On 12/10/2009 08:38 PM, or...@il.ibm.com wrote:
From: Orit Wasserman<or...@il.ibm.com>


(changelog)

@@ -1525,6 +1539,22 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                        new_offset = vmcs_read64(TSC_OFFSET) + delta;
                        vmcs_write64(TSC_OFFSET, new_offset);
                }
+
+               if (l1_shadow_vmcs != NULL) {
+                       l1_shadow_vmcs->host_tr_base =
+                               vmcs_readl(HOST_TR_BASE);
+                       l1_shadow_vmcs->host_gdtr_base =
+                               vmcs_readl(HOST_GDTR_BASE);
+                       l1_shadow_vmcs->host_ia32_sysenter_esp =
+                               vmcs_readl(HOST_IA32_SYSENTER_ESP);
+
+                       if (tsc_this<  vcpu->arch.host_tsc)
+                               l1_shadow_vmcs->tsc_offset =
+                                       vmcs_read64(TSC_OFFSET);
+
+                       if (vmx->nested.nested_mode)
+                               load_vmcs_host_state(l1_shadow_vmcs);
+               }

Please share this code with non-nested vmcs setup.

@@ -3794,6 +3824,11 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
  {
        u32 cpu_based_vm_exec_control;

+       if (to_vmx(vcpu)->nested.nested_mode) {
+               nested_vmx_intr(vcpu);
+               return;
+       }

I think this happens too late? enable_irq_window() is called after we've given up on injecting the interrupt because interrupts are disabled. But if we're running a guest, we can vmexit and inject the interrupt. This code will only vmexit.

Hm, I see the vmexit code has an in_interrupt case, but I'd like this to be more regular: adjust vmx_interrupt_allowed() to allow interrupts if in a guest, and vmx_inject_irq() to force the vmexit. This way interrupts have a single code path.


  static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
  {
+       if (to_vmx(vcpu)->nested.nested_mode) {
+               if (!nested_vmx_intr(vcpu))
+                       return 0;
+       }

... and you do that... so I wonder why the changes to enable_irq_window() are needed?

+
        return (vmcs_readl(GUEST_RFLAGS)&  X86_EFLAGS_IF)&&
                !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)&
                        (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
@@ -4042,6 +4082,10 @@ static int handle_exception(struct kvm_vcpu *vcpu)
                   not interested (exception bitmap 12 does not include 
NM_VECTOR)
                   enable fpu and resume l2 (avoid switching to l1)
                */
+
+               if (vmx->nested.nested_mode)
+                       vmx->nested.nested_run_pending = 1; /* removing this 
line cause hung on boot of l2*/
+

This indicates a hack?

                vmx_fpu_activate(vcpu);

                return 1;
@@ -4169,7 +4213,33 @@ static int handle_cr(struct kvm_vcpu *vcpu)
                trace_kvm_cr_write(cr, val);
                switch (cr) {
                case 0:
-                       kvm_set_cr0(vcpu, val);
+                       if (to_vmx(vcpu)->nested.nested_mode) {
+                               /* assume only X86_CR0_TS is handled by l0 */
+                               long new_cr0 = vmcs_readl(GUEST_CR0);
+                               long new_cr0_read_shadow = 
vmcs_readl(CR0_READ_SHADOW);
+
+                               vmx_fpu_deactivate(vcpu);
+
+                               if (val&  X86_CR0_TS) {
+                                       new_cr0 |= X86_CR0_TS;
+                                       new_cr0_read_shadow |= X86_CR0_TS;
+                                       vcpu->arch.cr0 |= X86_CR0_TS;
+                               } else {
+                                       new_cr0&= ~X86_CR0_TS;
+                                       new_cr0_read_shadow&= ~X86_CR0_TS;
+                                       vcpu->arch.cr0&= X86_CR0_TS;
+                               }
+
+                               vmcs_writel(GUEST_CR0, new_cr0);
+                               vmcs_writel(CR0_READ_SHADOW, 
new_cr0_read_shadow);

Don't you need to #vmexit if the new cr0 violates the cr0_bits_always_on constraint, or if it changes bits in cr0 that the guest intercepts?

+
+                               if (!(val&  X86_CR0_TS) || !(val&  X86_CR0_PE))
+                                       vmx_fpu_activate(vcpu);
+
+                               to_vmx(vcpu)->nested.nested_run_pending = 1;

Please split into a function.

+                       } else
+                               kvm_set_cr0(vcpu, val);
+
                        skip_emulated_instruction(vcpu);
                        return 1;
                case 3:
@@ -4196,8 +4266,15 @@ static int handle_cr(struct kvm_vcpu *vcpu)
                break;
        case 2: /* clts */
                vmx_fpu_deactivate(vcpu);
-               vcpu->arch.cr0&= ~X86_CR0_TS;
-               vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
+               if (to_vmx(vcpu)->nested.nested_mode) {
+                       vmcs_writel(GUEST_CR0, vmcs_readl(GUEST_CR0)&  
~X86_CR0_TS);
+                       vmcs_writel(CR0_READ_SHADOW, 
vmcs_readl(CR0_READ_SHADOW)&  ~X86_CR0_TS);
+                       vcpu->arch.cr0&= ~X86_CR0_TS;
+                       to_vmx(vcpu)->nested.nested_run_pending = 1;

Won't the guest want to intercept this some time?

        /* Access CR3 don't cause VMExit in paging mode, so we need
         * to sync with guest real CR3. */
        if (enable_ept&&  is_paging(vcpu))
@@ -5347,6 +5435,60 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
                | vmx->rmode.irq.vector;
  }

+static int nested_handle_valid_idt(struct kvm_vcpu *vcpu)

I asked for this to be renamed.

  #ifdef CONFIG_X86_64
  #define R "r"
  #define Q "q"
@@ -5358,8 +5500,17 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
  static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+       int r;
        u32 nested_exception_bitmap = 0;

+       if (vmx->nested.nested_mode) {
+               r = nested_handle_valid_idt(vcpu);

This will cause vmread()s before the launch of state that is not saved. This means it is broken on migration or after set_regs().

In general we follow the following pattern:

  read from memory
  vmwrite
  vmlaunch/vmresume
  vmread
  write to memory
  loop

There are exceptions where we allow state to be cached, mostly registers. But we keep accessors for them so that save/restore works.

+
+static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu)
+{
+       if (to_vmx(vcpu)->nested.nested_mode) {
+               struct page *msr_page = NULL;
+               u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
+               u32 exit_code = vmcs_read32(VM_EXIT_REASON);
+               struct shadow_vmcs *l2svmcs = get_shadow_vmcs(vcpu);
+
+               if (!cpu_has_vmx_msr_bitmap()
+                   || !nested_cpu_has_vmx_msr_bitmap(vcpu))
+                       return 1;
+
+               msr_page = nested_get_page(vcpu,
+                                          l2svmcs->msr_bitmap);
+
+               if (!msr_page) {
+                       printk(KERN_INFO "%s error in nested_get_page\n",
+                              __func__);
+                       return 0;
+               }
+
+               switch (exit_code) {
+               case EXIT_REASON_MSR_READ:
+                       if (msr_index<= 0x1fff) {
+                               if (test_bit(msr_index,
+                                            (unsigned long *)(msr_page +
+                                                              0x000)))
+                                       return 1;
+                       } else if ((msr_index>= 0xc0000000)&&
+                                  (msr_index<= 0xc0001fff)) {
+                               msr_index&= 0x1fff;
+                               if (test_bit(msr_index,
+                                            (unsigned long *)(msr_page +
+                                                              0x400)))
+                                       return 1;
+                       }
+                       break;
+               case EXIT_REASON_MSR_WRITE:
+                       if (msr_index<= 0x1fff) {
+                               if (test_bit(msr_index,
+                                            (unsigned long *)(msr_page +
+                                                              0x800)))
+                                               return 1;
+                       } else if ((msr_index>= 0xc0000000)&&
+                                  (msr_index<= 0xc0001fff)) {
+                               msr_index&= 0x1fff;
+                               if (test_bit(msr_index,
+                                            (unsigned long *)(msr_page +
+                                                              0xc00)))
+                                       return 1;
+                       }
+                       break;
+               }
+       }
+

Please refactor with a single test_bit, just calculate the offsets differently (+400*8 for high msrs, +800*8 for writes).

+
+static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override)
+{
+       u32 exit_code = vmcs_read32(VM_EXIT_REASON);
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+       struct shadow_vmcs *l2svmcs;
+
+       int r = 0;
+
+       if (vmx->nested.nested_run_pending)
+               return 0;
+
+       if (unlikely(vmx->fail)) {
+               printk(KERN_INFO "%s failed vm entry %x\n",
+                      __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
+               return 1;
+       }
+
+       if (kvm_override) {

What's kvm_override?

+               switch (exit_code) {
+               case EXIT_REASON_EXTERNAL_INTERRUPT:
+                       return 0;
+               case EXIT_REASON_EXCEPTION_NMI:
+                       if (!is_exception(intr_info))
+                               return 0;
+
+                       if (is_page_fault(intr_info)&&  (!enable_ept))
+                               return 0;
+
+                       break;
+               case EXIT_REASON_EPT_VIOLATION:
+                       if (enable_ept)
+                               return 0;
+
+                       break;
+               }
+       }
+
+
+       if (!nested_map_current(vcpu))
+               return 0;
+
+       l2svmcs = get_shadow_vmcs(vcpu);
+
+       switch (exit_code) {
+       case EXIT_REASON_INVLPG:
+               if (l2svmcs->cpu_based_vm_exec_control&
+                   CPU_BASED_INVLPG_EXITING)
+                       r = 1;
+               break;
+       case EXIT_REASON_MSR_READ:
+       case EXIT_REASON_MSR_WRITE:
+               r = nested_vmx_exit_handled_msr(vcpu);
+               break;
+       case EXIT_REASON_CR_ACCESS: {
+               unsigned long exit_qualification =
+                       vmcs_readl(EXIT_QUALIFICATION);
+               int cr = exit_qualification&  15;
+               int reg = (exit_qualification>>  8)&  15;
+               unsigned long val = kvm_register_read(vcpu, reg);
+
+               switch ((exit_qualification>>  4)&  3) {
+               case 0: /* mov to cr */
+                       switch (cr) {
+                       case 0:
+                               if (l2svmcs->cr0_guest_host_mask&
+                                   (val ^ l2svmcs->cr0_read_shadow))
+                                       r = 1;
+ break;
+                       case 3:
+                               if (l2svmcs->cpu_based_vm_exec_control&
+                                   CPU_BASED_CR3_LOAD_EXITING)
+                                       r = 1;
+                               break;
+                       case 4:
+                               if (l2svmcs->cr4_guest_host_mask&
+                                   (l2svmcs->cr4_read_shadow ^ val))
+                                       r = 1;
+                               break;
+                       case 8:
+                               if (l2svmcs->cpu_based_vm_exec_control&
+                                   CPU_BASED_CR8_LOAD_EXITING)
+                                       r = 1;
+                               break;
+                       }
+                       break;
+               case 2: /* clts */
+                       if (l2svmcs->cr0_guest_host_mask&  X86_CR0_TS)
+                               r = 1;
+                       break;
+               case 1: /*mov from cr*/
+                       switch (cr) {
+                       case 0:
+                               r = 1;
+                       case 3:
+                               if (l2svmcs->cpu_based_vm_exec_control&
+                                   CPU_BASED_CR3_STORE_EXITING)
+                                       r = 1;
+                               break;
+                       case 4:
+                               r = 1;
+                               break;
+                       case 8:
+                               if (l2svmcs->cpu_based_vm_exec_control&
+                                   CPU_BASED_CR8_STORE_EXITING)
+                                       r = 1;
+                               break;
+                       }
+                       break;
+               case 3: /* lmsw */
+                       if (l2svmcs->cr0_guest_host_mask&
+                           (val ^ l2svmcs->cr0_read_shadow))
+                               r = 1;
+                       break;
+               }
+               break;
+       }
+       case EXIT_REASON_DR_ACCESS: {
+               if (l2svmcs->cpu_based_vm_exec_control&
+                   CPU_BASED_MOV_DR_EXITING)
+                       r = 1;
+               break;
+       }
+
+       case EXIT_REASON_EXCEPTION_NMI: {
+
+               if (is_external_interrupt(intr_info)&&
+                   (l2svmcs->pin_based_vm_exec_control&
+                    PIN_BASED_EXT_INTR_MASK))
+                       r = 1;
+               else if (is_nmi(intr_info)&&
+                   (l2svmcs->pin_based_vm_exec_control&
+                    PIN_BASED_NMI_EXITING))
+                       r = 1;
+               else if (is_exception(intr_info)&&
+                   (l2svmcs->exception_bitmap&
+                    (1u<<  (intr_info&  INTR_INFO_VECTOR_MASK))))
+                       r = 1;
+               else if (is_page_fault(intr_info))
+                       r = 1;
+               break;
+       }
+
+       case EXIT_REASON_EXTERNAL_INTERRUPT:
+               if (l2svmcs->pin_based_vm_exec_control&
+                   PIN_BASED_EXT_INTR_MASK)
+                       r = 1;
+               break;
+       default:
+               r = 1;
+       }
+       nested_unmap_current(vcpu);
+

Please move these to the normal handlers so it is possible to follow the code.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to