On Mon, Nov 26, 2012 at 03:51:04AM +0000, Zhang, Yang Z wrote:
> Gleb Natapov wrote on 2012-11-25:
> > On Wed, Nov 21, 2012 at 04:09:39PM +0800, Yang Zhang wrote:
> >> Posted Interrupt allows vAPICV interrupts to inject into guest directly
> >> without any vmexit.
> >> 
> >> - When delivering a interrupt to guest, if target vcpu is running,
> >>   update Posted-interrupt requests bitmap and send a notification event
> >>   to the vcpu. Then the vcpu will handle this interrupt automatically,
> >>   without any software involvemnt.
> > Looks like you allocating one irq vector per vcpu per pcpu and then
> > migrate it or reallocate when vcpu move from one pcpu to another.
> > This is not scalable and migrating irq migration slows things down.
> > What's wrong with allocating one global vector for posted interrupt
> > during vmx initialization and use it for all vcpus?
> 
> Consider the following situation: 
> If vcpu A is running when notification event which belong to vcpu B is 
> arrived, since the vector match the vcpu A's notification vector, then this 
> event will be consumed by vcpu A(even it do nothing) and the interrupt cannot 
> be handled in time.
The exact same situation is possible with your code. vcpu B can be
migrated from pcpu and vcpu A will take its place and will be assigned
the same vector as vcpu B. But I fail to see why is this a problem. vcpu
A will ignore PI since pir will be empty and vcpu B should detect new
event during next vmentry. 

> 
> >> - If target vcpu is not running or there already a notification event
> >>   pending in the vcpu, do nothing. The interrupt will be handled by old
> >>   way.
> >> Signed-off-by: Yang Zhang <yang.z.zh...@intel.com>
> >> ---
> >>  arch/x86/include/asm/kvm_host.h |    3 + arch/x86/include/asm/vmx.h   
> >>    |    4 + arch/x86/kernel/apic/io_apic.c  |  138
> >>  ++++++++++++++++++++++++++++ arch/x86/kvm/lapic.c            |   31
> >>  ++++++- arch/x86/kvm/lapic.h            |    8 ++ arch/x86/kvm/vmx.c  
> >>             |  192 +++++++++++++++++++++++++++++++++++++--
> >>  arch/x86/kvm/x86.c              |    2 + include/linux/kvm_host.h     
> >>    |    1 + virt/kvm/kvm_main.c             |    2 + 9 files changed,
> >>  372 insertions(+), 9 deletions(-)
> >> diff --git a/arch/x86/include/asm/kvm_host.h
> >> b/arch/x86/include/asm/kvm_host.h index 8e07a86..1145894 100644 ---
> >> a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h
> >> @@ -683,9 +683,12 @@ struct kvm_x86_ops {
> >>    void (*enable_irq_window)(struct kvm_vcpu *vcpu);       void
> >>  (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);         
> >> int
> >>  (*has_virtual_interrupt_delivery)(struct kvm_vcpu *vcpu); +       int
> >>  (*has_posted_interrupt)(struct kvm_vcpu *vcpu);   void
> >>  (*update_irq)(struct kvm_vcpu *vcpu);     void (*set_eoi_exitmap)(struct
> >>  kvm_vcpu *vcpu, int vector,                       int need_eoi, int 
> >> global);
> >> +  int (*send_nv)(struct kvm_vcpu *vcpu, int vector);
> >> +  void (*pi_migrate)(struct kvm_vcpu *vcpu);
> >>    int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
> >>    int (*get_tdp_level)(void);
> >>    u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
> >> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> >> index 1003341..7b9e1d0 100644
> >> --- a/arch/x86/include/asm/vmx.h
> >> +++ b/arch/x86/include/asm/vmx.h
> >> @@ -152,6 +152,7 @@
> >>  #define PIN_BASED_EXT_INTR_MASK                 0x00000001
> >>  #define PIN_BASED_NMI_EXITING                   0x00000008
> >>  #define PIN_BASED_VIRTUAL_NMIS                  0x00000020
> >> +#define PIN_BASED_POSTED_INTR                   0x00000080
> >> 
> >>  #define VM_EXIT_SAVE_DEBUG_CONTROLS             0x00000002 #define
> >>  VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200 @@ -174,6 +175,7 @@
> >>  /* VMCS Encodings */ enum vmcs_field {    VIRTUAL_PROCESSOR_ID          
> >>   = 0x00000000, +  POSTED_INTR_NV                  = 0x00000002,
> >>    GUEST_ES_SELECTOR               = 0x00000800,   GUEST_CS_SELECTOR     
> >>           = 0x00000802,    GUEST_SS_SELECTOR               = 0x00000804,
> >>  @@ -208,6 +210,8 @@ enum vmcs_field {     VIRTUAL_APIC_PAGE_ADDR_HIGH    
> >>  = 0x00002013,     APIC_ACCESS_ADDR                = 0x00002014,
> >>    APIC_ACCESS_ADDR_HIGH           = 0x00002015,
> >> +  POSTED_INTR_DESC_ADDR           = 0x00002016,
> >> +  POSTED_INTR_DESC_ADDR_HIGH      = 0x00002017,
> >>    EPT_POINTER                     = 0x0000201a,
> >>    EPT_POINTER_HIGH                = 0x0000201b,
> >>    EOI_EXIT_BITMAP0                = 0x0000201c,
> >> diff --git a/arch/x86/kernel/apic/io_apic.c 
> >> b/arch/x86/kernel/apic/io_apic.c
> >> index 1817fa9..97cb8ee 100644
> >> --- a/arch/x86/kernel/apic/io_apic.c
> >> +++ b/arch/x86/kernel/apic/io_apic.c
> >> @@ -3277,6 +3277,144 @@ int arch_setup_dmar_msi(unsigned int irq)
> >>  }
> >>  #endif
> >> +static int
> >> +pi_set_affinity(struct irq_data *data, const struct cpumask *mask,
> >> +                bool force)
> >> +{
> >> +  unsigned int dest;
> >> +  struct irq_cfg *cfg = (struct irq_cfg *)data->chip_data;
> >> +  if (cpumask_equal(cfg->domain, mask))
> >> +          return IRQ_SET_MASK_OK;
> >> +
> >> +  if (__ioapic_set_affinity(data, mask, &dest))
> >> +          return -1;
> >> +
> >> +  return IRQ_SET_MASK_OK;
> >> +}
> >> +
> >> +static void pi_mask(struct irq_data *data)
> >> +{
> >> +  ;
> >> +}
> >> +
> >> +static void pi_unmask(struct irq_data *data)
> >> +{
> >> +  ;
> >> +}
> >> +
> >> +static struct irq_chip pi_chip = {
> >> +  .name       = "POSTED-INTR",
> >> +  .irq_ack    = ack_apic_edge,
> >> +  .irq_unmask = pi_unmask,
> >> +  .irq_mask   = pi_mask,
> >> +  .irq_set_affinity   = pi_set_affinity,
> >> +};
> >> +
> >> +int arch_pi_migrate(int irq, int cpu)
> >> +{
> >> +  struct irq_data *data = irq_get_irq_data(irq);
> >> +  struct irq_cfg *cfg;
> >> +  struct irq_desc *desc = irq_to_desc(irq);
> >> +  unsigned long flags;
> >> +
> >> +  if (!desc)
> >> +          return -EINVAL;
> >> +
> >> +  cfg = irq_cfg(irq);
> >> +  if (cpumask_equal(cfg->domain, cpumask_of(cpu)))
> >> +          return cfg->vector;
> >> +
> >> +  irq_set_affinity(irq, cpumask_of(cpu));
> >> +  raw_spin_lock_irqsave(&desc->lock, flags);
> >> +  irq_move_irq(data);
> >> +  raw_spin_unlock_irqrestore(&desc->lock, flags);
> >> +
> >> +  if (cfg->move_in_progress)
> >> +          send_cleanup_vector(cfg);
> >> +  return cfg->vector;
> >> +}
> >> +EXPORT_SYMBOL_GPL(arch_pi_migrate);
> >> +
> >> +static int arch_pi_create_irq(const struct cpumask *mask)
> >> +{
> >> +  int node = cpu_to_node(0);
> >> +  unsigned int irq_want;
> >> +  struct irq_cfg *cfg;
> >> +  unsigned long flags;
> >> +  unsigned int ret = 0;
> >> +  int irq;
> >> +
> >> +  irq_want = nr_irqs_gsi;
> >> +
> >> +  irq = alloc_irq_from(irq_want, node);
> >> +  if (irq < 0)
> >> +          return 0;
> >> +  cfg = alloc_irq_cfg(irq_want, node);
> > s/irq_want/irq.
> > 
> >> +  if (!cfg) {
> >> +          free_irq_at(irq, NULL);
> >> +          return 0;
> >> +  }
> >> +
> >> +  raw_spin_lock_irqsave(&vector_lock, flags);
> >> +  if (!__assign_irq_vector(irq, cfg, mask))
> >> +          ret = irq;
> >> +  raw_spin_unlock_irqrestore(&vector_lock, flags);
> >> +
> >> +  if (ret) {
> >> +          irq_set_chip_data(irq, cfg);
> >> +          irq_clear_status_flags(irq, IRQ_NOREQUEST);
> >> +  } else {
> >> +          free_irq_at(irq, cfg);
> >> +  }
> >> +  return ret;
> >> +}
> > 
> > This function is mostly cut&paste of create_irq_nr().
> 
> Yes, this function allow to allocate vector from specified cpu.
>  
Does not justify code duplication.

> >> +
> >> +int arch_pi_alloc_irq(void *vmx)
> >> +{
> >> +  int irq, cpu = smp_processor_id();
> >> +  struct irq_cfg *cfg;
> >> +
> >> +  irq = arch_pi_create_irq(cpumask_of(cpu));
> >> +  if (!irq) {
> >> +          pr_err("Posted Interrupt: no free irq\n");
> >> +          return -EINVAL;
> >> +  }
> >> +  irq_set_handler_data(irq, vmx);
> >> +  irq_set_chip_and_handler_name(irq, &pi_chip, handle_edge_irq, "edge");
> >> +  irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
> >> +  irq_set_affinity(irq, cpumask_of(cpu));
> >> +
> >> +  cfg = irq_cfg(irq);
> >> +  if (cfg->move_in_progress)
> >> +          send_cleanup_vector(cfg);
> >> +
> >> +  return irq;
> >> +}
> >> +EXPORT_SYMBOL_GPL(arch_pi_alloc_irq);
> >> +
> >> +void arch_pi_free_irq(unsigned int irq, void *vmx)
> >> +{
> >> +  if (irq) {
> >> +          irq_set_handler_data(irq, NULL);
> >> +          /* This will mask the irq */
> >> +          free_irq(irq, vmx);
> >> +          destroy_irq(irq);
> >> +  }
> >> +}
> >> +EXPORT_SYMBOL_GPL(arch_pi_free_irq);
> >> +
> >> +int arch_pi_get_vector(unsigned int irq)
> >> +{
> >> +  struct irq_cfg *cfg;
> >> +
> >> +  if (!irq)
> >> +          return -EINVAL;
> >> +
> >> +  cfg = irq_cfg(irq);
> >> +  return cfg->vector;
> >> +}
> >> +EXPORT_SYMBOL_GPL(arch_pi_get_vector);
> >> +
> >>  #ifdef CONFIG_HPET_TIMER
> >>  
> >>  static int hpet_msi_set_affinity(struct irq_data *data,
> >> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> >> index af48361..04220de 100644
> >> --- a/arch/x86/kvm/lapic.c
> >> +++ b/arch/x86/kvm/lapic.c
> >> @@ -656,7 +656,7 @@ void kvm_set_eoi_exitmap(struct kvm_vcpu *vcpu, int
> > vector,
> >>  static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
> >>                         int vector, int level, int trig_mode)
> >>  {
> >> -  int result = 0;
> >> +  int result = 0, send;
> >>    struct kvm_vcpu *vcpu = apic->vcpu;
> >>  
> >>    switch (delivery_mode) {
> >> @@ -674,6 +674,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, 
> >> int
> > delivery_mode,
> >>            } else {
> >>                    apic_clear_vector(vector, apic->regs + APIC_TMR);
> >>                    kvm_set_eoi_exitmap(vcpu, vector, 0, 0);
> >> +                  if (kvm_apic_pi_enabled(vcpu)) {
> > Provide send_nv() that returns 0 if pi is disabled.
> > 
> >> +                          send = kvm_x86_ops->send_nv(vcpu, vector);
> >> +                          if (send) {
> > No need "send" variable here.
> 
> ok.
>  
> >> +                                  result = 1;
> >> +                                  break;
> >> +                          }
> >> +                  }
> >>            }
> >>  
> >>            result = !apic_test_and_set_irr(vector, apic);
> >> @@ -1541,6 +1548,10 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
> >> 
> >>    if (kvm_x86_ops->has_virtual_interrupt_delivery(vcpu))
> >>            apic->vid_enabled = true;
> >> +
> >> +  if (kvm_x86_ops->has_posted_interrupt(vcpu))
> >> +          apic->pi_enabled = true;
> >> +
> > This is global state, no need per apic variable.
> > 
> >>    return 0;
> >>  nomem_free_apic:
> >>    kfree(apic);
> >> @@ -1575,6 +1586,24 @@ int kvm_apic_get_highest_irr(struct kvm_vcpu
> > *vcpu)
> >>  }
> >>  EXPORT_SYMBOL_GPL(kvm_apic_get_highest_irr);
> >> +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned int *pir)
> >> +{
> >> +  struct kvm_lapic *apic = vcpu->arch.apic;
> >> +  unsigned int *reg;
> >> +  unsigned int i;
> >> +
> >> +  if (!apic || !apic_enabled(apic))
> > Use kvm_vcpu_has_lapic() instead of !apic.
> 
> ok.
>  
> >> +          return;
> >> +
> >> +  for (i = 0; i <= 7; i++) {
> >> +          reg = apic->regs + APIC_IRR + i * 0x10;
> >> +          *reg |= pir[i];
> > Non atomic access to IRR. Other threads may set bit there concurrently.
> Ok.
>  
> >> +          pir[i] = 0;
> >> +  }
> > Should set apic->irr_pending to true when setting irr bit.
> Right. Will add it in next version.
> 
> >> +  return;
> >> +}
> >> +EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
> >> +
> >>  int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
> >>  {
> >>    u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0);
> >> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> >> index 2503a64..ad35868 100644
> >> --- a/arch/x86/kvm/lapic.h
> >> +++ b/arch/x86/kvm/lapic.h
> >> @@ -21,6 +21,7 @@ struct kvm_lapic {
> >>    struct kvm_vcpu *vcpu;  bool irr_pending;       bool vid_enabled; +     
> >> bool
> >>  pi_enabled;       /* Number of bits set in ISR. */        s16 isr_count;  
> >> /* The
> >>  highest vector set in ISR; if -1 - invalid, must scan ISR. */ @@ -43,6
> >>  +44,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); int
> >>  kvm_cpu_has_extint(struct kvm_vcpu *v); int kvm_cpu_get_extint(struct
> >>  kvm_vcpu *v); int kvm_apic_get_highest_irr(struct kvm_vcpu *vcpu);
> >>  +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned int *pir);
> >>  void kvm_lapic_reset(struct kvm_vcpu *vcpu); u64
> >>  kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void
> >>  kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
> >> @@ -94,6 +96,12 @@ static inline bool kvm_apic_vid_enabled(struct kvm_vcpu
> > *vcpu)
> >>    return apic->vid_enabled;
> >>  }
> >> +static inline bool kvm_apic_pi_enabled(struct kvm_vcpu *vcpu)
> >> +{
> >> +  struct kvm_lapic *apic = vcpu->arch.apic;
> >> +  return apic->pi_enabled;
> >> +}
> >> +
> >>  int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
> >>  void kvm_lapic_init(void);
> >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> >> index f6ef090..6448b96 100644
> >> --- a/arch/x86/kvm/vmx.c
> >> +++ b/arch/x86/kvm/vmx.c
> >> @@ -31,6 +31,7 @@
> >>  #include <linux/ftrace_event.h> #include <linux/slab.h> #include
> >>  <linux/tboot.h> +#include <linux/interrupt.h> #include
> >>  "kvm_cache_regs.h" #include "x86.h"
> >> @@ -89,6 +90,8 @@ module_param(enable_apicv_reg, bool, S_IRUGO);
> >>  static bool __read_mostly enable_apicv_vid = 0;
> >>  module_param(enable_apicv_vid, bool, S_IRUGO);
> >> +static bool __read_mostly enable_apicv_pi = 0;
> >> +module_param(enable_apicv_pi, bool, S_IRUGO);
> >>  /*
> >>   * If nested=1, nested virtualization is supported, i.e., guests may use
> >>   * VMX and be a hypervisor for its own guests. If nested=0, guests may not
> >> @@ -372,6 +375,44 @@ struct nested_vmx {
> >>    struct page *apic_access_page;
> >>  };
> >> +/* Posted-Interrupt Descriptor */
> >> +struct pi_desc {
> >> +  u32 pir[8];     /* Posted interrupt requested */
> >> +  union {
> >> +          struct {
> >> +                  u8  on:1,
> >> +                      rsvd:7;
> >> +          } control;
> >> +          u32 rsvd[8];
> >> +  } u;
> >> +} __aligned(64);
> >> +
> >> +#define POSTED_INTR_ON  0
> >> +u8 pi_test_on(struct pi_desc *pi_desc)
> >> +{
> >> +  return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control);
> >> +}
> >> +void pi_set_on(struct pi_desc *pi_desc)
> >> +{
> >> +  set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control);
> >> +}
> >> +
> >> +void pi_clear_on(struct pi_desc *pi_desc)
> >> +{
> >> +  clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control);
> >> +}
> >> +
> >> +u8 pi_test_and_set_on(struct pi_desc *pi_desc)
> >> +{
> >> +  return test_and_set_bit(POSTED_INTR_ON,
> >> +                  (unsigned long *)&pi_desc->u.control);
> >> +}
> >> +
> >> +void pi_set_pir(int vector, struct pi_desc *pi_desc)
> >> +{
> >> +  set_bit(vector, (unsigned long *)pi_desc->pir);
> >> +}
> >> +
> >>  struct vcpu_vmx {         struct kvm_vcpu       vcpu;     unsigned long   
> >>      
> >>  host_rsp; @@ -439,6 +480,11 @@ struct vcpu_vmx {  u64
> >>  eoi_exit_bitmap[4];       u64 eoi_exit_bitmap_global[4];
> >> +  /* Posted interrupt descriptor */
> >> +  struct pi_desc *pi;
> >> +  u32 irq;
> >> +  u32 vector;
> >> +
> >>    /* Support for a guest hypervisor (nested VMX) */
> >>    struct nested_vmx nested;
> >>  };
> >> @@ -698,6 +744,11 @@ static u64 host_efer;
> >> 
> >>  static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
> >> +int arch_pi_get_vector(unsigned int irq);
> >> +int arch_pi_alloc_irq(struct vcpu_vmx *vmx);
> >> +void arch_pi_free_irq(unsigned int irq, struct vcpu_vmx *vmx);
> >> +int arch_pi_migrate(int irq, int cpu);
> >> +
> >>  /*
> >>   * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
> >>   * away by decrementing the array size.
> >> @@ -783,6 +834,11 @@ static inline bool
> > cpu_has_vmx_virtual_intr_delivery(void)
> >>            SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
> >>  }
> >> +static inline bool cpu_has_vmx_posted_intr(void)
> >> +{
> >> +  return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
> >> +}
> >> +
> >>  static inline bool cpu_has_vmx_flexpriority(void)
> >>  {
> >>    return cpu_has_vmx_tpr_shadow() &&
> >> @@ -1555,6 +1611,11 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu,
> > int cpu)
> >>            struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
> >>            unsigned long sysenter_esp;
> >> +          if (enable_apicv_pi && to_vmx(vcpu)->pi)
> >> +                  pi_set_on(to_vmx(vcpu)->pi);
> >> +
> > Why?
> 
> Here means the vcpu start migration. So we should prevent the notification 
> event until migration end.
> 
You check for IN_GUEST_MODE while sending notification. Why is this not
enough? Also why vmx_vcpu_load() call means that vcpu start migration?

> >> +          kvm_make_request(KVM_REQ_POSTED_INTR, vcpu);
> >> +
> >>            kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);              
> >> local_irq_disable();
> >>            list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, @@ -1582,6
> >>  +1643,8 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)                
> >> vcpu->cpu
> >>  = -1;             kvm_cpu_vmxoff();       }
> >> +  if (enable_apicv_pi && to_vmx(vcpu)->pi)
> >> +          pi_set_on(to_vmx(vcpu)->pi);
> > Why?
> 
> When vcpu schedule out, no need to send notification event to it, just set 
> the PIR and wakeup it is enough.
Same as above. When vcpu is scheduled out it will no be in IN_GUEST_MODE
mode. Also in this case we probably should set bit directly in IRR and leave
PIR alone.

> 
> >>  }
> >>  
> >>  static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
> >> @@ -2451,12 +2514,6 @@ static __init int setup_vmcs_config(struct
> > vmcs_config *vmcs_conf)
> >>    u32 _vmexit_control = 0;
> >>    u32 _vmentry_control = 0;
> >> -  min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
> >> -  opt = PIN_BASED_VIRTUAL_NMIS;
> >> -  if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
> >> -                          &_pin_based_exec_control) < 0)
> >> -          return -EIO;
> >> -
> >>    min = CPU_BASED_HLT_EXITING |
> >>  #ifdef CONFIG_X86_64
> >>          CPU_BASED_CR8_LOAD_EXITING |
> >> @@ -2531,6 +2588,17 @@ static __init int setup_vmcs_config(struct
> > vmcs_config *vmcs_conf)
> >>                            &_vmexit_control) < 0)
> >>            return -EIO;
> >> +  min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
> >> +  opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
> >> +  if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
> >> +                          &_pin_based_exec_control) < 0)
> >> +          return -EIO;
> >> +
> >> +  if (!(_cpu_based_2nd_exec_control &
> >> +          SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
> >> +          !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
> >> +          _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
> >> +
> >>    min = 0;        opt = VM_ENTRY_LOAD_IA32_PAT;   if 
> >> (adjust_vmx_controls(min,
> >>  opt, MSR_IA32_VMX_ENTRY_CTLS, @@ -2715,6 +2783,9 @@ static __init int
> >>  hardware_setup(void)      if (!cpu_has_vmx_virtual_intr_delivery())
> >>            enable_apicv_vid = 0;
> >> +  if (!cpu_has_vmx_posted_intr() || !x2apic_enabled())
> > In nested guest x2apic may be enabled without irq remapping. Check for
> > irq remapping here.
> 
> There are no posted interrupt available in nested case. We don't need to 
> check IR here.
> 
One day emulation will be added. If pre-request for PI is IR check
for IR.

BTW why IR is needed for PI. To deliver assigned devices interrupts
directly into a guest sure, but why is it required for delivering
interrupts from emulated devices or IPIs?

> > 
> >> +          enable_apicv_pi = 0;
> >> +
> >>    if (nested)
> >>            nested_vmx_setup_ctls_msrs();
> >> @@ -3881,6 +3952,93 @@ static void ept_set_mmio_spte_mask(void)
> >>    kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
> >>  }
> >> +irqreturn_t pi_handler(int irq, void *data)
> >> +{
> >> +  struct vcpu_vmx *vmx = data;
> >> +
> >> +  kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
> >> +  kvm_vcpu_kick(&vmx->vcpu);
> >> +
> >> +  return IRQ_HANDLED;
> >> +}
> >> +
> >> +static int vmx_has_posted_interrupt(struct kvm_vcpu *vcpu)
> >> +{
> >> +  return irqchip_in_kernel(vcpu->kvm) && enable_apicv_pi;
> >> +}
> >> +
> >> +static void vmx_pi_migrate(struct kvm_vcpu *vcpu)
> >> +{
> >> +  int ret = 0;
> >> +  struct vcpu_vmx *vmx = to_vmx(vcpu);
> >> +
> >> +  if (!enable_apicv_pi)
> >> +          return ;
> >> +
> >> +  preempt_disable();
> >> +  local_irq_disable();
> >> +  if (!vmx->irq) {
> >> +          ret = arch_pi_alloc_irq(vmx);
> >> +          if (ret < 0) {
> >> +                  vmx->irq = -1;
> >> +                  goto out;
> >> +          }
> >> +          vmx->irq = ret;
> >> +
> >> +          ret = request_irq(vmx->irq, pi_handler, IRQF_NO_THREAD,
> >> +                                  "Posted Interrupt", vmx);
> >> +          if (ret) {
> >> +                  vmx->irq = -1;
> >> +                  goto out;
> >> +          }
> >> +
> >> +          ret = arch_pi_get_vector(vmx->irq);
> >> +  } else
> >> +          ret = arch_pi_migrate(vmx->irq, smp_processor_id());
> >> +
> >> +  if (ret < 0) {
> >> +          vmx->irq = -1;
> >> +          goto out;
> >> +  } else {
> >> +          vmx->vector = ret;
> >> +          vmcs_write16(POSTED_INTR_NV, vmx->vector);
> >> +          pi_clear_on(vmx->pi);
> >> +  }
> >> +out:
> >> +  local_irq_enable();
> >> +  preempt_enable();
> >> +  return ;
> >> +}
> >> +
> >> +static int vmx_send_nv(struct kvm_vcpu *vcpu,
> >> +          int vector)
> >> +{
> >> +  struct vcpu_vmx *vmx = to_vmx(vcpu);
> >> +
> >> +  if (unlikely(vmx->irq == -1))
> >> +          return 0;
> >> +
> >> +  if (vcpu->cpu == smp_processor_id()) {
> >> +          pi_set_on(vmx->pi);
> > Why? You clear this bit anyway in vmx_update_irq() during guest entry.
> Here means the target vcpu already in vmx non-root mode. Then it will consume 
> the interrupt on next vm entry and we don't need to send the notification 
> event from other cpu, just update PIR is enough.
I understand why you avoid sending PI IPI here, but you do not update
pir in this case either. You only set "on" bit here and set vector directly
in IRR in __apic_accept_irq() since vmx_send_nv() returns 0 in this case.
Interrupt is delivered from IRR on the next entry.

> 
> >> +          return 0; +     } + +   pi_set_pir(vector, vmx->pi); +  if
> >> (!pi_test_and_set_on(vmx->pi) && (vcpu->mode == IN_GUEST_MODE)) {
> >> +          apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), vmx->vector); +    
> >>         return
> >> 1; +       } +     return 0; +} + +static void free_pi(struct vcpu_vmx 
> >> *vmx) +{
> >> +  if (enable_apicv_pi) { +                kfree(vmx->pi);
> >> +          arch_pi_free_irq(vmx->irq, vmx); +      } +} +
> >>  /*
> >>   * Sets up the vmcs for emulated real mode.
> >>   */
> >> @@ -3890,6 +4048,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
> >>    unsigned long a;
> >>  #endif
> >>    int i;
> >> +  u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
> >> 
> >>    /* I/O */       vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); @@
> >>  -3901,8 +4060,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
> >>    vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
> >>  
> >>    /* Control */
> >> -  vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
> >> -          vmcs_config.pin_based_exec_ctrl);
> >> +  if (!enable_apicv_pi)
> >> +          pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
> >> +
> >> +  vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_exec_ctrl);
> >> 
> >>    vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
> > vmx_exec_control(vmx));
> >> 
> >> @@ -3920,6 +4081,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
> >>            vmcs_write16(GUEST_INTR_STATUS, 0);
> >>    }
> >> +  if (enable_apicv_pi) {
> >> +          vmx->pi = kmalloc(sizeof(struct pi_desc),
> >> +                          GFP_KERNEL | __GFP_ZERO);
> >> +          vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((vmx->pi)));
> >> +  }
> >> +
> >>    if (ple_gap) {          vmcs_write32(PLE_GAP, ple_gap);
> >>            vmcs_write32(PLE_WINDOW, ple_window); @@ -6161,6 +6328,11 @@ 
> >> static
> >>  void vmx_update_irq(struct kvm_vcpu *vcpu)        if (!enable_apicv_vid)
> >>            return ;
> >> +  if (enable_apicv_pi) {
> >> +          kvm_apic_update_irr(vcpu, (unsigned int *)vmx->pi->pir);
> >> +          pi_clear_on(vmx->pi);
> > Why do you do that? Isn't VMX process posted interrupts on vmentry if "on" 
> > bit
> > is set?
Can you answer this question?

> > 
> >> +  }
> >> +
> >>    vector = kvm_apic_get_highest_irr(vcpu);
> >>    if (vector == -1)
> >>            return;
> >> @@ -6586,6 +6758,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
> >> 
> >>    free_vpid(vmx);         free_nested(vmx); +     free_pi(vmx);
> >>    free_loaded_vmcs(vmx->loaded_vmcs);     kfree(vmx->guest_msrs);
> >>    kvm_vcpu_uninit(vcpu); @@ -7483,8 +7656,11 @@ static struct
> >>  kvm_x86_ops vmx_x86_ops = {       .enable_irq_window = enable_irq_window,
> >>    .update_cr8_intercept = update_cr8_intercept,
> >>    .has_virtual_interrupt_delivery = vmx_has_virtual_interrupt_delivery,
> >>  + .has_posted_interrupt = vmx_has_posted_interrupt,       .update_irq =
> >>  vmx_update_irq,   .set_eoi_exitmap = vmx_set_eoi_exitmap,
> >> +  .send_nv = vmx_send_nv,
> >> +  .pi_migrate = vmx_pi_migrate,
> >> 
> >>    .set_tss_addr = vmx_set_tss_addr,
> >>    .get_tdp_level = get_ept_level,
> >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >> index 8b8de3b..f035267 100644
> >> --- a/arch/x86/kvm/x86.c
> >> +++ b/arch/x86/kvm/x86.c
> >> @@ -5250,6 +5250,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> >>    bool req_immediate_exit = 0;
> >>  
> >>    if (vcpu->requests) {
> >> +          if (kvm_check_request(KVM_REQ_POSTED_INTR, vcpu))
> >> +                  kvm_x86_ops->pi_migrate(vcpu);
> >>            if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
> >>                    kvm_mmu_unload(vcpu);
> >>            if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
> >> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> >> index ecc5543..f8d8d34 100644
> >> --- a/include/linux/kvm_host.h
> >> +++ b/include/linux/kvm_host.h
> >> @@ -107,6 +107,7 @@ static inline bool is_error_page(struct page *page)
> >>  #define KVM_REQ_IMMEDIATE_EXIT    15
> >>  #define KVM_REQ_PMU               16
> >>  #define KVM_REQ_PMI               17
> >> +#define KVM_REQ_POSTED_INTR       18
> >> 
> >>  #define KVM_USERSPACE_IRQ_SOURCE_ID               0
> >>  #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID  1
> >> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> >> index be70035..05baf1c 100644
> >> --- a/virt/kvm/kvm_main.c
> >> +++ b/virt/kvm/kvm_main.c
> >> @@ -1625,6 +1625,8 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
> >>                    smp_send_reschedule(cpu);
> >>    put_cpu();
> >>  }
> >> +EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
> >> +
> >>  #endif /* !CONFIG_S390 */
> >>  
> >>  void kvm_resched(struct kvm_vcpu *vcpu)
> >> --
> >> 1.7.1
> > 
> > --
> >                     Gleb.
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majord...@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
> Best regards,
> Yang
> 

--
                        Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to