Re: [PATCH v9 3/3] x86, apicv: add virtual interrupt delivery support

2013-01-10 Thread Gleb Natapov
On Thu, Jan 10, 2013 at 03:26:08PM +0800, Yang Zhang wrote:
 From: Yang Zhang yang.z.zh...@intel.com
 
 Virtual interrupt delivery avoids KVM to inject vAPIC interrupts
 manually, which is fully taken care of by the hardware. This needs
 some special awareness into existing interrupr injection path:
 
 - for pending interrupt, instead of direct injection, we may need
   update architecture specific indicators before resuming to guest.
 
 - A pending interrupt, which is masked by ISR, should be also
   considered in above update action, since hardware will decide
   when to inject it at right time. Current has_interrupt and
   get_interrupt only returns a valid vector from injection p.o.v.
 
 Signed-off-by: Kevin Tian kevin.t...@intel.com
 Signed-off-by: Yang Zhang yang.z.zh...@intel.com
 ---
  arch/x86/include/asm/kvm_host.h |5 +
  arch/x86/include/asm/vmx.h  |   11 +++
  arch/x86/kvm/irq.c  |   56 +++-
  arch/x86/kvm/lapic.c|   72 +--
  arch/x86/kvm/lapic.h|   23 +
  arch/x86/kvm/svm.c  |   18 
  arch/x86/kvm/vmx.c  |  191 
 +--
  arch/x86/kvm/x86.c  |   14 +++-
  include/linux/kvm_host.h|3 +
  virt/kvm/ioapic.c   |   18 
  virt/kvm/ioapic.h   |4 +
  virt/kvm/irq_comm.c |   22 +
  virt/kvm/kvm_main.c |5 +
  13 files changed, 399 insertions(+), 43 deletions(-)
 
 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
 index 572a562..f471856 100644
 --- a/arch/x86/include/asm/kvm_host.h
 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -697,6 +697,10 @@ struct kvm_x86_ops {
   void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
   void (*enable_irq_window)(struct kvm_vcpu *vcpu);
   void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
 + int (*has_virtual_interrupt_delivery)(struct kvm_vcpu *vcpu);
 + void (*update_apic_irq)(struct kvm_vcpu *vcpu, int max_irr);
 + void (*update_eoi_exitmap)(struct kvm_vcpu *vcpu);
 + void (*set_svi)(int isr);
   void (*enable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
   void (*disable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
   int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 @@ -993,6 +997,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva);
  int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
  void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
  int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
 +int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
  int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
  int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
  int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
 index 0a54df0..694586c 100644
 --- a/arch/x86/include/asm/vmx.h
 +++ b/arch/x86/include/asm/vmx.h
 @@ -62,6 +62,7 @@
  #define EXIT_REASON_MCE_DURING_VMENTRY  41
  #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
  #define EXIT_REASON_APIC_ACCESS 44
 +#define EXIT_REASON_EOI_INDUCED 45
  #define EXIT_REASON_EPT_VIOLATION   48
  #define EXIT_REASON_EPT_MISCONFIG   49
  #define EXIT_REASON_WBINVD  54
 @@ -144,6 +145,7 @@
  #define SECONDARY_EXEC_WBINVD_EXITING0x0040
  #define SECONDARY_EXEC_UNRESTRICTED_GUEST0x0080
  #define SECONDARY_EXEC_APIC_REGISTER_VIRT   0x0100
 +#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY0x0200
  #define SECONDARY_EXEC_PAUSE_LOOP_EXITING0x0400
  #define SECONDARY_EXEC_ENABLE_INVPCID0x1000
  
 @@ -181,6 +183,7 @@ enum vmcs_field {
   GUEST_GS_SELECTOR   = 0x080a,
   GUEST_LDTR_SELECTOR = 0x080c,
   GUEST_TR_SELECTOR   = 0x080e,
 + GUEST_INTR_STATUS   = 0x0810,
   HOST_ES_SELECTOR= 0x0c00,
   HOST_CS_SELECTOR= 0x0c02,
   HOST_SS_SELECTOR= 0x0c04,
 @@ -208,6 +211,14 @@ enum vmcs_field {
   APIC_ACCESS_ADDR_HIGH   = 0x2015,
   EPT_POINTER = 0x201a,
   EPT_POINTER_HIGH= 0x201b,
 + EOI_EXIT_BITMAP0= 0x201c,
 + EOI_EXIT_BITMAP0_HIGH   = 0x201d,
 + EOI_EXIT_BITMAP1= 0x201e,
 + EOI_EXIT_BITMAP1_HIGH   = 0x201f,
 + EOI_EXIT_BITMAP2= 0x2020,
 + EOI_EXIT_BITMAP2_HIGH   = 0x2021,
 + EOI_EXIT_BITMAP3= 0x2022,
 + EOI_EXIT_BITMAP3_HIGH   = 0x2023,
   GUEST_PHYSICAL_ADDRESS  = 0x2400,
   GUEST_PHYSICAL_ADDRESS_HIGH = 0x2401,
   VMCS_LINK_POINTER   = 0x2800,
 diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
 index b111aee..e113440 100644
 --- 

Re: [PATCH v9 2/3] x86, apicv: add virtual x2apic support

2013-01-10 Thread Gleb Natapov
On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote:
 +static void vmx_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
 +{
 + u32 exec_control;
 + int msr;
 + struct vcpu_vmx *vmx = to_vmx(vcpu);
 +
 + if (!cpu_has_vmx_virtualize_x2apic_mode())
 + return;
 +
 + exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 + /* virtualize x2apic mode relies on tpr shadow */
 + if (!(exec_control  CPU_BASED_TPR_SHADOW))
 + return;
 +
 + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
 + exec_control = ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 + exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
 + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
 + vmx-virtual_x2apic_enabled = true;
 +
 + if (!cpu_has_vmx_virtual_intr_delivery())
 + return;
 +
 + for (msr = 0x800; msr = 0x8ff; msr++)
 + vmx_intercept_for_msr_read(msr, false, false);
 +
 + /* APIC ID */
 + vmx_intercept_for_msr_read(0x802, false, true);
Why are you enabling apic id read intercept?

 + /* TMCCT */
 + vmx_intercept_for_msr_read(0x839, false, true);
 + /* TPR */
 + vmx_intercept_for_msr_write(0x808, false, false);
 + /* EOI */
 + vmx_intercept_for_msr_write(0x80b, false, false);
 + /* SELF-IPI */
 + vmx_intercept_for_msr_write(0x83f, false, false);
 +
 +}
 +

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v9 2/3] x86, apicv: add virtual x2apic support

2013-01-10 Thread Zhang, Yang Z
Gleb Natapov wrote on 2013-01-10:
 On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote:
 +static void vmx_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
 +{
 +u32 exec_control;
 +int msr;
 +struct vcpu_vmx *vmx = to_vmx(vcpu);
 +
 +if (!cpu_has_vmx_virtualize_x2apic_mode())
 +return;
 +
 +exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 +/* virtualize x2apic mode relies on tpr shadow */
 +if (!(exec_control  CPU_BASED_TPR_SHADOW))
 +return;
 +
 +exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
 +exec_control = ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 +exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
 +vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
 +vmx-virtual_x2apic_enabled = true;
 +
 +if (!cpu_has_vmx_virtual_intr_delivery())
 +return;
 +
 +for (msr = 0x800; msr = 0x8ff; msr++)
 +vmx_intercept_for_msr_read(msr, false, false);
 +
 +/* APIC ID */
 +vmx_intercept_for_msr_read(0x802, false, true);
 Why are you enabling apic id read intercept?
Current code to read apic id in x2apic mode has some hacks:

if (apic_x2apic_mode(apic))
   val = kvm_apic_id(apic);
else
   val = kvm_apic_id(apic)  24;

static inline int kvm_apic_id(struct kvm_lapic *apic)
{
return (kvm_apic_get_reg(apic, APIC_ID)  24)  0xff;
}

According SPEC, in x2apic mode, the whole id reg is used, but in KVM, it only 
use the highest eight bits.

 +/* TMCCT */
 +vmx_intercept_for_msr_read(0x839, false, true);
 +/* TPR */
 +vmx_intercept_for_msr_write(0x808, false, false);
 +/* EOI */
 +vmx_intercept_for_msr_write(0x80b, false, false);
 +/* SELF-IPI */
 +vmx_intercept_for_msr_write(0x83f, false, false);
 +
 +}
 +
 
 --
   Gleb.


Best regards,
Yang


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v9 2/3] x86, apicv: add virtual x2apic support

2013-01-10 Thread Zhang, Yang Z
Gleb Natapov wrote on 2013-01-10:
 On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote:
 From: Yang Zhang yang.z.zh...@intel.com
 
 basically to benefit from apicv, we need to enable virtualized x2apic mode.
 Currently, we only enable it when guest is really using x2apic.
 
 Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled
 x2apic:
 0x800 - 0x8ff: no read intercept for apicv register virtualization,
 except APIC ID and TMCCT.
 APIC ID and TMCCT: need software's assistance to get right value.
 TPR,EOI,SELF-IPI: no write intercept for virtual interrupt delivery.
 Signed-off-by: Kevin Tian kevin.t...@intel.com
 Signed-off-by: Yang Zhang yang.z.zh...@intel.com
 ---
  arch/x86/include/asm/kvm_host.h |2 + arch/x86/include/asm/vmx.h   
|1 + arch/x86/kvm/lapic.c|5 +-
  arch/x86/kvm/svm.c  |6 + arch/x86/kvm/vmx.c   
|  194 +-- 5 files changed, 200
  insertions(+), 8 deletions(-)
 diff --git a/arch/x86/include/asm/kvm_host.h
 b/arch/x86/include/asm/kvm_host.h index c431b33..572a562 100644 ---
 a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h
 @@ -697,6 +697,8 @@ struct kvm_x86_ops {
  void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
  void (*enable_irq_window)(struct kvm_vcpu *vcpu);
  void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
 +void (*enable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
 +void (*disable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
 Make one callback with enable/disable parameter. And do not forget SVM.
 
  int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
  int (*get_tdp_level)(void);
  u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
 diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
 index 44c3f7e..0a54df0 100644
 --- a/arch/x86/include/asm/vmx.h
 +++ b/arch/x86/include/asm/vmx.h
 @@ -139,6 +139,7 @@
  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x0001 #define
  SECONDARY_EXEC_ENABLE_EPT   0x0002 #define
  SECONDARY_EXEC_RDTSCP   0x0008 +#define
  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x0010 #define
  SECONDARY_EXEC_ENABLE_VPID  0x0020 #define
  SECONDARY_EXEC_WBINVD_EXITING   0x0040 #define
  SECONDARY_EXEC_UNRESTRICTED_GUEST   0x0080
 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
 index 0664c13..ec38906 100644
 --- a/arch/x86/kvm/lapic.c
 +++ b/arch/x86/kvm/lapic.c
 @@ -1328,7 +1328,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu,
 u64 value)
  u32 id = kvm_apic_id(apic);
  u32 ldr = ((id  4)  16) | (1  (id  0xf));
  kvm_apic_set_ldr(apic, ldr);
 -}
 +kvm_x86_ops-enable_virtual_x2apic_mode(vcpu);
 +} else
 +kvm_x86_ops-disable_virtual_x2apic_mode(vcpu);
 +
 You just broke SVM.
  apic-base_address = apic-vcpu-arch.apic_base 
   MSR_IA32_APICBASE_BASE;
 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
 index d29d3cd..0b82cb1 100644
 --- a/arch/x86/kvm/svm.c
 +++ b/arch/x86/kvm/svm.c
 @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct kvm_vcpu
 *vcpu, int tpr, int irr)
  set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
  }
 +static void svm_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
 +{
 +return;
 +}
 +
  static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm
  *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops
  svm_x86_ops = { .enable_nmi_window = enable_nmi_window,
  .enable_irq_window = enable_irq_window, .update_cr8_intercept =
  update_cr8_intercept,
 +.enable_virtual_x2apic_mode = svm_enable_virtual_x2apic_mode,
 
  .set_tss_addr = svm_set_tss_addr,
  .get_tdp_level = get_npt_level,
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 688f43f..b203ce7 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -433,6 +433,8 @@ struct vcpu_vmx {
 
  bool rdtscp_enabled;
 +bool virtual_x2apic_enabled;
 +
  /* Support for a guest hypervisor (nested VMX) */
  struct nested_vmx nested;
  };
 @@ -767,12 +769,23 @@ static inline bool
 cpu_has_vmx_virtualize_apic_accesses(void)
  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
  }
 +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
 +{
 +return vmcs_config.cpu_based_2nd_exec_ctrl 
 +SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
 +}
 +
  static inline bool cpu_has_vmx_apic_register_virt(void)
  {
  return vmcs_config.cpu_based_2nd_exec_ctrl 
  SECONDARY_EXEC_APIC_REGISTER_VIRT;
  }
 +static inline bool cpu_has_vmx_virtual_intr_delivery(void)
 +{
 +return false;
 +}
 +
  static inline bool cpu_has_vmx_flexpriority(void)
  {
  return cpu_has_vmx_tpr_shadow() 
 @@ -2544,6 +2557,7 @@ static __init int 

Re: [PATCH 01/12] tap: multiqueue support

2013-01-10 Thread Stefan Hajnoczi
On Wed, Jan 09, 2013 at 11:25:24PM +0800, Jason Wang wrote:
 On 01/09/2013 05:56 PM, Stefan Hajnoczi wrote:
  On Fri, Dec 28, 2012 at 06:31:53PM +0800, Jason Wang wrote:
  diff --git a/qapi-schema.json b/qapi-schema.json
  index 5dfa052..583eb7c 100644
  --- a/qapi-schema.json
  +++ b/qapi-schema.json
  @@ -2465,7 +2465,7 @@
   { 'type': 'NetdevTapOptions',
 'data': {
   '*ifname': 'str',
  -'*fd': 'str',
  +'*fd': ['String'],
  This change is not backwards-compatible.  You need to add a '*fds':
  ['String'] field instead.
 
 I'm not quite understand this case, I think it still work when we we
 just specify one fd.

You are right, the QemuOpts visitor shows no incompatibility.

But there is also a QMP interface: netdev_add.  I think changing the
type to a string list breaks compatibility there.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 1/5] virtio: add functions for piecewise addition of buffers

2013-01-10 Thread Paolo Bonzini
Il 08/01/2013 01:12, Rusty Russell ha scritto:
  Unfortunately, that cannot work because not all architectures support
  chained scatterlists.
  
  WHAT?  I can't figure out what an arch needs to do to support this?
 
  It needs to use the iterator functions in its DMA driver.
 But we don't care for virtio.

True.

  All archs we care about support them, though, so I think we can ignore
  this issue for now.
 
  Kind of... In principle all QEMU-supported arches can use virtio, and
  the speedup can be quite useful.  And there is no Kconfig symbol for SG
  chains that I can use to disable virtio-scsi on unsupported arches. :/
 Well, we #error if it's not supported.  Then the lazy architectures can
 fix it.

Yeah, that would be one approach.

But frankly, your patch is really disgusting. :)  Not your fault, of
course, but I still prefer a limited amount of duplication.

Perhaps we can get the best of both worlds, I'll take a look when I have
some time.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH 00/12] Multiqueue virtio-net

2013-01-10 Thread Stefan Hajnoczi
On Wed, Jan 09, 2013 at 11:33:25PM +0800, Jason Wang wrote:
 On 01/09/2013 11:32 PM, Michael S. Tsirkin wrote:
  On Wed, Jan 09, 2013 at 03:29:24PM +0100, Stefan Hajnoczi wrote:
  On Fri, Dec 28, 2012 at 06:31:52PM +0800, Jason Wang wrote:
  Perf Numbers:
 
  Two Intel Xeon 5620 with direct connected intel 82599EB
  Host/Guest kernel: David net tree
  vhost enabled
 
  - lots of improvents of both latency and cpu utilization in 
  request-reponse test
  - get regression of guest sending small packets which because TCP tends 
  to batch
less when the latency were improved
 
  1q/2q/4q
  TCP_RR
   size #sessions trans.rate  norm trans.rate  norm trans.rate  norm
  1 1 9393.26   595.64  9408.18   597.34  9375.19   584.12
  1 2072162.1   2214.24 129880.22 2456.13 196949.81 2298.13
  1 50107513.38 2653.99 139721.93 2490.58 259713.82 2873.57
  1 100   126734.63 2676.54 145553.5  2406.63 265252.68 2943
  64 19453.42   632.33  9371.37   616.13  9338.19   615.97
  64 20   70620.03  2093.68 125155.75 2409.15 191239.91 2253.32
  64 50   1069662448.29 146518.67 2514.47 242134.07 2720.91
  64 100  117046.35 2394.56 190153.09 2696.82 238881.29 2704.41
  256 1   8733.29   736.36  8701.07   680.83  8608.92   530.1
  256 20  69279.89  2274.45 115103.07 2299.76 144555.16 1963.53
  256 50  97676.02  2296.09 150719.57 2522.92 254510.5  3028.44
  256 100 150221.55 2949.56 197569.3  2790.92 300695.78 3494.83
  TCP_CRR
   size #sessions trans.rate  norm trans.rate  norm trans.rate  norm
  1 1 2848.37  163.41 2230.39  130.89 2013.09  120.47
  1 2023434.5  562.11 31057.43 531.07 49488.28 564.41
  1 5028514.88 582.17 40494.23 605.92 60113.35 654.97
  1 100   28827.22 584.73 48813.25 661.6  61783.62 676.56
  64 12780.08  159.4  2201.07  127.96 2006.8   117.63
  64 20   23318.51 564.47 30982.44 530.24 49734.95 566.13
  64 50   28585.72 582.54 40576.7  610.08 60167.89 656.56
  64 100  28747.37 584.17 49081.87 667.87 60612.94 662
  256 1   2772.08  160.51 2231.84  131.05 2003.62  113.45
  256 20  23086.35 559.8  30929.09 528.16 48454.9  555.22
  256 50  28354.7  579.85 40578.31 60760261.71 657.87
  256 100 28844.55 585.67 48541.86 659.08 61941.07 676.72
  TCP_STREAM guest receiving
   size #sessions throughput  norm throughput  norm throughput  norm
  1 1 16.27   1.33   16.11.12   16.13   0.99
  1 2 33.04   2.08   32.96   2.19   32.75   1.98
  1 4 66.62   6.83   68.35.56   66.14   2.65
  64 1896.55  56.67  914.02  58.14  898.9   61.56
  64 21830.46 91.02  1812.02 64.59  1835.57 66.26
  64 43626.61 142.55 3636.25 100.64 3607.46 75.03
  256 1   2619.49 131.23 2543.19 129.03 2618.69 132.39
  256 2   5136.58 203.02 5163.31 141.11 5236.51 149.4
  256 4   7063.99 242.83 9365.4  208.49 9421.03 159.94
  512 1   3592.43 165.24 3603.12 167.19 3552.5  169.57
  512 2   7042.62 246.59 7068.46 180.87 7258.52 186.3
  512 4   6996.08 241.49 9298.34 206.12 9418.52 159.33
  1024 1  4339.54 192.95 4370.2  191.92 4211.72 192.49
  1024 2  7439.45 254.77 9403.99 215.24 9120.82 222.67
  1024 4  7953.86 272.11 9403.87 208.23 9366.98 159.49
  4096 1  7696.28 272.04 7611.41 270.38 7778.71 267.76
  4096 2  7530.35 261.1  8905.43 246.27 8990.18 267.57
  4096 4  7121.6  247.02 9411.75 206.71 9654.96 184.67
  16384 1 7795.73 268.54 7780.94 267.2  7634.26 260.73
  16384 2 7436.57 255.81 9381.86 220.85 9392220.36
  16384 4 7199.07 247.81 9420.96 205.87 9373.69 159.57
  TCP_MAERTS guest sending
   size #sessions throughput  norm throughput  norm throughput  norm
  1 1 15.94   0.62   15.55   0.61   15.13   0.59
  1 2 36.11   0.83   32.46   0.69   32.28   0.69
  1 4 71.59   1  68.91   0.94   61.52   0.77
  64 1630.71  22.52  622.11  22.35  605.09  21.84
  64 21442.36 30.57  1292.15 25.82  1282.67 25.55
  64 43186.79 42.59  2844.96 36.03  2529.69 30.06
  256 1   1760.96 58.07  1738.44 57.43  1695.99 56.19
  256 2   4834.23 95.19  3524.85 64.21  3511.94 64.45
  256 4   9324.63 145.74 8956.49 116.39 6720.17 73.86
  512 1   2678.03 84.1   2630.68 82.93  2636.54 82.57
  512 2   9368.17 195.61 9408.82 204.53 5316.3  92.99
  512 4   9186.34 209.68 9358.72 183.82 9489.29 160.42
  1024 1  3620.71 109.88 3625.54 109.83 3606.61 112.35
  1024 2  9429258.32 7082.79 120.55 7403.53 134.78
  1024 4  9430.66 290.44 9499.29 232.31 9414.6  190.92
  4096 1  9339.28 296.48 9374.23 372.88 9348.76 298.49
  4096 2  9410.53 378.69 9412.61 286.18 9409.75 278.31
  4096 4  9487.35 374.1  9556.91 288.81 9441.94 221.64
  16384 1 9380.43 403.8  9379.78 399.13 9382.42 393.55
  16384 2 9367.69 406.93 9415.04 312.68 9409.29 300.9
  16384 4 9391.96 405.17 9695.12 310.54 9423.76 223.47
  Trying to understand the performance results:
 
  What is the host device configuration?  tap + bridge?
 
 Yes.
 
  Did you use host CPU affinity for the vhost threads?
 
 I use numactl to pin cpu threads and vhost threads in the same numa node.
  Can multiqueue tap take advantage of multiqueue host NICs or is
 

Re: [PATCH v9 2/3] x86, apicv: add virtual x2apic support

2013-01-10 Thread Gleb Natapov
On Thu, Jan 10, 2013 at 08:32:06AM +, Zhang, Yang Z wrote:
 Gleb Natapov wrote on 2013-01-10:
  On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote:
  From: Yang Zhang yang.z.zh...@intel.com
  
  basically to benefit from apicv, we need to enable virtualized x2apic mode.
  Currently, we only enable it when guest is really using x2apic.
  
  Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled
  x2apic:
  0x800 - 0x8ff: no read intercept for apicv register virtualization,
except APIC ID and TMCCT.
  APIC ID and TMCCT: need software's assistance to get right value.
  TPR,EOI,SELF-IPI: no write intercept for virtual interrupt delivery.
  Signed-off-by: Kevin Tian kevin.t...@intel.com
  Signed-off-by: Yang Zhang yang.z.zh...@intel.com
  ---
   arch/x86/include/asm/kvm_host.h |2 + arch/x86/include/asm/vmx.h   
 |1 + arch/x86/kvm/lapic.c|5 +-
   arch/x86/kvm/svm.c  |6 + arch/x86/kvm/vmx.c   
 |  194 +-- 5 files changed, 200
   insertions(+), 8 deletions(-)
  diff --git a/arch/x86/include/asm/kvm_host.h
  b/arch/x86/include/asm/kvm_host.h index c431b33..572a562 100644 ---
  a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h
  @@ -697,6 +697,8 @@ struct kvm_x86_ops {
 void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
 void (*enable_irq_window)(struct kvm_vcpu *vcpu);
 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
  +  void (*enable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
  +  void (*disable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
  Make one callback with enable/disable parameter. And do not forget SVM.
  
 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 int (*get_tdp_level)(void);
 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
  diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
  index 44c3f7e..0a54df0 100644
  --- a/arch/x86/include/asm/vmx.h
  +++ b/arch/x86/include/asm/vmx.h
  @@ -139,6 +139,7 @@
   #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x0001 #define
   SECONDARY_EXEC_ENABLE_EPT   0x0002 #define
   SECONDARY_EXEC_RDTSCP 0x0008 +#define
   SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x0010 #define
   SECONDARY_EXEC_ENABLE_VPID  0x0020 #define
   SECONDARY_EXEC_WBINVD_EXITING 0x0040 #define
   SECONDARY_EXEC_UNRESTRICTED_GUEST 0x0080
  diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
  index 0664c13..ec38906 100644
  --- a/arch/x86/kvm/lapic.c
  +++ b/arch/x86/kvm/lapic.c
  @@ -1328,7 +1328,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu,
  u64 value)
 u32 id = kvm_apic_id(apic);
 u32 ldr = ((id  4)  16) | (1  (id  0xf));
 kvm_apic_set_ldr(apic, ldr);
  -  }
  +  kvm_x86_ops-enable_virtual_x2apic_mode(vcpu);
  +  } else
  +  kvm_x86_ops-disable_virtual_x2apic_mode(vcpu);
  +
  You just broke SVM.
 apic-base_address = apic-vcpu-arch.apic_base 
  MSR_IA32_APICBASE_BASE;
  diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
  index d29d3cd..0b82cb1 100644
  --- a/arch/x86/kvm/svm.c
  +++ b/arch/x86/kvm/svm.c
  @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct kvm_vcpu
  *vcpu, int tpr, int irr)
 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
   }
  +static void svm_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
  +{
  +  return;
  +}
  +
   static int svm_nmi_allowed(struct kvm_vcpu *vcpu) {   struct vcpu_svm
   *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops
   svm_x86_ops = {   .enable_nmi_window = enable_nmi_window,
 .enable_irq_window = enable_irq_window, .update_cr8_intercept =
   update_cr8_intercept,
  +  .enable_virtual_x2apic_mode = svm_enable_virtual_x2apic_mode,
  
 .set_tss_addr = svm_set_tss_addr,
 .get_tdp_level = get_npt_level,
  diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
  index 688f43f..b203ce7 100644
  --- a/arch/x86/kvm/vmx.c
  +++ b/arch/x86/kvm/vmx.c
  @@ -433,6 +433,8 @@ struct vcpu_vmx {
  
 bool rdtscp_enabled;
  +  bool virtual_x2apic_enabled;
  +
 /* Support for a guest hypervisor (nested VMX) */
 struct nested_vmx nested;
   };
  @@ -767,12 +769,23 @@ static inline bool
  cpu_has_vmx_virtualize_apic_accesses(void)
 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
   }
  +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
  +{
  +  return vmcs_config.cpu_based_2nd_exec_ctrl 
  +  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
  +}
  +
   static inline bool cpu_has_vmx_apic_register_virt(void)
   {
 return vmcs_config.cpu_based_2nd_exec_ctrl 
 SECONDARY_EXEC_APIC_REGISTER_VIRT;
   }
  +static inline bool cpu_has_vmx_virtual_intr_delivery(void)
  +{
  +  return false;
  +}
  +
   static inline bool 

Re: [PATCH v9 2/3] x86, apicv: add virtual x2apic support

2013-01-10 Thread Gleb Natapov
On Thu, Jan 10, 2013 at 08:31:51AM +, Zhang, Yang Z wrote:
 Gleb Natapov wrote on 2013-01-10:
  On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote:
  +static void vmx_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
  +{
  +  u32 exec_control;
  +  int msr;
  +  struct vcpu_vmx *vmx = to_vmx(vcpu);
  +
  +  if (!cpu_has_vmx_virtualize_x2apic_mode())
  +  return;
  +
  +  exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
  +  /* virtualize x2apic mode relies on tpr shadow */
  +  if (!(exec_control  CPU_BASED_TPR_SHADOW))
  +  return;
  +
  +  exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
  +  exec_control = ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
  +  exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
  +  vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
  +  vmx-virtual_x2apic_enabled = true;
  +
  +  if (!cpu_has_vmx_virtual_intr_delivery())
  +  return;
  +
  +  for (msr = 0x800; msr = 0x8ff; msr++)
  +  vmx_intercept_for_msr_read(msr, false, false);
  +
  +  /* APIC ID */
  +  vmx_intercept_for_msr_read(0x802, false, true);
  Why are you enabling apic id read intercept?
 Current code to read apic id in x2apic mode has some hacks:
 
 if (apic_x2apic_mode(apic))
val = kvm_apic_id(apic);
 else
val = kvm_apic_id(apic)  24;
 
 static inline int kvm_apic_id(struct kvm_lapic *apic)
 {
 return (kvm_apic_get_reg(apic, APIC_ID)  24)  0xff;
 }
 
 According SPEC, in x2apic mode, the whole id reg is used, but in KVM, it only 
 use the highest eight bits.
 
Correct. Put the comment above apic id intercept that we do that since
in x2apic mode apic id is not correctly reflected in apic page.

  +  /* TMCCT */
  +  vmx_intercept_for_msr_read(0x839, false, true);
  +  /* TPR */
  +  vmx_intercept_for_msr_write(0x808, false, false);
  +  /* EOI */
  +  vmx_intercept_for_msr_write(0x80b, false, false);
  +  /* SELF-IPI */
  +  vmx_intercept_for_msr_write(0x83f, false, false);
  +
  +}
  +
  
  --
  Gleb.
 
 
 Best regards,
 Yang
 

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH 10/12] virtio-net: multiqueue support

2013-01-10 Thread Wanlong Gao
On 01/10/2013 03:16 PM, Jason Wang wrote:
 On Thursday, January 10, 2013 02:49:14 PM Wanlong Gao wrote:
 On 01/10/2013 02:43 PM, Jason Wang wrote:
 On Wednesday, January 09, 2013 11:26:33 PM Jason Wang wrote:
 On 01/09/2013 06:01 PM, Wanlong Gao wrote:
 On 01/09/2013 05:30 PM, Jason Wang wrote:
 On 01/09/2013 04:23 PM, Wanlong Gao wrote:
 On 01/08/2013 06:14 PM, Jason Wang wrote:
 On 01/08/2013 06:00 PM, Wanlong Gao wrote:
 On 01/08/2013 05:51 PM, Jason Wang wrote:
 On 01/08/2013 05:49 PM, Wanlong Gao wrote:
 On 01/08/2013 05:29 PM, Jason Wang wrote:
 On 01/08/2013 05:07 PM, Wanlong Gao wrote:
 On 12/28/2012 06:32 PM, Jason Wang wrote:
 +} else if (nc-peer-info-type !=
 NET_CLIENT_OPTIONS_KIND_TAP) {
 +ret = -1;
 +} else {
 +ret = tap_detach(nc-peer);
 +}
 +
 +return ret;
 +}
 +

 [...]

 I got guest kernel panic when using this way and set queues=4.

 Does it happens w/o or w/ a fd parameter? What's the qemu command line?
 Did you meet it during boot time?

 The QEMU command line is

 /work/git/qemu/x86_64-softmmu/qemu-system-x86_64 -name f17 -M pc-0.15
 -enable-kvm -m 3096 \ -smp 4,sockets=4,cores=1,threads=1 \
 -uuid c31a9f3e-4161-c53a-339c-5dc36d0497cb -no-user-config -nodefaults \
 -chardev
 socket,id=charmonitor,path=/var/lib/libvirt/qemu/f17.monitor,server,nowa
 i
 t \ -mon chardev=charmonitor,id=monitor,mode=control \
 -rtc base=utc -no-shutdown \
 -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 \
 -device
 virtio-scsi-pci,id=scsi0,bus=pci.0,addr=0xb,num_queues=4,hotplug=on \
 -device virtio-serial-pci,id=virtio-serial0,bus=pci.0,addr=0x5 \
 -drive file=/vm/f17.img,if=none,id=drive-virtio-disk0,format=qcow2 \
 -device
 virtio-blk-pci,scsi=off,bus=pci.0,addr=0x6,drive=drive-virtio-disk0,id=v
 i
 rtio-disk0,bootindex=1 \ -drive
 file=/vm2/f17-kernel.img,if=none,id=drive-virtio-disk1,format=qcow2 \
 -device
 virtio-blk-pci,scsi=off,bus=pci.0,addr=0x8,drive=drive-virtio-disk1,id=v
 i
 rtio-disk1 \ -drive
 file=/vm/virtio-scsi/scsi3.img,if=none,id=drive-scsi0-0-2-0,format=raw \
 -device
 scsi-hd,bus=scsi0.0,channel=0,scsi-id=0,lun=2,drive=drive-scsi0-0-2-0,id
 =
 scsi0-0-2-0,removable=on \ -drive
 file=/vm/virtio-scsi/scsi4.img,if=none,id=drive-scsi0-0-3-0,format=raw \
 -device
 scsi-hd,bus=scsi0.0,channel=0,scsi-id=0,lun=3,drive=drive-scsi0-0-3-0,id
 =
 scsi0-0-3-0 \ -drive
 file=/vm/virtio-scsi/scsi1.img,if=none,id=drive-scsi0-0-0-0,format=raw \
 -device
 scsi-hd,bus=scsi0.0,channel=0,scsi-id=0,lun=0,drive=drive-scsi0-0-0-0,id
 =
 scsi0-0-0-0 \ -drive
 file=/vm/virtio-scsi/scsi2.img,if=none,id=drive-scsi0-0-1-0,format=raw \
 -device
 scsi-hd,bus=scsi0.0,channel=0,scsi-id=0,lun=1,drive=drive-scsi0-0-1-0,id
 =
 scsi0-0-1-0 \ -chardev pty,id=charserial0 -device
 isa-serial,chardev=charserial0,id=serial0 \ -chardev
 file,id=charserial1,path=/vm/f17.log \
 -device isa-serial,chardev=charserial1,id=serial1 \
 -device usb-tablet,id=input0 -vga std \
 -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x7 \
 -netdev tap,id=hostnet0,vhost=on,queues=4 \
 -device
 virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:ce:7b:29,bus=pci.0,a
 d
 dr=0x3 \ -monitor stdio

 I got panic just after booting the system, did nothing,  waited for a
 while, the guest panicked.

 [   28.053004] BUG: soft lockup - CPU#1 stuck for 23s! [ip:592]
 [   28.053004] Modules linked in: ip6t_REJECT nf_conntrack_ipv6
 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables uinput
 joydev microcode virtio_balloon pcspkr virtio_net i2c_piix4 i2c_core
 virtio_scsi virtio_blk floppy [   28.053004] CPU 1
 [   28.053004] Pid: 592, comm: ip Not tainted 3.8.0-rc1-net+ #3 Bochs
 Bochs
 [   28.053004] RIP: 0010:[8137a9ab]  [8137a9ab]
 virtqueue_get_buf+0xb/0x120 [   28.053004] RSP: 0018:8800bc913550
 EFLAGS: 0246
 [   28.053004] RAX:  RBX: 8800bc49c000 RCX:
 8800bc49e000 [   28.053004] RDX:  RSI:
 8800bc913584 RDI: 8800bcfd4000 [   28.053004] RBP:
 8800bc913558 R08: 8800bcfd0800 R09:  [
 28.053004] R10: 8800bc49c000 R11: 880036cc4de0 R12:
 8800bcfd4000 [   28.053004] R13: 8800bc913558 R14:
 8137ad73 R15: 000200d0 [   28.053004] FS:
 7fb27a589740() GS:8800c148() knlGS:
 [

   28.053004] CS:  0010 DS:  ES:  CR0: 80050033

 [   28.053004] CR2: 00640530 CR3: baeff000 CR4:
 06e0 [   28.053004] DR0:  DR1:
  DR2:  [   28.053004] DR3:
  DR6: 0ff0 DR7: 0400 [
 28.053004] Process ip (pid: 592, threadinfo 8800bc912000, task
 880036da2e20) [   28.053004] Stack:
 [   28.053004]  8800bcfd0800 8800bc913638 a003e9bb
 8800bc913656 [   28.053004]  00010002 8800c17ebb08
 0050ff10 ea0002f244c0 [   28.053004]  00020582
   ea0002f244c0 [  

Re: [Qemu-devel] [PATCH 00/12] Multiqueue virtio-net

2013-01-10 Thread Jason Wang
On 01/10/2013 04:44 PM, Stefan Hajnoczi wrote:
 On Wed, Jan 09, 2013 at 11:33:25PM +0800, Jason Wang wrote:
 On 01/09/2013 11:32 PM, Michael S. Tsirkin wrote:
 On Wed, Jan 09, 2013 at 03:29:24PM +0100, Stefan Hajnoczi wrote:
 On Fri, Dec 28, 2012 at 06:31:52PM +0800, Jason Wang wrote:
 Perf Numbers:

 Two Intel Xeon 5620 with direct connected intel 82599EB
 Host/Guest kernel: David net tree
 vhost enabled

 - lots of improvents of both latency and cpu utilization in 
 request-reponse test
 - get regression of guest sending small packets which because TCP tends 
 to batch
   less when the latency were improved

 1q/2q/4q
 TCP_RR
  size #sessions trans.rate  norm trans.rate  norm trans.rate  norm
 1 1 9393.26   595.64  9408.18   597.34  9375.19   584.12
 1 2072162.1   2214.24 129880.22 2456.13 196949.81 2298.13
 1 50107513.38 2653.99 139721.93 2490.58 259713.82 2873.57
 1 100   126734.63 2676.54 145553.5  2406.63 265252.68 2943
 64 19453.42   632.33  9371.37   616.13  9338.19   615.97
 64 20   70620.03  2093.68 125155.75 2409.15 191239.91 2253.32
 64 50   1069662448.29 146518.67 2514.47 242134.07 2720.91
 64 100  117046.35 2394.56 190153.09 2696.82 238881.29 2704.41
 256 1   8733.29   736.36  8701.07   680.83  8608.92   530.1
 256 20  69279.89  2274.45 115103.07 2299.76 144555.16 1963.53
 256 50  97676.02  2296.09 150719.57 2522.92 254510.5  3028.44
 256 100 150221.55 2949.56 197569.3  2790.92 300695.78 3494.83
 TCP_CRR
  size #sessions trans.rate  norm trans.rate  norm trans.rate  norm
 1 1 2848.37  163.41 2230.39  130.89 2013.09  120.47
 1 2023434.5  562.11 31057.43 531.07 49488.28 564.41
 1 5028514.88 582.17 40494.23 605.92 60113.35 654.97
 1 100   28827.22 584.73 48813.25 661.6  61783.62 676.56
 64 12780.08  159.4  2201.07  127.96 2006.8   117.63
 64 20   23318.51 564.47 30982.44 530.24 49734.95 566.13
 64 50   28585.72 582.54 40576.7  610.08 60167.89 656.56
 64 100  28747.37 584.17 49081.87 667.87 60612.94 662
 256 1   2772.08  160.51 2231.84  131.05 2003.62  113.45
 256 20  23086.35 559.8  30929.09 528.16 48454.9  555.22
 256 50  28354.7  579.85 40578.31 60760261.71 657.87
 256 100 28844.55 585.67 48541.86 659.08 61941.07 676.72
 TCP_STREAM guest receiving
  size #sessions throughput  norm throughput  norm throughput  norm
 1 1 16.27   1.33   16.11.12   16.13   0.99
 1 2 33.04   2.08   32.96   2.19   32.75   1.98
 1 4 66.62   6.83   68.35.56   66.14   2.65
 64 1896.55  56.67  914.02  58.14  898.9   61.56
 64 21830.46 91.02  1812.02 64.59  1835.57 66.26
 64 43626.61 142.55 3636.25 100.64 3607.46 75.03
 256 1   2619.49 131.23 2543.19 129.03 2618.69 132.39
 256 2   5136.58 203.02 5163.31 141.11 5236.51 149.4
 256 4   7063.99 242.83 9365.4  208.49 9421.03 159.94
 512 1   3592.43 165.24 3603.12 167.19 3552.5  169.57
 512 2   7042.62 246.59 7068.46 180.87 7258.52 186.3
 512 4   6996.08 241.49 9298.34 206.12 9418.52 159.33
 1024 1  4339.54 192.95 4370.2  191.92 4211.72 192.49
 1024 2  7439.45 254.77 9403.99 215.24 9120.82 222.67
 1024 4  7953.86 272.11 9403.87 208.23 9366.98 159.49
 4096 1  7696.28 272.04 7611.41 270.38 7778.71 267.76
 4096 2  7530.35 261.1  8905.43 246.27 8990.18 267.57
 4096 4  7121.6  247.02 9411.75 206.71 9654.96 184.67
 16384 1 7795.73 268.54 7780.94 267.2  7634.26 260.73
 16384 2 7436.57 255.81 9381.86 220.85 9392220.36
 16384 4 7199.07 247.81 9420.96 205.87 9373.69 159.57
 TCP_MAERTS guest sending
  size #sessions throughput  norm throughput  norm throughput  norm
 1 1 15.94   0.62   15.55   0.61   15.13   0.59
 1 2 36.11   0.83   32.46   0.69   32.28   0.69
 1 4 71.59   1  68.91   0.94   61.52   0.77
 64 1630.71  22.52  622.11  22.35  605.09  21.84
 64 21442.36 30.57  1292.15 25.82  1282.67 25.55
 64 43186.79 42.59  2844.96 36.03  2529.69 30.06
 256 1   1760.96 58.07  1738.44 57.43  1695.99 56.19
 256 2   4834.23 95.19  3524.85 64.21  3511.94 64.45
 256 4   9324.63 145.74 8956.49 116.39 6720.17 73.86
 512 1   2678.03 84.1   2630.68 82.93  2636.54 82.57
 512 2   9368.17 195.61 9408.82 204.53 5316.3  92.99
 512 4   9186.34 209.68 9358.72 183.82 9489.29 160.42
 1024 1  3620.71 109.88 3625.54 109.83 3606.61 112.35
 1024 2  9429258.32 7082.79 120.55 7403.53 134.78
 1024 4  9430.66 290.44 9499.29 232.31 9414.6  190.92
 4096 1  9339.28 296.48 9374.23 372.88 9348.76 298.49
 4096 2  9410.53 378.69 9412.61 286.18 9409.75 278.31
 4096 4  9487.35 374.1  9556.91 288.81 9441.94 221.64
 16384 1 9380.43 403.8  9379.78 399.13 9382.42 393.55
 16384 2 9367.69 406.93 9415.04 312.68 9409.29 300.9
 16384 4 9391.96 405.17 9695.12 310.54 9423.76 223.47
 Trying to understand the performance results:

 What is the host device configuration?  tap + bridge?
 Yes.
 Did you use host CPU affinity for the vhost threads?
 I use numactl to pin cpu threads and vhost threads in the same numa node.
 Can multiqueue tap take advantage of multiqueue host NICs or is
 virtio-net multiqueue unaware of the physical NIC 

Re: [Qemu-devel] [PATCH 10/12] virtio-net: multiqueue support

2013-01-10 Thread Jason Wang
On 01/10/2013 05:06 PM, Wanlong Gao wrote:
 On 01/10/2013 03:16 PM, Jason Wang wrote:
 On Thursday, January 10, 2013 02:49:14 PM Wanlong Gao wrote:
 On 01/10/2013 02:43 PM, Jason Wang wrote:
 On Wednesday, January 09, 2013 11:26:33 PM Jason Wang wrote:
 On 01/09/2013 06:01 PM, Wanlong Gao wrote:
 On 01/09/2013 05:30 PM, Jason Wang wrote:
 On 01/09/2013 04:23 PM, Wanlong Gao wrote:
 On 01/08/2013 06:14 PM, Jason Wang wrote:
 On 01/08/2013 06:00 PM, Wanlong Gao wrote:
 On 01/08/2013 05:51 PM, Jason Wang wrote:
 On 01/08/2013 05:49 PM, Wanlong Gao wrote:
 On 01/08/2013 05:29 PM, Jason Wang wrote:
 On 01/08/2013 05:07 PM, Wanlong Gao wrote:
 On 12/28/2012 06:32 PM, Jason Wang wrote:
 +} else if (nc-peer-info-type !=
 NET_CLIENT_OPTIONS_KIND_TAP) {
 +ret = -1;
 +} else {
 +ret = tap_detach(nc-peer);
 +}
 +
 +return ret;
 +}
 +
 [...]

 I got guest kernel panic when using this way and set queues=4.
 Does it happens w/o or w/ a fd parameter? What's the qemu command line?
 Did you meet it during boot time?
 The QEMU command line is

 /work/git/qemu/x86_64-softmmu/qemu-system-x86_64 -name f17 -M pc-0.15
 -enable-kvm -m 3096 \ -smp 4,sockets=4,cores=1,threads=1 \
 -uuid c31a9f3e-4161-c53a-339c-5dc36d0497cb -no-user-config -nodefaults \
 -chardev
 socket,id=charmonitor,path=/var/lib/libvirt/qemu/f17.monitor,server,nowa
 i
 t \ -mon chardev=charmonitor,id=monitor,mode=control \
 -rtc base=utc -no-shutdown \
 -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 \
 -device
 virtio-scsi-pci,id=scsi0,bus=pci.0,addr=0xb,num_queues=4,hotplug=on \
 -device virtio-serial-pci,id=virtio-serial0,bus=pci.0,addr=0x5 \
 -drive file=/vm/f17.img,if=none,id=drive-virtio-disk0,format=qcow2 \
 -device
 virtio-blk-pci,scsi=off,bus=pci.0,addr=0x6,drive=drive-virtio-disk0,id=v
 i
 rtio-disk0,bootindex=1 \ -drive
 file=/vm2/f17-kernel.img,if=none,id=drive-virtio-disk1,format=qcow2 \
 -device
 virtio-blk-pci,scsi=off,bus=pci.0,addr=0x8,drive=drive-virtio-disk1,id=v
 i
 rtio-disk1 \ -drive
 file=/vm/virtio-scsi/scsi3.img,if=none,id=drive-scsi0-0-2-0,format=raw \
 -device
 scsi-hd,bus=scsi0.0,channel=0,scsi-id=0,lun=2,drive=drive-scsi0-0-2-0,id
 =
 scsi0-0-2-0,removable=on \ -drive
 file=/vm/virtio-scsi/scsi4.img,if=none,id=drive-scsi0-0-3-0,format=raw \
 -device
 scsi-hd,bus=scsi0.0,channel=0,scsi-id=0,lun=3,drive=drive-scsi0-0-3-0,id
 =
 scsi0-0-3-0 \ -drive
 file=/vm/virtio-scsi/scsi1.img,if=none,id=drive-scsi0-0-0-0,format=raw \
 -device
 scsi-hd,bus=scsi0.0,channel=0,scsi-id=0,lun=0,drive=drive-scsi0-0-0-0,id
 =
 scsi0-0-0-0 \ -drive
 file=/vm/virtio-scsi/scsi2.img,if=none,id=drive-scsi0-0-1-0,format=raw \
 -device
 scsi-hd,bus=scsi0.0,channel=0,scsi-id=0,lun=1,drive=drive-scsi0-0-1-0,id
 =
 scsi0-0-1-0 \ -chardev pty,id=charserial0 -device
 isa-serial,chardev=charserial0,id=serial0 \ -chardev
 file,id=charserial1,path=/vm/f17.log \
 -device isa-serial,chardev=charserial1,id=serial1 \
 -device usb-tablet,id=input0 -vga std \
 -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x7 \
 -netdev tap,id=hostnet0,vhost=on,queues=4 \
 -device
 virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:ce:7b:29,bus=pci.0,a
 d
 dr=0x3 \ -monitor stdio

 I got panic just after booting the system, did nothing,  waited for a
 while, the guest panicked.

 [   28.053004] BUG: soft lockup - CPU#1 stuck for 23s! [ip:592]
 [   28.053004] Modules linked in: ip6t_REJECT nf_conntrack_ipv6
 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables uinput
 joydev microcode virtio_balloon pcspkr virtio_net i2c_piix4 i2c_core
 virtio_scsi virtio_blk floppy [   28.053004] CPU 1
 [   28.053004] Pid: 592, comm: ip Not tainted 3.8.0-rc1-net+ #3 Bochs
 Bochs
 [   28.053004] RIP: 0010:[8137a9ab]  [8137a9ab]
 virtqueue_get_buf+0xb/0x120 [   28.053004] RSP: 0018:8800bc913550
 EFLAGS: 0246
 [   28.053004] RAX:  RBX: 8800bc49c000 RCX:
 8800bc49e000 [   28.053004] RDX:  RSI:
 8800bc913584 RDI: 8800bcfd4000 [   28.053004] RBP:
 8800bc913558 R08: 8800bcfd0800 R09:  [
 28.053004] R10: 8800bc49c000 R11: 880036cc4de0 R12:
 8800bcfd4000 [   28.053004] R13: 8800bc913558 R14:
 8137ad73 R15: 000200d0 [   28.053004] FS:
 7fb27a589740() GS:8800c148() knlGS:
 [

   28.053004] CS:  0010 DS:  ES:  CR0: 80050033

 [   28.053004] CR2: 00640530 CR3: baeff000 CR4:
 06e0 [   28.053004] DR0:  DR1:
  DR2:  [   28.053004] DR3:
  DR6: 0ff0 DR7: 0400 [
 28.053004] Process ip (pid: 592, threadinfo 8800bc912000, task
 880036da2e20) [   28.053004] Stack:
 [   28.053004]  8800bcfd0800 8800bc913638 a003e9bb
 8800bc913656 [   28.053004]  00010002 8800c17ebb08
 0050ff10 ea0002f244c0 [   28.053004]  00020582
 

Re: [okeanos-dev] Re: KVM versions, machine types and failed migrations

2013-01-10 Thread Daniel P. Berrange
On Wed, Jan 09, 2013 at 03:27:53PM +0200, Vangelis Koukis wrote:
 On Wed, Jan 09, 2013 at 01:10:45pm +, Daniel P. Berrange wrote:
  When doing migration, the fundamental requirement is that the guest
  OS visible machine ABI must not change. Thus there are three key
  things to take care of when launching QEMU on the migration target
  host.
  
   - The device PCI/USB addresses must be identical to the source
   - The machine type must be identical to the source
   - The CPU model must be identical to the source
  
 
 Thanks for the detailed list of requirements, we'll take it into account
 for the relevant Ganeti patch.
 
  If you don't follow those requirements, either QEMU or the guest OS
  or both will crash  burn during migration  you get to keep both
  pieces :-)
  
 
 My point is, are these requirements left up to the caller of kvm
 -incoming to satisfy? Since the migration will most probably break,
 wouldn't it be best for QEMU to detect this and complain loudly, instead
 of continuing with the migration, failing silently and destroying the
 VM?
 
 Sure there could be some yes, do it, I know it is going to break
 option, which will make QEMU proceed with the migration. However, in 99%
 of the cases this is just user error, e.g. the user has upgraded the
 version on the other end and has not specified -M explicitly. It would
 be best if QEMU was able to detect and warn the user about what is going
 to happen, because it does lead to the VM dying.

What you describe is certainly desirable, but it is quite hard to achieve
with current QEMU. Much of the work with moving to the new QEMU object
model  configuration descriptions has been motivated by a desire to
enable improvements migration handling. As you suggest, the goal is that
the source QEMU be able to send a complete  reliable hardware description
to the destination QEMU during migration.It is getting closer, but we're
not there yet.

Regards,
Daniel
-- 
|: http://berrange.com  -o-http://www.flickr.com/photos/dberrange/ :|
|: http://libvirt.org  -o- http://virt-manager.org :|
|: http://autobuild.org   -o- http://search.cpan.org/~danberr/ :|
|: http://entangle-photo.org   -o-   http://live.gnome.org/gtk-vnc :|
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS

2013-01-10 Thread Peter Maydell
On 9 January 2013 22:30, Alexander Graf ag...@suse.de wrote:
 In fact, in this particular case one could merge all of the patches except 
 for the
 particular ioctl implementation and just give the respective addresses default
 values until there is an API to set them, similar to how we did things with 
 PVR
 in the beginning.

That's what we started with and we got review comments saying yuck,
don't hard code this...

-- PMM
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 01/12] tap: multiqueue support

2013-01-10 Thread Stefan Hajnoczi
On Fri, Dec 28, 2012 at 06:31:53PM +0800, Jason Wang wrote:

Mainly suggestions to make the code easier to understand, but see the
comment about the 1:1 queue/NetClientState model for a general issue
with this approach.

 Recently, linux support multiqueue tap which could let userspace call 
 TUNSETIFF
 for a signle device many times to create multiple file descriptors as

s/signle/single/

(Noting these if you respin.)

 independent queues. User could also enable/disabe a specific queue through

s/disabe/disable/

 TUNSETQUEUE.
 
 The patch adds the generic infrastructure to create multiqueue taps. To 
 achieve
 this a new parameter queues were introduced to specify how many queues were
 expected to be created for tap. The fd parameter were also changed to 
 support
 a list of file descriptors which could be used by management (such as libvirt)
 to pass pre-created file descriptors (queues) to qemu.
 
 Each TAPState were still associated to a tap fd, which mean multiple TAPStates
 were created when user needs multiqueue taps.
 
 Only linux part were implemented now, since it's the only OS that support
 multiqueue tap.
 
 Signed-off-by: Jason Wang jasow...@redhat.com
 ---
  net/tap-aix.c |   18 -
  net/tap-bsd.c |   18 -
  net/tap-haiku.c   |   18 -
  net/tap-linux.c   |   70 +++-
  net/tap-linux.h   |4 +
  net/tap-solaris.c |   18 -
  net/tap-win32.c   |   10 ++
  net/tap.c |  248 
 +
  net/tap.h |8 ++-
  qapi-schema.json  |5 +-
  10 files changed, 335 insertions(+), 82 deletions(-)

This patch should be split up:
1. linux-headers: import linux/if_tun.h multiqueue constants
2. tap: add Linux multiqueue support (tap_open(), tap_fd_attach(), 
tap_fd_detach())
3. tap: queue attach/detach (tap_attach(), tap_detach())
4. tap: split out net_init_one_tap() function (pure code motion, to make later 
diffs easy to review)
5. tap: add queues and multi-fd options (net_init_tap()/net_init_one_tap() 
changes)

Each commit description can explain how this works in more detail.  I
think I've figured it out now but it would have helped to separate
things out from the start.

 diff --git a/net/tap-aix.c b/net/tap-aix.c
 index f27c177..f931ef3 100644
 --- a/net/tap-aix.c
 +++ b/net/tap-aix.c
 @@ -25,7 +25,8 @@
  #include net/tap.h
  #include stdio.h
  
 -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int 
 vnet_hdr_required)
 +int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
 + int vnet_hdr_required, int mq_required)
  {
  fprintf(stderr, no tap on AIX\n);
  return -1;
 @@ -59,3 +60,18 @@ void tap_fd_set_offload(int fd, int csum, int tso4,
  int tso6, int ecn, int ufo)
  {
  }
 +
 +int tap_fd_attach(int fd)
 +{
 +return -1;
 +}
 +
 +int tap_fd_detach(int fd)
 +{
 +return -1;
 +}
 +
 +int tap_fd_ifname(int fd, char *ifname)
 +{
 +return -1;
 +}
 diff --git a/net/tap-bsd.c b/net/tap-bsd.c
 index a3b717d..07c287d 100644
 --- a/net/tap-bsd.c
 +++ b/net/tap-bsd.c
 @@ -33,7 +33,8 @@
  #include net/if_tap.h
  #endif
  
 -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int 
 vnet_hdr_required)
 +int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
 + int vnet_hdr_required, int mq_required)
  {
  int fd;
  #ifdef TAPGIFNAME
 @@ -145,3 +146,18 @@ void tap_fd_set_offload(int fd, int csum, int tso4,
  int tso6, int ecn, int ufo)
  {
  }
 +
 +int tap_fd_attach(int fd)
 +{
 +return -1;
 +}
 +
 +int tap_fd_detach(int fd)
 +{
 +return -1;
 +}
 +
 +int tap_fd_ifname(int fd, char *ifname)
 +{
 +return -1;
 +}
 diff --git a/net/tap-haiku.c b/net/tap-haiku.c
 index 34739d1..62ab423 100644
 --- a/net/tap-haiku.c
 +++ b/net/tap-haiku.c
 @@ -25,7 +25,8 @@
  #include net/tap.h
  #include stdio.h
  
 -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int 
 vnet_hdr_required)
 +int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
 + int vnet_hdr_required, int mq_required)
  {
  fprintf(stderr, no tap on Haiku\n);
  return -1;
 @@ -59,3 +60,18 @@ void tap_fd_set_offload(int fd, int csum, int tso4,
  int tso6, int ecn, int ufo)
  {
  }
 +
 +int tap_fd_attach(int fd)
 +{
 +return -1;
 +}
 +
 +int tap_fd_detach(int fd)
 +{
 +return -1;
 +}
 +
 +int tap_fd_ifname(int fd, char *ifname)
 +{
 +return -1;
 +}
 diff --git a/net/tap-linux.c b/net/tap-linux.c
 index c6521be..0854ef5 100644
 --- a/net/tap-linux.c
 +++ b/net/tap-linux.c
 @@ -35,7 +35,8 @@
  
  #define PATH_NET_TUN /dev/net/tun
  
 -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int 
 vnet_hdr_required)
 +int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
 + int vnet_hdr_required, int mq_required)
  {
  struct ifreq ifr;
  int fd, ret;
 @@ -67,6 +68,20 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, 
 int 

Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS

2013-01-10 Thread Alexander Graf


Am 10.01.2013 um 11:17 schrieb Peter Maydell peter.mayd...@linaro.org:

 On 9 January 2013 22:30, Alexander Graf ag...@suse.de wrote:
 In fact, in this particular case one could merge all of the patches except 
 for the
 particular ioctl implementation and just give the respective addresses 
 default
 values until there is an API to set them, similar to how we did things with 
 PVR
 in the beginning.
 
 That's what we started with and we got review comments saying yuck,
 don't hard code this...

And you shouldn't in the long run. But as an intermediate step, it's the best 
solution IMHO.

Either way, let's just finally get something in that doesn't clutter the global 
namespace :).

Alex

 
 -- PMM
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS

2013-01-10 Thread Alexander Graf

On 09.01.2013, at 23:34, Alexander Graf wrote:

 
 On 09.01.2013, at 23:26, Christoffer Dall wrote:
 
 On Wed, Jan 9, 2013 at 5:10 PM, Scott Wood scottw...@freescale.com wrote:
 On 01/09/2013 03:37:20 PM, Alexander Graf wrote:
 
 
 
 Am 09.01.2013 um 22:15 schrieb Scott Wood scottw...@freescale.com:
 
 I get that there's a tradeoff between getting something in now, versus
 waiting until the API is more refined.  Tagging it with a particular ISA
 seems like an odd way of saying soon to be deprecated, though.  What
 happens if we're still squabbling over the perfect replacement API when
 we're trying to push PPC MPIC stuff in?
 
 Then we're the ones who have to come up with a good interface.
 
 
 How about another bad one, with PPC in the name, and some pleas to hurry
 things up? :-)
 
 It's not as if there haven't been last-minute requests for API changes on
 the PPC side in the past...
 
 
 
 This is getting out of hand.
 
 Do you have another API for PPC, which was send for review and not
 commented on several months ago that we can unify right now?
 
 If not, let's go with the ARM name and work on the generic API in the mean 
 time.
 
 The end result will be something along 5 lines in a header files and 3
 lines in a switch case that return -EINVAL if the interface is
 completely deprecated later on, which is not a big problem.
 
 Agreed [1].
 
 So what exactly are we waiting for? Acks from kvm maintainers, right?

In fact, we should probably CC them :)


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS

2013-01-10 Thread Gleb Natapov
On Thu, Jan 10, 2013 at 12:15:55PM +0100, Alexander Graf wrote:
 
 On 09.01.2013, at 23:34, Alexander Graf wrote:
 
  
  On 09.01.2013, at 23:26, Christoffer Dall wrote:
  
  On Wed, Jan 9, 2013 at 5:10 PM, Scott Wood scottw...@freescale.com wrote:
  On 01/09/2013 03:37:20 PM, Alexander Graf wrote:
  
  
  
  Am 09.01.2013 um 22:15 schrieb Scott Wood scottw...@freescale.com:
  
  I get that there's a tradeoff between getting something in now, versus
  waiting until the API is more refined.  Tagging it with a particular ISA
  seems like an odd way of saying soon to be deprecated, though.  What
  happens if we're still squabbling over the perfect replacement API when
  we're trying to push PPC MPIC stuff in?
  
  Then we're the ones who have to come up with a good interface.
  
  
  How about another bad one, with PPC in the name, and some pleas to hurry
  things up? :-)
  
  It's not as if there haven't been last-minute requests for API changes on
  the PPC side in the past...
  
  
  
  This is getting out of hand.
  
  Do you have another API for PPC, which was send for review and not
  commented on several months ago that we can unify right now?
  
  If not, let's go with the ARM name and work on the generic API in the mean 
  time.
  
  The end result will be something along 5 lines in a header files and 3
  lines in a switch case that return -EINVAL if the interface is
  completely deprecated later on, which is not a big problem.
  
  Agreed [1].
  
  So what exactly are we waiting for? Acks from kvm maintainers, right?
 
 In fact, we should probably CC them :)
 
 
I am looking at them right now :) Give me a couple of days please.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/9] KVM: PPC: In-kernel PAPR interrupt controller emulation

2013-01-10 Thread Alexander Graf

On 10.01.2013, at 06:09, Paul Mackerras wrote:

 On Sat, Dec 15, 2012 at 01:06:13PM +1100, Benjamin Herrenschmidt wrote:
 On Sat, 2012-12-15 at 01:46 +0100, Alexander Graf wrote:
 On 05.11.2012, at 04:18, Paul Mackerras wrote:
 
 This series implements in-kernel emulation of the XICS interrupt
 controller specified in the PAPR specification and used by pseries
 guests.  Since the XICS is organized a bit differently to the
 interrupt controllers used on x86 machines, I have defined some new
 ioctls for setting up the XICS and for saving and restoring its
 state.  It may be possible to use some of the currently x86-specific
 ioctls instead.
 
 Is this one already following the new world order that we discussed during 
 KVM Forum? :)
 
 The new world order  (sorry, looks like nobody took notes and
 people expect me to do a write up from memory now ... :-)
 
 Well, mostly we agreed that the x86 stuff wasn't going to work for us.
 
 So basically what we discussed boils down to:
 
 - Move the existing generic KVM ioctl to create the kernel APIC to
 x86 since it's no as-is useful for other archs who, among other things,
 might need to pass argument based on the machine type (type of interrupt
 subsystem for example, etc...)
 
 Assuming you're talking about KVM_CREATE_IRQCHIP, it is already
 handled entirely in arch code (arch/x86/kvm/x86.c and
 arch/ia64/kvm/kvm-ia64.c), along with KVM_GET_IRQCHIP and
 KVM_SET_IRQCHIP.

This part is about QEMU.

 
 - Once that's done, well, instanciating interrupt controller objects
 becomes pretty much an arch specific matter. We could try to keep some
 ioctls somewhat common though it's not *that* useful because the various
 types  arguments are going to be fairly arch specific, so goes for
 configuration. Examples of what could be kept common are:
 
   * Create irq chip, takes at least a type argument, possibly a few
 more type-specific args (or a union, but then let's keep space in there
 since we can't change the size of the struct later as it would impact
 the ioctl number afaik).
 
 The existing KVM_CREATE_IRQCHIP is defined as _IO(...) which means
 that it doesn't read or write memory, but there is still the 3rd
 argument to the ioctl, which would give us 64 bits to indicate the
 type of the top-level IRQ controller (XICS, MPIC, ...), but not much
 more.
 
 It sounds like the agreement at the meeting was more along the lines
 of the KVM_CREATE_IRQCHIP_ARGS ioctl (as in the patches I posted)
 which can be called multiple times to instantiate pieces of the
 interrupt framework, rather than having a KVM_CREATE_IRQCHIP that gets
 called once early on to say that there we are using in-kernel
 interrupt controller emulation, followed by other calls to configure
 the various parts of the framework.  Is that accurate?

KVM_CREATE_IRQCHIP should get a parameter indicating the type of IRQCHIP (XICS 
/ MPIC / APIC / VGIC / ...). The parameter-less version defaults to an arch 
specific default (APIC for x86, VGIC for arm, error on PPC).

I'm not 100% sure yet whether we want to support models with multiple IRQCHIPs. 
If so, we need to return a device token from the KVM_CREATE_IRQCHIP ioctl. 
Maybe it makes more sense to model specific cases like this as separate type 
though with specific, explicit subdevice ids. Not sure.

Other instantiation attributes we don't know that early on should be settable 
between the time frame of vm creation and first execution. An example for this 
are device addresses. Check out the threads KVM: ARM: Rename 
KVM_SET_DEVICE_ADDRESS and KVM: ARM: Introduce KVM_SET_DEVICE_ADDRESS ioctl 
for more information on this particular bit.


Alex

 
  * Assigning the address of an irq chip when it can change (see ARM
 patches)
 
 - The existing business with irqfd etc... is mostly ok, except the
 interfaces messing around with MSIs (see virtio-pci use of kvm functions
 directly). The assignment of an irq number for an MSI must be a hook,
 probably a PCI controller hook, so x86 can get it done via its existing
 kernel interfaces and sane architectures can keep the assignment in qemu
 where it belongs.
 
 So, if I've understood you correctly about what was agreed, the set of
 ioctls that is implemented in the patches I posted is in line with
 what was agreed, isn't it?  If not, where does it differ?  (To recap,
 I had KVM_CREATE_IRQCHIP_ARGS, KVM_IRQCHIP_GET_SOURCES and
 KVM_IRQCHIP_SET_SOURCES, plus a one-reg interface to get/set the
 vcpu-specific state.)
 
 Paul.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 02/12] target-i386: Don't set any KVM flag by default if KVM is disabled

2013-01-10 Thread Michael S. Tsirkin
On Wed, Jan 09, 2013 at 04:53:42PM -0200, Eduardo Habkost wrote:
 This is a cleanup that tries to solve two small issues:
 
  - We don't need a separate kvm_pv_eoi_features variable just to keep a
constant calculated at compile-time, and this style would require
adding a separate variable (that's declared twice because of the
CONFIG_KVM ifdef) for each feature that's going to be enabled/disable
by machine-type compat code.
  - The pc-1.3 code is setting the kvm_pv_eoi flag on cpuid_kvm_features
even when KVM is disabled at runtime. This small incosistency in
the cpuid_kvm_features field isn't a problem today because
cpuid_kvm_features is ignored by the TCG code, but it may cause
unexpected problems later when refactoring the CPUID handling code.
 
 This patch eliminates the kvm_pv_eoi_features variable and simply uses
 kvm_enabled() inside the enable_kvm_pv_eoi() compat function, so it
 enables kvm_pv_eoi only if KVM is enabled. I believe this makes the
 behavior of enable_kvm_pv_eoi() clearer and easier to understand.
 
 Signed-off-by: Eduardo Habkost ehabk...@redhat.com

Acked-by: Michael S. Tsirkin m...@redhat.com



 ---
 Cc: kvm@vger.kernel.org
 Cc: Michael S. Tsirkin m...@redhat.com
 Cc: Gleb Natapov g...@redhat.com
 Cc: Marcelo Tosatti mtosa...@redhat.com
 
 Changes v2:
  - Coding style fix
 
 Changes v3:
  - Eliminate #ifdef by using the fake KVM_FEATURE_PV_EOI #define
 
 Changes v4:
  - Check kvm_enabled() when actually using kvm_default_features
  - Eliminate Yet Another #ifdef by using the fake KVM_FEATURE_*
#defines on kvm_default_features initialization
 ---
  target-i386/cpu.c | 16 ++--
  1 file changed, 6 insertions(+), 10 deletions(-)
 
 diff --git a/target-i386/cpu.c b/target-i386/cpu.c
 index e4dc370..57a22b7 100644
 --- a/target-i386/cpu.c
 +++ b/target-i386/cpu.c
 @@ -206,22 +206,16 @@ typedef struct model_features_t {
  int check_cpuid = 0;
  int enforce_cpuid = 0;
  
 -#if defined(CONFIG_KVM)
  static uint32_t kvm_default_features = (1  KVM_FEATURE_CLOCKSOURCE) |
  (1  KVM_FEATURE_NOP_IO_DELAY) |
  (1  KVM_FEATURE_CLOCKSOURCE2) |
  (1  KVM_FEATURE_ASYNC_PF) |
  (1  KVM_FEATURE_STEAL_TIME) |
  (1  KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
 -static const uint32_t kvm_pv_eoi_features = (0x1  KVM_FEATURE_PV_EOI);
 -#else
 -static uint32_t kvm_default_features = 0;
 -static const uint32_t kvm_pv_eoi_features = 0;
 -#endif
  
  void enable_kvm_pv_eoi(void)
  {
 -kvm_default_features |= kvm_pv_eoi_features;
 +kvm_default_features |= (1UL  KVM_FEATURE_PV_EOI);
  }
  
  void host_cpuid(uint32_t function, uint32_t count,
 @@ -1343,13 +1337,15 @@ static int cpu_x86_parse_featurestr(x86_def_t 
 *x86_cpu_def, char *features)
  unsigned int i;
  char *featurestr; /* Single 'key=value string being parsed */
  /* Features to be added */
 -FeatureWordArray plus_features = {
 -[FEAT_KVM] = kvm_default_features,
 -};
 +FeatureWordArray plus_features = { 0 };
  /* Features to be removed */
  FeatureWordArray minus_features = { 0 };
  uint32_t numvalue;
  
 +if (kvm_enabled()) {
 +plus_features[FEAT_KVM] = kvm_default_features;
 +}
 +
  add_flagname_to_bitmaps(hypervisor, plus_features);
  
  featurestr = features ? strtok(features, ,) : NULL;
 -- 
 1.7.11.7
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH 00/12] Multiqueue virtio-net

2013-01-10 Thread Stefan Hajnoczi
On Thu, Jan 10, 2013 at 05:34:14PM +0800, Jason Wang wrote:
 On 01/10/2013 04:44 PM, Stefan Hajnoczi wrote:
  On Wed, Jan 09, 2013 at 11:33:25PM +0800, Jason Wang wrote:
  On 01/09/2013 11:32 PM, Michael S. Tsirkin wrote:
  On Wed, Jan 09, 2013 at 03:29:24PM +0100, Stefan Hajnoczi wrote:
  On Fri, Dec 28, 2012 at 06:31:52PM +0800, Jason Wang wrote:
  Perf Numbers:
 
  Two Intel Xeon 5620 with direct connected intel 82599EB
  Host/Guest kernel: David net tree
  vhost enabled
 
  - lots of improvents of both latency and cpu utilization in 
  request-reponse test
  - get regression of guest sending small packets which because TCP tends 
  to batch
less when the latency were improved
 
  1q/2q/4q
  TCP_RR
   size #sessions trans.rate  norm trans.rate  norm trans.rate  norm
  1 1 9393.26   595.64  9408.18   597.34  9375.19   584.12
  1 2072162.1   2214.24 129880.22 2456.13 196949.81 2298.13
  1 50107513.38 2653.99 139721.93 2490.58 259713.82 2873.57
  1 100   126734.63 2676.54 145553.5  2406.63 265252.68 2943
  64 19453.42   632.33  9371.37   616.13  9338.19   615.97
  64 20   70620.03  2093.68 125155.75 2409.15 191239.91 2253.32
  64 50   1069662448.29 146518.67 2514.47 242134.07 2720.91
  64 100  117046.35 2394.56 190153.09 2696.82 238881.29 2704.41
  256 1   8733.29   736.36  8701.07   680.83  8608.92   530.1
  256 20  69279.89  2274.45 115103.07 2299.76 144555.16 1963.53
  256 50  97676.02  2296.09 150719.57 2522.92 254510.5  3028.44
  256 100 150221.55 2949.56 197569.3  2790.92 300695.78 3494.83
  TCP_CRR
   size #sessions trans.rate  norm trans.rate  norm trans.rate  norm
  1 1 2848.37  163.41 2230.39  130.89 2013.09  120.47
  1 2023434.5  562.11 31057.43 531.07 49488.28 564.41
  1 5028514.88 582.17 40494.23 605.92 60113.35 654.97
  1 100   28827.22 584.73 48813.25 661.6  61783.62 676.56
  64 12780.08  159.4  2201.07  127.96 2006.8   117.63
  64 20   23318.51 564.47 30982.44 530.24 49734.95 566.13
  64 50   28585.72 582.54 40576.7  610.08 60167.89 656.56
  64 100  28747.37 584.17 49081.87 667.87 60612.94 662
  256 1   2772.08  160.51 2231.84  131.05 2003.62  113.45
  256 20  23086.35 559.8  30929.09 528.16 48454.9  555.22
  256 50  28354.7  579.85 40578.31 60760261.71 657.87
  256 100 28844.55 585.67 48541.86 659.08 61941.07 676.72
  TCP_STREAM guest receiving
   size #sessions throughput  norm throughput  norm throughput  norm
  1 1 16.27   1.33   16.11.12   16.13   0.99
  1 2 33.04   2.08   32.96   2.19   32.75   1.98
  1 4 66.62   6.83   68.35.56   66.14   2.65
  64 1896.55  56.67  914.02  58.14  898.9   61.56
  64 21830.46 91.02  1812.02 64.59  1835.57 66.26
  64 43626.61 142.55 3636.25 100.64 3607.46 75.03
  256 1   2619.49 131.23 2543.19 129.03 2618.69 132.39
  256 2   5136.58 203.02 5163.31 141.11 5236.51 149.4
  256 4   7063.99 242.83 9365.4  208.49 9421.03 159.94
  512 1   3592.43 165.24 3603.12 167.19 3552.5  169.57
  512 2   7042.62 246.59 7068.46 180.87 7258.52 186.3
  512 4   6996.08 241.49 9298.34 206.12 9418.52 159.33
  1024 1  4339.54 192.95 4370.2  191.92 4211.72 192.49
  1024 2  7439.45 254.77 9403.99 215.24 9120.82 222.67
  1024 4  7953.86 272.11 9403.87 208.23 9366.98 159.49
  4096 1  7696.28 272.04 7611.41 270.38 7778.71 267.76
  4096 2  7530.35 261.1  8905.43 246.27 8990.18 267.57
  4096 4  7121.6  247.02 9411.75 206.71 9654.96 184.67
  16384 1 7795.73 268.54 7780.94 267.2  7634.26 260.73
  16384 2 7436.57 255.81 9381.86 220.85 9392220.36
  16384 4 7199.07 247.81 9420.96 205.87 9373.69 159.57
  TCP_MAERTS guest sending
   size #sessions throughput  norm throughput  norm throughput  norm
  1 1 15.94   0.62   15.55   0.61   15.13   0.59
  1 2 36.11   0.83   32.46   0.69   32.28   0.69
  1 4 71.59   1  68.91   0.94   61.52   0.77
  64 1630.71  22.52  622.11  22.35  605.09  21.84
  64 21442.36 30.57  1292.15 25.82  1282.67 25.55
  64 43186.79 42.59  2844.96 36.03  2529.69 30.06
  256 1   1760.96 58.07  1738.44 57.43  1695.99 56.19
  256 2   4834.23 95.19  3524.85 64.21  3511.94 64.45
  256 4   9324.63 145.74 8956.49 116.39 6720.17 73.86
  512 1   2678.03 84.1   2630.68 82.93  2636.54 82.57
  512 2   9368.17 195.61 9408.82 204.53 5316.3  92.99
  512 4   9186.34 209.68 9358.72 183.82 9489.29 160.42
  1024 1  3620.71 109.88 3625.54 109.83 3606.61 112.35
  1024 2  9429258.32 7082.79 120.55 7403.53 134.78
  1024 4  9430.66 290.44 9499.29 232.31 9414.6  190.92
  4096 1  9339.28 296.48 9374.23 372.88 9348.76 298.49
  4096 2  9410.53 378.69 9412.61 286.18 9409.75 278.31
  4096 4  9487.35 374.1  9556.91 288.81 9441.94 221.64
  16384 1 9380.43 403.8  9379.78 399.13 9382.42 393.55
  16384 2 9367.69 406.93 9415.04 312.68 9409.29 300.9
  16384 4 9391.96 405.17 9695.12 310.54 9423.76 223.47
  Trying to understand the performance results:
 
  What is the host device configuration?  tap + bridge?
  Yes.
  Did you use host CPU affinity for the vhost threads?
  I use numactl to pin cpu threads 

Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS

2013-01-10 Thread Marc Zyngier
On Thu, 10 Jan 2013 10:17:09 +, Peter Maydell
peter.mayd...@linaro.org wrote:
 On 9 January 2013 22:30, Alexander Graf ag...@suse.de wrote:
 In fact, in this particular case one could merge all of the patches
 except for the
 particular ioctl implementation and just give the respective addresses
 default
 values until there is an API to set them, similar to how we did things
 with PVR
 in the beginning.
 
 That's what we started with and we got review comments saying yuck,
 don't hard code this...

Even worse: hard-coding things breaks kvmtool (we do not emulate a
vexpress and have a rather different memory map) and arm64 (the
requirements on GIC mappings are different).

So hard coding addresses is already unacceptable.

M.
-- 
Fast, cheap, reliable. Pick two.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v9 2/3] x86, apicv: add virtual x2apic support

2013-01-10 Thread Zhang, Yang Z
Gleb Natapov wrote on 2013-01-10:
 On Thu, Jan 10, 2013 at 08:32:06AM +, Zhang, Yang Z wrote:
 Gleb Natapov wrote on 2013-01-10:
 On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote:
 From: Yang Zhang yang.z.zh...@intel.com
 
 basically to benefit from apicv, we need to enable virtualized x2apic mode.
 Currently, we only enable it when guest is really using x2apic.
 
 Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled
 x2apic:
 0x800 - 0x8ff: no read intercept for apicv register virtualization,
   except APIC ID and TMCCT.
 APIC ID and TMCCT: need software's assistance to get right value.
 TPR,EOI,SELF-IPI: no write intercept for virtual interrupt delivery.
 Signed-off-by: Kevin Tian kevin.t...@intel.com
 Signed-off-by: Yang Zhang yang.z.zh...@intel.com
 ---
  arch/x86/include/asm/kvm_host.h |2 + arch/x86/include/asm/vmx.h
|1 + arch/x86/kvm/lapic.c|5 +-
arch/x86/kvm/svm.c  |6 + arch/x86/kvm/vmx.c |  194
+-- 5 files
 changed, 200
  insertions(+), 8 deletions(-)
 diff --git a/arch/x86/include/asm/kvm_host.h
 b/arch/x86/include/asm/kvm_host.h index c431b33..572a562 100644 ---
 a/arch/x86/include/asm/kvm_host.h +++
 b/arch/x86/include/asm/kvm_host.h @@ -697,6 +697,8 @@ struct
 kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
 +  void (*enable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
 +  void (*disable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
 Make one callback with enable/disable parameter. And do not forget SVM.
 
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool
 is_mmio);
 diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
 index 44c3f7e..0a54df0 100644
 --- a/arch/x86/include/asm/vmx.h
 +++ b/arch/x86/include/asm/vmx.h
 @@ -139,6 +139,7 @@
  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x0001 #define
  SECONDARY_EXEC_ENABLE_EPT   0x0002 #define
  SECONDARY_EXEC_RDTSCP 0x0008 +#define
  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x0010 #define
  SECONDARY_EXEC_ENABLE_VPID  0x0020 #define
  SECONDARY_EXEC_WBINVD_EXITING 0x0040 #define
  SECONDARY_EXEC_UNRESTRICTED_GUEST 0x0080
 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
 index 0664c13..ec38906 100644
 --- a/arch/x86/kvm/lapic.c
 +++ b/arch/x86/kvm/lapic.c
 @@ -1328,7 +1328,10 @@ void kvm_lapic_set_base(struct kvm_vcpu
 *vcpu,
 u64 value)
u32 id = kvm_apic_id(apic);
u32 ldr = ((id  4)  16) | (1  (id  0xf));
kvm_apic_set_ldr(apic, ldr);
 -  }
 +  kvm_x86_ops-enable_virtual_x2apic_mode(vcpu);
 +  } else
 +  kvm_x86_ops-disable_virtual_x2apic_mode(vcpu);
 +
 You just broke SVM.
apic-base_address = apic-vcpu-arch.apic_base 
 MSR_IA32_APICBASE_BASE;
 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
 index d29d3cd..0b82cb1 100644
 --- a/arch/x86/kvm/svm.c
 +++ b/arch/x86/kvm/svm.c
 @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct
 kvm_vcpu
 *vcpu, int tpr, int irr)
set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
  }
 +static void svm_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
 +{
 +  return;
 +}
 +
  static int svm_nmi_allowed(struct kvm_vcpu *vcpu) {   struct vcpu_svm
  *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops
  svm_x86_ops = {   .enable_nmi_window = enable_nmi_window,
.enable_irq_window = enable_irq_window, .update_cr8_intercept =
  update_cr8_intercept,
 +  .enable_virtual_x2apic_mode = svm_enable_virtual_x2apic_mode,
 
.set_tss_addr = svm_set_tss_addr,
.get_tdp_level = get_npt_level,
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 688f43f..b203ce7 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -433,6 +433,8 @@ struct vcpu_vmx {
 
bool rdtscp_enabled;
 +  bool virtual_x2apic_enabled;
 +
/* Support for a guest hypervisor (nested VMX) */
struct nested_vmx nested;
  };
 @@ -767,12 +769,23 @@ static inline bool
 cpu_has_vmx_virtualize_apic_accesses(void)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
  }
 +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
 +{
 +  return vmcs_config.cpu_based_2nd_exec_ctrl 
 +  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
 +}
 +
  static inline bool cpu_has_vmx_apic_register_virt(void)
  {
return vmcs_config.cpu_based_2nd_exec_ctrl 
SECONDARY_EXEC_APIC_REGISTER_VIRT;
  }
 +static inline bool cpu_has_vmx_virtual_intr_delivery(void)
 +{
 +  return false;
 +}
 +
  static inline bool cpu_has_vmx_flexpriority(void)
  {
return cpu_has_vmx_tpr_shadow() 
 @@ -2544,6 +2557,7 @@ 

Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS

2013-01-10 Thread Alexander Graf

On 10.01.2013, at 12:53, Marc Zyngier wrote:

 On Thu, 10 Jan 2013 10:17:09 +, Peter Maydell
 peter.mayd...@linaro.org wrote:
 On 9 January 2013 22:30, Alexander Graf ag...@suse.de wrote:
 In fact, in this particular case one could merge all of the patches
 except for the
 particular ioctl implementation and just give the respective addresses
 default
 values until there is an API to set them, similar to how we did things
 with PVR
 in the beginning.
 
 That's what we started with and we got review comments saying yuck,
 don't hard code this...
 
 Even worse: hard-coding things breaks kvmtool (we do not emulate a
 vexpress and have a rather different memory map) and arm64 (the
 requirements on GIC mappings are different).
 
 So hard coding addresses is already unacceptable.

I see. The current approach with an intermediate ARM specific ioctl is 
certainly good with me then. And who knows - with a bit of luck we might be 
able to get a unified ioctl model for this before the code hits Linus' tree 
(read: APIs are really stable).


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v9 3/3] x86, apicv: add virtual interrupt delivery support

2013-01-10 Thread Zhang, Yang Z
Gleb Natapov wrote on 2013-01-10:
 On Thu, Jan 10, 2013 at 03:26:08PM +0800, Yang Zhang wrote:
 From: Yang Zhang yang.z.zh...@intel.com
 
 Virtual interrupt delivery avoids KVM to inject vAPIC interrupts
 manually, which is fully taken care of by the hardware. This needs
 some special awareness into existing interrupr injection path:
 
 - for pending interrupt, instead of direct injection, we may need
   update architecture specific indicators before resuming to guest.
 - A pending interrupt, which is masked by ISR, should be also
   considered in above update action, since hardware will decide
   when to inject it at right time. Current has_interrupt and
   get_interrupt only returns a valid vector from injection p.o.v.
 Signed-off-by: Kevin Tian kevin.t...@intel.com
 Signed-off-by: Yang Zhang yang.z.zh...@intel.com
 ---
  arch/x86/include/asm/kvm_host.h |5 + arch/x86/include/asm/vmx.h   
|   11 +++ arch/x86/kvm/irq.c  |   56 +++-
  arch/x86/kvm/lapic.c|   72 +--
  arch/x86/kvm/lapic.h|   23 + arch/x86/kvm/svm.c   
|   18  arch/x86/kvm/vmx.c  |  191
  +-- arch/x86/kvm/x86.c
   |   14 +++- include/linux/kvm_host.h|3 +
  virt/kvm/ioapic.c   |   18  virt/kvm/ioapic.h 
   |4 + virt/kvm/irq_comm.c |   22 +
  virt/kvm/kvm_main.c |5 + 13 files changed, 399
  insertions(+), 43 deletions(-)
 diff --git a/arch/x86/include/asm/kvm_host.h
 b/arch/x86/include/asm/kvm_host.h index 572a562..f471856 100644 ---
 a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h
 @@ -697,6 +697,10 @@ struct kvm_x86_ops {
  void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
  void (*enable_irq_window)(struct kvm_vcpu *vcpu);
  void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
 +int (*has_virtual_interrupt_delivery)(struct kvm_vcpu *vcpu);
 +void (*update_apic_irq)(struct kvm_vcpu *vcpu, int max_irr);
 +void (*update_eoi_exitmap)(struct kvm_vcpu *vcpu);
 +void (*set_svi)(int isr);
  void (*enable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);  void
  (*disable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);  int
  (*set_tss_addr)(struct kvm *kvm, unsigned int addr); @@ -993,6 +997,7
  @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva); int
  kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void
  kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int
  cpuid_maxphyaddr(struct kvm_vcpu *vcpu); +int
  kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int
  kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int
  kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int
  kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
 index 0a54df0..694586c 100644
 --- a/arch/x86/include/asm/vmx.h
 +++ b/arch/x86/include/asm/vmx.h
 @@ -62,6 +62,7 @@
  #define EXIT_REASON_MCE_DURING_VMENTRY  41 #define
  EXIT_REASON_TPR_BELOW_THRESHOLD 43 #define EXIT_REASON_APIC_ACCESS
  44 +#define EXIT_REASON_EOI_INDUCED 45 #define
  EXIT_REASON_EPT_VIOLATION   48 #define EXIT_REASON_EPT_MISCONFIG  
  49 #define EXIT_REASON_WBINVD  54 @@ -144,6 +145,7 @@
  #define SECONDARY_EXEC_WBINVD_EXITING   0x0040 #define
  SECONDARY_EXEC_UNRESTRICTED_GUEST   0x0080 #define
  SECONDARY_EXEC_APIC_REGISTER_VIRT   0x0100 +#define
  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY0x0200 #define
  SECONDARY_EXEC_PAUSE_LOOP_EXITING   0x0400 #define
  SECONDARY_EXEC_ENABLE_INVPCID   0x1000
 @@ -181,6 +183,7 @@ enum vmcs_field {
  GUEST_GS_SELECTOR   = 0x080a,   GUEST_LDTR_SELECTOR   
   = 0x080c,  GUEST_TR_SELECTOR   = 0x080e,
  +   GUEST_INTR_STATUS   = 0x0810,   HOST_ES_SELECTOR 
= 0x0c00, HOST_CS_SELECTOR= 0x0c02,
  HOST_SS_SELECTOR= 0x0c04, @@ -208,6 +211,14 @@
  enum vmcs_field {   APIC_ACCESS_ADDR_HIGH   = 0x2015,   
 EPT_POINTER  
= 0x201a, EPT_POINTER_HIGH=
  0x201b,
 +EOI_EXIT_BITMAP0= 0x201c,
 +EOI_EXIT_BITMAP0_HIGH   = 0x201d,
 +EOI_EXIT_BITMAP1= 0x201e,
 +EOI_EXIT_BITMAP1_HIGH   = 0x201f,
 +EOI_EXIT_BITMAP2= 0x2020,
 +EOI_EXIT_BITMAP2_HIGH   = 0x2021,
 +EOI_EXIT_BITMAP3= 0x2022,
 +EOI_EXIT_BITMAP3_HIGH   = 0x2023,
  GUEST_PHYSICAL_ADDRESS  = 0x2400,
  GUEST_PHYSICAL_ADDRESS_HIGH = 0x2401,
  VMCS_LINK_POINTER   = 0x2800,
 diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
 index b111aee..e113440 100644
 --- a/arch/x86/kvm/irq.c
 +++ 

Re: [PATCH v9 2/3] x86, apicv: add virtual x2apic support

2013-01-10 Thread Gleb Natapov
On Thu, Jan 10, 2013 at 11:54:21AM +, Zhang, Yang Z wrote:
  The right logic should be:
  When register virtualization enabled, read all apic msr(except apic id reg 
  and
  tmcct which have the hook) not cause vmexit and only write TPR not cause
  vmexit.
  When vid enabled, write to TPR, EOI and SELF IPI not cause vmexit.
  It's better to put the patch of envirtualize x2apic mode as first patch.
  
  There is no point whatsoever to enable virtualize x2apic without
  allowing at least one non intercepted access to x2apic MSR range and
  this is what your patch is doing when run on cpu without vid capability.
 No, at least read/write TPR will not cause vmexit if virtualize x2apic mode 
 is enabled.
For that you need to disable 808H MSR intercept, which your patch does not do.

 I am not sure whether I understand your comments right in previous 
 discussion, here is my thinking:
 1. enable virtualize x2apic mode if guest is really using x2apic and clear 
 TPR in msr read  and write bitmap. This will benefit reading TPR.
 2. If APIC registers virtualization is enabled, clear all bit in rang 
 0x800-0x8ff(except apic id reg and tmcct).
Clear all read bits in the range.

 3. If vid is enabled, clear EOI and SELF IPI in msr write map.
 
Looks OK.

 One concern you mentioned is that vid enabled and virtualize x2apic is 
 disabled but guest still use x2apic. In this case, we still use MSR bitmap to 
 intercept x2apic. This means the vEOI update will never happen. But we still 
 can benefit from interrupt window.
 
What interrupt windows? Without virtualized x2apic TPR/EOI
virtualization will not happen and we rely on that in the code.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v9 2/3] x86, apicv: add virtual x2apic support

2013-01-10 Thread Zhang, Yang Z
Gleb Natapov wrote on 2013-01-10:
 On Thu, Jan 10, 2013 at 11:54:21AM +, Zhang, Yang Z wrote:
 The right logic should be:
 When register virtualization enabled, read all apic msr(except apic id reg 
 and
 tmcct which have the hook) not cause vmexit and only write TPR not cause
 vmexit.
 When vid enabled, write to TPR, EOI and SELF IPI not cause vmexit.
 It's better to put the patch of envirtualize x2apic mode as first patch.
 
 There is no point whatsoever to enable virtualize x2apic without
 allowing at least one non intercepted access to x2apic MSR range and
 this is what your patch is doing when run on cpu without vid capability.
 No, at least read/write TPR will not cause vmexit if virtualize x2apic mode 
 is
 enabled.
 For that you need to disable 808H MSR intercept, which your patch does not do.
I want to do this in next patch.
 
 I am not sure whether I understand your comments right in previous
 discussion, here is my thinking: 1. enable virtualize x2apic mode if
 guest is really using x2apic and clear TPR in msr read  and write
 bitmap. This will benefit reading TPR. 2. If APIC registers
 virtualization is enabled, clear all bit in rang
 0x800-0x8ff(except apic id reg and tmcct).
 Clear all read bits in the range.
 
 3. If vid is enabled, clear EOI and SELF IPI in msr write map.
 
 Looks OK.
 
 One concern you mentioned is that vid enabled and virtualize x2apic is 
 disabled
 but guest still use x2apic. In this case, we still use MSR bitmap to 
 intercept x2apic.
 This means the vEOI update will never happen. But we still can benefit from
 interrupt window.
 
 What interrupt windows? Without virtualized x2apic TPR/EOI
 virtualization will not happen and we rely on that in the code.
Yes, but we can eliminate vmexit of interrupt window. Interrupt still can 
delivery to guest without vmexit when guest enables interrupt if vid is 
enabled. See SDM vol 3, 29.2.2.

Best regards,
Yang


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v9 2/3] x86, apicv: add virtual x2apic support

2013-01-10 Thread Gleb Natapov
On Thu, Jan 10, 2013 at 12:22:51PM +, Zhang, Yang Z wrote:
 Gleb Natapov wrote on 2013-01-10:
  On Thu, Jan 10, 2013 at 11:54:21AM +, Zhang, Yang Z wrote:
  The right logic should be:
  When register virtualization enabled, read all apic msr(except apic id 
  reg and
  tmcct which have the hook) not cause vmexit and only write TPR not cause
  vmexit.
  When vid enabled, write to TPR, EOI and SELF IPI not cause vmexit.
  It's better to put the patch of envirtualize x2apic mode as first patch.
  
  There is no point whatsoever to enable virtualize x2apic without
  allowing at least one non intercepted access to x2apic MSR range and
  this is what your patch is doing when run on cpu without vid capability.
  No, at least read/write TPR will not cause vmexit if virtualize x2apic 
  mode is
  enabled.
  For that you need to disable 808H MSR intercept, which your patch does not 
  do.
 I want to do this in next patch.
  
Then don't. There is no point in the patch that enables virtualize
x2apic and does not allow at least one non-intercepted MSR access.

  I am not sure whether I understand your comments right in previous
  discussion, here is my thinking: 1. enable virtualize x2apic mode if
  guest is really using x2apic and clear TPR in msr read  and write
  bitmap. This will benefit reading TPR. 2. If APIC registers
  virtualization is enabled, clear all bit in rang
  0x800-0x8ff(except apic id reg and tmcct).
  Clear all read bits in the range.
  
  3. If vid is enabled, clear EOI and SELF IPI in msr write map.
  
  Looks OK.
  
  One concern you mentioned is that vid enabled and virtualize x2apic is 
  disabled
  but guest still use x2apic. In this case, we still use MSR bitmap to 
  intercept x2apic.
  This means the vEOI update will never happen. But we still can benefit from
  interrupt window.
  
  What interrupt windows? Without virtualized x2apic TPR/EOI
  virtualization will not happen and we rely on that in the code.
 Yes, but we can eliminate vmexit of interrupt window. Interrupt still can 
 delivery to guest without vmexit when guest enables interrupt if vid is 
 enabled. See SDM vol 3, 29.2.2.
 
What does it matter that you eliminated vmexit of interrupt window if
you broke everything else? The vid patch assumes that apic emulation
either entirely happens in a software when vid is disabled or in a CPU
if vid is enabled. Mixed mode will not work (apic-isr_count = 1 trick
will break things for instance). And it is not worth it to complicate
the code to make it work.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PULL 0/8] ppc patch queue 2013-01-10

2013-01-10 Thread Alexander Graf
Hi Marcelo / Gleb,

This is my current patch queue for ppc.  Please pull.

Highlights this time:

  - Book3S: enable potential sPAPR guest emulation on PR KVM on pHyp
  - BookE: EPR (External Proxy Register) support

Alex

The following changes since commit 908e7d7999bcce70ac52e7f390a8f5cbc55948de:
  Gleb Natapov (1):
KVM: MMU: simplify folding of dirty bit into accessed_dirty

are available in the git repository at:

  git://github.com/agraf/linux-2.6.git kvm-ppc-next

Alexander Graf (6):
  KVM: PPC: Only WARN on invalid emulation
  KVM: PPC: Book3S: PR: Enable alternative instruction for SC 1
  KVM: PPC: BookE: Allow irq deliveries to inject requests
  KVM: PPC: BookE: Emulate mfspr on EPR
  KVM: PPC: BookE: Implement EPR exit
  KVM: PPC: BookE: Add EPR ONE_REG sync

Mihai Caraman (2):
  KVM: PPC: Fix SREGS documentation reference
  KVM: PPC: Fix mfspr/mtspr MMUCFG emulation

 Documentation/virtual/kvm/api.txt   |   43 --
 arch/powerpc/include/asm/kvm_host.h |2 +
 arch/powerpc/include/asm/kvm_ppc.h  |   10 
 arch/powerpc/include/uapi/asm/kvm.h |6 -
 arch/powerpc/kvm/book3s_emulate.c   |   30 
 arch/powerpc/kvm/book3s_pr.c|5 
 arch/powerpc/kvm/booke.c|   40 +++-
 arch/powerpc/kvm/booke_emulate.c|3 ++
 arch/powerpc/kvm/emulate.c  |5 
 arch/powerpc/kvm/powerpc.c  |   13 +-
 include/linux/kvm_host.h|1 +
 include/uapi/linux/kvm.h|6 +
 12 files changed, 153 insertions(+), 11 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/8] KVM: PPC: BookE: Emulate mfspr on EPR

2013-01-10 Thread Alexander Graf
The EPR register is potentially valid for PR KVM as well, so we need
to emulate accesses to it. It's only defined for reading, so only
handle the mfspr case.

Signed-off-by: Alexander Graf ag...@suse.de
---
 arch/powerpc/kvm/booke_emulate.c |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 4685b8c..27a4b28 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -269,6 +269,9 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int 
sprn, ulong *spr_val)
case SPRN_ESR:
*spr_val = vcpu-arch.shared-esr;
break;
+   case SPRN_EPR:
+   *spr_val = vcpu-arch.epr;
+   break;
case SPRN_CSRR0:
*spr_val = vcpu-arch.csrr0;
break;
-- 
1.6.0.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 8/8] KVM: PPC: BookE: Add EPR ONE_REG sync

2013-01-10 Thread Alexander Graf
We need to be able to read and write the contents of the EPR register
from user space.

This patch implements that logic through the ONE_REG API and declares
its (never implemented) SREGS counterpart as deprecated.

Signed-off-by: Alexander Graf ag...@suse.de
---
 Documentation/virtual/kvm/api.txt   |1 +
 arch/powerpc/include/uapi/asm/kvm.h |6 +-
 arch/powerpc/kvm/booke.c|   21 +
 3 files changed, 27 insertions(+), 1 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index a98ed09..09905cb 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1774,6 +1774,7 @@ registers, find a list below:
   PPC   | KVM_REG_PPC_VPA_SLB   | 128
   PPC   | KVM_REG_PPC_VPA_DTL   | 128
   PPC   | KVM_REG_PPC_EPCR | 32
+  PPC   | KVM_REG_PPC_EPR  | 32
 
 4.69 KVM_GET_ONE_REG
 
diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
b/arch/powerpc/include/uapi/asm/kvm.h
index 2fba8a6..16064d0 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -114,7 +114,10 @@ struct kvm_regs {
 /* Embedded Floating Point (SPE) -- IVOR32-34 if KVM_SREGS_E_IVOR */
 #define KVM_SREGS_E_SPE(1  9)
 
-/* External Proxy (EXP) -- EPR */
+/*
+ * DEPRECATED! USE ONE_REG FOR THIS ONE!
+ * External Proxy (EXP) -- EPR
+ */
 #define KVM_SREGS_EXP  (1  10)
 
 /* External PID (E.PD) -- EPSC/EPLC */
@@ -412,5 +415,6 @@ struct kvm_get_htab_header {
 #define KVM_REG_PPC_VPA_DTL(KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x84)
 
 #define KVM_REG_PPC_EPCR   (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85)
+#define KVM_REG_PPC_EPR(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x86)
 
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 940ec80..8779cd4 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -300,6 +300,15 @@ static void set_guest_esr(struct kvm_vcpu *vcpu, u32 esr)
 #endif
 }
 
+static unsigned long get_guest_epr(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_KVM_BOOKE_HV
+   return mfspr(SPRN_GEPR);
+#else
+   return vcpu-arch.epr;
+#endif
+}
+
 /* Deliver the interrupt of the corresponding priority, if possible. */
 static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 unsigned int priority)
@@ -1405,6 +1414,11 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, 
struct kvm_one_reg *reg)
 vcpu-arch.dbg_reg.dac[dac], sizeof(u64));
break;
}
+   case KVM_REG_PPC_EPR: {
+   u32 epr = get_guest_epr(vcpu);
+   r = put_user(epr, (u32 __user *)(long)reg-addr);
+   break;
+   }
 #if defined(CONFIG_64BIT)
case KVM_REG_PPC_EPCR:
r = put_user(vcpu-arch.epcr, (u32 __user *)(long)reg-addr);
@@ -1437,6 +1451,13 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, 
struct kvm_one_reg *reg)
 (u64 __user *)(long)reg-addr, sizeof(u64));
break;
}
+   case KVM_REG_PPC_EPR: {
+   u32 new_epr;
+   r = get_user(new_epr, (u32 __user *)(long)reg-addr);
+   if (!r)
+   kvmppc_set_epr(vcpu, new_epr);
+   break;
+   }
 #if defined(CONFIG_64BIT)
case KVM_REG_PPC_EPCR: {
u32 new_epcr;
-- 
1.6.0.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/8] KVM: PPC: BookE: Implement EPR exit

2013-01-10 Thread Alexander Graf
The External Proxy Facility in FSL BookE chips allows the interrupt
controller to automatically acknowledge an interrupt as soon as a
core gets its pending external interrupt delivered.

Today, user space implements the interrupt controller, so we need to
check on it during such a cycle.

This patch implements logic for user space to enable EPR exiting,
disable EPR exiting and EPR exiting itself, so that user space can
acknowledge an interrupt when an external interrupt has successfully
been delivered into the guest vcpu.

Signed-off-by: Alexander Graf ag...@suse.de
---
 Documentation/virtual/kvm/api.txt   |   40 +-
 arch/powerpc/include/asm/kvm_host.h |2 +
 arch/powerpc/include/asm/kvm_ppc.h  |9 +++
 arch/powerpc/kvm/booke.c|   14 +++-
 arch/powerpc/kvm/powerpc.c  |   10 
 include/linux/kvm_host.h|1 +
 include/uapi/linux/kvm.h|6 +
 7 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 4fc2bfc..a98ed09 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2246,8 +2246,8 @@ executed a memory-mapped I/O instruction which could not 
be satisfied
 by kvm.  The 'data' member contains the written data if 'is_write' is
 true, and should be filled by application code otherwise.
 
-NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR
-  and KVM_EXIT_PAPR the corresponding
+NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR,
+  KVM_EXIT_PAPR and KVM_EXIT_EPR the corresponding
 operations are complete (and guest state is consistent) only after userspace
 has re-entered the kernel with KVM_RUN.  The kernel side will first finish
 incomplete operations and then check for pending signals.  Userspace
@@ -2366,6 +2366,25 @@ interrupt for the target subchannel has been dequeued 
and subchannel_id,
 subchannel_nr, io_int_parm and io_int_word contain the parameters for that
 interrupt. ipb is needed for instruction parameter decoding.
 
+   /* KVM_EXIT_EPR */
+   struct {
+   __u32 epr;
+   } epr;
+
+On FSL BookE PowerPC chips, the interrupt controller has a fast patch
+interrupt acknowledge path to the core. When the core successfully
+delivers an interrupt, it automatically populates the EPR register with
+the interrupt vector number and acknowledges the interrupt inside
+the interrupt controller.
+
+In case the interrupt controller lives in user space, we need to do
+the interrupt acknowledge cycle through it to fetch the next to be
+delivered interrupt vector using this exit.
+
+It gets triggered whenever both KVM_CAP_PPC_EPR are enabled and an
+external interrupt has just been delivered into the guest. User space
+should put the acknowledged interrupt vector into the 'epr' field.
+
/* Fix the size of the union. */
char padding[256];
};
@@ -2501,3 +2520,20 @@ handled in-kernel, while the other I/O instructions are 
passed to userspace.
 
 When this capability is enabled, KVM_EXIT_S390_TSCH will occur on TEST
 SUBCHANNEL intercepts.
+
+6.5 KVM_CAP_PPC_EPR
+
+Architectures: ppc
+Parameters: args[0] defines whether the proxy facility is active
+Returns: 0 on success; -1 on error
+
+This capability enables or disables the delivery of interrupts through the
+external proxy facility.
+
+When enabled (args[0] != 0), every time the guest gets an external interrupt
+delivered, it automatically exits into user space with a KVM_EXIT_EPR exit
+to receive the topmost interrupt vector.
+
+When disabled (args[0] == 0), behavior is as if this facility is unsupported.
+
+When this capability is enabled, KVM_EXIT_EPR can occur.
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index ab49c6c..8a72d59 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -520,6 +520,8 @@ struct kvm_vcpu_arch {
u8 sane;
u8 cpu_type;
u8 hcall_needed;
+   u8 epr_enabled;
+   u8 epr_needed;
 
u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 5f5f69a..493630e 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -264,6 +264,15 @@ static inline void kvm_linear_init(void)
 {}
 #endif
 
+static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr)
+{
+#ifdef CONFIG_KVM_BOOKE_HV
+   mtspr(SPRN_GEPR, epr);
+#elif defined(CONFIG_BOOKE)
+   vcpu-arch.epr = epr;
+#endif
+}
+
 int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
  struct kvm_config_tlb *cfg);
 int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 964f447..940ec80 100644
--- 

[PATCH 4/8] KVM: PPC: Fix mfspr/mtspr MMUCFG emulation

2013-01-10 Thread Alexander Graf
From: Mihai Caraman mihai.cara...@freescale.com

On mfspr/mtspr emulation path Book3E's MMUCFG SPR with value 1015 clashes
with G4's MSSSR0 SPR. Move MSSSR0 emulation from generic part to Books3S.
MSSSR0 also clashes with Book3S's DABRX SPR. DABRX was not explicitly
handled so Book3S execution flow will behave as before.

Signed-off-by: Mihai Caraman mihai.cara...@freescale.com
Signed-off-by: Alexander Graf ag...@suse.de
---
 arch/powerpc/kvm/book3s_emulate.c |2 ++
 arch/powerpc/kvm/emulate.c|5 -
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index c88161b..836c569 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -455,6 +455,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int 
sprn, ulong spr_val)
case SPRN_PMC3_GEKKO:
case SPRN_PMC4_GEKKO:
case SPRN_WPAR_GEKKO:
+   case SPRN_MSSSR0:
break;
 unprivileged:
default:
@@ -551,6 +552,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int 
sprn, ulong *spr_val)
case SPRN_PMC3_GEKKO:
case SPRN_PMC4_GEKKO:
case SPRN_WPAR_GEKKO:
+   case SPRN_MSSSR0:
*spr_val = 0;
break;
default:
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index b0855e5..71abcf4 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -149,8 +149,6 @@ static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int 
sprn, int rs)
case SPRN_TBWL: break;
case SPRN_TBWU: break;
 
-   case SPRN_MSSSR0: break;
-
case SPRN_DEC:
vcpu-arch.dec = spr_val;
kvmppc_emulate_dec(vcpu);
@@ -201,9 +199,6 @@ static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int 
sprn, int rt)
case SPRN_PIR:
spr_val = vcpu-vcpu_id;
break;
-   case SPRN_MSSSR0:
-   spr_val = 0;
-   break;
 
/* Note: mftb and TBRL/TBWL are user-accessible, so
 * the guest can always access the real TB anyways.
-- 
1.6.0.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/8] KVM: PPC: BookE: Allow irq deliveries to inject requests

2013-01-10 Thread Alexander Graf
When injecting an interrupt into guest context, we usually don't need
to check for requests anymore. At least not until today.

With the introduction of EPR, we will have to create a request when the
guest has successfully accepted an external interrupt though.

So we need to prepare the interrupt delivery to abort guest entry
gracefully. Otherwise we'd delay the EPR request.

Signed-off-by: Alexander Graf ag...@suse.de
---
 arch/powerpc/kvm/booke.c |5 +
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 69f1140..964f447 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -581,6 +581,11 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
 
kvmppc_core_check_exceptions(vcpu);
 
+   if (vcpu-requests) {
+   /* Exception delivery raised request; start over */
+   return 1;
+   }
+
if (vcpu-arch.shared-msr  MSR_WE) {
local_irq_enable();
kvm_vcpu_block(vcpu);
-- 
1.6.0.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/8] KVM: PPC: Book3S: PR: Enable alternative instruction for SC 1

2013-01-10 Thread Alexander Graf
When running on top of pHyp, the hypercall instruction sc 1 goes
straight into pHyp without trapping in supervisor mode.

So if we want to support PAPR guest in this configuration we need to
add a second way of accessing PAPR hypercalls, preferably with the
exact same semantics except for the instruction.

So let's overlay an officially reserved instruction and emulate PAPR
hypercalls whenever we hit that one.

Signed-off-by: Alexander Graf ag...@suse.de
---
 arch/powerpc/include/asm/kvm_ppc.h |1 +
 arch/powerpc/kvm/book3s_emulate.c  |   28 
 arch/powerpc/kvm/book3s_pr.c   |5 +
 3 files changed, 34 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 572aa75..5f5f69a 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -44,6 +44,7 @@ enum emulation_result {
EMULATE_DO_DCR,   /* kvm_run filled with DCR request */
EMULATE_FAIL, /* can't emulate this instruction */
EMULATE_AGAIN,/* something went wrong. go again */
+   EMULATE_DO_PAPR,  /* kvm_run filled with PAPR request */
 };
 
 extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index d31a716..c88161b 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -34,6 +34,8 @@
 #define OP_31_XOP_MTSRIN   242
 #define OP_31_XOP_TLBIEL   274
 #define OP_31_XOP_TLBIE306
+/* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */
+#define OP_31_XOP_FAKE_SC1 308
 #define OP_31_XOP_SLBMTE   402
 #define OP_31_XOP_SLBIE434
 #define OP_31_XOP_SLBIA498
@@ -170,6 +172,32 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
vcpu-arch.mmu.tlbie(vcpu, addr, large);
break;
}
+#ifdef CONFIG_KVM_BOOK3S_64_PR
+   case OP_31_XOP_FAKE_SC1:
+   {
+   /* SC 1 papr hypercalls */
+   ulong cmd = kvmppc_get_gpr(vcpu, 3);
+   int i;
+
+   if ((vcpu-arch.shared-msr  MSR_PR) ||
+   !vcpu-arch.papr_enabled) {
+   emulated = EMULATE_FAIL;
+   break;
+   }
+
+   if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE)
+   break;
+
+   run-papr_hcall.nr = cmd;
+   for (i = 0; i  9; ++i) {
+   ulong gpr = kvmppc_get_gpr(vcpu, 4 + i);
+   run-papr_hcall.args[i] = gpr;
+   }
+
+   emulated = EMULATE_DO_PAPR;
+   break;
+   }
+#endif
case OP_31_XOP_EIOIO:
break;
case OP_31_XOP_SLBMTE:
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 28d38ad..73ed11c 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -760,6 +760,11 @@ program_interrupt:
run-exit_reason = KVM_EXIT_MMIO;
r = RESUME_HOST_NV;
break;
+   case EMULATE_DO_PAPR:
+   run-exit_reason = KVM_EXIT_PAPR_HCALL;
+   vcpu-arch.hcall_needed = 1;
+   r = RESUME_HOST_NV;
+   break;
default:
BUG();
}
-- 
1.6.0.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/8] KVM: PPC: Only WARN on invalid emulation

2013-01-10 Thread Alexander Graf
When we hit an emulation result that we didn't expect, that is an error,
but it's nothing that warrants a BUG(), because it can be guest triggered.

So instead, let's only WARN() the user that this happened.

Signed-off-by: Alexander Graf ag...@suse.de
---
 arch/powerpc/kvm/powerpc.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index be83fca..e2225e5 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -237,7 +237,8 @@ int kvmppc_emulate_mmio(struct kvm_run *run, struct 
kvm_vcpu *vcpu)
r = RESUME_HOST;
break;
default:
-   BUG();
+   WARN_ON(1);
+   r = RESUME_GUEST;
}
 
return r;
-- 
1.6.0.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/8] KVM: PPC: Fix SREGS documentation reference

2013-01-10 Thread Alexander Graf
From: Mihai Caraman mihai.cara...@freescale.com

Reflect the uapi folder change in SREGS API documentation.

Signed-off-by: Mihai Caraman mihai.cara...@freescale.com
Reviewed-by: Amos Kong kongjian...@gmail.com
Signed-off-by: Alexander Graf ag...@suse.de
---
 Documentation/virtual/kvm/api.txt |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index f2d6391..4fc2bfc 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -345,7 +345,7 @@ struct kvm_sregs {
__u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
 };
 
-/* ppc -- see arch/powerpc/include/asm/kvm.h */
+/* ppc -- see arch/powerpc/include/uapi/asm/kvm.h */
 
 interrupt_bitmap is a bitmap of pending external interrupts.  At most
 one bit may be set.  This interrupt has been acknowledged by the APIC
-- 
1.6.0.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] emulator: simple ALU tests

2013-01-10 Thread Marcelo Tosatti
On Fri, Jan 04, 2013 at 04:17:04PM +0200, Avi Kivity wrote:
 Signed-off-by: Avi Kivity avi.kiv...@gmail.com
 ---
  x86/emulator.c | 26 ++
  1 file changed, 26 insertions(+)

Applied, thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] KVM: s390: Bugfixes for virtio-ccw.

2013-01-10 Thread Marcelo Tosatti
On Mon, Jan 07, 2013 at 03:51:50PM +0100, Cornelia Huck wrote:
 Hi,
 
 Christian discovered some problems with regard to serialization
 in the virtio-ccw guest driver. Per-device data structures might
 contain data obtained by channel programs issued later on, leading
 to confusing behaviour. We cannot rely on the common I/O layer
 serialization here.
 
 Rather than adding extra serialization, we decided to keep it simple
 with per-request allocated data structures and retries on busy.
 These patches have been run in our internal testing without problems
 for a bit now.
 
 Please apply to kvm-next.

Applied, thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 01/12] tap: multiqueue support

2013-01-10 Thread Jason Wang
On 01/10/2013 06:28 PM, Stefan Hajnoczi wrote:
 On Fri, Dec 28, 2012 at 06:31:53PM +0800, Jason Wang wrote:

 Mainly suggestions to make the code easier to understand, but see the
 comment about the 1:1 queue/NetClientState model for a general issue
 with this approach.

Ok, thanks for the reviewing.
 Recently, linux support multiqueue tap which could let userspace call 
 TUNSETIFF
 for a signle device many times to create multiple file descriptors as
 s/signle/single/

 (Noting these if you respin.)

Sorry about this, will be careful.
 independent queues. User could also enable/disabe a specific queue through
 s/disabe/disable/

 TUNSETQUEUE.

 The patch adds the generic infrastructure to create multiqueue taps. To 
 achieve
 this a new parameter queues were introduced to specify how many queues were
 expected to be created for tap. The fd parameter were also changed to 
 support
 a list of file descriptors which could be used by management (such as 
 libvirt)
 to pass pre-created file descriptors (queues) to qemu.

 Each TAPState were still associated to a tap fd, which mean multiple 
 TAPStates
 were created when user needs multiqueue taps.

 Only linux part were implemented now, since it's the only OS that support
 multiqueue tap.

 Signed-off-by: Jason Wang jasow...@redhat.com
 ---
  net/tap-aix.c |   18 -
  net/tap-bsd.c |   18 -
  net/tap-haiku.c   |   18 -
  net/tap-linux.c   |   70 +++-
  net/tap-linux.h   |4 +
  net/tap-solaris.c |   18 -
  net/tap-win32.c   |   10 ++
  net/tap.c |  248 
 +
  net/tap.h |8 ++-
  qapi-schema.json  |5 +-
  10 files changed, 335 insertions(+), 82 deletions(-)
 This patch should be split up:
 1. linux-headers: import linux/if_tun.h multiqueue constants
 2. tap: add Linux multiqueue support (tap_open(), tap_fd_attach(), 
 tap_fd_detach())
 3. tap: queue attach/detach (tap_attach(), tap_detach())
 4. tap: split out net_init_one_tap() function (pure code motion, to make 
 later diffs easy to review)
 5. tap: add queues and multi-fd options 
 (net_init_tap()/net_init_one_tap() changes)

 Each commit description can explain how this works in more detail.  I
 think I've figured it out now but it would have helped to separate
 things out from the start.

Ok.
 diff --git a/net/tap-aix.c b/net/tap-aix.c
 index f27c177..f931ef3 100644
 --- a/net/tap-aix.c
 +++ b/net/tap-aix.c
 @@ -25,7 +25,8 @@
  #include net/tap.h
  #include stdio.h
  
 -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int 
 vnet_hdr_required)
 +int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
 + int vnet_hdr_required, int mq_required)
  {
  fprintf(stderr, no tap on AIX\n);
  return -1;
 @@ -59,3 +60,18 @@ void tap_fd_set_offload(int fd, int csum, int tso4,
  int tso6, int ecn, int ufo)
  {
  }
 +
 +int tap_fd_attach(int fd)
 +{
 +return -1;
 +}
 +
 +int tap_fd_detach(int fd)
 +{
 +return -1;
 +}
 +
 +int tap_fd_ifname(int fd, char *ifname)
 +{
 +return -1;
 +}
 diff --git a/net/tap-bsd.c b/net/tap-bsd.c
 index a3b717d..07c287d 100644
 --- a/net/tap-bsd.c
 +++ b/net/tap-bsd.c
 @@ -33,7 +33,8 @@
  #include net/if_tap.h
  #endif
  
 -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int 
 vnet_hdr_required)
 +int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
 + int vnet_hdr_required, int mq_required)
  {
  int fd;
  #ifdef TAPGIFNAME
 @@ -145,3 +146,18 @@ void tap_fd_set_offload(int fd, int csum, int tso4,
  int tso6, int ecn, int ufo)
  {
  }
 +
 +int tap_fd_attach(int fd)
 +{
 +return -1;
 +}
 +
 +int tap_fd_detach(int fd)
 +{
 +return -1;
 +}
 +
 +int tap_fd_ifname(int fd, char *ifname)
 +{
 +return -1;
 +}
 diff --git a/net/tap-haiku.c b/net/tap-haiku.c
 index 34739d1..62ab423 100644
 --- a/net/tap-haiku.c
 +++ b/net/tap-haiku.c
 @@ -25,7 +25,8 @@
  #include net/tap.h
  #include stdio.h
  
 -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int 
 vnet_hdr_required)
 +int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
 + int vnet_hdr_required, int mq_required)
  {
  fprintf(stderr, no tap on Haiku\n);
  return -1;
 @@ -59,3 +60,18 @@ void tap_fd_set_offload(int fd, int csum, int tso4,
  int tso6, int ecn, int ufo)
  {
  }
 +
 +int tap_fd_attach(int fd)
 +{
 +return -1;
 +}
 +
 +int tap_fd_detach(int fd)
 +{
 +return -1;
 +}
 +
 +int tap_fd_ifname(int fd, char *ifname)
 +{
 +return -1;
 +}
 diff --git a/net/tap-linux.c b/net/tap-linux.c
 index c6521be..0854ef5 100644
 --- a/net/tap-linux.c
 +++ b/net/tap-linux.c
 @@ -35,7 +35,8 @@
  
  #define PATH_NET_TUN /dev/net/tun
  
 -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int 
 vnet_hdr_required)
 +int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
 + int vnet_hdr_required, int mq_required)
  {
  

Re: [Qemu-devel] [PATCH 00/12] Multiqueue virtio-net

2013-01-10 Thread Jason Wang
On 01/10/2013 07:49 PM, Stefan Hajnoczi wrote:
 On Thu, Jan 10, 2013 at 05:34:14PM +0800, Jason Wang wrote:
 On 01/10/2013 04:44 PM, Stefan Hajnoczi wrote:
 On Wed, Jan 09, 2013 at 11:33:25PM +0800, Jason Wang wrote:
 On 01/09/2013 11:32 PM, Michael S. Tsirkin wrote:
 On Wed, Jan 09, 2013 at 03:29:24PM +0100, Stefan Hajnoczi wrote:
 On Fri, Dec 28, 2012 at 06:31:52PM +0800, Jason Wang wrote:
 Perf Numbers:

 Two Intel Xeon 5620 with direct connected intel 82599EB
 Host/Guest kernel: David net tree
 vhost enabled

 - lots of improvents of both latency and cpu utilization in 
 request-reponse test
 - get regression of guest sending small packets which because TCP tends 
 to batch
   less when the latency were improved

 1q/2q/4q
 TCP_RR
  size #sessions trans.rate  norm trans.rate  norm trans.rate  norm
 1 1 9393.26   595.64  9408.18   597.34  9375.19   584.12
 1 2072162.1   2214.24 129880.22 2456.13 196949.81 2298.13
 1 50107513.38 2653.99 139721.93 2490.58 259713.82 2873.57
 1 100   126734.63 2676.54 145553.5  2406.63 265252.68 2943
 64 19453.42   632.33  9371.37   616.13  9338.19   615.97
 64 20   70620.03  2093.68 125155.75 2409.15 191239.91 2253.32
 64 50   1069662448.29 146518.67 2514.47 242134.07 2720.91
 64 100  117046.35 2394.56 190153.09 2696.82 238881.29 2704.41
 256 1   8733.29   736.36  8701.07   680.83  8608.92   530.1
 256 20  69279.89  2274.45 115103.07 2299.76 144555.16 1963.53
 256 50  97676.02  2296.09 150719.57 2522.92 254510.5  3028.44
 256 100 150221.55 2949.56 197569.3  2790.92 300695.78 3494.83
 TCP_CRR
  size #sessions trans.rate  norm trans.rate  norm trans.rate  norm
 1 1 2848.37  163.41 2230.39  130.89 2013.09  120.47
 1 2023434.5  562.11 31057.43 531.07 49488.28 564.41
 1 5028514.88 582.17 40494.23 605.92 60113.35 654.97
 1 100   28827.22 584.73 48813.25 661.6  61783.62 676.56
 64 12780.08  159.4  2201.07  127.96 2006.8   117.63
 64 20   23318.51 564.47 30982.44 530.24 49734.95 566.13
 64 50   28585.72 582.54 40576.7  610.08 60167.89 656.56
 64 100  28747.37 584.17 49081.87 667.87 60612.94 662
 256 1   2772.08  160.51 2231.84  131.05 2003.62  113.45
 256 20  23086.35 559.8  30929.09 528.16 48454.9  555.22
 256 50  28354.7  579.85 40578.31 60760261.71 657.87
 256 100 28844.55 585.67 48541.86 659.08 61941.07 676.72
 TCP_STREAM guest receiving
  size #sessions throughput  norm throughput  norm throughput  norm
 1 1 16.27   1.33   16.11.12   16.13   0.99
 1 2 33.04   2.08   32.96   2.19   32.75   1.98
 1 4 66.62   6.83   68.35.56   66.14   2.65
 64 1896.55  56.67  914.02  58.14  898.9   61.56
 64 21830.46 91.02  1812.02 64.59  1835.57 66.26
 64 43626.61 142.55 3636.25 100.64 3607.46 75.03
 256 1   2619.49 131.23 2543.19 129.03 2618.69 132.39
 256 2   5136.58 203.02 5163.31 141.11 5236.51 149.4
 256 4   7063.99 242.83 9365.4  208.49 9421.03 159.94
 512 1   3592.43 165.24 3603.12 167.19 3552.5  169.57
 512 2   7042.62 246.59 7068.46 180.87 7258.52 186.3
 512 4   6996.08 241.49 9298.34 206.12 9418.52 159.33
 1024 1  4339.54 192.95 4370.2  191.92 4211.72 192.49
 1024 2  7439.45 254.77 9403.99 215.24 9120.82 222.67
 1024 4  7953.86 272.11 9403.87 208.23 9366.98 159.49
 4096 1  7696.28 272.04 7611.41 270.38 7778.71 267.76
 4096 2  7530.35 261.1  8905.43 246.27 8990.18 267.57
 4096 4  7121.6  247.02 9411.75 206.71 9654.96 184.67
 16384 1 7795.73 268.54 7780.94 267.2  7634.26 260.73
 16384 2 7436.57 255.81 9381.86 220.85 9392220.36
 16384 4 7199.07 247.81 9420.96 205.87 9373.69 159.57
 TCP_MAERTS guest sending
  size #sessions throughput  norm throughput  norm throughput  norm
 1 1 15.94   0.62   15.55   0.61   15.13   0.59
 1 2 36.11   0.83   32.46   0.69   32.28   0.69
 1 4 71.59   1  68.91   0.94   61.52   0.77
 64 1630.71  22.52  622.11  22.35  605.09  21.84
 64 21442.36 30.57  1292.15 25.82  1282.67 25.55
 64 43186.79 42.59  2844.96 36.03  2529.69 30.06
 256 1   1760.96 58.07  1738.44 57.43  1695.99 56.19
 256 2   4834.23 95.19  3524.85 64.21  3511.94 64.45
 256 4   9324.63 145.74 8956.49 116.39 6720.17 73.86
 512 1   2678.03 84.1   2630.68 82.93  2636.54 82.57
 512 2   9368.17 195.61 9408.82 204.53 5316.3  92.99
 512 4   9186.34 209.68 9358.72 183.82 9489.29 160.42
 1024 1  3620.71 109.88 3625.54 109.83 3606.61 112.35
 1024 2  9429258.32 7082.79 120.55 7403.53 134.78
 1024 4  9430.66 290.44 9499.29 232.31 9414.6  190.92
 4096 1  9339.28 296.48 9374.23 372.88 9348.76 298.49
 4096 2  9410.53 378.69 9412.61 286.18 9409.75 278.31
 4096 4  9487.35 374.1  9556.91 288.81 9441.94 221.64
 16384 1 9380.43 403.8  9379.78 399.13 9382.42 393.55
 16384 2 9367.69 406.93 9415.04 312.68 9409.29 300.9
 16384 4 9391.96 405.17 9695.12 310.54 9423.76 223.47
 Trying to understand the performance results:

 What is the host device configuration?  tap + bridge?
 Yes.
 Did you use host CPU affinity for the vhost threads?
 I use numactl to pin cpu threads and vhost threads in the same numa node.
 Can 

[RFC PATCH 0/2] make mac programming for virtio net more robust

2013-01-10 Thread akong
From: Amos Kong ak...@redhat.com

Currenly mac is programmed byte by byte. This means that we
have an intermediate step where mac is wrong. 

Second patch introduced a new vq control command to set mac
address in one time.

Amos Kong (2):
  move virtnet_send_command() above virtnet_set_mac_address()
  virtio-net: introduce a new control to set macaddr

 drivers/net/virtio_net.c| 105 ++--
 include/uapi/linux/virtio_net.h |   8 ++-
 2 files changed, 66 insertions(+), 47 deletions(-)

-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 1/2] move virtnet_send_command() above virtnet_set_mac_address()

2013-01-10 Thread akong
From: Amos Kong ak...@redhat.com

We will send vq command to set mac address in virtnet_set_mac_address()
a little fix of coding style

Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/net/virtio_net.c | 89 
 1 file changed, 44 insertions(+), 45 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index a6fcf15..395ab4f 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -753,6 +753,50 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct 
net_device *dev)
return NETDEV_TX_OK;
 }
 
+/*
+ * Send command via the control virtqueue and check status.  Commands
+ * supported by the hypervisor, as indicated by feature bits, should
+ * never fail unless improperly formated.
+ */
+static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
+struct scatterlist *data, int out, int in)
+{
+   struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2];
+   struct virtio_net_ctrl_hdr ctrl;
+   virtio_net_ctrl_ack status = ~0;
+   unsigned int tmp;
+   int i;
+
+   /* Caller should know better */
+   BUG_ON(!virtio_has_feature(vi-vdev, VIRTIO_NET_F_CTRL_VQ) ||
+   (out + in  VIRTNET_SEND_COMMAND_SG_MAX));
+
+   out++; /* Add header */
+   in++; /* Add return status */
+
+   ctrl.class = class;
+   ctrl.cmd = cmd;
+
+   sg_init_table(sg, out + in);
+
+   sg_set_buf(sg[0], ctrl, sizeof(ctrl));
+   for_each_sg(data, s, out + in - 2, i)
+   sg_set_buf(sg[i + 1], sg_virt(s), s-length);
+   sg_set_buf(sg[out + in - 1], status, sizeof(status));
+
+   BUG_ON(virtqueue_add_buf(vi-cvq, sg, out, in, vi, GFP_ATOMIC)  0);
+
+   virtqueue_kick(vi-cvq);
+
+   /* Spin for a response, the kick causes an ioport write, trapping
+* into the hypervisor, so the request should be handled immediately.
+*/
+   while (!virtqueue_get_buf(vi-cvq, tmp))
+   cpu_relax();
+
+   return status == VIRTIO_NET_OK;
+}
+
 static int virtnet_set_mac_address(struct net_device *dev, void *p)
 {
struct virtnet_info *vi = netdev_priv(dev);
@@ -819,51 +863,6 @@ static void virtnet_netpoll(struct net_device *dev)
 }
 #endif
 
-/*
- * Send command via the control virtqueue and check status.  Commands
- * supported by the hypervisor, as indicated by feature bits, should
- * never fail unless improperly formated.
- */
-static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
-struct scatterlist *data, int out, int in)
-{
-   struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2];
-   struct virtio_net_ctrl_hdr ctrl;
-   virtio_net_ctrl_ack status = ~0;
-   unsigned int tmp;
-   int i;
-
-   /* Caller should know better */
-   BUG_ON(!virtio_has_feature(vi-vdev, VIRTIO_NET_F_CTRL_VQ) ||
-   (out + in  VIRTNET_SEND_COMMAND_SG_MAX));
-
-   out++; /* Add header */
-   in++; /* Add return status */
-
-   ctrl.class = class;
-   ctrl.cmd = cmd;
-
-   sg_init_table(sg, out + in);
-
-   sg_set_buf(sg[0], ctrl, sizeof(ctrl));
-   for_each_sg(data, s, out + in - 2, i)
-   sg_set_buf(sg[i + 1], sg_virt(s), s-length);
-   sg_set_buf(sg[out + in - 1], status, sizeof(status));
-
-   BUG_ON(virtqueue_add_buf(vi-cvq, sg, out, in, vi, GFP_ATOMIC)  0);
-
-   virtqueue_kick(vi-cvq);
-
-   /*
-* Spin for a response, the kick causes an ioport write, trapping
-* into the hypervisor, so the request should be handled immediately.
-*/
-   while (!virtqueue_get_buf(vi-cvq, tmp))
-   cpu_relax();
-
-   return status == VIRTIO_NET_OK;
-}
-
 static void virtnet_ack_link_announce(struct virtnet_info *vi)
 {
rtnl_lock();
-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 2/2] virtio-net: introduce a new control to set macaddr

2013-01-10 Thread akong
From: Amos Kong ak...@redhat.com

Currently we write MAC address to pci config space byte by byte,
this means that we have an intermediate step where mac is wrong.
This patch introduced a new control command to set MAC address
in one time.

VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility.

Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/net/virtio_net.c| 16 +++-
 include/uapi/linux/virtio_net.h |  8 +++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 395ab4f..ff22bcd 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -803,13 +803,26 @@ static int virtnet_set_mac_address(struct net_device 
*dev, void *p)
struct virtio_device *vdev = vi-vdev;
int ret;
 
+   struct scatterlist sg;
+
ret = eth_mac_addr(dev, p);
if (ret)
return ret;
 
-   if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
+   if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
+   /* Set MAC address by sending vq command */
+   sg_init_one(sg, dev-dev_addr, dev-addr_len);
+   virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
+VIRTIO_NET_CTRL_MAC_ADDR_SET,
+sg, 1, 0);
+   return 0;
+   }
+
+   if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) {
+   /* Set MAC address by writing config space */
vdev-config-set(vdev, offsetof(struct virtio_net_config, mac),
  dev-dev_addr, dev-addr_len);
+   }
 
return 0;
 }
@@ -1627,6 +1640,7 @@ static unsigned int features[] = {
VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN,
VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ,
+   VIRTIO_NET_F_CTRL_MAC_ADDR,
 };
 
 static struct virtio_driver virtio_net_driver = {
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 848e358..a5a8c88 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -53,6 +53,7 @@
 * network */
 #define VIRTIO_NET_F_MQ22  /* Device supports Receive Flow
 * Steering */
+#define VIRTIO_NET_F_CTRL_MAC_ADDR 23  /* Set MAC address */
 
 #define VIRTIO_NET_S_LINK_UP   1   /* Link is up */
 #define VIRTIO_NET_S_ANNOUNCE  2   /* Announcement is needed */
@@ -127,7 +128,7 @@ typedef __u8 virtio_net_ctrl_ack;
  #define VIRTIO_NET_CTRL_RX_NOBCAST  5
 
 /*
- * Control the MAC filter table.
+ * Control the MAC
  *
  * The MAC filter table is managed by the hypervisor, the guest should
  * assume the size is infinite.  Filtering should be considered
@@ -140,6 +141,10 @@ typedef __u8 virtio_net_ctrl_ack;
  * first sg list contains unicast addresses, the second is for multicast.
  * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature
  * is available.
+ *
+ * The ADDR_SET command requests one out scatterlist, it contains a
+ * 6 bytes MAC address. This functionality is present if the
+ * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available.
  */
 struct virtio_net_ctrl_mac {
__u32 entries;
@@ -148,6 +153,7 @@ struct virtio_net_ctrl_mac {
 
 #define VIRTIO_NET_CTRL_MAC1
  #define VIRTIO_NET_CTRL_MAC_TABLE_SET0
+ #define VIRTIO_NET_CTRL_MAC_ADDR_SET 1
 
 /*
  * Control VLAN filtering
-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 1/2] move virtnet_send_command() above virtnet_set_mac_address()

2013-01-10 Thread Jason Wang
On 01/10/2013 10:45 PM, ak...@redhat.com wrote:
 From: Amos Kong ak...@redhat.com

 We will send vq command to set mac address in virtnet_set_mac_address()
 a little fix of coding style

Maybe what you need is just a forward declaration.

 Signed-off-by: Amos Kong ak...@redhat.com
 ---
  drivers/net/virtio_net.c | 89 
 
  1 file changed, 44 insertions(+), 45 deletions(-)

 diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
 index a6fcf15..395ab4f 100644
 --- a/drivers/net/virtio_net.c
 +++ b/drivers/net/virtio_net.c
 @@ -753,6 +753,50 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, 
 struct net_device *dev)
   return NETDEV_TX_OK;
  }
  
 +/*
 + * Send command via the control virtqueue and check status.  Commands
 + * supported by the hypervisor, as indicated by feature bits, should
 + * never fail unless improperly formated.
 + */
 +static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
 +  struct scatterlist *data, int out, int in)
 +{
 + struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2];
 + struct virtio_net_ctrl_hdr ctrl;
 + virtio_net_ctrl_ack status = ~0;
 + unsigned int tmp;
 + int i;
 +
 + /* Caller should know better */
 + BUG_ON(!virtio_has_feature(vi-vdev, VIRTIO_NET_F_CTRL_VQ) ||
 + (out + in  VIRTNET_SEND_COMMAND_SG_MAX));
 +
 + out++; /* Add header */
 + in++; /* Add return status */
 +
 + ctrl.class = class;
 + ctrl.cmd = cmd;
 +
 + sg_init_table(sg, out + in);
 +
 + sg_set_buf(sg[0], ctrl, sizeof(ctrl));
 + for_each_sg(data, s, out + in - 2, i)
 + sg_set_buf(sg[i + 1], sg_virt(s), s-length);
 + sg_set_buf(sg[out + in - 1], status, sizeof(status));
 +
 + BUG_ON(virtqueue_add_buf(vi-cvq, sg, out, in, vi, GFP_ATOMIC)  0);
 +
 + virtqueue_kick(vi-cvq);
 +
 + /* Spin for a response, the kick causes an ioport write, trapping
 +  * into the hypervisor, so the request should be handled immediately.
 +  */
 + while (!virtqueue_get_buf(vi-cvq, tmp))
 + cpu_relax();
 +
 + return status == VIRTIO_NET_OK;
 +}
 +
  static int virtnet_set_mac_address(struct net_device *dev, void *p)
  {
   struct virtnet_info *vi = netdev_priv(dev);
 @@ -819,51 +863,6 @@ static void virtnet_netpoll(struct net_device *dev)
  }
  #endif
  
 -/*
 - * Send command via the control virtqueue and check status.  Commands
 - * supported by the hypervisor, as indicated by feature bits, should
 - * never fail unless improperly formated.
 - */
 -static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
 -  struct scatterlist *data, int out, int in)
 -{
 - struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2];
 - struct virtio_net_ctrl_hdr ctrl;
 - virtio_net_ctrl_ack status = ~0;
 - unsigned int tmp;
 - int i;
 -
 - /* Caller should know better */
 - BUG_ON(!virtio_has_feature(vi-vdev, VIRTIO_NET_F_CTRL_VQ) ||
 - (out + in  VIRTNET_SEND_COMMAND_SG_MAX));
 -
 - out++; /* Add header */
 - in++; /* Add return status */
 -
 - ctrl.class = class;
 - ctrl.cmd = cmd;
 -
 - sg_init_table(sg, out + in);
 -
 - sg_set_buf(sg[0], ctrl, sizeof(ctrl));
 - for_each_sg(data, s, out + in - 2, i)
 - sg_set_buf(sg[i + 1], sg_virt(s), s-length);
 - sg_set_buf(sg[out + in - 1], status, sizeof(status));
 -
 - BUG_ON(virtqueue_add_buf(vi-cvq, sg, out, in, vi, GFP_ATOMIC)  0);
 -
 - virtqueue_kick(vi-cvq);
 -
 - /*
 -  * Spin for a response, the kick causes an ioport write, trapping
 -  * into the hypervisor, so the request should be handled immediately.
 -  */
 - while (!virtqueue_get_buf(vi-cvq, tmp))
 - cpu_relax();
 -
 - return status == VIRTIO_NET_OK;
 -}
 -
  static void virtnet_ack_link_announce(struct virtnet_info *vi)
  {
   rtnl_lock();

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH] virtio-net: introduce a new macaddr control

2013-01-10 Thread akong
From: Amos Kong ak...@redhat.com

In virtio-net guest driver, currently we write MAC address to
pci config space byte by byte, this means that we have an
intermediate step where mac is wrong. This patch introduced
a new control command to set MAC address in one time.

VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility.

Signed-off-by: Amos Kong ak...@redhat.com
---
 hw/virtio-net.c | 9 +
 hw/virtio-net.h | 9 -
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index dc7c6d6..fc11106 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -247,6 +247,7 @@ static uint32_t virtio_net_get_features(VirtIODevice *vdev, 
uint32_t features)
 VirtIONet *n = to_virtio_net(vdev);
 
 features |= (1  VIRTIO_NET_F_MAC);
+features |= (1  VIRTIO_NET_F_CTRL_MAC_ADDR);
 
 if (!peer_has_vnet_hdr(n)) {
 features = ~(0x1  VIRTIO_NET_F_CSUM);
@@ -282,6 +283,7 @@ static uint32_t virtio_net_bad_features(VirtIODevice *vdev)
 /* Linux kernel 2.6.25.  It understood MAC (as everyone must),
  * but also these: */
 features |= (1  VIRTIO_NET_F_MAC);
+features |= (1  VIRTIO_NET_F_CTRL_MAC_ADDR);
 features |= (1  VIRTIO_NET_F_CSUM);
 features |= (1  VIRTIO_NET_F_HOST_TSO4);
 features |= (1  VIRTIO_NET_F_HOST_TSO6);
@@ -349,6 +351,13 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd,
 {
 struct virtio_net_ctrl_mac mac_data;
 
+if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET  elem-out_num == 2) {
+/* Set MAC address */
+memcpy(n-mac, elem-out_sg[1].iov_base, elem-out_sg[1].iov_len);
+qemu_format_nic_info_str(n-nic-nc, n-mac);
+return VIRTIO_NET_OK;
+}
+
 if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET || elem-out_num != 3 ||
 elem-out_sg[1].iov_len  sizeof(mac_data) ||
 elem-out_sg[2].iov_len  sizeof(mac_data))
diff --git a/hw/virtio-net.h b/hw/virtio-net.h
index d46fb98..9394cc0 100644
--- a/hw/virtio-net.h
+++ b/hw/virtio-net.h
@@ -44,6 +44,8 @@
 #define VIRTIO_NET_F_CTRL_VLAN  19  /* Control channel VLAN filtering */
 #define VIRTIO_NET_F_CTRL_RX_EXTRA 20   /* Extra RX mode control support */
 
+#define VIRTIO_NET_F_CTRL_MAC_ADDR   23 /* Set MAC address */
+
 #define VIRTIO_NET_S_LINK_UP1   /* Link is up */
 
 #define TX_TIMER_INTERVAL 15 /* 150 us */
@@ -106,7 +108,7 @@ typedef uint8_t virtio_net_ctrl_ack;
  #define VIRTIO_NET_CTRL_RX_MODE_NOBCAST  5
 
 /*
- * Control the MAC filter table.
+ * Control the MAC
  *
  * The MAC filter table is managed by the hypervisor, the guest should
  * assume the size is infinite.  Filtering should be considered
@@ -119,6 +121,10 @@ typedef uint8_t virtio_net_ctrl_ack;
  * first sg list contains unicast addresses, the second is for multicast.
  * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature
  * is available.
+ *
+ * The ADDR_SET command requests one out scatterlist, it contains a
+ * 6 bytes MAC address. This functionality is present if the
+ * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available.
  */
 struct virtio_net_ctrl_mac {
 uint32_t entries;
@@ -126,6 +132,7 @@ struct virtio_net_ctrl_mac {
 };
 #define VIRTIO_NET_CTRL_MAC1
  #define VIRTIO_NET_CTRL_MAC_TABLE_SET0
+ #define VIRTIO_NET_CTRL_MAC_ADDR_SET 1
 
 /*
  * Control VLAN filtering
-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [RFC PATCH 2/2] virtio-net: introduce a new control to set macaddr

2013-01-10 Thread Jason Wang
On 01/10/2013 10:45 PM, ak...@redhat.com wrote:
 From: Amos Kong ak...@redhat.com

 Currently we write MAC address to pci config space byte by byte,
 this means that we have an intermediate step where mac is wrong.
 This patch introduced a new control command to set MAC address
 in one time.

 VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility.

 Signed-off-by: Amos Kong ak...@redhat.com
 ---
  drivers/net/virtio_net.c| 16 +++-
  include/uapi/linux/virtio_net.h |  8 +++-
  2 files changed, 22 insertions(+), 2 deletions(-)

 diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
 index 395ab4f..ff22bcd 100644
 --- a/drivers/net/virtio_net.c
 +++ b/drivers/net/virtio_net.c
 @@ -803,13 +803,26 @@ static int virtnet_set_mac_address(struct net_device 
 *dev, void *p)
   struct virtio_device *vdev = vi-vdev;
   int ret;
  
 + struct scatterlist sg;
 +
   ret = eth_mac_addr(dev, p);
   if (ret)
   return ret;
  
 - if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
 + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
 + /* Set MAC address by sending vq command */
 + sg_init_one(sg, dev-dev_addr, dev-addr_len);
 + virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
 +  VIRTIO_NET_CTRL_MAC_ADDR_SET,
 +  sg, 1, 0);
 + return 0;
 + }
 +

Better to check the return of virtnet_send_command() and give a warn
like other command. Btw, need we fail back to try the old way then?
 + if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) {
 + /* Set MAC address by writing config space */
   vdev-config-set(vdev, offsetof(struct virtio_net_config, mac),
 dev-dev_addr, dev-addr_len);
 + }
  
   return 0;
  }
 @@ -1627,6 +1640,7 @@ static unsigned int features[] = {
   VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
   VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN,
   VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ,
 + VIRTIO_NET_F_CTRL_MAC_ADDR,
  };
  
  static struct virtio_driver virtio_net_driver = {
 diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
 index 848e358..a5a8c88 100644
 --- a/include/uapi/linux/virtio_net.h
 +++ b/include/uapi/linux/virtio_net.h
 @@ -53,6 +53,7 @@
* network */
  #define VIRTIO_NET_F_MQ  22  /* Device supports Receive Flow
* Steering */
 +#define VIRTIO_NET_F_CTRL_MAC_ADDR 23/* Set MAC address */
  
  #define VIRTIO_NET_S_LINK_UP 1   /* Link is up */
  #define VIRTIO_NET_S_ANNOUNCE2   /* Announcement is needed */
 @@ -127,7 +128,7 @@ typedef __u8 virtio_net_ctrl_ack;
   #define VIRTIO_NET_CTRL_RX_NOBCAST  5
  
  /*
 - * Control the MAC filter table.
 + * Control the MAC
   *
   * The MAC filter table is managed by the hypervisor, the guest should
   * assume the size is infinite.  Filtering should be considered
 @@ -140,6 +141,10 @@ typedef __u8 virtio_net_ctrl_ack;
   * first sg list contains unicast addresses, the second is for multicast.
   * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature
   * is available.
 + *
 + * The ADDR_SET command requests one out scatterlist, it contains a
 + * 6 bytes MAC address. This functionality is present if the
 + * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available.
   */
  struct virtio_net_ctrl_mac {
   __u32 entries;
 @@ -148,6 +153,7 @@ struct virtio_net_ctrl_mac {
  
  #define VIRTIO_NET_CTRL_MAC1
   #define VIRTIO_NET_CTRL_MAC_TABLE_SET0
 + #define VIRTIO_NET_CTRL_MAC_ADDR_SET 1
  
  /*
   * Control VLAN filtering

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 1/2] move virtnet_send_command() above virtnet_set_mac_address()

2013-01-10 Thread Michael S. Tsirkin
On Thu, Jan 10, 2013 at 10:51:52PM +0800, Jason Wang wrote:
 On 01/10/2013 10:45 PM, ak...@redhat.com wrote:
  From: Amos Kong ak...@redhat.com
 
  We will send vq command to set mac address in virtnet_set_mac_address()
  a little fix of coding style
 
 Maybe what you need is just a forward declaration.


Nah functions should be ordered sensibly.

 
  Signed-off-by: Amos Kong ak...@redhat.com
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/2] make mac programming for virtio net more robust

2013-01-10 Thread Amos Kong
On Thu, Jan 10, 2013 at 10:45:39PM +0800, ak...@redhat.com wrote:
 From: Amos Kong ak...@redhat.com
 
 Currenly mac is programmed byte by byte. This means that we
 have an intermediate step where mac is wrong. 
 
 Second patch introduced a new vq control command to set mac
 address in one time.

I just posted RFC patches (kernel  qemu) out,
specifiction update will be sent when the solution
is accepted.

MST has another solution:
1. add a feature to disable mac address in pci space
2. introduce a new vq control cmd to add new mac address
   to mac filter table. (no long check n-mac in receiver_filter())

Welcome to give comments, thanks!
Amos
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 2/2] virtio-net: introduce a new control to set macaddr

2013-01-10 Thread Michael S. Tsirkin
On Thu, Jan 10, 2013 at 10:45:41PM +0800, ak...@redhat.com wrote:
 From: Amos Kong ak...@redhat.com
 
 Currently we write MAC address to pci config space byte by byte,
 this means that we have an intermediate step where mac is wrong.
 This patch introduced a new control command to set MAC address
 in one time.
 
 VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility.
 
 Signed-off-by: Amos Kong ak...@redhat.com
 ---
  drivers/net/virtio_net.c| 16 +++-
  include/uapi/linux/virtio_net.h |  8 +++-
  2 files changed, 22 insertions(+), 2 deletions(-)
 
 diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
 index 395ab4f..ff22bcd 100644
 --- a/drivers/net/virtio_net.c
 +++ b/drivers/net/virtio_net.c
 @@ -803,13 +803,26 @@ static int virtnet_set_mac_address(struct net_device 
 *dev, void *p)
   struct virtio_device *vdev = vi-vdev;
   int ret;
  
 + struct scatterlist sg;
 +
   ret = eth_mac_addr(dev, p);
   if (ret)
   return ret;
  
 - if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
 + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
 + /* Set MAC address by sending vq command */
 + sg_init_one(sg, dev-dev_addr, dev-addr_len);
 + virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
 +  VIRTIO_NET_CTRL_MAC_ADDR_SET,
 +  sg, 1, 0);

I think we should check status and return an error if the command failed.
Also, delay eth_mac_addr until after it passes.

 + return 0;
 + }
 +
 + if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) {
 + /* Set MAC address by writing config space */
   vdev-config-set(vdev, offsetof(struct virtio_net_config, mac),
 dev-dev_addr, dev-addr_len);
 + }
  

By the way, why don't we fail the command if VIRTIO_NET_F_MAC is off?
Rusty?

   return 0;
  }
 @@ -1627,6 +1640,7 @@ static unsigned int features[] = {
   VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
   VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN,
   VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ,
 + VIRTIO_NET_F_CTRL_MAC_ADDR,
  };
  
  static struct virtio_driver virtio_net_driver = {
 diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
 index 848e358..a5a8c88 100644
 --- a/include/uapi/linux/virtio_net.h
 +++ b/include/uapi/linux/virtio_net.h
 @@ -53,6 +53,7 @@
* network */
  #define VIRTIO_NET_F_MQ  22  /* Device supports Receive Flow
* Steering */
 +#define VIRTIO_NET_F_CTRL_MAC_ADDR 23/* Set MAC address */
  
  #define VIRTIO_NET_S_LINK_UP 1   /* Link is up */
  #define VIRTIO_NET_S_ANNOUNCE2   /* Announcement is needed */
 @@ -127,7 +128,7 @@ typedef __u8 virtio_net_ctrl_ack;
   #define VIRTIO_NET_CTRL_RX_NOBCAST  5
  
  /*
 - * Control the MAC filter table.
 + * Control the MAC
   *
   * The MAC filter table is managed by the hypervisor, the guest should
   * assume the size is infinite.  Filtering should be considered
 @@ -140,6 +141,10 @@ typedef __u8 virtio_net_ctrl_ack;
   * first sg list contains unicast addresses, the second is for multicast.
   * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature
   * is available.
 + *
 + * The ADDR_SET command requests one out scatterlist, it contains a
 + * 6 bytes MAC address. This functionality is present if the
 + * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available.
   */
  struct virtio_net_ctrl_mac {
   __u32 entries;
 @@ -148,6 +153,7 @@ struct virtio_net_ctrl_mac {
  
  #define VIRTIO_NET_CTRL_MAC1
   #define VIRTIO_NET_CTRL_MAC_TABLE_SET0
 + #define VIRTIO_NET_CTRL_MAC_ADDR_SET 1
  
  /*
   * Control VLAN filtering
 -- 
 1.7.11.7
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/2] make mac programming for virtio net more robust

2013-01-10 Thread Michael S. Tsirkin
On Thu, Jan 10, 2013 at 10:45:39PM +0800, ak...@redhat.com wrote:
 From: Amos Kong ak...@redhat.com
 
 Currenly mac is programmed byte by byte. This means that we
 have an intermediate step where mac is wrong. 
 
 Second patch introduced a new vq control command to set mac
 address in one time.

As you mention we could alternatively do it without
new commands, simply add a feature bit that says that MACs are
in the mac table.
This would be a much bigger patch, and I'm fine with either way.
Rusty what do you think?

 Amos Kong (2):
   move virtnet_send_command() above virtnet_set_mac_address()
   virtio-net: introduce a new control to set macaddr
 
  drivers/net/virtio_net.c| 105 
 ++--
  include/uapi/linux/virtio_net.h |   8 ++-
  2 files changed, 66 insertions(+), 47 deletions(-)
 
 -- 
 1.7.11.7
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 0/2] KVM/ARM implementation of PSCI

2013-01-10 Thread Marc Zyngier
This is an implementation of the Power State Coordination Interface,
as described there (registration required):
http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0022a/index.html

It allows CPUs to be powered on, off, suspended or migrated. This
series is only concerned with power-{on,off}. KVM Tool uses it to boot
SMP VMs. This is also supported by the mach-virt platform:

git://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git virt/mach

Tested on the arm64 model, based on kvm-arm-master as of today.

From v1:
- Documentation
- Move the one-time init to a separate function
- One mask fits all
- Use wait_event_interruptible instead of the open coded version

Marc Zyngier (2):
  ARM: KVM: move one-time init to its own function
  ARM: KVM: Power State Coordination Interface implementation

 Documentation/virtual/kvm/api.txt  |  4 ++
 arch/arm/include/asm/kvm_emulate.h |  5 +++
 arch/arm/include/asm/kvm_host.h|  5 ++-
 arch/arm/include/asm/kvm_psci.h| 23 ++
 arch/arm/include/uapi/asm/kvm.h| 16 +++
 arch/arm/kvm/Makefile  |  2 +-
 arch/arm/kvm/arm.c | 59 +
 arch/arm/kvm/psci.c| 90 ++
 include/uapi/linux/kvm.h   |  1 +
 9 files changed, 184 insertions(+), 21 deletions(-)
 create mode 100644 arch/arm/include/asm/kvm_psci.h
 create mode 100644 arch/arm/kvm/psci.c

-- 
1.8.1


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/2] ARM: KVM: move one-time init to its own function

2013-01-10 Thread Marc Zyngier
As more code to be run only once per vcpu is expected, move
the VGIC init code into a separate function.

Signed-off-by: Marc Zyngier marc.zyng...@arm.com
---
 arch/arm/include/asm/kvm_host.h |  3 +++
 arch/arm/kvm/arm.c  | 33 +++--
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 334b81d..14a71bb 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -127,6 +127,9 @@ struct kvm_vcpu_arch {
 
/* Cache some mmu pages needed inside spinlock regions */
struct kvm_mmu_memory_cache mmu_page_cache;
+
+   /* Detect first run of a vcpu */
+   bool has_run_once;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index dc67a3c..a28e49d 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -643,6 +643,26 @@ static int handle_exit(struct kvm_vcpu *vcpu, struct 
kvm_run *run,
}
 }
 
+static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
+{
+   if (likely(vcpu-arch.has_run_once))
+   return 0;
+
+   vcpu-arch.has_run_once = true;
+
+   /*
+* Initialize the VGIC before running a vcpu the first time on
+* this VM.
+*/
+   if (irqchip_in_kernel(vcpu-kvm)  !vgic_initialized(vcpu-kvm)) {
+   int ret = kvm_vgic_init(vcpu-kvm);
+   if (ret)
+   return ret;
+   }
+
+   return 0;
+}
+
 /**
  * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
  * @vcpu:  The VCPU pointer
@@ -663,16 +683,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct 
kvm_run *run)
if (unlikely(vcpu-arch.target  0))
return -ENOEXEC;
 
-   /*
-* Initialize the VGIC before running a vcpu the first time on
-* this VM.
-*/
-   if (irqchip_in_kernel(vcpu-kvm) 
-   unlikely(!vgic_initialized(vcpu-kvm))) {
-   ret = kvm_vgic_init(vcpu-kvm);
-   if (ret)
-   return ret;
-   }
+   ret = kvm_vcpu_first_run_init(vcpu);
+   if (ret)
+   return ret;
 
if (run-exit_reason == KVM_EXIT_MMIO) {
ret = kvm_handle_mmio_return(vcpu, vcpu-run);
-- 
1.8.1


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/2] ARM: KVM: Power State Coordination Interface implementation

2013-01-10 Thread Marc Zyngier
Implement the PSCI specification (ARM DEN 0022A) to control
virtual CPUs being powered on or off.

PSCI/KVM is detected using the KVM_CAP_ARM_PSCI capability.

A virtual CPU can now be initialized in a powered off state,
using the KVM_ARM_VCPU_POWER_OFF feature flag.

The guest can use either SMC or HVC to execute a PSCI function.

Signed-off-by: Marc Zyngier marc.zyng...@arm.com
---
 Documentation/virtual/kvm/api.txt  |  4 ++
 arch/arm/include/asm/kvm_emulate.h |  5 +++
 arch/arm/include/asm/kvm_host.h|  2 +-
 arch/arm/include/asm/kvm_psci.h| 23 ++
 arch/arm/include/uapi/asm/kvm.h| 16 +++
 arch/arm/kvm/Makefile  |  2 +-
 arch/arm/kvm/arm.c | 26 +++
 arch/arm/kvm/psci.c| 90 ++
 include/uapi/linux/kvm.h   |  1 +
 9 files changed, 158 insertions(+), 11 deletions(-)
 create mode 100644 arch/arm/include/asm/kvm_psci.h
 create mode 100644 arch/arm/kvm/psci.c

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 497fd48..e0fa0ea 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2185,6 +2185,10 @@ return ENOEXEC for that vcpu.
 Note that because some registers reflect machine topology, all vcpus
 should be created before this ioctl is invoked.
 
+Possible features:
+   - KVM_ARM_VCPU_POWER_OFF: Starts the CPU in a power-off state.
+ Depends on KVM_CAP_ARM_PSCI.
+
 
 4.78 KVM_GET_REG_LIST
 
diff --git a/arch/arm/include/asm/kvm_emulate.h 
b/arch/arm/include/asm/kvm_emulate.h
index 375795b..eb07342 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -34,6 +34,11 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu);
 void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
 void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
 
+static inline bool vcpu_mode_is_32bit(struct kvm_vcpu *vcpu)
+{
+   return 1;
+}
+
 static inline u32 *vcpu_pc(struct kvm_vcpu *vcpu)
 {
return (u32 *)vcpu-arch.regs.usr_regs.ARM_pc;
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 14a71bb..b8eb3e0 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -31,7 +31,7 @@
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #define KVM_HAVE_ONE_REG
 
-#define KVM_VCPU_MAX_FEATURES 0
+#define KVM_VCPU_MAX_FEATURES 1
 
 /* We don't currently support large pages. */
 #define KVM_HPAGE_GFN_SHIFT(x) 0
diff --git a/arch/arm/include/asm/kvm_psci.h b/arch/arm/include/asm/kvm_psci.h
new file mode 100644
index 000..992d7f1
--- /dev/null
+++ b/arch/arm/include/asm/kvm_psci.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2012 - ARM Ltd
+ * Author: Marc Zyngier marc.zyng...@arm.com
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/.
+ */
+
+#ifndef __ARM_KVM_PSCI_H__
+#define __ARM_KVM_PSCI_H__
+
+int kvm_psci_call(struct kvm_vcpu *vcpu);
+
+#endif /* __ARM_KVM_PSCI_H__ */
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index 94e893b..972b90d 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -81,6 +81,8 @@ struct kvm_regs {
 #define KVM_VGIC_V2_DIST_SIZE  0x1000
 #define KVM_VGIC_V2_CPU_SIZE   0x2000
 
+#define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */
+
 struct kvm_vcpu_init {
__u32 target;
__u32 features[7];
@@ -161,4 +163,18 @@ struct kvm_arch_memory_slot {
 /* Highest supported SPI, from VGIC_NR_IRQS */
 #define KVM_ARM_IRQ_GIC_MAX127
 
+/* PSCI interface */
+#define KVM_PSCI_FN_BASE   0x95c1ba5e
+#define KVM_PSCI_FN(n) (KVM_PSCI_FN_BASE + (n))
+
+#define KVM_PSCI_FN_CPU_SUSPENDKVM_PSCI_FN(0)
+#define KVM_PSCI_FN_CPU_OFFKVM_PSCI_FN(1)
+#define KVM_PSCI_FN_CPU_ON KVM_PSCI_FN(2)
+#define KVM_PSCI_FN_MIGRATEKVM_PSCI_FN(3)
+
+#define KVM_PSCI_RET_SUCCESS   0
+#define KVM_PSCI_RET_NI((unsigned long)-1)
+#define KVM_PSCI_RET_INVAL ((unsigned long)-2)
+#define KVM_PSCI_RET_DENIED((unsigned long)-3)
+
 #endif /* __ARM_KVM_H__ */
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index 43c4cad..6a2d571 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -18,6 +18,6 @@ kvm-arm-y = $(addprefix ../../../virt/kvm/, 

Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction

2013-01-10 Thread Marcelo Tosatti
On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote:
 The current reexecute_instruction can not well detect the failed instruction
 emulation. It allows guest to retry all the instructions except it accesses
 on error pfn
 
 For example, some cases are nested-write-protect - if the page we want to
 write is used as PDE but it chains to itself. Under this case, we should
 stop the emulation and report the case to userspace
 
 Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
 ---
  arch/x86/include/asm/kvm_host.h |7 +++
  arch/x86/kvm/paging_tmpl.h  |   27 ---
  arch/x86/kvm/x86.c  |8 +++-
  3 files changed, 34 insertions(+), 8 deletions(-)
 
 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
 index c431b33..d6ab8d2 100644
 --- a/arch/x86/include/asm/kvm_host.h
 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -502,6 +502,13 @@ struct kvm_vcpu_arch {
   u64 msr_val;
   struct gfn_to_hva_cache data;
   } pv_eoi;
 +
 + /*
 +  * Indicate whether the access faults on its page table in guest
 +  * which is set when fix page fault and used to detect unhandeable
 +  * instruction.
 +  */
 + bool write_fault_to_shadow_pgtable;
  };
 
  struct kvm_lpage_info {
 diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
 index 67b390d..df50560 100644
 --- a/arch/x86/kvm/paging_tmpl.h
 +++ b/arch/x86/kvm/paging_tmpl.h
 @@ -497,26 +497,34 @@ out_gpte_changed:
   * created when kvm establishes shadow page table that stop kvm using large
   * page size. Do it early can avoid unnecessary #PF and emulation.
   *
 + * @write_fault_to_shadow_pgtable will return true if the fault gfn is
 + * currently used as its page table.
 + *
   * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
   * since the PDPT is always shadowed, that means, we can not use large page
   * size to map the gfn which is used as PDPT.
   */
  static bool
  FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
 -   struct guest_walker *walker, int user_fault)
 +   struct guest_walker *walker, int user_fault,
 +   bool *write_fault_to_shadow_pgtable)
  {
   int level;
   gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker-level) - 1);
 + bool self_changed = false;
 
   if (!(walker-pte_access  ACC_WRITE_MASK ||
 (!is_write_protection(vcpu)  !user_fault)))
   return false;
 
 - for (level = walker-level; level = walker-max_level; level++)
 - if (!((walker-gfn ^ walker-table_gfn[level - 1])  mask))
 - return true;
 + for (level = walker-level; level = walker-max_level; level++) {
 + gfn_t gfn = walker-gfn ^ walker-table_gfn[level - 1];
 +
 + self_changed |= !(gfn  mask);
 + *write_fault_to_shadow_pgtable |= !gfn;
 + }
 
 - return false;
 + return self_changed;
  }
 
  /*
 @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t 
 addr, u32 error_code,
   int level = PT_PAGE_TABLE_LEVEL;
   int force_pt_level;
   unsigned long mmu_seq;
 - bool map_writable;
 + bool map_writable, is_self_change_mapping;
 
   pgprintk(%s: addr %lx err %x\n, __func__, addr, error_code);
 
 @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, 
 gva_t addr, u32 error_code,
   return 0;
   }
 
 + vcpu-arch.write_fault_to_shadow_pgtable = false;
 +
 + is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
 +   walker, user_fault, vcpu-arch.write_fault_to_shadow_pgtable);
 +
   if (walker.level = PT_DIRECTORY_LEVEL)
   force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
 -|| FNAME(is_self_change_mapping)(vcpu, walker, user_fault);
 +|| is_self_change_mapping;
   else
   force_pt_level = 1;
   if (!force_pt_level) {
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index 6f13e03..2957012 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu 
 *vcpu, gva_t cr2)
* guest to let CPU execute the instruction.
*/
   kvm_mmu_unprotect_page(vcpu-kvm, gpa_to_gfn(gpa));
 - return true;
 +
 + /*
 +  * If the access faults on its page table, it can not
 +  * be fixed by unprotecting shadow page and it should
 +  * be reported to userspace.
 +  */
 + return !vcpu-arch.write_fault_to_shadow_pgtable;
  }
 
  static bool retry_instruction(struct x86_emulate_ctxt *ctxt,

Also should make sure vcpu-arch.write_fault_to_shadow_pgtable is never
reused. Say, clean when exiting x86_emulate_instruction?

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More 

Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction

2013-01-10 Thread Marcelo Tosatti
On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote:
 The current reexecute_instruction can not well detect the failed instruction
 emulation. It allows guest to retry all the instructions except it accesses
 on error pfn
 
 For example, some cases are nested-write-protect - if the page we want to
 write is used as PDE but it chains to itself. Under this case, we should
 stop the emulation and report the case to userspace
 
 Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
 ---
  arch/x86/include/asm/kvm_host.h |7 +++
  arch/x86/kvm/paging_tmpl.h  |   27 ---
  arch/x86/kvm/x86.c  |8 +++-
  3 files changed, 34 insertions(+), 8 deletions(-)
 
 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
 index c431b33..d6ab8d2 100644
 --- a/arch/x86/include/asm/kvm_host.h
 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -502,6 +502,13 @@ struct kvm_vcpu_arch {
   u64 msr_val;
   struct gfn_to_hva_cache data;
   } pv_eoi;
 +
 + /*
 +  * Indicate whether the access faults on its page table in guest
 +  * which is set when fix page fault and used to detect unhandeable
 +  * instruction.
 +  */
 + bool write_fault_to_shadow_pgtable;
  };
 
  struct kvm_lpage_info {
 diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
 index 67b390d..df50560 100644
 --- a/arch/x86/kvm/paging_tmpl.h
 +++ b/arch/x86/kvm/paging_tmpl.h
 @@ -497,26 +497,34 @@ out_gpte_changed:
   * created when kvm establishes shadow page table that stop kvm using large
   * page size. Do it early can avoid unnecessary #PF and emulation.
   *
 + * @write_fault_to_shadow_pgtable will return true if the fault gfn is
 + * currently used as its page table.
 + *
   * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
   * since the PDPT is always shadowed, that means, we can not use large page
   * size to map the gfn which is used as PDPT.
   */
  static bool
  FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
 -   struct guest_walker *walker, int user_fault)
 +   struct guest_walker *walker, int user_fault,
 +   bool *write_fault_to_shadow_pgtable)
  {
   int level;
   gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker-level) - 1);
 + bool self_changed = false;
 
   if (!(walker-pte_access  ACC_WRITE_MASK ||
 (!is_write_protection(vcpu)  !user_fault)))
   return false;
 
 - for (level = walker-level; level = walker-max_level; level++)
 - if (!((walker-gfn ^ walker-table_gfn[level - 1])  mask))
 - return true;
 + for (level = walker-level; level = walker-max_level; level++) {
 + gfn_t gfn = walker-gfn ^ walker-table_gfn[level - 1];
 +
 + self_changed |= !(gfn  mask);
 + *write_fault_to_shadow_pgtable |= !gfn;
 + }
 
 - return false;
 + return self_changed;
  }
 
  /*
 @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t 
 addr, u32 error_code,
   int level = PT_PAGE_TABLE_LEVEL;
   int force_pt_level;
   unsigned long mmu_seq;
 - bool map_writable;
 + bool map_writable, is_self_change_mapping;
 
   pgprintk(%s: addr %lx err %x\n, __func__, addr, error_code);
 
 @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, 
 gva_t addr, u32 error_code,
   return 0;
   }
 
 + vcpu-arch.write_fault_to_shadow_pgtable = false;
 +
 + is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
 +   walker, user_fault, vcpu-arch.write_fault_to_shadow_pgtable);
 +
   if (walker.level = PT_DIRECTORY_LEVEL)
   force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
 -|| FNAME(is_self_change_mapping)(vcpu, walker, user_fault);
 +|| is_self_change_mapping;
   else
   force_pt_level = 1;
   if (!force_pt_level) {
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index 6f13e03..2957012 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu 
 *vcpu, gva_t cr2)
* guest to let CPU execute the instruction.
*/
   kvm_mmu_unprotect_page(vcpu-kvm, gpa_to_gfn(gpa));
 - return true;
 +
 + /*
 +  * If the access faults on its page table, it can not
 +  * be fixed by unprotecting shadow page and it should
 +  * be reported to userspace.
 +  */
 + return !vcpu-arch.write_fault_to_shadow_pgtable;
  }

This sounds wrong: only reporting emulation failure in case 
of a write fault to shadow pagetable? 

The current pattern is sane:

if (condition_1 which allows reexecution is true)
return true;

if (condition_2 which allows reexecution is true)
return true;
...
return false;

Applied 1-2.
--
To 

Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction

2013-01-10 Thread Gleb Natapov
On Thu, Jan 10, 2013 at 03:30:36PM -0200, Marcelo Tosatti wrote:
 On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote:
  The current reexecute_instruction can not well detect the failed instruction
  emulation. It allows guest to retry all the instructions except it accesses
  on error pfn
  
  For example, some cases are nested-write-protect - if the page we want to
  write is used as PDE but it chains to itself. Under this case, we should
  stop the emulation and report the case to userspace
  
  Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
  ---
   arch/x86/include/asm/kvm_host.h |7 +++
   arch/x86/kvm/paging_tmpl.h  |   27 ---
   arch/x86/kvm/x86.c  |8 +++-
   3 files changed, 34 insertions(+), 8 deletions(-)
  
  diff --git a/arch/x86/include/asm/kvm_host.h 
  b/arch/x86/include/asm/kvm_host.h
  index c431b33..d6ab8d2 100644
  --- a/arch/x86/include/asm/kvm_host.h
  +++ b/arch/x86/include/asm/kvm_host.h
  @@ -502,6 +502,13 @@ struct kvm_vcpu_arch {
  u64 msr_val;
  struct gfn_to_hva_cache data;
  } pv_eoi;
  +
  +   /*
  +* Indicate whether the access faults on its page table in guest
  +* which is set when fix page fault and used to detect unhandeable
  +* instruction.
  +*/
  +   bool write_fault_to_shadow_pgtable;
   };
  
   struct kvm_lpage_info {
  diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
  index 67b390d..df50560 100644
  --- a/arch/x86/kvm/paging_tmpl.h
  +++ b/arch/x86/kvm/paging_tmpl.h
  @@ -497,26 +497,34 @@ out_gpte_changed:
* created when kvm establishes shadow page table that stop kvm using large
* page size. Do it early can avoid unnecessary #PF and emulation.
*
  + * @write_fault_to_shadow_pgtable will return true if the fault gfn is
  + * currently used as its page table.
  + *
* Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
* since the PDPT is always shadowed, that means, we can not use large page
* size to map the gfn which is used as PDPT.
*/
   static bool
   FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
  - struct guest_walker *walker, int user_fault)
  + struct guest_walker *walker, int user_fault,
  + bool *write_fault_to_shadow_pgtable)
   {
  int level;
  gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker-level) - 1);
  +   bool self_changed = false;
  
  if (!(walker-pte_access  ACC_WRITE_MASK ||
(!is_write_protection(vcpu)  !user_fault)))
  return false;
  
  -   for (level = walker-level; level = walker-max_level; level++)
  -   if (!((walker-gfn ^ walker-table_gfn[level - 1])  mask))
  -   return true;
  +   for (level = walker-level; level = walker-max_level; level++) {
  +   gfn_t gfn = walker-gfn ^ walker-table_gfn[level - 1];
  +
  +   self_changed |= !(gfn  mask);
  +   *write_fault_to_shadow_pgtable |= !gfn;
  +   }
  
  -   return false;
  +   return self_changed;
   }
  
   /*
  @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, 
  gva_t addr, u32 error_code,
  int level = PT_PAGE_TABLE_LEVEL;
  int force_pt_level;
  unsigned long mmu_seq;
  -   bool map_writable;
  +   bool map_writable, is_self_change_mapping;
  
  pgprintk(%s: addr %lx err %x\n, __func__, addr, error_code);
  
  @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, 
  gva_t addr, u32 error_code,
  return 0;
  }
  
  +   vcpu-arch.write_fault_to_shadow_pgtable = false;
  +
  +   is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
  + walker, user_fault, vcpu-arch.write_fault_to_shadow_pgtable);
  +
  if (walker.level = PT_DIRECTORY_LEVEL)
  force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
  -  || FNAME(is_self_change_mapping)(vcpu, walker, user_fault);
  +  || is_self_change_mapping;
  else
  force_pt_level = 1;
  if (!force_pt_level) {
  diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
  index 6f13e03..2957012 100644
  --- a/arch/x86/kvm/x86.c
  +++ b/arch/x86/kvm/x86.c
  @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu 
  *vcpu, gva_t cr2)
   * guest to let CPU execute the instruction.
   */
  kvm_mmu_unprotect_page(vcpu-kvm, gpa_to_gfn(gpa));
  -   return true;
  +
  +   /*
  +* If the access faults on its page table, it can not
  +* be fixed by unprotecting shadow page and it should
  +* be reported to userspace.
  +*/
  +   return !vcpu-arch.write_fault_to_shadow_pgtable;
   }
  
   static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 
 Also should make sure vcpu-arch.write_fault_to_shadow_pgtable is never
 reused. Say, clean when exiting x86_emulate_instruction?
Clear it right here for clarity.

--
   

KVM/arm64 updates

2013-01-10 Thread Marc Zyngier
Guys,

I've updated the branches for KVM/arm64, renaming them all to keep the
interested parties on their toes. We now have:

- kvm-arm/psci: kvm-arm-master + the KVM/PSCI implementation

- kvm-arm/pre-arm64: the above + the cleanup branch

- arm64/soc-armv8-model: Catalin Marinas' arm64 branch

- arm64/psci: Implementation of PSCI for the above

- kvm-arm64/kvm-prereq: a bunch of random bits that KVM/arm requires to
compile on arm64.

- kvm-arm64/kvm: KVM/arm64 itself, and the only branch you should use
unless you have masochistic tendencies. Comes with all the above and
Mark Rutland's timer rework.

All that is at:
git://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git

You'll also need Will Deacon's KVM Tool port:
git://git.kernel.org/pub/scm/linux/kernel/git/will/kvmtool.git kvmtool/arm

A few random notes:
- If you're using the Foundation Model, use the provided DTS for your
host kernel (arch/arm64/boot/dts/foundation-v8.dts).
- The only supported models are the AEMv8 and the Foundation models. If
you're using something else and have any issue, first reproduce it with
one of the supported implementations.

What's new:
- Rebased on 3.8-rc3
- Resynced with kvm-arm-master (new and improved VGIC API...)
- Fixed 32bit VFP world switching

Enjoy,

M.
-- 
Jazz is not dead. It just smells funny...

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/7 -v2] KVM: Alleviate mmu_lock hold time when we start dirty logging

2013-01-10 Thread Marcelo Tosatti
On Tue, Jan 08, 2013 at 07:42:38PM +0900, Takuya Yoshikawa wrote:
 Changelog v1-v2:
   The condition in patch 1 was changed like this:
 npages  (mem-flags  KVM_MEM_LOG_DIRTY_PAGES)
 
 This patch set makes kvm_mmu_slot_remove_write_access() rmap based and
 adds conditional rescheduling to it.
 
 The motivation for this change is of course to reduce the mmu_lock hold
 time when we start dirty logging for a large memory slot.  You may not
 see the problem if you just give 8GB or less of the memory to the guest
 with THP enabled on the host -- this is for the worst case.
 
 Takuya Yoshikawa (7):
   KVM: Write protect the updated slot only when dirty logging is enabled
   KVM: MMU: Remove unused parameter level from __rmap_write_protect()
   KVM: MMU: Make kvm_mmu_slot_remove_write_access() rmap based
   KVM: Remove unused slot_bitmap from kvm_mmu_page
   KVM: Make kvm_mmu_change_mmu_pages() take mmu_lock by itself
   KVM: Make kvm_mmu_slot_remove_write_access() take mmu_lock by itself
   KVM: MMU: Conditionally reschedule when kvm_mmu_slot_remove_write_access() 
 takes a long time
 
  Documentation/virtual/kvm/mmu.txt |7 
  arch/x86/include/asm/kvm_host.h   |5 ---
  arch/x86/kvm/mmu.c|   56 +++-
  arch/x86/kvm/x86.c|   12 ---
  virt/kvm/kvm_main.c   |1 -
  5 files changed, 37 insertions(+), 44 deletions(-)
 
 -- 
 1.7.5.4

Reviewed-by: Marcelo Tosatti mtosa...@redhat.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 0/7] Streamline arithmetic instruction emulation

2013-01-10 Thread Marcelo Tosatti
On Fri, Jan 04, 2013 at 04:18:47PM +0200, Avi Kivity wrote:
 The current arithmetic instruction emulation is fairly clumsy: after
 decode, each instruction gets a switch (size), and for every size
 we fetch the operands, prepare flags, emulate the instruction, then store
 back the flags and operands.
 
 This patchset simplifies things by moving everything into common code
 except the instruction itself.  All the pre- and post- processing is
 coded just once.  The per-instrution code looks like:
 
   add %bl, %al
   ret
 
   add %bx, %ax
   ret
 
   add %ebx, %eax
   ret
 
   add %rbx, %rax
   ret
 
 The savings in size, for the ten instructions converted in this patchset,
 are fairly large:
 
text  data bss dec hex filename
   63724 0   0   63724f8ec arch/x86/kvm/emulate.o.before
   61268 0   0   61268ef54 arch/x86/kvm/emulate.o.after
 
 - around 2500 bytes.
 
 v3: fix reversed operand order in 2-operand macro
 
 v2: rebased
 
 Avi Kivity (7):
   KVM: x86 emulator: framework for streamlining arithmetic opcodes
   KVM: x86 emulator: Support for declaring single operand fastops
   KVM: x86 emulator: introduce NoWrite flag
   KVM: x86 emulator: mark CMP, CMPS, SCAS, TEST as NoWrite
   KVM: x86 emulator: convert NOT, NEG to fastop
   KVM: x86 emulator: add macros for defining 2-operand fastop emulation
   KVM: x86 emulator: convert basic ALU ops to fastop
 
  arch/x86/kvm/emulate.c | 215 
 +++--
  1 file changed, 120 insertions(+), 95 deletions(-)

Applied, thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [PATCH v2 0/2] KVM/ARM implementation of PSCI

2013-01-10 Thread Christoffer Dall
On Thu, Jan 10, 2013 at 11:06 AM, Marc Zyngier marc.zyng...@arm.com wrote:
 This is an implementation of the Power State Coordination Interface,
 as described there (registration required):
 http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0022a/index.html

 It allows CPUs to be powered on, off, suspended or migrated. This
 series is only concerned with power-{on,off}. KVM Tool uses it to boot
 SMP VMs. This is also supported by the mach-virt platform:

 git://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git virt/mach

 Tested on the arm64 model, based on kvm-arm-master as of today.

 From v1:
 - Documentation
 - Move the one-time init to a separate function
 - One mask fits all
 - Use wait_event_interruptible instead of the open coded version


Thanks, applied.

-Christoffer
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction

2013-01-10 Thread Xiao Guangrong
On 01/11/2013 01:26 AM, Marcelo Tosatti wrote:
 On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote:
 The current reexecute_instruction can not well detect the failed instruction
 emulation. It allows guest to retry all the instructions except it accesses
 on error pfn

 For example, some cases are nested-write-protect - if the page we want to
 write is used as PDE but it chains to itself. Under this case, we should
 stop the emulation and report the case to userspace

 Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
 ---
  arch/x86/include/asm/kvm_host.h |7 +++
  arch/x86/kvm/paging_tmpl.h  |   27 ---
  arch/x86/kvm/x86.c  |8 +++-
  3 files changed, 34 insertions(+), 8 deletions(-)

 diff --git a/arch/x86/include/asm/kvm_host.h 
 b/arch/x86/include/asm/kvm_host.h
 index c431b33..d6ab8d2 100644
 --- a/arch/x86/include/asm/kvm_host.h
 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -502,6 +502,13 @@ struct kvm_vcpu_arch {
  u64 msr_val;
  struct gfn_to_hva_cache data;
  } pv_eoi;
 +
 +/*
 + * Indicate whether the access faults on its page table in guest
 + * which is set when fix page fault and used to detect unhandeable
 + * instruction.
 + */
 +bool write_fault_to_shadow_pgtable;
  };

  struct kvm_lpage_info {
 diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
 index 67b390d..df50560 100644
 --- a/arch/x86/kvm/paging_tmpl.h
 +++ b/arch/x86/kvm/paging_tmpl.h
 @@ -497,26 +497,34 @@ out_gpte_changed:
   * created when kvm establishes shadow page table that stop kvm using large
   * page size. Do it early can avoid unnecessary #PF and emulation.
   *
 + * @write_fault_to_shadow_pgtable will return true if the fault gfn is
 + * currently used as its page table.
 + *
   * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
   * since the PDPT is always shadowed, that means, we can not use large page
   * size to map the gfn which is used as PDPT.
   */
  static bool
  FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
 -  struct guest_walker *walker, int user_fault)
 +  struct guest_walker *walker, int user_fault,
 +  bool *write_fault_to_shadow_pgtable)
  {
  int level;
  gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker-level) - 1);
 +bool self_changed = false;

  if (!(walker-pte_access  ACC_WRITE_MASK ||
(!is_write_protection(vcpu)  !user_fault)))
  return false;

 -for (level = walker-level; level = walker-max_level; level++)
 -if (!((walker-gfn ^ walker-table_gfn[level - 1])  mask))
 -return true;
 +for (level = walker-level; level = walker-max_level; level++) {
 +gfn_t gfn = walker-gfn ^ walker-table_gfn[level - 1];
 +
 +self_changed |= !(gfn  mask);
 +*write_fault_to_shadow_pgtable |= !gfn;
 +}

 -return false;
 +return self_changed;
  }

  /*
 @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, 
 gva_t addr, u32 error_code,
  int level = PT_PAGE_TABLE_LEVEL;
  int force_pt_level;
  unsigned long mmu_seq;
 -bool map_writable;
 +bool map_writable, is_self_change_mapping;

  pgprintk(%s: addr %lx err %x\n, __func__, addr, error_code);

 @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, 
 gva_t addr, u32 error_code,
  return 0;
  }

 +vcpu-arch.write_fault_to_shadow_pgtable = false;
 +
 +is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
 +  walker, user_fault, vcpu-arch.write_fault_to_shadow_pgtable);
 +
  if (walker.level = PT_DIRECTORY_LEVEL)
  force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
 -   || FNAME(is_self_change_mapping)(vcpu, walker, user_fault);
 +   || is_self_change_mapping;
  else
  force_pt_level = 1;
  if (!force_pt_level) {
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index 6f13e03..2957012 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu 
 *vcpu, gva_t cr2)
   * guest to let CPU execute the instruction.
   */
  kvm_mmu_unprotect_page(vcpu-kvm, gpa_to_gfn(gpa));
 -return true;
 +
 +/*
 + * If the access faults on its page table, it can not
 + * be fixed by unprotecting shadow page and it should
 + * be reported to userspace.
 + */
 +return !vcpu-arch.write_fault_to_shadow_pgtable;
  }
 
 This sounds wrong: only reporting emulation failure in case 
 of a write fault to shadow pagetable? 

We suppose unprotecting target-gfn can avoid emulation, the same
as current code. :(

 
 The current pattern is sane:
 
 if (condition_1 which allows reexecution is true)
   return true;
 
 if (condition_2 which allows reexecution is 

Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction

2013-01-10 Thread Xiao Guangrong
On 01/11/2013 01:30 AM, Marcelo Tosatti wrote:
 On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote:
 The current reexecute_instruction can not well detect the failed instruction
 emulation. It allows guest to retry all the instructions except it accesses
 on error pfn

 For example, some cases are nested-write-protect - if the page we want to
 write is used as PDE but it chains to itself. Under this case, we should
 stop the emulation and report the case to userspace

 Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
 ---
  arch/x86/include/asm/kvm_host.h |7 +++
  arch/x86/kvm/paging_tmpl.h  |   27 ---
  arch/x86/kvm/x86.c  |8 +++-
  3 files changed, 34 insertions(+), 8 deletions(-)

 diff --git a/arch/x86/include/asm/kvm_host.h 
 b/arch/x86/include/asm/kvm_host.h
 index c431b33..d6ab8d2 100644
 --- a/arch/x86/include/asm/kvm_host.h
 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -502,6 +502,13 @@ struct kvm_vcpu_arch {
  u64 msr_val;
  struct gfn_to_hva_cache data;
  } pv_eoi;
 +
 +/*
 + * Indicate whether the access faults on its page table in guest
 + * which is set when fix page fault and used to detect unhandeable
 + * instruction.
 + */
 +bool write_fault_to_shadow_pgtable;
  };

  struct kvm_lpage_info {
 diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
 index 67b390d..df50560 100644
 --- a/arch/x86/kvm/paging_tmpl.h
 +++ b/arch/x86/kvm/paging_tmpl.h
 @@ -497,26 +497,34 @@ out_gpte_changed:
   * created when kvm establishes shadow page table that stop kvm using large
   * page size. Do it early can avoid unnecessary #PF and emulation.
   *
 + * @write_fault_to_shadow_pgtable will return true if the fault gfn is
 + * currently used as its page table.
 + *
   * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
   * since the PDPT is always shadowed, that means, we can not use large page
   * size to map the gfn which is used as PDPT.
   */
  static bool
  FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
 -  struct guest_walker *walker, int user_fault)
 +  struct guest_walker *walker, int user_fault,
 +  bool *write_fault_to_shadow_pgtable)
  {
  int level;
  gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker-level) - 1);
 +bool self_changed = false;

  if (!(walker-pte_access  ACC_WRITE_MASK ||
(!is_write_protection(vcpu)  !user_fault)))
  return false;

 -for (level = walker-level; level = walker-max_level; level++)
 -if (!((walker-gfn ^ walker-table_gfn[level - 1])  mask))
 -return true;
 +for (level = walker-level; level = walker-max_level; level++) {
 +gfn_t gfn = walker-gfn ^ walker-table_gfn[level - 1];
 +
 +self_changed |= !(gfn  mask);
 +*write_fault_to_shadow_pgtable |= !gfn;
 +}

 -return false;
 +return self_changed;
  }

  /*
 @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, 
 gva_t addr, u32 error_code,
  int level = PT_PAGE_TABLE_LEVEL;
  int force_pt_level;
  unsigned long mmu_seq;
 -bool map_writable;
 +bool map_writable, is_self_change_mapping;

  pgprintk(%s: addr %lx err %x\n, __func__, addr, error_code);

 @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, 
 gva_t addr, u32 error_code,
  return 0;
  }

 +vcpu-arch.write_fault_to_shadow_pgtable = false;
 +
 +is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
 +  walker, user_fault, vcpu-arch.write_fault_to_shadow_pgtable);
 +
  if (walker.level = PT_DIRECTORY_LEVEL)
  force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
 -   || FNAME(is_self_change_mapping)(vcpu, walker, user_fault);
 +   || is_self_change_mapping;
  else
  force_pt_level = 1;
  if (!force_pt_level) {
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index 6f13e03..2957012 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu 
 *vcpu, gva_t cr2)
   * guest to let CPU execute the instruction.
   */
  kvm_mmu_unprotect_page(vcpu-kvm, gpa_to_gfn(gpa));
 -return true;
 +
 +/*
 + * If the access faults on its page table, it can not
 + * be fixed by unprotecting shadow page and it should
 + * be reported to userspace.
 + */
 +return !vcpu-arch.write_fault_to_shadow_pgtable;
  }

  static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 
 Also should make sure vcpu-arch.write_fault_to_shadow_pgtable is never
 reused. Say, clean when exiting x86_emulate_instruction?

Yes, it is more clear.

But i am thinking if it is really needed because 'cr2' is only valid when it
is called on page fault path, 

Re: [RFC PATCH 0/9] KVM: PPC: In-kernel PAPR interrupt controller emulation

2013-01-10 Thread Scott Wood

On 01/10/2013 05:42:16 AM, Alexander Graf wrote:
Other instantiation attributes we don't know that early on should be  
settable between the time frame of vm creation and first execution.  
An example for this are device addresses. Check out the threads KVM:  
ARM: Rename KVM_SET_DEVICE_ADDRESS and KVM: ARM: Introduce  
KVM_SET_DEVICE_ADDRESS ioctl for more information on this particular  
bit.


FWIW, on Freescale chips the MPIC base address can move at runtime if  
the guest writes to CCSRBAR (though QEMU doesn't currently implement  
this).


-Scott
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Proper taming of oom-killer with kvm

2013-01-10 Thread Andrey Korolyov
Hi,

I have recently run in the following issue: under certain conditions,
if emulator process have exceeded its own memory limit in the cgroup
and oom shot it, /proc entry may stay long indefinitely. There are two
possible side-effects - at first, if one will try to read cmdline from
such entry, his request will hang indefinitely long too, e.g. if
issuing ``ps aux'' once per minute will fill out default PID limit in
less a half of day by ps processes in D state. Second effect may
appear only on heavily loaded node - scheduler process will eat 100%
of selected cores (almost always at only one), with system becoming
unresponsive in a couple of minutes. This should be reproduced easily:

- start kvm process
- put in to memory cgroup and set the limits
- disable oom_killer via oom_control
- simply put the process into oom condition (using balloon, it should
be very simple)
- check kvm process state - if it stuck in D state, all should be
okay, since you`re able to catch oom condition - simply send TERM
signal and raise memory limit by nonsignificant amount and process
wiil end normally.
If you`re  observing kvm process with triggered under_oom flag and in
_sleep_ state, TERM signal will kill it, with nice lock described
above.

I have solved problem by quite stupid workaround - after getting
informed of oom event(w/ disabled oom in cg), I`m  freezing kvm
process via freezer cg, therefore moving it to D state, then sending
it TERM, then raising memory limits and finally unfreezing it - it is
very ugly, but at least I have get rid of the problem.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: trace: Fix exit decoding.

2013-01-10 Thread Marcelo Tosatti
On Tue, Jan 08, 2013 at 01:00:01PM +0100, Cornelia Huck wrote:
 trace_kvm_userspace_exit has been missing the KVM_EXIT_WATCHDOG exit.
 
 CC: Bharat Bhushan r65...@freescale.com
 Signed-off-by: Cornelia Huck cornelia.h...@de.ibm.com
 ---
  include/trace/events/kvm.h | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

Applied, thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC/PATCH 1/1] s390/kvm: Fix BUG in include/linux/kvm_host.h:745

2013-01-10 Thread Marcelo Tosatti
On Tue, Jan 08, 2013 at 04:23:10PM +0100, Christian Borntraeger wrote:
 commit b080935c8638e08134629d0a9ebdf35669bec14d
 kvm: Directly account vtime to system on guest switch
 
 also removed the irq_disable/enable around kvm guest switch, which
 is correct in itself. Unfortunately, there is a BUG ON that (correctly)
 checks for preemptible to cover the call to rcu later on.
 (Introduced with commit 8fa2206821953a50a3a02ea33fcfb3ced2fd9997
 KVM: make guest mode entry to be rcu quiescent state)
 
 This check might trigger depending on the kernel config.
 Lets make sure that no preemption happens during kvm_guest_enter.
 We can enable preemption again after the call to
 rcu_virt_note_context_switch returns.
 
 Please note that we continue to run s390 guests with interrupts
 enabled.
 
 CC: Frederic Weisbecker fweis...@gmail.com
 CC: Gleb Natapov g...@redhat.com
 Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com

Applied, thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction

2013-01-10 Thread Marcelo Tosatti
On Fri, Jan 11, 2013 at 02:05:33AM +0800, Xiao Guangrong wrote:
 On 01/11/2013 01:26 AM, Marcelo Tosatti wrote:
  On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote:
  The current reexecute_instruction can not well detect the failed 
  instruction
  emulation. It allows guest to retry all the instructions except it accesses
  on error pfn
 
  For example, some cases are nested-write-protect - if the page we want to
  write is used as PDE but it chains to itself. Under this case, we should
  stop the emulation and report the case to userspace
 
  Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
  ---
   arch/x86/include/asm/kvm_host.h |7 +++
   arch/x86/kvm/paging_tmpl.h  |   27 ---
   arch/x86/kvm/x86.c  |8 +++-
   3 files changed, 34 insertions(+), 8 deletions(-)
 
  diff --git a/arch/x86/include/asm/kvm_host.h 
  b/arch/x86/include/asm/kvm_host.h
  index c431b33..d6ab8d2 100644
  --- a/arch/x86/include/asm/kvm_host.h
  +++ b/arch/x86/include/asm/kvm_host.h
  @@ -502,6 +502,13 @@ struct kvm_vcpu_arch {
 u64 msr_val;
 struct gfn_to_hva_cache data;
 } pv_eoi;
  +
  +  /*
  +   * Indicate whether the access faults on its page table in guest
  +   * which is set when fix page fault and used to detect unhandeable
  +   * instruction.
  +   */
  +  bool write_fault_to_shadow_pgtable;
   };
 
   struct kvm_lpage_info {
  diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
  index 67b390d..df50560 100644
  --- a/arch/x86/kvm/paging_tmpl.h
  +++ b/arch/x86/kvm/paging_tmpl.h
  @@ -497,26 +497,34 @@ out_gpte_changed:
* created when kvm establishes shadow page table that stop kvm using 
  large
* page size. Do it early can avoid unnecessary #PF and emulation.
*
  + * @write_fault_to_shadow_pgtable will return true if the fault gfn is
  + * currently used as its page table.
  + *
* Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
* since the PDPT is always shadowed, that means, we can not use large 
  page
* size to map the gfn which is used as PDPT.
*/
   static bool
   FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
  -struct guest_walker *walker, int user_fault)
  +struct guest_walker *walker, int user_fault,
  +bool *write_fault_to_shadow_pgtable)
   {
 int level;
 gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker-level) - 1);
  +  bool self_changed = false;
 
 if (!(walker-pte_access  ACC_WRITE_MASK ||
   (!is_write_protection(vcpu)  !user_fault)))
 return false;
 
  -  for (level = walker-level; level = walker-max_level; level++)
  -  if (!((walker-gfn ^ walker-table_gfn[level - 1])  mask))
  -  return true;
  +  for (level = walker-level; level = walker-max_level; level++) {
  +  gfn_t gfn = walker-gfn ^ walker-table_gfn[level - 1];
  +
  +  self_changed |= !(gfn  mask);
  +  *write_fault_to_shadow_pgtable |= !gfn;
  +  }
 
  -  return false;
  +  return self_changed;
   }
 
   /*
  @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, 
  gva_t addr, u32 error_code,
 int level = PT_PAGE_TABLE_LEVEL;
 int force_pt_level;
 unsigned long mmu_seq;
  -  bool map_writable;
  +  bool map_writable, is_self_change_mapping;
 
 pgprintk(%s: addr %lx err %x\n, __func__, addr, error_code);
 
  @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, 
  gva_t addr, u32 error_code,
 return 0;
 }
 
  +  vcpu-arch.write_fault_to_shadow_pgtable = false;
  +
  +  is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
  +walker, user_fault, vcpu-arch.write_fault_to_shadow_pgtable);
  +
 if (walker.level = PT_DIRECTORY_LEVEL)
 force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
  - || FNAME(is_self_change_mapping)(vcpu, walker, user_fault);
  + || is_self_change_mapping;
 else
 force_pt_level = 1;
 if (!force_pt_level) {
  diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
  index 6f13e03..2957012 100644
  --- a/arch/x86/kvm/x86.c
  +++ b/arch/x86/kvm/x86.c
  @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu 
  *vcpu, gva_t cr2)
  * guest to let CPU execute the instruction.
  */
 kvm_mmu_unprotect_page(vcpu-kvm, gpa_to_gfn(gpa));
  -  return true;
  +
  +  /*
  +   * If the access faults on its page table, it can not
  +   * be fixed by unprotecting shadow page and it should
  +   * be reported to userspace.
  +   */
  +  return !vcpu-arch.write_fault_to_shadow_pgtable;
   }
  
  This sounds wrong: only reporting emulation failure in case 
  of a write fault to shadow pagetable? 
 
 We suppose unprotecting target-gfn can avoid emulation, the same
 as current code. :(

Current code treats access to non-mapped guest address as 

Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction

2013-01-10 Thread Xiao Guangrong
On 01/11/2013 03:48 AM, Marcelo Tosatti wrote:
 On Fri, Jan 11, 2013 at 02:05:33AM +0800, Xiao Guangrong wrote:
 On 01/11/2013 01:26 AM, Marcelo Tosatti wrote:
 On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote:
 The current reexecute_instruction can not well detect the failed 
 instruction
 emulation. It allows guest to retry all the instructions except it accesses
 on error pfn

 For example, some cases are nested-write-protect - if the page we want to
 write is used as PDE but it chains to itself. Under this case, we should
 stop the emulation and report the case to userspace

 Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
 ---
  arch/x86/include/asm/kvm_host.h |7 +++
  arch/x86/kvm/paging_tmpl.h  |   27 ---
  arch/x86/kvm/x86.c  |8 +++-
  3 files changed, 34 insertions(+), 8 deletions(-)

 diff --git a/arch/x86/include/asm/kvm_host.h 
 b/arch/x86/include/asm/kvm_host.h
 index c431b33..d6ab8d2 100644
 --- a/arch/x86/include/asm/kvm_host.h
 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -502,6 +502,13 @@ struct kvm_vcpu_arch {
u64 msr_val;
struct gfn_to_hva_cache data;
} pv_eoi;
 +
 +  /*
 +   * Indicate whether the access faults on its page table in guest
 +   * which is set when fix page fault and used to detect unhandeable
 +   * instruction.
 +   */
 +  bool write_fault_to_shadow_pgtable;
  };

  struct kvm_lpage_info {
 diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
 index 67b390d..df50560 100644
 --- a/arch/x86/kvm/paging_tmpl.h
 +++ b/arch/x86/kvm/paging_tmpl.h
 @@ -497,26 +497,34 @@ out_gpte_changed:
   * created when kvm establishes shadow page table that stop kvm using 
 large
   * page size. Do it early can avoid unnecessary #PF and emulation.
   *
 + * @write_fault_to_shadow_pgtable will return true if the fault gfn is
 + * currently used as its page table.
 + *
   * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
   * since the PDPT is always shadowed, that means, we can not use large 
 page
   * size to map the gfn which is used as PDPT.
   */
  static bool
  FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
 -struct guest_walker *walker, int user_fault)
 +struct guest_walker *walker, int user_fault,
 +bool *write_fault_to_shadow_pgtable)
  {
int level;
gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker-level) - 1);
 +  bool self_changed = false;

if (!(walker-pte_access  ACC_WRITE_MASK ||
  (!is_write_protection(vcpu)  !user_fault)))
return false;

 -  for (level = walker-level; level = walker-max_level; level++)
 -  if (!((walker-gfn ^ walker-table_gfn[level - 1])  mask))
 -  return true;
 +  for (level = walker-level; level = walker-max_level; level++) {
 +  gfn_t gfn = walker-gfn ^ walker-table_gfn[level - 1];
 +
 +  self_changed |= !(gfn  mask);
 +  *write_fault_to_shadow_pgtable |= !gfn;
 +  }

 -  return false;
 +  return self_changed;
  }

  /*
 @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, 
 gva_t addr, u32 error_code,
int level = PT_PAGE_TABLE_LEVEL;
int force_pt_level;
unsigned long mmu_seq;
 -  bool map_writable;
 +  bool map_writable, is_self_change_mapping;

pgprintk(%s: addr %lx err %x\n, __func__, addr, error_code);

 @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, 
 gva_t addr, u32 error_code,
return 0;
}

 +  vcpu-arch.write_fault_to_shadow_pgtable = false;
 +
 +  is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
 +walker, user_fault, vcpu-arch.write_fault_to_shadow_pgtable);
 +
if (walker.level = PT_DIRECTORY_LEVEL)
force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
 - || FNAME(is_self_change_mapping)(vcpu, walker, user_fault);
 + || is_self_change_mapping;
else
force_pt_level = 1;
if (!force_pt_level) {
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index 6f13e03..2957012 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu 
 *vcpu, gva_t cr2)
 * guest to let CPU execute the instruction.
 */
kvm_mmu_unprotect_page(vcpu-kvm, gpa_to_gfn(gpa));
 -  return true;
 +
 +  /*
 +   * If the access faults on its page table, it can not
 +   * be fixed by unprotecting shadow page and it should
 +   * be reported to userspace.
 +   */
 +  return !vcpu-arch.write_fault_to_shadow_pgtable;
  }

 This sounds wrong: only reporting emulation failure in case 
 of a write fault to shadow pagetable? 

 We suppose unprotecting target-gfn can avoid emulation, the same
 as current code. :(
 
 Current code treats access to non-mapped guest address as indication to
 exit reporting emulation failure.
 
 The patch above restricts 

Re: [PULL 0/2] vfio-pci: Fixes for 1.4 stable

2013-01-10 Thread Anthony Liguori
Pulled, thanks.

Regards,

Anthony Liguori

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v9 1/3] x86, apicv: add APICv register virtualization support

2013-01-10 Thread Marcelo Tosatti
On Thu, Jan 10, 2013 at 03:26:06PM +0800, Yang Zhang wrote:
 - APIC read doesn't cause VM-Exit
 - APIC write becomes trap-like
 
 Signed-off-by: Kevin Tian kevin.t...@intel.com
 Signed-off-by: Yang Zhang yang.z.zh...@intel.com
 ---
  arch/x86/include/asm/vmx.h |2 ++
  arch/x86/kvm/lapic.c   |   15 +++
  arch/x86/kvm/lapic.h   |2 ++
  arch/x86/kvm/vmx.c |   33 -
  4 files changed, 51 insertions(+), 1 deletions(-)
 
 diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
 index e385df9..44c3f7e 100644
 --- a/arch/x86/include/asm/vmx.h
 +++ b/arch/x86/include/asm/vmx.h
 @@ -66,6 +66,7 @@
  #define EXIT_REASON_EPT_MISCONFIG   49
  #define EXIT_REASON_WBINVD  54
  #define EXIT_REASON_XSETBV  55
 +#define EXIT_REASON_APIC_WRITE  56
  #define EXIT_REASON_INVPCID 58
  
  #define VMX_EXIT_REASONS \
 @@ -141,6 +142,7 @@
  #define SECONDARY_EXEC_ENABLE_VPID  0x0020
  #define SECONDARY_EXEC_WBINVD_EXITING0x0040
  #define SECONDARY_EXEC_UNRESTRICTED_GUEST0x0080
 +#define SECONDARY_EXEC_APIC_REGISTER_VIRT   0x0100
  #define SECONDARY_EXEC_PAUSE_LOOP_EXITING0x0400
  #define SECONDARY_EXEC_ENABLE_INVPCID0x1000
  
 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
 index 9392f52..0664c13 100644
 --- a/arch/x86/kvm/lapic.c
 +++ b/arch/x86/kvm/lapic.c
 @@ -1212,6 +1212,21 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
  }
  EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
  
 +/* emulate APIC access in a trap manner */
 +void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
 +{
 + u32 val = 0;
 +
 + /* hw has done the conditional check and inst decode */
 + offset = 0xff0;
 +
 + apic_reg_read(vcpu-arch.apic, offset, 4, val);
 +
 + /* TODO: optimize to just emulate side effect w/o one more write */
 + apic_reg_write(vcpu-arch.apic, offset, val);
 +}
 +EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode);
 +
  void kvm_free_lapic(struct kvm_vcpu *vcpu)
  {
   struct kvm_lapic *apic = vcpu-arch.apic;
 diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
 index e5ebf9f..9a8ee22 100644
 --- a/arch/x86/kvm/lapic.h
 +++ b/arch/x86/kvm/lapic.h
 @@ -64,6 +64,8 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
  u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
  void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
  
 +void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset);
 +
  void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
  void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
  void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 55dfc37..688f43f 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -84,6 +84,9 @@ module_param(vmm_exclusive, bool, S_IRUGO);
  static bool __read_mostly fasteoi = 1;
  module_param(fasteoi, bool, S_IRUGO);
  
 +static bool __read_mostly enable_apicv_reg_vid = 1;
 +module_param(enable_apicv_reg_vid, bool, S_IRUGO);
 +
  /*
   * If nested=1, nested virtualization is supported, i.e., guests may use
   * VMX and be a hypervisor for its own guests. If nested=0, guests may not
 @@ -764,6 +767,12 @@ static inline bool 
 cpu_has_vmx_virtualize_apic_accesses(void)
   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
  }
  
 +static inline bool cpu_has_vmx_apic_register_virt(void)
 +{
 + return vmcs_config.cpu_based_2nd_exec_ctrl 
 + SECONDARY_EXEC_APIC_REGISTER_VIRT;
 +}
 +
  static inline bool cpu_has_vmx_flexpriority(void)
  {
   return cpu_has_vmx_tpr_shadow() 
 @@ -2541,7 +2550,8 @@ static __init int setup_vmcs_config(struct vmcs_config 
 *vmcs_conf)
   SECONDARY_EXEC_UNRESTRICTED_GUEST |
   SECONDARY_EXEC_PAUSE_LOOP_EXITING |
   SECONDARY_EXEC_RDTSCP |
 - SECONDARY_EXEC_ENABLE_INVPCID;
 + SECONDARY_EXEC_ENABLE_INVPCID |
 + SECONDARY_EXEC_APIC_REGISTER_VIRT;
   if (adjust_vmx_controls(min2, opt2,
   MSR_IA32_VMX_PROCBASED_CTLS2,
   _cpu_based_2nd_exec_control)  0)
 @@ -2552,6 +2562,11 @@ static __init int setup_vmcs_config(struct vmcs_config 
 *vmcs_conf)
   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
   _cpu_based_exec_control = ~CPU_BASED_TPR_SHADOW;
  #endif
 +
 + if (!(_cpu_based_exec_control  CPU_BASED_TPR_SHADOW))
 + _cpu_based_2nd_exec_control = ~(
 + SECONDARY_EXEC_APIC_REGISTER_VIRT);

No need for () around SECONDARY_EXEC_APIC_REGISTER_VIRT.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  

Re: [PATCH v9 3/3] x86, apicv: add virtual interrupt delivery support

2013-01-10 Thread Marcelo Tosatti
Hi,

Getting into good shape.

On Thu, Jan 10, 2013 at 03:26:08PM +0800, Yang Zhang wrote:
 From: Yang Zhang yang.z.zh...@intel.com
 
 Virtual interrupt delivery avoids KVM to inject vAPIC interrupts
 manually, which is fully taken care of by the hardware. This needs
 some special awareness into existing interrupr injection path:
 
 - for pending interrupt, instead of direct injection, we may need
   update architecture specific indicators before resuming to guest.
 
 - A pending interrupt, which is masked by ISR, should be also
   considered in above update action, since hardware will decide
   when to inject it at right time. Current has_interrupt and
   get_interrupt only returns a valid vector from injection p.o.v.
 
 Signed-off-by: Kevin Tian kevin.t...@intel.com
 Signed-off-by: Yang Zhang yang.z.zh...@intel.com
 ---
  arch/x86/include/asm/kvm_host.h |5 +
  arch/x86/include/asm/vmx.h  |   11 +++
  arch/x86/kvm/irq.c  |   56 +++-
  arch/x86/kvm/lapic.c|   72 +--
  arch/x86/kvm/lapic.h|   23 +
  arch/x86/kvm/svm.c  |   18 
  arch/x86/kvm/vmx.c  |  191 
 +--
  arch/x86/kvm/x86.c  |   14 +++-
  include/linux/kvm_host.h|3 +
  virt/kvm/ioapic.c   |   18 
  virt/kvm/ioapic.h   |4 +
  virt/kvm/irq_comm.c |   22 +
  virt/kvm/kvm_main.c |5 +
  13 files changed, 399 insertions(+), 43 deletions(-)
 

  static void recalculate_apic_map(struct kvm *kvm)
  {
   struct kvm_apic_map *new, *old = NULL;
 @@ -236,12 +219,14 @@ static inline void kvm_apic_set_id(struct kvm_lapic 
 *apic, u8 id)
  {
   apic_set_reg(apic, APIC_ID, id  24);
   recalculate_apic_map(apic-vcpu-kvm);
 + ioapic_update_eoi_exitmap(apic-vcpu-kvm);
  }

Move ioapic_update_eoi_exitmap into recalculate_apic_map.

  static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
  {
   apic_set_reg(apic, APIC_LDR, id);
   recalculate_apic_map(apic-vcpu-kvm);
 + ioapic_update_eoi_exitmap(apic-vcpu-kvm);
  }
  

 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index b203ce7..990409a 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -434,6 +434,7 @@ struct vcpu_vmx {
   bool rdtscp_enabled;
  
   bool virtual_x2apic_enabled;
 + unsigned long eoi_exit_bitmap[4];

Use DECLARE_BITMAP (unsigned long is 4 bytes on 32-bit host).

   /* Support for a guest hypervisor (nested VMX) */
   struct nested_vmx nested;
 @@ -783,7 +784,8 @@ static inline bool cpu_has_vmx_apic_register_virt(void)
  
  static inline bool cpu_has_vmx_virtual_intr_delivery(void)
  {
 - return false;
 + return vmcs_config.cpu_based_2nd_exec_ctrl 
 + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
  }
  
  static inline bool cpu_has_vmx_flexpriority(void)
 @@ -2565,7 +2567,8 @@ static __init int setup_vmcs_config(struct vmcs_config 
 *vmcs_conf)
   SECONDARY_EXEC_PAUSE_LOOP_EXITING |
   SECONDARY_EXEC_RDTSCP |
   SECONDARY_EXEC_ENABLE_INVPCID |
 - SECONDARY_EXEC_APIC_REGISTER_VIRT;
 + SECONDARY_EXEC_APIC_REGISTER_VIRT |
 + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
   if (adjust_vmx_controls(min2, opt2,
   MSR_IA32_VMX_PROCBASED_CTLS2,
   _cpu_based_2nd_exec_control)  0)
 @@ -2579,7 +2582,8 @@ static __init int setup_vmcs_config(struct vmcs_config 
 *vmcs_conf)
  
   if (!(_cpu_based_exec_control  CPU_BASED_TPR_SHADOW))
   _cpu_based_2nd_exec_control = ~(
 - SECONDARY_EXEC_APIC_REGISTER_VIRT);
 + SECONDARY_EXEC_APIC_REGISTER_VIRT |
 + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);

Nevermind the earlier comment about ().

 +static void set_eoi_exitmap_one(struct kvm_vcpu *vcpu,
 + u32 vector)
 +{
 + struct vcpu_vmx *vmx = to_vmx(vcpu);
 +
 + if (WARN_ONCE((vector  255),
 + KVM VMX: vector (%d) out of range\n, vector))
 + return;
 + __set_bit(vector, vmx-eoi_exit_bitmap);
 +}
 +
 +void vmx_check_ioapic_entry(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
 +{
 + struct kvm_lapic **dst;
 + struct kvm_apic_map *map;
 + unsigned long bitmap = 1;
 + int i;
 +
 + rcu_read_lock();
 + map = rcu_dereference(vcpu-kvm-arch.apic_map);
 +
 + if (unlikely(!map)) {
 + set_eoi_exitmap_one(vcpu, irq-vector);
 + goto out;
 + }
 +
 + if (irq-dest_mode == 0) { /* physical mode */
 + if (irq-delivery_mode == APIC_DM_LOWEST ||
 + irq-dest_id == 0xff) {
 + set_eoi_exitmap_one(vcpu, irq-vector);
 + goto out;
 + }
 +   

Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS

2013-01-10 Thread Marcelo Tosatti
On Wed, Jan 09, 2013 at 10:37:20PM +0100, Alexander Graf wrote:
 
  
   We can start to introduce (and fix ARM) with a generic ioctl in the 
   MPIC patches then.
  
   The ioctl is already generic, except for its name.
  It's making a few wrong assumptions:
   * maximum size of value is u64
  
  This is tolerable IMHO.
  
   * combining device id (variable) with addr type id (const) into a single 
  field. It could just be split into multiple fields
  
  I agree, but that could be lived with as well.
  
  I get that there's a tradeoff between getting something in now, versus 
  waiting until the API is more refined.  Tagging it with a particular ISA 
  seems like an odd way of saying soon to be deprecated, though.  What 
  happens if we're still squabbling over the perfect replacement API when 
  we're trying to push PPC MPIC stuff in?

As mentioned, i fail to see the benefit in sharing 0.0x% of the code (the
interface), while the remaining code is not shared.

 Then we're the ones who have to come up with a good interface.

Or just have KVM_SET_PPC_DEVICE_ADDRESS. Is there a downside to that?

  
  Perhaps the threshold for an API becoming permanent should not be 
  acceptance into the tree, but rather the removal of an experimental tag 
  (including a way of shutting off experimental APIs to make sure you're not 
  depending on them).  Sort of like CONFIG_EXPERIMENTAL, except actually used 
  for its intended purpose (distributions should have it *off* by default), 
  and preferably managed at runtime.  Sort of like drivers/staging, except 
  for APIs rather than drivers.  Changes at that point should require more 
  justification than before merging, but would not have the strict 
  compatibility requirement that non-experimental APIs have.  This would make 
  collaboration and testing easier on APIs that aren't ready to be permanent.
 
 This tag does exist. It's called not in Linus' tree :).
 
  
   * the id is 100% architecture specific. It shouldn't be. At least the 
  device id field should be generic.
  
  That's a documentation issue that could be changed to have all 
  architectures adopt what is currently specified for ARM, without breaking 
  compatibility.
 
 Depends on how we want to design the final layout. I don't have the 
 impression that we've reached final conclusion on this interface. That's all 
 I'm saying.
 
  
  I'm not sure if we can come up with more problems in that API when staring 
  at it a bit longer and/or we would actually start using it for more 
  things. So for the sake of not holding up the ARM code, I'm perfectly fine 
  to clutter ARM's ioctl handling code with an ioctl that is already 
  deprecated at its introduction, as long as we don't hold everything else 
  up meanwhile.
  
  I'm not in a position to block it, and if I were I presumably would have 
  seen this in time for feedback to matter.  That's different from actually 
  being happy. :-)
 
 For the ARM specific ioctl it has my ack. I'd say let's go with that and 
 start to work on a good interface ASAP. But we need an ok from Marcelo or 
 Gleb too.
 
 
 Alex
 
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS

2013-01-10 Thread Marcelo Tosatti
On Wed, Jan 09, 2013 at 03:15:43PM -0600, Scott Wood wrote:
 On 01/09/2013 02:12:16 PM, Alexander Graf wrote:
 
 On 09.01.2013, at 20:50, Scott Wood wrote:
 
  On 01/09/2013 10:48:47 AM, Alexander Graf wrote:
  On 09.01.2013, at 17:26, Christoffer Dall wrote:
   Renames the KVM_SET_DEVICE_ADDRESS to KVM_ARM_SET_DEVICE_ADDR
   to make it obvious that this is ARM specific in lack of a
 better generic
   interface.
  
   Once we agree on a better interface the KVM/ARM code can also
 take
   advantage of that, but until then we don't want to hold up
 the KVM/ARM
   patches.
  Works for me. Scott, are you happy with this one too?
 
  Not really, given that it will stay around forever even after
 something new is introduced.
 
 But only in ARM specific code.
 
 ...which I'll probably have to deal with when Freescale's
 virtualization-capable ARM chips come along.  I don't see it's only
 in that other architecture as it might as well not exist.

Doesnt it make sense to make it extensible by adding some reserved
space and a flags field? (to the ioctl) (independently of anything
else).

So that, say, supporting new ARM processors does not require adding 
a new ioctl?

Having a single ioctl does not increase code sharing significantly
because large percentage of the code both in QEMU and the kernel are not
shared (well perhaps except the interface).

  If you're going to change the name, why not just change it to
 KVM_SET_DEVICE_CONFIG?  Can we change the name later if nothing
 else changes (so it's still binary compatible)?
 
 Because that again implies that it's generic enough. And to reach
 that conclusion will take more time than we should spend on this
 now.
 
 If the conclusion later on is that it is good enough, can the name
 be changed then?
 
  We can start to introduce (and fix ARM) with a generic ioctl in
 the MPIC patches then.
 
  The ioctl is already generic, except for its name.
 
 It's making a few wrong assumptions:
 
   * maximum size of value is u64
 
 This is tolerable IMHO.
 
   * combining device id (variable) with addr type id (const) into
 a single field. It could just be split into multiple fields
 
 I agree, but that could be lived with as well.
 
 I get that there's a tradeoff between getting something in now,
 versus waiting until the API is more refined.  Tagging it with a
 particular ISA seems like an odd way of saying soon to be
 deprecated, though.  What happens if we're still squabbling over
 the perfect replacement API when we're trying to push PPC MPIC stuff
 in?
 
 Perhaps the threshold for an API becoming permanent should not be
 acceptance into the tree, but rather the removal of an
 experimental tag (including a way of shutting off experimental
 APIs to make sure you're not depending on them).  Sort of like
 CONFIG_EXPERIMENTAL, except actually used for its intended purpose
 (distributions should have it *off* by default), and preferably
 managed at runtime.  Sort of like drivers/staging, except for APIs
 rather than drivers.  Changes at that point should require more
 justification than before merging, but would not have the strict
 compatibility requirement that non-experimental APIs have.  This
 would make collaboration and testing easier on APIs that aren't
 ready to be permanent.
 
   * the id is 100% architecture specific. It shouldn't be. At
 least the device id field should be generic.
 
 That's a documentation issue that could be changed to have all
 architectures adopt what is currently specified for ARM, without
 breaking compatibility.
 
 I'm not sure if we can come up with more problems in that API when
 staring at it a bit longer and/or we would actually start using it
 for more things. So for the sake of not holding up the ARM code,
 I'm perfectly fine to clutter ARM's ioctl handling code with an
 ioctl that is already deprecated at its introduction, as long as
 we don't hold everything else up meanwhile.
 
 I'm not in a position to block it, and if I were I presumably would
 have seen this in time for feedback to matter.  That's different
 from actually being happy. :-)

 -Scott
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS

2013-01-10 Thread Scott Wood

On 01/10/2013 04:28:01 PM, Marcelo Tosatti wrote:

On Wed, Jan 09, 2013 at 10:37:20PM +0100, Alexander Graf wrote:

 
   We can start to introduce (and fix ARM) with a generic ioctl  
in the MPIC patches then.

  
   The ioctl is already generic, except for its name.
  It's making a few wrong assumptions:
   * maximum size of value is u64
 
  This is tolerable IMHO.
 
   * combining device id (variable) with addr type id (const) into  
a single field. It could just be split into multiple fields

 
  I agree, but that could be lived with as well.
 
  I get that there's a tradeoff between getting something in now,  
versus waiting until the API is more refined.  Tagging it with a  
particular ISA seems like an odd way of saying soon to be  
deprecated, though.  What happens if we're still squabbling over the  
perfect replacement API when we're trying to push PPC MPIC stuff in?


As mentioned, i fail to see the benefit in sharing 0.0x% of the code  
(the

interface), while the remaining code is not shared.


Pointlessly making things architecture-specific just makes more work  
for people who later need the functionality on another architecture.   
It might not be much in this case (unless a particular device ends up  
being used on multiple architectures), but the principle still  
applies.  It's also more work for tools like strace, and could get in  
the way of the userspace caller using common code.



 Then we're the ones who have to come up with a good interface.

Or just have KVM_SET_PPC_DEVICE_ADDRESS. Is there a downside to that?


Besides the above, and my original complaint that it shouldn't be  
specific to addresses?


-Scott
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH qom-cpu 3/7] target-i386: Disable kvm_mmu by default

2013-01-10 Thread Andreas Färber
Am 07.01.2013 19:20, schrieb Eduardo Habkost:
 KVM_CAP_PV_MMU capability reporting was removed from the kernel since
 v2.6.33 (see commit a68a6a7282373), and was completely removed from the
 kernel since v3.3 (see commit fb92045843). It doesn't make sense to keep
 it enabled by default, as it would cause unnecessary hassle when using
 the enforce flag.
 
 This disables kvm_mmu on all machine-types. With this fix, the possible
 scenarios when migrating from QEMU = 1.3 to QEMU 1.4 are;
 
 ++
  src kernel | dst kernel | Result
 ++
  = 2.6.33  | any| kvm_mmu was already disabled and will stay disabled
  = 2.6.32  | = 3.3 | correct live migration is impossible
  = 2.6.32  | = 3.2 | kvm_mmu will be disabled on next guest reboot *
 ++

When using ASCII art, please remember to use at most 76 characters, to
avoid linewraps in git-log. Shortening the second column fixes this.

Andreas

 
  * If they are running kernel = 2.6.32 and want kvm_mmu to be kept
enabled on guest reboot, they can explicitly add +kvm_mmu to the QEMU
command-line. Using 2.6.33 and higher, it is not possible to enable
kvm_mmu explicitly anymore.
 
 Signed-off-by: Eduardo Habkost ehabk...@redhat.com

-- 
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH qom-cpu 6/7] target-i386/cpu.c: Add feature name array for ext4_features

2013-01-10 Thread Andreas Färber
Am 07.01.2013 19:20, schrieb Eduardo Habkost:
 Feature names were taken from the X86_FEATURE_* constants in the Linux
 kernel code.
 
 Signed-off-by: Eduardo Habkost ehabk...@redhat.com
 ---
 Cc: Gleb Natapov g...@redhat.com
 ---
  target-i386/cpu.c | 17 +
  1 file changed, 17 insertions(+)
 
 diff --git a/target-i386/cpu.c b/target-i386/cpu.c
 index 4b3ee63..a54c464 100644
 --- a/target-i386/cpu.c
 +++ b/target-i386/cpu.c
 @@ -95,6 +95,17 @@ static const char *ext3_feature_name[] = {
  NULL, NULL, NULL, NULL,
  };
  
 +static const char *ext4_feature_name[] = {
 +NULL, NULL, xstore,xstore-en,
 +NULL, NULL, xcrypt,xcrypt-en,

Missing spaces, fixed.

Andreas

 +ace2, ace2-en, phe, phe-en,
 +pmm, pmm-en, NULL, NULL,
 +NULL, NULL, NULL, NULL,
 +NULL, NULL, NULL, NULL,
 +NULL, NULL, NULL, NULL,
 +NULL, NULL, NULL, NULL,
 +};
 +
  static const char *kvm_feature_name[] = {
  kvmclock, kvm_nopiodelay, kvm_mmu, kvmclock,
  kvm_asyncpf, kvm_steal_time, kvm_pv_eoi, NULL,
[snip]

-- 
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [RFC 01/12] kvm: add KVM_FEATURE_CLOCKSOURCE_STABLE_BIT fake #define

2013-01-10 Thread Igor Mammedov
On Wed,  9 Jan 2013 16:53:41 -0200
Eduardo Habkost ehabk...@redhat.com wrote:

 Signed-off-by: Eduardo Habkost ehabk...@redhat.com
 ---
 Cc: kvm@vger.kernel.org
 Cc: Michael S. Tsirkin m...@redhat.com
 Cc: Gleb Natapov g...@redhat.com
 Cc: Marcelo Tosatti mtosa...@redhat.com
 ---
  include/sysemu/kvm.h | 1 +
  1 file changed, 1 insertion(+)
 
 diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
 index 6bdd513..22acf91 100644
 --- a/include/sysemu/kvm.h
 +++ b/include/sysemu/kvm.h
 @@ -36,6 +36,7 @@
  #define KVM_FEATURE_ASYNC_PF 0
  #define KVM_FEATURE_STEAL_TIME   0
  #define KVM_FEATURE_PV_EOI   0
 +#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 0
  #endif
  
  extern int kvm_allowed;
 -- 
 1.7.11.7
 
 
Perhaps kvm: add fake KVM_FEATURE_CLOCKSOURCE_STABLE_BIT for TCG only
configurations would be better

-- 
Regards,
  Igor
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 07/12] target-i386/cpu: Introduce apic_id_for_cpu() function

2013-01-10 Thread Igor Mammedov
On Wed,  9 Jan 2013 16:53:47 -0200
Eduardo Habkost ehabk...@redhat.com wrote:

 This function will be used by both the CPU initialization code and the
 fw_cfg table initialization code.
 
 Later this function will be updated to generate APIC IDs according to
 the CPU topology.
 
 Signed-off-by: Eduardo Habkost ehabk...@redhat.com
 ---
  target-i386/cpu.c | 17 -
  target-i386/cpu.h |  2 ++
  2 files changed, 18 insertions(+), 1 deletion(-)
 
 diff --git a/target-i386/cpu.c b/target-i386/cpu.c
 index 492656c..33787dc 100644
 --- a/target-i386/cpu.c
 +++ b/target-i386/cpu.c
 @@ -2192,6 +2192,21 @@ void x86_cpu_realize(Object *obj, Error **errp)
  cpu_reset(CPU(cpu));
  }
  
 +/* Calculates initial APIC ID for a specific CPU index
 + *
 + * Currently we need to be able to calculate the APIC ID from the CPU index
 + * alone (without requiring a CPU object), as the QEMU-Seabios interfaces 
 have
 + * no concept of CPU index, and the NUMA tables on fw_cfg need the APIC ID 
 of
 + * all CPUs up to max_cpus.
 + */
 +uint32_t apic_id_for_cpu(unsigned int cpu_index)
 +{
 +/* right now APIC ID == CPU index. this will eventually change to use
 + * the CPU topology configuration properly
 + */
 +return cpu_index;
 +}
 +
  static void x86_cpu_initfn(Object *obj)
  {
  CPUState *cs = CPU(obj);
 @@ -2226,7 +2241,7 @@ static void x86_cpu_initfn(Object *obj)
  x86_cpuid_get_tsc_freq,
  x86_cpuid_set_tsc_freq, NULL, NULL, NULL);
  
 -env-cpuid_apic_id = cs-cpu_index;
 +env-cpuid_apic_id = apic_id_for_cpu(cs-cpu_index);
APIC ID is set by board so CPU shouldn't do it by itself.

Could you create static property for it and set it from board using property
instead?

  
  /* init various static tables used in TCG mode */
  if (tcg_enabled()  !inited) {
 diff --git a/target-i386/cpu.h b/target-i386/cpu.h
 index f3c9df5..dbd9899 100644
 --- a/target-i386/cpu.h
 +++ b/target-i386/cpu.h
 @@ -1238,4 +1238,6 @@ void disable_kvm_pv_eoi(void);
  /* Return name of 32-bit register, from a R_* constant */
  const char *get_register_name_32(unsigned int reg);
  
 +uint32_t apic_id_for_cpu(unsigned int cpu_index);
 +
  #endif /* CPU_I386_H */
 -- 
 1.7.11.7
 


-- 
Regards,
  Igor
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 12/12] pc: Generate APIC IDs according to CPU topology

2013-01-10 Thread Igor Mammedov
On Wed,  9 Jan 2013 16:53:52 -0200
Eduardo Habkost ehabk...@redhat.com wrote:

 This keeps compatibility on machine-types pc-1.2 and older, and prints a
 warning in case the requested configuration won't get the correct
 topology.
 
 I couldn't think of a better way to warn about broken topology when in
 compat mode other than using error_report(). The warning message will be
 probably be buried in a log file somewhere, but it's better than
 nothing.
 
 Signed-off-by: Eduardo Habkost ehabk...@redhat.com
 ---
 Changes v1 - v2:
  - Move code to cpu.c
  - keep using cpu_index on *-user
  - Use SMP.contiguous_apic_ids global property
  - Prints warning in case the compatibility mode will expose incorrect
topology
 
 Changes v2 - v3:
  - Now all code is inside hw/pc.c
  - Use a real PC class and a contiguous_apic_ids property
 
 Changes v3 - v4:
  - Instead of using a global property, use a separate machine init
function and a PCInitArgs field, to implement compatibility mode
  - Use error_report() instead of fprintf(stderr) for the warning
  - Use a field on PCInitArgs instead of a static variable to check
if warning was already printed
 
 Changes v4 - v5:
  - Don't use PCInitArgs: simply add a enable_compat_apic_id_mode()
function and a static compat_apic_id_mode variable, to enable the
compatibility mode
  - Move APIC ID calculation code to cpu.c
 ---
  hw/pc_piix.c  | 13 +++--
  target-i386/cpu.c | 28 
  target-i386/cpu.h |  1 +
  3 files changed, 36 insertions(+), 6 deletions(-)
 
 diff --git a/hw/pc_piix.c b/hw/pc_piix.c
 index 13d7cc8..be4fea1 100644
 --- a/hw/pc_piix.c
 +++ b/hw/pc_piix.c
 @@ -233,11 +233,18 @@ static void pc_init_pci(QEMUMachineInitArgs *args)
   initrd_filename, cpu_model, 1, 1);
  }
  
 +
 +static void pc_init_pci_1_3(QEMUMachineInitArgs *args)
 +{
 +enable_compat_apic_id_mode();
 +pc_init_pci(args);
 +}
 +
  /* PC machine init function for pc-0.14 to pc-1.2 */
  static void pc_init_pci_1_2(QEMUMachineInitArgs *args)
  {
  disable_kvm_pv_eoi();
 -pc_init_pci(args);
 +pc_init_pci_1_3(args);
  }
  
  /* PC init function for pc-0.10 to pc-0.13, and reused by xenfv */
 @@ -250,6 +257,7 @@ static void pc_init_pci_no_kvmclock(QEMUMachineInitArgs 
 *args)
  const char *initrd_filename = args-initrd_filename;
  const char *boot_device = args-boot_device;
  disable_kvm_pv_eoi();
 +enable_compat_apic_id_mode();
  pc_init1(get_system_memory(),
   get_system_io(),
   ram_size, boot_device,
 @@ -268,6 +276,7 @@ static void pc_init_isa(QEMUMachineInitArgs *args)
  if (cpu_model == NULL)
  cpu_model = 486;
  disable_kvm_pv_eoi();
 +enable_compat_apic_id_mode();
  pc_init1(get_system_memory(),
   get_system_io(),
   ram_size, boot_device,
 @@ -305,7 +314,7 @@ static QEMUMachine pc_machine_v1_4 = {
  static QEMUMachine pc_machine_v1_3 = {
  .name = pc-1.3,
  .desc = Standard PC,
 -.init = pc_init_pci,
 +.init = pc_init_pci_1_3,
  .max_cpus = 255,
  .compat_props = (GlobalProperty[]) {
  PC_COMPAT_1_3,
 diff --git a/target-i386/cpu.c b/target-i386/cpu.c
 index 33787dc..f34192c 100644
 --- a/target-i386/cpu.c
 +++ b/target-i386/cpu.c
 @@ -23,6 +23,8 @@
  
  #include cpu.h
  #include sysemu/kvm.h
 +#include sysemu/cpus.h
 +#include topology.h
  
  #include qemu/option.h
  #include qemu/config-file.h
 @@ -2192,6 +2194,14 @@ void x86_cpu_realize(Object *obj, Error **errp)
  cpu_reset(CPU(cpu));
  }
  
 +/* Enables contiguous-apic-ID mode, for compatibility */
 +static bool compat_apic_id_mode;
 +
 +void enable_compat_apic_id_mode(void)
 +{
 +compat_apic_id_mode = true;
 +}
 +
  /* Calculates initial APIC ID for a specific CPU index
   *
   * Currently we need to be able to calculate the APIC ID from the CPU index
 @@ -2201,10 +2211,20 @@ void x86_cpu_realize(Object *obj, Error **errp)
   */
  uint32_t apic_id_for_cpu(unsigned int cpu_index)
if you move ^^^ to board, there won't be need in [8/12]
  {
 -/* right now APIC ID == CPU index. this will eventually change to use
 - * the CPU topology configuration properly
 - */
 -return cpu_index;
 +uint32_t correct_id;
 +static bool warned;
 +
 +correct_id = topo_apicid_for_cpu(smp_cores, smp_threads, cpu_index);
 +if (compat_apic_id_mode) {
 +if (cpu_index != correct_id  !warned) {
 +error_report(APIC IDs set in compatibility mode, 
 + CPU topology won't match the configuration);
 +warned = true;
 +}
 +return cpu_index;
 +} else {
 +return correct_id;
 +}
  }
  
  static void x86_cpu_initfn(Object *obj)
 diff --git a/target-i386/cpu.h b/target-i386/cpu.h
 index dbd9899..d9a9e8f 100644
 --- a/target-i386/cpu.h
 +++ b/target-i386/cpu.h
 @@ -1239,5 +1239,6 @@ void disable_kvm_pv_eoi(void);
  const char 

Re: [RFC 12/12] pc: Generate APIC IDs according to CPU topology

2013-01-10 Thread Eduardo Habkost
On Fri, Jan 11, 2013 at 12:36:35AM +0100, Igor Mammedov wrote:
[...]
  diff --git a/target-i386/cpu.c b/target-i386/cpu.c
  index 33787dc..f34192c 100644
  --- a/target-i386/cpu.c
  +++ b/target-i386/cpu.c
  @@ -23,6 +23,8 @@
   
   #include cpu.h
   #include sysemu/kvm.h
  +#include sysemu/cpus.h
  +#include topology.h
   
   #include qemu/option.h
   #include qemu/config-file.h
  @@ -2192,6 +2194,14 @@ void x86_cpu_realize(Object *obj, Error **errp)
   cpu_reset(CPU(cpu));
   }
   
  +/* Enables contiguous-apic-ID mode, for compatibility */
  +static bool compat_apic_id_mode;
  +
  +void enable_compat_apic_id_mode(void)
  +{
  +compat_apic_id_mode = true;
  +}
  +
   /* Calculates initial APIC ID for a specific CPU index
*
* Currently we need to be able to calculate the APIC ID from the CPU index
  @@ -2201,10 +2211,20 @@ void x86_cpu_realize(Object *obj, Error **errp)
*/
   uint32_t apic_id_for_cpu(unsigned int cpu_index)
 if you move ^^^ to board, there won't be need in [8/12]

True. The previous version I had (apic-id-topology-using-apic-id-prop
branch) didn't need it. But I just wanted to avoid touching the CPU
creation code by now.

BTW, *-user doesn't have a board but does have an APIC ID (because
*-user has CPUID and CPUID exposes the APIC ID). This would mean we
would need APIC ID calculation code present in two separated places
(which I would like to avoid).


   {
  -/* right now APIC ID == CPU index. this will eventually change to use
  - * the CPU topology configuration properly
  - */
  -return cpu_index;
  +uint32_t correct_id;
  +static bool warned;
  +
  +correct_id = topo_apicid_for_cpu(smp_cores, smp_threads, cpu_index);
  +if (compat_apic_id_mode) {
  +if (cpu_index != correct_id  !warned) {
  +error_report(APIC IDs set in compatibility mode, 
  + CPU topology won't match the configuration);
  +warned = true;
  +}
  +return cpu_index;
  +} else {
  +return correct_id;
  +}
   }
   
   static void x86_cpu_initfn(Object *obj)
  diff --git a/target-i386/cpu.h b/target-i386/cpu.h
  index dbd9899..d9a9e8f 100644
  --- a/target-i386/cpu.h
  +++ b/target-i386/cpu.h
  @@ -1239,5 +1239,6 @@ void disable_kvm_pv_eoi(void);
   const char *get_register_name_32(unsigned int reg);
   
   uint32_t apic_id_for_cpu(unsigned int cpu_index);
  +void enable_compat_apic_id_mode(void);
   
   #endif /* CPU_I386_H */
  -- 
  1.7.11.7
  
  --
  To unsubscribe from this list: send the line unsubscribe kvm in
  the body of a message to majord...@vger.kernel.org
  More majordomo info at  http://vger.kernel.org/majordomo-info.html
 
 
 -- 
 Regards,
   Igor

-- 
Eduardo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 07/12] target-i386/cpu: Introduce apic_id_for_cpu() function

2013-01-10 Thread Eduardo Habkost
On Fri, Jan 11, 2013 at 12:31:23AM +0100, Igor Mammedov wrote:
 On Wed,  9 Jan 2013 16:53:47 -0200
 Eduardo Habkost ehabk...@redhat.com wrote:
 
  This function will be used by both the CPU initialization code and the
  fw_cfg table initialization code.
  
  Later this function will be updated to generate APIC IDs according to
  the CPU topology.
  
  Signed-off-by: Eduardo Habkost ehabk...@redhat.com
  ---
   target-i386/cpu.c | 17 -
   target-i386/cpu.h |  2 ++
   2 files changed, 18 insertions(+), 1 deletion(-)
  
  diff --git a/target-i386/cpu.c b/target-i386/cpu.c
  index 492656c..33787dc 100644
  --- a/target-i386/cpu.c
  +++ b/target-i386/cpu.c
  @@ -2192,6 +2192,21 @@ void x86_cpu_realize(Object *obj, Error **errp)
   cpu_reset(CPU(cpu));
   }
   
  +/* Calculates initial APIC ID for a specific CPU index
  + *
  + * Currently we need to be able to calculate the APIC ID from the CPU index
  + * alone (without requiring a CPU object), as the QEMU-Seabios 
  interfaces have
  + * no concept of CPU index, and the NUMA tables on fw_cfg need the APIC 
  ID of
  + * all CPUs up to max_cpus.
  + */
  +uint32_t apic_id_for_cpu(unsigned int cpu_index)
  +{
  +/* right now APIC ID == CPU index. this will eventually change to use
  + * the CPU topology configuration properly
  + */
  +return cpu_index;
  +}
  +
   static void x86_cpu_initfn(Object *obj)
   {
   CPUState *cs = CPU(obj);
  @@ -2226,7 +2241,7 @@ static void x86_cpu_initfn(Object *obj)
   x86_cpuid_get_tsc_freq,
   x86_cpuid_set_tsc_freq, NULL, NULL, NULL);
   
  -env-cpuid_apic_id = cs-cpu_index;
  +env-cpuid_apic_id = apic_id_for_cpu(cs-cpu_index);
 APIC ID is set by board so CPU shouldn't do it by itself.
 
 Could you create static property for it and set it from board using property
 instead?

I had a version that had the PC CPU creation code set the apic-id using
a apic-id property, but I am using this method to keep the bug fix
simpler and less intrusive, as I we don't have much time to remodel the
CPU creation code before soft freeze.

The previous version can be seen at:
  
https://github.com/ehabkost/qemu-hacks/tree/apic-id-topology-using-apic-id-prop
The relevant apic-id-setting code is at:
  
https://github.com/ehabkost/qemu-hacks/commit/a79156d0ad99ee89c0def535ce4cafcc4aa9a486

I believe the final solution for calculating the APIC IDs should instead
involve passing the package ID and bit width of core+thread IDs to a
package/socket object (so we don't need to expose the APIC IDs outside
QEMU). But whatever design we're going to choose to allow CPU hotplug, I
didn't want to introduce complexity that will get into our way when we
remodel the CPU creation/hotplug interfaces later.


 
   
   /* init various static tables used in TCG mode */
   if (tcg_enabled()  !inited) {
  diff --git a/target-i386/cpu.h b/target-i386/cpu.h
  index f3c9df5..dbd9899 100644
  --- a/target-i386/cpu.h
  +++ b/target-i386/cpu.h
  @@ -1238,4 +1238,6 @@ void disable_kvm_pv_eoi(void);
   /* Return name of 32-bit register, from a R_* constant */
   const char *get_register_name_32(unsigned int reg);
   
  +uint32_t apic_id_for_cpu(unsigned int cpu_index);
  +
   #endif /* CPU_I386_H */
  -- 
  1.7.11.7
  
 
 
 -- 
 Regards,
   Igor

-- 
Eduardo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 02/12] target-i386: Don't set any KVM flag by default if KVM is disabled

2013-01-10 Thread Igor Mammedov
On Wed,  9 Jan 2013 16:53:42 -0200
Eduardo Habkost ehabk...@redhat.com wrote:

 This is a cleanup that tries to solve two small issues:
 
  - We don't need a separate kvm_pv_eoi_features variable just to keep a
constant calculated at compile-time, and this style would require
adding a separate variable (that's declared twice because of the
CONFIG_KVM ifdef) for each feature that's going to be enabled/disable
by machine-type compat code.
  - The pc-1.3 code is setting the kvm_pv_eoi flag on cpuid_kvm_features
even when KVM is disabled at runtime. This small incosistency in
the cpuid_kvm_features field isn't a problem today because
cpuid_kvm_features is ignored by the TCG code, but it may cause
unexpected problems later when refactoring the CPUID handling code.
 
 This patch eliminates the kvm_pv_eoi_features variable and simply uses
 kvm_enabled() inside the enable_kvm_pv_eoi() compat function, so it
 enables kvm_pv_eoi only if KVM is enabled. I believe this makes the
 behavior of enable_kvm_pv_eoi() clearer and easier to understand.
 
 Signed-off-by: Eduardo Habkost ehabk...@redhat.com
 ---
 Cc: kvm@vger.kernel.org
 Cc: Michael S. Tsirkin m...@redhat.com
 Cc: Gleb Natapov g...@redhat.com
 Cc: Marcelo Tosatti mtosa...@redhat.com
 
[snip]
  void host_cpuid(uint32_t function, uint32_t count,
 @@ -1343,13 +1337,15 @@ static int cpu_x86_parse_featurestr(x86_def_t 
 *x86_cpu_def, char *features)
  unsigned int i;
  char *featurestr; /* Single 'key=value string being parsed */
  /* Features to be added */
 -FeatureWordArray plus_features = {
 -[FEAT_KVM] = kvm_default_features,
 -};
 +FeatureWordArray plus_features = { 0 };
  /* Features to be removed */
  FeatureWordArray minus_features = { 0 };
  uint32_t numvalue;
  
 +if (kvm_enabled()) {
 +plus_features[FEAT_KVM] = kvm_default_features;
 +}
While touching it please move setting defaults to cpu_x86_register() or
cpu_x86_find_by_name() to so that cpu_x86_parse_featurestr() would deal only
with custom settings.

[snip]

-- 
Regards,
  Igor
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 07/12] target-i386/cpu: Introduce apic_id_for_cpu() function

2013-01-10 Thread Eduardo Habkost
On Fri, Jan 11, 2013 at 12:31:23AM +0100, Igor Mammedov wrote:
 On Wed,  9 Jan 2013 16:53:47 -0200
 Eduardo Habkost ehabk...@redhat.com wrote:
 
  This function will be used by both the CPU initialization code and the
  fw_cfg table initialization code.
  
  Later this function will be updated to generate APIC IDs according to
  the CPU topology.
  
  Signed-off-by: Eduardo Habkost ehabk...@redhat.com
  ---
   target-i386/cpu.c | 17 -
   target-i386/cpu.h |  2 ++
   2 files changed, 18 insertions(+), 1 deletion(-)
  
  diff --git a/target-i386/cpu.c b/target-i386/cpu.c
  index 492656c..33787dc 100644
  --- a/target-i386/cpu.c
  +++ b/target-i386/cpu.c
  @@ -2192,6 +2192,21 @@ void x86_cpu_realize(Object *obj, Error **errp)
   cpu_reset(CPU(cpu));
   }
   
  +/* Calculates initial APIC ID for a specific CPU index
  + *
  + * Currently we need to be able to calculate the APIC ID from the CPU index
  + * alone (without requiring a CPU object), as the QEMU-Seabios 
  interfaces have
  + * no concept of CPU index, and the NUMA tables on fw_cfg need the APIC 
  ID of
  + * all CPUs up to max_cpus.
  + */
  +uint32_t apic_id_for_cpu(unsigned int cpu_index)
  +{
  +/* right now APIC ID == CPU index. this will eventually change to use
  + * the CPU topology configuration properly
  + */
  +return cpu_index;
  +}
  +
   static void x86_cpu_initfn(Object *obj)
   {
   CPUState *cs = CPU(obj);
  @@ -2226,7 +2241,7 @@ static void x86_cpu_initfn(Object *obj)
   x86_cpuid_get_tsc_freq,
   x86_cpuid_set_tsc_freq, NULL, NULL, NULL);
   
  -env-cpuid_apic_id = cs-cpu_index;
  +env-cpuid_apic_id = apic_id_for_cpu(cs-cpu_index);
 APIC ID is set by board so CPU shouldn't do it by itself.
 
 Could you create static property for it and set it from board using property
 instead?

I had a version that had the PC CPU creation code set the apic-id using
a apic-id property, but I am using this method to keep the bug fix
simpler and less intrusive, as I we don't have much time to remodel the
CPU creation code before soft freeze.

The previous version can be seen at:
  
https://github.com/ehabkost/qemu-hacks/tree/apic-id-topology-using-apic-id-prop
The relevant apic-id-setting code is at:
  
https://github.com/ehabkost/qemu-hacks/commit/a79156d0ad99ee89c0def535ce4cafcc4aa9a486

I believe the final solution for calculating the APIC IDs should instead
involve passing the package ID and bit width of core+thread IDs to a
package/socket object (so we don't need to expose the APIC IDs outside
QEMU). But whatever design we're going to choose to allow CPU hotplug, I
didn't want to introduce complexity that will get into our way when we
remodel the CPU creation/hotplug interfaces later.


 
   
   /* init various static tables used in TCG mode */
   if (tcg_enabled()  !inited) {
  diff --git a/target-i386/cpu.h b/target-i386/cpu.h
  index f3c9df5..dbd9899 100644
  --- a/target-i386/cpu.h
  +++ b/target-i386/cpu.h
  @@ -1238,4 +1238,6 @@ void disable_kvm_pv_eoi(void);
   /* Return name of 32-bit register, from a R_* constant */
   const char *get_register_name_32(unsigned int reg);
   
  +uint32_t apic_id_for_cpu(unsigned int cpu_index);
  +
   #endif /* CPU_I386_H */
  -- 
  1.7.11.7
  
 
 
 -- 
 Regards,
   Igor

-- 
Eduardo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 02/12] target-i386: Don't set any KVM flag by default if KVM is disabled

2013-01-10 Thread Eduardo Habkost
On Fri, Jan 11, 2013 at 12:07:40AM +0100, Igor Mammedov wrote:
[...]
  @@ -1343,13 +1337,15 @@ static int cpu_x86_parse_featurestr(x86_def_t 
  *x86_cpu_def, char *features)
   unsigned int i;
   char *featurestr; /* Single 'key=value string being parsed */
   /* Features to be added */
  -FeatureWordArray plus_features = {
  -[FEAT_KVM] = kvm_default_features,
  -};
  +FeatureWordArray plus_features = { 0 };
   /* Features to be removed */
   FeatureWordArray minus_features = { 0 };
   uint32_t numvalue;
   
  +if (kvm_enabled()) {
  +plus_features[FEAT_KVM] = kvm_default_features;
  +}
 While touching it please move setting defaults to cpu_x86_register() or
 cpu_x86_find_by_name() to so that cpu_x86_parse_featurestr() would deal only
 with custom settings.

OK, I will do it in the next version.

-- 
Eduardo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [RFC 01/12] kvm: add KVM_FEATURE_CLOCKSOURCE_STABLE_BIT fake #define

2013-01-10 Thread Eduardo Habkost
On Fri, Jan 11, 2013 at 12:15:16AM +0100, Igor Mammedov wrote:
[...]
  diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
  index 6bdd513..22acf91 100644
  --- a/include/sysemu/kvm.h
  +++ b/include/sysemu/kvm.h
  @@ -36,6 +36,7 @@
   #define KVM_FEATURE_ASYNC_PF 0
   #define KVM_FEATURE_STEAL_TIME   0
   #define KVM_FEATURE_PV_EOI   0
  +#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 0
   #endif
snip
   
 Perhaps kvm: add fake KVM_FEATURE_CLOCKSOURCE_STABLE_BIT for TCG only
 configurations would be better

I'll change it in the next version. Thanks.

-- 
Eduardo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS

2013-01-10 Thread Marcelo Tosatti
On Thu, Jan 10, 2013 at 04:40:12PM -0600, Scott Wood wrote:
 On 01/10/2013 04:28:01 PM, Marcelo Tosatti wrote:
 On Wed, Jan 09, 2013 at 10:37:20PM +0100, Alexander Graf wrote:
 
  
We can start to introduce (and fix ARM) with a generic
 ioctl in the MPIC patches then.
   
The ioctl is already generic, except for its name.
   It's making a few wrong assumptions:
* maximum size of value is u64
  
   This is tolerable IMHO.
  
* combining device id (variable) with addr type id (const)
 into a single field. It could just be split into multiple fields
  
   I agree, but that could be lived with as well.
  
   I get that there's a tradeoff between getting something in
 now, versus waiting until the API is more refined.  Tagging it
 with a particular ISA seems like an odd way of saying soon to be
 deprecated, though.  What happens if we're still squabbling over
 the perfect replacement API when we're trying to push PPC MPIC
 stuff in?
 
 As mentioned, i fail to see the benefit in sharing 0.0x% of the
 code (the
 interface), while the remaining code is not shared.
 
 Pointlessly making things architecture-specific just makes more work
 for people who later need the functionality on another architecture.
 It might not be much in this case (unless a particular device ends
 up being used on multiple architectures), but the principle still
 applies.  It's also more work for tools like strace, and could get
 in the way of the userspace caller using common code.
 
  Then we're the ones who have to come up with a good interface.
 
 Or just have KVM_SET_PPC_DEVICE_ADDRESS. Is there a downside to that?
 
 Besides the above, and my original complaint that it shouldn't be
 specific to addresses?
 
 -Scott

I did not really grasp that ('shouldnt be specific to addresses'), but
anyway.

OK, can you write down your proposed improvements to the interface?
In case you have something ready, otherwise there is time pressure 
to merge the ARM port.

That is, if you have interest/energy to spend in a possibly reusable 
interface, as long as that does not delay integration of the ARM code,
i don't think the ARM people will mind that.

As mentioned in the thread, they (ARM) require configurable device 
address.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH qom-cpu 0/7] disable kvm_mmu + -cpu enforce fixes (v3)

2013-01-10 Thread Andreas Färber
Am 07.01.2013 19:20, schrieb Eduardo Habkost:
 Eduardo Habkost (7):
   kvm: Add fake KVM constants to avoid #ifdefs on KVM-specific code
[...]
   target-i386: Disable kvm_mmu by default
   target-i386/cpu: Introduce FeatureWord typedefs
   target-i386: kvm_check_features_against_host(): Use feature_word_info
   target-i386/cpu.c: Add feature name array for ext4_features
   target-i386: check/enforce: Check all feature words

Thanks, applied these to qom-cpu:
https://github.com/afaerber/qemu-cpu/commits/qom-cpu

Andreas

-- 
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS

2013-01-10 Thread Scott Wood

On 01/10/2013 06:35:02 PM, Marcelo Tosatti wrote:

On Thu, Jan 10, 2013 at 04:40:12PM -0600, Scott Wood wrote:
 On 01/10/2013 04:28:01 PM, Marcelo Tosatti wrote:
 Or just have KVM_SET_PPC_DEVICE_ADDRESS. Is there a downside to  
that?


 Besides the above, and my original complaint that it shouldn't be
 specific to addresses?

 -Scott

I did not really grasp that ('shouldnt be specific to addresses'), but
anyway.


A device may have other configuration parameters that need to be set,
besides addresses.  PPC MPIC will require information about the vendor
and version, for example.


OK, can you write down your proposed improvements to the interface?
In case you have something ready, otherwise there is time pressure
to merge the ARM port.


My original request was just to change the name to something like
KVM_SET_DEVICE_CONFIG or KVM_SET_DEVICE_ATTR, and not make the id
encoding architecture-specific (preferably, separate into a device id
field and an attribute id field rather than using bitfields).  Actual
values for device id could be architecture-specific (or there could be a
global enumeration), and attribute id values would be device-specific.

Alex suggested that an ideal interface might accept values larger than  
64

bits, though I think it's good enough -- there are currently no proposed
uses that need more than 64 bits for a single attribute (unlike  
ONE_REG),

and if it is needed, such configuration could be split up between
multiple attributes, or the attribute could specify that value be a
userspace pointer to the actual data (as with ONE_REG).

Here's a writeup (the ARM details would go under ARM/vGIC-specific
documentation):

4.80 KVM_SET_DEVICE_ATTR

Capability: KVM_CAP_SET_DEVICE_ATTR
Type: vm ioctl
Parameters: struct kvm_device_attr (in)
Returns: 0 on success, -1 on error
Errors:
  ENODEV: The device id is unknown
  ENXIO:  Device not supported on current system
  Other errors may be returned by specific devices and attributes.

struct kvm_device_attr {
__u32 device;
__u32 attr;
__u64 value;
};

Specify an attribute of a device emulated or directly exposed by the
kernel, which the host kernel needs to know about.  The device field is  
an

architecture-specific identifier for a specific device.  The attr field
is a device-specific identifier for a specific attribute.  Individual
attributes may have particular requirements for when they can and cannot
be set.


That is, if you have interest/energy to spend in a possibly reusable
interface, as long as that does not delay integration of the ARM code,
i don't think the ARM people will mind that.


The impression I've been given is that just about any change will delay
the integration at this point.  If that's the case, and everyone's OK
with having an interface that is deprecated on arrival, then fine.

-Scott
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v9 2/3] x86, apicv: add virtual x2apic support

2013-01-10 Thread Zhang, Yang Z
Gleb Natapov wrote on 2013-01-10:
 On Thu, Jan 10, 2013 at 08:32:06AM +, Zhang, Yang Z wrote:
 Gleb Natapov wrote on 2013-01-10:
 On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote:
 From: Yang Zhang yang.z.zh...@intel.com
 
 basically to benefit from apicv, we need to enable virtualized x2apic mode.
 Currently, we only enable it when guest is really using x2apic.
 
 Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled
 x2apic:
 0x800 - 0x8ff: no read intercept for apicv register virtualization,
   except APIC ID and TMCCT.
 APIC ID and TMCCT: need software's assistance to get right value.
 TPR,EOI,SELF-IPI: no write intercept for virtual interrupt delivery.
 Signed-off-by: Kevin Tian kevin.t...@intel.com
 Signed-off-by: Yang Zhang yang.z.zh...@intel.com
 ---
  arch/x86/include/asm/kvm_host.h |2 + arch/x86/include/asm/vmx.h
|1 + arch/x86/kvm/lapic.c|5 +-
arch/x86/kvm/svm.c  |6 + arch/x86/kvm/vmx.c |  194
+-- 5 files
 changed, 200
  insertions(+), 8 deletions(-)
 diff --git a/arch/x86/include/asm/kvm_host.h
 b/arch/x86/include/asm/kvm_host.h index c431b33..572a562 100644 ---
 a/arch/x86/include/asm/kvm_host.h +++
 b/arch/x86/include/asm/kvm_host.h @@ -697,6 +697,8 @@ struct
 kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
 +  void (*enable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
 +  void (*disable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
 Make one callback with enable/disable parameter. And do not forget SVM.
 
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool
 is_mmio);
 diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
 index 44c3f7e..0a54df0 100644
 --- a/arch/x86/include/asm/vmx.h
 +++ b/arch/x86/include/asm/vmx.h
 @@ -139,6 +139,7 @@
  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x0001 #define
  SECONDARY_EXEC_ENABLE_EPT   0x0002 #define
  SECONDARY_EXEC_RDTSCP 0x0008 +#define
  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x0010 #define
  SECONDARY_EXEC_ENABLE_VPID  0x0020 #define
  SECONDARY_EXEC_WBINVD_EXITING 0x0040 #define
  SECONDARY_EXEC_UNRESTRICTED_GUEST 0x0080
 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
 index 0664c13..ec38906 100644
 --- a/arch/x86/kvm/lapic.c
 +++ b/arch/x86/kvm/lapic.c
 @@ -1328,7 +1328,10 @@ void kvm_lapic_set_base(struct kvm_vcpu
 *vcpu,
 u64 value)
u32 id = kvm_apic_id(apic);
u32 ldr = ((id  4)  16) | (1  (id  0xf));
kvm_apic_set_ldr(apic, ldr);
 -  }
 +  kvm_x86_ops-enable_virtual_x2apic_mode(vcpu);
 +  } else
 +  kvm_x86_ops-disable_virtual_x2apic_mode(vcpu);
 +
 You just broke SVM.
apic-base_address = apic-vcpu-arch.apic_base 
 MSR_IA32_APICBASE_BASE;
 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
 index d29d3cd..0b82cb1 100644
 --- a/arch/x86/kvm/svm.c
 +++ b/arch/x86/kvm/svm.c
 @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct
 kvm_vcpu
 *vcpu, int tpr, int irr)
set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
  }
 +static void svm_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
 +{
 +  return;
 +}
 +
  static int svm_nmi_allowed(struct kvm_vcpu *vcpu) {   struct vcpu_svm
  *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops
  svm_x86_ops = {   .enable_nmi_window = enable_nmi_window,
.enable_irq_window = enable_irq_window, .update_cr8_intercept =
  update_cr8_intercept,
 +  .enable_virtual_x2apic_mode = svm_enable_virtual_x2apic_mode,
 
.set_tss_addr = svm_set_tss_addr,
.get_tdp_level = get_npt_level,
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 688f43f..b203ce7 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -433,6 +433,8 @@ struct vcpu_vmx {
 
bool rdtscp_enabled;
 +  bool virtual_x2apic_enabled;
 +
/* Support for a guest hypervisor (nested VMX) */
struct nested_vmx nested;
  };
 @@ -767,12 +769,23 @@ static inline bool
 cpu_has_vmx_virtualize_apic_accesses(void)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
  }
 +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
 +{
 +  return vmcs_config.cpu_based_2nd_exec_ctrl 
 +  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
 +}
 +
  static inline bool cpu_has_vmx_apic_register_virt(void)
  {
return vmcs_config.cpu_based_2nd_exec_ctrl 
SECONDARY_EXEC_APIC_REGISTER_VIRT;
  }
 +static inline bool cpu_has_vmx_virtual_intr_delivery(void)
 +{
 +  return false;
 +}
 +
  static inline bool cpu_has_vmx_flexpriority(void)
  {
return cpu_has_vmx_tpr_shadow() 
 @@ -2544,6 +2557,7 @@ 

Re: [RFC PATCH 0/2] make mac programming for virtio net more robust

2013-01-10 Thread Rusty Russell
Michael S. Tsirkin m...@redhat.com writes:

 On Thu, Jan 10, 2013 at 10:45:39PM +0800, ak...@redhat.com wrote:
 From: Amos Kong ak...@redhat.com
 
 Currenly mac is programmed byte by byte. This means that we
 have an intermediate step where mac is wrong. 
 
 Second patch introduced a new vq control command to set mac
 address in one time.

 As you mention we could alternatively do it without
 new commands, simply add a feature bit that says that MACs are
 in the mac table.
 This would be a much bigger patch, and I'm fine with either way.
 Rusty what do you think?

Hmm, mac filtering and my mac address are not quite the same thing.  I
don't know if it matters for anyone: does it?  The mac address is abused
for things like identifying machines, etc.

If we keep it as a separate concept, Amos' patch seems to make sense.

Cheers,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 2/2] virtio-net: introduce a new control to set macaddr

2013-01-10 Thread Rusty Russell
Michael S. Tsirkin m...@redhat.com writes:
 +if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) {
 +/* Set MAC address by writing config space */
  vdev-config-set(vdev, offsetof(struct virtio_net_config, mac),
dev-dev_addr, dev-addr_len);
 +}
  

 By the way, why don't we fail the command if VIRTIO_NET_F_MAC is off?
 Rusty?

Looked back through the history for this one.  I think the theory is that
if the guest doesn't set VIRTIO_NET_F_MAC, it means it doesn't care:
it's up to the guest.

Cheers,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH KVM v2 2/4] KVM: additional i8254 output fixes

2013-01-10 Thread Matthew Ogilvie
On Tue, Jan 08, 2013 at 09:41:36AM +0200, Gleb Natapov wrote:
 On Mon, Jan 07, 2013 at 06:35:47PM -0600, mmogi...@miniinfo.net wrote:
  On Mon, 7 Jan 2013 14:04:03 +0200, Gleb Natapov g...@redhat.com wrote:
   On Wed, Dec 26, 2012 at 10:39:54PM -0700, Matthew Ogilvie wrote:
   Make git_get_out() consistent with spec.  Currently pit_get_out()
   doesn't affect IRQ0, but it can be read by the guest in other ways.
   This makes it consistent with proposed changes in qemu's i8254 model
   as well.
   
   See http://bochs.sourceforge.net/techspec/intel-82c54-timer.pdf.gz
   or search the net for 23124406.pdf.
   
   Signed-off-by: Matthew Ogilvie mmogilvi_q...@miniinfo.net
   ---
arch/x86/kvm/i8254.c | 44 ++--
1 file changed, 34 insertions(+), 10 deletions(-)
   
   diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
   index cd4ec60..fd38938 100644
   --- a/arch/x86/kvm/i8254.c
   +++ b/arch/x86/kvm/i8254.c
   @@ -144,6 +144,10 @@ static int pit_get_count(struct kvm *kvm, int
   channel)

WARN_ON(!mutex_is_locked(kvm-arch.vpit-pit_state.lock));

   +/* FIXME: Add some way to represent a paused timer and return
   + *   the paused-at counter value, to better model gate pausing,
   + *   wait until next CLK pulse to load counter logic, etc.
   + */
t = kpit_elapsed(kvm, c, channel);
d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);

   @@ -155,8 +159,7 @@ static int pit_get_count(struct kvm *kvm, int
   channel)
counter = (c-count - d)  0x;
break;
case 3:
   -/* XXX: may be incorrect for odd counts */
   -counter = c-count - (mod_64((2 * d), c-count));
   +counter = (c-count - (mod_64((2 * d), c-count)))  
   0xfffe;
break;
default:
counter = c-count - mod_64(d, c-count);
   @@ -180,20 +183,18 @@ static int pit_get_out(struct kvm *kvm, int
   channel)
switch (c-mode) {
default:
case 0:
   -out = (d = c-count);
   -break;
case 1:
   -out = (d  c-count);
   +out = (d = c-count);
break;
case 2:
   -out = ((mod_64(d, c-count) == 0)  (d != 0));
   +out = (mod_64(d, c-count) != (c-count - 1) || c-gate 
   == 0);
break;
case 3:
   -out = (mod_64(d, c-count)  ((c-count + 1)  1));
   +out = (mod_64(d, c-count)  ((c-count + 1)  1) || 
   c-gate == 0);
break;
case 4:
case 5:
   -out = (d == c-count);
   +out = (d != c-count);
break;
}

   @@ -367,7 +368,7 @@ static void pit_load_count(struct kvm *kvm, int
   channel, u32 val)

/*
 * The largest possible initial count is 0; this is equivalent
   - * to 216 for binary counting and 104 for BCD counting.
   + * to pow(2,16) for binary counting and pow(10,4) for BCD 
   counting.
 */
if (val == 0)
val = 0x1;
   @@ -376,6 +377,26 @@ static void pit_load_count(struct kvm *kvm, int
   channel, u32 val)

if (channel != 0) {
ps-channels[channel].count_load_time = ktime_get();
   +
   +/* In gate-triggered one-shot modes,
   + * indirectly model some pit_get_out()
   + * cases by setting the load time way
   + * back until gate-triggered.
   + * (Generally only affects reading status
   + * from channel 2 speaker,
   + * due to hard-wired gates on other
   + * channels.)
   + *
   + * FIXME: This might be redesigned if a paused
   + * timer state is added for pit_get_count().
   + */
   +if (ps-channels[channel].mode == 1 ||
   +ps-channels[channel].mode == 5) {
   +u64 delta = muldiv64(val+2, NSEC_PER_SEC, 
   KVM_PIT_FREQ);
   +ps-channels[channel].count_load_time =
   +   
   ktime_sub(ps-channels[channel].count_load_time,
   +  ns_to_ktime(delta));
   I do not understand what are you trying to do here. You assume that
   trigger will happen 2 clocks after counter is loaded?
   
  
  Modes 1 and 5 are single-shot, and they do not start counting until GATE
  is triggered, potentially well after count is loaded.  So this is
  attempting to model the start of countdown has not been triggered
  state as being mostly identical to the already triggered and also
  expired some number of clocks (2) ago state.
  
 So 

Re: [PATCH KVM v2 1/4] KVM: fix i8254 IRQ0 to be normally high

2013-01-10 Thread Matthew Ogilvie
On Tue, Jan 08, 2013 at 09:31:19AM +0200, Gleb Natapov wrote:
 On Mon, Jan 07, 2013 at 06:17:22PM -0600, mmogi...@miniinfo.net wrote:
  On Mon, 7 Jan 2013 11:39:18 +0200, Gleb Natapov g...@redhat.com wrote:
   On Wed, Dec 26, 2012 at 10:39:53PM -0700, Matthew Ogilvie wrote:
   Reading the spec, it is clear that most modes normally leave the IRQ
   output line high, and only pulse it low to generate a leading edge.
   Especially the most commonly used mode 2.
   
   The KVM i8254 model does not try to emulate the duration of the pulse at
   all, so just swap the high/low settings it to leave it high most of
   the time.
   
   This fix is a prerequisite to improving the i8259 model to handle
   the trailing edge of an interupt request as indicated in its spec:
   If it gets a trailing edge of an IRQ line before it starts to service
   the interrupt, the request should be canceled.
   
   See http://bochs.sourceforge.net/techspec/intel-82c54-timer.pdf.gz
   or search the net for 23124406.pdf.
   
   Risks:
   
   There is a risk that migrating a running guest between versions
   with and without this patch will lose or gain a single timer
   interrupt during the migration process.  The only case where
   Can you elaborate on how exactly this can happen? Do not see it.
   
  
  KVM 8254: In the corrected model, when the count expires, the model
  briefly pulses output low and then high again, with the low to high
  transition being what triggers the interrupt.  In the old model,
  when the count expires, the model expects the output line
  to already be low, and briefly pulses it high (triggering the
  interrupt) and then low again.  But if the line was already
  high (because it migrated from the corrected model),
  this won't generate a new leading edge (low to high) and won't
  trigger a new interrupt (the first post-back-migration pulse turns
  into a simple trailing edge instead of a pulse).
  
  Unless there is something I'm missing?
  
 No, I missed that pic-last_irr/ioapic-irr  will be migrated as 1. But
 this means that next interrupt after migration from new to old will
 always be lost.  What about clearing pit bit from last_irr/irr before
 migration? Should not affect new-new migration and should fix new-old
 one. The only problem is that we may need to consult irq routing table
 to know how pit is connected to ioapic.

We should not clear both last_irr and irr.  That
cancels the interrupt early if the CPU hasn't started servicing
it yet:  If the guest CPU has interrupts disabled
and hasn't begun to service the interrupt, a new-new migration could
lose the interrupt much earlier in the countdown cycle than it otherwise
might be lost.

Potentially we could clear the last_irr bit only.  This effectively
makes migration behave like the old unfixed code.  But if this is
considered acceptable, I would be more inclined to not fix IRQ0 at all,
rather than make IRQ0 work subtly differently normally vs during
migration.  One of my earlier patch series (qemu v7 dated Nov 25)
attempts to use basically this strategy for the qemu model, at
least in the short term (and then potentially fix it properly
in the longer term), although the details of series might
be tweaked.

Or the minimal-risk strategy is to ONLY fix the cascade IRQ2's
trailing edge [the original i825* problem that spawned this whole
thing], leaving all other IRQs as-is.

 
 Still do not see how can we gain one interrupt.

In most cases I don't think we get an extra interrupt from
a direct fix.  But some kinds of attempts to work around missing
interrupts during migration can cause cause extra interrupts in other
cases.  In the old qemu model (but perhaps not kvm), the mode 2 leading
edge occurs one clock tick earlier than it ought to.  So if you
try to be tricky with adjusting levels during migration, you
may introduce possible cases where it gets an interrupt in
the old model, then migrates and gets another interrupt one tick
later in the new model...

Also, it occurs to me that maybe there might be an off-by-one issue in
the kvm model of mode 2 that ought to be fixed as well?  Although the  
zero length pulse in kvm suggests that one-off issues in counters do  
not matter.  In the qemu model, the leading edge of OUT in mode 2
moves by one tick...

 
  The qemu 8254 model actually models each edge at independent
  clock ticks instead of combining both into a very brief pulse at one time.
  I've found it handy to draw out old and new timing diagrams on paper
  (for each mode), and then carefully think about what happens with respect
  to levels and edges when you transition back and forth between old and
  new models at various points in the timing cycle.  (Note I've spent more
  time examining the qemu models rather than the kvm models.)
  
 Yes, drawing it definitely helps :)
 
   this is likely to be serious is probably losing a single-shot (mode 4)
   interrupt, but if my understanding of how things work is good, then
   that should only be 

Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS

2013-01-10 Thread Christoffer Dall




On 10/01/2013, at 20.10, Scott Wood scottw...@freescale.com wrote:

 On 01/10/2013 06:35:02 PM, Marcelo Tosatti wrote:
 On Thu, Jan 10, 2013 at 04:40:12PM -0600, Scott Wood wrote:
  On 01/10/2013 04:28:01 PM, Marcelo Tosatti wrote:
  Or just have KVM_SET_PPC_DEVICE_ADDRESS. Is there a downside to that?
 
  Besides the above, and my original complaint that it shouldn't be
  specific to addresses?
 
  -Scott
 I did not really grasp that ('shouldnt be specific to addresses'), but
 anyway.
 
 A device may have other configuration parameters that need to be set,
 besides addresses.  PPC MPIC will require information about the vendor
 and version, for example.
 
 OK, can you write down your proposed improvements to the interface?
 In case you have something ready, otherwise there is time pressure
 to merge the ARM port.
 
 My original request was just to change the name to something like
 KVM_SET_DEVICE_CONFIG or KVM_SET_DEVICE_ATTR, and not make the id
 encoding architecture-specific (preferably, separate into a device id
 field and an attribute id field rather than using bitfields).  Actual
 values for device id could be architecture-specific (or there could be a
 global enumeration), and attribute id values would be device-specific.
 
 Alex suggested that an ideal interface might accept values larger than 64
 bits, though I think it's good enough -- there are currently no proposed
 uses that need more than 64 bits for a single attribute (unlike ONE_REG),
 and if it is needed, such configuration could be split up between
 multiple attributes, or the attribute could specify that value be a
 userspace pointer to the actual data (as with ONE_REG).
 
 Here's a writeup (the ARM details would go under ARM/vGIC-specific
 documentation):
 
 4.80 KVM_SET_DEVICE_ATTR
 
 Capability: KVM_CAP_SET_DEVICE_ATTR
 Type: vm ioctl
 Parameters: struct kvm_device_attr (in)
 Returns: 0 on success, -1 on error
 Errors:
  ENODEV: The device id is unknown
  ENXIO:  Device not supported on current system
  Other errors may be returned by specific devices and attributes.
 
 struct kvm_device_attr {
__u32 device;
__u32 attr;
__u64 value;
 };
 
 Specify an attribute of a device emulated or directly exposed by the
 kernel, which the host kernel needs to know about.  The device field is an
 architecture-specific identifier for a specific device.  The attr field
 is a device-specific identifier for a specific attribute.  Individual
 attributes may have particular requirements for when they can and cannot
 be set.
 
 That is, if you have interest/energy to spend in a possibly reusable
 interface, as long as that does not delay integration of the ARM code,
 i don't think the ARM people will mind that.
 
 The impression I've been given is that just about any change will delay
 the integration at this point.  If that's the case, and everyone's OK
 with having an interface that is deprecated on arrival, then fine.


That is not entirely the case, but there wasn't event agreement on this revised 
API, and we didn't want to wait for weeks until a decision was made. Alex 
suggested we use a DEV_REG API similar to the ONE_REG API, but I am quite 
strongly against having such an API for this specific purpose as it is too 
semantically distant to what we do on ARM. (Having a DEV_REG API for other 
purposes might be fine though). 

So I have no problem with your suggestion, although we could consider having a 
size and addr field in the struct instead and be slightly more extensible. But 
I don't feel strongly about it. 

If we can agree on Scott's suggestion or with my modification (Alex, Gleb, 
Marcelo ?) then I'll change the KVM/ARM patches to use this and resend them 
right away. But we have to agree now!

If not, I really think we should keep things as they are now, as we're really 
discussing idealistic scenarios here, and the ARM patches have been out of tree 
for long enough. As Marcelo pointed out, the benefits of the perfect API are 
really minimal.

-Christoffer--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v9 2/3] x86, apicv: add virtual x2apic support

2013-01-10 Thread Zhang, Yang Z
Gleb Natapov wrote on 2013-01-10:
 On Thu, Jan 10, 2013 at 12:22:51PM +, Zhang, Yang Z wrote:
 Gleb Natapov wrote on 2013-01-10:
 On Thu, Jan 10, 2013 at 11:54:21AM +, Zhang, Yang Z wrote:
 The right logic should be:
 When register virtualization enabled, read all apic msr(except apic id 
 reg
 and
 tmcct which have the hook) not cause vmexit and only write TPR not cause
 vmexit.
 When vid enabled, write to TPR, EOI and SELF IPI not cause vmexit.
 It's better to put the patch of envirtualize x2apic mode as first patch.
 
 There is no point whatsoever to enable virtualize x2apic without
 allowing at least one non intercepted access to x2apic MSR range and
 this is what your patch is doing when run on cpu without vid capability.
 No, at least read/write TPR will not cause vmexit if virtualize x2apic 
 mode is
 enabled.
 For that you need to disable 808H MSR intercept, which your patch does not
 do.
 I want to do this in next patch.
 
 Then don't. There is no point in the patch that enables virtualize
 x2apic and does not allow at least one non-intercepted MSR access.
As I said, read/write TPR will not cause vmexit if virtual x2apic is set.

 I am not sure whether I understand your comments right in previous
 discussion, here is my thinking: 1. enable virtualize x2apic mode if
 guest is really using x2apic and clear TPR in msr read  and write
 bitmap. This will benefit reading TPR. 2. If APIC registers
 virtualization is enabled, clear all bit in rang
 0x800-0x8ff(except apic id reg and tmcct).
 Clear all read bits in the range.
 
 3. If vid is enabled, clear EOI and SELF IPI in msr write map.
 
 Looks OK.
 
 One concern you mentioned is that vid enabled and virtualize x2apic is
 disabled
 but guest still use x2apic. In this case, we still use MSR bitmap to
 intercept x2apic. This means the vEOI update will never happen. But we
 still can benefit from interrupt window.
 
 What interrupt windows? Without virtualized x2apic TPR/EOI
 virtualization will not happen and we rely on that in the code.
 Yes, but we can eliminate vmexit of interrupt window. Interrupt still can
 delivery to guest without vmexit when guest enables interrupt if vid is 
 enabled.
 See SDM vol 3, 29.2.2.
 
 What does it matter that you eliminated vmexit of interrupt window if
 you broke everything else? The vid patch assumes that apic emulation
 either entirely happens in a software when vid is disabled or in a CPU
 if vid is enabled. Mixed mode will not work (apic-isr_count = 1 trick
 will break things for instance). And it is not worth it to complicate
 the code to make it work.
Yes, you are right. It too complicated.
Another question? Why not to hide x2apic capability to guest when vid is 
supported and virtual x2apic mode is not supported? It should be more 
reasonable than disable vid when virtual x2apic mode is unavailable.

Best regards,
Yang


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/2] make mac programming for virtio net more robust

2013-01-10 Thread Michael S. Tsirkin
On Fri, Jan 11, 2013 at 12:53:07PM +1030, Rusty Russell wrote:
 Michael S. Tsirkin m...@redhat.com writes:
 
  On Thu, Jan 10, 2013 at 10:45:39PM +0800, ak...@redhat.com wrote:
  From: Amos Kong ak...@redhat.com
  
  Currenly mac is programmed byte by byte. This means that we
  have an intermediate step where mac is wrong. 
  
  Second patch introduced a new vq control command to set mac
  address in one time.
 
  As you mention we could alternatively do it without
  new commands, simply add a feature bit that says that MACs are
  in the mac table.
  This would be a much bigger patch, and I'm fine with either way.
  Rusty what do you think?
 
 Hmm, mac filtering and my mac address are not quite the same thing.  I
 don't know if it matters for anyone: does it?
 The mac address is abused
 for things like identifying machines, etc.

I don't know either. I think net core differentiates between mac and
uc_list because linux has to know which mac to use when building
up packets, so at some level, I agree it might be useful to identify the
machine.

BTW netdev/davem should have been copied on this, Amos I think it's a
good idea to remember to do it next time you post.

 
 If we keep it as a separate concept, Amos' patch seems to make sense.

Yes. It also keeps the patch small, I just thought I'd mention the
option.

 
 Cheers,
 Rusty.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/9] KVM: PPC: In-kernel PAPR interrupt controller emulation

2013-01-10 Thread Alexander Graf

On 10.01.2013, at 06:09, Paul Mackerras wrote:

 On Sat, Dec 15, 2012 at 01:06:13PM +1100, Benjamin Herrenschmidt wrote:
 On Sat, 2012-12-15 at 01:46 +0100, Alexander Graf wrote:
 On 05.11.2012, at 04:18, Paul Mackerras wrote:
 
 This series implements in-kernel emulation of the XICS interrupt
 controller specified in the PAPR specification and used by pseries
 guests.  Since the XICS is organized a bit differently to the
 interrupt controllers used on x86 machines, I have defined some new
 ioctls for setting up the XICS and for saving and restoring its
 state.  It may be possible to use some of the currently x86-specific
 ioctls instead.
 
 Is this one already following the new world order that we discussed during 
 KVM Forum? :)
 
 The new world order  (sorry, looks like nobody took notes and
 people expect me to do a write up from memory now ... :-)
 
 Well, mostly we agreed that the x86 stuff wasn't going to work for us.
 
 So basically what we discussed boils down to:
 
 - Move the existing generic KVM ioctl to create the kernel APIC to
 x86 since it's no as-is useful for other archs who, among other things,
 might need to pass argument based on the machine type (type of interrupt
 subsystem for example, etc...)
 
 Assuming you're talking about KVM_CREATE_IRQCHIP, it is already
 handled entirely in arch code (arch/x86/kvm/x86.c and
 arch/ia64/kvm/kvm-ia64.c), along with KVM_GET_IRQCHIP and
 KVM_SET_IRQCHIP.

This part is about QEMU.

 
 - Once that's done, well, instanciating interrupt controller objects
 becomes pretty much an arch specific matter. We could try to keep some
 ioctls somewhat common though it's not *that* useful because the various
 types  arguments are going to be fairly arch specific, so goes for
 configuration. Examples of what could be kept common are:
 
   * Create irq chip, takes at least a type argument, possibly a few
 more type-specific args (or a union, but then let's keep space in there
 since we can't change the size of the struct later as it would impact
 the ioctl number afaik).
 
 The existing KVM_CREATE_IRQCHIP is defined as _IO(...) which means
 that it doesn't read or write memory, but there is still the 3rd
 argument to the ioctl, which would give us 64 bits to indicate the
 type of the top-level IRQ controller (XICS, MPIC, ...), but not much
 more.
 
 It sounds like the agreement at the meeting was more along the lines
 of the KVM_CREATE_IRQCHIP_ARGS ioctl (as in the patches I posted)
 which can be called multiple times to instantiate pieces of the
 interrupt framework, rather than having a KVM_CREATE_IRQCHIP that gets
 called once early on to say that there we are using in-kernel
 interrupt controller emulation, followed by other calls to configure
 the various parts of the framework.  Is that accurate?

KVM_CREATE_IRQCHIP should get a parameter indicating the type of IRQCHIP (XICS 
/ MPIC / APIC / VGIC / ...). The parameter-less version defaults to an arch 
specific default (APIC for x86, VGIC for arm, error on PPC).

I'm not 100% sure yet whether we want to support models with multiple IRQCHIPs. 
If so, we need to return a device token from the KVM_CREATE_IRQCHIP ioctl. 
Maybe it makes more sense to model specific cases like this as separate type 
though with specific, explicit subdevice ids. Not sure.

Other instantiation attributes we don't know that early on should be settable 
between the time frame of vm creation and first execution. An example for this 
are device addresses. Check out the threads KVM: ARM: Rename 
KVM_SET_DEVICE_ADDRESS and KVM: ARM: Introduce KVM_SET_DEVICE_ADDRESS ioctl 
for more information on this particular bit.


Alex

 
  * Assigning the address of an irq chip when it can change (see ARM
 patches)
 
 - The existing business with irqfd etc... is mostly ok, except the
 interfaces messing around with MSIs (see virtio-pci use of kvm functions
 directly). The assignment of an irq number for an MSI must be a hook,
 probably a PCI controller hook, so x86 can get it done via its existing
 kernel interfaces and sane architectures can keep the assignment in qemu
 where it belongs.
 
 So, if I've understood you correctly about what was agreed, the set of
 ioctls that is implemented in the patches I posted is in line with
 what was agreed, isn't it?  If not, where does it differ?  (To recap,
 I had KVM_CREATE_IRQCHIP_ARGS, KVM_IRQCHIP_GET_SOURCES and
 KVM_IRQCHIP_SET_SOURCES, plus a one-reg interface to get/set the
 vcpu-specific state.)
 
 Paul.

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PULL 0/8] ppc patch queue 2013-01-10

2013-01-10 Thread Alexander Graf
Hi Marcelo / Gleb,

This is my current patch queue for ppc.  Please pull.

Highlights this time:

  - Book3S: enable potential sPAPR guest emulation on PR KVM on pHyp
  - BookE: EPR (External Proxy Register) support

Alex

The following changes since commit 908e7d7999bcce70ac52e7f390a8f5cbc55948de:
  Gleb Natapov (1):
KVM: MMU: simplify folding of dirty bit into accessed_dirty

are available in the git repository at:

  git://github.com/agraf/linux-2.6.git kvm-ppc-next

Alexander Graf (6):
  KVM: PPC: Only WARN on invalid emulation
  KVM: PPC: Book3S: PR: Enable alternative instruction for SC 1
  KVM: PPC: BookE: Allow irq deliveries to inject requests
  KVM: PPC: BookE: Emulate mfspr on EPR
  KVM: PPC: BookE: Implement EPR exit
  KVM: PPC: BookE: Add EPR ONE_REG sync

Mihai Caraman (2):
  KVM: PPC: Fix SREGS documentation reference
  KVM: PPC: Fix mfspr/mtspr MMUCFG emulation

 Documentation/virtual/kvm/api.txt   |   43 --
 arch/powerpc/include/asm/kvm_host.h |2 +
 arch/powerpc/include/asm/kvm_ppc.h  |   10 
 arch/powerpc/include/uapi/asm/kvm.h |6 -
 arch/powerpc/kvm/book3s_emulate.c   |   30 
 arch/powerpc/kvm/book3s_pr.c|5 
 arch/powerpc/kvm/booke.c|   40 +++-
 arch/powerpc/kvm/booke_emulate.c|3 ++
 arch/powerpc/kvm/emulate.c  |5 
 arch/powerpc/kvm/powerpc.c  |   13 +-
 include/linux/kvm_host.h|1 +
 include/uapi/linux/kvm.h|6 +
 12 files changed, 153 insertions(+), 11 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/8] KVM: PPC: BookE: Implement EPR exit

2013-01-10 Thread Alexander Graf
The External Proxy Facility in FSL BookE chips allows the interrupt
controller to automatically acknowledge an interrupt as soon as a
core gets its pending external interrupt delivered.

Today, user space implements the interrupt controller, so we need to
check on it during such a cycle.

This patch implements logic for user space to enable EPR exiting,
disable EPR exiting and EPR exiting itself, so that user space can
acknowledge an interrupt when an external interrupt has successfully
been delivered into the guest vcpu.

Signed-off-by: Alexander Graf ag...@suse.de
---
 Documentation/virtual/kvm/api.txt   |   40 +-
 arch/powerpc/include/asm/kvm_host.h |2 +
 arch/powerpc/include/asm/kvm_ppc.h  |9 +++
 arch/powerpc/kvm/booke.c|   14 +++-
 arch/powerpc/kvm/powerpc.c  |   10 
 include/linux/kvm_host.h|1 +
 include/uapi/linux/kvm.h|6 +
 7 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 4fc2bfc..a98ed09 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2246,8 +2246,8 @@ executed a memory-mapped I/O instruction which could not 
be satisfied
 by kvm.  The 'data' member contains the written data if 'is_write' is
 true, and should be filled by application code otherwise.
 
-NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR
-  and KVM_EXIT_PAPR the corresponding
+NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR,
+  KVM_EXIT_PAPR and KVM_EXIT_EPR the corresponding
 operations are complete (and guest state is consistent) only after userspace
 has re-entered the kernel with KVM_RUN.  The kernel side will first finish
 incomplete operations and then check for pending signals.  Userspace
@@ -2366,6 +2366,25 @@ interrupt for the target subchannel has been dequeued 
and subchannel_id,
 subchannel_nr, io_int_parm and io_int_word contain the parameters for that
 interrupt. ipb is needed for instruction parameter decoding.
 
+   /* KVM_EXIT_EPR */
+   struct {
+   __u32 epr;
+   } epr;
+
+On FSL BookE PowerPC chips, the interrupt controller has a fast patch
+interrupt acknowledge path to the core. When the core successfully
+delivers an interrupt, it automatically populates the EPR register with
+the interrupt vector number and acknowledges the interrupt inside
+the interrupt controller.
+
+In case the interrupt controller lives in user space, we need to do
+the interrupt acknowledge cycle through it to fetch the next to be
+delivered interrupt vector using this exit.
+
+It gets triggered whenever both KVM_CAP_PPC_EPR are enabled and an
+external interrupt has just been delivered into the guest. User space
+should put the acknowledged interrupt vector into the 'epr' field.
+
/* Fix the size of the union. */
char padding[256];
};
@@ -2501,3 +2520,20 @@ handled in-kernel, while the other I/O instructions are 
passed to userspace.
 
 When this capability is enabled, KVM_EXIT_S390_TSCH will occur on TEST
 SUBCHANNEL intercepts.
+
+6.5 KVM_CAP_PPC_EPR
+
+Architectures: ppc
+Parameters: args[0] defines whether the proxy facility is active
+Returns: 0 on success; -1 on error
+
+This capability enables or disables the delivery of interrupts through the
+external proxy facility.
+
+When enabled (args[0] != 0), every time the guest gets an external interrupt
+delivered, it automatically exits into user space with a KVM_EXIT_EPR exit
+to receive the topmost interrupt vector.
+
+When disabled (args[0] == 0), behavior is as if this facility is unsupported.
+
+When this capability is enabled, KVM_EXIT_EPR can occur.
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index ab49c6c..8a72d59 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -520,6 +520,8 @@ struct kvm_vcpu_arch {
u8 sane;
u8 cpu_type;
u8 hcall_needed;
+   u8 epr_enabled;
+   u8 epr_needed;
 
u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 5f5f69a..493630e 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -264,6 +264,15 @@ static inline void kvm_linear_init(void)
 {}
 #endif
 
+static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr)
+{
+#ifdef CONFIG_KVM_BOOKE_HV
+   mtspr(SPRN_GEPR, epr);
+#elif defined(CONFIG_BOOKE)
+   vcpu-arch.epr = epr;
+#endif
+}
+
 int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
  struct kvm_config_tlb *cfg);
 int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 964f447..940ec80 100644
--- 

  1   2   >