Re: [PATCH v9 3/3] x86, apicv: add virtual interrupt delivery support
On Thu, Jan 10, 2013 at 03:26:08PM +0800, Yang Zhang wrote: From: Yang Zhang yang.z.zh...@intel.com Virtual interrupt delivery avoids KVM to inject vAPIC interrupts manually, which is fully taken care of by the hardware. This needs some special awareness into existing interrupr injection path: - for pending interrupt, instead of direct injection, we may need update architecture specific indicators before resuming to guest. - A pending interrupt, which is masked by ISR, should be also considered in above update action, since hardware will decide when to inject it at right time. Current has_interrupt and get_interrupt only returns a valid vector from injection p.o.v. Signed-off-by: Kevin Tian kevin.t...@intel.com Signed-off-by: Yang Zhang yang.z.zh...@intel.com --- arch/x86/include/asm/kvm_host.h |5 + arch/x86/include/asm/vmx.h | 11 +++ arch/x86/kvm/irq.c | 56 +++- arch/x86/kvm/lapic.c| 72 +-- arch/x86/kvm/lapic.h| 23 + arch/x86/kvm/svm.c | 18 arch/x86/kvm/vmx.c | 191 +-- arch/x86/kvm/x86.c | 14 +++- include/linux/kvm_host.h|3 + virt/kvm/ioapic.c | 18 virt/kvm/ioapic.h |4 + virt/kvm/irq_comm.c | 22 + virt/kvm/kvm_main.c |5 + 13 files changed, 399 insertions(+), 43 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 572a562..f471856 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -697,6 +697,10 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); + int (*has_virtual_interrupt_delivery)(struct kvm_vcpu *vcpu); + void (*update_apic_irq)(struct kvm_vcpu *vcpu, int max_irr); + void (*update_eoi_exitmap)(struct kvm_vcpu *vcpu); + void (*set_svi)(int isr); void (*enable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu); void (*disable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); @@ -993,6 +997,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); +int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 0a54df0..694586c 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -62,6 +62,7 @@ #define EXIT_REASON_MCE_DURING_VMENTRY 41 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 #define EXIT_REASON_APIC_ACCESS 44 +#define EXIT_REASON_EOI_INDUCED 45 #define EXIT_REASON_EPT_VIOLATION 48 #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_WBINVD 54 @@ -144,6 +145,7 @@ #define SECONDARY_EXEC_WBINVD_EXITING0x0040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST0x0080 #define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x0100 +#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY0x0200 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING0x0400 #define SECONDARY_EXEC_ENABLE_INVPCID0x1000 @@ -181,6 +183,7 @@ enum vmcs_field { GUEST_GS_SELECTOR = 0x080a, GUEST_LDTR_SELECTOR = 0x080c, GUEST_TR_SELECTOR = 0x080e, + GUEST_INTR_STATUS = 0x0810, HOST_ES_SELECTOR= 0x0c00, HOST_CS_SELECTOR= 0x0c02, HOST_SS_SELECTOR= 0x0c04, @@ -208,6 +211,14 @@ enum vmcs_field { APIC_ACCESS_ADDR_HIGH = 0x2015, EPT_POINTER = 0x201a, EPT_POINTER_HIGH= 0x201b, + EOI_EXIT_BITMAP0= 0x201c, + EOI_EXIT_BITMAP0_HIGH = 0x201d, + EOI_EXIT_BITMAP1= 0x201e, + EOI_EXIT_BITMAP1_HIGH = 0x201f, + EOI_EXIT_BITMAP2= 0x2020, + EOI_EXIT_BITMAP2_HIGH = 0x2021, + EOI_EXIT_BITMAP3= 0x2022, + EOI_EXIT_BITMAP3_HIGH = 0x2023, GUEST_PHYSICAL_ADDRESS = 0x2400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x2401, VMCS_LINK_POINTER = 0x2800, diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index b111aee..e113440 100644 ---
Re: [PATCH v9 2/3] x86, apicv: add virtual x2apic support
On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote: +static void vmx_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu) +{ + u32 exec_control; + int msr; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!cpu_has_vmx_virtualize_x2apic_mode()) + return; + + exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + /* virtualize x2apic mode relies on tpr shadow */ + if (!(exec_control CPU_BASED_TPR_SHADOW)) + return; + + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control = ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmx-virtual_x2apic_enabled = true; + + if (!cpu_has_vmx_virtual_intr_delivery()) + return; + + for (msr = 0x800; msr = 0x8ff; msr++) + vmx_intercept_for_msr_read(msr, false, false); + + /* APIC ID */ + vmx_intercept_for_msr_read(0x802, false, true); Why are you enabling apic id read intercept? + /* TMCCT */ + vmx_intercept_for_msr_read(0x839, false, true); + /* TPR */ + vmx_intercept_for_msr_write(0x808, false, false); + /* EOI */ + vmx_intercept_for_msr_write(0x80b, false, false); + /* SELF-IPI */ + vmx_intercept_for_msr_write(0x83f, false, false); + +} + -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH v9 2/3] x86, apicv: add virtual x2apic support
Gleb Natapov wrote on 2013-01-10: On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote: +static void vmx_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu) +{ +u32 exec_control; +int msr; +struct vcpu_vmx *vmx = to_vmx(vcpu); + +if (!cpu_has_vmx_virtualize_x2apic_mode()) +return; + +exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); +/* virtualize x2apic mode relies on tpr shadow */ +if (!(exec_control CPU_BASED_TPR_SHADOW)) +return; + +exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); +exec_control = ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; +exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); +vmx-virtual_x2apic_enabled = true; + +if (!cpu_has_vmx_virtual_intr_delivery()) +return; + +for (msr = 0x800; msr = 0x8ff; msr++) +vmx_intercept_for_msr_read(msr, false, false); + +/* APIC ID */ +vmx_intercept_for_msr_read(0x802, false, true); Why are you enabling apic id read intercept? Current code to read apic id in x2apic mode has some hacks: if (apic_x2apic_mode(apic)) val = kvm_apic_id(apic); else val = kvm_apic_id(apic) 24; static inline int kvm_apic_id(struct kvm_lapic *apic) { return (kvm_apic_get_reg(apic, APIC_ID) 24) 0xff; } According SPEC, in x2apic mode, the whole id reg is used, but in KVM, it only use the highest eight bits. +/* TMCCT */ +vmx_intercept_for_msr_read(0x839, false, true); +/* TPR */ +vmx_intercept_for_msr_write(0x808, false, false); +/* EOI */ +vmx_intercept_for_msr_write(0x80b, false, false); +/* SELF-IPI */ +vmx_intercept_for_msr_write(0x83f, false, false); + +} + -- Gleb. Best regards, Yang -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH v9 2/3] x86, apicv: add virtual x2apic support
Gleb Natapov wrote on 2013-01-10: On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote: From: Yang Zhang yang.z.zh...@intel.com basically to benefit from apicv, we need to enable virtualized x2apic mode. Currently, we only enable it when guest is really using x2apic. Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled x2apic: 0x800 - 0x8ff: no read intercept for apicv register virtualization, except APIC ID and TMCCT. APIC ID and TMCCT: need software's assistance to get right value. TPR,EOI,SELF-IPI: no write intercept for virtual interrupt delivery. Signed-off-by: Kevin Tian kevin.t...@intel.com Signed-off-by: Yang Zhang yang.z.zh...@intel.com --- arch/x86/include/asm/kvm_host.h |2 + arch/x86/include/asm/vmx.h |1 + arch/x86/kvm/lapic.c|5 +- arch/x86/kvm/svm.c |6 + arch/x86/kvm/vmx.c | 194 +-- 5 files changed, 200 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c431b33..572a562 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -697,6 +697,8 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); +void (*enable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu); +void (*disable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu); Make one callback with enable/disable parameter. And do not forget SVM. int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 44c3f7e..0a54df0 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -139,6 +139,7 @@ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x0001 #define SECONDARY_EXEC_ENABLE_EPT 0x0002 #define SECONDARY_EXEC_RDTSCP 0x0008 +#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x0010 #define SECONDARY_EXEC_ENABLE_VPID 0x0020 #define SECONDARY_EXEC_WBINVD_EXITING 0x0040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x0080 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0664c13..ec38906 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1328,7 +1328,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) u32 id = kvm_apic_id(apic); u32 ldr = ((id 4) 16) | (1 (id 0xf)); kvm_apic_set_ldr(apic, ldr); -} +kvm_x86_ops-enable_virtual_x2apic_mode(vcpu); +} else +kvm_x86_ops-disable_virtual_x2apic_mode(vcpu); + You just broke SVM. apic-base_address = apic-vcpu-arch.apic_base MSR_IA32_APICBASE_BASE; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d29d3cd..0b82cb1 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) set_cr_intercept(svm, INTERCEPT_CR8_WRITE); } +static void svm_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu) +{ +return; +} + static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops svm_x86_ops = { .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, +.enable_virtual_x2apic_mode = svm_enable_virtual_x2apic_mode, .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 688f43f..b203ce7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -433,6 +433,8 @@ struct vcpu_vmx { bool rdtscp_enabled; +bool virtual_x2apic_enabled; + /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; }; @@ -767,12 +769,23 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; } +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) +{ +return vmcs_config.cpu_based_2nd_exec_ctrl +SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +} + static inline bool cpu_has_vmx_apic_register_virt(void) { return vmcs_config.cpu_based_2nd_exec_ctrl SECONDARY_EXEC_APIC_REGISTER_VIRT; } +static inline bool cpu_has_vmx_virtual_intr_delivery(void) +{ +return false; +} + static inline bool cpu_has_vmx_flexpriority(void) { return cpu_has_vmx_tpr_shadow() @@ -2544,6 +2557,7 @@ static __init int
Re: [PATCH 01/12] tap: multiqueue support
On Wed, Jan 09, 2013 at 11:25:24PM +0800, Jason Wang wrote: On 01/09/2013 05:56 PM, Stefan Hajnoczi wrote: On Fri, Dec 28, 2012 at 06:31:53PM +0800, Jason Wang wrote: diff --git a/qapi-schema.json b/qapi-schema.json index 5dfa052..583eb7c 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -2465,7 +2465,7 @@ { 'type': 'NetdevTapOptions', 'data': { '*ifname': 'str', -'*fd': 'str', +'*fd': ['String'], This change is not backwards-compatible. You need to add a '*fds': ['String'] field instead. I'm not quite understand this case, I think it still work when we we just specify one fd. You are right, the QemuOpts visitor shows no incompatibility. But there is also a QMP interface: netdev_add. I think changing the type to a string list breaks compatibility there. Stefan -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 1/5] virtio: add functions for piecewise addition of buffers
Il 08/01/2013 01:12, Rusty Russell ha scritto: Unfortunately, that cannot work because not all architectures support chained scatterlists. WHAT? I can't figure out what an arch needs to do to support this? It needs to use the iterator functions in its DMA driver. But we don't care for virtio. True. All archs we care about support them, though, so I think we can ignore this issue for now. Kind of... In principle all QEMU-supported arches can use virtio, and the speedup can be quite useful. And there is no Kconfig symbol for SG chains that I can use to disable virtio-scsi on unsupported arches. :/ Well, we #error if it's not supported. Then the lazy architectures can fix it. Yeah, that would be one approach. But frankly, your patch is really disgusting. :) Not your fault, of course, but I still prefer a limited amount of duplication. Perhaps we can get the best of both worlds, I'll take a look when I have some time. Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH 00/12] Multiqueue virtio-net
On Wed, Jan 09, 2013 at 11:33:25PM +0800, Jason Wang wrote: On 01/09/2013 11:32 PM, Michael S. Tsirkin wrote: On Wed, Jan 09, 2013 at 03:29:24PM +0100, Stefan Hajnoczi wrote: On Fri, Dec 28, 2012 at 06:31:52PM +0800, Jason Wang wrote: Perf Numbers: Two Intel Xeon 5620 with direct connected intel 82599EB Host/Guest kernel: David net tree vhost enabled - lots of improvents of both latency and cpu utilization in request-reponse test - get regression of guest sending small packets which because TCP tends to batch less when the latency were improved 1q/2q/4q TCP_RR size #sessions trans.rate norm trans.rate norm trans.rate norm 1 1 9393.26 595.64 9408.18 597.34 9375.19 584.12 1 2072162.1 2214.24 129880.22 2456.13 196949.81 2298.13 1 50107513.38 2653.99 139721.93 2490.58 259713.82 2873.57 1 100 126734.63 2676.54 145553.5 2406.63 265252.68 2943 64 19453.42 632.33 9371.37 616.13 9338.19 615.97 64 20 70620.03 2093.68 125155.75 2409.15 191239.91 2253.32 64 50 1069662448.29 146518.67 2514.47 242134.07 2720.91 64 100 117046.35 2394.56 190153.09 2696.82 238881.29 2704.41 256 1 8733.29 736.36 8701.07 680.83 8608.92 530.1 256 20 69279.89 2274.45 115103.07 2299.76 144555.16 1963.53 256 50 97676.02 2296.09 150719.57 2522.92 254510.5 3028.44 256 100 150221.55 2949.56 197569.3 2790.92 300695.78 3494.83 TCP_CRR size #sessions trans.rate norm trans.rate norm trans.rate norm 1 1 2848.37 163.41 2230.39 130.89 2013.09 120.47 1 2023434.5 562.11 31057.43 531.07 49488.28 564.41 1 5028514.88 582.17 40494.23 605.92 60113.35 654.97 1 100 28827.22 584.73 48813.25 661.6 61783.62 676.56 64 12780.08 159.4 2201.07 127.96 2006.8 117.63 64 20 23318.51 564.47 30982.44 530.24 49734.95 566.13 64 50 28585.72 582.54 40576.7 610.08 60167.89 656.56 64 100 28747.37 584.17 49081.87 667.87 60612.94 662 256 1 2772.08 160.51 2231.84 131.05 2003.62 113.45 256 20 23086.35 559.8 30929.09 528.16 48454.9 555.22 256 50 28354.7 579.85 40578.31 60760261.71 657.87 256 100 28844.55 585.67 48541.86 659.08 61941.07 676.72 TCP_STREAM guest receiving size #sessions throughput norm throughput norm throughput norm 1 1 16.27 1.33 16.11.12 16.13 0.99 1 2 33.04 2.08 32.96 2.19 32.75 1.98 1 4 66.62 6.83 68.35.56 66.14 2.65 64 1896.55 56.67 914.02 58.14 898.9 61.56 64 21830.46 91.02 1812.02 64.59 1835.57 66.26 64 43626.61 142.55 3636.25 100.64 3607.46 75.03 256 1 2619.49 131.23 2543.19 129.03 2618.69 132.39 256 2 5136.58 203.02 5163.31 141.11 5236.51 149.4 256 4 7063.99 242.83 9365.4 208.49 9421.03 159.94 512 1 3592.43 165.24 3603.12 167.19 3552.5 169.57 512 2 7042.62 246.59 7068.46 180.87 7258.52 186.3 512 4 6996.08 241.49 9298.34 206.12 9418.52 159.33 1024 1 4339.54 192.95 4370.2 191.92 4211.72 192.49 1024 2 7439.45 254.77 9403.99 215.24 9120.82 222.67 1024 4 7953.86 272.11 9403.87 208.23 9366.98 159.49 4096 1 7696.28 272.04 7611.41 270.38 7778.71 267.76 4096 2 7530.35 261.1 8905.43 246.27 8990.18 267.57 4096 4 7121.6 247.02 9411.75 206.71 9654.96 184.67 16384 1 7795.73 268.54 7780.94 267.2 7634.26 260.73 16384 2 7436.57 255.81 9381.86 220.85 9392220.36 16384 4 7199.07 247.81 9420.96 205.87 9373.69 159.57 TCP_MAERTS guest sending size #sessions throughput norm throughput norm throughput norm 1 1 15.94 0.62 15.55 0.61 15.13 0.59 1 2 36.11 0.83 32.46 0.69 32.28 0.69 1 4 71.59 1 68.91 0.94 61.52 0.77 64 1630.71 22.52 622.11 22.35 605.09 21.84 64 21442.36 30.57 1292.15 25.82 1282.67 25.55 64 43186.79 42.59 2844.96 36.03 2529.69 30.06 256 1 1760.96 58.07 1738.44 57.43 1695.99 56.19 256 2 4834.23 95.19 3524.85 64.21 3511.94 64.45 256 4 9324.63 145.74 8956.49 116.39 6720.17 73.86 512 1 2678.03 84.1 2630.68 82.93 2636.54 82.57 512 2 9368.17 195.61 9408.82 204.53 5316.3 92.99 512 4 9186.34 209.68 9358.72 183.82 9489.29 160.42 1024 1 3620.71 109.88 3625.54 109.83 3606.61 112.35 1024 2 9429258.32 7082.79 120.55 7403.53 134.78 1024 4 9430.66 290.44 9499.29 232.31 9414.6 190.92 4096 1 9339.28 296.48 9374.23 372.88 9348.76 298.49 4096 2 9410.53 378.69 9412.61 286.18 9409.75 278.31 4096 4 9487.35 374.1 9556.91 288.81 9441.94 221.64 16384 1 9380.43 403.8 9379.78 399.13 9382.42 393.55 16384 2 9367.69 406.93 9415.04 312.68 9409.29 300.9 16384 4 9391.96 405.17 9695.12 310.54 9423.76 223.47 Trying to understand the performance results: What is the host device configuration? tap + bridge? Yes. Did you use host CPU affinity for the vhost threads? I use numactl to pin cpu threads and vhost threads in the same numa node. Can multiqueue tap take advantage of multiqueue host NICs or is
Re: [PATCH v9 2/3] x86, apicv: add virtual x2apic support
On Thu, Jan 10, 2013 at 08:32:06AM +, Zhang, Yang Z wrote: Gleb Natapov wrote on 2013-01-10: On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote: From: Yang Zhang yang.z.zh...@intel.com basically to benefit from apicv, we need to enable virtualized x2apic mode. Currently, we only enable it when guest is really using x2apic. Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled x2apic: 0x800 - 0x8ff: no read intercept for apicv register virtualization, except APIC ID and TMCCT. APIC ID and TMCCT: need software's assistance to get right value. TPR,EOI,SELF-IPI: no write intercept for virtual interrupt delivery. Signed-off-by: Kevin Tian kevin.t...@intel.com Signed-off-by: Yang Zhang yang.z.zh...@intel.com --- arch/x86/include/asm/kvm_host.h |2 + arch/x86/include/asm/vmx.h |1 + arch/x86/kvm/lapic.c|5 +- arch/x86/kvm/svm.c |6 + arch/x86/kvm/vmx.c | 194 +-- 5 files changed, 200 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c431b33..572a562 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -697,6 +697,8 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); + void (*enable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu); + void (*disable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu); Make one callback with enable/disable parameter. And do not forget SVM. int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 44c3f7e..0a54df0 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -139,6 +139,7 @@ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x0001 #define SECONDARY_EXEC_ENABLE_EPT 0x0002 #define SECONDARY_EXEC_RDTSCP 0x0008 +#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x0010 #define SECONDARY_EXEC_ENABLE_VPID 0x0020 #define SECONDARY_EXEC_WBINVD_EXITING 0x0040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x0080 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0664c13..ec38906 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1328,7 +1328,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) u32 id = kvm_apic_id(apic); u32 ldr = ((id 4) 16) | (1 (id 0xf)); kvm_apic_set_ldr(apic, ldr); - } + kvm_x86_ops-enable_virtual_x2apic_mode(vcpu); + } else + kvm_x86_ops-disable_virtual_x2apic_mode(vcpu); + You just broke SVM. apic-base_address = apic-vcpu-arch.apic_base MSR_IA32_APICBASE_BASE; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d29d3cd..0b82cb1 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) set_cr_intercept(svm, INTERCEPT_CR8_WRITE); } +static void svm_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu) +{ + return; +} + static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops svm_x86_ops = { .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, + .enable_virtual_x2apic_mode = svm_enable_virtual_x2apic_mode, .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 688f43f..b203ce7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -433,6 +433,8 @@ struct vcpu_vmx { bool rdtscp_enabled; + bool virtual_x2apic_enabled; + /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; }; @@ -767,12 +769,23 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; } +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +} + static inline bool cpu_has_vmx_apic_register_virt(void) { return vmcs_config.cpu_based_2nd_exec_ctrl SECONDARY_EXEC_APIC_REGISTER_VIRT; } +static inline bool cpu_has_vmx_virtual_intr_delivery(void) +{ + return false; +} + static inline bool
Re: [PATCH v9 2/3] x86, apicv: add virtual x2apic support
On Thu, Jan 10, 2013 at 08:31:51AM +, Zhang, Yang Z wrote: Gleb Natapov wrote on 2013-01-10: On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote: +static void vmx_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu) +{ + u32 exec_control; + int msr; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!cpu_has_vmx_virtualize_x2apic_mode()) + return; + + exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + /* virtualize x2apic mode relies on tpr shadow */ + if (!(exec_control CPU_BASED_TPR_SHADOW)) + return; + + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control = ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmx-virtual_x2apic_enabled = true; + + if (!cpu_has_vmx_virtual_intr_delivery()) + return; + + for (msr = 0x800; msr = 0x8ff; msr++) + vmx_intercept_for_msr_read(msr, false, false); + + /* APIC ID */ + vmx_intercept_for_msr_read(0x802, false, true); Why are you enabling apic id read intercept? Current code to read apic id in x2apic mode has some hacks: if (apic_x2apic_mode(apic)) val = kvm_apic_id(apic); else val = kvm_apic_id(apic) 24; static inline int kvm_apic_id(struct kvm_lapic *apic) { return (kvm_apic_get_reg(apic, APIC_ID) 24) 0xff; } According SPEC, in x2apic mode, the whole id reg is used, but in KVM, it only use the highest eight bits. Correct. Put the comment above apic id intercept that we do that since in x2apic mode apic id is not correctly reflected in apic page. + /* TMCCT */ + vmx_intercept_for_msr_read(0x839, false, true); + /* TPR */ + vmx_intercept_for_msr_write(0x808, false, false); + /* EOI */ + vmx_intercept_for_msr_write(0x80b, false, false); + /* SELF-IPI */ + vmx_intercept_for_msr_write(0x83f, false, false); + +} + -- Gleb. Best regards, Yang -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH 10/12] virtio-net: multiqueue support
On 01/10/2013 03:16 PM, Jason Wang wrote: On Thursday, January 10, 2013 02:49:14 PM Wanlong Gao wrote: On 01/10/2013 02:43 PM, Jason Wang wrote: On Wednesday, January 09, 2013 11:26:33 PM Jason Wang wrote: On 01/09/2013 06:01 PM, Wanlong Gao wrote: On 01/09/2013 05:30 PM, Jason Wang wrote: On 01/09/2013 04:23 PM, Wanlong Gao wrote: On 01/08/2013 06:14 PM, Jason Wang wrote: On 01/08/2013 06:00 PM, Wanlong Gao wrote: On 01/08/2013 05:51 PM, Jason Wang wrote: On 01/08/2013 05:49 PM, Wanlong Gao wrote: On 01/08/2013 05:29 PM, Jason Wang wrote: On 01/08/2013 05:07 PM, Wanlong Gao wrote: On 12/28/2012 06:32 PM, Jason Wang wrote: +} else if (nc-peer-info-type != NET_CLIENT_OPTIONS_KIND_TAP) { +ret = -1; +} else { +ret = tap_detach(nc-peer); +} + +return ret; +} + [...] I got guest kernel panic when using this way and set queues=4. Does it happens w/o or w/ a fd parameter? What's the qemu command line? Did you meet it during boot time? The QEMU command line is /work/git/qemu/x86_64-softmmu/qemu-system-x86_64 -name f17 -M pc-0.15 -enable-kvm -m 3096 \ -smp 4,sockets=4,cores=1,threads=1 \ -uuid c31a9f3e-4161-c53a-339c-5dc36d0497cb -no-user-config -nodefaults \ -chardev socket,id=charmonitor,path=/var/lib/libvirt/qemu/f17.monitor,server,nowa i t \ -mon chardev=charmonitor,id=monitor,mode=control \ -rtc base=utc -no-shutdown \ -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 \ -device virtio-scsi-pci,id=scsi0,bus=pci.0,addr=0xb,num_queues=4,hotplug=on \ -device virtio-serial-pci,id=virtio-serial0,bus=pci.0,addr=0x5 \ -drive file=/vm/f17.img,if=none,id=drive-virtio-disk0,format=qcow2 \ -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x6,drive=drive-virtio-disk0,id=v i rtio-disk0,bootindex=1 \ -drive file=/vm2/f17-kernel.img,if=none,id=drive-virtio-disk1,format=qcow2 \ -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x8,drive=drive-virtio-disk1,id=v i rtio-disk1 \ -drive file=/vm/virtio-scsi/scsi3.img,if=none,id=drive-scsi0-0-2-0,format=raw \ -device scsi-hd,bus=scsi0.0,channel=0,scsi-id=0,lun=2,drive=drive-scsi0-0-2-0,id = scsi0-0-2-0,removable=on \ -drive file=/vm/virtio-scsi/scsi4.img,if=none,id=drive-scsi0-0-3-0,format=raw \ -device scsi-hd,bus=scsi0.0,channel=0,scsi-id=0,lun=3,drive=drive-scsi0-0-3-0,id = scsi0-0-3-0 \ -drive file=/vm/virtio-scsi/scsi1.img,if=none,id=drive-scsi0-0-0-0,format=raw \ -device scsi-hd,bus=scsi0.0,channel=0,scsi-id=0,lun=0,drive=drive-scsi0-0-0-0,id = scsi0-0-0-0 \ -drive file=/vm/virtio-scsi/scsi2.img,if=none,id=drive-scsi0-0-1-0,format=raw \ -device scsi-hd,bus=scsi0.0,channel=0,scsi-id=0,lun=1,drive=drive-scsi0-0-1-0,id = scsi0-0-1-0 \ -chardev pty,id=charserial0 -device isa-serial,chardev=charserial0,id=serial0 \ -chardev file,id=charserial1,path=/vm/f17.log \ -device isa-serial,chardev=charserial1,id=serial1 \ -device usb-tablet,id=input0 -vga std \ -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x7 \ -netdev tap,id=hostnet0,vhost=on,queues=4 \ -device virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:ce:7b:29,bus=pci.0,a d dr=0x3 \ -monitor stdio I got panic just after booting the system, did nothing, waited for a while, the guest panicked. [ 28.053004] BUG: soft lockup - CPU#1 stuck for 23s! [ip:592] [ 28.053004] Modules linked in: ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables uinput joydev microcode virtio_balloon pcspkr virtio_net i2c_piix4 i2c_core virtio_scsi virtio_blk floppy [ 28.053004] CPU 1 [ 28.053004] Pid: 592, comm: ip Not tainted 3.8.0-rc1-net+ #3 Bochs Bochs [ 28.053004] RIP: 0010:[8137a9ab] [8137a9ab] virtqueue_get_buf+0xb/0x120 [ 28.053004] RSP: 0018:8800bc913550 EFLAGS: 0246 [ 28.053004] RAX: RBX: 8800bc49c000 RCX: 8800bc49e000 [ 28.053004] RDX: RSI: 8800bc913584 RDI: 8800bcfd4000 [ 28.053004] RBP: 8800bc913558 R08: 8800bcfd0800 R09: [ 28.053004] R10: 8800bc49c000 R11: 880036cc4de0 R12: 8800bcfd4000 [ 28.053004] R13: 8800bc913558 R14: 8137ad73 R15: 000200d0 [ 28.053004] FS: 7fb27a589740() GS:8800c148() knlGS: [ 28.053004] CS: 0010 DS: ES: CR0: 80050033 [ 28.053004] CR2: 00640530 CR3: baeff000 CR4: 06e0 [ 28.053004] DR0: DR1: DR2: [ 28.053004] DR3: DR6: 0ff0 DR7: 0400 [ 28.053004] Process ip (pid: 592, threadinfo 8800bc912000, task 880036da2e20) [ 28.053004] Stack: [ 28.053004] 8800bcfd0800 8800bc913638 a003e9bb 8800bc913656 [ 28.053004] 00010002 8800c17ebb08 0050ff10 ea0002f244c0 [ 28.053004] 00020582 ea0002f244c0 [
Re: [Qemu-devel] [PATCH 00/12] Multiqueue virtio-net
On 01/10/2013 04:44 PM, Stefan Hajnoczi wrote: On Wed, Jan 09, 2013 at 11:33:25PM +0800, Jason Wang wrote: On 01/09/2013 11:32 PM, Michael S. Tsirkin wrote: On Wed, Jan 09, 2013 at 03:29:24PM +0100, Stefan Hajnoczi wrote: On Fri, Dec 28, 2012 at 06:31:52PM +0800, Jason Wang wrote: Perf Numbers: Two Intel Xeon 5620 with direct connected intel 82599EB Host/Guest kernel: David net tree vhost enabled - lots of improvents of both latency and cpu utilization in request-reponse test - get regression of guest sending small packets which because TCP tends to batch less when the latency were improved 1q/2q/4q TCP_RR size #sessions trans.rate norm trans.rate norm trans.rate norm 1 1 9393.26 595.64 9408.18 597.34 9375.19 584.12 1 2072162.1 2214.24 129880.22 2456.13 196949.81 2298.13 1 50107513.38 2653.99 139721.93 2490.58 259713.82 2873.57 1 100 126734.63 2676.54 145553.5 2406.63 265252.68 2943 64 19453.42 632.33 9371.37 616.13 9338.19 615.97 64 20 70620.03 2093.68 125155.75 2409.15 191239.91 2253.32 64 50 1069662448.29 146518.67 2514.47 242134.07 2720.91 64 100 117046.35 2394.56 190153.09 2696.82 238881.29 2704.41 256 1 8733.29 736.36 8701.07 680.83 8608.92 530.1 256 20 69279.89 2274.45 115103.07 2299.76 144555.16 1963.53 256 50 97676.02 2296.09 150719.57 2522.92 254510.5 3028.44 256 100 150221.55 2949.56 197569.3 2790.92 300695.78 3494.83 TCP_CRR size #sessions trans.rate norm trans.rate norm trans.rate norm 1 1 2848.37 163.41 2230.39 130.89 2013.09 120.47 1 2023434.5 562.11 31057.43 531.07 49488.28 564.41 1 5028514.88 582.17 40494.23 605.92 60113.35 654.97 1 100 28827.22 584.73 48813.25 661.6 61783.62 676.56 64 12780.08 159.4 2201.07 127.96 2006.8 117.63 64 20 23318.51 564.47 30982.44 530.24 49734.95 566.13 64 50 28585.72 582.54 40576.7 610.08 60167.89 656.56 64 100 28747.37 584.17 49081.87 667.87 60612.94 662 256 1 2772.08 160.51 2231.84 131.05 2003.62 113.45 256 20 23086.35 559.8 30929.09 528.16 48454.9 555.22 256 50 28354.7 579.85 40578.31 60760261.71 657.87 256 100 28844.55 585.67 48541.86 659.08 61941.07 676.72 TCP_STREAM guest receiving size #sessions throughput norm throughput norm throughput norm 1 1 16.27 1.33 16.11.12 16.13 0.99 1 2 33.04 2.08 32.96 2.19 32.75 1.98 1 4 66.62 6.83 68.35.56 66.14 2.65 64 1896.55 56.67 914.02 58.14 898.9 61.56 64 21830.46 91.02 1812.02 64.59 1835.57 66.26 64 43626.61 142.55 3636.25 100.64 3607.46 75.03 256 1 2619.49 131.23 2543.19 129.03 2618.69 132.39 256 2 5136.58 203.02 5163.31 141.11 5236.51 149.4 256 4 7063.99 242.83 9365.4 208.49 9421.03 159.94 512 1 3592.43 165.24 3603.12 167.19 3552.5 169.57 512 2 7042.62 246.59 7068.46 180.87 7258.52 186.3 512 4 6996.08 241.49 9298.34 206.12 9418.52 159.33 1024 1 4339.54 192.95 4370.2 191.92 4211.72 192.49 1024 2 7439.45 254.77 9403.99 215.24 9120.82 222.67 1024 4 7953.86 272.11 9403.87 208.23 9366.98 159.49 4096 1 7696.28 272.04 7611.41 270.38 7778.71 267.76 4096 2 7530.35 261.1 8905.43 246.27 8990.18 267.57 4096 4 7121.6 247.02 9411.75 206.71 9654.96 184.67 16384 1 7795.73 268.54 7780.94 267.2 7634.26 260.73 16384 2 7436.57 255.81 9381.86 220.85 9392220.36 16384 4 7199.07 247.81 9420.96 205.87 9373.69 159.57 TCP_MAERTS guest sending size #sessions throughput norm throughput norm throughput norm 1 1 15.94 0.62 15.55 0.61 15.13 0.59 1 2 36.11 0.83 32.46 0.69 32.28 0.69 1 4 71.59 1 68.91 0.94 61.52 0.77 64 1630.71 22.52 622.11 22.35 605.09 21.84 64 21442.36 30.57 1292.15 25.82 1282.67 25.55 64 43186.79 42.59 2844.96 36.03 2529.69 30.06 256 1 1760.96 58.07 1738.44 57.43 1695.99 56.19 256 2 4834.23 95.19 3524.85 64.21 3511.94 64.45 256 4 9324.63 145.74 8956.49 116.39 6720.17 73.86 512 1 2678.03 84.1 2630.68 82.93 2636.54 82.57 512 2 9368.17 195.61 9408.82 204.53 5316.3 92.99 512 4 9186.34 209.68 9358.72 183.82 9489.29 160.42 1024 1 3620.71 109.88 3625.54 109.83 3606.61 112.35 1024 2 9429258.32 7082.79 120.55 7403.53 134.78 1024 4 9430.66 290.44 9499.29 232.31 9414.6 190.92 4096 1 9339.28 296.48 9374.23 372.88 9348.76 298.49 4096 2 9410.53 378.69 9412.61 286.18 9409.75 278.31 4096 4 9487.35 374.1 9556.91 288.81 9441.94 221.64 16384 1 9380.43 403.8 9379.78 399.13 9382.42 393.55 16384 2 9367.69 406.93 9415.04 312.68 9409.29 300.9 16384 4 9391.96 405.17 9695.12 310.54 9423.76 223.47 Trying to understand the performance results: What is the host device configuration? tap + bridge? Yes. Did you use host CPU affinity for the vhost threads? I use numactl to pin cpu threads and vhost threads in the same numa node. Can multiqueue tap take advantage of multiqueue host NICs or is virtio-net multiqueue unaware of the physical NIC
Re: [Qemu-devel] [PATCH 10/12] virtio-net: multiqueue support
On 01/10/2013 05:06 PM, Wanlong Gao wrote: On 01/10/2013 03:16 PM, Jason Wang wrote: On Thursday, January 10, 2013 02:49:14 PM Wanlong Gao wrote: On 01/10/2013 02:43 PM, Jason Wang wrote: On Wednesday, January 09, 2013 11:26:33 PM Jason Wang wrote: On 01/09/2013 06:01 PM, Wanlong Gao wrote: On 01/09/2013 05:30 PM, Jason Wang wrote: On 01/09/2013 04:23 PM, Wanlong Gao wrote: On 01/08/2013 06:14 PM, Jason Wang wrote: On 01/08/2013 06:00 PM, Wanlong Gao wrote: On 01/08/2013 05:51 PM, Jason Wang wrote: On 01/08/2013 05:49 PM, Wanlong Gao wrote: On 01/08/2013 05:29 PM, Jason Wang wrote: On 01/08/2013 05:07 PM, Wanlong Gao wrote: On 12/28/2012 06:32 PM, Jason Wang wrote: +} else if (nc-peer-info-type != NET_CLIENT_OPTIONS_KIND_TAP) { +ret = -1; +} else { +ret = tap_detach(nc-peer); +} + +return ret; +} + [...] I got guest kernel panic when using this way and set queues=4. Does it happens w/o or w/ a fd parameter? What's the qemu command line? Did you meet it during boot time? The QEMU command line is /work/git/qemu/x86_64-softmmu/qemu-system-x86_64 -name f17 -M pc-0.15 -enable-kvm -m 3096 \ -smp 4,sockets=4,cores=1,threads=1 \ -uuid c31a9f3e-4161-c53a-339c-5dc36d0497cb -no-user-config -nodefaults \ -chardev socket,id=charmonitor,path=/var/lib/libvirt/qemu/f17.monitor,server,nowa i t \ -mon chardev=charmonitor,id=monitor,mode=control \ -rtc base=utc -no-shutdown \ -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 \ -device virtio-scsi-pci,id=scsi0,bus=pci.0,addr=0xb,num_queues=4,hotplug=on \ -device virtio-serial-pci,id=virtio-serial0,bus=pci.0,addr=0x5 \ -drive file=/vm/f17.img,if=none,id=drive-virtio-disk0,format=qcow2 \ -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x6,drive=drive-virtio-disk0,id=v i rtio-disk0,bootindex=1 \ -drive file=/vm2/f17-kernel.img,if=none,id=drive-virtio-disk1,format=qcow2 \ -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x8,drive=drive-virtio-disk1,id=v i rtio-disk1 \ -drive file=/vm/virtio-scsi/scsi3.img,if=none,id=drive-scsi0-0-2-0,format=raw \ -device scsi-hd,bus=scsi0.0,channel=0,scsi-id=0,lun=2,drive=drive-scsi0-0-2-0,id = scsi0-0-2-0,removable=on \ -drive file=/vm/virtio-scsi/scsi4.img,if=none,id=drive-scsi0-0-3-0,format=raw \ -device scsi-hd,bus=scsi0.0,channel=0,scsi-id=0,lun=3,drive=drive-scsi0-0-3-0,id = scsi0-0-3-0 \ -drive file=/vm/virtio-scsi/scsi1.img,if=none,id=drive-scsi0-0-0-0,format=raw \ -device scsi-hd,bus=scsi0.0,channel=0,scsi-id=0,lun=0,drive=drive-scsi0-0-0-0,id = scsi0-0-0-0 \ -drive file=/vm/virtio-scsi/scsi2.img,if=none,id=drive-scsi0-0-1-0,format=raw \ -device scsi-hd,bus=scsi0.0,channel=0,scsi-id=0,lun=1,drive=drive-scsi0-0-1-0,id = scsi0-0-1-0 \ -chardev pty,id=charserial0 -device isa-serial,chardev=charserial0,id=serial0 \ -chardev file,id=charserial1,path=/vm/f17.log \ -device isa-serial,chardev=charserial1,id=serial1 \ -device usb-tablet,id=input0 -vga std \ -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x7 \ -netdev tap,id=hostnet0,vhost=on,queues=4 \ -device virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:ce:7b:29,bus=pci.0,a d dr=0x3 \ -monitor stdio I got panic just after booting the system, did nothing, waited for a while, the guest panicked. [ 28.053004] BUG: soft lockup - CPU#1 stuck for 23s! [ip:592] [ 28.053004] Modules linked in: ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables uinput joydev microcode virtio_balloon pcspkr virtio_net i2c_piix4 i2c_core virtio_scsi virtio_blk floppy [ 28.053004] CPU 1 [ 28.053004] Pid: 592, comm: ip Not tainted 3.8.0-rc1-net+ #3 Bochs Bochs [ 28.053004] RIP: 0010:[8137a9ab] [8137a9ab] virtqueue_get_buf+0xb/0x120 [ 28.053004] RSP: 0018:8800bc913550 EFLAGS: 0246 [ 28.053004] RAX: RBX: 8800bc49c000 RCX: 8800bc49e000 [ 28.053004] RDX: RSI: 8800bc913584 RDI: 8800bcfd4000 [ 28.053004] RBP: 8800bc913558 R08: 8800bcfd0800 R09: [ 28.053004] R10: 8800bc49c000 R11: 880036cc4de0 R12: 8800bcfd4000 [ 28.053004] R13: 8800bc913558 R14: 8137ad73 R15: 000200d0 [ 28.053004] FS: 7fb27a589740() GS:8800c148() knlGS: [ 28.053004] CS: 0010 DS: ES: CR0: 80050033 [ 28.053004] CR2: 00640530 CR3: baeff000 CR4: 06e0 [ 28.053004] DR0: DR1: DR2: [ 28.053004] DR3: DR6: 0ff0 DR7: 0400 [ 28.053004] Process ip (pid: 592, threadinfo 8800bc912000, task 880036da2e20) [ 28.053004] Stack: [ 28.053004] 8800bcfd0800 8800bc913638 a003e9bb 8800bc913656 [ 28.053004] 00010002 8800c17ebb08 0050ff10 ea0002f244c0 [ 28.053004] 00020582
Re: [okeanos-dev] Re: KVM versions, machine types and failed migrations
On Wed, Jan 09, 2013 at 03:27:53PM +0200, Vangelis Koukis wrote: On Wed, Jan 09, 2013 at 01:10:45pm +, Daniel P. Berrange wrote: When doing migration, the fundamental requirement is that the guest OS visible machine ABI must not change. Thus there are three key things to take care of when launching QEMU on the migration target host. - The device PCI/USB addresses must be identical to the source - The machine type must be identical to the source - The CPU model must be identical to the source Thanks for the detailed list of requirements, we'll take it into account for the relevant Ganeti patch. If you don't follow those requirements, either QEMU or the guest OS or both will crash burn during migration you get to keep both pieces :-) My point is, are these requirements left up to the caller of kvm -incoming to satisfy? Since the migration will most probably break, wouldn't it be best for QEMU to detect this and complain loudly, instead of continuing with the migration, failing silently and destroying the VM? Sure there could be some yes, do it, I know it is going to break option, which will make QEMU proceed with the migration. However, in 99% of the cases this is just user error, e.g. the user has upgraded the version on the other end and has not specified -M explicitly. It would be best if QEMU was able to detect and warn the user about what is going to happen, because it does lead to the VM dying. What you describe is certainly desirable, but it is quite hard to achieve with current QEMU. Much of the work with moving to the new QEMU object model configuration descriptions has been motivated by a desire to enable improvements migration handling. As you suggest, the goal is that the source QEMU be able to send a complete reliable hardware description to the destination QEMU during migration.It is getting closer, but we're not there yet. Regards, Daniel -- |: http://berrange.com -o-http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :| -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS
On 9 January 2013 22:30, Alexander Graf ag...@suse.de wrote: In fact, in this particular case one could merge all of the patches except for the particular ioctl implementation and just give the respective addresses default values until there is an API to set them, similar to how we did things with PVR in the beginning. That's what we started with and we got review comments saying yuck, don't hard code this... -- PMM -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 01/12] tap: multiqueue support
On Fri, Dec 28, 2012 at 06:31:53PM +0800, Jason Wang wrote: Mainly suggestions to make the code easier to understand, but see the comment about the 1:1 queue/NetClientState model for a general issue with this approach. Recently, linux support multiqueue tap which could let userspace call TUNSETIFF for a signle device many times to create multiple file descriptors as s/signle/single/ (Noting these if you respin.) independent queues. User could also enable/disabe a specific queue through s/disabe/disable/ TUNSETQUEUE. The patch adds the generic infrastructure to create multiqueue taps. To achieve this a new parameter queues were introduced to specify how many queues were expected to be created for tap. The fd parameter were also changed to support a list of file descriptors which could be used by management (such as libvirt) to pass pre-created file descriptors (queues) to qemu. Each TAPState were still associated to a tap fd, which mean multiple TAPStates were created when user needs multiqueue taps. Only linux part were implemented now, since it's the only OS that support multiqueue tap. Signed-off-by: Jason Wang jasow...@redhat.com --- net/tap-aix.c | 18 - net/tap-bsd.c | 18 - net/tap-haiku.c | 18 - net/tap-linux.c | 70 +++- net/tap-linux.h |4 + net/tap-solaris.c | 18 - net/tap-win32.c | 10 ++ net/tap.c | 248 + net/tap.h |8 ++- qapi-schema.json |5 +- 10 files changed, 335 insertions(+), 82 deletions(-) This patch should be split up: 1. linux-headers: import linux/if_tun.h multiqueue constants 2. tap: add Linux multiqueue support (tap_open(), tap_fd_attach(), tap_fd_detach()) 3. tap: queue attach/detach (tap_attach(), tap_detach()) 4. tap: split out net_init_one_tap() function (pure code motion, to make later diffs easy to review) 5. tap: add queues and multi-fd options (net_init_tap()/net_init_one_tap() changes) Each commit description can explain how this works in more detail. I think I've figured it out now but it would have helped to separate things out from the start. diff --git a/net/tap-aix.c b/net/tap-aix.c index f27c177..f931ef3 100644 --- a/net/tap-aix.c +++ b/net/tap-aix.c @@ -25,7 +25,8 @@ #include net/tap.h #include stdio.h -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + int vnet_hdr_required, int mq_required) { fprintf(stderr, no tap on AIX\n); return -1; @@ -59,3 +60,18 @@ void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { } + +int tap_fd_attach(int fd) +{ +return -1; +} + +int tap_fd_detach(int fd) +{ +return -1; +} + +int tap_fd_ifname(int fd, char *ifname) +{ +return -1; +} diff --git a/net/tap-bsd.c b/net/tap-bsd.c index a3b717d..07c287d 100644 --- a/net/tap-bsd.c +++ b/net/tap-bsd.c @@ -33,7 +33,8 @@ #include net/if_tap.h #endif -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + int vnet_hdr_required, int mq_required) { int fd; #ifdef TAPGIFNAME @@ -145,3 +146,18 @@ void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { } + +int tap_fd_attach(int fd) +{ +return -1; +} + +int tap_fd_detach(int fd) +{ +return -1; +} + +int tap_fd_ifname(int fd, char *ifname) +{ +return -1; +} diff --git a/net/tap-haiku.c b/net/tap-haiku.c index 34739d1..62ab423 100644 --- a/net/tap-haiku.c +++ b/net/tap-haiku.c @@ -25,7 +25,8 @@ #include net/tap.h #include stdio.h -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + int vnet_hdr_required, int mq_required) { fprintf(stderr, no tap on Haiku\n); return -1; @@ -59,3 +60,18 @@ void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { } + +int tap_fd_attach(int fd) +{ +return -1; +} + +int tap_fd_detach(int fd) +{ +return -1; +} + +int tap_fd_ifname(int fd, char *ifname) +{ +return -1; +} diff --git a/net/tap-linux.c b/net/tap-linux.c index c6521be..0854ef5 100644 --- a/net/tap-linux.c +++ b/net/tap-linux.c @@ -35,7 +35,8 @@ #define PATH_NET_TUN /dev/net/tun -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + int vnet_hdr_required, int mq_required) { struct ifreq ifr; int fd, ret; @@ -67,6 +68,20 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int
Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS
Am 10.01.2013 um 11:17 schrieb Peter Maydell peter.mayd...@linaro.org: On 9 January 2013 22:30, Alexander Graf ag...@suse.de wrote: In fact, in this particular case one could merge all of the patches except for the particular ioctl implementation and just give the respective addresses default values until there is an API to set them, similar to how we did things with PVR in the beginning. That's what we started with and we got review comments saying yuck, don't hard code this... And you shouldn't in the long run. But as an intermediate step, it's the best solution IMHO. Either way, let's just finally get something in that doesn't clutter the global namespace :). Alex -- PMM -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS
On 09.01.2013, at 23:34, Alexander Graf wrote: On 09.01.2013, at 23:26, Christoffer Dall wrote: On Wed, Jan 9, 2013 at 5:10 PM, Scott Wood scottw...@freescale.com wrote: On 01/09/2013 03:37:20 PM, Alexander Graf wrote: Am 09.01.2013 um 22:15 schrieb Scott Wood scottw...@freescale.com: I get that there's a tradeoff between getting something in now, versus waiting until the API is more refined. Tagging it with a particular ISA seems like an odd way of saying soon to be deprecated, though. What happens if we're still squabbling over the perfect replacement API when we're trying to push PPC MPIC stuff in? Then we're the ones who have to come up with a good interface. How about another bad one, with PPC in the name, and some pleas to hurry things up? :-) It's not as if there haven't been last-minute requests for API changes on the PPC side in the past... This is getting out of hand. Do you have another API for PPC, which was send for review and not commented on several months ago that we can unify right now? If not, let's go with the ARM name and work on the generic API in the mean time. The end result will be something along 5 lines in a header files and 3 lines in a switch case that return -EINVAL if the interface is completely deprecated later on, which is not a big problem. Agreed [1]. So what exactly are we waiting for? Acks from kvm maintainers, right? In fact, we should probably CC them :) Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS
On Thu, Jan 10, 2013 at 12:15:55PM +0100, Alexander Graf wrote: On 09.01.2013, at 23:34, Alexander Graf wrote: On 09.01.2013, at 23:26, Christoffer Dall wrote: On Wed, Jan 9, 2013 at 5:10 PM, Scott Wood scottw...@freescale.com wrote: On 01/09/2013 03:37:20 PM, Alexander Graf wrote: Am 09.01.2013 um 22:15 schrieb Scott Wood scottw...@freescale.com: I get that there's a tradeoff between getting something in now, versus waiting until the API is more refined. Tagging it with a particular ISA seems like an odd way of saying soon to be deprecated, though. What happens if we're still squabbling over the perfect replacement API when we're trying to push PPC MPIC stuff in? Then we're the ones who have to come up with a good interface. How about another bad one, with PPC in the name, and some pleas to hurry things up? :-) It's not as if there haven't been last-minute requests for API changes on the PPC side in the past... This is getting out of hand. Do you have another API for PPC, which was send for review and not commented on several months ago that we can unify right now? If not, let's go with the ARM name and work on the generic API in the mean time. The end result will be something along 5 lines in a header files and 3 lines in a switch case that return -EINVAL if the interface is completely deprecated later on, which is not a big problem. Agreed [1]. So what exactly are we waiting for? Acks from kvm maintainers, right? In fact, we should probably CC them :) I am looking at them right now :) Give me a couple of days please. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 0/9] KVM: PPC: In-kernel PAPR interrupt controller emulation
On 10.01.2013, at 06:09, Paul Mackerras wrote: On Sat, Dec 15, 2012 at 01:06:13PM +1100, Benjamin Herrenschmidt wrote: On Sat, 2012-12-15 at 01:46 +0100, Alexander Graf wrote: On 05.11.2012, at 04:18, Paul Mackerras wrote: This series implements in-kernel emulation of the XICS interrupt controller specified in the PAPR specification and used by pseries guests. Since the XICS is organized a bit differently to the interrupt controllers used on x86 machines, I have defined some new ioctls for setting up the XICS and for saving and restoring its state. It may be possible to use some of the currently x86-specific ioctls instead. Is this one already following the new world order that we discussed during KVM Forum? :) The new world order (sorry, looks like nobody took notes and people expect me to do a write up from memory now ... :-) Well, mostly we agreed that the x86 stuff wasn't going to work for us. So basically what we discussed boils down to: - Move the existing generic KVM ioctl to create the kernel APIC to x86 since it's no as-is useful for other archs who, among other things, might need to pass argument based on the machine type (type of interrupt subsystem for example, etc...) Assuming you're talking about KVM_CREATE_IRQCHIP, it is already handled entirely in arch code (arch/x86/kvm/x86.c and arch/ia64/kvm/kvm-ia64.c), along with KVM_GET_IRQCHIP and KVM_SET_IRQCHIP. This part is about QEMU. - Once that's done, well, instanciating interrupt controller objects becomes pretty much an arch specific matter. We could try to keep some ioctls somewhat common though it's not *that* useful because the various types arguments are going to be fairly arch specific, so goes for configuration. Examples of what could be kept common are: * Create irq chip, takes at least a type argument, possibly a few more type-specific args (or a union, but then let's keep space in there since we can't change the size of the struct later as it would impact the ioctl number afaik). The existing KVM_CREATE_IRQCHIP is defined as _IO(...) which means that it doesn't read or write memory, but there is still the 3rd argument to the ioctl, which would give us 64 bits to indicate the type of the top-level IRQ controller (XICS, MPIC, ...), but not much more. It sounds like the agreement at the meeting was more along the lines of the KVM_CREATE_IRQCHIP_ARGS ioctl (as in the patches I posted) which can be called multiple times to instantiate pieces of the interrupt framework, rather than having a KVM_CREATE_IRQCHIP that gets called once early on to say that there we are using in-kernel interrupt controller emulation, followed by other calls to configure the various parts of the framework. Is that accurate? KVM_CREATE_IRQCHIP should get a parameter indicating the type of IRQCHIP (XICS / MPIC / APIC / VGIC / ...). The parameter-less version defaults to an arch specific default (APIC for x86, VGIC for arm, error on PPC). I'm not 100% sure yet whether we want to support models with multiple IRQCHIPs. If so, we need to return a device token from the KVM_CREATE_IRQCHIP ioctl. Maybe it makes more sense to model specific cases like this as separate type though with specific, explicit subdevice ids. Not sure. Other instantiation attributes we don't know that early on should be settable between the time frame of vm creation and first execution. An example for this are device addresses. Check out the threads KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS and KVM: ARM: Introduce KVM_SET_DEVICE_ADDRESS ioctl for more information on this particular bit. Alex * Assigning the address of an irq chip when it can change (see ARM patches) - The existing business with irqfd etc... is mostly ok, except the interfaces messing around with MSIs (see virtio-pci use of kvm functions directly). The assignment of an irq number for an MSI must be a hook, probably a PCI controller hook, so x86 can get it done via its existing kernel interfaces and sane architectures can keep the assignment in qemu where it belongs. So, if I've understood you correctly about what was agreed, the set of ioctls that is implemented in the patches I posted is in line with what was agreed, isn't it? If not, where does it differ? (To recap, I had KVM_CREATE_IRQCHIP_ARGS, KVM_IRQCHIP_GET_SOURCES and KVM_IRQCHIP_SET_SOURCES, plus a one-reg interface to get/set the vcpu-specific state.) Paul. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 02/12] target-i386: Don't set any KVM flag by default if KVM is disabled
On Wed, Jan 09, 2013 at 04:53:42PM -0200, Eduardo Habkost wrote: This is a cleanup that tries to solve two small issues: - We don't need a separate kvm_pv_eoi_features variable just to keep a constant calculated at compile-time, and this style would require adding a separate variable (that's declared twice because of the CONFIG_KVM ifdef) for each feature that's going to be enabled/disable by machine-type compat code. - The pc-1.3 code is setting the kvm_pv_eoi flag on cpuid_kvm_features even when KVM is disabled at runtime. This small incosistency in the cpuid_kvm_features field isn't a problem today because cpuid_kvm_features is ignored by the TCG code, but it may cause unexpected problems later when refactoring the CPUID handling code. This patch eliminates the kvm_pv_eoi_features variable and simply uses kvm_enabled() inside the enable_kvm_pv_eoi() compat function, so it enables kvm_pv_eoi only if KVM is enabled. I believe this makes the behavior of enable_kvm_pv_eoi() clearer and easier to understand. Signed-off-by: Eduardo Habkost ehabk...@redhat.com Acked-by: Michael S. Tsirkin m...@redhat.com --- Cc: kvm@vger.kernel.org Cc: Michael S. Tsirkin m...@redhat.com Cc: Gleb Natapov g...@redhat.com Cc: Marcelo Tosatti mtosa...@redhat.com Changes v2: - Coding style fix Changes v3: - Eliminate #ifdef by using the fake KVM_FEATURE_PV_EOI #define Changes v4: - Check kvm_enabled() when actually using kvm_default_features - Eliminate Yet Another #ifdef by using the fake KVM_FEATURE_* #defines on kvm_default_features initialization --- target-i386/cpu.c | 16 ++-- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/target-i386/cpu.c b/target-i386/cpu.c index e4dc370..57a22b7 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -206,22 +206,16 @@ typedef struct model_features_t { int check_cpuid = 0; int enforce_cpuid = 0; -#if defined(CONFIG_KVM) static uint32_t kvm_default_features = (1 KVM_FEATURE_CLOCKSOURCE) | (1 KVM_FEATURE_NOP_IO_DELAY) | (1 KVM_FEATURE_CLOCKSOURCE2) | (1 KVM_FEATURE_ASYNC_PF) | (1 KVM_FEATURE_STEAL_TIME) | (1 KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); -static const uint32_t kvm_pv_eoi_features = (0x1 KVM_FEATURE_PV_EOI); -#else -static uint32_t kvm_default_features = 0; -static const uint32_t kvm_pv_eoi_features = 0; -#endif void enable_kvm_pv_eoi(void) { -kvm_default_features |= kvm_pv_eoi_features; +kvm_default_features |= (1UL KVM_FEATURE_PV_EOI); } void host_cpuid(uint32_t function, uint32_t count, @@ -1343,13 +1337,15 @@ static int cpu_x86_parse_featurestr(x86_def_t *x86_cpu_def, char *features) unsigned int i; char *featurestr; /* Single 'key=value string being parsed */ /* Features to be added */ -FeatureWordArray plus_features = { -[FEAT_KVM] = kvm_default_features, -}; +FeatureWordArray plus_features = { 0 }; /* Features to be removed */ FeatureWordArray minus_features = { 0 }; uint32_t numvalue; +if (kvm_enabled()) { +plus_features[FEAT_KVM] = kvm_default_features; +} + add_flagname_to_bitmaps(hypervisor, plus_features); featurestr = features ? strtok(features, ,) : NULL; -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH 00/12] Multiqueue virtio-net
On Thu, Jan 10, 2013 at 05:34:14PM +0800, Jason Wang wrote: On 01/10/2013 04:44 PM, Stefan Hajnoczi wrote: On Wed, Jan 09, 2013 at 11:33:25PM +0800, Jason Wang wrote: On 01/09/2013 11:32 PM, Michael S. Tsirkin wrote: On Wed, Jan 09, 2013 at 03:29:24PM +0100, Stefan Hajnoczi wrote: On Fri, Dec 28, 2012 at 06:31:52PM +0800, Jason Wang wrote: Perf Numbers: Two Intel Xeon 5620 with direct connected intel 82599EB Host/Guest kernel: David net tree vhost enabled - lots of improvents of both latency and cpu utilization in request-reponse test - get regression of guest sending small packets which because TCP tends to batch less when the latency were improved 1q/2q/4q TCP_RR size #sessions trans.rate norm trans.rate norm trans.rate norm 1 1 9393.26 595.64 9408.18 597.34 9375.19 584.12 1 2072162.1 2214.24 129880.22 2456.13 196949.81 2298.13 1 50107513.38 2653.99 139721.93 2490.58 259713.82 2873.57 1 100 126734.63 2676.54 145553.5 2406.63 265252.68 2943 64 19453.42 632.33 9371.37 616.13 9338.19 615.97 64 20 70620.03 2093.68 125155.75 2409.15 191239.91 2253.32 64 50 1069662448.29 146518.67 2514.47 242134.07 2720.91 64 100 117046.35 2394.56 190153.09 2696.82 238881.29 2704.41 256 1 8733.29 736.36 8701.07 680.83 8608.92 530.1 256 20 69279.89 2274.45 115103.07 2299.76 144555.16 1963.53 256 50 97676.02 2296.09 150719.57 2522.92 254510.5 3028.44 256 100 150221.55 2949.56 197569.3 2790.92 300695.78 3494.83 TCP_CRR size #sessions trans.rate norm trans.rate norm trans.rate norm 1 1 2848.37 163.41 2230.39 130.89 2013.09 120.47 1 2023434.5 562.11 31057.43 531.07 49488.28 564.41 1 5028514.88 582.17 40494.23 605.92 60113.35 654.97 1 100 28827.22 584.73 48813.25 661.6 61783.62 676.56 64 12780.08 159.4 2201.07 127.96 2006.8 117.63 64 20 23318.51 564.47 30982.44 530.24 49734.95 566.13 64 50 28585.72 582.54 40576.7 610.08 60167.89 656.56 64 100 28747.37 584.17 49081.87 667.87 60612.94 662 256 1 2772.08 160.51 2231.84 131.05 2003.62 113.45 256 20 23086.35 559.8 30929.09 528.16 48454.9 555.22 256 50 28354.7 579.85 40578.31 60760261.71 657.87 256 100 28844.55 585.67 48541.86 659.08 61941.07 676.72 TCP_STREAM guest receiving size #sessions throughput norm throughput norm throughput norm 1 1 16.27 1.33 16.11.12 16.13 0.99 1 2 33.04 2.08 32.96 2.19 32.75 1.98 1 4 66.62 6.83 68.35.56 66.14 2.65 64 1896.55 56.67 914.02 58.14 898.9 61.56 64 21830.46 91.02 1812.02 64.59 1835.57 66.26 64 43626.61 142.55 3636.25 100.64 3607.46 75.03 256 1 2619.49 131.23 2543.19 129.03 2618.69 132.39 256 2 5136.58 203.02 5163.31 141.11 5236.51 149.4 256 4 7063.99 242.83 9365.4 208.49 9421.03 159.94 512 1 3592.43 165.24 3603.12 167.19 3552.5 169.57 512 2 7042.62 246.59 7068.46 180.87 7258.52 186.3 512 4 6996.08 241.49 9298.34 206.12 9418.52 159.33 1024 1 4339.54 192.95 4370.2 191.92 4211.72 192.49 1024 2 7439.45 254.77 9403.99 215.24 9120.82 222.67 1024 4 7953.86 272.11 9403.87 208.23 9366.98 159.49 4096 1 7696.28 272.04 7611.41 270.38 7778.71 267.76 4096 2 7530.35 261.1 8905.43 246.27 8990.18 267.57 4096 4 7121.6 247.02 9411.75 206.71 9654.96 184.67 16384 1 7795.73 268.54 7780.94 267.2 7634.26 260.73 16384 2 7436.57 255.81 9381.86 220.85 9392220.36 16384 4 7199.07 247.81 9420.96 205.87 9373.69 159.57 TCP_MAERTS guest sending size #sessions throughput norm throughput norm throughput norm 1 1 15.94 0.62 15.55 0.61 15.13 0.59 1 2 36.11 0.83 32.46 0.69 32.28 0.69 1 4 71.59 1 68.91 0.94 61.52 0.77 64 1630.71 22.52 622.11 22.35 605.09 21.84 64 21442.36 30.57 1292.15 25.82 1282.67 25.55 64 43186.79 42.59 2844.96 36.03 2529.69 30.06 256 1 1760.96 58.07 1738.44 57.43 1695.99 56.19 256 2 4834.23 95.19 3524.85 64.21 3511.94 64.45 256 4 9324.63 145.74 8956.49 116.39 6720.17 73.86 512 1 2678.03 84.1 2630.68 82.93 2636.54 82.57 512 2 9368.17 195.61 9408.82 204.53 5316.3 92.99 512 4 9186.34 209.68 9358.72 183.82 9489.29 160.42 1024 1 3620.71 109.88 3625.54 109.83 3606.61 112.35 1024 2 9429258.32 7082.79 120.55 7403.53 134.78 1024 4 9430.66 290.44 9499.29 232.31 9414.6 190.92 4096 1 9339.28 296.48 9374.23 372.88 9348.76 298.49 4096 2 9410.53 378.69 9412.61 286.18 9409.75 278.31 4096 4 9487.35 374.1 9556.91 288.81 9441.94 221.64 16384 1 9380.43 403.8 9379.78 399.13 9382.42 393.55 16384 2 9367.69 406.93 9415.04 312.68 9409.29 300.9 16384 4 9391.96 405.17 9695.12 310.54 9423.76 223.47 Trying to understand the performance results: What is the host device configuration? tap + bridge? Yes. Did you use host CPU affinity for the vhost threads? I use numactl to pin cpu threads
Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS
On Thu, 10 Jan 2013 10:17:09 +, Peter Maydell peter.mayd...@linaro.org wrote: On 9 January 2013 22:30, Alexander Graf ag...@suse.de wrote: In fact, in this particular case one could merge all of the patches except for the particular ioctl implementation and just give the respective addresses default values until there is an API to set them, similar to how we did things with PVR in the beginning. That's what we started with and we got review comments saying yuck, don't hard code this... Even worse: hard-coding things breaks kvmtool (we do not emulate a vexpress and have a rather different memory map) and arm64 (the requirements on GIC mappings are different). So hard coding addresses is already unacceptable. M. -- Fast, cheap, reliable. Pick two. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH v9 2/3] x86, apicv: add virtual x2apic support
Gleb Natapov wrote on 2013-01-10: On Thu, Jan 10, 2013 at 08:32:06AM +, Zhang, Yang Z wrote: Gleb Natapov wrote on 2013-01-10: On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote: From: Yang Zhang yang.z.zh...@intel.com basically to benefit from apicv, we need to enable virtualized x2apic mode. Currently, we only enable it when guest is really using x2apic. Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled x2apic: 0x800 - 0x8ff: no read intercept for apicv register virtualization, except APIC ID and TMCCT. APIC ID and TMCCT: need software's assistance to get right value. TPR,EOI,SELF-IPI: no write intercept for virtual interrupt delivery. Signed-off-by: Kevin Tian kevin.t...@intel.com Signed-off-by: Yang Zhang yang.z.zh...@intel.com --- arch/x86/include/asm/kvm_host.h |2 + arch/x86/include/asm/vmx.h |1 + arch/x86/kvm/lapic.c|5 +- arch/x86/kvm/svm.c |6 + arch/x86/kvm/vmx.c | 194 +-- 5 files changed, 200 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c431b33..572a562 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -697,6 +697,8 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); + void (*enable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu); + void (*disable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu); Make one callback with enable/disable parameter. And do not forget SVM. int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 44c3f7e..0a54df0 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -139,6 +139,7 @@ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x0001 #define SECONDARY_EXEC_ENABLE_EPT 0x0002 #define SECONDARY_EXEC_RDTSCP 0x0008 +#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x0010 #define SECONDARY_EXEC_ENABLE_VPID 0x0020 #define SECONDARY_EXEC_WBINVD_EXITING 0x0040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x0080 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0664c13..ec38906 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1328,7 +1328,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) u32 id = kvm_apic_id(apic); u32 ldr = ((id 4) 16) | (1 (id 0xf)); kvm_apic_set_ldr(apic, ldr); - } + kvm_x86_ops-enable_virtual_x2apic_mode(vcpu); + } else + kvm_x86_ops-disable_virtual_x2apic_mode(vcpu); + You just broke SVM. apic-base_address = apic-vcpu-arch.apic_base MSR_IA32_APICBASE_BASE; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d29d3cd..0b82cb1 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) set_cr_intercept(svm, INTERCEPT_CR8_WRITE); } +static void svm_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu) +{ + return; +} + static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops svm_x86_ops = { .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, + .enable_virtual_x2apic_mode = svm_enable_virtual_x2apic_mode, .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 688f43f..b203ce7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -433,6 +433,8 @@ struct vcpu_vmx { bool rdtscp_enabled; + bool virtual_x2apic_enabled; + /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; }; @@ -767,12 +769,23 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; } +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +} + static inline bool cpu_has_vmx_apic_register_virt(void) { return vmcs_config.cpu_based_2nd_exec_ctrl SECONDARY_EXEC_APIC_REGISTER_VIRT; } +static inline bool cpu_has_vmx_virtual_intr_delivery(void) +{ + return false; +} + static inline bool cpu_has_vmx_flexpriority(void) { return cpu_has_vmx_tpr_shadow() @@ -2544,6 +2557,7 @@
Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS
On 10.01.2013, at 12:53, Marc Zyngier wrote: On Thu, 10 Jan 2013 10:17:09 +, Peter Maydell peter.mayd...@linaro.org wrote: On 9 January 2013 22:30, Alexander Graf ag...@suse.de wrote: In fact, in this particular case one could merge all of the patches except for the particular ioctl implementation and just give the respective addresses default values until there is an API to set them, similar to how we did things with PVR in the beginning. That's what we started with and we got review comments saying yuck, don't hard code this... Even worse: hard-coding things breaks kvmtool (we do not emulate a vexpress and have a rather different memory map) and arm64 (the requirements on GIC mappings are different). So hard coding addresses is already unacceptable. I see. The current approach with an intermediate ARM specific ioctl is certainly good with me then. And who knows - with a bit of luck we might be able to get a unified ioctl model for this before the code hits Linus' tree (read: APIs are really stable). Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH v9 3/3] x86, apicv: add virtual interrupt delivery support
Gleb Natapov wrote on 2013-01-10: On Thu, Jan 10, 2013 at 03:26:08PM +0800, Yang Zhang wrote: From: Yang Zhang yang.z.zh...@intel.com Virtual interrupt delivery avoids KVM to inject vAPIC interrupts manually, which is fully taken care of by the hardware. This needs some special awareness into existing interrupr injection path: - for pending interrupt, instead of direct injection, we may need update architecture specific indicators before resuming to guest. - A pending interrupt, which is masked by ISR, should be also considered in above update action, since hardware will decide when to inject it at right time. Current has_interrupt and get_interrupt only returns a valid vector from injection p.o.v. Signed-off-by: Kevin Tian kevin.t...@intel.com Signed-off-by: Yang Zhang yang.z.zh...@intel.com --- arch/x86/include/asm/kvm_host.h |5 + arch/x86/include/asm/vmx.h | 11 +++ arch/x86/kvm/irq.c | 56 +++- arch/x86/kvm/lapic.c| 72 +-- arch/x86/kvm/lapic.h| 23 + arch/x86/kvm/svm.c | 18 arch/x86/kvm/vmx.c | 191 +-- arch/x86/kvm/x86.c | 14 +++- include/linux/kvm_host.h|3 + virt/kvm/ioapic.c | 18 virt/kvm/ioapic.h |4 + virt/kvm/irq_comm.c | 22 + virt/kvm/kvm_main.c |5 + 13 files changed, 399 insertions(+), 43 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 572a562..f471856 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -697,6 +697,10 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); +int (*has_virtual_interrupt_delivery)(struct kvm_vcpu *vcpu); +void (*update_apic_irq)(struct kvm_vcpu *vcpu, int max_irr); +void (*update_eoi_exitmap)(struct kvm_vcpu *vcpu); +void (*set_svi)(int isr); void (*enable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu); void (*disable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); @@ -993,6 +997,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); +int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 0a54df0..694586c 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -62,6 +62,7 @@ #define EXIT_REASON_MCE_DURING_VMENTRY 41 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 #define EXIT_REASON_APIC_ACCESS 44 +#define EXIT_REASON_EOI_INDUCED 45 #define EXIT_REASON_EPT_VIOLATION 48 #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_WBINVD 54 @@ -144,6 +145,7 @@ #define SECONDARY_EXEC_WBINVD_EXITING 0x0040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x0080 #define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x0100 +#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY0x0200 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x0400 #define SECONDARY_EXEC_ENABLE_INVPCID 0x1000 @@ -181,6 +183,7 @@ enum vmcs_field { GUEST_GS_SELECTOR = 0x080a, GUEST_LDTR_SELECTOR = 0x080c, GUEST_TR_SELECTOR = 0x080e, + GUEST_INTR_STATUS = 0x0810, HOST_ES_SELECTOR = 0x0c00, HOST_CS_SELECTOR= 0x0c02, HOST_SS_SELECTOR= 0x0c04, @@ -208,6 +211,14 @@ enum vmcs_field { APIC_ACCESS_ADDR_HIGH = 0x2015, EPT_POINTER = 0x201a, EPT_POINTER_HIGH= 0x201b, +EOI_EXIT_BITMAP0= 0x201c, +EOI_EXIT_BITMAP0_HIGH = 0x201d, +EOI_EXIT_BITMAP1= 0x201e, +EOI_EXIT_BITMAP1_HIGH = 0x201f, +EOI_EXIT_BITMAP2= 0x2020, +EOI_EXIT_BITMAP2_HIGH = 0x2021, +EOI_EXIT_BITMAP3= 0x2022, +EOI_EXIT_BITMAP3_HIGH = 0x2023, GUEST_PHYSICAL_ADDRESS = 0x2400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x2401, VMCS_LINK_POINTER = 0x2800, diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index b111aee..e113440 100644 --- a/arch/x86/kvm/irq.c +++
Re: [PATCH v9 2/3] x86, apicv: add virtual x2apic support
On Thu, Jan 10, 2013 at 11:54:21AM +, Zhang, Yang Z wrote: The right logic should be: When register virtualization enabled, read all apic msr(except apic id reg and tmcct which have the hook) not cause vmexit and only write TPR not cause vmexit. When vid enabled, write to TPR, EOI and SELF IPI not cause vmexit. It's better to put the patch of envirtualize x2apic mode as first patch. There is no point whatsoever to enable virtualize x2apic without allowing at least one non intercepted access to x2apic MSR range and this is what your patch is doing when run on cpu without vid capability. No, at least read/write TPR will not cause vmexit if virtualize x2apic mode is enabled. For that you need to disable 808H MSR intercept, which your patch does not do. I am not sure whether I understand your comments right in previous discussion, here is my thinking: 1. enable virtualize x2apic mode if guest is really using x2apic and clear TPR in msr read and write bitmap. This will benefit reading TPR. 2. If APIC registers virtualization is enabled, clear all bit in rang 0x800-0x8ff(except apic id reg and tmcct). Clear all read bits in the range. 3. If vid is enabled, clear EOI and SELF IPI in msr write map. Looks OK. One concern you mentioned is that vid enabled and virtualize x2apic is disabled but guest still use x2apic. In this case, we still use MSR bitmap to intercept x2apic. This means the vEOI update will never happen. But we still can benefit from interrupt window. What interrupt windows? Without virtualized x2apic TPR/EOI virtualization will not happen and we rely on that in the code. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH v9 2/3] x86, apicv: add virtual x2apic support
Gleb Natapov wrote on 2013-01-10: On Thu, Jan 10, 2013 at 11:54:21AM +, Zhang, Yang Z wrote: The right logic should be: When register virtualization enabled, read all apic msr(except apic id reg and tmcct which have the hook) not cause vmexit and only write TPR not cause vmexit. When vid enabled, write to TPR, EOI and SELF IPI not cause vmexit. It's better to put the patch of envirtualize x2apic mode as first patch. There is no point whatsoever to enable virtualize x2apic without allowing at least one non intercepted access to x2apic MSR range and this is what your patch is doing when run on cpu without vid capability. No, at least read/write TPR will not cause vmexit if virtualize x2apic mode is enabled. For that you need to disable 808H MSR intercept, which your patch does not do. I want to do this in next patch. I am not sure whether I understand your comments right in previous discussion, here is my thinking: 1. enable virtualize x2apic mode if guest is really using x2apic and clear TPR in msr read and write bitmap. This will benefit reading TPR. 2. If APIC registers virtualization is enabled, clear all bit in rang 0x800-0x8ff(except apic id reg and tmcct). Clear all read bits in the range. 3. If vid is enabled, clear EOI and SELF IPI in msr write map. Looks OK. One concern you mentioned is that vid enabled and virtualize x2apic is disabled but guest still use x2apic. In this case, we still use MSR bitmap to intercept x2apic. This means the vEOI update will never happen. But we still can benefit from interrupt window. What interrupt windows? Without virtualized x2apic TPR/EOI virtualization will not happen and we rely on that in the code. Yes, but we can eliminate vmexit of interrupt window. Interrupt still can delivery to guest without vmexit when guest enables interrupt if vid is enabled. See SDM vol 3, 29.2.2. Best regards, Yang -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v9 2/3] x86, apicv: add virtual x2apic support
On Thu, Jan 10, 2013 at 12:22:51PM +, Zhang, Yang Z wrote: Gleb Natapov wrote on 2013-01-10: On Thu, Jan 10, 2013 at 11:54:21AM +, Zhang, Yang Z wrote: The right logic should be: When register virtualization enabled, read all apic msr(except apic id reg and tmcct which have the hook) not cause vmexit and only write TPR not cause vmexit. When vid enabled, write to TPR, EOI and SELF IPI not cause vmexit. It's better to put the patch of envirtualize x2apic mode as first patch. There is no point whatsoever to enable virtualize x2apic without allowing at least one non intercepted access to x2apic MSR range and this is what your patch is doing when run on cpu without vid capability. No, at least read/write TPR will not cause vmexit if virtualize x2apic mode is enabled. For that you need to disable 808H MSR intercept, which your patch does not do. I want to do this in next patch. Then don't. There is no point in the patch that enables virtualize x2apic and does not allow at least one non-intercepted MSR access. I am not sure whether I understand your comments right in previous discussion, here is my thinking: 1. enable virtualize x2apic mode if guest is really using x2apic and clear TPR in msr read and write bitmap. This will benefit reading TPR. 2. If APIC registers virtualization is enabled, clear all bit in rang 0x800-0x8ff(except apic id reg and tmcct). Clear all read bits in the range. 3. If vid is enabled, clear EOI and SELF IPI in msr write map. Looks OK. One concern you mentioned is that vid enabled and virtualize x2apic is disabled but guest still use x2apic. In this case, we still use MSR bitmap to intercept x2apic. This means the vEOI update will never happen. But we still can benefit from interrupt window. What interrupt windows? Without virtualized x2apic TPR/EOI virtualization will not happen and we rely on that in the code. Yes, but we can eliminate vmexit of interrupt window. Interrupt still can delivery to guest without vmexit when guest enables interrupt if vid is enabled. See SDM vol 3, 29.2.2. What does it matter that you eliminated vmexit of interrupt window if you broke everything else? The vid patch assumes that apic emulation either entirely happens in a software when vid is disabled or in a CPU if vid is enabled. Mixed mode will not work (apic-isr_count = 1 trick will break things for instance). And it is not worth it to complicate the code to make it work. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PULL 0/8] ppc patch queue 2013-01-10
Hi Marcelo / Gleb, This is my current patch queue for ppc. Please pull. Highlights this time: - Book3S: enable potential sPAPR guest emulation on PR KVM on pHyp - BookE: EPR (External Proxy Register) support Alex The following changes since commit 908e7d7999bcce70ac52e7f390a8f5cbc55948de: Gleb Natapov (1): KVM: MMU: simplify folding of dirty bit into accessed_dirty are available in the git repository at: git://github.com/agraf/linux-2.6.git kvm-ppc-next Alexander Graf (6): KVM: PPC: Only WARN on invalid emulation KVM: PPC: Book3S: PR: Enable alternative instruction for SC 1 KVM: PPC: BookE: Allow irq deliveries to inject requests KVM: PPC: BookE: Emulate mfspr on EPR KVM: PPC: BookE: Implement EPR exit KVM: PPC: BookE: Add EPR ONE_REG sync Mihai Caraman (2): KVM: PPC: Fix SREGS documentation reference KVM: PPC: Fix mfspr/mtspr MMUCFG emulation Documentation/virtual/kvm/api.txt | 43 -- arch/powerpc/include/asm/kvm_host.h |2 + arch/powerpc/include/asm/kvm_ppc.h | 10 arch/powerpc/include/uapi/asm/kvm.h |6 - arch/powerpc/kvm/book3s_emulate.c | 30 arch/powerpc/kvm/book3s_pr.c|5 arch/powerpc/kvm/booke.c| 40 +++- arch/powerpc/kvm/booke_emulate.c|3 ++ arch/powerpc/kvm/emulate.c |5 arch/powerpc/kvm/powerpc.c | 13 +- include/linux/kvm_host.h|1 + include/uapi/linux/kvm.h|6 + 12 files changed, 153 insertions(+), 11 deletions(-) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 6/8] KVM: PPC: BookE: Emulate mfspr on EPR
The EPR register is potentially valid for PR KVM as well, so we need to emulate accesses to it. It's only defined for reading, so only handle the mfspr case. Signed-off-by: Alexander Graf ag...@suse.de --- arch/powerpc/kvm/booke_emulate.c |3 +++ 1 files changed, 3 insertions(+), 0 deletions(-) diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index 4685b8c..27a4b28 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -269,6 +269,9 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) case SPRN_ESR: *spr_val = vcpu-arch.shared-esr; break; + case SPRN_EPR: + *spr_val = vcpu-arch.epr; + break; case SPRN_CSRR0: *spr_val = vcpu-arch.csrr0; break; -- 1.6.0.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 8/8] KVM: PPC: BookE: Add EPR ONE_REG sync
We need to be able to read and write the contents of the EPR register from user space. This patch implements that logic through the ONE_REG API and declares its (never implemented) SREGS counterpart as deprecated. Signed-off-by: Alexander Graf ag...@suse.de --- Documentation/virtual/kvm/api.txt |1 + arch/powerpc/include/uapi/asm/kvm.h |6 +- arch/powerpc/kvm/booke.c| 21 + 3 files changed, 27 insertions(+), 1 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index a98ed09..09905cb 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1774,6 +1774,7 @@ registers, find a list below: PPC | KVM_REG_PPC_VPA_SLB | 128 PPC | KVM_REG_PPC_VPA_DTL | 128 PPC | KVM_REG_PPC_EPCR | 32 + PPC | KVM_REG_PPC_EPR | 32 4.69 KVM_GET_ONE_REG diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 2fba8a6..16064d0 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -114,7 +114,10 @@ struct kvm_regs { /* Embedded Floating Point (SPE) -- IVOR32-34 if KVM_SREGS_E_IVOR */ #define KVM_SREGS_E_SPE(1 9) -/* External Proxy (EXP) -- EPR */ +/* + * DEPRECATED! USE ONE_REG FOR THIS ONE! + * External Proxy (EXP) -- EPR + */ #define KVM_SREGS_EXP (1 10) /* External PID (E.PD) -- EPSC/EPLC */ @@ -412,5 +415,6 @@ struct kvm_get_htab_header { #define KVM_REG_PPC_VPA_DTL(KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x84) #define KVM_REG_PPC_EPCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85) +#define KVM_REG_PPC_EPR(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x86) #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 940ec80..8779cd4 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -300,6 +300,15 @@ static void set_guest_esr(struct kvm_vcpu *vcpu, u32 esr) #endif } +static unsigned long get_guest_epr(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_KVM_BOOKE_HV + return mfspr(SPRN_GEPR); +#else + return vcpu-arch.epr; +#endif +} + /* Deliver the interrupt of the corresponding priority, if possible. */ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) @@ -1405,6 +1414,11 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) vcpu-arch.dbg_reg.dac[dac], sizeof(u64)); break; } + case KVM_REG_PPC_EPR: { + u32 epr = get_guest_epr(vcpu); + r = put_user(epr, (u32 __user *)(long)reg-addr); + break; + } #if defined(CONFIG_64BIT) case KVM_REG_PPC_EPCR: r = put_user(vcpu-arch.epcr, (u32 __user *)(long)reg-addr); @@ -1437,6 +1451,13 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) (u64 __user *)(long)reg-addr, sizeof(u64)); break; } + case KVM_REG_PPC_EPR: { + u32 new_epr; + r = get_user(new_epr, (u32 __user *)(long)reg-addr); + if (!r) + kvmppc_set_epr(vcpu, new_epr); + break; + } #if defined(CONFIG_64BIT) case KVM_REG_PPC_EPCR: { u32 new_epcr; -- 1.6.0.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 7/8] KVM: PPC: BookE: Implement EPR exit
The External Proxy Facility in FSL BookE chips allows the interrupt controller to automatically acknowledge an interrupt as soon as a core gets its pending external interrupt delivered. Today, user space implements the interrupt controller, so we need to check on it during such a cycle. This patch implements logic for user space to enable EPR exiting, disable EPR exiting and EPR exiting itself, so that user space can acknowledge an interrupt when an external interrupt has successfully been delivered into the guest vcpu. Signed-off-by: Alexander Graf ag...@suse.de --- Documentation/virtual/kvm/api.txt | 40 +- arch/powerpc/include/asm/kvm_host.h |2 + arch/powerpc/include/asm/kvm_ppc.h |9 +++ arch/powerpc/kvm/booke.c| 14 +++- arch/powerpc/kvm/powerpc.c | 10 include/linux/kvm_host.h|1 + include/uapi/linux/kvm.h|6 + 7 files changed, 79 insertions(+), 3 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 4fc2bfc..a98ed09 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2246,8 +2246,8 @@ executed a memory-mapped I/O instruction which could not be satisfied by kvm. The 'data' member contains the written data if 'is_write' is true, and should be filled by application code otherwise. -NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR - and KVM_EXIT_PAPR the corresponding +NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR, + KVM_EXIT_PAPR and KVM_EXIT_EPR the corresponding operations are complete (and guest state is consistent) only after userspace has re-entered the kernel with KVM_RUN. The kernel side will first finish incomplete operations and then check for pending signals. Userspace @@ -2366,6 +2366,25 @@ interrupt for the target subchannel has been dequeued and subchannel_id, subchannel_nr, io_int_parm and io_int_word contain the parameters for that interrupt. ipb is needed for instruction parameter decoding. + /* KVM_EXIT_EPR */ + struct { + __u32 epr; + } epr; + +On FSL BookE PowerPC chips, the interrupt controller has a fast patch +interrupt acknowledge path to the core. When the core successfully +delivers an interrupt, it automatically populates the EPR register with +the interrupt vector number and acknowledges the interrupt inside +the interrupt controller. + +In case the interrupt controller lives in user space, we need to do +the interrupt acknowledge cycle through it to fetch the next to be +delivered interrupt vector using this exit. + +It gets triggered whenever both KVM_CAP_PPC_EPR are enabled and an +external interrupt has just been delivered into the guest. User space +should put the acknowledged interrupt vector into the 'epr' field. + /* Fix the size of the union. */ char padding[256]; }; @@ -2501,3 +2520,20 @@ handled in-kernel, while the other I/O instructions are passed to userspace. When this capability is enabled, KVM_EXIT_S390_TSCH will occur on TEST SUBCHANNEL intercepts. + +6.5 KVM_CAP_PPC_EPR + +Architectures: ppc +Parameters: args[0] defines whether the proxy facility is active +Returns: 0 on success; -1 on error + +This capability enables or disables the delivery of interrupts through the +external proxy facility. + +When enabled (args[0] != 0), every time the guest gets an external interrupt +delivered, it automatically exits into user space with a KVM_EXIT_EPR exit +to receive the topmost interrupt vector. + +When disabled (args[0] == 0), behavior is as if this facility is unsupported. + +When this capability is enabled, KVM_EXIT_EPR can occur. diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index ab49c6c..8a72d59 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -520,6 +520,8 @@ struct kvm_vcpu_arch { u8 sane; u8 cpu_type; u8 hcall_needed; + u8 epr_enabled; + u8 epr_needed; u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 5f5f69a..493630e 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -264,6 +264,15 @@ static inline void kvm_linear_init(void) {} #endif +static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr) +{ +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GEPR, epr); +#elif defined(CONFIG_BOOKE) + vcpu-arch.epr = epr; +#endif +} + int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, struct kvm_config_tlb *cfg); int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu, diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 964f447..940ec80 100644 ---
[PATCH 4/8] KVM: PPC: Fix mfspr/mtspr MMUCFG emulation
From: Mihai Caraman mihai.cara...@freescale.com On mfspr/mtspr emulation path Book3E's MMUCFG SPR with value 1015 clashes with G4's MSSSR0 SPR. Move MSSSR0 emulation from generic part to Books3S. MSSSR0 also clashes with Book3S's DABRX SPR. DABRX was not explicitly handled so Book3S execution flow will behave as before. Signed-off-by: Mihai Caraman mihai.cara...@freescale.com Signed-off-by: Alexander Graf ag...@suse.de --- arch/powerpc/kvm/book3s_emulate.c |2 ++ arch/powerpc/kvm/emulate.c|5 - 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index c88161b..836c569 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -455,6 +455,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) case SPRN_PMC3_GEKKO: case SPRN_PMC4_GEKKO: case SPRN_WPAR_GEKKO: + case SPRN_MSSSR0: break; unprivileged: default: @@ -551,6 +552,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) case SPRN_PMC3_GEKKO: case SPRN_PMC4_GEKKO: case SPRN_WPAR_GEKKO: + case SPRN_MSSSR0: *spr_val = 0; break; default: diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index b0855e5..71abcf4 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -149,8 +149,6 @@ static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_TBWL: break; case SPRN_TBWU: break; - case SPRN_MSSSR0: break; - case SPRN_DEC: vcpu-arch.dec = spr_val; kvmppc_emulate_dec(vcpu); @@ -201,9 +199,6 @@ static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) case SPRN_PIR: spr_val = vcpu-vcpu_id; break; - case SPRN_MSSSR0: - spr_val = 0; - break; /* Note: mftb and TBRL/TBWL are user-accessible, so * the guest can always access the real TB anyways. -- 1.6.0.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/8] KVM: PPC: BookE: Allow irq deliveries to inject requests
When injecting an interrupt into guest context, we usually don't need to check for requests anymore. At least not until today. With the introduction of EPR, we will have to create a request when the guest has successfully accepted an external interrupt though. So we need to prepare the interrupt delivery to abort guest entry gracefully. Otherwise we'd delay the EPR request. Signed-off-by: Alexander Graf ag...@suse.de --- arch/powerpc/kvm/booke.c |5 + 1 files changed, 5 insertions(+), 0 deletions(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 69f1140..964f447 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -581,6 +581,11 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) kvmppc_core_check_exceptions(vcpu); + if (vcpu-requests) { + /* Exception delivery raised request; start over */ + return 1; + } + if (vcpu-arch.shared-msr MSR_WE) { local_irq_enable(); kvm_vcpu_block(vcpu); -- 1.6.0.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/8] KVM: PPC: Book3S: PR: Enable alternative instruction for SC 1
When running on top of pHyp, the hypercall instruction sc 1 goes straight into pHyp without trapping in supervisor mode. So if we want to support PAPR guest in this configuration we need to add a second way of accessing PAPR hypercalls, preferably with the exact same semantics except for the instruction. So let's overlay an officially reserved instruction and emulate PAPR hypercalls whenever we hit that one. Signed-off-by: Alexander Graf ag...@suse.de --- arch/powerpc/include/asm/kvm_ppc.h |1 + arch/powerpc/kvm/book3s_emulate.c | 28 arch/powerpc/kvm/book3s_pr.c |5 + 3 files changed, 34 insertions(+), 0 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 572aa75..5f5f69a 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -44,6 +44,7 @@ enum emulation_result { EMULATE_DO_DCR, /* kvm_run filled with DCR request */ EMULATE_FAIL, /* can't emulate this instruction */ EMULATE_AGAIN,/* something went wrong. go again */ + EMULATE_DO_PAPR, /* kvm_run filled with PAPR request */ }; extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index d31a716..c88161b 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -34,6 +34,8 @@ #define OP_31_XOP_MTSRIN 242 #define OP_31_XOP_TLBIEL 274 #define OP_31_XOP_TLBIE306 +/* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */ +#define OP_31_XOP_FAKE_SC1 308 #define OP_31_XOP_SLBMTE 402 #define OP_31_XOP_SLBIE434 #define OP_31_XOP_SLBIA498 @@ -170,6 +172,32 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu-arch.mmu.tlbie(vcpu, addr, large); break; } +#ifdef CONFIG_KVM_BOOK3S_64_PR + case OP_31_XOP_FAKE_SC1: + { + /* SC 1 papr hypercalls */ + ulong cmd = kvmppc_get_gpr(vcpu, 3); + int i; + + if ((vcpu-arch.shared-msr MSR_PR) || + !vcpu-arch.papr_enabled) { + emulated = EMULATE_FAIL; + break; + } + + if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) + break; + + run-papr_hcall.nr = cmd; + for (i = 0; i 9; ++i) { + ulong gpr = kvmppc_get_gpr(vcpu, 4 + i); + run-papr_hcall.args[i] = gpr; + } + + emulated = EMULATE_DO_PAPR; + break; + } +#endif case OP_31_XOP_EIOIO: break; case OP_31_XOP_SLBMTE: diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 28d38ad..73ed11c 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -760,6 +760,11 @@ program_interrupt: run-exit_reason = KVM_EXIT_MMIO; r = RESUME_HOST_NV; break; + case EMULATE_DO_PAPR: + run-exit_reason = KVM_EXIT_PAPR_HCALL; + vcpu-arch.hcall_needed = 1; + r = RESUME_HOST_NV; + break; default: BUG(); } -- 1.6.0.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/8] KVM: PPC: Only WARN on invalid emulation
When we hit an emulation result that we didn't expect, that is an error, but it's nothing that warrants a BUG(), because it can be guest triggered. So instead, let's only WARN() the user that this happened. Signed-off-by: Alexander Graf ag...@suse.de --- arch/powerpc/kvm/powerpc.c |3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index be83fca..e2225e5 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -237,7 +237,8 @@ int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu) r = RESUME_HOST; break; default: - BUG(); + WARN_ON(1); + r = RESUME_GUEST; } return r; -- 1.6.0.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/8] KVM: PPC: Fix SREGS documentation reference
From: Mihai Caraman mihai.cara...@freescale.com Reflect the uapi folder change in SREGS API documentation. Signed-off-by: Mihai Caraman mihai.cara...@freescale.com Reviewed-by: Amos Kong kongjian...@gmail.com Signed-off-by: Alexander Graf ag...@suse.de --- Documentation/virtual/kvm/api.txt |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index f2d6391..4fc2bfc 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -345,7 +345,7 @@ struct kvm_sregs { __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; }; -/* ppc -- see arch/powerpc/include/asm/kvm.h */ +/* ppc -- see arch/powerpc/include/uapi/asm/kvm.h */ interrupt_bitmap is a bitmap of pending external interrupts. At most one bit may be set. This interrupt has been acknowledged by the APIC -- 1.6.0.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] emulator: simple ALU tests
On Fri, Jan 04, 2013 at 04:17:04PM +0200, Avi Kivity wrote: Signed-off-by: Avi Kivity avi.kiv...@gmail.com --- x86/emulator.c | 26 ++ 1 file changed, 26 insertions(+) Applied, thanks. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/2] KVM: s390: Bugfixes for virtio-ccw.
On Mon, Jan 07, 2013 at 03:51:50PM +0100, Cornelia Huck wrote: Hi, Christian discovered some problems with regard to serialization in the virtio-ccw guest driver. Per-device data structures might contain data obtained by channel programs issued later on, leading to confusing behaviour. We cannot rely on the common I/O layer serialization here. Rather than adding extra serialization, we decided to keep it simple with per-request allocated data structures and retries on busy. These patches have been run in our internal testing without problems for a bit now. Please apply to kvm-next. Applied, thanks. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 01/12] tap: multiqueue support
On 01/10/2013 06:28 PM, Stefan Hajnoczi wrote: On Fri, Dec 28, 2012 at 06:31:53PM +0800, Jason Wang wrote: Mainly suggestions to make the code easier to understand, but see the comment about the 1:1 queue/NetClientState model for a general issue with this approach. Ok, thanks for the reviewing. Recently, linux support multiqueue tap which could let userspace call TUNSETIFF for a signle device many times to create multiple file descriptors as s/signle/single/ (Noting these if you respin.) Sorry about this, will be careful. independent queues. User could also enable/disabe a specific queue through s/disabe/disable/ TUNSETQUEUE. The patch adds the generic infrastructure to create multiqueue taps. To achieve this a new parameter queues were introduced to specify how many queues were expected to be created for tap. The fd parameter were also changed to support a list of file descriptors which could be used by management (such as libvirt) to pass pre-created file descriptors (queues) to qemu. Each TAPState were still associated to a tap fd, which mean multiple TAPStates were created when user needs multiqueue taps. Only linux part were implemented now, since it's the only OS that support multiqueue tap. Signed-off-by: Jason Wang jasow...@redhat.com --- net/tap-aix.c | 18 - net/tap-bsd.c | 18 - net/tap-haiku.c | 18 - net/tap-linux.c | 70 +++- net/tap-linux.h |4 + net/tap-solaris.c | 18 - net/tap-win32.c | 10 ++ net/tap.c | 248 + net/tap.h |8 ++- qapi-schema.json |5 +- 10 files changed, 335 insertions(+), 82 deletions(-) This patch should be split up: 1. linux-headers: import linux/if_tun.h multiqueue constants 2. tap: add Linux multiqueue support (tap_open(), tap_fd_attach(), tap_fd_detach()) 3. tap: queue attach/detach (tap_attach(), tap_detach()) 4. tap: split out net_init_one_tap() function (pure code motion, to make later diffs easy to review) 5. tap: add queues and multi-fd options (net_init_tap()/net_init_one_tap() changes) Each commit description can explain how this works in more detail. I think I've figured it out now but it would have helped to separate things out from the start. Ok. diff --git a/net/tap-aix.c b/net/tap-aix.c index f27c177..f931ef3 100644 --- a/net/tap-aix.c +++ b/net/tap-aix.c @@ -25,7 +25,8 @@ #include net/tap.h #include stdio.h -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + int vnet_hdr_required, int mq_required) { fprintf(stderr, no tap on AIX\n); return -1; @@ -59,3 +60,18 @@ void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { } + +int tap_fd_attach(int fd) +{ +return -1; +} + +int tap_fd_detach(int fd) +{ +return -1; +} + +int tap_fd_ifname(int fd, char *ifname) +{ +return -1; +} diff --git a/net/tap-bsd.c b/net/tap-bsd.c index a3b717d..07c287d 100644 --- a/net/tap-bsd.c +++ b/net/tap-bsd.c @@ -33,7 +33,8 @@ #include net/if_tap.h #endif -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + int vnet_hdr_required, int mq_required) { int fd; #ifdef TAPGIFNAME @@ -145,3 +146,18 @@ void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { } + +int tap_fd_attach(int fd) +{ +return -1; +} + +int tap_fd_detach(int fd) +{ +return -1; +} + +int tap_fd_ifname(int fd, char *ifname) +{ +return -1; +} diff --git a/net/tap-haiku.c b/net/tap-haiku.c index 34739d1..62ab423 100644 --- a/net/tap-haiku.c +++ b/net/tap-haiku.c @@ -25,7 +25,8 @@ #include net/tap.h #include stdio.h -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + int vnet_hdr_required, int mq_required) { fprintf(stderr, no tap on Haiku\n); return -1; @@ -59,3 +60,18 @@ void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { } + +int tap_fd_attach(int fd) +{ +return -1; +} + +int tap_fd_detach(int fd) +{ +return -1; +} + +int tap_fd_ifname(int fd, char *ifname) +{ +return -1; +} diff --git a/net/tap-linux.c b/net/tap-linux.c index c6521be..0854ef5 100644 --- a/net/tap-linux.c +++ b/net/tap-linux.c @@ -35,7 +35,8 @@ #define PATH_NET_TUN /dev/net/tun -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + int vnet_hdr_required, int mq_required) {
Re: [Qemu-devel] [PATCH 00/12] Multiqueue virtio-net
On 01/10/2013 07:49 PM, Stefan Hajnoczi wrote: On Thu, Jan 10, 2013 at 05:34:14PM +0800, Jason Wang wrote: On 01/10/2013 04:44 PM, Stefan Hajnoczi wrote: On Wed, Jan 09, 2013 at 11:33:25PM +0800, Jason Wang wrote: On 01/09/2013 11:32 PM, Michael S. Tsirkin wrote: On Wed, Jan 09, 2013 at 03:29:24PM +0100, Stefan Hajnoczi wrote: On Fri, Dec 28, 2012 at 06:31:52PM +0800, Jason Wang wrote: Perf Numbers: Two Intel Xeon 5620 with direct connected intel 82599EB Host/Guest kernel: David net tree vhost enabled - lots of improvents of both latency and cpu utilization in request-reponse test - get regression of guest sending small packets which because TCP tends to batch less when the latency were improved 1q/2q/4q TCP_RR size #sessions trans.rate norm trans.rate norm trans.rate norm 1 1 9393.26 595.64 9408.18 597.34 9375.19 584.12 1 2072162.1 2214.24 129880.22 2456.13 196949.81 2298.13 1 50107513.38 2653.99 139721.93 2490.58 259713.82 2873.57 1 100 126734.63 2676.54 145553.5 2406.63 265252.68 2943 64 19453.42 632.33 9371.37 616.13 9338.19 615.97 64 20 70620.03 2093.68 125155.75 2409.15 191239.91 2253.32 64 50 1069662448.29 146518.67 2514.47 242134.07 2720.91 64 100 117046.35 2394.56 190153.09 2696.82 238881.29 2704.41 256 1 8733.29 736.36 8701.07 680.83 8608.92 530.1 256 20 69279.89 2274.45 115103.07 2299.76 144555.16 1963.53 256 50 97676.02 2296.09 150719.57 2522.92 254510.5 3028.44 256 100 150221.55 2949.56 197569.3 2790.92 300695.78 3494.83 TCP_CRR size #sessions trans.rate norm trans.rate norm trans.rate norm 1 1 2848.37 163.41 2230.39 130.89 2013.09 120.47 1 2023434.5 562.11 31057.43 531.07 49488.28 564.41 1 5028514.88 582.17 40494.23 605.92 60113.35 654.97 1 100 28827.22 584.73 48813.25 661.6 61783.62 676.56 64 12780.08 159.4 2201.07 127.96 2006.8 117.63 64 20 23318.51 564.47 30982.44 530.24 49734.95 566.13 64 50 28585.72 582.54 40576.7 610.08 60167.89 656.56 64 100 28747.37 584.17 49081.87 667.87 60612.94 662 256 1 2772.08 160.51 2231.84 131.05 2003.62 113.45 256 20 23086.35 559.8 30929.09 528.16 48454.9 555.22 256 50 28354.7 579.85 40578.31 60760261.71 657.87 256 100 28844.55 585.67 48541.86 659.08 61941.07 676.72 TCP_STREAM guest receiving size #sessions throughput norm throughput norm throughput norm 1 1 16.27 1.33 16.11.12 16.13 0.99 1 2 33.04 2.08 32.96 2.19 32.75 1.98 1 4 66.62 6.83 68.35.56 66.14 2.65 64 1896.55 56.67 914.02 58.14 898.9 61.56 64 21830.46 91.02 1812.02 64.59 1835.57 66.26 64 43626.61 142.55 3636.25 100.64 3607.46 75.03 256 1 2619.49 131.23 2543.19 129.03 2618.69 132.39 256 2 5136.58 203.02 5163.31 141.11 5236.51 149.4 256 4 7063.99 242.83 9365.4 208.49 9421.03 159.94 512 1 3592.43 165.24 3603.12 167.19 3552.5 169.57 512 2 7042.62 246.59 7068.46 180.87 7258.52 186.3 512 4 6996.08 241.49 9298.34 206.12 9418.52 159.33 1024 1 4339.54 192.95 4370.2 191.92 4211.72 192.49 1024 2 7439.45 254.77 9403.99 215.24 9120.82 222.67 1024 4 7953.86 272.11 9403.87 208.23 9366.98 159.49 4096 1 7696.28 272.04 7611.41 270.38 7778.71 267.76 4096 2 7530.35 261.1 8905.43 246.27 8990.18 267.57 4096 4 7121.6 247.02 9411.75 206.71 9654.96 184.67 16384 1 7795.73 268.54 7780.94 267.2 7634.26 260.73 16384 2 7436.57 255.81 9381.86 220.85 9392220.36 16384 4 7199.07 247.81 9420.96 205.87 9373.69 159.57 TCP_MAERTS guest sending size #sessions throughput norm throughput norm throughput norm 1 1 15.94 0.62 15.55 0.61 15.13 0.59 1 2 36.11 0.83 32.46 0.69 32.28 0.69 1 4 71.59 1 68.91 0.94 61.52 0.77 64 1630.71 22.52 622.11 22.35 605.09 21.84 64 21442.36 30.57 1292.15 25.82 1282.67 25.55 64 43186.79 42.59 2844.96 36.03 2529.69 30.06 256 1 1760.96 58.07 1738.44 57.43 1695.99 56.19 256 2 4834.23 95.19 3524.85 64.21 3511.94 64.45 256 4 9324.63 145.74 8956.49 116.39 6720.17 73.86 512 1 2678.03 84.1 2630.68 82.93 2636.54 82.57 512 2 9368.17 195.61 9408.82 204.53 5316.3 92.99 512 4 9186.34 209.68 9358.72 183.82 9489.29 160.42 1024 1 3620.71 109.88 3625.54 109.83 3606.61 112.35 1024 2 9429258.32 7082.79 120.55 7403.53 134.78 1024 4 9430.66 290.44 9499.29 232.31 9414.6 190.92 4096 1 9339.28 296.48 9374.23 372.88 9348.76 298.49 4096 2 9410.53 378.69 9412.61 286.18 9409.75 278.31 4096 4 9487.35 374.1 9556.91 288.81 9441.94 221.64 16384 1 9380.43 403.8 9379.78 399.13 9382.42 393.55 16384 2 9367.69 406.93 9415.04 312.68 9409.29 300.9 16384 4 9391.96 405.17 9695.12 310.54 9423.76 223.47 Trying to understand the performance results: What is the host device configuration? tap + bridge? Yes. Did you use host CPU affinity for the vhost threads? I use numactl to pin cpu threads and vhost threads in the same numa node. Can
[RFC PATCH 0/2] make mac programming for virtio net more robust
From: Amos Kong ak...@redhat.com Currenly mac is programmed byte by byte. This means that we have an intermediate step where mac is wrong. Second patch introduced a new vq control command to set mac address in one time. Amos Kong (2): move virtnet_send_command() above virtnet_set_mac_address() virtio-net: introduce a new control to set macaddr drivers/net/virtio_net.c| 105 ++-- include/uapi/linux/virtio_net.h | 8 ++- 2 files changed, 66 insertions(+), 47 deletions(-) -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 1/2] move virtnet_send_command() above virtnet_set_mac_address()
From: Amos Kong ak...@redhat.com We will send vq command to set mac address in virtnet_set_mac_address() a little fix of coding style Signed-off-by: Amos Kong ak...@redhat.com --- drivers/net/virtio_net.c | 89 1 file changed, 44 insertions(+), 45 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index a6fcf15..395ab4f 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -753,6 +753,50 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } +/* + * Send command via the control virtqueue and check status. Commands + * supported by the hypervisor, as indicated by feature bits, should + * never fail unless improperly formated. + */ +static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, +struct scatterlist *data, int out, int in) +{ + struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2]; + struct virtio_net_ctrl_hdr ctrl; + virtio_net_ctrl_ack status = ~0; + unsigned int tmp; + int i; + + /* Caller should know better */ + BUG_ON(!virtio_has_feature(vi-vdev, VIRTIO_NET_F_CTRL_VQ) || + (out + in VIRTNET_SEND_COMMAND_SG_MAX)); + + out++; /* Add header */ + in++; /* Add return status */ + + ctrl.class = class; + ctrl.cmd = cmd; + + sg_init_table(sg, out + in); + + sg_set_buf(sg[0], ctrl, sizeof(ctrl)); + for_each_sg(data, s, out + in - 2, i) + sg_set_buf(sg[i + 1], sg_virt(s), s-length); + sg_set_buf(sg[out + in - 1], status, sizeof(status)); + + BUG_ON(virtqueue_add_buf(vi-cvq, sg, out, in, vi, GFP_ATOMIC) 0); + + virtqueue_kick(vi-cvq); + + /* Spin for a response, the kick causes an ioport write, trapping +* into the hypervisor, so the request should be handled immediately. +*/ + while (!virtqueue_get_buf(vi-cvq, tmp)) + cpu_relax(); + + return status == VIRTIO_NET_OK; +} + static int virtnet_set_mac_address(struct net_device *dev, void *p) { struct virtnet_info *vi = netdev_priv(dev); @@ -819,51 +863,6 @@ static void virtnet_netpoll(struct net_device *dev) } #endif -/* - * Send command via the control virtqueue and check status. Commands - * supported by the hypervisor, as indicated by feature bits, should - * never fail unless improperly formated. - */ -static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, -struct scatterlist *data, int out, int in) -{ - struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2]; - struct virtio_net_ctrl_hdr ctrl; - virtio_net_ctrl_ack status = ~0; - unsigned int tmp; - int i; - - /* Caller should know better */ - BUG_ON(!virtio_has_feature(vi-vdev, VIRTIO_NET_F_CTRL_VQ) || - (out + in VIRTNET_SEND_COMMAND_SG_MAX)); - - out++; /* Add header */ - in++; /* Add return status */ - - ctrl.class = class; - ctrl.cmd = cmd; - - sg_init_table(sg, out + in); - - sg_set_buf(sg[0], ctrl, sizeof(ctrl)); - for_each_sg(data, s, out + in - 2, i) - sg_set_buf(sg[i + 1], sg_virt(s), s-length); - sg_set_buf(sg[out + in - 1], status, sizeof(status)); - - BUG_ON(virtqueue_add_buf(vi-cvq, sg, out, in, vi, GFP_ATOMIC) 0); - - virtqueue_kick(vi-cvq); - - /* -* Spin for a response, the kick causes an ioport write, trapping -* into the hypervisor, so the request should be handled immediately. -*/ - while (!virtqueue_get_buf(vi-cvq, tmp)) - cpu_relax(); - - return status == VIRTIO_NET_OK; -} - static void virtnet_ack_link_announce(struct virtnet_info *vi) { rtnl_lock(); -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 2/2] virtio-net: introduce a new control to set macaddr
From: Amos Kong ak...@redhat.com Currently we write MAC address to pci config space byte by byte, this means that we have an intermediate step where mac is wrong. This patch introduced a new control command to set MAC address in one time. VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/net/virtio_net.c| 16 +++- include/uapi/linux/virtio_net.h | 8 +++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 395ab4f..ff22bcd 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -803,13 +803,26 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p) struct virtio_device *vdev = vi-vdev; int ret; + struct scatterlist sg; + ret = eth_mac_addr(dev, p); if (ret) return ret; - if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) { + /* Set MAC address by sending vq command */ + sg_init_one(sg, dev-dev_addr, dev-addr_len); + virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, +VIRTIO_NET_CTRL_MAC_ADDR_SET, +sg, 1, 0); + return 0; + } + + if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) { + /* Set MAC address by writing config space */ vdev-config-set(vdev, offsetof(struct virtio_net_config, mac), dev-dev_addr, dev-addr_len); + } return 0; } @@ -1627,6 +1640,7 @@ static unsigned int features[] = { VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, + VIRTIO_NET_F_CTRL_MAC_ADDR, }; static struct virtio_driver virtio_net_driver = { diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 848e358..a5a8c88 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -53,6 +53,7 @@ * network */ #define VIRTIO_NET_F_MQ22 /* Device supports Receive Flow * Steering */ +#define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ #define VIRTIO_NET_S_LINK_UP 1 /* Link is up */ #define VIRTIO_NET_S_ANNOUNCE 2 /* Announcement is needed */ @@ -127,7 +128,7 @@ typedef __u8 virtio_net_ctrl_ack; #define VIRTIO_NET_CTRL_RX_NOBCAST 5 /* - * Control the MAC filter table. + * Control the MAC * * The MAC filter table is managed by the hypervisor, the guest should * assume the size is infinite. Filtering should be considered @@ -140,6 +141,10 @@ typedef __u8 virtio_net_ctrl_ack; * first sg list contains unicast addresses, the second is for multicast. * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature * is available. + * + * The ADDR_SET command requests one out scatterlist, it contains a + * 6 bytes MAC address. This functionality is present if the + * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available. */ struct virtio_net_ctrl_mac { __u32 entries; @@ -148,6 +153,7 @@ struct virtio_net_ctrl_mac { #define VIRTIO_NET_CTRL_MAC1 #define VIRTIO_NET_CTRL_MAC_TABLE_SET0 + #define VIRTIO_NET_CTRL_MAC_ADDR_SET 1 /* * Control VLAN filtering -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 1/2] move virtnet_send_command() above virtnet_set_mac_address()
On 01/10/2013 10:45 PM, ak...@redhat.com wrote: From: Amos Kong ak...@redhat.com We will send vq command to set mac address in virtnet_set_mac_address() a little fix of coding style Maybe what you need is just a forward declaration. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/net/virtio_net.c | 89 1 file changed, 44 insertions(+), 45 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index a6fcf15..395ab4f 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -753,6 +753,50 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } +/* + * Send command via the control virtqueue and check status. Commands + * supported by the hypervisor, as indicated by feature bits, should + * never fail unless improperly formated. + */ +static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, + struct scatterlist *data, int out, int in) +{ + struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2]; + struct virtio_net_ctrl_hdr ctrl; + virtio_net_ctrl_ack status = ~0; + unsigned int tmp; + int i; + + /* Caller should know better */ + BUG_ON(!virtio_has_feature(vi-vdev, VIRTIO_NET_F_CTRL_VQ) || + (out + in VIRTNET_SEND_COMMAND_SG_MAX)); + + out++; /* Add header */ + in++; /* Add return status */ + + ctrl.class = class; + ctrl.cmd = cmd; + + sg_init_table(sg, out + in); + + sg_set_buf(sg[0], ctrl, sizeof(ctrl)); + for_each_sg(data, s, out + in - 2, i) + sg_set_buf(sg[i + 1], sg_virt(s), s-length); + sg_set_buf(sg[out + in - 1], status, sizeof(status)); + + BUG_ON(virtqueue_add_buf(vi-cvq, sg, out, in, vi, GFP_ATOMIC) 0); + + virtqueue_kick(vi-cvq); + + /* Spin for a response, the kick causes an ioport write, trapping + * into the hypervisor, so the request should be handled immediately. + */ + while (!virtqueue_get_buf(vi-cvq, tmp)) + cpu_relax(); + + return status == VIRTIO_NET_OK; +} + static int virtnet_set_mac_address(struct net_device *dev, void *p) { struct virtnet_info *vi = netdev_priv(dev); @@ -819,51 +863,6 @@ static void virtnet_netpoll(struct net_device *dev) } #endif -/* - * Send command via the control virtqueue and check status. Commands - * supported by the hypervisor, as indicated by feature bits, should - * never fail unless improperly formated. - */ -static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, - struct scatterlist *data, int out, int in) -{ - struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2]; - struct virtio_net_ctrl_hdr ctrl; - virtio_net_ctrl_ack status = ~0; - unsigned int tmp; - int i; - - /* Caller should know better */ - BUG_ON(!virtio_has_feature(vi-vdev, VIRTIO_NET_F_CTRL_VQ) || - (out + in VIRTNET_SEND_COMMAND_SG_MAX)); - - out++; /* Add header */ - in++; /* Add return status */ - - ctrl.class = class; - ctrl.cmd = cmd; - - sg_init_table(sg, out + in); - - sg_set_buf(sg[0], ctrl, sizeof(ctrl)); - for_each_sg(data, s, out + in - 2, i) - sg_set_buf(sg[i + 1], sg_virt(s), s-length); - sg_set_buf(sg[out + in - 1], status, sizeof(status)); - - BUG_ON(virtqueue_add_buf(vi-cvq, sg, out, in, vi, GFP_ATOMIC) 0); - - virtqueue_kick(vi-cvq); - - /* - * Spin for a response, the kick causes an ioport write, trapping - * into the hypervisor, so the request should be handled immediately. - */ - while (!virtqueue_get_buf(vi-cvq, tmp)) - cpu_relax(); - - return status == VIRTIO_NET_OK; -} - static void virtnet_ack_link_announce(struct virtnet_info *vi) { rtnl_lock(); -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH] virtio-net: introduce a new macaddr control
From: Amos Kong ak...@redhat.com In virtio-net guest driver, currently we write MAC address to pci config space byte by byte, this means that we have an intermediate step where mac is wrong. This patch introduced a new control command to set MAC address in one time. VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility. Signed-off-by: Amos Kong ak...@redhat.com --- hw/virtio-net.c | 9 + hw/virtio-net.h | 9 - 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index dc7c6d6..fc11106 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -247,6 +247,7 @@ static uint32_t virtio_net_get_features(VirtIODevice *vdev, uint32_t features) VirtIONet *n = to_virtio_net(vdev); features |= (1 VIRTIO_NET_F_MAC); +features |= (1 VIRTIO_NET_F_CTRL_MAC_ADDR); if (!peer_has_vnet_hdr(n)) { features = ~(0x1 VIRTIO_NET_F_CSUM); @@ -282,6 +283,7 @@ static uint32_t virtio_net_bad_features(VirtIODevice *vdev) /* Linux kernel 2.6.25. It understood MAC (as everyone must), * but also these: */ features |= (1 VIRTIO_NET_F_MAC); +features |= (1 VIRTIO_NET_F_CTRL_MAC_ADDR); features |= (1 VIRTIO_NET_F_CSUM); features |= (1 VIRTIO_NET_F_HOST_TSO4); features |= (1 VIRTIO_NET_F_HOST_TSO6); @@ -349,6 +351,13 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, { struct virtio_net_ctrl_mac mac_data; +if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET elem-out_num == 2) { +/* Set MAC address */ +memcpy(n-mac, elem-out_sg[1].iov_base, elem-out_sg[1].iov_len); +qemu_format_nic_info_str(n-nic-nc, n-mac); +return VIRTIO_NET_OK; +} + if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET || elem-out_num != 3 || elem-out_sg[1].iov_len sizeof(mac_data) || elem-out_sg[2].iov_len sizeof(mac_data)) diff --git a/hw/virtio-net.h b/hw/virtio-net.h index d46fb98..9394cc0 100644 --- a/hw/virtio-net.h +++ b/hw/virtio-net.h @@ -44,6 +44,8 @@ #define VIRTIO_NET_F_CTRL_VLAN 19 /* Control channel VLAN filtering */ #define VIRTIO_NET_F_CTRL_RX_EXTRA 20 /* Extra RX mode control support */ +#define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ + #define VIRTIO_NET_S_LINK_UP1 /* Link is up */ #define TX_TIMER_INTERVAL 15 /* 150 us */ @@ -106,7 +108,7 @@ typedef uint8_t virtio_net_ctrl_ack; #define VIRTIO_NET_CTRL_RX_MODE_NOBCAST 5 /* - * Control the MAC filter table. + * Control the MAC * * The MAC filter table is managed by the hypervisor, the guest should * assume the size is infinite. Filtering should be considered @@ -119,6 +121,10 @@ typedef uint8_t virtio_net_ctrl_ack; * first sg list contains unicast addresses, the second is for multicast. * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature * is available. + * + * The ADDR_SET command requests one out scatterlist, it contains a + * 6 bytes MAC address. This functionality is present if the + * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available. */ struct virtio_net_ctrl_mac { uint32_t entries; @@ -126,6 +132,7 @@ struct virtio_net_ctrl_mac { }; #define VIRTIO_NET_CTRL_MAC1 #define VIRTIO_NET_CTRL_MAC_TABLE_SET0 + #define VIRTIO_NET_CTRL_MAC_ADDR_SET 1 /* * Control VLAN filtering -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [RFC PATCH 2/2] virtio-net: introduce a new control to set macaddr
On 01/10/2013 10:45 PM, ak...@redhat.com wrote: From: Amos Kong ak...@redhat.com Currently we write MAC address to pci config space byte by byte, this means that we have an intermediate step where mac is wrong. This patch introduced a new control command to set MAC address in one time. VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/net/virtio_net.c| 16 +++- include/uapi/linux/virtio_net.h | 8 +++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 395ab4f..ff22bcd 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -803,13 +803,26 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p) struct virtio_device *vdev = vi-vdev; int ret; + struct scatterlist sg; + ret = eth_mac_addr(dev, p); if (ret) return ret; - if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) { + /* Set MAC address by sending vq command */ + sg_init_one(sg, dev-dev_addr, dev-addr_len); + virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, + VIRTIO_NET_CTRL_MAC_ADDR_SET, + sg, 1, 0); + return 0; + } + Better to check the return of virtnet_send_command() and give a warn like other command. Btw, need we fail back to try the old way then? + if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) { + /* Set MAC address by writing config space */ vdev-config-set(vdev, offsetof(struct virtio_net_config, mac), dev-dev_addr, dev-addr_len); + } return 0; } @@ -1627,6 +1640,7 @@ static unsigned int features[] = { VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, + VIRTIO_NET_F_CTRL_MAC_ADDR, }; static struct virtio_driver virtio_net_driver = { diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 848e358..a5a8c88 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -53,6 +53,7 @@ * network */ #define VIRTIO_NET_F_MQ 22 /* Device supports Receive Flow * Steering */ +#define VIRTIO_NET_F_CTRL_MAC_ADDR 23/* Set MAC address */ #define VIRTIO_NET_S_LINK_UP 1 /* Link is up */ #define VIRTIO_NET_S_ANNOUNCE2 /* Announcement is needed */ @@ -127,7 +128,7 @@ typedef __u8 virtio_net_ctrl_ack; #define VIRTIO_NET_CTRL_RX_NOBCAST 5 /* - * Control the MAC filter table. + * Control the MAC * * The MAC filter table is managed by the hypervisor, the guest should * assume the size is infinite. Filtering should be considered @@ -140,6 +141,10 @@ typedef __u8 virtio_net_ctrl_ack; * first sg list contains unicast addresses, the second is for multicast. * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature * is available. + * + * The ADDR_SET command requests one out scatterlist, it contains a + * 6 bytes MAC address. This functionality is present if the + * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available. */ struct virtio_net_ctrl_mac { __u32 entries; @@ -148,6 +153,7 @@ struct virtio_net_ctrl_mac { #define VIRTIO_NET_CTRL_MAC1 #define VIRTIO_NET_CTRL_MAC_TABLE_SET0 + #define VIRTIO_NET_CTRL_MAC_ADDR_SET 1 /* * Control VLAN filtering -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 1/2] move virtnet_send_command() above virtnet_set_mac_address()
On Thu, Jan 10, 2013 at 10:51:52PM +0800, Jason Wang wrote: On 01/10/2013 10:45 PM, ak...@redhat.com wrote: From: Amos Kong ak...@redhat.com We will send vq command to set mac address in virtnet_set_mac_address() a little fix of coding style Maybe what you need is just a forward declaration. Nah functions should be ordered sensibly. Signed-off-by: Amos Kong ak...@redhat.com -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 0/2] make mac programming for virtio net more robust
On Thu, Jan 10, 2013 at 10:45:39PM +0800, ak...@redhat.com wrote: From: Amos Kong ak...@redhat.com Currenly mac is programmed byte by byte. This means that we have an intermediate step where mac is wrong. Second patch introduced a new vq control command to set mac address in one time. I just posted RFC patches (kernel qemu) out, specifiction update will be sent when the solution is accepted. MST has another solution: 1. add a feature to disable mac address in pci space 2. introduce a new vq control cmd to add new mac address to mac filter table. (no long check n-mac in receiver_filter()) Welcome to give comments, thanks! Amos -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 2/2] virtio-net: introduce a new control to set macaddr
On Thu, Jan 10, 2013 at 10:45:41PM +0800, ak...@redhat.com wrote: From: Amos Kong ak...@redhat.com Currently we write MAC address to pci config space byte by byte, this means that we have an intermediate step where mac is wrong. This patch introduced a new control command to set MAC address in one time. VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/net/virtio_net.c| 16 +++- include/uapi/linux/virtio_net.h | 8 +++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 395ab4f..ff22bcd 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -803,13 +803,26 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p) struct virtio_device *vdev = vi-vdev; int ret; + struct scatterlist sg; + ret = eth_mac_addr(dev, p); if (ret) return ret; - if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) { + /* Set MAC address by sending vq command */ + sg_init_one(sg, dev-dev_addr, dev-addr_len); + virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, + VIRTIO_NET_CTRL_MAC_ADDR_SET, + sg, 1, 0); I think we should check status and return an error if the command failed. Also, delay eth_mac_addr until after it passes. + return 0; + } + + if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) { + /* Set MAC address by writing config space */ vdev-config-set(vdev, offsetof(struct virtio_net_config, mac), dev-dev_addr, dev-addr_len); + } By the way, why don't we fail the command if VIRTIO_NET_F_MAC is off? Rusty? return 0; } @@ -1627,6 +1640,7 @@ static unsigned int features[] = { VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, + VIRTIO_NET_F_CTRL_MAC_ADDR, }; static struct virtio_driver virtio_net_driver = { diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 848e358..a5a8c88 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -53,6 +53,7 @@ * network */ #define VIRTIO_NET_F_MQ 22 /* Device supports Receive Flow * Steering */ +#define VIRTIO_NET_F_CTRL_MAC_ADDR 23/* Set MAC address */ #define VIRTIO_NET_S_LINK_UP 1 /* Link is up */ #define VIRTIO_NET_S_ANNOUNCE2 /* Announcement is needed */ @@ -127,7 +128,7 @@ typedef __u8 virtio_net_ctrl_ack; #define VIRTIO_NET_CTRL_RX_NOBCAST 5 /* - * Control the MAC filter table. + * Control the MAC * * The MAC filter table is managed by the hypervisor, the guest should * assume the size is infinite. Filtering should be considered @@ -140,6 +141,10 @@ typedef __u8 virtio_net_ctrl_ack; * first sg list contains unicast addresses, the second is for multicast. * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature * is available. + * + * The ADDR_SET command requests one out scatterlist, it contains a + * 6 bytes MAC address. This functionality is present if the + * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available. */ struct virtio_net_ctrl_mac { __u32 entries; @@ -148,6 +153,7 @@ struct virtio_net_ctrl_mac { #define VIRTIO_NET_CTRL_MAC1 #define VIRTIO_NET_CTRL_MAC_TABLE_SET0 + #define VIRTIO_NET_CTRL_MAC_ADDR_SET 1 /* * Control VLAN filtering -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 0/2] make mac programming for virtio net more robust
On Thu, Jan 10, 2013 at 10:45:39PM +0800, ak...@redhat.com wrote: From: Amos Kong ak...@redhat.com Currenly mac is programmed byte by byte. This means that we have an intermediate step where mac is wrong. Second patch introduced a new vq control command to set mac address in one time. As you mention we could alternatively do it without new commands, simply add a feature bit that says that MACs are in the mac table. This would be a much bigger patch, and I'm fine with either way. Rusty what do you think? Amos Kong (2): move virtnet_send_command() above virtnet_set_mac_address() virtio-net: introduce a new control to set macaddr drivers/net/virtio_net.c| 105 ++-- include/uapi/linux/virtio_net.h | 8 ++- 2 files changed, 66 insertions(+), 47 deletions(-) -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 0/2] KVM/ARM implementation of PSCI
This is an implementation of the Power State Coordination Interface, as described there (registration required): http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0022a/index.html It allows CPUs to be powered on, off, suspended or migrated. This series is only concerned with power-{on,off}. KVM Tool uses it to boot SMP VMs. This is also supported by the mach-virt platform: git://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git virt/mach Tested on the arm64 model, based on kvm-arm-master as of today. From v1: - Documentation - Move the one-time init to a separate function - One mask fits all - Use wait_event_interruptible instead of the open coded version Marc Zyngier (2): ARM: KVM: move one-time init to its own function ARM: KVM: Power State Coordination Interface implementation Documentation/virtual/kvm/api.txt | 4 ++ arch/arm/include/asm/kvm_emulate.h | 5 +++ arch/arm/include/asm/kvm_host.h| 5 ++- arch/arm/include/asm/kvm_psci.h| 23 ++ arch/arm/include/uapi/asm/kvm.h| 16 +++ arch/arm/kvm/Makefile | 2 +- arch/arm/kvm/arm.c | 59 + arch/arm/kvm/psci.c| 90 ++ include/uapi/linux/kvm.h | 1 + 9 files changed, 184 insertions(+), 21 deletions(-) create mode 100644 arch/arm/include/asm/kvm_psci.h create mode 100644 arch/arm/kvm/psci.c -- 1.8.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 1/2] ARM: KVM: move one-time init to its own function
As more code to be run only once per vcpu is expected, move the VGIC init code into a separate function. Signed-off-by: Marc Zyngier marc.zyng...@arm.com --- arch/arm/include/asm/kvm_host.h | 3 +++ arch/arm/kvm/arm.c | 33 +++-- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 334b81d..14a71bb 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -127,6 +127,9 @@ struct kvm_vcpu_arch { /* Cache some mmu pages needed inside spinlock regions */ struct kvm_mmu_memory_cache mmu_page_cache; + + /* Detect first run of a vcpu */ + bool has_run_once; }; struct kvm_vm_stat { diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index dc67a3c..a28e49d 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -643,6 +643,26 @@ static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, } } +static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) +{ + if (likely(vcpu-arch.has_run_once)) + return 0; + + vcpu-arch.has_run_once = true; + + /* +* Initialize the VGIC before running a vcpu the first time on +* this VM. +*/ + if (irqchip_in_kernel(vcpu-kvm) !vgic_initialized(vcpu-kvm)) { + int ret = kvm_vgic_init(vcpu-kvm); + if (ret) + return ret; + } + + return 0; +} + /** * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code * @vcpu: The VCPU pointer @@ -663,16 +683,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) if (unlikely(vcpu-arch.target 0)) return -ENOEXEC; - /* -* Initialize the VGIC before running a vcpu the first time on -* this VM. -*/ - if (irqchip_in_kernel(vcpu-kvm) - unlikely(!vgic_initialized(vcpu-kvm))) { - ret = kvm_vgic_init(vcpu-kvm); - if (ret) - return ret; - } + ret = kvm_vcpu_first_run_init(vcpu); + if (ret) + return ret; if (run-exit_reason == KVM_EXIT_MMIO) { ret = kvm_handle_mmio_return(vcpu, vcpu-run); -- 1.8.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 2/2] ARM: KVM: Power State Coordination Interface implementation
Implement the PSCI specification (ARM DEN 0022A) to control virtual CPUs being powered on or off. PSCI/KVM is detected using the KVM_CAP_ARM_PSCI capability. A virtual CPU can now be initialized in a powered off state, using the KVM_ARM_VCPU_POWER_OFF feature flag. The guest can use either SMC or HVC to execute a PSCI function. Signed-off-by: Marc Zyngier marc.zyng...@arm.com --- Documentation/virtual/kvm/api.txt | 4 ++ arch/arm/include/asm/kvm_emulate.h | 5 +++ arch/arm/include/asm/kvm_host.h| 2 +- arch/arm/include/asm/kvm_psci.h| 23 ++ arch/arm/include/uapi/asm/kvm.h| 16 +++ arch/arm/kvm/Makefile | 2 +- arch/arm/kvm/arm.c | 26 +++ arch/arm/kvm/psci.c| 90 ++ include/uapi/linux/kvm.h | 1 + 9 files changed, 158 insertions(+), 11 deletions(-) create mode 100644 arch/arm/include/asm/kvm_psci.h create mode 100644 arch/arm/kvm/psci.c diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 497fd48..e0fa0ea 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2185,6 +2185,10 @@ return ENOEXEC for that vcpu. Note that because some registers reflect machine topology, all vcpus should be created before this ioctl is invoked. +Possible features: + - KVM_ARM_VCPU_POWER_OFF: Starts the CPU in a power-off state. + Depends on KVM_CAP_ARM_PSCI. + 4.78 KVM_GET_REG_LIST diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index 375795b..eb07342 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -34,6 +34,11 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu); void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr); void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr); +static inline bool vcpu_mode_is_32bit(struct kvm_vcpu *vcpu) +{ + return 1; +} + static inline u32 *vcpu_pc(struct kvm_vcpu *vcpu) { return (u32 *)vcpu-arch.regs.usr_regs.ARM_pc; diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 14a71bb..b8eb3e0 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -31,7 +31,7 @@ #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #define KVM_HAVE_ONE_REG -#define KVM_VCPU_MAX_FEATURES 0 +#define KVM_VCPU_MAX_FEATURES 1 /* We don't currently support large pages. */ #define KVM_HPAGE_GFN_SHIFT(x) 0 diff --git a/arch/arm/include/asm/kvm_psci.h b/arch/arm/include/asm/kvm_psci.h new file mode 100644 index 000..992d7f1 --- /dev/null +++ b/arch/arm/include/asm/kvm_psci.h @@ -0,0 +1,23 @@ +/* + * Copyright (C) 2012 - ARM Ltd + * Author: Marc Zyngier marc.zyng...@arm.com + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ + +#ifndef __ARM_KVM_PSCI_H__ +#define __ARM_KVM_PSCI_H__ + +int kvm_psci_call(struct kvm_vcpu *vcpu); + +#endif /* __ARM_KVM_PSCI_H__ */ diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index 94e893b..972b90d 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -81,6 +81,8 @@ struct kvm_regs { #define KVM_VGIC_V2_DIST_SIZE 0x1000 #define KVM_VGIC_V2_CPU_SIZE 0x2000 +#define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */ + struct kvm_vcpu_init { __u32 target; __u32 features[7]; @@ -161,4 +163,18 @@ struct kvm_arch_memory_slot { /* Highest supported SPI, from VGIC_NR_IRQS */ #define KVM_ARM_IRQ_GIC_MAX127 +/* PSCI interface */ +#define KVM_PSCI_FN_BASE 0x95c1ba5e +#define KVM_PSCI_FN(n) (KVM_PSCI_FN_BASE + (n)) + +#define KVM_PSCI_FN_CPU_SUSPENDKVM_PSCI_FN(0) +#define KVM_PSCI_FN_CPU_OFFKVM_PSCI_FN(1) +#define KVM_PSCI_FN_CPU_ON KVM_PSCI_FN(2) +#define KVM_PSCI_FN_MIGRATEKVM_PSCI_FN(3) + +#define KVM_PSCI_RET_SUCCESS 0 +#define KVM_PSCI_RET_NI((unsigned long)-1) +#define KVM_PSCI_RET_INVAL ((unsigned long)-2) +#define KVM_PSCI_RET_DENIED((unsigned long)-3) + #endif /* __ARM_KVM_H__ */ diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index 43c4cad..6a2d571 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -18,6 +18,6 @@ kvm-arm-y = $(addprefix ../../../virt/kvm/,
Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction
On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote: The current reexecute_instruction can not well detect the failed instruction emulation. It allows guest to retry all the instructions except it accesses on error pfn For example, some cases are nested-write-protect - if the page we want to write is used as PDE but it chains to itself. Under this case, we should stop the emulation and report the case to userspace Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/include/asm/kvm_host.h |7 +++ arch/x86/kvm/paging_tmpl.h | 27 --- arch/x86/kvm/x86.c |8 +++- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c431b33..d6ab8d2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -502,6 +502,13 @@ struct kvm_vcpu_arch { u64 msr_val; struct gfn_to_hva_cache data; } pv_eoi; + + /* + * Indicate whether the access faults on its page table in guest + * which is set when fix page fault and used to detect unhandeable + * instruction. + */ + bool write_fault_to_shadow_pgtable; }; struct kvm_lpage_info { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 67b390d..df50560 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -497,26 +497,34 @@ out_gpte_changed: * created when kvm establishes shadow page table that stop kvm using large * page size. Do it early can avoid unnecessary #PF and emulation. * + * @write_fault_to_shadow_pgtable will return true if the fault gfn is + * currently used as its page table. + * * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok * since the PDPT is always shadowed, that means, we can not use large page * size to map the gfn which is used as PDPT. */ static bool FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, - struct guest_walker *walker, int user_fault) + struct guest_walker *walker, int user_fault, + bool *write_fault_to_shadow_pgtable) { int level; gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker-level) - 1); + bool self_changed = false; if (!(walker-pte_access ACC_WRITE_MASK || (!is_write_protection(vcpu) !user_fault))) return false; - for (level = walker-level; level = walker-max_level; level++) - if (!((walker-gfn ^ walker-table_gfn[level - 1]) mask)) - return true; + for (level = walker-level; level = walker-max_level; level++) { + gfn_t gfn = walker-gfn ^ walker-table_gfn[level - 1]; + + self_changed |= !(gfn mask); + *write_fault_to_shadow_pgtable |= !gfn; + } - return false; + return self_changed; } /* @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int level = PT_PAGE_TABLE_LEVEL; int force_pt_level; unsigned long mmu_seq; - bool map_writable; + bool map_writable, is_self_change_mapping; pgprintk(%s: addr %lx err %x\n, __func__, addr, error_code); @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return 0; } + vcpu-arch.write_fault_to_shadow_pgtable = false; + + is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, + walker, user_fault, vcpu-arch.write_fault_to_shadow_pgtable); + if (walker.level = PT_DIRECTORY_LEVEL) force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) -|| FNAME(is_self_change_mapping)(vcpu, walker, user_fault); +|| is_self_change_mapping; else force_pt_level = 1; if (!force_pt_level) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6f13e03..2957012 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2) * guest to let CPU execute the instruction. */ kvm_mmu_unprotect_page(vcpu-kvm, gpa_to_gfn(gpa)); - return true; + + /* + * If the access faults on its page table, it can not + * be fixed by unprotecting shadow page and it should + * be reported to userspace. + */ + return !vcpu-arch.write_fault_to_shadow_pgtable; } static bool retry_instruction(struct x86_emulate_ctxt *ctxt, Also should make sure vcpu-arch.write_fault_to_shadow_pgtable is never reused. Say, clean when exiting x86_emulate_instruction? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More
Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction
On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote: The current reexecute_instruction can not well detect the failed instruction emulation. It allows guest to retry all the instructions except it accesses on error pfn For example, some cases are nested-write-protect - if the page we want to write is used as PDE but it chains to itself. Under this case, we should stop the emulation and report the case to userspace Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/include/asm/kvm_host.h |7 +++ arch/x86/kvm/paging_tmpl.h | 27 --- arch/x86/kvm/x86.c |8 +++- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c431b33..d6ab8d2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -502,6 +502,13 @@ struct kvm_vcpu_arch { u64 msr_val; struct gfn_to_hva_cache data; } pv_eoi; + + /* + * Indicate whether the access faults on its page table in guest + * which is set when fix page fault and used to detect unhandeable + * instruction. + */ + bool write_fault_to_shadow_pgtable; }; struct kvm_lpage_info { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 67b390d..df50560 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -497,26 +497,34 @@ out_gpte_changed: * created when kvm establishes shadow page table that stop kvm using large * page size. Do it early can avoid unnecessary #PF and emulation. * + * @write_fault_to_shadow_pgtable will return true if the fault gfn is + * currently used as its page table. + * * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok * since the PDPT is always shadowed, that means, we can not use large page * size to map the gfn which is used as PDPT. */ static bool FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, - struct guest_walker *walker, int user_fault) + struct guest_walker *walker, int user_fault, + bool *write_fault_to_shadow_pgtable) { int level; gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker-level) - 1); + bool self_changed = false; if (!(walker-pte_access ACC_WRITE_MASK || (!is_write_protection(vcpu) !user_fault))) return false; - for (level = walker-level; level = walker-max_level; level++) - if (!((walker-gfn ^ walker-table_gfn[level - 1]) mask)) - return true; + for (level = walker-level; level = walker-max_level; level++) { + gfn_t gfn = walker-gfn ^ walker-table_gfn[level - 1]; + + self_changed |= !(gfn mask); + *write_fault_to_shadow_pgtable |= !gfn; + } - return false; + return self_changed; } /* @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int level = PT_PAGE_TABLE_LEVEL; int force_pt_level; unsigned long mmu_seq; - bool map_writable; + bool map_writable, is_self_change_mapping; pgprintk(%s: addr %lx err %x\n, __func__, addr, error_code); @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return 0; } + vcpu-arch.write_fault_to_shadow_pgtable = false; + + is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, + walker, user_fault, vcpu-arch.write_fault_to_shadow_pgtable); + if (walker.level = PT_DIRECTORY_LEVEL) force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) -|| FNAME(is_self_change_mapping)(vcpu, walker, user_fault); +|| is_self_change_mapping; else force_pt_level = 1; if (!force_pt_level) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6f13e03..2957012 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2) * guest to let CPU execute the instruction. */ kvm_mmu_unprotect_page(vcpu-kvm, gpa_to_gfn(gpa)); - return true; + + /* + * If the access faults on its page table, it can not + * be fixed by unprotecting shadow page and it should + * be reported to userspace. + */ + return !vcpu-arch.write_fault_to_shadow_pgtable; } This sounds wrong: only reporting emulation failure in case of a write fault to shadow pagetable? The current pattern is sane: if (condition_1 which allows reexecution is true) return true; if (condition_2 which allows reexecution is true) return true; ... return false; Applied 1-2. -- To
Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction
On Thu, Jan 10, 2013 at 03:30:36PM -0200, Marcelo Tosatti wrote: On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote: The current reexecute_instruction can not well detect the failed instruction emulation. It allows guest to retry all the instructions except it accesses on error pfn For example, some cases are nested-write-protect - if the page we want to write is used as PDE but it chains to itself. Under this case, we should stop the emulation and report the case to userspace Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/include/asm/kvm_host.h |7 +++ arch/x86/kvm/paging_tmpl.h | 27 --- arch/x86/kvm/x86.c |8 +++- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c431b33..d6ab8d2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -502,6 +502,13 @@ struct kvm_vcpu_arch { u64 msr_val; struct gfn_to_hva_cache data; } pv_eoi; + + /* +* Indicate whether the access faults on its page table in guest +* which is set when fix page fault and used to detect unhandeable +* instruction. +*/ + bool write_fault_to_shadow_pgtable; }; struct kvm_lpage_info { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 67b390d..df50560 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -497,26 +497,34 @@ out_gpte_changed: * created when kvm establishes shadow page table that stop kvm using large * page size. Do it early can avoid unnecessary #PF and emulation. * + * @write_fault_to_shadow_pgtable will return true if the fault gfn is + * currently used as its page table. + * * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok * since the PDPT is always shadowed, that means, we can not use large page * size to map the gfn which is used as PDPT. */ static bool FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, - struct guest_walker *walker, int user_fault) + struct guest_walker *walker, int user_fault, + bool *write_fault_to_shadow_pgtable) { int level; gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker-level) - 1); + bool self_changed = false; if (!(walker-pte_access ACC_WRITE_MASK || (!is_write_protection(vcpu) !user_fault))) return false; - for (level = walker-level; level = walker-max_level; level++) - if (!((walker-gfn ^ walker-table_gfn[level - 1]) mask)) - return true; + for (level = walker-level; level = walker-max_level; level++) { + gfn_t gfn = walker-gfn ^ walker-table_gfn[level - 1]; + + self_changed |= !(gfn mask); + *write_fault_to_shadow_pgtable |= !gfn; + } - return false; + return self_changed; } /* @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int level = PT_PAGE_TABLE_LEVEL; int force_pt_level; unsigned long mmu_seq; - bool map_writable; + bool map_writable, is_self_change_mapping; pgprintk(%s: addr %lx err %x\n, __func__, addr, error_code); @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return 0; } + vcpu-arch.write_fault_to_shadow_pgtable = false; + + is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, + walker, user_fault, vcpu-arch.write_fault_to_shadow_pgtable); + if (walker.level = PT_DIRECTORY_LEVEL) force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) - || FNAME(is_self_change_mapping)(vcpu, walker, user_fault); + || is_self_change_mapping; else force_pt_level = 1; if (!force_pt_level) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6f13e03..2957012 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2) * guest to let CPU execute the instruction. */ kvm_mmu_unprotect_page(vcpu-kvm, gpa_to_gfn(gpa)); - return true; + + /* +* If the access faults on its page table, it can not +* be fixed by unprotecting shadow page and it should +* be reported to userspace. +*/ + return !vcpu-arch.write_fault_to_shadow_pgtable; } static bool retry_instruction(struct x86_emulate_ctxt *ctxt, Also should make sure vcpu-arch.write_fault_to_shadow_pgtable is never reused. Say, clean when exiting x86_emulate_instruction? Clear it right here for clarity. --
KVM/arm64 updates
Guys, I've updated the branches for KVM/arm64, renaming them all to keep the interested parties on their toes. We now have: - kvm-arm/psci: kvm-arm-master + the KVM/PSCI implementation - kvm-arm/pre-arm64: the above + the cleanup branch - arm64/soc-armv8-model: Catalin Marinas' arm64 branch - arm64/psci: Implementation of PSCI for the above - kvm-arm64/kvm-prereq: a bunch of random bits that KVM/arm requires to compile on arm64. - kvm-arm64/kvm: KVM/arm64 itself, and the only branch you should use unless you have masochistic tendencies. Comes with all the above and Mark Rutland's timer rework. All that is at: git://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git You'll also need Will Deacon's KVM Tool port: git://git.kernel.org/pub/scm/linux/kernel/git/will/kvmtool.git kvmtool/arm A few random notes: - If you're using the Foundation Model, use the provided DTS for your host kernel (arch/arm64/boot/dts/foundation-v8.dts). - The only supported models are the AEMv8 and the Foundation models. If you're using something else and have any issue, first reproduce it with one of the supported implementations. What's new: - Rebased on 3.8-rc3 - Resynced with kvm-arm-master (new and improved VGIC API...) - Fixed 32bit VFP world switching Enjoy, M. -- Jazz is not dead. It just smells funny... -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/7 -v2] KVM: Alleviate mmu_lock hold time when we start dirty logging
On Tue, Jan 08, 2013 at 07:42:38PM +0900, Takuya Yoshikawa wrote: Changelog v1-v2: The condition in patch 1 was changed like this: npages (mem-flags KVM_MEM_LOG_DIRTY_PAGES) This patch set makes kvm_mmu_slot_remove_write_access() rmap based and adds conditional rescheduling to it. The motivation for this change is of course to reduce the mmu_lock hold time when we start dirty logging for a large memory slot. You may not see the problem if you just give 8GB or less of the memory to the guest with THP enabled on the host -- this is for the worst case. Takuya Yoshikawa (7): KVM: Write protect the updated slot only when dirty logging is enabled KVM: MMU: Remove unused parameter level from __rmap_write_protect() KVM: MMU: Make kvm_mmu_slot_remove_write_access() rmap based KVM: Remove unused slot_bitmap from kvm_mmu_page KVM: Make kvm_mmu_change_mmu_pages() take mmu_lock by itself KVM: Make kvm_mmu_slot_remove_write_access() take mmu_lock by itself KVM: MMU: Conditionally reschedule when kvm_mmu_slot_remove_write_access() takes a long time Documentation/virtual/kvm/mmu.txt |7 arch/x86/include/asm/kvm_host.h |5 --- arch/x86/kvm/mmu.c| 56 +++- arch/x86/kvm/x86.c| 12 --- virt/kvm/kvm_main.c |1 - 5 files changed, 37 insertions(+), 44 deletions(-) -- 1.7.5.4 Reviewed-by: Marcelo Tosatti mtosa...@redhat.com -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 0/7] Streamline arithmetic instruction emulation
On Fri, Jan 04, 2013 at 04:18:47PM +0200, Avi Kivity wrote: The current arithmetic instruction emulation is fairly clumsy: after decode, each instruction gets a switch (size), and for every size we fetch the operands, prepare flags, emulate the instruction, then store back the flags and operands. This patchset simplifies things by moving everything into common code except the instruction itself. All the pre- and post- processing is coded just once. The per-instrution code looks like: add %bl, %al ret add %bx, %ax ret add %ebx, %eax ret add %rbx, %rax ret The savings in size, for the ten instructions converted in this patchset, are fairly large: text data bss dec hex filename 63724 0 0 63724f8ec arch/x86/kvm/emulate.o.before 61268 0 0 61268ef54 arch/x86/kvm/emulate.o.after - around 2500 bytes. v3: fix reversed operand order in 2-operand macro v2: rebased Avi Kivity (7): KVM: x86 emulator: framework for streamlining arithmetic opcodes KVM: x86 emulator: Support for declaring single operand fastops KVM: x86 emulator: introduce NoWrite flag KVM: x86 emulator: mark CMP, CMPS, SCAS, TEST as NoWrite KVM: x86 emulator: convert NOT, NEG to fastop KVM: x86 emulator: add macros for defining 2-operand fastop emulation KVM: x86 emulator: convert basic ALU ops to fastop arch/x86/kvm/emulate.c | 215 +++-- 1 file changed, 120 insertions(+), 95 deletions(-) Applied, thanks. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kvmarm] [PATCH v2 0/2] KVM/ARM implementation of PSCI
On Thu, Jan 10, 2013 at 11:06 AM, Marc Zyngier marc.zyng...@arm.com wrote: This is an implementation of the Power State Coordination Interface, as described there (registration required): http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0022a/index.html It allows CPUs to be powered on, off, suspended or migrated. This series is only concerned with power-{on,off}. KVM Tool uses it to boot SMP VMs. This is also supported by the mach-virt platform: git://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git virt/mach Tested on the arm64 model, based on kvm-arm-master as of today. From v1: - Documentation - Move the one-time init to a separate function - One mask fits all - Use wait_event_interruptible instead of the open coded version Thanks, applied. -Christoffer -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction
On 01/11/2013 01:26 AM, Marcelo Tosatti wrote: On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote: The current reexecute_instruction can not well detect the failed instruction emulation. It allows guest to retry all the instructions except it accesses on error pfn For example, some cases are nested-write-protect - if the page we want to write is used as PDE but it chains to itself. Under this case, we should stop the emulation and report the case to userspace Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/include/asm/kvm_host.h |7 +++ arch/x86/kvm/paging_tmpl.h | 27 --- arch/x86/kvm/x86.c |8 +++- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c431b33..d6ab8d2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -502,6 +502,13 @@ struct kvm_vcpu_arch { u64 msr_val; struct gfn_to_hva_cache data; } pv_eoi; + +/* + * Indicate whether the access faults on its page table in guest + * which is set when fix page fault and used to detect unhandeable + * instruction. + */ +bool write_fault_to_shadow_pgtable; }; struct kvm_lpage_info { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 67b390d..df50560 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -497,26 +497,34 @@ out_gpte_changed: * created when kvm establishes shadow page table that stop kvm using large * page size. Do it early can avoid unnecessary #PF and emulation. * + * @write_fault_to_shadow_pgtable will return true if the fault gfn is + * currently used as its page table. + * * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok * since the PDPT is always shadowed, that means, we can not use large page * size to map the gfn which is used as PDPT. */ static bool FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, - struct guest_walker *walker, int user_fault) + struct guest_walker *walker, int user_fault, + bool *write_fault_to_shadow_pgtable) { int level; gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker-level) - 1); +bool self_changed = false; if (!(walker-pte_access ACC_WRITE_MASK || (!is_write_protection(vcpu) !user_fault))) return false; -for (level = walker-level; level = walker-max_level; level++) -if (!((walker-gfn ^ walker-table_gfn[level - 1]) mask)) -return true; +for (level = walker-level; level = walker-max_level; level++) { +gfn_t gfn = walker-gfn ^ walker-table_gfn[level - 1]; + +self_changed |= !(gfn mask); +*write_fault_to_shadow_pgtable |= !gfn; +} -return false; +return self_changed; } /* @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int level = PT_PAGE_TABLE_LEVEL; int force_pt_level; unsigned long mmu_seq; -bool map_writable; +bool map_writable, is_self_change_mapping; pgprintk(%s: addr %lx err %x\n, __func__, addr, error_code); @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return 0; } +vcpu-arch.write_fault_to_shadow_pgtable = false; + +is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, + walker, user_fault, vcpu-arch.write_fault_to_shadow_pgtable); + if (walker.level = PT_DIRECTORY_LEVEL) force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) - || FNAME(is_self_change_mapping)(vcpu, walker, user_fault); + || is_self_change_mapping; else force_pt_level = 1; if (!force_pt_level) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6f13e03..2957012 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2) * guest to let CPU execute the instruction. */ kvm_mmu_unprotect_page(vcpu-kvm, gpa_to_gfn(gpa)); -return true; + +/* + * If the access faults on its page table, it can not + * be fixed by unprotecting shadow page and it should + * be reported to userspace. + */ +return !vcpu-arch.write_fault_to_shadow_pgtable; } This sounds wrong: only reporting emulation failure in case of a write fault to shadow pagetable? We suppose unprotecting target-gfn can avoid emulation, the same as current code. :( The current pattern is sane: if (condition_1 which allows reexecution is true) return true; if (condition_2 which allows reexecution is
Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction
On 01/11/2013 01:30 AM, Marcelo Tosatti wrote: On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote: The current reexecute_instruction can not well detect the failed instruction emulation. It allows guest to retry all the instructions except it accesses on error pfn For example, some cases are nested-write-protect - if the page we want to write is used as PDE but it chains to itself. Under this case, we should stop the emulation and report the case to userspace Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/include/asm/kvm_host.h |7 +++ arch/x86/kvm/paging_tmpl.h | 27 --- arch/x86/kvm/x86.c |8 +++- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c431b33..d6ab8d2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -502,6 +502,13 @@ struct kvm_vcpu_arch { u64 msr_val; struct gfn_to_hva_cache data; } pv_eoi; + +/* + * Indicate whether the access faults on its page table in guest + * which is set when fix page fault and used to detect unhandeable + * instruction. + */ +bool write_fault_to_shadow_pgtable; }; struct kvm_lpage_info { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 67b390d..df50560 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -497,26 +497,34 @@ out_gpte_changed: * created when kvm establishes shadow page table that stop kvm using large * page size. Do it early can avoid unnecessary #PF and emulation. * + * @write_fault_to_shadow_pgtable will return true if the fault gfn is + * currently used as its page table. + * * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok * since the PDPT is always shadowed, that means, we can not use large page * size to map the gfn which is used as PDPT. */ static bool FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, - struct guest_walker *walker, int user_fault) + struct guest_walker *walker, int user_fault, + bool *write_fault_to_shadow_pgtable) { int level; gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker-level) - 1); +bool self_changed = false; if (!(walker-pte_access ACC_WRITE_MASK || (!is_write_protection(vcpu) !user_fault))) return false; -for (level = walker-level; level = walker-max_level; level++) -if (!((walker-gfn ^ walker-table_gfn[level - 1]) mask)) -return true; +for (level = walker-level; level = walker-max_level; level++) { +gfn_t gfn = walker-gfn ^ walker-table_gfn[level - 1]; + +self_changed |= !(gfn mask); +*write_fault_to_shadow_pgtable |= !gfn; +} -return false; +return self_changed; } /* @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int level = PT_PAGE_TABLE_LEVEL; int force_pt_level; unsigned long mmu_seq; -bool map_writable; +bool map_writable, is_self_change_mapping; pgprintk(%s: addr %lx err %x\n, __func__, addr, error_code); @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return 0; } +vcpu-arch.write_fault_to_shadow_pgtable = false; + +is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, + walker, user_fault, vcpu-arch.write_fault_to_shadow_pgtable); + if (walker.level = PT_DIRECTORY_LEVEL) force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) - || FNAME(is_self_change_mapping)(vcpu, walker, user_fault); + || is_self_change_mapping; else force_pt_level = 1; if (!force_pt_level) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6f13e03..2957012 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2) * guest to let CPU execute the instruction. */ kvm_mmu_unprotect_page(vcpu-kvm, gpa_to_gfn(gpa)); -return true; + +/* + * If the access faults on its page table, it can not + * be fixed by unprotecting shadow page and it should + * be reported to userspace. + */ +return !vcpu-arch.write_fault_to_shadow_pgtable; } static bool retry_instruction(struct x86_emulate_ctxt *ctxt, Also should make sure vcpu-arch.write_fault_to_shadow_pgtable is never reused. Say, clean when exiting x86_emulate_instruction? Yes, it is more clear. But i am thinking if it is really needed because 'cr2' is only valid when it is called on page fault path,
Re: [RFC PATCH 0/9] KVM: PPC: In-kernel PAPR interrupt controller emulation
On 01/10/2013 05:42:16 AM, Alexander Graf wrote: Other instantiation attributes we don't know that early on should be settable between the time frame of vm creation and first execution. An example for this are device addresses. Check out the threads KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS and KVM: ARM: Introduce KVM_SET_DEVICE_ADDRESS ioctl for more information on this particular bit. FWIW, on Freescale chips the MPIC base address can move at runtime if the guest writes to CCSRBAR (though QEMU doesn't currently implement this). -Scott -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Proper taming of oom-killer with kvm
Hi, I have recently run in the following issue: under certain conditions, if emulator process have exceeded its own memory limit in the cgroup and oom shot it, /proc entry may stay long indefinitely. There are two possible side-effects - at first, if one will try to read cmdline from such entry, his request will hang indefinitely long too, e.g. if issuing ``ps aux'' once per minute will fill out default PID limit in less a half of day by ps processes in D state. Second effect may appear only on heavily loaded node - scheduler process will eat 100% of selected cores (almost always at only one), with system becoming unresponsive in a couple of minutes. This should be reproduced easily: - start kvm process - put in to memory cgroup and set the limits - disable oom_killer via oom_control - simply put the process into oom condition (using balloon, it should be very simple) - check kvm process state - if it stuck in D state, all should be okay, since you`re able to catch oom condition - simply send TERM signal and raise memory limit by nonsignificant amount and process wiil end normally. If you`re observing kvm process with triggered under_oom flag and in _sleep_ state, TERM signal will kill it, with nice lock described above. I have solved problem by quite stupid workaround - after getting informed of oom event(w/ disabled oom in cg), I`m freezing kvm process via freezer cg, therefore moving it to D state, then sending it TERM, then raising memory limits and finally unfreezing it - it is very ugly, but at least I have get rid of the problem. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: trace: Fix exit decoding.
On Tue, Jan 08, 2013 at 01:00:01PM +0100, Cornelia Huck wrote: trace_kvm_userspace_exit has been missing the KVM_EXIT_WATCHDOG exit. CC: Bharat Bhushan r65...@freescale.com Signed-off-by: Cornelia Huck cornelia.h...@de.ibm.com --- include/trace/events/kvm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Applied, thanks. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC/PATCH 1/1] s390/kvm: Fix BUG in include/linux/kvm_host.h:745
On Tue, Jan 08, 2013 at 04:23:10PM +0100, Christian Borntraeger wrote: commit b080935c8638e08134629d0a9ebdf35669bec14d kvm: Directly account vtime to system on guest switch also removed the irq_disable/enable around kvm guest switch, which is correct in itself. Unfortunately, there is a BUG ON that (correctly) checks for preemptible to cover the call to rcu later on. (Introduced with commit 8fa2206821953a50a3a02ea33fcfb3ced2fd9997 KVM: make guest mode entry to be rcu quiescent state) This check might trigger depending on the kernel config. Lets make sure that no preemption happens during kvm_guest_enter. We can enable preemption again after the call to rcu_virt_note_context_switch returns. Please note that we continue to run s390 guests with interrupts enabled. CC: Frederic Weisbecker fweis...@gmail.com CC: Gleb Natapov g...@redhat.com Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com Applied, thanks. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction
On Fri, Jan 11, 2013 at 02:05:33AM +0800, Xiao Guangrong wrote: On 01/11/2013 01:26 AM, Marcelo Tosatti wrote: On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote: The current reexecute_instruction can not well detect the failed instruction emulation. It allows guest to retry all the instructions except it accesses on error pfn For example, some cases are nested-write-protect - if the page we want to write is used as PDE but it chains to itself. Under this case, we should stop the emulation and report the case to userspace Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/include/asm/kvm_host.h |7 +++ arch/x86/kvm/paging_tmpl.h | 27 --- arch/x86/kvm/x86.c |8 +++- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c431b33..d6ab8d2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -502,6 +502,13 @@ struct kvm_vcpu_arch { u64 msr_val; struct gfn_to_hva_cache data; } pv_eoi; + + /* + * Indicate whether the access faults on its page table in guest + * which is set when fix page fault and used to detect unhandeable + * instruction. + */ + bool write_fault_to_shadow_pgtable; }; struct kvm_lpage_info { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 67b390d..df50560 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -497,26 +497,34 @@ out_gpte_changed: * created when kvm establishes shadow page table that stop kvm using large * page size. Do it early can avoid unnecessary #PF and emulation. * + * @write_fault_to_shadow_pgtable will return true if the fault gfn is + * currently used as its page table. + * * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok * since the PDPT is always shadowed, that means, we can not use large page * size to map the gfn which is used as PDPT. */ static bool FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, -struct guest_walker *walker, int user_fault) +struct guest_walker *walker, int user_fault, +bool *write_fault_to_shadow_pgtable) { int level; gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker-level) - 1); + bool self_changed = false; if (!(walker-pte_access ACC_WRITE_MASK || (!is_write_protection(vcpu) !user_fault))) return false; - for (level = walker-level; level = walker-max_level; level++) - if (!((walker-gfn ^ walker-table_gfn[level - 1]) mask)) - return true; + for (level = walker-level; level = walker-max_level; level++) { + gfn_t gfn = walker-gfn ^ walker-table_gfn[level - 1]; + + self_changed |= !(gfn mask); + *write_fault_to_shadow_pgtable |= !gfn; + } - return false; + return self_changed; } /* @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int level = PT_PAGE_TABLE_LEVEL; int force_pt_level; unsigned long mmu_seq; - bool map_writable; + bool map_writable, is_self_change_mapping; pgprintk(%s: addr %lx err %x\n, __func__, addr, error_code); @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return 0; } + vcpu-arch.write_fault_to_shadow_pgtable = false; + + is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, +walker, user_fault, vcpu-arch.write_fault_to_shadow_pgtable); + if (walker.level = PT_DIRECTORY_LEVEL) force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) - || FNAME(is_self_change_mapping)(vcpu, walker, user_fault); + || is_self_change_mapping; else force_pt_level = 1; if (!force_pt_level) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6f13e03..2957012 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2) * guest to let CPU execute the instruction. */ kvm_mmu_unprotect_page(vcpu-kvm, gpa_to_gfn(gpa)); - return true; + + /* + * If the access faults on its page table, it can not + * be fixed by unprotecting shadow page and it should + * be reported to userspace. + */ + return !vcpu-arch.write_fault_to_shadow_pgtable; } This sounds wrong: only reporting emulation failure in case of a write fault to shadow pagetable? We suppose unprotecting target-gfn can avoid emulation, the same as current code. :( Current code treats access to non-mapped guest address as
Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction
On 01/11/2013 03:48 AM, Marcelo Tosatti wrote: On Fri, Jan 11, 2013 at 02:05:33AM +0800, Xiao Guangrong wrote: On 01/11/2013 01:26 AM, Marcelo Tosatti wrote: On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote: The current reexecute_instruction can not well detect the failed instruction emulation. It allows guest to retry all the instructions except it accesses on error pfn For example, some cases are nested-write-protect - if the page we want to write is used as PDE but it chains to itself. Under this case, we should stop the emulation and report the case to userspace Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/include/asm/kvm_host.h |7 +++ arch/x86/kvm/paging_tmpl.h | 27 --- arch/x86/kvm/x86.c |8 +++- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c431b33..d6ab8d2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -502,6 +502,13 @@ struct kvm_vcpu_arch { u64 msr_val; struct gfn_to_hva_cache data; } pv_eoi; + + /* + * Indicate whether the access faults on its page table in guest + * which is set when fix page fault and used to detect unhandeable + * instruction. + */ + bool write_fault_to_shadow_pgtable; }; struct kvm_lpage_info { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 67b390d..df50560 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -497,26 +497,34 @@ out_gpte_changed: * created when kvm establishes shadow page table that stop kvm using large * page size. Do it early can avoid unnecessary #PF and emulation. * + * @write_fault_to_shadow_pgtable will return true if the fault gfn is + * currently used as its page table. + * * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok * since the PDPT is always shadowed, that means, we can not use large page * size to map the gfn which is used as PDPT. */ static bool FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, -struct guest_walker *walker, int user_fault) +struct guest_walker *walker, int user_fault, +bool *write_fault_to_shadow_pgtable) { int level; gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker-level) - 1); + bool self_changed = false; if (!(walker-pte_access ACC_WRITE_MASK || (!is_write_protection(vcpu) !user_fault))) return false; - for (level = walker-level; level = walker-max_level; level++) - if (!((walker-gfn ^ walker-table_gfn[level - 1]) mask)) - return true; + for (level = walker-level; level = walker-max_level; level++) { + gfn_t gfn = walker-gfn ^ walker-table_gfn[level - 1]; + + self_changed |= !(gfn mask); + *write_fault_to_shadow_pgtable |= !gfn; + } - return false; + return self_changed; } /* @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int level = PT_PAGE_TABLE_LEVEL; int force_pt_level; unsigned long mmu_seq; - bool map_writable; + bool map_writable, is_self_change_mapping; pgprintk(%s: addr %lx err %x\n, __func__, addr, error_code); @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return 0; } + vcpu-arch.write_fault_to_shadow_pgtable = false; + + is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, +walker, user_fault, vcpu-arch.write_fault_to_shadow_pgtable); + if (walker.level = PT_DIRECTORY_LEVEL) force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) - || FNAME(is_self_change_mapping)(vcpu, walker, user_fault); + || is_self_change_mapping; else force_pt_level = 1; if (!force_pt_level) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6f13e03..2957012 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2) * guest to let CPU execute the instruction. */ kvm_mmu_unprotect_page(vcpu-kvm, gpa_to_gfn(gpa)); - return true; + + /* + * If the access faults on its page table, it can not + * be fixed by unprotecting shadow page and it should + * be reported to userspace. + */ + return !vcpu-arch.write_fault_to_shadow_pgtable; } This sounds wrong: only reporting emulation failure in case of a write fault to shadow pagetable? We suppose unprotecting target-gfn can avoid emulation, the same as current code. :( Current code treats access to non-mapped guest address as indication to exit reporting emulation failure. The patch above restricts
Re: [PULL 0/2] vfio-pci: Fixes for 1.4 stable
Pulled, thanks. Regards, Anthony Liguori -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v9 1/3] x86, apicv: add APICv register virtualization support
On Thu, Jan 10, 2013 at 03:26:06PM +0800, Yang Zhang wrote: - APIC read doesn't cause VM-Exit - APIC write becomes trap-like Signed-off-by: Kevin Tian kevin.t...@intel.com Signed-off-by: Yang Zhang yang.z.zh...@intel.com --- arch/x86/include/asm/vmx.h |2 ++ arch/x86/kvm/lapic.c | 15 +++ arch/x86/kvm/lapic.h |2 ++ arch/x86/kvm/vmx.c | 33 - 4 files changed, 51 insertions(+), 1 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index e385df9..44c3f7e 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -66,6 +66,7 @@ #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 +#define EXIT_REASON_APIC_WRITE 56 #define EXIT_REASON_INVPCID 58 #define VMX_EXIT_REASONS \ @@ -141,6 +142,7 @@ #define SECONDARY_EXEC_ENABLE_VPID 0x0020 #define SECONDARY_EXEC_WBINVD_EXITING0x0040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST0x0080 +#define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x0100 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING0x0400 #define SECONDARY_EXEC_ENABLE_INVPCID0x1000 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9392f52..0664c13 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1212,6 +1212,21 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); +/* emulate APIC access in a trap manner */ +void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) +{ + u32 val = 0; + + /* hw has done the conditional check and inst decode */ + offset = 0xff0; + + apic_reg_read(vcpu-arch.apic, offset, 4, val); + + /* TODO: optimize to just emulate side effect w/o one more write */ + apic_reg_write(vcpu-arch.apic, offset, val); +} +EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); + void kvm_free_lapic(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu-arch.apic; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index e5ebf9f..9a8ee22 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -64,6 +64,8 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); +void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset); + void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 55dfc37..688f43f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -84,6 +84,9 @@ module_param(vmm_exclusive, bool, S_IRUGO); static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); +static bool __read_mostly enable_apicv_reg_vid = 1; +module_param(enable_apicv_reg_vid, bool, S_IRUGO); + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -764,6 +767,12 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; } +static inline bool cpu_has_vmx_apic_register_virt(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl + SECONDARY_EXEC_APIC_REGISTER_VIRT; +} + static inline bool cpu_has_vmx_flexpriority(void) { return cpu_has_vmx_tpr_shadow() @@ -2541,7 +2550,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_PAUSE_LOOP_EXITING | SECONDARY_EXEC_RDTSCP | - SECONDARY_EXEC_ENABLE_INVPCID; + SECONDARY_EXEC_ENABLE_INVPCID | + SECONDARY_EXEC_APIC_REGISTER_VIRT; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, _cpu_based_2nd_exec_control) 0) @@ -2552,6 +2562,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) _cpu_based_exec_control = ~CPU_BASED_TPR_SHADOW; #endif + + if (!(_cpu_based_exec_control CPU_BASED_TPR_SHADOW)) + _cpu_based_2nd_exec_control = ~( + SECONDARY_EXEC_APIC_REGISTER_VIRT); No need for () around SECONDARY_EXEC_APIC_REGISTER_VIRT. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at
Re: [PATCH v9 3/3] x86, apicv: add virtual interrupt delivery support
Hi, Getting into good shape. On Thu, Jan 10, 2013 at 03:26:08PM +0800, Yang Zhang wrote: From: Yang Zhang yang.z.zh...@intel.com Virtual interrupt delivery avoids KVM to inject vAPIC interrupts manually, which is fully taken care of by the hardware. This needs some special awareness into existing interrupr injection path: - for pending interrupt, instead of direct injection, we may need update architecture specific indicators before resuming to guest. - A pending interrupt, which is masked by ISR, should be also considered in above update action, since hardware will decide when to inject it at right time. Current has_interrupt and get_interrupt only returns a valid vector from injection p.o.v. Signed-off-by: Kevin Tian kevin.t...@intel.com Signed-off-by: Yang Zhang yang.z.zh...@intel.com --- arch/x86/include/asm/kvm_host.h |5 + arch/x86/include/asm/vmx.h | 11 +++ arch/x86/kvm/irq.c | 56 +++- arch/x86/kvm/lapic.c| 72 +-- arch/x86/kvm/lapic.h| 23 + arch/x86/kvm/svm.c | 18 arch/x86/kvm/vmx.c | 191 +-- arch/x86/kvm/x86.c | 14 +++- include/linux/kvm_host.h|3 + virt/kvm/ioapic.c | 18 virt/kvm/ioapic.h |4 + virt/kvm/irq_comm.c | 22 + virt/kvm/kvm_main.c |5 + 13 files changed, 399 insertions(+), 43 deletions(-) static void recalculate_apic_map(struct kvm *kvm) { struct kvm_apic_map *new, *old = NULL; @@ -236,12 +219,14 @@ static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) { apic_set_reg(apic, APIC_ID, id 24); recalculate_apic_map(apic-vcpu-kvm); + ioapic_update_eoi_exitmap(apic-vcpu-kvm); } Move ioapic_update_eoi_exitmap into recalculate_apic_map. static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) { apic_set_reg(apic, APIC_LDR, id); recalculate_apic_map(apic-vcpu-kvm); + ioapic_update_eoi_exitmap(apic-vcpu-kvm); } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b203ce7..990409a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -434,6 +434,7 @@ struct vcpu_vmx { bool rdtscp_enabled; bool virtual_x2apic_enabled; + unsigned long eoi_exit_bitmap[4]; Use DECLARE_BITMAP (unsigned long is 4 bytes on 32-bit host). /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; @@ -783,7 +784,8 @@ static inline bool cpu_has_vmx_apic_register_virt(void) static inline bool cpu_has_vmx_virtual_intr_delivery(void) { - return false; + return vmcs_config.cpu_based_2nd_exec_ctrl + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; } static inline bool cpu_has_vmx_flexpriority(void) @@ -2565,7 +2567,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_PAUSE_LOOP_EXITING | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_ENABLE_INVPCID | - SECONDARY_EXEC_APIC_REGISTER_VIRT; + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, _cpu_based_2nd_exec_control) 0) @@ -2579,7 +2582,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) if (!(_cpu_based_exec_control CPU_BASED_TPR_SHADOW)) _cpu_based_2nd_exec_control = ~( - SECONDARY_EXEC_APIC_REGISTER_VIRT); + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); Nevermind the earlier comment about (). +static void set_eoi_exitmap_one(struct kvm_vcpu *vcpu, + u32 vector) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (WARN_ONCE((vector 255), + KVM VMX: vector (%d) out of range\n, vector)) + return; + __set_bit(vector, vmx-eoi_exit_bitmap); +} + +void vmx_check_ioapic_entry(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) +{ + struct kvm_lapic **dst; + struct kvm_apic_map *map; + unsigned long bitmap = 1; + int i; + + rcu_read_lock(); + map = rcu_dereference(vcpu-kvm-arch.apic_map); + + if (unlikely(!map)) { + set_eoi_exitmap_one(vcpu, irq-vector); + goto out; + } + + if (irq-dest_mode == 0) { /* physical mode */ + if (irq-delivery_mode == APIC_DM_LOWEST || + irq-dest_id == 0xff) { + set_eoi_exitmap_one(vcpu, irq-vector); + goto out; + } +
Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS
On Wed, Jan 09, 2013 at 10:37:20PM +0100, Alexander Graf wrote: We can start to introduce (and fix ARM) with a generic ioctl in the MPIC patches then. The ioctl is already generic, except for its name. It's making a few wrong assumptions: * maximum size of value is u64 This is tolerable IMHO. * combining device id (variable) with addr type id (const) into a single field. It could just be split into multiple fields I agree, but that could be lived with as well. I get that there's a tradeoff between getting something in now, versus waiting until the API is more refined. Tagging it with a particular ISA seems like an odd way of saying soon to be deprecated, though. What happens if we're still squabbling over the perfect replacement API when we're trying to push PPC MPIC stuff in? As mentioned, i fail to see the benefit in sharing 0.0x% of the code (the interface), while the remaining code is not shared. Then we're the ones who have to come up with a good interface. Or just have KVM_SET_PPC_DEVICE_ADDRESS. Is there a downside to that? Perhaps the threshold for an API becoming permanent should not be acceptance into the tree, but rather the removal of an experimental tag (including a way of shutting off experimental APIs to make sure you're not depending on them). Sort of like CONFIG_EXPERIMENTAL, except actually used for its intended purpose (distributions should have it *off* by default), and preferably managed at runtime. Sort of like drivers/staging, except for APIs rather than drivers. Changes at that point should require more justification than before merging, but would not have the strict compatibility requirement that non-experimental APIs have. This would make collaboration and testing easier on APIs that aren't ready to be permanent. This tag does exist. It's called not in Linus' tree :). * the id is 100% architecture specific. It shouldn't be. At least the device id field should be generic. That's a documentation issue that could be changed to have all architectures adopt what is currently specified for ARM, without breaking compatibility. Depends on how we want to design the final layout. I don't have the impression that we've reached final conclusion on this interface. That's all I'm saying. I'm not sure if we can come up with more problems in that API when staring at it a bit longer and/or we would actually start using it for more things. So for the sake of not holding up the ARM code, I'm perfectly fine to clutter ARM's ioctl handling code with an ioctl that is already deprecated at its introduction, as long as we don't hold everything else up meanwhile. I'm not in a position to block it, and if I were I presumably would have seen this in time for feedback to matter. That's different from actually being happy. :-) For the ARM specific ioctl it has my ack. I'd say let's go with that and start to work on a good interface ASAP. But we need an ok from Marcelo or Gleb too. Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS
On Wed, Jan 09, 2013 at 03:15:43PM -0600, Scott Wood wrote: On 01/09/2013 02:12:16 PM, Alexander Graf wrote: On 09.01.2013, at 20:50, Scott Wood wrote: On 01/09/2013 10:48:47 AM, Alexander Graf wrote: On 09.01.2013, at 17:26, Christoffer Dall wrote: Renames the KVM_SET_DEVICE_ADDRESS to KVM_ARM_SET_DEVICE_ADDR to make it obvious that this is ARM specific in lack of a better generic interface. Once we agree on a better interface the KVM/ARM code can also take advantage of that, but until then we don't want to hold up the KVM/ARM patches. Works for me. Scott, are you happy with this one too? Not really, given that it will stay around forever even after something new is introduced. But only in ARM specific code. ...which I'll probably have to deal with when Freescale's virtualization-capable ARM chips come along. I don't see it's only in that other architecture as it might as well not exist. Doesnt it make sense to make it extensible by adding some reserved space and a flags field? (to the ioctl) (independently of anything else). So that, say, supporting new ARM processors does not require adding a new ioctl? Having a single ioctl does not increase code sharing significantly because large percentage of the code both in QEMU and the kernel are not shared (well perhaps except the interface). If you're going to change the name, why not just change it to KVM_SET_DEVICE_CONFIG? Can we change the name later if nothing else changes (so it's still binary compatible)? Because that again implies that it's generic enough. And to reach that conclusion will take more time than we should spend on this now. If the conclusion later on is that it is good enough, can the name be changed then? We can start to introduce (and fix ARM) with a generic ioctl in the MPIC patches then. The ioctl is already generic, except for its name. It's making a few wrong assumptions: * maximum size of value is u64 This is tolerable IMHO. * combining device id (variable) with addr type id (const) into a single field. It could just be split into multiple fields I agree, but that could be lived with as well. I get that there's a tradeoff between getting something in now, versus waiting until the API is more refined. Tagging it with a particular ISA seems like an odd way of saying soon to be deprecated, though. What happens if we're still squabbling over the perfect replacement API when we're trying to push PPC MPIC stuff in? Perhaps the threshold for an API becoming permanent should not be acceptance into the tree, but rather the removal of an experimental tag (including a way of shutting off experimental APIs to make sure you're not depending on them). Sort of like CONFIG_EXPERIMENTAL, except actually used for its intended purpose (distributions should have it *off* by default), and preferably managed at runtime. Sort of like drivers/staging, except for APIs rather than drivers. Changes at that point should require more justification than before merging, but would not have the strict compatibility requirement that non-experimental APIs have. This would make collaboration and testing easier on APIs that aren't ready to be permanent. * the id is 100% architecture specific. It shouldn't be. At least the device id field should be generic. That's a documentation issue that could be changed to have all architectures adopt what is currently specified for ARM, without breaking compatibility. I'm not sure if we can come up with more problems in that API when staring at it a bit longer and/or we would actually start using it for more things. So for the sake of not holding up the ARM code, I'm perfectly fine to clutter ARM's ioctl handling code with an ioctl that is already deprecated at its introduction, as long as we don't hold everything else up meanwhile. I'm not in a position to block it, and if I were I presumably would have seen this in time for feedback to matter. That's different from actually being happy. :-) -Scott -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS
On 01/10/2013 04:28:01 PM, Marcelo Tosatti wrote: On Wed, Jan 09, 2013 at 10:37:20PM +0100, Alexander Graf wrote: We can start to introduce (and fix ARM) with a generic ioctl in the MPIC patches then. The ioctl is already generic, except for its name. It's making a few wrong assumptions: * maximum size of value is u64 This is tolerable IMHO. * combining device id (variable) with addr type id (const) into a single field. It could just be split into multiple fields I agree, but that could be lived with as well. I get that there's a tradeoff between getting something in now, versus waiting until the API is more refined. Tagging it with a particular ISA seems like an odd way of saying soon to be deprecated, though. What happens if we're still squabbling over the perfect replacement API when we're trying to push PPC MPIC stuff in? As mentioned, i fail to see the benefit in sharing 0.0x% of the code (the interface), while the remaining code is not shared. Pointlessly making things architecture-specific just makes more work for people who later need the functionality on another architecture. It might not be much in this case (unless a particular device ends up being used on multiple architectures), but the principle still applies. It's also more work for tools like strace, and could get in the way of the userspace caller using common code. Then we're the ones who have to come up with a good interface. Or just have KVM_SET_PPC_DEVICE_ADDRESS. Is there a downside to that? Besides the above, and my original complaint that it shouldn't be specific to addresses? -Scott -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH qom-cpu 3/7] target-i386: Disable kvm_mmu by default
Am 07.01.2013 19:20, schrieb Eduardo Habkost: KVM_CAP_PV_MMU capability reporting was removed from the kernel since v2.6.33 (see commit a68a6a7282373), and was completely removed from the kernel since v3.3 (see commit fb92045843). It doesn't make sense to keep it enabled by default, as it would cause unnecessary hassle when using the enforce flag. This disables kvm_mmu on all machine-types. With this fix, the possible scenarios when migrating from QEMU = 1.3 to QEMU 1.4 are; ++ src kernel | dst kernel | Result ++ = 2.6.33 | any| kvm_mmu was already disabled and will stay disabled = 2.6.32 | = 3.3 | correct live migration is impossible = 2.6.32 | = 3.2 | kvm_mmu will be disabled on next guest reboot * ++ When using ASCII art, please remember to use at most 76 characters, to avoid linewraps in git-log. Shortening the second column fixes this. Andreas * If they are running kernel = 2.6.32 and want kvm_mmu to be kept enabled on guest reboot, they can explicitly add +kvm_mmu to the QEMU command-line. Using 2.6.33 and higher, it is not possible to enable kvm_mmu explicitly anymore. Signed-off-by: Eduardo Habkost ehabk...@redhat.com -- SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH qom-cpu 6/7] target-i386/cpu.c: Add feature name array for ext4_features
Am 07.01.2013 19:20, schrieb Eduardo Habkost: Feature names were taken from the X86_FEATURE_* constants in the Linux kernel code. Signed-off-by: Eduardo Habkost ehabk...@redhat.com --- Cc: Gleb Natapov g...@redhat.com --- target-i386/cpu.c | 17 + 1 file changed, 17 insertions(+) diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 4b3ee63..a54c464 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -95,6 +95,17 @@ static const char *ext3_feature_name[] = { NULL, NULL, NULL, NULL, }; +static const char *ext4_feature_name[] = { +NULL, NULL, xstore,xstore-en, +NULL, NULL, xcrypt,xcrypt-en, Missing spaces, fixed. Andreas +ace2, ace2-en, phe, phe-en, +pmm, pmm-en, NULL, NULL, +NULL, NULL, NULL, NULL, +NULL, NULL, NULL, NULL, +NULL, NULL, NULL, NULL, +NULL, NULL, NULL, NULL, +}; + static const char *kvm_feature_name[] = { kvmclock, kvm_nopiodelay, kvm_mmu, kvmclock, kvm_asyncpf, kvm_steal_time, kvm_pv_eoi, NULL, [snip] -- SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [RFC 01/12] kvm: add KVM_FEATURE_CLOCKSOURCE_STABLE_BIT fake #define
On Wed, 9 Jan 2013 16:53:41 -0200 Eduardo Habkost ehabk...@redhat.com wrote: Signed-off-by: Eduardo Habkost ehabk...@redhat.com --- Cc: kvm@vger.kernel.org Cc: Michael S. Tsirkin m...@redhat.com Cc: Gleb Natapov g...@redhat.com Cc: Marcelo Tosatti mtosa...@redhat.com --- include/sysemu/kvm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index 6bdd513..22acf91 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -36,6 +36,7 @@ #define KVM_FEATURE_ASYNC_PF 0 #define KVM_FEATURE_STEAL_TIME 0 #define KVM_FEATURE_PV_EOI 0 +#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 0 #endif extern int kvm_allowed; -- 1.7.11.7 Perhaps kvm: add fake KVM_FEATURE_CLOCKSOURCE_STABLE_BIT for TCG only configurations would be better -- Regards, Igor -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 07/12] target-i386/cpu: Introduce apic_id_for_cpu() function
On Wed, 9 Jan 2013 16:53:47 -0200 Eduardo Habkost ehabk...@redhat.com wrote: This function will be used by both the CPU initialization code and the fw_cfg table initialization code. Later this function will be updated to generate APIC IDs according to the CPU topology. Signed-off-by: Eduardo Habkost ehabk...@redhat.com --- target-i386/cpu.c | 17 - target-i386/cpu.h | 2 ++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 492656c..33787dc 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -2192,6 +2192,21 @@ void x86_cpu_realize(Object *obj, Error **errp) cpu_reset(CPU(cpu)); } +/* Calculates initial APIC ID for a specific CPU index + * + * Currently we need to be able to calculate the APIC ID from the CPU index + * alone (without requiring a CPU object), as the QEMU-Seabios interfaces have + * no concept of CPU index, and the NUMA tables on fw_cfg need the APIC ID of + * all CPUs up to max_cpus. + */ +uint32_t apic_id_for_cpu(unsigned int cpu_index) +{ +/* right now APIC ID == CPU index. this will eventually change to use + * the CPU topology configuration properly + */ +return cpu_index; +} + static void x86_cpu_initfn(Object *obj) { CPUState *cs = CPU(obj); @@ -2226,7 +2241,7 @@ static void x86_cpu_initfn(Object *obj) x86_cpuid_get_tsc_freq, x86_cpuid_set_tsc_freq, NULL, NULL, NULL); -env-cpuid_apic_id = cs-cpu_index; +env-cpuid_apic_id = apic_id_for_cpu(cs-cpu_index); APIC ID is set by board so CPU shouldn't do it by itself. Could you create static property for it and set it from board using property instead? /* init various static tables used in TCG mode */ if (tcg_enabled() !inited) { diff --git a/target-i386/cpu.h b/target-i386/cpu.h index f3c9df5..dbd9899 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -1238,4 +1238,6 @@ void disable_kvm_pv_eoi(void); /* Return name of 32-bit register, from a R_* constant */ const char *get_register_name_32(unsigned int reg); +uint32_t apic_id_for_cpu(unsigned int cpu_index); + #endif /* CPU_I386_H */ -- 1.7.11.7 -- Regards, Igor -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 12/12] pc: Generate APIC IDs according to CPU topology
On Wed, 9 Jan 2013 16:53:52 -0200 Eduardo Habkost ehabk...@redhat.com wrote: This keeps compatibility on machine-types pc-1.2 and older, and prints a warning in case the requested configuration won't get the correct topology. I couldn't think of a better way to warn about broken topology when in compat mode other than using error_report(). The warning message will be probably be buried in a log file somewhere, but it's better than nothing. Signed-off-by: Eduardo Habkost ehabk...@redhat.com --- Changes v1 - v2: - Move code to cpu.c - keep using cpu_index on *-user - Use SMP.contiguous_apic_ids global property - Prints warning in case the compatibility mode will expose incorrect topology Changes v2 - v3: - Now all code is inside hw/pc.c - Use a real PC class and a contiguous_apic_ids property Changes v3 - v4: - Instead of using a global property, use a separate machine init function and a PCInitArgs field, to implement compatibility mode - Use error_report() instead of fprintf(stderr) for the warning - Use a field on PCInitArgs instead of a static variable to check if warning was already printed Changes v4 - v5: - Don't use PCInitArgs: simply add a enable_compat_apic_id_mode() function and a static compat_apic_id_mode variable, to enable the compatibility mode - Move APIC ID calculation code to cpu.c --- hw/pc_piix.c | 13 +++-- target-i386/cpu.c | 28 target-i386/cpu.h | 1 + 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/hw/pc_piix.c b/hw/pc_piix.c index 13d7cc8..be4fea1 100644 --- a/hw/pc_piix.c +++ b/hw/pc_piix.c @@ -233,11 +233,18 @@ static void pc_init_pci(QEMUMachineInitArgs *args) initrd_filename, cpu_model, 1, 1); } + +static void pc_init_pci_1_3(QEMUMachineInitArgs *args) +{ +enable_compat_apic_id_mode(); +pc_init_pci(args); +} + /* PC machine init function for pc-0.14 to pc-1.2 */ static void pc_init_pci_1_2(QEMUMachineInitArgs *args) { disable_kvm_pv_eoi(); -pc_init_pci(args); +pc_init_pci_1_3(args); } /* PC init function for pc-0.10 to pc-0.13, and reused by xenfv */ @@ -250,6 +257,7 @@ static void pc_init_pci_no_kvmclock(QEMUMachineInitArgs *args) const char *initrd_filename = args-initrd_filename; const char *boot_device = args-boot_device; disable_kvm_pv_eoi(); +enable_compat_apic_id_mode(); pc_init1(get_system_memory(), get_system_io(), ram_size, boot_device, @@ -268,6 +276,7 @@ static void pc_init_isa(QEMUMachineInitArgs *args) if (cpu_model == NULL) cpu_model = 486; disable_kvm_pv_eoi(); +enable_compat_apic_id_mode(); pc_init1(get_system_memory(), get_system_io(), ram_size, boot_device, @@ -305,7 +314,7 @@ static QEMUMachine pc_machine_v1_4 = { static QEMUMachine pc_machine_v1_3 = { .name = pc-1.3, .desc = Standard PC, -.init = pc_init_pci, +.init = pc_init_pci_1_3, .max_cpus = 255, .compat_props = (GlobalProperty[]) { PC_COMPAT_1_3, diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 33787dc..f34192c 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -23,6 +23,8 @@ #include cpu.h #include sysemu/kvm.h +#include sysemu/cpus.h +#include topology.h #include qemu/option.h #include qemu/config-file.h @@ -2192,6 +2194,14 @@ void x86_cpu_realize(Object *obj, Error **errp) cpu_reset(CPU(cpu)); } +/* Enables contiguous-apic-ID mode, for compatibility */ +static bool compat_apic_id_mode; + +void enable_compat_apic_id_mode(void) +{ +compat_apic_id_mode = true; +} + /* Calculates initial APIC ID for a specific CPU index * * Currently we need to be able to calculate the APIC ID from the CPU index @@ -2201,10 +2211,20 @@ void x86_cpu_realize(Object *obj, Error **errp) */ uint32_t apic_id_for_cpu(unsigned int cpu_index) if you move ^^^ to board, there won't be need in [8/12] { -/* right now APIC ID == CPU index. this will eventually change to use - * the CPU topology configuration properly - */ -return cpu_index; +uint32_t correct_id; +static bool warned; + +correct_id = topo_apicid_for_cpu(smp_cores, smp_threads, cpu_index); +if (compat_apic_id_mode) { +if (cpu_index != correct_id !warned) { +error_report(APIC IDs set in compatibility mode, + CPU topology won't match the configuration); +warned = true; +} +return cpu_index; +} else { +return correct_id; +} } static void x86_cpu_initfn(Object *obj) diff --git a/target-i386/cpu.h b/target-i386/cpu.h index dbd9899..d9a9e8f 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -1239,5 +1239,6 @@ void disable_kvm_pv_eoi(void); const char
Re: [RFC 12/12] pc: Generate APIC IDs according to CPU topology
On Fri, Jan 11, 2013 at 12:36:35AM +0100, Igor Mammedov wrote: [...] diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 33787dc..f34192c 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -23,6 +23,8 @@ #include cpu.h #include sysemu/kvm.h +#include sysemu/cpus.h +#include topology.h #include qemu/option.h #include qemu/config-file.h @@ -2192,6 +2194,14 @@ void x86_cpu_realize(Object *obj, Error **errp) cpu_reset(CPU(cpu)); } +/* Enables contiguous-apic-ID mode, for compatibility */ +static bool compat_apic_id_mode; + +void enable_compat_apic_id_mode(void) +{ +compat_apic_id_mode = true; +} + /* Calculates initial APIC ID for a specific CPU index * * Currently we need to be able to calculate the APIC ID from the CPU index @@ -2201,10 +2211,20 @@ void x86_cpu_realize(Object *obj, Error **errp) */ uint32_t apic_id_for_cpu(unsigned int cpu_index) if you move ^^^ to board, there won't be need in [8/12] True. The previous version I had (apic-id-topology-using-apic-id-prop branch) didn't need it. But I just wanted to avoid touching the CPU creation code by now. BTW, *-user doesn't have a board but does have an APIC ID (because *-user has CPUID and CPUID exposes the APIC ID). This would mean we would need APIC ID calculation code present in two separated places (which I would like to avoid). { -/* right now APIC ID == CPU index. this will eventually change to use - * the CPU topology configuration properly - */ -return cpu_index; +uint32_t correct_id; +static bool warned; + +correct_id = topo_apicid_for_cpu(smp_cores, smp_threads, cpu_index); +if (compat_apic_id_mode) { +if (cpu_index != correct_id !warned) { +error_report(APIC IDs set in compatibility mode, + CPU topology won't match the configuration); +warned = true; +} +return cpu_index; +} else { +return correct_id; +} } static void x86_cpu_initfn(Object *obj) diff --git a/target-i386/cpu.h b/target-i386/cpu.h index dbd9899..d9a9e8f 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -1239,5 +1239,6 @@ void disable_kvm_pv_eoi(void); const char *get_register_name_32(unsigned int reg); uint32_t apic_id_for_cpu(unsigned int cpu_index); +void enable_compat_apic_id_mode(void); #endif /* CPU_I386_H */ -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- Regards, Igor -- Eduardo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 07/12] target-i386/cpu: Introduce apic_id_for_cpu() function
On Fri, Jan 11, 2013 at 12:31:23AM +0100, Igor Mammedov wrote: On Wed, 9 Jan 2013 16:53:47 -0200 Eduardo Habkost ehabk...@redhat.com wrote: This function will be used by both the CPU initialization code and the fw_cfg table initialization code. Later this function will be updated to generate APIC IDs according to the CPU topology. Signed-off-by: Eduardo Habkost ehabk...@redhat.com --- target-i386/cpu.c | 17 - target-i386/cpu.h | 2 ++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 492656c..33787dc 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -2192,6 +2192,21 @@ void x86_cpu_realize(Object *obj, Error **errp) cpu_reset(CPU(cpu)); } +/* Calculates initial APIC ID for a specific CPU index + * + * Currently we need to be able to calculate the APIC ID from the CPU index + * alone (without requiring a CPU object), as the QEMU-Seabios interfaces have + * no concept of CPU index, and the NUMA tables on fw_cfg need the APIC ID of + * all CPUs up to max_cpus. + */ +uint32_t apic_id_for_cpu(unsigned int cpu_index) +{ +/* right now APIC ID == CPU index. this will eventually change to use + * the CPU topology configuration properly + */ +return cpu_index; +} + static void x86_cpu_initfn(Object *obj) { CPUState *cs = CPU(obj); @@ -2226,7 +2241,7 @@ static void x86_cpu_initfn(Object *obj) x86_cpuid_get_tsc_freq, x86_cpuid_set_tsc_freq, NULL, NULL, NULL); -env-cpuid_apic_id = cs-cpu_index; +env-cpuid_apic_id = apic_id_for_cpu(cs-cpu_index); APIC ID is set by board so CPU shouldn't do it by itself. Could you create static property for it and set it from board using property instead? I had a version that had the PC CPU creation code set the apic-id using a apic-id property, but I am using this method to keep the bug fix simpler and less intrusive, as I we don't have much time to remodel the CPU creation code before soft freeze. The previous version can be seen at: https://github.com/ehabkost/qemu-hacks/tree/apic-id-topology-using-apic-id-prop The relevant apic-id-setting code is at: https://github.com/ehabkost/qemu-hacks/commit/a79156d0ad99ee89c0def535ce4cafcc4aa9a486 I believe the final solution for calculating the APIC IDs should instead involve passing the package ID and bit width of core+thread IDs to a package/socket object (so we don't need to expose the APIC IDs outside QEMU). But whatever design we're going to choose to allow CPU hotplug, I didn't want to introduce complexity that will get into our way when we remodel the CPU creation/hotplug interfaces later. /* init various static tables used in TCG mode */ if (tcg_enabled() !inited) { diff --git a/target-i386/cpu.h b/target-i386/cpu.h index f3c9df5..dbd9899 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -1238,4 +1238,6 @@ void disable_kvm_pv_eoi(void); /* Return name of 32-bit register, from a R_* constant */ const char *get_register_name_32(unsigned int reg); +uint32_t apic_id_for_cpu(unsigned int cpu_index); + #endif /* CPU_I386_H */ -- 1.7.11.7 -- Regards, Igor -- Eduardo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 02/12] target-i386: Don't set any KVM flag by default if KVM is disabled
On Wed, 9 Jan 2013 16:53:42 -0200 Eduardo Habkost ehabk...@redhat.com wrote: This is a cleanup that tries to solve two small issues: - We don't need a separate kvm_pv_eoi_features variable just to keep a constant calculated at compile-time, and this style would require adding a separate variable (that's declared twice because of the CONFIG_KVM ifdef) for each feature that's going to be enabled/disable by machine-type compat code. - The pc-1.3 code is setting the kvm_pv_eoi flag on cpuid_kvm_features even when KVM is disabled at runtime. This small incosistency in the cpuid_kvm_features field isn't a problem today because cpuid_kvm_features is ignored by the TCG code, but it may cause unexpected problems later when refactoring the CPUID handling code. This patch eliminates the kvm_pv_eoi_features variable and simply uses kvm_enabled() inside the enable_kvm_pv_eoi() compat function, so it enables kvm_pv_eoi only if KVM is enabled. I believe this makes the behavior of enable_kvm_pv_eoi() clearer and easier to understand. Signed-off-by: Eduardo Habkost ehabk...@redhat.com --- Cc: kvm@vger.kernel.org Cc: Michael S. Tsirkin m...@redhat.com Cc: Gleb Natapov g...@redhat.com Cc: Marcelo Tosatti mtosa...@redhat.com [snip] void host_cpuid(uint32_t function, uint32_t count, @@ -1343,13 +1337,15 @@ static int cpu_x86_parse_featurestr(x86_def_t *x86_cpu_def, char *features) unsigned int i; char *featurestr; /* Single 'key=value string being parsed */ /* Features to be added */ -FeatureWordArray plus_features = { -[FEAT_KVM] = kvm_default_features, -}; +FeatureWordArray plus_features = { 0 }; /* Features to be removed */ FeatureWordArray minus_features = { 0 }; uint32_t numvalue; +if (kvm_enabled()) { +plus_features[FEAT_KVM] = kvm_default_features; +} While touching it please move setting defaults to cpu_x86_register() or cpu_x86_find_by_name() to so that cpu_x86_parse_featurestr() would deal only with custom settings. [snip] -- Regards, Igor -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 07/12] target-i386/cpu: Introduce apic_id_for_cpu() function
On Fri, Jan 11, 2013 at 12:31:23AM +0100, Igor Mammedov wrote: On Wed, 9 Jan 2013 16:53:47 -0200 Eduardo Habkost ehabk...@redhat.com wrote: This function will be used by both the CPU initialization code and the fw_cfg table initialization code. Later this function will be updated to generate APIC IDs according to the CPU topology. Signed-off-by: Eduardo Habkost ehabk...@redhat.com --- target-i386/cpu.c | 17 - target-i386/cpu.h | 2 ++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 492656c..33787dc 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -2192,6 +2192,21 @@ void x86_cpu_realize(Object *obj, Error **errp) cpu_reset(CPU(cpu)); } +/* Calculates initial APIC ID for a specific CPU index + * + * Currently we need to be able to calculate the APIC ID from the CPU index + * alone (without requiring a CPU object), as the QEMU-Seabios interfaces have + * no concept of CPU index, and the NUMA tables on fw_cfg need the APIC ID of + * all CPUs up to max_cpus. + */ +uint32_t apic_id_for_cpu(unsigned int cpu_index) +{ +/* right now APIC ID == CPU index. this will eventually change to use + * the CPU topology configuration properly + */ +return cpu_index; +} + static void x86_cpu_initfn(Object *obj) { CPUState *cs = CPU(obj); @@ -2226,7 +2241,7 @@ static void x86_cpu_initfn(Object *obj) x86_cpuid_get_tsc_freq, x86_cpuid_set_tsc_freq, NULL, NULL, NULL); -env-cpuid_apic_id = cs-cpu_index; +env-cpuid_apic_id = apic_id_for_cpu(cs-cpu_index); APIC ID is set by board so CPU shouldn't do it by itself. Could you create static property for it and set it from board using property instead? I had a version that had the PC CPU creation code set the apic-id using a apic-id property, but I am using this method to keep the bug fix simpler and less intrusive, as I we don't have much time to remodel the CPU creation code before soft freeze. The previous version can be seen at: https://github.com/ehabkost/qemu-hacks/tree/apic-id-topology-using-apic-id-prop The relevant apic-id-setting code is at: https://github.com/ehabkost/qemu-hacks/commit/a79156d0ad99ee89c0def535ce4cafcc4aa9a486 I believe the final solution for calculating the APIC IDs should instead involve passing the package ID and bit width of core+thread IDs to a package/socket object (so we don't need to expose the APIC IDs outside QEMU). But whatever design we're going to choose to allow CPU hotplug, I didn't want to introduce complexity that will get into our way when we remodel the CPU creation/hotplug interfaces later. /* init various static tables used in TCG mode */ if (tcg_enabled() !inited) { diff --git a/target-i386/cpu.h b/target-i386/cpu.h index f3c9df5..dbd9899 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -1238,4 +1238,6 @@ void disable_kvm_pv_eoi(void); /* Return name of 32-bit register, from a R_* constant */ const char *get_register_name_32(unsigned int reg); +uint32_t apic_id_for_cpu(unsigned int cpu_index); + #endif /* CPU_I386_H */ -- 1.7.11.7 -- Regards, Igor -- Eduardo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 02/12] target-i386: Don't set any KVM flag by default if KVM is disabled
On Fri, Jan 11, 2013 at 12:07:40AM +0100, Igor Mammedov wrote: [...] @@ -1343,13 +1337,15 @@ static int cpu_x86_parse_featurestr(x86_def_t *x86_cpu_def, char *features) unsigned int i; char *featurestr; /* Single 'key=value string being parsed */ /* Features to be added */ -FeatureWordArray plus_features = { -[FEAT_KVM] = kvm_default_features, -}; +FeatureWordArray plus_features = { 0 }; /* Features to be removed */ FeatureWordArray minus_features = { 0 }; uint32_t numvalue; +if (kvm_enabled()) { +plus_features[FEAT_KVM] = kvm_default_features; +} While touching it please move setting defaults to cpu_x86_register() or cpu_x86_find_by_name() to so that cpu_x86_parse_featurestr() would deal only with custom settings. OK, I will do it in the next version. -- Eduardo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [RFC 01/12] kvm: add KVM_FEATURE_CLOCKSOURCE_STABLE_BIT fake #define
On Fri, Jan 11, 2013 at 12:15:16AM +0100, Igor Mammedov wrote: [...] diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index 6bdd513..22acf91 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -36,6 +36,7 @@ #define KVM_FEATURE_ASYNC_PF 0 #define KVM_FEATURE_STEAL_TIME 0 #define KVM_FEATURE_PV_EOI 0 +#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 0 #endif snip Perhaps kvm: add fake KVM_FEATURE_CLOCKSOURCE_STABLE_BIT for TCG only configurations would be better I'll change it in the next version. Thanks. -- Eduardo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS
On Thu, Jan 10, 2013 at 04:40:12PM -0600, Scott Wood wrote: On 01/10/2013 04:28:01 PM, Marcelo Tosatti wrote: On Wed, Jan 09, 2013 at 10:37:20PM +0100, Alexander Graf wrote: We can start to introduce (and fix ARM) with a generic ioctl in the MPIC patches then. The ioctl is already generic, except for its name. It's making a few wrong assumptions: * maximum size of value is u64 This is tolerable IMHO. * combining device id (variable) with addr type id (const) into a single field. It could just be split into multiple fields I agree, but that could be lived with as well. I get that there's a tradeoff between getting something in now, versus waiting until the API is more refined. Tagging it with a particular ISA seems like an odd way of saying soon to be deprecated, though. What happens if we're still squabbling over the perfect replacement API when we're trying to push PPC MPIC stuff in? As mentioned, i fail to see the benefit in sharing 0.0x% of the code (the interface), while the remaining code is not shared. Pointlessly making things architecture-specific just makes more work for people who later need the functionality on another architecture. It might not be much in this case (unless a particular device ends up being used on multiple architectures), but the principle still applies. It's also more work for tools like strace, and could get in the way of the userspace caller using common code. Then we're the ones who have to come up with a good interface. Or just have KVM_SET_PPC_DEVICE_ADDRESS. Is there a downside to that? Besides the above, and my original complaint that it shouldn't be specific to addresses? -Scott I did not really grasp that ('shouldnt be specific to addresses'), but anyway. OK, can you write down your proposed improvements to the interface? In case you have something ready, otherwise there is time pressure to merge the ARM port. That is, if you have interest/energy to spend in a possibly reusable interface, as long as that does not delay integration of the ARM code, i don't think the ARM people will mind that. As mentioned in the thread, they (ARM) require configurable device address. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH qom-cpu 0/7] disable kvm_mmu + -cpu enforce fixes (v3)
Am 07.01.2013 19:20, schrieb Eduardo Habkost: Eduardo Habkost (7): kvm: Add fake KVM constants to avoid #ifdefs on KVM-specific code [...] target-i386: Disable kvm_mmu by default target-i386/cpu: Introduce FeatureWord typedefs target-i386: kvm_check_features_against_host(): Use feature_word_info target-i386/cpu.c: Add feature name array for ext4_features target-i386: check/enforce: Check all feature words Thanks, applied these to qom-cpu: https://github.com/afaerber/qemu-cpu/commits/qom-cpu Andreas -- SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS
On 01/10/2013 06:35:02 PM, Marcelo Tosatti wrote: On Thu, Jan 10, 2013 at 04:40:12PM -0600, Scott Wood wrote: On 01/10/2013 04:28:01 PM, Marcelo Tosatti wrote: Or just have KVM_SET_PPC_DEVICE_ADDRESS. Is there a downside to that? Besides the above, and my original complaint that it shouldn't be specific to addresses? -Scott I did not really grasp that ('shouldnt be specific to addresses'), but anyway. A device may have other configuration parameters that need to be set, besides addresses. PPC MPIC will require information about the vendor and version, for example. OK, can you write down your proposed improvements to the interface? In case you have something ready, otherwise there is time pressure to merge the ARM port. My original request was just to change the name to something like KVM_SET_DEVICE_CONFIG or KVM_SET_DEVICE_ATTR, and not make the id encoding architecture-specific (preferably, separate into a device id field and an attribute id field rather than using bitfields). Actual values for device id could be architecture-specific (or there could be a global enumeration), and attribute id values would be device-specific. Alex suggested that an ideal interface might accept values larger than 64 bits, though I think it's good enough -- there are currently no proposed uses that need more than 64 bits for a single attribute (unlike ONE_REG), and if it is needed, such configuration could be split up between multiple attributes, or the attribute could specify that value be a userspace pointer to the actual data (as with ONE_REG). Here's a writeup (the ARM details would go under ARM/vGIC-specific documentation): 4.80 KVM_SET_DEVICE_ATTR Capability: KVM_CAP_SET_DEVICE_ATTR Type: vm ioctl Parameters: struct kvm_device_attr (in) Returns: 0 on success, -1 on error Errors: ENODEV: The device id is unknown ENXIO: Device not supported on current system Other errors may be returned by specific devices and attributes. struct kvm_device_attr { __u32 device; __u32 attr; __u64 value; }; Specify an attribute of a device emulated or directly exposed by the kernel, which the host kernel needs to know about. The device field is an architecture-specific identifier for a specific device. The attr field is a device-specific identifier for a specific attribute. Individual attributes may have particular requirements for when they can and cannot be set. That is, if you have interest/energy to spend in a possibly reusable interface, as long as that does not delay integration of the ARM code, i don't think the ARM people will mind that. The impression I've been given is that just about any change will delay the integration at this point. If that's the case, and everyone's OK with having an interface that is deprecated on arrival, then fine. -Scott -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH v9 2/3] x86, apicv: add virtual x2apic support
Gleb Natapov wrote on 2013-01-10: On Thu, Jan 10, 2013 at 08:32:06AM +, Zhang, Yang Z wrote: Gleb Natapov wrote on 2013-01-10: On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote: From: Yang Zhang yang.z.zh...@intel.com basically to benefit from apicv, we need to enable virtualized x2apic mode. Currently, we only enable it when guest is really using x2apic. Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled x2apic: 0x800 - 0x8ff: no read intercept for apicv register virtualization, except APIC ID and TMCCT. APIC ID and TMCCT: need software's assistance to get right value. TPR,EOI,SELF-IPI: no write intercept for virtual interrupt delivery. Signed-off-by: Kevin Tian kevin.t...@intel.com Signed-off-by: Yang Zhang yang.z.zh...@intel.com --- arch/x86/include/asm/kvm_host.h |2 + arch/x86/include/asm/vmx.h |1 + arch/x86/kvm/lapic.c|5 +- arch/x86/kvm/svm.c |6 + arch/x86/kvm/vmx.c | 194 +-- 5 files changed, 200 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c431b33..572a562 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -697,6 +697,8 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); + void (*enable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu); + void (*disable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu); Make one callback with enable/disable parameter. And do not forget SVM. int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 44c3f7e..0a54df0 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -139,6 +139,7 @@ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x0001 #define SECONDARY_EXEC_ENABLE_EPT 0x0002 #define SECONDARY_EXEC_RDTSCP 0x0008 +#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x0010 #define SECONDARY_EXEC_ENABLE_VPID 0x0020 #define SECONDARY_EXEC_WBINVD_EXITING 0x0040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x0080 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0664c13..ec38906 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1328,7 +1328,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) u32 id = kvm_apic_id(apic); u32 ldr = ((id 4) 16) | (1 (id 0xf)); kvm_apic_set_ldr(apic, ldr); - } + kvm_x86_ops-enable_virtual_x2apic_mode(vcpu); + } else + kvm_x86_ops-disable_virtual_x2apic_mode(vcpu); + You just broke SVM. apic-base_address = apic-vcpu-arch.apic_base MSR_IA32_APICBASE_BASE; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d29d3cd..0b82cb1 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) set_cr_intercept(svm, INTERCEPT_CR8_WRITE); } +static void svm_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu) +{ + return; +} + static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops svm_x86_ops = { .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, + .enable_virtual_x2apic_mode = svm_enable_virtual_x2apic_mode, .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 688f43f..b203ce7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -433,6 +433,8 @@ struct vcpu_vmx { bool rdtscp_enabled; + bool virtual_x2apic_enabled; + /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; }; @@ -767,12 +769,23 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; } +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +} + static inline bool cpu_has_vmx_apic_register_virt(void) { return vmcs_config.cpu_based_2nd_exec_ctrl SECONDARY_EXEC_APIC_REGISTER_VIRT; } +static inline bool cpu_has_vmx_virtual_intr_delivery(void) +{ + return false; +} + static inline bool cpu_has_vmx_flexpriority(void) { return cpu_has_vmx_tpr_shadow() @@ -2544,6 +2557,7 @@
Re: [RFC PATCH 0/2] make mac programming for virtio net more robust
Michael S. Tsirkin m...@redhat.com writes: On Thu, Jan 10, 2013 at 10:45:39PM +0800, ak...@redhat.com wrote: From: Amos Kong ak...@redhat.com Currenly mac is programmed byte by byte. This means that we have an intermediate step where mac is wrong. Second patch introduced a new vq control command to set mac address in one time. As you mention we could alternatively do it without new commands, simply add a feature bit that says that MACs are in the mac table. This would be a much bigger patch, and I'm fine with either way. Rusty what do you think? Hmm, mac filtering and my mac address are not quite the same thing. I don't know if it matters for anyone: does it? The mac address is abused for things like identifying machines, etc. If we keep it as a separate concept, Amos' patch seems to make sense. Cheers, Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 2/2] virtio-net: introduce a new control to set macaddr
Michael S. Tsirkin m...@redhat.com writes: +if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) { +/* Set MAC address by writing config space */ vdev-config-set(vdev, offsetof(struct virtio_net_config, mac), dev-dev_addr, dev-addr_len); +} By the way, why don't we fail the command if VIRTIO_NET_F_MAC is off? Rusty? Looked back through the history for this one. I think the theory is that if the guest doesn't set VIRTIO_NET_F_MAC, it means it doesn't care: it's up to the guest. Cheers, Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH KVM v2 2/4] KVM: additional i8254 output fixes
On Tue, Jan 08, 2013 at 09:41:36AM +0200, Gleb Natapov wrote: On Mon, Jan 07, 2013 at 06:35:47PM -0600, mmogi...@miniinfo.net wrote: On Mon, 7 Jan 2013 14:04:03 +0200, Gleb Natapov g...@redhat.com wrote: On Wed, Dec 26, 2012 at 10:39:54PM -0700, Matthew Ogilvie wrote: Make git_get_out() consistent with spec. Currently pit_get_out() doesn't affect IRQ0, but it can be read by the guest in other ways. This makes it consistent with proposed changes in qemu's i8254 model as well. See http://bochs.sourceforge.net/techspec/intel-82c54-timer.pdf.gz or search the net for 23124406.pdf. Signed-off-by: Matthew Ogilvie mmogilvi_q...@miniinfo.net --- arch/x86/kvm/i8254.c | 44 ++-- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index cd4ec60..fd38938 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -144,6 +144,10 @@ static int pit_get_count(struct kvm *kvm, int channel) WARN_ON(!mutex_is_locked(kvm-arch.vpit-pit_state.lock)); +/* FIXME: Add some way to represent a paused timer and return + * the paused-at counter value, to better model gate pausing, + * wait until next CLK pulse to load counter logic, etc. + */ t = kpit_elapsed(kvm, c, channel); d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); @@ -155,8 +159,7 @@ static int pit_get_count(struct kvm *kvm, int channel) counter = (c-count - d) 0x; break; case 3: -/* XXX: may be incorrect for odd counts */ -counter = c-count - (mod_64((2 * d), c-count)); +counter = (c-count - (mod_64((2 * d), c-count))) 0xfffe; break; default: counter = c-count - mod_64(d, c-count); @@ -180,20 +183,18 @@ static int pit_get_out(struct kvm *kvm, int channel) switch (c-mode) { default: case 0: -out = (d = c-count); -break; case 1: -out = (d c-count); +out = (d = c-count); break; case 2: -out = ((mod_64(d, c-count) == 0) (d != 0)); +out = (mod_64(d, c-count) != (c-count - 1) || c-gate == 0); break; case 3: -out = (mod_64(d, c-count) ((c-count + 1) 1)); +out = (mod_64(d, c-count) ((c-count + 1) 1) || c-gate == 0); break; case 4: case 5: -out = (d == c-count); +out = (d != c-count); break; } @@ -367,7 +368,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) /* * The largest possible initial count is 0; this is equivalent - * to 216 for binary counting and 104 for BCD counting. + * to pow(2,16) for binary counting and pow(10,4) for BCD counting. */ if (val == 0) val = 0x1; @@ -376,6 +377,26 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) if (channel != 0) { ps-channels[channel].count_load_time = ktime_get(); + +/* In gate-triggered one-shot modes, + * indirectly model some pit_get_out() + * cases by setting the load time way + * back until gate-triggered. + * (Generally only affects reading status + * from channel 2 speaker, + * due to hard-wired gates on other + * channels.) + * + * FIXME: This might be redesigned if a paused + * timer state is added for pit_get_count(). + */ +if (ps-channels[channel].mode == 1 || +ps-channels[channel].mode == 5) { +u64 delta = muldiv64(val+2, NSEC_PER_SEC, KVM_PIT_FREQ); +ps-channels[channel].count_load_time = + ktime_sub(ps-channels[channel].count_load_time, + ns_to_ktime(delta)); I do not understand what are you trying to do here. You assume that trigger will happen 2 clocks after counter is loaded? Modes 1 and 5 are single-shot, and they do not start counting until GATE is triggered, potentially well after count is loaded. So this is attempting to model the start of countdown has not been triggered state as being mostly identical to the already triggered and also expired some number of clocks (2) ago state. So
Re: [PATCH KVM v2 1/4] KVM: fix i8254 IRQ0 to be normally high
On Tue, Jan 08, 2013 at 09:31:19AM +0200, Gleb Natapov wrote: On Mon, Jan 07, 2013 at 06:17:22PM -0600, mmogi...@miniinfo.net wrote: On Mon, 7 Jan 2013 11:39:18 +0200, Gleb Natapov g...@redhat.com wrote: On Wed, Dec 26, 2012 at 10:39:53PM -0700, Matthew Ogilvie wrote: Reading the spec, it is clear that most modes normally leave the IRQ output line high, and only pulse it low to generate a leading edge. Especially the most commonly used mode 2. The KVM i8254 model does not try to emulate the duration of the pulse at all, so just swap the high/low settings it to leave it high most of the time. This fix is a prerequisite to improving the i8259 model to handle the trailing edge of an interupt request as indicated in its spec: If it gets a trailing edge of an IRQ line before it starts to service the interrupt, the request should be canceled. See http://bochs.sourceforge.net/techspec/intel-82c54-timer.pdf.gz or search the net for 23124406.pdf. Risks: There is a risk that migrating a running guest between versions with and without this patch will lose or gain a single timer interrupt during the migration process. The only case where Can you elaborate on how exactly this can happen? Do not see it. KVM 8254: In the corrected model, when the count expires, the model briefly pulses output low and then high again, with the low to high transition being what triggers the interrupt. In the old model, when the count expires, the model expects the output line to already be low, and briefly pulses it high (triggering the interrupt) and then low again. But if the line was already high (because it migrated from the corrected model), this won't generate a new leading edge (low to high) and won't trigger a new interrupt (the first post-back-migration pulse turns into a simple trailing edge instead of a pulse). Unless there is something I'm missing? No, I missed that pic-last_irr/ioapic-irr will be migrated as 1. But this means that next interrupt after migration from new to old will always be lost. What about clearing pit bit from last_irr/irr before migration? Should not affect new-new migration and should fix new-old one. The only problem is that we may need to consult irq routing table to know how pit is connected to ioapic. We should not clear both last_irr and irr. That cancels the interrupt early if the CPU hasn't started servicing it yet: If the guest CPU has interrupts disabled and hasn't begun to service the interrupt, a new-new migration could lose the interrupt much earlier in the countdown cycle than it otherwise might be lost. Potentially we could clear the last_irr bit only. This effectively makes migration behave like the old unfixed code. But if this is considered acceptable, I would be more inclined to not fix IRQ0 at all, rather than make IRQ0 work subtly differently normally vs during migration. One of my earlier patch series (qemu v7 dated Nov 25) attempts to use basically this strategy for the qemu model, at least in the short term (and then potentially fix it properly in the longer term), although the details of series might be tweaked. Or the minimal-risk strategy is to ONLY fix the cascade IRQ2's trailing edge [the original i825* problem that spawned this whole thing], leaving all other IRQs as-is. Still do not see how can we gain one interrupt. In most cases I don't think we get an extra interrupt from a direct fix. But some kinds of attempts to work around missing interrupts during migration can cause cause extra interrupts in other cases. In the old qemu model (but perhaps not kvm), the mode 2 leading edge occurs one clock tick earlier than it ought to. So if you try to be tricky with adjusting levels during migration, you may introduce possible cases where it gets an interrupt in the old model, then migrates and gets another interrupt one tick later in the new model... Also, it occurs to me that maybe there might be an off-by-one issue in the kvm model of mode 2 that ought to be fixed as well? Although the zero length pulse in kvm suggests that one-off issues in counters do not matter. In the qemu model, the leading edge of OUT in mode 2 moves by one tick... The qemu 8254 model actually models each edge at independent clock ticks instead of combining both into a very brief pulse at one time. I've found it handy to draw out old and new timing diagrams on paper (for each mode), and then carefully think about what happens with respect to levels and edges when you transition back and forth between old and new models at various points in the timing cycle. (Note I've spent more time examining the qemu models rather than the kvm models.) Yes, drawing it definitely helps :) this is likely to be serious is probably losing a single-shot (mode 4) interrupt, but if my understanding of how things work is good, then that should only be
Re: [kvmarm] [PATCH v5.1 0/2] KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS
On 10/01/2013, at 20.10, Scott Wood scottw...@freescale.com wrote: On 01/10/2013 06:35:02 PM, Marcelo Tosatti wrote: On Thu, Jan 10, 2013 at 04:40:12PM -0600, Scott Wood wrote: On 01/10/2013 04:28:01 PM, Marcelo Tosatti wrote: Or just have KVM_SET_PPC_DEVICE_ADDRESS. Is there a downside to that? Besides the above, and my original complaint that it shouldn't be specific to addresses? -Scott I did not really grasp that ('shouldnt be specific to addresses'), but anyway. A device may have other configuration parameters that need to be set, besides addresses. PPC MPIC will require information about the vendor and version, for example. OK, can you write down your proposed improvements to the interface? In case you have something ready, otherwise there is time pressure to merge the ARM port. My original request was just to change the name to something like KVM_SET_DEVICE_CONFIG or KVM_SET_DEVICE_ATTR, and not make the id encoding architecture-specific (preferably, separate into a device id field and an attribute id field rather than using bitfields). Actual values for device id could be architecture-specific (or there could be a global enumeration), and attribute id values would be device-specific. Alex suggested that an ideal interface might accept values larger than 64 bits, though I think it's good enough -- there are currently no proposed uses that need more than 64 bits for a single attribute (unlike ONE_REG), and if it is needed, such configuration could be split up between multiple attributes, or the attribute could specify that value be a userspace pointer to the actual data (as with ONE_REG). Here's a writeup (the ARM details would go under ARM/vGIC-specific documentation): 4.80 KVM_SET_DEVICE_ATTR Capability: KVM_CAP_SET_DEVICE_ATTR Type: vm ioctl Parameters: struct kvm_device_attr (in) Returns: 0 on success, -1 on error Errors: ENODEV: The device id is unknown ENXIO: Device not supported on current system Other errors may be returned by specific devices and attributes. struct kvm_device_attr { __u32 device; __u32 attr; __u64 value; }; Specify an attribute of a device emulated or directly exposed by the kernel, which the host kernel needs to know about. The device field is an architecture-specific identifier for a specific device. The attr field is a device-specific identifier for a specific attribute. Individual attributes may have particular requirements for when they can and cannot be set. That is, if you have interest/energy to spend in a possibly reusable interface, as long as that does not delay integration of the ARM code, i don't think the ARM people will mind that. The impression I've been given is that just about any change will delay the integration at this point. If that's the case, and everyone's OK with having an interface that is deprecated on arrival, then fine. That is not entirely the case, but there wasn't event agreement on this revised API, and we didn't want to wait for weeks until a decision was made. Alex suggested we use a DEV_REG API similar to the ONE_REG API, but I am quite strongly against having such an API for this specific purpose as it is too semantically distant to what we do on ARM. (Having a DEV_REG API for other purposes might be fine though). So I have no problem with your suggestion, although we could consider having a size and addr field in the struct instead and be slightly more extensible. But I don't feel strongly about it. If we can agree on Scott's suggestion or with my modification (Alex, Gleb, Marcelo ?) then I'll change the KVM/ARM patches to use this and resend them right away. But we have to agree now! If not, I really think we should keep things as they are now, as we're really discussing idealistic scenarios here, and the ARM patches have been out of tree for long enough. As Marcelo pointed out, the benefits of the perfect API are really minimal. -Christoffer-- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH v9 2/3] x86, apicv: add virtual x2apic support
Gleb Natapov wrote on 2013-01-10: On Thu, Jan 10, 2013 at 12:22:51PM +, Zhang, Yang Z wrote: Gleb Natapov wrote on 2013-01-10: On Thu, Jan 10, 2013 at 11:54:21AM +, Zhang, Yang Z wrote: The right logic should be: When register virtualization enabled, read all apic msr(except apic id reg and tmcct which have the hook) not cause vmexit and only write TPR not cause vmexit. When vid enabled, write to TPR, EOI and SELF IPI not cause vmexit. It's better to put the patch of envirtualize x2apic mode as first patch. There is no point whatsoever to enable virtualize x2apic without allowing at least one non intercepted access to x2apic MSR range and this is what your patch is doing when run on cpu without vid capability. No, at least read/write TPR will not cause vmexit if virtualize x2apic mode is enabled. For that you need to disable 808H MSR intercept, which your patch does not do. I want to do this in next patch. Then don't. There is no point in the patch that enables virtualize x2apic and does not allow at least one non-intercepted MSR access. As I said, read/write TPR will not cause vmexit if virtual x2apic is set. I am not sure whether I understand your comments right in previous discussion, here is my thinking: 1. enable virtualize x2apic mode if guest is really using x2apic and clear TPR in msr read and write bitmap. This will benefit reading TPR. 2. If APIC registers virtualization is enabled, clear all bit in rang 0x800-0x8ff(except apic id reg and tmcct). Clear all read bits in the range. 3. If vid is enabled, clear EOI and SELF IPI in msr write map. Looks OK. One concern you mentioned is that vid enabled and virtualize x2apic is disabled but guest still use x2apic. In this case, we still use MSR bitmap to intercept x2apic. This means the vEOI update will never happen. But we still can benefit from interrupt window. What interrupt windows? Without virtualized x2apic TPR/EOI virtualization will not happen and we rely on that in the code. Yes, but we can eliminate vmexit of interrupt window. Interrupt still can delivery to guest without vmexit when guest enables interrupt if vid is enabled. See SDM vol 3, 29.2.2. What does it matter that you eliminated vmexit of interrupt window if you broke everything else? The vid patch assumes that apic emulation either entirely happens in a software when vid is disabled or in a CPU if vid is enabled. Mixed mode will not work (apic-isr_count = 1 trick will break things for instance). And it is not worth it to complicate the code to make it work. Yes, you are right. It too complicated. Another question? Why not to hide x2apic capability to guest when vid is supported and virtual x2apic mode is not supported? It should be more reasonable than disable vid when virtual x2apic mode is unavailable. Best regards, Yang -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 0/2] make mac programming for virtio net more robust
On Fri, Jan 11, 2013 at 12:53:07PM +1030, Rusty Russell wrote: Michael S. Tsirkin m...@redhat.com writes: On Thu, Jan 10, 2013 at 10:45:39PM +0800, ak...@redhat.com wrote: From: Amos Kong ak...@redhat.com Currenly mac is programmed byte by byte. This means that we have an intermediate step where mac is wrong. Second patch introduced a new vq control command to set mac address in one time. As you mention we could alternatively do it without new commands, simply add a feature bit that says that MACs are in the mac table. This would be a much bigger patch, and I'm fine with either way. Rusty what do you think? Hmm, mac filtering and my mac address are not quite the same thing. I don't know if it matters for anyone: does it? The mac address is abused for things like identifying machines, etc. I don't know either. I think net core differentiates between mac and uc_list because linux has to know which mac to use when building up packets, so at some level, I agree it might be useful to identify the machine. BTW netdev/davem should have been copied on this, Amos I think it's a good idea to remember to do it next time you post. If we keep it as a separate concept, Amos' patch seems to make sense. Yes. It also keeps the patch small, I just thought I'd mention the option. Cheers, Rusty. -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 0/9] KVM: PPC: In-kernel PAPR interrupt controller emulation
On 10.01.2013, at 06:09, Paul Mackerras wrote: On Sat, Dec 15, 2012 at 01:06:13PM +1100, Benjamin Herrenschmidt wrote: On Sat, 2012-12-15 at 01:46 +0100, Alexander Graf wrote: On 05.11.2012, at 04:18, Paul Mackerras wrote: This series implements in-kernel emulation of the XICS interrupt controller specified in the PAPR specification and used by pseries guests. Since the XICS is organized a bit differently to the interrupt controllers used on x86 machines, I have defined some new ioctls for setting up the XICS and for saving and restoring its state. It may be possible to use some of the currently x86-specific ioctls instead. Is this one already following the new world order that we discussed during KVM Forum? :) The new world order (sorry, looks like nobody took notes and people expect me to do a write up from memory now ... :-) Well, mostly we agreed that the x86 stuff wasn't going to work for us. So basically what we discussed boils down to: - Move the existing generic KVM ioctl to create the kernel APIC to x86 since it's no as-is useful for other archs who, among other things, might need to pass argument based on the machine type (type of interrupt subsystem for example, etc...) Assuming you're talking about KVM_CREATE_IRQCHIP, it is already handled entirely in arch code (arch/x86/kvm/x86.c and arch/ia64/kvm/kvm-ia64.c), along with KVM_GET_IRQCHIP and KVM_SET_IRQCHIP. This part is about QEMU. - Once that's done, well, instanciating interrupt controller objects becomes pretty much an arch specific matter. We could try to keep some ioctls somewhat common though it's not *that* useful because the various types arguments are going to be fairly arch specific, so goes for configuration. Examples of what could be kept common are: * Create irq chip, takes at least a type argument, possibly a few more type-specific args (or a union, but then let's keep space in there since we can't change the size of the struct later as it would impact the ioctl number afaik). The existing KVM_CREATE_IRQCHIP is defined as _IO(...) which means that it doesn't read or write memory, but there is still the 3rd argument to the ioctl, which would give us 64 bits to indicate the type of the top-level IRQ controller (XICS, MPIC, ...), but not much more. It sounds like the agreement at the meeting was more along the lines of the KVM_CREATE_IRQCHIP_ARGS ioctl (as in the patches I posted) which can be called multiple times to instantiate pieces of the interrupt framework, rather than having a KVM_CREATE_IRQCHIP that gets called once early on to say that there we are using in-kernel interrupt controller emulation, followed by other calls to configure the various parts of the framework. Is that accurate? KVM_CREATE_IRQCHIP should get a parameter indicating the type of IRQCHIP (XICS / MPIC / APIC / VGIC / ...). The parameter-less version defaults to an arch specific default (APIC for x86, VGIC for arm, error on PPC). I'm not 100% sure yet whether we want to support models with multiple IRQCHIPs. If so, we need to return a device token from the KVM_CREATE_IRQCHIP ioctl. Maybe it makes more sense to model specific cases like this as separate type though with specific, explicit subdevice ids. Not sure. Other instantiation attributes we don't know that early on should be settable between the time frame of vm creation and first execution. An example for this are device addresses. Check out the threads KVM: ARM: Rename KVM_SET_DEVICE_ADDRESS and KVM: ARM: Introduce KVM_SET_DEVICE_ADDRESS ioctl for more information on this particular bit. Alex * Assigning the address of an irq chip when it can change (see ARM patches) - The existing business with irqfd etc... is mostly ok, except the interfaces messing around with MSIs (see virtio-pci use of kvm functions directly). The assignment of an irq number for an MSI must be a hook, probably a PCI controller hook, so x86 can get it done via its existing kernel interfaces and sane architectures can keep the assignment in qemu where it belongs. So, if I've understood you correctly about what was agreed, the set of ioctls that is implemented in the patches I posted is in line with what was agreed, isn't it? If not, where does it differ? (To recap, I had KVM_CREATE_IRQCHIP_ARGS, KVM_IRQCHIP_GET_SOURCES and KVM_IRQCHIP_SET_SOURCES, plus a one-reg interface to get/set the vcpu-specific state.) Paul. -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PULL 0/8] ppc patch queue 2013-01-10
Hi Marcelo / Gleb, This is my current patch queue for ppc. Please pull. Highlights this time: - Book3S: enable potential sPAPR guest emulation on PR KVM on pHyp - BookE: EPR (External Proxy Register) support Alex The following changes since commit 908e7d7999bcce70ac52e7f390a8f5cbc55948de: Gleb Natapov (1): KVM: MMU: simplify folding of dirty bit into accessed_dirty are available in the git repository at: git://github.com/agraf/linux-2.6.git kvm-ppc-next Alexander Graf (6): KVM: PPC: Only WARN on invalid emulation KVM: PPC: Book3S: PR: Enable alternative instruction for SC 1 KVM: PPC: BookE: Allow irq deliveries to inject requests KVM: PPC: BookE: Emulate mfspr on EPR KVM: PPC: BookE: Implement EPR exit KVM: PPC: BookE: Add EPR ONE_REG sync Mihai Caraman (2): KVM: PPC: Fix SREGS documentation reference KVM: PPC: Fix mfspr/mtspr MMUCFG emulation Documentation/virtual/kvm/api.txt | 43 -- arch/powerpc/include/asm/kvm_host.h |2 + arch/powerpc/include/asm/kvm_ppc.h | 10 arch/powerpc/include/uapi/asm/kvm.h |6 - arch/powerpc/kvm/book3s_emulate.c | 30 arch/powerpc/kvm/book3s_pr.c|5 arch/powerpc/kvm/booke.c| 40 +++- arch/powerpc/kvm/booke_emulate.c|3 ++ arch/powerpc/kvm/emulate.c |5 arch/powerpc/kvm/powerpc.c | 13 +- include/linux/kvm_host.h|1 + include/uapi/linux/kvm.h|6 + 12 files changed, 153 insertions(+), 11 deletions(-) -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 7/8] KVM: PPC: BookE: Implement EPR exit
The External Proxy Facility in FSL BookE chips allows the interrupt controller to automatically acknowledge an interrupt as soon as a core gets its pending external interrupt delivered. Today, user space implements the interrupt controller, so we need to check on it during such a cycle. This patch implements logic for user space to enable EPR exiting, disable EPR exiting and EPR exiting itself, so that user space can acknowledge an interrupt when an external interrupt has successfully been delivered into the guest vcpu. Signed-off-by: Alexander Graf ag...@suse.de --- Documentation/virtual/kvm/api.txt | 40 +- arch/powerpc/include/asm/kvm_host.h |2 + arch/powerpc/include/asm/kvm_ppc.h |9 +++ arch/powerpc/kvm/booke.c| 14 +++- arch/powerpc/kvm/powerpc.c | 10 include/linux/kvm_host.h|1 + include/uapi/linux/kvm.h|6 + 7 files changed, 79 insertions(+), 3 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 4fc2bfc..a98ed09 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2246,8 +2246,8 @@ executed a memory-mapped I/O instruction which could not be satisfied by kvm. The 'data' member contains the written data if 'is_write' is true, and should be filled by application code otherwise. -NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR - and KVM_EXIT_PAPR the corresponding +NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR, + KVM_EXIT_PAPR and KVM_EXIT_EPR the corresponding operations are complete (and guest state is consistent) only after userspace has re-entered the kernel with KVM_RUN. The kernel side will first finish incomplete operations and then check for pending signals. Userspace @@ -2366,6 +2366,25 @@ interrupt for the target subchannel has been dequeued and subchannel_id, subchannel_nr, io_int_parm and io_int_word contain the parameters for that interrupt. ipb is needed for instruction parameter decoding. + /* KVM_EXIT_EPR */ + struct { + __u32 epr; + } epr; + +On FSL BookE PowerPC chips, the interrupt controller has a fast patch +interrupt acknowledge path to the core. When the core successfully +delivers an interrupt, it automatically populates the EPR register with +the interrupt vector number and acknowledges the interrupt inside +the interrupt controller. + +In case the interrupt controller lives in user space, we need to do +the interrupt acknowledge cycle through it to fetch the next to be +delivered interrupt vector using this exit. + +It gets triggered whenever both KVM_CAP_PPC_EPR are enabled and an +external interrupt has just been delivered into the guest. User space +should put the acknowledged interrupt vector into the 'epr' field. + /* Fix the size of the union. */ char padding[256]; }; @@ -2501,3 +2520,20 @@ handled in-kernel, while the other I/O instructions are passed to userspace. When this capability is enabled, KVM_EXIT_S390_TSCH will occur on TEST SUBCHANNEL intercepts. + +6.5 KVM_CAP_PPC_EPR + +Architectures: ppc +Parameters: args[0] defines whether the proxy facility is active +Returns: 0 on success; -1 on error + +This capability enables or disables the delivery of interrupts through the +external proxy facility. + +When enabled (args[0] != 0), every time the guest gets an external interrupt +delivered, it automatically exits into user space with a KVM_EXIT_EPR exit +to receive the topmost interrupt vector. + +When disabled (args[0] == 0), behavior is as if this facility is unsupported. + +When this capability is enabled, KVM_EXIT_EPR can occur. diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index ab49c6c..8a72d59 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -520,6 +520,8 @@ struct kvm_vcpu_arch { u8 sane; u8 cpu_type; u8 hcall_needed; + u8 epr_enabled; + u8 epr_needed; u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 5f5f69a..493630e 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -264,6 +264,15 @@ static inline void kvm_linear_init(void) {} #endif +static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr) +{ +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GEPR, epr); +#elif defined(CONFIG_BOOKE) + vcpu-arch.epr = epr; +#endif +} + int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, struct kvm_config_tlb *cfg); int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu, diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 964f447..940ec80 100644 ---