Re: [PATCH 3/4] KVM: x86: inject nested page faults on emulated instructions
On Tue, Sep 02, 2014 at 05:13:49PM +0200, Paolo Bonzini wrote: This is required for the following patch to work correctly. If a nested page fault happens during emulation, we must inject a vmexit, not a page fault. Luckily we already have the required machinery: it is enough to return X86EMUL_INTERCEPTED instead of X86EMUL_PROPAGATE_FAULT. I wonder why this patch is needed. X86EMUL_PROPAGATE_FAULT causes ctxt-have_exception to be set to true in x86_emulate_insn(). x86_emulate_instruction() checks ctxt-have_exception and calls inject_emulated_exception() if it is true. inject_emulated_exception() calls kvm_propagate_fault() where we check if the fault was nested and generate vmexit or a page fault accordingly. Reported-by: Valentine Sinitsyn valentine.sinit...@gmail.com Signed-off-by: Paolo Bonzini pbonz...@redhat.com --- arch/x86/kvm/x86.c | 18 ++ 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e4ed85e07a01..9e3b74c044ed 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -416,6 +416,16 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) vcpu-arch.mmu.inject_page_fault(vcpu, fault); } +static inline int kvm_propagate_or_intercept(struct kvm_vcpu *vcpu, + struct x86_exception *exception) +{ + if (likely(!exception-nested_page_fault)) + return X86EMUL_PROPAGATE_FAULT; + + vcpu-arch.mmu.inject_page_fault(vcpu, exception); + return X86EMUL_INTERCEPTED; +} + void kvm_inject_nmi(struct kvm_vcpu *vcpu) { atomic_inc(vcpu-arch.nmi_queued); @@ -4122,7 +4132,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, int ret; if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; + return kvm_propagate_or_intercept(vcpu, exception); ret = kvm_read_guest_page(vcpu-kvm, gpa PAGE_SHIFT, data, offset, toread); if (ret 0) { @@ -4152,7 +4162,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, gpa_t gpa = vcpu-arch.walk_mmu-gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK, exception); if (unlikely(gpa == UNMAPPED_GVA)) - return X86EMUL_PROPAGATE_FAULT; + return kvm_propagate_or_intercept(vcpu, exception); offset = addr (PAGE_SIZE-1); if (WARN_ON(offset + bytes PAGE_SIZE)) @@ -4203,7 +4213,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, int ret; if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; + return kvm_propagate_or_intercept(vcpu, exception); ret = kvm_write_guest(vcpu-kvm, gpa, data, towrite); if (ret 0) { r = X86EMUL_IO_NEEDED; @@ -4350,7 +4360,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, ret = vcpu_mmio_gva_to_gpa(vcpu, addr, gpa, exception, write); if (ret 0) - return X86EMUL_PROPAGATE_FAULT; + return kvm_propagate_or_intercept(vcpu, exception); /* For APIC access vmexit */ if (ret) -- 1.8.3.1 -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 03/15] arm/arm64: KVM: refactor vgic_handle_mmio() function
On 2014/8/21 21:06, Andre Przywara wrote: Currently we only need to deal with one MMIO region for the GIC emulation, but we soon need to extend this. Refactor the existing code to allow easier addition of different ranges without code duplication. Signed-off-by: Andre Przywara andre.przyw...@arm.com --- virt/kvm/arm/vgic.c | 72 --- 1 file changed, 51 insertions(+), 21 deletions(-) diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index bba8692..3b6f78d 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -925,37 +925,28 @@ static bool vgic_validate_access(const struct vgic_dist *dist, return true; } -/** - * vgic_handle_mmio - handle an in-kernel MMIO access +/* + * vgic_handle_mmio_range - handle an in-kernel MMIO access * @vcpu:pointer to the vcpu performing the access * @run: pointer to the kvm_run structure * @mmio:pointer to the data describing the access + * @ranges: pointer to the register defining structure + * @mmio_base: base address for this mapping * - * returns true if the MMIO access has been performed in kernel space, - * and false if it needs to be emulated in user space. + * returns true if the MMIO access could be performed */ -bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run, - struct kvm_exit_mmio *mmio) +static bool vgic_handle_mmio_range(struct kvm_vcpu *vcpu, struct kvm_run *run, + struct kvm_exit_mmio *mmio, + const struct mmio_range *ranges, + unsigned long mmio_base) { const struct mmio_range *range; struct vgic_dist *dist = vcpu-kvm-arch.vgic; - unsigned long base = dist-vgic_dist_base; bool updated_state; unsigned long offset; - if (!irqchip_in_kernel(vcpu-kvm) || - mmio-phys_addr base || - (mmio-phys_addr + mmio-len) (base + KVM_VGIC_V2_DIST_SIZE)) - return false; - - /* We don't support ldrd / strd or ldm / stm to the emulated vgic */ - if (mmio-len 4) { - kvm_inject_dabt(vcpu, mmio-phys_addr); - return true; - } - - offset = mmio-phys_addr - base; - range = find_matching_range(vgic_dist_ranges, mmio, offset); + offset = mmio-phys_addr - mmio_base; + range = find_matching_range(ranges, mmio, offset); if (unlikely(!range || !range-handle_mmio)) { pr_warn(Unhandled access %d %08llx %d\n, mmio-is_write, mmio-phys_addr, mmio-len); @@ -963,7 +954,7 @@ bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run, } spin_lock(vcpu-kvm-arch.vgic.lock); - offset = mmio-phys_addr - range-base - base; + offset -= range-base; if (vgic_validate_access(dist, range, offset)) { updated_state = range-handle_mmio(vcpu, mmio, offset); } else { @@ -981,6 +972,45 @@ bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run, return true; } Andre, Is the IS_IN_RANGE definition wrong? Compare with Marc's old code, here should lack a =. #define IS_IN_RANGE(addr, alen, base, len) \ (((addr) = (base)) (((addr) + (alen)) = ((base) + (len +#define IS_IN_RANGE(addr, alen, base, len) \ + (((addr) = (base)) (((addr) + (alen)) ((base) + (len + +static bool vgic_v2_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run, + struct kvm_exit_mmio *mmio) +{ + unsigned long base = vcpu-kvm-arch.vgic.vgic_dist_base; + + if (!IS_IN_RANGE(mmio-phys_addr, mmio-len, base, + KVM_VGIC_V2_DIST_SIZE)) + return false; + + /* GICv2 does not support accesses wider than 32 bits */ + if (mmio-len 4) { + kvm_inject_dabt(vcpu, mmio-phys_addr); + return true; + } + + return vgic_handle_mmio_range(vcpu, run, mmio, vgic_dist_ranges, base); +} + +/** + * vgic_handle_mmio - handle an in-kernel MMIO access for the GIC emulation + * @vcpu: pointer to the vcpu performing the access + * @run: pointer to the kvm_run structure + * @mmio: pointer to the data describing the access + * + * returns true if the MMIO access has been performed in kernel space, + * and false if it needs to be emulated in user space. + */ +bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run, + struct kvm_exit_mmio *mmio) +{ + if (!irqchip_in_kernel(vcpu-kvm)) + return false; + + return vgic_v2_handle_mmio(vcpu, run, mmio); +} + static u8 *vgic_get_sgi_sources(struct vgic_dist *dist, int vcpu_id, int sgi) { return dist-irq_sgi_sources + vcpu_id * VGIC_NR_SGIS + sgi; -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo
Re: [Qemu-devel] [question] virtio-blk performance degradationhappenedwith virito-serial
If virtio-blk and virtio-serial share an IRQ, the guest operating system has to check each virtqueue for activity. Maybe there is some inefficiency doing that. AFAIK virtio-serial registers 64 virtqueues (on 31 ports + console) even if everything is unused. That could be the case if MSI is disabled. Do the windows virtio drivers enable MSIs, in their inf file? It depends on the version of the drivers, but it is a reasonable guess at what differs between Linux and Windows. Haoyu, can you give us the output of lspci from a Linux guest? I made a test with fio on rhel-6.5 guest, the same degradation happened too, this degradation can be reproduced on rhel6.5 guest 100%. virtio_console module installed: 64K-write-sequence: 285 MBPS, 4380 IOPS virtio_console module uninstalled: 64K-write-sequence: 370 MBPS, 5670 IOPS And, virio-blk's interrupt mode always is MSI, no matter if virtio_console module is installed or uninstalled. 25:2245933 PCI-MSI-edge virtio1-requests fio command: fio -filename /dev/vda -direct=1 -iodepth=1 -thread -rw=write -ioengine=psync -bs=64k -size=30G -numjobs=1 -name=mytest QEMU comamnd: /usr/bin/kvm -id 5497356709352 -chardev socket,id=qmp,path=/var/run/qemu-server/5497356709352.qmp,server,nowait -mon chardev=qmp,mode=control -vnc :0,websocket,to=200 -enable-kvm -pidfile /var/run/qemu-server/5497356709352.pid -daemonize -name io-test-rhel-6.5 -smp sockets=1,cores=1 -cpu core2duo -nodefaults -vga cirrus -no-hpet -k en-us -boot menu=on,splash-time=8000 -m 4096 -usb -drive file=/sf/data/local/zhanghaoyu/rhel-server-6.5-x86_64-dvd.iso,if=none,id=drive-ide0,media=cdrom,aio=native,forecast=disable -device ide-cd,bus=ide.0,unit=0,drive=drive-ide0,id=ide0,bootindex=200 -drive file=/sf/data/local/images/host-1051721dff13/io-test-rhel-6.5.vm/vm-disk-1.qcow2,if=none,id=drive-virtio1,cache=none,aio=native -device virtio-blk-pci,drive=drive-virtio1,id=virtio1,bus=pci.0,addr=0xb -drive file=/sf/data/local/images/host-1051721dff13/io-test-rhel-6.5.vm/vm-disk-2.qcow2,if=none,id=drive-virtio2,cache=none,aio=native -device virtio-blk-pci,drive=drive-virtio2,id=virtio2,bus=pci .0,addr=0xc,bootindex=101 -netdev type=tap,id=net0,ifname=164922379979200,script=/sf/etc/kvm/vtp-bridge,vhost=on,vhostforce=on -device virtio-net-pci,mac=FE:FC:FE:C6:47:F6,netdev=net0,bus=pci.0,addr=0x12,id=net0,bootindex=300 -rtc driftfix=slew,clock=rt -global kvm-pit.lost_tick_policy=discard -global PIIX4_PM.disable_s3=1 -global PIIX4_PM.disable_s4=1 -chardev socket,path=/run/virtser/1649223799792.sock,server,nowait,id=channelser -device virtio-serial,vectors=4 -device virtserialport,chardev=channelser,name=channelser.virtserial0.0 [environment] Host:linux-3.10(RHEL7-rc1) QEMU: qemu-2.0.1 Guest: RHEL6.5 # lspci -tv -[:00]-+-00.0 Intel Corporation 440FX - 82441FX PMC [Natoma] +-01.0 Intel Corporation 82371SB PIIX3 ISA [Natoma/Triton II] +-01.1 Intel Corporation 82371SB PIIX3 IDE [Natoma/Triton II] +-01.2 Intel Corporation 82371SB PIIX3 USB [Natoma/Triton II] +-01.3 Intel Corporation 82371AB/EB/MB PIIX4 ACPI +-02.0 Cirrus Logic GD 5446 +-03.0 Red Hat, Inc Virtio console +-0b.0 Red Hat, Inc Virtio block device +-0c.0 Red Hat, Inc Virtio block device \-12.0 Red Hat, Inc Virtio network device # lspci -vvv 00:00.0 Host bridge: Intel Corporation 440FX - 82441FX PMC [Natoma] (rev 02) Subsystem: Red Hat, Inc Qemu virtual machine Control: I/O+ Mem+ BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx- Status: Cap- 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort- MAbort- SERR- PERR- INTx- 00:01.0 ISA bridge: Intel Corporation 82371SB PIIX3 ISA [Natoma/Triton II] Subsystem: Red Hat, Inc Qemu virtual machine Control: I/O+ Mem+ BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx- Status: Cap- 66MHz- UDF- FastB2B- ParErr- DEVSEL=medium TAbort- TAbort- MAbort- SERR- PERR- INTx- 00:01.1 IDE interface: Intel Corporation 82371SB PIIX3 IDE [Natoma/Triton II] (prog-if 80 [Master]) Subsystem: Red Hat, Inc Qemu virtual machine Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx- Status: Cap- 66MHz- UDF- FastB2B+ ParErr- DEVSEL=medium TAbort- TAbort- MAbort- SERR- PERR- INTx- Latency: 0 Region 0: [virtual] Memory at 01f0 (32-bit, non-prefetchable) [size=8] Region 1: [virtual] Memory at 03f0 (type 3, non-prefetchable) Region 2: [virtual] Memory at 0170 (32-bit, non-prefetchable) [size=8] Region 3: [virtual] Memory at 0370 (type 3, non-prefetchable) Region 4: I/O ports at c0e0 [size=16] Kernel driver in use: ata_piix Kernel modules: ata_generic, pata_acpi, ata_piix 00:01.2 USB controller: Intel
Re: kvm-unit-test failures
Il 03/09/2014 20:25, Chris J Arges ha scritto: snip I'm not sure about the reason for the warp, but indeed the offset and uptime match (I'll check them against the trace tomorrow) so it's just that the VM's TSC base is not taken into account correctly. Can you gather another trace with the problematic patch reverted? Paolo Here is the third trace running with 0d3da0d2 reverted from the latest kvm queue branch 11cc9ea3: http://people.canonical.com/~arges/kvm/trace-3.dat.xz Thanks! And---yay!---I reproduced it on another machine. Paolo $ uptime 18:25:13 up 5 min, 1 user, load average: 0.21, 0.74, 0.44 qemu-system-x86_64 -enable-kvm -device pc-testdev -device isa-debug-exit,iobase=0xf4,iosize=0x4 -display none -serial stdio -device pci-testdev -kernel x86/kvmclock_test.flat -smp 2 --append 1000 1409768537 enabling apic enabling apic kvm-clock: cpu 0, msr 0x:44e520 kvm-clock: cpu 0, msr 0x:44e520 Wallclock test, threshold 5 Seconds get from host: 1409768537 Seconds get from kvmclock: 1409768538 Offset:1 Wallclock test, threshold 5 Seconds get from host: 1409768537 Seconds get from kvmclock: 1409768538 Offset:1 Check the stability of raw cycle ... Total vcpus: 2 Test loops: 1000 Total warps: 0 Total stalls: 0 Worst warp: 0 Raw cycle is stable Monotonic cycle test: Total vcpus: 2 Test loops: 1000 Total warps: 0 Total stalls: 0 Worst warp: 0 Measure the performance of raw cycle ... Total vcpus: 2 Test loops: 1000 TSC cycles: 1241970306 Measure the performance of adjusted cycle ... Total vcpus: 2 Test loops: 1000 TSC cycles: 3266701026 Return value from qemu: 1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC][patch 3/6] KVM: s390: Add GISA support
From: Frank Blaschka frank.blasc...@de.ibm.com This patch adds GISA (Guest Interrupt State Area) support to s390 kvm. GISA can be used for exitless interrupts. The patch provides a set of functions for GISA related operations like accessing GISA fields or registering ISCs for alert. Exploiters of GISA will follow with additional patches. Signed-off-by: Frank Blaschka frank.blasc...@de.ibm.com --- arch/s390/include/asm/kvm_host.h | 72 arch/s390/kvm/kvm-s390.c | 167 +++ arch/s390/kvm/kvm-s390.h | 28 ++ 3 files changed, 265 insertions(+), 2 deletions(-) --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -129,11 +129,12 @@ struct kvm_s390_sie_block { __u8reserved60; /* 0x0060 */ __u8ecb;/* 0x0061 */ __u8ecb2; /* 0x0062 */ - __u8reserved63[1]; /* 0x0063 */ + __u8ecb3; /* 0x0063 */ __u32 scaol; /* 0x0064 */ __u8reserved68[4]; /* 0x0068 */ __u32 todpr; /* 0x006c */ - __u8reserved70[32]; /* 0x0070 */ + __u32 gd; /* 0x0070 */ + __u8reserved74[28]; /* 0x0074 */ psw_t gpsw; /* 0x0090 */ __u64 gg14; /* 0x00a0 */ __u64 gg15; /* 0x00a8 */ @@ -300,6 +301,70 @@ struct kvm_s390_interrupt_info { #define ACTION_STORE_ON_STOP (10) #define ACTION_STOP_ON_STOP(11) +#define KVM_S390_GISA_FORMAT_0 0 +#define KVM_S390_GISA_FORMAT_1 1 + +struct kvm_s390_gisa_f0 { + u32 next_alert; + u8 ipm; + u16 rsv0:14; + u16 g:1; + u16 c:1; + u8 iam; + u32 rsv1; + u32 count; +} __packed; + +struct kvm_s390_gisa_f1 { + u32 next_alert; + u8 ipm; + u8 simm; + u8 nimm; + u8 iam; + u64 aisma; + u32 rsv0:6; + u32 g:1; + u32 c:1; + u32 rsv1:24; + u64 rsv2; + u32 count; +} __packed; + +union kvm_s390_gisa { + struct kvm_s390_gisa_f0 f0; + struct kvm_s390_gisa_f1 f1; +}; + +struct kvm_s390_gait { + u32 gd; + u16 : 5; + u16 gisc : 3; + u16 rpu : 8; + u16: 10; + u16 gaisbo : 6; + u64 gaisba; +} __packed; + +struct kvm_s390_aifte { + u64 faisba; + u64 gaita; + u16 simm : 8; + u16 : 5; + u16 afi : 3; + u16 reserved1; + u16 reserved2; + u16 faal; +} __packed; + +struct kvm_s390_gib { + u32 alo; + u32 reserved1; + u32 : 5; + u32 nisc : 3; + u32 : 24; + u8 reserverd2[20]; +} __packed; + struct kvm_s390_local_interrupt { spinlock_t lock; struct list_head list; @@ -420,6 +485,9 @@ struct kvm_arch{ struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS]; wait_queue_head_t ipte_wq; spinlock_t start_stop_lock; + union kvm_s390_gisa *gisa; + unsigned long iam; + atomic_t in_sie; }; #define KVM_HVA_ERR_BAD(-1UL) --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -404,6 +404,16 @@ long kvm_arch_vm_ioctl(struct file *filp return r; } +static u8 kvm_s390_gisa_get_alert_mask(struct kvm *kvm) +{ + return (u8)ACCESS_ONCE(kvm-arch.iam); +} + +static void kvm_s390_gisa_set_alert_mask(struct kvm *kvm, u8 iam) +{ + xchg(kvm-arch.iam, iam); +} + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int rc; @@ -461,6 +471,14 @@ int kvm_arch_init_vm(struct kvm *kvm, un kvm-arch.css_support = 0; kvm-arch.use_irqchip = 0; + kvm-arch.gisa = (union kvm_s390_gisa *)get_zeroed_page( + GFP_KERNEL | GFP_DMA); + if (!kvm-arch.gisa) + goto out_nogmap; + kvm_s390_gisa_set_next_alert(kvm, (u32)(unsigned long)kvm-arch.gisa); + kvm_s390_gisa_set_alert_mask(kvm, 0); + atomic_set(kvm-arch.in_sie, 0); + spin_lock_init(kvm-arch.start_stop_lock); return 0; @@ -520,6 +538,7 @@ void kvm_arch_sync_events(struct kvm *kv void kvm_arch_destroy_vm(struct kvm *kvm) { + free_page((unsigned long)kvm-arch.gisa); kvm_free_vcpus(kvm); free_page((unsigned long)(kvm-arch.sca)); debug_unregister(kvm-arch.dbf); @@ -656,6 +675,19 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu return rc; } +u32 kvm_s390_gisa_get_fmt(void) +{ + if (test_facility(70) || test_facility(72)) + return KVM_S390_GISA_FORMAT_1; + else + return KVM_S390_GISA_FORMAT_0; +} + +static u32 kvm_s390_build_gd(struct kvm *kvm) +{ + return (u32)(unsigned long)kvm-arch.gisa | kvm_s390_gisa_get_fmt(); +} + struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
[RFC][patch 5/6] s390: Add PCI bus support
From: Frank Blaschka frank.blasc...@de.ibm.com This patch implements a pci bus for s390x together with some infrastructure to generate and handle hotplug events. It also provides device configuration/unconfiguration via sclp instruction interception. Signed-off-by: Frank Blaschka frank.blasc...@de.ibm.com --- default-configs/s390x-softmmu.mak |1 hw/s390x/Makefile.objs|1 hw/s390x/css.c|5 hw/s390x/css.h|1 hw/s390x/s390-pci-bus.c | 287 ++ hw/s390x/s390-pci-bus.h | 139 ++ hw/s390x/s390-virtio-ccw.c|2 hw/s390x/sclp.c | 10 + include/hw/s390x/sclp.h |8 + target-s390x/Makefile.objs|2 target-s390x/ioinst.c | 52 ++ target-s390x/ioinst.h |1 target-s390x/kvm.c|5 target-s390x/pci_ic.c | 230 ++ target-s390x/pci_ic.h | 214 15 files changed, 956 insertions(+), 2 deletions(-) --- a/default-configs/s390x-softmmu.mak +++ b/default-configs/s390x-softmmu.mak @@ -1,4 +1,5 @@ CONFIG_VIRTIO=y +CONFIG_PCI=y CONFIG_SCLPCONSOLE=y CONFIG_S390_FLIC=y CONFIG_S390_FLIC_KVM=$(CONFIG_KVM) --- a/hw/s390x/Makefile.objs +++ b/hw/s390x/Makefile.objs @@ -8,3 +8,4 @@ obj-y += ipl.o obj-y += css.o obj-y += s390-virtio-ccw.o obj-y += virtio-ccw.o +obj-$(CONFIG_KVM) += s390-pci-bus.o --- a/hw/s390x/css.c +++ b/hw/s390x/css.c @@ -1281,6 +1281,11 @@ void css_generate_chp_crws(uint8_t cssid /* TODO */ } +void css_generate_css_crws(uint8_t cssid) +{ +css_queue_crw(CRW_RSC_CSS, 0, 0, 0); +} + int css_enable_mcsse(void) { trace_css_enable_facility(mcsse); --- a/hw/s390x/css.h +++ b/hw/s390x/css.h @@ -99,6 +99,7 @@ void css_queue_crw(uint8_t rsc, uint8_t void css_generate_sch_crws(uint8_t cssid, uint8_t ssid, uint16_t schid, int hotplugged, int add); void css_generate_chp_crws(uint8_t cssid, uint8_t chpid); +void css_generate_css_crws(uint8_t cssid); void css_adapter_interrupt(uint8_t isc); #define CSS_IO_ADAPTER_VIRTIO 1 --- /dev/null +++ b/hw/s390x/s390-pci-bus.c @@ -0,0 +1,287 @@ +/* + * s390 PCI BUS + * + * Copyright 2014 IBM Corp. + * Author(s): Frank Blaschka frank.blasc...@de.ibm.com + *Hong Bo Li lih...@cn.ibm.com + *Yi Min Zhao zyi...@cn.ibm.com + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at + * your option) any later version. See the COPYING file in the top-level + * directory. + */ + +#include hw/pci/pci.h +#include hw/s390x/css.h +#include hw/s390x/sclp.h +#include qemu/error-report.h +#include s390-pci-bus.h + +/* #define DEBUG_S390PCI_BUS */ +#ifdef DEBUG_S390PCI_BUS +#define DPRINTF(fmt, ...) \ +do { fprintf(stderr, S390pci-bus: fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) \ +do { } while (0) +#endif + +static QTAILQ_HEAD(, SeiContainer) pending_sei = +QTAILQ_HEAD_INITIALIZER(pending_sei); +static QTAILQ_HEAD(, S390PCIBusDevice) device_list = +QTAILQ_HEAD_INITIALIZER(device_list); + +int chsc_sei_nt2_get_event(void *res) +{ +ChscSeiNt2Res *nt2_res = (ChscSeiNt2Res *)res; +PciCcdfAvail *accdf; +PciCcdfErr *eccdf; +int rc = 1; +SeiContainer *sei_cont; + +sei_cont = QTAILQ_FIRST(pending_sei); +if (sei_cont) { +QTAILQ_REMOVE(pending_sei, sei_cont, link); +nt2_res-nt = 2; +nt2_res-cc = sei_cont-cc; +switch (sei_cont-cc) { +case 1: /* error event */ +eccdf = (PciCcdfErr *)nt2_res-ccdf; +eccdf-fid = cpu_to_be32(sei_cont-fid); +eccdf-fh = cpu_to_be32(sei_cont-fh); +break; +case 2: /* availability event */ +accdf = (PciCcdfAvail *)nt2_res-ccdf; +accdf-fid = cpu_to_be32(sei_cont-fid); +accdf-fh = cpu_to_be32(sei_cont-fh); +accdf-pec = cpu_to_be16(sei_cont-pec); +break; +default: +abort(); +} +g_free(sei_cont); +rc = 0; +} + +return rc; +} + +int chsc_sei_nt2_have_event(void) +{ +return !QTAILQ_EMPTY(pending_sei); +} + +static S390PCIBusDevice *s390_pci_find_dev_by_fid(uint32_t fid) +{ +S390PCIBusDevice *pbdev; + +QTAILQ_FOREACH(pbdev, device_list, next) { +if (pbdev-fid == fid) { +return pbdev; +} +} +return NULL; +} + +void s390_pci_sclp_configure(int configure, SCCB *sccb) +{ +PciCfgSccb *psccb = (PciCfgSccb *)sccb; +S390PCIBusDevice *pbdev = s390_pci_find_dev_by_fid(be32_to_cpu(psccb-aid)); +uint16_t rc; + +if (pbdev) { +if ((configure == 1 pbdev-configured == true) || +(configure == 0 pbdev-configured == false)) { +rc = SCLP_RC_NO_ACTION_REQUIRED; +} else { +pbdev-configured =
[RFC][patch 4/6] KVM: s390: Add PCI pass-through support
From: Frank Blaschka frank.blasc...@de.ibm.com This patch implemets PCI pass-through kernel support for s390. Design approach is very similar to the x86 device assignment. User space executes the KVM_ASSIGN_PCI_DEVICE ioctl to create a proxy instance in the kernel KVM and connect this instance to the host pci device. s390 pci instructions are intercepted in kernel and operations are passed directly to the assigned pci device. To take advantage of all system z specific virtualization features we need to access the SIE control block residing in KVM. Also we have to enable z pci devices with special configuration information coming form the SIE block as well. Signed-off-by: Frank Blaschka frank.blasc...@de.ibm.com --- arch/s390/include/asm/kvm_host.h |1 arch/s390/kvm/Makefile |2 arch/s390/kvm/intercept.c|1 arch/s390/kvm/kvm-s390.c | 33 arch/s390/kvm/kvm-s390.h | 17 arch/s390/kvm/pci.c | 2130 +++ arch/s390/kvm/priv.c | 21 7 files changed, 2202 insertions(+), 3 deletions(-) --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -488,6 +488,7 @@ struct kvm_arch{ union kvm_s390_gisa *gisa; unsigned long iam; atomic_t in_sie; + struct list_head ppt_dev_list; }; #define KVM_HVA_ERR_BAD(-1UL) --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -12,6 +12,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/e ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-objs += diag.o gaccess.o guestdbg.o +kvm-objs += diag.o gaccess.o guestdbg.o pci.o obj-$(CONFIG_KVM) += kvm.o --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -34,6 +34,7 @@ static const intercept_handler_t instruc [0xb6] = kvm_s390_handle_stctl, [0xb7] = kvm_s390_handle_lctl, [0xb9] = kvm_s390_handle_b9, + [0xe3] = kvm_s390_handle_e3, [0xe5] = kvm_s390_handle_e5, [0xeb] = kvm_s390_handle_eb, }; --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -397,6 +397,24 @@ long kvm_arch_vm_ioctl(struct file *filp r = kvm_s390_vm_has_attr(kvm, attr); break; } + case KVM_ASSIGN_PCI_DEVICE: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(assigned_dev, argp, sizeof(assigned_dev))) + break; + r = kvm_s390_ioctrl_assign_pci(kvm, assigned_dev); + break; + } + case KVM_DEASSIGN_PCI_DEVICE: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(assigned_dev, argp, sizeof(assigned_dev))) + break; + r = kvm_s390_ioctrl_deassign_pci(kvm, assigned_dev); + break; + } default: r = -ENOTTY; } @@ -478,6 +496,7 @@ int kvm_arch_init_vm(struct kvm *kvm, un kvm_s390_gisa_set_next_alert(kvm, (u32)(unsigned long)kvm-arch.gisa); kvm_s390_gisa_set_alert_mask(kvm, 0); atomic_set(kvm-arch.in_sie, 0); + INIT_LIST_HEAD(kvm-arch.ppt_dev_list); spin_lock_init(kvm-arch.start_stop_lock); @@ -538,6 +557,7 @@ void kvm_arch_sync_events(struct kvm *kv void kvm_arch_destroy_vm(struct kvm *kvm) { + s390_pci_cleanup(kvm); free_page((unsigned long)kvm-arch.gisa); kvm_free_vcpus(kvm); free_page((unsigned long)(kvm-arch.sca)); @@ -656,7 +676,10 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu vcpu-arch.sie_block-ecb |= 0x10; vcpu-arch.sie_block-ecb2 = 8; - vcpu-arch.sie_block-eca = 0xD1002000U; + vcpu-arch.sie_block-eca = 0xD1202000U; + vcpu-arch.sie_block-ecb2 |= 0x02; + vcpu-arch.sie_block-ecb3 = 0x20; + if (sclp_has_siif()) vcpu-arch.sie_block-eca |= 1; vcpu-arch.sie_block-fac = (int) (long) vfacilities; @@ -1920,6 +1943,12 @@ static int __init kvm_s390_init(void) if (ret) return ret; + ret = s390_pci_init(); + if (ret) { + kvm_exit(); + return ret; + } + /* * guests can ask for up to 255+1 double words, we need a full page * to hold the maximum amount of facilities. On the other hand, we @@ -1932,7 +1961,7 @@ static int __init kvm_s390_init(void) } memcpy(vfacilities, S390_lowcore.stfle_fac_list, 16); vfacilities[0] = 0xff82fff3f4fc2000UL; - vfacilities[1] = 0x005cUL; + vfacilities[1] = 0x07dcUL; return 0; } --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -167,6 +167,7 @@ int kvm_s390_mask_adapter(struct kvm *kv /* implemented in priv.c */ int is_valid_psw(psw_t *psw); int
[RFC][patch 2/6] s390: pci: export pci functions for pass-through usage
From: Frank Blaschka frank.blasc...@de.ibm.com This patch exports a couple of zPCI functions. The new pci pass-through driver for KVM will use this functions to enable the device with virtualization information and update the device dma translation table on the host. We add a new interface to purge the translation table of a device. Also we moved some zPCI functions to the pci_insn header file. Signed-off-by: Frank Blaschka frank.blasc...@de.ibm.com --- arch/s390/include/asm/pci.h |6 ++ arch/s390/include/asm/pci_clp.h |3 - arch/s390/include/asm/pci_insn.h | 92 arch/s390/pci/pci_clp.c |4 + arch/s390/pci/pci_dma.c | 24 - arch/s390/pci/pci_insn.c | 97 --- 6 files changed, 126 insertions(+), 100 deletions(-) --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -140,6 +140,7 @@ int zpci_register_ioat(struct zpci_dev * int zpci_unregister_ioat(struct zpci_dev *, u8); /* CLP */ +u8 clp_instr(void *data); int clp_scan_pci_devices(void); int clp_rescan_pci_devices(void); int clp_rescan_pci_devices_simple(void); @@ -177,6 +178,11 @@ struct zpci_dev *get_zdev_by_fid(u32); /* DMA */ int zpci_dma_init(void); void zpci_dma_exit(void); +int dma_update_trans(struct zpci_dev *zdev, unsigned long pa, +dma_addr_t dma_addr, size_t size, int flags); +void dma_update_cpu_trans(struct zpci_dev *zdev, void *page_addr, + dma_addr_t dma_addr, int flags); +void dma_purge_rto_entries(struct zpci_dev *zdev); /* FMB */ int zpci_fmb_enable_device(struct zpci_dev *); --- a/arch/s390/include/asm/pci_clp.h +++ b/arch/s390/include/asm/pci_clp.h @@ -148,7 +148,8 @@ struct clp_req_set_pci { u16 reserved2; u8 oc; /* operation controls */ u8 ndas;/* number of dma spaces */ - u64 reserved3; + u32 reserved3; + u32 gd; /* GISA Designation */ } __packed; /* Set PCI function response */ --- a/arch/s390/include/asm/pci_insn.h +++ b/arch/s390/include/asm/pci_insn.h @@ -1,6 +1,8 @@ #ifndef _ASM_S390_PCI_INSN_H #define _ASM_S390_PCI_INSN_H +#include asm/processor.h + /* Load/Store status codes */ #define ZPCI_PCI_ST_FUNC_NOT_ENABLED 4 #define ZPCI_PCI_ST_FUNC_IN_ERR8 @@ -83,4 +85,94 @@ int zpci_store(u64 data, u64 req, u64 of int zpci_store_block(const u64 *data, u64 req, u64 offset); void zpci_set_irq_ctrl(u16 ctl, char *unused, u8 isc); +static inline u8 __mpcifc(u64 req, struct zpci_fib *fib, u8 *status) +{ + u8 cc; + + asm volatile ( + .insn rxy,0xe3d0,%[req],%[fib]\n + ipm %[cc]\n + srl %[cc],28\n + : [cc] =d (cc), [req] +d (req), [fib] +Q (*fib) + : : cc); + *status = req 24 0xff; + return cc; +} + +static inline u8 __rpcit(u64 fn, u64 addr, u64 range, u8 *status) +{ + register u64 __addr asm(2) = addr; + register u64 __range asm(3) = range; + u8 cc; + + asm volatile ( + .insn rre,0xb9d3,%[fn],%[addr]\n + ipm %[cc]\n + srl %[cc],28\n + : [cc] =d (cc), [fn] +d (fn) + : [addr] d (__addr), d (__range) + : cc); + *status = fn 24 0xff; + return cc; +} + +static inline int __pcilg(u64 *data, u64 req, u64 offset, u8 *status) +{ + register u64 __req asm(2) = req; + register u64 __offset asm(3) = offset; + int cc = -ENXIO; + u64 __data; + + asm volatile ( + .insn rre,0xb9d2,%[data],%[req]\n + 0: ipm %[cc]\n + srl %[cc],28\n + 1:\n + EX_TABLE(0b, 1b) + : [cc] +d (cc), [data] =d (__data), [req] +d (__req) + : d (__offset) + : cc); + *status = __req 24 0xff; + if (!cc) + *data = __data; + + return cc; +} + +static inline int __pcistg(u64 data, u64 req, u64 offset, u8 *status) +{ + register u64 __req asm(2) = req; + register u64 __offset asm(3) = offset; + int cc = -ENXIO; + + asm volatile ( + .insn rre,0xb9d0,%[data],%[req]\n + 0: ipm %[cc]\n + srl %[cc],28\n + 1:\n + EX_TABLE(0b, 1b) + : [cc] +d (cc), [req] +d (__req) + : d (__offset), [data] d (data) + : cc); + *status = __req 24 0xff; + return cc; +} + +static inline int __pcistb(const u64 *data, u64 req, u64 offset, u8 *status) +{ + int cc = -ENXIO; + + asm volatile ( + .insn rsy,0xebd0,%[req],%[offset],%[data]\n +
[RFC][patch 6/6] s390: Add PCI pass-through device support
From: Frank Blaschka frank.blasc...@de.ibm.com This patch adds a new device class handling s390 pci pass-through device assignment. The approach is very similar to the x86 device assignment. The device executes the KVM_ASSIGN_PCI_DEVICE ioctl to create a proxy instance in the kernel KVM and connect this instance to the host pci device. Signed-off-by: Frank Blaschka frank.blasc...@de.ibm.com --- hw/s390x/Makefile.objs |2 hw/s390x/s390-pci-bus.c | 14 +- hw/s390x/s390_pci.c | 321 hw/s390x/s390_pci.h | 31 4 files changed, 365 insertions(+), 3 deletions(-) --- a/hw/s390x/Makefile.objs +++ b/hw/s390x/Makefile.objs @@ -8,4 +8,4 @@ obj-y += ipl.o obj-y += css.o obj-y += s390-virtio-ccw.o obj-y += virtio-ccw.o -obj-$(CONFIG_KVM) += s390-pci-bus.o +obj-$(CONFIG_KVM) += s390-pci-bus.o s390_pci.o --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -16,6 +16,7 @@ #include hw/s390x/sclp.h #include qemu/error-report.h #include s390-pci-bus.h +#include s390_pci.h /* #define DEBUG_S390PCI_BUS */ #ifdef DEBUG_S390PCI_BUS @@ -219,8 +220,17 @@ static void s390_pcihost_hot_plug(Hotplu pbdev-pdev = pci_dev; pbdev-configured = true; -pbdev-fh = s390_pci_get_pfh(pci_dev); -pbdev-is_virt = 1; +if (!strcmp(pci_dev-name, s390-pci)) { +S390PCIDevice *sdev = DO_UPCAST(S390PCIDevice, pdev, pci_dev); +pbdev-fh = s390_pci_get_fh(sdev-host); +if (!pbdev-fh) { +g_free(pbdev); +return; +} +} else { +pbdev-fh = s390_pci_get_pfh(pci_dev); +pbdev-is_virt = 1; +} QTAILQ_INSERT_TAIL(device_list, pbdev, next); if (dev-hotplugged) { --- /dev/null +++ b/hw/s390x/s390_pci.c @@ -0,0 +1,321 @@ +/* + * s390 PCI pass-through device assignment + * + * Copyright 2014 IBM Corp. + * Author(s): Frank Blaschka frank.blasc...@de.ibm.com + *Hong Bo Li lih...@cn.ibm.com + *Yi Min Zhao zyi...@cn.ibm.com + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at + * your option) any later version. See the COPYING file in the top-level + * directory. + */ + +#include hw/pci/pci.h +#include hw/pci/pci_host.h +#include hw/pci/pci_bus.h +#include net/net.h +#include hw/s390x/css.h +#include hw/s390x/sclp.h +#include exec/exec-all.h +#include sysemu/sysemu.h +#include exec/address-spaces.h +#include qemu/error-report.h +#include qapi/qmp/qerror.h + +#include s390_pci.h +#include s390-pci-bus.h + +/* #define DEBUG_S390PCI */ +#ifdef DEBUG_S390PCI +#define DPRINTF(fmt, ...) \ +do { fprintf(stderr, s390pci: fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) \ +do { } while (0) +#endif + +#define ASSIGN_FLAG_HOSTIRQ 0x1 + +uint32_t s390_pci_get_fh(PCIHostDeviceAddress host) +{ +char fh_path[128]; +struct stat st; +FILE *fd; +uint32_t fh; + +snprintf(fh_path, sizeof(fh_path), +/sys/bus/pci/devices/%04x:%02x:%02x.%x/function_handle, +host.domain, host.bus, host.slot, host.function); + +if (stat(fh_path, st)) { +error_report(get function handle faild: no host device specified); +return -1; +} + +fd = fopen(fh_path, r); +if (fd == NULL) { +error_report(%s: %s: %m, __func__, fh_path); +return 0; +} +if (fscanf(fd, %x, fh) != 1) { +fclose(fd); +return 0; +} +fclose(fd); +return fh; +} + +uint32_t s390_pci_get_fid(PCIHostDeviceAddress host) +{ +char fid_path[128]; +struct stat st; +FILE *fd; +uint32_t fid; + +snprintf(fid_path, sizeof(fid_path), +/sys/bus/pci/devices/%04x:%02x:%02x.%x/function_id, +host.domain, host.bus, host.slot, host.function); + +if (stat(fid_path, st)) { +error_report(get function id faild: no host device specified); +return -1; +} + +fd = fopen(fid_path, r); +if (fd == NULL) { +error_report(%s: %s: %m, __func__, fid_path); +return -1; +} +if (fscanf(fd, %x, fid) != 1) { +fclose(fd); +return -1; +} +fclose(fd); +return fid; +} + +static int get_real_id(const char *devpath, const char *idname, uint16_t *val) +{ +FILE *f; +char name[128]; +long id; + +snprintf(name, sizeof(name), %s%s, devpath, idname); +f = fopen(name, r); +if (f == NULL) { +error_report(%s: %s: %m, __func__, name); +return -1; +} +if (fscanf(f, %li\n, id) == 1) { +*val = id; +} else { +fclose(f); +return -1; +} +fclose(f); + +return 0; +} + +static int get_real_vendor_id(const char *devpath, uint16_t *val) +{ +return get_real_id(devpath, vendor, val); +} + +static int get_real_device_id(const char *devpath, uint16_t *val) +{ +return get_real_id(devpath, device, val); +} + +static void assign_failed_examine(S390PCIDevice *dev) +{ +char name[PATH_MAX], dir[PATH_MAX],
[RFC][patch 0/6] pci pass-through support for qemu/KVM on s390
This set of patches implements pci pass-through support for qemu/KVM on s390. PCI support on s390 is very different from other platforms. Major differences are: 1) all PCI operations are driven by special s390 instructions 2) all s390 PCI instructions are privileged 3) PCI config and memory spaces can not be mmap'ed 4) no classic interrupts (INTX, MSI). The pci hw understands the concept of requesting MSIX irqs but irqs are delivered as s390 adapter irqs. 5) For DMA access there is always an IOMMU required. s390 pci implementation does not support a complete memory to iommu mapping, dma mappings are created on request. 6) The OS does not get any informations about the physical layout of the PCI bus. 7) To take advantage of system z specific virtualization features we need to access the SIE control block residing in the kernel KVM 8) To enable system z specific virtualization features we have to manipulate the zpci device in kernel. For this reasons I decided to implement a kernel based approach similar to x86 device assignment. There is a new qemu device (s390-pci) representing a pass through device on the host. Here is a sample qemu device configuration: -device s390-pci,host=:00:00.0 The device executes the KVM_ASSIGN_PCI_DEVICE ioctl to create a proxy instance in the kernel KVM and connect this instance to the host pci device. kernel patches apply to linux-kvm s390: cio: chsc function to register GIB s390: pci: export pci functions for pass-through usage KVM: s390: Add GISA support KVM: s390: Add PCI pass-through support qemu patches apply to qemu-master s390: Add PCI bus support s390: Add PCI pass-through device support Feedback and discussion is highly welcome ... Thx! Frank -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC][patch 1/6] s390: cio: chsc function to register GIB
From: Frank Blaschka frank.blasc...@de.ibm.com This patch provides a new chsc function to register/unregister a GIB (Guest Information Block). Signed-off-by: Frank Blaschka frank.blasc...@de.ibm.com --- arch/s390/include/asm/cio.h |1 drivers/s390/cio/chsc.c | 50 2 files changed, 51 insertions(+) --- a/arch/s390/include/asm/cio.h +++ b/arch/s390/include/asm/cio.h @@ -311,5 +311,6 @@ extern int cio_get_iplinfo(struct cio_ip /* Function from drivers/s390/cio/chsc.c */ int chsc_sstpc(void *page, unsigned int op, u16 ctrl); int chsc_sstpi(void *page, void *result, size_t size); +int chsc_sgib(u32 gibo); #endif --- a/drivers/s390/cio/chsc.c +++ b/drivers/s390/cio/chsc.c @@ -1188,6 +1188,56 @@ out: EXPORT_SYMBOL_GPL(chsc_siosl); /** + * chsc_sgib() - register guest information block + * @gibo: guest information block + * + * gibo must be allocated in low memory + * + * Returns 0 on success. + */ +int chsc_sgib(u32 gibo) +{ + struct { + struct chsc_header request; + u16 operation_code; + u16 : 16; + u32 : 4; + u32 fmt : 4; + u32 : 24; + u32 : 32; + u32 : 32; + u32 gibo; + u64 : 64; + u32 : 16; + u32 aix : 8; + u32 : 8; + u32 reserved[1007]; + struct chsc_header response; + } __packed *scssc; + unsigned long flags; + int rc; + + spin_lock_irqsave(chsc_page_lock, flags); + memset(chsc_page, 0, PAGE_SIZE); + scssc = chsc_page; + + scssc-request.length = 0x0fe0; + scssc-request.code = 0x0021; + scssc-operation_code = 1; + scssc-gibo = gibo; + + rc = chsc(scssc); + if (rc) + rc = -EIO; + else + rc = chsc_error_from_response(scssc-response.code); + + spin_unlock_irqrestore(chsc_page_lock, flags); + return rc; +} +EXPORT_SYMBOL_GPL(chsc_sgib); + +/** * chsc_scm_info() - store SCM information (SSI) * @scm_area: request and response block for SSI * @token: continuation token -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: kvm-unit-test failures
Il 04/09/2014 11:53, Paolo Bonzini ha scritto: Il 03/09/2014 20:25, Chris J Arges ha scritto: snip I'm not sure about the reason for the warp, but indeed the offset and uptime match (I'll check them against the trace tomorrow) so it's just that the VM's TSC base is not taken into account correctly. Can you gather another trace with the problematic patch reverted? Paolo Here is the third trace running with 0d3da0d2 reverted from the latest kvm queue branch 11cc9ea3: http://people.canonical.com/~arges/kvm/trace-3.dat.xz Thanks! And---yay!---I reproduced it on another machine. And my bisection landed on the merge of the timer branch (commit e7fda6c4c3c1a7d6996dd75fd84670fa0b5d448f). Here is the log: $ git bisect bad origin/master $ git bisect good v3.16 $ git bisect good kvm-3.17-1 # 42cbc04fd3b5e3f9b011bf9fa3ce0b3d1e10b58b good[ 6929.863545] loaded kvm module (v3.17-rc1-158-g451fd72219dd) bad [ 6971.932790] loaded kvm module (for-linus) bad [ 7216.073128] loaded kvm module (v3.16-6426-gae045e245542) good[ 7286.198948] loaded kvm module (v3.16-3283-g53ee983378ff) good[ 7350.534060] loaded kvm module (v3.16-rc7-1668-gaef4f5b6db65) good[ 7439.037038] loaded kvm module (v3.16-4006-g91c2ff7708d4) good[ 7481.188637] loaded kvm module (v3.16-rc6-450-g7ba3c21c17d0) bad [ 7535.292730] loaded kvm module (v3.16-4635-ge7fda6c4c3c1) good[ 7589.722691] loaded kvm module (v3.16-rc5-110-g9b0fd802e8c0) good[ 7630.286418] loaded kvm module (v3.16-4467-ged5c41d30ef2) good[ 7712.470986] loaded kvm module (v3.16-rc1-35-g885d078bfe92) good[ 7763.443626] loaded kvm module (v3.16-rc1-381-g1b0733837a9b) good[ 7825.497414] loaded kvm module (v3.16-rc5-116-g7806f60e1d20) good[ 7893.174056] loaded kvm module (v3.16-rc1-384-gc6f1224573c3) This means that: - Tomasz's patch (commit 0d3da0d26e3c3515997c99451ce3b0ad1a69a36c) is fine, it just enables the (wrong) master clock more often - the failure is within that branch. I then cherry-picked Tomasz's patch during a new bisection, and landed on one of my original suspects: commit cbcf2dd3b3d4d990610259e8d878fc8dc1f17d80 Author: Thomas Gleixner t...@linutronix.de Date: Wed Jul 16 21:04:54 2014 + x86: kvm: Make kvm_get_time_and_clockread() nanoseconds based Convert the relevant base data right away to nanoseconds instead of doing the conversion on every readout. Reduces text size by 160 bytes. Signed-off-by: Thomas Gleixner t...@linutronix.de Cc: Gleb Natapov g...@kernel.org Cc: kvm@vger.kernel.org Acked-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: John Stultz john.stu...@linaro.org Again, here is the log: $ git bisect bad 953dec21aed4038464fec02f96a2f1b8701a5bce $ git bisect good 1af447bd8cbfb808a320885d214555fb2d32e6e6 good[ 8384.334892] loaded kvm module (v3.16-rc5-81-g68f6783d2831) bad [ 8525.975170] loaded kvm module (v3.16-rc5-99-gf519b1a2e08c) good[ 8562.204988] loaded kvm module (v3.16-rc5-90-g41fa4215f8e8) bad [ 8629.133287] loaded kvm module (v3.16-rc5-94-g48f18fd6addc) bad [ 8772.846612] loaded kvm module (v3.16-rc5-92-gcbcf2dd3b3d4) good[ 8836.509602] loaded kvm module (v3.16-rc5-91-gbb0b58127c5a) Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: kvm-unit-test failures
On Thu, Sep 04, 2014 at 01:33:10PM +0200, Paolo Bonzini wrote: Il 04/09/2014 11:53, Paolo Bonzini ha scritto: Il 03/09/2014 20:25, Chris J Arges ha scritto: snip I'm not sure about the reason for the warp, but indeed the offset and uptime match (I'll check them against the trace tomorrow) so it's just that the VM's TSC base is not taken into account correctly. Can you gather another trace with the problematic patch reverted? Paolo Here is the third trace running with 0d3da0d2 reverted from the latest kvm queue branch 11cc9ea3: http://people.canonical.com/~arges/kvm/trace-3.dat.xz Thanks! And---yay!---I reproduced it on another machine. And my bisection landed on the merge of the timer branch (commit e7fda6c4c3c1a7d6996dd75fd84670fa0b5d448f). Here is the log: $ git bisect bad origin/master $ git bisect good v3.16 $ git bisect good kvm-3.17-1 # 42cbc04fd3b5e3f9b011bf9fa3ce0b3d1e10b58b good[ 6929.863545] loaded kvm module (v3.17-rc1-158-g451fd72219dd) bad [ 6971.932790] loaded kvm module (for-linus) bad [ 7216.073128] loaded kvm module (v3.16-6426-gae045e245542) good[ 7286.198948] loaded kvm module (v3.16-3283-g53ee983378ff) good[ 7350.534060] loaded kvm module (v3.16-rc7-1668-gaef4f5b6db65) good[ 7439.037038] loaded kvm module (v3.16-4006-g91c2ff7708d4) good[ 7481.188637] loaded kvm module (v3.16-rc6-450-g7ba3c21c17d0) bad [ 7535.292730] loaded kvm module (v3.16-4635-ge7fda6c4c3c1) good[ 7589.722691] loaded kvm module (v3.16-rc5-110-g9b0fd802e8c0) good[ 7630.286418] loaded kvm module (v3.16-4467-ged5c41d30ef2) good[ 7712.470986] loaded kvm module (v3.16-rc1-35-g885d078bfe92) good[ 7763.443626] loaded kvm module (v3.16-rc1-381-g1b0733837a9b) good[ 7825.497414] loaded kvm module (v3.16-rc5-116-g7806f60e1d20) good[ 7893.174056] loaded kvm module (v3.16-rc1-384-gc6f1224573c3) This means that: - Tomasz's patch (commit 0d3da0d26e3c3515997c99451ce3b0ad1a69a36c) is fine, it just enables the (wrong) master clock more often - the failure is within that branch. I then cherry-picked Tomasz's patch during a new bisection, and landed on one of my original suspects: commit cbcf2dd3b3d4d990610259e8d878fc8dc1f17d80 Author: Thomas Gleixner t...@linutronix.de Date: Wed Jul 16 21:04:54 2014 + x86: kvm: Make kvm_get_time_and_clockread() nanoseconds based Convert the relevant base data right away to nanoseconds instead of doing the conversion on every readout. Reduces text size by 160 bytes. Signed-off-by: Thomas Gleixner t...@linutronix.de Cc: Gleb Natapov g...@kernel.org Cc: kvm@vger.kernel.org Acked-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: John Stultz john.stu...@linaro.org Yes, I also look into this bad commit recently which lead to guest hang after live migration or after local save/restore. Regards, Wanpeng Li Again, here is the log: $ git bisect bad 953dec21aed4038464fec02f96a2f1b8701a5bce $ git bisect good 1af447bd8cbfb808a320885d214555fb2d32e6e6 good[ 8384.334892] loaded kvm module (v3.16-rc5-81-g68f6783d2831) bad [ 8525.975170] loaded kvm module (v3.16-rc5-99-gf519b1a2e08c) good[ 8562.204988] loaded kvm module (v3.16-rc5-90-g41fa4215f8e8) bad [ 8629.133287] loaded kvm module (v3.16-rc5-94-g48f18fd6addc) bad [ 8772.846612] loaded kvm module (v3.16-rc5-92-gcbcf2dd3b3d4) good[ 8836.509602] loaded kvm module (v3.16-rc5-91-gbb0b58127c5a) Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 0/3] ivshmem: update documentation, add client/server tools
Here is a patchset containing an update on ivshmem specs documentation and importing ivshmem server and client tools. These tools have been written from scratch and are not related to what is available in nahanni repository. I put them in contrib/ directory as the qemu-doc.texi was already telling the server was supposed to be there. Changes since v4: - squashed patches 3-13 from v4 into first patch - reused reported error when parsing arguments in server - fixed spelling mistakes in documentation in second patch Changes since v3: - first patch is untouched - just restored the Reviewed-By Claudio in second patch - following patches 3-8 take into account Stefan's comments - patches 9-12 take into account Gonglei's comments - patch 13 adjusts ivshmem-server default values - last patch introduces a change in the ivshmem client-server protocol to check a protocol version at connect time Changes since v2: - fixed license issues in ivshmem client/server (I took hw/virtio/virtio-rng.c file as a reference). Changes since v1: - moved client/server import patch before doc update, - tried to re-organise the ivshmem_device_spec.txt file based on Claudio comments (still not sure if the result is that great, comments welcome), - incorporated comments from Claudio, Eric and Cam, - added more details on the server - client messages exchange (but sorry, no ASCII art here). By the way, there are still some functionnalities that need description (use of ioeventfd, the lack of irqfd support) and some parts of the ivshmem code clearly need cleanup. I will try to address this in future patches when these first patches are ok. -- David Marchand David Marchand (3): contrib: add ivshmem client and server docs: update ivshmem device spec ivshmem: add check on protocol version in QEMU Makefile|8 + configure |3 + contrib/ivshmem-client/ivshmem-client.c | 413 +++ contrib/ivshmem-client/ivshmem-client.h | 240 ++ contrib/ivshmem-client/main.c | 237 ++ contrib/ivshmem-server/ivshmem-server.c | 402 ++ contrib/ivshmem-server/ivshmem-server.h | 187 ++ contrib/ivshmem-server/main.c | 244 ++ docs/specs/ivshmem_device_spec.txt | 127 +++--- hw/misc/ivshmem.c | 43 +++- include/hw/misc/ivshmem.h | 17 ++ qemu-doc.texi | 10 +- 12 files changed, 1888 insertions(+), 43 deletions(-) create mode 100644 contrib/ivshmem-client/ivshmem-client.c create mode 100644 contrib/ivshmem-client/ivshmem-client.h create mode 100644 contrib/ivshmem-client/main.c create mode 100644 contrib/ivshmem-server/ivshmem-server.c create mode 100644 contrib/ivshmem-server/ivshmem-server.h create mode 100644 contrib/ivshmem-server/main.c create mode 100644 include/hw/misc/ivshmem.h -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 3/3] ivshmem: add check on protocol version in QEMU
Send a protocol version as the first message from server, clients must close communication if they don't support this protocol version. Older QEMUs should be fine with this change in the protocol since they overrides their own vm_id on reception of an id associated to no eventfd. Signed-off-by: David Marchand david.march...@6wind.com --- contrib/ivshmem-client/ivshmem-client.c | 14 +++--- contrib/ivshmem-client/ivshmem-client.h |1 + contrib/ivshmem-server/ivshmem-server.c |7 + contrib/ivshmem-server/ivshmem-server.h |1 + docs/specs/ivshmem_device_spec.txt |9 --- hw/misc/ivshmem.c | 43 --- include/hw/misc/ivshmem.h | 17 7 files changed, 77 insertions(+), 15 deletions(-) create mode 100644 include/hw/misc/ivshmem.h diff --git a/contrib/ivshmem-client/ivshmem-client.c b/contrib/ivshmem-client/ivshmem-client.c index ad210c8..0c4e016 100644 --- a/contrib/ivshmem-client/ivshmem-client.c +++ b/contrib/ivshmem-client/ivshmem-client.c @@ -184,10 +184,18 @@ ivshmem_client_connect(IvshmemClient *client) goto err_close; } -/* first, we expect our index + a fd == -1 */ +/* first, we expect a protocol version */ +if (read_one_msg(client, tmp, fd) 0 || +(tmp != IVSHMEM_PROTOCOL_VERSION) || fd != -1) { +debug_log(client, cannot read from server\n); +goto err_close; +} +debug_log(client, our_id=%ld\n, client-local.id); + +/* then, we expect our index + a fd == -1 */ if (read_one_msg(client, client-local.id, fd) 0 || client-local.id 0 || fd != -1) { -debug_log(client, cannot read from server\n); +debug_log(client, cannot read from server (2)\n); goto err_close; } debug_log(client, our_id=%ld\n, client-local.id); @@ -196,7 +204,7 @@ ivshmem_client_connect(IvshmemClient *client) * is not used */ if (read_one_msg(client, tmp, fd) 0 || tmp != -1 || fd 0) { -debug_log(client, cannot read from server (2)\n); +debug_log(client, cannot read from server (3)\n); goto err_close; } debug_log(client, shm_fd=%d\n, fd); diff --git a/contrib/ivshmem-client/ivshmem-client.h b/contrib/ivshmem-client/ivshmem-client.h index 45f2b64..8d6ab35 100644 --- a/contrib/ivshmem-client/ivshmem-client.h +++ b/contrib/ivshmem-client/ivshmem-client.h @@ -23,6 +23,7 @@ #include sys/select.h #include qemu/queue.h +#include hw/misc/ivshmem.h /** * Maximum number of notification vectors supported by the client diff --git a/contrib/ivshmem-server/ivshmem-server.c b/contrib/ivshmem-server/ivshmem-server.c index f441da7..670c58c 100644 --- a/contrib/ivshmem-server/ivshmem-server.c +++ b/contrib/ivshmem-server/ivshmem-server.c @@ -99,6 +99,13 @@ send_initial_info(IvshmemServer *server, IvshmemServerPeer *peer) { int ret; +/* send our protool version first */ +ret = send_one_msg(peer-sock_fd, IVSHMEM_PROTOCOL_VERSION, -1); +if (ret 0) { +debug_log(server, cannot send version: %s\n, strerror(errno)); +return -1; +} + /* send the peer id to the client */ ret = send_one_msg(peer-sock_fd, peer-id, -1); if (ret 0) { diff --git a/contrib/ivshmem-server/ivshmem-server.h b/contrib/ivshmem-server/ivshmem-server.h index 5ccc7af..e76e4fe 100644 --- a/contrib/ivshmem-server/ivshmem-server.h +++ b/contrib/ivshmem-server/ivshmem-server.h @@ -30,6 +30,7 @@ #include sys/select.h #include qemu/queue.h +#include hw/misc/ivshmem.h /** * Maximum number of notification vectors supported by the server diff --git a/docs/specs/ivshmem_device_spec.txt b/docs/specs/ivshmem_device_spec.txt index 12f338e..3435116 100644 --- a/docs/specs/ivshmem_device_spec.txt +++ b/docs/specs/ivshmem_device_spec.txt @@ -64,6 +64,8 @@ It creates a shared memory object then waits for clients to connect on a unix socket. For each client (QEMU process) that connects to the server: +- the server sends a protocol version, if client does not support it, the client + closes the communication, - the server assigns an ID for this client and sends this ID to him as the first message, - the server sends a fd to the shared memory object to this client, @@ -86,9 +88,10 @@ been provided in qemu.git/contrib/ivshmem-client for debug. *QEMU as an ivshmem client* -At initialisation, when creating the ivshmem device, QEMU gets its ID from the -server then makes it available through BAR0 IVPosition register for the VM to -use (see 'PCI device registers' subsection). +At initialisation, when creating the ivshmem device, QEMU first receives a +protocol version and closes communication with server if it does not match. +Then, QEMU gets its ID from the server then makes it available through BAR0 +IVPosition register for the VM to use (see 'PCI device registers' subsection). QEMU then uses the fd to the shared memory to map it to BAR2.
[PATCH v5 1/3] contrib: add ivshmem client and server
When using ivshmem devices, notifications between guests can be sent as interrupts using a ivshmem-server (typical use described in documentation). The client is provided as a debug tool. Signed-off-by: Olivier Matz olivier.m...@6wind.com Signed-off-by: David Marchand david.march...@6wind.com --- Makefile|8 + configure |3 + contrib/ivshmem-client/ivshmem-client.c | 405 +++ contrib/ivshmem-client/ivshmem-client.h | 239 ++ contrib/ivshmem-client/main.c | 237 ++ contrib/ivshmem-server/ivshmem-server.c | 395 ++ contrib/ivshmem-server/ivshmem-server.h | 186 ++ contrib/ivshmem-server/main.c | 244 +++ qemu-doc.texi | 10 +- 9 files changed, 1724 insertions(+), 3 deletions(-) create mode 100644 contrib/ivshmem-client/ivshmem-client.c create mode 100644 contrib/ivshmem-client/ivshmem-client.h create mode 100644 contrib/ivshmem-client/main.c create mode 100644 contrib/ivshmem-server/ivshmem-server.c create mode 100644 contrib/ivshmem-server/ivshmem-server.h create mode 100644 contrib/ivshmem-server/main.c diff --git a/Makefile b/Makefile index b33aaac..0575898 100644 --- a/Makefile +++ b/Makefile @@ -283,6 +283,14 @@ $(qga-obj-y) qemu-ga.o: $(QGALIB_GEN) qemu-ga$(EXESUF): $(qga-obj-y) libqemuutil.a libqemustub.a $(call LINK, $^) +IVSHMEM_CLIENT_OBJS=$(addprefix $(SRC_PATH)/contrib/ivshmem-client/, ivshmem-client.o main.o) +ivshmem-client$(EXESUF): $(IVSHMEM_CLIENT_OBJS) + $(call LINK, $^) + +IVSHMEM_SERVER_OBJS=$(addprefix $(SRC_PATH)/contrib/ivshmem-server/, ivshmem-server.o main.o) +ivshmem-server$(EXESUF): $(IVSHMEM_SERVER_OBJS) libqemuutil.a libqemustub.a + $(call LINK, $^) + clean: # avoid old build problems by removing potentially incorrect old files rm -f config.mak op-i386.h opc-i386.h gen-op-i386.h op-arm.h opc-arm.h gen-op-arm.h diff --git a/configure b/configure index 961bf6f..a41a16c 100755 --- a/configure +++ b/configure @@ -4125,6 +4125,9 @@ if test $want_tools = yes ; then if [ $linux = yes -o $bsd = yes -o $solaris = yes ] ; then tools=qemu-nbd\$(EXESUF) $tools fi + if [ $kvm = yes ] ; then +tools=ivshmem-client\$(EXESUF) ivshmem-server\$(EXESUF) $tools + fi fi if test $softmmu = yes ; then if test $virtfs != no ; then diff --git a/contrib/ivshmem-client/ivshmem-client.c b/contrib/ivshmem-client/ivshmem-client.c new file mode 100644 index 000..ad210c8 --- /dev/null +++ b/contrib/ivshmem-client/ivshmem-client.c @@ -0,0 +1,405 @@ +/* + * Copyright 6WIND S.A., 2014 + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#include sys/types.h +#include sys/socket.h +#include sys/un.h + +#include qemu-common.h +#include qemu/queue.h + +#include ivshmem-client.h + +/* log a message on stdout if verbose=1 */ +#define debug_log(client, fmt, ...) do { \ +if ((client)-verbose) { \ +printf(fmt, ## __VA_ARGS__); \ +}\ +} while (0) + +/* read message from the unix socket */ +static int +read_one_msg(IvshmemClient *client, long *index, int *fd) +{ +int ret; +struct msghdr msg; +struct iovec iov[1]; +union { +struct cmsghdr cmsg; +char control[CMSG_SPACE(sizeof(int))]; +} msg_control; +struct cmsghdr *cmsg; + +iov[0].iov_base = index; +iov[0].iov_len = sizeof(*index); + +memset(msg, 0, sizeof(msg)); +msg.msg_iov = iov; +msg.msg_iovlen = 1; +msg.msg_control = msg_control; +msg.msg_controllen = sizeof(msg_control); + +ret = recvmsg(client-sock_fd, msg, 0); +if (ret 0) { +debug_log(client, cannot read message: %s\n, strerror(errno)); +return -1; +} +if (ret == 0) { +debug_log(client, lost connection to server\n); +return -1; +} + +*fd = -1; + +for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + +if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) || +cmsg-cmsg_level != SOL_SOCKET || +cmsg-cmsg_type != SCM_RIGHTS) { +continue; +} + +memcpy(fd, CMSG_DATA(cmsg), sizeof(*fd)); +} + +return 0; +} + +/* free a peer when the server advertise a disconnection or when the + * client is freed */ +static void +free_peer(IvshmemClient *client, IvshmemClientPeer *peer) +{ +unsigned vector; + +QTAILQ_REMOVE(client-peer_list, peer, next); +for (vector = 0; vector peer-vectors_count; vector++) { +close(peer-vectors[vector]); +} + +g_free(peer); +} + +/* handle message coming from server (new peer, new vectors) */ +static int +handle_server_msg(IvshmemClient *client) +{ +IvshmemClientPeer *peer;
[PATCH v5 2/3] docs: update ivshmem device spec
Add some notes on the parts needed to use ivshmem devices: more specifically, explain the purpose of an ivshmem server and the basic concept to use the ivshmem devices in guests. Move some parts of the documentation and re-organise it. Signed-off-by: David Marchand david.march...@6wind.com Reviewed-by: Claudio Fontana claudio.font...@huawei.com --- docs/specs/ivshmem_device_spec.txt | 124 +++- 1 file changed, 93 insertions(+), 31 deletions(-) diff --git a/docs/specs/ivshmem_device_spec.txt b/docs/specs/ivshmem_device_spec.txt index 667a862..12f338e 100644 --- a/docs/specs/ivshmem_device_spec.txt +++ b/docs/specs/ivshmem_device_spec.txt @@ -2,30 +2,103 @@ Device Specification for Inter-VM shared memory device -- -The Inter-VM shared memory device is designed to share a region of memory to -userspace in multiple virtual guests. The memory region does not belong to any -guest, but is a POSIX memory object on the host. Optionally, the device may -support sending interrupts to other guests sharing the same memory region. +The Inter-VM shared memory device is designed to share a memory region (created +on the host via the POSIX shared memory API) between multiple QEMU processes +running different guests. In order for all guests to be able to pick up the +shared memory area, it is modeled by QEMU as a PCI device exposing said memory +to the guest as a PCI BAR. +The memory region does not belong to any guest, but is a POSIX memory object on +the host. The host can access this shared memory if needed. + +The device also provides an optional communication mechanism between guests +sharing the same memory object. More details about that in the section 'Guest to +guest communication' section. The Inter-VM PCI device --- -*BARs* +From the VM point of view, the ivshmem PCI device supports three BARs. + +- BAR0 is a 1 Kbyte MMIO region to support registers and interrupts when MSI is + not used. +- BAR1 is used for MSI-X when it is enabled in the device. +- BAR2 is used to access the shared memory object. + +It is your choice how to use the device but you must choose between two +behaviors : + +- basically, if you only need the shared memory part, you will map BAR2. + This way, you have access to the shared memory in guest and can use it as you + see fit (memnic, for example, uses it in userland + http://dpdk.org/browse/memnic). + +- BAR0 and BAR1 are used to implement an optional communication mechanism + through interrupts in the guests. If you need an event mechanism between the + guests accessing the shared memory, you will most likely want to write a + kernel driver that will handle interrupts. See details in the section 'Guest + to guest communication' section. + +The behavior is chosen when starting your QEMU processes: +- no communication mechanism needed, the first QEMU to start creates the shared + memory on the host, subsequent QEMU processes will use it. + +- communication mechanism needed, an ivshmem server must be started before any + QEMU processes, then each QEMU process connects to the server unix socket. + +For more details on the QEMU ivshmem parameters, see qemu-doc documentation. + + +Guest to guest communication + + +This section details the communication mechanism between the guests accessing +the ivhsmem shared memory. -The device supports three BARs. BAR0 is a 1 Kbyte MMIO region to support -registers. BAR1 is used for MSI-X when it is enabled in the device. BAR2 is -used to map the shared memory object from the host. The size of BAR2 is -specified when the guest is started and must be a power of 2 in size. +*ivshmem server* -*Registers* +This server code is available in qemu.git/contrib/ivshmem-server. -The device currently supports 4 registers of 32-bits each. Registers -are used for synchronization between guests sharing the same memory object when -interrupts are supported (this requires using the shared memory server). +The server must be started on the host before any guest. +It creates a shared memory object then waits for clients to connect on a unix +socket. -The server assigns each VM an ID number and sends this ID number to the QEMU -process when the guest starts. +For each client (QEMU process) that connects to the server: +- the server assigns an ID for this client and sends this ID to him as the first + message, +- the server sends a fd to the shared memory object to this client, +- the server creates a new set of host eventfds associated to the new client and + sends this set to all already connected clients, +- finally, the server sends all the eventfds sets for all clients to the new + client. + +The server signals all clients when one of them disconnects. + +The client IDs are limited to 16 bits because of the current implementation (see +Doorbell register in 'PCI device registers' subsection). Hence
[PATCH] KVM: x86: fix kvmclock breakage from timers branch merge
Commit cbcf2dd3b3d4 (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds based, 2014-07-16) forgot to add tk-xtime_sec, thus breaking kvmclock on hosts that have a reliable TSC. Add it back; and since the field boot_ns is not anymore related to the host boot-based clock, rename boot_ns-nsec_base and the existing nsec_base-snsec_base. Cc: Thomas Gleixner t...@linutronix.de Cc: John Stultz john.stu...@linaro.org Reported-by: Chris J Arges chris.j.ar...@canonical.com Signed-off-by: Paolo Bonzini pbonz...@redhat.com --- arch/x86/kvm/x86.c | 11 ++- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8f1e22d3b286..92493e10937c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1020,8 +1020,8 @@ struct pvclock_gtod_data { u32 shift; } clock; - u64 boot_ns; u64 nsec_base; + u64 snsec_base; }; static struct pvclock_gtod_data pvclock_gtod_data; @@ -1042,8 +1042,9 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata-clock.mult = tk-tkr.mult; vdata-clock.shift = tk-tkr.shift; - vdata-boot_ns = boot_ns; - vdata-nsec_base= tk-tkr.xtime_nsec; + vdata-nsec_base= tk-xtime_sec * (u64)NSEC_PER_SEC + + boot_ns; + vdata-snsec_base = tk-tkr.xtime_nsec; write_seqcount_end(vdata-seq); } @@ -1413,10 +1414,10 @@ static int do_monotonic_boot(s64 *t, cycle_t *cycle_now) do { seq = read_seqcount_begin(gtod-seq); mode = gtod-clock.vclock_mode; - ns = gtod-nsec_base; + ns = gtod-snsec_base; ns += vgettsc(cycle_now); ns = gtod-clock.shift; - ns += gtod-boot_ns; + ns += gtod-nsec_base; } while (unlikely(read_seqcount_retry(gtod-seq, seq))); *t = ns; -- 2.1.0 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: kvm-unit-test failures
Il 04/09/2014 14:24, Wanpeng Li ha scritto: On Thu, Sep 04, 2014 at 01:33:10PM +0200, Paolo Bonzini wrote: Il 04/09/2014 11:53, Paolo Bonzini ha scritto: Il 03/09/2014 20:25, Chris J Arges ha scritto: snip I'm not sure about the reason for the warp, but indeed the offset and uptime match (I'll check them against the trace tomorrow) so it's just that the VM's TSC base is not taken into account correctly. Can you gather another trace with the problematic patch reverted? Paolo Here is the third trace running with 0d3da0d2 reverted from the latest kvm queue branch 11cc9ea3: http://people.canonical.com/~arges/kvm/trace-3.dat.xz Thanks! And---yay!---I reproduced it on another machine. And my bisection landed on the merge of the timer branch (commit e7fda6c4c3c1a7d6996dd75fd84670fa0b5d448f). Here is the log: $ git bisect bad origin/master $ git bisect good v3.16 $ git bisect good kvm-3.17-1 # 42cbc04fd3b5e3f9b011bf9fa3ce0b3d1e10b58b good[ 6929.863545] loaded kvm module (v3.17-rc1-158-g451fd72219dd) bad [ 6971.932790] loaded kvm module (for-linus) bad [ 7216.073128] loaded kvm module (v3.16-6426-gae045e245542) good[ 7286.198948] loaded kvm module (v3.16-3283-g53ee983378ff) good[ 7350.534060] loaded kvm module (v3.16-rc7-1668-gaef4f5b6db65) good[ 7439.037038] loaded kvm module (v3.16-4006-g91c2ff7708d4) good[ 7481.188637] loaded kvm module (v3.16-rc6-450-g7ba3c21c17d0) bad [ 7535.292730] loaded kvm module (v3.16-4635-ge7fda6c4c3c1) good[ 7589.722691] loaded kvm module (v3.16-rc5-110-g9b0fd802e8c0) good[ 7630.286418] loaded kvm module (v3.16-4467-ged5c41d30ef2) good[ 7712.470986] loaded kvm module (v3.16-rc1-35-g885d078bfe92) good[ 7763.443626] loaded kvm module (v3.16-rc1-381-g1b0733837a9b) good[ 7825.497414] loaded kvm module (v3.16-rc5-116-g7806f60e1d20) good[ 7893.174056] loaded kvm module (v3.16-rc1-384-gc6f1224573c3) This means that: - Tomasz's patch (commit 0d3da0d26e3c3515997c99451ce3b0ad1a69a36c) is fine, it just enables the (wrong) master clock more often - the failure is within that branch. I then cherry-picked Tomasz's patch during a new bisection, and landed on one of my original suspects: commit cbcf2dd3b3d4d990610259e8d878fc8dc1f17d80 Author: Thomas Gleixner t...@linutronix.de Date: Wed Jul 16 21:04:54 2014 + x86: kvm: Make kvm_get_time_and_clockread() nanoseconds based Convert the relevant base data right away to nanoseconds instead of doing the conversion on every readout. Reduces text size by 160 bytes. Signed-off-by: Thomas Gleixner t...@linutronix.de Cc: Gleb Natapov g...@kernel.org Cc: kvm@vger.kernel.org Acked-by: Paolo Bonzini pbonz...@redhat.com Signed-off-by: John Stultz john.stu...@linaro.org Yes, I also look into this bad commit recently which lead to guest hang after live migration or after local save/restore. Thanks for the report! Wanpeng, can you test and/or review the patch I just posted ([PATCH] KVM: x86: fix kvmclock breakage from timers branch merge)? Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][patch 0/6] pci pass-through support for qemu/KVM on s390
On Thu, 2014-09-04 at 12:52 +0200, frank.blasc...@de.ibm.com wrote: This set of patches implements pci pass-through support for qemu/KVM on s390. PCI support on s390 is very different from other platforms. Major differences are: 1) all PCI operations are driven by special s390 instructions Generating config cycles is always arch specific. 2) all s390 PCI instructions are privileged While the operations to generate config cycles on x86 are not privileged, they must be arbitrated between accesses, so in a sense they're privileged. 3) PCI config and memory spaces can not be mmap'ed VFIO has mapping flags that allow any region to specify mmap support. 4) no classic interrupts (INTX, MSI). The pci hw understands the concept of requesting MSIX irqs but irqs are delivered as s390 adapter irqs. VFIO delivers interrupts as eventfds regardless of the underlying platform mechanism. 5) For DMA access there is always an IOMMU required. x86 requires the same. s390 pci implementation does not support a complete memory to iommu mapping, dma mappings are created on request. Sounds like POWER. 6) The OS does not get any informations about the physical layout of the PCI bus. If that means that every device is isolated (seems unlikely for multifunction devices) then that makes IOMMU group support really easy. 7) To take advantage of system z specific virtualization features we need to access the SIE control block residing in the kernel KVM The KVM-VFIO device allows interaction between VFIO devices and KVM. 8) To enable system z specific virtualization features we have to manipulate the zpci device in kernel. VFIO supports different device backends, currently pci_dev and working towards platform devices. zpci might just be an extension to standard pci. For this reasons I decided to implement a kernel based approach similar to x86 device assignment. There is a new qemu device (s390-pci) representing a pass through device on the host. Here is a sample qemu device configuration: -device s390-pci,host=:00:00.0 The device executes the KVM_ASSIGN_PCI_DEVICE ioctl to create a proxy instance in the kernel KVM and connect this instance to the host pci device. kernel patches apply to linux-kvm s390: cio: chsc function to register GIB s390: pci: export pci functions for pass-through usage KVM: s390: Add GISA support KVM: s390: Add PCI pass-through support qemu patches apply to qemu-master s390: Add PCI bus support s390: Add PCI pass-through device support Feedback and discussion is highly welcome ... KVM-based device assignment needs to go away. It's a horrible model for devices, it offers very little protection to the kernel, assumes every device is fully isolated and visible to the IOMMU, relies on smattering of sysfs files to operate, etc. x86, POWER, and ARM are all moving to VFIO-based device assignment. Why is s390 special enough to repeat all the mistakes that x86 did? Thanks, Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/4] KVM: x86: inject nested page faults on emulated instructions
Il 04/09/2014 09:02, Gleb Natapov ha scritto: On Tue, Sep 02, 2014 at 05:13:49PM +0200, Paolo Bonzini wrote: This is required for the following patch to work correctly. If a nested page fault happens during emulation, we must inject a vmexit, not a page fault. Luckily we already have the required machinery: it is enough to return X86EMUL_INTERCEPTED instead of X86EMUL_PROPAGATE_FAULT. I wonder why this patch is needed. X86EMUL_PROPAGATE_FAULT causes ctxt-have_exception to be set to true in x86_emulate_insn(). x86_emulate_instruction() checks ctxt-have_exception and calls inject_emulated_exception() if it is true. inject_emulated_exception() calls kvm_propagate_fault() where we check if the fault was nested and generate vmexit or a page fault accordingly. Good question. :) If you do that, KVM gets down to the if (writeback) and writes the ctxt-eip from L2 into the L1 EIP. Possibly this patch can be replaced by just this? diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 022513b..475e979 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5312,7 +5312,7 @@ restart: if (ctxt-have_exception) { inject_emulated_exception(vcpu); - r = EMULATE_DONE; + return EMULATE_DONE; } else if (vcpu-arch.pio.count) { if (!vcpu-arch.pio.in) { /* FIXME: return into emulator if single-stepping. */ But I'm not sure how to test it, and I like the idea of treating nested page faults like other nested vmexits during emulation (which is what this patch does). If I included this patch, I could then remove kvm_propagate_fault like (I think) this: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 92493e10937c..e096db566ac2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4910,9 +4902,10 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) static void inject_emulated_exception(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = vcpu-arch.emulate_ctxt; - if (ctxt-exception.vector == PF_VECTOR) - kvm_propagate_fault(vcpu, ctxt-exception); - else if (ctxt-exception.error_code_valid) + if (ctxt-exception.vector == PF_VECTOR) { + WARN_ON(fault-nested_page_fault); + vcpu-arch.walk_mmu-inject_page_fault(vcpu, fault); + } else if (ctxt-exception.error_code_valid) kvm_queue_exception_e(vcpu, ctxt-exception.vector, ctxt-exception.error_code); else What do you think? Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][patch 3/6] KVM: s390: Add GISA support
On Thu, Sep 04, 2014 at 12:52:26PM +0200, frank.blasc...@de.ibm.com wrote: +void kvm_s390_gisa_register_alert(struct kvm *kvm, u32 gisc) +{ + int bito = BITS_PER_BYTE * 7 + gisc; + + set_bit(bito ^ (BITS_PER_LONG - 1), kvm-arch.iam); +} Just a very minor nit: you could also use set_bit_inv() friends. +static inline u64 kvm_s390_get_base_disp_rxy(struct kvm_vcpu *vcpu) +{ + u32 x2 = (vcpu-arch.sie_block-ipa 0x000f); + u32 base2 = vcpu-arch.sie_block-ipb 28; + u32 disp2 = ((vcpu-arch.sie_block-ipb 0x0fff) 16) + + ((vcpu-arch.sie_block-ipb 0xff00) 4); + + return (base2 ? vcpu-run-s.regs.gprs[base2] : 0) + + (x2 ? vcpu-run-s.regs.gprs[x2] : 0) + (u64)disp2; +} Not very readable ;) However.. for the RXY instruction format the 20 bit displacement is usually signed and not unsigned like your code seems to treat it. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/4] KVM: x86: inject nested page faults on emulated instructions
On Thu, Sep 04, 2014 at 04:12:19PM +0200, Paolo Bonzini wrote: Il 04/09/2014 09:02, Gleb Natapov ha scritto: On Tue, Sep 02, 2014 at 05:13:49PM +0200, Paolo Bonzini wrote: This is required for the following patch to work correctly. If a nested page fault happens during emulation, we must inject a vmexit, not a page fault. Luckily we already have the required machinery: it is enough to return X86EMUL_INTERCEPTED instead of X86EMUL_PROPAGATE_FAULT. I wonder why this patch is needed. X86EMUL_PROPAGATE_FAULT causes ctxt-have_exception to be set to true in x86_emulate_insn(). x86_emulate_instruction() checks ctxt-have_exception and calls inject_emulated_exception() if it is true. inject_emulated_exception() calls kvm_propagate_fault() where we check if the fault was nested and generate vmexit or a page fault accordingly. Good question. :) If you do that, KVM gets down to the if (writeback) and writes the ctxt-eip from L2 into the L1 EIP. Heh, that's a bummer. We should not write back if an instruction caused a vmexit. Possibly this patch can be replaced by just this? diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 022513b..475e979 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5312,7 +5312,7 @@ restart: if (ctxt-have_exception) { inject_emulated_exception(vcpu); - r = EMULATE_DONE; + return EMULATE_DONE; If there was no vmexit we still want to writeback. Perhaps: writeback = inject_emulated_exception(vcpu); and return false if there was vmexit due to nested page fault (or any fault, can't L1 ask for #GP/#UD intercept that need to be handled here too?) } else if (vcpu-arch.pio.count) { if (!vcpu-arch.pio.in) { /* FIXME: return into emulator if single-stepping. */ But I'm not sure how to test it, and I like the idea of treating nested page faults like other nested vmexits during emulation (which is what this patch does). IMO exits due to instruction intercept and exits due to other interceptable events that may happen during instruction emulation are sufficiently different to be handled slightly different. If my assumption about #GP above are correct with current approach it can be easily handled inside inject_emulated_exception(). -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v5 0/3] ivshmem: update documentation, add client/server tools
On Thu, Sep 04, 2014 at 02:50:58PM +0200, David Marchand wrote: Here is a patchset containing an update on ivshmem specs documentation and importing ivshmem server and client tools. These tools have been written from scratch and are not related to what is available in nahanni repository. I put them in contrib/ directory as the qemu-doc.texi was already telling the server was supposed to be there. I think it's a very nice patchset, and very useful. Some comments on the patches follow. Changes since v4: - squashed patches 3-13 from v4 into first patch - reused reported error when parsing arguments in server - fixed spelling mistakes in documentation in second patch Changes since v3: - first patch is untouched - just restored the Reviewed-By Claudio in second patch - following patches 3-8 take into account Stefan's comments - patches 9-12 take into account Gonglei's comments - patch 13 adjusts ivshmem-server default values - last patch introduces a change in the ivshmem client-server protocol to check a protocol version at connect time Changes since v2: - fixed license issues in ivshmem client/server (I took hw/virtio/virtio-rng.c file as a reference). Changes since v1: - moved client/server import patch before doc update, - tried to re-organise the ivshmem_device_spec.txt file based on Claudio comments (still not sure if the result is that great, comments welcome), - incorporated comments from Claudio, Eric and Cam, - added more details on the server - client messages exchange (but sorry, no ASCII art here). By the way, there are still some functionnalities that need description (use of ioeventfd, the lack of irqfd support) and some parts of the ivshmem code clearly need cleanup. I will try to address this in future patches when these first patches are ok. -- David Marchand David Marchand (3): contrib: add ivshmem client and server docs: update ivshmem device spec ivshmem: add check on protocol version in QEMU Makefile|8 + configure |3 + contrib/ivshmem-client/ivshmem-client.c | 413 +++ contrib/ivshmem-client/ivshmem-client.h | 240 ++ contrib/ivshmem-client/main.c | 237 ++ contrib/ivshmem-server/ivshmem-server.c | 402 ++ contrib/ivshmem-server/ivshmem-server.h | 187 ++ contrib/ivshmem-server/main.c | 244 ++ docs/specs/ivshmem_device_spec.txt | 127 +++--- hw/misc/ivshmem.c | 43 +++- include/hw/misc/ivshmem.h | 17 ++ qemu-doc.texi | 10 +- 12 files changed, 1888 insertions(+), 43 deletions(-) create mode 100644 contrib/ivshmem-client/ivshmem-client.c create mode 100644 contrib/ivshmem-client/ivshmem-client.h create mode 100644 contrib/ivshmem-client/main.c create mode 100644 contrib/ivshmem-server/ivshmem-server.c create mode 100644 contrib/ivshmem-server/ivshmem-server.h create mode 100644 contrib/ivshmem-server/main.c create mode 100644 include/hw/misc/ivshmem.h -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v5 1/3] contrib: add ivshmem client and server
On Thu, Sep 04, 2014 at 02:50:59PM +0200, David Marchand wrote: When using ivshmem devices, notifications between guests can be sent as interrupts using a ivshmem-server (typical use described in documentation). The client is provided as a debug tool. Signed-off-by: Olivier Matz olivier.m...@6wind.com Signed-off-by: David Marchand david.march...@6wind.com --- Makefile|8 + configure |3 + contrib/ivshmem-client/ivshmem-client.c | 405 +++ contrib/ivshmem-client/ivshmem-client.h | 239 ++ contrib/ivshmem-client/main.c | 237 ++ contrib/ivshmem-server/ivshmem-server.c | 395 ++ contrib/ivshmem-server/ivshmem-server.h | 186 ++ contrib/ivshmem-server/main.c | 244 +++ qemu-doc.texi | 10 +- 9 files changed, 1724 insertions(+), 3 deletions(-) create mode 100644 contrib/ivshmem-client/ivshmem-client.c create mode 100644 contrib/ivshmem-client/ivshmem-client.h create mode 100644 contrib/ivshmem-client/main.c create mode 100644 contrib/ivshmem-server/ivshmem-server.c create mode 100644 contrib/ivshmem-server/ivshmem-server.h create mode 100644 contrib/ivshmem-server/main.c diff --git a/Makefile b/Makefile index b33aaac..0575898 100644 --- a/Makefile +++ b/Makefile @@ -283,6 +283,14 @@ $(qga-obj-y) qemu-ga.o: $(QGALIB_GEN) qemu-ga$(EXESUF): $(qga-obj-y) libqemuutil.a libqemustub.a $(call LINK, $^) +IVSHMEM_CLIENT_OBJS=$(addprefix $(SRC_PATH)/contrib/ivshmem-client/, ivshmem-client.o main.o) +ivshmem-client$(EXESUF): $(IVSHMEM_CLIENT_OBJS) + $(call LINK, $^) + +IVSHMEM_SERVER_OBJS=$(addprefix $(SRC_PATH)/contrib/ivshmem-server/, ivshmem-server.o main.o) +ivshmem-server$(EXESUF): $(IVSHMEM_SERVER_OBJS) libqemuutil.a libqemustub.a + $(call LINK, $^) + clean: # avoid old build problems by removing potentially incorrect old files rm -f config.mak op-i386.h opc-i386.h gen-op-i386.h op-arm.h opc-arm.h gen-op-arm.h diff --git a/configure b/configure index 961bf6f..a41a16c 100755 --- a/configure +++ b/configure @@ -4125,6 +4125,9 @@ if test $want_tools = yes ; then if [ $linux = yes -o $bsd = yes -o $solaris = yes ] ; then tools=qemu-nbd\$(EXESUF) $tools fi + if [ $kvm = yes ] ; then +tools=ivshmem-client\$(EXESUF) ivshmem-server\$(EXESUF) $tools + fi fi if test $softmmu = yes ; then if test $virtfs != no ; then diff --git a/contrib/ivshmem-client/ivshmem-client.c b/contrib/ivshmem-client/ivshmem-client.c new file mode 100644 index 000..ad210c8 --- /dev/null +++ b/contrib/ivshmem-client/ivshmem-client.c @@ -0,0 +1,405 @@ +/* + * Copyright 6WIND S.A., 2014 + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#include sys/types.h +#include sys/socket.h +#include sys/un.h + +#include qemu-common.h +#include qemu/queue.h + +#include ivshmem-client.h + +/* log a message on stdout if verbose=1 */ +#define debug_log(client, fmt, ...) do { \ +if ((client)-verbose) { \ +printf(fmt, ## __VA_ARGS__); \ +}\ +} while (0) + +/* read message from the unix socket */ +static int +read_one_msg(IvshmemClient *client, long *index, int *fd) +{ +int ret; +struct msghdr msg; +struct iovec iov[1]; +union { +struct cmsghdr cmsg; +char control[CMSG_SPACE(sizeof(int))]; +} msg_control; +struct cmsghdr *cmsg; + +iov[0].iov_base = index; +iov[0].iov_len = sizeof(*index); + +memset(msg, 0, sizeof(msg)); +msg.msg_iov = iov; +msg.msg_iovlen = 1; +msg.msg_control = msg_control; +msg.msg_controllen = sizeof(msg_control); + +ret = recvmsg(client-sock_fd, msg, 0); +if (ret 0) { +debug_log(client, cannot read message: %s\n, strerror(errno)); +return -1; +} +if (ret == 0) { +debug_log(client, lost connection to server\n); +return -1; +} + +*fd = -1; + +for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + +if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) || +cmsg-cmsg_level != SOL_SOCKET || +cmsg-cmsg_type != SCM_RIGHTS) { +continue; +} + +memcpy(fd, CMSG_DATA(cmsg), sizeof(*fd)); +} + +return 0; +} + +/* free a peer when the server advertise a disconnection or when the + * client is freed */ +static void +free_peer(IvshmemClient *client, IvshmemClientPeer *peer) +{ +unsigned vector; + +QTAILQ_REMOVE(client-peer_list, peer, next); +for (vector = 0; vector peer-vectors_count; vector++) { +
Re: [PATCH v5 1/3] contrib: add ivshmem client and server
On Thu, Sep 04, 2014 at 02:50:59PM +0200, David Marchand wrote: When using ivshmem devices, notifications between guests can be sent as interrupts using a ivshmem-server (typical use described in documentation). The client is provided as a debug tool. Signed-off-by: Olivier Matz olivier.m...@6wind.com Signed-off-by: David Marchand david.march...@6wind.com --- Makefile|8 + configure |3 + contrib/ivshmem-client/ivshmem-client.c | 405 +++ contrib/ivshmem-client/ivshmem-client.h | 239 ++ contrib/ivshmem-client/main.c | 237 ++ contrib/ivshmem-server/ivshmem-server.c | 395 ++ contrib/ivshmem-server/ivshmem-server.h | 186 ++ contrib/ivshmem-server/main.c | 244 +++ qemu-doc.texi | 10 +- 9 files changed, 1724 insertions(+), 3 deletions(-) create mode 100644 contrib/ivshmem-client/ivshmem-client.c create mode 100644 contrib/ivshmem-client/ivshmem-client.h create mode 100644 contrib/ivshmem-client/main.c create mode 100644 contrib/ivshmem-server/ivshmem-server.c create mode 100644 contrib/ivshmem-server/ivshmem-server.h create mode 100644 contrib/ivshmem-server/main.c diff --git a/Makefile b/Makefile index b33aaac..0575898 100644 --- a/Makefile +++ b/Makefile @@ -283,6 +283,14 @@ $(qga-obj-y) qemu-ga.o: $(QGALIB_GEN) qemu-ga$(EXESUF): $(qga-obj-y) libqemuutil.a libqemustub.a $(call LINK, $^) +IVSHMEM_CLIENT_OBJS=$(addprefix $(SRC_PATH)/contrib/ivshmem-client/, ivshmem-client.o main.o) +ivshmem-client$(EXESUF): $(IVSHMEM_CLIENT_OBJS) + $(call LINK, $^) + +IVSHMEM_SERVER_OBJS=$(addprefix $(SRC_PATH)/contrib/ivshmem-server/, ivshmem-server.o main.o) +ivshmem-server$(EXESUF): $(IVSHMEM_SERVER_OBJS) libqemuutil.a libqemustub.a + $(call LINK, $^) + clean: # avoid old build problems by removing potentially incorrect old files rm -f config.mak op-i386.h opc-i386.h gen-op-i386.h op-arm.h opc-arm.h gen-op-arm.h diff --git a/configure b/configure index 961bf6f..a41a16c 100755 --- a/configure +++ b/configure @@ -4125,6 +4125,9 @@ if test $want_tools = yes ; then if [ $linux = yes -o $bsd = yes -o $solaris = yes ] ; then tools=qemu-nbd\$(EXESUF) $tools fi + if [ $kvm = yes ] ; then +tools=ivshmem-client\$(EXESUF) ivshmem-server\$(EXESUF) $tools + fi fi if test $softmmu = yes ; then if test $virtfs != no ; then diff --git a/contrib/ivshmem-client/ivshmem-client.c b/contrib/ivshmem-client/ivshmem-client.c new file mode 100644 index 000..ad210c8 --- /dev/null +++ b/contrib/ivshmem-client/ivshmem-client.c @@ -0,0 +1,405 @@ +/* + * Copyright 6WIND S.A., 2014 + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#include sys/types.h +#include sys/socket.h +#include sys/un.h + +#include qemu-common.h +#include qemu/queue.h + +#include ivshmem-client.h + +/* log a message on stdout if verbose=1 */ +#define debug_log(client, fmt, ...) do { \ +if ((client)-verbose) { \ +printf(fmt, ## __VA_ARGS__); \ +}\ +} while (0) + +/* read message from the unix socket */ +static int +read_one_msg(IvshmemClient *client, long *index, int *fd) +{ +int ret; +struct msghdr msg; +struct iovec iov[1]; +union { +struct cmsghdr cmsg; +char control[CMSG_SPACE(sizeof(int))]; +} msg_control; +struct cmsghdr *cmsg; + +iov[0].iov_base = index; +iov[0].iov_len = sizeof(*index); + +memset(msg, 0, sizeof(msg)); +msg.msg_iov = iov; +msg.msg_iovlen = 1; +msg.msg_control = msg_control; +msg.msg_controllen = sizeof(msg_control); + +ret = recvmsg(client-sock_fd, msg, 0); +if (ret 0) { +debug_log(client, cannot read message: %s\n, strerror(errno)); +return -1; +} +if (ret == 0) { +debug_log(client, lost connection to server\n); +return -1; +} + +*fd = -1; + +for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + +if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) || +cmsg-cmsg_level != SOL_SOCKET || +cmsg-cmsg_type != SCM_RIGHTS) { +continue; +} + +memcpy(fd, CMSG_DATA(cmsg), sizeof(*fd)); +} + +return 0; +} + +/* free a peer when the server advertise a disconnection or when the + * client is freed */ +static void +free_peer(IvshmemClient *client, IvshmemClientPeer *peer) +{ +unsigned vector; + +QTAILQ_REMOVE(client-peer_list, peer, next); +for (vector = 0; vector peer-vectors_count; vector++) { +
Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge
On 09/04/2014 07:58 AM, Paolo Bonzini wrote: Commit cbcf2dd3b3d4 (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds based, 2014-07-16) forgot to add tk-xtime_sec, thus breaking kvmclock on hosts that have a reliable TSC. Add it back; and since the field boot_ns is not anymore related to the host boot-based clock, rename boot_ns-nsec_base and the existing nsec_base-snsec_base. Cc: Thomas Gleixner t...@linutronix.de Cc: John Stultz john.stu...@linaro.org Reported-by: Chris J Arges chris.j.ar...@canonical.com Signed-off-by: Paolo Bonzini pbonz...@redhat.com --- arch/x86/kvm/x86.c | 11 ++- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8f1e22d3b286..92493e10937c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1020,8 +1020,8 @@ struct pvclock_gtod_data { u32 shift; } clock; - u64 boot_ns; u64 nsec_base; + u64 snsec_base; }; static struct pvclock_gtod_data pvclock_gtod_data; @@ -1042,8 +1042,9 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata-clock.mult = tk-tkr.mult; vdata-clock.shift = tk-tkr.shift; - vdata-boot_ns = boot_ns; - vdata-nsec_base= tk-tkr.xtime_nsec; + vdata-nsec_base= tk-xtime_sec * (u64)NSEC_PER_SEC + + boot_ns; + vdata-snsec_base = tk-tkr.xtime_nsec; write_seqcount_end(vdata-seq); } @@ -1413,10 +1414,10 @@ static int do_monotonic_boot(s64 *t, cycle_t *cycle_now) do { seq = read_seqcount_begin(gtod-seq); mode = gtod-clock.vclock_mode; - ns = gtod-nsec_base; + ns = gtod-snsec_base; ns += vgettsc(cycle_now); ns = gtod-clock.shift; - ns += gtod-boot_ns; + ns += gtod-nsec_base; } while (unlikely(read_seqcount_retry(gtod-seq, seq))); *t = ns; Paulo, I've tested with the above patch and I still have issues with the kvmclock test offset; however the cycle tests pass now. Here is trace data: http://people.canonical.com/~arges/kvm/trace-4.dat.xz Uptime: 15:58:02 up 1:00, 1 user, load average: 0.59, 0.60, 0.31 Here is the output: ./x86-run x86/kvmclock_test.flat -smp 2 --append 1000 `date +%s` qemu-system-x86_64 -enable-kvm -device pc-testdev -device isa-debug-exit,iobase=0xf4,iosize=0x4 -display none -serial stdio -device pci-testdev -kernel x86/kvmclock_test.flat -smp 2 --append 1000 1409846210 enabling apic enabling apic kvm-clock: cpu 0, msr 0x:44d4c0 kvm-clock: cpu 0, msr 0x:44d4c0 Wallclock test, threshold 5 Seconds get from host: 1409846210 Seconds get from kvmclock: 2819688866 Offset:1409842656 offset too large! Check the stability of raw cycle ... Total vcpus: 2 Test loops: 1000 Total warps: 0 Total stalls: 0 Worst warp: 0 Raw cycle is stable Monotonic cycle test: Total vcpus: 2 Test loops: 1000 Total warps: 0 Total stalls: 0 Worst warp: 0 Measure the performance of raw cycle ... Total vcpus: 2 Test loops: 1000 TSC cycles: 1139288710 Measure the performance of adjusted cycle ... Total vcpus: 2 Test loops: 1000 TSC cycles: 1138643774 Return value from qemu: 3 My observation is that the kvmclock value seems to be positively biased by the boot_ns value. --chris j arges -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge
Il 04/09/2014 18:00, Chris J Arges ha scritto: Uptime: 15:58:02 up 1:00, 1 user, load average: 0.59, 0.60, 0.31 Here is the output: ./x86-run x86/kvmclock_test.flat -smp 2 --append 1000 `date +%s` qemu-system-x86_64 -enable-kvm -device pc-testdev -device isa-debug-exit,iobase=0xf4,iosize=0x4 -display none -serial stdio -device pci-testdev -kernel x86/kvmclock_test.flat -smp 2 --append 1000 1409846210 enabling apic enabling apic kvm-clock: cpu 0, msr 0x:44d4c0 kvm-clock: cpu 0, msr 0x:44d4c0 Wallclock test, threshold 5 Seconds get from host: 1409846210 Seconds get from kvmclock: 2819688866 Offset:1409842656 With kvm/queue this would have been roughly -3600, now it's host_wallclock-3600. So the patch hasn't fixed the -3600 part for you. Can you try applying this patch on top of 3.16? This is my backport of Thomas's patch. If this works for you, we only have to find out how to compute boot_ns and nsec_base in the new timekeeping world order of 3.17... Thomas, do you have any ideas? Every time a VM is started, the kvmclock starts at the boot time of the host, instead of the current wallclock time. Paolo diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d38abc81db65..70de23f1de51 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1020,9 +1020,8 @@ struct pvclock_gtod_data { u32 shift; } clock; - /* open coded 'struct timespec' */ - u64 monotonic_time_snsec; - time_t monotonic_time_sec; + u64 boot_ns; + u64 nsec_base; }; static struct pvclock_gtod_data pvclock_gtod_data; @@ -1030,6 +1029,12 @@ static struct pvclock_gtod_data pvclock_gtod_data; static void update_pvclock_gtod(struct timekeeper *tk) { struct pvclock_gtod_data *vdata = pvclock_gtod_data; + u64 boot_ns; + + boot_ns = timespec_to_ns(tk-total_sleep_time) + + tk-wall_to_monotonic.tv_sec * (u64)NSEC_PER_SEC + + tk-wall_to_monotonic.tv_nsec + + tk-xtime_sec * (u64)NSEC_PER_SEC; write_seqcount_begin(vdata-seq); @@ -1040,17 +1044,8 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata-clock.mult = tk-mult; vdata-clock.shift = tk-shift; - vdata-monotonic_time_sec = tk-xtime_sec - + tk-wall_to_monotonic.tv_sec; - vdata-monotonic_time_snsec = tk-xtime_nsec - + (tk-wall_to_monotonic.tv_nsec -tk-shift); - while (vdata-monotonic_time_snsec = - (((u64)NSEC_PER_SEC) tk-shift)) { - vdata-monotonic_time_snsec -= - ((u64)NSEC_PER_SEC) tk-shift; - vdata-monotonic_time_sec++; - } + vdata-boot_ns = boot_ns; + vdata-nsec_base= tk-xtime_nsec; write_seqcount_end(vdata-seq); } @@ -1414,23 +1409,22 @@ static inline u64 vgettsc(cycle_t *cycle_now) return v * gtod-clock.mult; } -static int do_monotonic(struct timespec *ts, cycle_t *cycle_now) +static int do_monotonic_boot(s64 *t, cycle_t *cycle_now) { + struct pvclock_gtod_data *gtod = pvclock_gtod_data; unsigned long seq; - u64 ns; int mode; - struct pvclock_gtod_data *gtod = pvclock_gtod_data; + u64 ns; - ts-tv_nsec = 0; do { seq = read_seqcount_begin(gtod-seq); mode = gtod-clock.vclock_mode; - ts-tv_sec = gtod-monotonic_time_sec; - ns = gtod-monotonic_time_snsec; + ns = gtod-nsec_base; ns += vgettsc(cycle_now); ns = gtod-clock.shift; + ns += gtod-boot_ns; } while (unlikely(read_seqcount_retry(gtod-seq, seq))); - timespec_add_ns(ts, ns); + *t = ns; return mode; } @@ -1438,19 +1432,11 @@ static int do_monotonic(struct timespec *ts, cycle_t *cycle_now) /* returns true if host is using tsc clocksource */ static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now) { - struct timespec ts; - /* checked again under seqlock below */ if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) return false; - if (do_monotonic(ts, cycle_now) != VCLOCK_TSC) - return false; - - monotonic_to_bootbased(ts); - *kernel_ns = timespec_to_ns(ts); - - return true; + return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC; } #endif My observation is that the kvmclock value seems to be positively biased by the boot_ns value. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at
Re: [PATCH 3/4] KVM: x86: inject nested page faults on emulated instructions
Il 04/09/2014 17:05, Gleb Natapov ha scritto: if (ctxt-have_exception) { inject_emulated_exception(vcpu); - r = EMULATE_DONE; + return EMULATE_DONE; If there was no vmexit we still want to writeback. Perhaps: writeback = inject_emulated_exception(vcpu); and return false if there was vmexit due to nested page fault (or any fault, can't L1 ask for #GP/#UD intercept that need to be handled here too?) Sounds good. Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/4] KVM: x86: inject nested page faults on emulated instructions
Il 04/09/2014 17:05, Gleb Natapov ha scritto: If you do that, KVM gets down to the if (writeback) and writes the ctxt-eip from L2 into the L1 EIP. Heh, that's a bummer. We should not write back if an instruction caused a vmexit. You're right, that works. Paolo -- 8 - Subject: [PATCH] KVM: x86: skip writeback on injection of nested exception If a nested page fault happens during emulation, we will inject a vmexit, not a page fault. However because writeback happens after the injection, we will write ctxt-eip from L2 into the L1 EIP. We do not write back if an instruction caused an interception vmexit---do the same for page faults. Signed-off-by: Paolo Bonzini pbonz...@redhat.com --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/x86.c | 15 ++- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 08cc299..c989651 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -893,7 +893,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gfn_t gfn, void *data, int offset, int len, u32 access); -void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); static inline int __kvm_irq_line_state(unsigned long *irq_state, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e4ed85e..3541946 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -408,12 +408,14 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) } EXPORT_SYMBOL_GPL(kvm_inject_page_fault); -void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) +static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { if (mmu_is_nested(vcpu) !fault-nested_page_fault) vcpu-arch.nested_mmu.inject_page_fault(vcpu, fault); else vcpu-arch.mmu.inject_page_fault(vcpu, fault); + + return fault-nested_page_fault; } void kvm_inject_nmi(struct kvm_vcpu *vcpu) @@ -4929,16 +4931,18 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) } } -static void inject_emulated_exception(struct kvm_vcpu *vcpu) +static bool inject_emulated_exception(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = vcpu-arch.emulate_ctxt; if (ctxt-exception.vector == PF_VECTOR) - kvm_propagate_fault(vcpu, ctxt-exception); - else if (ctxt-exception.error_code_valid) + return kvm_propagate_fault(vcpu, ctxt-exception); + + if (ctxt-exception.error_code_valid) kvm_queue_exception_e(vcpu, ctxt-exception.vector, ctxt-exception.error_code); else kvm_queue_exception(vcpu, ctxt-exception.vector); + return false; } static void init_emulate_ctxt(struct kvm_vcpu *vcpu) @@ -5300,8 +5304,9 @@ restart: } if (ctxt-have_exception) { - inject_emulated_exception(vcpu); r = EMULATE_DONE; + if (inject_emulated_exception(vcpu)) + return r; } else if (vcpu-arch.pio.count) { if (!vcpu-arch.pio.in) { /* FIXME: return into emulator if single-stepping. */ -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge
Il 04/09/2014 14:58, Paolo Bonzini ha scritto: Commit cbcf2dd3b3d4 (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds based, 2014-07-16) forgot to add tk-xtime_sec, thus breaking kvmclock on hosts that have a reliable TSC. Add it back; and since the field boot_ns is not anymore related to the host boot-based clock, rename boot_ns-nsec_base and the existing nsec_base-snsec_base. Cc: Thomas Gleixner t...@linutronix.de Cc: John Stultz john.stu...@linaro.org Reported-by: Chris J Arges chris.j.ar...@canonical.com Signed-off-by: Paolo Bonzini pbonz...@redhat.com --- arch/x86/kvm/x86.c | 11 ++- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8f1e22d3b286..92493e10937c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1020,8 +1020,8 @@ struct pvclock_gtod_data { u32 shift; } clock; - u64 boot_ns; u64 nsec_base; + u64 snsec_base; }; static struct pvclock_gtod_data pvclock_gtod_data; @@ -1042,8 +1042,9 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata-clock.mult = tk-tkr.mult; vdata-clock.shift = tk-tkr.shift; - vdata-boot_ns = boot_ns; - vdata-nsec_base= tk-tkr.xtime_nsec; + vdata-nsec_base= tk-xtime_sec * (u64)NSEC_PER_SEC + + boot_ns; + vdata-snsec_base = tk-tkr.xtime_nsec; Hmm, I found this comment in kernel/time/timekeeping.c /* * The xtime based monotonic readout is: * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now(); * The ktime based monotonic readout is: * nsec = base_mono + now(); so this patch makes no sense. The offs_boot part must be broken. Paolo write_seqcount_end(vdata-seq); } @@ -1413,10 +1414,10 @@ static int do_monotonic_boot(s64 *t, cycle_t *cycle_now) do { seq = read_seqcount_begin(gtod-seq); mode = gtod-clock.vclock_mode; - ns = gtod-nsec_base; + ns = gtod-snsec_base; ns += vgettsc(cycle_now); ns = gtod-clock.shift; - ns += gtod-boot_ns; + ns += gtod-nsec_base; } while (unlikely(read_seqcount_retry(gtod-seq, seq))); *t = ns; -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge
On 09/04/2014 12:14 PM, Paolo Bonzini wrote: Il 04/09/2014 18:00, Chris J Arges ha scritto: Uptime: 15:58:02 up 1:00, 1 user, load average: 0.59, 0.60, 0.31 Here is the output: ./x86-run x86/kvmclock_test.flat -smp 2 --append 1000 `date +%s` qemu-system-x86_64 -enable-kvm -device pc-testdev -device isa-debug-exit,iobase=0xf4,iosize=0x4 -display none -serial stdio -device pci-testdev -kernel x86/kvmclock_test.flat -smp 2 --append 1000 1409846210 enabling apic enabling apic kvm-clock: cpu 0, msr 0x:44d4c0 kvm-clock: cpu 0, msr 0x:44d4c0 Wallclock test, threshold 5 Seconds get from host: 1409846210 Seconds get from kvmclock: 2819688866 Offset:1409842656 With kvm/queue this would have been roughly -3600, now it's host_wallclock-3600. So the patch hasn't fixed the -3600 part for you. Can you try applying this patch on top of 3.16? This is my backport of Thomas's patch. If this works for you, we only have to find out how to compute boot_ns and nsec_base in the new timekeeping world order of 3.17... Paolo, The patch below applied to 3.16 still allows the testcase to pass on my hardware. --chris Thomas, do you have any ideas? Every time a VM is started, the kvmclock starts at the boot time of the host, instead of the current wallclock time. Paolo diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d38abc81db65..70de23f1de51 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1020,9 +1020,8 @@ struct pvclock_gtod_data { u32 shift; } clock; - /* open coded 'struct timespec' */ - u64 monotonic_time_snsec; - time_t monotonic_time_sec; + u64 boot_ns; + u64 nsec_base; }; static struct pvclock_gtod_data pvclock_gtod_data; @@ -1030,6 +1029,12 @@ static struct pvclock_gtod_data pvclock_gtod_data; static void update_pvclock_gtod(struct timekeeper *tk) { struct pvclock_gtod_data *vdata = pvclock_gtod_data; + u64 boot_ns; + + boot_ns = timespec_to_ns(tk-total_sleep_time) + + tk-wall_to_monotonic.tv_sec * (u64)NSEC_PER_SEC + + tk-wall_to_monotonic.tv_nsec + + tk-xtime_sec * (u64)NSEC_PER_SEC; write_seqcount_begin(vdata-seq); @@ -1040,17 +1044,8 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata-clock.mult = tk-mult; vdata-clock.shift = tk-shift; - vdata-monotonic_time_sec = tk-xtime_sec - + tk-wall_to_monotonic.tv_sec; - vdata-monotonic_time_snsec = tk-xtime_nsec - + (tk-wall_to_monotonic.tv_nsec - tk-shift); - while (vdata-monotonic_time_snsec = - (((u64)NSEC_PER_SEC) tk-shift)) { - vdata-monotonic_time_snsec -= - ((u64)NSEC_PER_SEC) tk-shift; - vdata-monotonic_time_sec++; - } + vdata-boot_ns = boot_ns; + vdata-nsec_base= tk-xtime_nsec; write_seqcount_end(vdata-seq); } @@ -1414,23 +1409,22 @@ static inline u64 vgettsc(cycle_t *cycle_now) return v * gtod-clock.mult; } -static int do_monotonic(struct timespec *ts, cycle_t *cycle_now) +static int do_monotonic_boot(s64 *t, cycle_t *cycle_now) { + struct pvclock_gtod_data *gtod = pvclock_gtod_data; unsigned long seq; - u64 ns; int mode; - struct pvclock_gtod_data *gtod = pvclock_gtod_data; + u64 ns; - ts-tv_nsec = 0; do { seq = read_seqcount_begin(gtod-seq); mode = gtod-clock.vclock_mode; - ts-tv_sec = gtod-monotonic_time_sec; - ns = gtod-monotonic_time_snsec; + ns = gtod-nsec_base; ns += vgettsc(cycle_now); ns = gtod-clock.shift; + ns += gtod-boot_ns; } while (unlikely(read_seqcount_retry(gtod-seq, seq))); - timespec_add_ns(ts, ns); + *t = ns; return mode; } @@ -1438,19 +1432,11 @@ static int do_monotonic(struct timespec *ts, cycle_t *cycle_now) /* returns true if host is using tsc clocksource */ static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now) { - struct timespec ts; - /* checked again under seqlock below */ if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) return false; - if (do_monotonic(ts, cycle_now) != VCLOCK_TSC) - return false; - - monotonic_to_bootbased(ts); - *kernel_ns = timespec_to_ns(ts); - - return true; + return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC; } #endif My observation is that the kvmclock value seems to be positively biased by the boot_ns value. -- To unsubscribe from
Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge
On Thu, Sep 4, 2014 at 9:00 AM, Chris J Arges chris.j.ar...@canonical.com wrote: On 09/04/2014 07:58 AM, Paolo Bonzini wrote: Commit cbcf2dd3b3d4 (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds based, 2014-07-16) forgot to add tk-xtime_sec, thus breaking kvmclock on hosts that have a reliable TSC. Add it back; and since the field boot_ns is not anymore related to the host boot-based clock, rename boot_ns-nsec_base and the existing nsec_base-snsec_base. Cc: Thomas Gleixner t...@linutronix.de Cc: John Stultz john.stu...@linaro.org Reported-by: Chris J Arges chris.j.ar...@canonical.com Signed-off-by: Paolo Bonzini pbonz...@redhat.com --- arch/x86/kvm/x86.c | 11 ++- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8f1e22d3b286..92493e10937c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1020,8 +1020,8 @@ struct pvclock_gtod_data { u32 shift; } clock; - u64 boot_ns; u64 nsec_base; + u64 snsec_base; }; static struct pvclock_gtod_data pvclock_gtod_data; @@ -1042,8 +1042,9 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata-clock.mult = tk-tkr.mult; vdata-clock.shift = tk-tkr.shift; - vdata-boot_ns = boot_ns; - vdata-nsec_base= tk-tkr.xtime_nsec; + vdata-nsec_base= tk-xtime_sec * (u64)NSEC_PER_SEC + + boot_ns; + vdata-snsec_base = tk-tkr.xtime_nsec; write_seqcount_end(vdata-seq); } @@ -1413,10 +1414,10 @@ static int do_monotonic_boot(s64 *t, cycle_t *cycle_now) do { seq = read_seqcount_begin(gtod-seq); mode = gtod-clock.vclock_mode; - ns = gtod-nsec_base; + ns = gtod-snsec_base; ns += vgettsc(cycle_now); ns = gtod-clock.shift; - ns += gtod-boot_ns; + ns += gtod-nsec_base; } while (unlikely(read_seqcount_retry(gtod-seq, seq))); *t = ns; Paulo, I've tested with the above patch and I still have issues with the kvmclock test offset; however the cycle tests pass now. Here is trace data: http://people.canonical.com/~arges/kvm/trace-4.dat.xz Uptime: 15:58:02 up 1:00, 1 user, load average: 0.59, 0.60, 0.31 Here is the output: ./x86-run x86/kvmclock_test.flat -smp 2 --append 1000 `date +%s` qemu-system-x86_64 -enable-kvm -device pc-testdev -device isa-debug-exit,iobase=0xf4,iosize=0x4 -display none -serial stdio -device pci-testdev -kernel x86/kvmclock_test.flat -smp 2 --append 1000 1409846210 enabling apic enabling apic kvm-clock: cpu 0, msr 0x:44d4c0 kvm-clock: cpu 0, msr 0x:44d4c0 Wallclock test, threshold 5 Seconds get from host: 1409846210 Seconds get from kvmclock: 2819688866 Offset:1409842656 offset too large! Hey, thanks for reporting the issue and sending an initial patch (even if its not quite all sorted yet). Is the test you're using here available somewhere? Are there any special requirements to run it? thanks -john -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/3] KVM: remove redundant assigment of return value in kvm_dev_ioctl
The first statement of kvm_dev_ioctl is long r = -EINVAL; No need to reassign the same value. Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com --- virt/kvm/kvm_main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0a824a0..5ea65d2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2610,9 +2610,8 @@ static long kvm_dev_ioctl(struct file *filp, long r = -EINVAL; switch (ioctl) { case KVM_GET_API_VERSION: - r = -EINVAL; if (arg) goto out; r = KVM_API_VERSION; break; @@ -2622,9 +2621,8 @@ static long kvm_dev_ioctl(struct file *filp, case KVM_CHECK_EXTENSION: r = kvm_vm_ioctl_check_extension_generic(NULL, arg); break; case KVM_GET_VCPU_MMAP_SIZE: - r = -EINVAL; if (arg) goto out; r = PAGE_SIZE; /* struct kvm_run */ #ifdef CONFIG_X86 -- 1.8.4.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/3] KVM: remove redundant check of in_spin_loop
The expression `vcpu-spin_loop.in_spin_loop' is always true, because it is evaluated only when the condition `!vcpu-spin_loop.in_spin_loop' is false. Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com --- virt/kvm/kvm_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7176929..0a824a0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1768,10 +1768,9 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT bool eligible; eligible = !vcpu-spin_loop.in_spin_loop || - (vcpu-spin_loop.in_spin_loop -vcpu-spin_loop.dy_eligible); + vcpu-spin_loop.dy_eligible; if (vcpu-spin_loop.in_spin_loop) kvm_vcpu_set_dy_eligible(vcpu, !vcpu-spin_loop.dy_eligible); -- 1.8.4.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/3] cleanup of redundant statements
Paolo, I was playing with some static code checkers. Here is some fallout from the kvm common code. Only minor things that are not real error, just redundant statements. One could argue here and there that these statement make the code easier to understand. So, please have a look and either drop or apply the patches. Christian Borntraeger (3): KVM: remove redundant check of in_spin_loop KVM: remove redundant assigment of return value in kvm_dev_ioctl KVM: remove redundant assignments in __kvm_set_memory_region virt/kvm/kvm_main.c | 8 +--- 1 file changed, 1 insertion(+), 7 deletions(-) -- 1.8.4.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/3] KVM: remove redundant assignments in __kvm_set_memory_region
__kvm_set_memory_region sets r to EINVAL very early. Doing it again is not necessary. The same is true later on, where r is assigned -ENOMEM twice. Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com --- virt/kvm/kvm_main.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5ea65d2..2d868ad 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -776,9 +776,8 @@ int __kvm_set_memory_region(struct kvm *kvm, slot = id_to_memslot(kvm-memslots, mem-slot); base_gfn = mem-guest_phys_addr PAGE_SHIFT; npages = mem-memory_size PAGE_SHIFT; - r = -EINVAL; if (npages KVM_MEM_MAX_NR_PAGES) goto out; if (!npages) @@ -790,9 +789,8 @@ int __kvm_set_memory_region(struct kvm *kvm, new.base_gfn = base_gfn; new.npages = npages; new.flags = mem-flags; - r = -EINVAL; if (npages) { if (!old.npages) change = KVM_MR_CREATE; else { /* Modify an existing slot. */ @@ -846,9 +844,8 @@ int __kvm_set_memory_region(struct kvm *kvm, goto out_free; } if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) { - r = -ENOMEM; slots = kmemdup(kvm-memslots, sizeof(struct kvm_memslots), GFP_KERNEL); if (!slots) goto out_free; -- 1.8.4.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge
Il 04/09/2014 21:00, John Stultz ha scritto: Hey, thanks for reporting the issue and sending an initial patch (even if its not quite all sorted yet). Is the test you're using here available somewhere? Are there any special requirements to run it? You need KVM on a machine with clocksource=tsc. Grab the tests from git://git.kernel.org/pub/scm/virt/kvm/kvm-unit-tests.git and run them with ./configure make ./x86-run x86/kvmclock_test.flat --append 1000 `date +%s` Thanks, Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge
Il 04/09/2014 20:16, Chris J Arges ha scritto: +boot_ns = timespec_to_ns(tk-total_sleep_time) ++ tk-wall_to_monotonic.tv_sec * (u64)NSEC_PER_SEC ++ tk-wall_to_monotonic.tv_nsec ++ tk-xtime_sec * (u64)NSEC_PER_SEC; So this means that the above 3.16-based code is not the same as boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot)); in 3.17. Everything else in the patch you tested is the same as the code that is in 3.17, so that's a start. Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge
Il 04/09/2014 21:15, Paolo Bonzini ha scritto: Il 04/09/2014 20:16, Chris J Arges ha scritto: + boot_ns = timespec_to_ns(tk-total_sleep_time) + + tk-wall_to_monotonic.tv_sec * (u64)NSEC_PER_SEC + + tk-wall_to_monotonic.tv_nsec + + tk-xtime_sec * (u64)NSEC_PER_SEC; So this means that the above 3.16-based code is not the same as boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot)); in 3.17. Everything else in the patch you tested is the same as the code that is in 3.17, so that's a start. Paolo Based on commit 02cba1598a2a3b689e79ad6dad2532521f638271 we have: offs_real - offs_boot = wall_to_monotonic + total_sleep_time The patch I posted this morning separates tk-xtime_sec out of boot_ns, so all that is missing should be a change in boot_ns from base_mono + offs_boot to offs_real - offs_boot. Chris, can you try this patch on top of the previous one: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 92493e10937c..811eecc43fe8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1031,7 +1031,7 @@ static void update_pvclock_gtod(struct timekeeper *tk) struct pvclock_gtod_data *vdata = pvclock_gtod_data; u64 boot_ns; - boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot)); + boot_ns = ktime_to_ns(ktime_sub(tk-offs_real, tk-offs_boot)); write_seqcount_begin(vdata-seq); If it doesn't work, then commit 02cba1598a2a3b689e79ad6dad2532521f638271 is also broken. Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge
On 09/04/2014 02:42 PM, Paolo Bonzini wrote: Il 04/09/2014 21:15, Paolo Bonzini ha scritto: Il 04/09/2014 20:16, Chris J Arges ha scritto: + boot_ns = timespec_to_ns(tk-total_sleep_time) + + tk-wall_to_monotonic.tv_sec * (u64)NSEC_PER_SEC + + tk-wall_to_monotonic.tv_nsec + + tk-xtime_sec * (u64)NSEC_PER_SEC; So this means that the above 3.16-based code is not the same as boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot)); in 3.17. Everything else in the patch you tested is the same as the code that is in 3.17, so that's a start. Paolo Based on commit 02cba1598a2a3b689e79ad6dad2532521f638271 we have: offs_real - offs_boot = wall_to_monotonic + total_sleep_time The patch I posted this morning separates tk-xtime_sec out of boot_ns, so all that is missing should be a change in boot_ns from base_mono + offs_boot to offs_real - offs_boot. Chris, can you try this patch on top of the previous one: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 92493e10937c..811eecc43fe8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1031,7 +1031,7 @@ static void update_pvclock_gtod(struct timekeeper *tk) struct pvclock_gtod_data *vdata = pvclock_gtod_data; u64 boot_ns; - boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot)); + boot_ns = ktime_to_ns(ktime_sub(tk-offs_real, tk-offs_boot)); write_seqcount_begin(vdata-seq); If it doesn't work, then commit 02cba1598a2a3b689e79ad6dad2532521f638271 is also broken. Paolo Paolo, That modification do your additional patch didn't work. However I was able to modify the code as follows to get this test case working. The only additional modification was: + vdata-nsec_base= tk-xtime_sec * (u64)NSEC_PER_SEC + - boot_ns; --chris j arges -- diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7b25aa2..60c0a9b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1023,8 +1023,8 @@ struct pvclock_gtod_data { u32 shift; } clock; - u64 boot_ns; u64 nsec_base; + u64 snsec_base; }; static struct pvclock_gtod_data pvclock_gtod_data; @@ -1034,7 +1034,7 @@ static void update_pvclock_gtod(struct timekeeper *tk) struct pvclock_gtod_data *vdata = pvclock_gtod_data; u64 boot_ns; - boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot)); + boot_ns = ktime_to_ns(ktime_sub(tk-offs_real, tk-offs_boot)); write_seqcount_begin(vdata-seq); @@ -1045,8 +1045,9 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata-clock.mult = tk-tkr.mult; vdata-clock.shift = tk-tkr.shift; - vdata-boot_ns = boot_ns; - vdata-nsec_base= tk-tkr.xtime_nsec; + vdata-nsec_base= tk-xtime_sec * (u64)NSEC_PER_SEC + - boot_ns; + vdata-snsec_base = tk-tkr.xtime_nsec; write_seqcount_end(vdata-seq); } @@ -1416,10 +1417,10 @@ static int do_monotonic_boot(s64 *t, cycle_t *cycle_now) do { seq = read_seqcount_begin(gtod-seq); mode = gtod-clock.vclock_mode; - ns = gtod-nsec_base; + ns = gtod-snsec_base; ns += vgettsc(cycle_now); ns = gtod-clock.shift; - ns += gtod-boot_ns; + ns += gtod-nsec_base; } while (unlikely(read_seqcount_retry(gtod-seq, seq))); *t = ns; -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/3] cleanup of redundant statements
Il 04/09/2014 21:13, Christian Borntraeger ha scritto: Paolo, I was playing with some static code checkers. Here is some fallout from the kvm common code. Only minor things that are not real error, just redundant statements. One could argue here and there that these statement make the code easier to understand. So, please have a look and either drop or apply the patches. I think all the patches are an improvement. Thanks! Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge
Il 04/09/2014 22:37, Chris J Arges ha scritto: - boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot)); + boot_ns = ktime_to_ns(ktime_sub(tk-offs_real, tk-offs_boot)); write_seqcount_begin(vdata-seq); If it doesn't work, then commit 02cba1598a2a3b689e79ad6dad2532521f638271 is also broken. Paolo Paolo, That modification do your additional patch didn't work. However I was able to modify the code as follows to get this test case working. The only additional modification was: + vdata-nsec_base= tk-xtime_sec * (u64)NSEC_PER_SEC + - boot_ns; Right, it should have been boot_ns = ktime_to_ns(ktime_sub(tk-offs_boot, tk-offs_real)); I'll post the patch shortly. Thanks! Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge
On 09/04/2014 03:40 PM, Paolo Bonzini wrote: Il 04/09/2014 22:37, Chris J Arges ha scritto: - boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot)); + boot_ns = ktime_to_ns(ktime_sub(tk-offs_real, tk-offs_boot)); write_seqcount_begin(vdata-seq); If it doesn't work, then commit 02cba1598a2a3b689e79ad6dad2532521f638271 is also broken. Paolo Paolo, That modification do your additional patch didn't work. However I was able to modify the code as follows to get this test case working. The only additional modification was: +vdata-nsec_base= tk-xtime_sec * (u64)NSEC_PER_SEC +- boot_ns; Right, it should have been boot_ns = ktime_to_ns(ktime_sub(tk-offs_boot, tk-offs_real)); I'll post the patch shortly. Thanks! Paolo Paolo, Great, tested that modification really quick and it also works for me! All test cases are now passing on my machine; thanks for all the debugging and help. --chris -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge
On Thu, 4 Sep 2014, Paolo Bonzini wrote: Commit cbcf2dd3b3d4 (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds based, 2014-07-16) forgot to add tk-xtime_sec, thus breaking kvmclock on Errm. How is boottime related to xtime_sec? hosts that have a reliable TSC. Add it back; and since the field boot_ns is not anymore related to the host boot-based clock, rename boot_ns-nsec_base and the existing nsec_base-snsec_base. This is simply wrong. The original code before that changed did: vdata-monotonic_time_sec = tk-xtime_sec + tk-wall_to_monotonic.tv_sec; vdata-monotonic_time_snsec = tk-xtime_nsec + (tk-wall_to_monotonic.tv_nsec tk-shift); So this is the momentary monotonic base time And the readout function did: ts-tv_nsec = 0; do { seq = read_seqcount_begin(gtod-seq); mode = gtod-clock.vclock_mode; ts-tv_sec = gtod-monotonic_time_sec; ns = gtod-monotonic_time_snsec; ns += vgettsc(cycle_now); ns = gtod-clock.shift; } while (unlikely(read_seqcount_retry(gtod-seq, seq))); timespec_add_ns(ts, ns); So this does: now = monotonic_base + delta_nsec And the caller converted it to boot time with: monotonic_to_bootbased(ts); So the time calculation does: now = monotonic_base + delta_nsec + mono_to_boot Because: monotonic_base + mono_to_boot = boot_time_base The calculation can be written as: now = boot_time_base + delta_nsec The new code does boot_ns = ktime_to_ns(ktime_add(tk-base_mono, tk-offs_boot)); So thats boot_time_base = monotonic_base + mono_to_boot; vdata-boot_ns = boot_ns; vdata-nsec_base= tk-tkr.xtime_nsec; And the readout does: do { seq = read_seqcount_begin(gtod-seq); mode = gtod-clock.vclock_mode; ns = gtod-nsec_base; ns += vgettsc(cycle_now); ns = gtod-clock.shift; ns += gtod-boot_ns; } while (unlikely(read_seqcount_retry(gtod-seq, seq))); *t = ns; Which is: boot_time_base + delta_nsec Now I have no idea why you think it needs to add xtime_sec. If the result is wrong, then we need to figure out which one of the supplied values is wrong and not blindly add xtime_sec just because that makes it magically correct. Can you please provide a proper background why you think that adding xtime_sec is a good idea? Thanks, tglx -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] KVM: x86: fix kvmclock breakage from timers branch merge
Commit cbcf2dd3b3d4 (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds based, 2014-07-16) used the wrong formula for boot_ns, thus breaking kvmclock on hosts that have a reliable TSC. To find the right formula, let's first backport the switch to nanoseconds to 3.16-era timekeeping logic. The full patch (which works) is at https://lkml.org/lkml/2014/9/4/462. The key line here is boot_ns = timespec_to_ns(tk-total_sleep_time) + timespec_to_ns(tk-wall_to_monotonic) + tk-xtime_sec * (u64)NSEC_PER_SEC; Because the above patch works, the conclusion is that the above formula is not the same as commit cbcf2dd3b3d4's boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot)); As to what is the right one, commit 02cba1598a2a (timekeeping: Simplify getboottime(), 2014-07-16) provides a hint: offs_real = -wall-to_monotonic offs_boot = total_sleep_time offs_real - offs_boot = -wall_to_monotonic - total_sleep_time that is offs_boot - offs_real = wall_to_monotonic + total_sleep_time which is what this patch uses, adding xtime_sec separately. The boot_ns moniker is not too clear, so rename boot_ns to nsec_base and the existing nsec_base to snsec_base. Cc: Thomas Gleixner t...@linutronix.de Cc: John Stultz john.stu...@linaro.org Reported-by: Chris J Arges chris.j.ar...@canonical.com Signed-off-by: Paolo Bonzini pbonz...@redhat.com --- Thomas/John, the problem with the above explanation is that tk_update_ktime_data has base_mono = xtime_sec + wtm, and from there base_mono + offs_boot = xtime_sec + wtm + total_sleep_time. Except that doesn't work, so something must be wrong in tk_update_ktime_data's comment. arch/x86/kvm/x86.c | 13 +++-- 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8f1e22d3b286..c55203bea337 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1020,8 +1020,8 @@ struct pvclock_gtod_data { u32 shift; } clock; - u64 boot_ns; u64 nsec_base; + u64 snsec_base; }; static struct pvclock_gtod_data pvclock_gtod_data; @@ -1031,7 +1031,7 @@ static void update_pvclock_gtod(struct timekeeper *tk) struct pvclock_gtod_data *vdata = pvclock_gtod_data; u64 boot_ns; - boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot)); + boot_ns = ktime_to_ns(ktime_sub(tk-tkr.offs_boot, tk-offs_real)); write_seqcount_begin(vdata-seq); @@ -1042,8 +1042,9 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata-clock.mult = tk-tkr.mult; vdata-clock.shift = tk-tkr.shift; - vdata-boot_ns = boot_ns; - vdata-nsec_base= tk-tkr.xtime_nsec; + vdata-nsec_base= tk-xtime_sec * (u64)NSEC_PER_SEC + + boot_ns; + vdata-snsec_base = tk-tkr.xtime_nsec; write_seqcount_end(vdata-seq); } @@ -1413,10 +1414,10 @@ static int do_monotonic_boot(s64 *t, cycle_t *cycle_now) do { seq = read_seqcount_begin(gtod-seq); mode = gtod-clock.vclock_mode; - ns = gtod-nsec_base; + ns = gtod-snsec_base; ns += vgettsc(cycle_now); ns = gtod-clock.shift; - ns += gtod-boot_ns; + ns += gtod-nsec_base; } while (unlikely(read_seqcount_retry(gtod-seq, seq))); *t = ns; -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge
Il 04/09/2014 22:58, Thomas Gleixner ha scritto: This is simply wrong. It is. Now I have no idea why you think it needs to add xtime_sec. If the result is wrong, then we need to figure out which one of the supplied values is wrong and not blindly add xtime_sec just because that makes it magically correct. Can you please provide a proper background why you think that adding xtime_sec is a good idea? It's not a good idea indeed. I didn't fully digest the 3.16-3.17 timekeeping changes and messed up this patch. However, there is a bug in the base_mono + offs_boot formula, given that: - bisection leads to the merge commit of John's timers branch - bisecting within John's timers branch, with a KVM commit on top to make the code much easier to trigger, leads to commit cbcf2dd3b3d4 (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds based, 2014-07-16). - I backported your patch to 3.16, using wall_to_monotonic + total_sleep_time + xtime_sec (wtm+xtime_sec as in pre-cbcf2dd3b3d4 code, total_sleep_time from 3.16 monotonic_to_bootbased) and it works - In v2 of the patch I fixed the bug by changing the formula base_mono + offs_boot to offs_boot - offs_real (and then adding xtime_sec separately as in the 3.16 backport), but the two formulas base_mono + offs_boot and offs_boot - offs_real + xtime_sec ought to be identical. I find offs_boot - offs_real + xtime more readable than the alternative base_mono + offs_boot + xtime_nsec, so the fix doubles as a cleanup for me and I'm fine with it. But something must be wrong in the timekeeping code. Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge
On Thu, 4 Sep 2014, Paolo Bonzini wrote: Commit cbcf2dd3b3d4 (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds based, 2014-07-16) used the wrong formula for boot_ns, thus breaking kvmclock on hosts that have a reliable TSC. To find the right formula, let's first backport the switch to nanoseconds to 3.16-era timekeeping logic. The full patch (which works) is at https://lkml.org/lkml/2014/9/4/462. The key line here is boot_ns = timespec_to_ns(tk-total_sleep_time) + timespec_to_ns(tk-wall_to_monotonic) + tk-xtime_sec * (u64)NSEC_PER_SEC; Because the above patch works, the conclusion is that the above formula is not the same as commit cbcf2dd3b3d4's boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot)); As to what is the right one, commit 02cba1598a2a (timekeeping: Simplify getboottime(), 2014-07-16) provides a hint: offs_real = -wall-to_monotonic offs_boot = total_sleep_time offs_real - offs_boot = -wall_to_monotonic - total_sleep_time that is offs_boot - offs_real = wall_to_monotonic + total_sleep_time which is what this patch uses, adding xtime_sec separately. The boot_ns moniker is not too clear, so rename boot_ns to nsec_base and the existing nsec_base to snsec_base. Cc: Thomas Gleixner t...@linutronix.de Cc: John Stultz john.stu...@linaro.org Reported-by: Chris J Arges chris.j.ar...@canonical.com Signed-off-by: Paolo Bonzini pbonz...@redhat.com --- Thomas/John, the problem with the above explanation is that tk_update_ktime_data has base_mono = xtime_sec + wtm, and from there base_mono + offs_boot = xtime_sec + wtm + total_sleep_time. Except that doesn't work, so something must be wrong in tk_update_ktime_data's comment. Right. I'm staring into it and we need to fix the core code issue and not the usage site. Thanks, tglx -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge
On Thu, 4 Sep 2014, Paolo Bonzini wrote: Il 04/09/2014 22:58, Thomas Gleixner ha scritto: This is simply wrong. It is. Now I have no idea why you think it needs to add xtime_sec. If the result is wrong, then we need to figure out which one of the supplied values is wrong and not blindly add xtime_sec just because that makes it magically correct. Can you please provide a proper background why you think that adding xtime_sec is a good idea? It's not a good idea indeed. I didn't fully digest the 3.16-3.17 timekeeping changes and messed up this patch. However, there is a bug in the base_mono + offs_boot formula, given that: - bisection leads to the merge commit of John's timers branch - bisecting within John's timers branch, with a KVM commit on top to make the code much easier to trigger, leads to commit cbcf2dd3b3d4 (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds based, 2014-07-16). - I backported your patch to 3.16, using wall_to_monotonic + total_sleep_time + xtime_sec (wtm+xtime_sec as in pre-cbcf2dd3b3d4 code, total_sleep_time from 3.16 monotonic_to_bootbased) and it works - In v2 of the patch I fixed the bug by changing the formula base_mono + offs_boot to offs_boot - offs_real (and then adding xtime_sec separately as in the 3.16 backport), but the two formulas base_mono + offs_boot and offs_boot - offs_real + xtime_sec ought to be identical. I find offs_boot - offs_real + xtime more readable than the alternative base_mono + offs_boot + xtime_nsec, so the fix doubles as a cleanup for me and I'm fine with it. But something must be wrong in the timekeeping code. I think I have a vague idea what happened, but I'm way too tired now to write it up fully. I'll do that tomorrow morning with brain awake. Thanks, tglx -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/4] PCI: Export MSI message relevant functions
On Mon, May 19, 2014 at 01:01:07PM +1000, Gavin Shan wrote: The patch exports 2 MSI message relevant functions, which will be used by VFIO PCI driver. The VFIO PCI driver would be built as a module. Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com Acked-by: Bjorn Helgaas bhelg...@google.com I think Alex will merge this along with the other ones. Sorry this took so long. I don't really like this, but I just can't figure out any solution that's better. --- drivers/pci/msi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 955ab79..2350271 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -324,6 +324,7 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) __get_cached_msi_msg(entry, msg); } +EXPORT_SYMBOL_GPL(get_cached_msi_msg); void __write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) { @@ -368,6 +369,7 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg) __write_msi_msg(entry, msg); } +EXPORT_SYMBOL_GPL(write_msi_msg); static void free_msi_irqs(struct pci_dev *dev) { -- 1.8.3.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[Bug 83381] 4-ports 82576 detect 2 ports when add intel_iommu=on pci=assign-busses.
https://bugzilla.kernel.org/show_bug.cgi?id=83381 Wanpeng Li wanpeng...@linux.intel.com changed: What|Removed |Added CC||alex.william...@redhat.com, ||wanpeng...@linux.intel.com --- Comment #7 from Wanpeng Li wanpeng...@linux.intel.com --- Hi Chao, -- You are receiving this mail because: You are watching the assignee of the bug. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[Bug 83381] 4-ports 82576 detect 2 ports when add intel_iommu=on pci=assign-busses.
https://bugzilla.kernel.org/show_bug.cgi?id=83381 Wanpeng Li wanpeng...@linux.intel.com changed: What|Removed |Added CC||wanpeng...@linux.intel.com --- Comment #8 from Wanpeng Li wanpeng...@linux.intel.com --- Cc Alex Williamson alex.william...@redhat.com -- You are receiving this mail because: You are watching the assignee of the bug. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/4] PCI: Export MSI message relevant functions
On Thu, Sep 04, 2014 at 04:57:36PM -0600, Bjorn Helgaas wrote: On Mon, May 19, 2014 at 01:01:07PM +1000, Gavin Shan wrote: The patch exports 2 MSI message relevant functions, which will be used by VFIO PCI driver. The VFIO PCI driver would be built as a module. Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com Acked-by: Bjorn Helgaas bhelg...@google.com I think Alex will merge this along with the other ones. Sorry this took so long. I don't really like this, but I just can't figure out any solution that's better. Thanks, Bjorn. I thought you must forget this. Lets get it in firstly and I'll do more investigation later to see if I can figure out something better. Thanks, Gavin --- drivers/pci/msi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 955ab79..2350271 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -324,6 +324,7 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) __get_cached_msi_msg(entry, msg); } +EXPORT_SYMBOL_GPL(get_cached_msi_msg); void __write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) { @@ -368,6 +369,7 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg) __write_msi_msg(entry, msg); } +EXPORT_SYMBOL_GPL(write_msi_msg); static void free_msi_irqs(struct pci_dev *dev) { -- 1.8.3.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[Bug 83381] 4-ports 82576 detect 2 ports when add intel_iommu=on pci=assign-busses.
https://bugzilla.kernel.org/show_bug.cgi?id=83381 --- Comment #9 from Alex Williamson alex.william...@redhat.com --- Is this a regression? Has it ever worked? Why is this filed against kvm since it appears to have no relation to qemu or kvm? -- You are receiving this mail because: You are watching the assignee of the bug. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 12/15] arm/arm64: KVM: add virtual GICv3 distributor emulation
On 2014/8/21 21:06, Andre Przywara wrote: +void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg) +{ + struct kvm *kvm = vcpu-kvm; + struct kvm_vcpu *c_vcpu; + struct vgic_dist *dist = kvm-arch.vgic; + u16 target_cpus; + u64 mpidr, mpidr_h, mpidr_l; + int sgi, mode, c, vcpu_id; + int updated = 0; + + vcpu_id = vcpu-vcpu_id; + + sgi = (reg 24) 0xf; + mode = (reg 40) 0x1; + target_cpus = reg 0x; + mpidr = ((reg 48) 0xff) MPIDR_LEVEL_SHIFT(3); + mpidr |= ((reg 32) 0xff) MPIDR_LEVEL_SHIFT(2); + mpidr |= ((reg 16) 0xff) MPIDR_LEVEL_SHIFT(1); + mpidr = ~MPIDR_LEVEL_MASK; + + /* + * We take the dist lock here, because we come from the sysregs + * code path and not from MMIO (where this is already done) + */ + spin_lock(dist-lock); + kvm_for_each_vcpu(c, c_vcpu, kvm) { Hi, Andre, there is a suggestion. Move the + if (target_cpus == 0) + break; code, out the kvm_for_each_vcpu loop, Like : if (!mode target_cpus == 0) /* the judgement do not need judge in kvm_for_each_vcpu loop */ return; spin_lock(dist-lock); kvm_for_each_vcpu(c, c_vcpu, kvm) { + if (mode c == vcpu_id) /* not to myself */ + continue; + if (!mode) { + mpidr_h = kvm_vcpu_get_mpidr(c_vcpu); + mpidr_l = MPIDR_AFFINITY_LEVEL(mpidr_h, 0); + mpidr_h = ~MPIDR_LEVEL_MASK; + if (mpidr != mpidr_h) + continue; + if (!(target_cpus BIT(mpidr_l))) + continue; + target_cpus = ~BIT(mpidr_l); + } + /* Flag the SGI as pending */ + vgic_dist_irq_set(c_vcpu, sgi); + updated = 1; + kvm_debug(SGI%d from CPU%d to CPU%d\n, sgi, vcpu_id, c); + } + if (updated) + vgic_update_state(vcpu-kvm); + spin_unlock(dist-lock); + if (updated) + vgic_kick_vcpus(vcpu-kvm); +} + + -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[question] git clone kvm.git failed
Hi, all I encounter below error during git clone kvm.git, # git clone git://git.kernel.org/pub/scm/virt/kvm/kvm.git kvm_0905 Cloning into 'kvm_0905'... remote: Counting objects: 3819711, done. remote: Compressing objects: 100% (575699/575699), done. remote: Total 3819711 (delta 3219203), reused 3812285 (delta 3211836) Receiving objects: 100% (3819711/3819711), 804.71 MiB | 122 KiB/s, done. Resolving deltas: 100% (3219203/3219203), done. error: unable to create file include/linux/types.h (File too large) error: unable to create file include/linux/u64_stats_sync.h (File too large) error: unable to create file include/linux/uaccess.h (File too large) error: unable to create file include/linux/ucb1400.h (File too large) error: unable to create file include/linux/ucs2_string.h (File too large) error: unable to create file include/linux/udp.h (File too large) error: unable to create file include/linux/uidgid.h (File too large) error: unable to create file include/linux/uinput.h (File too large) error: unable to create file include/linux/uio.h (File too large) error: unable to create file include/linux/uio_driver.h (File too large) fatal: cannot create directory at 'include/linux/unaligned': File too large How to resolve these errors? Thanks, Zhang Haoyu -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Patch - support e500-specific: Performance monitor
Is There specific any reason not to copy extra handler IOVR 35 for e500? --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -527,7 +527,7 @@ static struct kvmppc_ops kvm_ops_e500 = { static int __init kvmppc_e500_init(void) { int r, i; - unsigned long ivor[3]; + unsigned long ivor[4]; /* Process remaining handlers above the generic first 16 */ unsigned long *handler = kvmppc_booke_handler_addr[16]; unsigned long handler_len; @@ -545,7 +545,8 @@ static int __init kvmppc_e500_init(void) ivor[0] = mfspr(SPRN_IVOR32); ivor[1] = mfspr(SPRN_IVOR33); ivor[2] = mfspr(SPRN_IVOR34); - for (i = 0; i 3; i++) { +ivor[3] = mfspr(SPRN_IVOR35); + for (i = 0; i 4; i++) { if (ivor[i] ivor[max_ivor]) max_ivor = i; Please ignore this patch if find it irrelevant or been submitted. Thanks, Amit Tomar. -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: Patch - support e500-specific: Performance monitor
-Original Message- From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On Behalf Of Amit Tomar Sent: Thursday, September 04, 2014 8:34 PM To: ag...@suse.de; kvm-ppc@vger.kernel.org; Caraman Mihai Claudiu-B02008; pbonz...@redhat.com Subject: Patch - support e500-specific: Performance monitor Is There specific any reason not to copy extra handler IOVR 35 for e500? Because we do not support Performance Monitor for guest. Why you want to add this, any specific requirement? Thanks -Bharat --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -527,7 +527,7 @@ static struct kvmppc_ops kvm_ops_e500 = { static int __init kvmppc_e500_init(void) { int r, i; - unsigned long ivor[3]; + unsigned long ivor[4]; /* Process remaining handlers above the generic first 16 */ unsigned long *handler = kvmppc_booke_handler_addr[16]; unsigned long handler_len; @@ -545,7 +545,8 @@ static int __init kvmppc_e500_init(void) ivor[0] = mfspr(SPRN_IVOR32); ivor[1] = mfspr(SPRN_IVOR33); ivor[2] = mfspr(SPRN_IVOR34); - for (i = 0; i 3; i++) { +ivor[3] = mfspr(SPRN_IVOR35); + for (i = 0; i 4; i++) { if (ivor[i] ivor[max_ivor]) max_ivor = i; Please ignore this patch if find it irrelevant or been submitted. Thanks, Amit Tomar. -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html