Re: [PATCH 3/4] KVM: x86: inject nested page faults on emulated instructions

2014-09-04 Thread Gleb Natapov
On Tue, Sep 02, 2014 at 05:13:49PM +0200, Paolo Bonzini wrote:
 This is required for the following patch to work correctly.  If a nested page
 fault happens during emulation, we must inject a vmexit, not a page fault.
 Luckily we already have the required machinery: it is enough to return
 X86EMUL_INTERCEPTED instead of X86EMUL_PROPAGATE_FAULT.
 
I wonder why this patch is needed. X86EMUL_PROPAGATE_FAULT causes
ctxt-have_exception to be set to true in x86_emulate_insn().
x86_emulate_instruction() checks ctxt-have_exception and calls
inject_emulated_exception() if it is true. inject_emulated_exception()
calls kvm_propagate_fault() where we check if the fault was nested and
generate vmexit or a page fault accordingly.

 Reported-by: Valentine Sinitsyn valentine.sinit...@gmail.com
 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 ---
  arch/x86/kvm/x86.c | 18 ++
  1 file changed, 14 insertions(+), 4 deletions(-)
 
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index e4ed85e07a01..9e3b74c044ed 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -416,6 +416,16 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct 
 x86_exception *fault)
   vcpu-arch.mmu.inject_page_fault(vcpu, fault);
  }
  
 +static inline int kvm_propagate_or_intercept(struct kvm_vcpu *vcpu,
 +  struct x86_exception *exception)
 +{
 + if (likely(!exception-nested_page_fault))
 + return X86EMUL_PROPAGATE_FAULT;
 +
 + vcpu-arch.mmu.inject_page_fault(vcpu, exception);
 + return X86EMUL_INTERCEPTED;
 +}
 +
  void kvm_inject_nmi(struct kvm_vcpu *vcpu)
  {
   atomic_inc(vcpu-arch.nmi_queued);
 @@ -4122,7 +4132,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void 
 *val, unsigned int bytes,
   int ret;
  
   if (gpa == UNMAPPED_GVA)
 - return X86EMUL_PROPAGATE_FAULT;
 + return kvm_propagate_or_intercept(vcpu, exception);
   ret = kvm_read_guest_page(vcpu-kvm, gpa  PAGE_SHIFT, data,
 offset, toread);
   if (ret  0) {
 @@ -4152,7 +4162,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt 
 *ctxt,
   gpa_t gpa = vcpu-arch.walk_mmu-gva_to_gpa(vcpu, addr, 
 access|PFERR_FETCH_MASK,
   exception);
   if (unlikely(gpa == UNMAPPED_GVA))
 - return X86EMUL_PROPAGATE_FAULT;
 + return kvm_propagate_or_intercept(vcpu, exception);
  
   offset = addr  (PAGE_SIZE-1);
   if (WARN_ON(offset + bytes  PAGE_SIZE))
 @@ -4203,7 +4213,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt 
 *ctxt,
   int ret;
  
   if (gpa == UNMAPPED_GVA)
 - return X86EMUL_PROPAGATE_FAULT;
 + return kvm_propagate_or_intercept(vcpu, exception);
   ret = kvm_write_guest(vcpu-kvm, gpa, data, towrite);
   if (ret  0) {
   r = X86EMUL_IO_NEEDED;
 @@ -4350,7 +4360,7 @@ static int emulator_read_write_onepage(unsigned long 
 addr, void *val,
   ret = vcpu_mmio_gva_to_gpa(vcpu, addr, gpa, exception, write);
  
   if (ret  0)
 - return X86EMUL_PROPAGATE_FAULT;
 + return kvm_propagate_or_intercept(vcpu, exception);
  
   /* For APIC access vmexit */
   if (ret)
 -- 
 1.8.3.1
 
 

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 03/15] arm/arm64: KVM: refactor vgic_handle_mmio() function

2014-09-04 Thread wanghaibin
On 2014/8/21 21:06, Andre Przywara wrote:

 Currently we only need to deal with one MMIO region for the GIC
 emulation, but we soon need to extend this. Refactor the existing
 code to allow easier addition of different ranges without code
 duplication.
 
 Signed-off-by: Andre Przywara andre.przyw...@arm.com
 ---
  virt/kvm/arm/vgic.c |   72 
 ---
  1 file changed, 51 insertions(+), 21 deletions(-)
 
 diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
 index bba8692..3b6f78d 100644
 --- a/virt/kvm/arm/vgic.c
 +++ b/virt/kvm/arm/vgic.c
 @@ -925,37 +925,28 @@ static bool vgic_validate_access(const struct vgic_dist 
 *dist,
   return true;
  }
  
 -/**
 - * vgic_handle_mmio - handle an in-kernel MMIO access
 +/*
 + * vgic_handle_mmio_range - handle an in-kernel MMIO access
   * @vcpu:pointer to the vcpu performing the access
   * @run: pointer to the kvm_run structure
   * @mmio:pointer to the data describing the access
 + * @ranges:  pointer to the register defining structure
 + * @mmio_base:   base address for this mapping
   *
 - * returns true if the MMIO access has been performed in kernel space,
 - * and false if it needs to be emulated in user space.
 + * returns true if the MMIO access could be performed
   */
 -bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
 -   struct kvm_exit_mmio *mmio)
 +static bool vgic_handle_mmio_range(struct kvm_vcpu *vcpu, struct kvm_run 
 *run,
 + struct kvm_exit_mmio *mmio,
 + const struct mmio_range *ranges,
 + unsigned long mmio_base)
  {
   const struct mmio_range *range;
   struct vgic_dist *dist = vcpu-kvm-arch.vgic;
 - unsigned long base = dist-vgic_dist_base;
   bool updated_state;
   unsigned long offset;
  
 - if (!irqchip_in_kernel(vcpu-kvm) ||
 - mmio-phys_addr  base ||
 - (mmio-phys_addr + mmio-len)  (base + KVM_VGIC_V2_DIST_SIZE))
 - return false;
 -
 - /* We don't support ldrd / strd or ldm / stm to the emulated vgic */
 - if (mmio-len  4) {
 - kvm_inject_dabt(vcpu, mmio-phys_addr);
 - return true;
 - }
 -
 - offset = mmio-phys_addr - base;
 - range = find_matching_range(vgic_dist_ranges, mmio, offset);
 + offset = mmio-phys_addr - mmio_base;
 + range = find_matching_range(ranges, mmio, offset);
   if (unlikely(!range || !range-handle_mmio)) {
   pr_warn(Unhandled access %d %08llx %d\n,
   mmio-is_write, mmio-phys_addr, mmio-len);
 @@ -963,7 +954,7 @@ bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct 
 kvm_run *run,
   }
  
   spin_lock(vcpu-kvm-arch.vgic.lock);
 - offset = mmio-phys_addr - range-base - base;
 + offset -= range-base;
   if (vgic_validate_access(dist, range, offset)) {
   updated_state = range-handle_mmio(vcpu, mmio, offset);
   } else {
 @@ -981,6 +972,45 @@ bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct 
 kvm_run *run,
   return true;
  }
  


Andre, Is the IS_IN_RANGE definition wrong?

Compare with Marc's old code, here should lack a  =.

#define IS_IN_RANGE(addr, alen, base, len) \
(((addr) = (base))  (((addr) + (alen)) = ((base) + (len

 +#define IS_IN_RANGE(addr, alen, base, len) \
 + (((addr) = (base))  (((addr) + (alen))  ((base) + (len
 +
 +static bool vgic_v2_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
 + struct kvm_exit_mmio *mmio)
 +{
 + unsigned long base = vcpu-kvm-arch.vgic.vgic_dist_base;
 +
 + if (!IS_IN_RANGE(mmio-phys_addr, mmio-len, base,
 +  KVM_VGIC_V2_DIST_SIZE))
 + return false;
 +
 + /* GICv2 does not support accesses wider than 32 bits */
 + if (mmio-len  4) {
 + kvm_inject_dabt(vcpu, mmio-phys_addr);
 + return true;
 + }
 +
 + return vgic_handle_mmio_range(vcpu, run, mmio, vgic_dist_ranges, base);
 +}
 +
 +/**
 + * vgic_handle_mmio - handle an in-kernel MMIO access for the GIC emulation
 + * @vcpu:  pointer to the vcpu performing the access
 + * @run:   pointer to the kvm_run structure
 + * @mmio:  pointer to the data describing the access
 + *
 + * returns true if the MMIO access has been performed in kernel space,
 + * and false if it needs to be emulated in user space.
 + */
 +bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
 +   struct kvm_exit_mmio *mmio)
 +{
 + if (!irqchip_in_kernel(vcpu-kvm))
 + return false;
 +
 + return vgic_v2_handle_mmio(vcpu, run, mmio);
 +}
 +
  static u8 *vgic_get_sgi_sources(struct vgic_dist *dist, int vcpu_id, int sgi)
  {
   return dist-irq_sgi_sources + vcpu_id * VGIC_NR_SGIS + sgi;



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo 

Re: [Qemu-devel] [question] virtio-blk performance degradationhappenedwith virito-serial

2014-09-04 Thread Zhang Haoyu
   If virtio-blk and virtio-serial share an IRQ, the guest operating system 
   has to check each virtqueue for activity. Maybe there is some 
   inefficiency doing that.
   AFAIK virtio-serial registers 64 virtqueues (on 31 ports + console) even 
   if everything is unused.
  
  That could be the case if MSI is disabled.
 
 Do the windows virtio drivers enable MSIs, in their inf file?

It depends on the version of the drivers, but it is a reasonable guess
at what differs between Linux and Windows.  Haoyu, can you give us the
output of lspci from a Linux guest?

I made a test with fio on rhel-6.5 guest, the same degradation happened too,  
this degradation can be reproduced on rhel6.5 guest 100%.
virtio_console module installed:
64K-write-sequence: 285 MBPS, 4380 IOPS
virtio_console module uninstalled:
64K-write-sequence: 370 MBPS, 5670 IOPS

And, virio-blk's interrupt mode always is MSI, no matter if virtio_console 
module is installed or uninstalled.
25:2245933   PCI-MSI-edge  virtio1-requests

fio command:
fio -filename /dev/vda -direct=1 -iodepth=1 -thread -rw=write -ioengine=psync 
-bs=64k -size=30G -numjobs=1 -name=mytest

QEMU comamnd:
/usr/bin/kvm -id 5497356709352 -chardev 
socket,id=qmp,path=/var/run/qemu-server/5497356709352.qmp,server,nowait -mon 
chardev=qmp,mode=control -vnc :0,websocket,to=200 -enable-kvm -pidfile 
/var/run/qemu-server/5497356709352.pid -daemonize -name io-test-rhel-6.5 -smp 
sockets=1,cores=1 -cpu core2duo -nodefaults -vga cirrus -no-hpet -k en-us -boot 
menu=on,splash-time=8000 -m 4096 -usb -drive 
file=/sf/data/local/zhanghaoyu/rhel-server-6.5-x86_64-dvd.iso,if=none,id=drive-ide0,media=cdrom,aio=native,forecast=disable
 -device ide-cd,bus=ide.0,unit=0,drive=drive-ide0,id=ide0,bootindex=200 -drive 
file=/sf/data/local/images/host-1051721dff13/io-test-rhel-6.5.vm/vm-disk-1.qcow2,if=none,id=drive-virtio1,cache=none,aio=native
 -device virtio-blk-pci,drive=drive-virtio1,id=virtio1,bus=pci.0,addr=0xb 
-drive 
file=/sf/data/local/images/host-1051721dff13/io-test-rhel-6.5.vm/vm-disk-2.qcow2,if=none,id=drive-virtio2,cache=none,aio=native
 -device virtio-blk-pci,drive=drive-virtio2,id=virtio2,bus=pci
 .0,addr=0xc,bootindex=101 -netdev 
type=tap,id=net0,ifname=164922379979200,script=/sf/etc/kvm/vtp-bridge,vhost=on,vhostforce=on
 -device 
virtio-net-pci,mac=FE:FC:FE:C6:47:F6,netdev=net0,bus=pci.0,addr=0x12,id=net0,bootindex=300
 -rtc driftfix=slew,clock=rt -global kvm-pit.lost_tick_policy=discard -global 
PIIX4_PM.disable_s3=1 -global PIIX4_PM.disable_s4=1 -chardev 
socket,path=/run/virtser/1649223799792.sock,server,nowait,id=channelser -device 
virtio-serial,vectors=4 -device 
virtserialport,chardev=channelser,name=channelser.virtserial0.0

[environment]
Host:linux-3.10(RHEL7-rc1)
QEMU: qemu-2.0.1
Guest: RHEL6.5

# lspci -tv
-[:00]-+-00.0  Intel Corporation 440FX - 82441FX PMC [Natoma]
   +-01.0  Intel Corporation 82371SB PIIX3 ISA [Natoma/Triton II]
   +-01.1  Intel Corporation 82371SB PIIX3 IDE [Natoma/Triton II]
   +-01.2  Intel Corporation 82371SB PIIX3 USB [Natoma/Triton II]
   +-01.3  Intel Corporation 82371AB/EB/MB PIIX4 ACPI
   +-02.0  Cirrus Logic GD 5446
   +-03.0  Red Hat, Inc Virtio console
   +-0b.0  Red Hat, Inc Virtio block device
   +-0c.0  Red Hat, Inc Virtio block device
   \-12.0  Red Hat, Inc Virtio network device

# lspci -vvv
00:00.0 Host bridge: Intel Corporation 440FX - 82441FX PMC [Natoma] (rev 02)
Subsystem: Red Hat, Inc Qemu virtual machine
Control: I/O+ Mem+ BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr- 
Stepping- SERR+ FastB2B- DisINTx-
Status: Cap- 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort- 
MAbort- SERR- PERR- INTx-

00:01.0 ISA bridge: Intel Corporation 82371SB PIIX3 ISA [Natoma/Triton II]
Subsystem: Red Hat, Inc Qemu virtual machine
Control: I/O+ Mem+ BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr- 
Stepping- SERR+ FastB2B- DisINTx-
Status: Cap- 66MHz- UDF- FastB2B- ParErr- DEVSEL=medium TAbort- 
TAbort- MAbort- SERR- PERR- INTx-

00:01.1 IDE interface: Intel Corporation 82371SB PIIX3 IDE [Natoma/Triton II] 
(prog-if 80 [Master])
Subsystem: Red Hat, Inc Qemu virtual machine
Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- 
Stepping- SERR+ FastB2B- DisINTx-
Status: Cap- 66MHz- UDF- FastB2B+ ParErr- DEVSEL=medium TAbort- 
TAbort- MAbort- SERR- PERR- INTx-
Latency: 0
Region 0: [virtual] Memory at 01f0 (32-bit, non-prefetchable) 
[size=8]
Region 1: [virtual] Memory at 03f0 (type 3, non-prefetchable)
Region 2: [virtual] Memory at 0170 (32-bit, non-prefetchable) 
[size=8]
Region 3: [virtual] Memory at 0370 (type 3, non-prefetchable)
Region 4: I/O ports at c0e0 [size=16]
Kernel driver in use: ata_piix
Kernel modules: ata_generic, pata_acpi, ata_piix

00:01.2 USB controller: Intel 

Re: kvm-unit-test failures

2014-09-04 Thread Paolo Bonzini
Il 03/09/2014 20:25, Chris J Arges ha scritto:
 snip
 I'm not sure about the reason for the warp, but indeed the offset and
 uptime match (I'll check them against the trace tomorrow) so it's just
 that the VM's TSC base is not taken into account correctly.

 Can you gather another trace with the problematic patch reverted?

 Paolo

 
 Here is the third trace running with 0d3da0d2 reverted from the latest
 kvm queue branch 11cc9ea3:
 
 http://people.canonical.com/~arges/kvm/trace-3.dat.xz

Thanks!  And---yay!---I reproduced it on another machine.

Paolo

 $ uptime
  18:25:13 up 5 min,  1 user,  load average: 0.21, 0.74, 0.44
 
 qemu-system-x86_64 -enable-kvm -device pc-testdev -device
 isa-debug-exit,iobase=0xf4,iosize=0x4 -display none -serial stdio
 -device pci-testdev -kernel x86/kvmclock_test.flat -smp 2 --append
 1000 1409768537
 enabling apic
 enabling apic
 kvm-clock: cpu 0, msr 0x:44e520
 kvm-clock: cpu 0, msr 0x:44e520
 Wallclock test, threshold 5
 Seconds get from host: 1409768537
 Seconds get from kvmclock: 1409768538
 Offset:1
 Wallclock test, threshold 5
 Seconds get from host: 1409768537
 Seconds get from kvmclock: 1409768538
 Offset:1
 Check the stability of raw cycle ...
 Total vcpus: 2
 Test  loops: 1000
 Total warps:  0
 Total stalls: 0
 Worst warp:   0
 Raw cycle is stable
 Monotonic cycle test:
 Total vcpus: 2
 Test  loops: 1000
 Total warps:  0
 Total stalls: 0
 Worst warp:   0
 Measure the performance of raw cycle ...
 Total vcpus: 2
 Test  loops: 1000
 TSC cycles:  1241970306
 Measure the performance of adjusted cycle ...
 Total vcpus: 2
 Test  loops: 1000
 TSC cycles:  3266701026
 Return value from qemu: 1
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][patch 3/6] KVM: s390: Add GISA support

2014-09-04 Thread frank . blaschka
From: Frank Blaschka frank.blasc...@de.ibm.com

This patch adds GISA (Guest Interrupt State Area) support
to s390 kvm. GISA can be used for exitless interrupts. The
patch provides a set of functions for GISA related operations
like accessing GISA fields or registering ISCs for alert.
Exploiters of GISA will follow with additional patches.

Signed-off-by: Frank Blaschka frank.blasc...@de.ibm.com
---
 arch/s390/include/asm/kvm_host.h |   72 
 arch/s390/kvm/kvm-s390.c |  167 +++
 arch/s390/kvm/kvm-s390.h |   28 ++
 3 files changed, 265 insertions(+), 2 deletions(-)

--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -129,11 +129,12 @@ struct kvm_s390_sie_block {
__u8reserved60; /* 0x0060 */
__u8ecb;/* 0x0061 */
__u8ecb2;   /* 0x0062 */
-   __u8reserved63[1];  /* 0x0063 */
+   __u8ecb3;   /* 0x0063 */
__u32   scaol;  /* 0x0064 */
__u8reserved68[4];  /* 0x0068 */
__u32   todpr;  /* 0x006c */
-   __u8reserved70[32]; /* 0x0070 */
+   __u32   gd; /* 0x0070 */
+   __u8reserved74[28]; /* 0x0074 */
psw_t   gpsw;   /* 0x0090 */
__u64   gg14;   /* 0x00a0 */
__u64   gg15;   /* 0x00a8 */
@@ -300,6 +301,70 @@ struct kvm_s390_interrupt_info {
 #define ACTION_STORE_ON_STOP   (10)
 #define ACTION_STOP_ON_STOP(11)
 
+#define KVM_S390_GISA_FORMAT_0 0
+#define KVM_S390_GISA_FORMAT_1 1
+
+struct kvm_s390_gisa_f0 {
+   u32 next_alert;
+   u8 ipm;
+   u16 rsv0:14;
+   u16 g:1;
+   u16 c:1;
+   u8 iam;
+   u32 rsv1;
+   u32 count;
+} __packed;
+
+struct kvm_s390_gisa_f1 {
+   u32 next_alert;
+   u8 ipm;
+   u8 simm;
+   u8 nimm;
+   u8 iam;
+   u64 aisma;
+   u32 rsv0:6;
+   u32 g:1;
+   u32 c:1;
+   u32 rsv1:24;
+   u64 rsv2;
+   u32 count;
+} __packed;
+
+union kvm_s390_gisa {
+   struct kvm_s390_gisa_f0 f0;
+   struct kvm_s390_gisa_f1 f1;
+};
+
+struct kvm_s390_gait {
+   u32 gd;
+   u16  : 5;
+   u16 gisc : 3;
+   u16 rpu  : 8;
+   u16: 10;
+   u16 gaisbo :  6;
+   u64 gaisba;
+} __packed;
+
+struct kvm_s390_aifte {
+   u64 faisba;
+   u64 gaita;
+   u16 simm : 8;
+   u16  : 5;
+   u16 afi  : 3;
+   u16 reserved1;
+   u16 reserved2;
+   u16 faal;
+} __packed;
+
+struct kvm_s390_gib {
+   u32 alo;
+   u32 reserved1;
+   u32  : 5;
+   u32 nisc : 3;
+   u32  : 24;
+   u8 reserverd2[20];
+} __packed;
+
 struct kvm_s390_local_interrupt {
spinlock_t lock;
struct list_head list;
@@ -420,6 +485,9 @@ struct kvm_arch{
struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS];
wait_queue_head_t ipte_wq;
spinlock_t start_stop_lock;
+   union kvm_s390_gisa *gisa;
+   unsigned long iam;
+   atomic_t in_sie;
 };
 
 #define KVM_HVA_ERR_BAD(-1UL)
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -404,6 +404,16 @@ long kvm_arch_vm_ioctl(struct file *filp
return r;
 }
 
+static u8 kvm_s390_gisa_get_alert_mask(struct kvm *kvm)
+{
+   return (u8)ACCESS_ONCE(kvm-arch.iam);
+}
+
+static void kvm_s390_gisa_set_alert_mask(struct kvm *kvm, u8 iam)
+{
+   xchg(kvm-arch.iam, iam);
+}
+
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
int rc;
@@ -461,6 +471,14 @@ int kvm_arch_init_vm(struct kvm *kvm, un
kvm-arch.css_support = 0;
kvm-arch.use_irqchip = 0;
 
+   kvm-arch.gisa = (union kvm_s390_gisa *)get_zeroed_page(
+   GFP_KERNEL | GFP_DMA);
+   if (!kvm-arch.gisa)
+   goto out_nogmap;
+   kvm_s390_gisa_set_next_alert(kvm, (u32)(unsigned long)kvm-arch.gisa);
+   kvm_s390_gisa_set_alert_mask(kvm, 0);
+   atomic_set(kvm-arch.in_sie, 0);
+
spin_lock_init(kvm-arch.start_stop_lock);
 
return 0;
@@ -520,6 +538,7 @@ void kvm_arch_sync_events(struct kvm *kv
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+   free_page((unsigned long)kvm-arch.gisa);
kvm_free_vcpus(kvm);
free_page((unsigned long)(kvm-arch.sca));
debug_unregister(kvm-arch.dbf);
@@ -656,6 +675,19 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu
return rc;
 }
 
+u32 kvm_s390_gisa_get_fmt(void)
+{
+   if (test_facility(70) || test_facility(72))
+   return KVM_S390_GISA_FORMAT_1;
+   else
+   return KVM_S390_GISA_FORMAT_0;
+}
+
+static u32 kvm_s390_build_gd(struct kvm *kvm)
+{
+   return (u32)(unsigned long)kvm-arch.gisa | kvm_s390_gisa_get_fmt();
+}
+
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 

[RFC][patch 5/6] s390: Add PCI bus support

2014-09-04 Thread frank . blaschka
From: Frank Blaschka frank.blasc...@de.ibm.com

This patch implements a pci bus for s390x together with some infrastructure
to generate and handle hotplug events. It also provides device 
configuration/unconfiguration via sclp instruction interception.

Signed-off-by: Frank Blaschka frank.blasc...@de.ibm.com
---
 default-configs/s390x-softmmu.mak |1 
 hw/s390x/Makefile.objs|1 
 hw/s390x/css.c|5 
 hw/s390x/css.h|1 
 hw/s390x/s390-pci-bus.c   |  287 ++
 hw/s390x/s390-pci-bus.h   |  139 ++
 hw/s390x/s390-virtio-ccw.c|2 
 hw/s390x/sclp.c   |   10 +
 include/hw/s390x/sclp.h   |8 +
 target-s390x/Makefile.objs|2 
 target-s390x/ioinst.c |   52 ++
 target-s390x/ioinst.h |1 
 target-s390x/kvm.c|5 
 target-s390x/pci_ic.c |  230 ++
 target-s390x/pci_ic.h |  214 
 15 files changed, 956 insertions(+), 2 deletions(-)

--- a/default-configs/s390x-softmmu.mak
+++ b/default-configs/s390x-softmmu.mak
@@ -1,4 +1,5 @@
 CONFIG_VIRTIO=y
+CONFIG_PCI=y
 CONFIG_SCLPCONSOLE=y
 CONFIG_S390_FLIC=y
 CONFIG_S390_FLIC_KVM=$(CONFIG_KVM)
--- a/hw/s390x/Makefile.objs
+++ b/hw/s390x/Makefile.objs
@@ -8,3 +8,4 @@ obj-y += ipl.o
 obj-y += css.o
 obj-y += s390-virtio-ccw.o
 obj-y += virtio-ccw.o
+obj-$(CONFIG_KVM) += s390-pci-bus.o
--- a/hw/s390x/css.c
+++ b/hw/s390x/css.c
@@ -1281,6 +1281,11 @@ void css_generate_chp_crws(uint8_t cssid
 /* TODO */
 }
 
+void css_generate_css_crws(uint8_t cssid)
+{
+css_queue_crw(CRW_RSC_CSS, 0, 0, 0);
+}
+
 int css_enable_mcsse(void)
 {
 trace_css_enable_facility(mcsse);
--- a/hw/s390x/css.h
+++ b/hw/s390x/css.h
@@ -99,6 +99,7 @@ void css_queue_crw(uint8_t rsc, uint8_t
 void css_generate_sch_crws(uint8_t cssid, uint8_t ssid, uint16_t schid,
int hotplugged, int add);
 void css_generate_chp_crws(uint8_t cssid, uint8_t chpid);
+void css_generate_css_crws(uint8_t cssid);
 void css_adapter_interrupt(uint8_t isc);
 
 #define CSS_IO_ADAPTER_VIRTIO 1
--- /dev/null
+++ b/hw/s390x/s390-pci-bus.c
@@ -0,0 +1,287 @@
+/*
+ * s390 PCI BUS
+ *
+ * Copyright 2014 IBM Corp.
+ * Author(s): Frank Blaschka frank.blasc...@de.ibm.com
+ *Hong Bo Li lih...@cn.ibm.com
+ *Yi Min Zhao zyi...@cn.ibm.com
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or (at
+ * your option) any later version. See the COPYING file in the top-level
+ * directory.
+ */
+
+#include hw/pci/pci.h
+#include hw/s390x/css.h
+#include hw/s390x/sclp.h
+#include qemu/error-report.h
+#include s390-pci-bus.h
+
+/* #define DEBUG_S390PCI_BUS */
+#ifdef DEBUG_S390PCI_BUS
+#define DPRINTF(fmt, ...) \
+do { fprintf(stderr, S390pci-bus:  fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+do { } while (0)
+#endif
+
+static QTAILQ_HEAD(, SeiContainer) pending_sei =
+QTAILQ_HEAD_INITIALIZER(pending_sei);
+static QTAILQ_HEAD(, S390PCIBusDevice) device_list =
+QTAILQ_HEAD_INITIALIZER(device_list);
+
+int chsc_sei_nt2_get_event(void *res)
+{
+ChscSeiNt2Res *nt2_res = (ChscSeiNt2Res *)res;
+PciCcdfAvail *accdf;
+PciCcdfErr *eccdf;
+int rc = 1;
+SeiContainer *sei_cont;
+
+sei_cont = QTAILQ_FIRST(pending_sei);
+if (sei_cont) {
+QTAILQ_REMOVE(pending_sei, sei_cont, link);
+nt2_res-nt = 2;
+nt2_res-cc = sei_cont-cc;
+switch (sei_cont-cc) {
+case 1: /* error event */
+eccdf = (PciCcdfErr *)nt2_res-ccdf;
+eccdf-fid = cpu_to_be32(sei_cont-fid);
+eccdf-fh = cpu_to_be32(sei_cont-fh);
+break;
+case 2: /* availability event */
+accdf = (PciCcdfAvail *)nt2_res-ccdf;
+accdf-fid = cpu_to_be32(sei_cont-fid);
+accdf-fh = cpu_to_be32(sei_cont-fh);
+accdf-pec = cpu_to_be16(sei_cont-pec);
+break;
+default:
+abort();
+}
+g_free(sei_cont);
+rc = 0;
+}
+
+return rc;
+}
+
+int chsc_sei_nt2_have_event(void)
+{
+return !QTAILQ_EMPTY(pending_sei);
+}
+
+static S390PCIBusDevice *s390_pci_find_dev_by_fid(uint32_t fid)
+{
+S390PCIBusDevice *pbdev;
+
+QTAILQ_FOREACH(pbdev, device_list, next) {
+if (pbdev-fid == fid) {
+return pbdev;
+}
+}
+return NULL;
+}
+
+void s390_pci_sclp_configure(int configure, SCCB *sccb)
+{
+PciCfgSccb *psccb = (PciCfgSccb *)sccb;
+S390PCIBusDevice *pbdev = 
s390_pci_find_dev_by_fid(be32_to_cpu(psccb-aid));
+uint16_t rc;
+
+if (pbdev) {
+if ((configure == 1  pbdev-configured == true) ||
+(configure == 0  pbdev-configured == false)) {
+rc = SCLP_RC_NO_ACTION_REQUIRED;
+} else {
+pbdev-configured = 

[RFC][patch 4/6] KVM: s390: Add PCI pass-through support

2014-09-04 Thread frank . blaschka
From: Frank Blaschka frank.blasc...@de.ibm.com

This patch implemets PCI pass-through kernel support for s390.
Design approach is very similar to the x86 device assignment.
User space executes the KVM_ASSIGN_PCI_DEVICE ioctl to create
a proxy instance in the kernel KVM and connect this instance to the
host pci device. s390 pci instructions are intercepted in kernel and
operations are passed directly to the assigned pci device.
To take advantage of all system z specific virtualization features
we need to access the SIE control block residing in KVM. Also we have to
enable z pci devices with special configuration information coming
form the SIE block as well.

Signed-off-by: Frank Blaschka frank.blasc...@de.ibm.com
---
 arch/s390/include/asm/kvm_host.h |1 
 arch/s390/kvm/Makefile   |2 
 arch/s390/kvm/intercept.c|1 
 arch/s390/kvm/kvm-s390.c |   33 
 arch/s390/kvm/kvm-s390.h |   17 
 arch/s390/kvm/pci.c  | 2130 +++
 arch/s390/kvm/priv.c |   21 
 7 files changed, 2202 insertions(+), 3 deletions(-)

--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -488,6 +488,7 @@ struct kvm_arch{
union kvm_s390_gisa *gisa;
unsigned long iam;
atomic_t in_sie;
+   struct list_head ppt_dev_list;
 };
 
 #define KVM_HVA_ERR_BAD(-1UL)
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -12,6 +12,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/e
 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
 
 kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-objs += diag.o gaccess.o guestdbg.o
+kvm-objs += diag.o gaccess.o guestdbg.o pci.o
 
 obj-$(CONFIG_KVM) += kvm.o
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -34,6 +34,7 @@ static const intercept_handler_t instruc
[0xb6] = kvm_s390_handle_stctl,
[0xb7] = kvm_s390_handle_lctl,
[0xb9] = kvm_s390_handle_b9,
+   [0xe3] = kvm_s390_handle_e3,
[0xe5] = kvm_s390_handle_e5,
[0xeb] = kvm_s390_handle_eb,
 };
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -397,6 +397,24 @@ long kvm_arch_vm_ioctl(struct file *filp
r = kvm_s390_vm_has_attr(kvm, attr);
break;
}
+   case KVM_ASSIGN_PCI_DEVICE: {
+   struct kvm_assigned_pci_dev assigned_dev;
+
+   r = -EFAULT;
+   if (copy_from_user(assigned_dev, argp, sizeof(assigned_dev)))
+   break;
+   r = kvm_s390_ioctrl_assign_pci(kvm, assigned_dev);
+   break;
+   }
+   case KVM_DEASSIGN_PCI_DEVICE: {
+   struct kvm_assigned_pci_dev assigned_dev;
+
+   r = -EFAULT;
+   if (copy_from_user(assigned_dev, argp, sizeof(assigned_dev)))
+   break;
+   r = kvm_s390_ioctrl_deassign_pci(kvm, assigned_dev);
+   break;
+   }
default:
r = -ENOTTY;
}
@@ -478,6 +496,7 @@ int kvm_arch_init_vm(struct kvm *kvm, un
kvm_s390_gisa_set_next_alert(kvm, (u32)(unsigned long)kvm-arch.gisa);
kvm_s390_gisa_set_alert_mask(kvm, 0);
atomic_set(kvm-arch.in_sie, 0);
+   INIT_LIST_HEAD(kvm-arch.ppt_dev_list);
 
spin_lock_init(kvm-arch.start_stop_lock);
 
@@ -538,6 +557,7 @@ void kvm_arch_sync_events(struct kvm *kv
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+   s390_pci_cleanup(kvm);
free_page((unsigned long)kvm-arch.gisa);
kvm_free_vcpus(kvm);
free_page((unsigned long)(kvm-arch.sca));
@@ -656,7 +676,10 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu
vcpu-arch.sie_block-ecb |= 0x10;
 
vcpu-arch.sie_block-ecb2  = 8;
-   vcpu-arch.sie_block-eca   = 0xD1002000U;
+   vcpu-arch.sie_block-eca   = 0xD1202000U;
+   vcpu-arch.sie_block-ecb2 |= 0x02;
+   vcpu-arch.sie_block-ecb3 = 0x20;
+
if (sclp_has_siif())
vcpu-arch.sie_block-eca |= 1;
vcpu-arch.sie_block-fac   = (int) (long) vfacilities;
@@ -1920,6 +1943,12 @@ static int __init kvm_s390_init(void)
if (ret)
return ret;
 
+   ret = s390_pci_init();
+   if (ret) {
+   kvm_exit();
+   return ret;
+   }
+
/*
 * guests can ask for up to 255+1 double words, we need a full page
 * to hold the maximum amount of facilities. On the other hand, we
@@ -1932,7 +1961,7 @@ static int __init kvm_s390_init(void)
}
memcpy(vfacilities, S390_lowcore.stfle_fac_list, 16);
vfacilities[0] = 0xff82fff3f4fc2000UL;
-   vfacilities[1] = 0x005cUL;
+   vfacilities[1] = 0x07dcUL;
return 0;
 }
 
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -167,6 +167,7 @@ int kvm_s390_mask_adapter(struct kvm *kv
 /* implemented in priv.c */
 int is_valid_psw(psw_t *psw);
 int 

[RFC][patch 2/6] s390: pci: export pci functions for pass-through usage

2014-09-04 Thread frank . blaschka
From: Frank Blaschka frank.blasc...@de.ibm.com

This patch exports a couple of zPCI functions. The new pci
pass-through driver for KVM will use this functions to enable the
device with virtualization information and update the device dma
translation table on the host. We add a new interface to purge
the translation table of a device. Also we moved some zPCI functions
to the pci_insn header file.

Signed-off-by: Frank Blaschka frank.blasc...@de.ibm.com
---
 arch/s390/include/asm/pci.h  |6 ++
 arch/s390/include/asm/pci_clp.h  |3 -
 arch/s390/include/asm/pci_insn.h |   92 
 arch/s390/pci/pci_clp.c  |4 +
 arch/s390/pci/pci_dma.c  |   24 -
 arch/s390/pci/pci_insn.c |   97 ---
 6 files changed, 126 insertions(+), 100 deletions(-)

--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -140,6 +140,7 @@ int zpci_register_ioat(struct zpci_dev *
 int zpci_unregister_ioat(struct zpci_dev *, u8);
 
 /* CLP */
+u8 clp_instr(void *data);
 int clp_scan_pci_devices(void);
 int clp_rescan_pci_devices(void);
 int clp_rescan_pci_devices_simple(void);
@@ -177,6 +178,11 @@ struct zpci_dev *get_zdev_by_fid(u32);
 /* DMA */
 int zpci_dma_init(void);
 void zpci_dma_exit(void);
+int dma_update_trans(struct zpci_dev *zdev, unsigned long pa,
+dma_addr_t dma_addr, size_t size, int flags);
+void dma_update_cpu_trans(struct zpci_dev *zdev, void *page_addr,
+ dma_addr_t dma_addr, int flags);
+void dma_purge_rto_entries(struct zpci_dev *zdev);
 
 /* FMB */
 int zpci_fmb_enable_device(struct zpci_dev *);
--- a/arch/s390/include/asm/pci_clp.h
+++ b/arch/s390/include/asm/pci_clp.h
@@ -148,7 +148,8 @@ struct clp_req_set_pci {
u16 reserved2;
u8 oc;  /* operation controls */
u8 ndas;/* number of dma spaces */
-   u64 reserved3;
+   u32 reserved3;
+   u32 gd; /* GISA Designation */
 } __packed;
 
 /* Set PCI function response */
--- a/arch/s390/include/asm/pci_insn.h
+++ b/arch/s390/include/asm/pci_insn.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_S390_PCI_INSN_H
 #define _ASM_S390_PCI_INSN_H
 
+#include asm/processor.h
+
 /* Load/Store status codes */
 #define ZPCI_PCI_ST_FUNC_NOT_ENABLED   4
 #define ZPCI_PCI_ST_FUNC_IN_ERR8
@@ -83,4 +85,94 @@ int zpci_store(u64 data, u64 req, u64 of
 int zpci_store_block(const u64 *data, u64 req, u64 offset);
 void zpci_set_irq_ctrl(u16 ctl, char *unused, u8 isc);
 
+static inline u8 __mpcifc(u64 req, struct zpci_fib *fib, u8 *status)
+{
+   u8 cc;
+
+   asm volatile (
+  .insn   rxy,0xe3d0,%[req],%[fib]\n
+  ipm %[cc]\n
+  srl %[cc],28\n
+   : [cc] =d (cc), [req] +d (req), [fib] +Q (*fib)
+   : : cc);
+   *status = req  24  0xff;
+   return cc;
+}
+
+static inline u8 __rpcit(u64 fn, u64 addr, u64 range, u8 *status)
+{
+   register u64 __addr asm(2) = addr;
+   register u64 __range asm(3) = range;
+   u8 cc;
+
+   asm volatile (
+  .insn   rre,0xb9d3,%[fn],%[addr]\n
+  ipm %[cc]\n
+  srl %[cc],28\n
+   : [cc] =d (cc), [fn] +d (fn)
+   : [addr] d (__addr), d (__range)
+   : cc);
+   *status = fn  24  0xff;
+   return cc;
+}
+
+static inline int __pcilg(u64 *data, u64 req, u64 offset, u8 *status)
+{
+   register u64 __req asm(2) = req;
+   register u64 __offset asm(3) = offset;
+   int cc = -ENXIO;
+   u64 __data;
+
+   asm volatile (
+  .insn   rre,0xb9d2,%[data],%[req]\n
+   0: ipm %[cc]\n
+  srl %[cc],28\n
+   1:\n
+   EX_TABLE(0b, 1b)
+   : [cc] +d (cc), [data] =d (__data), [req] +d (__req)
+   :  d (__offset)
+   : cc);
+   *status = __req  24  0xff;
+   if (!cc)
+   *data = __data;
+
+   return cc;
+}
+
+static inline int __pcistg(u64 data, u64 req, u64 offset, u8 *status)
+{
+   register u64 __req asm(2) = req;
+   register u64 __offset asm(3) = offset;
+   int cc = -ENXIO;
+
+   asm volatile (
+  .insn   rre,0xb9d0,%[data],%[req]\n
+   0: ipm %[cc]\n
+  srl %[cc],28\n
+   1:\n
+   EX_TABLE(0b, 1b)
+   : [cc] +d (cc), [req] +d (__req)
+   : d (__offset), [data] d (data)
+   : cc);
+   *status = __req  24  0xff;
+   return cc;
+}
+
+static inline int __pcistb(const u64 *data, u64 req, u64 offset, u8 *status)
+{
+   int cc = -ENXIO;
+
+   asm volatile (
+  .insn   rsy,0xebd0,%[req],%[offset],%[data]\n
+ 

[RFC][patch 6/6] s390: Add PCI pass-through device support

2014-09-04 Thread frank . blaschka
From: Frank Blaschka frank.blasc...@de.ibm.com

This patch adds a new device class handling s390 pci pass-through device
assignment. The approach is very similar to the x86 device assignment.
The device executes the KVM_ASSIGN_PCI_DEVICE ioctl to create a proxy instance
in the kernel KVM and connect this instance to the host pci device.

Signed-off-by: Frank Blaschka frank.blasc...@de.ibm.com
---
 hw/s390x/Makefile.objs  |2 
 hw/s390x/s390-pci-bus.c |   14 +-
 hw/s390x/s390_pci.c |  321 
 hw/s390x/s390_pci.h |   31 
 4 files changed, 365 insertions(+), 3 deletions(-)

--- a/hw/s390x/Makefile.objs
+++ b/hw/s390x/Makefile.objs
@@ -8,4 +8,4 @@ obj-y += ipl.o
 obj-y += css.o
 obj-y += s390-virtio-ccw.o
 obj-y += virtio-ccw.o
-obj-$(CONFIG_KVM) += s390-pci-bus.o
+obj-$(CONFIG_KVM) += s390-pci-bus.o s390_pci.o
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -16,6 +16,7 @@
 #include hw/s390x/sclp.h
 #include qemu/error-report.h
 #include s390-pci-bus.h
+#include s390_pci.h
 
 /* #define DEBUG_S390PCI_BUS */
 #ifdef DEBUG_S390PCI_BUS
@@ -219,8 +220,17 @@ static void s390_pcihost_hot_plug(Hotplu
 pbdev-pdev = pci_dev;
 pbdev-configured = true;
 
-pbdev-fh = s390_pci_get_pfh(pci_dev);
-pbdev-is_virt = 1;
+if (!strcmp(pci_dev-name, s390-pci)) {
+S390PCIDevice *sdev = DO_UPCAST(S390PCIDevice, pdev, pci_dev);
+pbdev-fh = s390_pci_get_fh(sdev-host);
+if (!pbdev-fh) {
+g_free(pbdev);
+return;
+}
+} else {
+pbdev-fh = s390_pci_get_pfh(pci_dev);
+pbdev-is_virt = 1;
+}
 
 QTAILQ_INSERT_TAIL(device_list, pbdev, next);
 if (dev-hotplugged) {
--- /dev/null
+++ b/hw/s390x/s390_pci.c
@@ -0,0 +1,321 @@
+/*
+ * s390 PCI pass-through device assignment
+ *
+ * Copyright 2014 IBM Corp.
+ * Author(s): Frank Blaschka frank.blasc...@de.ibm.com
+ *Hong Bo Li lih...@cn.ibm.com
+ *Yi Min Zhao zyi...@cn.ibm.com
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or (at
+ * your option) any later version. See the COPYING file in the top-level
+ * directory.
+ */
+
+#include hw/pci/pci.h
+#include hw/pci/pci_host.h
+#include hw/pci/pci_bus.h
+#include net/net.h
+#include hw/s390x/css.h
+#include hw/s390x/sclp.h
+#include exec/exec-all.h
+#include sysemu/sysemu.h
+#include exec/address-spaces.h
+#include qemu/error-report.h
+#include qapi/qmp/qerror.h
+
+#include s390_pci.h
+#include s390-pci-bus.h
+
+/* #define DEBUG_S390PCI */
+#ifdef DEBUG_S390PCI
+#define DPRINTF(fmt, ...) \
+do { fprintf(stderr, s390pci:  fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+do { } while (0)
+#endif
+
+#define ASSIGN_FLAG_HOSTIRQ 0x1
+
+uint32_t s390_pci_get_fh(PCIHostDeviceAddress host)
+{
+char fh_path[128];
+struct stat st;
+FILE *fd;
+uint32_t fh;
+
+snprintf(fh_path, sizeof(fh_path),
+/sys/bus/pci/devices/%04x:%02x:%02x.%x/function_handle,
+host.domain, host.bus, host.slot, host.function);
+
+if (stat(fh_path, st)) {
+error_report(get function handle faild: no host device specified);
+return -1;
+}
+
+fd = fopen(fh_path, r);
+if (fd == NULL) {
+error_report(%s: %s: %m, __func__, fh_path);
+return 0;
+}
+if (fscanf(fd, %x, fh) != 1) {
+fclose(fd);
+return 0;
+}
+fclose(fd);
+return fh;
+}
+
+uint32_t s390_pci_get_fid(PCIHostDeviceAddress host)
+{
+char fid_path[128];
+struct stat st;
+FILE *fd;
+uint32_t fid;
+
+snprintf(fid_path, sizeof(fid_path),
+/sys/bus/pci/devices/%04x:%02x:%02x.%x/function_id,
+host.domain, host.bus, host.slot, host.function);
+
+if (stat(fid_path, st)) {
+error_report(get function id faild: no host device specified);
+return -1;
+}
+
+fd = fopen(fid_path, r);
+if (fd == NULL) {
+error_report(%s: %s: %m, __func__, fid_path);
+return -1;
+}
+if (fscanf(fd, %x, fid) != 1) {
+fclose(fd);
+return -1;
+}
+fclose(fd);
+return fid;
+}
+
+static int get_real_id(const char *devpath, const char *idname, uint16_t *val)
+{
+FILE *f;
+char name[128];
+long id;
+
+snprintf(name, sizeof(name), %s%s, devpath, idname);
+f = fopen(name, r);
+if (f == NULL) {
+error_report(%s: %s: %m, __func__, name);
+return -1;
+}
+if (fscanf(f, %li\n, id) == 1) {
+*val = id;
+} else {
+fclose(f);
+return -1;
+}
+fclose(f);
+
+return 0;
+}
+
+static int get_real_vendor_id(const char *devpath, uint16_t *val)
+{
+return get_real_id(devpath, vendor, val);
+}
+
+static int get_real_device_id(const char *devpath, uint16_t *val)
+{
+return get_real_id(devpath, device, val);
+}
+
+static void assign_failed_examine(S390PCIDevice *dev)
+{
+char name[PATH_MAX], dir[PATH_MAX], 

[RFC][patch 0/6] pci pass-through support for qemu/KVM on s390

2014-09-04 Thread frank . blaschka
This set of patches implements pci pass-through support for qemu/KVM on s390.
PCI support on s390 is very different from other platforms.
Major differences are:

1) all PCI operations are driven by special s390 instructions
2) all s390 PCI instructions are privileged
3) PCI config and memory spaces can not be mmap'ed
4) no classic interrupts (INTX, MSI). The pci hw understands the concept
   of requesting MSIX irqs but irqs are delivered as s390 adapter irqs.
5) For DMA access there is always an IOMMU required. s390 pci implementation
   does not support a complete memory to iommu mapping, dma mappings are
   created on request.
6) The OS does not get any informations about the physical layout
   of the PCI bus.
7) To take advantage of system z specific virtualization features
   we need to access the SIE control block residing in the kernel KVM
8) To enable system z specific virtualization features we have to manipulate
   the zpci device in kernel.

For this reasons I decided to implement a kernel based approach similar
to x86 device assignment. There is a new qemu device (s390-pci) representing a
pass through device on the host. Here is a sample qemu device configuration:

-device s390-pci,host=:00:00.0

The device executes the KVM_ASSIGN_PCI_DEVICE ioctl to create a proxy instance
in the kernel KVM and connect this instance to the host pci device.

kernel patches apply to linux-kvm

s390: cio: chsc function to register GIB
s390: pci: export pci functions for pass-through usage
KVM: s390: Add GISA support
KVM: s390: Add PCI pass-through support

qemu patches apply to qemu-master

s390: Add PCI bus support
s390: Add PCI pass-through device support

Feedback and discussion is highly welcome ...
Thx!

Frank

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][patch 1/6] s390: cio: chsc function to register GIB

2014-09-04 Thread frank . blaschka
From: Frank Blaschka frank.blasc...@de.ibm.com

This patch provides a new chsc function to register/unregister
a GIB (Guest Information Block).

Signed-off-by: Frank Blaschka frank.blasc...@de.ibm.com
---
 arch/s390/include/asm/cio.h |1 
 drivers/s390/cio/chsc.c |   50 
 2 files changed, 51 insertions(+)

--- a/arch/s390/include/asm/cio.h
+++ b/arch/s390/include/asm/cio.h
@@ -311,5 +311,6 @@ extern int cio_get_iplinfo(struct cio_ip
 /* Function from drivers/s390/cio/chsc.c */
 int chsc_sstpc(void *page, unsigned int op, u16 ctrl);
 int chsc_sstpi(void *page, void *result, size_t size);
+int chsc_sgib(u32 gibo);
 
 #endif
--- a/drivers/s390/cio/chsc.c
+++ b/drivers/s390/cio/chsc.c
@@ -1188,6 +1188,56 @@ out:
 EXPORT_SYMBOL_GPL(chsc_siosl);
 
 /**
+ * chsc_sgib() - register guest information block
+ * @gibo: guest information block
+ *
+ * gibo must be allocated in low memory
+ *
+ * Returns 0 on success.
+ */
+int chsc_sgib(u32 gibo)
+{
+   struct {
+   struct chsc_header request;
+   u16 operation_code;
+   u16 : 16;
+   u32 : 4;
+   u32 fmt : 4;
+   u32 : 24;
+   u32 : 32;
+   u32 : 32;
+   u32 gibo;
+   u64 : 64;
+   u32 : 16;
+   u32 aix : 8;
+   u32 : 8;
+   u32 reserved[1007];
+   struct chsc_header response;
+   } __packed *scssc;
+   unsigned long flags;
+   int rc;
+
+   spin_lock_irqsave(chsc_page_lock, flags);
+   memset(chsc_page, 0, PAGE_SIZE);
+   scssc = chsc_page;
+
+   scssc-request.length = 0x0fe0;
+   scssc-request.code = 0x0021;
+   scssc-operation_code = 1;
+   scssc-gibo = gibo;
+
+   rc = chsc(scssc);
+   if (rc)
+   rc = -EIO;
+   else
+   rc = chsc_error_from_response(scssc-response.code);
+
+   spin_unlock_irqrestore(chsc_page_lock, flags);
+   return rc;
+}
+EXPORT_SYMBOL_GPL(chsc_sgib);
+
+/**
  * chsc_scm_info() - store SCM information (SSI)
  * @scm_area: request and response block for SSI
  * @token: continuation token

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: kvm-unit-test failures

2014-09-04 Thread Paolo Bonzini
Il 04/09/2014 11:53, Paolo Bonzini ha scritto:
 Il 03/09/2014 20:25, Chris J Arges ha scritto:
 snip
 I'm not sure about the reason for the warp, but indeed the offset and
 uptime match (I'll check them against the trace tomorrow) so it's just
 that the VM's TSC base is not taken into account correctly.

 Can you gather another trace with the problematic patch reverted?

 Paolo


 Here is the third trace running with 0d3da0d2 reverted from the latest
 kvm queue branch 11cc9ea3:

 http://people.canonical.com/~arges/kvm/trace-3.dat.xz
 
 Thanks!  And---yay!---I reproduced it on another machine.

And my bisection landed on the merge of the timer branch (commit
e7fda6c4c3c1a7d6996dd75fd84670fa0b5d448f).  Here is the log:

$ git bisect bad origin/master
$ git bisect good v3.16
$ git bisect good kvm-3.17-1 # 42cbc04fd3b5e3f9b011bf9fa3ce0b3d1e10b58b

good[ 6929.863545] loaded kvm module (v3.17-rc1-158-g451fd72219dd)
bad [ 6971.932790] loaded kvm module (for-linus)
bad [ 7216.073128] loaded kvm module (v3.16-6426-gae045e245542)
good[ 7286.198948] loaded kvm module (v3.16-3283-g53ee983378ff)
good[ 7350.534060] loaded kvm module (v3.16-rc7-1668-gaef4f5b6db65)
good[ 7439.037038] loaded kvm module (v3.16-4006-g91c2ff7708d4)
good[ 7481.188637] loaded kvm module (v3.16-rc6-450-g7ba3c21c17d0)
bad [ 7535.292730] loaded kvm module (v3.16-4635-ge7fda6c4c3c1)
good[ 7589.722691] loaded kvm module (v3.16-rc5-110-g9b0fd802e8c0)
good[ 7630.286418] loaded kvm module (v3.16-4467-ged5c41d30ef2)
good[ 7712.470986] loaded kvm module (v3.16-rc1-35-g885d078bfe92)
good[ 7763.443626] loaded kvm module (v3.16-rc1-381-g1b0733837a9b)
good[ 7825.497414] loaded kvm module (v3.16-rc5-116-g7806f60e1d20)
good[ 7893.174056] loaded kvm module (v3.16-rc1-384-gc6f1224573c3)

This means that:

- Tomasz's patch (commit 0d3da0d26e3c3515997c99451ce3b0ad1a69a36c) is
fine, it just enables the (wrong) master clock more often

- the failure is within that branch.

I then cherry-picked Tomasz's patch during a new bisection, and landed
on one of my original suspects:

commit cbcf2dd3b3d4d990610259e8d878fc8dc1f17d80
Author: Thomas Gleixner t...@linutronix.de
Date:   Wed Jul 16 21:04:54 2014 +

x86: kvm: Make kvm_get_time_and_clockread() nanoseconds based

Convert the relevant base data right away to nanoseconds instead of
doing the conversion on every readout. Reduces text size by 160
bytes.

Signed-off-by: Thomas Gleixner t...@linutronix.de
Cc: Gleb Natapov g...@kernel.org
Cc: kvm@vger.kernel.org
Acked-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: John Stultz john.stu...@linaro.org

Again, here is the log:

$ git bisect bad 953dec21aed4038464fec02f96a2f1b8701a5bce
$ git bisect good 1af447bd8cbfb808a320885d214555fb2d32e6e6

good[ 8384.334892] loaded kvm module (v3.16-rc5-81-g68f6783d2831)
bad [ 8525.975170] loaded kvm module (v3.16-rc5-99-gf519b1a2e08c)
good[ 8562.204988] loaded kvm module (v3.16-rc5-90-g41fa4215f8e8)
bad [ 8629.133287] loaded kvm module (v3.16-rc5-94-g48f18fd6addc)
bad [ 8772.846612] loaded kvm module (v3.16-rc5-92-gcbcf2dd3b3d4)
good[ 8836.509602] loaded kvm module (v3.16-rc5-91-gbb0b58127c5a)

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: kvm-unit-test failures

2014-09-04 Thread Wanpeng Li
On Thu, Sep 04, 2014 at 01:33:10PM +0200, Paolo Bonzini wrote:
Il 04/09/2014 11:53, Paolo Bonzini ha scritto:
 Il 03/09/2014 20:25, Chris J Arges ha scritto:
 snip
 I'm not sure about the reason for the warp, but indeed the offset and
 uptime match (I'll check them against the trace tomorrow) so it's just
 that the VM's TSC base is not taken into account correctly.

 Can you gather another trace with the problematic patch reverted?

 Paolo


 Here is the third trace running with 0d3da0d2 reverted from the latest
 kvm queue branch 11cc9ea3:

 http://people.canonical.com/~arges/kvm/trace-3.dat.xz
 
 Thanks!  And---yay!---I reproduced it on another machine.

And my bisection landed on the merge of the timer branch (commit
e7fda6c4c3c1a7d6996dd75fd84670fa0b5d448f).  Here is the log:

$ git bisect bad origin/master
$ git bisect good v3.16
$ git bisect good kvm-3.17-1 # 42cbc04fd3b5e3f9b011bf9fa3ce0b3d1e10b58b

good[ 6929.863545] loaded kvm module (v3.17-rc1-158-g451fd72219dd)
bad [ 6971.932790] loaded kvm module (for-linus)
bad [ 7216.073128] loaded kvm module (v3.16-6426-gae045e245542)
good[ 7286.198948] loaded kvm module (v3.16-3283-g53ee983378ff)
good[ 7350.534060] loaded kvm module (v3.16-rc7-1668-gaef4f5b6db65)
good[ 7439.037038] loaded kvm module (v3.16-4006-g91c2ff7708d4)
good[ 7481.188637] loaded kvm module (v3.16-rc6-450-g7ba3c21c17d0)
bad [ 7535.292730] loaded kvm module (v3.16-4635-ge7fda6c4c3c1)
good[ 7589.722691] loaded kvm module (v3.16-rc5-110-g9b0fd802e8c0)
good[ 7630.286418] loaded kvm module (v3.16-4467-ged5c41d30ef2)
good[ 7712.470986] loaded kvm module (v3.16-rc1-35-g885d078bfe92)
good[ 7763.443626] loaded kvm module (v3.16-rc1-381-g1b0733837a9b)
good[ 7825.497414] loaded kvm module (v3.16-rc5-116-g7806f60e1d20)
good[ 7893.174056] loaded kvm module (v3.16-rc1-384-gc6f1224573c3)

This means that:

- Tomasz's patch (commit 0d3da0d26e3c3515997c99451ce3b0ad1a69a36c) is
fine, it just enables the (wrong) master clock more often

- the failure is within that branch.

I then cherry-picked Tomasz's patch during a new bisection, and landed
on one of my original suspects:

commit cbcf2dd3b3d4d990610259e8d878fc8dc1f17d80
Author: Thomas Gleixner t...@linutronix.de
Date:   Wed Jul 16 21:04:54 2014 +

x86: kvm: Make kvm_get_time_and_clockread() nanoseconds based

Convert the relevant base data right away to nanoseconds instead of
doing the conversion on every readout. Reduces text size by 160
bytes.

Signed-off-by: Thomas Gleixner t...@linutronix.de
Cc: Gleb Natapov g...@kernel.org
Cc: kvm@vger.kernel.org
Acked-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: John Stultz john.stu...@linaro.org


Yes, I also look into this bad commit recently which lead to guest hang
after live migration or after local save/restore. 

Regards,
Wanpeng Li 

Again, here is the log:

$ git bisect bad 953dec21aed4038464fec02f96a2f1b8701a5bce
$ git bisect good 1af447bd8cbfb808a320885d214555fb2d32e6e6

good[ 8384.334892] loaded kvm module (v3.16-rc5-81-g68f6783d2831)
bad [ 8525.975170] loaded kvm module (v3.16-rc5-99-gf519b1a2e08c)
good[ 8562.204988] loaded kvm module (v3.16-rc5-90-g41fa4215f8e8)
bad [ 8629.133287] loaded kvm module (v3.16-rc5-94-g48f18fd6addc)
bad [ 8772.846612] loaded kvm module (v3.16-rc5-92-gcbcf2dd3b3d4)
good[ 8836.509602] loaded kvm module (v3.16-rc5-91-gbb0b58127c5a)

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 0/3] ivshmem: update documentation, add client/server tools

2014-09-04 Thread David Marchand
Here is a patchset containing an update on ivshmem specs documentation and
importing ivshmem server and client tools.
These tools have been written from scratch and are not related to what is
available in nahanni repository.
I put them in contrib/ directory as the qemu-doc.texi was already telling the
server was supposed to be there.

Changes since v4:
- squashed patches 3-13 from v4 into first patch
- reused reported error when parsing arguments in server
- fixed spelling mistakes in documentation in second patch

Changes since v3:
- first patch is untouched
- just restored the Reviewed-By Claudio in second patch
- following patches 3-8 take into account Stefan's comments
- patches 9-12 take into account Gonglei's comments
- patch 13 adjusts ivshmem-server default values
- last patch introduces a change in the ivshmem client-server protocol to
  check a protocol version at connect time

Changes since v2:
- fixed license issues in ivshmem client/server (I took hw/virtio/virtio-rng.c
  file as a reference).

Changes since v1:
- moved client/server import patch before doc update,
- tried to re-organise the ivshmem_device_spec.txt file based on Claudio
  comments (still not sure if the result is that great, comments welcome),
- incorporated comments from Claudio, Eric and Cam,
- added more details on the server - client messages exchange (but sorry, no
  ASCII art here).

By the way, there are still some functionnalities that need description (use of
ioeventfd, the lack of irqfd support) and some parts of the ivshmem code clearly
need cleanup. I will try to address this in future patches when these first
patches are ok.


-- 
David Marchand

David Marchand (3):
  contrib: add ivshmem client and server
  docs: update ivshmem device spec
  ivshmem: add check on protocol version in QEMU

 Makefile|8 +
 configure   |3 +
 contrib/ivshmem-client/ivshmem-client.c |  413 +++
 contrib/ivshmem-client/ivshmem-client.h |  240 ++
 contrib/ivshmem-client/main.c   |  237 ++
 contrib/ivshmem-server/ivshmem-server.c |  402 ++
 contrib/ivshmem-server/ivshmem-server.h |  187 ++
 contrib/ivshmem-server/main.c   |  244 ++
 docs/specs/ivshmem_device_spec.txt  |  127 +++---
 hw/misc/ivshmem.c   |   43 +++-
 include/hw/misc/ivshmem.h   |   17 ++
 qemu-doc.texi   |   10 +-
 12 files changed, 1888 insertions(+), 43 deletions(-)
 create mode 100644 contrib/ivshmem-client/ivshmem-client.c
 create mode 100644 contrib/ivshmem-client/ivshmem-client.h
 create mode 100644 contrib/ivshmem-client/main.c
 create mode 100644 contrib/ivshmem-server/ivshmem-server.c
 create mode 100644 contrib/ivshmem-server/ivshmem-server.h
 create mode 100644 contrib/ivshmem-server/main.c
 create mode 100644 include/hw/misc/ivshmem.h

-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 3/3] ivshmem: add check on protocol version in QEMU

2014-09-04 Thread David Marchand
Send a protocol version as the first message from server, clients must close
communication if they don't support this protocol version.
Older QEMUs should be fine with this change in the protocol since they overrides
their own vm_id on reception of an id associated to no eventfd.

Signed-off-by: David Marchand david.march...@6wind.com
---
 contrib/ivshmem-client/ivshmem-client.c |   14 +++---
 contrib/ivshmem-client/ivshmem-client.h |1 +
 contrib/ivshmem-server/ivshmem-server.c |7 +
 contrib/ivshmem-server/ivshmem-server.h |1 +
 docs/specs/ivshmem_device_spec.txt  |9 ---
 hw/misc/ivshmem.c   |   43 ---
 include/hw/misc/ivshmem.h   |   17 
 7 files changed, 77 insertions(+), 15 deletions(-)
 create mode 100644 include/hw/misc/ivshmem.h

diff --git a/contrib/ivshmem-client/ivshmem-client.c 
b/contrib/ivshmem-client/ivshmem-client.c
index ad210c8..0c4e016 100644
--- a/contrib/ivshmem-client/ivshmem-client.c
+++ b/contrib/ivshmem-client/ivshmem-client.c
@@ -184,10 +184,18 @@ ivshmem_client_connect(IvshmemClient *client)
 goto err_close;
 }
 
-/* first, we expect our index + a fd == -1 */
+/* first, we expect a protocol version */
+if (read_one_msg(client, tmp, fd)  0 ||
+(tmp != IVSHMEM_PROTOCOL_VERSION) || fd != -1) {
+debug_log(client, cannot read from server\n);
+goto err_close;
+}
+debug_log(client, our_id=%ld\n, client-local.id);
+
+/* then, we expect our index + a fd == -1 */
 if (read_one_msg(client, client-local.id, fd)  0 ||
 client-local.id  0 || fd != -1) {
-debug_log(client, cannot read from server\n);
+debug_log(client, cannot read from server (2)\n);
 goto err_close;
 }
 debug_log(client, our_id=%ld\n, client-local.id);
@@ -196,7 +204,7 @@ ivshmem_client_connect(IvshmemClient *client)
  * is not used */
 if (read_one_msg(client, tmp, fd)  0 ||
 tmp != -1 || fd  0) {
-debug_log(client, cannot read from server (2)\n);
+debug_log(client, cannot read from server (3)\n);
 goto err_close;
 }
 debug_log(client, shm_fd=%d\n, fd);
diff --git a/contrib/ivshmem-client/ivshmem-client.h 
b/contrib/ivshmem-client/ivshmem-client.h
index 45f2b64..8d6ab35 100644
--- a/contrib/ivshmem-client/ivshmem-client.h
+++ b/contrib/ivshmem-client/ivshmem-client.h
@@ -23,6 +23,7 @@
 #include sys/select.h
 
 #include qemu/queue.h
+#include hw/misc/ivshmem.h
 
 /**
  * Maximum number of notification vectors supported by the client
diff --git a/contrib/ivshmem-server/ivshmem-server.c 
b/contrib/ivshmem-server/ivshmem-server.c
index f441da7..670c58c 100644
--- a/contrib/ivshmem-server/ivshmem-server.c
+++ b/contrib/ivshmem-server/ivshmem-server.c
@@ -99,6 +99,13 @@ send_initial_info(IvshmemServer *server, IvshmemServerPeer 
*peer)
 {
 int ret;
 
+/* send our protool version first */
+ret = send_one_msg(peer-sock_fd, IVSHMEM_PROTOCOL_VERSION, -1);
+if (ret  0) {
+debug_log(server, cannot send version: %s\n, strerror(errno));
+return -1;
+}
+
 /* send the peer id to the client */
 ret = send_one_msg(peer-sock_fd, peer-id, -1);
 if (ret  0) {
diff --git a/contrib/ivshmem-server/ivshmem-server.h 
b/contrib/ivshmem-server/ivshmem-server.h
index 5ccc7af..e76e4fe 100644
--- a/contrib/ivshmem-server/ivshmem-server.h
+++ b/contrib/ivshmem-server/ivshmem-server.h
@@ -30,6 +30,7 @@
 #include sys/select.h
 
 #include qemu/queue.h
+#include hw/misc/ivshmem.h
 
 /**
  * Maximum number of notification vectors supported by the server
diff --git a/docs/specs/ivshmem_device_spec.txt 
b/docs/specs/ivshmem_device_spec.txt
index 12f338e..3435116 100644
--- a/docs/specs/ivshmem_device_spec.txt
+++ b/docs/specs/ivshmem_device_spec.txt
@@ -64,6 +64,8 @@ It creates a shared memory object then waits for clients to 
connect on a unix
 socket.
 
 For each client (QEMU process) that connects to the server:
+- the server sends a protocol version, if client does not support it, the 
client
+  closes the communication,
 - the server assigns an ID for this client and sends this ID to him as the 
first
   message,
 - the server sends a fd to the shared memory object to this client,
@@ -86,9 +88,10 @@ been provided in qemu.git/contrib/ivshmem-client for debug.
 
 *QEMU as an ivshmem client*
 
-At initialisation, when creating the ivshmem device, QEMU gets its ID from the
-server then makes it available through BAR0 IVPosition register for the VM to
-use (see 'PCI device registers' subsection).
+At initialisation, when creating the ivshmem device, QEMU first receives a
+protocol version and closes communication with server if it does not match.
+Then, QEMU gets its ID from the server then makes it available through BAR0
+IVPosition register for the VM to use (see 'PCI device registers' subsection).
 QEMU then uses the fd to the shared memory to map it to BAR2.
 

[PATCH v5 1/3] contrib: add ivshmem client and server

2014-09-04 Thread David Marchand
When using ivshmem devices, notifications between guests can be sent as
interrupts using a ivshmem-server (typical use described in documentation).
The client is provided as a debug tool.

Signed-off-by: Olivier Matz olivier.m...@6wind.com
Signed-off-by: David Marchand david.march...@6wind.com
---
 Makefile|8 +
 configure   |3 +
 contrib/ivshmem-client/ivshmem-client.c |  405 +++
 contrib/ivshmem-client/ivshmem-client.h |  239 ++
 contrib/ivshmem-client/main.c   |  237 ++
 contrib/ivshmem-server/ivshmem-server.c |  395 ++
 contrib/ivshmem-server/ivshmem-server.h |  186 ++
 contrib/ivshmem-server/main.c   |  244 +++
 qemu-doc.texi   |   10 +-
 9 files changed, 1724 insertions(+), 3 deletions(-)
 create mode 100644 contrib/ivshmem-client/ivshmem-client.c
 create mode 100644 contrib/ivshmem-client/ivshmem-client.h
 create mode 100644 contrib/ivshmem-client/main.c
 create mode 100644 contrib/ivshmem-server/ivshmem-server.c
 create mode 100644 contrib/ivshmem-server/ivshmem-server.h
 create mode 100644 contrib/ivshmem-server/main.c

diff --git a/Makefile b/Makefile
index b33aaac..0575898 100644
--- a/Makefile
+++ b/Makefile
@@ -283,6 +283,14 @@ $(qga-obj-y) qemu-ga.o: $(QGALIB_GEN)
 qemu-ga$(EXESUF): $(qga-obj-y) libqemuutil.a libqemustub.a
$(call LINK, $^)
 
+IVSHMEM_CLIENT_OBJS=$(addprefix $(SRC_PATH)/contrib/ivshmem-client/, 
ivshmem-client.o main.o)
+ivshmem-client$(EXESUF): $(IVSHMEM_CLIENT_OBJS)
+   $(call LINK, $^)
+
+IVSHMEM_SERVER_OBJS=$(addprefix $(SRC_PATH)/contrib/ivshmem-server/, 
ivshmem-server.o main.o)
+ivshmem-server$(EXESUF): $(IVSHMEM_SERVER_OBJS) libqemuutil.a libqemustub.a
+   $(call LINK, $^)
+
 clean:
 # avoid old build problems by removing potentially incorrect old files
rm -f config.mak op-i386.h opc-i386.h gen-op-i386.h op-arm.h opc-arm.h 
gen-op-arm.h
diff --git a/configure b/configure
index 961bf6f..a41a16c 100755
--- a/configure
+++ b/configure
@@ -4125,6 +4125,9 @@ if test $want_tools = yes ; then
   if [ $linux = yes -o $bsd = yes -o $solaris = yes ] ; then
 tools=qemu-nbd\$(EXESUF) $tools
   fi
+  if [ $kvm = yes ] ; then
+tools=ivshmem-client\$(EXESUF) ivshmem-server\$(EXESUF) $tools
+  fi
 fi
 if test $softmmu = yes ; then
   if test $virtfs != no ; then
diff --git a/contrib/ivshmem-client/ivshmem-client.c 
b/contrib/ivshmem-client/ivshmem-client.c
new file mode 100644
index 000..ad210c8
--- /dev/null
+++ b/contrib/ivshmem-client/ivshmem-client.c
@@ -0,0 +1,405 @@
+/*
+ * Copyright 6WIND S.A., 2014
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+
+#include sys/types.h
+#include sys/socket.h
+#include sys/un.h
+
+#include qemu-common.h
+#include qemu/queue.h
+
+#include ivshmem-client.h
+
+/* log a message on stdout if verbose=1 */
+#define debug_log(client, fmt, ...) do { \
+if ((client)-verbose) { \
+printf(fmt, ## __VA_ARGS__); \
+}\
+} while (0)
+
+/* read message from the unix socket */
+static int
+read_one_msg(IvshmemClient *client, long *index, int *fd)
+{
+int ret;
+struct msghdr msg;
+struct iovec iov[1];
+union {
+struct cmsghdr cmsg;
+char control[CMSG_SPACE(sizeof(int))];
+} msg_control;
+struct cmsghdr *cmsg;
+
+iov[0].iov_base = index;
+iov[0].iov_len = sizeof(*index);
+
+memset(msg, 0, sizeof(msg));
+msg.msg_iov = iov;
+msg.msg_iovlen = 1;
+msg.msg_control = msg_control;
+msg.msg_controllen = sizeof(msg_control);
+
+ret = recvmsg(client-sock_fd, msg, 0);
+if (ret  0) {
+debug_log(client, cannot read message: %s\n, strerror(errno));
+return -1;
+}
+if (ret == 0) {
+debug_log(client, lost connection to server\n);
+return -1;
+}
+
+*fd = -1;
+
+for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+
+if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) ||
+cmsg-cmsg_level != SOL_SOCKET ||
+cmsg-cmsg_type != SCM_RIGHTS) {
+continue;
+}
+
+memcpy(fd, CMSG_DATA(cmsg), sizeof(*fd));
+}
+
+return 0;
+}
+
+/* free a peer when the server advertise a disconnection or when the
+ * client is freed */
+static void
+free_peer(IvshmemClient *client, IvshmemClientPeer *peer)
+{
+unsigned vector;
+
+QTAILQ_REMOVE(client-peer_list, peer, next);
+for (vector = 0; vector  peer-vectors_count; vector++) {
+close(peer-vectors[vector]);
+}
+
+g_free(peer);
+}
+
+/* handle message coming from server (new peer, new vectors) */
+static int
+handle_server_msg(IvshmemClient *client)
+{
+IvshmemClientPeer *peer;

[PATCH v5 2/3] docs: update ivshmem device spec

2014-09-04 Thread David Marchand
Add some notes on the parts needed to use ivshmem devices: more specifically,
explain the purpose of an ivshmem server and the basic concept to use the
ivshmem devices in guests.
Move some parts of the documentation and re-organise it.

Signed-off-by: David Marchand david.march...@6wind.com
Reviewed-by: Claudio Fontana claudio.font...@huawei.com
---
 docs/specs/ivshmem_device_spec.txt |  124 +++-
 1 file changed, 93 insertions(+), 31 deletions(-)

diff --git a/docs/specs/ivshmem_device_spec.txt 
b/docs/specs/ivshmem_device_spec.txt
index 667a862..12f338e 100644
--- a/docs/specs/ivshmem_device_spec.txt
+++ b/docs/specs/ivshmem_device_spec.txt
@@ -2,30 +2,103 @@
 Device Specification for Inter-VM shared memory device
 --
 
-The Inter-VM shared memory device is designed to share a region of memory to
-userspace in multiple virtual guests.  The memory region does not belong to any
-guest, but is a POSIX memory object on the host.  Optionally, the device may
-support sending interrupts to other guests sharing the same memory region.
+The Inter-VM shared memory device is designed to share a memory region (created
+on the host via the POSIX shared memory API) between multiple QEMU processes
+running different guests. In order for all guests to be able to pick up the
+shared memory area, it is modeled by QEMU as a PCI device exposing said memory
+to the guest as a PCI BAR.
+The memory region does not belong to any guest, but is a POSIX memory object on
+the host. The host can access this shared memory if needed.
+
+The device also provides an optional communication mechanism between guests
+sharing the same memory object. More details about that in the section 'Guest 
to
+guest communication' section.
 
 
 The Inter-VM PCI device
 ---
 
-*BARs*
+From the VM point of view, the ivshmem PCI device supports three BARs.
+
+- BAR0 is a 1 Kbyte MMIO region to support registers and interrupts when MSI is
+  not used.
+- BAR1 is used for MSI-X when it is enabled in the device.
+- BAR2 is used to access the shared memory object.
+
+It is your choice how to use the device but you must choose between two
+behaviors :
+
+- basically, if you only need the shared memory part, you will map BAR2.
+  This way, you have access to the shared memory in guest and can use it as you
+  see fit (memnic, for example, uses it in userland
+  http://dpdk.org/browse/memnic).
+
+- BAR0 and BAR1 are used to implement an optional communication mechanism
+  through interrupts in the guests. If you need an event mechanism between the
+  guests accessing the shared memory, you will most likely want to write a
+  kernel driver that will handle interrupts. See details in the section 'Guest
+  to guest communication' section.
+
+The behavior is chosen when starting your QEMU processes:
+- no communication mechanism needed, the first QEMU to start creates the shared
+  memory on the host, subsequent QEMU processes will use it.
+
+- communication mechanism needed, an ivshmem server must be started before any
+  QEMU processes, then each QEMU process connects to the server unix socket.
+
+For more details on the QEMU ivshmem parameters, see qemu-doc documentation.
+
+
+Guest to guest communication
+
+
+This section details the communication mechanism between the guests accessing
+the ivhsmem shared memory.
 
-The device supports three BARs.  BAR0 is a 1 Kbyte MMIO region to support
-registers.  BAR1 is used for MSI-X when it is enabled in the device.  BAR2 is
-used to map the shared memory object from the host.  The size of BAR2 is
-specified when the guest is started and must be a power of 2 in size.
+*ivshmem server*
 
-*Registers*
+This server code is available in qemu.git/contrib/ivshmem-server.
 
-The device currently supports 4 registers of 32-bits each.  Registers
-are used for synchronization between guests sharing the same memory object when
-interrupts are supported (this requires using the shared memory server).
+The server must be started on the host before any guest.
+It creates a shared memory object then waits for clients to connect on a unix
+socket.
 
-The server assigns each VM an ID number and sends this ID number to the QEMU
-process when the guest starts.
+For each client (QEMU process) that connects to the server:
+- the server assigns an ID for this client and sends this ID to him as the 
first
+  message,
+- the server sends a fd to the shared memory object to this client,
+- the server creates a new set of host eventfds associated to the new client 
and
+  sends this set to all already connected clients,
+- finally, the server sends all the eventfds sets for all clients to the new
+  client.
+
+The server signals all clients when one of them disconnects.
+
+The client IDs are limited to 16 bits because of the current implementation 
(see
+Doorbell register in 'PCI device registers' subsection). Hence 

[PATCH] KVM: x86: fix kvmclock breakage from timers branch merge

2014-09-04 Thread Paolo Bonzini
Commit cbcf2dd3b3d4 (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds
based, 2014-07-16) forgot to add tk-xtime_sec, thus breaking kvmclock on
hosts that have a reliable TSC.  Add it back; and since the field boot_ns
is not anymore related to the host boot-based clock, rename boot_ns-nsec_base
and the existing nsec_base-snsec_base.

Cc: Thomas Gleixner t...@linutronix.de
Cc: John Stultz john.stu...@linaro.org
Reported-by: Chris J Arges chris.j.ar...@canonical.com
Signed-off-by: Paolo Bonzini pbonz...@redhat.com
---
 arch/x86/kvm/x86.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8f1e22d3b286..92493e10937c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1020,8 +1020,8 @@ struct pvclock_gtod_data {
u32 shift;
} clock;
 
-   u64 boot_ns;
u64 nsec_base;
+   u64 snsec_base;
 };
 
 static struct pvclock_gtod_data pvclock_gtod_data;
@@ -1042,8 +1042,9 @@ static void update_pvclock_gtod(struct timekeeper *tk)
vdata-clock.mult   = tk-tkr.mult;
vdata-clock.shift  = tk-tkr.shift;
 
-   vdata-boot_ns  = boot_ns;
-   vdata-nsec_base= tk-tkr.xtime_nsec;
+   vdata-nsec_base= tk-xtime_sec * (u64)NSEC_PER_SEC
+   + boot_ns;
+   vdata-snsec_base   = tk-tkr.xtime_nsec;
 
write_seqcount_end(vdata-seq);
 }
@@ -1413,10 +1414,10 @@ static int do_monotonic_boot(s64 *t, cycle_t *cycle_now)
do {
seq = read_seqcount_begin(gtod-seq);
mode = gtod-clock.vclock_mode;
-   ns = gtod-nsec_base;
+   ns = gtod-snsec_base;
ns += vgettsc(cycle_now);
ns = gtod-clock.shift;
-   ns += gtod-boot_ns;
+   ns += gtod-nsec_base;
} while (unlikely(read_seqcount_retry(gtod-seq, seq)));
*t = ns;
 
-- 
2.1.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: kvm-unit-test failures

2014-09-04 Thread Paolo Bonzini
Il 04/09/2014 14:24, Wanpeng Li ha scritto:
 On Thu, Sep 04, 2014 at 01:33:10PM +0200, Paolo Bonzini wrote:
 Il 04/09/2014 11:53, Paolo Bonzini ha scritto:
 Il 03/09/2014 20:25, Chris J Arges ha scritto:
 snip
 I'm not sure about the reason for the warp, but indeed the offset and
 uptime match (I'll check them against the trace tomorrow) so it's just
 that the VM's TSC base is not taken into account correctly.

 Can you gather another trace with the problematic patch reverted?

 Paolo


 Here is the third trace running with 0d3da0d2 reverted from the latest
 kvm queue branch 11cc9ea3:

 http://people.canonical.com/~arges/kvm/trace-3.dat.xz

 Thanks!  And---yay!---I reproduced it on another machine.

 And my bisection landed on the merge of the timer branch (commit
 e7fda6c4c3c1a7d6996dd75fd84670fa0b5d448f).  Here is the log:

 $ git bisect bad origin/master
 $ git bisect good v3.16
 $ git bisect good kvm-3.17-1 # 42cbc04fd3b5e3f9b011bf9fa3ce0b3d1e10b58b

 good[ 6929.863545] loaded kvm module (v3.17-rc1-158-g451fd72219dd)
 bad [ 6971.932790] loaded kvm module (for-linus)
 bad [ 7216.073128] loaded kvm module (v3.16-6426-gae045e245542)
 good[ 7286.198948] loaded kvm module (v3.16-3283-g53ee983378ff)
 good[ 7350.534060] loaded kvm module (v3.16-rc7-1668-gaef4f5b6db65)
 good[ 7439.037038] loaded kvm module (v3.16-4006-g91c2ff7708d4)
 good[ 7481.188637] loaded kvm module (v3.16-rc6-450-g7ba3c21c17d0)
 bad [ 7535.292730] loaded kvm module (v3.16-4635-ge7fda6c4c3c1)
 good[ 7589.722691] loaded kvm module (v3.16-rc5-110-g9b0fd802e8c0)
 good[ 7630.286418] loaded kvm module (v3.16-4467-ged5c41d30ef2)
 good[ 7712.470986] loaded kvm module (v3.16-rc1-35-g885d078bfe92)
 good[ 7763.443626] loaded kvm module (v3.16-rc1-381-g1b0733837a9b)
 good[ 7825.497414] loaded kvm module (v3.16-rc5-116-g7806f60e1d20)
 good[ 7893.174056] loaded kvm module (v3.16-rc1-384-gc6f1224573c3)

 This means that:

 - Tomasz's patch (commit 0d3da0d26e3c3515997c99451ce3b0ad1a69a36c) is
 fine, it just enables the (wrong) master clock more often

 - the failure is within that branch.

 I then cherry-picked Tomasz's patch during a new bisection, and landed
 on one of my original suspects:

 commit cbcf2dd3b3d4d990610259e8d878fc8dc1f17d80
 Author: Thomas Gleixner t...@linutronix.de
 Date:   Wed Jul 16 21:04:54 2014 +

x86: kvm: Make kvm_get_time_and_clockread() nanoseconds based

Convert the relevant base data right away to nanoseconds instead of
doing the conversion on every readout. Reduces text size by 160
bytes.

Signed-off-by: Thomas Gleixner t...@linutronix.de
Cc: Gleb Natapov g...@kernel.org
Cc: kvm@vger.kernel.org
Acked-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: John Stultz john.stu...@linaro.org

 
 Yes, I also look into this bad commit recently which lead to guest hang
 after live migration or after local save/restore. 

Thanks for the report!

Wanpeng, can you test and/or review the patch I just posted ([PATCH]
KVM: x86: fix kvmclock breakage from timers branch merge)?

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][patch 0/6] pci pass-through support for qemu/KVM on s390

2014-09-04 Thread Alex Williamson
On Thu, 2014-09-04 at 12:52 +0200, frank.blasc...@de.ibm.com wrote:
 This set of patches implements pci pass-through support for qemu/KVM on s390.
 PCI support on s390 is very different from other platforms.
 Major differences are:
 
 1) all PCI operations are driven by special s390 instructions

Generating config cycles is always arch specific.

 2) all s390 PCI instructions are privileged

While the operations to generate config cycles on x86 are not
privileged, they must be arbitrated between accesses, so in a sense
they're privileged.

 3) PCI config and memory spaces can not be mmap'ed

VFIO has mapping flags that allow any region to specify mmap support.

 4) no classic interrupts (INTX, MSI). The pci hw understands the concept
of requesting MSIX irqs but irqs are delivered as s390 adapter irqs.

VFIO delivers interrupts as eventfds regardless of the underlying
platform mechanism.

 5) For DMA access there is always an IOMMU required.

x86 requires the same.

  s390 pci implementation
does not support a complete memory to iommu mapping, dma mappings are
created on request.

Sounds like POWER.

 6) The OS does not get any informations about the physical layout
of the PCI bus.

If that means that every device is isolated (seems unlikely for
multifunction devices) then that makes IOMMU group support really easy.

 7) To take advantage of system z specific virtualization features
we need to access the SIE control block residing in the kernel KVM

The KVM-VFIO device allows interaction between VFIO devices and KVM.

 8) To enable system z specific virtualization features we have to manipulate
the zpci device in kernel.

VFIO supports different device backends, currently pci_dev and working
towards platform devices.  zpci might just be an extension to standard
pci.

 For this reasons I decided to implement a kernel based approach similar
 to x86 device assignment. There is a new qemu device (s390-pci) representing a
 pass through device on the host. Here is a sample qemu device configuration:
 
 -device s390-pci,host=:00:00.0
 
 The device executes the KVM_ASSIGN_PCI_DEVICE ioctl to create a proxy instance
 in the kernel KVM and connect this instance to the host pci device.
 
 kernel patches apply to linux-kvm
 
 s390: cio: chsc function to register GIB
 s390: pci: export pci functions for pass-through usage
 KVM: s390: Add GISA support
 KVM: s390: Add PCI pass-through support
 
 qemu patches apply to qemu-master
 
 s390: Add PCI bus support
 s390: Add PCI pass-through device support
 
 Feedback and discussion is highly welcome ...

KVM-based device assignment needs to go away.  It's a horrible model for
devices, it offers very little protection to the kernel, assumes every
device is fully isolated and visible to the IOMMU, relies on smattering
of sysfs files to operate, etc.  x86, POWER, and ARM are all moving to
VFIO-based device assignment.  Why is s390 special enough to repeat all
the mistakes that x86 did?  Thanks,

Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/4] KVM: x86: inject nested page faults on emulated instructions

2014-09-04 Thread Paolo Bonzini
Il 04/09/2014 09:02, Gleb Natapov ha scritto:
 On Tue, Sep 02, 2014 at 05:13:49PM +0200, Paolo Bonzini wrote:
  This is required for the following patch to work correctly.  If a nested 
  page
  fault happens during emulation, we must inject a vmexit, not a page fault.
  Luckily we already have the required machinery: it is enough to return
  X86EMUL_INTERCEPTED instead of X86EMUL_PROPAGATE_FAULT.
  
 I wonder why this patch is needed. X86EMUL_PROPAGATE_FAULT causes
 ctxt-have_exception to be set to true in x86_emulate_insn().
 x86_emulate_instruction() checks ctxt-have_exception and calls
 inject_emulated_exception() if it is true. inject_emulated_exception()
 calls kvm_propagate_fault() where we check if the fault was nested and
 generate vmexit or a page fault accordingly.

Good question. :)

If you do that, KVM gets down to the if (writeback) and writes the 
ctxt-eip from L2 into the L1 EIP.

Possibly this patch can be replaced by just this?

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 022513b..475e979 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5312,7 +5312,7 @@ restart:
 
if (ctxt-have_exception) {
inject_emulated_exception(vcpu);
-   r = EMULATE_DONE;
+   return EMULATE_DONE;
} else if (vcpu-arch.pio.count) {
if (!vcpu-arch.pio.in) {
/* FIXME: return into emulator if single-stepping.  */

But I'm not sure how to test it, and I like the idea of treating nested page
faults like other nested vmexits during emulation (which is what this patch
does).

If I included this patch, I could then remove kvm_propagate_fault
like (I think) this:

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 92493e10937c..e096db566ac2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4910,9 +4902,10 @@ static void toggle_interruptibility(struct kvm_vcpu 
*vcpu, u32 mask)
 static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 {
struct x86_emulate_ctxt *ctxt = vcpu-arch.emulate_ctxt;
-   if (ctxt-exception.vector == PF_VECTOR)
-   kvm_propagate_fault(vcpu, ctxt-exception);
-   else if (ctxt-exception.error_code_valid)
+   if (ctxt-exception.vector == PF_VECTOR) {
+   WARN_ON(fault-nested_page_fault);
+   vcpu-arch.walk_mmu-inject_page_fault(vcpu, fault);
+   } else if (ctxt-exception.error_code_valid)
kvm_queue_exception_e(vcpu, ctxt-exception.vector,
  ctxt-exception.error_code);
else

What do you think?

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][patch 3/6] KVM: s390: Add GISA support

2014-09-04 Thread Heiko Carstens
On Thu, Sep 04, 2014 at 12:52:26PM +0200, frank.blasc...@de.ibm.com wrote:
 +void kvm_s390_gisa_register_alert(struct kvm *kvm, u32 gisc)
 +{
 + int bito = BITS_PER_BYTE * 7 + gisc;
 +
 + set_bit(bito ^ (BITS_PER_LONG - 1), kvm-arch.iam);
 +}

Just a very minor nit: you could also use set_bit_inv()  friends.

 +static inline u64 kvm_s390_get_base_disp_rxy(struct kvm_vcpu *vcpu)
 +{
 + u32 x2 = (vcpu-arch.sie_block-ipa  0x000f);
 + u32 base2 = vcpu-arch.sie_block-ipb  28;
 + u32 disp2 = ((vcpu-arch.sie_block-ipb  0x0fff)  16) +
 + ((vcpu-arch.sie_block-ipb  0xff00)  4);
 +
 + return (base2 ? vcpu-run-s.regs.gprs[base2] : 0) +
 + (x2 ? vcpu-run-s.regs.gprs[x2] : 0) + (u64)disp2;
 +}

Not very readable ;) However.. for the RXY instruction format the 20 bit
displacement is usually signed and not unsigned like your code seems to
treat it.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/4] KVM: x86: inject nested page faults on emulated instructions

2014-09-04 Thread Gleb Natapov
On Thu, Sep 04, 2014 at 04:12:19PM +0200, Paolo Bonzini wrote:
 Il 04/09/2014 09:02, Gleb Natapov ha scritto:
  On Tue, Sep 02, 2014 at 05:13:49PM +0200, Paolo Bonzini wrote:
   This is required for the following patch to work correctly.  If a nested 
   page
   fault happens during emulation, we must inject a vmexit, not a page 
   fault.
   Luckily we already have the required machinery: it is enough to return
   X86EMUL_INTERCEPTED instead of X86EMUL_PROPAGATE_FAULT.
   
  I wonder why this patch is needed. X86EMUL_PROPAGATE_FAULT causes
  ctxt-have_exception to be set to true in x86_emulate_insn().
  x86_emulate_instruction() checks ctxt-have_exception and calls
  inject_emulated_exception() if it is true. inject_emulated_exception()
  calls kvm_propagate_fault() where we check if the fault was nested and
  generate vmexit or a page fault accordingly.
 
 Good question. :)
 
 If you do that, KVM gets down to the if (writeback) and writes the 
 ctxt-eip from L2 into the L1 EIP.
Heh, that's a bummer. We should not write back if an instruction caused a 
vmexit.

 
 Possibly this patch can be replaced by just this?
 
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index 022513b..475e979 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -5312,7 +5312,7 @@ restart:
  
 if (ctxt-have_exception) {
 inject_emulated_exception(vcpu);
 -   r = EMULATE_DONE;
 +   return EMULATE_DONE;
If there was no vmexit we still want to writeback. Perhaps:
writeback = inject_emulated_exception(vcpu);
and return false if there was vmexit due to nested page fault (or any fault,
can't L1 ask for #GP/#UD intercept that need to be handled here too?)

 } else if (vcpu-arch.pio.count) {
 if (!vcpu-arch.pio.in) {
 /* FIXME: return into emulator if single-stepping.  */
 
 But I'm not sure how to test it, and I like the idea of treating nested page
 faults like other nested vmexits during emulation (which is what this patch
 does).
IMO exits due to instruction intercept and exits due to other interceptable 
events
that may happen during instruction emulation are sufficiently different to be 
handled
slightly different. If my assumption about #GP above are correct with current 
approach it
can be easily handled inside inject_emulated_exception().

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 0/3] ivshmem: update documentation, add client/server tools

2014-09-04 Thread Michael S. Tsirkin
On Thu, Sep 04, 2014 at 02:50:58PM +0200, David Marchand wrote:
 Here is a patchset containing an update on ivshmem specs documentation and
 importing ivshmem server and client tools.
 These tools have been written from scratch and are not related to what is
 available in nahanni repository.
 I put them in contrib/ directory as the qemu-doc.texi was already telling the
 server was supposed to be there.

I think it's a very nice patchset, and very useful.
Some comments on the patches follow.

 Changes since v4:
 - squashed patches 3-13 from v4 into first patch
 - reused reported error when parsing arguments in server
 - fixed spelling mistakes in documentation in second patch
 
 Changes since v3:
 - first patch is untouched
 - just restored the Reviewed-By Claudio in second patch
 - following patches 3-8 take into account Stefan's comments
 - patches 9-12 take into account Gonglei's comments
 - patch 13 adjusts ivshmem-server default values
 - last patch introduces a change in the ivshmem client-server protocol to
   check a protocol version at connect time
 
 Changes since v2:
 - fixed license issues in ivshmem client/server (I took hw/virtio/virtio-rng.c
   file as a reference).
 
 Changes since v1:
 - moved client/server import patch before doc update,
 - tried to re-organise the ivshmem_device_spec.txt file based on Claudio
   comments (still not sure if the result is that great, comments welcome),
 - incorporated comments from Claudio, Eric and Cam,
 - added more details on the server - client messages exchange (but sorry, no
   ASCII art here).
 
 By the way, there are still some functionnalities that need description (use 
 of
 ioeventfd, the lack of irqfd support) and some parts of the ivshmem code 
 clearly
 need cleanup. I will try to address this in future patches when these first
 patches are ok.
 
 
 -- 
 David Marchand
 
 David Marchand (3):
   contrib: add ivshmem client and server
   docs: update ivshmem device spec
   ivshmem: add check on protocol version in QEMU
 
  Makefile|8 +
  configure   |3 +
  contrib/ivshmem-client/ivshmem-client.c |  413 
 +++
  contrib/ivshmem-client/ivshmem-client.h |  240 ++
  contrib/ivshmem-client/main.c   |  237 ++
  contrib/ivshmem-server/ivshmem-server.c |  402 ++
  contrib/ivshmem-server/ivshmem-server.h |  187 ++
  contrib/ivshmem-server/main.c   |  244 ++
  docs/specs/ivshmem_device_spec.txt  |  127 +++---
  hw/misc/ivshmem.c   |   43 +++-
  include/hw/misc/ivshmem.h   |   17 ++
  qemu-doc.texi   |   10 +-
  12 files changed, 1888 insertions(+), 43 deletions(-)
  create mode 100644 contrib/ivshmem-client/ivshmem-client.c
  create mode 100644 contrib/ivshmem-client/ivshmem-client.h
  create mode 100644 contrib/ivshmem-client/main.c
  create mode 100644 contrib/ivshmem-server/ivshmem-server.c
  create mode 100644 contrib/ivshmem-server/ivshmem-server.h
  create mode 100644 contrib/ivshmem-server/main.c
  create mode 100644 include/hw/misc/ivshmem.h
 
 -- 
 1.7.10.4
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 1/3] contrib: add ivshmem client and server

2014-09-04 Thread Michael S. Tsirkin
On Thu, Sep 04, 2014 at 02:50:59PM +0200, David Marchand wrote:
 When using ivshmem devices, notifications between guests can be sent as
 interrupts using a ivshmem-server (typical use described in documentation).
 The client is provided as a debug tool.
 
 Signed-off-by: Olivier Matz olivier.m...@6wind.com
 Signed-off-by: David Marchand david.march...@6wind.com
 ---
  Makefile|8 +
  configure   |3 +
  contrib/ivshmem-client/ivshmem-client.c |  405 
 +++
  contrib/ivshmem-client/ivshmem-client.h |  239 ++
  contrib/ivshmem-client/main.c   |  237 ++
  contrib/ivshmem-server/ivshmem-server.c |  395 ++
  contrib/ivshmem-server/ivshmem-server.h |  186 ++
  contrib/ivshmem-server/main.c   |  244 +++
  qemu-doc.texi   |   10 +-
  9 files changed, 1724 insertions(+), 3 deletions(-)
  create mode 100644 contrib/ivshmem-client/ivshmem-client.c
  create mode 100644 contrib/ivshmem-client/ivshmem-client.h
  create mode 100644 contrib/ivshmem-client/main.c
  create mode 100644 contrib/ivshmem-server/ivshmem-server.c
  create mode 100644 contrib/ivshmem-server/ivshmem-server.h
  create mode 100644 contrib/ivshmem-server/main.c
 
 diff --git a/Makefile b/Makefile
 index b33aaac..0575898 100644
 --- a/Makefile
 +++ b/Makefile
 @@ -283,6 +283,14 @@ $(qga-obj-y) qemu-ga.o: $(QGALIB_GEN)
  qemu-ga$(EXESUF): $(qga-obj-y) libqemuutil.a libqemustub.a
   $(call LINK, $^)
  
 +IVSHMEM_CLIENT_OBJS=$(addprefix $(SRC_PATH)/contrib/ivshmem-client/, 
 ivshmem-client.o main.o)
 +ivshmem-client$(EXESUF): $(IVSHMEM_CLIENT_OBJS)
 + $(call LINK, $^)
 +
 +IVSHMEM_SERVER_OBJS=$(addprefix $(SRC_PATH)/contrib/ivshmem-server/, 
 ivshmem-server.o main.o)
 +ivshmem-server$(EXESUF): $(IVSHMEM_SERVER_OBJS) libqemuutil.a libqemustub.a
 + $(call LINK, $^)
 +
  clean:
  # avoid old build problems by removing potentially incorrect old files
   rm -f config.mak op-i386.h opc-i386.h gen-op-i386.h op-arm.h opc-arm.h 
 gen-op-arm.h
 diff --git a/configure b/configure
 index 961bf6f..a41a16c 100755
 --- a/configure
 +++ b/configure
 @@ -4125,6 +4125,9 @@ if test $want_tools = yes ; then
if [ $linux = yes -o $bsd = yes -o $solaris = yes ] ; then
  tools=qemu-nbd\$(EXESUF) $tools
fi
 +  if [ $kvm = yes ] ; then
 +tools=ivshmem-client\$(EXESUF) ivshmem-server\$(EXESUF) $tools
 +  fi
  fi
  if test $softmmu = yes ; then
if test $virtfs != no ; then
 diff --git a/contrib/ivshmem-client/ivshmem-client.c 
 b/contrib/ivshmem-client/ivshmem-client.c
 new file mode 100644
 index 000..ad210c8
 --- /dev/null
 +++ b/contrib/ivshmem-client/ivshmem-client.c
 @@ -0,0 +1,405 @@
 +/*
 + * Copyright 6WIND S.A., 2014
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or
 + * (at your option) any later version.  See the COPYING file in the
 + * top-level directory.
 + */
 +
 +#include sys/types.h
 +#include sys/socket.h
 +#include sys/un.h
 +
 +#include qemu-common.h
 +#include qemu/queue.h
 +
 +#include ivshmem-client.h
 +
 +/* log a message on stdout if verbose=1 */
 +#define debug_log(client, fmt, ...) do { \
 +if ((client)-verbose) { \
 +printf(fmt, ## __VA_ARGS__); \
 +}\
 +} while (0)
 +
 +/* read message from the unix socket */
 +static int
 +read_one_msg(IvshmemClient *client, long *index, int *fd)
 +{
 +int ret;
 +struct msghdr msg;
 +struct iovec iov[1];
 +union {
 +struct cmsghdr cmsg;
 +char control[CMSG_SPACE(sizeof(int))];
 +} msg_control;
 +struct cmsghdr *cmsg;
 +
 +iov[0].iov_base = index;
 +iov[0].iov_len = sizeof(*index);
 +
 +memset(msg, 0, sizeof(msg));
 +msg.msg_iov = iov;
 +msg.msg_iovlen = 1;
 +msg.msg_control = msg_control;
 +msg.msg_controllen = sizeof(msg_control);
 +
 +ret = recvmsg(client-sock_fd, msg, 0);
 +if (ret  0) {
 +debug_log(client, cannot read message: %s\n, strerror(errno));
 +return -1;
 +}
 +if (ret == 0) {
 +debug_log(client, lost connection to server\n);
 +return -1;
 +}
 +
 +*fd = -1;
 +
 +for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
 +
 +if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) ||
 +cmsg-cmsg_level != SOL_SOCKET ||
 +cmsg-cmsg_type != SCM_RIGHTS) {
 +continue;
 +}
 +
 +memcpy(fd, CMSG_DATA(cmsg), sizeof(*fd));
 +}
 +
 +return 0;
 +}
 +
 +/* free a peer when the server advertise a disconnection or when the
 + * client is freed */
 +static void
 +free_peer(IvshmemClient *client, IvshmemClientPeer *peer)
 +{
 +unsigned vector;
 +
 +QTAILQ_REMOVE(client-peer_list, peer, next);
 +for (vector = 0; vector  peer-vectors_count; vector++) {
 +

Re: [PATCH v5 1/3] contrib: add ivshmem client and server

2014-09-04 Thread Michael S. Tsirkin
On Thu, Sep 04, 2014 at 02:50:59PM +0200, David Marchand wrote:
 When using ivshmem devices, notifications between guests can be sent as
 interrupts using a ivshmem-server (typical use described in documentation).
 The client is provided as a debug tool.
 
 Signed-off-by: Olivier Matz olivier.m...@6wind.com
 Signed-off-by: David Marchand david.march...@6wind.com
 ---
  Makefile|8 +
  configure   |3 +
  contrib/ivshmem-client/ivshmem-client.c |  405 
 +++
  contrib/ivshmem-client/ivshmem-client.h |  239 ++
  contrib/ivshmem-client/main.c   |  237 ++
  contrib/ivshmem-server/ivshmem-server.c |  395 ++
  contrib/ivshmem-server/ivshmem-server.h |  186 ++
  contrib/ivshmem-server/main.c   |  244 +++
  qemu-doc.texi   |   10 +-
  9 files changed, 1724 insertions(+), 3 deletions(-)
  create mode 100644 contrib/ivshmem-client/ivshmem-client.c
  create mode 100644 contrib/ivshmem-client/ivshmem-client.h
  create mode 100644 contrib/ivshmem-client/main.c
  create mode 100644 contrib/ivshmem-server/ivshmem-server.c
  create mode 100644 contrib/ivshmem-server/ivshmem-server.h
  create mode 100644 contrib/ivshmem-server/main.c
 
 diff --git a/Makefile b/Makefile
 index b33aaac..0575898 100644
 --- a/Makefile
 +++ b/Makefile
 @@ -283,6 +283,14 @@ $(qga-obj-y) qemu-ga.o: $(QGALIB_GEN)
  qemu-ga$(EXESUF): $(qga-obj-y) libqemuutil.a libqemustub.a
   $(call LINK, $^)
  
 +IVSHMEM_CLIENT_OBJS=$(addprefix $(SRC_PATH)/contrib/ivshmem-client/, 
 ivshmem-client.o main.o)
 +ivshmem-client$(EXESUF): $(IVSHMEM_CLIENT_OBJS)
 + $(call LINK, $^)
 +
 +IVSHMEM_SERVER_OBJS=$(addprefix $(SRC_PATH)/contrib/ivshmem-server/, 
 ivshmem-server.o main.o)
 +ivshmem-server$(EXESUF): $(IVSHMEM_SERVER_OBJS) libqemuutil.a libqemustub.a
 + $(call LINK, $^)
 +
  clean:
  # avoid old build problems by removing potentially incorrect old files
   rm -f config.mak op-i386.h opc-i386.h gen-op-i386.h op-arm.h opc-arm.h 
 gen-op-arm.h
 diff --git a/configure b/configure
 index 961bf6f..a41a16c 100755
 --- a/configure
 +++ b/configure
 @@ -4125,6 +4125,9 @@ if test $want_tools = yes ; then
if [ $linux = yes -o $bsd = yes -o $solaris = yes ] ; then
  tools=qemu-nbd\$(EXESUF) $tools
fi
 +  if [ $kvm = yes ] ; then
 +tools=ivshmem-client\$(EXESUF) ivshmem-server\$(EXESUF) $tools
 +  fi
  fi
  if test $softmmu = yes ; then
if test $virtfs != no ; then
 diff --git a/contrib/ivshmem-client/ivshmem-client.c 
 b/contrib/ivshmem-client/ivshmem-client.c
 new file mode 100644
 index 000..ad210c8
 --- /dev/null
 +++ b/contrib/ivshmem-client/ivshmem-client.c
 @@ -0,0 +1,405 @@
 +/*
 + * Copyright 6WIND S.A., 2014
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or
 + * (at your option) any later version.  See the COPYING file in the
 + * top-level directory.
 + */
 +
 +#include sys/types.h
 +#include sys/socket.h
 +#include sys/un.h
 +
 +#include qemu-common.h
 +#include qemu/queue.h
 +
 +#include ivshmem-client.h
 +
 +/* log a message on stdout if verbose=1 */
 +#define debug_log(client, fmt, ...) do { \
 +if ((client)-verbose) { \
 +printf(fmt, ## __VA_ARGS__); \
 +}\
 +} while (0)
 +
 +/* read message from the unix socket */
 +static int
 +read_one_msg(IvshmemClient *client, long *index, int *fd)
 +{
 +int ret;
 +struct msghdr msg;
 +struct iovec iov[1];
 +union {
 +struct cmsghdr cmsg;
 +char control[CMSG_SPACE(sizeof(int))];
 +} msg_control;
 +struct cmsghdr *cmsg;
 +
 +iov[0].iov_base = index;
 +iov[0].iov_len = sizeof(*index);
 +
 +memset(msg, 0, sizeof(msg));
 +msg.msg_iov = iov;
 +msg.msg_iovlen = 1;
 +msg.msg_control = msg_control;
 +msg.msg_controllen = sizeof(msg_control);
 +
 +ret = recvmsg(client-sock_fd, msg, 0);
 +if (ret  0) {
 +debug_log(client, cannot read message: %s\n, strerror(errno));
 +return -1;
 +}
 +if (ret == 0) {
 +debug_log(client, lost connection to server\n);
 +return -1;
 +}
 +
 +*fd = -1;
 +
 +for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
 +
 +if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) ||
 +cmsg-cmsg_level != SOL_SOCKET ||
 +cmsg-cmsg_type != SCM_RIGHTS) {
 +continue;
 +}
 +
 +memcpy(fd, CMSG_DATA(cmsg), sizeof(*fd));
 +}
 +
 +return 0;
 +}
 +
 +/* free a peer when the server advertise a disconnection or when the
 + * client is freed */
 +static void
 +free_peer(IvshmemClient *client, IvshmemClientPeer *peer)
 +{
 +unsigned vector;
 +
 +QTAILQ_REMOVE(client-peer_list, peer, next);
 +for (vector = 0; vector  peer-vectors_count; vector++) {
 +

Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge

2014-09-04 Thread Chris J Arges


On 09/04/2014 07:58 AM, Paolo Bonzini wrote:
 Commit cbcf2dd3b3d4 (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds
 based, 2014-07-16) forgot to add tk-xtime_sec, thus breaking kvmclock on
 hosts that have a reliable TSC.  Add it back; and since the field boot_ns
 is not anymore related to the host boot-based clock, rename boot_ns-nsec_base
 and the existing nsec_base-snsec_base.
 
 Cc: Thomas Gleixner t...@linutronix.de
 Cc: John Stultz john.stu...@linaro.org
 Reported-by: Chris J Arges chris.j.ar...@canonical.com
 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 ---
  arch/x86/kvm/x86.c | 11 ++-
  1 file changed, 6 insertions(+), 5 deletions(-)
 
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index 8f1e22d3b286..92493e10937c 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -1020,8 +1020,8 @@ struct pvclock_gtod_data {
   u32 shift;
   } clock;
  
 - u64 boot_ns;
   u64 nsec_base;
 + u64 snsec_base;
  };
  
  static struct pvclock_gtod_data pvclock_gtod_data;
 @@ -1042,8 +1042,9 @@ static void update_pvclock_gtod(struct timekeeper *tk)
   vdata-clock.mult   = tk-tkr.mult;
   vdata-clock.shift  = tk-tkr.shift;
  
 - vdata-boot_ns  = boot_ns;
 - vdata-nsec_base= tk-tkr.xtime_nsec;
 + vdata-nsec_base= tk-xtime_sec * (u64)NSEC_PER_SEC
 + + boot_ns;
 + vdata-snsec_base   = tk-tkr.xtime_nsec;
  
   write_seqcount_end(vdata-seq);
  }
 @@ -1413,10 +1414,10 @@ static int do_monotonic_boot(s64 *t, cycle_t 
 *cycle_now)
   do {
   seq = read_seqcount_begin(gtod-seq);
   mode = gtod-clock.vclock_mode;
 - ns = gtod-nsec_base;
 + ns = gtod-snsec_base;
   ns += vgettsc(cycle_now);
   ns = gtod-clock.shift;
 - ns += gtod-boot_ns;
 + ns += gtod-nsec_base;
   } while (unlikely(read_seqcount_retry(gtod-seq, seq)));
   *t = ns;
  
 

Paulo,
I've tested with the above patch and I still have issues with the
kvmclock test offset; however the cycle tests pass now.

Here is trace data:
http://people.canonical.com/~arges/kvm/trace-4.dat.xz

Uptime:
 15:58:02 up  1:00,  1 user,  load average: 0.59, 0.60, 0.31

Here is the output:

./x86-run x86/kvmclock_test.flat -smp 2 --append 1000 `date +%s`
qemu-system-x86_64 -enable-kvm -device pc-testdev -device
isa-debug-exit,iobase=0xf4,iosize=0x4 -display none -serial stdio
-device pci-testdev -kernel x86/kvmclock_test.flat -smp 2 --append
1000 1409846210
enabling apic
enabling apic
kvm-clock: cpu 0, msr 0x:44d4c0
kvm-clock: cpu 0, msr 0x:44d4c0
Wallclock test, threshold 5
Seconds get from host: 1409846210
Seconds get from kvmclock: 2819688866
Offset:1409842656
offset too large!
Check the stability of raw cycle ...
Total vcpus: 2
Test  loops: 1000
Total warps:  0
Total stalls: 0
Worst warp:   0
Raw cycle is stable
Monotonic cycle test:
Total vcpus: 2
Test  loops: 1000
Total warps:  0
Total stalls: 0
Worst warp:   0
Measure the performance of raw cycle ...
Total vcpus: 2
Test  loops: 1000
TSC cycles:  1139288710
Measure the performance of adjusted cycle ...
Total vcpus: 2
Test  loops: 1000
TSC cycles:  1138643774
Return value from qemu: 3

My observation is that the kvmclock value seems to be positively biased
by the boot_ns value.

--chris j arges
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge

2014-09-04 Thread Paolo Bonzini
Il 04/09/2014 18:00, Chris J Arges ha scritto:
 Uptime:
  15:58:02 up  1:00,  1 user,  load average: 0.59, 0.60, 0.31
 
 Here is the output:
 
 ./x86-run x86/kvmclock_test.flat -smp 2 --append 1000 `date +%s`
 qemu-system-x86_64 -enable-kvm -device pc-testdev -device
 isa-debug-exit,iobase=0xf4,iosize=0x4 -display none -serial stdio
 -device pci-testdev -kernel x86/kvmclock_test.flat -smp 2 --append
 1000 1409846210
 enabling apic
 enabling apic
 kvm-clock: cpu 0, msr 0x:44d4c0
 kvm-clock: cpu 0, msr 0x:44d4c0
 Wallclock test, threshold 5
 Seconds get from host: 1409846210
 Seconds get from kvmclock: 2819688866
 Offset:1409842656

With kvm/queue this would have been roughly -3600, now it's 
host_wallclock-3600.  So the patch hasn't fixed the -3600 part for you.

Can you try applying this patch on top of 3.16?  This is my backport of
Thomas's patch.  If this works for you, we only have to find out how
to compute boot_ns and nsec_base in the new timekeeping world order of
3.17...

Thomas, do you have any ideas?  Every time a VM is started, the kvmclock
starts at the boot time of the host, instead of the current wallclock time.

Paolo

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d38abc81db65..70de23f1de51 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1020,9 +1020,8 @@ struct pvclock_gtod_data {
u32 shift;
} clock;
 
-   /* open coded 'struct timespec' */
-   u64 monotonic_time_snsec;
-   time_t  monotonic_time_sec;
+   u64 boot_ns;
+   u64 nsec_base;
 };
 
 static struct pvclock_gtod_data pvclock_gtod_data;
@@ -1030,6 +1029,12 @@ static struct pvclock_gtod_data pvclock_gtod_data;
 static void update_pvclock_gtod(struct timekeeper *tk)
 {
struct pvclock_gtod_data *vdata = pvclock_gtod_data;
+   u64 boot_ns;
+
+   boot_ns = timespec_to_ns(tk-total_sleep_time)
+   + tk-wall_to_monotonic.tv_sec * (u64)NSEC_PER_SEC
+   + tk-wall_to_monotonic.tv_nsec
+   + tk-xtime_sec * (u64)NSEC_PER_SEC;
 
write_seqcount_begin(vdata-seq);
 
@@ -1040,17 +1044,8 @@ static void update_pvclock_gtod(struct timekeeper *tk)
vdata-clock.mult   = tk-mult;
vdata-clock.shift  = tk-shift;
 
-   vdata-monotonic_time_sec   = tk-xtime_sec
-   + tk-wall_to_monotonic.tv_sec;
-   vdata-monotonic_time_snsec = tk-xtime_nsec
-   + (tk-wall_to_monotonic.tv_nsec
-tk-shift);
-   while (vdata-monotonic_time_snsec =
-   (((u64)NSEC_PER_SEC)  tk-shift)) {
-   vdata-monotonic_time_snsec -=
-   ((u64)NSEC_PER_SEC)  tk-shift;
-   vdata-monotonic_time_sec++;
-   }
+   vdata-boot_ns  = boot_ns;
+   vdata-nsec_base= tk-xtime_nsec;
 
write_seqcount_end(vdata-seq);
 }
@@ -1414,23 +1409,22 @@ static inline u64 vgettsc(cycle_t *cycle_now)
return v * gtod-clock.mult;
 }
 
-static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
+static int do_monotonic_boot(s64 *t, cycle_t *cycle_now)
 {
+   struct pvclock_gtod_data *gtod = pvclock_gtod_data;
unsigned long seq;
-   u64 ns;
int mode;
-   struct pvclock_gtod_data *gtod = pvclock_gtod_data;
+   u64 ns;
 
-   ts-tv_nsec = 0;
do {
seq = read_seqcount_begin(gtod-seq);
mode = gtod-clock.vclock_mode;
-   ts-tv_sec = gtod-monotonic_time_sec;
-   ns = gtod-monotonic_time_snsec;
+   ns = gtod-nsec_base;
ns += vgettsc(cycle_now);
ns = gtod-clock.shift;
+   ns += gtod-boot_ns;
} while (unlikely(read_seqcount_retry(gtod-seq, seq)));
-   timespec_add_ns(ts, ns);
+   *t = ns;
 
return mode;
 }
@@ -1438,19 +1432,11 @@ static int do_monotonic(struct timespec *ts, cycle_t 
*cycle_now)
 /* returns true if host is using tsc clocksource */
 static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
 {
-   struct timespec ts;
-
/* checked again under seqlock below */
if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
return false;
 
-   if (do_monotonic(ts, cycle_now) != VCLOCK_TSC)
-   return false;
-
-   monotonic_to_bootbased(ts);
-   *kernel_ns = timespec_to_ns(ts);
-
-   return true;
+   return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
 }
 #endif
 


 My observation is that the kvmclock value seems to be positively biased
 by the boot_ns value.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  

Re: [PATCH 3/4] KVM: x86: inject nested page faults on emulated instructions

2014-09-04 Thread Paolo Bonzini
Il 04/09/2014 17:05, Gleb Natapov ha scritto:
  if (ctxt-have_exception) {
  inject_emulated_exception(vcpu);
  -   r = EMULATE_DONE;
  +   return EMULATE_DONE;
 If there was no vmexit we still want to writeback. Perhaps:
 writeback = inject_emulated_exception(vcpu);
 and return false if there was vmexit due to nested page fault (or any fault,
 can't L1 ask for #GP/#UD intercept that need to be handled here too?)
 

Sounds good.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/4] KVM: x86: inject nested page faults on emulated instructions

2014-09-04 Thread Paolo Bonzini
Il 04/09/2014 17:05, Gleb Natapov ha scritto:
  
  If you do that, KVM gets down to the if (writeback) and writes the 
  ctxt-eip from L2 into the L1 EIP.
 Heh, that's a bummer. We should not write back if an instruction caused a 
 vmexit.
 

You're right, that works.

Paolo

-- 8 -
Subject: [PATCH] KVM: x86: skip writeback on injection of nested exception

If a nested page fault happens during emulation, we will inject a vmexit,
not a page fault.  However because writeback happens after the injection,
we will write ctxt-eip from L2 into the L1 EIP.  We do not write back
if an instruction caused an interception vmexit---do the same for page
faults.

Signed-off-by: Paolo Bonzini pbonz...@redhat.com
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/x86.c  | 15 ++-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 08cc299..c989651 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -893,7 +893,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct 
x86_exception *fault);
 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gfn_t gfn, void *data, int offset, int len,
u32 access);
-void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
 
 static inline int __kvm_irq_line_state(unsigned long *irq_state,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e4ed85e..3541946 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -408,12 +408,14 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct 
x86_exception *fault)
 }
 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
 
-void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception 
*fault)
 {
if (mmu_is_nested(vcpu)  !fault-nested_page_fault)
vcpu-arch.nested_mmu.inject_page_fault(vcpu, fault);
else
vcpu-arch.mmu.inject_page_fault(vcpu, fault);
+
+   return fault-nested_page_fault;
 }
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -4929,16 +4931,18 @@ static void toggle_interruptibility(struct kvm_vcpu 
*vcpu, u32 mask)
}
 }
 
-static void inject_emulated_exception(struct kvm_vcpu *vcpu)
+static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
 {
struct x86_emulate_ctxt *ctxt = vcpu-arch.emulate_ctxt;
if (ctxt-exception.vector == PF_VECTOR)
-   kvm_propagate_fault(vcpu, ctxt-exception);
-   else if (ctxt-exception.error_code_valid)
+   return kvm_propagate_fault(vcpu, ctxt-exception);
+
+   if (ctxt-exception.error_code_valid)
kvm_queue_exception_e(vcpu, ctxt-exception.vector,
  ctxt-exception.error_code);
else
kvm_queue_exception(vcpu, ctxt-exception.vector);
+   return false;
 }
 
 static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
@@ -5300,8 +5304,9 @@ restart:
}
 
if (ctxt-have_exception) {
-   inject_emulated_exception(vcpu);
r = EMULATE_DONE;
+   if (inject_emulated_exception(vcpu))
+   return r;
} else if (vcpu-arch.pio.count) {
if (!vcpu-arch.pio.in) {
/* FIXME: return into emulator if single-stepping.  */
-- 
1.9.3


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge

2014-09-04 Thread Paolo Bonzini
Il 04/09/2014 14:58, Paolo Bonzini ha scritto:
 Commit cbcf2dd3b3d4 (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds
 based, 2014-07-16) forgot to add tk-xtime_sec, thus breaking kvmclock on
 hosts that have a reliable TSC.  Add it back; and since the field boot_ns
 is not anymore related to the host boot-based clock, rename boot_ns-nsec_base
 and the existing nsec_base-snsec_base.
 
 Cc: Thomas Gleixner t...@linutronix.de
 Cc: John Stultz john.stu...@linaro.org
 Reported-by: Chris J Arges chris.j.ar...@canonical.com
 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 ---
  arch/x86/kvm/x86.c | 11 ++-
  1 file changed, 6 insertions(+), 5 deletions(-)
 
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index 8f1e22d3b286..92493e10937c 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -1020,8 +1020,8 @@ struct pvclock_gtod_data {
   u32 shift;
   } clock;
  
 - u64 boot_ns;
   u64 nsec_base;
 + u64 snsec_base;
  };
  
  static struct pvclock_gtod_data pvclock_gtod_data;
 @@ -1042,8 +1042,9 @@ static void update_pvclock_gtod(struct timekeeper *tk)
   vdata-clock.mult   = tk-tkr.mult;
   vdata-clock.shift  = tk-tkr.shift;
  
 - vdata-boot_ns  = boot_ns;
 - vdata-nsec_base= tk-tkr.xtime_nsec;
 + vdata-nsec_base= tk-xtime_sec * (u64)NSEC_PER_SEC
 + + boot_ns;
 + vdata-snsec_base   = tk-tkr.xtime_nsec;

Hmm, I found this comment in kernel/time/timekeeping.c

/*
 * The xtime based monotonic readout is:
 *  nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now();
 * The ktime based monotonic readout is:
 *  nsec = base_mono + now();

so this patch makes no sense.  The offs_boot part must be broken.

Paolo

  
   write_seqcount_end(vdata-seq);
  }
 @@ -1413,10 +1414,10 @@ static int do_monotonic_boot(s64 *t, cycle_t 
 *cycle_now)
   do {
   seq = read_seqcount_begin(gtod-seq);
   mode = gtod-clock.vclock_mode;
 - ns = gtod-nsec_base;
 + ns = gtod-snsec_base;
   ns += vgettsc(cycle_now);
   ns = gtod-clock.shift;
 - ns += gtod-boot_ns;
 + ns += gtod-nsec_base;
   } while (unlikely(read_seqcount_retry(gtod-seq, seq)));
   *t = ns;
  
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge

2014-09-04 Thread Chris J Arges


On 09/04/2014 12:14 PM, Paolo Bonzini wrote:
 Il 04/09/2014 18:00, Chris J Arges ha scritto:
 Uptime:
  15:58:02 up  1:00,  1 user,  load average: 0.59, 0.60, 0.31

 Here is the output:

 ./x86-run x86/kvmclock_test.flat -smp 2 --append 1000 `date +%s`
 qemu-system-x86_64 -enable-kvm -device pc-testdev -device
 isa-debug-exit,iobase=0xf4,iosize=0x4 -display none -serial stdio
 -device pci-testdev -kernel x86/kvmclock_test.flat -smp 2 --append
 1000 1409846210
 enabling apic
 enabling apic
 kvm-clock: cpu 0, msr 0x:44d4c0
 kvm-clock: cpu 0, msr 0x:44d4c0
 Wallclock test, threshold 5
 Seconds get from host: 1409846210
 Seconds get from kvmclock: 2819688866
 Offset:1409842656
 
 With kvm/queue this would have been roughly -3600, now it's 
 host_wallclock-3600.  So the patch hasn't fixed the -3600 part for you.
 
 Can you try applying this patch on top of 3.16?  This is my backport of
 Thomas's patch.  If this works for you, we only have to find out how
 to compute boot_ns and nsec_base in the new timekeeping world order of
 3.17...

Paolo,
The patch below applied to 3.16 still allows the testcase to pass on my
hardware.
--chris

 
 Thomas, do you have any ideas?  Every time a VM is started, the kvmclock
 starts at the boot time of the host, instead of the current wallclock time.
 
 Paolo
 
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index d38abc81db65..70de23f1de51 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -1020,9 +1020,8 @@ struct pvclock_gtod_data {
   u32 shift;
   } clock;
  
 - /* open coded 'struct timespec' */
 - u64 monotonic_time_snsec;
 - time_t  monotonic_time_sec;
 + u64 boot_ns;
 + u64 nsec_base;
  };
  
  static struct pvclock_gtod_data pvclock_gtod_data;
 @@ -1030,6 +1029,12 @@ static struct pvclock_gtod_data pvclock_gtod_data;
  static void update_pvclock_gtod(struct timekeeper *tk)
  {
   struct pvclock_gtod_data *vdata = pvclock_gtod_data;
 + u64 boot_ns;
 +
 + boot_ns = timespec_to_ns(tk-total_sleep_time)
 + + tk-wall_to_monotonic.tv_sec * (u64)NSEC_PER_SEC
 + + tk-wall_to_monotonic.tv_nsec
 + + tk-xtime_sec * (u64)NSEC_PER_SEC;
  
   write_seqcount_begin(vdata-seq);
  
 @@ -1040,17 +1044,8 @@ static void update_pvclock_gtod(struct timekeeper *tk)
   vdata-clock.mult   = tk-mult;
   vdata-clock.shift  = tk-shift;
  
 - vdata-monotonic_time_sec   = tk-xtime_sec
 - + tk-wall_to_monotonic.tv_sec;
 - vdata-monotonic_time_snsec = tk-xtime_nsec
 - + (tk-wall_to_monotonic.tv_nsec
 -  tk-shift);
 - while (vdata-monotonic_time_snsec =
 - (((u64)NSEC_PER_SEC)  tk-shift)) {
 - vdata-monotonic_time_snsec -=
 - ((u64)NSEC_PER_SEC)  tk-shift;
 - vdata-monotonic_time_sec++;
 - }
 + vdata-boot_ns  = boot_ns;
 + vdata-nsec_base= tk-xtime_nsec;
  
   write_seqcount_end(vdata-seq);
  }
 @@ -1414,23 +1409,22 @@ static inline u64 vgettsc(cycle_t *cycle_now)
   return v * gtod-clock.mult;
  }
  
 -static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
 +static int do_monotonic_boot(s64 *t, cycle_t *cycle_now)
  {
 + struct pvclock_gtod_data *gtod = pvclock_gtod_data;
   unsigned long seq;
 - u64 ns;
   int mode;
 - struct pvclock_gtod_data *gtod = pvclock_gtod_data;
 + u64 ns;
  
 - ts-tv_nsec = 0;
   do {
   seq = read_seqcount_begin(gtod-seq);
   mode = gtod-clock.vclock_mode;
 - ts-tv_sec = gtod-monotonic_time_sec;
 - ns = gtod-monotonic_time_snsec;
 + ns = gtod-nsec_base;
   ns += vgettsc(cycle_now);
   ns = gtod-clock.shift;
 + ns += gtod-boot_ns;
   } while (unlikely(read_seqcount_retry(gtod-seq, seq)));
 - timespec_add_ns(ts, ns);
 + *t = ns;
  
   return mode;
  }
 @@ -1438,19 +1432,11 @@ static int do_monotonic(struct timespec *ts, cycle_t 
 *cycle_now)
  /* returns true if host is using tsc clocksource */
  static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
  {
 - struct timespec ts;
 -
   /* checked again under seqlock below */
   if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
   return false;
  
 - if (do_monotonic(ts, cycle_now) != VCLOCK_TSC)
 - return false;
 -
 - monotonic_to_bootbased(ts);
 - *kernel_ns = timespec_to_ns(ts);
 -
 - return true;
 + return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
  }
  #endif
  
 
 
 My observation is that the kvmclock value seems to be positively biased
 by the boot_ns value.
 
--
To unsubscribe from 

Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge

2014-09-04 Thread John Stultz
On Thu, Sep 4, 2014 at 9:00 AM, Chris J Arges
chris.j.ar...@canonical.com wrote:


 On 09/04/2014 07:58 AM, Paolo Bonzini wrote:
 Commit cbcf2dd3b3d4 (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds
 based, 2014-07-16) forgot to add tk-xtime_sec, thus breaking kvmclock on
 hosts that have a reliable TSC.  Add it back; and since the field boot_ns
 is not anymore related to the host boot-based clock, rename 
 boot_ns-nsec_base
 and the existing nsec_base-snsec_base.

 Cc: Thomas Gleixner t...@linutronix.de
 Cc: John Stultz john.stu...@linaro.org
 Reported-by: Chris J Arges chris.j.ar...@canonical.com
 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 ---
  arch/x86/kvm/x86.c | 11 ++-
  1 file changed, 6 insertions(+), 5 deletions(-)

 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index 8f1e22d3b286..92493e10937c 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -1020,8 +1020,8 @@ struct pvclock_gtod_data {
   u32 shift;
   } clock;

 - u64 boot_ns;
   u64 nsec_base;
 + u64 snsec_base;
  };

  static struct pvclock_gtod_data pvclock_gtod_data;
 @@ -1042,8 +1042,9 @@ static void update_pvclock_gtod(struct timekeeper *tk)
   vdata-clock.mult   = tk-tkr.mult;
   vdata-clock.shift  = tk-tkr.shift;

 - vdata-boot_ns  = boot_ns;
 - vdata-nsec_base= tk-tkr.xtime_nsec;
 + vdata-nsec_base= tk-xtime_sec * (u64)NSEC_PER_SEC
 + + boot_ns;
 + vdata-snsec_base   = tk-tkr.xtime_nsec;

   write_seqcount_end(vdata-seq);
  }
 @@ -1413,10 +1414,10 @@ static int do_monotonic_boot(s64 *t, cycle_t 
 *cycle_now)
   do {
   seq = read_seqcount_begin(gtod-seq);
   mode = gtod-clock.vclock_mode;
 - ns = gtod-nsec_base;
 + ns = gtod-snsec_base;
   ns += vgettsc(cycle_now);
   ns = gtod-clock.shift;
 - ns += gtod-boot_ns;
 + ns += gtod-nsec_base;
   } while (unlikely(read_seqcount_retry(gtod-seq, seq)));
   *t = ns;



 Paulo,
 I've tested with the above patch and I still have issues with the
 kvmclock test offset; however the cycle tests pass now.

 Here is trace data:
 http://people.canonical.com/~arges/kvm/trace-4.dat.xz

 Uptime:
  15:58:02 up  1:00,  1 user,  load average: 0.59, 0.60, 0.31

 Here is the output:

 ./x86-run x86/kvmclock_test.flat -smp 2 --append 1000 `date +%s`
 qemu-system-x86_64 -enable-kvm -device pc-testdev -device
 isa-debug-exit,iobase=0xf4,iosize=0x4 -display none -serial stdio
 -device pci-testdev -kernel x86/kvmclock_test.flat -smp 2 --append
 1000 1409846210
 enabling apic
 enabling apic
 kvm-clock: cpu 0, msr 0x:44d4c0
 kvm-clock: cpu 0, msr 0x:44d4c0
 Wallclock test, threshold 5
 Seconds get from host: 1409846210
 Seconds get from kvmclock: 2819688866
 Offset:1409842656
 offset too large!


Hey, thanks for reporting the issue and sending an initial patch (even
if its not quite all sorted yet).

Is the test you're using here available somewhere? Are there any
special requirements to run it?

thanks
-john
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] KVM: remove redundant assigment of return value in kvm_dev_ioctl

2014-09-04 Thread Christian Borntraeger
The first statement of kvm_dev_ioctl is
long r = -EINVAL;

No need to reassign the same value.

Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com
---
 virt/kvm/kvm_main.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0a824a0..5ea65d2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2610,9 +2610,8 @@ static long kvm_dev_ioctl(struct file *filp,
long r = -EINVAL;
 
switch (ioctl) {
case KVM_GET_API_VERSION:
-   r = -EINVAL;
if (arg)
goto out;
r = KVM_API_VERSION;
break;
@@ -2622,9 +2621,8 @@ static long kvm_dev_ioctl(struct file *filp,
case KVM_CHECK_EXTENSION:
r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
break;
case KVM_GET_VCPU_MMAP_SIZE:
-   r = -EINVAL;
if (arg)
goto out;
r = PAGE_SIZE; /* struct kvm_run */
 #ifdef CONFIG_X86
-- 
1.8.4.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] KVM: remove redundant check of in_spin_loop

2014-09-04 Thread Christian Borntraeger
The expression `vcpu-spin_loop.in_spin_loop' is always true,
because it is evaluated only when the condition
`!vcpu-spin_loop.in_spin_loop' is false.

Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com
---
 virt/kvm/kvm_main.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7176929..0a824a0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1768,10 +1768,9 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct 
kvm_vcpu *vcpu)
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
bool eligible;
 
eligible = !vcpu-spin_loop.in_spin_loop ||
-   (vcpu-spin_loop.in_spin_loop 
-vcpu-spin_loop.dy_eligible);
+   vcpu-spin_loop.dy_eligible;
 
if (vcpu-spin_loop.in_spin_loop)
kvm_vcpu_set_dy_eligible(vcpu, !vcpu-spin_loop.dy_eligible);
 
-- 
1.8.4.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/3] cleanup of redundant statements

2014-09-04 Thread Christian Borntraeger
Paolo,

I was playing with some static code checkers. Here is some fallout
from the kvm common code. Only minor things that are not real error,
just redundant statements.

One could argue here and there that these statement make the code easier
to understand. So, please have a look and either drop or apply the patches.

Christian Borntraeger (3):
  KVM: remove redundant check of in_spin_loop
  KVM: remove redundant assigment of return value in kvm_dev_ioctl
  KVM: remove redundant assignments in __kvm_set_memory_region

 virt/kvm/kvm_main.c | 8 +---
 1 file changed, 1 insertion(+), 7 deletions(-)

-- 
1.8.4.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] KVM: remove redundant assignments in __kvm_set_memory_region

2014-09-04 Thread Christian Borntraeger
__kvm_set_memory_region sets r to EINVAL very early.
Doing it again is not necessary. The same is true later on, where
r is assigned -ENOMEM twice.

Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com
---
 virt/kvm/kvm_main.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5ea65d2..2d868ad 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -776,9 +776,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
slot = id_to_memslot(kvm-memslots, mem-slot);
base_gfn = mem-guest_phys_addr  PAGE_SHIFT;
npages = mem-memory_size  PAGE_SHIFT;
 
-   r = -EINVAL;
if (npages  KVM_MEM_MAX_NR_PAGES)
goto out;
 
if (!npages)
@@ -790,9 +789,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
new.base_gfn = base_gfn;
new.npages = npages;
new.flags = mem-flags;
 
-   r = -EINVAL;
if (npages) {
if (!old.npages)
change = KVM_MR_CREATE;
else { /* Modify an existing slot. */
@@ -846,9 +844,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
goto out_free;
}
 
if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
-   r = -ENOMEM;
slots = kmemdup(kvm-memslots, sizeof(struct kvm_memslots),
GFP_KERNEL);
if (!slots)
goto out_free;
-- 
1.8.4.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge

2014-09-04 Thread Paolo Bonzini
Il 04/09/2014 21:00, John Stultz ha scritto:
 
 Hey, thanks for reporting the issue and sending an initial patch (even
 if its not quite all sorted yet).
 
 Is the test you're using here available somewhere? Are there any
 special requirements to run it?

You need KVM on a machine with clocksource=tsc.  Grab the tests from
git://git.kernel.org/pub/scm/virt/kvm/kvm-unit-tests.git

and run them with

./configure
make
./x86-run x86/kvmclock_test.flat  --append 1000 `date +%s`

Thanks,

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge

2014-09-04 Thread Paolo Bonzini
Il 04/09/2014 20:16, Chris J Arges ha scritto:
 +boot_ns = timespec_to_ns(tk-total_sleep_time)
 ++ tk-wall_to_monotonic.tv_sec * (u64)NSEC_PER_SEC
 ++ tk-wall_to_monotonic.tv_nsec
 ++ tk-xtime_sec * (u64)NSEC_PER_SEC;

So this means that the above 3.16-based code is not the same as

boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot));

in 3.17.  Everything else in the patch you tested is the same as the
code that is in 3.17, so that's a start.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge

2014-09-04 Thread Paolo Bonzini
Il 04/09/2014 21:15, Paolo Bonzini ha scritto:
 Il 04/09/2014 20:16, Chris J Arges ha scritto:
 +   boot_ns = timespec_to_ns(tk-total_sleep_time)
 +   + tk-wall_to_monotonic.tv_sec * (u64)NSEC_PER_SEC
 +   + tk-wall_to_monotonic.tv_nsec
 +   + tk-xtime_sec * (u64)NSEC_PER_SEC;
 
 So this means that the above 3.16-based code is not the same as
 
 boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot));
 
 in 3.17.  Everything else in the patch you tested is the same as the
 code that is in 3.17, so that's a start.
 
 Paolo
 

Based on commit 02cba1598a2a3b689e79ad6dad2532521f638271 we have:

   offs_real - offs_boot = wall_to_monotonic + total_sleep_time

The patch I posted this morning separates tk-xtime_sec out of boot_ns, so
all that is missing should be a change in boot_ns from base_mono + offs_boot
to offs_real - offs_boot.

Chris, can you try this patch on top of the previous one:

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 92493e10937c..811eecc43fe8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1031,7 +1031,7 @@ static void update_pvclock_gtod(struct timekeeper *tk)
struct pvclock_gtod_data *vdata = pvclock_gtod_data;
u64 boot_ns;
 
-   boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot));
+   boot_ns = ktime_to_ns(ktime_sub(tk-offs_real, tk-offs_boot));
 
write_seqcount_begin(vdata-seq);

If it doesn't work, then commit 02cba1598a2a3b689e79ad6dad2532521f638271
is also broken.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge

2014-09-04 Thread Chris J Arges


On 09/04/2014 02:42 PM, Paolo Bonzini wrote:
 Il 04/09/2014 21:15, Paolo Bonzini ha scritto:
 Il 04/09/2014 20:16, Chris J Arges ha scritto:
 +  boot_ns = timespec_to_ns(tk-total_sleep_time)
 +  + tk-wall_to_monotonic.tv_sec * (u64)NSEC_PER_SEC
 +  + tk-wall_to_monotonic.tv_nsec
 +  + tk-xtime_sec * (u64)NSEC_PER_SEC;

 So this means that the above 3.16-based code is not the same as

 boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot));

 in 3.17.  Everything else in the patch you tested is the same as the
 code that is in 3.17, so that's a start.

 Paolo

 
 Based on commit 02cba1598a2a3b689e79ad6dad2532521f638271 we have:
 
offs_real - offs_boot = wall_to_monotonic + total_sleep_time
 
 The patch I posted this morning separates tk-xtime_sec out of boot_ns, so
 all that is missing should be a change in boot_ns from base_mono + offs_boot
 to offs_real - offs_boot.
 
 Chris, can you try this patch on top of the previous one:
 
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index 92493e10937c..811eecc43fe8 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -1031,7 +1031,7 @@ static void update_pvclock_gtod(struct timekeeper *tk)
   struct pvclock_gtod_data *vdata = pvclock_gtod_data;
   u64 boot_ns;
  
 - boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot));
 + boot_ns = ktime_to_ns(ktime_sub(tk-offs_real, tk-offs_boot));
  
   write_seqcount_begin(vdata-seq);
 
 If it doesn't work, then commit 02cba1598a2a3b689e79ad6dad2532521f638271
 is also broken.
 
 Paolo
 

Paolo,
That modification do your additional patch didn't work. However I was
able to modify the code as follows to get this test case working. The
only additional modification was:
+   vdata-nsec_base= tk-xtime_sec * (u64)NSEC_PER_SEC
+   - boot_ns;

--chris j arges

--

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7b25aa2..60c0a9b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1023,8 +1023,8 @@ struct pvclock_gtod_data {
u32 shift;
} clock;

-   u64 boot_ns;
u64 nsec_base;
+   u64 snsec_base;
 };

 static struct pvclock_gtod_data pvclock_gtod_data;
@@ -1034,7 +1034,7 @@ static void update_pvclock_gtod(struct timekeeper *tk)
struct pvclock_gtod_data *vdata = pvclock_gtod_data;
u64 boot_ns;

-   boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot));
+   boot_ns = ktime_to_ns(ktime_sub(tk-offs_real, tk-offs_boot));

write_seqcount_begin(vdata-seq);

@@ -1045,8 +1045,9 @@ static void update_pvclock_gtod(struct timekeeper *tk)
vdata-clock.mult   = tk-tkr.mult;
vdata-clock.shift  = tk-tkr.shift;

-   vdata-boot_ns  = boot_ns;
-   vdata-nsec_base= tk-tkr.xtime_nsec;
+   vdata-nsec_base= tk-xtime_sec * (u64)NSEC_PER_SEC
+   - boot_ns;
+   vdata-snsec_base   = tk-tkr.xtime_nsec;

write_seqcount_end(vdata-seq);
 }
@@ -1416,10 +1417,10 @@ static int do_monotonic_boot(s64 *t, cycle_t
*cycle_now)
do {
seq = read_seqcount_begin(gtod-seq);
mode = gtod-clock.vclock_mode;
-   ns = gtod-nsec_base;
+   ns = gtod-snsec_base;
ns += vgettsc(cycle_now);
ns = gtod-clock.shift;
-   ns += gtod-boot_ns;
+   ns += gtod-nsec_base;
} while (unlikely(read_seqcount_retry(gtod-seq, seq)));
*t = ns;
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/3] cleanup of redundant statements

2014-09-04 Thread Paolo Bonzini
Il 04/09/2014 21:13, Christian Borntraeger ha scritto:
 Paolo,
 
 I was playing with some static code checkers. Here is some fallout
 from the kvm common code. Only minor things that are not real error,
 just redundant statements.
 
 One could argue here and there that these statement make the code easier
 to understand. So, please have a look and either drop or apply the patches.

I think all the patches are an improvement.  Thanks!

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge

2014-09-04 Thread Paolo Bonzini
Il 04/09/2014 22:37, Chris J Arges ha scritto:
  -  boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot));
  +  boot_ns = ktime_to_ns(ktime_sub(tk-offs_real, tk-offs_boot));
   
 write_seqcount_begin(vdata-seq);
  
  If it doesn't work, then commit 02cba1598a2a3b689e79ad6dad2532521f638271
  is also broken.
  
  Paolo
  
 Paolo,
 That modification do your additional patch didn't work. However I was
 able to modify the code as follows to get this test case working. The
 only additional modification was:
 + vdata-nsec_base= tk-xtime_sec * (u64)NSEC_PER_SEC
 + - boot_ns;

Right, it should have been

boot_ns = ktime_to_ns(ktime_sub(tk-offs_boot, tk-offs_real));

I'll post the patch shortly.

Thanks!

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge

2014-09-04 Thread Chris J Arges


On 09/04/2014 03:40 PM, Paolo Bonzini wrote:
 Il 04/09/2014 22:37, Chris J Arges ha scritto:
 -  boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot));
 +  boot_ns = ktime_to_ns(ktime_sub(tk-offs_real, tk-offs_boot));
  
write_seqcount_begin(vdata-seq);

 If it doesn't work, then commit 02cba1598a2a3b689e79ad6dad2532521f638271
 is also broken.

 Paolo

 Paolo,
 That modification do your additional patch didn't work. However I was
 able to modify the code as follows to get this test case working. The
 only additional modification was:
 +vdata-nsec_base= tk-xtime_sec * (u64)NSEC_PER_SEC
 +- boot_ns;
 
 Right, it should have been
 
   boot_ns = ktime_to_ns(ktime_sub(tk-offs_boot, tk-offs_real));
 
 I'll post the patch shortly.
 
 Thanks!
 
 Paolo
 

Paolo,

Great, tested that modification really quick and it also works for me!
All test cases are now passing on my machine; thanks for all the
debugging and help.

--chris
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge

2014-09-04 Thread Thomas Gleixner
On Thu, 4 Sep 2014, Paolo Bonzini wrote:

 Commit cbcf2dd3b3d4 (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds
 based, 2014-07-16) forgot to add tk-xtime_sec, thus breaking kvmclock on

Errm. How is boottime related to xtime_sec?

 hosts that have a reliable TSC.  Add it back; and since the field boot_ns
 is not anymore related to the host boot-based clock, rename boot_ns-nsec_base
 and the existing nsec_base-snsec_base.

This is simply wrong.

The original code before that changed did:

   vdata-monotonic_time_sec   = tk-xtime_sec
   + tk-wall_to_monotonic.tv_sec;
   vdata-monotonic_time_snsec = tk-xtime_nsec
   + (tk-wall_to_monotonic.tv_nsec
tk-shift);
So this is the momentary monotonic base time

And the readout function did:

   ts-tv_nsec = 0;
   do {
   seq = read_seqcount_begin(gtod-seq);
   mode = gtod-clock.vclock_mode;
   ts-tv_sec = gtod-monotonic_time_sec;
   ns = gtod-monotonic_time_snsec;
   ns += vgettsc(cycle_now);
   ns = gtod-clock.shift;
} while (unlikely(read_seqcount_retry(gtod-seq, seq)));
   timespec_add_ns(ts, ns);

So this does:

   now = monotonic_base + delta_nsec

And the caller converted it to boot time with:

   monotonic_to_bootbased(ts);

So the time calculation does:

  now = monotonic_base + delta_nsec + mono_to_boot

Because: monotonic_base + mono_to_boot = boot_time_base
 
The calculation can be written as:

  now = boot_time_base + delta_nsec


The new code does

boot_ns = ktime_to_ns(ktime_add(tk-base_mono, tk-offs_boot));

So thats

   boot_time_base = monotonic_base + mono_to_boot;

vdata-boot_ns  = boot_ns;
vdata-nsec_base= tk-tkr.xtime_nsec;

And the readout does:

do {
   seq = read_seqcount_begin(gtod-seq);
   mode = gtod-clock.vclock_mode;
   ns = gtod-nsec_base;
   ns += vgettsc(cycle_now);
   ns = gtod-clock.shift;
   ns += gtod-boot_ns;
   } while (unlikely(read_seqcount_retry(gtod-seq, seq)));
   *t = ns;

Which is:

   boot_time_base + delta_nsec

Now I have no idea why you think it needs to add xtime_sec. If the
result is wrong, then we need to figure out which one of the supplied
values is wrong and not blindly add xtime_sec just because that makes
it magically correct.

Can you please provide a proper background why you think that adding
xtime_sec is a good idea?

Thanks,

tglx


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM: x86: fix kvmclock breakage from timers branch merge

2014-09-04 Thread Paolo Bonzini
Commit cbcf2dd3b3d4 (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds
based, 2014-07-16) used the wrong formula for boot_ns, thus breaking kvmclock on
hosts that have a reliable TSC.

To find the right formula, let's first backport the switch to nanoseconds
to 3.16-era timekeeping logic.  The full patch (which works) is at
https://lkml.org/lkml/2014/9/4/462.  The key line here is

boot_ns = timespec_to_ns(tk-total_sleep_time)
+ timespec_to_ns(tk-wall_to_monotonic)
+ tk-xtime_sec * (u64)NSEC_PER_SEC;

Because the above patch works, the conclusion is that the above formula
is not the same as commit cbcf2dd3b3d4's

boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot));

As to what is the right one, commit 02cba1598a2a (timekeeping: Simplify 
getboottime(),
2014-07-16) provides a hint:

   offs_real = -wall-to_monotonic
   offs_boot =  total_sleep_time

   offs_real - offs_boot = -wall_to_monotonic - total_sleep_time

that is

   offs_boot - offs_real =  wall_to_monotonic + total_sleep_time

which is what this patch uses, adding xtime_sec separately.  The boot_ns
moniker is not too clear, so rename boot_ns to nsec_base and the existing
nsec_base to snsec_base.

Cc: Thomas Gleixner t...@linutronix.de
Cc: John Stultz john.stu...@linaro.org
Reported-by: Chris J Arges chris.j.ar...@canonical.com
Signed-off-by: Paolo Bonzini pbonz...@redhat.com
---
Thomas/John, the problem with the above explanation is that
tk_update_ktime_data has base_mono = xtime_sec + wtm, and from
there base_mono + offs_boot = xtime_sec + wtm + total_sleep_time.
Except that doesn't work, so something must be wrong in
tk_update_ktime_data's comment.

 arch/x86/kvm/x86.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8f1e22d3b286..c55203bea337 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1020,8 +1020,8 @@ struct pvclock_gtod_data {
u32 shift;
} clock;
 
-   u64 boot_ns;
u64 nsec_base;
+   u64 snsec_base;
 };
 
 static struct pvclock_gtod_data pvclock_gtod_data;
@@ -1031,7 +1031,7 @@ static void update_pvclock_gtod(struct timekeeper *tk)
struct pvclock_gtod_data *vdata = pvclock_gtod_data;
u64 boot_ns;
 
-   boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot));
+   boot_ns = ktime_to_ns(ktime_sub(tk-tkr.offs_boot, tk-offs_real));
 
write_seqcount_begin(vdata-seq);
 
@@ -1042,8 +1042,9 @@ static void update_pvclock_gtod(struct timekeeper *tk)
vdata-clock.mult   = tk-tkr.mult;
vdata-clock.shift  = tk-tkr.shift;
 
-   vdata-boot_ns  = boot_ns;
-   vdata-nsec_base= tk-tkr.xtime_nsec;
+   vdata-nsec_base= tk-xtime_sec * (u64)NSEC_PER_SEC
+   + boot_ns;
+   vdata-snsec_base   = tk-tkr.xtime_nsec;
 
write_seqcount_end(vdata-seq);
 }
@@ -1413,10 +1414,10 @@ static int do_monotonic_boot(s64 *t, cycle_t *cycle_now)
do {
seq = read_seqcount_begin(gtod-seq);
mode = gtod-clock.vclock_mode;
-   ns = gtod-nsec_base;
+   ns = gtod-snsec_base;
ns += vgettsc(cycle_now);
ns = gtod-clock.shift;
-   ns += gtod-boot_ns;
+   ns += gtod-nsec_base;
} while (unlikely(read_seqcount_retry(gtod-seq, seq)));
*t = ns;
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge

2014-09-04 Thread Paolo Bonzini
Il 04/09/2014 22:58, Thomas Gleixner ha scritto:
 This is simply wrong.

It is.

 Now I have no idea why you think it needs to add xtime_sec. If the
 result is wrong, then we need to figure out which one of the supplied
 values is wrong and not blindly add xtime_sec just because that makes
 it magically correct.
 
 Can you please provide a proper background why you think that adding
 xtime_sec is a good idea?

It's not a good idea indeed.  I didn't fully digest the 3.16-3.17
timekeeping changes and messed up this patch.

However, there is a bug in the base_mono + offs_boot formula, given
that:

- bisection leads to the merge commit of John's timers branch

- bisecting within John's timers branch, with a KVM commit on top to
  make the code much easier to trigger, leads to commit cbcf2dd3b3d4
  (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds based,
  2014-07-16).

- I backported your patch to 3.16, using wall_to_monotonic +
  total_sleep_time + xtime_sec (wtm+xtime_sec as in pre-cbcf2dd3b3d4
  code, total_sleep_time from 3.16 monotonic_to_bootbased) and it works

- In v2 of the patch I fixed the bug by changing the formula
  base_mono + offs_boot to offs_boot - offs_real (and then adding
  xtime_sec separately as in the 3.16 backport), but the two formulas
  base_mono + offs_boot and offs_boot - offs_real + xtime_sec ought
  to be identical.

I find offs_boot - offs_real + xtime more readable than the
alternative base_mono + offs_boot + xtime_nsec, so the fix doubles as
a cleanup for me and I'm fine with it.  But something must be wrong in
the timekeeping code.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge

2014-09-04 Thread Thomas Gleixner
On Thu, 4 Sep 2014, Paolo Bonzini wrote:

 Commit cbcf2dd3b3d4 (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds
 based, 2014-07-16) used the wrong formula for boot_ns, thus breaking kvmclock 
 on
 hosts that have a reliable TSC.
 
 To find the right formula, let's first backport the switch to nanoseconds
 to 3.16-era timekeeping logic.  The full patch (which works) is at
 https://lkml.org/lkml/2014/9/4/462.  The key line here is
 
 boot_ns = timespec_to_ns(tk-total_sleep_time)
 + timespec_to_ns(tk-wall_to_monotonic)
 + tk-xtime_sec * (u64)NSEC_PER_SEC;
 
 Because the above patch works, the conclusion is that the above formula
 is not the same as commit cbcf2dd3b3d4's
 
 boot_ns = ktime_to_ns(ktime_add(tk-tkr.base_mono, tk-offs_boot));
 
 As to what is the right one, commit 02cba1598a2a (timekeeping: Simplify 
 getboottime(),
 2014-07-16) provides a hint:
 
offs_real = -wall-to_monotonic
offs_boot =  total_sleep_time
 
offs_real - offs_boot = -wall_to_monotonic - total_sleep_time
 
 that is
 
offs_boot - offs_real =  wall_to_monotonic + total_sleep_time
 
 which is what this patch uses, adding xtime_sec separately.  The boot_ns
 moniker is not too clear, so rename boot_ns to nsec_base and the existing
 nsec_base to snsec_base.
 
 Cc: Thomas Gleixner t...@linutronix.de
 Cc: John Stultz john.stu...@linaro.org
 Reported-by: Chris J Arges chris.j.ar...@canonical.com
 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 ---
   Thomas/John, the problem with the above explanation is that
   tk_update_ktime_data has base_mono = xtime_sec + wtm, and from
   there base_mono + offs_boot = xtime_sec + wtm + total_sleep_time.
   Except that doesn't work, so something must be wrong in
   tk_update_ktime_data's comment.

Right. I'm staring into it and we need to fix the core code issue and
not the usage site.

Thanks,

tglx
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: fix kvmclock breakage from timers branch merge

2014-09-04 Thread Thomas Gleixner
On Thu, 4 Sep 2014, Paolo Bonzini wrote:
 Il 04/09/2014 22:58, Thomas Gleixner ha scritto:
  This is simply wrong.
 
 It is.
 
  Now I have no idea why you think it needs to add xtime_sec. If the
  result is wrong, then we need to figure out which one of the supplied
  values is wrong and not blindly add xtime_sec just because that makes
  it magically correct.
  
  Can you please provide a proper background why you think that adding
  xtime_sec is a good idea?
 
 It's not a good idea indeed.  I didn't fully digest the 3.16-3.17
 timekeeping changes and messed up this patch.
 
 However, there is a bug in the base_mono + offs_boot formula, given
 that:
 
 - bisection leads to the merge commit of John's timers branch
 
 - bisecting within John's timers branch, with a KVM commit on top to
   make the code much easier to trigger, leads to commit cbcf2dd3b3d4
   (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds based,
   2014-07-16).
 
 - I backported your patch to 3.16, using wall_to_monotonic +
   total_sleep_time + xtime_sec (wtm+xtime_sec as in pre-cbcf2dd3b3d4
   code, total_sleep_time from 3.16 monotonic_to_bootbased) and it works
 
 - In v2 of the patch I fixed the bug by changing the formula
   base_mono + offs_boot to offs_boot - offs_real (and then adding
   xtime_sec separately as in the 3.16 backport), but the two formulas
   base_mono + offs_boot and offs_boot - offs_real + xtime_sec ought
   to be identical.
 
 I find offs_boot - offs_real + xtime more readable than the
 alternative base_mono + offs_boot + xtime_nsec, so the fix doubles as
 a cleanup for me and I'm fine with it.  But something must be wrong in
 the timekeeping code.

I think I have a vague idea what happened, but I'm way too tired now
to write it up fully. I'll do that tomorrow morning with brain awake.

Thanks,

tglx


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/4] PCI: Export MSI message relevant functions

2014-09-04 Thread Bjorn Helgaas
On Mon, May 19, 2014 at 01:01:07PM +1000, Gavin Shan wrote:
 The patch exports 2 MSI message relevant functions, which will be
 used by VFIO PCI driver. The VFIO PCI driver would be built as
 a module.
 
 Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com

Acked-by: Bjorn Helgaas bhelg...@google.com

I think Alex will merge this along with the other ones.  Sorry this
took so long.  I don't really like this, but I just can't figure out
any solution that's better.

 ---
  drivers/pci/msi.c | 2 ++
  1 file changed, 2 insertions(+)
 
 diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
 index 955ab79..2350271 100644
 --- a/drivers/pci/msi.c
 +++ b/drivers/pci/msi.c
 @@ -324,6 +324,7 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg 
 *msg)
  
   __get_cached_msi_msg(entry, msg);
  }
 +EXPORT_SYMBOL_GPL(get_cached_msi_msg);
  
  void __write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
  {
 @@ -368,6 +369,7 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg)
  
   __write_msi_msg(entry, msg);
  }
 +EXPORT_SYMBOL_GPL(write_msi_msg);
  
  static void free_msi_irqs(struct pci_dev *dev)
  {
 -- 
 1.8.3.2
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 83381] 4-ports 82576 detect 2 ports when add intel_iommu=on pci=assign-busses.

2014-09-04 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=83381

Wanpeng Li wanpeng...@linux.intel.com changed:

   What|Removed |Added

 CC||alex.william...@redhat.com,
   ||wanpeng...@linux.intel.com

--- Comment #7 from Wanpeng Li wanpeng...@linux.intel.com ---
Hi Chao,

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 83381] 4-ports 82576 detect 2 ports when add intel_iommu=on pci=assign-busses.

2014-09-04 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=83381

Wanpeng Li wanpeng...@linux.intel.com changed:

   What|Removed |Added

 CC||wanpeng...@linux.intel.com

--- Comment #8 from Wanpeng Li wanpeng...@linux.intel.com ---
Cc Alex Williamson alex.william...@redhat.com

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/4] PCI: Export MSI message relevant functions

2014-09-04 Thread Gavin Shan
On Thu, Sep 04, 2014 at 04:57:36PM -0600, Bjorn Helgaas wrote:
On Mon, May 19, 2014 at 01:01:07PM +1000, Gavin Shan wrote:
 The patch exports 2 MSI message relevant functions, which will be
 used by VFIO PCI driver. The VFIO PCI driver would be built as
 a module.
 
 Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com

Acked-by: Bjorn Helgaas bhelg...@google.com

I think Alex will merge this along with the other ones.  Sorry this
took so long.  I don't really like this, but I just can't figure out
any solution that's better.


Thanks, Bjorn. I thought you must forget this. Lets get it in firstly
and I'll do more investigation later to see if I can figure out something
better.

Thanks,
Gavin

 ---
  drivers/pci/msi.c | 2 ++
  1 file changed, 2 insertions(+)
 
 diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
 index 955ab79..2350271 100644
 --- a/drivers/pci/msi.c
 +++ b/drivers/pci/msi.c
 @@ -324,6 +324,7 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg 
 *msg)
  
  __get_cached_msi_msg(entry, msg);
  }
 +EXPORT_SYMBOL_GPL(get_cached_msi_msg);
  
  void __write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
  {
 @@ -368,6 +369,7 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg)
  
  __write_msi_msg(entry, msg);
  }
 +EXPORT_SYMBOL_GPL(write_msi_msg);
  
  static void free_msi_irqs(struct pci_dev *dev)
  {
 -- 
 1.8.3.2
 


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 83381] 4-ports 82576 detect 2 ports when add intel_iommu=on pci=assign-busses.

2014-09-04 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=83381

--- Comment #9 from Alex Williamson alex.william...@redhat.com ---
Is this a regression?

Has it ever worked?

Why is this filed against kvm since it appears to have no relation to qemu or
kvm?

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 12/15] arm/arm64: KVM: add virtual GICv3 distributor emulation

2014-09-04 Thread wanghaibin
On 2014/8/21 21:06, Andre Przywara wrote:


 +void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
 +{
 + struct kvm *kvm = vcpu-kvm;
 + struct kvm_vcpu *c_vcpu;
 + struct vgic_dist *dist = kvm-arch.vgic;
 + u16 target_cpus;
 + u64 mpidr, mpidr_h, mpidr_l;
 + int sgi, mode, c, vcpu_id;
 + int updated = 0;
 +
 + vcpu_id = vcpu-vcpu_id;
 +
 + sgi = (reg  24)  0xf;
 + mode = (reg  40)  0x1;
 + target_cpus = reg  0x;
 + mpidr = ((reg  48)  0xff)  MPIDR_LEVEL_SHIFT(3);
 + mpidr |= ((reg  32)  0xff)  MPIDR_LEVEL_SHIFT(2);
 + mpidr |= ((reg  16)  0xff)  MPIDR_LEVEL_SHIFT(1);
 + mpidr = ~MPIDR_LEVEL_MASK;
 +

 + /*
 +  * We take the dist lock here, because we come from the sysregs
 +  * code path and not from MMIO (where this is already done)
 +  */
 + spin_lock(dist-lock);
 + kvm_for_each_vcpu(c, c_vcpu, kvm) {


Hi, Andre, there is a suggestion. Move the

 + if (target_cpus == 0)
 + break;

code, out the  kvm_for_each_vcpu loop, Like :


if (!mode  target_cpus == 0)   /* the judgement do not need judge in 
kvm_for_each_vcpu loop */
return;

spin_lock(dist-lock);
kvm_for_each_vcpu(c, c_vcpu, kvm) {

 + if (mode  c == vcpu_id)   /* not to myself */
 + continue;
 + if (!mode) {
 + mpidr_h = kvm_vcpu_get_mpidr(c_vcpu);
 + mpidr_l = MPIDR_AFFINITY_LEVEL(mpidr_h, 0);
 + mpidr_h = ~MPIDR_LEVEL_MASK;
 + if (mpidr != mpidr_h)
 + continue;
 + if (!(target_cpus  BIT(mpidr_l)))
 + continue;
 + target_cpus = ~BIT(mpidr_l);
 + }
 + /* Flag the SGI as pending */
 + vgic_dist_irq_set(c_vcpu, sgi);
 + updated = 1;
 + kvm_debug(SGI%d from CPU%d to CPU%d\n, sgi, vcpu_id, c);
 + }
 + if (updated)
 + vgic_update_state(vcpu-kvm);
 + spin_unlock(dist-lock);
 + if (updated)
 + vgic_kick_vcpus(vcpu-kvm);
 +}
 +
 +



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[question] git clone kvm.git failed

2014-09-04 Thread Zhang Haoyu
Hi, all
I encounter below error during git clone kvm.git,

# git clone git://git.kernel.org/pub/scm/virt/kvm/kvm.git kvm_0905
Cloning into 'kvm_0905'...
remote: Counting objects: 3819711, done.
remote: Compressing objects: 100% (575699/575699), done.
remote: Total 3819711 (delta 3219203), reused 3812285 (delta 3211836)
Receiving objects: 100% (3819711/3819711), 804.71 MiB | 122 KiB/s, done.
Resolving deltas: 100% (3219203/3219203), done.
error: unable to create file include/linux/types.h (File too large)
error: unable to create file include/linux/u64_stats_sync.h (File too large)
error: unable to create file include/linux/uaccess.h (File too large)
error: unable to create file include/linux/ucb1400.h (File too large)
error: unable to create file include/linux/ucs2_string.h (File too large)
error: unable to create file include/linux/udp.h (File too large)
error: unable to create file include/linux/uidgid.h (File too large)
error: unable to create file include/linux/uinput.h (File too large)
error: unable to create file include/linux/uio.h (File too large)
error: unable to create file include/linux/uio_driver.h (File too large)
fatal: cannot create directory at 'include/linux/unaligned': File too large

How to resolve these errors?

Thanks,
Zhang Haoyu

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Patch - support e500-specific: Performance monitor

2014-09-04 Thread Amit Tomar

Is There specific any reason not to copy extra handler IOVR 35 for e500?


--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -527,7 +527,7 @@ static struct kvmppc_ops kvm_ops_e500 = {
 static int __init kvmppc_e500_init(void)
 {
int r, i;
-   unsigned long ivor[3];
+  unsigned long ivor[4];
/* Process remaining handlers above the generic first 16 */
unsigned long *handler = kvmppc_booke_handler_addr[16];
unsigned long handler_len;
@@ -545,7 +545,8 @@ static int __init kvmppc_e500_init(void)
ivor[0] = mfspr(SPRN_IVOR32);
ivor[1] = mfspr(SPRN_IVOR33);
ivor[2] = mfspr(SPRN_IVOR34);
-   for (i = 0; i  3; i++) {
+ivor[3] = mfspr(SPRN_IVOR35);
+   for (i = 0; i  4; i++) {
if (ivor[i]  ivor[max_ivor])
max_ivor = i;


Please ignore this patch if find it irrelevant or been submitted.

Thanks,
Amit Tomar.
--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: Patch - support e500-specific: Performance monitor

2014-09-04 Thread bharat.bhus...@freescale.com


 -Original Message-
 From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On
 Behalf Of Amit Tomar
 Sent: Thursday, September 04, 2014 8:34 PM
 To: ag...@suse.de; kvm-ppc@vger.kernel.org; Caraman Mihai Claudiu-B02008;
 pbonz...@redhat.com
 Subject: Patch - support e500-specific: Performance monitor
 
 Is There specific any reason not to copy extra handler IOVR 35 for e500?

Because we do not support Performance Monitor for guest.
Why you want to add this, any specific requirement?

Thanks
-Bharat

 
 
 --- a/arch/powerpc/kvm/e500.c
 +++ b/arch/powerpc/kvm/e500.c
 @@ -527,7 +527,7 @@ static struct kvmppc_ops kvm_ops_e500 = {
   static int __init kvmppc_e500_init(void)
   {
  int r, i;
 -   unsigned long ivor[3];
 +  unsigned long ivor[4];
  /* Process remaining handlers above the generic first 16 */
  unsigned long *handler = kvmppc_booke_handler_addr[16];
  unsigned long handler_len;
 @@ -545,7 +545,8 @@ static int __init kvmppc_e500_init(void)
  ivor[0] = mfspr(SPRN_IVOR32);
  ivor[1] = mfspr(SPRN_IVOR33);
  ivor[2] = mfspr(SPRN_IVOR34);
 -   for (i = 0; i  3; i++) {
 +ivor[3] = mfspr(SPRN_IVOR35);
 +   for (i = 0; i  4; i++) {
  if (ivor[i]  ivor[max_ivor])
  max_ivor = i;
 
 
 Please ignore this patch if find it irrelevant or been submitted.
 
 Thanks,
 Amit Tomar.
 --
 To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body
 of a message to majord...@vger.kernel.org More majordomo info at
 http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html