[kvm-devel] [PATCH 3/3] propagate errors from ioport registering up to pci level

2008-04-16 Thread Glauber de Oliveira Costa
From: Glauber Costa <[EMAIL PROTECTED]>

In situations like pci-passthrough, the ioport registering can
fail, because another device is already present and in charge for
an io address. The current state would crash qemu, but we can propagate
the errors up to the pci layer, avoiding it.

Signed-off-by: Glauber Costa <[EMAIL PROTECTED]>
---
 qemu/hw/pci-passthrough.c |   28 
 qemu/hw/pci.c |   30 --
 qemu/hw/pci.h |2 +-
 3 files changed, 45 insertions(+), 15 deletions(-)

diff --git a/qemu/hw/pci-passthrough.c b/qemu/hw/pci-passthrough.c
index 7ffcc7b..3912447 100644
--- a/qemu/hw/pci-passthrough.c
+++ b/qemu/hw/pci-passthrough.c
@@ -127,7 +127,7 @@ pt_ioport_read(b)
 pt_ioport_read(w)
 pt_ioport_read(l)
 
-static void pt_iomem_map(PCIDevice * d, int region_num,
+static int pt_iomem_map(PCIDevice * d, int region_num,
 uint32_t e_phys, uint32_t e_size, int type)
 {
pt_dev_t *r_dev = (pt_dev_t *) d;
@@ -141,6 +141,7 @@ static void pt_iomem_map(PCIDevice * d, int region_num,
cpu_register_physical_memory(e_phys,
 r_dev->dev.io_regions[region_num].size,
 r_dev->v_addrs[region_num].memory_index);
+   return 0;
 }
 
 
@@ -148,7 +149,8 @@ static void pt_ioport_map(PCIDevice * pci_dev, int 
region_num,
  uint32_t addr, uint32_t size, int type)
 {
pt_dev_t *r_dev = (pt_dev_t *) pci_dev;
-   int i;
+   int i, err;
+
uint32_t ((*rf[])(void *, uint32_t)) =  { pt_ioport_readb,
  pt_ioport_readw,
  pt_ioport_readl
@@ -163,10 +165,14 @@ static void pt_ioport_map(PCIDevice * pci_dev, int 
region_num,
  "region_num=%d \n", addr, type, size, region_num);
 
for (i = 0; i < 3; i++) {
-   register_ioport_write(addr, size, 1dev.unregister = pt_pci_unregister;
}
 
if (kvm_enabled() && !qemu_kvm_irqchip_in_kernel())
diff --git a/qemu/hw/pci.c b/qemu/hw/pci.c
index 7e4ce2d..5265b81 100644
--- a/qemu/hw/pci.c
+++ b/qemu/hw/pci.c
@@ -48,7 +48,7 @@ struct PCIBus {
 int irq_count[];
 };
 
-static void pci_update_mappings(PCIDevice *d);
+static int pci_update_mappings(PCIDevice *d);
 static void pci_set_irq(void *opaque, int irq_num, int level);
 void pci_pt_update_irq(PCIDevice *d);
 
@@ -133,13 +133,14 @@ void pci_device_save(PCIDevice *s, QEMUFile *f)
 int pci_device_load(PCIDevice *s, QEMUFile *f)
 {
 uint32_t version_id;
-int i;
+int i, err;
 
 version_id = qemu_get_be32(f);
 if (version_id > 2)
 return -EINVAL;
 qemu_get_buffer(f, s->config, 256);
-pci_update_mappings(s);
+if ((err = pci_update_mappings(s)) < 0)
+   return err;
 
 if (version_id >= 2)
 for (i = 0; i < 4; i ++)
@@ -192,7 +193,7 @@ static target_phys_addr_t 
pci_to_cpu_addr(target_phys_addr_t addr)
 return addr + pci_mem_base;
 }
 
-static void pci_unregister_io_regions(PCIDevice *pci_dev)
+void pci_unregister_io_regions(PCIDevice *pci_dev)
 {
 PCIIORegion *r;
 int i;
@@ -256,11 +257,22 @@ void pci_register_io_region(PCIDevice *pci_dev, int 
region_num,
 *(uint32_t *)(pci_dev->config + addr) = cpu_to_le32(type);
 }
 
+static int map_pci_region(PCIDevice *d, int i, PCIIORegion *r)
+{
+   int err = 0;
 
-static void pci_update_mappings(PCIDevice *d)
+   if ((err = r->map_func(d, i, r->addr, r->size, r->type)) < 0) {
+   fprintf(stderr, "Could not map pci device %s\n", d->name);
+   pci_unregister_device(d);
+   }
+   r->status = PCI_STATUS_REGISTERED;
+   return err;
+}
+
+static int pci_update_mappings(PCIDevice *d)
 {
 PCIIORegion *r;
-int cmd, i;
+int cmd, i, err;
 uint32_t last_addr, new_addr, config_ofs;
 
 cmd = le16_to_cpu(*(uint16_t *)(d->config + PCI_COMMAND));
@@ -328,10 +340,8 @@ static void pci_update_mappings(PCIDevice *d)
 }
 }
  

[kvm-devel] [PATCH 1/3] don't exit on errors while registering ioports

2008-04-16 Thread Glauber de Oliveira Costa
From: Glauber Costa <[EMAIL PROTECTED]>

Currently, any error in register_ioports make qemu
abort through hw_error(). But there are situations
in which those errors are not fatal. Just return
< 0 instead

Signed-off-by: Glauber Costa <[EMAIL PROTECTED]>
---
 qemu/vl.c |   12 +++-
 1 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/qemu/vl.c b/qemu/vl.c
index 35a0465..d7e07e2 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -351,13 +351,13 @@ int register_ioport_read(int start, int length, int size,
 } else if (size == 4) {
 bsize = 2;
 } else {
-hw_error("register_ioport_read: invalid size");
+fprintf(stderr, "register_ioport_read: invalid size\n");
 return -1;
 }
 for(i = start; i < start + length; i += size) {
 ioport_read_table[bsize][i] = func;
 if (ioport_opaque[i] != NULL && ioport_opaque[i] != opaque)
-hw_error("register_ioport_read: invalid opaque");
+fprintf(stderr, "register_ioport_read: invalid opaque\n");
 ioport_opaque[i] = opaque;
 }
 return 0;
@@ -376,13 +376,15 @@ int register_ioport_write(int start, int length, int size,
 } else if (size == 4) {
 bsize = 2;
 } else {
-hw_error("register_ioport_write: invalid size");
+fprintf(stderr, "register_ioport_write: invalid size\n");
 return -1;
 }
 for(i = start; i < start + length; i += size) {
 ioport_write_table[bsize][i] = func;
-if (ioport_opaque[i] != NULL && ioport_opaque[i] != opaque)
-hw_error("register_ioport_write: invalid opaque");
+if (ioport_opaque[i] != NULL && ioport_opaque[i] != opaque) {
+fprintf(stderr, "register_ioport_write: invalid opaque\n");
+return -1;
+}
 ioport_opaque[i] = opaque;
 }
 return 0;
-- 
1.5.5


-
This SF.net email is sponsored by the 2008 JavaOne(SM) Conference 
Don't miss this year's exciting event. There's still time to save $100. 
Use priority code J8TL2D2. 
http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 2/3] map regions as registered

2008-04-16 Thread Glauber de Oliveira Costa
From: Glauber Costa <[EMAIL PROTECTED]>

map which io registers where already sucessfuly registered

Signed-off-by: Glauber Costa <[EMAIL PROTECTED]>
---
 qemu/hw/pci.c |5 +++--
 qemu/hw/pci.h |3 +++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/qemu/hw/pci.c b/qemu/hw/pci.c
index 1937408..7e4ce2d 100644
--- a/qemu/hw/pci.c
+++ b/qemu/hw/pci.c
@@ -199,7 +199,7 @@ static void pci_unregister_io_regions(PCIDevice *pci_dev)
 
 for(i = 0; i < PCI_NUM_REGIONS; i++) {
 r = &pci_dev->io_regions[i];
-if (!r->size)
+if ((!r->size) || (r->status != PCI_STATUS_REGISTERED))
 continue;
 if (r->type == PCI_ADDRESS_SPACE_IO) {
 isa_unassign_ioport(r->addr, r->size);
@@ -321,7 +321,7 @@ static void pci_update_mappings(PCIDevice *d)
 } else {
 isa_unassign_ioport(r->addr, r->size);
 }
-} else {
+} else if (r->status == PCI_STATUS_REGISTERED) {
 cpu_register_physical_memory(pci_to_cpu_addr(r->addr),
  r->size,
  IO_MEM_UNASSIGNED);
@@ -330,6 +330,7 @@ static void pci_update_mappings(PCIDevice *d)
 r->addr = new_addr;
 if (r->addr != -1) {
 r->map_func(d, i, r->addr, r->size, r->type);
+r->status = PCI_STATUS_REGISTERED;
 }
 }
 }
diff --git a/qemu/hw/pci.h b/qemu/hw/pci.h
index e11fbbf..6350ad2 100644
--- a/qemu/hw/pci.h
+++ b/qemu/hw/pci.h
@@ -27,9 +27,12 @@ typedef struct PCIIORegion {
 uint32_t addr; /* current PCI mapping address. -1 means not mapped */
 uint32_t size;
 uint8_t type;
+uint8_t status;
 PCIMapIORegionFunc *map_func;
 } PCIIORegion;
 
+#define PCI_STATUS_REGISTERED  1
+
 #define PCI_ROM_SLOT 6
 #define PCI_NUM_REGIONS 7
 
-- 
1.5.5


-
This SF.net email is sponsored by the 2008 JavaOne(SM) Conference 
Don't miss this year's exciting event. There's still time to save $100. 
Use priority code J8TL2D2. 
http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 0/3] Qemu crashes with pci passthrough

2008-04-16 Thread Glauber de Oliveira Costa
Hi, 

I've got some qemu crashes while trying to passthrough an ide device
to a kvm guest. After some investigation, it turned out that 
register_ioport_{read/write} will abort on errors instead of returning
a meaningful error.

However, even if we do return an error, the asynchronous nature of pci
config space mapping updates makes it a little bit hard to treat.

This series of patches basically treats errors in the mapping functions in
the pci layer. If anything goes wrong, we unregister the pci device, unmapping
any mappings that happened to be sucessfull already.

After these patches are applied, a lot of warnings appears. And, you know,
everytime there is a warning, god kills a kitten. But I'm not planning on
touching the other pieces of qemu code for this until we set up (or not) in
this solution

Comments are very welcome, specially from qemu folks (since it is a bit 
invasive)




-
This SF.net email is sponsored by the 2008 JavaOne(SM) Conference 
Don't miss this year's exciting event. There's still time to save $100. 
Use priority code J8TL2D2. 
http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH] kvm: move kvmclock initialization inside kvm_guest_init

2008-04-16 Thread Glauber de Oliveira Costa
Glauber Costa wrote:
> It makes no sense for the clock initialization to be
> hanging around in setup_32.c when we have a generic kvm guest
> initialization function available. So, we move kvmclock_init()
> inside such a function, leading to a cleaner code.
>
> Signed-off-by: Glauber Costa <[EMAIL PROTECTED]>
> ---
>  arch/x86/kernel/kvm.c  |2 ++
>  arch/x86/kernel/setup_32.c |4 
>  include/linux/kvm_para.h   |5 +
>  3 files changed, 7 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index d9121f9..5cad368 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -210,6 +210,8 @@ static void paravirt_ops_setup(void)
>   pv_info.name = "KVM";
>   pv_info.paravirt_enabled = 1;
>  
> + kvmclock_init();
> +
>   if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
>   pv_cpu_ops.io_delay = kvm_io_delay;
>  
> diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
> index 65f3a23..029350c 100644
> --- a/arch/x86/kernel/setup_32.c
> +++ b/arch/x86/kernel/setup_32.c
> @@ -771,10 +771,6 @@ void __init setup_arch(char **cmdline_p)
>  
>   max_low_pfn = setup_memory();
>  
> -#ifdef CONFIG_KVM_CLOCK
> - kvmclock_init();
> -#endif
> -
>  #ifdef CONFIG_VMI
>   /*
>* Must be after max_low_pfn is determined, and before kernel
> diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
> index 3ddce03..c5e662c 100644
> --- a/include/linux/kvm_para.h
> +++ b/include/linux/kvm_para.h
> @@ -28,6 +28,11 @@ void __init kvm_guest_init(void);
>  #else
>  #define kvm_guest_init() do { } while (0)
>  #endif
> +#ifdef CONFIG_KVM_CLOCK
> +void kvmclock_init(void);
> +#else
> +#define kvmclock_init() do { } while (0)
> +#endif
>  
>  static inline int kvm_para_has_feature(unsigned int feature)
>  {
>   

Forget about it. Marelo just screamed to me (and somehow I heard it), 
that this create a bogus dependency between clock and the mmu functions. 
Duh.

I'll resend a better version

-
This SF.net email is sponsored by the 2008 JavaOne(SM) Conference 
Don't miss this year's exciting event. There's still time to save $100. 
Use priority code J8TL2D2. 
http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH] call write_guest_time as soon as we register the paravirt clock.

2008-02-21 Thread Glauber de Oliveira Costa
From: Glauber Costa <[EMAIL PROTECTED]>

In situations, like, cpu hotplugging, a cpu can arrive
later on the game and register its paravirt clock
while everything else is already running, which will lead to
breakage, since the time readings will return bogus values.

To prevent this, we write system time as soon as the guest
registers its clock

Signed-off-by: Glauber Costa <[EMAIL PROTECTED]>
---
 arch/x86/kvm/x86.c |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5dfc21f..03c0c6d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -595,6 +595,8 @@ int kvm_set_msr_common(struct kvm_vcpu *
 
if (is_error_page(vcpu->arch.time_page))
vcpu->arch.time_page = NULL;
+
+   kvm_write_guest_time(vcpu);
break;
}
default:
-- 
1.4.2


-
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse012070mrt/direct/01/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 2/2] [PATCH] kvmclock implementation, the guest part.

2008-02-15 Thread Glauber de Oliveira Costa
This is the guest part of kvm clock implementation
It does not do tsc-only timing, as tsc can have deltas
between cpus, and it did not seem worthy to me to keep
adjusting them.

We do use it, however, for fine-grained adjustment.

Other than that, time comes from the host.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 arch/x86/Kconfig   |   10 +++
 arch/x86/kernel/Makefile   |1 
 arch/x86/kernel/kvmclock.c |  161 
 arch/x86/kernel/setup_32.c |5 +
 arch/x86/kernel/setup_64.c |5 +
 5 files changed, 182 insertions(+), 0 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3be2305..cc2bc37 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -372,6 +372,16 @@ config VMI
  at the moment), by linking the kernel to a GPL-ed ROM module
  provided by the hypervisor.
 
+config KVM_CLOCK
+   bool "KVM paravirtualized clock"
+   select PARAVIRT
+   help
+ Turning on this option will allow you to run a paravirtualized clock
+ when running over the KVM hypervisor. Instead of relying on a PIT
+ (or probably other) emulation by the underlying device model, the host
+ provides the guest with timing infrastructure such as time of day, and
+ system time
+
 source "arch/x86/lguest/Kconfig"
 
 config PARAVIRT
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 76ec0f8..5b91b82 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_DEBUG_RODATA_TEST)   += test_
 obj-$(CONFIG_DEBUG_NX_TEST)+= test_nx.o
 
 obj-$(CONFIG_VMI)  += vmi_32.o vmiclock_32.o
+obj-$(CONFIG_KVM_CLOCK)+= kvmclock.o
 obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
 
 ifdef CONFIG_INPUT_PCSPKR
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
new file mode 100644
index 000..b8da3bf
--- /dev/null
+++ b/arch/x86/kernel/kvmclock.c
@@ -0,0 +1,161 @@
+/*  KVM paravirtual clock driver. A clocksource implementation
+Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define KVM_SCALE 22
+
+static int kvmclock = 1;
+
+static int parse_no_kvmclock(char *arg)
+{
+   kvmclock = 0;
+   return 0;
+}
+early_param("no-kvmclock", parse_no_kvmclock);
+
+struct shared_info shared_info __attribute__((__aligned__(PAGE_SIZE)));
+
+/* The hypervisor will put information about time periodically here */
+static struct kvm_vcpu_time_info hv_clock[NR_CPUS];
+#define get_clock(cpu, field) hv_clock[cpu].field
+
+static inline u64 kvm_get_delta(u64 last_tsc)
+{
+   int cpu = smp_processor_id();
+   u64 delta = native_read_tsc() - last_tsc;
+   return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
+}
+
+static struct kvm_wall_clock wall_clock;
+static cycle_t kvm_clock_read(void);
+/*
+ * The wallclock is the time of day when we booted. Since then, some time may
+ * have elapsed since the hypervisor wrote the data. So we try to account for
+ * that with system time
+ */
+unsigned long kvm_get_wallclock(void)
+{
+   u32 wc_sec, wc_nsec;
+   u64 delta;
+   struct timespec ts;
+   int version, nsec;
+   int low, high;
+
+   low = (int)__pa(&wall_clock);
+   high = ((u64)__pa(&wall_clock) >> 32);
+
+   delta = kvm_clock_read();
+
+   native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
+   do {
+   version = wall_clock.wc_version;
+   rmb();
+   wc_sec = wall_clock.wc_sec;
+   wc_nsec = wall_clock.wc_nsec;
+   rmb();
+   } while ((wall_clock.wc_version != version) || (version & 1));
+
+   delta = kvm_clock_read() - delta;
+   delta += wc_nsec;
+   nsec = do_div(delta, NSEC_PER_SEC);
+   set_normalized_timespec(&ts, wc_sec + delta, nsec);
+   /*
+* Of all mechanisms of time adjustment I've tested, this one
+* was the champion!
+*/
+   return ts.tv_sec + 1;
+}
+
+int kvm_set_wallclock(unsigned long now)
+{
+   return 0;
+}
+
+/*
+ * This

[kvm-devel] [PATCH 0/2 -v(many)] kvmclock

2008-02-15 Thread Glauber de Oliveira Costa
I think this version addresses avi's last comments.
I'm not resending userspace since it is unchanged




-
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse012070mrt/direct/01/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 1/2] [PATCH] kvmclock - the host part.

2008-02-15 Thread Glauber de Oliveira Costa
This is the host part of kvm clocksource implementation. As it does
not include clockevents, it is a fairly simple implementation. We
only have to register a per-vcpu area, and start writting to it periodically.

The area is binary compatible with xen, as we use the same shadow_info 
structure.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 arch/x86/kvm/x86.c |   96 
 include/asm-x86/kvm_host.h |7 +++
 include/asm-x86/kvm_para.h |   25 +++
 include/linux/kvm.h|1 
 4 files changed, 128 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0c910c7..5dfc21f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -19,6 +19,7 @@ #include "segment_descriptor.h"
 #include "irq.h"
 #include "mmu.h"
 
+#include 
 #include 
 #include 
 #include 
@@ -424,7 +425,7 @@ static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-   MSR_IA32_TIME_STAMP_COUNTER,
+   MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 };
 
 static unsigned num_msrs_to_save;
@@ -482,6 +483,70 @@ static int do_set_msr(struct kvm_vcpu *v
return kvm_set_msr(vcpu, index, *data);
 }
 
+static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
+{
+   static int version;
+   struct kvm_wall_clock wc;
+   struct timespec wc_ts;
+
+   if (!wall_clock)
+   return
+
+   mutex_lock(&kvm->lock);
+
+   version++;
+   kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+
+   wc_ts = current_kernel_time();
+   wc.wc_sec = wc_ts.tv_sec;
+   wc.wc_nsec = wc_ts.tv_nsec;
+   wc.wc_version = version;
+   kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
+
+   version++;
+   kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+
+   mutex_unlock(&kvm->lock);
+}
+
+static void kvm_write_guest_time(struct kvm_vcpu *v)
+{
+   struct timespec ts;
+   unsigned long flags;
+   struct kvm_vcpu_arch *vcpu = &v->arch;
+   void *shared_kaddr;
+
+   if ((!vcpu->time_page))
+   return;
+
+   /* Keep irq disabled to prevent changes to the clock */
+   local_irq_save(flags);
+   kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
+ &vcpu->hv_clock.tsc_timestamp);
+   ktime_get_ts(&ts);
+   local_irq_restore(flags);
+
+   /* With all the info we got, fill in the values */
+
+   vcpu->hv_clock.system_time = ts.tv_nsec +
+(NSEC_PER_SEC * (u64)ts.tv_sec);
+   /*
+* The interface expects us to write an even number signaling that the
+* update is finished. Since the guest won't see the intermediate
+* state, we just write "2" at the end
+*/
+   vcpu->hv_clock.version = 2;
+
+   shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
+
+   memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
+   sizeof(vcpu->hv_clock));
+
+   kunmap_atomic(shared_kaddr, KM_USER0);
+
+   mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
+}
+
 
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
@@ -511,6 +576,27 @@ int kvm_set_msr_common(struct kvm_vcpu *
case MSR_IA32_MISC_ENABLE:
vcpu->arch.ia32_misc_enable_msr = data;
break;
+   case MSR_KVM_WALL_CLOCK:
+   vcpu->kvm->arch.wall_clock = data;
+   kvm_write_wall_clock(vcpu->kvm, data);
+   break;
+   case MSR_KVM_SYSTEM_TIME: {
+   vcpu->arch.time = data & PAGE_MASK;
+   vcpu->arch.time_offset = data & ~PAGE_MASK;
+
+   vcpu->arch.hv_clock.tsc_to_system_mul =
+   clocksource_khz2mult(tsc_khz, 22);
+   vcpu->arch.hv_clock.tsc_shift = 22;
+
+   down_read(¤t->mm->mmap_sem);
+   vcpu->arch.time_page =
+   gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
+   up_read(¤t->mm->mmap_sem);
+
+   if (is_error_page(vcpu->arch.time_page))
+   vcpu->arch.time_page = NULL;
+   break;
+   }
default:
pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
return 1;
@@ -569,6 +655,12 @@ int kvm_get_msr_common(struct kvm_vcpu *
case MSR_EFER:
data = vcpu->arch.shadow_efer;
break;
+   case MSR_KVM_WALL_CLOCK:
+   data = vcpu->kvm->arch.wall_clock;
+   break;
+   case MSR_KVM_SYSTEM_TIME:
+   data = vcpu->arch.time;
+   

[kvm-devel] [PATCH 2/2] fill cpuid with clock information

2008-02-11 Thread Glauber de Oliveira Costa
we advertise the presence of our paravirt clock
through cpuid, if it is defined in headers

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 qemu/qemu-kvm-x86.c |6 +-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/qemu/qemu-kvm-x86.c b/qemu/qemu-kvm-x86.c
index 4a52d4d..4daf6cc 100644
--- a/qemu/qemu-kvm-x86.c
+++ b/qemu/qemu-kvm-x86.c
@@ -513,6 +513,10 @@ #endif
 int cpuid_nent = 0;
 CPUState copy;
 uint32_t i, limit;
+int has_clocksource = 0;
+#ifdef KVM_CAP_CLOCKSOURCE
+has_clocksource = kvm_check_extension(kvm_context, KVM_CAP_CLOCKSOURCE);
+#endif
 
 copy = *cenv;
 
@@ -530,7 +534,7 @@ #ifdef KVM_CPUID_SIGNATURE
 pv_ent = &cpuid_ent[cpuid_nent++];
 memset(pv_ent, 0, sizeof(*pv_ent));
 pv_ent->function = KVM_CPUID_FEATURES;
-pv_ent->eax = 0;
+pv_ent->eax = (has_clocksource << KVM_FEATURE_CLOCKSOURCE);
 #endif
 
 copy.regs[R_EAX] = 0;
-- 
1.4.2


-
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse012070mrt/direct/01/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 1/2] add kvm_para.h header to qemu-kvm-x86.c

2008-02-11 Thread Glauber de Oliveira Costa
With this, functions such as cpuid can make
use of paravirt definitions

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 qemu/qemu-kvm-x86.c |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/qemu/qemu-kvm-x86.c b/qemu/qemu-kvm-x86.c
index 037abb1..4a52d4d 100644
--- a/qemu/qemu-kvm-x86.c
+++ b/qemu/qemu-kvm-x86.c
@@ -9,6 +9,7 @@ #include "qemu-kvm.h"
 #include 
 #include 
 #include 
+#include 
 
 #define MSR_IA32_TSC   0x10
 
-- 
1.4.2


-
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse012070mrt/direct/01/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 0/2] kvm clock : userspace part

2008-02-11 Thread Glauber de Oliveira Costa


-
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse012070mrt/direct/01/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 2/2] kvmclock implementation, the guest part.

2008-02-11 Thread Glauber de Oliveira Costa
This is the guest part of kvm clock implementation
It does not do tsc-only timing, as tsc can have deltas
between cpus, and it did not seem worthy to me to keep
adjusting them.

We do use it, however, for fine-grained adjustment.

Other than that, time comes from the host.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 arch/x86/Kconfig   |   10 +++
 arch/x86/kernel/Makefile   |1 
 arch/x86/kernel/kvmclock.c |  161 
 arch/x86/kernel/setup_32.c |5 +
 arch/x86/kernel/setup_64.c |5 +
 5 files changed, 182 insertions(+), 0 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 65b4491..a4b33b1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -349,6 +349,16 @@ config VMI
  at the moment), by linking the kernel to a GPL-ed ROM module
  provided by the hypervisor.
 
+config KVM_CLOCK
+   bool "KVM paravirtualized clock"
+   select PARAVIRT
+   help
+ Turning on this option will allow you to run a paravirtualized clock
+ when running over the KVM hypervisor. Instead of relying on a PIT
+ (or probably other) emulation by the underlying device model, the host
+ provides the guest with timing infrastructure such as time of day, and
+ system time
+
 source "arch/x86/lguest/Kconfig"
 
 config PARAVIRT
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 6f81300..4b10872 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -68,6 +68,7 @@ obj-$(CONFIG_DEBUG_RODATA_TEST)   += test_
 obj-$(CONFIG_DEBUG_NX_TEST)+= test_nx.o
 
 obj-$(CONFIG_VMI)  += vmi_32.o vmiclock_32.o
+obj-$(CONFIG_KVM_CLOCK)+= kvmclock.o
 obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
 
 ifdef CONFIG_INPUT_PCSPKR
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
new file mode 100644
index 000..809d8a5
--- /dev/null
+++ b/arch/x86/kernel/kvmclock.c
@@ -0,0 +1,161 @@
+/*  KVM paravirtual clock driver. A clocksource implementation
+Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define KVM_SCALE 22
+
+static int kvmclock = 1;
+
+static int parse_no_kvmclock(char *arg)
+{
+   kvmclock = 0;
+   return 0;
+}
+early_param("no-kvmclock", parse_no_kvmclock);
+
+struct shared_info shared_info __attribute__((__aligned__(PAGE_SIZE)));
+
+/* The hypervisor will put information about time periodically here */
+static struct kvm_vcpu_time_info hv_clock[NR_CPUS];
+#define get_clock(cpu, field) hv_clock[cpu].field
+
+static inline u64 kvm_get_delta(u64 last_tsc)
+{
+   int cpu = smp_processor_id();
+   u64 delta = native_read_tsc() - last_tsc;
+   return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
+}
+
+static struct kvm_wall_clock wall_clock;
+/*
+ * The wallclock is the time of day when we booted. Since then, some time may
+ * have elapsed since the hypervisor wrote the data. So we try to account for
+ * that. Even if the tsc is not accurate, it gives us a more accurate timing
+ * than not adjusting at all
+ */
+unsigned long kvm_get_wallclock(void)
+{
+   u32 wc_sec, wc_nsec;
+   u64 delta, last_tsc;
+   struct timespec ts;
+   int version, nsec, cpu = smp_processor_id();
+   int low,high;
+
+   low = (int)__pa(&wall_clock);
+   high = ((u64)__pa(&wall_clock) >> 32);
+
+   native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
+   do {
+   version = wall_clock.wc_version;
+   rmb();
+   wc_sec = wall_clock.wc_sec;
+   wc_nsec = wall_clock.wc_nsec;
+   last_tsc = get_clock(cpu, tsc_timestamp);
+   rmb();
+   } while ((wall_clock.wc_version != version) || (version & 1));
+
+   delta = kvm_get_delta(last_tsc);
+   delta += wc_nsec;
+   nsec = do_div(delta, NSEC_PER_SEC);
+   set_normalized_timespec(&ts, wc_sec + delta, nsec);
+   /*
+* Of all mechanisms of time adjustment I've tested, this one
+* was the champion!
+*/
+ 

[kvm-devel] [PATCH 1/2] kvmclock - the host part.

2008-02-11 Thread Glauber de Oliveira Costa
This is the host part of kvm clocksource implementation. As it does
not include clockevents, it is a fairly simple implementation. We
only have to register a per-vcpu area, and start writting to it periodically.

The area is binary compatible with xen, as we use the same shadow_info 
structure.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 arch/x86/kvm/x86.c |   95 
 include/asm-x86/kvm_host.h |7 +++
 include/asm-x86/kvm_para.h |   24 +++
 include/linux/kvm.h|1 
 4 files changed, 126 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0987191..03adc9b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -19,6 +19,7 @@ #include "segment_descriptor.h"
 #include "irq.h"
 #include "mmu.h"
 
+#include 
 #include 
 #include 
 #include 
@@ -420,7 +421,7 @@ static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-   MSR_IA32_TIME_STAMP_COUNTER,
+   MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 };
 
 static unsigned num_msrs_to_save;
@@ -478,6 +479,71 @@ static int do_set_msr(struct kvm_vcpu *v
return kvm_set_msr(vcpu, index, *data);
 }
 
+static void kvm_write_wall_clock(struct kvm_vcpu *v, gpa_t wall_clock)
+{
+   int version = 1;
+   struct kvm_wall_clock wc;
+   unsigned long flags;
+   struct timespec wc_ts;
+
+   local_irq_save(flags);
+   kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
+ &v->arch.hv_clock.tsc_timestamp);
+   wc_ts = current_kernel_time();
+   local_irq_restore(flags);
+
+   down_write(¤t->mm->mmap_sem);
+   kvm_write_guest(v->kvm, wall_clock, &version, sizeof(version));
+   up_write(¤t->mm->mmap_sem);
+
+   /* With all the info we got, fill in the values */
+   wc.wc_sec = wc_ts.tv_sec;
+   wc.wc_nsec = wc_ts.tv_nsec;
+   wc.wc_version = ++version;
+
+   down_write(¤t->mm->mmap_sem);
+   kvm_write_guest(v->kvm, wall_clock, &wc, sizeof(wc));
+   up_write(¤t->mm->mmap_sem);
+}
+
+static void kvm_write_guest_time(struct kvm_vcpu *v)
+{
+   struct timespec ts;
+   unsigned long flags;
+   struct kvm_vcpu_arch *vcpu = &v->arch;
+   void *shared_kaddr;
+
+   if ((!vcpu->time_page))
+   return;
+
+   /* Keep irq disabled to prevent changes to the clock */
+   local_irq_save(flags);
+   kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
+ &vcpu->hv_clock.tsc_timestamp);
+   ktime_get_ts(&ts);
+   local_irq_restore(flags);
+
+   /* With all the info we got, fill in the values */
+
+   vcpu->hv_clock.system_time = ts.tv_nsec +
+(NSEC_PER_SEC * (u64)ts.tv_sec);
+   /*
+* The interface expects us to write an even number signaling that the
+* update is finished. Since the guest won't see the intermediate 
states,
+* we just write "2" at the end
+*/
+   vcpu->hv_clock.version = 2;
+
+   shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
+
+   memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
+   sizeof(vcpu->hv_clock));
+
+   kunmap_atomic(shared_kaddr, KM_USER0);
+
+   mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
+}
+
 
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
@@ -503,6 +569,25 @@ int kvm_set_msr_common(struct kvm_vcpu *
case MSR_IA32_MISC_ENABLE:
vcpu->arch.ia32_misc_enable_msr = data;
break;
+   case MSR_KVM_WALL_CLOCK:
+   vcpu->kvm->arch.wall_clock = data;
+   kvm_write_wall_clock(vcpu, data);
+   break;
+   case MSR_KVM_SYSTEM_TIME: {
+   vcpu->arch.time = data & PAGE_MASK;
+   vcpu->arch.time_offset = data & ~PAGE_MASK;
+
+   vcpu->arch.hv_clock.tsc_to_system_mul =
+   clocksource_khz2mult(tsc_khz, 22);
+   vcpu->arch.hv_clock.tsc_shift = 22;
+
+   down_write(¤t->mm->mmap_sem);
+   vcpu->arch.time_page = gfn_to_page(vcpu->kvm, data >> 
PAGE_SHIFT);
+   up_write(¤t->mm->mmap_sem);
+   if (is_error_page(vcpu->arch.time_page))
+   vcpu->arch.time_page = NULL;
+   break;
+   }
default:
pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
return 1;
@@ -560,6 +645,12 @@ int kvm_get_msr_common(struct kvm_vcpu *
case MSR_EFER:
data = vcpu->arch.shadow_efer;
bre

[kvm-devel] [PATCH 0/2] kvm clock - merge last comments

2008-02-11 Thread Glauber de Oliveira Costa
Here's a new version that merges last comments from avi.
Also, it makes it available to x86_64 as well.
Userspace part will follow shortly



-
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse012070mrt/direct/01/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] Fwd: [PATCH] boot a linux kernel from non-ide device

2008-02-08 Thread Glauber de Oliveira Costa
Reposting to kvm-devel, since aliguori notices that I'm relying on
non-upstream features of qemu

-- Forwarded message --
From: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Date: Feb 8, 2008 5:05 AM
Subject: [PATCH] boot a linux kernel from non-ide device
To: [EMAIL PROTECTED]


Since it's now possible to use the -drive option, the test for something
in the index 0 of the IDE bus is too restrictive.

A better idea, IMHO, is to check if the user specified any bootable device,
and only if not, fallback to the default, compatible behaviour of checking
hda regardless of the presence of a boot=on arg.

--
Glauber de Oliveira Costa.
"Free as in Freedom"
http://glommer.net

"The less confident you are, the more serious you have to act."



-- 
Glauber de Oliveira Costa.
"Free as in Freedom"
http://glommer.net

"The less confident you are, the more serious you have to act."
commit 65b8fcf3bc5b754b720a04fa7efed6fdcd472a6a
Author: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Date:   Fri Feb 8 05:02:47 2008 -0200

[PATCH] boot a linux kernel from non-ide device

Since it's now possible to use the -drive option, the test for something
in the index 0 of the IDE bus is too restrictive.

A better idea, IMHO, is to check if the user specified any bootable device,
and only if not, fallback to the default, compatible behaviour of checking
hda regardless of the presence of a boot=on arg.

diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
index 5ce28ab..a9b0f71 100644
--- a/qemu/hw/pc.c
+++ b/qemu/hw/pc.c
@@ -398,11 +398,14 @@ static void generate_bootsect(uint32_t gpr[8], uint16_t segs[6], uint16_t ip)
 {
 uint8_t bootsect[512], *p;
 int i;
-int hda;
-
-hda = drive_get_index(IF_IDE, 0, 0);
-if (hda == -1) {
-	fprintf(stderr, "A disk image must be given for 'hda' when booting "
+int hda = -1, boot_device;
+
+if (extboot_drive != -1)
+boot_device = extboot_drive;
+else if ((hda = drive_get_index(IF_IDE, 0, 0)) != -1) 
+	boot_device = hda;
+else {
+	fprintf(stderr, "A bootable disk image must be given when booting "
 		"a Linux kernel\n");
 	exit(1);
 }
@@ -410,7 +413,7 @@ static void generate_bootsect(uint32_t gpr[8], uint16_t segs[6], uint16_t ip)
 memset(bootsect, 0, sizeof(bootsect));
 
 /* Copy the MSDOS partition table if possible */
-bdrv_read(drives_table[hda].bdrv, 0, bootsect, 1);
+bdrv_read(drives_table[boot_device].bdrv, 0, bootsect, 1);
 
 /* Make sure we have a partition signature */
 bootsect[510] = 0x55;
@@ -447,7 +450,7 @@ static void generate_bootsect(uint32_t gpr[8], uint16_t segs[6], uint16_t ip)
 *p++ = segs[1];		/* CS */
 *p++ = segs[1] >> 8;
 
-bdrv_set_boot_sector(drives_table[hda].bdrv, bootsect, sizeof(bootsect));
+bdrv_set_boot_sector(drives_table[boot_device].bdrv, bootsect, sizeof(bootsect));
 }
 
 static int load_kernel(const char *filename, uint8_t *addr,
-
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse012070mrt/direct/01/___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 2/2] kvmclock implementation, the guest part.

2008-01-17 Thread Glauber de Oliveira Costa
Gerd Hoffmann wrote:
>> +struct shared_info shared_info __attribute__((__aligned__(PAGE_SIZE)));
> 
> leftover from old version?
> 
>> +unsigned long kvm_get_wallclock(void)
>> +{
>> +u32 wc_sec, wc_nsec;
>> +u64 delta, last_tsc;
>> +struct timespec ts;
>> +int version, nsec, cpu = smp_processor_id();
>> +
>> +native_write_msr(MSR_KVM_WALL_CLOCK, __pa(&wall_clock));
> 
> Huh?  Shouldn't that be done once at boot time?
> 
> cheers,
>   Gerd
> 
> 
It can, but I don't think so. The closer to read we do, the more 
precision we get on unstable tsc systems.


-
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse012070mrt/direct/01/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 1/2] kvmclock - the host part.

2008-01-16 Thread Glauber de Oliveira Costa
Anthony Liguori wrote:
> Glauber de Oliveira Costa wrote:
>> This is the host part of kvm clocksource implementation. As it does
>> not include clockevents, it is a fairly simple implementation. We
>> only have to register a per-vcpu area, and start writting to it 
>> periodically.
>>
>> The area is binary compatible with xen, as we use the same shadow_info 
>> structure.
>>
>> Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
>> ---
>>  arch/x86/kvm/x86.c |   98 
>> +++-
>>  include/asm-x86/kvm_host.h |6 +++
>>  include/asm-x86/kvm_para.h |   24 +++
>>  include/linux/kvm.h|1 +
>>  4 files changed, 128 insertions(+), 1 deletions(-)
>>
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 8a90403..fd69aa1 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -19,6 +19,7 @@
>>  #include "irq.h"
>>  #include "mmu.h"
>>  
>> +#include 
>>  #include 
>>  #include 
>>  #include 
>> @@ -412,7 +413,7 @@ static u32 msrs_to_save[] = {
>>  #ifdef CONFIG_X86_64
>>  MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
>>  #endif
>> -MSR_IA32_TIME_STAMP_COUNTER,
>> +MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME,
>>  };
>>  
>>  static unsigned num_msrs_to_save;
>> @@ -467,6 +468,73 @@ static int do_set_msr(struct kvm_vcpu *vcpu, 
>> unsigned index, u64 *data)
>>  return kvm_set_msr(vcpu, index, *data);
>>  }
>>  
>> +static void kvm_write_wall_clock(struct kvm_vcpu *v, gpa_t wall_clock)
>> +{
>> +int version = 1;
>> +struct wall_clock wc;
>> +unsigned long flags;
>> +struct timespec wc_ts;
>> +
>> +local_irq_save(flags);
>> +kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
>> +  &v->arch.hv_clock.tsc_timestamp);
>> +wc_ts = current_kernel_time();
>> +local_irq_restore(flags);
>> +
>> +down_write(¤t->mm->mmap_sem);
>> +kvm_write_guest(v->kvm, wall_clock, &version, sizeof(version));
>> +up_write(¤t->mm->mmap_sem);
>> +
>> +/* With all the info we got, fill in the values */
>> +wc.wc_sec = wc_ts.tv_sec;
>> +wc.wc_nsec = wc_ts.tv_nsec;
>> +wc.wc_version = ++version;
>> +
>> +down_write(¤t->mm->mmap_sem);
>> +kvm_write_guest(v->kvm, wall_clock, &wc, sizeof(wc));
>> +up_write(¤t->mm->mmap_sem);
>>   
> 
> Can we get a comment explaining why we only write the version field and 
> then immediately increment the version and write the whole struct?  It's 
> not at all obvious why the first write is needed to me.
If the comment is the only pending thing, can we add the comment in a 
later commit?

>> +}
>> +static void kvm_write_guest_time(struct kvm_vcpu *v)
>> +{
>> +struct timespec ts;
>> +unsigned long flags;
>> +struct kvm_vcpu_arch *vcpu = &v->arch;
>> +void *shared_kaddr;
>> +
>> +if ((!vcpu->time_page))
>> +return;
>> +
>> +/* Keep irq disabled to prevent changes to the clock */
>> +local_irq_save(flags);
>> +kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
>> +  &vcpu->hv_clock.tsc_timestamp);
>> +ktime_get_ts(&ts);
>> +local_irq_restore(flags);
>> +
>> +/* With all the info we got, fill in the values */
>> +
>> +vcpu->hv_clock.system_time = ts.tv_nsec +
>> + (NSEC_PER_SEC * (u64)ts.tv_sec);
>> +/*
>> + * The interface expects us to write an even number signaling 
>> that the
>> + * update is finished. Since the guest won't see the intermediate 
>> states,
>> + * we just write "2" at the end
>> + */
>> +vcpu->hv_clock.version = 2;
>> +
>> +preempt_disable();
>> +
>> +shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
>> +
>> +memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
>> +sizeof(vcpu->hv_clock));
>> +
>> +kunmap_atomic(shared_kaddr, KM_USER0);
>>   
> 
> Instead of doing a kmap/memcpy, I think it would be better to store the 
> GPA of the time page and do a kvm_write_guest().  Otherwise, you're 
> pinning this page in memory.
this functions end up being called from various contexts. Some with the 
mmap_sem held, some uncontended. kvm_write_guest needs it held, so it 
would turn the code into a big spaguetti. Using the kmap was avi's 
suggestion to get around it, which I personally liked: we only grab the 
semaphore when the msr is registered.


-
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse012070mrt/direct/01/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] RFC: qemu acpi hotplug

2008-01-16 Thread Glauber de Oliveira Costa
When it's more close to inclusion, I'd also post it to main qemu list. 
But right now, I'm just aiming at a first round around this draft.


The attached patch is enough to make the notifications DEVICE_CHECK and 
EJECT reach the kernel. As far as I understand, some userspace black 
magic that keeps changing its scroll is needed to really put the 
processors logically off/on after the notify (acpi code itself will 
never call cpu_up/down)


Just let me tell you what you think.
>From c45432c0cec8241dbcd6ed6cf38c953b17a6f826 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Date: Wed, 16 Jan 2008 18:43:11 -0200
Subject: [PATCH] RFC: qemu cpu hotplug

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 bios/acpi-dsdt.dsl|   87 +-
 bios/rombios32.c  |2 +
 qemu/hw/acpi.c|  125 +
 qemu/hw/pc.c  |4 +-
 qemu/monitor.c|9 
 qemu/pc-bios/bios.bin |  Bin
 6 files changed, 214 insertions(+), 13 deletions(-)

diff --git a/bios/acpi-dsdt.dsl b/bios/acpi-dsdt.dsl
index df255ce..497b866 100755
--- a/bios/acpi-dsdt.dsl
+++ b/bios/acpi-dsdt.dsl
@@ -27,18 +27,35 @@ DefinitionBlock (
 {
 Scope (_PR)
 {
-Processor (CPU0, 0x00, 0xb010, 0x06) {}
-Processor (CPU1, 0x01, 0xb010, 0x06) {}
-Processor (CPU2, 0x02, 0xb010, 0x06) {}
-Processor (CPU3, 0x03, 0xb010, 0x06) {}
-Processor (CPU4, 0x04, 0xb010, 0x06) {}
-Processor (CPU5, 0x05, 0xb010, 0x06) {}
-Processor (CPU6, 0x06, 0xb010, 0x06) {}
-Processor (CPU7, 0x07, 0xb010, 0x06) {}
-Processor (CPU8, 0x08, 0xb010, 0x06) {}
-Processor (CPU9, 0x09, 0xb010, 0x06) {}
-Processor (CPUA, 0x0a, 0xb010, 0x06) {}
-Processor (CPUB, 0x0b, 0xb010, 0x06) {}
+	OperationRegion( PRO, SystemIO, 0xaf00, 0x02)
+	Field (PRO, ByteAcc, NoLock, WriteAsZeros)
+	{
+		PR0U, 1,
+		PR1U, 1,
+		PR2U, 1,
+		PR3U, 1,
+		PR4U, 1,
+		PADU, 3,
+
+		PR0D, 1,
+		PR1D, 1,
+		PR2D, 1,
+		PR3D, 1,
+		PR4D, 1,
+		PADD, 3,
+	}
+Processor (CPU0, 0x00, 0xb010, 0x06) { Method (_STA) { Return(0x1)} }
+Processor (CPU1, 0x01, 0xb010, 0x06) { Method (_STA) { Return(0x1)} }
+Processor (CPU2, 0x02, 0xb010, 0x06) { Method (_STA) { Return(0x1)} }
+Processor (CPU3, 0x03, 0xb010, 0x06) { Method (_STA) { Return(0x1)} }
+Processor (CPU4, 0x04, 0xb010, 0x06) { Method (_STA) { Return(0x1)} } 
+Processor (CPU5, 0x05, 0xb010, 0x06) { Method (_STA) { Return(0x1)} }
+Processor (CPU6, 0x06, 0xb010, 0x06) { Method (_STA) { Return(0x1)} }
+Processor (CPU7, 0x07, 0xb010, 0x06) { Method (_STA) { Return(0x1)} }
+Processor (CPU8, 0x08, 0xb010, 0x06) { Method (_STA) { Return(0x1)} }
+Processor (CPU9, 0x09, 0xb010, 0x06) { Method (_STA) { Return(0x1)} }
+Processor (CPUA, 0x0a, 0xb010, 0x06) { Method (_STA) { Return(0x1)} }
+Processor (CPUB, 0x0b, 0xb010, 0x06) { Method (_STA) { Return(0x1)} }
 Processor (CPUC, 0x0c, 0xb010, 0x06) {}
 Processor (CPUD, 0x0d, 0xb010, 0x06) {}
 Processor (CPUE, 0x0e, 0xb010, 0x06) {}
@@ -559,6 +576,51 @@ DefinitionBlock (
 }
 }
 }
+Scope(\_GPE)
+{
+   Method(_L00) {  
+	  Return(0x01)
+   }
+   Method(_L01) { 
+ If (\_PR.PR1U) {
+	  Notify(\_PR.CPU1, 1)
+	 }
+	 If (\_PR.PR1D){
+	  Notify(\_PR.CPU1, 3) 
+	 }
+	 Return(0x01)
+   }
+
+   Method(_L02) { 
+ If (\_PR.PR2U) {
+	  Notify(\_PR.CPU2, 1)
+	 }
+	 If (\_PR.PR2D){
+	  Notify(\_PR.CPU2, 3) 
+	 }
+	 Return(0x01)
+   }
+
+   Method(_L03) { 
+ If (\_PR.PR3U) {
+	  Notify(\_PR.CPU3, 1)
+	 }
+	 If (\_PR.PR3D){
+	  Notify(\_PR.CPU3, 3) 
+	 }
+	 Return(0x01)
+   }
+
+   Method(_L04) { 
+ If (\_PR.PR4U) {
+	  Notify(\_PR.CPU4, 1)
+	 }
+	 IF (\_PR.PR4D) {
+	  Notify(\_PR.CPU4, 3) 
+	 }
+	 Return(0x01)
+   }
+}
 
 /* S5 = power off state */
 Name (_S5, Package (4) {
@@ -567,4 +629,5 @@ DefinitionBlock (
 0x00, // reserved
 0x00, // reserved
 })
+
 }
diff --git a/bios/rombios32.c b/bios/rombios32.c
index 967c119..4580462 100755
--- a/bios/rombios32.c
+++ b/bios/rombios32.c
@@ -1329,6 +1329,8 @@ void acpi_bios_init(void)
 fadt->pm_tmr_len = 4;
 fadt->plvl2_lat = cpu_to_le16(0x0fff); // C2 state not supported
 fadt->plvl3_lat = cpu_to_le16(0x0fff); // C3 state not supported
+fadt->gpe0_blk = cpu_to_le32(0xafe0);
+fadt->gpe0_blk_len = 4;
 /* WBINVD + PROC_C1 + SLP_BUTTON + FIX_RTC */
 fadt->flags = cpu_to_le32((1 << 0) | (1 << 2) | (1 << 5) | (1 << 6));
 acpi_build_table_header((struct acpi_table_header *)fadt, "FACP", 
diff --git a/qemu/hw/acpi.c

[kvm-devel] [PATCH 2/2] kvmclock implementation, the guest part.

2008-01-16 Thread Glauber de Oliveira Costa
This is the guest part of kvm clock implementation
It does not do tsc-only timing, as tsc can have deltas
between cpus, and it did not seem worthy to me to keep
adjusting them.

We do use it, however, for fine-grained adjustment.

Other than that, time comes from the host.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 arch/x86/Kconfig|   10 +++
 arch/x86/kernel/Makefile_32 |1 +
 arch/x86/kernel/kvmclock.c  |  154 +++
 arch/x86/kernel/setup_32.c  |5 ++
 4 files changed, 170 insertions(+), 0 deletions(-)
 create mode 100644 arch/x86/kernel/kvmclock.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ab2df55..968315e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -350,6 +350,16 @@ config VMI
  at the moment), by linking the kernel to a GPL-ed ROM module
  provided by the hypervisor.
 
+config KVM_CLOCK
+   bool "KVM paravirtualized clock"
+   select PARAVIRT
+   help
+ Turning on this option will allow you to run a paravirtualized clock
+ when running over the KVM hypervisor. Instead of relying on a PIT
+ (or probably other) emulation by the underlying device model, the host
+ provides the guest with timing infrastructure, as time of day, and
+ timer expiration.
+
 source "arch/x86/lguest/Kconfig"
 
 endif
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index a7bc93c..f6332b6 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -44,6 +44,7 @@ obj-$(CONFIG_K8_NB)   += k8.o
 obj-$(CONFIG_MGEODE_LX)+= geode_32.o mfgpt_32.o
 
 obj-$(CONFIG_VMI)  += vmi_32.o vmiclock_32.o
+obj-$(CONFIG_KVM_CLOCK)+= kvmclock.o
 obj-$(CONFIG_PARAVIRT) += paravirt_32.o
 obj-y  += pcspeaker.o
 
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
new file mode 100644
index 000..56be828
--- /dev/null
+++ b/arch/x86/kernel/kvmclock.c
@@ -0,0 +1,154 @@
+/*  KVM paravirtual clock driver. A clocksource implementation
+Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define KVM_SCALE 22
+
+static int kvmclock = 1;
+
+static int parse_no_kvmclock(char *arg)
+{
+   kvmclock = 0;
+   return 0;
+}
+early_param("no-kvmclock", parse_no_kvmclock);
+
+struct shared_info shared_info __attribute__((__aligned__(PAGE_SIZE)));
+
+/* The hypervisor will put information about time periodically here */
+static struct kvm_vcpu_time_info hv_clock[NR_CPUS];
+#define get_clock(cpu, field) hv_clock[cpu].field
+
+static inline u64 kvm_get_delta(u64 last_tsc)
+{
+   int cpu = smp_processor_id();
+   u64 delta = native_read_tsc() - last_tsc;
+   return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
+}
+
+static struct wall_clock wall_clock;
+/*
+ * The wallclock is the time of day when we booted. Since then, some time may
+ * have elapsed since the hypervisor wrote the data. So we try to account for
+ * that. Even if the tsc is not accurate, it gives us a more accurate timing
+ * than not adjusting at all
+ */
+unsigned long kvm_get_wallclock(void)
+{
+   u32 wc_sec, wc_nsec;
+   u64 delta, last_tsc;
+   struct timespec ts;
+   int version, nsec, cpu = smp_processor_id();
+
+   native_write_msr(MSR_KVM_WALL_CLOCK, __pa(&wall_clock));
+   do {
+   version = wall_clock.wc_version;
+   rmb();
+   wc_sec = wall_clock.wc_sec;
+   wc_nsec = wall_clock.wc_nsec;
+   last_tsc = get_clock(cpu, tsc_timestamp);
+   rmb();
+   } while ((wall_clock.wc_version != version) || (version & 1));
+
+   delta = kvm_get_delta(last_tsc);
+   delta += wc_nsec;
+   nsec = do_div(delta, NSEC_PER_SEC);
+   set_normalized_timespec(&ts, wc_sec + delta, nsec);
+   /*
+* Of all mechanisms of time adjustment I've tested, this one
+* was the champion!
+*/
+   return ts.tv_sec + 1;
+}
+
+int kvm_set_wallclock(unsigned long now)
+{
+   return 0;
+}

[kvm-devel] [PATCH 1/2] kvmclock - the host part.

2008-01-16 Thread Glauber de Oliveira Costa
This is the host part of kvm clocksource implementation. As it does
not include clockevents, it is a fairly simple implementation. We
only have to register a per-vcpu area, and start writting to it periodically.

The area is binary compatible with xen, as we use the same shadow_info 
structure.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 arch/x86/kvm/x86.c |   98 +++-
 include/asm-x86/kvm_host.h |6 +++
 include/asm-x86/kvm_para.h |   24 +++
 include/linux/kvm.h|1 +
 4 files changed, 128 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8a90403..fd69aa1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -19,6 +19,7 @@
 #include "irq.h"
 #include "mmu.h"
 
+#include 
 #include 
 #include 
 #include 
@@ -412,7 +413,7 @@ static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-   MSR_IA32_TIME_STAMP_COUNTER,
+   MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME,
 };
 
 static unsigned num_msrs_to_save;
@@ -467,6 +468,73 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned 
index, u64 *data)
return kvm_set_msr(vcpu, index, *data);
 }
 
+static void kvm_write_wall_clock(struct kvm_vcpu *v, gpa_t wall_clock)
+{
+   int version = 1;
+   struct wall_clock wc;
+   unsigned long flags;
+   struct timespec wc_ts;
+
+   local_irq_save(flags);
+   kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
+ &v->arch.hv_clock.tsc_timestamp);
+   wc_ts = current_kernel_time();
+   local_irq_restore(flags);
+
+   down_write(¤t->mm->mmap_sem);
+   kvm_write_guest(v->kvm, wall_clock, &version, sizeof(version));
+   up_write(¤t->mm->mmap_sem);
+
+   /* With all the info we got, fill in the values */
+   wc.wc_sec = wc_ts.tv_sec;
+   wc.wc_nsec = wc_ts.tv_nsec;
+   wc.wc_version = ++version;
+
+   down_write(¤t->mm->mmap_sem);
+   kvm_write_guest(v->kvm, wall_clock, &wc, sizeof(wc));
+   up_write(¤t->mm->mmap_sem);
+}
+static void kvm_write_guest_time(struct kvm_vcpu *v)
+{
+   struct timespec ts;
+   unsigned long flags;
+   struct kvm_vcpu_arch *vcpu = &v->arch;
+   void *shared_kaddr;
+
+   if ((!vcpu->time_page))
+   return;
+
+   /* Keep irq disabled to prevent changes to the clock */
+   local_irq_save(flags);
+   kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
+ &vcpu->hv_clock.tsc_timestamp);
+   ktime_get_ts(&ts);
+   local_irq_restore(flags);
+
+   /* With all the info we got, fill in the values */
+
+   vcpu->hv_clock.system_time = ts.tv_nsec +
+(NSEC_PER_SEC * (u64)ts.tv_sec);
+   /*
+* The interface expects us to write an even number signaling that the
+* update is finished. Since the guest won't see the intermediate 
states,
+* we just write "2" at the end
+*/
+   vcpu->hv_clock.version = 2;
+
+   preempt_disable();
+
+   shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
+
+   memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
+   sizeof(vcpu->hv_clock));
+
+   kunmap_atomic(shared_kaddr, KM_USER0);
+   preempt_enable();
+
+   mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
+}
+
 
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
@@ -494,6 +562,25 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 
data)
case MSR_IA32_MISC_ENABLE:
vcpu->arch.ia32_misc_enable_msr = data;
break;
+   case MSR_KVM_WALL_CLOCK:
+   vcpu->arch.wall_clock = data;
+   kvm_write_wall_clock(vcpu, data);
+   break;
+   case MSR_KVM_SYSTEM_TIME: {
+   vcpu->arch.time = data & PAGE_MASK;
+   vcpu->arch.time_offset = data & ~PAGE_MASK;
+
+   vcpu->arch.hv_clock.tsc_to_system_mul =
+   clocksource_khz2mult(tsc_khz, 22);
+   vcpu->arch.hv_clock.tsc_shift = 22;
+
+   down_write(¤t->mm->mmap_sem);
+   vcpu->arch.time_page = gfn_to_page(vcpu->kvm, data >> 
PAGE_SHIFT);
+   up_write(¤t->mm->mmap_sem);
+   if (is_error_page(vcpu->arch.time_page))
+   vcpu->arch.time_page = NULL;
+   break;
+   }
default:
pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
return 1;
@@ -553,6 +640,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 
*pdata)
data = vcpu->arch.shadow_e

[kvm-devel] [PATCH 0/2] kvm clock - xen compatible by accident

2008-01-16 Thread Glauber de Oliveira Costa
I think I've misunderstood what you guys wanted to achieve with "xen
compatible", but now I get it. It's something that's kvm specific, 
but happens to be able to communicate with xen guests, provided they 
do a kvm-aware initialization.

So, here's the two patches for it, using two msrs and non-xen data structures

Userspace is the same, so I'm only sending these ones



-
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse012070mrt/direct/01/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 2/2] fill cpuid with clocksource information

2008-01-15 Thread Glauber de Oliveira Costa
In this patch, we probe the host checking for clocksource capabilities.
In case it is found, it is advertised to the guest through the appropriate
cpuid mechanism

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 qemu/qemu-kvm-x86.c |7 ++-
 1 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/qemu/qemu-kvm-x86.c b/qemu/qemu-kvm-x86.c
index 21ec112..6b11f02 100644
--- a/qemu/qemu-kvm-x86.c
+++ b/qemu/qemu-kvm-x86.c
@@ -517,6 +517,11 @@ int kvm_arch_qemu_init_env(CPUState *cenv)
 int cpuid_nent = 0;
 CPUState copy;
 uint32_t i, limit;
+int clocksource_feature = 0;
+#ifdef KVM_CAP_CLOCKSOURCE
+clocksource_feature = kvm_check_extension(kvm_context, 
KVM_CAP_CLOCKSOURCE);
+clocksource_feature <<= KVM_FEATURE_CLOCKSOURCE; 
+#endif
 
 copy = *cenv;
 
@@ -534,7 +539,7 @@ int kvm_arch_qemu_init_env(CPUState *cenv)
 pv_ent = &cpuid_ent[cpuid_nent++];
 memset(pv_ent, 0, sizeof(*pv_ent));
 pv_ent->function = KVM_CPUID_FEATURES;
-pv_ent->eax = 0;
+pv_ent->eax = clocksource_feature;
 #endif
 
 copy.regs[R_EAX] = 0;
-- 
1.5.0.6


-
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse012070mrt/direct/01/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 1/2] include kvm_para.h

2008-01-15 Thread Glauber de Oliveira Costa
have qemu-kvm to include kvm_para, allowing access to paravirt definitions

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 qemu/qemu-kvm-x86.c |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/qemu/qemu-kvm-x86.c b/qemu/qemu-kvm-x86.c
index c79ca36..21ec112 100644
--- a/qemu/qemu-kvm-x86.c
+++ b/qemu/qemu-kvm-x86.c
@@ -14,6 +14,7 @@ extern int kvm_irqchip;
 #include 
 #include 
 #include 
+#include 
 
 #define MSR_IA32_TSC   0x10
 
-- 
1.5.0.6


-
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse012070mrt/direct/01/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 0/2] kvm clock - userspace with patches

2008-01-15 Thread Glauber de Oliveira Costa
userspace, with patches attached.



-
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse012070mrt/direct/01/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 3/3] [PATCH] kvmclock implementation, the guest part.

2008-01-15 Thread Glauber de Oliveira Costa
This is the guest part of kvm clock implementation
It does not do tsc-only timing, as tsc can have deltas
between cpus, and it did not seem worthy to me to keep
adjusting them.

We do use it, however, for fine-grained adjustment.

Other than that, time comes from the host.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 arch/x86/Kconfig|   10 +++
 arch/x86/kernel/Makefile_32 |1 +
 arch/x86/kernel/kvmclock.c  |  161 +++
 arch/x86/kernel/setup_32.c  |5 ++
 4 files changed, 177 insertions(+), 0 deletions(-)
 create mode 100644 arch/x86/kernel/kvmclock.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ab2df55..968315e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -350,6 +350,16 @@ config VMI
  at the moment), by linking the kernel to a GPL-ed ROM module
  provided by the hypervisor.
 
+config KVM_CLOCK
+   bool "KVM paravirtualized clock"
+   select PARAVIRT
+   help
+ Turning on this option will allow you to run a paravirtualized clock
+ when running over the KVM hypervisor. Instead of relying on a PIT
+ (or probably other) emulation by the underlying device model, the host
+ provides the guest with timing infrastructure, as time of day, and
+ timer expiration.
+
 source "arch/x86/lguest/Kconfig"
 
 endif
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index a7bc93c..f6332b6 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -44,6 +44,7 @@ obj-$(CONFIG_K8_NB)   += k8.o
 obj-$(CONFIG_MGEODE_LX)+= geode_32.o mfgpt_32.o
 
 obj-$(CONFIG_VMI)  += vmi_32.o vmiclock_32.o
+obj-$(CONFIG_KVM_CLOCK)+= kvmclock.o
 obj-$(CONFIG_PARAVIRT) += paravirt_32.o
 obj-y  += pcspeaker.o
 
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
new file mode 100644
index 000..317bee0
--- /dev/null
+++ b/arch/x86/kernel/kvmclock.c
@@ -0,0 +1,161 @@
+/*  KVM paravirtual clock driver. A clocksource implementation
+Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include 
+#include 
+#include 
+#include 
+
+
+#define KVM_SCALE 22
+
+static int kvmclock = 1;
+
+static int parse_no_kvmclock(char *arg)
+{
+   kvmclock = 0;
+   return 0;
+}
+early_param("no-kvmclock", parse_no_kvmclock);
+
+struct xen_shared_info shared_info __attribute__((__aligned__(PAGE_SIZE)));
+
+/* The hypervisor will put information about time periodically here */
+DEFINE_PER_CPU(struct xen_vcpu_time_info *, hv_clock);
+#define get_clock(cpu, field) per_cpu(hv_clock, cpu)->field
+
+static inline u64 kvm_get_delta(u64 last_tsc)
+{
+   int cpu = smp_processor_id();
+   u64 delta = native_read_tsc() - last_tsc;
+   return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
+}
+
+/*
+ * The wallclock is the time of day when we booted. Since then, some time may
+ * have elapsed since the hypervisor wrote the data. So we try to account for
+ * that. Even if the tsc is not accurate, it gives us a more accurate timing
+ * than not adjusting at all
+ */
+unsigned long kvm_get_wallclock(void)
+{
+   u32 wc_sec, wc_nsec;
+   u64 delta, last_tsc;
+   struct timespec ts;
+   int version, nsec, cpu = smp_processor_id();
+
+   do {
+   version = shared_info.wc_version;
+   rmb();
+   wc_sec = shared_info.wc_sec;
+   wc_nsec = shared_info.wc_nsec;
+   last_tsc = get_clock(cpu, tsc_timestamp);
+   rmb();
+   } while ((shared_info.wc_version != version) || (version & 1));
+
+   delta = kvm_get_delta(last_tsc);
+   delta += wc_nsec;
+   nsec = do_div(delta, NSEC_PER_SEC);
+   set_normalized_timespec(&ts, wc_sec + delta, nsec);
+   /*
+* Of all mechanisms of time adjustment I've tested, this one
+* was the champion!
+*/
+   return ts.tv_sec + 1;
+}
+
+int kvm_set_wallclock(unsigned long now)
+{
+   return 0;
+}
+
+/*
+ * This is our read_clock function. The host puts an tsc timestamp each time
+ *

[kvm-devel] [PATCH 2/3] kvmclock - the host part.

2008-01-15 Thread Glauber de Oliveira Costa
This is the host part of kvm clocksource implementation. As it does
not include clockevents, it is a fairly simple implementation. We
only have to register a per-vcpu area, and start writting to it periodically.

The area is binary compatible with xen, as we use the same shadow_info 
structure.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 arch/x86/kvm/x86.c |   79 +++-
 include/asm-x86/kvm_host.h |4 ++
 include/asm-x86/kvm_para.h |   37 
 include/linux/kvm.h|1 +
 4 files changed, 120 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8a90403..53b5692 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -19,6 +19,7 @@
 #include "irq.h"
 #include "mmu.h"
 
+#include 
 #include 
 #include 
 #include 
@@ -412,7 +413,7 @@ static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-   MSR_IA32_TIME_STAMP_COUNTER,
+   MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_PARAVIRT_CLOCK,
 };
 
 static unsigned num_msrs_to_save;
@@ -467,6 +468,60 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned 
index, u64 *data)
return kvm_set_msr(vcpu, index, *data);
 }
 
+#define WC_OFFSET offsetof(struct xen_shared_info, wc_version)
+
+static void kvm_write_guest_time(struct kvm_vcpu *v)
+{
+   struct timespec ts, wc_ts;
+   int wc_args[3]; /* version, wc_sec, wc_nsec */
+   unsigned long flags;
+   struct kvm_vcpu_arch *vcpu = &v->arch;
+   struct xen_shared_info *shared_kaddr;
+
+   if ((!vcpu->shared_page))
+   return;
+
+   /* Keep irq disabled to prevent changes to the clock */
+   local_irq_save(flags);
+   kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
+ &vcpu->hv_clock.tsc_timestamp);
+   wc_ts = current_kernel_time();
+   ktime_get_ts(&ts);
+   local_irq_restore(flags);
+
+   /* With all the info we got, fill in the values */
+   wc_args[1] = wc_ts.tv_sec;
+   wc_args[2] = wc_ts.tv_nsec;
+
+   vcpu->hv_clock.system_time = ts.tv_nsec +
+(NSEC_PER_SEC * (u64)ts.tv_sec);
+   /*
+* The interface expects us to write an even number signaling that the
+* update is finished. Since the guest won't see the intermediate 
states,
+* we just write "2" at the end
+*/
+   wc_args[0] = 2;
+   vcpu->hv_clock.version = 2;
+
+   preempt_disable();
+
+   shared_kaddr = kmap_atomic(vcpu->shared_page, KM_USER0);
+
+   /*
+* We could write everything at once, but it can break future
+* implementations. We're just a tiny and lonely clock, so let's
+* write only what matters here
+*/
+   memcpy(&shared_kaddr->wc_version, wc_args, sizeof(wc_args));
+   memcpy(&shared_kaddr->vcpu_info[v->vcpu_id].time, &vcpu->hv_clock,
+   sizeof(vcpu->hv_clock));
+
+   kunmap_atomic(shared_kaddr, KM_USER0);
+   preempt_enable();
+
+   mark_page_dirty(v->kvm, vcpu->shared_info >> PAGE_SHIFT);
+}
+
 
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
@@ -494,6 +549,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 
data)
case MSR_IA32_MISC_ENABLE:
vcpu->arch.ia32_misc_enable_msr = data;
break;
+   case MSR_KVM_PARAVIRT_CLOCK: {
+   vcpu->arch.shared_info = data;
+
+   vcpu->arch.hv_clock.tsc_to_system_mul =
+   clocksource_khz2mult(tsc_khz, 22);
+   vcpu->arch.hv_clock.tsc_shift = 22;
+
+   down_write(¤t->mm->mmap_sem);
+   vcpu->arch.shared_page = gfn_to_page(vcpu->kvm, data >> 
PAGE_SHIFT);
+   up_write(¤t->mm->mmap_sem);
+   if (is_error_page(vcpu->arch.shared_page))
+   vcpu->arch.shared_page = NULL;
+   break;
+   }
default:
pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
return 1;
@@ -553,6 +622,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 
*pdata)
data = vcpu->arch.shadow_efer;
break;
 #endif
+   case MSR_KVM_PARAVIRT_CLOCK:
+   data = vcpu->arch.shared_info;
+   break;
+
default:
pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
return 1;
@@ -680,6 +753,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_USER_MEMORY:
case KVM_CAP_SET_TSS_ADDR:
case KVM_CAP_EXT_CPUID:
+   case KVM_CAP_CLOCKSOURCE:
r = 1;
break;
case K

[kvm-devel] [PATCH 0/3] kvm clock - with the patches this time

2008-01-15 Thread Glauber de Oliveira Costa
The 3 patches should be attached now.

userspace follows



-
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse012070mrt/direct/01/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 1/3] put kvm_para.h include outside __KERNEL__

2008-01-15 Thread Glauber de Oliveira Costa
kvm_para.h potentially contains definitions that are to be used by 
kvm-userspace,
so it should not be included inside the __KERNEL__ block. To protect its own 
data structures,
kvm_para.h already includes its own __KERNEL__ block.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Acked-by: Amit Shah <[EMAIL PROTECTED]>
---
 include/linux/kvm_para.h |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index 6af91a5..5497aac 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -14,12 +14,12 @@
 
 #define KVM_HC_VAPIC_POLL_IRQ1
 
-#ifdef __KERNEL__
 /*
  * hypercalls use architecture specific
  */
 #include 
 
+#ifdef __KERNEL__
 static inline int kvm_para_has_feature(unsigned int feature)
 {
if (kvm_arch_para_features() & (1UL << feature))
-- 
1.5.0.6


-
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse012070mrt/direct/01/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 0/3] kvm clock, next iteration

2008-01-15 Thread Glauber de Oliveira Costa
Avi Kivity wrote:
> Glauber de Oliveira Costa wrote:
>> Here's a new version, with some comments addressed.
>> I'm using only one msr instead of two as avi sugested.
>>
>> Reason is that, following another comentary of avi,
>> I'm doing kmap_atomic's instead of kvm_write. however,
>> as all the information is part of the same page, both kmaps
>> end up pointing to the same place, rendering the two msrs
>> useless.
>>
>> userspace follows
>>
>>   
> Patches missing?
> 
really? I sent them through my default send mail script.
I'll send now.

-
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse012070mrt/direct/01/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 0/2] kvm clock - userspace

2008-01-14 Thread Glauber de Oliveira Costa
Here are the two patches of the userspace part of the series.
First patch is unchanged.



-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 0/3] kvm clock, next iteration

2008-01-14 Thread Glauber de Oliveira Costa
Here's a new version, with some comments addressed.
I'm using only one msr instead of two as avi sugested.

Reason is that, following another comentary of avi,
I'm doing kmap_atomic's instead of kvm_write. however,
as all the information is part of the same page, both kmaps
end up pointing to the same place, rendering the two msrs
useless.

userspace follows



-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 1/2] include kvm_para.h

2008-01-14 Thread Glauber de Oliveira Costa
Avi Kivity wrote:
> Glauber de Oliveira Costa wrote:
>> have qemu-kvm to include kvm_para, allowing access to paravirt 
>> definitions
>>
>> Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
>> ---
>>  qemu/qemu-kvm-x86.c |1 +
>>  1 files changed, 1 insertions(+), 0 deletions(-)
>>
>> diff --git a/qemu/qemu-kvm-x86.c b/qemu/qemu-kvm-x86.c
>> index c79ca36..21ec112 100644
>> --- a/qemu/qemu-kvm-x86.c
>> +++ b/qemu/qemu-kvm-x86.c
>> @@ -14,6 +14,7 @@ extern int kvm_irqchip;
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  
> 
> IIRC, early versions of kvm_para.h were userspace unfriendly, so we need 
> a version code check (or better, a KVM_CAP_ check) around this.
> 
No need. including kvm_para.h in userspace unfriendly tools won't break 
anything. Code that handles it userspace is already enclosed in the 
KVM_CAP_ check.

(well, in reality, I forgot to enclose one line with it. But it's a bug,
and will be fixed in the series I'm about to shoot out.


-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 3/3] [PATCH] kvmclock implementation, the guest part.

2008-01-11 Thread Glauber de Oliveira Costa
Gerd Hoffmann wrote:
>   Hi,
> 
>> diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
>> index d083ff5..7728b87 100644
>> --- a/arch/x86/xen/time.c
>> +++ b/arch/x86/xen/time.c
>> -static cycle_t xen_clocksource_read(void);
>> +cycle_t xen_clocksource_read(void);
> 
> Huh?  You kill the static, but don't use the functions anywhere?  Looks
> like half-done code sharing with xen paravirt clock ...
> 
> cheers,
>   Gerd
> 
It's called experimentation. I was sure I deleted this part of the patch 
altogheter, but I must have
missed it. Thanks for the point out.



-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 2/2] fill cpuid with clocksource information

2008-01-11 Thread Glauber de Oliveira Costa
In this patch, we probe the host checking for clocksource capabilities.
In case it is found, it is advertised to the guest through the appropriate
cpuid mechanism

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 qemu/qemu-kvm-x86.c |6 +-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/qemu/qemu-kvm-x86.c b/qemu/qemu-kvm-x86.c
index 21ec112..c008cb7 100644
--- a/qemu/qemu-kvm-x86.c
+++ b/qemu/qemu-kvm-x86.c
@@ -517,6 +517,10 @@ int kvm_arch_qemu_init_env(CPUState *cenv)
 int cpuid_nent = 0;
 CPUState copy;
 uint32_t i, limit;
+int has_clocksource = 0;
+#ifdef KVM_CAP_CLOCKSOURCE
+has_clocksource = kvm_check_extension(kvm_context, KVM_CAP_CLOCKSOURCE);
+#endif
 
 copy = *cenv;
 
@@ -534,7 +538,7 @@ int kvm_arch_qemu_init_env(CPUState *cenv)
 pv_ent = &cpuid_ent[cpuid_nent++];
 memset(pv_ent, 0, sizeof(*pv_ent));
 pv_ent->function = KVM_CPUID_FEATURES;
-pv_ent->eax = 0;
+pv_ent->eax = (has_clocksource << KVM_FEATURE_CLOCKSOURCE);
 #endif
 
 copy.regs[R_EAX] = 0;
-- 
1.5.0.6


-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 1/2] include kvm_para.h

2008-01-11 Thread Glauber de Oliveira Costa
have qemu-kvm to include kvm_para, allowing access to paravirt definitions

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 qemu/qemu-kvm-x86.c |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/qemu/qemu-kvm-x86.c b/qemu/qemu-kvm-x86.c
index c79ca36..21ec112 100644
--- a/qemu/qemu-kvm-x86.c
+++ b/qemu/qemu-kvm-x86.c
@@ -14,6 +14,7 @@ extern int kvm_irqchip;
 #include 
 #include 
 #include 
+#include 
 
 #define MSR_IA32_TSC   0x10
 
-- 
1.5.0.6


-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 0/2] KVM clock - userspace part

2008-01-11 Thread Glauber de Oliveira Costa
These are pretty straightforward. Just probe the host, and advertise the guest



-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 3/3] [PATCH] kvmclock implementation, the guest part.

2008-01-11 Thread Glauber de Oliveira Costa
This is the guest part of kvm clock implementation
It does not do tsc-only timing, as tsc can have deltas
between cpus, and it did not seem worthy to me to keep
adjusting them.

We do use it, however, for fine-grained adjustment.

Other than that, time comes from the host.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 arch/x86/Kconfig|   10 +++
 arch/x86/kernel/Makefile_32 |1 +
 arch/x86/kernel/kvmclock.c  |  160 +++
 arch/x86/kernel/setup_32.c  |5 ++
 arch/x86/xen/time.c |6 +-
 5 files changed, 179 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/kernel/kvmclock.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ab2df55..968315e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -350,6 +350,16 @@ config VMI
  at the moment), by linking the kernel to a GPL-ed ROM module
  provided by the hypervisor.
 
+config KVM_CLOCK
+   bool "KVM paravirtualized clock"
+   select PARAVIRT
+   help
+ Turning on this option will allow you to run a paravirtualized clock
+ when running over the KVM hypervisor. Instead of relying on a PIT
+ (or probably other) emulation by the underlying device model, the host
+ provides the guest with timing infrastructure, as time of day, and
+ timer expiration.
+
 source "arch/x86/lguest/Kconfig"
 
 endif
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index a7bc93c..f6332b6 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -44,6 +44,7 @@ obj-$(CONFIG_K8_NB)   += k8.o
 obj-$(CONFIG_MGEODE_LX)+= geode_32.o mfgpt_32.o
 
 obj-$(CONFIG_VMI)  += vmi_32.o vmiclock_32.o
+obj-$(CONFIG_KVM_CLOCK)+= kvmclock.o
 obj-$(CONFIG_PARAVIRT) += paravirt_32.o
 obj-y  += pcspeaker.o
 
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
new file mode 100644
index 000..f2094f4
--- /dev/null
+++ b/arch/x86/kernel/kvmclock.c
@@ -0,0 +1,160 @@
+/*  KVM paravirtual clock driver. A clocksource implementation
+Copyright (C) 2007 Glauber de Oliveira Costa, Red Hat Inc.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include 
+#include 
+#include 
+#include 
+
+
+#define KVM_SCALE 22
+
+static int kvmclock = 1;
+
+static int parse_no_kvmclock(char *arg)
+{
+   kvmclock = 0;
+   return 0;
+}
+early_param("no-kvmclock", parse_no_kvmclock);
+
+struct shared_info shared_info __attribute__((__aligned__(PAGE_SIZE)));
+
+/* The hypervisor will put information about time periodically here */
+DEFINE_PER_CPU(struct vcpu_time_info *, hv_clock);
+#define get_clock(cpu, field) per_cpu(hv_clock, cpu)->field
+
+static inline u64 kvm_get_delta(u64 last_tsc)
+{
+   int cpu = smp_processor_id();
+   u64 delta = native_read_tsc() - last_tsc;
+   return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
+}
+
+/*
+ * The wallclock is the time of day when we booted. Since then, some time may
+ * have elapsed since the hypervisor wrote the data. So we try to account for
+ * that. Even if the tsc is not accurate, it gives us a more accurate timing
+ * than not adjusting at all
+ */
+unsigned long kvm_get_wallclock(void)
+{
+   u32 wc_sec, wc_nsec;
+   u64 delta, last_tsc;
+   struct timespec ts;
+   int version, nsec, cpu = smp_processor_id();
+
+   do {
+   version = shared_info.wc_version;
+   rmb();
+   wc_sec = shared_info.wc_sec;
+   wc_nsec = shared_info.wc_nsec;
+   last_tsc = get_clock(cpu, tsc_timestamp);
+   rmb();
+   } while ((shared_info.wc_version != version) || (version & 1));
+
+   delta = kvm_get_delta(last_tsc);
+   delta += wc_nsec;
+   nsec = do_div(delta, NSEC_PER_SEC);
+   set_normalized_timespec(&ts, wc_sec + delta, nsec);
+   /*
+* Of all mechanisms of time adjustment I've tested, this one
+* was the champion!
+*/
+   return ts.tv_sec + 1;
+}
+
+int kvm_set_wallclock(unsigned long now)
+{
+   return 0;
+}
+
+/*
+ * This is our read_clock function. The host puts an tsc 

[kvm-devel] [PATCH 2/3] [PATCH] kvmclock - the host part.

2008-01-11 Thread Glauber de Oliveira Costa
This is the host part of kvm clocksource implementation. As it does
not include clockevents, it is a fairly simple implementation. We
only have to register a per-vcpu area, and start writting to it periodically.

The area is binary compatible with xen, as we use the same shadow_info 
structure.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 arch/x86/kvm/x86.c |   64 +++-
 include/asm-x86/kvm_host.h |5 +++
 include/asm-x86/kvm_para.h |5 +++
 include/linux/kvm.h|1 +
 4 files changed, 74 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8a90403..d3fd920 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -19,6 +19,7 @@
 #include "irq.h"
 #include "mmu.h"
 
+#include 
 #include 
 #include 
 #include 
@@ -412,7 +413,7 @@ static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-   MSR_IA32_TIME_STAMP_COUNTER,
+   MSR_IA32_TIME_STAMP_COUNTER, MSR_PARAVIRT_CLOCK,
 };
 
 static unsigned num_msrs_to_save;
@@ -467,6 +468,50 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned 
index, u64 *data)
return kvm_set_msr(vcpu, index, *data);
 }
 
+#define WC_OFFSET offsetof(struct shared_info, wc_version)
+
+static void kvm_write_guest_time(struct kvm_vcpu *v)
+{
+   struct timespec ts, wc_ts;
+   int wc_args[3]; /* version, wc_sec, wc_nsec */
+   unsigned long flags;
+   struct kvm_vcpu_arch *vcpu = &v->arch;
+
+   if (!vcpu->shared_info)
+   return;
+
+   /* Make the update of both version numbers visible to guest */
+   wc_args[0] = ++vcpu->wc_version;
+   kvm_write_guest(v->kvm, vcpu->shared_info + WC_OFFSET, wc_args,
+   sizeof(wc_args));
+   vcpu->hv_clock.version++;
+   kvm_write_guest(v->kvm, vcpu->clock_addr, &vcpu->hv_clock,
+   sizeof(vcpu->hv_clock));
+
+   /* Keep irq disabled to prevent changes to the clock */
+   local_irq_save(flags);
+   kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
+ &vcpu->hv_clock.tsc_timestamp);
+   wc_ts = current_kernel_time();
+   ktime_get_ts(&ts);
+   local_irq_restore(flags);
+
+   /* With all the info we got, fill in the values */
+   wc_args[1] = wc_ts.tv_sec;
+   wc_args[2] = wc_ts.tv_nsec;
+   wc_args[0] = ++vcpu->wc_version;
+
+   vcpu->hv_clock.system_time = ts.tv_nsec +
+(NSEC_PER_SEC * (u64)ts.tv_sec);
+   vcpu->hv_clock.version++;
+
+   /* And finally, let the guest see them */
+   kvm_write_guest(v->kvm, vcpu->shared_info + WC_OFFSET, wc_args,
+   sizeof(wc_args));
+   kvm_write_guest(v->kvm, vcpu->clock_addr, &vcpu->hv_clock,
+   sizeof(vcpu->hv_clock));
+}
+
 
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
@@ -494,6 +539,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 
data)
case MSR_IA32_MISC_ENABLE:
vcpu->arch.ia32_misc_enable_msr = data;
break;
+   case MSR_PARAVIRT_CLOCK: {
+   vcpu->arch.shared_info = data;
+
+   vcpu->arch.clock_addr = data + offsetof(struct vcpu_info, time)
++ sizeof(struct vcpu_info) * vcpu->vcpu_id;
+
+   vcpu->arch.hv_clock.tsc_to_system_mul =
+   clocksource_khz2mult(tsc_khz, 22);
+   vcpu->arch.hv_clock.tsc_shift = 22;
+   kvm_write_guest_time(vcpu);
+   break;
+   }
default:
pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
return 1;
@@ -553,6 +610,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 
*pdata)
data = vcpu->arch.shadow_efer;
break;
 #endif
+   case MSR_PARAVIRT_CLOCK:
+   data = vcpu->arch.shared_info;
+   break;
default:
pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
return 1;
@@ -680,6 +740,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_USER_MEMORY:
case KVM_CAP_SET_TSS_ADDR:
case KVM_CAP_EXT_CPUID:
+   case KVM_CAP_CLOCKSOURCE:
r = 1;
break;
case KVM_CAP_VAPIC:
@@ -737,6 +798,7 @@ out:
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
kvm_x86_ops->vcpu_load(vcpu, cpu);
+   kvm_write_guest_time(vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index d6db0de..bbc4b51 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h

[kvm-devel] [PATCH 1/3] put kvm_para.h include outside __KERNEL__

2008-01-11 Thread Glauber de Oliveira Costa
kvm_para.h potentially contains definitions that are to be used by 
kvm-userspace,
so it should not be included inside the __KERNEL__ block. To protect its own 
data structures,
kvm_para.h already includes its own __KERNEL__ block.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 include/linux/kvm_para.h |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index 6af91a5..5497aac 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -14,12 +14,12 @@
 
 #define KVM_HC_VAPIC_POLL_IRQ1
 
-#ifdef __KERNEL__
 /*
  * hypercalls use architecture specific
  */
 #include 
 
+#ifdef __KERNEL__
 static inline int kvm_para_has_feature(unsigned int feature)
 {
if (kvm_arch_para_features() & (1UL << feature))
-- 
1.5.0.6


-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 0/3] KVM clock, new iteration

2008-01-11 Thread Glauber de Oliveira Costa
Hi folks,

A lot of time has been elapsed since I last sent a version of kvm clock,
with a lot of things in the middle, including my vacations and a nasty bug
in the clock itself ;-)

Here it is a new version. I'm following avi's last suggestion of using an msr
to make it more migration-proof. If it's not approved, we can easily switch back
to the hypercall mechanism. That's the easy part ;-)

Clock updates are done every time we're scheduled. It is _not_ enough to check
if we're changing cpus, because the tsc will change its frequency when the cpu 
is
idle (if using the mwait idle). So after a long idle period, one cpu will end 
up stepping
ahead causing a backward time (remember the bug I talked about? :-) )

As I fell it ready for inclusion (which obviously does not mean it is perfect, 
nor that
I don't value any further comments ), I'm also posting the userspace patches in 
a separate
series that will follow shortly.



-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/RFC 0/2] CPU hotplug virtio driver

2008-01-09 Thread Glauber de Oliveira Costa
Christian Borntraeger wrote:
> Am Mittwoch, 9. Januar 2008 schrieb Glauber de Oliveira Costa:
>> I'm sending a first draft of my proposed cpu hotplug driver for kvm/virtio
>> The first patch is the kernel module, while the second, the userspace pci 
> device.
> 
> I personally prefer to use non paravirtualized cpu hotplug and implement the 
> existing interfaces instead in the hypervisor. (sigp/sclp for s390 and 
> maybe acpi for x86 etc.). This is not a performance critical operation.
> 
> 
> Christian
It's barely "paravirtualized". It's a device driver talking to a 
hardware from the guest POV, not much different than acpi itself is, but 
sanner. The only difference is that acpi support "already exists", but 
virtio is a lot easier, and it wouldn't take long to write a driver like 
that for windows or anything else (not that I am volunteering ;-)).

Also, in the future the interface can be extended in ways acpi can't 
handle. (and just because nobody will change the hardware to embrace new 
things that fast)

That said, if acpi is really the preference here, and this patches have 
chance, no problem. But it will take me a little more time to implement 
them ;-)

-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/RFC 0/2] CPU hotplug virtio driver

2008-01-09 Thread Glauber de Oliveira Costa
Avi Kivity wrote:
> Glauber de Oliveira Costa wrote:
>> I'm sending a first draft of my proposed cpu hotplug driver for 
>> kvm/virtio
>> The first patch is the kernel module, while the second, the userspace 
>> pci device.
>>
>> The host boots with the maximum cpus it should ever use, through the 
>> -smp parameter.
>> Due to real machine constraints (which qemu copies), i386 does not 
>> allow for any addition
>> of cpus after boot, so this is the most general way.
>>
>> I do however, include an "attempt_buffer" in the userspace part. It's 
>> purpose is to
>> allow tools like virt-manager to set a max_cpus (-smp), and a desired 
>> number of cpus
>> in their configuration files. (and AFAICT, there's is no easy way for 
>> them to tell when the
>> backend driver is up and running)
>>
>> Other than that, it should be pretty much straightforward.
>>
>> Looking forward for your comments
>>
>>   
> 
> I would much prefer to see cpu hotplug implemented via acpi.  Such an 
> implementation would work on older kernels without change, and will also 
> work with other operating systems.  It isn't a high-speed interface so 
> virtio doesn't buy us anything.
> 
> Linux appears to support it (CONFIG_ACPI_HOTPLUG_CPU) so all that's 
> needed is the host side support (likely qemu/bios only).  Of course 
> hacking on acpi is fun, if you're the kind of person than enjoys dental 
> surgery.
> 
Being acpi so that fun, how can it be that virtio don't by us _anything_ ?

-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH] cpu hotplug driver: userspace back end

2008-01-09 Thread Glauber de Oliveira Costa
Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 qemu/Makefile.target |2 +-
 qemu/hw/pc.c |4 +-
 qemu/hw/pc.h |3 +
 qemu/hw/virtio-hotplug.c |  111 ++
 qemu/monitor.c   |   11 +
 qemu/qemu-kvm.h  |2 +
 6 files changed, 131 insertions(+), 2 deletions(-)
 create mode 100644 qemu/hw/virtio-hotplug.c

diff --git a/qemu/Makefile.target b/qemu/Makefile.target
index bb7be0f..4d5679a 100644
--- a/qemu/Makefile.target
+++ b/qemu/Makefile.target
@@ -464,7 +464,7 @@ VL_OBJS += rtl8139.o
 VL_OBJS+= hypercall.o
 
 # virtio devices
-VL_OBJS += virtio.o virtio-net.o virtio-blk.o
+VL_OBJS += virtio.o virtio-net.o virtio-blk.o virtio-hotplug.o
 
 ifeq ($(TARGET_BASE_ARCH), i386)
 # Hardware support
diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
index 3972ab4..c9d8f89 100644
--- a/qemu/hw/pc.c
+++ b/qemu/hw/pc.c
@@ -1029,7 +1029,9 @@ static void pc_init1(ram_addr_t ram_size, int 
vga_ram_size,
 }
 }
 
-#define USE_HYPERCALL
+virtio_hotplug_init(pci_bus);
+
+#undef USE_HYPERCALL
 #ifdef USE_HYPERCALL
 pci_hypercall_init(pci_bus);
 #endif
diff --git a/qemu/hw/pc.h b/qemu/hw/pc.h
index 7d1832f..3e2cfb4 100644
--- a/qemu/hw/pc.h
+++ b/qemu/hw/pc.h
@@ -151,6 +151,9 @@ void virtio_net_poll(void);
 void *virtio_blk_init(PCIBus *bus, uint16_t vendor, uint16_t device,
  BlockDriverState *bs);
 
+/* virtio-hotplug.c */
+void *virtio_hotplug_init(PCIBus *bus);
+
 /* extboot.c */
 
 void extboot_init(BlockDriverState *bs, int cmd);
diff --git a/qemu/hw/virtio-hotplug.c b/qemu/hw/virtio-hotplug.c
new file mode 100644
index 000..e51f26f
--- /dev/null
+++ b/qemu/hw/virtio-hotplug.c
@@ -0,0 +1,111 @@
+#include "virtio.h"
+#include "net.h"
+#include "pc.h"
+#include "sysemu.h"
+
+typedef struct VirtIOHotplug {
+VirtIODevice vdev;
+VirtQueue *cmd_vq;
+int buffer_ready;
+struct VirtIOHotplug *next;
+int job_status;
+} VirtIOHotplug;
+
+typedef struct VirtioHotplugHdr {
+uint8_t cmd;
+uint8_t status;
+} VirtioHotplugHdr;
+
+VirtIOHotplug *virtio_hotplug;
+uint32_t attempt_buffer;
+
+#define VIRTIO_ID_HOTPLUG 4 /* arbitrary */
+
+#define CMD_CPU_SET 1
+
+static VirtIOHotplug *to_virtio_hotplug(VirtIODevice *vdev)
+{
+return (VirtIOHotplug *)vdev;
+}
+
+static void virtio_hotplug_update_config(VirtIODevice *vdev, uint8_t *config)
+{
+/* nothing to do */
+}
+
+static uint32_t virtio_hotplug_get_features(VirtIODevice *vdev)
+{
+/* no features yet */
+return 0;
+}
+
+static void virtio_hotplug_send(VirtIODevice *vdev, VirtQueue *vq, uint8_t cmd,
+   const uint32_t arg)
+{
+VirtQueueElement elem;
+VirtioHotplugHdr *hdr;
+uint32_t *data;
+
+if (virtqueue_pop(vq, &elem) == 0) {
+fprintf(stderr, "pop failure\n"); 
+return;
+}
+
+hdr = (void *)elem.in_sg[0].iov_base;
+hdr->cmd = cmd;
+
+data = (int *)elem.in_sg[1].iov_base;
+*data = arg;
+ 
+virtqueue_push(vq, &elem, sizeof(*hdr) + elem.in_sg[1].iov_len);
+virtio_notify(vdev, vq);
+}
+
+int hotplug_send_cmd(int value)
+{
+   
+if (!virtio_hotplug->buffer_ready) {
+attempt_buffer = value;
+return;
+}
+
+virtio_hotplug_send(&virtio_hotplug->vdev, virtio_hotplug->cmd_vq, 
+CMD_CPU_SET, value);
+
+return 0;
+}
+
+/* TX */
+static void virtio_hotplug_handle_cmd(VirtIODevice *vdev, VirtQueue *vq)
+{
+VirtIOHotplug *n = to_virtio_hotplug(vdev);
+
+n->buffer_ready = 1;
+
+if (attempt_buffer) {
+uint32_t value = attempt_buffer;
+attempt_buffer = 0;
+hotplug_send_cmd(value);
+}
+
+}
+
+void *virtio_hotplug_init(PCIBus *bus)
+{
+VirtIOHotplug *n;
+
+n = (VirtIOHotplug *)virtio_init_pci(bus, "virtio-hotplug", 0x2523, 0x2325,
+0, VIRTIO_ID_HOTPLUG,
+0xff, 0x80, 0x00,
+6, sizeof(VirtIOHotplug));
+
+n->vdev.update_config = virtio_hotplug_update_config;
+n->vdev.get_features = virtio_hotplug_get_features;
+n->cmd_vq = virtio_add_queue(&n->vdev, 128, virtio_hotplug_handle_cmd);
+
+n->buffer_ready = 0;
+
+virtio_hotplug = n;
+
+return &n->vdev;
+}
diff --git a/qemu/monitor.c b/qemu/monitor.c
index e03c473..f0c55c6 100644
--- a/qemu/monitor.c
+++ b/qemu/monitor.c
@@ -346,6 +346,16 @@ static void do_cpu_set(int index)
 term_printf("Invalid CPU index\n");
 }
 
+static void do_cpu_set_nr(int value)
+{
+if ((value < 1)) {
+   term_printf("value out of range\n");
+   return;
+}
+
+hotplug_send_cmd(value);
+}
+
 static void do_info_jit(void)
 {
 dump_exec_info(NULL, monitor_fprintf);
@@ -1339,6 +1349,7 @@ static term_cmd_t term_cmds[] = {
 

[kvm-devel] [PATCH] cpu hotplug driver: kernel module

2008-01-09 Thread Glauber de Oliveira Costa
Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 drivers/virtio/Kconfig  |6 +
 drivers/virtio/Makefile |1 +
 drivers/virtio/virtio_cpu.c |  226 +++
 drivers/virtio/virtio_pci.c |1 +
 kernel/cpu.c|2 +
 5 files changed, 236 insertions(+), 0 deletions(-)
 create mode 100644 drivers/virtio/virtio_cpu.c

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 73a414c..6665d4d 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -23,3 +23,9 @@ config VIRTIO_PCI
 
  If unsure, say M.
 
+config VIRTIO_CPU
+   tristate "virtio-cpu"
+   depends on VIRTIO_PCI && EXPERIMENTAL
+   ---help---
+ I'll obviously put a better description later
+
diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
index cc84999..057ebcc 100644
--- a/drivers/virtio/Makefile
+++ b/drivers/virtio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VIRTIO) += virtio.o
 obj-$(CONFIG_VIRTIO_RING) += virtio_ring.o
 obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o
+obj-$(CONFIG_VIRTIO_CPU) += virtio_cpu.o
diff --git a/drivers/virtio/virtio_cpu.c b/drivers/virtio/virtio_cpu.c
new file mode 100644
index 000..8dc0c7d
--- /dev/null
+++ b/drivers/virtio/virtio_cpu.c
@@ -0,0 +1,226 @@
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+
+#define VIRTIO_ID_CPU 4
+
+#define CMD_CPU_SET 1
+
+struct virtio_hotplug_hdr {
+   u8 cmd;
+   u8 status;
+};
+
+struct hotplug_buf {
+   struct virtio_hotplug_hdr hdr;
+   u32 arg;
+};
+
+struct virtio_hotplug {
+   struct virtio_device *dev;
+   struct virtqueue *cmd_receive;
+   struct task_struct *hotplug_thread;
+   wait_queue_head_t hotplug_wait;
+   u32 hotplug_target;
+   int thread_started;
+};
+
+
+static struct virtio_device_id id_table[] = {
+   { VIRTIO_ID_CPU, VIRTIO_DEV_ANY_ID},
+   { 0 },
+};
+
+void *__alloc_virtio_buf(int size)
+{
+   return kzalloc(size, GFP_KERNEL);
+}
+
+#define alloc_hotplug_buf() (__alloc_virtio_buf(sizeof(struct hotplug_buf)))
+
+static int __send_hotplug_buf(u8 cmd, struct hotplug_buf *buf,
+ struct virtqueue *vq)
+{
+   struct scatterlist sg[2];
+   int err = 0;
+   
+   sg_init_table(sg, 2);
+
+   sg_set_buf(&sg[0], &buf->hdr, sizeof(buf->hdr));
+   sg_set_buf(&sg[1], &buf->arg, sizeof(buf->arg));
+   
+   err = vq->vq_ops->add_buf(vq, sg, 0, 2, buf);
+   if (err)
+   return err;
+
+   vq->vq_ops->kick(vq);
+
+   return 0;
+} 
+
+/* Prepare the virtio driver to receive a command. */
+static int prepare_to_receive(struct virtio_hotplug *v)
+{
+   void *buf = alloc_hotplug_buf();
+
+   if (!buf)
+   return -ENOMEM;
+
+   return __send_hotplug_buf(0, buf, v->cmd_receive);
+}
+
+static void hotplug_add_cpus(struct virtio_hotplug *v)
+{
+   struct sys_device *dev;
+   struct virtqueue *vq = v->cmd_receive;
+   struct hotplug_buf *buf;
+   int i, len;
+   int err;
+
+   while ((buf = vq->vq_ops->get_buf(vq, &len)) != NULL) {
+   switch (buf->hdr.cmd) {
+   case CMD_CPU_SET:
+   v->hotplug_target = buf->arg;
+
+   if (v->hotplug_target > num_present_cpus())
+   v->hotplug_target = num_present_cpus();
+   break;
+   default:
+   printk("%s: Unrecognized command %d\n",
+   __func__, buf->hdr.cmd);
+   break;  
+   }
+   }
+   kfree(buf);
+
+
+   while ((i = num_online_cpus()) < v->hotplug_target) {
+   err = cpu_up(i);
+   dev = get_cpu_sysdev(i);
+   if (!err) {
+   if (dev)
+   kobject_uevent(&dev->kobj, KOBJ_ONLINE);
+   }
+   }
+
+   while ((i = num_online_cpus()) > v->hotplug_target) {
+   i--;
+   dev = get_cpu_sysdev(i);
+   err = cpu_down(i);
+   if (!err) {
+   if (dev)
+   kobject_uevent(&dev->kobj, KOBJ_ONLINE);
+   }
+   }
+}
+
+static int virtio_do_hotplug(void *p)
+{
+   struct virtio_hotplug *v = p;
+   DEFINE_WAIT(wait);
+   set_freezable();
+
+   for (;;) {
+   prepare_to_receive(v);
+   prepare_to_wait(&v->hotplug_wait, &wait, TASK_UNINTERRUPTIBLE);
+   v->thread_started = 1;
+   schedule();
+   finish_wait(&v->hotplug_wait, &wait);
+
+   try_to_freeze();
+
+   if (kthread_should_stop())
+   break;
+
+   h

[kvm-devel] [PATCH/RFC 0/2] CPU hotplug virtio driver

2008-01-09 Thread Glauber de Oliveira Costa
I'm sending a first draft of my proposed cpu hotplug driver for kvm/virtio
The first patch is the kernel module, while the second, the userspace pci 
device.

The host boots with the maximum cpus it should ever use, through the -smp 
parameter.
Due to real machine constraints (which qemu copies), i386 does not allow for 
any addition
of cpus after boot, so this is the most general way.

I do however, include an "attempt_buffer" in the userspace part. It's purpose 
is to
allow tools like virt-manager to set a max_cpus (-smp), and a desired number of 
cpus
in their configuration files. (and AFAICT, there's is no easy way for them to 
tell when the
backend driver is up and running)

Other than that, it should be pretty much straightforward.

Looking forward for your comments



-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] Guest kernel hangs in smp kvm for older kernels prior to tsc sync cleanup

2007-12-20 Thread Glauber de Oliveira Costa
On Dec 19, 2007 1:41 PM, Avi Kivity <[EMAIL PROTECTED]> wrote:
> Glauber de Oliveira Costa wrote:
> > Changes in rate does not sound good. It's possibly what's screwing up
> > my paravirt clock implementation in smp.
> >
>
> You should renew the timebase on vcpu migration, and hook cpufreq so
> that changes in frequency are reflected in the timebase.

 To be conservative, I do it in every vcpu run, and have any kind of
cpu frequency scaling disabled. And it does not work.

In a trace in the host, I see that vcpu runs happens very often in
vcpu 0 (probably because exits happen often there, so we have to go
back),
and comparatively, very few times in vcpu 1.

So what's probably happening is : vcpu 1 does system_time + tsc_delta,
 but vcpu 0 has already updated it so many times, the tsc does not
keep up,
and it end going backwards.

I'm running (in the host), the following test, upon module loading
(and Ingo can please tell me if I'm doing something idiotic in it,
compromising my conclusions)

void test (int foo)
{
   u64 start, stop;
   start = native_read_tsc();
   udelay(foo);
   stop = native_read_tsc();
   printk("%d Result: %lld\n", foo, foo * 1000 - cycles_2_ns(stop
- start));
}

Output is:

30 Result: -126
90 Result: 576
300 Result: 2627
1000 Result: 9381
3000 Result: 28238
5000 Result: 48086


So the delta is expecting to get bigger. If a vcpu passes a long time
without having the time updated.
Xen manages to keep the guest tsc stable and steady by doing
synchronization from time to time.

We can either: (If I'm right at this, of course):

* put a periodic timer in the host to update the system time from time to time;
* use some sort of global timestamp, instead of the per-cpu one.
* do something akin to what xen does, and still rely on the tsc.

Any thoughts?
-- 
Glauber de Oliveira Costa.
"Free as in Freedom"
http://glommer.net

"The less confident you are, the more serious you have to act."

-
SF.Net email is sponsored by:
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services
for just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] Guest kernel hangs in smp kvm for older kernels prior to tsc sync cleanup

2007-12-19 Thread Glauber de Oliveira Costa
On Dec 19, 2007 12:27 PM, Avi Kivity <[EMAIL PROTECTED]> wrote:
> Ingo Molnar wrote:
> > * Avi Kivity <[EMAIL PROTECTED]> wrote:
> >
> >
> >> Avi Kivity wrote:
> >>
> >>>  Testing shows wrmsr and rdtsc function normally.
> >>>
> >>> I'll try pinning the vcpus to cpus and see if that helps.
> >>>
> >>>
> >> It does.
> >>
> >
> > do we let the guest read the physical CPU's TSC? That would be trouble.
> >
> >
>
> vmx (and svm) allow us to add an offset to the physical tsc.  We set it
> on startup to -tsc (so that an rdtsc on boot would return 0), and
> massage it on vcpu migration so that guest rdtsc is monotonic.
>
> The net effect is that tsc on a vcpu can experience large forward jumps
> and changes in rate, but no negative jumps.
>

Changes in rate does not sound good. It's possibly what's screwing up
my paravirt clock implementation in smp.
Since the host updates guest time prior to putting vcpu to run, two
vcpus that start running at different times will have different system
values.

Now if the vcpu that started running later probes the time first,
we'll se the time going backwards. A constant tsc rate is the only way
around
my limited mind sees around the problem (besides, obviously, _not_
making the system time per-vcpu).


-- 
Glauber de Oliveira Costa.
"Free as in Freedom"
http://glommer.net

"The less confident you are, the more serious you have to act."

-
SF.Net email is sponsored by:
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services
for just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH] fix -kernel option

2007-12-10 Thread Glauber de Oliveira Costa
Dor Laor wrote:
> Glauber de Oliveira Costa wrote:
>>
>> Currently, the -kernel option is not working.
>>
>> Reason is, because we're registering chunks for regions 0-0xa and
>> 0x10-ram_size, the phys_ram_addr + PA is broken.
>> The real fix should be to rewrite all the load_linux() code to not rely
>> on this, but meanwhile, filling in the gap up to 0xc - the beginning
>> of extended memory - makes it work again
>>
>> Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
>> ---
>>  qemu/hw/pc.c |   11 ---
>>  1 files changed, 8 insertions(+), 3 deletions(-)
>>
>> diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
>> index 6c71b09..e4a5f2d 100644
>> --- a/qemu/hw/pc.c
>> +++ b/qemu/hw/pc.c
>> @@ -725,13 +725,18 @@ static void pc_init1(ram_addr_t ram_size, int 
>> vga_ram_size, int boot_device,
>>  #ifdef USE_KVM
>>   #ifdef KVM_CAP_USER_MEMORY
>>  if (kvm_allowed && kvm_qemu_check_extension(KVM_CAP_USER_MEMORY)) {
>> +ram_addr = qemu_ram_alloc(0xa);
>> +cpu_register_physical_memory(0, 0xa, ram_addr);
>> +kvm_cpu_register_physical_memory(0, 0xa, ram_addr);
>> +
>> +   /* move the pointer up to 0xc, which is the next
>> +   address we'll touch */
>> +qemu_ram_alloc(0x2);
>>
> It should be 0x6 instead of 0x2 since the code below should 
> start at offset of
> 0x10.
> This finally solved my problem with running virtio using kvm, before it 
> only worked for -no-kvm.
> In general this fixes phys_mem_base + PA.
> 
I disagree. We allocate the piece 0xc onwards a little bit later. So 
we only need to fill the gap 0xa -> 0xc

-
SF.Net email is sponsored by:
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://sourceforge.net/services/buy/index.php
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH] fix -kernel option

2007-12-07 Thread Glauber de Oliveira Costa
Currently, the -kernel option is not working.

Reason is, because we're registering chunks for regions 0-0xa and
0x10-ram_size, the phys_ram_addr + PA is broken.
The real fix should be to rewrite all the load_linux() code to not rely
on this, but meanwhile, filling in the gap up to 0xc - the beginning
of extended memory - makes it work again

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 qemu/hw/pc.c |   11 ---
 1 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
index 6c71b09..e4a5f2d 100644
--- a/qemu/hw/pc.c
+++ b/qemu/hw/pc.c
@@ -725,13 +725,18 @@ static void pc_init1(ram_addr_t ram_size, int 
vga_ram_size, int boot_device,
 #ifdef USE_KVM
  #ifdef KVM_CAP_USER_MEMORY
 if (kvm_allowed && kvm_qemu_check_extension(KVM_CAP_USER_MEMORY)) {
+ram_addr = qemu_ram_alloc(0xa);
+cpu_register_physical_memory(0, 0xa, ram_addr);
+kvm_cpu_register_physical_memory(0, 0xa, ram_addr);
+
+   /* move the pointer up to 0xc, which is the next
+   address we'll touch */
+qemu_ram_alloc(0x2);
+
 ram_addr = qemu_ram_alloc(ram_size - 0x10);
 cpu_register_physical_memory(0x10, ram_size - 0x10, ram_addr);
 kvm_cpu_register_physical_memory(0x10, ram_size - 0x10,
  ram_addr);
-ram_addr = qemu_ram_alloc(0xa);
-cpu_register_physical_memory(0, 0xa, ram_addr);
-kvm_cpu_register_physical_memory(0, 0xa, ram_addr);
 } else
  #endif
 #endif
-- 
1.5.0.6


-
SF.Net email is sponsored by:
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://sourceforge.net/services/buy/index.php
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 24/24] make vsmp a paravirt client

2007-11-13 Thread Glauber de Oliveira Costa
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Andi Kleen escreveu:
>> The vsmp_64.c file is now compiled unconditionally, according to which
>> me and kiran agreed to. The detection code is always run, but will only
>> trigger when a suitable box is found. Accordingly, the paravirt structs
>> are only touched when PARAVIRT is on. Otherwise, we don't even have the
>> symbols.
> 
> That seems dumb. What good is it if it doesn't patch the interrupt code?

If vsmp is selected, PARAVIRT will be too, and the interrupt code will
be patched.
the vsmp option triggers a select statement.

the ifdef only exists because, as I said, the code itself will be always
compiled in, to avoid an ifdef in setup_64.c. So it's just a taking it
from here, putting it there issue. Kiran seem to prefer this way, but I
don't really have a preference.

-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Remi - http://enigmail.mozdev.org

iD8DBQFHOZ3IjYI8LaFUWXMRAjPkAJ0XosdyMcj1j4h6XW5dVaj95NH7cgCeIW5o
CGnBnZOTGz9DIu5D997bsZ4=
=BP57
-END PGP SIGNATURE-

-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 2/3] kvmclock - the host part.

2007-11-13 Thread Glauber de Oliveira Costa
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Dong, Eddie escreveu:
>> +static void kvm_write_guest_time(struct kvm_vcpu *vcpu) +{
>> +struct timespec ts;
>> +int r;
>> +
>> +if (!vcpu->clock_gpa)
>> +return;
>> +
>> +/* Updates version to the next odd number, indicating
>> we're writing */
>> +vcpu->hv_clock.version++;
>> +kvm_write_guest(vcpu->kvm, vcpu->clock_gpa,
>> &vcpu->hv_clock, PAGE_SIZE);
>> +
>> +kvm_get_msr(vcpu, MSR_IA32_TIME_STAMP_COUNTER,
>> +  &vcpu->hv_clock.last_tsc);
>> +
>> +ktime_get_ts(&ts);
> 
> Do we need to disable preemption here?
After thinking for a little while, you are theoretically right.
In the current state, we could even be preempted between all operations
;-) Maybe after avi's suggestion of moving the call to it it will end up
in a preempt safe region, but anyway, it's safer to add the preempt
markers here.
I'll put it in next version, thanks

-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Remi - http://enigmail.mozdev.org

iD8DBQFHOZBrjYI8LaFUWXMRAo81AKCfbkzhLl7F6BUjzUHVyErCFeHxFACg1teB
eqsOnJiAqB3JiYf+2YdMZ4o=
=ENKU
-END PGP SIGNATURE-

-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 0/24] paravirt_ops for unified x86 - that's me again!

2007-11-13 Thread Glauber de Oliveira Costa
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Amit Shah escreveu:
> On Saturday 10 November 2007 00:12:41 Glauber de Oliveira Costa wrote:
>> Hey folks,
>>
>> Here's a new spin of the pvops64 patch series.
>> We didn't get that many comments from the last time,
>> so it should be probably almost ready to get in. Heya!
>>
>> >From the last version, the most notable changes are:
>>
>> * consolidation of system.h, merging jeremy's comments about ordering
>>   concerns
>> * consolidation of smp functions that goes through smp_ops. They're sharing
>>   a bunch of code now.
>>
>> Other than that, just some issues that arose from the rebase.
>>
>> Please, not that this patch series _does not_ apply over linus git anymore,
>> but rather, over tglx cleanup series.
>>
>> The first patch in this series is already on linus', but not on tglx', so
>> I'm sending it again, because you'll need it if you want to compile it
>> anyway.
>>
>> tglx, in the absense of any outstanding NACKs, or any very big call for
>> improvements, could you please pull it in your tree?
>>
>> Have fun,
> 
> Glauber, are you planning on consolidating the dma_ops structure for 32- and 
> 64-bit? 32-bit doesn't currently have a dma_mapping_ops structure, which 
> makes paravirtualizing DMA access difficult on 32-bit.
Until its get merged, definitely not. Although important, this is
significant work, and can delay us even more. But I was not planning to
do it at all (well, you were the first that raised the issue...)

So if the reason for your question is you are planning to work on it, go
ahead ;-)
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Remi - http://enigmail.mozdev.org

iD8DBQFHOY3mjYI8LaFUWXMRAvnqAJ4ridsG0ZB2aI7U36hbZFBO0PoDgQCgqvjc
n6RbVz7Jw8t9qCyKUN+hLGg=
=crYj
-END PGP SIGNATURE-

-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 24/24] make vsmp a paravirt client

2007-11-13 Thread Glauber de Oliveira Costa
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Andi Kleen escreveu:
> On Fri, Nov 09, 2007 at 04:43:05PM -0200, Glauber de Oliveira Costa wrote:
>> This patch makes vsmp a paravirt client. It now uses the whole
>> infrastructure provided by pvops. When we detect we're running
>> a vsmp box, we change the irq-related paravirt operations (and so,
>> it have to happen quite early), and the patching function
> 
> The PARAVIRT ifdefs look wrong. Surely you don't need them at all
> because it cannot work at all without paravirt.

The vsmp_64.c file is now compiled unconditionally, according to which
me and kiran agreed to. The detection code is always run, but will only
trigger when a suitable box is found. Accordingly, the paravirt structs
are only touched when PARAVIRT is on. Otherwise, we don't even have the
symbols.

> Also you got some white space damage.

Thanks, will fix.

> And the "EM64T based comment" is wrong because there are AMD based 
> vSMPs too.
Just got it as-is from the old Kconfig. Do you think it should be fixed
as well?

> -Andi

-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Remi - http://enigmail.mozdev.org

iD8DBQFHOYxKjYI8LaFUWXMRApS3AJwJSYjW4Lw3dnPR4yMNfXABnMoQQQCcDMnf
3wBQoPjGO/8HO1Os4Y21vIU=
=S+KO
-END PGP SIGNATURE-

-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 2/3] kvmclock - the host part.

2007-11-13 Thread Glauber de Oliveira Costa
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Avi Kivity escreveu:
> Glauber de Oliveira Costa wrote:
>> This is the host part of kvm clocksource implementation. As it does
>> not include clockevents, it is a fairly simple implementation. We
>> only have to register a per-vcpu area, and start writting to it
>> periodically.
>>
>>   
> 
> Missing live migration support  (a way for userspace to read and write
> the guest clock address).  Should probably be in a separate patch.

I think it's a matter of issuing a hypercall for reading the clock
address. It's fair simple, and can be done in a later version of this patch.
As for writting, the register hypercall itself can be used. It has no
special side-effects we should care about.

>> @@ -1924,6 +1955,7 @@ out:
>>  goto preempted;
>>  }
>>  
>> +kvm_write_guest_time(vcpu);
>>  post_kvm_run_save(vcpu, kvm_run);
>>   
> 
> Why here?  Seems like we're leaving the guest for a while at this place.
> 
> Suggest putting it on top of __vcpu_run(), guarded by a flag, and
> setting the flag every time we put the vcpu.

No special preference. It just sounded exity enough to me. I can move to
where you suggest.

-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Remi - http://enigmail.mozdev.org

iD8DBQFHOYpojYI8LaFUWXMRApf8AJ4jQ/ZTBlub1IwFkJrYZyart+f7bwCfT+9m
l1Rblsmw96ZatCf60g2dNYY=
=DBpn
-END PGP SIGNATURE-

-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 5/24] smp x86 consolidation

2007-11-09 Thread Glauber de Oliveira Costa
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Jeremy Fitzhardinge escreveu:
> Glauber de Oliveira Costa wrote:
>> This patch consolidates part of the pieces of smp for both architectures.
>> (i386 and x86_64). It makes part the calls go through smp_ops, and shares
>> code for those functions in smpcommon.c
>>
>> There's more room for code sharing here, but it is left as an exercise to
>> the reader ;-)
>>   
> 
> I'm getting link errors in 32-bit:
> 
> arch/x86/kernel/built-in.o: In function `native_smp_send_reschedule':
> /home/jeremy/hg/xen/paravirt/linux/arch/x86/kernel/smpcommon.c:262: undefined 
> reference to `genapic'
> arch/x86/kernel/built-in.o: In function `native_smp_call_function_mask':
> /home/jeremy/hg/xen/paravirt/linux/arch/x86/kernel/smpcommon.c:113: undefined 
> reference to `genapic'
> 
Ok, it compiled just fine here. I bet it's due to  one of that i386 lots
of variants.
Which subarchitecture are you compiling for, jeremy ?
> J

-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Remi - http://enigmail.mozdev.org

iD8DBQFHNQmKjYI8LaFUWXMRAlrLAKCHOb28oE/veBkbVeJZDCbjE8OADwCg81ye
nsBfBe2iLOFHF6dxT4mRauc=
=xjo4
-END PGP SIGNATURE-

-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 1/24] mm/sparse-vmemmap.c: make sure init_mm is included

2007-11-09 Thread Glauber de Oliveira Costa
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Jeremy Fitzhardinge escreveu:
> Glauber de Oliveira Costa wrote:
>> mm/sparse-vmemmap.c uses init_mm in some places.  However, it is not
>> present in any of the headers currently included in the file.
>>
>> init_mm is defined as extern in sched.h, so we add it to the headers list
>>
>> Up to now, this problem was masked by the fact that functions like
>> set_pte_at() and pmd_populate_kernel() are usually macros that expand to
>> simpler variants that does not use the first parameter at all.
>>
>> Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
>> Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
>> Signed-off-by: Linus Torvalds <[EMAIL PROTECTED]>
>> ---
>>  mm/sparse-vmemmap.c |1 +
>>  1 files changed, 1 insertions(+), 0 deletions(-)
>>
>> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
>> index d3b718b..22620f6 100644
>> --- a/mm/sparse-vmemmap.c
>> +++ b/mm/sparse-vmemmap.c
>> @@ -24,6 +24,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>   
> 
> This is already in git.
> 
> J
As I told in the 0th message, yes, I'm aware.
Just it does not seem to be in tglx's , so if people are willing to try
this out, they'll needed. Thus it's included in the series.

-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Remi - http://enigmail.mozdev.org

iD8DBQFHNQjijYI8LaFUWXMRAn+UAJ48V1EyWoXkWu1+J0Y0ze59H7ZG2QCcDdgW
4qVJgJcQDfJrvZk8TSo901s=
=RwdO
-END PGP SIGNATURE-

-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 6/24] Add debugreg/load_rsp native hooks

2007-11-09 Thread Glauber de Oliveira Costa
This patch adds native hooks for debugreg handling functions,
and for the native load_rsp0 function. The later also have its
call sites patched. There's some room for consolidation in the
processor*.h headers, and it is done, for paravirt related functions

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 arch/x86/kernel/process_64.c   |2 +-
 arch/x86/kernel/smpboot_64.c   |2 +-
 include/asm-x86/msr.h  |   67 --
 include/asm-x86/processor.h|  146 
 include/asm-x86/processor_32.h |  138 +-
 include/asm-x86/processor_64.h |   32 -
 6 files changed, 165 insertions(+), 222 deletions(-)

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 6472f37..9647a10 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -589,7 +589,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct 
*next_p)
/*
 * Reload esp0, LDT and the page table pointer:
 */
-   tss->rsp0 = next->rsp0;
+   load_esp0(tss, next);
 
/* 
 * Switch DS and ES.
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index 17e54fa..8fb0f90 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -613,7 +613,7 @@ do_rest:
start_rip = setup_trampoline();
 
init_rsp = c_idle.idle->thread.rsp;
-   per_cpu(init_tss,cpu).rsp0 = init_rsp;
+   load_esp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread);
initial_code = start_secondary;
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
 
diff --git a/include/asm-x86/msr.h b/include/asm-x86/msr.h
index ba4b314..48f73c7 100644
--- a/include/asm-x86/msr.h
+++ b/include/asm-x86/msr.h
@@ -253,73 +253,6 @@ static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 
msr_no, u32 l, u32 h)
  : "=a" (low), "=d" (high) \
  : "c" (counter))
 
-static inline void cpuid(int op, unsigned int *eax, unsigned int *ebx,
-unsigned int *ecx, unsigned int *edx)
-{
-   __asm__("cpuid"
-   : "=a" (*eax),
- "=b" (*ebx),
- "=c" (*ecx),
- "=d" (*edx)
-   : "0" (op));
-}
-
-/* Some CPUID calls want 'count' to be placed in ecx */
-static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
-  int *edx)
-{
-   __asm__("cpuid"
-   : "=a" (*eax),
- "=b" (*ebx),
- "=c" (*ecx),
- "=d" (*edx)
-   : "0" (op), "c" (count));
-}
-
-/*
- * CPUID functions returning a single datum
- */
-static inline unsigned int cpuid_eax(unsigned int op)
-{
-   unsigned int eax;
-
-   __asm__("cpuid"
-   : "=a" (eax)
-   : "0" (op)
-   : "bx", "cx", "dx");
-   return eax;
-}
-static inline unsigned int cpuid_ebx(unsigned int op)
-{
-   unsigned int eax, ebx;
-
-   __asm__("cpuid"
-   : "=a" (eax), "=b" (ebx)
-   : "0" (op)
-   : "cx", "dx" );
-   return ebx;
-}
-static inline unsigned int cpuid_ecx(unsigned int op)
-{
-   unsigned int eax, ecx;
-
-   __asm__("cpuid"
-   : "=a" (eax), "=c" (ecx)
-   : "0" (op)
-   : "bx", "dx" );
-   return ecx;
-}
-static inline unsigned int cpuid_edx(unsigned int op)
-{
-   unsigned int eax, edx;
-
-   __asm__("cpuid"
-   : "=a" (eax), "=d" (edx)
-   : "0" (op)
-   : "bx", "cx");
-   return edx;
-}
-
 #ifdef CONFIG_SMP
 void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
 void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
index 46e1c04..a576a72 100644
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -1,5 +1,151 @@
+#ifndef _X86_PROCESSOR_H_
+#define _X86_PROCESSOR_H_
+
+#include 
+#include 
+
+static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
+   unsigned int *ecx, unsigned int *edx)
+{
+   /* ecx is often an input as well as an output. */
+   __asm__("cpuid"
+   : "=a" (*eax),
+ "=b" (*ebx),
+

[kvm-devel] [PATCH 24/24] make vsmp a paravirt client

2007-11-09 Thread Glauber de Oliveira Costa
This patch makes vsmp a paravirt client. It now uses the whole
infrastructure provided by pvops. When we detect we're running
a vsmp box, we change the irq-related paravirt operations (and so,
it have to happen quite early), and the patching function

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 arch/x86/Kconfig.x86_64 |3 +-
 arch/x86/kernel/Makefile_64 |3 +-
 arch/x86/kernel/setup_64.c  |2 +
 arch/x86/kernel/vsmp_64.c   |   82 +-
 include/asm-x86/setup.h |6 ++-
 5 files changed, 80 insertions(+), 16 deletions(-)

diff --git a/arch/x86/Kconfig.x86_64 b/arch/x86/Kconfig.x86_64
index 568ba7a..fe4ef61 100644
--- a/arch/x86/Kconfig.x86_64
+++ b/arch/x86/Kconfig.x86_64
@@ -148,15 +148,14 @@ config X86_PC
bool "PC-compatible"
help
  Choose this option if your computer is a standard PC or compatible.
-
 config X86_VSMP
bool "Support for ScaleMP vSMP"
depends on PCI
+   select PARAVIRT
 help
  Support for ScaleMP vSMP systems.  Say 'Y' here if this kernel is
  supposed to run on these EM64T-based machines.  Only choose this 
option
  if you have one of these machines.
-
 endchoice
 
 choice
diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
index 0714528..34e5c7d 100644
--- a/arch/x86/kernel/Makefile_64
+++ b/arch/x86/kernel/Makefile_64
@@ -8,7 +8,7 @@ obj-y   := process_64.o signal_64.o entry_64.o traps_64.o 
irq_64.o \
x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \
setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \
pci-dma_64.o pci-nommu_64.o alternative.o hpet.o tsc_64.o 
bugs_64.o \
-   i8253.o rtc.o
+   i8253.o rtc.o vsmp_64.o
 
 obj-$(CONFIG_STACKTRACE)   += stacktrace.o
 obj-y  += cpu/
@@ -29,7 +29,6 @@ obj-$(CONFIG_CALGARY_IOMMU)   += pci-calgary_64.o tce_64.o
 obj-$(CONFIG_SWIOTLB)  += pci-swiotlb_64.o
 obj-$(CONFIG_KPROBES)  += kprobes_64.o
 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
-obj-$(CONFIG_X86_VSMP) += vsmp_64.o
 obj-$(CONFIG_K8_NB)+= k8.o
 obj-$(CONFIG_AUDIT)+= audit_64.o
 
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 1c9f237..8cc5915 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -338,6 +338,8 @@ void __init setup_arch(char **cmdline_p)
 
init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
 
+   vsmp_init();
+
dmi_scan_machine();
 
 #ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index d971210..24b0541 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -8,31 +8,95 @@
  *
  * Ravikiran Thirumalai <[EMAIL PROTECTED]>,
  * Shai Fultheim <[EMAIL PROTECTED]>
+ * Paravirt ops integration: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
  */
-
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
+
+/*
+ * Interrupt control for the VSMP architecture:
+ */
+
+static inline unsigned long vsmp_save_fl(void)
+{
+   unsigned long flags = native_save_fl();
+
+   if (!(flags & X86_EFLAGS_AC))
+   return X86_EFLAGS_IF;
+   return 0;
+}
+
+static inline void vsmp_restore_fl(unsigned long flags)
+{
+   if (flags & X86_EFLAGS_IF)
+   flags &= ~X86_EFLAGS_AC;
+   if (!(flags & X86_EFLAGS_IF))
+   flags &= X86_EFLAGS_AC;
+   native_restore_fl(flags);
+}
+
+static inline void vsmp_irq_disable(void)
+{
+   unsigned long flags = native_save_fl();
 
-static int __init vsmp_init(void)
+   vsmp_restore_fl(flags & ~X86_EFLAGS_IF);
+}
+
+static inline void vsmp_irq_enable(void)
+{
+   unsigned long flags = native_save_fl();
+
+   vsmp_restore_fl(flags | X86_EFLAGS_IF);
+}
+
+#ifdef CONFIG_PARAVIRT
+static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf,
+ unsigned long addr, unsigned len)
+{
+   switch (type) {
+   case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
+   case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
+   case PARAVIRT_PATCH(pv_irq_ops.save_fl):
+   case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
+   return paravirt_patch_default(type, clobbers, ibuf, addr, len);
+   default:
+   return native_patch(type, clobbers, ibuf, addr, len);
+   }
+
+}
+#endif
+
+void __init vsmp_init(void)
 {
void *address;
-   unsigned int cap, ctl;
+   unsigned int cap, ctl, cfg;
 
if (!early_pci_allowed())
-   return 0;
+   return;
 
/* Check if we are running on a ScaleMP vSMP box */
if ((read_pci_config_16(0, 0x1f, 0

[kvm-devel] [PATCH 20/24] tweak io_64.h for paravirt.

2007-11-09 Thread Glauber de Oliveira Costa
We need something here because we can't call in and out instructions
directly. However, we have to be careful, because no indirections are
allowed in misc_64.c , and paravirt_ops is a kind of one. So just
call it directly there

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 arch/x86/boot/compressed/misc_64.c |6 +
 include/asm-x86/io_64.h|   37 +--
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/arch/x86/boot/compressed/misc_64.c 
b/arch/x86/boot/compressed/misc_64.c
index 6ea015a..6640a17 100644
--- a/arch/x86/boot/compressed/misc_64.c
+++ b/arch/x86/boot/compressed/misc_64.c
@@ -9,6 +9,12 @@
  * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
  */
 
+/*
+ * we have to be careful, because no indirections are allowed here, and
+ * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
+ * we just keep it from happening
+ */
+#undef CONFIG_PARAVIRT
 #define _LINUX_STRING_H_ 1
 #define __LINUX_BITMAP_H 1
 
diff --git a/include/asm-x86/io_64.h b/include/asm-x86/io_64.h
index a037b07..57fcdd9 100644
--- a/include/asm-x86/io_64.h
+++ b/include/asm-x86/io_64.h
@@ -35,12 +35,24 @@
   *  - Arnaldo Carvalho de Melo <[EMAIL PROTECTED]>
   */
 
-#define __SLOW_DOWN_IO "\noutb %%al,$0x80"
+static inline void native_io_delay(void)
+{
+   asm volatile("outb %%al,$0x80" : : : "memory");
+}
 
-#ifdef REALLY_SLOW_IO
-#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO 
__SLOW_DOWN_IO
+#if defined(CONFIG_PARAVIRT)
+#include 
 #else
-#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO
+
+static inline void slow_down_io(void)
+{
+   native_io_delay();
+#ifdef REALLY_SLOW_IO
+   native_io_delay();
+   native_io_delay();
+   native_io_delay();
+#endif
+}
 #endif
 
 /*
@@ -52,9 +64,15 @@ static inline void out##s(unsigned x value, unsigned short 
port) {
 #define __OUT2(s,s1,s2) \
 __asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
 
+#ifndef REALLY_SLOW_IO
+#define REALLY_SLOW_IO
+#define UNSET_REALLY_SLOW_IO
+#endif
+
 #define __OUT(s,s1,x) \
 __OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
-__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" 
(port));} \
+__OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
+   slow_down_io(); }
 
 #define __IN1(s) \
 static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
@@ -63,8 +81,13 @@ static inline RETURN_TYPE in##s(unsigned short port) { 
RETURN_TYPE _v;
 __asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
 
 #define __IN(s,s1,i...) \
-__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
-__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) 
,##i ); return _v; } \
+__IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); return _v; } \
+__IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i);  \
+   slow_down_io(); return _v; }
+
+#ifdef UNSET_REALLY_SLOW_IO
+#undef REALLY_SLOW_IO
+#endif
 
 #define __INS(s) \
 static inline void ins##s(unsigned short port, void * addr, unsigned long 
count) \
-- 
1.4.4.2


-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 18/24] export cpu_gdt_descr

2007-11-09 Thread Glauber de Oliveira Costa
With paravirualization, hypervisors needs to handle the gdt,
that was right to this point only used at very early
inialization code. Hypervisors (lguest being the current case)
are commonly modules, so make it an export

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 arch/x86/kernel/x8664_ksyms_64.c |6 ++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 105712e..f97aed4 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 
 EXPORT_SYMBOL(kernel_thread);
 
@@ -51,3 +52,8 @@ EXPORT_SYMBOL(__memcpy);
 EXPORT_SYMBOL(load_gs_index);
 
 EXPORT_SYMBOL(_proxy_pda);
+
+#ifdef CONFIG_PARAVIRT
+/* Virtualized guests may want to use it */
+EXPORT_SYMBOL_GPL(cpu_gdt_descr);
+#endif
-- 
1.4.4.2


-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 16/24] add native functions for descriptors handling

2007-11-09 Thread Glauber de Oliveira Costa
This patch turns the basic descriptor handling into native_
functions. It is basically write_idt, load_idt, write_gdt,
load_gdt, set_ldt, store_tr, load_tls, and the ones
for updating a single entry.

In the process of doing that, we change the definition of
load_LDT_nolock, and caller sites have to be patched. We
also patch call sites that now needs a typecast.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 include/asm-x86/desc.h   |   59 
 include/asm-x86/desc_32.h|   45 -
 include/asm-x86/desc_64.h|  191 ++---
 include/asm-x86/mmu_context_64.h |   23 -
 4 files changed, 169 insertions(+), 149 deletions(-)

diff --git a/include/asm-x86/desc.h b/include/asm-x86/desc.h
index 6065c50..276dc6e 100644
--- a/include/asm-x86/desc.h
+++ b/include/asm-x86/desc.h
@@ -1,5 +1,64 @@
+#ifndef _ASM_DESC_H_
+#define _ASM_DESC_H_
+
 #ifdef CONFIG_X86_32
 # include "desc_32.h"
 #else
 # include "desc_64.h"
 #endif
+
+#ifndef __ASSEMBLY__
+#define LDT_entry_a(info) \
+   info)->base_addr & 0x) << 16) | ((info)->limit & 0x0))
+
+#define LDT_entry_b(info) \
+   (((info)->base_addr & 0xff00) | \
+   (((info)->base_addr & 0x00ff) >> 16) | \
+   ((info)->limit & 0xf) | \
+   (((info)->read_exec_only ^ 1) << 9) | \
+   ((info)->contents << 10) | \
+   (((info)->seg_not_present ^ 1) << 15) | \
+   ((info)->seg_32bit << 22) | \
+   ((info)->limit_in_pages << 23) | \
+   ((info)->useable << 20) | \
+   0x7000)
+
+#define _LDT_empty(info) (\
+   (info)->base_addr   == 0&& \
+   (info)->limit   == 0&& \
+   (info)->contents== 0&& \
+   (info)->read_exec_only  == 1&& \
+   (info)->seg_32bit   == 0&& \
+   (info)->limit_in_pages  == 0&& \
+   (info)->seg_not_present == 1&& \
+   (info)->useable == 0)
+
+#ifdef CONFIG_X86_64
+#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
+#else
+#define LDT_empty(info) _LDT_empty(info)
+#endif
+
+static inline void clear_LDT(void)
+{
+   set_ldt(NULL, 0);
+}
+
+/*
+ * load one particular LDT into the current CPU
+ */
+static inline void load_LDT_nolock(mm_context_t *pc)
+{
+   set_ldt(pc->ldt, pc->size);
+}
+
+static inline void load_LDT(mm_context_t *pc)
+{
+   preempt_disable();
+   load_LDT_nolock(pc);
+   preempt_enable();
+}
+
+#endif /* __ASSEMBLY__ */
+
+#endif
diff --git a/include/asm-x86/desc_32.h b/include/asm-x86/desc_32.h
index c547403..84bb843 100644
--- a/include/asm-x86/desc_32.h
+++ b/include/asm-x86/desc_32.h
@@ -162,51 +162,6 @@ static inline void __set_tss_desc(unsigned int cpu, 
unsigned int entry, const vo
 
 #define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
 
-#define LDT_entry_a(info) \
-   info)->base_addr & 0x) << 16) | ((info)->limit & 0x0))
-
-#define LDT_entry_b(info) \
-   (((info)->base_addr & 0xff00) | \
-   (((info)->base_addr & 0x00ff) >> 16) | \
-   ((info)->limit & 0xf) | \
-   (((info)->read_exec_only ^ 1) << 9) | \
-   ((info)->contents << 10) | \
-   (((info)->seg_not_present ^ 1) << 15) | \
-   ((info)->seg_32bit << 22) | \
-   ((info)->limit_in_pages << 23) | \
-   ((info)->useable << 20) | \
-   0x7000)
-
-#define LDT_empty(info) (\
-   (info)->base_addr   == 0&& \
-   (info)->limit   == 0&& \
-   (info)->contents== 0&& \
-   (info)->read_exec_only  == 1&& \
-   (info)->seg_32bit   == 0&& \
-   (info)->limit_in_pages  == 0&& \
-   (info)->seg_not_present == 1&& \
-   (info)->useable == 0)
-
-static inline void clear_LDT(void)
-{
-   set_ldt(NULL, 0);
-}
-
-/*
- * load one particular LDT into the current CPU
- */
-static inline void load_LDT_nolock(mm_context_t *pc)
-{
-   set_ldt(pc->ldt, pc->size);
-}
-
-static inline void load_LDT(mm_context_t *pc)
-{
-   preempt_disable();
-   load_LDT_nolock(pc);
-   preempt_enable();
-}
-
 static inline unsigned long get_desc_base(unsigned long *desc)
 {
unsigned long base;
diff --git a/include/asm-x86/desc_64.h b/include/asm-x86/desc_64.h
index 7d48df7..d12cd07 100644
--- a/include/asm-x86/desc_64.h
+++ b/include/asm-x86/desc_64.h
@@ -16,11 +16,12 @@
 
 extern struct desc_struct cpu_gdt_tabl

[kvm-devel] [PATCH 21/24] native versions for page table entries values

2007-11-09 Thread Glauber de Oliveira Costa
This patch turns the page operations (set and make a page table)
into native_ versions. The operations itself will be later
overriden by paravirt.

It uses unsigned long long for consistency with 32-bit. So we
have to fix fault_64.c to get rid of warnings.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 arch/x86/mm/fault_64.c|8 +++---
 include/asm-x86/page_64.h |   56 +
 2 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c
index 161c0d1..86b7307 100644
--- a/arch/x86/mm/fault_64.c
+++ b/arch/x86/mm/fault_64.c
@@ -157,22 +157,22 @@ void dump_pagetable(unsigned long address)
pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); 
pgd += pgd_index(address);
if (bad_address(pgd)) goto bad;
-   printk("PGD %lx ", pgd_val(*pgd));
+   printk("PGD %llx ", pgd_val(*pgd));
if (!pgd_present(*pgd)) goto ret; 
 
pud = pud_offset(pgd, address);
if (bad_address(pud)) goto bad;
-   printk("PUD %lx ", pud_val(*pud));
+   printk("PUD %llx ", pud_val(*pud));
if (!pud_present(*pud)) goto ret;
 
pmd = pmd_offset(pud, address);
if (bad_address(pmd)) goto bad;
-   printk("PMD %lx ", pmd_val(*pmd));
+   printk("PMD %llx ", pmd_val(*pmd));
if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
 
pte = pte_offset_kernel(pmd, address);
if (bad_address(pte)) goto bad;
-   printk("PTE %lx", pte_val(*pte)); 
+   printk("PTE %llx", pte_val(*pte));
 ret:
printk("\n");
return;
diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h
index 6fdc904..b8da60c 100644
--- a/include/asm-x86/page_64.h
+++ b/include/asm-x86/page_64.h
@@ -65,16 +65,62 @@ typedef struct { unsigned long pgprot; } pgprot_t;
 
 extern unsigned long phys_base;
 
-#define pte_val(x) ((x).pte)
-#define pmd_val(x) ((x).pmd)
-#define pud_val(x) ((x).pud)
-#define pgd_val(x) ((x).pgd)
-#define pgprot_val(x)  ((x).pgprot)
+static inline unsigned long long native_pte_val(pte_t pte)
+{
+   return pte.pte;
+}
+
+static inline unsigned long long native_pud_val(pud_t pud)
+{
+   return pud.pud;
+}
+
+
+static inline unsigned long long native_pmd_val(pmd_t pmd)
+{
+   return pmd.pmd;
+}
+
+static inline unsigned long long native_pgd_val(pgd_t pgd)
+{
+   return pgd.pgd;
+}
+
+static inline pte_t native_make_pte(unsigned long long pte)
+{
+   return (pte_t){ pte };
+}
+
+static inline pud_t native_make_pud(unsigned long long pud)
+{
+   return (pud_t){ pud };
+}
+
+static inline pmd_t native_make_pmd(unsigned long long pmd)
+{
+   return (pmd_t){ pmd };
+}
+
+static inline pgd_t native_make_pgd(unsigned long long pgd)
+{
+   return (pgd_t){ pgd };
+}
+
+#ifdef CONFIG_PARAVIRT
+#include 
+#else
+#define pte_val(x) native_pte_val(x)
+#define pmd_val(x) native_pmd_val(x)
+#define pud_val(x) native_pud_val(x)
+#define pgd_val(x) native_pgd_val(x)
 
 #define __pte(x) ((pte_t) { (x) } )
 #define __pmd(x) ((pmd_t) { (x) } )
 #define __pud(x) ((pud_t) { (x) } )
 #define __pgd(x) ((pgd_t) { (x) } )
+#endif /* CONFIG_PARAVIRT */
+
+#define pgprot_val(x)  ((x).pgprot)
 #define __pgprot(x)((pgprot_t) { (x) } )
 
 #endif /* !__ASSEMBLY__ */
-- 
1.4.4.2


-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 22/24] prepare x86_64 architecture initialization for paravirt

2007-11-09 Thread Glauber de Oliveira Costa
This patch prepares the x86_64 architecture initialization for
paravirt. It requires a memory initialization step, which is done
by implementing 64-bit version for machine_specific_memory_setup,
and putting an ARCH_SETUP hook, for guest-dependent initialization.
This last step is done akin to i386

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 arch/x86/kernel/e820_64.c  |9 +++--
 arch/x86/kernel/setup_64.c |   28 +++-
 include/asm-x86/setup.h|   11 ---
 3 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 0128b0b..eed900b 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -639,8 +639,10 @@ void early_panic(char *msg)
panic(msg);
 }
 
-void __init setup_memory_region(void)
+/* We're not void only for x86 32-bit compat */
+char * __init machine_specific_memory_setup(void)
 {
+   char *who = "BIOS-e820";
/*
 * Try to copy the BIOS-supplied E820-map.
 *
@@ -651,7 +653,10 @@ void __init setup_memory_region(void)
if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
early_panic("Cannot find a valid memory map");
printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-   e820_print_map("BIOS-e820");
+   e820_print_map(who);
+
+   /* In case someone cares... */
+   return who;
 }
 
 static int __init parse_memopt(char *p)
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 2451a63..1c9f237 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -39,6 +39,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -61,6 +62,12 @@
 #include 
 #include 
 
+#ifdef CONFIG_PARAVIRT
+#include 
+#else
+#define ARCH_SETUP
+#endif
+
 /*
  * Machine setup..
  */
@@ -244,6 +251,16 @@ static void discover_ebda(void)
 * 4K EBDA area at 0x40E
 */
ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
+   /*
+* There can be some situations, like paravirtualized guests,
+* in which there is no available ebda information. In such
+* case, just skip it
+*/
+   if (!ebda_addr) {
+   ebda_size = 0;
+   return;
+   }
+
ebda_addr <<= 4;
 
ebda_size = *(unsigned short *)__va(ebda_addr);
@@ -257,6 +274,12 @@ static void discover_ebda(void)
ebda_size = 64*1024;
 }
 
+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
+void __attribute__((weak)) memory_setup(void)
+{
+   machine_specific_memory_setup();
+}
+
 void __init setup_arch(char **cmdline_p)
 {
printk(KERN_INFO "Command line: %s\n", boot_command_line);
@@ -272,7 +295,10 @@ void __init setup_arch(char **cmdline_p)
rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
 #endif
-   setup_memory_region();
+
+   ARCH_SETUP
+
+   memory_setup();
copy_edd();
 
if (!boot_params.hdr.root_flags)
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
index 24d786e..071e054 100644
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -3,6 +3,13 @@
 
 #define COMMAND_LINE_SIZE 2048
 
+#ifndef __ASSEMBLY__
+char *machine_specific_memory_setup(void);
+#ifndef CONFIG_PARAVIRT
+#define paravirt_post_allocator_init() do {} while (0)
+#endif
+#endif /* __ASSEMBLY__ */
+
 #ifdef __KERNEL__
 
 #ifdef __i386__
@@ -51,9 +58,7 @@ void __init add_memory_region(unsigned long long start,
 
 extern unsigned long init_pg_tables_end;
 
-#ifndef CONFIG_PARAVIRT
-#define paravirt_post_allocator_init() do {} while (0)
-#endif
+
 
 #endif /* __i386__ */
 #endif /* _SETUP */
-- 
1.4.4.2


-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 17/24] This patch add provisions for time related functions so they

2007-11-09 Thread Glauber de Oliveira Costa
can be later replaced by paravirt versions.

it basically encloses {g,s}et_wallclock inside the
already existent functions update_persistent_clock and
read_persistent_clock, and defines {s,g}et_wallclock
to the core of such functions.

it also allow for a later-on-game time initialization, as done
by i386. Paravirt guests can set a function to do their own
initialization this way.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 arch/x86/kernel/time_64.c |   12 +---
 include/asm-x86/time.h|   26 +-
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index f88bf6b..89943d8 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -21,6 +21,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
 
@@ -54,7 +56,7 @@ static irqreturn_t timer_event_interrupt(int irq, void 
*dev_id)
 /* calibrate_cpu is used on systems with fixed rate TSCs to determine
  * processor frequency */
 #define TICK_COUNT 1
-static unsigned int __init tsc_calibrate_cpu_khz(void)
+unsigned long __init native_calculate_cpu_khz(void)
 {
int tsc_start, tsc_now;
int i, no_ctr_free;
@@ -104,20 +106,23 @@ static struct irqaction irq0 = {
.name   = "timer"
 };
 
-void __init time_init(void)
+void __init hpet_time_init(void)
 {
if (!hpet_enable())
setup_pit_timer();
 
setup_irq(0, &irq0);
+}
 
+void __init time_init(void)
+{
tsc_calibrate();
 
cpu_khz = tsc_khz;
if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
boot_cpu_data.x86 == 16)
-   cpu_khz = tsc_calibrate_cpu_khz();
+   cpu_khz = calculate_cpu_khz();
 
if (unsynchronized_tsc())
mark_tsc_unstable("TSCs unsynchronized");
@@ -130,4 +135,5 @@ void __init time_init(void)
printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
cpu_khz / 1000, cpu_khz % 1000);
init_tsc_clocksource();
+   late_time_init = choose_time_init();
 }
diff --git a/include/asm-x86/time.h b/include/asm-x86/time.h
index b3f94cd..68779b0 100644
--- a/include/asm-x86/time.h
+++ b/include/asm-x86/time.h
@@ -1,8 +1,12 @@
-#ifndef _ASMi386_TIME_H
-#define _ASMi386_TIME_H
+#ifndef _ASMX86_TIME_H
+#define _ASMX86_TIME_H
+
+extern void (*late_time_init)(void);
+extern void hpet_time_init(void);
 
-#include 
 #include 
+#ifdef CONFIG_X86_32
+#include 
 
 static inline unsigned long native_get_wallclock(void)
 {
@@ -28,8 +32,20 @@ static inline int native_set_wallclock(unsigned long nowtime)
return retval;
 }
 
-extern void (*late_time_init)(void);
-extern void hpet_time_init(void);
+#else
+extern void native_time_init_hook(void);
+
+static inline unsigned long native_get_wallclock(void)
+{
+   return mach_get_cmos_time();
+}
+
+static inline int native_set_wallclock(unsigned long nowtime)
+{
+   return mach_set_rtc_mmss(nowtime);
+}
+
+#endif
 
 #ifdef CONFIG_PARAVIRT
 #include 
-- 
1.4.4.2


-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 19/24] turn priviled operation into a macro in head_64.S

2007-11-09 Thread Glauber de Oliveira Costa
under paravirt, read cr2 cannot be issued directly anymore.
So wrap it in a macro, defined to the operation itself in case
paravirt is off, but to something else if we have paravirt
in the game

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 arch/x86/kernel/head_64.S |9 -
 1 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b6167fe..c31b1c9 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,6 +19,13 @@
 #include 
 #include 
 
+#ifdef CONFIG_PARAVIRT
+#include 
+#include 
+#else
+#define GET_CR2_INTO_RCX movq %cr2, %rcx
+#endif
+
 /* we are not able to switch in one step to the final KERNEL ADRESS SPACE
  * because we need identity-mapped pages.
  *
@@ -267,7 +274,7 @@ ENTRY(early_idt_handler)
xorl %eax,%eax
movq 8(%rsp),%rsi   # get rip
movq (%rsp),%rdx
-   movq %cr2,%rcx
+   GET_CR2_INTO_RCX
leaq early_idt_msg(%rip),%rdi
call early_printk
cmpl $2,early_recursion_flag(%rip)
-- 
1.4.4.2


-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 15/24] native versions for set pagetables

2007-11-09 Thread Glauber de Oliveira Costa
This patch turns the set_p{te,md,ud,gd} functions into their
native_ versions. There is no need to patch any caller.

Also, it adds pte_update() and pte_update_defer() calls whenever
we modify a page table entry. This last part was coded to match
i386 as close as possible.

Pieces of the header are moved to below the #ifdef CONFIG_PARAVIRT
site, as they are users of the newly defined set_* macros.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 include/asm-x86/pgtable_64.h |  192 --
 1 files changed, 128 insertions(+), 64 deletions(-)

diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h
index 9b0ff47..592d613 100644
--- a/include/asm-x86/pgtable_64.h
+++ b/include/asm-x86/pgtable_64.h
@@ -57,56 +57,107 @@ extern unsigned long 
empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
  */
 #define PTRS_PER_PTE   512
 
-#ifndef __ASSEMBLY__
+#ifdef CONFIG_PARAVIRT
+#include 
+#else
+
+#define set_pte native_set_pte
+#define set_pte_at(mm, addr, ptep, pteval) set_pte(ptep, pteval)
+#define set_pmd native_set_pmd
+#define set_pud native_set_pud
+#define set_pgd native_set_pgd
+#define pte_clear(mm, addr, xp)\
+do {   \
+   set_pte_at(mm, addr, xp, __pte(0)); \
+} while (0)
 
-#define pte_ERROR(e) \
-   printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), 
pte_val(e))
-#define pmd_ERROR(e) \
-   printk("%s:%d: bad pmd %p(%016lx).\n", __FILE__, __LINE__, &(e), 
pmd_val(e))
-#define pud_ERROR(e) \
-   printk("%s:%d: bad pud %p(%016lx).\n", __FILE__, __LINE__, &(e), 
pud_val(e))
-#define pgd_ERROR(e) \
-   printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), 
pgd_val(e))
+#define pmd_clear(xp)  do { set_pmd(xp, __pmd(0)); } while (0)
+#define pud_clear native_pud_clear
+#define pgd_clear native_pgd_clear
+#define pte_update(mm, addr, ptep)  do { } while (0)
+#define pte_update_defer(mm, addr, ptep)do { } while (0)
 
-#define pgd_none(x)(!pgd_val(x))
-#define pud_none(x)(!pud_val(x))
+#endif
 
-static inline void set_pte(pte_t *dst, pte_t val)
+#ifndef __ASSEMBLY__
+
+static inline void native_set_pte(pte_t *dst, pte_t val)
 {
-   pte_val(*dst) = pte_val(val);
+   dst->pte = pte_val(val);
 } 
-#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval)
 
-static inline void set_pmd(pmd_t *dst, pmd_t val)
+static inline void native_set_pmd(pmd_t *dst, pmd_t val)
 {
-pmd_val(*dst) = pmd_val(val); 
+   dst->pmd = pmd_val(val);
 } 
 
-static inline void set_pud(pud_t *dst, pud_t val)
+static inline void native_set_pud(pud_t *dst, pud_t val)
 {
-   pud_val(*dst) = pud_val(val);
+   dst->pud = pud_val(val);
 }
 
-static inline void pud_clear (pud_t *pud)
+static inline void native_set_pgd(pgd_t *dst, pgd_t val)
 {
-   set_pud(pud, __pud(0));
+   dst->pgd = pgd_val(val);
 }
 
-static inline void set_pgd(pgd_t *dst, pgd_t val)
+static inline void native_pud_clear(pud_t *pud)
 {
-   pgd_val(*dst) = pgd_val(val); 
-} 
+   set_pud(pud, __pud(0));
+}
 
-static inline void pgd_clear (pgd_t * pgd)
+static inline void native_pgd_clear(pgd_t *pgd)
 {
set_pgd(pgd, __pgd(0));
 }
 
-#define ptep_get_and_clear(mm,addr,xp) __pte(xchg(&(xp)->pte, 0))
+static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
+pte_t *ptep, pte_t pteval)
+{
+   native_set_pte(ptep, pteval);
+}
+
+static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
+   pte_t *ptep)
+{
+   native_set_pte_at(mm, addr, ptep, __pte(0));
+}
+
+static inline void native_pmd_clear(pmd_t *pmd)
+{
+   native_set_pmd(pmd, __pmd(0));
+}
+
+
+#define pte_ERROR(e)   \
+   printk("%s:%d: bad pte %p(%016llx).\n", \
+   __FILE__, __LINE__, &(e), (u64)pte_val(e))
+#define pmd_ERROR(e)   \
+   printk("%s:%d: bad pmd %p(%016llx).\n", \
+   __FILE__, __LINE__, &(e), (u64)pmd_val(e))
+#define pud_ERROR(e)   \
+   printk("%s:%d: bad pud %p(%016llx).\n", \
+__FILE__, __LINE__, &(e), (u64)pud_val(e))
+#define pgd_ERROR(e)   \
+   printk("%s:%d: bad pgd %p(%016llx).\n", \
+   __FILE__, __LINE__, &(e), (u64)pgd_val(e))
+
+#define pgd_none(x)(!pgd_val(x))
+#define pud_none(x)(!pud_val(x))
 
 struct mm_struct;
 
-static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned 
long addr, pte_t *ptep, int full)
+static inline pte_t pte

[kvm-devel] [PATCH 5/24] smp x86 consolidation

2007-11-09 Thread Glauber de Oliveira Costa
This patch consolidates part of the pieces of smp for both architectures.
(i386 and x86_64). It makes part the calls go through smp_ops, and shares
code for those functions in smpcommon.c

There's more room for code sharing here, but it is left as an exercise to
the reader ;-)

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 arch/x86/kernel/Makefile_32|2 +-
 arch/x86/kernel/Makefile_64|2 +-
 arch/x86/kernel/smp_32.c   |  218 --
 arch/x86/kernel/smp_64.c   |  245 -
 arch/x86/kernel/smpboot_64.c   |8 +-
 arch/x86/kernel/smpcommon.c|  291 
 arch/x86/kernel/smpcommon_32.c |   81 ---
 include/asm-x86/idle.h |   18 +++-
 include/asm-x86/smp.h  |   66 +
 include/asm-x86/smp_32.h   |   58 
 include/asm-x86/smp_64.h   |4 -
 11 files changed, 379 insertions(+), 614 deletions(-)

diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index b08179a..b0da543 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -20,7 +20,7 @@ obj-$(CONFIG_MICROCODE)   += microcode.o
 obj-$(CONFIG_PCI)  += early-quirks.o
 obj-$(CONFIG_APM)  += apm_32.o
 obj-$(CONFIG_X86_SMP)  += smp_32.o smpboot_32.o tsc_sync.o
-obj-$(CONFIG_SMP)  += smpcommon_32.o
+obj-$(CONFIG_SMP)  += smpcommon.o
 obj-$(CONFIG_X86_TRAMPOLINE)   += trampoline_32.o
 obj-$(CONFIG_X86_MPPARSE)  += mpparse_32.o
 obj-$(CONFIG_X86_LOCAL_APIC)   += apic_32.o nmi_32.o
diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
index 686de84..ffee997 100644
--- a/arch/x86/kernel/Makefile_64
+++ b/arch/x86/kernel/Makefile_64
@@ -17,7 +17,7 @@ obj-y += acpi/
 obj-$(CONFIG_X86_MSR)  += msr.o
 obj-$(CONFIG_MICROCODE)+= microcode.o
 obj-$(CONFIG_X86_CPUID)+= cpuid.o
-obj-$(CONFIG_SMP)  += smp_64.o smpboot_64.o trampoline_64.o 
tsc_sync.o
+obj-$(CONFIG_SMP)  += smp_64.o smpboot_64.o trampoline_64.o 
tsc_sync.o smpcommon.o
 obj-y  += apic_64.o  nmi_64.o
 obj-y  += io_apic_64.o mpparse_64.o genapic_64.o 
genapic_flat_64.o
 obj-$(CONFIG_KEXEC)+= machine_kexec_64.o relocate_kernel_64.o 
crash.o
diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c
index fcaa026..a7cc319 100644
--- a/arch/x86/kernel/smp_32.c
+++ b/arch/x86/kernel/smp_32.c
@@ -464,213 +464,6 @@ void flush_tlb_all(void)
on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
 }
 
-/*
- * this function sends a 'reschedule' IPI to another CPU.
- * it goes straight through and wastes no time serializing
- * anything. Worst case is that we lose a reschedule ...
- */
-static void native_smp_send_reschedule(int cpu)
-{
-   WARN_ON(cpu_is_offline(cpu));
-   send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
-}
-
-/*
- * Structure and data for smp_call_function(). This is designed to minimise
- * static memory requirements. It also looks cleaner.
- */
-static DEFINE_SPINLOCK(call_lock);
-
-struct call_data_struct {
-   void (*func) (void *info);
-   void *info;
-   atomic_t started;
-   atomic_t finished;
-   int wait;
-};
-
-void lock_ipi_call_lock(void)
-{
-   spin_lock_irq(&call_lock);
-}
-
-void unlock_ipi_call_lock(void)
-{
-   spin_unlock_irq(&call_lock);
-}
-
-static struct call_data_struct *call_data;
-
-static void __smp_call_function(void (*func) (void *info), void *info,
-   int nonatomic, int wait)
-{
-   struct call_data_struct data;
-   int cpus = num_online_cpus() - 1;
-
-   if (!cpus)
-   return;
-
-   data.func = func;
-   data.info = info;
-   atomic_set(&data.started, 0);
-   data.wait = wait;
-   if (wait)
-   atomic_set(&data.finished, 0);
-
-   call_data = &data;
-   mb();
-   
-   /* Send a message to all other CPUs and wait for them to respond */
-   send_IPI_allbutself(CALL_FUNCTION_VECTOR);
-
-   /* Wait for response */
-   while (atomic_read(&data.started) != cpus)
-   cpu_relax();
-
-   if (wait)
-   while (atomic_read(&data.finished) != cpus)
-   cpu_relax();
-}
-
-
-/**
- * smp_call_function_mask(): Run a function on a set of other CPUs.
- * @mask: The set of cpus to run on.  Must not include the current cpu.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed on other 
CPUs.
- *
-  * Returns 0 on success, else a negative status code.
-

[kvm-devel] [PATCH 7/24] consolidate msr.h

2007-11-09 Thread Glauber de Oliveira Costa
This patch goes one step forward in consolidating the msr.h header.
It shares code between i386 and x86_64, instead of duplicating the
code for tsc reading, msr reading/writing, etc.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 arch/x86/ia32/syscall32.c|2 +-
 arch/x86/kernel/setup64.c|6 +-
 arch/x86/kernel/tsc_64.c |   17 +++-
 arch/x86/kernel/vsyscall_64.c|4 +-
 arch/x86/vdso/vgetcpu.c  |4 +-
 include/asm-x86/alternative_32.h |   17 +++-
 include/asm-x86/alternative_64.h |   27 -
 include/asm-x86/msr.h|  225 ++
 include/asm-x86/tsc.h|   33 +-
 9 files changed, 151 insertions(+), 184 deletions(-)

diff --git a/arch/x86/ia32/syscall32.c b/arch/x86/ia32/syscall32.c
index d751d96..a1247ed 100644
--- a/arch/x86/ia32/syscall32.c
+++ b/arch/x86/ia32/syscall32.c
@@ -82,5 +82,5 @@ void syscall32_cpu_init(void)
checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL);
checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
 
-   wrmsrl(MSR_CSTAR, ia32_cstar_target);
+   wrmsrl(MSR_CSTAR, (u64)ia32_cstar_target);
 }
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index 3558ac7..50b7514 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -122,7 +122,7 @@ void pda_init(int cpu)
asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 
/* Memory clobbers used to order PDA accessed */
mb();
-   wrmsrl(MSR_GS_BASE, pda);
+   wrmsrl(MSR_GS_BASE, (u64)pda);
mb();
 
pda->cpunumber = cpu; 
@@ -161,8 +161,8 @@ void syscall_init(void)
 * but only a 32bit target. LSTAR sets the 64bit rip.
 */ 
wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32); 
-   wrmsrl(MSR_LSTAR, system_call); 
-   wrmsrl(MSR_CSTAR, ignore_sysret);
+   wrmsrl(MSR_LSTAR, (u64)system_call);
+   wrmsrl(MSR_CSTAR, (u64)ignore_sysret);
 
 #ifdef CONFIG_IA32_EMULATION   
syscall32_cpu_init ();
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index 9c70af4..4502539 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -30,7 +30,7 @@ static unsigned long long cycles_2_ns(unsigned long long cyc)
return (cyc * cyc2ns_scale) >> NS_SCALE;
 }
 
-unsigned long long sched_clock(void)
+unsigned long long native_sched_clock(void)
 {
unsigned long a = 0;
 
@@ -44,6 +44,19 @@ unsigned long long sched_clock(void)
return cycles_2_ns(a);
 }
 
+/* We need to define a real function for sched_clock, to override the
+   weak default version */
+#ifdef CONFIG_PARAVIRT
+unsigned long long sched_clock(void)
+{
+   return paravirt_sched_clock();
+}
+#else
+unsigned long long
+sched_clock(void) __attribute__((alias("native_sched_clock")));
+#endif
+
+
 static int tsc_unstable;
 
 inline int check_tsc_unstable(void)
@@ -256,7 +269,7 @@ static cycle_t read_tsc(void)
 
 static cycle_t __vsyscall_fn vread_tsc(void)
 {
-   cycle_t ret = (cycle_t)get_cycles_sync();
+   cycle_t ret = (cycle_t)vget_cycles_sync();
return ret;
 }
 
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index ad4005c..1425d02 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -190,7 +190,7 @@ time_t __vsyscall(1) vtime(time_t *t)
 long __vsyscall(2)
 vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
 {
-   unsigned int dummy, p;
+   unsigned int p;
unsigned long j = 0;
 
/* Fast cache - only recompute value once per jiffies and avoid
@@ -205,7 +205,7 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache 
*tcache)
p = tcache->blob[1];
} else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
/* Load per CPU data from RDTSCP */
-   rdtscp(dummy, dummy, p);
+   native_read_tscp(&p);
} else {
/* Load per CPU data from GDT */
asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index 91f6e85..61d0def 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -15,7 +15,7 @@
 
 long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
 {
-   unsigned int dummy, p;
+   unsigned int p;
unsigned long j = 0;
 
/* Fast cache - only recompute value once per jiffies and avoid
@@ -30,7 +30,7 @@ long __vdso_getcpu(unsigned *cpu, unsigned *node, struct 
getcpu_cache *tcache)
p = tcache->blob[1];
} else if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) {
/* Load per CPU data

[kvm-devel] [PATCH 13/24] report ring kernel is running without paravirt

2007-11-09 Thread Glauber de Oliveira Costa
When paravirtualization is disabled, the kernel is always
running at ring 0. So report it in the appropriate macro

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 include/asm-x86/segment_64.h |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/include/asm-x86/segment_64.h b/include/asm-x86/segment_64.h
index 04b8ab2..240c1bf 100644
--- a/include/asm-x86/segment_64.h
+++ b/include/asm-x86/segment_64.h
@@ -50,4 +50,8 @@
 #define GDT_SIZE (GDT_ENTRIES * 8)
 #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) 
 
+#ifndef CONFIG_PARAVIRT
+#define get_kernel_rpl()  0
+#endif
+
 #endif
-- 
1.4.4.2


-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 14/24] export math_state_restore

2007-11-09 Thread Glauber de Oliveira Costa
Export math_state_restore symbol, so it can be used for hypervisors.
They are commonly loaded as modules (lguest being an example).

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 arch/x86/kernel/traps_64.c |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 4d752a8..0876692 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -1069,6 +1069,7 @@ asmlinkage void math_state_restore(void)
task_thread_info(me)->status |= TS_USEDFPU;
me->fpu_counter++;
 }
+EXPORT_SYMBOL_GPL(math_state_restore);
 
 void __init trap_init(void)
 {
-- 
1.4.4.2


-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 3/24] consolidate spinlock.h

2007-11-09 Thread Glauber de Oliveira Costa
The cli and sti instructions need to be replaced by paravirt hooks.
For the i386 architecture, this is already done. The code requirements
aren't much different from x86_64 POV, so this part is consolidated in
the common header

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 include/asm-x86/spinlock.h|   14 ++
 include/asm-x86/spinlock_32.h |9 -
 include/asm-x86/spinlock_64.h |8 +---
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/include/asm-x86/spinlock.h b/include/asm-x86/spinlock.h
index d74d85e..e1d555a 100644
--- a/include/asm-x86/spinlock.h
+++ b/include/asm-x86/spinlock.h
@@ -1,5 +1,19 @@
+#ifndef _X86_SPINLOCK_H_
+#define _X86_SPINLOCK_H_
+
+#ifdef CONFIG_PARAVIRT
+#include 
+#else
+#define CLI_STRING "cli"
+#define STI_STRING "sti"
+#define CLI_STI_CLOBBERS
+#define CLI_STI_INPUT_ARGS
+#endif /* CONFIG_PARAVIRT */
+
 #ifdef CONFIG_X86_32
 # include "spinlock_32.h"
 #else
 # include "spinlock_64.h"
 #endif
+
+#endif
diff --git a/include/asm-x86/spinlock_32.h b/include/asm-x86/spinlock_32.h
index d3bcebe..ebbf371 100644
--- a/include/asm-x86/spinlock_32.h
+++ b/include/asm-x86/spinlock_32.h
@@ -7,15 +7,6 @@
 #include 
 #include 
 
-#ifdef CONFIG_PARAVIRT
-#include 
-#else
-#define CLI_STRING "cli"
-#define STI_STRING "sti"
-#define CLI_STI_CLOBBERS
-#define CLI_STI_INPUT_ARGS
-#endif /* CONFIG_PARAVIRT */
-
 /*
  * Your basic SMP spinlocks, allowing only a single CPU anywhere
  *
diff --git a/include/asm-x86/spinlock_64.h b/include/asm-x86/spinlock_64.h
index 88bf981..e56b17e 100644
--- a/include/asm-x86/spinlock_64.h
+++ b/include/asm-x86/spinlock_64.h
@@ -48,12 +48,12 @@ static inline void __raw_spin_lock_flags(raw_spinlock_t 
*lock, unsigned long fla
"jns 5f\n"
"testl $0x200, %1\n\t"  /* interrupts were disabled? */
"jz 4f\n\t"
-   "sti\n"
+   STI_STRING "\n"
"3:\t"
"rep;nop\n\t"
"cmpl $0, %0\n\t"
"jle 3b\n\t"
-   "cli\n\t"
+   CLI_STRING "\n\t"
"jmp 1b\n"
"4:\t"
"rep;nop\n\t"
@@ -61,7 +61,9 @@ static inline void __raw_spin_lock_flags(raw_spinlock_t 
*lock, unsigned long fla
"jg 1b\n\t"
"jmp 4b\n"
"5:\n\t"
-   : "+m" (lock->slock) : "r" ((unsigned)flags) : "memory");
+   : "+m" (lock->slock)
+   : "r" ((unsigned)flags) CLI_STI_INPUT_ARGS
+   : "memory" CLI_STI_CLOBBERS);
 }
 #endif
 
-- 
1.4.4.2


-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 11/24] read/write_crX, clts and wbinvd for 64-bit paravirt

2007-11-09 Thread Glauber de Oliveira Costa
This patch introduces, and patch callers when needed, native
versions for read/write_crX functions, clts and wbinvd.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 arch/x86/mm/pageattr_64.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/arch/x86/mm/pageattr_64.c b/arch/x86/mm/pageattr_64.c
index 14ab327..3a483a8 100644
--- a/arch/x86/mm/pageattr_64.c
+++ b/arch/x86/mm/pageattr_64.c
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 
 pte_t *lookup_address(unsigned long address)
 {
@@ -84,7 +85,7 @@ static void flush_kernel_map(void *arg)
   much cheaper than WBINVD. */
/* clflush is still broken. Disable for now. */
if (1 || !cpu_has_clflush) {
-   asm volatile("wbinvd" ::: "memory");
+   wbinvd();
} else {
list_for_each_entry(pg, l, lru) {
void *addr = page_address(pg);
-- 
1.4.4.2


-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 8/24] consolidate system.h

2007-11-09 Thread Glauber de Oliveira Costa
This patch consolidates system.h header. For i386, it adds functions
read/write_cr8 that ain't really needed, but will also not hurt. If they are
used somewhere in i386 code, there's a bug anyway, and should be fixed.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 include/asm-x86/system.h|  134 +++
 include/asm-x86/system_32.h |  102 
 include/asm-x86/system_64.h |   77 -
 3 files changed, 134 insertions(+), 179 deletions(-)

diff --git a/include/asm-x86/system.h b/include/asm-x86/system.h
index 692562b..ef20916 100644
--- a/include/asm-x86/system.h
+++ b/include/asm-x86/system.h
@@ -1,5 +1,139 @@
+#ifndef _X86_SYSTEM_H_
+#define _X86_SYSTEM_H_
+
+#include 
+
+static inline void native_clts(void)
+{
+   asm volatile ("clts");
+}
+
+/*
+ * Volatile isn't enough to prevent the compiler from reordering the
+ * read/write functions for the control registers and messing everything up.
+ * A memory clobber would solve the problem, but would prevent reordering of
+ * all loads stores around it, which can hurt performance. Solution is to
+ * use a variable and mimic reads and writes to it to enforce serialization
+ */
+static unsigned long __force_order;
+
+static inline unsigned long native_read_cr0(void)
+{
+   unsigned long val;
+   asm volatile("mov %%cr0,%0\n\t" :"=r" (val), "=m" (__force_order));
+   return val;
+}
+
+static inline void native_write_cr0(unsigned long val)
+{
+   asm volatile("mov %0,%%cr0": :"r" (val), "m" (__force_order));
+}
+
+static inline unsigned long native_read_cr2(void)
+{
+   unsigned long val;
+   asm volatile("mov %%cr2,%0\n\t" :"=r" (val), "=m" (__force_order));
+   return val;
+}
+
+static inline void native_write_cr2(unsigned long val)
+{
+   asm volatile("mov %0,%%cr2": :"r" (val), "m" (__force_order));
+}
+
+static inline unsigned long native_read_cr3(void)
+{
+   unsigned long val;
+   asm volatile("mov %%cr3,%0\n\t" :"=r" (val), "=m" (__force_order));
+   return val;
+}
+
+static inline void native_write_cr3(unsigned long val)
+{
+   asm volatile("mov %0,%%cr3": :"r" (val), "m" (__force_order));
+}
+
+static inline unsigned long native_read_cr4(void)
+{
+   unsigned long val;
+   asm volatile("mov %%cr4,%0\n\t" :"=r" (val), "=m" (__force_order));
+   return val;
+}
+
+static inline unsigned long native_read_cr4_safe(void)
+{
+   unsigned long val;
+   /* This could fault if %cr4 does not exist. In x86_64, a cr4 always
+* exists, so it will never fail. */
+#ifdef CONFIG_X86_32
+   asm volatile("1: mov %%cr4, %0  \n"
+   "2: \n"
+   ".section __ex_table,\"a\"  \n"
+   ".long 1b,2b\n"
+   ".previous  \n"
+   : "=r" (val), "=m" (__force_order) : "0" (0));
+#else
+   val = native_read_cr4();
+#endif
+   return val;
+}
+
+static inline void native_write_cr4(unsigned long val)
+{
+   asm volatile("mov %0,%%cr4": :"r" (val), "m" (__force_order));
+}
+
+static inline unsigned long native_read_cr8(void)
+{
+   unsigned long cr8;
+   asm volatile("mov %%cr8,%0" : "=r" (cr8), "=m" (__force_order));
+   return cr8;
+}
+
+static inline void native_write_cr8(unsigned long val)
+{
+   asm volatile("mov %0,%%cr8" : : "r" (val));
+}
+
+static inline void native_wbinvd(void)
+{
+   asm volatile("wbinvd": : :"memory");
+}
+
+static inline void clflush(volatile void *__p)
+{
+   asm volatile("clflush %0" : "+m" (*(char __force *)__p));
+}
+
+#ifdef CONFIG_PARAVIRT
+#include 
+#else
+#define read_cr0() (native_read_cr0())
+#define write_cr0(x)   (native_write_cr0(x))
+#define read_cr2() (native_read_cr2())
+#define write_cr2(x)   (native_write_cr2(x))
+#define read_cr3() (native_read_cr3())
+#define write_cr3(x)   (native_write_cr3(x))
+#define read_cr4() (native_read_cr4())
+#define read_cr4_safe()(native_read_cr4_safe())
+#define write_cr4(x)   (native_write_cr4(x))
+#define read_cr8() (native_read_cr8())
+#define write_cr8(x)   (native_write_cr8(x))
+#define wbinvd()   (native_wbinvd())
+
+/* Clear the 'TS' bit */
+#define clts() (native_clts())
+
+#endif/* CONFIG_PARAVIRT */
+
+#define stts() write_cr0(8 | rea

[kvm-devel] [PATCH 12/24] provide native irq initialization function

2007-11-09 Thread Glauber de Oliveira Costa
The interrupt initialization routine becomes native_init_IRQ and will
be overriden later in case paravirt is on. The interrupt array is made visible
for guests such lguest, that will need to have their own initialization
mechanism (though using most of the same irq lines) later on.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 arch/x86/kernel/i8259_64.c |7 +--
 include/asm-x86/irq_64.h   |3 +++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
index 3041e59..53955f4 100644
--- a/arch/x86/kernel/i8259_64.c
+++ b/arch/x86/kernel/i8259_64.c
@@ -77,7 +77,7 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) 
BUILD_16_IRQS(0xf)
IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
 
 /* for the irq vectors */
-static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
+void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
  IRQLIST_16(0x2), IRQLIST_16(0x3),
IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
@@ -456,7 +456,10 @@ void __init init_ISA_irqs (void)
}
 }
 
-void __init init_IRQ(void)
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
 {
int i;
 
diff --git a/include/asm-x86/irq_64.h b/include/asm-x86/irq_64.h
index 5006c6e..4f02446 100644
--- a/include/asm-x86/irq_64.h
+++ b/include/asm-x86/irq_64.h
@@ -46,6 +46,9 @@ static __inline__ int irq_canonicalize(int irq)
 extern void fixup_irqs(cpumask_t map);
 #endif
 
+#include 
+void native_init_IRQ(void);
+
 #define __ARCH_HAS_DO_SOFTIRQ 1
 
 #endif /* _ASM_IRQ_H */
-- 
1.4.4.2


-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 9/24] Wipe out traditional opt from x86_64 Makefile

2007-11-09 Thread Glauber de Oliveira Costa
Among other things, using -traditional as a gcc option stops us from
using macro token pasting, which is a feature we heavily rely on.

There was still a use of -traditional in arch/x86/kernel/Makefile_64,
which this patch removes.

I don't see any problems building kernels in my x86_64 box without
-traditional.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 arch/x86/kernel/Makefile_64 |1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
index ffee997..0714528 100644
--- a/arch/x86/kernel/Makefile_64
+++ b/arch/x86/kernel/Makefile_64
@@ -3,7 +3,6 @@
 #
 
 extra-y:= head_64.o head64.o init_task.o vmlinux.lds
-EXTRA_AFLAGS   := -traditional
 obj-y  := process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
ptrace_64.o time_64.o ioport_64.o ldt.o setup_64.o i8259_64.o 
sys_x86_64.o \
x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \
-- 
1.4.4.2


-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 10/24] paravirt hooks at entry functions.

2007-11-09 Thread Glauber de Oliveira Costa
Those are the hooks needed for paravirt at entry_64.S
In general, they follow the way of i386.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 arch/x86/kernel/entry_64.S |  108 +++-
 1 files changed, 66 insertions(+), 42 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3a058bb..b6d7008 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -50,6 +50,7 @@
 #include 
 #include 
 #include 
+#include 
 
.code64
 
@@ -57,6 +58,20 @@
 #define retint_kernel retint_restore_args
 #endif 
 
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_irq_enable_syscall_ret)
+   movq%gs:pda_oldrsp,%rsp
+   swapgs
+   sysretq
+/* 
+ * This could well be defined as a C function, but as it is only used here,
+ * let it be locally defined
+ */
+ENTRY(native_swapgs)
+   swapgs
+   retq
+#endif /* CONFIG_PARAVIRT */
+
 
 .macro TRACE_IRQS_IRETQ offset=ARGOFFSET
 #ifdef CONFIG_TRACE_IRQFLAGS
@@ -216,14 +231,21 @@ ENTRY(system_call)
CFI_DEF_CFA rsp,PDA_STACKOFFSET
CFI_REGISTERrip,rcx
/*CFI_REGISTER  rflags,r11*/
-   swapgs
+   SWAPGS_UNSAFE_STACK
+   /*
+* A hypervisor implementation might want to use a label
+* after the swapgs, so that it can do the swapgs
+* for the guest and jump here on syscall.
+*/
+ENTRY(system_call_after_swapgs)
+
movq%rsp,%gs:pda_oldrsp 
movq%gs:pda_kernelstack,%rsp
/*
 * No need to follow this irqs off/on section - it's straight
 * and short:
 */
-   sti 
+   ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_ARGS 8,1
movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
movq  %rcx,RIP-ARGOFFSET(%rsp)
@@ -246,7 +268,7 @@ ret_from_sys_call:
 sysret_check:  
LOCKDEP_SYS_EXIT
GET_THREAD_INFO(%rcx)
-   cli
+   DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
movl threadinfo_flags(%rcx),%edx
andl %edi,%edx
@@ -260,9 +282,7 @@ sysret_check:
CFI_REGISTERrip,rcx
RESTORE_ARGS 0,-ARG_SKIP,1
/*CFI_REGISTER  rflags,r11*/
-   movq%gs:pda_oldrsp,%rsp
-   swapgs
-   sysretq
+   ENABLE_INTERRUPTS_SYSCALL_RET
 
CFI_RESTORE_STATE
/* Handle reschedules */
@@ -271,7 +291,7 @@ sysret_careful:
bt $TIF_NEED_RESCHED,%edx
jnc sysret_signal
TRACE_IRQS_ON
-   sti
+   ENABLE_INTERRUPTS(CLBR_NONE)
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
call schedule
@@ -282,7 +302,7 @@ sysret_careful:
/* Handle a signal */ 
 sysret_signal:
TRACE_IRQS_ON
-   sti
+   ENABLE_INTERRUPTS(CLBR_NONE)
testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
jz1f
 
@@ -295,7 +315,7 @@ sysret_signal:
 1: movl $_TIF_NEED_RESCHED,%edi
/* Use IRET because user could have changed frame. This
   works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
-   cli
+   DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check

@@ -327,7 +347,7 @@ tracesys:
  */
.globl int_ret_from_sys_call
 int_ret_from_sys_call:
-   cli
+   DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
testl $3,CS-ARGOFFSET(%rsp)
je retint_restore_args
@@ -349,20 +369,20 @@ int_careful:
bt $TIF_NEED_RESCHED,%edx
jnc  int_very_careful
TRACE_IRQS_ON
-   sti
+   ENABLE_INTERRUPTS(CLBR_NONE)
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
call schedule
popq %rdi
CFI_ADJUST_CFA_OFFSET -8
-   cli
+   DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
 
/* handle signals and tracing -- both require a full stack frame */
 int_very_careful:
TRACE_IRQS_ON
-   sti
+   ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_REST
/* Check for syscall exit trace */  
testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
@@ -385,7 +405,7 @@ int_signal:
 1: movl $_TIF_NEED_RESCHED,%edi
 int_restore_rest:
RESTORE_REST
-   cli
+   DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
CFI_ENDPROC
@@ -506,7 +526,7 @@ END(stub_rt_sigreturn)
CFI_DEF_CFA_REGISTERrbp
testl $3,CS(%rdi)
je 1f
-   swapgs  
+   SWAPGS
/* irqcount is used to check if a CPU is already on an interrupt
   stack or not. While this is essentially redundant with preempt_count
   it is a little cheaper to use a separate counter in the PDA
@@ -527,7 +547,7 @@ ENTRY(common_interrupt)
interrupt do_IRQ
/* 0(%rsp): oldrsp-ARGOFFSET */
 ret

[kvm-devel] [PATCH 4/24] tlb functions consolidation

2007-11-09 Thread Glauber de Oliveira Costa
This patch consolidates part of the tlb handling functions for the x86
architecture. In this approach, we start by the parts actually used for
paravirt in i386.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 arch/x86/kernel/smp_64.c  |5 ++-
 include/asm-x86/tlbflush.h|   77 +
 include/asm-x86/tlbflush_32.h |   77 -
 include/asm-x86/tlbflush_64.h |   43 +++
 4 files changed, 85 insertions(+), 117 deletions(-)

diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/smp_64.c
index 62b0f2a..ce3935b 100644
--- a/arch/x86/kernel/smp_64.c
+++ b/arch/x86/kernel/smp_64.c
@@ -166,11 +166,12 @@ out:
add_pda(irq_tlb_count, 1);
 }
 
-static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
-   unsigned long va)
+void native_flush_tlb_others(const cpumask_t *cpumaskp,
+struct mm_struct *mm, unsigned long va)
 {
int sender;
union smp_flush_state *f;
+   cpumask_t cpumask = *cpumaskp;
 
/* Caller has disabled preemption */
sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
diff --git a/include/asm-x86/tlbflush.h b/include/asm-x86/tlbflush.h
index 9af4cc8..93283cf 100644
--- a/include/asm-x86/tlbflush.h
+++ b/include/asm-x86/tlbflush.h
@@ -1,5 +1,82 @@
+#ifndef _X86_TLBFLUSH_H_
+#define _X86_TLBFLUSH_H_
+
+#ifdef CONFIG_PARAVIRT
+#include 
+#else
+#define __flush_tlb() __native_flush_tlb()
+#define __flush_tlb_global() __native_flush_tlb_global()
+#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
+#endif
+
+static inline void __native_flush_tlb(void)
+{
+   write_cr3(read_cr3());
+}
+
+static inline void __native_flush_tlb_global(void)
+{
+   unsigned long cr4 = read_cr4();
+   write_cr4(cr4 & ~X86_CR4_PGE);  /* clear PGE */
+   write_cr4(cr4); /* write old PGE again and flush TLBs */
+}
+
+#define __native_flush_tlb_single(addr)\
+   __asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory")
+
+#ifdef CONFIG_SMP
+
+#include 
+#include 
+
+#define local_flush_tlb() \
+   __flush_tlb()
+
+extern void flush_tlb_all(void);
+extern void flush_tlb_current_task(void);
+extern void flush_tlb_mm(struct mm_struct *);
+extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
+
+#define flush_tlb()flush_tlb_current_task()
+
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+  unsigned long start, unsigned long end)
+{
+   flush_tlb_mm(vma->vm_mm);
+}
+
+void native_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm,
+unsigned long va);
+
+#define TLBSTATE_OK1
+#define TLBSTATE_LAZY  2
+
+#ifdef CONFIG_X86_64
+/* Roughly an IPI every 20MB with 4k pages for freeing page table
+   ranges. Cost is about 42k of memory for each CPU. */
+#define ARCH_FREE_PTE_NR 5350
+
+#else /* X86_64 */
+struct tlb_state
+{
+   struct mm_struct *active_mm;
+   int state;
+   char __cacheline_padding[L1_CACHE_BYTES-8];
+};
+DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
+#endif /* X86_64 */
+
+#endif
+
+#ifndef CONFIG_PARAVIRT
+#define flush_tlb_others(mask, mm, va) \
+   native_flush_tlb_others(&mask, mm, va)
+#endif
+
 #ifdef CONFIG_X86_32
 # include "tlbflush_32.h"
 #else
 # include "tlbflush_64.h"
 #endif
+
+#endif
diff --git a/include/asm-x86/tlbflush_32.h b/include/asm-x86/tlbflush_32.h
index 2bd5b95..07eaf37 100644
--- a/include/asm-x86/tlbflush_32.h
+++ b/include/asm-x86/tlbflush_32.h
@@ -1,49 +1,8 @@
 #ifndef _I386_TLBFLUSH_H
 #define _I386_TLBFLUSH_H
 
-#include 
 #include 
 
-#ifdef CONFIG_PARAVIRT
-#include 
-#else
-#define __flush_tlb() __native_flush_tlb()
-#define __flush_tlb_global() __native_flush_tlb_global()
-#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
-#endif
-
-#define __native_flush_tlb()   \
-   do {\
-   unsigned int tmpreg;\
-   \
-   __asm__ __volatile__(   \
-   "movl %%cr3, %0;  \n"   \
-   "movl %0, %%cr3;  # flush TLB \n"   \
-   : "=r" (tmpreg) \
-   :: "memory");   \
-   } while (0)
-
-/*
- * Global pages have to be flushed a bit diff

[kvm-devel] [PATCH 1/24] mm/sparse-vmemmap.c: make sure init_mm is included

2007-11-09 Thread Glauber de Oliveira Costa
mm/sparse-vmemmap.c uses init_mm in some places.  However, it is not
present in any of the headers currently included in the file.

init_mm is defined as extern in sched.h, so we add it to the headers list

Up to now, this problem was masked by the fact that functions like
set_pte_at() and pmd_populate_kernel() are usually macros that expand to
simpler variants that does not use the first parameter at all.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
Signed-off-by: Linus Torvalds <[EMAIL PROTECTED]>
---
 mm/sparse-vmemmap.c |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index d3b718b..22620f6 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
-- 
1.4.4.2


-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 0/24] paravirt_ops for unified x86 - that's me again!

2007-11-09 Thread Glauber de Oliveira Costa
Hey folks,

Here's a new spin of the pvops64 patch series.
We didn't get that many comments from the last time,
so it should be probably almost ready to get in. Heya!

>From the last version, the most notable changes are:
* consolidation of system.h, merging jeremy's comments about ordering
  concerns
* consolidation of smp functions that goes through smp_ops. They're sharing
  a bunch of code now.

Other than that, just some issues that arose from the rebase.

Please, not that this patch series _does not_ apply over linus git anymore,
but rather, over tglx cleanup series.

The first patch in this series is already on linus', but not on tglx', so
I'm sending it again, because you'll need it if you want to compile it
anyway.

tglx, in the absense of any outstanding NACKs, or any very big call for
improvements, could you please pull it in your tree?

Have fun,



-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 2/24] irqflags consolidation

2007-11-09 Thread Glauber de Oliveira Costa
This patch consolidates the irqflags include files containing common
paravirt definitions. The native definition for interrupt handling, halt,
and such, are the same for 32 and 64 bit, and they are kept in irqflags.h.
The differences are split in the arch-specific files.

The syscall function, irq_enable_sysexit, has a very specific i386 naming,
and its name is then changed to a more general one.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 arch/x86/kernel/asm-offsets_32.c |2 +-
 arch/x86/kernel/entry_32.S   |8 +-
 arch/x86/kernel/paravirt_32.c|   10 +-
 arch/x86/kernel/vmi_32.c |4 +-
 arch/x86/xen/enlighten.c |2 +-
 include/asm-x86/irqflags.h   |  246 +-
 include/asm-x86/irqflags_32.h|  174 ---
 include/asm-x86/irqflags_64.h|  154 
 include/asm-x86/paravirt.h   |9 +-
 9 files changed, 261 insertions(+), 348 deletions(-)

diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 0e45981..c1ccfab 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -123,7 +123,7 @@ void foo(void)
OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
-   OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
+   OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, 
irq_enable_syscall_ret);
OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
 #endif
 
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index dc7f938..d63609d 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -58,7 +58,7 @@
  * for paravirtualization.  The following will never clobber any registers:
  *   INTERRUPT_RETURN (aka. "iret")
  *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
- *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
+ *   ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
  *
  * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
  * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
@@ -351,7 +351,7 @@ sysenter_past_esp:
xorl %ebp,%ebp
TRACE_IRQS_ON
 1: mov  PT_FS(%esp), %fs
-   ENABLE_INTERRUPTS_SYSEXIT
+   ENABLE_INTERRUPTS_SYSCALL_RET
CFI_ENDPROC
 .pushsection .fixup,"ax"
 2: movl $0,PT_FS(%esp)
@@ -882,10 +882,10 @@ ENTRY(native_iret)
 .previous
 END(native_iret)
 
-ENTRY(native_irq_enable_sysexit)
+ENTRY(native_irq_enable_syscall_ret)
sti
sysexit
-END(native_irq_enable_sysexit)
+END(native_irq_enable_syscall_ret)
 #endif
 
 KPROBE_ENTRY(int3)
diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c
index 6a80d67..04f51d0 100644
--- a/arch/x86/kernel/paravirt_32.c
+++ b/arch/x86/kernel/paravirt_32.c
@@ -60,7 +60,7 @@ DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
 DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
 DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
 DEF_NATIVE(pv_cpu_ops, iret, "iret");
-DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
+DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit");
 DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
@@ -88,7 +88,7 @@ static unsigned native_patch(u8 type, u16 clobbers, void 
*ibuf,
SITE(pv_irq_ops, restore_fl);
SITE(pv_irq_ops, save_fl);
SITE(pv_cpu_ops, iret);
-   SITE(pv_cpu_ops, irq_enable_sysexit);
+   SITE(pv_cpu_ops, irq_enable_syscall_ret);
SITE(pv_mmu_ops, read_cr2);
SITE(pv_mmu_ops, read_cr3);
SITE(pv_mmu_ops, write_cr3);
@@ -186,7 +186,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void 
*insnbuf,
/* If the operation is a nop, then nop the callsite */
ret = paravirt_patch_nop();
else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
-type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit))
+type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret))
/* If operation requires a jmp, then jmp */
ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
else
@@ -237,7 +237,7 @@ static void native_flush_tlb_single(unsigned long addr)
 
 /* These are in entry.S */
 extern void native_iret(void);
-extern void native_irq_enable_sysexit(void);
+extern void native_irq_enable_syscall_ret(void);
 
 static int __init print_banner(void)
 {
@@ -384,7 +384,7 @@ struct pv_cpu_ops pv_cpu_op

[kvm-devel] [PATCH 2/3] kvmclock - the host part.

2007-11-08 Thread Glauber de Oliveira Costa
This is the host part of kvm clocksource implementation. As it does
not include clockevents, it is a fairly simple implementation. We
only have to register a per-vcpu area, and start writting to it periodically.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 drivers/kvm/kvm_main.c |1 +
 drivers/kvm/x86.c  |   32 
 drivers/kvm/x86.h  |4 
 3 files changed, 37 insertions(+), 0 deletions(-)

diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index d095002..c2c79b8 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -1243,6 +1243,7 @@ static long kvm_dev_ioctl(struct file *filp,
case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
case KVM_CAP_USER_MEMORY:
case KVM_CAP_SET_TSS_ADDR:
+   case KVM_CAP_CLOCKSOURCE:
r = 1;
break;
default:
diff --git a/drivers/kvm/x86.c b/drivers/kvm/x86.c
index e905d46..ef31fed 100644
--- a/drivers/kvm/x86.c
+++ b/drivers/kvm/x86.c
@@ -19,6 +19,7 @@
 #include "segment_descriptor.h"
 #include "irq.h"
 
+#include 
 #include 
 #include 
 #include 
@@ -1628,6 +1629,28 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
+static void kvm_write_guest_time(struct kvm_vcpu *vcpu)
+{
+   struct timespec ts;
+   int r;
+
+   if (!vcpu->clock_gpa)
+   return;
+
+   /* Updates version to the next odd number, indicating we're writing */
+   vcpu->hv_clock.version++;
+   kvm_write_guest(vcpu->kvm, vcpu->clock_gpa, &vcpu->hv_clock, PAGE_SIZE);
+
+   kvm_get_msr(vcpu, MSR_IA32_TIME_STAMP_COUNTER,
+ &vcpu->hv_clock.last_tsc);
+
+   ktime_get_ts(&ts);
+   vcpu->hv_clock.now_ns = ts.tv_nsec + (NSEC_PER_SEC * (u64)ts.tv_sec);
+   vcpu->hv_clock.wc_sec = get_seconds();
+   vcpu->hv_clock.version++;
+   kvm_write_guest(vcpu->kvm, vcpu->clock_gpa, &vcpu->hv_clock, PAGE_SIZE);
+}
+
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
unsigned long nr, a0, a1, a2, a3, ret;
@@ -1648,7 +1671,15 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
a3 &= 0x;
}
 
+   ret = 0;
switch (nr) {
+   case  KVM_HCALL_REGISTER_CLOCK:
+
+   vcpu->clock_gpa = a0 << PAGE_SHIFT;
+   vcpu->hv_clock.tsc_mult = clocksource_khz2mult(tsc_khz, 22);
+
+   break;
+
default:
ret = -KVM_ENOSYS;
break;
@@ -1924,6 +1955,7 @@ out:
goto preempted;
}
 
+   kvm_write_guest_time(vcpu);
post_kvm_run_save(vcpu, kvm_run);
 
return r;
diff --git a/drivers/kvm/x86.h b/drivers/kvm/x86.h
index 663b822..fd77b66 100644
--- a/drivers/kvm/x86.h
+++ b/drivers/kvm/x86.h
@@ -83,6 +83,10 @@ struct kvm_vcpu {
/* emulate context */
 
struct x86_emulate_ctxt emulate_ctxt;
+
+   struct kvm_hv_clock_s hv_clock;
+   gpa_t clock_gpa; /* guest frame number, physical addr */
+
 };
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
-- 
1.5.0.6


-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 3/3] kvmclock implementation, the guest part.

2007-11-08 Thread Glauber de Oliveira Costa
This is the guest part of kvm clock implementation
It does not do tsc-only timing, as tsc can have deltas
between cpus, and it did not seem worthy to me to keep
adjusting them.

We do use it, however, for fine-grained adjustment.

Other than that, time comes from the host.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 arch/x86/Kconfig.i386   |   10 +++
 arch/x86/kernel/Makefile_32 |1 +
 arch/x86/kernel/kvmclock.c  |  171 +++
 arch/x86/kernel/setup_32.c  |5 +
 4 files changed, 187 insertions(+), 0 deletions(-)
 create mode 100644 arch/x86/kernel/kvmclock.c

diff --git a/arch/x86/Kconfig.i386 b/arch/x86/Kconfig.i386
index 7331efe..5fe4025 100644
--- a/arch/x86/Kconfig.i386
+++ b/arch/x86/Kconfig.i386
@@ -257,6 +257,16 @@ config VMI
  at the moment), by linking the kernel to a GPL-ed ROM module
  provided by the hypervisor.
 
+config KVM_CLOCK
+   bool "KVM paravirtualized clock"
+   select PARAVIRT
+   help
+ Turning on this option will allow you to run a paravirtualized clock
+ when running over the KVM hypervisor. Instead of relying on a PIT
+ (or probably other) emulation by the underlying device model, the host
+ provides the guest with timing infrastructure, as time of day, and
+ timer expiration.
+
 source "arch/x86/lguest/Kconfig"
 
 endif
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index b9d6798..df76d8c 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -43,6 +43,7 @@ obj-$(CONFIG_K8_NB)   += k8.o
 obj-$(CONFIG_MGEODE_LX)+= geode_32.o mfgpt_32.o
 
 obj-$(CONFIG_VMI)  += vmi_32.o vmiclock_32.o
+obj-$(CONFIG_KVM_CLOCK)+= kvmclock.o
 obj-$(CONFIG_PARAVIRT) += paravirt_32.o
 obj-y  += pcspeaker.o
 
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
new file mode 100644
index 000..df14613
--- /dev/null
+++ b/arch/x86/kernel/kvmclock.c
@@ -0,0 +1,171 @@
+/*  KVM paravirtual clock driver. A clocksource implementation
+Copyright (C) 2007 Glauber de Oliveira Costa, Red Hat Inc.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+#define KVM_SCALE 22
+
+#define get_clock(cpu, field) hv_clock[cpu].fields.field
+
+static int kvmclock = 1;
+
+static int parse_no_kvmclock(char *arg)
+{
+   kvmclock = 0;
+   return 0;
+}
+early_param("no-kvmclock", parse_no_kvmclock);
+
+/* The hypervisor will put information about time periodically here */
+union kvm_hv_clock hv_clock[NR_CPUS] __attribute__((__aligned__(PAGE_SIZE)));
+
+static inline u64 kvm_get_delta(u64 last_tsc)
+{
+   int cpu = smp_processor_id();
+   u64 delta = native_read_tsc() - last_tsc;
+   return (delta * get_clock(cpu, tsc_mult)) >> KVM_SCALE;
+}
+
+/*
+ * The wallclock is the time of day when we booted. Since then, some time may
+ * have elapsed since the hypervisor wrote the data. So we try to account for
+ * that. Even if the tsc is not accurate, it gives us a more accurate timing
+ * than not adjusting at all
+ */
+unsigned long kvm_get_wallclock(void)
+{
+   u64 wc_sec, delta, last_tsc;
+   struct timespec ts;
+   int version, nsec, cpu = smp_processor_id();
+
+   do {
+   version = get_clock(cpu, version);
+   rmb();
+   last_tsc = get_clock(cpu, last_tsc);
+   rmb();
+   wc_sec = get_clock(cpu, wc_sec);
+   rmb();
+   } while ((get_clock(cpu, version) != version) && !(version & 1));
+
+   delta = kvm_get_delta(last_tsc);
+   nsec = do_div(delta, NSEC_PER_SEC);
+   set_normalized_timespec(&ts, wc_sec + delta, nsec);
+
+   /*
+* Of all mechanisms of time adjustment I've tested, this one
+* was the champion!
+*/
+   return ts.tv_sec + 1;
+}
+
+int kvm_set_wallclock(unsigned long now)
+{
+   return 0;
+}
+
+/*
+ * This is our read_clock function. The host puts an tsc timestamp each time
+ * it updates a new time, and then we can use it to derive a

[kvm-devel] [PATCH 1/3] include files for kvmclock

2007-11-08 Thread Glauber de Oliveira Costa
This patch introduces the include files for kvm clock.
They'll be needed for both guest and host part.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 include/asm-x86/kvm_para.h |   25 +
 include/linux/kvm.h|1 +
 include/linux/kvm_para.h   |2 ++
 3 files changed, 28 insertions(+), 0 deletions(-)

diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h
index c6f3fd8..0f6b813 100644
--- a/include/asm-x86/kvm_para.h
+++ b/include/asm-x86/kvm_para.h
@@ -10,15 +10,40 @@
  * paravirtualization, the appropriate feature bit should be checked.
  */
 #define KVM_CPUID_FEATURES 0x4001
+#define KVM_FEATURE_CLOCKSOURCE 0
 
 #ifdef __KERNEL__
 #include 
+extern void kvmclock_init(void);
+
+/*
+ * Guest has page alignment and padding requirements. At the host, it will
+ * only lead to wasted space at the vcpu struct. For this reason, the struct
+ * is not anonymous
+ */
+union kvm_hv_clock {
+   struct kvm_hv_clock_s {
+   u64 tsc_mult;
+   u64 now_ns;
+   /* That's the wall clock, not the water closet */
+   u64 wc_sec;
+   u64 last_tsc;
+   /* At first, we could use the tsc value as a marker, but Jeremy
+* well noted that it will cause us locking problems in 32-bit
+* sys, so we have a special version field */
+   u32 version;
+   } fields;
+   char page_align[PAGE_SIZE];
+};
+
 
 /* This instruction is vmcall.  On non-VT architectures, it will generate a
  * trap that we will then rewrite to the appropriate instruction.
  */
 #define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
 
+#define KVM_HCALL_REGISTER_CLOCK   1
+
 /* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
  * instruction.  The hypervisor may replace it with something else but only the
  * instructions are guaranteed to be supported.
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 71d33d6..9862241 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -359,6 +359,7 @@ struct kvm_signal_mask {
 #define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2
 #define KVM_CAP_USER_MEMORY 3
 #define KVM_CAP_SET_TSS_ADDR 4
+#define KVM_CAP_CLOCKSOURCE  5
 
 /*
  * ioctls for VM fds
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index e4db25f..094efc7 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -11,6 +11,8 @@
 
 /* Return values for hypercalls */
 #define KVM_ENOSYS 1000
+#define KVM_EINVAL 1019
+#define KVM_ENODEV 1022
 
 #ifdef __KERNEL__
 /*
-- 
1.5.0.6


-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 0/3] Kvm clocksource, new spin

2007-11-08 Thread Glauber de Oliveira Costa
Hi folks,

Here's a new spin of the clocksource implementation.
In this new version:
* followed avi's suggestion of:
  - letting the cpu itself register its memory area.
  - using a gfn instead of a phys addr as a parameter, to be sure we can 
cover the whole memory area
  - write guest time at exits.

Also, I 'm not using an anonymous struct in the kvm_hv_clock union, so the
vcpu struct can grab just what it needs, and not the whole padding the guest 
needs

This is it.

Have fun



-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] kvmclock - the host part.

2007-11-07 Thread Glauber de Oliveira Costa
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Avi Kivity escreveu:
> Glauber de Oliveira Costa wrote:
>> This is the host part of kvm clocksource implementation. As it does
>> not include clockevents, it is a fairly simple implementation. We
>> only have to register a per-vcpu area, and start writting to it periodically.
>>
>> Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
>> ---
>>  drivers/kvm/irq.c  |1 +
>>  drivers/kvm/kvm_main.c |2 +
>>  drivers/kvm/svm.c  |1 +
>>  drivers/kvm/vmx.c  |1 +
>>  drivers/kvm/x86.c  |   59 
>> 
>>  drivers/kvm/x86.h  |   13 ++
>>  6 files changed, 77 insertions(+), 0 deletions(-)
>>
>> diff --git a/drivers/kvm/irq.c b/drivers/kvm/irq.c
>> index 22bfeee..0344879 100644
>> --- a/drivers/kvm/irq.c
>> +++ b/drivers/kvm/irq.c
>> @@ -92,6 +92,7 @@ void kvm_vcpu_kick_request(struct kvm_vcpu *vcpu, int 
>> request)
>>  
>>  void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
>>  {
>> +vcpu->time_needs_update = 1;
>>   
> 
> Why here and not in __vcpu_run()?  It isn't timer irq related.
Because my plan was exactly, updating it at each timer interrupt.
There's a trade off between
updating every run (hopefully more precision, but more overhead), versus
updating at timer irqs, or other events.

What would you prefer?

>> @@ -1242,6 +1243,7 @@ static long kvm_dev_ioctl(struct file *filp,
>>  case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
>>  case KVM_CAP_USER_MEMORY:
>>  case KVM_CAP_SET_TSS_ADDR:
>> +case KVM_CAP_CLK:
>>   
> 
> It's just a clock source now, right?  so _CLOCK_SOURCE.
Right.

>> +static void kvm_write_guest_time(struct kvm_vcpu *vcpu)
>> +{
>> +struct timespec ts;
>> +void *clock_addr;
>> +
>> +
>> +if (!vcpu->clock_page)
>> +return;
>> +
>> +/* Updates version to the next odd number, indicating we're writing */
>> +vcpu->hv_clock.version++;
>>   
> 
> No one can actually see this as you're updating a private structure. 
> You need to copy it to guestspace.
That's true, I'm just copying it at the end, the whole thing. thanks.

>> +/* Updating the tsc count is the first thing we do */
>> +kvm_get_msr(vcpu, MSR_IA32_TIME_STAMP_COUNTER, 
>> &vcpu->hv_clock.last_tsc);
>> +ktime_get_ts(&ts);
>> +vcpu->hv_clock.now_ns = ts.tv_nsec + (NSEC_PER_SEC * (u64)ts.tv_sec);
>> +vcpu->hv_clock.wc_sec = get_seconds();
>> +vcpu->hv_clock.version++;
>> +
>> +clock_addr = vcpu->clock_addr;
>> +memcpy(clock_addr, &vcpu->hv_clock, sizeof(vcpu->hv_clock));
>> +mark_page_dirty(vcpu->kvm, vcpu->clock_gfn);
>>   
> 
> Just use kvm_write_guest().
Too slow. Updating guest time, even only in timer interrupts, was a too
frequent operation, and the kmap / kunmap (atomic) at every iteration
deemed the whole thing
unusable.

>> +
>> +vcpu->time_needs_update = 0;
>> +}
>> +
>>  int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
>>  {
>>  unsigned long nr, a0, a1, a2, a3, ret;
>> @@ -1648,7 +1674,33 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
>>  a3 &= 0x;
>>  }
>>  
>> +ret = 0;
>>  switch (nr) {
>> +case  KVM_HCALL_REGISTER_CLOCK: {
>> +struct kvm_vcpu *dst_vcpu;
>> +
>> +if (!((a1 < KVM_MAX_VCPUS) && (vcpu->kvm->vcpus[a1]))) {
>> +ret = -KVM_EINVAL;
>> +break;
>> +}
>> +
>> +dst_vcpu = vcpu->kvm->vcpus[a1];
>>   
> 
> What if !dst_vcpu?  What about locking?
> 
> Suggest simply using vcpu.  Every guest cpu can register its own
Earlier version had a check for !dst_vcpu, you are absolutely right.

Locking was not a problem in practice, because these operations are done
 serialized, by the same cpu.

This hypercall is called by cpu_up, which, at least in the beginning,
it's called by cpu0. And that's why each vcpu cannot register its own.
(And why we don't need locking).

Well, theorectically each vcpu do can register its own clocksource, it
will just be a little bit more complicated, we have to fire out an IPI,
and have the other cpu to catch it, and call the hypercall.

But I honestly don't like it.
Usually, the cpu leaves start_secondary with a clock already registered,
so the kernel relies on it.


Re: [kvm-devel] include files for kvmclock

2007-11-07 Thread Glauber de Oliveira Costa
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Akio Takebe escreveu:
> Hi, Glauber
> 
> This is interesting facility. :-)
> 
>> +#define KVM_HCALL_REGISTER_CLOCK1
>> +
>> +union kvm_hv_clock {
>> +struct {
>> +u64 tsc_mult;
>> +u64 now_ns;
>> +/* That's the wall clock, not the water closet */
>> +u64 wc_sec;
>> +u64 wc_nsec;
>> +u64 last_tsc;
>> +/* At first, we could use the tsc value as a marker, but Jeremy
>> + * well noted that it will cause us locking problems in 32-bit
>> + * sys, so we have a special version field */
>> +u32 version;
>> +};
>> +char page_align[PAGE_SIZE];
>> +};
>> +
> Why does kvm_hv_clock need page_align?
Each vcpu will register a page on its own. In the guest side, it will be
an array of pages. So, we make it page sized.

> And also the kvm_hv_clock is alloced with kvm_vcpu,
There's no requirements on the host part at all. So it doesn't really
matter. In the next version, I may make it even a simple pointer.

> so the align is not enough, isn't it?
> I thik __atribute__((__aligne__(PAGE_SIZE is better than it.
It deals with the start of the structure, but not with its size. See the
guest part: Where it matters, I do use it.


Thanks

-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Remi - http://enigmail.mozdev.org

iD8DBQFHMbqyjYI8LaFUWXMRAgfOAKCTeKF3cWbhILYSXY+MjtXo8B87EwCeNNhn
z9RDYaCWHIxsqlciMF0i27w=
=EIEM
-END PGP SIGNATURE-

-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] include files for kvmclock

2007-11-07 Thread Glauber de Oliveira Costa
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Jeremy Fitzhardinge escreveu:
> Avi Kivity wrote:
>> Glauber de Oliveira Costa wrote:
>>   
>>>> +union kvm_hv_clock {
>>>> +   struct {
>>>> +   u64 tsc_mult;
>>>> +   u64 now_ns;
>>>> +   /* That's the wall clock, not the water closet */
>>>> +   u64 wc_sec;
>>>> +   u64 wc_nsec;
>>>> 
>>>>   
>> Do we really need 128-bit time?  you must be planning to live forever.
>>   
> 
> Well, he's planning on having lots of very small nanoseconds.
> 
> J
The wc_nsec is legacy, and should be gone. It's not really used in
current code. However, you gave me a very good idea. Living forever
would be awesome! Where can I apply ?
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Remi - http://enigmail.mozdev.org

iD8DBQFHMbRfjYI8LaFUWXMRAjsQAJ4vDBW0M48fMaL9sl6XfN0+Pd82egCgutwe
9rR7+H8MUQznyinlJc76kbo=
=b3wx
-END PGP SIGNATURE-

-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] include files for kvmclock

2007-11-06 Thread Glauber de Oliveira Costa
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Jeremy Fitzhardinge escreveu:
> Glauber de Oliveira Costa wrote:
>> This patch introduces the include files for kvm clock.
>> They'll be needed for both guest and host part.
>>
>> Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
>> ---
>>  include/asm-x86/kvm_para.h |   23 +++
>>  include/linux/kvm.h|1 +
>>  include/linux/kvm_para.h   |   20 
>>  3 files changed, 44 insertions(+), 0 deletions(-)
>>
>> diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h
>> index c6f3fd8..af9fb75 100644
>> --- a/include/asm-x86/kvm_para.h
>> +++ b/include/asm-x86/kvm_para.h
>> @@ -10,15 +10,38 @@
>>   * paravirtualization, the appropriate feature bit should be checked.
>>   */
>>  #define KVM_CPUID_FEATURES  0x4001
>> +#define KVM_FEATURE_CLOCKEVENTS 0
>> +#define KVM_FEATURE_CLOCKSOURCE 1
>> +
>>  
>>  #ifdef __KERNEL__
>>  #include 
>> +extern void kvmclock_init(void);
>> +
>> +union kvm_hv_clock {
>>   
> 
> Why two copies of this structure?
> 
It's called silly mistake. ;-)

-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Remi - http://enigmail.mozdev.org

iD8DBQFHMPGvjYI8LaFUWXMRAgt2AJ9NKgq2LCueUidH56ZgUYA+5wBhGwCfdqQB
otFP1/SFowaANQ8FojEtJUE=
=8Xqp
-END PGP SIGNATURE-

-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] include files for kvmclock

2007-11-06 Thread Glauber de Oliveira Costa
On 11/6/07, Glauber de Oliveira Costa <[EMAIL PROTECTED]> wrote:
> This patch introduces the include files for kvm clock.
> They'll be needed for both guest and host part.

And of course, this was my test files by mistake ;-)
Oh god... ;-)

Patches aren't numbered but this one should go first. And please just ignore,
the replica of the hv_clock union definition in the patch bellow. It
should all go in asm/kvm_para.h

The other two patches are fine, and can be applied in any order, so
it's not worth resending. I'll grab your comments first.

> Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
> ---
>  include/asm-x86/kvm_para.h |   23 +++
>  include/linux/kvm.h|1 +
>  include/linux/kvm_para.h   |   20 
>  3 files changed, 44 insertions(+), 0 deletions(-)
>
> diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h
> index c6f3fd8..af9fb75 100644
> --- a/include/asm-x86/kvm_para.h
> +++ b/include/asm-x86/kvm_para.h
> @@ -10,15 +10,38 @@
>   * paravirtualization, the appropriate feature bit should be checked.
>   */
>  #define KVM_CPUID_FEATURES 0x4001
> +#define KVM_FEATURE_CLOCKEVENTS 0
> +#define KVM_FEATURE_CLOCKSOURCE 1
> +
>
>  #ifdef __KERNEL__
>  #include 
> +extern void kvmclock_init(void);
> +
> +union kvm_hv_clock {
> +   struct {
> +   u64 tsc_mult;
> +   u64 now_ns;
> +   /* That's the wall clock, not the water closet */
> +   u64 wc_sec;
> +   u64 wc_nsec;
> +   u64 last_tsc;
> +   /* At first, we could use the tsc value as a marker, but 
> Jeremy
> +* well noted that it will cause us locking problems in 32-bit
> +* sys, so we have a special version field */
> +   u32 version;
> +   };
> +   char page_align[PAGE_SIZE];
> +};
> +
>
>  /* This instruction is vmcall.  On non-VT architectures, it will generate a
>   * trap that we will then rewrite to the appropriate instruction.
>   */
>  #define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
>
> +#define KVM_HCALL_REGISTER_CLOCK   1
> +
>  /* For KVM hypercalls, a three-byte sequence of either the vmrun or the 
> vmmrun
>   * instruction.  The hypervisor may replace it with something else but only 
> the
>   * instructions are guaranteed to be supported.
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 71d33d6..7ac8786 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -359,6 +359,7 @@ struct kvm_signal_mask {
>  #define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2
>  #define KVM_CAP_USER_MEMORY 3
>  #define KVM_CAP_SET_TSS_ADDR 4
> +#define KVM_CAP_CLK  5
>
>  /*
>   * ioctls for VM fds
> diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
> index e4db25f..567a192 100644
> --- a/include/linux/kvm_para.h
> +++ b/include/linux/kvm_para.h
> @@ -11,8 +11,28 @@
>
>  /* Return values for hypercalls */
>  #define KVM_ENOSYS 1000
> +#define KVM_ENODEV 1019
> +#define KVM_EINVAL 1022
>
>  #ifdef __KERNEL__
> +#define KVM_HCALL_REGISTER_CLOCK   1
> +
> +union kvm_hv_clock {
> +   struct {
> +   u64 tsc_mult;
> +   u64 now_ns;
> +   /* That's the wall clock, not the water closet */
> +   u64 wc_sec;
> +   u64 wc_nsec;
> +   u64 last_tsc;
> +   /* At first, we could use the tsc value as a marker, but 
> Jeremy
> +* well noted that it will cause us locking problems in 32-bit
> +* sys, so we have a special version field */
> +   u32 version;
> +   };
> +   char page_align[PAGE_SIZE];
> +};
> +
>  /*
>   * hypercalls use architecture specific
>   */
> --
> 1.5.0.6
>
>
> -
> This SF.net email is sponsored by: Splunk Inc.
> Still grepping through log files to find problems?  Stop.
> Now Search log events and configuration files using AJAX and a browser.
> Download your FREE copy of Splunk now >> http://get.splunk.com/
> ___
> kvm-devel mailing list
> kvm-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/kvm-devel
>


-- 
Glauber de Oliveira Costa.
"Free as in Freedom"
http://glommer.net

"The less confident you are, the more serious you have to act."

-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] kvmclock implementation, the guest part.

2007-11-06 Thread Glauber de Oliveira Costa
This is the guest part of kvm clock implementation
It does not do tsc-only timing, as tsc can have deltas
between cpus, and it did not seem worthy to me to keep
adjusting them.

We do use it, however, for fine-grained adjustment.

Other than that, time comes from the host.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 arch/i386/Kconfig   |   10 +++
 arch/x86/kernel/Makefile_32 |1 +
 arch/x86/kernel/kvmclock.c  |  164 +++
 arch/x86/kernel/setup_32.c  |5 ++
 4 files changed, 180 insertions(+), 0 deletions(-)
 create mode 100644 arch/x86/kernel/kvmclock.c

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index b4437ce..a3b45f1 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -257,6 +257,16 @@ config VMI
  at the moment), by linking the kernel to a GPL-ed ROM module
  provided by the hypervisor.
 
+config KVM_CLOCK
+   bool "KVM paravirtualized clock"
+   select PARAVIRT
+   help
+ Turning on this option will allow you to run a paravirtualized clock
+ when running over the KVM hypervisor. Instead of relying on a PIT
+ (or probably other) emulation by the underlying device model, the host
+ provides the guest with timing infrastructure, as time of day, and
+ timer expiration.
+
 source "arch/x86/lguest/Kconfig"
 
 endif
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index b9d6798..df76d8c 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -43,6 +43,7 @@ obj-$(CONFIG_K8_NB)   += k8.o
 obj-$(CONFIG_MGEODE_LX)+= geode_32.o mfgpt_32.o
 
 obj-$(CONFIG_VMI)  += vmi_32.o vmiclock_32.o
+obj-$(CONFIG_KVM_CLOCK)+= kvmclock.o
 obj-$(CONFIG_PARAVIRT) += paravirt_32.o
 obj-y  += pcspeaker.o
 
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
new file mode 100644
index 000..8778d61
--- /dev/null
+++ b/arch/x86/kernel/kvmclock.c
@@ -0,0 +1,164 @@
+/*  KVM paravirtual clock driver. A clocksource implementation
+Copyright (C) 2007 Glauber de Oliveira Costa, Red Hat Inc.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+#define KVM_SCALE 22
+
+static int kvmclock = 1;
+
+static int parse_no_kvmclock(char *arg)
+{
+   kvmclock = 0;
+   return 0;
+}
+early_param("no-kvmclock", parse_no_kvmclock);
+
+/* The hypervisor will put information about time periodically here */
+union kvm_hv_clock hv_clock[NR_CPUS] __attribute__((__aligned__(PAGE_SIZE)));
+
+/*
+ * The wallclock is the time of day when we booted. Since then, some time may
+ * have elapsed since the hypervisor wrote the data. So we try to account for
+ * that. Even if the tsc is not accurate, it gives us a more accurate timing
+ * than not adjusting at all
+ */
+unsigned long kvm_get_wallclock(void)
+{
+   u64 wc_sec, delta, last_tsc;
+   struct timespec ts;
+   int version, nsec, cpu = smp_processor_id();
+
+   do {
+   version = hv_clock[cpu].version;
+   rmb();
+   last_tsc = hv_clock[cpu].last_tsc;
+   rmb();
+   wc_sec = hv_clock[cpu].wc_sec;
+   rmb();
+   } while ((hv_clock[cpu].version != version) && !(version & 1));
+
+   rdtscll(delta);
+   delta = delta - last_tsc;
+   delta = (delta * hv_clock[cpu].tsc_mult) >> KVM_SCALE;
+   nsec = do_div(delta, NSEC_PER_SEC);
+   set_normalized_timespec(&ts, wc_sec + delta, nsec);
+
+   /*
+* Of all mechanisms of time adjustment I've tested, this one
+* was the champion!
+*/
+   return ts.tv_sec + 1;
+}
+
+int kvm_set_wallclock(unsigned long now)
+{
+   return 0;
+}
+
+/*
+ * This is our read_clock function. The host puts an tsc timestamp each time
+ * it updates a new time, and then we can use it to derive a slightly more
+ * precise notion of elapsed time, converted to nanoseconds.
+ */
+static cycle_t kvm_clock_read(void)
+{
+
+   u64 delta, last_tsc, now;
+   u32 version;
+   int cpu = smp_proces

[kvm-devel] include files for kvmclock

2007-11-06 Thread Glauber de Oliveira Costa
This patch introduces the include files for kvm clock.
They'll be needed for both guest and host part.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 include/asm-x86/kvm_para.h |   23 +++
 include/linux/kvm.h|1 +
 include/linux/kvm_para.h   |   20 
 3 files changed, 44 insertions(+), 0 deletions(-)

diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h
index c6f3fd8..af9fb75 100644
--- a/include/asm-x86/kvm_para.h
+++ b/include/asm-x86/kvm_para.h
@@ -10,15 +10,38 @@
  * paravirtualization, the appropriate feature bit should be checked.
  */
 #define KVM_CPUID_FEATURES 0x4001
+#define KVM_FEATURE_CLOCKEVENTS 0
+#define KVM_FEATURE_CLOCKSOURCE 1
+
 
 #ifdef __KERNEL__
 #include 
+extern void kvmclock_init(void);
+
+union kvm_hv_clock {
+   struct {
+   u64 tsc_mult;
+   u64 now_ns;
+   /* That's the wall clock, not the water closet */
+   u64 wc_sec;
+   u64 wc_nsec;
+   u64 last_tsc;
+   /* At first, we could use the tsc value as a marker, but Jeremy
+* well noted that it will cause us locking problems in 32-bit
+* sys, so we have a special version field */
+   u32 version;
+   };
+   char page_align[PAGE_SIZE];
+};
+
 
 /* This instruction is vmcall.  On non-VT architectures, it will generate a
  * trap that we will then rewrite to the appropriate instruction.
  */
 #define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
 
+#define KVM_HCALL_REGISTER_CLOCK   1
+
 /* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
  * instruction.  The hypervisor may replace it with something else but only the
  * instructions are guaranteed to be supported.
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 71d33d6..7ac8786 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -359,6 +359,7 @@ struct kvm_signal_mask {
 #define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2
 #define KVM_CAP_USER_MEMORY 3
 #define KVM_CAP_SET_TSS_ADDR 4
+#define KVM_CAP_CLK  5
 
 /*
  * ioctls for VM fds
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index e4db25f..567a192 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -11,8 +11,28 @@
 
 /* Return values for hypercalls */
 #define KVM_ENOSYS 1000
+#define KVM_ENODEV 1019
+#define KVM_EINVAL 1022
 
 #ifdef __KERNEL__
+#define KVM_HCALL_REGISTER_CLOCK   1
+
+union kvm_hv_clock {
+   struct {
+   u64 tsc_mult;
+   u64 now_ns;
+   /* That's the wall clock, not the water closet */
+   u64 wc_sec;
+   u64 wc_nsec;
+   u64 last_tsc;
+   /* At first, we could use the tsc value as a marker, but Jeremy
+* well noted that it will cause us locking problems in 32-bit
+* sys, so we have a special version field */
+   u32 version;
+   };
+   char page_align[PAGE_SIZE];
+};
+
 /*
  * hypercalls use architecture specific
  */
-- 
1.5.0.6


-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] kvmclock - the host part.

2007-11-06 Thread Glauber de Oliveira Costa
This is the host part of kvm clocksource implementation. As it does
not include clockevents, it is a fairly simple implementation. We
only have to register a per-vcpu area, and start writting to it periodically.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
---
 drivers/kvm/irq.c  |1 +
 drivers/kvm/kvm_main.c |2 +
 drivers/kvm/svm.c  |1 +
 drivers/kvm/vmx.c  |1 +
 drivers/kvm/x86.c  |   59 
 drivers/kvm/x86.h  |   13 ++
 6 files changed, 77 insertions(+), 0 deletions(-)

diff --git a/drivers/kvm/irq.c b/drivers/kvm/irq.c
index 22bfeee..0344879 100644
--- a/drivers/kvm/irq.c
+++ b/drivers/kvm/irq.c
@@ -92,6 +92,7 @@ void kvm_vcpu_kick_request(struct kvm_vcpu *vcpu, int request)
 
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 {
+   vcpu->time_needs_update = 1;
kvm_inject_apic_timer_irqs(vcpu);
/* TODO: PIT, RTC etc. */
 }
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 0b8edca..5834573 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -20,6 +20,7 @@
 #include "x86_emulate.h"
 #include "irq.h"
 
+#include 
 #include 
 #include 
 #include 
@@ -1242,6 +1243,7 @@ static long kvm_dev_ioctl(struct file *filp,
case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
case KVM_CAP_USER_MEMORY:
case KVM_CAP_SET_TSS_ADDR:
+   case KVM_CAP_CLK:
r = 1;
break;
default:
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index 95a3489..cb8c19d 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -617,6 +617,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 
__free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
kvm_vcpu_uninit(vcpu);
+   release_clock(vcpu);
kmem_cache_free(kvm_vcpu_cache, svm);
 }
 
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index da3a339..b5edeed 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -2501,6 +2501,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
kfree(vmx->host_msrs);
kfree(vmx->guest_msrs);
kvm_vcpu_uninit(vcpu);
+   release_clock(vcpu);
kmem_cache_free(kvm_vcpu_cache, vmx);
 }
 
diff --git a/drivers/kvm/x86.c b/drivers/kvm/x86.c
index e905d46..d476488 100644
--- a/drivers/kvm/x86.c
+++ b/drivers/kvm/x86.c
@@ -19,6 +19,7 @@
 #include "segment_descriptor.h"
 #include "irq.h"
 
+#include 
 #include 
 #include 
 #include 
@@ -1628,6 +1629,31 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
+static void kvm_write_guest_time(struct kvm_vcpu *vcpu)
+{
+   struct timespec ts;
+   void *clock_addr;
+
+
+   if (!vcpu->clock_page)
+   return;
+
+   /* Updates version to the next odd number, indicating we're writing */
+   vcpu->hv_clock.version++;
+   /* Updating the tsc count is the first thing we do */
+   kvm_get_msr(vcpu, MSR_IA32_TIME_STAMP_COUNTER, 
&vcpu->hv_clock.last_tsc);
+   ktime_get_ts(&ts);
+   vcpu->hv_clock.now_ns = ts.tv_nsec + (NSEC_PER_SEC * (u64)ts.tv_sec);
+   vcpu->hv_clock.wc_sec = get_seconds();
+   vcpu->hv_clock.version++;
+
+   clock_addr = vcpu->clock_addr;
+   memcpy(clock_addr, &vcpu->hv_clock, sizeof(vcpu->hv_clock));
+   mark_page_dirty(vcpu->kvm, vcpu->clock_gfn);
+
+   vcpu->time_needs_update = 0;
+}
+
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
unsigned long nr, a0, a1, a2, a3, ret;
@@ -1648,7 +1674,33 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
a3 &= 0x;
}
 
+   ret = 0;
switch (nr) {
+   case  KVM_HCALL_REGISTER_CLOCK: {
+   struct kvm_vcpu *dst_vcpu;
+
+   if (!((a1 < KVM_MAX_VCPUS) && (vcpu->kvm->vcpus[a1]))) {
+   ret = -KVM_EINVAL;
+   break;
+   }
+
+   dst_vcpu = vcpu->kvm->vcpus[a1];
+   dst_vcpu->clock_page = gfn_to_page(vcpu->kvm, a0 >> PAGE_SHIFT);
+
+   if (!dst_vcpu->clock_page) {
+   ret = -KVM_EINVAL;
+   break;
+   }
+   dst_vcpu->clock_gfn = a0 >> PAGE_SHIFT;
+
+   dst_vcpu->hv_clock.tsc_mult = clocksource_khz2mult(tsc_khz, 22);
+   dst_vcpu->clock_addr = kmap(dst_vcpu->clock_page);
+
+   dst_vcpu->time_needs_update = 1;
+
+   break;
+   }
+
default:
ret = -KVM_ENOSYS;
break;
@@ -1816,6 +1868,12 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
 vcpu->irq_summary == 0);
 }
 
+void kvm_update_guest_tim

[kvm-devel] KVM paravirt clocksource - Take 3 out of

2007-11-06 Thread Glauber de Oliveira Costa
This is a new version of kvm paravirt clock implementation

This time, the clockevents part was completely wiped out.
Not that I'm dropping it: As said in my last message, I'm starting
to take the path of having a specialized irq chip, as for avi-san's
suggestion.

However, last iteration made clocksources and clockevents a lot more
independent, and so, turning it into a clocksource-only implementation
was just a matter of deleting code - that can be later added almost as-is.
It also favours people willing to use the clocksource, but going towards
userspace HPET emulation, or things like this.

The goal is to have this part included independently of the other bits.

>From last release:

* no more clockevents (for a while, hang on!)
* per-vcpu hv_clock area, which led to...
* no more purely tsc timing.

If you have a new concern with this version, or I failed to address a previous 
concern of yours, just voice it!



-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 3/16] read/write_crX, clts and wbinvd for 64-bit paravirt

2007-11-01 Thread Glauber de Oliveira Costa
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Keir Fraser escreveu:
> On 1/11/07 15:30, "Jeremy Fitzhardinge" <[EMAIL PROTECTED]> wrote:
> 
>> Glauber de Oliveira Costa wrote:
>>> I in fact have seen bugs with mixed reads and writes to the same cr,
>>> (cr4), but adding the volatile
>>> flag to the read function seemed to fix it.
>> Well, volatile will make a read be repeated rather than caching the
>> previous value, but it has no effect on ordering.
> 
> volatile prevents the asm from being 'moved significantly', according to the
> gcc manual. I take that to mean that reordering is not allowed.
> 
According to a gcc developer to whom I asked this question, volatile
prevents the code
to be removed, but does not prevent it to be moved (pun indented). In
practice, it should force
a re-read, but not influence the ordering decisions from the compiler.
Besides , 'significantly'
sounds like a significantly unprecise word, whose specific meaning may
be implementation dependant.

So I agree that adding a memory location reference is probably the best
alternative.

-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Remi - http://enigmail.mozdev.org

iD8DBQFHKftDjYI8LaFUWXMRAiLTAKDqf/M8umNYw6u7r9ONozTEUVy8SwCgygma
jWNKQmxmLpyPxr00KbQy9Vg=
=JM4K
-END PGP SIGNATURE-

-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 11/16] turn priviled operation into a macro in head_64.S

2007-11-01 Thread Glauber de Oliveira Costa
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Jeremy Fitzhardinge escreveu:
> Glauber de Oliveira Costa wrote:
>> under paravirt, read cr2 cannot be issued directly anymore.
>> So wrap it in a macro, defined to the operation itself in case
>> paravirt is off, but to something else if we have paravirt
>> in the game
>>   
> 
> Is this actually needed?  It's only used in the early fault handler in
> head_64.S.  Will we be taking that path in the paravirt case?  If so,
> should we disable the fault handler altogether, since the hypervisor can
> probably provide better diagnositcs.
> 

Well, as you told me earlier, xen won't use it. Neither does lguest.
None of us goes through the normal boot process anyway. But maybe some
other technology uses it?

Zach, how does it work for vmware?


-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Remi - http://enigmail.mozdev.org

iD8DBQFHKdmVjYI8LaFUWXMRAgxkAKCQB+VtpGSlm+zuyRRWmi3h+k9NqQCgyfmD
cpPyQcfxo9hcSI0WaDFmpWg=
=Oa4v
-END PGP SIGNATURE-

-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 3/16] read/write_crX, clts and wbinvd for 64-bit paravirt

2007-11-01 Thread Glauber de Oliveira Costa
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Jeremy Fitzhardinge escreveu:
> Glauber de Oliveira Costa wrote:
>> This patch introduces, and patch callers when needed, native
>> versions for read/write_crX functions, clts and wbinvd.
>>
>> Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
>> Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
>> Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
>> ---
>>  arch/x86/mm/pageattr_64.c   |3 +-
>>  include/asm-x86/system_64.h |   60 
>> ++
>>  2 files changed, 45 insertions(+), 18 deletions(-)
>>
>> diff --git a/arch/x86/mm/pageattr_64.c b/arch/x86/mm/pageattr_64.c
>> index c40afba..59a52b0 100644
>> --- a/arch/x86/mm/pageattr_64.c
>> +++ b/arch/x86/mm/pageattr_64.c
>> @@ -12,6 +12,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  
>>  pte_t *lookup_address(unsigned long address)
>>  { 
>> @@ -77,7 +78,7 @@ static void flush_kernel_map(void *arg)
>> much cheaper than WBINVD. */
>>  /* clflush is still broken. Disable for now. */
>>  if (1 || !cpu_has_clflush)
>> -asm volatile("wbinvd" ::: "memory");
>> +wbinvd();
>>  else list_for_each_entry(pg, l, lru) {
>>  void *adr = page_address(pg);
>>  clflush_cache_range(adr, PAGE_SIZE);
>> diff --git a/include/asm-x86/system_64.h b/include/asm-x86/system_64.h
>> index 4cb2384..b558cb2 100644
>> --- a/include/asm-x86/system_64.h
>> +++ b/include/asm-x86/system_64.h
>> @@ -65,53 +65,62 @@ extern void load_gs_index(unsigned);
>>  /*
>>   * Clear and set 'TS' bit respectively
>>   */
>> -#define clts() __asm__ __volatile__ ("clts")
>> +static inline void native_clts(void)
>> +{
>> +asm volatile ("clts");
>> +}
>>  
>> -static inline unsigned long read_cr0(void)
>> -{ 
>> +static inline unsigned long native_read_cr0(void)
>> +{
>>  unsigned long cr0;
>>  asm volatile("movq %%cr0,%0" : "=r" (cr0));
>>  return cr0;
>>  }
>>   
> 
> This is a pre-existing bug, but it seems to me that these read/write crX
> asms should have a constraint to stop the compiler from reordering them
> with respect to each other.  The brute-force approach would be to add
> "memory" clobbers, but the subtle fix would be to add a variable which
> is only used to sequence:
> 
I in fact have seen bugs with mixed reads and writes to the same cr,
(cr4), but adding the volatile
flag to the read function seemed to fix it. Yet, I agree with you that
the theorectical problem exists for the reorder, and your proposed fix
seems fine (although if we're really desperate about memory usage, we
can use a char instead a int and save 3 bytes!)

It also just ocurred to me that this part of the patch can also go into
the consolidation part. So I'll respin it.

Thanks for the comment
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Remi - http://enigmail.mozdev.org

iD8DBQFHKdkujYI8LaFUWXMRAhV2AKDPIjwGQnoLtldys/OWtIEs6biwxwCg1Jd/
o36S+qcb4sWJ6peqhrSRnos=
=YmeC
-END PGP SIGNATURE-

-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 16/16] make vsmp a paravirt client

2007-10-31 Thread Glauber de Oliveira Costa
This patch makes vsmp a paravirt client. It now uses the whole
infrastructure provided by pvops. When we detect we're running
a vsmp box, we change the irq-related paravirt operations (and so,
it have to happen quite early), and the patching function

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 arch/x86/Kconfig.x86_64|3 +-
 arch/x86/kernel/setup_64.c |3 ++
 arch/x86/kernel/vsmp_64.c  |   72 +++
 include/asm-x86/setup.h|3 +-
 4 files changed, 71 insertions(+), 10 deletions(-)

diff --git a/arch/x86/Kconfig.x86_64 b/arch/x86/Kconfig.x86_64
index 04734dd..544bad5 100644
--- a/arch/x86/Kconfig.x86_64
+++ b/arch/x86/Kconfig.x86_64
@@ -148,15 +148,14 @@ config X86_PC
bool "PC-compatible"
help
  Choose this option if your computer is a standard PC or compatible.
-
 config X86_VSMP
bool "Support for ScaleMP vSMP"
depends on PCI
+   select PARAVIRT
 help
  Support for ScaleMP vSMP systems.  Say 'Y' here if this kernel is
  supposed to run on these EM64T-based machines.  Only choose this 
option
  if you have one of these machines.
-
 endchoice
 
 choice
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 44a11e3..c522549 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -335,6 +335,9 @@ void __init setup_arch(char **cmdline_p)
 
init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
 
+#ifdef CONFIG_VSMP
+   vsmp_init();
+#endif
dmi_scan_machine();
 
 #ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 414caf0..547d3b3 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -8,18 +8,70 @@
  *
  * Ravikiran Thirumalai <[EMAIL PROTECTED]>,
  * Shai Fultheim <[EMAIL PROTECTED]>
+ * Paravirt ops integration: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
  */
-
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
+
+/*
+ * Interrupt control for the VSMP architecture:
+ */
+
+static inline unsigned long vsmp_save_fl(void)
+{
+   unsigned long flags = native_save_fl();
+
+   if (flags & X86_EFLAGS_IF)
+   return X86_EFLAGS_IF;
+   return 0;
+}
 
-static int __init vsmp_init(void)
+static inline void vsmp_restore_fl(unsigned long flags)
+{
+   if (flags & X86_EFLAGS_IF)
+   flags &= ~X86_EFLAGS_AC;
+   if (!(flags & X86_EFLAGS_IF))
+   flags &= X86_EFLAGS_AC;
+   native_restore_fl(flags);
+}
+
+static inline void vsmp_irq_disable(void)
+{
+   unsigned long flags = native_save_fl();
+
+   vsmp_restore_fl((flags & ~X86_EFLAGS_IF));
+}
+
+static inline void vsmp_irq_enable(void)
+{
+   unsigned long flags = native_save_fl();
+
+   vsmp_restore_fl((flags | X86_EFLAGS_IF));
+}
+
+static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf,
+ unsigned long addr, unsigned len)
+{
+   switch (type) {
+   case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
+   case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
+   case PARAVIRT_PATCH(pv_irq_ops.save_fl):
+   case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
+   return paravirt_patch_default(type, clobbers, ibuf, addr, len);
+   default:
+   return native_patch(type, clobbers, ibuf, addr, len);
+   }
+
+}
+
+int __init vsmp_init(void)
 {
void *address;
-   unsigned int cap, ctl;
+   unsigned int cap, ctl, cfg;
 
if (!early_pci_allowed())
return 0;
@@ -29,8 +81,16 @@ static int __init vsmp_init(void)
(read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != 
PCI_DEVICE_ID_SCALEMP_VSMP_CTL))
return 0;
 
+   /* If we are, use the distinguished irq functions */
+   pv_irq_ops.irq_disable = vsmp_irq_disable;
+   pv_irq_ops.irq_enable  = vsmp_irq_enable;
+   pv_irq_ops.save_fl  = vsmp_save_fl;
+   pv_irq_ops.restore_fl  = vsmp_restore_fl;
+   pv_init_ops.patch = vsmp_patch;
+
/* set vSMP magic bits to indicate vSMP capable kernel */
-   address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8);
+   cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0);
+   address = early_ioremap(cfg, 8);
cap = readl(address);
ctl = readl(address + 4);
printk("vSMP CTL: capabilities:0x%08x  control:0x%08x\n", cap, ctl);
@@ -42,8 +102,6 @@ static int __init vsmp_init(void)
printk("vSMP CTL: control set to:0x%08x\n", ctl);
}
 
-   iounmap(address);
+   early_iounmap(address, 8);
return 0;
 }
-
-core_initcall(vsmp_init);
diff --git a/include/asm-x86/setup.h b/include/asm-x86

[kvm-devel] [PATCH 14/16] prepare x86_64 architecture initialization for paravirt

2007-10-31 Thread Glauber de Oliveira Costa
This patch prepares the x86_64 architecture initialization for
paravirt. It requires a memory initialization step, which is done
by implementing 64-bit version for machine_specific_memory_setup,
and putting an ARCH_SETUP hook, for guest-dependent initialization.
This last step is done akin to i386

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 arch/x86/kernel/e820_64.c  |9 +++--
 arch/x86/kernel/setup_64.c |   28 +++-
 include/asm-x86/setup.h|   11 ---
 3 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 04698e0..c67526c 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -593,8 +593,10 @@ void early_panic(char *msg)
panic(msg);
 }
 
-void __init setup_memory_region(void)
+/* We're not void only for x86 32-bit compat */
+char * __init machine_specific_memory_setup(void)
 {
+   char *who = "BIOS-e820";
/*
 * Try to copy the BIOS-supplied E820-map.
 *
@@ -605,7 +607,10 @@ void __init setup_memory_region(void)
if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
early_panic("Cannot find a valid memory map");
printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-   e820_print_map("BIOS-e820");
+   e820_print_map(who);
+
+   /* In case someone cares... */
+   return who;
 }
 
 static int __init parse_memopt(char *p)
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 238633d..44a11e3 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -39,6 +39,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -60,6 +61,12 @@
 #include 
 #include 
 
+#ifdef CONFIG_PARAVIRT
+#include 
+#else
+#define ARCH_SETUP
+#endif
+
 /*
  * Machine setup..
  */
@@ -241,6 +248,16 @@ static void discover_ebda(void)
 * 4K EBDA area at 0x40E
 */
ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
+   /*
+* There can be some situations, like paravirtualized guests,
+* in which there is no available ebda information. In such
+* case, just skip it
+*/
+   if (!ebda_addr) {
+   ebda_size = 0;
+   return;
+   }
+
ebda_addr <<= 4;
 
ebda_size = *(unsigned short *)__va(ebda_addr);
@@ -254,6 +271,12 @@ static void discover_ebda(void)
ebda_size = 64*1024;
 }
 
+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
+void __attribute__((weak)) memory_setup(void)
+{
+   machine_specific_memory_setup();
+}
+
 void __init setup_arch(char **cmdline_p)
 {
printk(KERN_INFO "Command line: %s\n", boot_command_line);
@@ -269,7 +292,10 @@ void __init setup_arch(char **cmdline_p)
rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
 #endif
-   setup_memory_region();
+
+   ARCH_SETUP
+
+   memory_setup();
copy_edd();
 
if (!boot_params.hdr.root_flags)
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
index 24d786e..071e054 100644
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -3,6 +3,13 @@
 
 #define COMMAND_LINE_SIZE 2048
 
+#ifndef __ASSEMBLY__
+char *machine_specific_memory_setup(void);
+#ifndef CONFIG_PARAVIRT
+#define paravirt_post_allocator_init() do {} while (0)
+#endif
+#endif /* __ASSEMBLY__ */
+
 #ifdef __KERNEL__
 
 #ifdef __i386__
@@ -51,9 +58,7 @@ void __init add_memory_region(unsigned long long start,
 
 extern unsigned long init_pg_tables_end;
 
-#ifndef CONFIG_PARAVIRT
-#define paravirt_post_allocator_init() do {} while (0)
-#endif
+
 
 #endif /* __i386__ */
 #endif /* _SETUP */
-- 
1.4.4.2


-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 12/16] tweak io_64.h for paravirt.

2007-10-31 Thread Glauber de Oliveira Costa
We need something here because we can't call in and out instructions
directly. However, we have to be careful, because no indirections are
allowed in misc_64.c , and paravirt_ops is a kind of one. So just
call it directly there

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
Acked-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
---
 arch/x86/boot/compressed/misc_64.c |6 +
 include/asm-x86/io_64.h|   37 +--
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/arch/x86/boot/compressed/misc_64.c 
b/arch/x86/boot/compressed/misc_64.c
index 6ea015a..6640a17 100644
--- a/arch/x86/boot/compressed/misc_64.c
+++ b/arch/x86/boot/compressed/misc_64.c
@@ -9,6 +9,12 @@
  * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
  */
 
+/*
+ * we have to be careful, because no indirections are allowed here, and
+ * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
+ * we just keep it from happening
+ */
+#undef CONFIG_PARAVIRT
 #define _LINUX_STRING_H_ 1
 #define __LINUX_BITMAP_H 1
 
diff --git a/include/asm-x86/io_64.h b/include/asm-x86/io_64.h
index a037b07..57fcdd9 100644
--- a/include/asm-x86/io_64.h
+++ b/include/asm-x86/io_64.h
@@ -35,12 +35,24 @@
   *  - Arnaldo Carvalho de Melo <[EMAIL PROTECTED]>
   */
 
-#define __SLOW_DOWN_IO "\noutb %%al,$0x80"
+static inline void native_io_delay(void)
+{
+   asm volatile("outb %%al,$0x80" : : : "memory");
+}
 
-#ifdef REALLY_SLOW_IO
-#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO 
__SLOW_DOWN_IO
+#if defined(CONFIG_PARAVIRT)
+#include 
 #else
-#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO
+
+static inline void slow_down_io(void)
+{
+   native_io_delay();
+#ifdef REALLY_SLOW_IO
+   native_io_delay();
+   native_io_delay();
+   native_io_delay();
+#endif
+}
 #endif
 
 /*
@@ -52,9 +64,15 @@ static inline void out##s(unsigned x value, unsigned short 
port) {
 #define __OUT2(s,s1,s2) \
 __asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
 
+#ifndef REALLY_SLOW_IO
+#define REALLY_SLOW_IO
+#define UNSET_REALLY_SLOW_IO
+#endif
+
 #define __OUT(s,s1,x) \
 __OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
-__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" 
(port));} \
+__OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
+   slow_down_io(); }
 
 #define __IN1(s) \
 static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
@@ -63,8 +81,13 @@ static inline RETURN_TYPE in##s(unsigned short port) { 
RETURN_TYPE _v;
 __asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
 
 #define __IN(s,s1,i...) \
-__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
-__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) 
,##i ); return _v; } \
+__IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); return _v; } \
+__IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i);  \
+   slow_down_io(); return _v; }
+
+#ifdef UNSET_REALLY_SLOW_IO
+#undef REALLY_SLOW_IO
+#endif
 
 #define __INS(s) \
 static inline void ins##s(unsigned short port, void * addr, unsigned long 
count) \
-- 
1.4.4.2


-
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


  1   2   >