[kvm-devel] [PATCH 1/2] Revert kvm: qemu: GET_CONFIGURATION fixes to allow OpenSolaris CD-ROM access

2008-01-14 Thread Carlo Marcelo Arenas Belon
This reverts commit b64c20519d5826875679b6df85afebed27e1a9a8.

Conflicts:

qemu/hw/ide.c (keep uint64_t total_sectors)

Signed-off-by: Carlo Marcelo Arenas Belon [EMAIL PROTECTED]
---
 qemu/hw/ide.c |   27 ---
 1 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/qemu/hw/ide.c b/qemu/hw/ide.c
index 90f2f2b..9a22db9 100644
--- a/qemu/hw/ide.c
+++ b/qemu/hw/ide.c
@@ -1640,7 +1640,6 @@ static void ide_atapi_cmd(IDEState *s)
 break;
 case GPCMD_GET_CONFIGURATION:
 {
-uint32_t len;
 uint64_t total_sectors;
 
 /* only feature 0 is supported */
@@ -1649,27 +1648,17 @@ static void ide_atapi_cmd(IDEState *s)
 ASC_INV_FIELD_IN_CMD_PACKET);
 break;
 }
-max_len = ube16_to_cpu(packet + 7);
-bdrv_get_geometry(s-bs, total_sectors);
 memset(buf, 0, 32);
-if (total_sectors) {
-if (total_sectors  1433600) {
-buf[7] = 0x10; /* DVD-ROM */
-} else {
-buf[7] = 0x08; /* CD-ROM */
-}
-} else {
-buf[7] = 0x00; /* no current profile */
-}
-buf[10] = 0x02 | 0x01; /* persistent and current */
-buf[11] = 0x08; /* size of profile list = 4 bytes per profile */
+bdrv_get_geometry(s-bs, total_sectors);
+buf[3] = 16;
+buf[7] = total_sectors = 1433600 ? 0x08 : 0x10; /* current 
profile */
+buf[10] = 0x10 | 0x1;
+buf[11] = 0x08; /* size of profile list */
 buf[13] = 0x10; /* DVD-ROM profile */
-buf[14] = buf[13] == buf[7]; /* (in)active */
+buf[14] = buf[7] == 0x10; /* (in)active */
 buf[17] = 0x08; /* CD-ROM profile */
-buf[18] = buf[17] == buf[7]; /* (in)active */
-len = 8 + 4 + buf[11]; /* headers + size of profile list */
-cpu_to_ube32(buf, len - 4); /* data length */
-ide_atapi_cmd_reply(s, len, max_len);
+buf[18] = buf[7] == 0x08; /* (in)active */
+ide_atapi_cmd_reply(s, 32, 32);
 break;
 }
 default:
-- 
1.5.3.7


-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 2/2] qemu: Multi-profile DVD-ROM support

2008-01-14 Thread Carlo Marcelo Arenas Belon
This is version 2.2 of the original patch proposed to fix the MMC6
command for GET CONFIGURATION for multi profile CD/DVD support and that
was originally committed as b64c20519d5826875679b6df85afebed27e1a9a8

  cvs -q diff -up -r1.79 -r1.80 hw/ide.c

Signed-off-by: Carlo Marcelo Arenas Belon [EMAIL PROTECTED]
---
 qemu/hw/ide.c |  123 +++--
 1 files changed, 110 insertions(+), 13 deletions(-)

diff --git a/qemu/hw/ide.c b/qemu/hw/ide.c
index 9a22db9..54496c2 100644
--- a/qemu/hw/ide.c
+++ b/qemu/hw/ide.c
@@ -1,5 +1,5 @@
 /*
- * QEMU IDE disk and CD-ROM Emulator
+ * QEMU IDE disk and CD/DVD-ROM Emulator
  *
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2006 Openedhand Ltd.
@@ -284,6 +284,58 @@
  * of MODE_SENSE_POWER_PAGE */
 #define GPMODE_CDROM_PAGE  0x0d
 
+/*
+ * Based on values from linux/cdrom.h but extending CD_MINS
+ * to the maximum common size allowed by the Orange's Book ATIP
+ *
+ * 90 and 99 min CDs are also available but using them as the
+ * upper limit reduces the effectiveness of the heuristic to
+ * detect DVDs burned to less than 25% of their maximum capacity
+ */
+
+/* Some generally useful CD-ROM information */
+#define CD_MINS   80 /* max. minutes per CD */
+#define CD_SECS   60 /* seconds per minute */
+#define CD_FRAMES 75 /* frames per second */
+#define CD_FRAMESIZE2048 /* bytes per frame, cooked mode */
+#define CD_MAX_BYTES   (CD_MINS * CD_SECS * CD_FRAMES * CD_FRAMESIZE)
+#define CD_MAX_SECTORS (CD_MAX_BYTES / 512)
+
+/*
+ * The MMC values are not IDE specific and might need to be moved
+ * to a common header if they are also needed for the SCSI emulation
+ */
+
+/* Profile list from MMC-6 revision 1 table 91 */
+#define MMC_PROFILE_NONE0x
+#define MMC_PROFILE_CD_ROM  0x0008
+#define MMC_PROFILE_CD_R0x0009
+#define MMC_PROFILE_CD_RW   0x000A
+#define MMC_PROFILE_DVD_ROM 0x0010
+#define MMC_PROFILE_DVD_R_SR0x0011
+#define MMC_PROFILE_DVD_RAM 0x0012
+#define MMC_PROFILE_DVD_RW_RO   0x0013
+#define MMC_PROFILE_DVD_RW_SR   0x0014
+#define MMC_PROFILE_DVD_R_DL_SR 0x0015
+#define MMC_PROFILE_DVD_R_DL_JR 0x0016
+#define MMC_PROFILE_DVD_RW_DL   0x0017
+#define MMC_PROFILE_DVD_DDR 0x0018
+#define MMC_PROFILE_DVD_PLUS_RW 0x001A
+#define MMC_PROFILE_DVD_PLUS_R  0x001B
+#define MMC_PROFILE_DVD_PLUS_RW_DL  0x002A
+#define MMC_PROFILE_DVD_PLUS_R_DL   0x002B
+#define MMC_PROFILE_BD_ROM  0x0040
+#define MMC_PROFILE_BD_R_SRM0x0041
+#define MMC_PROFILE_BD_R_RRM0x0042
+#define MMC_PROFILE_BD_RE   0x0043
+#define MMC_PROFILE_HDDVD_ROM   0x0050
+#define MMC_PROFILE_HDDVD_R 0x0051
+#define MMC_PROFILE_HDDVD_RAM   0x0052
+#define MMC_PROFILE_HDDVD_RW0x0053
+#define MMC_PROFILE_HDDVD_R_DL  0x0058
+#define MMC_PROFILE_HDDVD_RW_DL 0x005A
+#define MMC_PROFILE_INVALID 0x
+
 #define ATAPI_INT_REASON_CD 0x01 /* 0 = data transfer */
 #define ATAPI_INT_REASON_IO 0x02 /* 1 = transfer to the host */
 #define ATAPI_INT_REASON_REL0x04
@@ -1290,6 +1342,22 @@ static void ide_atapi_cmd_read(IDEState *s, int lba, int 
nb_sectors,
 }
 }
 
+static inline uint8_t ide_atapi_set_profile(uint8_t *buf, uint8_t *index,
+uint16_t profile)
+{
+uint8_t *buf_profile = buf + 12; /* start of profiles */
+
+buf_profile += ((*index) * 4); /* start of indexed profile */
+cpu_to_ube16 (buf_profile, profile);
+buf_profile[2] = ((buf_profile[0] == buf[6])  (buf_profile[1] == 
buf[7]));
+
+/* each profile adds 4 bytes to the response */
+(*index)++;
+buf[11] += 4; /* Additional Length */
+
+return 4;
+}
+
 static void ide_atapi_cmd(IDEState *s)
 {
 const uint8_t *packet;
@@ -1640,7 +1708,7 @@ static void ide_atapi_cmd(IDEState *s)
 break;
 case GPCMD_GET_CONFIGURATION:
 {
-uint64_t total_sectors;
+uint32_t len;
 
 /* only feature 0 is supported */
 if (packet[2] != 0 || packet[3] != 0) {
@@ -1648,17 +1716,46 @@ static void ide_atapi_cmd(IDEState *s)
 ASC_INV_FIELD_IN_CMD_PACKET);
 break;
 }
-memset(buf, 0, 32);
-bdrv_get_geometry(s-bs, total_sectors);
-buf[3] = 16;
-buf[7] = total_sectors = 1433600 ? 0x08 : 0x10; /* current 
profile */
-buf[10] = 0x10 | 0x1;
-buf[11] = 0x08; /* size of profile list */
-buf[13] = 0x10; /* DVD-ROM profile */
-buf[14] = buf[7] == 0x10; /* (in)active */
-buf[17] = 0x08; /* CD-ROM profile */
-buf[18] = 

Re: [kvm-devel] [PATCH] add acpi powerbutton support

2008-01-14 Thread Guido Guenther
On Mon, Jan 14, 2008 at 04:41:23AM -0800, Alexey Eremenko wrote:
 As far as I know, Qemu/KVM already had ACPI power button; It can be used via 
 Qemu Monitor: system_shutdown command.
Before 46b1a7377b55a3b6317b18fff64f1a80de7d3120 system_powerdown was a
do { } while(0); on i386 and the FF pwrbtn wasn't enabled in the FADT.
 -- Guido

-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] KVM swapping with mmu notifiers

2008-01-14 Thread Marcelo Tosatti
Hi Andrea,

 path for kvm so I guess not worth optimizing in the short/mid term,
 but by optimizing the invalidate_page case we may halve the number of
 tlb flushes for some common case. I leave it for later, the swapping
 is heavily I/O bound anyway so a some more ipi in smp host shouldn't
 be very measurable (on UP host it makes no difference to flush
 multiple times in practice).
 
 Signed-off-by: Andrea Arcangeli [EMAIL PROTECTED]
 
 diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
 index 4086080..c527d7d 100644
 --- a/arch/x86/kvm/Kconfig
 +++ b/arch/x86/kvm/Kconfig
 @@ -18,6 +18,7 @@ config KVM
   tristate Kernel-based Virtual Machine (KVM) support
   depends on ARCH_SUPPORTS_KVM  EXPERIMENTAL
   select PREEMPT_NOTIFIERS
 + select MMU_NOTIFIER
   select ANON_INODES
   ---help---
 Support hosting fully virtualized guest machines using hardware
 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
 index 324ff9a..103c270 100644
 --- a/arch/x86/kvm/mmu.c
 +++ b/arch/x86/kvm/mmu.c
 @@ -532,6 +532,36 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
   kvm_flush_remote_tlbs(kvm);
  }
  
 +static void unmap_spte(struct kvm *kvm, u64 *spte)
 +{
 + struct page *page = pfn_to_page((*spte  PT64_BASE_ADDR_MASK)  
 PAGE_SHIFT);
 + get_page(page);
 + rmap_remove(kvm, spte);
 + set_shadow_pte(spte, shadow_trap_nonpresent_pte);
 + kvm_flush_remote_tlbs(kvm);
 + __free_page(page);
 +}
 +
 +void kvm_rmap_unmap_gfn(struct kvm *kvm, gfn_t gfn)
 +{
 + unsigned long *rmapp;
 + u64 *spte, *curr_spte;
 +
 + spin_lock(kvm-mmu_lock);
 + gfn = unalias_gfn(kvm, gfn);
 + rmapp = gfn_to_rmap(kvm, gfn);

The alias and memslot maps are protected only by mmap_sem, so you
should make kvm_set_memory_region/set_memory_alias grab the mmu spinlock
in addition to mmap_sem in write mode.

kvm_mmu_zap_all() grabs the mmu lock.. that should probably move up into
the caller.

 +
 + spte = rmap_next(kvm, rmapp, NULL);
 + while (spte) {
 + BUG_ON(!(*spte  PT_PRESENT_MASK));
 + rmap_printk(rmap_swap_page: spte %p %llx\n, spte, *spte);
 + curr_spte = spte;
 + spte = rmap_next(kvm, rmapp, spte);
 + unmap_spte(kvm, curr_spte);
 + }
 + spin_unlock(kvm-mmu_lock);
 +}
 +
  #ifdef MMU_DEBUG
  static int is_empty_shadow_page(u64 *spt)
  {
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index 8a90403..e9a3f6e 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -3159,6 +3159,36 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
   free_page((unsigned long)vcpu-arch.pio_data);
  }
  
 +static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 +{
 + return container_of(mn, struct kvm, mmu_notifier);
 +}
 +
 +void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
 +   struct mm_struct *mm,
 +   unsigned long address)
 +{
 + struct kvm *kvm = mmu_notifier_to_kvm(mn);
 + gfn_t gfn = hva_to_gfn(kvm, address);
 + BUG_ON(mm != kvm-mm);
 + if (gfn == -1UL)
 + return;
 + kvm_rmap_unmap_gfn(kvm, gfn);

And then you also need to cover hva_to_gfn() to happen under the lock.

 +}
 +
 +void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
 +struct mm_struct *mm,
 +unsigned long start, unsigned long end)
 +{
 + for (; start  end; start += PAGE_SIZE)
 + kvm_mmu_notifier_invalidate_page(mn, mm, start);
 +}
 +
 +static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 + .invalidate_range   = kvm_mmu_notifier_invalidate_range,
 + .invalidate_page= kvm_mmu_notifier_invalidate_page,
 +};
 +
  struct  kvm *kvm_arch_create_vm(void)
  {
   struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
 @@ -3167,6 +3197,7 @@ struct  kvm *kvm_arch_create_vm(void)
   return ERR_PTR(-ENOMEM);
  
   INIT_LIST_HEAD(kvm-arch.active_mmu_pages);
 + kvm-mmu_notifier.ops = kvm_mmu_notifier_ops;
  
   return kvm;
  }
 diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
 index d6db0de..feacd77 100644
 --- a/include/asm-x86/kvm_host.h
 +++ b/include/asm-x86/kvm_host.h
 @@ -404,6 +404,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu);
  int kvm_mmu_setup(struct kvm_vcpu *vcpu);
  void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
  
 +void kvm_rmap_unmap_gfn(struct kvm *kvm, gfn_t gfn);
  int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
  void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
  void kvm_mmu_zap_all(struct kvm *kvm);
 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
 index 2714068..85da7fa 100644
 --- a/include/linux/kvm_host.h
 +++ b/include/linux/kvm_host.h
 @@ -117,6 +117,7 @@ struct kvm {
   struct kvm_io_bus pio_bus;
   struct kvm_vm_stat stat;

Re: [kvm-devel] KVM swapping with mmu notifiers

2008-01-14 Thread Andrea Arcangeli
On Mon, Jan 14, 2008 at 11:45:39AM -0200, Marcelo Tosatti wrote:
 The alias and memslot maps are protected only by mmap_sem, so you

yes, they are already protected and furthermore in write mode.

 should make kvm_set_memory_region/set_memory_alias grab the mmu spinlock
 in addition to mmap_sem in write mode.

The mmu notifiers already hold the mmap_sem in read mode so I don't
see why I should add the mmu_lock around memslots.

The mmu_lock AFAICS is only needed to serialize with other vcpu fautls
when updating the sptes and I already take it there.

 And then you also need to cover hva_to_gfn() to happen under the lock.

hva_to_gfn only requires the mmap_sem in read mode and that's already
taken implicitly before the mmu notifiers are called.

-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] KVM swapping with mmu notifiers

2008-01-14 Thread Avi Kivity
Marcelo Tosatti wrote:
  
 +static void unmap_spte(struct kvm *kvm, u64 *spte)
 +{
 +struct page *page = pfn_to_page((*spte  PT64_BASE_ADDR_MASK)  
 PAGE_SHIFT);
 +get_page(page);
 +rmap_remove(kvm, spte);
 +set_shadow_pte(spte, shadow_trap_nonpresent_pte);
 +kvm_flush_remote_tlbs(kvm);
 +__free_page(page);
 +}
 +
 +void kvm_rmap_unmap_gfn(struct kvm *kvm, gfn_t gfn)
 +{
 +unsigned long *rmapp;
 +u64 *spte, *curr_spte;
 +
 +spin_lock(kvm-mmu_lock);
 +gfn = unalias_gfn(kvm, gfn);
 +rmapp = gfn_to_rmap(kvm, gfn);
 

 The alias and memslot maps are protected only by mmap_sem, so you
 should make kvm_set_memory_region/set_memory_alias grab the mmu spinlock
 in addition to mmap_sem in write mode.

 kvm_mmu_zap_all() grabs the mmu lock.. that should probably move up into
 the caller.

   

Aren't mmu notifiers called with mmap_sem held for read?

Maybe not from the swap path?

-- 
error compiling committee.c: too many arguments to function


-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] KVM swapping with mmu notifiers

2008-01-14 Thread Andrea Arcangeli
On Mon, Jan 14, 2008 at 04:09:03PM +0200, Avi Kivity wrote:
 Marcelo Tosatti wrote:
  +static void unmap_spte(struct kvm *kvm, u64 *spte)
 +{
 +   struct page *page = pfn_to_page((*spte  PT64_BASE_ADDR_MASK)  
 PAGE_SHIFT);
 +   get_page(page);
 +   rmap_remove(kvm, spte);
 +   set_shadow_pte(spte, shadow_trap_nonpresent_pte);
 +   kvm_flush_remote_tlbs(kvm);
 +   __free_page(page);
 +}
 +
 +void kvm_rmap_unmap_gfn(struct kvm *kvm, gfn_t gfn)
 +{
 +   unsigned long *rmapp;
 +   u64 *spte, *curr_spte;
 +
 +   spin_lock(kvm-mmu_lock);
 +   gfn = unalias_gfn(kvm, gfn);
 +   rmapp = gfn_to_rmap(kvm, gfn);
 

 The alias and memslot maps are protected only by mmap_sem, so you
 should make kvm_set_memory_region/set_memory_alias grab the mmu spinlock
 in addition to mmap_sem in write mode.

 kvm_mmu_zap_all() grabs the mmu lock.. that should probably move up into
 the caller.

   

 Aren't mmu notifiers called with mmap_sem held for read?

 Maybe not from the swap path?

Good point, the swap path isn't covered by the mmap_sem, so Marcelo's
right I need to fixup the locking a bit.

-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] KVM swapping with mmu notifiers

2008-01-14 Thread Avi Kivity
Andrea Arcangeli wrote:

 Aren't mmu notifiers called with mmap_sem held for read?

 Maybe not from the swap path?
 

 Good point, the swap path isn't covered by the mmap_sem, so Marcelo's
 right I need to fixup the locking a bit.
   

One idea I had at a time was to add -lock() and -unlock() callbacks to 
mmu notifiers that would be called in a sleepy context.  But that seems 
heavy handed.  Maybe it can be fixed in some clever way with rcu or with 
a rwlock around the memory slot map.

-- 
error compiling committee.c: too many arguments to function


-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [RFC] fix VMX TSC synchronicity

2008-01-14 Thread Marcelo Tosatti
Hi Avi,

On Sun, Jan 13, 2008 at 02:19:29PM +0200, Avi Kivity wrote:
 Marcelo Tosatti wrote:
 The boot TSC sync check is failing on recent Linux SMP guests on TSC
 stable hosts.
 
   
 
 What about tsc unstable hosts?  If your patch convinces the guest its 
 tsc is table, while the host tsc is not, then it may cause confusion 
 later on.

The adjustment to zero won't fool the guest because it assumes that the
TSC's are synchronized. It will simply set the guest TSC to zero on all
VCPUs based on the time VCPU0 was initialized.

That is, setting -(vcpu[0].first_tsc) on all VCPU's will not correct any
synchronization problem.

 Following patch attempts to address the problems, which are:
 
 1) APIC_DM_STARTUP, which is only called for vcpu's other than vcpu0,
 will trigger -vcpu_reset(), setting the TSC to 0. Fix that by moving
 the guest_write_tsc(0) to vcpu_load.
 
 2) vcpu's are initialized at different times by QEMU (vcpu0 init happens
 way earlier than the rest). Fix that by initializing the TSC of vcpu's 
 0 with reference to vcpu0 init tsc value. This way TSC synchronization
 is kept (apparently Xen does something similar).
 
 3) The code which adjusts the TSC of a VCPU on physical CPU switch is
 meant to guarantee that the guest sees a monotonically increasing value.
 However there is a large gap, in terms of clocks, between the time which
 the source CPU TSC is read and the time the destination CPU TSC is read.
 So move those two reads to happen at vcpu_clear.
 
 I believe that 3) is the reason for the -no-kvm-irqchip problems
 reported by Avi on RHEL5, with kernels  2.6.21 (where vcpu-cpu pinning
 would fix the problem). Unfortunately I could reproduce that problem.
 
 4-way guest with constant tick at 250Hz now reliably sees the TSC's
 synchronized, and idle guest CPU consumption is reduced by 50% (3-4%
 instead of 8%, the latter with acpi_pm_good boot parameter).
 
 
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 4741806..e1c8cf4 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -47,6 +47,8 @@ struct vcpu_vmx {
  struct kvm_vcpu   vcpu;
  int   launched;
  u8fail;
 +u64   first_tsc;
 +u64   tsc_this;
  u32   idt_vectoring_info;
  struct kvm_msr_entry *guest_msrs;
  struct kvm_msr_entry *host_msrs;
 @@ -254,6 +256,7 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
  if (vmx-vcpu.cpu == -1)
  return;
  smp_call_function_single(vmx-vcpu.cpu, __vcpu_clear, vmx, 0, 1);
 +rdtscll(vmx-tsc_this);
  vmx-launched = 0;
  }
  
 @@ -480,6 +483,8 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
  reload_host_efer(vmx);
  }
  
 +static void guest_write_tsc(u64 host_tsc, u64 guest_tsc);
 +
  /*
   * Switches to specified vcpu, until a matching vcpu_put(), but assumes
   * vcpu mutex is already taken.
 @@ -488,7 +493,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int 
 cpu)
  {
  struct vcpu_vmx *vmx = to_vmx(vcpu);
  u64 phys_addr = __pa(vmx-vmcs);
 -u64 tsc_this, delta;
 +u64 delta;
  
  if (vcpu-cpu != cpu) {
  vcpu_clear(vmx);
 @@ -511,6 +516,19 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int 
 cpu)
  struct descriptor_table dt;
  unsigned long sysenter_esp;
  
 +if (unlikely(vcpu-cpu == -1)) {
   
 
 This can happen after migration, I believe.

Right now the destination host will do -vcpu_reset() on all VCPU's on
startup... so its already broken. This patch is not introducing any
issues, just changing where it happens :)

Perhaps fixing migration should be the subject of a separate patch?

 +rdtscll(vcpu-arch.host_tsc);
 +vmx-tsc_this = vcpu-arch.host_tsc;
 +if (vcpu-vcpu_id == 0) {
 +guest_write_tsc(vcpu-arch.host_tsc, 0);
 +vmx-first_tsc = vcpu-arch.host_tsc;
 +} else {
 +struct vcpu_vmx *cpu0;
 +cpu0 = to_vmx(vcpu-kvm-vcpus[0]);
 +guest_write_tsc(cpu0-first_tsc, 0);
 +}
 +}
 +
   
 
 Depending on vcpu_id == 0 can cause problems (for example, if vcpu 0 is 
 somehow not the first to run).

But QEMU will always run kvm_create() first (which does
kvm_create_vcpu(0)), then start the other threads later. So I don't see
how that can happen.

 We might initialize the tsc base on vm creation, and if the host tsc is 
 synced, then the guest tsc should also be.
 
  vcpu-cpu = cpu;
  /*
   * Linux uses per-cpu TSS and GDT, so set these when 
   switching
 @@ -526,8 +544,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int 
 cpu)
  /*
   * Make sure the time stamp counter is monotonous.
   */
 -

Re: [kvm-devel] [PATCH] fix cpuid function 4

2008-01-14 Thread Dan Kenigsberg
On Mon, Jan 14, 2008 at 02:49:31PM +0100, Alexander Graf wrote:
 Hi,
 
 Currently CPUID function 4 is broken. This function's values rely on the
 value of ECX.
 To solve the issue cleanly, there is already a new API for cpuid
 settings, which is not used yet.
 Using the current interface, the function 4 can be easily passed
 through, by giving multiple function 4 outputs and increasing the
 index-identifier on the fly. This does not break compatibility.
 
 This fix is really important for Mac OS X, as it requires cache
 information. Please also see my previous patches for Mac OS X (or rather
 core duo target) compatibility.
 
 Regards,
 
 Alex

 diff --git a/kernel/x86.c b/kernel/x86.c
 index b55c177..73312e9 100644
 --- a/kernel/x86.c
 +++ b/kernel/x86.c
 @@ -783,7 +783,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
   struct kvm_cpuid *cpuid,
   struct kvm_cpuid_entry __user *entries)
  {
 - int r, i;
 + int r, i, n = 0;
   struct kvm_cpuid_entry *cpuid_entries;
  
   r = -E2BIG;
 @@ -803,8 +803,17 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu 
 *vcpu,
   vcpu-arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
   vcpu-arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
   vcpu-arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
 - vcpu-arch.cpuid_entries[i].index = 0;
 - vcpu-arch.cpuid_entries[i].flags = 0;
 +switch(vcpu-arch.cpuid_entries[i].function) {
 +case 4:
 +vcpu-arch.cpuid_entries[i].index = n;
 +vcpu-arch.cpuid_entries[i].flags = 
 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 +n++;
 +break;
 +default:
 +vcpu-arch.cpuid_entries[i].index = 0;
 +vcpu-arch.cpuid_entries[i].flags = 0;
 +break;
 +}

I will not mention the whitespace damage here :-). Instead, I'd ask you
to review, comment, and even try, the patch that I posted here not long
ago, exposing all safe host cpuid functions to guests.

Thanks,
Dan.


diff --git a/libkvm/libkvm-x86.c b/libkvm/libkvm-x86.c
index 2fa8146..e9a8113 100644
--- a/libkvm/libkvm-x86.c
+++ b/libkvm/libkvm-x86.c
@@ -485,20 +485,109 @@ __u64 kvm_get_cr8(kvm_context_t kvm, int vcpu)
 }
 
 int kvm_setup_cpuid(kvm_context_t kvm, int vcpu, int nent,
-   struct kvm_cpuid_entry *entries)
+   struct kvm_cpuid_entry2 *entries)
 {
-   struct kvm_cpuid *cpuid;
-   int r;
+   int r = -ENOSYS;
 
+#ifdef KVM_CAP_EXT_CPUID
+   r = ioctl(kvm-fd, KVM_CHECK_EXTENSION, KVM_CAP_EXT_CPUID);
+   if (r = 0) 
+#endif
+   { /* kernel has no KVM_SET_CPUID2 */
+   int i;
+   struct kvm_cpuid *cpuid;
+
+   cpuid = malloc(sizeof(*cpuid) +
+   nent * sizeof(struct kvm_cpuid_entry));
+   if (!cpuid)
+   return -ENOMEM;
+
+   cpuid-nent = nent;
+   for (i = 0; i  nent; ++i) {
+   cpuid-entries[i].function = entries[i].function;
+   cpuid-entries[i].eax = entries[i].eax;
+   cpuid-entries[i].ebx = entries[i].ebx;
+   cpuid-entries[i].ecx = entries[i].ecx;
+   cpuid-entries[i].edx = entries[i].edx;
+   }
+   r = ioctl(kvm-vcpu_fd[vcpu], KVM_SET_CPUID, cpuid);
+   if (r)
+   r = -errno;
+
+   free(cpuid);
+   }
+#ifdef KVM_CAP_EXT_CPUID
+   else {
+   struct kvm_cpuid2 *cpuid;
cpuid = malloc(sizeof(*cpuid) + nent * sizeof(*entries));
if (!cpuid)
return -ENOMEM;
 
cpuid-nent = nent;
memcpy(cpuid-entries, entries, nent * sizeof(*entries));
-   r = ioctl(kvm-vcpu_fd[vcpu], KVM_SET_CPUID, cpuid);
+   r = ioctl(kvm-vcpu_fd[vcpu], KVM_SET_CPUID2, cpuid);
+   if (r)
+   r = -errno;
+
+   free(cpuid);
+   }
+#endif
+   return r;
+}
+
+int kvm_get_supported_cpuid(kvm_context_t kvm, int *nent,
+   struct kvm_cpuid_entry2 *entries)
+{
+   struct kvm_cpuid2 *cpuid;
+   int r = -ENOSYS;
+
+#ifdef KVM_CAP_EXT_CPUID
+   r = ioctl(kvm-fd, KVM_CHECK_EXTENSION, KVM_CAP_EXT_CPUID);
+   if (r = 0)
+   return -ENOSYS;
+
+   cpuid = malloc(sizeof(*cpuid) + *nent * sizeof(*entries));
+   if (!cpuid)
+   return -ENOMEM;
+   cpuid-nent = *nent;
 
+   r = ioctl(kvm-vm_fd, KVM_GET_SUPPORTED_CPUID, cpuid);
+   if (r)
+   r = -errno;
+   else {
+   memcpy(entries, cpuid-entries, *nent * sizeof(*entries));
+   *nent = cpuid-nent;
+   }
free(cpuid);

Re: [kvm-devel] [kvm-ppc-devel] RFC: MMIO endianness flag

2008-01-14 Thread Hollis Blanchard
I hope I've explained in the other mail I just sent why Qemu assuming
little-endian for everything is not OK.

One other important clarification: kvm_run-mmio.is_bigendian is about
*one* *particular* *MMIO* *access*. It has only coincidental
relationship to the endianness mode of the guest.

-- 
Hollis Blanchard
IBM Linux Technology Center

On Mon, 2008-01-14 at 13:42 +0800, Xu, Anthony wrote:
  kvm_run-mmio.is_bigendian = vcpu-arch.some_register 
 From your example code, I can know is_bigendian indicate whether guest
 is in bigendian mode when accessing MMIO.
 
 Qemu is responsible for handling MMIO emulation, Qemus always assume it
 is running on littleendian mode.
 So if guest accesses MMIO in bigendian mode,
   kvm need to transform it to littleendian before delivering this
 MMIO request to Qemu.
 
 
 This works for IA64 side.
 
 
 Thanks,
 - Anthony
 
 
 Hollis Blanchard wrote:
  We're just talking about a flag in the kvm_run.mmio structure, so it
  does not represent the state of any software, guest or host, other
  than that single MMIO access.
  
  This flag is only used for communication between host kernel and host
  userland, so the host kernel is always responsible for setting it.
  
  is_bigendian is just one more necessary piece of information for
  MMIO emulation. kvm_run already tells you that you are loading 4
  bytes from address 0. What you don't know today is if byte 0 is the
  least significant or most significant byte. If is_bigendian is set,
  you know that byte 0 is the MSB and byte 3 is the LSB. If not, the
  opposite is true.
  
  In the simplest case, IA64's in-kernel MMIO emulation code could look
  something like:
  kvm_run-mmio.phys_addr = addr;
  kvm_run-mmio.len = len;
  ...
  kvm_run-mmio.is_bigendian = vcpu-arch.some_register 
  BIGENDIAN;
  
  If IA64 has reverse-endian load/store instructions like PowerPC, then
  you would also need to consider the particular instruction used as
  well as the guest state.
  
  
  On Fri, 2008-01-11 at 10:02 +0800, Xu, Anthony wrote:
  Hi all,
  
  That's a good start to consider BE.
  Yes, IA64 support BE and LE.
  
  I have below comments.
  
  What does is_bigendian mean?
  Host is runing with BE or guest is running with BE.
  Who will set is_bigendian?
  
  For supporing BE,
  We need to consider host BE and guest BE.
  For IA64,  most OS is running with LE, and
  Application can run with BE or LE,
  For example, Qemu can run with BE or LE.
  
  IMHO, we need two flags,
  host_is_bigendian indicates Qemu is running with BE
  Guest_is_bigendian indecates the guest application who is accessing
  MMIO 
  
  Is running with LE.
  
  
  - Anthony
  
  
  
  
  
  
  Hollis Blanchard wrote:
  On Thu, 2008-01-10 at 17:28 +0200, Avi Kivity wrote:
  I'll apply that patch (with a #ifdef CONFIG_PPC so other archs
  don't use it by mistake).
  
  I don't think that's the right ifdef. For example, I believe IA64
  can run in BE mode and so will have the same issue, and there are
  certainly other architectures (less relevant to the current code)
  that definitely are in the same situation.
  
  We need to plumb this through to the libkvm users anyways. Take a
  look at the patch below and tell me if you think it's not the right
  approach. x86 simply won't consider 'is_bigendian'. I spent a lot
  of time on this, and it's by far the cleanest solution I could come
  up with. 
  
  
  
  diff --git a/libkvm/config-i386.mak b/libkvm/config-i386.mak ---
  a/libkvm/config-i386.mak +++ b/libkvm/config-i386.mak
  @@ -2,5 +2,6 @@ LIBDIR := /lib
   LIBDIR := /lib
   CFLAGS += -m32
   CFLAGS += -D__i386__
  +CFLAGS += -DARCH_MMIO_ENDIAN_LITTLE
  
   libkvm-$(ARCH)-objs := libkvm-x86.o
  diff --git a/libkvm/config-ia64.mak b/libkvm/config-ia64.mak ---
  a/libkvm/config-ia64.mak +++ b/libkvm/config-ia64.mak
  @@ -1,5 +1,6 @@
  
   LIBDIR := /lib
   CFLAGS += -D__ia64__
  +CFLAGS += -DARCH_MMIO_ENDIAN_LITTLE -DARCH_MMIO_ENDIAN_BIG
  
   libkvm-$(ARCH)-objs := libkvm-ia64.o
  diff --git a/libkvm/config-powerpc.mak b/libkvm/config-powerpc.mak
  --- a/libkvm/config-powerpc.mak
  +++ b/libkvm/config-powerpc.mak
  @@ -2,5 +2,6 @@ LIBDIR := /lib
   LIBDIR := /lib
   CFLAGS += -m32
   CFLAGS += -D__powerpc__
  +CFLAGS += -DARCH_MMIO_ENDIAN_BIG -DARCH_MMIO_ENDIAN_LITTLE
  
   libkvm-$(ARCH)-objs := libkvm-powerpc.o
  diff --git a/libkvm/config-x86_64.mak b/libkvm/config-x86_64.mak
  --- a/libkvm/config-x86_64.mak
  +++ b/libkvm/config-x86_64.mak
  @@ -2,5 +2,6 @@ LIBDIR := /lib64
   LIBDIR := /lib64
   CFLAGS += -m64
   CFLAGS += -D__x86_64__
  +CFLAGS += -DARCH_MMIO_ENDIAN_LITTLE
  
   libkvm-$(ARCH)-objs := libkvm-x86.o
  diff --git a/libkvm/libkvm.c b/libkvm/libkvm.c
  --- a/libkvm/libkvm.c
  +++ b/libkvm/libkvm.c
  @@ -774,21 +774,56 @@ int kvm_set_sregs(kvm_context_t kvm, int
   return ioctl(kvm-vcpu_fd[vcpu], KVM_SET_SREGS, sregs);  }
  
  +#ifdef ARCH_MMIO_ENDIAN_BIG
  +static int 

Re: [kvm-devel] [kvm-ppc-devel] RFC: MMIO endianness flag

2008-01-14 Thread Hollis Blanchard
On Sun, 2008-01-13 at 11:42 +0200, Avi Kivity wrote:

 Do we really need to propagate endianness all the way to the user?  
 Perhaps libkvm could call the regular mmio functions and do the 
 transformation itself.
 
 Or maybe even the kernel can do this by itself?

The kernel *already* does this by itself, and I'm attempting to explain
why that is not sufficient.

My point is precisely that the endianness information must be propagated
to the user, otherwise the user may not have all the information it
needs to emulate it.

Here is the concrete example:
  * guest writes to MMIO
  * KVM passes MMIO information (physical address, number of bytes,
value) to qemu
  * Qemu knows from the address that this access is for a passthough
device, a special case the administrator has pre-configured
  * Qemu does mmap(/dev/mem), and writes length bytes of value
at offset address.

Now here's the catch: what endianness does qemu use when doing the
write? If qemu only does BE, then a LE access from the guest will be
byte-reversed when presented to the real hardware.

-- 
Hollis Blanchard
IBM Linux Technology Center


-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH] add acpi powerbutton support

2008-01-14 Thread Avi Kivity

Guido Guenther wrote:
  

- backout the madt interrupt source override changes


Should these stay, but with active low instead of active high?
  


They can go away, since active low is the default.

For reference, I'm attaching the patches I'm using.

--
error compiling committee.c: too many arguments to function

From 736880c1e550f84778e0cd13ae7dd27988dee902 Mon Sep 17 00:00:00 2001
From: Avi Kivity [EMAIL PROTECTED]
Date: Mon, 14 Jan 2008 17:34:11 +0200
Subject: [PATCH] kvm: bios: pci interrupts are active low

Signed-off-by: Avi Kivity [EMAIL PROTECTED]
---
 bios/acpi-dsdt.dsl |   16 
 1 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/bios/acpi-dsdt.dsl b/bios/acpi-dsdt.dsl
index df255ce..92fd126 100755
--- a/bios/acpi-dsdt.dsl
+++ b/bios/acpi-dsdt.dsl
@@ -382,7 +382,7 @@ DefinitionBlock (
 Name(_HID, EISAID(PNP0C0F)) // PCI interrupt link
 Name(_UID, 1)
 Name(_PRS, ResourceTemplate(){
-Interrupt (, Level, ActiveHigh, Shared)
+Interrupt (, Level, ActiveLow, Shared)
 { 5, 9, 10, 11 }
 })
 Method (_STA, 0, NotSerialized)
@@ -402,7 +402,7 @@ DefinitionBlock (
 {
 Name (PRR0, ResourceTemplate ()
 {
-Interrupt (, Level, ActiveHigh, Shared)
+Interrupt (, Level, ActiveLow, Shared)
 {1}
 })
 CreateDWordField (PRR0, 0x05, TMP)
@@ -427,7 +427,7 @@ DefinitionBlock (
 Name(_HID, EISAID(PNP0C0F)) // PCI interrupt link
 Name(_UID, 2)
 Name(_PRS, ResourceTemplate(){
-Interrupt (, Level, ActiveHigh, Shared)
+Interrupt (, Level, ActiveLow, Shared)
 { 5, 9, 10, 11 }
 })
 Method (_STA, 0, NotSerialized)
@@ -447,7 +447,7 @@ DefinitionBlock (
 {
 Name (PRR0, ResourceTemplate ()
 {
-Interrupt (, Level, ActiveHigh, Shared)
+Interrupt (, Level, ActiveLow, Shared)
 {1}
 })
 CreateDWordField (PRR0, 0x05, TMP)
@@ -472,7 +472,7 @@ DefinitionBlock (
 Name(_HID, EISAID(PNP0C0F)) // PCI interrupt link
 Name(_UID, 3)
 Name(_PRS, ResourceTemplate(){
-Interrupt (, Level, ActiveHigh, Shared)
+Interrupt (, Level, ActiveLow, Shared)
 { 5, 9, 10, 11 }
 })
 Method (_STA, 0, NotSerialized)
@@ -492,7 +492,7 @@ DefinitionBlock (
 {
 Name (PRR0, ResourceTemplate ()
 {
-Interrupt (, Level, ActiveHigh, Shared)
+Interrupt (, Level, ActiveLow, Shared)
 {1}
 })
 CreateDWordField (PRR0, 0x05, TMP)
@@ -517,7 +517,7 @@ DefinitionBlock (
 Name(_HID, EISAID(PNP0C0F)) // PCI interrupt link
 Name(_UID, 4)
 Name(_PRS, ResourceTemplate(){
-Interrupt (, Level, ActiveHigh, Shared)
+Interrupt (, Level, ActiveLow, Shared)
 { 5, 9, 10, 11 }
 })
 Method (_STA, 0, NotSerialized)
@@ -537,7 +537,7 @@ DefinitionBlock (
 {
 Name (PRR0, ResourceTemplate ()
 {
-Interrupt (, Level, ActiveHigh, Shared)
+Interrupt (, Level, ActiveLow, Shared)
 {1}
 })
 CreateDWordField (PRR0, 0x05, TMP)
-- 
1.5.3.7

From 96ecf8e23154baa8020c58316cc013b8fa28689c Mon Sep 17 00:00:00 2001
From: Avi Kivity [EMAIL PROTECTED]
Date: Mon, 14 Jan 2008 17:47:35 +0200
Subject: [PATCH] kvm: bios: don't advertise the pci interrupts as active high

Signed-off-by: Avi Kivity [EMAIL PROTECTED]
---
 bios/rombios32.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/bios/rombios32.c b/bios/rombios32.c
index 967c119..ebfbb24 100755
--- a/bios/rombios32.c
+++ b/bios/rombios32.c
@@ -1369,7 +1369,7 @@ void acpi_bios_init(void)
 
 intsrcovr = (struct madt_intsrcovr*)(io_apic + 1);
 for ( i = 0; i  16; i++ ) {
-if ( PCI_ISA_IRQ_MASK  (1U  i) ) {
+if ( 0  (PCI_ISA_IRQ_MASK  (1U  i) )) {
 memset(intsrcovr, 0, sizeof(*intsrcovr));
 intsrcovr-type   = APIC_XRUPT_OVERRIDE;
 intsrcovr-length = sizeof(*intsrcovr);
-- 
1.5.3.7

From 5a5564a0998f42c5394090167d0dbf1e111dde10 Mon Sep 17 00:00:00 2001
From: Avi Kivity [EMAIL 

Re: [kvm-devel] Setting hardware breakpoints in guest OS

2008-01-14 Thread Avi Kivity
[EMAIL PROTECTED] wrote:
 While we tried to make debugging inside the guest work, this
 was never really tested, so it's likely broken.  I'll try to
 look at what it will take to make it work; I don't think there's
 much needed.
 

   

Thinking a bit, it may well be broken only when using the external 
module.  If you use a distro-provided kvm (or compile your own kernel 
with kvm included) then it might work.

Then again, distro provided kvms tend to be old and therefore slow.

 That sounds encouraging -- I had imagined there might be some
 impossibility factor in sharing something like hardware breakpoints
 between host and guest.

   

Both vmx and svm fully allow virtualizing the hardware breakpoints (and 
even things like last branch recording).

 For now I'm simply sticking to QEMU+kqemu when I expect deliberate
 trickiness or need to do hard-breakpoint debugging, and QEMU/KVM (which is
 up to 50% faster when doing Windows software builds on my PC, nice!) when I
 don't care.

 I haven't had any problems loading and using the kvm drivers and kqemu at
 the same time, and I have assumed that there ought to be no issues in doing
 so, since they work quite differently and (from my very dangerously limited
 understanding) ought not to be competing for any mutually exclusive
 hardware resources. Is that a reasonable assumption?
   

I believe so.  I know VirtualBox and VMware have problems coexisting 
with kvm (since they tend to switch to real mode which is forbidden by 
vmx), but if kqemu works, then there shouldn't be any hidden problems.

   
 What hardware are you using?  If you have both AMD and Intel
 hardware, you might have better luck switching, since this is
 very subarch dependent.
 

 Intel Core Duo (T2400 @ 1.83GHz according to /proc/cpuinfo), running 32-bit
 Linux 2.6.21.5 using KVM drivers built from the kvm-59 sourceball.

 Sorry, I don't have other vendors or CPU bitnesses to test on.

 PS: When I build KVM out of the box, I get a qemu binary called
 qemu-system-x86_64, though I have a 32-bit CPU and a 32-bit OS. Forgive my
 ignorance on this, but...why does the name of the binary imply a 64-bit
 flavour?
   

The qemu binary is 32-bit, but is capable of running a 64-bit guest if 
you have a 64-bit cpu and a 64-bit kernel (still retaining 32-bit 
userspace).

You can generate a 32-bit only binary, but that does not have any 
advantages over the 64-bit capable binary.

Yes, it confuses me too.

-- 
error compiling committee.c: too many arguments to function


-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] KVM swapping with mmu notifiers

2008-01-14 Thread Andrea Arcangeli
On Mon, Jan 14, 2008 at 05:43:58PM +0200, Avi Kivity wrote:
 heavy handed.  Maybe it can be fixed in some clever way with rcu or with a 
 rwlock around the memory slot map.

Ok, first the alias looked superflous so I just dropped it (the whole
point of the alias is to never call get_user_pages in the aliased hva,
so the notifiers should never trigger in those alias gfn ranges).

And the code was doing hva-gfn-rmap, while I'm now doing hva-rmap
directly to avoid two loops over the memslot instead of only one. To
serialize hva_to_rmapp mmap_sem is taken (in any mode) or the mmu_lock
is taken, and it's enough to serialize the insertion with the mmu_lock
and setting userspace_addr to 0 before insertion if the buffer isn't
allocated by userland. Setting userspace_addr atomically will make the
memslot visible.

This is a new version of the kvm.git patch.

Please Marcelo let me know if you see more problems, thanks a lot for
the review!

Signed-off-by: Andrea Arcangeli [EMAIL PROTECTED]

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 4086080..c527d7d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -18,6 +18,7 @@ config KVM
tristate Kernel-based Virtual Machine (KVM) support
depends on ARCH_SUPPORTS_KVM  EXPERIMENTAL
select PREEMPT_NOTIFIERS
+   select MMU_NOTIFIER
select ANON_INODES
---help---
  Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 324ff9a..189f3e1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -532,6 +532,38 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
kvm_flush_remote_tlbs(kvm);
 }
 
+static void unmap_spte(struct kvm *kvm, u64 *spte)
+{
+   struct page *page = pfn_to_page((*spte  PT64_BASE_ADDR_MASK)  
PAGE_SHIFT);
+   get_page(page);
+   rmap_remove(kvm, spte);
+   set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+   kvm_flush_remote_tlbs(kvm);
+   __free_page(page);
+}
+
+void kvm_rmap_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+   unsigned long *rmapp;
+   u64 *spte, *curr_spte;
+
+   spin_lock(kvm-mmu_lock);
+   rmapp = kvm_hva_to_rmapp(kvm, hva);
+   if (!rmapp)
+   goto out_unlock;
+
+   spte = rmap_next(kvm, rmapp, NULL);
+   while (spte) {
+   BUG_ON(!(*spte  PT_PRESENT_MASK));
+   rmap_printk(kvm_rmap_unmap_hva: spte %p %llx\n, spte, *spte);
+   curr_spte = spte;
+   spte = rmap_next(kvm, rmapp, spte);
+   unmap_spte(kvm, curr_spte);
+   }
+out_unlock:
+   spin_unlock(kvm-mmu_lock);
+}
+
 #ifdef MMU_DEBUG
 static int is_empty_shadow_page(u64 *spt)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8a90403..271ce12 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3159,6 +3159,33 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
free_page((unsigned long)vcpu-arch.pio_data);
 }
 
+static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
+{
+   return container_of(mn, struct kvm, mmu_notifier);
+}
+
+void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+   struct kvm *kvm = mmu_notifier_to_kvm(mn);
+   BUG_ON(mm != kvm-mm);
+   kvm_rmap_unmap_hva(kvm, address);
+}
+
+void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
+  struct mm_struct *mm,
+  unsigned long start, unsigned long end)
+{
+   for (; start  end; start += PAGE_SIZE)
+   kvm_mmu_notifier_invalidate_page(mn, mm, start);
+}
+
+static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+   .invalidate_range   = kvm_mmu_notifier_invalidate_range,
+   .invalidate_page= kvm_mmu_notifier_invalidate_page,
+};
+
 struct  kvm *kvm_arch_create_vm(void)
 {
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
@@ -3167,6 +3194,7 @@ struct  kvm *kvm_arch_create_vm(void)
return ERR_PTR(-ENOMEM);
 
INIT_LIST_HEAD(kvm-arch.active_mmu_pages);
+   kvm-mmu_notifier.ops = kvm_mmu_notifier_ops;
 
return kvm;
 }
@@ -3219,14 +3247,20 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 */
if (!user_alloc) {
if (npages  !old.rmap) {
-   memslot-userspace_addr = do_mmap(NULL, 0,
-npages * PAGE_SIZE,
-PROT_READ | PROT_WRITE,
-MAP_SHARED | MAP_ANONYMOUS,
-0);
-
-   if (IS_ERR((void *)memslot-userspace_addr))
-   return PTR_ERR((void *)memslot-userspace_addr);

Re: [kvm-devel] [kvm-ppc-devel] RFC: MMIO endianness flag

2008-01-14 Thread Avi Kivity
Hollis Blanchard wrote:
 On Sun, 2008-01-13 at 11:42 +0200, Avi Kivity wrote:

   
 Do we really need to propagate endianness all the way to the user?  
 Perhaps libkvm could call the regular mmio functions and do the 
 transformation itself.

 Or maybe even the kernel can do this by itself?
 

 The kernel *already* does this by itself, and I'm attempting to explain
 why that is not sufficient.

 My point is precisely that the endianness information must be propagated
 to the user, otherwise the user may not have all the information it
 needs to emulate it.

 Here is the concrete example:
   * guest writes to MMIO
   * KVM passes MMIO information (physical address, number of bytes,
 value) to qemu
   * Qemu knows from the address that this access is for a passthough
 device, a special case the administrator has pre-configured
   * Qemu does mmap(/dev/mem), and writes length bytes of value
 at offset address.

 Now here's the catch: what endianness does qemu use when doing the
 write? If qemu only does BE, then a LE access from the guest will be
 byte-reversed when presented to the real hardware.
   

Right, I forgot that bit when replying.

So, the data is always in natural endianness?  If so, we can add an 
endianness indicator to the mmio callbacks, which qemu will mostly 
ignore, and only examine it on passthrough.

btw, isn't passthrough better handled through the tlb?  i.e. actually 
let the guest access the specially-configured memory?  You can have qemu 
mmap /dev/mem and install it as a memslot, and things should work, no?  
(well, you might need to set some cachablility flag or other).

-- 
error compiling committee.c: too many arguments to function


-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] only for better reading

2008-01-14 Thread inflo
hi,
this patch is only for better reading.

flo

--- x86.c   2008-01-14 18:05:07.0 +0100
+++ changed/x86.c   2008-01-14 18:06:10.0 +0100
@@ -407,10 +407,15 @@
  * capabilities of the host cpu.
  */
 static u32 msrs_to_save[] = {
-   MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+   MSR_IA32_SYSENTER_CS, 
+   MSR_IA32_SYSENTER_ESP, 
+   MSR_IA32_SYSENTER_EIP,
MSR_K6_STAR,
 #ifdef CONFIG_X86_64
-   MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
+   MSR_CSTAR, 
+   MSR_KERNEL_GS_BASE, 
+   MSR_SYSCALL_MASK, 
+   MSR_LSTAR,
 #endif
MSR_IA32_TIME_STAMP_COUNTER,
 };

-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] mmu notifiers

2008-01-14 Thread Christoph Lameter
On Sat, 12 Jan 2008, Avi Kivity wrote:

 Two kvm instances mmap() the file (from anywhere) into the guest address
 space.  That memory is shared, and will be backed by the same page structs at
 the same offset.

Duh. Impossible. Two instances of Linux cannot share page structs. So how 
are you doing this? Or is this just an idea?




-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH] mmu notifiers #v2

2008-01-14 Thread Benjamin Herrenschmidt

On Mon, 2008-01-14 at 12:02 -0800, Christoph Lameter wrote:
 On Sun, 13 Jan 2008, Andrea Arcangeli wrote:
 
  About the locking perhaps I'm underestimating it, but by following the
  TLB flushing analogy, by simply clearing the shadow ptes (with kvm
  mmu_lock spinlock to avoid racing with other vcpu spte accesses of
  course) and flushing the shadow-pte after clearing the main linux pte,
  it should be enough to serialize against shadow-pte page faults that
  would call into get_user_pages. Flushing the host TLB before or after
  the shadow-ptes shouldn't matter.
 
 Hmmm... In most of the callsites we hold a writelock on mmap_sem right?

Not in unmap_mapping_range() afaik.

  Comments welcome... especially from SGI/IBM/Quadrics and all other
  potential users of this functionality.
 
  There are also certain details I'm uncertain about, like passing 'mm'
  to the lowlevel methods, my KVM usage of the invalidate_page()
  notifier for example only uses 'mm' for a BUG_ON for example:
 
 Passing mm is fine as long as mmap_sem is held.

Passing mm is always a good idea, regardless of the mmap_sem, it can be
useful for lots of other things :-)

  diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
  --- a/include/asm-generic/pgtable.h
  +++ b/include/asm-generic/pgtable.h
  @@ -86,6 +86,7 @@ do {  
  \
  pte_t __pte;\
  __pte = ptep_get_and_clear((__vma)-vm_mm, __address, __ptep);  \
  flush_tlb_page(__vma, __address);   \
  +   mmu_notifier(invalidate_page, (__vma)-vm_mm, __address);   \
  __pte;  \
   })
   #endif
 
 Hmmm... this is ptep_clear_flush? What about the other uses of 
 flush_tlb_page in asm-generic/pgtable.h and related uses in arch code?
 (would help if your patches would mention the function name in the diff 
 headers)

Note that last I looked, a lot of these were stale. Might be time to
resume my spring/summer cleaning of page table accessors...

  +#define mmu_notifier(function, mm, args...)
  \
  +   do {\
  +   struct mmu_notifier *__mn;  \
  +   struct hlist_node *__n; \
  +   \
  +   hlist_for_each_entry(__mn, __n, (mm)-mmu_notifier, hlist) \
  +   if (__mn-ops-function)\
  +   __mn-ops-function(__mn, mm, args);\
  +   } while (0)
 
 Does this have to be inline? ptep_clear_flush will become quite big
 
 --
 To unsubscribe, send a message with 'unsubscribe linux-mm' in
 the body to [EMAIL PROTECTED]  For more info on Linux MM,
 see: http://www.linux-mm.org/ .
 Don't email: a href=mailto:[EMAIL PROTECTED] [EMAIL PROTECTED] /a


-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] mmu notifiers

2008-01-14 Thread Avi Kivity
Christoph Lameter wrote:
 On Sat, 12 Jan 2008, Avi Kivity wrote:

   
 Two kvm instances mmap() the file (from anywhere) into the guest address
 space.  That memory is shared, and will be backed by the same page structs at
 the same offset.
 

 Duh. Impossible. Two instances of Linux cannot share page structs. So how 
 are you doing this? Or is this just an idea?

   

I was describing one Linux host running two guest instances.  The page 
structs are in the host, so they are shared by mmap().

kvm userspace is just an ordinary host process, it can mmap() any file 
it likes and then assign that virtual memory range to the guest (as 
guest physical memory).

-- 
error compiling committee.c: too many arguments to function


-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH] fix cpuid function 4

2008-01-14 Thread Alexander Graf
Dan Kenigsberg wrote:
 On Mon, Jan 14, 2008 at 02:49:31PM +0100, Alexander Graf wrote:
   
 Hi,

 Currently CPUID function 4 is broken. This function's values rely on the
 value of ECX.
 To solve the issue cleanly, there is already a new API for cpuid
 settings, which is not used yet.
 Using the current interface, the function 4 can be easily passed
 through, by giving multiple function 4 outputs and increasing the
 index-identifier on the fly. This does not break compatibility.

 This fix is really important for Mac OS X, as it requires cache
 information. Please also see my previous patches for Mac OS X (or rather
 core duo target) compatibility.

 Regards,

 Alex
 

   
 diff --git a/kernel/x86.c b/kernel/x86.c
 index b55c177..73312e9 100644
 --- a/kernel/x86.c
 +++ b/kernel/x86.c
 @@ -783,7 +783,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu 
 *vcpu,
  struct kvm_cpuid *cpuid,
  struct kvm_cpuid_entry __user *entries)
  {
 -int r, i;
 +int r, i, n = 0;
  struct kvm_cpuid_entry *cpuid_entries;
  
  r = -E2BIG;
 @@ -803,8 +803,17 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu 
 *vcpu,
  vcpu-arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
  vcpu-arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
  vcpu-arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
 -vcpu-arch.cpuid_entries[i].index = 0;
 -vcpu-arch.cpuid_entries[i].flags = 0;
 +switch(vcpu-arch.cpuid_entries[i].function) {
 +case 4:
 +vcpu-arch.cpuid_entries[i].index = n;
 +vcpu-arch.cpuid_entries[i].flags = 
 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 +n++;
 +break;
 +default:
 +vcpu-arch.cpuid_entries[i].index = 0;
 +vcpu-arch.cpuid_entries[i].flags = 0;
 +break;
 +}
 

 I will not mention the whitespace damage here :-). Instead, I'd ask you
   
Oh well, after having been into qemu source, I just got used to use
spaces instead of tabs ;-).

 to review, comment, and even try, the patch that I posted here not long
 ago, exposing all safe host cpuid functions to guests.
   
Sure.
Basically your patch targets at a completely different use case than
mine though. You want to expose the host features on the virtual CPU,
whereas my goal is to have a virtual Core Duo/Solo CPU, even if your
host CPU is actually an SVM capable one.

So my CoreDuo CPU definition still fails to populate a proper CPUID
function 4. With the -cpu host option, Linux works (as it's bright
enough to know that some values are just plain wrong), but Darwin
crashes. I am not exactly sure why it is, but I guess it's due to the
function 4 values exposing a 2-core CPU, which kvm simply doesn't emulate.

So far I still like the approach of the cpuid2 API, so it would be great
if you could take the patch one step further and have qemu defined CPUs
work properly as well.

I do not know too much about the kvm code style, so I can't really say
anything regarding that.

Regards,

Alex

-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] mmu notifiers

2008-01-14 Thread Avi Kivity
Christoph Lameter wrote:
 On Sun, 13 Jan 2008, Avi Kivity wrote:

   
 I was just explaining how kvm shares memory among guests (which does not
 require mmu notifiers); if you have some other configuration that can benefit
 from mmu notifiers, then, well, great.
 

 I think you have two page tables pointing to the same memory location 
 right (not to page structs but two ptes)? Without a mmu notifier the pages 
 in this memory range cannot be evicted because otherwise ptes of the other 
 instance will point to a page that is now used for a different purpose.
   

Even with just one guest we can't swap well without mmu notifiers.

kvm constructs new page tables for the guest that the Linux vm doesn't 
know about, so when Linux removes all the ptes, we need a callback to 
remove the kvm private ptes (and tlb entries).

-- 
error compiling committee.c: too many arguments to function


-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel