Re: [PATCH] regression: vmalloc easily fail.

2008-10-28 Thread Avi Kivity

Nick Piggin wrote:

Right... that was to add a guard page like the old vmalloc allocator.
vmallocs still add their extra page too, so most of them will have
a 2 page guard area, but I didn't think this would hurt significantly.

I'm not against the patch, but I wonder exactly what is filling it up
and how? (can you look at the vmalloc proc function to find out?


Maybe we're allocating two guard pages, but freeing only one?

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Add VMRUN handler v5

2008-10-28 Thread Alexander Graf


On 28.10.2008, at 19:38, Mike Day wrote:


On 20/10/08 19:04 +0200, Alexander Graf wrote:

+static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run  
*kvm_run)

+{
+   nsvm_printk("VMrun\n");
+   if (nested_svm_check_permissions(svm))
+   return 1;
+
+   svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+   skip_emulated_instruction(&svm->vcpu);
+
+   if (nested_svm_do(svm, svm->vmcb->save.rax, 0,
+ NULL, nested_svm_vmrun))
+   return 1;
+
+   if (nested_svm_do(svm, svm->vmcb->control.msrpm_base_pa, 0,
+ NULL, nested_svm_vmrun_msrpm))
+   return 1;
+
+   return 1;
+}


A nitpick, but you could remove the last if() statement and one of
the last two return statements. Unless you forsee more calls to
nested_svm_do() in here.


I had the IOPM merger in here and actually like the fall-through  
aspect of the function :-). But I guess this again is a personal taste  
thing.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[01/03] [PATCH] KVM: ia64: Re-organize data sturure of guests' data area

2008-10-28 Thread Zhang, Xiantao
>From 77601150901d7bb6b5542c14275709e81212062d Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <[EMAIL PROTECTED]>
Date: Thu, 23 Oct 2008 14:56:44 +0800
Subject: [PATCH] KVM: ia64: Re-organize data sturure of guests' data area.
 1. Increase the size of data area to 64M
 2. Support more vcpus and memory, 128 vcpus and 256G memory
 are supported for guests.
 3. Add the boundary check for memory and vcpu allocation.

With this patch, kvm guest's data area looks as follow:
  *
  *+--+  --- KVM_VM_DATA_SIZE
  *| vcpu[n]'s data   |   | 
___KVM_STK_OFFSET
  *|  |   |/   |
  *|..|   |   /vcpu's struct&stack |
  *|..|   |  /-| 0
  *| vcpu[5]'s data   |   | /   vpd|
  *| vcpu[4]'s data   |   |/---|
  *| vcpu[3]'s data   |   / vtlb   |
  *| vcpu[2]'s data   |  /||
  *| vcpu[1]'s data   |/  | vhpt   |
  *| vcpu[0]'s data   ||
  *+--+   |
  *|memory dirty log  |   |
  *+--+   |
  *|vm's data struct  |   |
  *+--+   |
  *|  |   |
  *|  |   |
  *|  |   |
  *|  |   |
  *|  |   |
  *|  |   |
  *|  |   |
  *|   vm's p2m table  |  |
  *|  |   |
  *|  |   |
  *|  |   |  |
  * vm's data->|  |   |  |
  *+--+ --- 0
  * To support large memory, needs to increase the size of p2m.
  * To support more vcpus, needs to ensure it has enough space to
  * hold vcpus' data.
  */

Signed-off-by: Xiantao Zhang <[EMAIL PROTECTED]>
---
 arch/ia64/include/asm/kvm_host.h |  192 --
 arch/ia64/kvm/kvm-ia64.c |   60 ++--
 arch/ia64/kvm/kvm_minstate.h |4 +-
 arch/ia64/kvm/misc.h |3 +-
 arch/ia64/kvm/vcpu.c |5 +-
 arch/ia64/kvm/vtlb.c |4 +-
 6 files changed, 161 insertions(+), 107 deletions(-)

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index c60d324..678e264 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -23,17 +23,6 @@
 #ifndef __ASM_KVM_HOST_H
 #define __ASM_KVM_HOST_H

-
-#include 
-#include 
-#include 
-#include 
-#include 
-
-#include 
-#include 
-
-#define KVM_MAX_VCPUS 4
 #define KVM_MEMORY_SLOTS 32
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
@@ -52,68 +41,127 @@
 #define EXIT_REASON_PTC_G  8

 /*Define vmm address space and vm data space.*/
-#define KVM_VMM_SIZE (16UL<<20)
+#define KVM_VMM_SIZE (__IA64_UL_CONST(16)<<20)
 #define KVM_VMM_SHIFT 24
-#define KVM_VMM_BASE 0xD000UL
-#define VMM_SIZE (8UL<<20)
+#define KVM_VMM_BASE 0xD000
+#define VMM_SIZE (__IA64_UL_CONST(8)<<20)

 /*
  * Define vm_buffer, used by PAL Services, base address.
- * Note: vmbuffer is in the VMM-BLOCK, the size must be < 8M
+ * Note: vm_buffer is in the VMM-BLOCK, the size must be < 8M
  */
 #define KVM_VM_BUFFER_BASE (KVM_VMM_BASE + VMM_SIZE)
-#define KVM_VM_BUFFER_SIZE (8UL<<20)
-
-/*Define Virtual machine data layout.*/
-#define KVM_VM_DATA_SHIFT  24
-#define KVM_VM_DATA_SIZE (1UL << KVM_VM_DATA_SHIFT)
-#define KVM_VM_DATA_BASE (KVM_VMM_BASE + KVM_VMM_SIZE)
-
-
-#define KVM_P2M_BASEKVM_VM_DATA_BASE
-#define KVM_P2M_OFS 0
-#define KVM_P2M_SIZE(8UL << 20)
-
-#define KVM_VHPT_BASE   (KVM_P2M_BASE + KVM_P2M_SIZE)
-#define KVM_VHPT_OFSKVM_P2M_SIZE
-#define KVM_VHPT_BLOCK_SIZE   (2UL << 20)
-#define VHPT_SHIFT  18
-#define VHPT_SIZE   (1UL << VHPT_SHIFT)
-#define VHPT_NUM_ENTRIES (1<<(VHPT_SHIFT-5))
-
-#define KVM_VTLB_BASE   (KVM_VHPT_BASE+KVM_VHPT_BLOCK_SIZE)
-#define KVM_VTLB_OFS(KVM_VHPT_OFS+KVM_VHPT_BLOCK_SIZE)
-#define KVM_VTLB_BLOCK_SIZE   (1UL<<20)
-#define VTLB_SHIFT  17
-#define VTLB_SIZE   (1UL<| |   |  |
+ *   +--+ --- 0
+ * To support large memory, needs to increase the size of p2m.
+ * To support more vcpus, needs to ensure it has enough space to
+ * hold vcpus' data.
+ */
+
+#define KVM_VM_DATA_SHIFT  26
+#define KVM_VM_DATA_SIZE   (__IA64_UL_CONST(1) << KVM_VM_DATA_SHIFT)
+#define KVM_VM_DATA_BASE   (KVM_VMM_BASE + KVM_VM_DATA_SI

[03/03][PATCH] KVM: ia64: kvm halt logic doesn't need lock to protect.

2008-10-28 Thread Zhang, Xiantao
>From 4858a5c47c5dce88a62a6edf427d8709f3ebda15 Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <[EMAIL PROTECTED]>
Date: Thu, 23 Oct 2008 15:03:38 +0800
Subject: [PATCH] KVM: ia64: kvm halt logic doesn't need lock to protect.

Remove the lock protection for kvm halt logic, otherwise,
once other vcpus want to acquire the lock, and they have to
wait all vcpus are waken up from halt.
Signed-off-by: Xiantao Zhang <[EMAIL PROTECTED]>
---
 arch/ia64/kvm/kvm-ia64.c |2 --
 1 files changed, 0 insertions(+), 2 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 6b1e31b..93c7f18 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -439,7 +439,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
expires = div64_u64(itc_diff, cyc_per_usec);
kt = ktime_set(0, 1000 * expires);
 
-   down_read(&vcpu->kvm->slots_lock);
vcpu->arch.ht_active = 1;
hrtimer_start(p_ht, kt, HRTIMER_MODE_ABS);
 
@@ -452,7 +451,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
vcpu->arch.mp_state =
KVM_MP_STATE_RUNNABLE;
-   up_read(&vcpu->kvm->slots_lock);
 
if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
return -EINTR;
-- 
1.6.0


0003-KVM-ia64-kvm-halt-logic-doesn-t-need-lock-to-prote.patch
Description: 0003-KVM-ia64-kvm-halt-logic-doesn-t-need-lock-to-prote.patch


[02/03][PATCH] KVM: ia64: Ensure SIGINT delivered to main thread (vcpu 0).

2008-10-28 Thread Zhang, Xiantao
>From dd0f4f43e038d33472dbbf6d6b75d4d84d1bc3f9 Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <[EMAIL PROTECTED]>
Date: Thu, 23 Oct 2008 15:02:52 +0800
Subject: [PATCH] KVM: ia64: Ensure SIGINT delivered to main thread (vcpu 0).

Before APs going to block status, it should make sure SIGINT is
masked, otherwise, it may eat SIGINT from user killing the guest, and
results in Qemu hanging there, becasue main thread can't get it to free
guest's resource.

Signed-off-by: Xiantao Zhang <[EMAIL PROTECTED]>
---
 arch/ia64/kvm/kvm-ia64.c |   12 ++--
 1 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 54a90b8..6b1e31b 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -673,16 +673,16 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct 
kvm_run *kvm_run)
 
vcpu_load(vcpu);
 
+   if (vcpu->sigset_active)
+   sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
kvm_vcpu_block(vcpu);
clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
-   vcpu_put(vcpu);
-   return -EAGAIN;
+   r = -EAGAIN;
+   goto out;
}
 
-   if (vcpu->sigset_active)
-   sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
-
if (vcpu->mmio_needed) {
memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
kvm_set_mmio_data(vcpu);
@@ -690,7 +690,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct 
kvm_run *kvm_run)
vcpu->mmio_needed = 0;
}
r = __vcpu_run(vcpu, kvm_run);
-
+out:
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
-- 
1.6.0


0002-KVM-ia64-Ensure-SIGINT-delivered-to-main-thread-v.patch
Description: 0002-KVM-ia64-Ensure-SIGINT-delivered-to-main-thread-v.patch


[00/03][PATCH] kvm-ia64 updates for linux-2.6.28-rc2

2008-10-28 Thread Zhang, Xiantao
Hi, Avi
Please review and apply the three patches!  The last two are key fixes 
for linux-2.6.28-rc2, and please also push them into upstream. 
Thanks!
Xiantao--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] Add HPET emulation to qemu (v3)

2008-10-28 Thread Alexander Graf





Am 27.10.2008 um 08:07 schrieb Beth Kon <[EMAIL PROTECTED]>:


On Tue, 2008-10-21 at 10:21 -0500, Anthony Liguori wrote:

Beth Kon wrote:


Thanks for the feedback, Anthony. I'll only respond where I have
specific comments. Otherwise, I agree to your suggestions and will  
make

the changes.


+if(timer_enabled(timer) && hpet_enabled(timer->state)) {
+qemu_irq_pulse(irq);
+/* windows wants timer0 on irq2 and linux wants irq0,
+ * so we pulse both
+ */
+if (do_ioapic)
+qemu_irq_pulse(timer->state->irqs[2]);



This seems curious and not quite right.  We should be able to detect
whether the HPET is being used in IO APIC mode and raise the  
appropriate

interrupt instead of generating a spurious irq0 interrupt.


After digging further on this, it turns out that the need for the 2
interrupts was caused by what looks like a problem with the way qemu  
is

generating interrupts for the ioapic. I will send out a separate patch
for that issue, and make the necessary changes in this hpet code.

+}
+}
+
+static void hpet_save(QEMUFile *f, void *opaque)
+{
+HPETState *s = opaque;
+int i;
+qemu_put_be64s(f, &s->config);
+qemu_put_be64s(f, &s->isr);
+/* save current counter value */
+s->hpet_counter = hpet_get_ticks(s);
+qemu_put_be64s(f, &s->hpet_counter);
+
+for(i = 0; i < HPET_NUM_TIMERS; i++) {
+qemu_put_8s(f, &s->timer[i].tn);
+qemu_put_be64s(f, &s->timer[i].config);
+qemu_put_be64s(f, &s->timer[i].cmp);
+qemu_put_be64s(f, &s->timer[i].fsb);
+qemu_put_be64s(f, &s->timer[i].period);
+if (s->timer[i].qemu_timer) {
+qemu_put_timer(f, s->timer[i].qemu_timer);
+}



Would qemu_timer ever be NULL?


You're right... the answer is no. I'll fix that.


+
+
+diff = hpet_calculate_diff(t, cur_tick);
+qemu_mod_timer(t->qemu_timer, qemu_get_clock(vm_clock)
++ (int64_t)ticks_to_ns(diff));



May want to convert ticks_to_ns to take and return an int64_t.  The
explicit casting could introduce very subtle bugs.

It seems better this way to me, since muldiv64 in ticks_to_ns takes  
uint64_t.
The likelihood of diff being big enough to create a problem seems  
small enough. Am I

missing something?

+case HPET_COUNTER:
+if (hpet_enabled(s))
+cur_tick = hpet_get_ticks(s);



Any reason for hpet_get_ticks(s) to not have this check integrated  
into it?
When the hpet is being disabled, we need to get the actual count,  
even though the
hpet_enabled check would return false. So if I made this change it  
would introduce an
ordering issue in the disable code (i.e., get the ticks before  
setting the hpet to

disabled)



+
+/* XXX this is a dirty hack for HPET support w/o LPC
+   Actually this is a config descriptor for the RCBA */



What's the dirty hack?
This comment is left over from Alexander Graf's code. I'm not sure  
why it is in this location and will I'll remove it. But
in comments on the first version of hpet code I produced, Alexander  
said, regarding the fixed assignment of HPET_BASE:


"This is a dirty hack that I did to make Mac OS X happy. Actually  
the HPET base address gets specified in the RCBA on the
LPC and is configured by the BIOS to point to a valid address, with  
0xfed0 being the default (IIRC if you write 0 to

the fields you end up with that address)."


Basically IIRC on the ICH-7 the HPET base address is configured  
indirectly by writing an address to the RCBA, which is mmio based  
space configured in the LPC pci device config space.
Since we don't have an LPC device, but a PIIX ISA bridge, there was no  
space to configure this on. That's why I faked and hardcoded some  
parts here, as the OS should read the acpi tables to get the address  
anyways.


Please double-check that information please, as I don't have the specs  
with me atm.


Alex




But in other areas of qemu code I see base addresses being hardcoded  
and am not sure anything different needs to be done

here. Comments?





Regards,

Anthony Liguori


--
Elizabeth Kon (Beth)
IBM Linux Technology Center
Open Hypervisor Team
email: [EMAIL PROTECTED]


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU

2008-10-28 Thread Anthony Liguori

Glauber Costa wrote:

On Tue, Oct 28, 2008 at 7:51 PM, Anthony Liguori <[EMAIL PROTECTED]> wrote:
  


This is part of the reason for this exercise.  I'd rather introduce KVM
support first and then look at abstracting things, than vice versa.  A
number of the hooks in the current QEMUAccel tree are there for the wrong
reason (to support the out-of-tree IO thread, for instance).

If you just introduce something with various hooks and say, these are hooks
we'll need, it's not possible to really evaluate whether the hooks are
needed because nothing in the tree makes use of them.



We talked extensively on monday about it, and I'm in agreement with it.
  


Something I was thinking about this morning, and I think the first place 
where we'll definitely need a hook, is how to deal with 
kvm_load_registers().  I think there's overlap between KVM and the IO 
thread here.


There are two reasons (I can think of) that most of the device model 
code can't run in conjunction with TCG.  The first is that TCG may 
modify CPUState in a non-atomic way.  The device model may need to 
access CPUState although there are very few places that it does.  The 
other reason is accessing guest memory.  TCG does not preserve atomicity 
when a guest accesses device memory.  There are probably only a few 
places in the device model (like virtio) that depend on atomicity.


If we implemented an API that implemented a lock/unlock for CPUState and 
for portions of memory, then I think this could be used both as a hook 
for kvm_load_registers and as a way to introduce an IO thread with TCG.


The CPUState lock/unlock is pretty straight forward.  For the memory 
implementation to be efficient, I think you would have to acquire the 
lock when TCG brings a physical address into the TLB (preferrably, at a 
page granularity), or whenever someone tries to access memory (via 
cpu_physical_memory_rw).  I think in the vast majority of the cases, 
there wouldn't be any contention and both could TCG could run along side 
the IO thread.


Another place "hook" is updating a slot's dirty bitmap.  Right now, with 
my patchset we don't have live migration or the VGA RAM optimization.  
There's nothing about the VGA RAM optimization that wouldn't work for 
QEMU.  I'm not sure that it really is an optimization in the context of 
TCG, but I certainly don't think it's any worse.  The only thing you 
really need is to query the KVM dirty bitmap when it comes time to 
enable start over querying the VGA dirty bits.


The same is needed for live migration, so I think what we really need is 
to change the memory dirty bit tracking API to have a concept of refresh 
that we can use to hook for KVM.


FWIW, I included a TODO in my patch if people are interesting in 
tackling any of these things.


Regards,

Anthony Liguori

Regards,

Anthony Liguori


Regards,

Anthony Liguori



surprised,
 Gerd
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

  








  


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] regression: vmalloc easily fail.

2008-10-28 Thread Nick Piggin
On Tue, Oct 28, 2008 at 08:55:13PM -0200, Glauber Costa wrote:
> Commit db64fe02258f1507e13fe5212a989922323685ce broke
> KVM (the symptom) for me. The cause is that vmalloc
> allocations fail, despite of the fact that /proc/meminfo
> shows plenty of vmalloc space available.
> 
> After some investigation, it seems to me that the current
> way to compute the next addr in the rb-tree transversal
> leaves a spare page between each allocation. After a few
> allocations, regardless of their size, we run out of vmalloc
> space.

Right... that was to add a guard page like the old vmalloc allocator.
vmallocs still add their extra page too, so most of them will have
a 2 page guard area, but I didn't think this would hurt significantly.

I'm not against the patch, but I wonder exactly what is filling it up
and how? (can you look at the vmalloc proc function to find out?)

> 
> Signed-off-by: Glauber Costa <[EMAIL PROTECTED]>
> Cc: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
> Cc: Krzysztof Helt <[EMAIL PROTECTED]>
> ---
>  mm/vmalloc.c |2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 0365369..a33b0d1 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -363,7 +363,7 @@ retry:
>   }
>  
>   while (addr + size >= first->va_start && addr + size <= vend) {
> - addr = ALIGN(first->va_end + PAGE_SIZE, align);
> + addr = ALIGN(first->va_end, align);
>  
>   n = rb_next(&first->rb_node);
>   if (n)
> -- 
> 1.5.6.5
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU

2008-10-28 Thread Glauber Costa
On Tue, Oct 28, 2008 at 7:51 PM, Anthony Liguori <[EMAIL PROTECTED]> wrote:
> Gerd Hoffmann wrote:
>>
>> Anthony Liguori wrote:
>>
>>>
>>> This patch only implements the bare minimum support to get a guest
>>> booting.  It
>>> has very little impact the rest of QEMU and attempts to integrate nicely
>>> with
>>> the rest of QEMU.
>>>
>>
>> Huh?  That isn't based on the qemu-accel patches ...
>>
>
> This is part of the reason for this exercise.  I'd rather introduce KVM
> support first and then look at abstracting things, than vice versa.  A
> number of the hooks in the current QEMUAccel tree are there for the wrong
> reason (to support the out-of-tree IO thread, for instance).
>
> If you just introduce something with various hooks and say, these are hooks
> we'll need, it's not possible to really evaluate whether the hooks are
> needed because nothing in the tree makes use of them.

We talked extensively on monday about it, and I'm in agreement with it.

>
> Regards,
>
> Anthony Liguori
>
>> surprised,
>>  Gerd
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to [EMAIL PROTECTED]
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>
>
>
>



-- 
Glauber  Costa.
"Free as in Freedom"
http://glommer.net

"The less confident you are, the more serious you have to act."
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: kvm + kqemu enabled at the same time

2008-10-28 Thread Anthony Liguori

Martin Kejík wrote:

Hello,
I've compiled the KVM enabled QEMU with support for both KVM and KQEMU. Both 
modules loaded and
QEMU running saying "kvm: enabled" and "kqemu: enabled for user code".

How does this work?? What does QEMU really do in this situation when we look 
closer to CPU??
  


You're using both accelerators at the same time and getting 2x 
acceleration.  It will actually go faster than native now :-)


Seriously, it's working based on sheer luck.  If you look at the 
cpu_exec() loop (which is the core execution loop in QEMU, you'll see):



#ifdef USE_KQEMU
if (kqemu_is_ok(env) && env->interrupt_request == 0) {
int ret;
env->eflags = env->eflags | 
cc_table[CC_OP].compute_all() | (DF & DF_MASK);

ret = kqemu_cpu_exec(env);
/* put eflags in CPU temporary format */
CC_SRC = env->eflags & (CC_O | CC_S | CC_Z | CC_A | 
CC_P | CC_C);

DF = 1 - (2 * ((env->eflags >> 10) & 1));
CC_OP = CC_OP_EFLAGS;
env->eflags &= ~(DF_MASK | CC_O | CC_S | CC_Z | CC_A | 
CC_P | CC_C);

if (ret == 1) {
/* exception */
longjmp(env->jmp_env, 1);
} else if (ret == 2) {
/* softmmu execution needed */
} else {
if (env->interrupt_request != 0) {
/* hardware interrupt will be executed just 
after */

} else {
/* otherwise, we restart */
longjmp(env->jmp_env, 1);
}
}
}
#endif

if (kvm_enabled()) {
kvm_cpu_exec(env);
longjmp(env->jmp_env, 1);
}


What's letting this work is kqemu_is_ok(env).  This check looks like this:


static inline int kqemu_is_ok(CPUState *env)
{
return(env->kqemu_enabled &&
   (env->cr[0] & CR0_PE_MASK) &&
   !(env->hflags & HF_INHIBIT_IRQ_MASK) &&
   (env->eflags & IF_MASK) &&
   !(env->eflags & VM_MASK) &&
   (env->kqemu_enabled == 2 ||
((env->hflags & HF_CPL_MASK) == 3 &&
 (env->eflags & IOPL_MASK) != IOPL_MASK)));
}


This is checking whether you're in protected mode, not in an interrupt 
window, interrupts are enabled, you aren't in vm86 mode, and if not 
using kernel-kqemu, CPL == 3 and IOPL > CPL.


As an optimization, KVM does not synchronize CPUState very often which 
means that env is very stale.  This means that it's likely that CPUState 
is in the initial CPU state (in real mode).


As long as kqemu never gets to execute, you should be ok.  If you force 
KVM to sync CPUState, you'll see kqemu actually execute and bad things 
will happen.


But the more important question is, why in the world are you doing this 
in the first place?


Regards,

Anthony Liguori


thanx
  


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] regression: vmalloc easily fail.

2008-10-28 Thread Roland Dreier
 > I suspect it's a case of off-by-one... ALIGN() might round down, and
 > the "+ (PAGE_SIZE-1)" was there to make it round up.
 > Except for that missing -1 ...

ALIGN() has always rounded up, at least back to 2.4.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


kvm + kqemu enabled at the same time

2008-10-28 Thread Martin Kejík
Hello,
I've compiled the KVM enabled QEMU with support for both KVM and KQEMU. Both 
modules loaded and
QEMU running saying "kvm: enabled" and "kqemu: enabled for user code".

How does this work?? What does QEMU really do in this situation when we look 
closer to CPU??

thanx
-- 
Martin (Kejda) Kejík
kejda(at)centrum(dot)cz


signature.asc
Description: This is a digitally signed message part.


Re: [PATCH 3/3] Add KVM support to QEMU

2008-10-28 Thread Anthony Liguori

Gerd Hoffmann wrote:

Anthony Liguori wrote:
  

This patch only implements the bare minimum support to get a guest booting.  It
has very little impact the rest of QEMU and attempts to integrate nicely with
the rest of QEMU.



Huh?  That isn't based on the qemu-accel patches ...
  


This is part of the reason for this exercise.  I'd rather introduce KVM 
support first and then look at abstracting things, than vice versa.  A 
number of the hooks in the current QEMUAccel tree are there for the 
wrong reason (to support the out-of-tree IO thread, for instance).


If you just introduce something with various hooks and say, these are 
hooks we'll need, it's not possible to really evaluate whether the hooks 
are needed because nothing in the tree makes use of them.


Regards,

Anthony Liguori


surprised,
  Gerd
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
  


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] regression: vmalloc easily fail.

2008-10-28 Thread Arjan van de Ven
On Tue, 28 Oct 2008 14:22:16 -0700
Roland Dreier <[EMAIL PROTECTED]> wrote:

>  > I'm guessing that the missing comment explains that this is
>  > intentional, to trap buffer overflows?
> 
> Actually, speaking of comments, it's interesting that
> __get_vm_area_node() -- which is called from vmalloc() -- does:
> 
>   /*
>* We always allocate a guard page.
>*/
>   size += PAGE_SIZE;
> 
>   va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
> 
> and alloc_vmap_area() adds another PAGE_SIZE, as the original email
> pointed out:
> 
>   while (addr + size >= first->va_start && addr + size
> <= vend) { addr = ALIGN(first->va_end + PAGE_SIZE, align);
> 
> I wonder if the double padding is causing a problem when things get
> too fragmented?

I suspect it's a case of off-by-one... ALIGN() might round down, and
the "+ (PAGE_SIZE-1)" was there to make it round up.
Except for that missing -1 ...

-- 
Arjan van de VenIntel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] Add KVM support to QEMU

2008-10-28 Thread Gerd Hoffmann
Anthony Liguori wrote:
> This patch only implements the bare minimum support to get a guest booting.  
> It
> has very little impact the rest of QEMU and attempts to integrate nicely with
> the rest of QEMU.

Huh?  That isn't based on the qemu-accel patches ...

surprised,
  Gerd
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] regression: vmalloc easily fail.

2008-10-28 Thread Matias Zabaljauregui
hello,

>> I'm guessing that the missing comment explains that this is intentional,
>> to trap buffer overflows?

yes, IIRC the pages between vmalloc areas are there for safety reasons.
(like the interval inserted before the first area, defined by VMALLOC_OFFSET)

regards
Matias
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] regression: vmalloc easily fail.

2008-10-28 Thread Roland Dreier
 > I'm guessing that the missing comment explains that this is
 > intentional, to trap buffer overflows?

Actually, speaking of comments, it's interesting that
__get_vm_area_node() -- which is called from vmalloc() -- does:

/*
 * We always allocate a guard page.
 */
size += PAGE_SIZE;

va = alloc_vmap_area(size, align, start, end, node, gfp_mask);

and alloc_vmap_area() adds another PAGE_SIZE, as the original email
pointed out:

while (addr + size >= first->va_start && addr + size <= vend) {
addr = ALIGN(first->va_end + PAGE_SIZE, align);

I wonder if the double padding is causing a problem when things get too
fragmented?

 - R.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH 3/3] Add KVM support to QEMU

2008-10-28 Thread Anthony Liguori

Hollis Blanchard wrote:

Just a quick skim...

On Tue, Oct 28, 2008 at 3:13 PM, Anthony Liguori <[EMAIL PROTECTED]> wrote:
  

+int kvm_cpu_exec(CPUState *env)
+{
+struct kvm_run *run = env->kvm_run;
+int ret;
+
+dprintf("kvm_cpu_exec()\n");
+
+do {
+kvm_arch_pre_run(env, run);
+
+if ((env->interrupt_request & CPU_INTERRUPT_EXIT)) {
+dprintf("interrupt exit requested\n");
+ret = 0;
+break;
+}
+
+dprintf("setting tpr\n");
+run->cr8 = cpu_get_apic_tpr(env);



This belongs in the arch_pre_run hook above.
  


Good catch, I've updated the patch.


How did you decide which exit handlers should go into
architecture-specific code? Looking at just the KVM architecture set:
  


Based on whether the implementation required target-specific code.


IO: x86 and ia64, not PowerPC or s390
  


cpu_{in,out}[bwl] are defined in vl.c and are available for all 
architectures.  They are no-ops on most architectures because they are 
never used.



MMIO: everybody except s390
  


cpu_physical_memory_rw() is defined by everyone.


DCRs: PowerPC only
  


This will have to be an architecture specific handler.


IRQ window: not sure
  


It's a no-op implementation.  I would think that this would be needed on 
PPC.  If you want to inject an interrupt, but the guest is unable to 
handle an interrupt, you need to exit to userspace when the guest 
re-enables interrupts.  Otherwise, you may never return to userspace for 
the interrupt to be injected.


How do you handle that now?  Does PPC have something that makes this 
unnecessary?


Regards,

Anthony Liguori


-Hollis
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
  


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] regression: vmalloc easily fail.

2008-10-28 Thread Glauber Costa
On Tue, Oct 28, 2008 at 11:03:22PM +0200, Avi Kivity wrote:
> Glauber Costa wrote:
>> Commit db64fe02258f1507e13fe5212a989922323685ce broke
>> KVM (the symptom) for me. The cause is that vmalloc
>> allocations fail, despite of the fact that /proc/meminfo
>> shows plenty of vmalloc space available.
>>
>> After some investigation, it seems to me that the current
>> way to compute the next addr in the rb-tree transversal
>> leaves a spare page between each allocation. After a few
>> allocations, regardless of their size, we run out of vmalloc
>> space.
>>
>>  while (addr + size >= first->va_start && addr + size <= vend) {
>> -addr = ALIGN(first->va_end + PAGE_SIZE, align);
>> +addr = ALIGN(first->va_end, align);
>>  n = rb_next(&first->rb_node);
>>  if (n)
>>   
>
> I'm guessing that the missing comment explains that this is intentional,  
> to trap buffer overflows?
>
> (okay that was a cheap shot.  I don't comment nearly enough either)
>
> Even if you leave a page between allocations, I don't see how you can  
> fail a one page allocation, unless you've allocated at least N/2 pages  
> (where N is the size of the vmalloc space in pages).

I'm hoping Nick will comment on it. I might well be wrong.
but it nicely fixes the problem for me, and actually, you don't need 
"at least N/2 pages". The size of the allocations hardly matters, just
the amount of allocations we did. Since kvm does some small
vmalloc allocations, that may be the reason for we triggering it.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] regression: vmalloc easily fail.

2008-10-28 Thread Avi Kivity

Glauber Costa wrote:

Commit db64fe02258f1507e13fe5212a989922323685ce broke
KVM (the symptom) for me. The cause is that vmalloc
allocations fail, despite of the fact that /proc/meminfo
shows plenty of vmalloc space available.

After some investigation, it seems to me that the current
way to compute the next addr in the rb-tree transversal
leaves a spare page between each allocation. After a few
allocations, regardless of their size, we run out of vmalloc
space.

 
 		while (addr + size >= first->va_start && addr + size <= vend) {

-   addr = ALIGN(first->va_end + PAGE_SIZE, align);
+   addr = ALIGN(first->va_end, align);
 
 			n = rb_next(&first->rb_node);

if (n)
  


I'm guessing that the missing comment explains that this is intentional, 
to trap buffer overflows?


(okay that was a cheap shot.  I don't comment nearly enough either)

Even if you leave a page between allocations, I don't see how you can 
fail a one page allocation, unless you've allocated at least N/2 pages 
(where N is the size of the vmalloc space in pages).


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] regression: vmalloc easily fail.

2008-10-28 Thread Glauber Costa
Commit db64fe02258f1507e13fe5212a989922323685ce broke
KVM (the symptom) for me. The cause is that vmalloc
allocations fail, despite of the fact that /proc/meminfo
shows plenty of vmalloc space available.

After some investigation, it seems to me that the current
way to compute the next addr in the rb-tree transversal
leaves a spare page between each allocation. After a few
allocations, regardless of their size, we run out of vmalloc
space.

Signed-off-by: Glauber Costa <[EMAIL PROTECTED]>
Cc: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
Cc: Krzysztof Helt <[EMAIL PROTECTED]>
---
 mm/vmalloc.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0365369..a33b0d1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -363,7 +363,7 @@ retry:
}
 
while (addr + size >= first->va_start && addr + size <= vend) {
-   addr = ALIGN(first->va_end + PAGE_SIZE, align);
+   addr = ALIGN(first->va_end, align);
 
n = rb_next(&first->rb_node);
if (n)
-- 
1.5.6.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH 3/3] Add KVM support to QEMU

2008-10-28 Thread Hollis Blanchard
Just a quick skim...

On Tue, Oct 28, 2008 at 3:13 PM, Anthony Liguori <[EMAIL PROTECTED]> wrote:
> +int kvm_cpu_exec(CPUState *env)
> +{
> +struct kvm_run *run = env->kvm_run;
> +int ret;
> +
> +dprintf("kvm_cpu_exec()\n");
> +
> +do {
> +kvm_arch_pre_run(env, run);
> +
> +if ((env->interrupt_request & CPU_INTERRUPT_EXIT)) {
> +dprintf("interrupt exit requested\n");
> +ret = 0;
> +break;
> +}
> +
> +dprintf("setting tpr\n");
> +run->cr8 = cpu_get_apic_tpr(env);

This belongs in the arch_pre_run hook above.

> +ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
> +kvm_arch_post_run(env, run);
> +
> +if (ret == -EINTR || ret == -EAGAIN) {
> +dprintf("io window exit\n");
> +ret = 0;
> +break;
> +}
> +
> +if (ret < 0) {
> +dprintf("kvm run failed %s\n", strerror(-ret));
> +abort();
> +}
> +
> +ret = 0; /* exit loop */
> +switch (run->exit_reason) {
> +case KVM_EXIT_IO:
> +dprintf("handle_io\n");
> +ret = kvm_handle_io(env, run->io.port,
> +(uint8_t *)run + run->io.data_offset,
> +run->io.direction,
> +run->io.size,
> +run->io.count);
> +break;
> +case KVM_EXIT_MMIO:
> +dprintf("handle_mmio\n");
> +cpu_physical_memory_rw(run->mmio.phys_addr,
> +   run->mmio.data,
> +   run->mmio.len,
> +   run->mmio.is_write);
> +ret = 1;
> +break;
> +case KVM_EXIT_IRQ_WINDOW_OPEN:
> +dprintf("irq_window_open\n");
> +break;
> +case KVM_EXIT_SHUTDOWN:
> +dprintf("shutdown\n");
> +qemu_system_reset_request();
> +ret = 1;
> +break;
> +case KVM_EXIT_UNKNOWN:
> +dprintf("kvm_exit_unknown\n");
> +break;
> +case KVM_EXIT_FAIL_ENTRY:
> +dprintf("kvm_exit_fail_entry\n");
> +break;
> +case KVM_EXIT_EXCEPTION:
> +dprintf("kvm_exit_exception\n");
> +break;
> +case KVM_EXIT_DEBUG:
> +dprintf("kvm_exit_debug\n");
> +break;
> +default:
> +dprintf("kvm_arch_handle_exit\n");
> +ret = kvm_arch_handle_exit(env, run);
> +break;
> +}
> +} while (ret > 0);
> +
> +return ret;
> +}

How did you decide which exit handlers should go into
architecture-specific code? Looking at just the KVM architecture set:
IO: x86 and ia64, not PowerPC or s390
MMIO: everybody except s390
DCRs: PowerPC only
IRQ window: not sure

-Hollis
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


KVM: MMU: increase per-vcpu rmap cache alloc size

2008-10-28 Thread Marcelo Tosatti

The page fault path can use two rmap_desc structures, if:

- walk_addr's dirty pte update allocates one rmap_desc.
- mmu_lock is dropped, sptes are zapped resulting in rmap_desc being
freed.
- fetch->mmu_set_spte allocates another rmap_desc.

Increase to 4 for safety.

Signed-off-by: Marcelo Tosatti <[EMAIL PROTECTED]>


diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 79cb4a9..2477a24 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -316,7 +316,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
if (r)
goto out;
r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
-  rmap_desc_cache, 1);
+  rmap_desc_cache, 4);
if (r)
goto out;
r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] kvm: disable virtualization on kdump

2008-10-28 Thread Eric W. Biederman
Eduardo Habkost <[EMAIL PROTECTED]> writes:

> I am still wondering if a simple function pointer (instead of a full
> notifier interface) would be good enough. It looks like a reasonable
> tradeoff.

Oh sorry.   As long as we do the whole rcu protected thing so it is safe
to call the function without taking locks it should work.  I'm not
thrilled about a function pointer but it should work.

> I think I will get flamed if I try to pull to the core a bunch of code
> that always lived in the KVM module.  8)

Why is KVM modular anyway?  That seems like some pretty core cpu 
functionality...

> And even if we pull those functions to the core, we will still have
> a function pointer on the new code anyway, because we would need to
> support vmx and svm.

Depending.  It doesn't sound like svm has the problem where init doesn't
work so svm really doesn't need to do this.

Eric
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] Add KVM support to QEMU

2008-10-28 Thread Anthony Liguori
This patch adds very basic KVM support.  KVM is a kernel module for Linux that
allows userspace programs to make use of hardware virtualization support.  It
current supports x86 hardware virtualization using Intel VT-x or AMD-V.  It
also supports IA64 VT-i, PPC 440, and S390.

This patch only implements the bare minimum support to get a guest booting.  It
has very little impact the rest of QEMU and attempts to integrate nicely with
the rest of QEMU.

Even though this implementation is basic, it is significantly faster than TCG.
Booting and shutting down a Linux guest:

w/TCG:  1:32.36 elapsed  84% CPU

w/KVM:  0:31.14 elapsed  59% CPU

Right now, KVM is disabled by default and must be explicitly enabled with
 -enable-kvm.  We can enable it by default later when we have had better
testing.

Signed-off-by: Anthony Liguori <[EMAIL PROTECTED]>

diff --git a/KVM_TODO b/KVM_TODO
new file mode 100644
index 000..9529049
--- /dev/null
+++ b/KVM_TODO
@@ -0,0 +1,9 @@
+1) Add hooks for load/save of register state
+  o Fixes gdbstub, save/restore, and vmport
+2) Add VGA optimization
+3) Add IO thread
+4) Add guest SMP support
+5) Add TPR optimization
+6) Add support for in-kernel APIC
+7) Add support for in-kernel PIT
+8) Merge in additional changes in kvm-userspace tree
diff --git a/Makefile.target b/Makefile.target
index e2edf9d..903d66d 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -183,6 +183,9 @@ CFLAGS+=-I/opt/SUNWspro/prod/include/cc
 endif
 endif
 
+kvm.o: CFLAGS+=$(KVM_CFLAGS)
+kvm-all.o: CFLAGS+=$(KVM_CFLAGS)
+
 all: $(PROGS)
 
 #
@@ -475,6 +478,9 @@ ifndef CONFIG_USER_ONLY
 
 OBJS=vl.o osdep.o monitor.o pci.o loader.o isa_mmio.o machine.o net-checksum.o
 OBJS+=fw_cfg.o aio.o buffered_file.o migration.o migration-tcp.o
+ifdef CONFIG_KVM
+OBJS+=kvm.o kvm-all.o
+endif
 ifdef CONFIG_WIN32
 OBJS+=block-raw-win32.o
 else
diff --git a/configure b/configure
index aefa69b..7aed99d 100755
--- a/configure
+++ b/configure
@@ -113,6 +113,7 @@ aio="yes"
 nptl="yes"
 mixemu="no"
 bluez="yes"
+kvm="yes"
 
 # OS specific
 targetos=`uname -s`
@@ -300,6 +301,8 @@ for opt do
   ;;
   --disable-bluez) bluez="no"
   ;;
+  --disable-kvm) kvm="no"
+  ;;
   --enable-profiler) profiler="yes"
   ;;
   --enable-cocoa)
@@ -439,6 +442,7 @@ echo "  --disable-brlapi disable BrlAPI"
 echo "  --disable-vnc-tlsdisable TLS encryption for VNC server"
 echo "  --disable-curses disable curses output"
 echo "  --disable-bluez  disable bluez stack connectivity"
+echo "  --disable-kvmdisable KVM acceleration support"
 echo "  --disable-nptl   disable usermode NPTL support"
 echo "  --enable-system  enable all system emulation targets"
 echo "  --disable-system disable all system emulation targets"
@@ -933,6 +937,30 @@ EOF
 fi
 
 ##
+# kvm probe
+if test "$kvm" = "yes" ; then
+cat > $TMPC <
+#if !defined(KVM_API_VERSION) || \
+KVM_API_VERSION < 12 || \
+KVM_API_VERSION > 12 || \
+!defined(KVM_CAP_USER_MEMORY) || \
+!defined(KVM_CAP_SET_TSS_ADDR)
+#error Invalid KVM version
+#endif
+int main(void) { return 0; }
+EOF
+  # FIXME make this configurable
+  kvm_cflags=-I/lib/modules/`uname -r`/build/include
+  if $cc $ARCH_CFLAGS -o $TMPE ${OS_CFLAGS} $kvm_cflags $TMPC \
+  2>/dev/null ; then
+:
+  else
+kvm="no"
+  fi
+fi
+
+##
 # AIO probe
 if test "$aio" = "yes" ; then
   aio=no
@@ -1018,6 +1046,7 @@ echo "uname -r  $uname_release"
 echo "NPTL support  $nptl"
 echo "vde support   $vde"
 echo "AIO support   $aio"
+echo "KVM support   $kvm"
 
 if test $sdl_too_old = "yes"; then
 echo "-> Your SDL version is too old - please upgrade to have SDL support"
@@ -1388,6 +1417,15 @@ interp_prefix1=`echo "$interp_prefix" | sed 
"s/%M/$target_cpu/g"`
 echo "#define CONFIG_QEMU_PREFIX \"$interp_prefix1\"" >> $config_h
 gdb_xml_files=""
 
+# FIXME allow i386 to build on x86_64 and vice versa
+if test "$kvm" = "yes" -a "$target_cpu" != "$cpu" ; then
+  kvm="no"
+fi
+# Disable KVM for linux-user
+if test "$kvm" = "yes" -a "$target_softmmu" = "no" ; then
+  kvm="no"
+fi
+
 case "$target_cpu" in
   i386)
 echo "TARGET_ARCH=i386" >> $config_mak
@@ -1397,6 +1435,11 @@ case "$target_cpu" in
 then
   echo "#define USE_KQEMU 1" >> $config_h
 fi
+if test "$kvm" = "yes" ; then
+  echo "CONFIG_KVM=yes" >> $config_mak
+  echo "KVM_CFLAGS=$kvm_cflags" >> $config_mak
+  echo "#define CONFIG_KVM" >> $config_h
+fi
 gcc3minver=`$cc --version 2> /dev/null| fgrep "(GCC) 3." | awk '{ print $3 
}' | cut -f2 -d.`
 if test -n "$gcc3minver" && test $gcc3minver -gt 3
 then
@@ -1414,6 +1457,11 @@ case "$target_cpu" in
 then
   echo "#define USE_KQEMU 1" >> $config_h
 fi
+if test "$kvm" = "yes" ; then
+  echo "CONFIG_KVM=yes" >> $config_mak
+  echo "KVM_CFL

[PATCH 2/3] Split CPUID from op_helper

2008-10-28 Thread Anthony Liguori
KVM needs to call CPUID from outside of the TCG code.  This patch
splits out the CPUID logic into a separate helper that both the op
helper and KVM can call.

Signed-off-by: Anthony Liguori <[EMAIL PROTECTED]>

diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index b1678ef..263a477 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -730,6 +730,10 @@ void cpu_smm_update(CPUX86State *env);
 /* will be suppressed */
 void cpu_x86_update_cr0(CPUX86State *env, uint32_t new_cr0);
 
+void cpu_x86_cpuid(CPUX86State *env, uint32_t index,
+   uint32_t *eax, uint32_t *ebx,
+   uint32_t *ecx, uint32_t *edx);
+
 /* used to debug */
 #define X86_DUMP_FPU  0x0001 /* dump FPU state too */
 #define X86_DUMP_CCOP 0x0002 /* dump qemu flag cache */
diff --git a/target-i386/helper.c b/target-i386/helper.c
index c2e1a88..905ae9b 100644
--- a/target-i386/helper.c
+++ b/target-i386/helper.c
@@ -1287,3 +1287,169 @@ target_phys_addr_t cpu_get_phys_page_debug(CPUState 
*env, target_ulong addr)
 return paddr;
 }
 #endif /* !CONFIG_USER_ONLY */
+
+void cpu_x86_cpuid(CPUX86State *env, uint32_t index,
+   uint32_t *eax, uint32_t *ebx,
+   uint32_t *ecx, uint32_t *edx)
+{
+/* test if maximum index reached */
+if (index & 0x8000) {
+if (index > env->cpuid_xlevel)
+index = env->cpuid_level;
+} else {
+if (index > env->cpuid_level)
+index = env->cpuid_level;
+}
+
+switch(index) {
+case 0:
+*eax = env->cpuid_level;
+*ebx = env->cpuid_vendor1;
+*edx = env->cpuid_vendor2;
+*ecx = env->cpuid_vendor3;
+break;
+case 1:
+*eax = env->cpuid_version;
+*ebx = (env->cpuid_apic_id << 24) | 8 << 8; /* CLFLUSH size in quad 
words, Linux wants it. */
+*ecx = env->cpuid_ext_features;
+*edx = env->cpuid_features;
+break;
+case 2:
+/* cache info: needed for Pentium Pro compatibility */
+*eax = 1;
+*ebx = 0;
+*ecx = 0;
+*edx = 0x2c307d;
+break;
+case 4:
+/* cache info: needed for Core compatibility */
+switch (*ecx) {
+case 0: /* L1 dcache info */
+*eax = 0x121;
+*ebx = 0x1c0003f;
+*ecx = 0x03f;
+*edx = 0x001;
+break;
+case 1: /* L1 icache info */
+*eax = 0x122;
+*ebx = 0x1c0003f;
+*ecx = 0x03f;
+*edx = 0x001;
+break;
+case 2: /* L2 cache info */
+*eax = 0x143;
+*ebx = 0x3c0003f;
+*ecx = 0xfff;
+*edx = 0x001;
+break;
+default: /* end of info */
+*eax = 0;
+*ebx = 0;
+*ecx = 0;
+*edx = 0;
+break;
+}
+
+break;
+case 5:
+/* mwait info: needed for Core compatibility */
+*eax = 0; /* Smallest monitor-line size in bytes */
+*ebx = 0; /* Largest monitor-line size in bytes */
+*ecx = CPUID_MWAIT_EMX | CPUID_MWAIT_IBE;
+*edx = 0;
+break;
+case 6:
+/* Thermal and Power Leaf */
+*eax = 0;
+*ebx = 0;
+*ecx = 0;
+*edx = 0;
+break;
+case 9:
+/* Direct Cache Access Information Leaf */
+*eax = 0; /* Bits 0-31 in DCA_CAP MSR */
+*ebx = 0;
+*ecx = 0;
+*edx = 0;
+break;
+case 0xA:
+/* Architectural Performance Monitoring Leaf */
+*eax = 0;
+*ebx = 0;
+*ecx = 0;
+*edx = 0;
+break;
+case 0x8000:
+*eax = env->cpuid_xlevel;
+*ebx = env->cpuid_vendor1;
+*edx = env->cpuid_vendor2;
+*ecx = env->cpuid_vendor3;
+break;
+case 0x8001:
+*eax = env->cpuid_features;
+*ebx = 0;
+*ecx = env->cpuid_ext3_features;
+*edx = env->cpuid_ext2_features;
+break;
+case 0x8002:
+case 0x8003:
+case 0x8004:
+*eax = env->cpuid_model[(index - 0x8002) * 4 + 0];
+*ebx = env->cpuid_model[(index - 0x8002) * 4 + 1];
+*ecx = env->cpuid_model[(index - 0x8002) * 4 + 2];
+*edx = env->cpuid_model[(index - 0x8002) * 4 + 3];
+break;
+case 0x8005:
+/* cache info (L1 cache) */
+*eax = 0x01ff01ff;
+*ebx = 0x01ff01ff;
+*ecx = 0x40020140;
+*edx = 0x40020140;
+break;
+case 0x8006:
+/* cache info (L2 cache) */
+*eax = 0;
+*ebx = 0x42004200;
+*ecx = 0x02008140;
+*edx = 0;
+break;
+case 0x8008:
+/* virtual & phys address size in low 2 bytes. */
+/* XXX: This value must match the one used in the MMU code. */ 
+if 

[PATCH 1/3] Add additional CPU flag definitions

2008-10-28 Thread Anthony Liguori
Some x86 CPU definitions that KVM needs

Signed-off-by: Anthony Liguori <[EMAIL PROTECTED]>

diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 3c11e0f..b1678ef 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -159,9 +159,11 @@
 #define HF_MP_MASK   (1 << HF_MP_SHIFT)
 #define HF_EM_MASK   (1 << HF_EM_SHIFT)
 #define HF_TS_MASK   (1 << HF_TS_SHIFT)
+#define HF_IOPL_MASK (3 << HF_IOPL_SHIFT)
 #define HF_LMA_MASK  (1 << HF_LMA_SHIFT)
 #define HF_CS64_MASK (1 << HF_CS64_SHIFT)
 #define HF_OSFXSR_MASK   (1 << HF_OSFXSR_SHIFT)
+#define HF_VM_MASK   (1 << HF_VM_SHIFT)
 #define HF_SMM_MASK  (1 << HF_SMM_SHIFT)
 #define HF_SVME_MASK (1 << HF_SVME_SHIFT)
 #define HF_SVMI_MASK (1 << HF_SVMI_SHIFT)
@@ -178,6 +180,9 @@
 #define HF2_NMI_MASK  (1 << HF2_NMI_SHIFT)
 #define HF2_VINTR_MASK(1 << HF2_VINTR_SHIFT)
 
+#define CR0_PE_SHIFT 0
+#define CR0_MP_SHIFT 1
+
 #define CR0_PE_MASK  (1 << 0)
 #define CR0_MP_MASK  (1 << 1)
 #define CR0_EM_MASK  (1 << 2)
@@ -196,7 +201,8 @@
 #define CR4_PAE_MASK  (1 << 5)
 #define CR4_PGE_MASK  (1 << 7)
 #define CR4_PCE_MASK  (1 << 8)
-#define CR4_OSFXSR_MASK (1 << 9)
+#define CR4_OSFXSR_SHIFT 9
+#define CR4_OSFXSR_MASK (1 << CR4_OSFXSR_SHIFT)
 #define CR4_OSXMMEXCPT_MASK  (1 << 10)
 
 #define PG_PRESENT_BIT 0
@@ -229,6 +235,7 @@
 #define PG_ERROR_RSVD_MASK 0x08
 #define PG_ERROR_I_D_MASK  0x10
 
+#define MSR_IA32_TSC0x10
 #define MSR_IA32_APICBASE   0x1b
 #define MSR_IA32_APICBASE_BSP   (1<<8)
 #define MSR_IA32_APICBASE_ENABLE(1<<11)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] kvm: disable virtualization on kdump

2008-10-28 Thread Eduardo Habkost
On Mon, Oct 27, 2008 at 10:32:43AM -0700, Eric W. Biederman wrote:
> Avi Kivity <[EMAIL PROTECTED]> writes:

> >
> > I wouldn't mind notifiers (with a nice comment explaining that you must know
> > what you're doing, though that's the case with most kernel APIs).  I'm fine 
> > with
> > either approach.
> 
> This is the 3rd request I have seen for a notifier.  This is the first
> request I have seen for code that must be executed in the kexec on
> panic path.  So history suggest to me that notifiers make it
> unreasonably easy to get code onto the kexec on panic code path.
> 
> Occasionally the kexec on panic code path is tested to see how
> well it works in strange situations like being called from
> a stack overflow etc.
> 
> The rest of the history is that previous attempts like lkcd
> had very programmer friendly interfaces, that worked fine
> in test environments giving beautiful core dumps, but when things
> broke in the field they were essentially useless.  The kdump
> approach is still not completely reliable but it does work
> well enough that people get useful crash dumps sometimes.
> 
> I feel anything that makes the kexec on panic code path harder
> to verify it will work when things are crazy broken, like
> a notifier is something we should avoid.

I am still wondering if a simple function pointer (instead of a full
notifier interface) would be good enough. It looks like a reasonable
tradeoff.

I think I will get flamed if I try to pull to the core a bunch of code
that always lived in the KVM module.  8)

And even if we pull those functions to the core, we will still have
a function pointer on the new code anyway, because we would need to
support vmx and svm.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RFC: VMX: initialize TSC offset relative to vm creation time

2008-10-28 Thread David S. Ahern


Marcelo Tosatti wrote:
> On Sat, Sep 13, 2008 at 07:55:02AM +0300, Avi Kivity wrote:
>> Marcelo Tosatti wrote:
>>> VMX initializes the TSC offset for each vcpu at different times, and
>>> also reinitializes it for vcpus other than 0 on APIC SIPI message.
>>>
>>> This bug causes the TSC's to appear unsynchronized in the guest, even if
>>> the host is good.
>>>
>>> Older Linux kernels don't handle the situation very well, so
>>> gettimeofday is likely to go backwards in time:
>>>
>>> http://www.mail-archive.com/kvm@vger.kernel.org/msg02955.html
>>> http://sourceforge.net/tracker/index.php?func=detail&aid=2025534&group_id=180599&atid=893831
>>>
>>> Fix it by initializating the offset of each vcpu relative to vm creation
>>> time, and moving it from vmx_vcpu_reset to vmx_vcpu_setup, out of the
>>> APIC MP init path.
>>>
>>>
>>>   
>> This is good in principle, but we need to detect if we're on a multiple
>> board host (or a host with unsynced tscs) and do something else in that
>> case.
> 
> I think this is a separate, and difficult, problem. For instance older
> Linux guests that correct the TSC across CPU's are broken at the moment
> in the unsynced TSC case.
> 
> That is, the fact that KVM does not handle unsynced TSC's on the host is
> not an argument against this patch which clearly fixes a bug.
> 
> Take commit 019960ae9933161c2809fa4ee608ba30d9639fd2 for example.
> 

Has anything changed "recently" with the TSC code? Recently here being
the past 2 months since you first crafted the patch. I ask because in
the past few runs based on kvm.git trees (e.g., as recently as a pull on
10/26), this tsc offset patch no longer fixes the problem.

The following one does fix the problem with kvm.git pulled on 10/26/08:

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 64e2439..d5da717 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -860,7 +860,7 @@ static void guest_write_tsc(u64 guest_tsc)
u64 host_tsc;

rdtscll(host_tsc);
-   vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
+   vmcs_write64(TSC_OFFSET, 0);
 }

 /*

This is the vmx counterpart (or at least to my understanding) to a
suggestion Ben had for the svm code.

david
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Add VMRUN handler v5

2008-10-28 Thread Mike Day
On 20/10/08 19:04 +0200, Alexander Graf wrote:

> +static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
> +{
> + nsvm_printk("VMrun\n");
> + if (nested_svm_check_permissions(svm))
> + return 1;
> +
> + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
> + skip_emulated_instruction(&svm->vcpu);
> +
> + if (nested_svm_do(svm, svm->vmcb->save.rax, 0,
> +   NULL, nested_svm_vmrun))
> + return 1;
> +
> + if (nested_svm_do(svm, svm->vmcb->control.msrpm_base_pa, 0,
> +   NULL, nested_svm_vmrun_msrpm))
> + return 1;
> +
> + return 1;
> +}

A nitpick, but you could remove the last if() statement and one of 
the last two return statements. Unless you forsee more calls to
nested_svm_do() in here.

Mike

-- 
Mike Day
http://www.ncultra.org
AIM: ncmikeday |  Yahoo IM: ultra.runner
PGP key: http://www.ncultra.org/ncmike/pubkey.asc
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests

2008-10-28 Thread Mark McLoughlin
On Tue, 2008-10-28 at 12:06 +0200, [EMAIL PROTECTED] wrote:
...
> +static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
> +   uint8_t r_dev, uint8_t r_func)
> +{
> +char dir[128], name[128];
> +int fd, r = 0;
> +FILE *f;
> +unsigned long long start, end, size, flags;
> +PCIRegion *rp;
> +PCIDevRegions *dev = &pci_dev->real_device;
> +
> +dev->region_number = 0;
> +
> +snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/:%02x:%02x.%x/",
> +  r_bus, r_dev, r_func);
> +
> +snprintf(name, sizeof(name), "%sconfig", dir);
> +
> +fd = open(name, O_RDWR);
> +if (fd == -1) {
> +fprintf(stderr, "%s: %s: %m\n", __func__, name);
> +return 1;
> +}
> +dev->config_fd = fd;
> +again:
> +r = read(fd, pci_dev->dev.config, sizeof(pci_dev->dev.config));
> +if (r < 0) {
> +if (errno == EINTR || errno == EAGAIN)
> +goto again;
> +fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno);
> +}
> +
> +snprintf(name, sizeof(name), "%sresource", dir);
> +
> +f = fopen(name, "r");
> +if (f == NULL) {
> +fprintf(stderr, "%s: %s: %m\n", __func__, name);
> +return 1;
> +}
> +r = -1;
> +while (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) == 3) {
> +r++;
> +rp = dev->regions + r;

Could, in theory, overflow dev->regions here. Suggest:

+for (r = 0; r < MAX_IO_REGIONS; r++) {
+if (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) != 3)
+break;

> +rp->valid = 0;
> +size = end - start + 1;
> +flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
> +if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
> +continue;
> +if (flags & IORESOURCE_MEM) {
> +flags &= ~IORESOURCE_IO;
> + snprintf(name, sizeof(name), "%sresource%d", dir, r);
> +fd = open(name, O_RDWR);
> +if (fd == -1)
> +continue;   /* probably ROM */
> +rp->resource_fd = fd;
> +} else
> +flags &= ~IORESOURCE_PREFETCH;
> +
> +rp->type = flags;
> +rp->valid = 1;
> +rp->base_addr = start;
> +rp->size = size;
> +DEBUG("region %d size %d start 0x%x type %d resource_fd %d\n",
> +  r, rp->size, start, rp->type, rp->resource_fd);
> +}
> +fclose(f);
> +
> +dev->region_number = r;
> +return 0;
> +}
> +
> +static int disable_iommu;

Why is this global?

The flag is set per-device on the command-line and only affects whether
we pass KVM_DEV_ASSIGN_ENABLE_IOMMU to kvm_assign_pci_device()

> +int nr_assigned_devices;
> +static LIST_HEAD(, AssignedDevInfo) adev_head;
> +
> +static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
> +{
> +return (uint32_t)bus << 8 | (uint32_t)devfn;
> +}
> +
> +static AssignedDevice *register_real_device(PCIBus *e_bus,
> +const char *e_dev_name,
> +int e_devfn, uint8_t r_bus,
> +uint8_t r_dev, uint8_t r_func)
> +{
> +int r;
> +AssignedDevice *pci_dev;
> +uint8_t e_device, e_intx;
> +
> +DEBUG("Registering real physical device %s (devfn=0x%x)\n",
> +  e_dev_name, e_devfn);
> +
> +pci_dev = (AssignedDevice *)
> +pci_register_device(e_bus, e_dev_name, sizeof(AssignedDevice),
> +e_devfn, assigned_dev_pci_read_config,
> +assigned_dev_pci_write_config);
> +if (NULL == pci_dev) {
> +fprintf(stderr, "%s: Error: Couldn't register real device %s\n",
> +__func__, e_dev_name);
> +return NULL;
> +}
> +if (get_real_device(pci_dev, r_bus, r_dev, r_func)) {
> +fprintf(stderr, "%s: Error: Couldn't get real device (%s)!\n",
> +__func__, e_dev_name);
> +goto out;
> +}
> +
> +/* handle real device's MMIO/PIO BARs */
> +if (assigned_dev_register_regions(pci_dev->real_device.regions,
> +  pci_dev->real_device.region_number,
> +  pci_dev))
> +goto out;
> +
> +/* handle interrupt routing */
> +e_device = (pci_dev->dev.devfn >> 3) & 0x1f;
> +e_intx = pci_dev->dev.config[0x3d] - 1;
> +pci_dev->intpin = e_intx;
> +pci_dev->run = 0;
> +pci_dev->girq = 0;
> +pci_dev->h_busnr = r_bus;
> +pci_dev->h_devfn = PCI_DEVFN(r_dev, r_func);
> +
> +#ifdef KVM_CAP_DEVICE_ASSIGNMENT
> +if (kvm_enabled()) {
> +struct kvm_assigned_pci_dev assigned_dev_data;
> +
> +memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
> +assigned_dev_data.assigned_dev_id  =
> +calc_assigned_dev_id(pci_dev->h_busnr,
> + (uint32_t)pci_dev->h_devfn);
> +

Re: [PATCH 3/6] qemu: piix: Introduce functions to get pin number from irq and vice versa

2008-10-28 Thread Muli Ben-Yehuda
On Tue, Oct 28, 2008 at 06:21:35PM +0200, Avi Kivity wrote:
> Muli Ben-Yehuda wrote:
>
>  
>>> Well, what is this needed for in the first place?
>>> 
>>
>> This specific function is not used. I assume Amit added it for
>> completeness with piix_get_irq. piix_get_irq, as far as I can tell, is
>> used in only one place (when the guest updates a device's
>> configuration space interrupt register) to go from interrupt pin
>> (intx) to guest IRQ line.
>>   
>
> In that case, a solution suggests itself...

Yes, of course! I don't know how I missed it!

Err...

What is it?

Seriously, I removed piix3_get_pin as soon as I noticed it wasn't
actually used, but I am not convinced that there are no aliasing
issues remaining with piix_get_irq---most likely because I do not
understand PCI interrupt routing to any sufficient degree. Do you see
problems remaining with pixx_get_irq?

Cheers,
Muli
-- 
The First Workshop on I/O Virtualization (WIOV '08)
Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/
   <->
SYSTOR 2009---The Israeli Experimental Systems Conference
http://www.haifa.il.ibm.com/conferences/systor2009/
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch] v4 - fold struct vcpu_info into CPUState

2008-10-28 Thread Jes Sorensen

Hi,

Here's an updated version of the patch. It should fix the problems
Hollis ran into, and also compile on x86_64 again :-)

I managed to get rid of all the runtime use of qemu_kvm_cpu_env(),
except for the hotplug code. But I think it's reasonable to do the
walk of the linked list in that case. However, the more I have looked
at this, the more obvious to me it becomes that it is right to
expose struct CPUState to libkvm, and avoid passing around int vcpu.

Comments and test reports very welcome!

Cheers,
Jes

Merge vcpu_info into CPUState.

Moves definition of vcpu related structs to new header qemu-kvm-vcpu.h
and declares this struct in i386/ia64/ppc CPUState structs if USE_KVM
is defined. In addition conver qemu-kvm.c to pull vcpu_info out of
CPUState.

This eliminates ugly static sized array of struct vcpu_info.

Signed-off-by: Jes Sorensen <[EMAIL PROTECTED]>

---
 libkvm/kvm-common.h |8 +-
 libkvm/libkvm.c |   28 
 libkvm/libkvm.h |   10 +--
 qemu/hw/acpi.c  |   18 +
 qemu/qemu-kvm-ia64.c|4 -
 qemu/qemu-kvm-powerpc.c |5 -
 qemu/qemu-kvm-vcpu.h|   34 ++
 qemu/qemu-kvm-x86.c |   11 +--
 qemu/qemu-kvm.c |  151 ++--
 qemu/qemu-kvm.h |6 -
 qemu/target-i386/cpu.h  |4 +
 qemu/target-ia64/cpu.h  |5 +
 qemu/target-ppc/cpu.h   |5 +
 13 files changed, 172 insertions(+), 117 deletions(-)

Index: kvm-userspace.git/libkvm/kvm-common.h
===
--- kvm-userspace.git.orig/libkvm/kvm-common.h
+++ kvm-userspace.git/libkvm/kvm-common.h
@@ -84,11 +84,11 @@
 void kvm_show_code(kvm_context_t kvm, int vcpu);
 
 int handle_halt(kvm_context_t kvm, int vcpu);
-int handle_shutdown(kvm_context_t kvm, int vcpu);
-void post_kvm_run(kvm_context_t kvm, int vcpu);
-int pre_kvm_run(kvm_context_t kvm, int vcpu);
+int handle_shutdown(kvm_context_t kvm, void *env);
+void post_kvm_run(kvm_context_t kvm, void *env);
+int pre_kvm_run(kvm_context_t kvm, void *env);
 int handle_io_window(kvm_context_t kvm);
-int handle_debug(kvm_context_t kvm, int vcpu);
+int handle_debug(kvm_context_t kvm, void *env);
 int try_push_interrupts(kvm_context_t kvm);
 
 #endif
Index: kvm-userspace.git/libkvm/libkvm.c
===
--- kvm-userspace.git.orig/libkvm/libkvm.c
+++ kvm-userspace.git/libkvm/libkvm.c
@@ -738,9 +738,9 @@
return 0;
 }
 
-int handle_debug(kvm_context_t kvm, int vcpu)
+int handle_debug(kvm_context_t kvm, void *env)
 {
-   return kvm->callbacks->debug(kvm->opaque, vcpu);
+   return kvm->callbacks->debug(kvm->opaque, env);
 }
 
 int kvm_get_regs(kvm_context_t kvm, int vcpu, struct kvm_regs *regs)
@@ -822,9 +822,9 @@
return kvm->callbacks->halt(kvm->opaque, vcpu);
 }
 
-int handle_shutdown(kvm_context_t kvm, int vcpu)
+int handle_shutdown(kvm_context_t kvm, void *env)
 {
-   return kvm->callbacks->shutdown(kvm->opaque, vcpu);
+   return kvm->callbacks->shutdown(kvm->opaque, env);
 }
 
 int try_push_interrupts(kvm_context_t kvm)
@@ -837,14 +837,14 @@
return kvm->callbacks->try_push_nmi(kvm->opaque);
 }
 
-void post_kvm_run(kvm_context_t kvm, int vcpu)
+void post_kvm_run(kvm_context_t kvm, void *env)
 {
-   kvm->callbacks->post_kvm_run(kvm->opaque, vcpu);
+   kvm->callbacks->post_kvm_run(kvm->opaque, env);
 }
 
-int pre_kvm_run(kvm_context_t kvm, int vcpu)
+int pre_kvm_run(kvm_context_t kvm, void *env)
 {
-   return kvm->callbacks->pre_kvm_run(kvm->opaque, vcpu);
+   return kvm->callbacks->pre_kvm_run(kvm->opaque, env);
 }
 
 int kvm_get_interrupt_flag(kvm_context_t kvm, int vcpu)
@@ -872,7 +872,7 @@
 #endif
 }
 
-int kvm_run(kvm_context_t kvm, int vcpu)
+int kvm_run(kvm_context_t kvm, int vcpu, void *env)
 {
int r;
int fd = kvm->vcpu_fd[vcpu];
@@ -886,19 +886,19 @@
if (!kvm->irqchip_in_kernel)
run->request_interrupt_window = try_push_interrupts(kvm);
 #endif
-   r = pre_kvm_run(kvm, vcpu);
+   r = pre_kvm_run(kvm, env);
if (r)
return r;
r = ioctl(fd, KVM_RUN, 0);
 
if (r == -1 && errno != EINTR && errno != EAGAIN) {
r = -errno;
-   post_kvm_run(kvm, vcpu);
+   post_kvm_run(kvm, env);
fprintf(stderr, "kvm_run: %s\n", strerror(-r));
return r;
}
 
-   post_kvm_run(kvm, vcpu);
+   post_kvm_run(kvm, env);
 
 #if defined(KVM_CAP_COALESCED_MMIO)
if (kvm->coalesced_mmio) {
@@ -948,7 +948,7 @@
r = handle_io(kvm, run, vcpu);
break;
case KVM_EXIT_DEBUG:
-   r = handle_debug(kvm, vcpu);
+   r = handle_debug(kvm, env);
break;
case KVM_EXIT_MMIO:
r = handle_mmio(kvm, run);
@@ -962,7 +962,7 @@
 #endif
 

Re: [PATCH 3/6] qemu: piix: Introduce functions to get pin number from irq and vice versa

2008-10-28 Thread Avi Kivity

Muli Ben-Yehuda wrote:

 


Well, what is this needed for in the first place?



This specific function is not used. I assume Amit added it for
completeness with piix_get_irq. piix_get_irq, as far as I can tell, is
used in only one place (when the guest updates a device's
configuration space interrupt register) to go from interrupt pin
(intx) to guest IRQ line.
  


In that case, a solution suggests itself...

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] [PATCH] qemu: ppc: xer access prototypes no more used & implemented

2008-10-28 Thread Ehrhardt Christian
From: Christian Ehrhardt <[EMAIL PROTECTED]>

Revision 5500 of the qemu repository removed all code using
ppc_load_xer & ppc_store_xer as well as their implementation.

Another patch fixes it's usage in kvm-userspace for powerpc, but I think
that header can now be cleaned up, therefore this patch to qemu-devel.

Signed-off-by: Christian Ehrhardt <[EMAIL PROTECTED]>
---

[diffstat]
 cpu.h |2 --
 1 file changed, 2 deletions(-)

[diff]

diff --git a/qemu/target-ppc/cpu.h b/qemu/target-ppc/cpu.h
--- a/qemu/target-ppc/cpu.h
+++ b/qemu/target-ppc/cpu.h
@@ -725,8 +725,6 @@
 #endif
 void do_store_sr (CPUPPCState *env, int srnum, target_ulong value);
 #endif /* !defined(CONFIG_USER_ONLY) */
-target_ulong ppc_load_xer (CPUPPCState *env);
-void ppc_store_xer (CPUPPCState *env, target_ulong value);
 void ppc_store_msr (CPUPPCState *env, target_ulong value);
 
 void cpu_ppc_reset (void *opaque);
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 05 of 10] [PATCH] libcflat: ppc: add timebase accessor

2008-10-28 Thread Ehrhardt Christian
Provide a timebase accessor for ppc testcases.

Signed-off-by: Hollis Blanchard <[EMAIL PROTECTED]>
Signed-off-by: Christian Ehrhardt <[EMAIL PROTECTED]>

[diffstat]
 config-powerpc-44x.mak  |3 ++-
 test/lib/powerpc/44x/timebase.S |   28 
 test/lib/powerpc/44x/timebase.h |   25 +
 3 files changed, 55 insertions(+), 1 deletion(-)

[diff]

diff --git a/user/config-powerpc-44x.mak b/user/config-powerpc-44x.mak
--- a/user/config-powerpc-44x.mak
+++ b/user/config-powerpc-44x.mak
@@ -5,7 +5,8 @@
 
 cflatobjs += \
test/lib/powerpc/44x/map.o \
-   test/lib/powerpc/44x/tlbwe.o
+   test/lib/powerpc/44x/tlbwe.o \
+   test/lib/powerpc/44x/timebase.o
 
 simpletests += \
test/powerpc/44x/tlbsx.bin \
diff --git a/user/test/lib/powerpc/44x/timebase.S 
b/user/test/lib/powerpc/44x/timebase.S
new file mode 100644
--- /dev/null
+++ b/user/test/lib/powerpc/44x/timebase.S
@@ -0,0 +1,28 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <[EMAIL PROTECTED]>
+ */
+
+/* unsigned long long mftb(void); */
+.global mftb
+mftb:
+   mftbu   r5
+   mftbl   r4
+   mftbu   r3
+   cmpwr3, r5
+   bne mftb
+   blr
diff --git a/user/test/lib/powerpc/44x/timebase.h 
b/user/test/lib/powerpc/44x/timebase.h
new file mode 100644
--- /dev/null
+++ b/user/test/lib/powerpc/44x/timebase.h
@@ -0,0 +1,25 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <[EMAIL PROTECTED]>
+ */
+
+#ifndef __TIMEBASE_H__
+#define __TIMEBASE_H__
+
+unsigned long long mftb(void);
+
+#endif /* __TIMEBASE_H__ */
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 04 of 10] [PATCH] user: ppc: implement PowerPC 44x libcflat

2008-10-28 Thread Ehrhardt Christian
From: Hollis Blanchard <[EMAIL PROTECTED]>

- Create a 44x-specific makefile.
- Reorganize PowerPC makefiles to separate "simple" tests from those which
  link with libcflat.
- Create a minimal libcflat testcase (which just exits).

Signed-off-by: Hollis Blanchard <[EMAIL PROTECTED]>
Signed-off-by: Christian Ehrhardt <[EMAIL PROTECTED]>
---

[diffstat]
 config-powerpc-44x.mak   |   14 +++
 config-powerpc.mak   |   46 ++
 test/lib/powerpc/44x/map.c   |   51 +++
 test/lib/powerpc/44x/tlbwe.S |   29 
 test/lib/powerpc/io.c|   35 +
 test/powerpc/cstart.S|   38 
 test/powerpc/exit.c  |   23 +++
 7 files changed, 221 insertions(+), 15 deletions(-)

[diff]

diff --git a/user/config-powerpc-44x.mak b/user/config-powerpc-44x.mak
new file mode 100644
--- /dev/null
+++ b/user/config-powerpc-44x.mak
@@ -0,0 +1,14 @@
+
+
+# for some reason binutils hates tlbsx unless we say we're 405  :(
+CFLAGS += -Wa,-m405 -I test/lib/powerpc/44x
+
+cflatobjs += \
+   test/lib/powerpc/44x/map.o \
+   test/lib/powerpc/44x/tlbwe.o
+
+simpletests += \
+   test/powerpc/44x/tlbsx.bin \
+   test/powerpc/44x/tlbwe_16KB.bin \
+   test/powerpc/44x/tlbwe_hole.bin \
+   test/powerpc/44x/tlbwe.bin
diff --git a/user/config-powerpc.mak b/user/config-powerpc.mak
--- a/user/config-powerpc.mak
+++ b/user/config-powerpc.mak
@@ -1,26 +1,42 @@
+platform := 44x
+
 CFLAGS += -m32
 CFLAGS += -D__powerpc__
 CFLAGS += -I $(KERNELDIR)/include
-# for some reaons binutils hates tlbsx unless we say we're 405  :(
-CFLAGS += -Wa,-mregnames,-m405
+CFLAGS += -Wa,-mregnames -I test/lib
 
-%.bin: %.o
-   $(OBJCOPY) -O binary $^ $@
+cstart := test/powerpc/cstart.o
 
-testobjs := \
-   io.bin \
-   spin.bin \
-   sprg.bin \
-   44x/tlbsx.bin \
-   44x/tlbwe_16KB.bin \
-   44x/tlbwe_hole.bin \
-   44x/tlbwe.bin
+cflatobjs += \
+   test/lib/powerpc/io.o
 
-tests := $(addprefix test/powerpc/, $(testobjs))
+$(libcflat): LDFLAGS += -nostdlib
+$(libcflat): CFLAGS += -ffreestanding
 
-all: kvmtrace kvmctl $(tests)
+# these tests do not use libcflat
+simpletests := \
+   test/powerpc/spin.bin \
+   test/powerpc/io.bin \
+   test/powerpc/sprg.bin
+
+# theses tests use cstart.o, libcflat, and libgcc
+tests := \
+   test/powerpc/exit.bin
+
+include config-powerpc-$(platform).mak
+
+
+all: kvmtrace kvmctl $(libcflat) $(simpletests) $(tests)
+
+$(simpletests): %.bin: %.o
+   $(CC) -nostdlib $^ -Wl,-T,flat.lds -o $@
+
+$(tests): %.bin: $(cstart) %.o $(libcflat)
+   $(CC) -nostdlib $^ $(libgcc) -Wl,-T,flat.lds -o $@
 
 kvmctl_objs = main-ppc.o iotable.o ../libkvm/libkvm.a
 
 arch_clean:
-   rm -f $(tests)
+   $(RM) $(simpletests) $(tests) $(cstart)
+   $(RM) $(patsubst %.bin, %.elf, $(simpletests) $(tests))
+   $(RM) $(patsubst %.bin, %.o, $(simpletests) $(tests))
diff --git a/user/test/lib/powerpc/44x/map.c b/user/test/lib/powerpc/44x/map.c
new file mode 100644
--- /dev/null
+++ b/user/test/lib/powerpc/44x/map.c
@@ -0,0 +1,51 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <[EMAIL PROTECTED]>
+ */
+
+#include "libcflat.h"
+
+#define TLB_SIZE 64
+
+extern void tlbwe(unsigned int index,
+ unsigned char tid,
+ unsigned int word0,
+ unsigned int word1,
+ unsigned int word2);
+
+unsigned int next_free_index;
+
+#define PAGE_SHIFT 12
+#define PAGE_MASK (~((1<= TLB_SIZE)
+   panic("TLB overflow");
+
+   w0 = (vaddr & PAGE_MASK) | V;
+   w1 = paddr & PAGE_MASK;
+   w2 = 0x3;
+
+   tlbwe(next_free_index, 0, w0, w1, w2);
+}
diff --git a/user/test/lib/powerpc/44x/tlbwe.S 
b/user/test/lib/powerpc/44x/tlbwe.S
new file mode 100644
--- /dev/null
+++ b/user/test/lib/powerpc/44x/tlbwe.S
@@ -0,0 +1,29 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied w

[PATCH 10 of 10] [PATCH] kvm-userspace: ppc: fix initial ppc memory setup

2008-10-28 Thread Ehrhardt Christian
From: Christian Ehrhardt <[EMAIL PROTECTED]>

The old memory initialization code was broken for all cases not fitting in one
ram stick. This patch fixes the ram_stick calculation, now sets the proper
base adresses per stick and removes the old workaround.

Signed-off-by: Christian Ehrhardt <[EMAIL PROTECTED]>
---

[diffstat]
 ppc440.c|   12 +---
 ppc440.h|8 ++--
 ppc440_bamboo.c |   30 --
 3 files changed, 31 insertions(+), 19 deletions(-)

[diff]

diff --git a/qemu/hw/ppc440.c b/qemu/hw/ppc440.c
--- a/qemu/hw/ppc440.c
+++ b/qemu/hw/ppc440.c
@@ -3,6 +3,7 @@
  *
  * Copyright 2007 IBM Corporation.
  * Authors: Jerone Young <[EMAIL PROTECTED]>
+ * Christian Ehrhardt <[EMAIL PROTECTED]>
  *
  * This work is licensed under the GNU GPL license version 2 or later.
  *
@@ -24,15 +25,15 @@
 
 
 void ppc440ep_init(CPUState *env,
-   target_phys_addr_t ram_bases[2],
-   target_phys_addr_t ram_sizes[2],
+   target_phys_addr_t ram_bases[PPC440_MAX_RAM_SLOTS],
+   target_phys_addr_t ram_sizes[PPC440_MAX_RAM_SLOTS],
+   int nbanks,
qemu_irq **picp,
ppc4xx_pci_t **pcip,
int do_init)
 {
ppc4xx_mmio_t *mmio;
qemu_irq *pic, *irqs;
-   ram_addr_t offset;
ppc4xx_pci_t *pci;
int i;
 
@@ -55,10 +56,7 @@
/* SDRAM controller */
printf("trying to setup sdram controller\n");
/* XXX 440EP's ECC interrupts are on UIC1 */
-   ppc405_sdram_init(env, pic[14], 2, ram_bases, ram_sizes, do_init);
-   offset = 0;
-   for (i = 0; i < 2; i++)
-   offset += ram_sizes[i];
+   ppc405_sdram_init(env, pic[14], nbanks, ram_bases, ram_sizes, do_init);
 
/* PCI */
pci = ppc4xx_pci_init(env, pic,
diff --git a/qemu/hw/ppc440.h b/qemu/hw/ppc440.h
--- a/qemu/hw/ppc440.h
+++ b/qemu/hw/ppc440.h
@@ -3,6 +3,7 @@
  *
  * Copyright 2007 IBM Corporation.
  * Authors: Jerone Young <[EMAIL PROTECTED]>
+ * Christian Ehrhardt <[EMAIL PROTECTED]>
  *
  * This work is licensed under the GNU GPL licence version 2 or later
  *
@@ -20,9 +21,12 @@
 #include "exec-all.h"
 #include "boards.h"
 
+#define PPC440_MAX_RAM_SLOTS 4
+
 void ppc440ep_init(CPUState *env,
-   target_phys_addr_t ram_bases[2],
-   target_phys_addr_t ram_sizes[2],
+   target_phys_addr_t ram_bases[PPC440_MAX_RAM_SLOTS],
+   target_phys_addr_t ram_sizes[PPC440_MAX_RAM_SLOTS],
+   int nbanks,
qemu_irq **picp,
ppc4xx_pci_t **pcip,
int do_init);
diff --git a/qemu/hw/ppc440_bamboo.c b/qemu/hw/ppc440_bamboo.c
--- a/qemu/hw/ppc440_bamboo.c
+++ b/qemu/hw/ppc440_bamboo.c
@@ -2,7 +2,9 @@
  * Qemu PowerPC 440 board emualtion
  *
  * Copyright 2007 IBM Corporation.
- * Authors: Jerone Young <[EMAIL PROTECTED]>
+ * Authors:
+ * Jerone Young <[EMAIL PROTECTED]>
+ * Christian Ehrhardt <[EMAIL PROTECTED]>
  *
  * This work is licensed under the GNU GPL license version 2 or later.
  *
@@ -30,7 +32,8 @@
const char *cpu_model)
 {
char *buf=NULL;
-   target_phys_addr_t ram_bases[4], ram_sizes[4];
+   target_phys_addr_t ram_bases[PPC440_MAX_RAM_SLOTS];
+   target_phys_addr_t ram_sizes[PPC440_MAX_RAM_SLOTS];
NICInfo *nd;
qemu_irq *pic;
ppc4xx_pci_t *pci;
@@ -46,6 +49,8 @@
int ret;
int ram_stick_sizes[] = {256<<20, 128<<20, 64<<20,
32<<20, 16<<20, 8<<20 }; /* in bytes */
+   int nbanks = 0; /* number of used memory banks */
+   int next_bank_offset = 0;
ram_addr_t tmp_ram_size;
int i=0, k=0;
uint32_t cpu_freq;
@@ -55,15 +60,22 @@
printf("%s: START\n", __func__);
 
/* Setup Memory */
-   printf("Ram size passed is: %i MB\n",
-   bytes_to_mb((int)ram_size));
+   if (ram_size < 8<<20) {
+   printf("ERROR: ram size too small (min 8mb)\n");
+   exit(1);
+   } else
+   printf("Ram size passed is: %i MB\n",
+   bytes_to_mb((int)ram_size));
 
tmp_ram_size = ram_size;
 
-   for (i=0; i < (sizeof(ram_sizes)/sizeof(ram_sizes[0])); i++) {
-   for (k=0; k < 
(sizeof(ram_stick_sizes)/sizeof(ram_stick_sizes[0])); k++) {
+   for (i = 0; i < PPC440_MAX_RAM_SLOTS; i++) {
+   for (k = 0; k < (sizeof(ram_stick_sizes)/sizeof(int)); k++) {
if ((tmp_ram_size/ram_stick_sizes[k]) > 0) {
ram_sizes[i] = ram_stick_sizes[k];
+   ram_bases[i] = next_bank_offset;
+   next_bank_offset += ram_stick_sizes[k];
+   nbanks++;
tmp_ram_size -= ram_stick_sizes[k];
break;
   

[PATCH 08 of 10] [PATCH] qemu: ppc: if not a uImage, try to load kernel as ELF

2008-10-28 Thread Ehrhardt Christian
From: Hollis Blanchard <[EMAIL PROTECTED]>

This allows qemu to load "bare metal" ELF kernels, useful for standalone
benchmarks and testcases.

We could/should also load the specified file as a flat binary, if both uImage
and ELF loaders fail. (See hw/arm_boot.c.)

Signed-off-by: Hollis Blanchard <[EMAIL PROTECTED]>
Signed-off-by: Christian Ehrhardt <[EMAIL PROTECTED]>
---

[diffstat]
 ppc440_bamboo.c |7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

[diff]

diff --git a/qemu/hw/ppc440_bamboo.c b/qemu/hw/ppc440_bamboo.c
--- a/qemu/hw/ppc440_bamboo.c
+++ b/qemu/hw/ppc440_bamboo.c
@@ -35,8 +35,8 @@
qemu_irq *pic;
ppc4xx_pci_t *pci;
CPUState *env;
-   target_ulong ep=0;
-   target_ulong la=0;
+   uint64_t ep=0;
+   uint64_t la=0;
int is_linux=1; /* Will assume allways is Linux for now */
target_long kernel_size=0;
target_ulong initrd_base=0;
@@ -97,6 +97,9 @@
/* load kernel with uboot loader */
printf("%s: load kernel\n", __func__);
ret = load_uimage(kernel_filename, &ep, &la, &kernel_size, &is_linux);
+   if (ret < 0)
+   ret = load_elf(kernel_filename, 0, &ep, &la, NULL);
+
if (ret < 0) {
fprintf(stderr, "qemu: could not load kernel '%s'\n",
kernel_filename);
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02 of 10] [PATCH] user: ppc: fix threading bugs in main-ppc.c

2008-10-28 Thread Ehrhardt Christian
From: Hollis Blanchard <[EMAIL PROTECTED]>

- call io_table_register() before any vcpus have started
- wait for all vcpus to exit before exiting the parent thread

Signed-off-by: Hollis Blanchard <[EMAIL PROTECTED]>
Signed-off-by: Christian Ehrhardt <[EMAIL PROTECTED]>
---

[diffstat]
 main-ppc.c |   32 
 1 file changed, 12 insertions(+), 20 deletions(-)

[diff]

diff --git a/user/main-ppc.c b/user/main-ppc.c
--- a/user/main-ppc.c
+++ b/user/main-ppc.c
@@ -51,7 +51,7 @@
 struct io_table mmio_table;
 
 static int ncpus = 1;
-static sem_t init_sem;
+static sem_t exited_sem;
 static __thread int vcpu;
 static sigset_t kernel_sigmask;
 static sigset_t ipi_sigmask;
@@ -220,16 +220,8 @@
asm volatile ("sync; isync");
 }
 
-static void init_vcpu(int n, unsigned long entry)
+static void init_vcpu(int n)
 {
-   /* XXX must set initial TLB state and stack
-   struct kvm_regs regs = {
-   .pc = entry,
-   };
-
-   kvm_set_regs(kvm, 0, ®s);
-   */
-
sigemptyset(&ipi_sigmask);
sigaddset(&ipi_sigmask, IPI_SIGNAL);
sigprocmask(SIG_UNBLOCK, &ipi_sigmask, NULL);
@@ -237,7 +229,6 @@
vcpus[n].tid = gettid();
vcpu = n;
kvm_set_signal_mask(kvm, n, &kernel_sigmask);
-   sem_post(&init_sem);
 }
 
 static void *do_create_vcpu(void *_n)
@@ -245,8 +236,9 @@
int n = (long)_n;
 
kvm_create_vcpu(kvm, n);
-   init_vcpu(n, 0x0);
+   init_vcpu(n);
kvm_run(kvm, n);
+   sem_post(&exited_sem);
return NULL;
 }
 
@@ -368,14 +360,14 @@
len = load_file(vm_mem, argv[optind], 1);
sync_caches(vm_mem, len);
 
-   sem_init(&init_sem, 0, 0);
-   init_vcpu(0, 0x0);
-   for (i = 1; i < ncpus; ++i)
-   start_vcpu(i);
-   for (i = 0; i < ncpus; ++i)
-   sem_wait(&init_sem);
-
io_table_register(&mmio_table, 0xf000, 64, mmio_handler, NULL);
 
-   return kvm_run(kvm, 0);
+   sem_init(&exited_sem, 0, 0);
+   for (i = 0; i < ncpus; ++i)
+   start_vcpu(i);
+   /* Wait for all vcpus to exit. */
+   for (i = 0; i < ncpus; ++i)
+   sem_wait(&exited_sem);
+
+   return 0;
 }
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests

2008-10-28 Thread Muli Ben-Yehuda
On Tue, Oct 28, 2008 at 10:45:57AM -0500, Anthony Liguori wrote:

>> +ifeq ($(USE_KVM), 1)
>> +OBJS+= device-assignment.o
>> +endif
>
> I don't think you want to build this on PPC so I think you need a
> stronger check.

Good point. How about checking TARGET_BASE_ARCH = i386?

>> +static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
>> +   uint32_t value)
>> +{
>> +AssignedDevRegion *r_access = opaque;
>> +uint32_t r_pio = guest_to_host_ioport(r_access, addr);
>> +
>> +DEBUG("%s: r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
>> +  r_pio, (int)r_access->e_physbase,
>> +  (unsigned long)r_access->r_virtbase, value);
>>   
>
> The format doesn't match the parameter count.

Yep, already fixed.

>> +static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
>> +uint32_t addr, uint32_t size, int 
>> type)
>> +{
>> +AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
>> +AssignedDevRegion *region = &r_dev->v_addrs[region_num];
>> +uint32_t old_port = region->u.r_baseport;
>> +uint32_t old_num = region->e_size;
>> +int first_map = (old_num == 0);
>> +struct ioperm_data data;
>> +int i;
>> +
>> +region->e_physbase = addr;
>> +region->e_size = size;
>> +
>> +DEBUG("e_phys=0x%x r_baseport=%x type=0x%x len=%d region_num=%d \n",
>> +  addr, region->u.r_baseport, type, size, region_num);
>> +
>> +memset(&data, 0, sizeof(data));
>> +
>> +if (!first_map) {
>> +data.start_port = old_port;
>> +data.num = old_num; +   data.turn_on = 0;
>> +
>> +for (i = 0; i < smp_cpus; ++i)
>> +kvm_ioperm(qemu_kvm_cpu_env(i), &data);
>>   
>
> How does this interact with VCPU hot-plug?

I have no idea. Weidong?

>> +#ifdef KVM_CAP_IOMMU
>> +/* We always enable the IOMMU if present
>> + * (or when not disabled on the command line)
>> + */
>> +r = kvm_check_extension(kvm_context, KVM_CAP_IOMMU);
>> +if (r && !disable_iommu)
>> +assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU;
>> +#endif
>> +r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
>> +if (r < 0) {
>> +fprintf(stderr, "Could not notify kernel about "
>> +"assigned device \"%s\"\n", e_dev_name);
>> +perror("register_real_device");
>> +goto out;
>> +}
>> +}
>>   
>
> You still succeed if KVM_CAP_DEVICE_ASSIGNMENT isn't defined?  That
> means a newer userspace compiled on an older kernel will silently
> fail if they try to do device assignment.  There's probably no
> reason to build this file if KVM_CAP_DEVICE_ASSIGNMENT isn't defined
> (see how the in-kernel PIT gets conditionally build depending on
> whether that cap is available).

Ok, I'll take a look at this.

>> +#endif
>> +term_printf("Registered host PCI device %02x:%02x.%1x "
>> +"(\"%s\") as guest device %02x:%02x.%1x\n",
>> +r_bus, r_dev, r_func, e_dev_name,
>> +pci_bus_num(e_bus), e_device, r_func);
>>
>>   
>
> If I read the code correctly, this term_printf() happens regardless
> of whether this is being done for PCI hotplug or for command-line
> assignment?  That's a problem as it'll print garbage on the monitor
> when you start QEMU which could break management applications.

Is there a more suitable alternative or shall I just nuke it?

>> diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
>> index d559f0c..5fdb726 100644
>> --- a/qemu/hw/pc.c
>> +++ b/qemu/hw/pc.c
>> @@ -33,6 +33,7 @@
>>  #include "boards.h"
>>  #include "console.h"
>>  #include "fw_cfg.h"
>> +#include "device-assignment.h"
>>   #include "qemu-kvm.h"
>>  @@ -1157,6 +1158,21 @@ static void pc_init1(ram_addr_t ram_size, int 
>> vga_ram_size,
>>   if (pci_enabled)
>>  virtio_balloon_init(pci_bus);
>> +
>> +if (kvm_enabled() && device_assignment_enabled) {
>> +int i;
>>   
>
> Stray tab.

Grrr. Silly emacs.

>
>> +for (i = 0; i < assigned_devices_index; i++) {
>> +if (add_assigned_device(assigned_devices[i]) < 0) {
>> +fprintf(stderr, "Warning: could not add assigned device 
>> %s\n",
>> +assigned_devices[i]);
>> +}
>> +}
>> +
>> +if (init_all_assigned_devices(pci_bus)) {
>> +fprintf(stderr, "Failed to initialize assigned devices\n");
>> +exit (1);
>> +}
>> +}
>>  }
>>  +#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
>> +case QEMU_OPTION_pcidevice:
>> +device_assignment_enabled = 1;
>> +if (assigned_devices_index >= MAX_DEV_ASSIGN_CMDLINE) {
>> +fprintf(stderr, "Too many assigned devices\n");
>> +exit(1);
>> +}
>> +assigned_devices[assigned_devices_index] = optarg;
>> +assigned_devices_index++;
>> +break;
>>   
>

[PATCH 07 of 10] [PATCH] qemu: ppc: define maximum SMP limit as 1 for Bamboo

2008-10-28 Thread Ehrhardt Christian
From: Christian Ehrhardt <[EMAIL PROTECTED]>

Fix for qemu runtime error. Full error message:
Number of SMP cpus requested (1), exceeds max cpus supported by machine 
`bamboo' (0)

Signed-off-by: Christian Ehrhardt <[EMAIL PROTECTED]>
Signed-off-by: Hollis Blanchard <[EMAIL PROTECTED]>
---

[diffstat]
 ppc440_bamboo.c |7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

[diff]

diff --git a/qemu/hw/ppc440_bamboo.c b/qemu/hw/ppc440_bamboo.c
--- a/qemu/hw/ppc440_bamboo.c
+++ b/qemu/hw/ppc440_bamboo.c
@@ -202,7 +202,8 @@
 }
 
 QEMUMachine bamboo_machine = {
-   "bamboo",
-   "bamboo",
-   bamboo_init,
+   .name = "bamboo",
+   .desc = "bamboo",
+   .init = bamboo_init,
+   .max_cpus = 1,
 };
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 06 of 10] [PATCH] user: ppc: add stub nmi handler

2008-10-28 Thread Ehrhardt Christian
From: Hollis Blanchard <[EMAIL PROTECTED]>

Adding a nmi stub handler for user/main-ppc.c

Signed-off-by: Hollis Blanchard <[EMAIL PROTECTED]>
Signed-off-by: Christian Ehrhardt <[EMAIL PROTECTED]>
---

[diffstat]
 main-ppc.c |6 ++
 1 file changed, 6 insertions(+)

[diff]

diff --git a/user/main-ppc.c b/user/main-ppc.c
--- a/user/main-ppc.c
+++ b/user/main-ppc.c
@@ -83,6 +83,11 @@
 }
 
 static int test_try_push_interrupts(void *opaque)
+{
+   return 0;
+}
+
+static int test_try_push_nmi(void *opaque)
 {
return 0;
 }
@@ -175,6 +180,7 @@
.halt= test_halt,
.io_window = test_io_window,
.try_push_interrupts = test_try_push_interrupts,
+   .try_push_nmi = test_try_push_nmi,
.post_kvm_run = test_post_kvm_run,
.pre_kvm_run = test_pre_kvm_run,
.powerpc_dcr_read = test_dcr_read,
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03 of 10] [PATCH] user: ppc: better error reporting in load_file

2008-10-28 Thread Ehrhardt Christian
From: Hollis Blanchard <[EMAIL PROTECTED]>

Fancy description.

Signed-off-by: Hollis Blanchard <[EMAIL PROTECTED]>
Signed-off-by: Christian Ehrhardt <[EMAIL PROTECTED]>
---

[diffstat]
 main-ppc.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

[diff]

diff --git a/user/main-ppc.c b/user/main-ppc.c
--- a/user/main-ppc.c
+++ b/user/main-ppc.c
@@ -183,7 +183,7 @@
 
 static unsigned long load_file(void *mem, const char *fname, int inval_icache)
 {
-   int r;
+   ssize_t r;
int fd;
unsigned long bytes = 0;
 
@@ -200,6 +200,7 @@
 
if (r == -1) {
perror("read");
+   printf("read %d bytes\n", bytes);
exit(1);
}
 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 09 of 10] [PATCH] kvm: external module: Treat NONARCH_CONFIG as a list

2008-10-28 Thread Ehrhardt Christian
From: Hollis Blanchard <[EMAIL PROTECTED]

As discussed on the list the unifdef changes break powerpc (and more ?). A fix
is to treat NONARCH_CONFIG as a list instead of a single item.

Signed-off-by: Hollis Blanchard <[EMAIL PROTECTED]>
Signed-off-by: Christian Ehrhardt <[EMAIL PROTECTED]>
---

[diffstat]
 Makefile |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

[diff]

diff --git a/kernel/Makefile b/kernel/Makefile
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -25,8 +25,9 @@
gawk -v version=$(version) -f $(ARCH_DIR)/hack-module.awk $1.orig \
| sed '/\#include/! s/\blapic\b/l_apic/g' > $1 && rm $1.orig
 
+unifdef_uflags = $(foreach arch, $(NONARCH_CONFIG), -UCONFIG_$(arch))
 unifdef = mv $1 $1.orig && \
- unifdef -DCONFIG_$(ARCH_CONFIG) -UCONFIG_$(NONARCH_CONFIG) $1.orig > 
$1; \
+ unifdef -DCONFIG_$(ARCH_CONFIG) $(unifdef_uflags) $1.orig > $1; \
   [ $$? -le 2 ] && rm $1.orig
 
 hack = $(call _hack,$T/$(strip $1))
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 00 of 10] kvm-userspace: ppc: userspace fixes for powerpc

2008-10-28 Thread Ehrhardt Christian
From: Christian Ehrhardt <[EMAIL PROTECTED]>

This is a set of various fixes in kvm-userspace for powerpc. This time without
the split between user/* and the rest and without the qemu patch (sent
separate to qemu-devel now).

Signed-off-by: Christian Ehrhardt <[EMAIL PROTECTED]>

b/kernel/Makefile  |3 +
b/qemu/hw/ppc440.c |   12 +++
b/qemu/hw/ppc440.h |8 +++--
b/qemu/hw/ppc440_bamboo.c  |7 ++--
b/qemu/qemu-kvm-powerpc.c  |4 +-
b/user/config-powerpc-44x.mak  |   14 +
b/user/config-powerpc.mak  |   46 -
b/user/main-ppc.c  |   32 +++-
b/user/test/lib/powerpc/44x/map.c  |   51 +
b/user/test/lib/powerpc/44x/timebase.S |   28 ++
b/user/test/lib/powerpc/44x/timebase.h |   25 
b/user/test/lib/powerpc/44x/tlbwe.S|   29 ++
b/user/test/lib/powerpc/io.c   |   35 ++
b/user/test/powerpc/cstart.S   |   38 
b/user/test/powerpc/exit.c |   23 ++
qemu/hw/ppc440_bamboo.c|   36 +++
user/config-powerpc-44x.mak|3 +
user/main-ppc.c|9 +
18 files changed, 339 insertions(+), 64 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 01 of 10] [PATCH] kvm-userspace: powerpc: fix env->xer access

2008-10-28 Thread Ehrhardt Christian
From: Christian Ehrhardt <[EMAIL PROTECTED]>

Since qemu revision 5500 which was merged with the last qemu merge env->xer
is accessed directly.

Signed-off-by: Christian Ehrhardt <[EMAIL PROTECTED]>
---

[diffstat]
 qemu-kvm-powerpc.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

[diff]

diff --git a/qemu/qemu-kvm-powerpc.c b/qemu/qemu-kvm-powerpc.c
--- a/qemu/qemu-kvm-powerpc.c
+++ b/qemu/qemu-kvm-powerpc.c
@@ -57,7 +57,7 @@
 
 regs.ctr = env->ctr;
 regs.lr  = env->lr;
-regs.xer = ppc_load_xer(env);
+regs.xer = env->xer;
 regs.msr = env->msr;
 
 regs.srr0 = env->spr[SPR_SRR0];
@@ -93,7 +93,7 @@
 
 env->ctr =regs.ctr;
 env->lr = regs.lr;
-ppc_store_xer(env,regs.xer);
+env->xer = regs.xer;
 env->msr = regs.msr;
 /* calculate hflags based on the current msr using the ppc qemu helper */
 hreg_compute_hflags(env);
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests

2008-10-28 Thread Muli Ben-Yehuda
On Tue, Oct 28, 2008 at 11:36:10PM +0800, Han, Weidong wrote:
> > diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c
> > index c5f3f29..5e66832 100644
> > --- a/qemu/qemu-kvm.c
> > +++ b/qemu/qemu-kvm.c
> > @@ -20,6 +20,7 @@ int kvm_pit = 1;
> >  #include "console.h"
> >  #include "block.h"
> >  #include "compatfd.h"
> > +#include "hw/device-assignment.h"
> 
> It's not necessary.

Indeed, left overs from my ioperm bits. Removed.

> >  #include "qemu-kvm.h"
> >  #include 
> > @@ -27,6 +28,7 @@ int kvm_pit = 1;
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> 
> It's not necessary.

This one is needed on my compile system for the ioperm() declaration.

Cheers,
Muli

-- 
The First Workshop on I/O Virtualization (WIOV '08)
Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/
   <->
SYSTOR 2009---The Israeli Experimental Systems Conference
http://www.haifa.il.ibm.com/conferences/systor2009/
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests

2008-10-28 Thread Anthony Liguori

[EMAIL PROTECTED] wrote:

Signed-off-by: Amit Shah <[EMAIL PROTECTED]>
Signed-off-by: Muli Ben-Yehuda <[EMAIL PROTECTED]>
---
 qemu/Makefile.target|3 +
 qemu/hw/device-assignment.c |  641 +++
 qemu/hw/device-assignment.h |  117 
 qemu/hw/pc.c|   16 +
 qemu/hw/pci.c   |7 +
 qemu/qemu-kvm.c |   14 +
 qemu/qemu-kvm.h |8 +
 qemu/vl.c   |   28 ++
 8 files changed, 834 insertions(+), 0 deletions(-)
 create mode 100644 qemu/hw/device-assignment.c
 create mode 100644 qemu/hw/device-assignment.h

diff --git a/qemu/Makefile.target b/qemu/Makefile.target
index d9bdeca..5d44e08 100644
--- a/qemu/Makefile.target
+++ b/qemu/Makefile.target
@@ -621,6 +621,9 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
+ifeq ($(USE_KVM), 1)
+OBJS+= device-assignment.o
+endif
  


I don't think you want to build this on PPC so I think you need a 
stronger check.



+static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
+   uint32_t value)
+{
+AssignedDevRegion *r_access = opaque;
+uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+
+DEBUG("%s: r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
+ r_pio, (int)r_access->e_physbase,
+  (unsigned long)r_access->r_virtbase, value);
  


The format doesn't match the parameter count.


+static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
+   uint32_t e_phys, uint32_t e_size, int type)
+{
+AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
+AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+uint32_t old_ephys = region->e_physbase;
+uint32_t old_esize = region->e_size;
+int first_map = (region->e_size == 0);
+int ret = 0;
+
+DEBUG("e_phys=%08x r_virt=%x type=%d len=%08x region_num=%d \n",
+  e_phys, (uint32_t)region->r_virtbase, type, e_size, region_num);
+
+region->e_physbase = e_phys;
+region->e_size = e_size;
+
+if (!first_map)
+   kvm_destroy_phys_mem(kvm_context, old_ephys, old_esize);
+
+if (e_size > 0)
+   ret = kvm_register_phys_mem(kvm_context, e_phys,
+region->u.r_virtbase, e_size, 0);
+if (ret != 0) {
+   fprintf(stderr, "%s: Error: create new mapping failed\n", __func__);
+   exit(1);
+}
+}
+
+static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
+uint32_t addr, uint32_t size, int type)
+{
+AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
+AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+uint32_t old_port = region->u.r_baseport;
+uint32_t old_num = region->e_size;
+int first_map = (old_num == 0);
+struct ioperm_data data;
+int i;
+
+region->e_physbase = addr;
+region->e_size = size;
+
+DEBUG("e_phys=0x%x r_baseport=%x type=0x%x len=%d region_num=%d \n",
+  addr, region->u.r_baseport, type, size, region_num);
+
+memset(&data, 0, sizeof(data));
+
+if (!first_map) {
+   data.start_port = old_port;
+	data.num = old_num; 
+	data.turn_on = 0;

+
+   for (i = 0; i < smp_cpus; ++i)
+   kvm_ioperm(qemu_kvm_cpu_env(i), &data);
  


How does this interact with VCPU hot-plug?


+}
+
+data.start_port = region->u.r_baseport;
+data.num = size;
+data.turn_on = 1;
+ 
+for (i = 0; i < smp_cpus; ++i)

+   kvm_ioperm(qemu_kvm_cpu_env(i), &data);
+ 
+register_ioport_read(addr, size, 1, assigned_dev_ioport_readb,

+ (r_dev->v_addrs + region_num));
+register_ioport_read(addr, size, 2, assigned_dev_ioport_readw,
+ (r_dev->v_addrs + region_num));
+register_ioport_read(addr, size, 4, assigned_dev_ioport_readl,
+ (r_dev->v_addrs + region_num));
+register_ioport_write(addr, size, 1, assigned_dev_ioport_writeb,
+  (r_dev->v_addrs + region_num));
+register_ioport_write(addr, size, 2, assigned_dev_ioport_writew,
+  (r_dev->v_addrs + region_num));
+register_ioport_write(addr, size, 4, assigned_dev_ioport_writel,
+  (r_dev->v_addrs + region_num));
+}
+
+static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address,
+  uint32_t val, int len)
+{
+int fd;
+ssize_t ret;
+
+DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+  ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
+  (uint16_t) address, val, len);
+
+if (address == 0x4) {
+pci_default_write_config(d, address, val, len);
+/* Continue to program the card */
+}
+
+if ((addre

Re: [PATCH 3/6] qemu: piix: Introduce functions to get pin number from irq and vice versa

2008-10-28 Thread Muli Ben-Yehuda
On Tue, Oct 28, 2008 at 12:46:39PM +0200, Avi Kivity wrote:
> Muli Ben-Yehuda wrote:
>> On Sun, Oct 26, 2008 at 03:31:24PM +0200, Avi Kivity wrote:
>>   
>>> Amit Shah wrote:
>>> 
  +int piix3_get_pin(int pic_irq)
 +{
 +int i;
 +for (i = 0; i < 4; i++)
 +if (piix3_dev->config[0x60+i] == pic_irq)
 +return i;
 +return -1;
 +}
 
>>> What happens if two pci interrupts are routed to one irq line?
>>> 
>>
>> This one I'm still thinking about.
>>   
>
> Well, what is this needed for in the first place?

This specific function is not used. I assume Amit added it for
completeness with piix_get_irq. piix_get_irq, as far as I can tell, is
used in only one place (when the guest updates a device's
configuration space interrupt register) to go from interrupt pin
(intx) to guest IRQ line.

Cheers,
Muli
-- 
The First Workshop on I/O Virtualization (WIOV '08)
Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/
   <->
SYSTOR 2009---The Israeli Experimental Systems Conference
http://www.haifa.il.ibm.com/conferences/systor2009/
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests

2008-10-28 Thread Muli Ben-Yehuda
On Tue, Oct 28, 2008 at 10:10:07PM +0800, Han, Weidong wrote:

> > +DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
> > + r_pio, (int)r_access->e_physbase,
> > + (unsigned long)r_access->r_virtbase, value);
> 
> should be (unsigned long)r_access->u.r_virtbase

Thanks, actually it should be u.r_baseport for IO ports and there were
a number of other bogosities there too. Here's a quick incremental
patch compiled with DEBUG() enabled.

>From 9b917528647b55a1046a5a19d9e2427bb2d86db7 Mon Sep 17 00:00:00 2001
From: Muli Ben-Yehuda <[EMAIL PROTECTED]>
Date: Tue, 28 Oct 2008 17:30:30 +0200
Subject: [PATCH 1/1] fix DEBUG statements

(thanks to Weidong Han for spotting)

Signed-off-by: Muli Ben-Yehuda <[EMAIL PROTECTED]>
---
 qemu/hw/device-assignment.c |   32 
 1 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c
index 89b05f9..8b56599 100644
--- a/qemu/hw/device-assignment.c
+++ b/qemu/hw/device-assignment.c
@@ -63,9 +63,9 @@ static void assigned_dev_ioport_writeb(void *opaque, uint32_t 
addr,
 AssignedDevRegion *r_access = opaque;
 uint32_t r_pio = guest_to_host_ioport(r_access, addr);
 
-DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
+DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
  r_pio, (int)r_access->e_physbase,
- (unsigned long)r_access->r_virtbase, value);
+ (unsigned long)r_access->u.r_baseport, value);
 
 outb(value, r_pio);
 }
@@ -76,9 +76,9 @@ static void assigned_dev_ioport_writew(void *opaque, uint32_t 
addr,
 AssignedDevRegion *r_access = opaque;
 uint32_t r_pio = guest_to_host_ioport(r_access, addr);
 
-DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
-  __func__, r_pio, (int)r_access->e_physbase,
-  (unsigned long)r_access->r_virtbase, value);
+DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
+  r_pio, (int)r_access->e_physbase,
+ (unsigned long)r_access->u.r_baseport, value);
 
 outw(value, r_pio);
 }
@@ -89,9 +89,9 @@ static void assigned_dev_ioport_writel(void *opaque, uint32_t 
addr,
 AssignedDevRegion *r_access = opaque;
 uint32_t r_pio = guest_to_host_ioport(r_access, addr);
 
-DEBUG("%s: r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
+DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
  r_pio, (int)r_access->e_physbase,
-  (unsigned long)r_access->r_virtbase, value);
+  (unsigned long)r_access->u.r_baseport, value);
 
 outl(value, r_pio);
 }
@@ -104,9 +104,9 @@ static uint32_t assigned_dev_ioport_readb(void *opaque, 
uint32_t addr)
 
 value = inb(r_pio);
 
-DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
+DEBUG("r_pio=%08x e_physbase=%08x r_=%08lx value=%08x\n",
   r_pio, (int)r_access->e_physbase,
-  (unsigned long)r_access->r_virtbase, value);
+  (unsigned long)r_access->u.r_baseport, value);
 
 return value;
 }
@@ -119,9 +119,9 @@ static uint32_t assigned_dev_ioport_readw(void *opaque, 
uint32_t addr)
 
 value = inw(r_pio);
 
-DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
+DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
   r_pio, (int)r_access->e_physbase,
- (unsigned long)r_access->r_virtbase, value);
+ (unsigned long)r_access->u.r_baseport, value);
 
 return value;
 }
@@ -134,9 +134,9 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, 
uint32_t addr)
 
 value = inl(r_pio);
 
-DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
+DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
   r_pio, (int)r_access->e_physbase,
-  (unsigned long)r_access->r_virtbase, value);
+  (unsigned long)r_access->u.r_baseport, value);
 
 return value;
 }
@@ -151,8 +151,8 @@ static void assigned_dev_iomem_map(PCIDevice *pci_dev, int 
region_num,
 int first_map = (region->e_size == 0);
 int ret = 0;
 
-DEBUG("e_phys=%08x r_virt=%x type=%d len=%08x region_num=%d \n",
-  e_phys, (uint32_t)region->r_virtbase, type, e_size, region_num);
+DEBUG("e_phys=%08x r_virt=%p type=%d len=%08x region_num=%d \n",
+  e_phys, region->u.r_virtbase, type, e_size, region_num);
 
 region->e_physbase = e_phys;
 region->e_size = e_size;
@@ -425,7 +425,7 @@ again:
 rp->valid = 1;
 rp->base_addr = start;
 rp->size = size;
-DEBUG("region %d size %d start 0x%x type %d resource_fd %d\n",
+DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n",
   r, rp->size, start, rp->type, rp->resource_fd);
 }
 fclose(f);
-- 
1.5.6.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http:/

RE: [PATCH 5/6] device assignment: support for assigning PCI devices to guests

2008-10-28 Thread Han, Weidong
[EMAIL PROTECTED] wrote:
> From: Muli Ben-Yehuda <[EMAIL PROTECTED]>
> 
> This patch has been contributed to by the following people:
> 
> Or Sagi <[EMAIL PROTECTED]>
> Nir Peleg <[EMAIL PROTECTED]>
> Amit Shah <[EMAIL PROTECTED]>
> Ben-Ami Yassour <[EMAIL PROTECTED]>
> Weidong Han <[EMAIL PROTECTED]>
> Glauber de Oliveira Costa <[EMAIL PROTECTED]>
> Muli Ben-Yehuda <[EMAIL PROTECTED]>
> 
> With this patch, we can assign a device on the host machine to a
> guest.
> 
> A new command-line option, -pcidevice is added.
> To invoke it for a device sitting at PCI bus:dev.fn 04:08.0, use this:
> 
> -pcidevice host=04:08.0
> 
> * The host driver for the device, if any, is to be removed before
> assigning the device (else device assignment will fail).
> 
> * A device that shares IRQ with another host device cannot currently
> be assigned.
> 
> * The RAW_IO capability is needed for this to work
> 
> This works only with the in-kernel irqchip method; to use the
> userspace irqchip, a kernel module (irqhook) and some extra changes
> are needed.
> 
> [muli: lots of small fixes from Muli and Weidong Han addressing all v7
> review comments]
> 
> Signed-off-by: Amit Shah <[EMAIL PROTECTED]>
> Signed-off-by: Muli Ben-Yehuda <[EMAIL PROTECTED]>
> ---
>  qemu/Makefile.target|3 +
>  qemu/hw/device-assignment.c |  641
>  +++
>  qemu/hw/device-assignment.h |  117  qemu/hw/pc.c
>  |   16 + qemu/hw/pci.c   |7 +
>  qemu/qemu-kvm.c |   14 +
>  qemu/qemu-kvm.h |8 +
>  qemu/vl.c   |   28 ++
>  8 files changed, 834 insertions(+), 0 deletions(-)
>  create mode 100644 qemu/hw/device-assignment.c
>  create mode 100644 qemu/hw/device-assignment.h
> 
> diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c
> index c5f3f29..5e66832 100644
> --- a/qemu/qemu-kvm.c
> +++ b/qemu/qemu-kvm.c
> @@ -20,6 +20,7 @@ int kvm_pit = 1;
>  #include "console.h"
>  #include "block.h"
>  #include "compatfd.h"
> +#include "hw/device-assignment.h"

It's not necessary.

> 
>  #include "qemu-kvm.h"
>  #include 
> @@ -27,6 +28,7 @@ int kvm_pit = 1;
>  #include 
>  #include 
>  #include 
> +#include 

It's not necessary.

Regards,
Weidong

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH 5/6] device assignment: support for assigning PCI devices to guests

2008-10-28 Thread Han, Weidong
Han, Weidong wrote:
> [EMAIL PROTECTED] wrote:
>> From: Muli Ben-Yehuda <[EMAIL PROTECTED]>
>> 
>> This patch has been contributed to by the following people:
>> 
>> Or Sagi <[EMAIL PROTECTED]>
>> Nir Peleg <[EMAIL PROTECTED]>
>> Amit Shah <[EMAIL PROTECTED]>
>> Ben-Ami Yassour <[EMAIL PROTECTED]>
>> Weidong Han <[EMAIL PROTECTED]>
>> Glauber de Oliveira Costa <[EMAIL PROTECTED]>
>> Muli Ben-Yehuda <[EMAIL PROTECTED]>
>> 
>> With this patch, we can assign a device on the host machine to a
>> guest. 
>> 
>> A new command-line option, -pcidevice is added.
>> To invoke it for a device sitting at PCI bus:dev.fn 04:08.0, use
>> this: 
>> 
>> -pcidevice host=04:08.0
>> 
>> * The host driver for the device, if any, is to be removed before
>> assigning the device (else device assignment will fail).
>> 
>> * A device that shares IRQ with another host device cannot currently
>> be assigned. 
>> 
>> * The RAW_IO capability is needed for this to work
>> 
>> This works only with the in-kernel irqchip method; to use the
>> userspace irqchip, a kernel module (irqhook) and some extra changes
>> are needed. 
>> 
>> [muli: lots of small fixes from Muli and Weidong Han addressing all
>> v7 review comments] 
>> 
>> Signed-off-by: Amit Shah <[EMAIL PROTECTED]>
>> Signed-off-by: Muli Ben-Yehuda <[EMAIL PROTECTED]>
>> ---
>>  qemu/Makefile.target|3 +
>>  qemu/hw/device-assignment.c |  641
>>  +++
>>  qemu/hw/device-assignment.h |  117  qemu/hw/pc.c
>>  |   16 + qemu/hw/pci.c   |7 +
>>  qemu/qemu-kvm.c |   14 +
>>  qemu/qemu-kvm.h |8 +
>>  qemu/vl.c   |   28 ++
>>  8 files changed, 834 insertions(+), 0 deletions(-)
>>  create mode 100644 qemu/hw/device-assignment.c
>>  create mode 100644 qemu/hw/device-assignment.h
>> 
>> diff --git a/qemu/Makefile.target b/qemu/Makefile.target
>> index d9bdeca..5d44e08 100644
>> --- a/qemu/Makefile.target
>> +++ b/qemu/Makefile.target
>> @@ -621,6 +621,9 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW)
>>  dma.o OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o
>>  pc.o OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
>>  OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o +ifeq
>> ($(USE_KVM), 1) +OBJS+= device-assignment.o
>> +endif
>>  ifeq ($(USE_KVM_PIT), 1)
>>  OBJS+= i8254-kvm.o
>>  endif
>> diff --git a/qemu/hw/device-assignment.c
>> b/qemu/hw/device-assignment.c new file mode 100644 index
>> 000..89b05f9 --- /dev/null
>> +++ b/qemu/hw/device-assignment.c
>> @@ -0,0 +1,641 @@
>> +/*
>> + * Copyright (c) 2007, Neocleus Corporation.
>> + *
>> + * This program is free software; you can redistribute it and/or
>> modify it + * under the terms and conditions of the GNU General
>> Public License, + * version 2, as published by the Free Software
>> Foundation. + * + * This program is distributed in the hope it will
>> be useful, but WITHOUT + * ANY WARRANTY; without even the implied
>> warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.
>> See the GNU General Public License for + * more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> along with + * this program; if not, write to the Free Software
>> Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA
>> 02111-1307 USA. + * + *
>> + *  Assign a PCI device from the host to a guest VM. + *
>> + *  Adapted for KVM by Qumranet.
>> + *
>> + *  Copyright (c) 2007, Neocleus, Alex Novik ([EMAIL PROTECTED])
>> + *  Copyright (c) 2007, Neocleus, Guy Zana ([EMAIL PROTECTED])
>> + *  Copyright (C) 2008, Qumranet, Amit Shah ([EMAIL PROTECTED])
>> + *  Copyright (C) 2008, Red Hat, Amit Shah ([EMAIL PROTECTED])
>> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda ([EMAIL PROTECTED]) + */
>> +#include 
>> +#include 
>> +#include "qemu-kvm.h"
>> +#include "hw.h"
>> +#include "pc.h"
>> +#include "sysemu.h"
>> +#include "console.h"
>> +#include "device-assignment.h"
>> +
>> +/* From linux/ioport.h */
>> +#define IORESOURCE_IO   0x0100  /* Resource type */
>> +#define IORESOURCE_MEM  0x0200
>> +#define IORESOURCE_IRQ  0x0400
>> +#define IORESOURCE_DMA  0x0800
>> +#define IORESOURCE_PREFETCH 0x1000  /* No side effects */ +
>> +/* #define DEVICE_ASSIGNMENT_DEBUG 1 */
>> +
>> +#ifdef DEVICE_ASSIGNMENT_DEBUG
>> +#define DEBUG(fmt, ...)   \
>> +do {  \
>> +  fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__);\ +  
>> } while (0) +#else
>> +#define DEBUG(fmt, ...) do { } while(0)
>> +#endif
>> +
>> +static uint32_t guest_to_host_ioport(AssignedDevRegion *region,
>> uint32_t addr) +{ +return region->u.r_baseport + (addr -
>> region->e_physbase); +} +
>> +static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
>> +   uint32_t value) +{
>> +AssignedDevRegion *r_acces

[GIT PULL] KVM fixes for 2.6.28-rc2

2008-10-28 Thread Avi Kivity
Linus, please pull from the repo and branch @

  git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm.git kvm-updates/2.6.28

To receive a bunch of kvm fixes.  Most notably, fix the paravirt mmu bug
that prevents booting recent Linux kernels on kvm.

Shortlog, diffstat:

Avi Kivity (1):
  KVM: Future-proof device assignment ABI

Marcelo Tosatti (1):
  KVM: MMU: sync root on paravirt TLB flush

Sheng Yang (1):
  KVM: Fix guest shared interrupt with in-kernel irqchip

Xiantao Zhang (2):
  KVM: ia64: Fix halt emulation logic
  KVM: ia64: Makefile fix for forcing to re-generate asm-offsets.h

 arch/ia64/include/asm/kvm_host.h |6 ++-
 arch/ia64/kvm/Makefile   |8 +++-
 arch/ia64/kvm/kvm-ia64.c |   80 +++--
 arch/ia64/kvm/kvm_fw.c   |9 +++-
 arch/ia64/kvm/process.c  |2 +-
 arch/x86/include/asm/kvm_host.h  |3 +
 arch/x86/kvm/i8254.c |   11 -
 arch/x86/kvm/i8254.h |1 +
 arch/x86/kvm/mmu.c   |1 +
 arch/x86/kvm/x86.c   |6 ++-
 include/linux/kvm.h  |6 +++
 include/linux/kvm_host.h |7 +++-
 virt/kvm/irq_comm.c  |   42 ++-
 virt/kvm/kvm_main.c  |   12 --
 14 files changed, 138 insertions(+), 56 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/6] qemu: piix: Introduce functions to get pin number from irq and vice versa

2008-10-28 Thread Avi Kivity

Muli Ben-Yehuda wrote:

On Sun, Oct 26, 2008 at 03:31:24PM +0200, Avi Kivity wrote:
  

Amit Shah wrote:


 +int piix3_get_pin(int pic_irq)
+{
+int i;
+for (i = 0; i < 4; i++)
+if (piix3_dev->config[0x60+i] == pic_irq)
+return i;
+return -1;
+}
  
  

What happens if two pci interrupts are routed to one irq line?



This one I'm still thinking about.
  


Well, what is this needed for in the first place?

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH 5/6] device assignment: support for assigning PCI devices to guests

2008-10-28 Thread Han, Weidong
[EMAIL PROTECTED] wrote:
> From: Muli Ben-Yehuda <[EMAIL PROTECTED]>
> 
> This patch has been contributed to by the following people:
> 
> Or Sagi <[EMAIL PROTECTED]>
> Nir Peleg <[EMAIL PROTECTED]>
> Amit Shah <[EMAIL PROTECTED]>
> Ben-Ami Yassour <[EMAIL PROTECTED]>
> Weidong Han <[EMAIL PROTECTED]>
> Glauber de Oliveira Costa <[EMAIL PROTECTED]>
> Muli Ben-Yehuda <[EMAIL PROTECTED]>
> 
> With this patch, we can assign a device on the host machine to a
> guest.
> 
> A new command-line option, -pcidevice is added.
> To invoke it for a device sitting at PCI bus:dev.fn 04:08.0, use this:
> 
> -pcidevice host=04:08.0
> 
> * The host driver for the device, if any, is to be removed before
> assigning the device (else device assignment will fail).
> 
> * A device that shares IRQ with another host device cannot currently
> be assigned.
> 
> * The RAW_IO capability is needed for this to work
> 
> This works only with the in-kernel irqchip method; to use the
> userspace irqchip, a kernel module (irqhook) and some extra changes
> are needed.
> 
> [muli: lots of small fixes from Muli and Weidong Han addressing all v7
> review comments]
> 
> Signed-off-by: Amit Shah <[EMAIL PROTECTED]>
> Signed-off-by: Muli Ben-Yehuda <[EMAIL PROTECTED]>
> ---
>  qemu/Makefile.target|3 +
>  qemu/hw/device-assignment.c |  641
>  +++
>  qemu/hw/device-assignment.h |  117  qemu/hw/pc.c
>  |   16 + qemu/hw/pci.c   |7 +
>  qemu/qemu-kvm.c |   14 +
>  qemu/qemu-kvm.h |8 +
>  qemu/vl.c   |   28 ++
>  8 files changed, 834 insertions(+), 0 deletions(-)
>  create mode 100644 qemu/hw/device-assignment.c
>  create mode 100644 qemu/hw/device-assignment.h
> 
> diff --git a/qemu/Makefile.target b/qemu/Makefile.target
> index d9bdeca..5d44e08 100644
> --- a/qemu/Makefile.target
> +++ b/qemu/Makefile.target
> @@ -621,6 +621,9 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
>  OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
>  OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
>  OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
> +ifeq ($(USE_KVM), 1)
> +OBJS+= device-assignment.o
> +endif
>  ifeq ($(USE_KVM_PIT), 1)
>  OBJS+= i8254-kvm.o
>  endif
> diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c
> new file mode 100644
> index 000..89b05f9
> --- /dev/null
> +++ b/qemu/hw/device-assignment.c
> @@ -0,0 +1,641 @@
> +/*
> + * Copyright (c) 2007, Neocleus Corporation.
> + *
> + * This program is free software; you can redistribute it and/or
> modify it + * under the terms and conditions of the GNU General
> Public License, + * version 2, as published by the Free Software
> Foundation. + *
> + * This program is distributed in the hope it will be useful, but
> WITHOUT + * ANY WARRANTY; without even the implied warranty of
> MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> General Public License for + * more details.
> + *
> + * You should have received a copy of the GNU General Public License
> along with + * this program; if not, write to the Free Software
> Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA
> 02111-1307 USA. + *
> + *
> + *  Assign a PCI device from the host to a guest VM.
> + *
> + *  Adapted for KVM by Qumranet.
> + *
> + *  Copyright (c) 2007, Neocleus, Alex Novik ([EMAIL PROTECTED])
> + *  Copyright (c) 2007, Neocleus, Guy Zana ([EMAIL PROTECTED])
> + *  Copyright (C) 2008, Qumranet, Amit Shah ([EMAIL PROTECTED])
> + *  Copyright (C) 2008, Red Hat, Amit Shah ([EMAIL PROTECTED])
> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda ([EMAIL PROTECTED])
> + */
> +#include 
> +#include 
> +#include "qemu-kvm.h"
> +#include "hw.h"
> +#include "pc.h"
> +#include "sysemu.h"
> +#include "console.h"
> +#include "device-assignment.h"
> +
> +/* From linux/ioport.h */
> +#define IORESOURCE_IO   0x0100  /* Resource type */
> +#define IORESOURCE_MEM  0x0200
> +#define IORESOURCE_IRQ  0x0400
> +#define IORESOURCE_DMA  0x0800
> +#define IORESOURCE_PREFETCH 0x1000  /* No side effects */
> +
> +/* #define DEVICE_ASSIGNMENT_DEBUG 1 */
> +
> +#ifdef DEVICE_ASSIGNMENT_DEBUG
> +#define DEBUG(fmt, ...)   \
> +do {  \
> +  fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__);\
> +} while (0)
> +#else
> +#define DEBUG(fmt, ...) do { } while(0)
> +#endif
> +
> +static uint32_t guest_to_host_ioport(AssignedDevRegion *region,
> uint32_t addr) +{
> +return region->u.r_baseport + (addr - region->e_physbase);
> +}
> +
> +static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
> +   uint32_t value)
> +{
> +AssignedDevRegion *r_access = opaque;
> +uint32_t r_pio = guest_to_host_ioport(r_access, addr);
> +
> +DEBUG("r_pio=%08x e_physbase=%0

Re: [PATCH] x86 emulator: move skip_emulated_instruction()

2008-10-28 Thread Avi Kivity

Guillaume Thouvenin wrote:

 If we call the emulator we shouldn't call skip_emulated_instruction()
in the first place, since the emulator already computes the next rip
for us. Thus we move ->skip_emulated_instruction() out of
kvm_emulate_pio() and into handle_io() (and the svm equivalent). We
also replaced "return 0" by "break" in the "do_io:" case because now
the shadow register state needs to be committed. Otherwise eip will never
be updated.
  


Applied, thanks.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/6] KVM/userspace: Device Assignment: Add ioctl wrappers needed for assigning devices

2008-10-28 Thread Muli Ben-Yehuda
On Sun, Oct 26, 2008 at 03:29:19PM +0200, Avi Kivity wrote:
> Amit Shah wrote:
>>  +#ifdef KVM_CAP_DEVICE_ASSIGNMENT
>> +int kvm_assign_pci_device(kvm_context_t kvm,
>> +  struct kvm_assigned_pci_dev *assigned_dev)
>> +{
>> +return ioctl(kvm->vm_fd, KVM_ASSIGN_PCI_DEVICE, assigned_dev);
>>   
>
> Convert -1s to -errno, to avoid problems with errno being
> overwritten later.

Done.

Cheers,
Muli
-- 
The First Workshop on I/O Virtualization (WIOV '08)
Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/
   <->
SYSTOR 2009---The Israeli Experimental Systems Conference
http://www.haifa.il.ibm.com/conferences/systor2009/
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [v7] Userspace patches for PCI device assignment

2008-10-28 Thread Muli Ben-Yehuda
On Fri, Oct 24, 2008 at 10:59:58AM -0500, Anthony Liguori wrote:
> Amit Shah wrote:
>> This patchset enables device assignment for KVM hosts for PCI devices. It 
>> uses the Intel IOMMU by default if available.
>>
>> Major changes since the last send in no particular order:
>> - formatting changes: adhere to qemu style
>> - use strncmp, strncpy etc. instead of the insecure ones
>>   
>
> FWIW, strncpy almost never does what you expect it to.  snprintf()
> is much nicer.

Fixed all over. If you find a stray strncpy, shoot it.

Cheers,
Muli
-- 
The First Workshop on I/O Virtualization (WIOV '08)
Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/
   <->
SYSTOR 2009---The Israeli Experimental Systems Conference
http://www.haifa.il.ibm.com/conferences/systor2009/
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/6] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests

2008-10-28 Thread Muli Ben-Yehuda
On Mon, Oct 27, 2008 at 02:32:48PM +0800, Han, Weidong wrote:

> Yes, it's buggy. It should like:
> 
> uint32_t old_ephys = region->e_physbase;
> uint32_t old_esize = region->e_size;
> 
> ...
> 
> kvm_destroy_phys_mem(kvm_context, old_ephys, old_esize);

Fixed in v8. Thanks!

Cheers,
Muli
-- 
The First Workshop on I/O Virtualization (WIOV '08)
Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/
   <->
SYSTOR 2009---The Israeli Experimental Systems Conference
http://www.haifa.il.ibm.com/conferences/systor2009/
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/6] qemu: piix: Introduce functions to get pin number from irq and vice versa

2008-10-28 Thread Muli Ben-Yehuda
On Sun, Oct 26, 2008 at 03:31:24PM +0200, Avi Kivity wrote:
> Amit Shah wrote:
>>  +int piix3_get_pin(int pic_irq)
>> +{
>> +int i;
>> +for (i = 0; i < 4; i++)
>> +if (piix3_dev->config[0x60+i] == pic_irq)
>> +return i;
>> +return -1;
>> +}
>>   
>
> What happens if two pci interrupts are routed to one irq line?

This one I'm still thinking about.

Cheers,
Muli
-- 
The First Workshop on I/O Virtualization (WIOV '08)
Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/
   <->
SYSTOR 2009---The Israeli Experimental Systems Conference
http://www.haifa.il.ibm.com/conferences/systor2009/
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/6] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests

2008-10-28 Thread Muli Ben-Yehuda
On Fri, Oct 24, 2008 at 11:22:48AM -0500, Anthony Liguori wrote:

> Amit Shah wrote:

>> +#include 
>>   
>
> Is this header really necessary?

No, removed.

>
>> +#include "device-assignment.h"
>> +
>> +/* From linux/ioport.h */
>> +#define IORESOURCE_IO   0x0100  /* Resource type */
>> +#define IORESOURCE_MEM  0x0200
>> +#define IORESOURCE_IRQ  0x0400
>> +#define IORESOURCE_DMA  0x0800
>> +#define IORESOURCE_PREFETCH 0x1000  /* No side effects */
>> +
>> +/* #define DEVICE_ASSIGNMENT_DEBUG 1 */
>> +
>> +#ifdef DEVICE_ASSIGNMENT_DEBUG
>> +#define DEBUG(fmt, args...)   \
>>   
>
> Please use C99 style varidacs.

Done.

>
>> +do {  \
>> +  fprintf(stderr, "%s: " fmt, __func__ , ## args);\
>> +} while (0)
>> +#else
>> +#define DEBUG(fmt, args...) do { } while(0)
>> +#endif
>> +
>> +static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
>> +   uint32_t value)
>> +{
>> +AssignedDevRegion *r_access = (AssignedDevRegion *)opaque;
>>   
>
> Cast is unnecessary.

Removed.

>
>> +uint32_t r_pio = (unsigned long)r_access->r_virtbase
>> ++ (addr - r_access->e_physbase);
>>   
>
> It would be nice to make this a function to make it more obvious that you 
> were translated from guest to host regions.  The cast to unsigned long 
> should probably be target_ulong too.

Done.

>
>> +DEBUG(stderr, "%s: r_pio=%08x e_physbase=%08x"
>> +  " r_virtbase=%08lx value=%08x\n",
>> +  __func__, r_pio, (int)r_access->e_physbase,
>> +  (unsigned long)r_access->r_virtbase, value);
>>   
>
> This debug statement looks wrong to me.  You're passing stderr.
> It's true for all of these functions.

Fixed.

>
>> +static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
>> +   uint32_t e_phys, uint32_t e_size, int 
>> type)
>> +{
>> +AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
>> +AssignedDevRegion *region = &r_dev->v_addrs[region_num];
>> +int first_map = (region->e_size == 0);
>> +int ret = 0;
>> +
>> +DEBUG("%s: e_phys=%08x r_virt=%x type=%d len=%08x region_num=%d \n",
>> +  __func__, e_phys, (uint32_t)region->r_virtbase, type, e_size,
>> +  region_num);
>>   
>
> You already have __func__ in your debug printf().

Fixed.

>
>> +region->e_physbase = e_phys;
>> +region->e_size = e_size;
>> +
>> +/* FIXME: Add support for emulated MMIO for non-kvm guests */
>> +if (kvm_enabled()) {
>>   
>
> I don't think having a kvm_enabled() check here is very useful.  I
> think device-assignment.c should be conditional on USE_KVM, and the
> only kvm_enabled() check should be when creating the initial device
> assignment.  Practically speaking, QEMU is never going to support
> device assignment outside of the context of KVM because I strongly
> doubt anything like irqhook will make it upstream.

Reworked along your suggestions, please let me know if you have
further comments.

>> +if (!first_map)
>> +kvm_destroy_phys_mem(kvm_context, e_phys, e_size);
>> +if (e_size > 0)
>> +ret = kvm_register_phys_mem(kvm_context, e_phys,
>> +region->r_virtbase, e_size, 0);
>> +if (ret != 0)
>> +fprintf(stderr, "%s: Error: create new mapping failed\n", 
>> __func__);
>>   
>
> If we do get an error here, we shouldn't keep going.  This error is
> probably going to happen in practice if a guest tries to pass
> through too many devices and we run out of slots.

Fixed, we exit(1) now (is there a more graceful to bail out?).

>> +}
>> +}
>> +
>> +static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
>> +uint32_t addr, uint32_t size, int 
>> type)
>> +{
>> +AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
>> +AssignedDevRegion *region = &r_dev->v_addrs[region_num];
>> +int r;
>> +
>> +region->e_physbase = addr;
>> +region->e_size = size;
>> +
>> +DEBUG("%s: e_phys=0x%x r_virt=%x type=0x%x len=%d region_num=%d \n",
>> +  __func__, addr, (uint32_t)region->r_virtbase, type, size, 
>> region_num);
>>   
>
> Need to fix this DEBUG().

Fixed.

>
>> +r = ioperm((uint32_t)region->r_virtbase, size, 1);
>>   
>
> I don't think this is enough for KVM.  This will only do the ioperm
> in the thread that triggered the IO.  If you have an SMP guest,
> ioperm needs to be done on each VCPU's thread.

Fixed.

>> +if (r < 0) {
>> +perror("assigned_dev_ioport_map: ioperm");
>> +return;
>> +}
>>   
>
> Again, if we fail, we have to exit QEMU gracefully.

Fixed.

>
>> +register_ioport_read(addr, size, 1, assigned_dev_ioport_readb,
>> + (void *) (r_dev->v_addrs + region_num));
>> +register_ioport_read(addr, size, 2, assigned_dev_iop

[PATCH 1/6] device assignment: add ioctl wrappers

2008-10-28 Thread muli
From: Amit Shah <[EMAIL PROTECTED]>

[muli: return -errno instead of ioctl retval]

Signed-off-by: Amit Shah <[EMAIL PROTECTED]>
Signed-off-by: Muli Ben-Yehuda <[EMAIL PROTECTED]>
---
 libkvm/libkvm.c |   25 +
 libkvm/libkvm.h |   27 +++
 2 files changed, 52 insertions(+), 0 deletions(-)

diff --git a/libkvm/libkvm.c b/libkvm/libkvm.c
index 444b97f..e7dba8a 100644
--- a/libkvm/libkvm.c
+++ b/libkvm/libkvm.c
@@ -1112,3 +1112,28 @@ int kvm_unregister_coalesced_mmio(kvm_context_t kvm, 
uint64_t addr, uint32_t siz
return -ENOSYS;
 }
 
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+int kvm_assign_pci_device(kvm_context_t kvm,
+ struct kvm_assigned_pci_dev *assigned_dev)
+{
+   int ret;
+
+   ret = ioctl(kvm->vm_fd, KVM_ASSIGN_PCI_DEVICE, assigned_dev);
+   if (ret < 0)
+   return -errno;
+
+   return ret;
+}
+
+int kvm_assign_irq(kvm_context_t kvm,
+  struct kvm_assigned_irq *assigned_irq)
+{
+   int ret;
+
+   ret = ioctl(kvm->vm_fd, KVM_ASSIGN_IRQ, assigned_irq);
+   if (ret < 0)
+   return -errno;
+
+   return ret;
+}
+#endif
diff --git a/libkvm/libkvm.h b/libkvm/libkvm.h
index 423ce31..53d67f2 100644
--- a/libkvm/libkvm.h
+++ b/libkvm/libkvm.h
@@ -686,4 +686,31 @@ int kvm_s390_interrupt(kvm_context_t kvm, int slot,
 int kvm_s390_set_initial_psw(kvm_context_t kvm, int slot, psw_t psw);
 int kvm_s390_store_status(kvm_context_t kvm, int slot, unsigned long addr);
 #endif
+
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+/*!
+ * \brief Notifies host kernel about a PCI device to be assigned to a guest
+ *
+ * Used for PCI device assignment, this function notifies the host
+ * kernel about the assigning of the physical PCI device to a guest.
+ *
+ * \param kvm Pointer to the current kvm_context
+ * \param assigned_dev Parameters, like bus, devfn number, etc
+ */
+int kvm_assign_pci_device(kvm_context_t kvm,
+ struct kvm_assigned_pci_dev *assigned_dev);
+
+/*!
+ * \brief Notifies host kernel about changes to IRQ for an assigned device
+ *
+ * Used for PCI device assignment, this function notifies the host
+ * kernel about the changes in IRQ number for an assigned physical
+ * PCI device.
+ *
+ * \param kvm Pointer to the current kvm_context
+ * \param assigned_irq Parameters, like dev id, host irq, guest irq, etc
+ */
+int kvm_assign_irq(kvm_context_t kvm,
+  struct kvm_assigned_irq *assigned_irq);
+#endif
 #endif
-- 
1.5.6.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/6] device assignment: build vtd.c for Intel IOMMU support

2008-10-28 Thread muli
From: Amit Shah <[EMAIL PROTECTED]>

Signed-off-by: Amit Shah <[EMAIL PROTECTED]>
Signed-off-by: Muli Ben-Yehuda <[EMAIL PROTECTED]>
---
 kernel/x86/Kbuild |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/kernel/x86/Kbuild b/kernel/x86/Kbuild
index 2369d00..c4723b1 100644
--- a/kernel/x86/Kbuild
+++ b/kernel/x86/Kbuild
@@ -9,6 +9,9 @@ kvm-objs := kvm_main.o x86.o mmu.o x86_emulate.o 
../anon_inodes.o irq.o i8259.o
 ifeq ($(EXT_CONFIG_KVM_TRACE),y)
 kvm-objs += kvm_trace.o
 endif
+ifeq ($(CONFIG_DMAR),y)
+kvm-objs += vtd.o
+endif
 kvm-intel-objs := vmx.o vmx-debug.o ../external-module-compat.o
 kvm-amd-objs := svm.o ../external-module-compat.o
 
-- 
1.5.6.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/6] device assignment: introduce pci_map_irq to get irq nr from pin number

2008-10-28 Thread muli
From: Amit Shah <[EMAIL PROTECTED]>

Signed-off-by: Amit Shah <[EMAIL PROTECTED]>
Signed-off-by: Muli Ben-Yehuda <[EMAIL PROTECTED]>
---
 qemu/hw/pci.c |5 +
 qemu/hw/pci.h |1 +
 2 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/qemu/hw/pci.c b/qemu/hw/pci.c
index 512dbea..c82cd20 100644
--- a/qemu/hw/pci.c
+++ b/qemu/hw/pci.c
@@ -560,6 +560,11 @@ static void pci_set_irq(void *opaque, int irq_num, int 
level)
 bus->set_irq(bus->irq_opaque, irq_num, bus->irq_count[irq_num] != 0);
 }
 
+int pci_map_irq(PCIDevice *pci_dev, int pin)
+{
+return pci_dev->bus->map_irq(pci_dev, pin);
+}
+
 /***/
 /* monitor info on PCI */
 
diff --git a/qemu/hw/pci.h b/qemu/hw/pci.h
index 60e4094..e11fbbf 100644
--- a/qemu/hw/pci.h
+++ b/qemu/hw/pci.h
@@ -81,6 +81,7 @@ void pci_register_io_region(PCIDevice *pci_dev, int 
region_num,
 uint32_t size, int type,
 PCIMapIORegionFunc *map_func);
 
+int pci_map_irq(PCIDevice *pci_dev, int pin);
 uint32_t pci_default_read_config(PCIDevice *d,
  uint32_t address, int len);
 void pci_default_write_config(PCIDevice *d,
-- 
1.5.6.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[v8] Userspace patches for PCI device assignment

2008-10-28 Thread muli

[v8] Userspace patches for PCI device assignment

This patchset enables device assignment for KVM hosts for PCI
devices. It uses the Intel IOMMU by default if available.

Changes from v7->v8 in in particular order:

- various formatting fixes, DEBUG cleanups, cast removals, etc.
- s/strncpy/snprintf/
- split initialization in two phases per aliguori's suggestion
- bail out on errors when we can't limp on
- do ioperm on every cpu and vcpu (Weidong Han)
- use pwrite/pread where applicable
- split r_virtbase into different fields for memory and IO
- fix destruction of MMIO regions (Disheng Su and Weidong Han)

Changes from v6->v7 in no particular order:

- formatting changes: adhere to qemu style
- use strncmp, strncpy etc. instead of the insecure ones
- move from array to linked list
- change iopl() to ioperm() (Weidong Han)
- other small changes as suggested during the review of v6.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/6] device assignment: support for assigning PCI devices to guests

2008-10-28 Thread muli
From: Muli Ben-Yehuda <[EMAIL PROTECTED]>

This patch has been contributed to by the following people:

Or Sagi <[EMAIL PROTECTED]>
Nir Peleg <[EMAIL PROTECTED]>
Amit Shah <[EMAIL PROTECTED]>
Ben-Ami Yassour <[EMAIL PROTECTED]>
Weidong Han <[EMAIL PROTECTED]>
Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Muli Ben-Yehuda <[EMAIL PROTECTED]>

With this patch, we can assign a device on the host machine to a
guest.

A new command-line option, -pcidevice is added.
To invoke it for a device sitting at PCI bus:dev.fn 04:08.0, use this:

-pcidevice host=04:08.0

* The host driver for the device, if any, is to be removed before
assigning the device (else device assignment will fail).

* A device that shares IRQ with another host device cannot currently
be assigned.

* The RAW_IO capability is needed for this to work

This works only with the in-kernel irqchip method; to use the
userspace irqchip, a kernel module (irqhook) and some extra changes
are needed.

[muli: lots of small fixes from Muli and Weidong Han addressing all v7
review comments]

Signed-off-by: Amit Shah <[EMAIL PROTECTED]>
Signed-off-by: Muli Ben-Yehuda <[EMAIL PROTECTED]>
---
 qemu/Makefile.target|3 +
 qemu/hw/device-assignment.c |  641 +++
 qemu/hw/device-assignment.h |  117 
 qemu/hw/pc.c|   16 +
 qemu/hw/pci.c   |7 +
 qemu/qemu-kvm.c |   14 +
 qemu/qemu-kvm.h |8 +
 qemu/vl.c   |   28 ++
 8 files changed, 834 insertions(+), 0 deletions(-)
 create mode 100644 qemu/hw/device-assignment.c
 create mode 100644 qemu/hw/device-assignment.h

diff --git a/qemu/Makefile.target b/qemu/Makefile.target
index d9bdeca..5d44e08 100644
--- a/qemu/Makefile.target
+++ b/qemu/Makefile.target
@@ -621,6 +621,9 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
+ifeq ($(USE_KVM), 1)
+OBJS+= device-assignment.o
+endif
 ifeq ($(USE_KVM_PIT), 1)
 OBJS+= i8254-kvm.o
 endif
diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c
new file mode 100644
index 000..89b05f9
--- /dev/null
+++ b/qemu/hw/device-assignment.c
@@ -0,0 +1,641 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ *
+ *  Assign a PCI device from the host to a guest VM.
+ *
+ *  Adapted for KVM by Qumranet.
+ *
+ *  Copyright (c) 2007, Neocleus, Alex Novik ([EMAIL PROTECTED])
+ *  Copyright (c) 2007, Neocleus, Guy Zana ([EMAIL PROTECTED])
+ *  Copyright (C) 2008, Qumranet, Amit Shah ([EMAIL PROTECTED])
+ *  Copyright (C) 2008, Red Hat, Amit Shah ([EMAIL PROTECTED])
+ *  Copyright (C) 2008, IBM, Muli Ben-Yehuda ([EMAIL PROTECTED])
+ */
+#include 
+#include 
+#include "qemu-kvm.h"
+#include "hw.h"
+#include "pc.h"
+#include "sysemu.h"
+#include "console.h"
+#include "device-assignment.h"
+
+/* From linux/ioport.h */
+#define IORESOURCE_IO   0x0100  /* Resource type */
+#define IORESOURCE_MEM  0x0200
+#define IORESOURCE_IRQ  0x0400
+#define IORESOURCE_DMA  0x0800
+#define IORESOURCE_PREFETCH 0x1000  /* No side effects */
+
+/* #define DEVICE_ASSIGNMENT_DEBUG 1 */
+
+#ifdef DEVICE_ASSIGNMENT_DEBUG
+#define DEBUG(fmt, ...)   \
+do {  \
+  fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__);\
+} while (0)
+#else
+#define DEBUG(fmt, ...) do { } while(0)
+#endif
+
+static uint32_t guest_to_host_ioport(AssignedDevRegion *region, uint32_t addr)
+{
+return region->u.r_baseport + (addr - region->e_physbase);
+}
+
+static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
+   uint32_t value)
+{
+AssignedDevRegion *r_access = opaque;
+uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+
+DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
+ r_pio, (int)r_access->e_physbase,
+ (unsigned long)r_access->r_virtbase, value);
+
+outb(value, r_pio);
+}
+
+static void assigned_dev_ioport_writew(void *opaque, uint32_t addr,
+   uint32_t value)
+{

[PATCH 6/6] device assignment: support for hot-plugging PCI devices

2008-10-28 Thread muli
From: Amit Shah <[EMAIL PROTECTED]>

This patch adds support for hot-plugging host PCI devices into
guests

Signed-off-by: Amit Shah <[EMAIL PROTECTED]>
Signed-off-by: Muli Ben-Yehuda <[EMAIL PROTECTED]>
---
 qemu/hw/device-hotplug.c |   21 +
 qemu/monitor.c   |2 +-
 2 files changed, 22 insertions(+), 1 deletions(-)

diff --git a/qemu/hw/device-hotplug.c b/qemu/hw/device-hotplug.c
index 8e2bc35..817e708 100644
--- a/qemu/hw/device-hotplug.c
+++ b/qemu/hw/device-hotplug.c
@@ -6,6 +6,7 @@
 #include "pc.h"
 #include "console.h"
 #include "block_int.h"
+#include "device-assignment.h"
 
 #define PCI_BASE_CLASS_STORAGE  0x01
 #define PCI_BASE_CLASS_NETWORK  0x02
@@ -27,6 +28,24 @@ static PCIDevice *qemu_system_hot_add_nic(const char *opts, 
int bus_nr)
 return pci_nic_init (pci_bus, &nd_table[ret], -1);
 }
 
+static PCIDevice *qemu_system_hot_assign_device(const char *opts, int bus_nr)
+{
+PCIBus *pci_bus;
+AssignedDevInfo *adev;
+
+pci_bus = pci_find_bus(bus_nr);
+if (!pci_bus) {
+term_printf ("Can't find pci_bus %d\n", bus_nr);
+return NULL;
+}
+adev = add_assigned_device(opts);
+if (adev == NULL) {
+term_printf ("Error adding device; check syntax\n");
+return NULL;
+}
+return init_assigned_device(adev, pci_bus);
+}
+
 static int add_init_drive(const char *opts)
 {
 int drive_opt_idx, drive_idx;
@@ -143,6 +162,8 @@ void device_hot_add(int pcibus, const char *type, const 
char *opts)
 dev = qemu_system_hot_add_nic(opts, pcibus);
 else if (strcmp(type, "storage") == 0)
 dev = qemu_system_hot_add_storage(opts, pcibus);
+else if (strcmp(type, "host") == 0)
+dev = qemu_system_hot_assign_device(opts, pcibus);
 else
 term_printf("invalid type: %s\n", type);
 
diff --git a/qemu/monitor.c b/qemu/monitor.c
index 79b6b4c..d1043b1 100644
--- a/qemu/monitor.c
+++ b/qemu/monitor.c
@@ -1529,7 +1529,7 @@ static const term_cmd_t term_cmds[] = {
 "[,cyls=c,heads=h,secs=s[,trans=t]]\n"
 "[snapshot=on|off][,cache=on|off]",
 "add drive to PCI storage controller" 
},
-{ "pci_add", "iss", device_hot_add, "bus nic|storage 
[[vlan=n][,macaddr=addr][,model=type]] [file=file][,if=type][,bus=nr]...", 
"hot-add PCI device" },
+{ "pci_add", "iss", device_hot_add, "bus nic|storage|host 
[[vlan=n][,macaddr=addr][,model=type]] [file=file][,if=type][,bus=nr]... 
[host=02:00.0[,name=string][,dma=none]" "hot-add PCI device" },
 { "pci_del", "ii", device_hot_remove, "bus slot-number", "hot remove PCI 
device" },
 #endif
 { "balloon", "i", do_balloon,
-- 
1.5.6.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/6] device assignment: introduce functions to correlate pin number and irq

2008-10-28 Thread muli
From: Amit Shah <[EMAIL PROTECTED]>

Signed-off-by: Amit Shah <[EMAIL PROTECTED]>
Signed-off-by: Muli Ben-Yehuda <[EMAIL PROTECTED]>
---
 qemu/hw/pc.h   |3 +++
 qemu/hw/piix_pci.c |   19 +++
 2 files changed, 22 insertions(+), 0 deletions(-)

diff --git a/qemu/hw/pc.h b/qemu/hw/pc.h
index 1f63678..3edf62f 100644
--- a/qemu/hw/pc.h
+++ b/qemu/hw/pc.h
@@ -112,6 +112,9 @@ void i440fx_init_memory_mappings(PCIDevice *d);
 
 int piix4_init(PCIBus *bus, int devfn);
 
+int piix3_get_pin(int pic_irq);
+int piix_get_irq(int pin);
+
 /* vga.c */
 enum vga_retrace_method {
 VGA_RETRACE_DUMB,
diff --git a/qemu/hw/piix_pci.c b/qemu/hw/piix_pci.c
index 6fbf47b..dc12c8a 100644
--- a/qemu/hw/piix_pci.c
+++ b/qemu/hw/piix_pci.c
@@ -243,6 +243,25 @@ static void piix3_set_irq(qemu_irq *pic, int irq_num, int 
level)
 }
 }
 
+int piix3_get_pin(int pic_irq)
+{
+int i;
+for (i = 0; i < 4; i++)
+if (piix3_dev->config[0x60+i] == pic_irq)
+return i;
+return -1;
+}
+
+int piix_get_irq(int pin)
+{
+if (piix3_dev)
+return piix3_dev->config[0x60+pin];
+if (piix4_dev)
+return piix4_dev->config[0x60+pin];
+
+return 0;
+}
+
 static void piix3_reset(PCIDevice *d)
 {
 uint8_t *pci_conf = d->config;
-- 
1.5.6.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] KVM: Fix kvm_free_physmem_slot memory leak.

2008-10-28 Thread François Diakhate
[Sorry, I realized I forgot to check style, here is the fixed patch]

Make sure that kvm_free_physmem_slot also frees the VM memory
if it was allocated by the kernel.

Signed-off-by: François Diakhaté <[EMAIL PROTECTED]>
---
 arch/x86/kvm/x86.c  |   10 +-
 virt/kvm/kvm_main.c |   19 +++
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 883c137..818220b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4179,13 +4179,13 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
if (npages && !old.rmap) {
unsigned long userspace_addr;

-   down_write(¤t->mm->mmap_sem);
+   down_write(&kvm->mm->mmap_sem);
userspace_addr = do_mmap(NULL, 0,
 npages * PAGE_SIZE,
 PROT_READ | PROT_WRITE,
 MAP_PRIVATE | MAP_ANONYMOUS,
 0);
-   up_write(¤t->mm->mmap_sem);
+   up_write(&kvm->mm->mmap_sem);

if (IS_ERR((void *)userspace_addr))
return PTR_ERR((void *)userspace_addr);
@@ -4198,10 +4198,10 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
if (!old.user_alloc && old.rmap) {
int ret;

-   down_write(¤t->mm->mmap_sem);
-   ret = do_munmap(current->mm, old.userspace_addr,
+   down_write(&kvm->mm->mmap_sem);
+   ret = do_munmap(kvm->mm, old.userspace_addr,
old.npages * PAGE_SIZE);
-   up_write(¤t->mm->mmap_sem);
+   up_write(&kvm->mm->mmap_sem);
if (ret < 0)
printk(KERN_WARNING
   "kvm_vm_ioctl_set_memory_region: "
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a87f45e..c7d6585 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -617,9 +617,20 @@ out:
 /*
  * Free any memory in @free but not in @dont.
  */
-static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
+static void kvm_free_physmem_slot(struct kvm *kvm,
+ struct kvm_memory_slot *free,
  struct kvm_memory_slot *dont)
 {
+   if (!dont || free->userspace_addr != dont->userspace_addr) {
+   struct kvm_userspace_memory_region mem = {
+   .slot = memslot_id(kvm, free),
+   .guest_phys_addr = free->base_gfn << PAGE_SHIFT,
+   .memory_size = 0,
+   .flags = 0,
+   };
+   kvm_arch_set_memory_region(kvm, &mem, *free, free->user_alloc);
+   }
+
if (!dont || free->rmap != dont->rmap)
vfree(free->rmap);

@@ -640,7 +651,7 @@ void kvm_free_physmem(struct kvm *kvm)
int i;

for (i = 0; i < kvm->nmemslots; ++i)
-   kvm_free_physmem_slot(&kvm->memslots[i], NULL);
+   kvm_free_physmem_slot(kvm, &kvm->memslots[i], NULL);
 }

 static void kvm_destroy_vm(struct kvm *kvm)
@@ -821,7 +832,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
goto out_free;
}

-   kvm_free_physmem_slot(&old, &new);
+   kvm_free_physmem_slot(kvm, &old, &new);
 #ifdef CONFIG_DMAR
/* map the pages in iommu page table */
r = kvm_iommu_map_pages(kvm, base_gfn, npages);
@@ -831,7 +842,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
return 0;

 out_free:
-   kvm_free_physmem_slot(&new, &old);
+   kvm_free_physmem_slot(kvm, &new, &old);
 out:
return r;

-- 
1.6.0.3
N�r��yb�X��ǧv�^�)޺{.n�+h����ܨ}���Ơz�&j:+v���zZ+��+zf���h���~i���z��w���?�&�)ߢf

[PATCH] x86 emulator: move skip_emulated_instruction()

2008-10-28 Thread Guillaume Thouvenin
 If we call the emulator we shouldn't call skip_emulated_instruction()
in the first place, since the emulator already computes the next rip
for us. Thus we move ->skip_emulated_instruction() out of
kvm_emulate_pio() and into handle_io() (and the svm equivalent). We
also replaced "return 0" by "break" in the "do_io:" case because now
the shadow register state needs to be committed. Otherwise eip will never
be updated.

Signed-off-by: Guillaume Thouvenin <[EMAIL PROTECTED]>
---
 arch/x86/kvm/svm.c |1 +
 arch/x86/kvm/vmx.c |1 +
 arch/x86/kvm/x86.c |2 --
 arch/x86/kvm/x86_emulate.c |2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 743aebd..f0ad4d4 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1115,6 +1115,7 @@ static int io_interception(struct vcpu_svm *svm, struct 
kvm_run *kvm_run)
rep = (io_info & SVM_IOIO_REP_MASK) != 0;
down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
 
+   skip_emulated_instruction(&svm->vcpu);
return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
 }
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 64e2439..789f819 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2687,6 +2687,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct 
kvm_run *kvm_run)
rep = (exit_qualification & 32) != 0;
port = exit_qualification >> 16;
 
+   skip_emulated_instruction(vcpu);
return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ceeac88..38f79b6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2478,8 +2478,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run 
*run, int in,
val = kvm_register_read(vcpu, VCPU_REGS_RAX);
memcpy(vcpu->arch.pio_data, &val, 4);
 
-   kvm_x86_ops->skip_emulated_instruction(vcpu);
-
pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
if (pio_dev) {
kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 57d7cc4..8f60ace 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1772,7 +1772,7 @@ special_insn:
c->eip = saved_eip;
goto cannot_emulate;
}
-   return 0;
+   break;
case 0xf4:  /* hlt */
ctxt->vcpu->arch.halt_request = 1;
break;
-- 
1.6.0.3.514.g2f91b

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/3] kvm-userspace: ppc: userspace fixes for powerpc

2008-10-28 Thread Christian Ehrhardt
Ok I should have send these two series with some minutes in between to 
not intermix them :-/

Additionally I have the wrong header in this one it should be [0/5] :-/++

Overall it is a patch series of three patches for powerpc kvm-userspace, 
a five patch series for kvm-suerspace/user/ and a single patch I just 
submitted while cleaning our userspace repo to get the missing things 
upstream.


Avi, let me know if I confused you and I'll send them once again with 
some time in between to order them easier.


[EMAIL PROTECTED] wrote:

From: Christian Ehrhardt <[EMAIL PROTECTED]>

This is a set of fixes for the powerpc tests kvm-userspace/user.

Patch 1&2 fix main-ppc.c while patch 3 introduces libcflat for powerpc.
Further on patch 4 provides a timebase accessor for the ppc testcases (not
used yet) and patch 5 finally adds a stub nmi handler to main-ppc.c.

[patches in series]
[PATCH 1/5] user: ppc: fix threading bugs in main-ppc.c
[PATCH 2/5] user: ppc: better error reporting in load_file
[PATCH 3/5] user: ppc: implement PowerPC 44x libcflat
[PATCH 4/5] libcflat: ppc: add timebase accessor
[PATCH 5/5] user: ppc: add stub nmi handler

---
[diffstat]
 b/user/config-powerpc-44x.mak  |   14 +
 b/user/config-powerpc.mak  |   46 -
 b/user/main-ppc.c  |   32 +++-
 b/user/test/lib/powerpc/44x/map.c  |   51 +
 b/user/test/lib/powerpc/44x/timebase.S |   28 ++
 b/user/test/lib/powerpc/44x/timebase.h |   25 
 b/user/test/lib/powerpc/44x/tlbwe.S|   29 ++
 b/user/test/lib/powerpc/io.c   |   35 ++
 b/user/test/powerpc/cstart.S   |   38 
 b/user/test/powerpc/exit.c |   23 ++
 user/config-powerpc-44x.mak|3 +
 user/main-ppc.c|9 +
 12 files changed, 296 insertions(+), 37 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
  



--

Grüsse / regards, 
Christian Ehrhardt

IBM Linux Technology Center, Open Virtualization

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] kvm-userspace: ppc: fix initial ppc memory setup

2008-10-28 Thread ehrhardt
From: Christian Ehrhardt <[EMAIL PROTECTED]>

The old memory initialization code was broken for all cases not fitting in one
ram stick. This patch fixes the ram_stick calculation, now sets the proper
base adresses per stick and removes the old workaround.

Signed-off-by: Christian Ehrhardt <[EMAIL PROTECTED]>
---

[diffstat]
 ppc440.c|   12 +---
 ppc440.h|8 ++--
 ppc440_bamboo.c |   30 --
 3 files changed, 31 insertions(+), 19 deletions(-)

[diff]

diff --git a/qemu/hw/ppc440.c b/qemu/hw/ppc440.c
--- a/qemu/hw/ppc440.c
+++ b/qemu/hw/ppc440.c
@@ -3,6 +3,7 @@
  *
  * Copyright 2007 IBM Corporation.
  * Authors: Jerone Young <[EMAIL PROTECTED]>
+ * Christian Ehrhardt <[EMAIL PROTECTED]>
  *
  * This work is licensed under the GNU GPL license version 2 or later.
  *
@@ -24,15 +25,15 @@
 
 
 void ppc440ep_init(CPUState *env,
-   target_phys_addr_t ram_bases[2],
-   target_phys_addr_t ram_sizes[2],
+   target_phys_addr_t ram_bases[PPC440_MAX_RAM_SLOTS],
+   target_phys_addr_t ram_sizes[PPC440_MAX_RAM_SLOTS],
+   int nbanks,
qemu_irq **picp,
ppc4xx_pci_t **pcip,
int do_init)
 {
ppc4xx_mmio_t *mmio;
qemu_irq *pic, *irqs;
-   ram_addr_t offset;
ppc4xx_pci_t *pci;
int i;
 
@@ -55,10 +56,7 @@
/* SDRAM controller */
printf("trying to setup sdram controller\n");
/* XXX 440EP's ECC interrupts are on UIC1 */
-   ppc405_sdram_init(env, pic[14], 2, ram_bases, ram_sizes, do_init);
-   offset = 0;
-   for (i = 0; i < 2; i++)
-   offset += ram_sizes[i];
+   ppc405_sdram_init(env, pic[14], nbanks, ram_bases, ram_sizes, do_init);
 
/* PCI */
pci = ppc4xx_pci_init(env, pic,
diff --git a/qemu/hw/ppc440.h b/qemu/hw/ppc440.h
--- a/qemu/hw/ppc440.h
+++ b/qemu/hw/ppc440.h
@@ -3,6 +3,7 @@
  *
  * Copyright 2007 IBM Corporation.
  * Authors: Jerone Young <[EMAIL PROTECTED]>
+ * Christian Ehrhardt <[EMAIL PROTECTED]>
  *
  * This work is licensed under the GNU GPL licence version 2 or later
  *
@@ -20,9 +21,12 @@
 #include "exec-all.h"
 #include "boards.h"
 
+#define PPC440_MAX_RAM_SLOTS 4
+
 void ppc440ep_init(CPUState *env,
-   target_phys_addr_t ram_bases[2],
-   target_phys_addr_t ram_sizes[2],
+   target_phys_addr_t ram_bases[PPC440_MAX_RAM_SLOTS],
+   target_phys_addr_t ram_sizes[PPC440_MAX_RAM_SLOTS],
+   int nbanks,
qemu_irq **picp,
ppc4xx_pci_t **pcip,
int do_init);
diff --git a/qemu/hw/ppc440_bamboo.c b/qemu/hw/ppc440_bamboo.c
--- a/qemu/hw/ppc440_bamboo.c
+++ b/qemu/hw/ppc440_bamboo.c
@@ -2,7 +2,9 @@
  * Qemu PowerPC 440 board emualtion
  *
  * Copyright 2007 IBM Corporation.
- * Authors: Jerone Young <[EMAIL PROTECTED]>
+ * Authors:
+ * Jerone Young <[EMAIL PROTECTED]>
+ * Christian Ehrhardt <[EMAIL PROTECTED]>
  *
  * This work is licensed under the GNU GPL license version 2 or later.
  *
@@ -30,7 +32,8 @@
const char *cpu_model)
 {
char *buf=NULL;
-   target_phys_addr_t ram_bases[4], ram_sizes[4];
+   target_phys_addr_t ram_bases[PPC440_MAX_RAM_SLOTS];
+   target_phys_addr_t ram_sizes[PPC440_MAX_RAM_SLOTS];
NICInfo *nd;
qemu_irq *pic;
ppc4xx_pci_t *pci;
@@ -46,6 +49,8 @@
int ret;
int ram_stick_sizes[] = {256<<20, 128<<20, 64<<20,
32<<20, 16<<20, 8<<20 }; /* in bytes */
+   int nbanks = 0; /* number of used memory banks */
+   int next_bank_offset = 0;
ram_addr_t tmp_ram_size;
int i=0, k=0;
uint32_t cpu_freq;
@@ -55,15 +60,22 @@
printf("%s: START\n", __func__);
 
/* Setup Memory */
-   printf("Ram size passed is: %i MB\n",
-   bytes_to_mb((int)ram_size));
+   if (ram_size < 8<<20) {
+   printf("ERROR: ram size too small (min 8mb)\n");
+   exit(1);
+   } else
+   printf("Ram size passed is: %i MB\n",
+   bytes_to_mb((int)ram_size));
 
tmp_ram_size = ram_size;
 
-   for (i=0; i < (sizeof(ram_sizes)/sizeof(ram_sizes[0])); i++) {
-   for (k=0; k < 
(sizeof(ram_stick_sizes)/sizeof(ram_stick_sizes[0])); k++) {
+   for (i = 0; i < PPC440_MAX_RAM_SLOTS; i++) {
+   for (k = 0; k < (sizeof(ram_stick_sizes)/sizeof(int)); k++) {
if ((tmp_ram_size/ram_stick_sizes[k]) > 0) {
ram_sizes[i] = ram_stick_sizes[k];
+   ram_bases[i] = next_bank_offset;
+   next_bank_offset += ram_stick_sizes[k];
+   nbanks++;
tmp_ram_size -= ram_stick_sizes[k];
break;
   

[PATCH 0/3] kvm-userspace: ppc: userspace fixes for powerpc

2008-10-28 Thread ehrhardt
From: Christian Ehrhardt <[EMAIL PROTECTED]>

This is a set of various functional fixes in kvm-userspace for powerpc.

Patch 1 fullfils the requirement to provide a max smp cpu in the machine
 struct, without that value qemu denies to run the guest (cpu 1 > maxcpu 0)

Patch 2 is a intermediate fix to allow ppc (and hopefully all others to
 build) until we changed the unifdef to sed'ing files as avi suggested.
 Until then it would be nice if that patch could fix the build issues for
 all of us in the unifdef style.

Patch3 is a rework of the powerpc 440 guest memory initialization. I looked
 at it because the -m option did not work sometimes but it came up that the
 memory setup is broken and only running due to a workaround.

qemu-devel is on cc for patch 1/3

[patches in series]
[PATCH 1/3] qemu: ppc: define maximum SMP limit as 1 for Bamboo
[PATCH 2/3] kvm: external module: Treat NONARCH_CONFIG as a list
[PATCH 3/3] kvm-userspace: ppc: fix initial ppc memory setup

---
[diffstat]
 b/kernel/Makefile |3 ++-
 b/qemu/hw/ppc440.c|   12 +---
 b/qemu/hw/ppc440.h|8 ++--
 b/qemu/hw/ppc440_bamboo.c |7 ---
 qemu/hw/ppc440_bamboo.c   |   30 --
 5 files changed, 37 insertions(+), 23 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] qemu: ppc: if not a uImage, try to load kernel as ELF

2008-10-28 Thread ehrhardt
From: Hollis Blanchard <[EMAIL PROTECTED]>

This allows qemu to load "bare metal" ELF kernels, useful for standalone
benchmarks and testcases.

We could/should also load the specified file as a flat binary, if both uImage
and ELF loaders fail. (See hw/arm_boot.c.)

Signed-off-by: Hollis Blanchard <[EMAIL PROTECTED]>
Signed-off-by: Christian Ehrhardt <[EMAIL PROTECTED]>
---

[diffstat]
 ppc440_bamboo.c |7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

[diff]
diff --git a/qemu/hw/ppc440_bamboo.c b/qemu/hw/ppc440_bamboo.c
--- a/qemu/hw/ppc440_bamboo.c
+++ b/qemu/hw/ppc440_bamboo.c
@@ -35,8 +35,8 @@ void bamboo_init(ram_addr_t ram_size, in
qemu_irq *pic;
ppc4xx_pci_t *pci;
CPUState *env;
-   target_ulong ep=0;
-   target_ulong la=0;
+   uint64_t ep=0;
+   uint64_t la=0;
int is_linux=1; /* Will assume allways is Linux for now */
target_long kernel_size=0;
target_ulong initrd_base=0;
@@ -98,6 +98,9 @@ void bamboo_init(ram_addr_t ram_size, in
/* load kernel with uboot loader */
printf("%s: load kernel\n", __func__);
ret = load_uimage(kernel_filename, &ep, &la, &kernel_size, &is_linux);
+   if (ret < 0)
+   ret = load_elf(kernel_filename, 0, &ep, &la, NULL);
+
if (ret < 0) {
fprintf(stderr, "qemu: could not load kernel '%s'\n",
kernel_filename);

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/5] user: ppc: fix threading bugs in main-ppc.c

2008-10-28 Thread ehrhardt
From: Hollis Blanchard <[EMAIL PROTECTED]>

- call io_table_register() before any vcpus have started
- wait for all vcpus to exit before exiting the parent thread

Signed-off-by: Hollis Blanchard <[EMAIL PROTECTED]>
Signed-off-by: Christian Ehrhardt <[EMAIL PROTECTED]>
---

[diffstat]
 main-ppc.c |   32 
 1 file changed, 12 insertions(+), 20 deletions(-)

[diff]
diff --git a/user/main-ppc.c b/user/main-ppc.c
--- a/user/main-ppc.c
+++ b/user/main-ppc.c
@@ -51,7 +51,7 @@ struct io_table mmio_table;
 struct io_table mmio_table;
 
 static int ncpus = 1;
-static sem_t init_sem;
+static sem_t exited_sem;
 static __thread int vcpu;
 static sigset_t kernel_sigmask;
 static sigset_t ipi_sigmask;
@@ -220,16 +220,8 @@ void sync_caches(void *mem, unsigned lon
asm volatile ("sync; isync");
 }
 
-static void init_vcpu(int n, unsigned long entry)
+static void init_vcpu(int n)
 {
-   /* XXX must set initial TLB state and stack
-   struct kvm_regs regs = {
-   .pc = entry,
-   };
-
-   kvm_set_regs(kvm, 0, ®s);
-   */
-
sigemptyset(&ipi_sigmask);
sigaddset(&ipi_sigmask, IPI_SIGNAL);
sigprocmask(SIG_UNBLOCK, &ipi_sigmask, NULL);
@@ -237,7 +229,6 @@ static void init_vcpu(int n, unsigned lo
vcpus[n].tid = gettid();
vcpu = n;
kvm_set_signal_mask(kvm, n, &kernel_sigmask);
-   sem_post(&init_sem);
 }
 
 static void *do_create_vcpu(void *_n)
@@ -245,8 +236,9 @@ static void *do_create_vcpu(void *_n)
int n = (long)_n;
 
kvm_create_vcpu(kvm, n);
-   init_vcpu(n, 0x0);
+   init_vcpu(n);
kvm_run(kvm, n);
+   sem_post(&exited_sem);
return NULL;
 }
 
@@ -368,14 +360,14 @@ int main(int argc, char **argv)
len = load_file(vm_mem, argv[optind], 1);
sync_caches(vm_mem, len);
 
-   sem_init(&init_sem, 0, 0);
-   init_vcpu(0, 0x0);
-   for (i = 1; i < ncpus; ++i)
-   start_vcpu(i);
-   for (i = 0; i < ncpus; ++i)
-   sem_wait(&init_sem);
-
io_table_register(&mmio_table, 0xf000, 64, mmio_handler, NULL);
 
-   return kvm_run(kvm, 0);
+   sem_init(&exited_sem, 0, 0);
+   for (i = 0; i < ncpus; ++i)
+   start_vcpu(i);
+   /* Wait for all vcpus to exit. */
+   for (i = 0; i < ncpus; ++i)
+   sem_wait(&exited_sem);
+
+   return 0;
 }
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/3] kvm-userspace: ppc: userspace fixes for powerpc

2008-10-28 Thread ehrhardt
From: Christian Ehrhardt <[EMAIL PROTECTED]>

This is a set of fixes for the powerpc tests kvm-userspace/user.

Patch 1&2 fix main-ppc.c while patch 3 introduces libcflat for powerpc.
Further on patch 4 provides a timebase accessor for the ppc testcases (not
used yet) and patch 5 finally adds a stub nmi handler to main-ppc.c.

[patches in series]
[PATCH 1/5] user: ppc: fix threading bugs in main-ppc.c
[PATCH 2/5] user: ppc: better error reporting in load_file
[PATCH 3/5] user: ppc: implement PowerPC 44x libcflat
[PATCH 4/5] libcflat: ppc: add timebase accessor
[PATCH 5/5] user: ppc: add stub nmi handler

---
[diffstat]
 b/user/config-powerpc-44x.mak  |   14 +
 b/user/config-powerpc.mak  |   46 -
 b/user/main-ppc.c  |   32 +++-
 b/user/test/lib/powerpc/44x/map.c  |   51 +
 b/user/test/lib/powerpc/44x/timebase.S |   28 ++
 b/user/test/lib/powerpc/44x/timebase.h |   25 
 b/user/test/lib/powerpc/44x/tlbwe.S|   29 ++
 b/user/test/lib/powerpc/io.c   |   35 ++
 b/user/test/powerpc/cstart.S   |   38 
 b/user/test/powerpc/exit.c |   23 ++
 user/config-powerpc-44x.mak|3 +
 user/main-ppc.c|9 +
 12 files changed, 296 insertions(+), 37 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/5] libcflat: ppc: add timebase accessor

2008-10-28 Thread ehrhardt
Provide a timebase accessor for ppc testcases.

Signed-off-by: Hollis Blanchard <[EMAIL PROTECTED]>
Signed-off-by: Christian Ehrhardt <[EMAIL PROTECTED]>

[diffstat]
 config-powerpc-44x.mak  |3 ++-
 test/lib/powerpc/44x/timebase.S |   28 
 test/lib/powerpc/44x/timebase.h |   25 +
 3 files changed, 55 insertions(+), 1 deletion(-)

[diff]
diff --git a/user/config-powerpc-44x.mak b/user/config-powerpc-44x.mak
--- a/user/config-powerpc-44x.mak
+++ b/user/config-powerpc-44x.mak
@@ -5,7 +5,8 @@
 
 cflatobjs += \
test/lib/powerpc/44x/map.o \
-   test/lib/powerpc/44x/tlbwe.o
+   test/lib/powerpc/44x/tlbwe.o \
+   test/lib/powerpc/44x/timebase.o
 
 simpletests += \
test/powerpc/44x/tlbsx.bin \
diff --git a/user/test/lib/powerpc/44x/timebase.S 
b/user/test/lib/powerpc/44x/timebase.S
new file mode 100644
--- /dev/null
+++ b/user/test/lib/powerpc/44x/timebase.S
@@ -0,0 +1,28 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <[EMAIL PROTECTED]>
+ */
+
+/* unsigned long long mftb(void); */
+.global mftb
+mftb:
+   mftbu   r5
+   mftbl   r4
+   mftbu   r3
+   cmpwr3, r5
+   bne mftb
+   blr
diff --git a/user/test/lib/powerpc/44x/timebase.h 
b/user/test/lib/powerpc/44x/timebase.h
new file mode 100644
--- /dev/null
+++ b/user/test/lib/powerpc/44x/timebase.h
@@ -0,0 +1,25 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <[EMAIL PROTECTED]>
+ */
+
+#ifndef __TIMEBASE_H__
+#define __TIMEBASE_H__
+
+unsigned long long mftb(void);
+
+#endif /* __TIMEBASE_H__ */
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/5] user: ppc: better error reporting in load_file

2008-10-28 Thread ehrhardt
From: Hollis Blanchard <[EMAIL PROTECTED]>

Fancy description.

Signed-off-by: Hollis Blanchard <[EMAIL PROTECTED]>
Signed-off-by: Christian Ehrhardt <[EMAIL PROTECTED]>
---

[diffstat]
 main-ppc.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

[diff]
diff --git a/user/main-ppc.c b/user/main-ppc.c
--- a/user/main-ppc.c
+++ b/user/main-ppc.c
@@ -183,7 +183,7 @@ static struct kvm_callbacks test_callbac
 
 static unsigned long load_file(void *mem, const char *fname, int inval_icache)
 {
-   int r;
+   ssize_t r;
int fd;
unsigned long bytes = 0;
 
@@ -200,6 +200,7 @@ static unsigned long load_file(void *mem
 
if (r == -1) {
perror("read");
+   printf("read %d bytes\n", bytes);
exit(1);
}
 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/5] user: ppc: add stub nmi handler

2008-10-28 Thread ehrhardt
From: Hollis Blanchard <[EMAIL PROTECTED]>

Adding a nmi stub handler for user/main-ppc.c. We already pushed a stub
for qemu but not for the test suite in the user dir.

Signed-off-by: Hollis Blanchard <[EMAIL PROTECTED]>
Signed-off-by: Christian Ehrhardt <[EMAIL PROTECTED]>
---

[diffstat]
 main-ppc.c |6 ++
 1 file changed, 6 insertions(+)

[diff]
diff --git a/user/main-ppc.c b/user/main-ppc.c
--- a/user/main-ppc.c
+++ b/user/main-ppc.c
@@ -83,6 +83,11 @@ static int test_io_window(void *opaque)
 }
 
 static int test_try_push_interrupts(void *opaque)
+{
+   return 0;
+}
+
+static int test_try_push_nmi(void *opaque)
 {
return 0;
 }
@@ -175,6 +180,7 @@ static struct kvm_callbacks test_callbac
.halt= test_halt,
.io_window = test_io_window,
.try_push_interrupts = test_try_push_interrupts,
+   .try_push_nmi = test_try_push_nmi,
.post_kvm_run = test_post_kvm_run,
.pre_kvm_run = test_pre_kvm_run,
.powerpc_dcr_read = test_dcr_read,
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/5] user: ppc: implement PowerPC 44x libcflat

2008-10-28 Thread ehrhardt
From: Hollis Blanchard <[EMAIL PROTECTED]>

- Create a 44x-specific makefile.
- Reorganize PowerPC makefiles to separate "simple" tests from those which
  link with libcflat.
- Create a minimal libcflat testcase (which just exits).

Signed-off-by: Hollis Blanchard <[EMAIL PROTECTED]>
Signed-off-by: Christian Ehrhardt <[EMAIL PROTECTED]>
---

[diffstat]
 config-powerpc-44x.mak   |   14 +++
 config-powerpc.mak   |   46 ++
 test/lib/powerpc/44x/map.c   |   51 +++
 test/lib/powerpc/44x/tlbwe.S |   29 
 test/lib/powerpc/io.c|   35 +
 test/powerpc/cstart.S|   38 
 test/powerpc/exit.c  |   23 +++
 7 files changed, 221 insertions(+), 15 deletions(-)

[diff]

diff --git a/user/config-powerpc-44x.mak b/user/config-powerpc-44x.mak
new file mode 100644
--- /dev/null
+++ b/user/config-powerpc-44x.mak
@@ -0,0 +1,14 @@
+
+
+# for some reason binutils hates tlbsx unless we say we're 405  :(
+CFLAGS += -Wa,-m405 -I test/lib/powerpc/44x
+
+cflatobjs += \
+   test/lib/powerpc/44x/map.o \
+   test/lib/powerpc/44x/tlbwe.o
+
+simpletests += \
+   test/powerpc/44x/tlbsx.bin \
+   test/powerpc/44x/tlbwe_16KB.bin \
+   test/powerpc/44x/tlbwe_hole.bin \
+   test/powerpc/44x/tlbwe.bin
diff --git a/user/config-powerpc.mak b/user/config-powerpc.mak
--- a/user/config-powerpc.mak
+++ b/user/config-powerpc.mak
@@ -1,26 +1,42 @@
+platform := 44x
+
 CFLAGS += -m32
 CFLAGS += -D__powerpc__
 CFLAGS += -I $(KERNELDIR)/include
-# for some reaons binutils hates tlbsx unless we say we're 405  :(
-CFLAGS += -Wa,-mregnames,-m405
+CFLAGS += -Wa,-mregnames -I test/lib
 
-%.bin: %.o
-   $(OBJCOPY) -O binary $^ $@
+cstart := test/powerpc/cstart.o
 
-testobjs := \
-   io.bin \
-   spin.bin \
-   sprg.bin \
-   44x/tlbsx.bin \
-   44x/tlbwe_16KB.bin \
-   44x/tlbwe_hole.bin \
-   44x/tlbwe.bin
+cflatobjs += \
+   test/lib/powerpc/io.o
 
-tests := $(addprefix test/powerpc/, $(testobjs))
+$(libcflat): LDFLAGS += -nostdlib
+$(libcflat): CFLAGS += -ffreestanding
 
-all: kvmtrace kvmctl $(tests)
+# these tests do not use libcflat
+simpletests := \
+   test/powerpc/spin.bin \
+   test/powerpc/io.bin \
+   test/powerpc/sprg.bin
+
+# theses tests use cstart.o, libcflat, and libgcc
+tests := \
+   test/powerpc/exit.bin
+
+include config-powerpc-$(platform).mak
+
+
+all: kvmtrace kvmctl $(libcflat) $(simpletests) $(tests)
+
+$(simpletests): %.bin: %.o
+   $(CC) -nostdlib $^ -Wl,-T,flat.lds -o $@
+
+$(tests): %.bin: $(cstart) %.o $(libcflat)
+   $(CC) -nostdlib $^ $(libgcc) -Wl,-T,flat.lds -o $@
 
 kvmctl_objs = main-ppc.o iotable.o ../libkvm/libkvm.a
 
 arch_clean:
-   rm -f $(tests)
+   $(RM) $(simpletests) $(tests) $(cstart)
+   $(RM) $(patsubst %.bin, %.elf, $(simpletests) $(tests))
+   $(RM) $(patsubst %.bin, %.o, $(simpletests) $(tests))
diff --git a/user/test/lib/powerpc/44x/map.c b/user/test/lib/powerpc/44x/map.c
new file mode 100644
--- /dev/null
+++ b/user/test/lib/powerpc/44x/map.c
@@ -0,0 +1,51 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <[EMAIL PROTECTED]>
+ */
+
+#include "libcflat.h"
+
+#define TLB_SIZE 64
+
+extern void tlbwe(unsigned int index,
+ unsigned char tid,
+ unsigned int word0,
+ unsigned int word1,
+ unsigned int word2);
+
+unsigned int next_free_index;
+
+#define PAGE_SHIFT 12
+#define PAGE_MASK (~((1<= TLB_SIZE)
+   panic("TLB overflow");
+
+   w0 = (vaddr & PAGE_MASK) | V;
+   w1 = paddr & PAGE_MASK;
+   w2 = 0x3;
+
+   tlbwe(next_free_index, 0, w0, w1, w2);
+}
diff --git a/user/test/lib/powerpc/44x/tlbwe.S 
b/user/test/lib/powerpc/44x/tlbwe.S
new file mode 100644
--- /dev/null
+++ b/user/test/lib/powerpc/44x/tlbwe.S
@@ -0,0 +1,29 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied w

[PATCH 2/3] kvm: external module: Treat NONARCH_CONFIG as a list

2008-10-28 Thread ehrhardt
From: Hollis Blanchard <[EMAIL PROTECTED]>

As discussed on the list the unifdef changes break powerpc (and more ?). A fix
is to treat NONARCH_CONFIG as a list instead of a single item.

Signed-off-by: Hollis Blanchard <[EMAIL PROTECTED]>
Signed-off-by: Christian Ehrhardt <[EMAIL PROTECTED]>
---

[diffstat]
 Makefile |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

[diff]
diff --git a/kernel/Makefile b/kernel/Makefile
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -25,8 +25,9 @@
gawk -v version=$(version) -f $(ARCH_DIR)/hack-module.awk $1.orig \
| sed '/\#include/! s/\blapic\b/l_apic/g' > $1 && rm $1.orig
 
+unifdef_uflags = $(foreach arch, $(NONARCH_CONFIG), -UCONFIG_$(arch))
 unifdef = mv $1 $1.orig && \
- unifdef -DCONFIG_$(ARCH_CONFIG) -UCONFIG_$(NONARCH_CONFIG) $1.orig > 
$1; \
+ unifdef -DCONFIG_$(ARCH_CONFIG) $(unifdef_uflags) $1.orig > $1; \
   [ $$? -le 2 ] && rm $1.orig
 
 hack = $(call _hack,$T/$(strip $1))
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] qemu: ppc: define maximum SMP limit as 1 for Bamboo

2008-10-28 Thread ehrhardt
From: Christian Ehrhardt <[EMAIL PROTECTED]>

Fix for qemu runtime error. Full error message:
Number of SMP cpus requested (1), exceeds max cpus supported by machine 
`bamboo' (0)

Signed-off-by: Christian Ehrhardt <[EMAIL PROTECTED]>
Signed-off-by: Hollis Blanchard <[EMAIL PROTECTED]>
---

[diffstat]
 ppc440_bamboo.c |7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

[diff]
diff --git a/qemu/hw/ppc440_bamboo.c b/qemu/hw/ppc440_bamboo.c
--- a/qemu/hw/ppc440_bamboo.c
+++ b/qemu/hw/ppc440_bamboo.c
@@ -203,7 +203,8 @@ void bamboo_init(ram_addr_t ram_size, in
 }

 QEMUMachine bamboo_machine = {
-   "bamboo",
-   "bamboo",
-   bamboo_init,
+   .name = "bamboo",
+   .desc = "bamboo",
+   .init = bamboo_init,
+   .max_cpus = 1,
 };
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] KVM: Fix kvm_free_physmem_slot memory leak.

2008-10-28 Thread François Diakhate
[Updated the patch taking your comments into account]

Make sure that kvm_free_physmem_slot also frees the VM memory
if it was allocated by the kernel.

Signed-off-by: François Diakhaté <[EMAIL PROTECTED]>
---
 arch/x86/kvm/x86.c  |   10 +-
 virt/kvm/kvm_main.c |   18 ++
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 883c137..818220b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4179,13 +4179,13 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
if (npages && !old.rmap) {
unsigned long userspace_addr;

-   down_write(¤t->mm->mmap_sem);
+   down_write(&kvm->mm->mmap_sem);
userspace_addr = do_mmap(NULL, 0,
 npages * PAGE_SIZE,
 PROT_READ | PROT_WRITE,
 MAP_PRIVATE | MAP_ANONYMOUS,
 0);
-   up_write(¤t->mm->mmap_sem);
+   up_write(&kvm->mm->mmap_sem);

if (IS_ERR((void *)userspace_addr))
return PTR_ERR((void *)userspace_addr);
@@ -4198,10 +4198,10 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
if (!old.user_alloc && old.rmap) {
int ret;

-   down_write(¤t->mm->mmap_sem);
-   ret = do_munmap(current->mm, old.userspace_addr,
+   down_write(&kvm->mm->mmap_sem);
+   ret = do_munmap(kvm->mm, old.userspace_addr,
old.npages * PAGE_SIZE);
-   up_write(¤t->mm->mmap_sem);
+   up_write(&kvm->mm->mmap_sem);
if (ret < 0)
printk(KERN_WARNING
   "kvm_vm_ioctl_set_memory_region: "
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a87f45e..b420930 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -617,9 +617,19 @@ out:
 /*
  * Free any memory in @free but not in @dont.
  */
-static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
+static void kvm_free_physmem_slot(struct kvm * kvm, struct
kvm_memory_slot *free,
  struct kvm_memory_slot *dont)
 {
+   if(!dont || free->userspace_addr != dont->userspace_addr) {
+   struct kvm_userspace_memory_region mem = {
+   .slot = memslot_id(kvm, free),
+   .guest_phys_addr = free->base_gfn << PAGE_SHIFT,
+   .memory_size = 0,
+   .flags = 0,
+   };
+   kvm_arch_set_memory_region(kvm, &mem, *free, free->user_alloc);
+   }
+
if (!dont || free->rmap != dont->rmap)
vfree(free->rmap);

@@ -640,7 +650,7 @@ void kvm_free_physmem(struct kvm *kvm)
int i;

for (i = 0; i < kvm->nmemslots; ++i)
-   kvm_free_physmem_slot(&kvm->memslots[i], NULL);
+   kvm_free_physmem_slot(kvm, &kvm->memslots[i], NULL);
 }

 static void kvm_destroy_vm(struct kvm *kvm)
@@ -821,7 +831,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
goto out_free;
}

-   kvm_free_physmem_slot(&old, &new);
+   kvm_free_physmem_slot(kvm, &old, &new);
 #ifdef CONFIG_DMAR
/* map the pages in iommu page table */
r = kvm_iommu_map_pages(kvm, base_gfn, npages);
@@ -831,7 +841,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
return 0;

 out_free:
-   kvm_free_physmem_slot(&new, &old);
+   kvm_free_physmem_slot(kvm, &new, &old);
 out:
return r;

-- 
1.6.0.3