[kvm-devel] Gentoo livecd amd64 not working?

2007-05-14 Thread Wink Saville
This weekend I tried to get the Gentoo livecd for amd64 and i686 working.
The i686 version successfully installed and does "run". The amd64
livecd boots and
runs but then during install stops trying to unpack the 50th of 116 files at
the beginning of the install process.

I'm running on an Intel dual core 2 6600 with 2G of Ram. I created qemu/kvm
from git sources with the last git commit of 05eb943c9b547 on May 6 16:10:01.

What other information may be needed?

Regards,

Wink Saville

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] Gentoo livecd amd64 not working?

2007-05-14 Thread Avi Kivity
Wink Saville wrote:
> This weekend I tried to get the Gentoo livecd for amd64 and i686 working.
> The i686 version successfully installed and does "run". The amd64
> livecd boots and
> runs but then during install stops trying to unpack the 50th of 116 files at
> the beginning of the install process.
>
> I'm running on an Intel dual core 2 6600 with 2G of Ram. I created qemu/kvm
> from git sources with the last git commit of 05eb943c9b547 on May 6 16:10:01.
>
> What other information may be needed?
>
>   

Is there anything in dmesg?

Please provide a link to the livecd you used.


-- 
Do not meddle in the internals of kernels, for they are subtle and quick to 
panic.


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 3/8] KVM: Adds ability to preempt an executing VCPU

2007-05-14 Thread Avi Kivity
Gregory Haskins wrote:
> The VCPU executes synchronously w.r.t. userspace today, and therefore
> interrupt injection is pretty straight forward.  However, we will soon need
> to be able to inject interrupts asynchronous to the execution of the VCPU
> due to the introduction of SMP, paravirtualized drivers, and asynchronous
> hypercalls.  This patch adds support to the interrupt mechanism to force
> a VCPU to VMEXIT when a new interrupt is pending.
>
>   

Comments below are fairly minor, but worthwhile IMO.



> Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]>
> ---
>
>  drivers/kvm/kvm.h  |2 ++
>  drivers/kvm/kvm_main.c |   59 
> +++-
>  drivers/kvm/svm.c  |   43 +++
>  drivers/kvm/vmx.c  |   43 +++
>  4 files changed, 146 insertions(+), 1 deletions(-)
>
> diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
> index 059f074..0f6cc32 100644
> --- a/drivers/kvm/kvm.h
> +++ b/drivers/kvm/kvm.h
> @@ -329,6 +329,8 @@ struct kvm_vcpu_irq {
>   struct kvm_irqdevice dev;
>   int  pending;
>   int  deferred;
> + struct task_struct  *task;
> + int  guest_mode;
>   

->guest_mode can be folded into ->task, by specifying that ->task != 
NULL is equivalent to ->guest_mode != 0.  This will make the rest of the 
code easier to read.

>  };
>  
>  struct kvm_vcpu {
> diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
> index 199489b..a160638 100644
> --- a/drivers/kvm/kvm_main.c
> +++ b/drivers/kvm/kvm_main.c
> @@ -1868,6 +1868,9 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, 
> struct kvm_run *kvm_run)
>   kvm_arch_ops->decache_regs(vcpu);
>   }
>  
> + vcpu->irq.task = current;
> + smp_wmb();
> +
>   

This is best moved where ->guest_mode is set.

> +/*
>   * This function will be invoked whenever the vcpu->irq.dev raises its INTR
>   * line
>   */
> @@ -2318,10 +2335,50 @@ static void kvm_vcpu_intr(struct kvm_irqsink *this,
>  {
>   struct kvm_vcpu *vcpu = (struct kvm_vcpu*)this->private;
>   unsigned long flags;
> + int direct_ipi = -1;
>  
>   spin_lock_irqsave(&vcpu->irq.lock, flags);
>   

irqs are always enabled here, so spin_lock_irq() (and a corresponding 
spin_unlock_irq) is sufficient.

>  static void kvm_vcpu_irqsink_init(struct kvm_vcpu *vcpu)
> diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
> index 4c03881..91546ae 100644
> --- a/drivers/kvm/svm.c
> +++ b/drivers/kvm/svm.c
> @@ -1542,11 +1542,40 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct 
> kvm_run *kvm_run)
>   u16 gs_selector;
>   u16 ldt_selector;
>   int r;
> + unsigned long irq_flags;
>  
>  again:
> + /*
> +  * We disable interrupts until the next VMEXIT to eliminate a race
> +  * condition for delivery of virtual interrutps.  Note that this is
> +  * probably not as bad as it sounds, as interrupts will still invoke
> +  * a VMEXIT once transitioned to GUEST mode (and thus exit this lock
> +  * scope) even if they are disabled.
> +  *
> +  * FIXME: Do we need to do anything additional to mask IPI/NMIs?
>   

You can remove the FIXME.

> +  */
> + local_irq_save(irq_flags);
>   

Interrupts are always enabled here, so local_irq_disable() suffices.

> @@ -1688,6 +1717,13 @@ again:
>  #endif
>   : "cc", "memory" );
>  
> + /*
> +  * FIXME: We'd like to turn on interrupts ASAP, but is this so early
> +  * that we will mess up the state of the CPU before we fully
> +  * transition from guest to host?
> +  */
>   

You can remove the FIXME.  Pre-patch enabled interrupts much earlier.

> + local_irq_restore(irq_flags);
> +
>   if (vcpu->fpu_active) {
>   fx_save(vcpu->guest_fx_image);
>   fx_restore(vcpu->host_fx_image);
> @@ -1710,6 +1746,13 @@ again:
>   reload_tss(vcpu);
>  
>   /*
> +  * Signal that we have transitioned back to host mode
> +  */
> + spin_lock_irqsave(&vcpu->irq.lock, irq_flags);
> + vcpu->irq.guest_mode = 0;
> + spin_unlock_irqrestore(&vcpu->irq.lock, irq_flags);
>   

 >> Don't you need to check interrupts here?
 > No, we assume that host userspace won't sleep.
Right, I forgot again.


> (prof_on == KVM_PROFILING))
> diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
> index ca858cb..7b81fff 100644
> --- a/drivers/kvm/vmx.c
> +++ b/drivers/kvm/vmx.c
> @@ -1895,6 +1895,7 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct 
> kvm_run *kvm_run)
>   u16 fs_sel, gs_sel, ldt_sel;
>   int fs_gs_ldt_reload_needed;
>   int r;
> + unsigned long irq_flags;
>  
>  preempted:
>   /*
> @@ -1929,9 +1930,37 @@ preempted:
>   if (vcpu->guest_debug.enabled)
>   kvm_guest_debug_pre(vcpu);
>  
> + /*
> +  * We disable interrupts until the next VMEXIT to eliminate a race
> +  * condition for de

Re: [kvm-devel] [PATCH 6/8] KVM: Adds support for real NMI injection on VMX processors

2007-05-14 Thread Avi Kivity
Gregory Haskins wrote:
> Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]>
> ---
>
>  drivers/kvm/vmx.c |   63 
> +
>  drivers/kvm/vmx.h |3 +++
>  2 files changed, 61 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
> index bee4831..1c99bc9 100644
> --- a/drivers/kvm/vmx.c
> +++ b/drivers/kvm/vmx.c
> @@ -1148,7 +1148,14 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
>  PIN_BASED_VM_EXEC_CONTROL,
>  PIN_BASED_EXT_INTR_MASK   /* 20.6.1 */
>  | PIN_BASED_NMI_EXITING   /* 20.6.1 */
> +| PIN_BASED_VIRTUAL_NMI   /* 20.6.1 */
>   );
> +
> + if (!(vmcs_read32(PIN_BASED_VM_EXEC_CONTROL) & PIN_BASED_VIRTUAL_NMI))
> + printk(KERN_WARNING "KVM: Warning - Host processor does " \
> +"not support virtual-NMI injection.  Using IRQ " \
> +"method\n");
>   

Warning is too severe here.  Things work (right?), there's nothing the 
user can do about it, and no need to alert kvm-devel.  KERN_DEBUG is 
sufficient.


-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 8/8] KVM: Adds support for TPR shadowing under VMX processors

2007-05-14 Thread Avi Kivity
Gregory Haskins wrote:
> Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]>
> ---
>
>   

How was this tested?

> + printk(KERN_WARNING "KVM: Warning - Host processor does " \
> +"not support TPR-shadow\n");
>   

KERN_DEBUG.


-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 09/10] KVM: Add statistics from interrupt subsystem

2007-05-14 Thread Avi Kivity
Gregory Haskins wrote:
>* then wake up the vcpu (if necessary)
> @@ -2540,6 +2544,7 @@ static void kvm_vcpu_intr(struct kvm_irqsink *this,
>*/
>   direct_ipi = task_cpu(vcpu->irq.task);
>   BUG_ON(direct_ipi == smp_processor_id());
> + ++vcpu->stat.guest_intr;
>   } else {
>   /*
>   

Maybe call it guest_preempt to make it a bit more specific.


-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 2/2] KVM: in-kernel-apic modification to QEMU

2007-05-14 Thread Avi Kivity
Anthony Liguori wrote:
>>  int intno;
>>  
>> +#ifdef USE_KVM
>> +if (!kvm_allowed) {
>> +#endif
>>  intno = apic_get_interrupt(env);
>>  if (intno >= 0) {
>>   
>> 
>
> Indent the inner block please :-)
>
>   

Indenting the inner code would cause merging to be very difficult, 
should qemu change something there.  Maybe you can reverse the sense of 
the test and add the code after th #endif instead.

-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/RFC 4/9] Basic guest virtual devices infrastructure

2007-05-14 Thread Avi Kivity
Carsten Otte wrote:
> From: Carsten Otte <[EMAIL PROTECTED]>
>
> This patch adds support for a new bus type that manages paravirtualized 
> devices. The bus uses the s390 diagnose instruction to query devices, and
> match them with the corresponding drivers.
> Future enhancements should include hotplug and hotremoval of virtual devices
> triggered by the host, and supend/resume of virtual devices for migration.
>
>   

Interesting.  We could use a variation this for x86 as well, but I'm not 
sure how easy it is to integrate it into closed source OSes (Windows).  
The diag instruction could be replaced by a hypercall which would make 
the code generic.

-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/RFC 4/9] Basic guest virtual devices infrastructure

2007-05-14 Thread Carsten Otte
Avi Kivity wrote:
> Interesting.  We could use a variation this for x86 as well, but I'm not 
> sure how easy it is to integrate it into closed source OSes (Windows).  
> The diag instruction could be replaced by a hypercall which would make 
> the code generic.
I think we need to freeze the hypercall API at some time, and consider 
it a stable kernel external API. We do then need to document these 
calls, and non-GPL hypervisors can implement it. We could eventually 
have a similar situation with one of the other non-GPL hypervisors on 
s390 that run Linux.

so long,
Carsten

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/RFC 6/9] virtual block device driver

2007-05-14 Thread Avi Kivity
Carsten Otte wrote:
> From: Carsten Otte <[EMAIL PROTECTED]>
>
> This driver provides access to virtual block devices. It does use its own
> make_request function which passes the bio to a workqueue thread. The 
> workqueue
> thread does use the diagnose hypervisor call to call the hosting Linux.
> The hypervisor code in host userspace does use aio_submit to initiate the IO. 
> Once the IO is done, the host will use io_getevents and then generate an
> interrupt to the guest. The interrupt handler calls bio_endio.
> This device driver is currently architecture dependent. We intend to move the
> host API to hypercall instead of the diagnose instuction. Please review.
>
> Signed-off-by: Carsten Otte <[EMAIL PROTECTED]>
>   

> +struct vdisk_device * vdisk_get_device_by_fd(int fd)
> +{
> + struct device *dev;
> + struct vdev *vdev;
> + struct vdisk_device *vdisk;
> +
> + dev = driver_find_device(&vdisk_driver.driver, NULL, (void*)(long)fd, 
> __find_fd);
> + if (!dev)
> + return NULL;
> + vdev  = to_vdev(dev);
> + vdisk = (struct vdisk_device *)vdev->drv_private;
> + return vdisk;
> +}
>   

Is this the host file descriptor?  If so, we want to use something more 
abstract (if the host side is in kernel, there will be no fd, or if the 
device is implemented using >1 files (or <1 files)).

> +
> +#define VDISK_WRITE 1
> +#define VDISK_READ  0
> +
> +struct vdisk_request {
> + unsigned long buf;
> + unsigned long count;
> +};
> +
> +typedef struct vdisk_request (*vdisk_req_t)[VDISK_NR_REQ];
> +
> +struct vdisk_response {
> + unsigned long intparm;
> + unsigned long count;
> + unsigned long failed;
> +};
> +
> +typedef struct vdisk_response (*vdisk_irq_t)[VDISK_NR_RES];
> +
> +struct vdisk_device {
> + struct list_head head;
> + int blocksize;
> + long size;
> + int read_only;
> + struct gendisk *gd;
> + struct vdev *vdev;
> + spinlock_t lock;
> + struct rw_semaphore pump_sem;
> + int open_count;
> + int vfd;
> + struct vdisk_request (*submit_page)[VDISK_NR_REQ];
>   


> + struct workqueue_struct *wq;
> + vdisk_irq_t irq_page;
> + wait_queue_head_t wait;
> +};
> +
> +struct vdisk_work {
> + struct work_struct work;
> + struct bio* bio;
> +};
> +
> +struct vdisk_elem {
> + unsigned int fd;
> + unsigned int command;
> + unsigned long offset;
> + unsigned long buffer;
> + unsigned long nbytes;
>   

We'll want scatter/gather here.

> +};
> +
> +struct vdisk_iocb_container {
> + struct iocb iocb;
> + struct bio *bio;
> + struct vdisk_device *dev;
> + int ctx_index;
> + unsigned long context;
> + struct list_head list;
> +};
> +
> +// from aio_abi.h
> +typedef enum io_iocb_cmd {
> + IO_CMD_PREAD = 0,
> + IO_CMD_PWRITE = 1,
> +
> + IO_CMD_FSYNC = 2,
> + IO_CMD_FDSYNC = 3,
> +
> + IO_CMD_POLL = 5,
> + IO_CMD_NOOP = 6,
> +} io_iocb_cmd_t;
>   

Our own commands, please.  We need READV, WRITEV, and a barrier for 
journalling filesystems.  FDSYNC should work as a barrier, but is 
wasteful.  The FSYNC/FDSYNC distinction is meaningless.  POLL/NOOP are 
irrelevant.

> +static void vdisk_pump_bvecs(struct vdisk_device *dev, int op,
> +   loff_t start_offset, int requestno,
> +   struct bio* bio, struct bio_vec *(vectors[256]))
> +{
> + int i, rc;
> + loff_t offset = start_offset;
> + int nr_done = 0;
> + long size;
> + long flags=0;
> + DEFINE_WAIT(wait);
> +
> + spin_lock_irqsave(&dev->lock, flags);
> + prepare_to_wait_exclusive(&dev->wait, &wait,
> +   TASK_UNINTERRUPTIBLE);
> +
> + while (nr_done < requestno) {
> + memset(dev->submit_page, 0, PAGE_SIZE);
> + for (i=nr_done; i + (*dev->submit_page)[i-nr_done].buf =
> + (unsigned 
> long)page_address(vectors[i]->bv_page) +
> + vectors[i]->bv_offset;
> + (*dev->submit_page)[i-nr_done].count = 
> vectors[i]->bv_len;
> + }
> +
> + rc = diag_vdisk_submit_request(dev->vfd,
> + dev->submit_page,
> + op, offset,
> + requestno-nr_done, bio);
> +
> + if (rc < 0) {
> + // error case
> + size = 0;
> + for (i=0; i<(requestno-nr_done); i++)
> + size += (*dev->submit_page)[i].count;
> + bio_io_error(bio, size);
> + break;
> + }
> +
> + if (rc == requestno - nr_done)
> + // everything was submitted propper
> + break;
> +
> + if (rc) {
> + //request was partly submitted
> +  

Re: [kvm-devel] [PATCH/RFC 6/9] virtual block device driver

2007-05-14 Thread Avi Kivity
Carsten Otte wrote:
> From: Carsten Otte <[EMAIL PROTECTED]>
>
> This driver provides access to virtual block devices. It does use its own
> make_request function which passes the bio to a workqueue thread. The 
> workqueue
> thread does use the diagnose hypervisor call to call the hosting Linux.
> The hypervisor code in host userspace does use aio_submit to initiate the IO. 
> Once the IO is done, the host will use io_getevents and then generate an
> interrupt to the guest. The interrupt handler calls bio_endio.
> This device driver is currently architecture dependent. We intend to move the
> host API to hypercall instead of the diagnose instuction. Please review.
>
>   

Oh. Why not use Xen's pending block driver? It probably has everything 
needed.

-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/RFC 7/9] Virtual network guest device driver

2007-05-14 Thread Avi Kivity
Anthony Liguori wrote:
> Dor Laor wrote:
>   
>> Furthermore,
>>   
>> 
>>> the plan is to completely rearchitect the netback/netfront protocol for
>>> the next Xen release (this effort is referred to netchannel2).
>>> 
>>>   
>> But isn't Jeremy Fitzhardinge is pushing big patch queue into the
>> kernel?
>>   
>> 
>
> Yes, but it's not in the kernel yet and there's no guarantee it'll get 
> there in time for KVM's consumption.
>   

I doubt we could add the missing features to kvmnet, test, optimize, 
submit to netdev, apply comments, re-submit, re-write, update to latest 
netdev api, and fix all the bugs much faster.

-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 00/10] in-kernel APIC v3 (kernel side)

2007-05-14 Thread Gregory Haskins
>>> On Sun, May 13, 2007 at  8:38 AM, in message <[EMAIL PROTECTED]>,
Avi Kivity <[EMAIL PROTECTED]> wrote: 
> Gregory Haskins wrote:
>> The load- average on my system is about 1 while XP is idling.  qemu seems to 
> be mostly at "0%" but will bounce up to 1% on occasion.  Here is the output 
> of "top - b - p " over a few seconds:
>>
>> top -  09:17:45 up 16:58,  3 users,  load average: 1.02, 0.86, 0.42
>> Tasks:   1 total,   0 running,   1 sleeping,   0 stopped,   0 zombie
>> Cpu(s):  0.1%us,  0.2%sy,  0.0%ni, 99.5%id,  0.1%wa,  0.0%hi,  0.1%si,  
> 0.0%st
>> Mem:   3994704k total,  2018980k used,  1975724k free,70996k buffers
>> Swap:  2104472k total,0k used,  2104472k free,  1148284k cached
>>
>>   PID USER  PR  NI  VIRT  RES  SHR S %CPU %MEMTIME+  COMMAND 
>
>> 10359 ghaskins  15   0  598m  83m  75m S0  2.1   1:37.87 qemu- system- 
>> x86   
>  
>>   
> 
> A good test is to let Windows boot and idle itself, then compare the 
> process cpu time under the TIME+ column with model- 0 and model- 1.

Will do.  One that that I notice that I can't explain yet is as follows:

When I boot windows + level-1, the point at which windows is running in 16 bit 
real-mode in the very beginning (before the splashscreen comes up), we seem to 
take a very large number of exits for instruction emulation.  This ends up 
being a little storm of activity for about 1 second or so.  I am not really 
sure why this happens with the new code and not with the old.  It doesnt seem 
to hurt anything other than extra CPU used.  But its weird nonetheless. 

> 
> Since the vast majority of exits in the scenario are hitting the tpr, 
> I'd be unsurprised if the time if 50% lower or so.

Yeah, the new code essentially converts all those TPR exits to lightweight.  
Nothing more, nothing less.  I could be crazy, but my perception is the GUI is 
*much* more responsive because of it, however.  Windows draw very fast and it 
actually seems usable.  Whereas trunk+ACPI always feels sluggish.  I don't know 
of a good benchmark to run to see if there really is an improvement, however.





-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/RFC 4/9] Basic guest virtualdevices infrastructure

2007-05-14 Thread Dor Laor
>Avi Kivity wrote:
>> Interesting.  We could use a variation this for x86 as well, but I'm
not
>> sure how easy it is to integrate it into closed source OSes
(Windows).
>> The diag instruction could be replaced by a hypercall which would
make
>> the code generic.
>I think we need to freeze the hypercall API at some time, and consider
>it a stable kernel external API. We do then need to document these
>calls, and non-GPL hypervisors can implement it. We could eventually
>have a similar situation with one of the other non-GPL hypervisors on
>s390 that run Linux.

I think Avi meant using a virtual bus as an option for HVMs too (windows
especially). Currently we're using the cpi bus. Using a new virtualized
bus might be a good idea, it's easy & clean for open source. The
question is it make life easier for HVMs. For instance, on windows we'll
need Pnp support for these devices.

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/RFC 7/9] Virtual network guest device driver

2007-05-14 Thread Avi Kivity
ron minnich wrote:
> We had hoped to get something like this into Xen. On Xen, for example,
> the block device and ethernet device interfaces are as different as
> one could imagine. Disk I/O does not steal pages from the guest. The
> network does. Disk I/O is in 4k chunks, period, with a bitmap
> describing which of the 8 512-byte subunits are being sent. The enet
> device, on read, returns a page with your packet, but also potentially
> containing bits of other domain's packets too. The interfaces are as
> dissimilar as they can be, and I see no reason for such a huge
> variance between what are basically read/write devices.
>   

The reason for the variance is that hardware capabilities are very 
different for disk and block. Block device requests are always 
guest-initiated and sector-aligned, and often span many pages. On the 
other hand, network packets are byte aligned, and rx packets are 
host-initiated, triggering the stolen pages concept (which 
unsurprisingly turned out not to be a win). Network has such esoteric 
features as TSO. Block is very interested in actually getting things 
onto the disk (barrier support).

In short, the "everything is a stream of bytes" grossly oversimplifies 
things.

> Another issue is that kvm, in its current form (-24) is beautifully
> simple. These additions seem to detract from the beauty a  bit. Might
> it be worth taking a little time to consider these ideas in order to
> preserve the basic elegance of KVM?
>   

kvm? elegant and simple? it's basically a pile of special cases.

But I agree that the growing code base is a problem. With the block 
driver we can probably keep the host side in userspace, but to do the 
same for networking is much more work. I do think (now) that it is doable.

-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 00/10] in-kernel APIC v3 (kernel side)

2007-05-14 Thread Dor Laor
>>
>> A good test is to let Windows boot and idle itself, then compare the
>> process cpu time under the TIME+ column with model- 0 and model- 1.
>
>Will do.  One that that I notice that I can't explain yet is as
follows:
>
>When I boot windows + level-1, the point at which windows is running in
16
>bit real-mode in the very beginning (before the splashscreen comes up),
we
>seem to take a very large number of exits for instruction emulation.
This
>ends up being a little storm of activity for about 1 second or so.  I
am
>not really sure why this happens with the new code and not with the
old.
>It doesnt seem to hurt anything other than extra CPU used.  But its
weird
>nonetheless.
>
>>
>> Since the vast majority of exits in the scenario are hitting the tpr,
>> I'd be unsurprised if the time if 50% lower or so.
>
>Yeah, the new code essentially converts all those TPR exits to
lightweight.
>Nothing more, nothing less.  I could be crazy, but my perception is the
GUI
>is *much* more responsive because of it, however.  Windows draw very
fast
>and it actually seems usable.  Whereas trunk+ACPI always feels
sluggish.  I
>don't know of a good benchmark to run to see if there really is an
>improvement, however.
>
- Since light exits are much faster then regular exits the performance
change is noticeable :)
- Don't use usb_device tablet option since it use polling and waists
lots of cpu.
- You can use pcmark05 benchmark.

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 2/4] KVM-USER: Make the kvm_allowed flag always defined so we dont need #ifdefs

2007-05-14 Thread Avi Kivity
Gregory Haskins wrote:
> Non-performance critical code is made more awkward by having to always define
> both "#ifdef KVM" and "if (kvm_allowed)".  Define "kvm_allowed = 0" by
> default.  Anthony Ligouri is credited with the idea.
>
> Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]>
> ---
>
>  qemu/qemu-kvm.c |9 -
>  1 files changed, 8 insertions(+), 1 deletions(-)
>
> diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c
> index 212570a..d4419a3 100644
> --- a/qemu/qemu-kvm.c
> +++ b/qemu/qemu-kvm.c
> @@ -3,6 +3,14 @@
>  #include "config-host.h"
>  
>  #ifdef USE_KVM
> + #define KVM_ALLOWED_DEFAULT 1
> +#else
> + #define KVM_ALLOWED_DEFAULT 0
> +#endif

You could do a

#ifndef USE_KVM
#define kvm_allowed 0
#else
extern int kvm_allowed
#endif

However, will the code actually compile ifndef USE_KVM? Suppose the 
headers aren't installed?

-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 00/10] in-kernel APIC v3 (kernel side)

2007-05-14 Thread Avi Kivity
Gregory Haskins wrote:

  

>> A good test is to let Windows boot and idle itself, then compare the 
>> process cpu time under the TIME+ column with model- 0 and model- 1.
>> 
>
> Will do.  One that that I notice that I can't explain yet is as follows:
>
> When I boot windows + level-1, the point at which windows is running in 16 
> bit real-mode in the very beginning (before the splashscreen comes up), we 
> seem to take a very large number of exits for instruction emulation.  This 
> ends up being a little storm of activity for about 1 second or so.  I am not 
> really sure why this happens with the new code and not with the old.  It 
> doesnt seem to hurt anything other than extra CPU used.  But its weird 
> nonetheless. 
>
>   

Very strange. Maybe it is a problem with emulating the apic disabled 
mode. Or maybe the initial state of the apic is different between qemu 
and kvm+apic.

>> Since the vast majority of exits in the scenario are hitting the tpr, 
>> I'd be unsurprised if the time if 50% lower or so.
>> 
>
> Yeah, the new code essentially converts all those TPR exits to lightweight.  
> Nothing more, nothing less.  I could be crazy, but my perception is the GUI 
> is *much* more responsive because of it, however.  Windows draw very fast and 
> it actually seems usable.  Whereas trunk+ACPI always feels sluggish.  I don't 
> know of a good benchmark to run to see if there really is an improvement, 
> however.
>   

Just the cpu time spent during boot would be a good indication.

-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 05/10] KVM: Adds ability to signal userspace using a file-descriptor

2007-05-14 Thread Gregory Haskins
>>> On Sun, May 13, 2007 at  9:00 AM, in message <[EMAIL PROTECTED]>,
Avi Kivity <[EMAIL PROTECTED]> wrote: 
> Gregory Haskins wrote:
>> Signed- off- by: Gregory Haskins <[EMAIL PROTECTED]>
>>   
> 
> Please include patch descriptions.

Ack.

On that topic: Does anyone know how to retroactively change the patch comment 
in StGIT?

> 
>> ---
>>
>>  drivers/kvm/kvm.h  |2 +
>>  drivers/kvm/kvm_main.c |   82 
> 
>>  2 files changed, 84 insertions(+), 0 deletions(- )
>>
>> diff -- git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
>> index 7b5d5e6..1c46830 100644
>> ---  a/drivers/kvm/kvm.h
>> +++ b/drivers/kvm/kvm.h
>> @@ - 333,6 +333,8 @@ struct kvm_vcpu_irq {
>>  int  deferred;
>>  struct task_struct  *task;
>>  int  guest_mode;
>> +wait_queue_head_twq;
>> +int  usignal;
>>  };
>>  
>>  struct kvm_vcpu {
>> diff -- git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
>> index 3304cce..9a6d2c5 100644
>> ---  a/drivers/kvm/kvm_main.c
>> +++ b/drivers/kvm/kvm_main.c
>> @@ - 40,6 +40,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  
>>  #include "x86_emulate.h"
>>  #include "segment_descriptor.h"
>> @@ - 326,6 +327,7 @@ static struct kvm *kvm_create_vm(void)
>>  memset(&vcpu- >irq, 0, sizeof(vcpu- >irq));
>>  spin_lock_init(&vcpu- >irq.lock);
>>  vcpu- >irq.deferred = - 1;
>> +init_waitqueue_head(&vcpu- >irq.wq);
>>  
>>  vcpu- >cpu = - 1;
>>  vcpu- >kvm = kvm;
>> @@ - 2288,11 +2290,78 @@ static int kvm_vcpu_release(struct inode *inode, 
> struct file *filp)
>>  return 0;
>>  }
>>  
>> +static unsigned int kvm_vcpu_poll(struct file *filp, poll_table *wait)
>> +{
>> +struct kvm_vcpu *vcpu = filp- >private_data;
>> +unsigned int events = 0;
>> +unsigned long flags;
>> +
>> +poll_wait(filp, &vcpu- >irq.wq, wait);
>> +
>> +spin_lock_irqsave(&vcpu- >irq.lock, flags);
>> +if (vcpu- >irq.usignal)
>> +events |= POLLIN;
>> +spin_unlock_irqrestore(&vcpu- >irq.lock, flags);
>> +
>> +return events;
>> +}
>> +
>> +static ssize_t kvm_vcpu_read(struct file *filp, char __user *buf, size_t 
> count,
>> + loff_t *ppos)
>> +{
>>   
> 
> Is having a read() (or a write()) actually necessary?

Based on what I know: yes.  It could be a case of ignorance, however ;)

Heres why I think its necessary:  You need poll to simply tell you when 
something is pending.  You can't clear the pending status in poll because you 
cannot predict the internal access pattern (e.g. I assume it could be polled 
multiple times by the kernel without returning immediately to userspace).  
Therefore, you need a second method to actually clear the pending "signal", 
which I use the read() method for.  I can be convinced otherwise, but that was 
my original thinking.

> 
>> +
>> +if (indirect_sig && waitqueue_active(&vcpu- >irq.wq))
>> +wake_up(&vcpu- >irq.wq);
>>  }
>>  
>>   
> 
> Did you check that we can actually deliver signals with this?  I think a 
> fasync_struct or something like that is necessary, but not sure.

Actually, my signals *didn't* seem to be working, but they werent working with 
"send_sig()" either so I just assumed I had a userspace coding problem.  Based 
on what I read, it seemed like what I did should work if you do a 
fcntl(F_SETSIG), etc.  But again, it could be ignorance.  I am not familiar 
with fasync_struct.  If you have any pointers, please forward.

> 
> Another implementation option (which I've only thought of now, sorry) is 
> to have an ioctl which returns a real eventfd, reducing some code 
> duplication.

So based on this, I assume eventfd must be in the kernel already?  Cool.  Even 
if its not, I like this idea much better than what I did.  There was still an 
unresolved problem regarding how I was going to expose the signaling mechanism 
to QEMU without giving away the vcpu_fd from the kvmctl library that this 
solves nicely.

With this methodology, I can simply provide a function like 
"kvm_vcpu_get_eventfd()" in the library, and return the eventfd directly to the 
QEMU process.  Then we dont have to worry about layering violations.  Nice!


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 05/10] KVM: Adds ability to signal userspace using a file-descriptor

2007-05-14 Thread Avi Kivity
Gregory Haskins wrote:
>>
>> Is having a read() (or a write()) actually necessary?
>> 
>
> Based on what I know: yes.  It could be a case of ignorance, however ;)
>
> Heres why I think its necessary:  You need poll to simply tell you when 
> something is pending.  You can't clear the pending status in poll because you 
> cannot predict the internal access pattern (e.g. I assume it could be polled 
> multiple times by the kernel without returning immediately to userspace).  
> Therefore, you need a second method to actually clear the pending "signal", 
> which I use the read() method for.  I can be convinced otherwise, but that 
> was my original thinking.
>   

I think you are right, but am cc'ing an expert. Davide, we're using an 
fd to signal something to userspace, but have nothing to actually read() 
or write(). Is a read() or write() avoidable?

>   
>>> +
>>> +   if (indirect_sig && waitqueue_active(&vcpu- >irq.wq))
>>> +   wake_up(&vcpu- >irq.wq);
>>>  }
>>>  
>>>   
>>>   
>> Did you check that we can actually deliver signals with this?  I think a 
>> fasync_struct or something like that is necessary, but not sure.
>> 
>
> Actually, my signals *didn't* seem to be working, but they werent working 
> with "send_sig()" either so I just assumed I had a userspace coding problem.  
> Based on what I read, it seemed like what I did should work if you do a 
> fcntl(F_SETSIG), etc.  But again, it could be ignorance.  I am not familiar 
> with fasync_struct.  If you have any pointers, please forward.
>
>   

fs/pipe.c. hairy stuff.

>> Another implementation option (which I've only thought of now, sorry) is 
>> to have an ioctl which returns a real eventfd, reducing some code 
>> duplication.
>> 
>
> So based on this, I assume eventfd must be in the kernel already?  Cool.

It is in 2.6.22-rc1. As is the anonymous inodes source which can be used 
to retire kvmfs (which will probably break the record for shortest-lived 
filesystem ever).

>   Even if its not, I like this idea much better than what I did.  There was 
> still an unresolved problem regarding how I was going to expose the signaling 
> mechanism to QEMU without giving away the vcpu_fd from the kvmctl library 
> that this solves nicely.
>
> With this methodology, I can simply provide a function like 
> "kvm_vcpu_get_eventfd()" in the library, and return the eventfd directly to 
> the QEMU process.  Then we dont have to worry about layering violations.  
> Nice!
>   

I hadn't though of it. Looks like a win from all directions.

It means we need to package eventfd for the external module, but that's 
easily done.

-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 00/10] in-kernel APIC v3 (kernel side)

2007-05-14 Thread Gregory Haskins
>>> On Sun, May 13, 2007 at  9:10 AM, in message <[EMAIL PROTECTED]>,
Avi Kivity <[EMAIL PROTECTED]> wrote: 
> Gregory Haskins wrote:
>> I am pleased to announce v3, which builds upon v2 by adding:
>>   
> 
> While I haven't reviewed all this yet, it look like we can merge this 
> early next week.

Thats great news, thanks!

> 
> What remains to be done:
> -  boot test mainstream (Linux + Windows) guests on (vmx, svm) x (32, 64) 
> host x (32, 64) guest
> -  boot test non- mainstream guests
> -  measure Windows APIC HAL performance improvement
> -  measure Linux performance non- regression
> -  show newer userspace running older with Linux 2.6.22- rc1 and kvm- 24 
> userspace running with new code
> -  add non- intrusive external module support
> 
> I'll certainly help with the second item, as I've built a considerable 
> collection of guests, and possibly with others as time allows.  I guess 
> I can help with the last too.

I can handle the other items, except for SVN testing.  I currently only have a 
Merom-T7600 and Woodcrest-5130 Intel setup.  I should have some SVN hardware at 
some point soon, but not sure when (probably a few weeks out at the earliest).  
Any help also testing the SVN side would be appreciated.

> 
> Mainstream target is 2.6.23.




-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 00/10] in-kernel APIC v3 (kernel side)

2007-05-14 Thread Avi Kivity
Gregory Haskins wrote:
> I can handle the other items, except for SVN testing.  I currently only have 
> a Merom-T7600 and Woodcrest-5130 Intel setup.  I should have some SVN 
> hardware at some point soon, but not sure when (probably a few weeks out at 
> the earliest).  Any help also testing the SVN side would be appreciated.
>   

I can do that.

-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/RFC 7/9] Virtual network guest device driver

2007-05-14 Thread Christian Bornträger
On Monday 14 May 2007 14:05, Avi Kivity wrote:
> But I agree that the growing code base is a problem. With the block 
> driver we can probably keep the host side in userspace, but to do the 
> same for networking is much more work. I do think (now) that it is doable.

Interesting. What kind of userspace networking do you have in mind?

One of the first trys from Carsten was to use tun/tap, which proved to be slow 
performance-wise.

What I had in mind was some kind of switch in userspace. That would allow 
non-root guests to define there own private networks. We could use Linux fast 
pipe implementation for guest-to-guest communication. 

The questions is how to connect user space networks to the host ones?
- tun/tap is quite slow
- last time we checked, netfiler offered only IP hooks (if you dont use the 
bridging code)
- raw sockets get tricky if you do in/out at the same time because you have to 
manually deal with loops

This reminds me, that we actually have another party doing virtual networking 
between guests: UML. User mode linux actually can do networking/switching in 
userspace, but I cannot tell how well UMLs concept works out. 

Christian

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/RFC 7/9] Virtual network guest device driver

2007-05-14 Thread Avi Kivity
Christian Bornträger wrote:
> On Monday 14 May 2007 14:05, Avi Kivity wrote:
>   
>> But I agree that the growing code base is a problem. With the block 
>> driver we can probably keep the host side in userspace, but to do the 
>> same for networking is much more work. I do think (now) that it is doable.
>> 
>
> Interesting. What kind of userspace networking do you have in mind?
>
> One of the first trys from Carsten was to use tun/tap, which proved to be 
> slow 
> performance-wise.
>   

tun/tap, but extended with:

- true aio
- aio with scatter/gather (IO_CMD_PWRITEV/IO_CMD_PREADV)
- qemu support for native Linux aio (not the glibc hackaround currently 
in place), so we get event coalescing and cheap multi request submission
- tap support for tso

With these, we could conceivably reach speeds close to an in-kernel 
driver.  Unfortunately we'd only know after all the hard work was done.

> What I had in mind was some kind of switch in userspace. That would allow 
> non-root guests to define there own private networks. We could use Linux fast 
> pipe implementation for guest-to-guest communication. 
>
> The questions is how to connect user space networks to the host ones?
> - tun/tap is quite slow
> - last time we checked, netfiler offered only IP hooks (if you dont use the 
> bridging code)
> - raw sockets get tricky if you do in/out at the same time because you have 
> to 
> manually deal with loops
>   

qemu has some support for this, see the '-net socket' option.


-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 00/10] in-kernel APIC v3 (kernel side)

2007-05-14 Thread Gregory Haskins
>>> On Sun, May 13, 2007 at 10:06 AM, in message
<[EMAIL PROTECTED]>,
"Dor Laor" <[EMAIL PROTECTED]> wrote: 
>> >
>>> Superb results, when I run the old in-  kernel apic I got much higher
> idle
>>> cpu consumtion. I know that's vmenter-  vmexit latency was drastically
>>> improved but 0-  1% cpu consumption for winxp 32 bit sounds too good
> to be
>>> true.
>>> Are you sure an ACPI HAL is used in the guest? (can be checked by
>>> looking at the computer node in the device manager).
>>
>>I did a clean install and didn't change it over to StandardPC.  The
> Control
>>Panel says "ACPI Uniprocessor PC" or something like that (recalling
> from
>>memory right now).  As a third datapoint, I am seeing the TPR changing
>>rapidly (which I would expect 0 changes if the ACPI stuff was
> disabled).
> 
> Great, I was a little skeptical (sorry for that) 

No problem.

> since several months
> ago when I tested the in- kernel- apic the idle time was much worse. After
> talking today with Avi, he said his vmentry- exit latency drastically
> improved performance and even the qemu's apic can reach these figures.

Yeah.  At first I was psyched when I saw my code running at 0-1%, but then I 
ran trunk and saw its not all that different :(  The good news is that windows 
is performing pretty well now, regardless of my patch or not.  Thats good for 
KVM all around.

> I'm sure that with load, you implementation will achieve performance
> advantage.

In theory, we should be able to take advantage of all the light-exit gains we 
have made recently since that is the primary difference for windows+ACPI now.  
This means the APIC change will have an increasing impact as we improve the 
VMEXIT times (both in HW and SW).  So even if it doesn't make a huge diff 
today, it will always be contributing more and more as we push the ball forward 
in other areas.

But I am not disappointed.  My primary motivation is functionality, not 
performance. ;)  I need to be able to inject interrupts to a guest from the 
kernel for the work I am doing.  If I also improve performance, great!



-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 00/10] in-kernel APIC v3 (kernel side)

2007-05-14 Thread Dor Laor
>
>> since several months
>> ago when I tested the in- kernel- apic the idle time was much worse.
>After
>> talking today with Avi, he said his vmentry- exit latency drastically
>> improved performance and even the qemu's apic can reach these
figures.
>
>Yeah.  At first I was psyched when I saw my code running at 0-1%, but
then
>I ran trunk and saw its not all that different :(  The good news is
that
>windows is performing pretty well now, regardless of my patch or not.
>Thats good for KVM all around.
>
>> I'm sure that with load, you implementation will achieve performance
>> advantage.
>
>In theory, we should be able to take advantage of all the light-exit
gains
>we have made recently since that is the primary difference for
windows+ACPI
>now.  This means the APIC change will have an increasing impact as we
>improve the VMEXIT times (both in HW and SW).  So even if it doesn't
make a
>huge diff today, it will always be contributing more and more as we
push
>the ball forward in other areas.
>
>But I am not disappointed.  My primary motivation is functionality, not
>performance. ;)  I need to be able to inject interrupts to a guest from
the
>kernel for the work I am doing.  If I also improve performance, great!
>

Your timing is good for us too since we have our pv network driver that
now works with qemu's interrupts. This means that we use a signal in
order to inject rx packets to the guest. In-kernel apic should improve
that, I'll soon measure the diff although using NAPI might dissolve the
performance gain.
It's good stuff the apic will be in the kernel, once it's there we need
to consider other components too.

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/RFC 6/9] virtual block device driver

2007-05-14 Thread Carsten Otte

Avi Kivity wrote:
> Is this the host file descriptor?  If so, we want to use something more 
> abstract (if the host side is in kernel, there will be no fd, or if the 
> device is implemented using >1 files (or <1 files)).
This is indeed the host file descriptor. Host userland uses sys_open 
to retrieve it. I see the beauty of having the remote side in the 
kernel, however I fail to see why we would want to reinvent the wheel: 
asynchronous IO with O_DIRECT (to avoid host caching) does just what 
we want. System call latency adds to the in-kernel approach here.

> We'll want scatter/gather here.
If you want scatter/gather, you have to do request merging in the 
guest and use the do_request function of the block queue. That is 
because in make_request you only have a single chunk at hand.
With do_request, you would do that request merging twice and get twice 
the block device plug latency for nothing. The host is the better 
place to do IO scheduling, because it can optimize over IO from all 
guest machines.
> 
>> +};
>> +
>> +struct vdisk_iocb_container {
>> +struct iocb iocb;
>> +struct bio *bio;
>> +struct vdisk_device *dev;
>> +int ctx_index;
>> +unsigned long context;
>> +struct list_head list;
>> +};
>> +
>> +// from aio_abi.h
>> +typedef enum io_iocb_cmd {
>> +IO_CMD_PREAD = 0,
>> +IO_CMD_PWRITE = 1,
>> +
>> +IO_CMD_FSYNC = 2,
>> +IO_CMD_FDSYNC = 3,
>> +
>> +IO_CMD_POLL = 5,
>> +IO_CMD_NOOP = 6,
>> +} io_iocb_cmd_t;
>>   
> 
> Our own commands, please.  We need READV, WRITEV, and a barrier for 
> journalling filesystems.  FDSYNC should work as a barrier, but is 
> wasteful.  The FSYNC/FDSYNC distinction is meaningless.  POLL/NOOP are 
> irrelevant.
This matches the api of libaio. If userland translates this into 
struct iocp, this makes sense. The barrier however is a general 
problem with this approach: today, the asynchronous IO userspace api 
does not allow to submit a barrier. Therefore, our make_request 
function in the guest returns -ENOTSUPP in the guest which forces the 
file system to wait for IO completion. This does sacrifice some 
performance. The right thing to do would be to add the possibility to 
submit a barrier to the kernel aio interface.

> We want to amortize the hypercall over multiple bios (but maybe you're 
> doing that -- I'm not 100% up to speed on the block layer)
We don't. We do one per bio, and I agree that this is a major 
disadvantage of this approach. Since IO is slow (compared to 
vmenter/vmexit), it pays back from to better IO scheduling. On our 
platform, this approach outperforms the scatter/gather do_request one.

> Any reason not to perform the work directly?
I owe you an answer to this one, I have to revisit our CVS logs to 
find out. We used to call from make_request without workqueue before, 
and I cannot remember why we changed that.

so long,
Carsten

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/RFC 6/9] virtual block device driver

2007-05-14 Thread Carsten Otte
Avi Kivity wrote:
> Oh. Why not use Xen's pending block driver? It probably has everything 
> needed.
We're not too eager to have our own device drivers become the solution 
of choice. I have'nt looked at it so far, will do.

so long,
Carsten

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/RFC 4/9] Basic guest virtualdevices infrastructure

2007-05-14 Thread Carsten Otte
Dor Laor wrote:
> I think Avi meant using a virtual bus as an option for HVMs too (windows
> especially). Currently we're using the cpi bus. Using a new virtualized
> bus might be a good idea, it's easy & clean for open source. The
> question is it make life easier for HVMs. For instance, on windows we'll
> need Pnp support for these devices.
Oh that way around. Thanks for clarification.
As far as I see, a stable hypercall API would also be good for 
maintaining non-GPL HVMs. Probably we should forge the API with 
respect to other HVMs needs then.

so long,
Carsten

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/RFC 7/9] Virtual network guest device driver

2007-05-14 Thread Carsten Otte
Avi Kivity wrote:
> But I agree that the growing code base is a problem. With the block 
> driver we can probably keep the host side in userspace, but to do the 
> same for networking is much more work. I do think (now) that it is doable.
I agree that networking needs to be handled in the host kernel. We go 
out to userspace for signaling at this time, but that's simply broken. 
All our userspace does is do a system call next.

so long,
Carsten

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 05/10] KVM: Adds ability to signal userspace using a file-descriptor

2007-05-14 Thread Gregory Haskins
>>> On Mon, May 14, 2007 at  8:22 AM, in message <[EMAIL PROTECTED]>,
Avi Kivity <[EMAIL PROTECTED]> wrote: 
>> So based on this, I assume eventfd must be in the kernel already?  Cool.
> 
> It is in 2.6.22- rc1. As is the anonymous inodes source which can be used 
> to retire kvmfs (which will probably break the record for shortest- lived 
> filesystem ever).
> 

I just did a search against my kvm.git HEAD and do not see anything related to 
eventfd.  Does this mean I should pull from linus' tree?  If I do this, will it 
still work in your tree?  (Sorry...relative git-newbie here).

-Greg  

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/PFC 0/2] s390 host support

2007-05-14 Thread Carsten Otte
Avi Kivity wrote:
> If the eventfd patchset is merged, then file descriptors will become the 
> standard Linux handle type, and poll (or rather, epoll) will become the 
> standard way of waiting for something to happen.  But of course if you 
> come up with something better we'll use that.
Triggered by this discussion, I have spent quite some time thinking 
about signaling, idle cpus, interrupts, signal processor (read: IPI) 
and such lately. It has become clear to me, that sleeping in userspace 
has been a bad design point I've made. The way kvm deals with this 
(idle cpu thread sleeps interruptible in kernel) is clearly preferable.

Our sie system call has quite a complex userspace interface that 
allows the user to modify various bits and pieces of our virtual cpu 
control block. All this is needed only, because we do a lot of wrong 
things in userspace. Like signal processor (read: IPI). I will go 
ahead and put these things into our kernel module. That should 
simplify our user<->kernel interface a lot.
One problem is, that we need to inject interrupts from userland. This 
requires waking up idle CPUs. I want to try how it comes out with a 
new system call for irqs rather then using tkill(). We could have the 
kernel choose the vcpu that is enabled for this interrupt for example. 
And the kernel can do optimizations like sending irqs to idle cpus 
preferably. The user could supply a CPU mask that specifies what CPUs 
come into question for the irq.

Another neat advantage of moving SIE specifics into the kernel module 
is, that our userspace will be left with device drivers only. We can 
then put those into kvm/qemu or switch to other paravirtual device 
drivers and discard our userspace code.

I believe once we've changed that, merging with kvm on both kernel and 
user side should become easier then it is today.

so long,
Carsten

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/RFC 6/9] virtual block device driver

2007-05-14 Thread Avi Kivity
Carsten Otte wrote:
>
> Avi Kivity wrote:
>> Is this the host file descriptor?  If so, we want to use something 
>> more abstract (if the host side is in kernel, there will be no fd, or 
>> if the device is implemented using >1 files (or <1 files)).
> This is indeed the host file descriptor. Host userland uses sys_open 
> to retrieve it. I see the beauty of having the remote side in the 
> kernel, however I fail to see why we would want to reinvent the wheel: 
> asynchronous IO with O_DIRECT (to avoid host caching) does just what 
> we want.

I don't see an immediate need to put the host-side driver in the kernel, 
but I don't want to embed the host fd (which is an implementation 
detail) into the host/guest ABI.  There may not even be a host fd.

> System call latency adds to the in-kernel approach here.

I don't understand this.

>
>> We'll want scatter/gather here.
> If you want scatter/gather, you have to do request merging in the 
> guest and use the do_request function of the block queue. That is 
> because in make_request you only have a single chunk at hand.
> With do_request, you would do that request merging twice and get twice 
> the block device plug latency for nothing. The host is the better 
> place to do IO scheduling, because it can optimize over IO from all 
> guest machines.

The bio layer already has scatter/gather (basically, a biovec), but the 
aio api (which you copy) doesn't.  The basic request should be a bio, 
not a bio page.

I don't think the guest driver needs to do its own merging.

>>
>>> +};
>>> +
>>> +struct vdisk_iocb_container {
>>> +struct iocb iocb;
>>> +struct bio *bio;
>>> +struct vdisk_device *dev;
>>> +int ctx_index;
>>> +unsigned long context;
>>> +struct list_head list;
>>> +};
>>> +
>>> +// from aio_abi.h
>>> +typedef enum io_iocb_cmd {
>>> +IO_CMD_PREAD = 0,
>>> +IO_CMD_PWRITE = 1,
>>> +
>>> +IO_CMD_FSYNC = 2,
>>> +IO_CMD_FDSYNC = 3,
>>> +
>>> +IO_CMD_POLL = 5,
>>> +IO_CMD_NOOP = 6,
>>> +} io_iocb_cmd_t;
>>>   
>>
>> Our own commands, please.  We need READV, WRITEV, and a barrier for 
>> journalling filesystems.  FDSYNC should work as a barrier, but is 
>> wasteful.  The FSYNC/FDSYNC distinction is meaningless.  POLL/NOOP 
>> are irrelevant.
> This matches the api of libaio. If userland translates this into 
> struct iocp, this makes sense. The barrier however is a general 
> problem with this approach: today, the asynchronous IO userspace api 
> does not allow to submit a barrier. Therefore, our make_request 
> function in the guest returns -ENOTSUPP in the guest which forces the 
> file system to wait for IO completion. This does sacrifice some 
> performance. The right thing to do would be to add the possibility to 
> submit a barrier to the kernel aio interface.

Right.  But the ABI needs to support barriers regardless of host kernel 
support.  When unavailable, barriers can be emulated by waiting for the 
request queue to flush itself.  If we do implement the host side in the 
kernel, then barriers become available.

>
>> We want to amortize the hypercall over multiple bios (but maybe 
>> you're doing that -- I'm not 100% up to speed on the block layer)
> We don't. We do one per bio, and I agree that this is a major 
> disadvantage of this approach. Since IO is slow (compared to 
> vmenter/vmexit), it pays back from to better IO scheduling. On our 
> platform, this approach outperforms the scatter/gather do_request one.

I/O may be slow, but you can have a lot more disks than cpus.

For example, if an I/O takes 1ms, and you have 100 disks, then you can 
issue 100K IOPS.  With one hypercall per request, that's ~50% of a cpu 
(at about 5us per hypercall that goes all the way to userspace).  That's 
not counting the overhead of calling io_submit().


-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 05/10] KVM: Adds ability to signal userspace using a file-descriptor

2007-05-14 Thread Avi Kivity
Gregory Haskins wrote:
 On Mon, May 14, 2007 at  8:22 AM, in message <[EMAIL PROTECTED]>,
 
> Avi Kivity <[EMAIL PROTECTED]> wrote: 
>   
>>> So based on this, I assume eventfd must be in the kernel already?  Cool.
>>>   
>> It is in 2.6.22- rc1. As is the anonymous inodes source which can be used 
>> to retire kvmfs (which will probably break the record for shortest- lived 
>> filesystem ever).
>>
>> 
>
> I just did a search against my kvm.git HEAD and do not see anything related 
> to eventfd.  Does this mean I should pull from linus' tree?  If I do this, 
> will it still work in your tree?  (Sorry...relative git-newbie here).
>
>   

kvm.git has eventfd merged.  See 
http://git.kernel.org/gitweb.cgi?p=linux/kernel/git/avi/kvm.git;a=blob;f=fs/eventfd.c;h=480e2b3c4166a85be538d6f2c5edc25eace5ec6f;hb=HEAD
 
.

You were probably hit by the sync delay between master.kernel.org mand 
the mirrors, or pulled before I pushed.

-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/PFC 0/2] s390 host support

2007-05-14 Thread Avi Kivity
Carsten Otte wrote:
> Avi Kivity wrote:
>> If the eventfd patchset is merged, then file descriptors will become 
>> the standard Linux handle type, and poll (or rather, epoll) will 
>> become the standard way of waiting for something to happen.  But of 
>> course if you come up with something better we'll use that.
> Triggered by this discussion, I have spent quite some time thinking 
> about signaling, idle cpus, interrupts, signal processor (read: IPI) 
> and such lately. It has become clear to me, that sleeping in userspace 
> has been a bad design point I've made. The way kvm deals with this 
> (idle cpu thread sleeps interruptible in kernel) is clearly preferable.

kvm doesn't do this directly.  A hlt instruction (which is is used on 
x86 to signal an idle cpu) is trapped and echoed to userspace, which 
then sleeps using select(2).

We thought of having hlt sleep in the kernel, but that meant that we 
would need to specify the exit conditions from sleep (signals, fd 
readiness, aio readiness).

(I think you're comparing to your pthread way of sleeping and waking, 
just making sure we're all on the same page here)



-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH] lighweight VM Exit (was:RE: guest state leak into host)

2007-05-14 Thread Dong, Eddie
Avi Kivity wrote:
> 
> Some exit handlers (even the #PF handler) can sleep sometimes.  They
> call kvm_arch_ops->vcpu_put(), do some sleepy thing, then call
> kvm_arch_ops->vcpu_load().  The changes in the commit make
> sure that if
> vcpu_put() is called, the lightweight exit is converted to a
> heavyweight exit.  Since such sleeps are rare, this is not expected
> to impact performance. 
> 
> See for example mmu_topup_memory_caches().
> 
> 
OK, how about this patch which further reduce the light weight VM Exit
MSR save/restore?


thx,eddie

Signed-off-by:  Yaozu(Eddie) Dong [EMAIL PROTECTED]

against ca76d209b88c344fc6a8eac17057c0088a3d6940.



diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 1bbafba..e61a7e6 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -287,6 +287,7 @@ struct kvm_vcpu {
u64 apic_base;
u64 ia32_misc_enable_msr;
int nmsrs;
+   int smsrs_bitmap;
struct vmx_msr_entry *guest_msrs;
struct vmx_msr_entry *host_msrs;
 
@@ -513,6 +514,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32
msr, u64 data);
 
 void fx_init(struct kvm_vcpu *vcpu);
 
+void load_msrs_select(struct vmx_msr_entry *e, int bitmap);
+void save_msrs_select(struct vmx_msr_entry *e, int bitmap);
 void load_msrs(struct vmx_msr_entry *e, int n);
 void save_msrs(struct vmx_msr_entry *e, int n);
 void kvm_resched(struct kvm_vcpu *vcpu);
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 1288cff..ef96fae 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -1596,6 +1596,30 @@ void kvm_resched(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_resched);
 
+void load_msrs_select(struct vmx_msr_entry *e, int bitmap)
+{
+   unsigned long nr;
+
+   while (bitmap) {
+   nr = __ffs(bitmap);
+   clear_bit(nr,&bitmap);
+   wrmsrl(e[nr].index, e[nr].data);
+   }
+}
+EXPORT_SYMBOL_GPL(load_msrs_select);
+
+void save_msrs_select(struct vmx_msr_entry *e, int bitmap)
+{
+   unsigned long nr;
+
+   while (bitmap) {
+   nr = __ffs(bitmap);
+   clear_bit(nr,&bitmap);
+   rdmsrl(e[nr].index, e[nr].data);
+   }
+}
+EXPORT_SYMBOL_GPL(save_msrs_select);
+
 void load_msrs(struct vmx_msr_entry *e, int n)
 {
int i;
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index 804a623..67d076c 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -86,15 +86,6 @@ static const u32 vmx_msr_index[] = {
 
 #ifdef CONFIG_X86_64
 static unsigned msr_offset_kernel_gs_base;
-#define NR_64BIT_MSRS 4
-/*
- * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
- * mechanism (cpu bug AA24)
- */
-#define NR_BAD_MSRS 2
-#else
-#define NR_64BIT_MSRS 0
-#define NR_BAD_MSRS 0
 #endif
 
 static inline int is_page_fault(u32 intr_info)
@@ -117,13 +108,23 @@ static inline int is_external_interrupt(u32
intr_info)
== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
 }
 
-static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32
msr)
+static int __find_msr_index(struct kvm_vcpu *vcpu, u32 msr)
 {
int i;
 
for (i = 0; i < vcpu->nmsrs; ++i)
if (vcpu->guest_msrs[i].index == msr)
-   return &vcpu->guest_msrs[i];
+   return i;
+   return -1;
+}
+
+static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32
msr)
+{
+   int i;
+
+   i = __find_msr_index(vcpu, msr);
+   if (i >= 0) 
+   return &vcpu->guest_msrs[i];
return NULL;
 }
 
@@ -307,9 +308,9 @@ static void vmx_save_host_state(struct kvm_vcpu
*vcpu)
 #ifdef CONFIG_X86_64
if (is_long_mode(vcpu)) {
save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base,
1);
-   load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
}
 #endif
+   load_msrs_select(vcpu->guest_msrs, vcpu->smsrs_bitmap);
 }
 
 static void vmx_load_host_state(struct kvm_vcpu *vcpu)
@@ -336,12 +337,8 @@ static void vmx_load_host_state(struct kvm_vcpu
*vcpu)
 
reload_tss();
}
-#ifdef CONFIG_X86_64
-   if (is_long_mode(vcpu)) {
-   save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
-   load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
-   }
-#endif
+   save_msrs_select(vcpu->guest_msrs, vcpu->smsrs_bitmap);
+   load_msrs_select(vcpu->host_msrs, vcpu->smsrs_bitmap);
 }
 
 /*
@@ -469,35 +466,51 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu,
unsigned error_code)
  */
 static void setup_msrs(struct kvm_vcpu *vcpu)
 {
-   int nr_skip, nr_good_msrs;
-
-   if (is_long_mode(vcpu))
-   nr_skip = NR_BAD_MSRS;
-   else
-   nr_skip = NR_64BIT_MSRS;
-   nr_good_msrs = vcpu->nmsrs - nr_skip;
+   int index,save_msrs;
 
-   /*
-* MSR_K6_STAR is only needed on long mode guests, and only
-* if efer.sce is enabled.
-*/
-   if (find_msr_entry(vcpu, MSR_K6_STAR)) {
-   --nr_good_msrs;
-#

Re: [kvm-devel] [PATCH] lighweight VM Exit

2007-05-14 Thread Avi Kivity
Dong, Eddie wrote:
> OK, how about this patch which further reduce the light weight VM Exit
> MSR save/restore?
>
>
> diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
> index 1288cff..ef96fae 100644
> --- a/drivers/kvm/kvm_main.c
> +++ b/drivers/kvm/kvm_main.c
> @@ -1596,6 +1596,30 @@ void kvm_resched(struct kvm_vcpu *vcpu)
>  }
>  EXPORT_SYMBOL_GPL(kvm_resched);
>  
> +void load_msrs_select(struct vmx_msr_entry *e, int bitmap)
> +{
> + unsigned long nr;
> +
> + while (bitmap) {
> + nr = __ffs(bitmap);
> + clear_bit(nr,&bitmap);
> + wrmsrl(e[nr].index, e[nr].data);
> + }
> +}
> +EXPORT_SYMBOL_GPL(load_msrs_select);
> +
> +void save_msrs_select(struct vmx_msr_entry *e, int bitmap)
> +{
> + unsigned long nr;
> +
> + while (bitmap) {
> + nr = __ffs(bitmap);
> + clear_bit(nr,&bitmap);
> + rdmsrl(e[nr].index, e[nr].data);
> + }
> +}
> +EXPORT_SYMBOL_GPL(save_msrs_select);
> +
>   

__clear_bit() is faster here (no LOCK prefix).  But maybe we can avoid 
the entire thing by having a vcpu->active_msr_list (array of struct 
vmx_msr_entry) which is re-constructed every time the mode changes 
(instead of constructing the bitmap).  vmx_get_msr() can first look at 
the active msr list and then at the regular msr list.

>  
>  /*
> @@ -469,35 +466,51 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu,
> unsigned error_code)
>   */
>  static void setup_msrs(struct kvm_vcpu *vcpu)
>  {
> - int nr_skip, nr_good_msrs;
> -
> - if (is_long_mode(vcpu))
> - nr_skip = NR_BAD_MSRS;
> - else
> - nr_skip = NR_64BIT_MSRS;
> - nr_good_msrs = vcpu->nmsrs - nr_skip;
> + int index,save_msrs;
>   

space after comma

>  
> - /*
> -  * MSR_K6_STAR is only needed on long mode guests, and only
> -  * if efer.sce is enabled.
> -  */
> - if (find_msr_entry(vcpu, MSR_K6_STAR)) {
> - --nr_good_msrs;
> -#ifdef CONFIG_X86_64
> - if (is_long_mode(vcpu) && (vcpu->shadow_efer &
> EFER_SCE))
> - ++nr_good_msrs;
> + vcpu->smsrs_bitmap = 0;
> + if (is_long_mode(vcpu)) {
> + if ((index=__find_msr_index(vcpu, MSR_SYSCALL_MASK)) >=
> 0) {
> + set_bit(index, &vcpu->smsrs_bitmap);
> + }
>   

Assignment outside if (), spaces around =, please.  Single statements 
without {}.

Also __set_bit() applies here.

> + /*
> +  * MSR_K6_STAR is only needed on long mode guests, and
> only
> +  * if efer.sce is enabled.
> +  */
> + if ((index=__find_msr_index(vcpu, MSR_K6_STAR)) >= 0
> +#ifdef X86_64
> + && (vcpu->shadow_efer & EFER_SCE)
>  #endif
> + ) {
> + set_bit(index, &vcpu->smsrs_bitmap);
>   

You're saving MSR_K6_STAR unnecessarily on i386.  Since we don't export 
EFER on i386 (one day we should...), the guest can't use syscall.

> + }
>   }
>  
> + if ((index = __find_msr_index(vcpu, MSR_EFER)) >= 0) {
> + save_msrs = 1;
> + }
> + else {
> + save_msrs = 0;
> + index = 0;
> + }
>   

Why not use hardware autoloading?  Is it slower than software?

Otherwise looks good.  Did you measure performance improvement?  I 
usually use user/test/vmexit.c from kvm-userspace.git.


-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 3/8] KVM: Adds ability to preempt an executing VCPU

2007-05-14 Thread Gregory Haskins
>>> On Mon, May 14, 2007 at  5:34 AM, in message <[EMAIL PROTECTED]>,
Avi Kivity <[EMAIL PROTECTED]> wrote: 
> Gregory Haskins wrote:
>> The VCPU executes synchronously w.r.t. userspace today, and therefore
>> interrupt injection is pretty straight forward.  However, we will soon need
>> to be able to inject interrupts asynchronous to the execution of the VCPU
>> due to the introduction of SMP, paravirtualized drivers, and asynchronous
>> hypercalls.  This patch adds support to the interrupt mechanism to force
>> a VCPU to VMEXIT when a new interrupt is pending.
>>
>>   
> 
> Comments below are fairly minor, but worthwhile IMO.
> 
> 
> 
>> Signed- off- by: Gregory Haskins <[EMAIL PROTECTED]>
>> ---
>>
>>  drivers/kvm/kvm.h  |2 ++
>>  drivers/kvm/kvm_main.c |   59 
> +++-
>>  drivers/kvm/svm.c  |   43 +++
>>  drivers/kvm/vmx.c  |   43 +++
>>  4 files changed, 146 insertions(+), 1 deletions(- )
>>
>> diff -- git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
>> index 059f074..0f6cc32 100644
>> ---  a/drivers/kvm/kvm.h
>> +++ b/drivers/kvm/kvm.h
>> @@ - 329,6 +329,8 @@ struct kvm_vcpu_irq {
>>  struct kvm_irqdevice dev;
>>  int  pending;
>>  int  deferred;
>> +struct task_struct  *task;
>> +int  guest_mode;
>>   
> 
> - >guest_mode can be folded into - >task, by specifying that - >task != 
> NULL is equivalent to - >guest_mode != 0.  This will make the rest of the 
> code easier to read.

The problem with doing it this way is that its no longer possible to detect the 
optimizing condition of "irq.task != current" when injecting interrupts.  This 
means that userspace will be inadvertently sending itself a signal every time 
it injects interrupts, which IMHO is undesirable.

> 
>>  };
>>  
>>  struct kvm_vcpu {
>> diff -- git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
>> index 199489b..a160638 100644
>> ---  a/drivers/kvm/kvm_main.c
>> +++ b/drivers/kvm/kvm_main.c
>> @@ - 1868,6 +1868,9 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, 
> struct kvm_run *kvm_run)
>>  kvm_arch_ops- >decache_regs(vcpu);
>>  }
>>  
>> +vcpu- >irq.task = current;
>> +smp_wmb();
>> +
>>   
> 
> This is best moved where - >guest_mode is set.

I can do this, but its common to all platforms so I figured it was best to be 
out here?

> 
>> +/*
>>   * This function will be invoked whenever the vcpu- >irq.dev raises its INTR
>>   * line
>>   */
>> @@ - 2318,10 +2335,50 @@ static void kvm_vcpu_intr(struct kvm_irqsink *this,
>>  {
>>  struct kvm_vcpu *vcpu = (struct kvm_vcpu*)this- >private;
>>  unsigned long flags;
>> +int direct_ipi = - 1;
>>  
>>  spin_lock_irqsave(&vcpu- >irq.lock, flags);
>>   
> 
> irqs are always enabled here, so spin_lock_irq() (and a corresponding 
> spin_unlock_irq) is sufficient.

This and the rest of your comments make sense.  Consider them all acked.

> 
>>  static void kvm_vcpu_irqsink_init(struct kvm_vcpu *vcpu)
>> diff -- git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
>> index 4c03881..91546ae 100644
>> ---  a/drivers/kvm/svm.c
>> +++ b/drivers/kvm/svm.c
>> @@ - 1542,11 +1542,40 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, 
>> struct 
> kvm_run *kvm_run)
>>  u16 gs_selector;
>>  u16 ldt_selector;
>>  int r;
>> +unsigned long irq_flags;
>>  
>>  again:
>> +/*
>> + * We disable interrupts until the next VMEXIT to eliminate a race
>> + * condition for delivery of virtual interrutps.  Note that this is
>> + * probably not as bad as it sounds, as interrupts will still invoke
>> + * a VMEXIT once transitioned to GUEST mode (and thus exit this lock
>> + * scope) even if they are disabled.
>> + *
>> + * FIXME: Do we need to do anything additional to mask IPI/NMIs?
>>   
> 
> You can remove the FIXME.
> 
>> + */
>> +local_irq_save(irq_flags);
>>   
> 
> Interrupts are always enabled here, so local_irq_disable() suffices.
> 
>> @@ - 1688,6 +1717,13 @@ again:
>>  #endif
>>  : "cc", "memory" );
>>  
>> +/*
>> + * FIXME: We'd like to turn on interrupts ASAP, but is this so early
>> + * that we will mess up the state of the CPU before we fully
>> + * transition from guest to host?
>> + */
>>   
> 
> You can remove the FIXME.  Pre- patch enabled interrupts much earlier.
> 
>> +local_irq_restore(irq_flags);
>> +
>>  if (vcpu- >fpu_active) {
>>  fx_save(vcpu- >guest_fx_image);
>>  fx_restore(vcpu- >host_fx_image);
>> @@ - 1710,6 +1746,13 @@ again:
>>  reload_tss(vcpu);
>>  
>>  /*
>> + * Signal that we have transitioned back to host mode
>> + */
>> +spin_lock_irqsave(&vcpu- >irq.lock, irq_flags);
>> +vcpu- >irq.guest_mode = 0;
>> +spin_unlock_irqrestore(&vcpu- >irq.lock, irq_flags);
>>   
> 
>  >> Don't you need to check interrupts here?
>  > 

Re: [kvm-devel] [PATCH/PFC 0/2] s390 host support

2007-05-14 Thread Carsten Otte
Avi Kivity wrote:
> kvm doesn't do this directly.  A hlt instruction (which is is used on 
> x86 to signal an idle cpu) is trapped and echoed to userspace, which 
> then sleeps using select(2).
I've read that part. I still don't like this approach, it just does'nt 
fit our signal processor instruction without interresting race conditions.
Our SIGP instruction does provide a condition code on the source cpu 
which indicates if the interprocessor signal was accepted by the 
target cpu.
- When the target CPU is going idle, but has not yet called signal(), 
how can we figure from kernel space if it has masked this interrupt? 
We would want to figure that quick to be able to reenter VM context on 
the initiating CPU asap.
- Also this requires synchronization, our arch requires there may be 
just one external interrupt pending per target CPU at a given time. 
How do we synchronize if both user and kernel can inject interrupts?

> We thought of having hlt sleep in the kernel, but that meant that we 
> would need to specify the exit conditions from sleep (signals, fd 
> readiness, aio readiness).
Yes, that is required indeed. I think pending signals should make the
syscall exit. AIO translates to SIGIO, and file descriptors should be 
checked by another pthread via poll.

> (I think you're comparing to your pthread way of sleeping and waking, 
> just making sure we're all on the same page here)
Yes we are. Sorry for confusion.

so long,
Carsten

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 8/8] KVM: Adds support for TPR shadowing under VMX processors

2007-05-14 Thread Gregory Haskins
>>> On Mon, May 14, 2007 at  7:09 AM, in message <[EMAIL PROTECTED]>,
Avi Kivity <[EMAIL PROTECTED]> wrote: 
> Gregory Haskins wrote:
>> Signed- off- by: Gregory Haskins <[EMAIL PROTECTED]>
>> ---
>>
>>   
> 
> How was this tested?

Its busted.  Don't use it ;)  Its just for example/comment only.  I will 
exclude it from future submissions.

> 
>> +printk(KERN_WARNING "KVM: Warning -  Host processor does " \
>> +   "not support TPR- shadow\n");
>>   
> 
> KERN_DEBUG.

Ack.




-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/PFC 0/2] s390 host support

2007-05-14 Thread Carsten Otte
Carsten Otte wrote:
> - When the target CPU is going idle, but has not yet called signal(), 
> how can we figure from kernel space if it has masked this interrupt?
*Ouch*. Should be select(), not signal().

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH] lighweight VM Exit

2007-05-14 Thread Christoph Hellwig
On Mon, May 14, 2007 at 06:14:31PM +0300, Avi Kivity wrote:
> Dong, Eddie wrote:
> > OK, how about this patch which further reduce the light weight VM Exit
> > MSR save/restore?
> >
> >
> > diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
> > index 1288cff..ef96fae 100644
> > --- a/drivers/kvm/kvm_main.c
> > +++ b/drivers/kvm/kvm_main.c
> > @@ -1596,6 +1596,30 @@ void kvm_resched(struct kvm_vcpu *vcpu)
> >  }
> >  EXPORT_SYMBOL_GPL(kvm_resched);
> >  
> > +void load_msrs_select(struct vmx_msr_entry *e, int bitmap)
> > +{
> > +   unsigned long nr;
> > +
> > +   while (bitmap) {
> > +   nr = __ffs(bitmap);
> > +   clear_bit(nr,&bitmap);
> > +   wrmsrl(e[nr].index, e[nr].data);
> > +   }
> > +}
> > +EXPORT_SYMBOL_GPL(load_msrs_select);

Exported symbols should have names with a meaningfull prefix and
a kerneldoc comment describing them.

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 3/8] KVM: Adds ability to preempt an executing VCPU

2007-05-14 Thread Avi Kivity
Gregory Haskins wrote:
>>> index 059f074..0f6cc32 100644
>>> ---  a/drivers/kvm/kvm.h
>>> +++ b/drivers/kvm/kvm.h
>>> @@ - 329,6 +329,8 @@ struct kvm_vcpu_irq {
>>> struct kvm_irqdevice dev;
>>> int  pending;
>>> int  deferred;
>>> +   struct task_struct  *task;
>>> +   int  guest_mode;
>>>   
>>>   
>> - >guest_mode can be folded into - >task, by specifying that - >task != 
>> NULL is equivalent to - >guest_mode != 0.  This will make the rest of the 
>> code easier to read.
>> 
>
> The problem with doing it this way is that its no longer possible to detect 
> the optimizing condition of "irq.task != current" when injecting interrupts.  
> This means that userspace will be inadvertently sending itself a signal every 
> time it injects interrupts, which IMHO is undesirable.
>
>   

I meant keeping ->task and dropping ->guest_mode.  Or did I 
misunderstand something?

>>>  
>>> +   vcpu- >irq.task = current;
>>> +   smp_wmb();
>>> +
>>>   
>>>   
>> This is best moved where - >guest_mode is set.
>> 
>
> I can do this, but its common to all platforms so I figured it was best to be 
> out here?
>
>   

Well, it scatters the logic.  If we can merge guest_mode and task it's 
moot anyway.


-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/PFC 0/2] s390 host support

2007-05-14 Thread Avi Kivity
Carsten Otte wrote:
> Avi Kivity wrote:
>> kvm doesn't do this directly.  A hlt instruction (which is is used on 
>> x86 to signal an idle cpu) is trapped and echoed to userspace, which 
>> then sleeps using select(2).
> I've read that part. I still don't like this approach, it just does'nt 
> fit our signal processor instruction without interresting race 
> conditions.

x86 and s390 don't have to be the same when hardware differences 
warrant.  But see below.

> Our SIGP instruction does provide a condition code on the source cpu 
> which indicates if the interprocessor signal was accepted by the 
> target cpu.
> - When the target CPU is going idle, but has not yet called signal(), 
> how can we figure from kernel space if it has masked this interrupt? 
> We would want to figure that quick to be able to reenter VM context on 
> the initiating CPU asap.

I don't understand the context here.  Why would the target cpu call 
signal()?

> - Also this requires synchronization, our arch requires there may be 
> just one external interrupt pending per target CPU at a given time. 
> How do we synchronize if both user and kernel can inject interrupts?

With the code that's going into kvm now, userspace posts the interrupt 
to the kernel, and the kernel injects it.  So the kernel is the 
synchronization point (x86 has the same constraint).

>
>> We thought of having hlt sleep in the kernel, but that meant that we 
>> would need to specify the exit conditions from sleep (signals, fd 
>> readiness, aio readiness).
> Yes, that is required indeed. I think pending signals should make the
> syscall exit. AIO translates to SIGIO, and file descriptors should be 
> checked by another pthread via poll.
>

Currently qemu multiplexes fd readiness and vcpu execution on the same 
(and only) thread, but it may make sense to have completions reaped by 
an I/O thread, which then dispatches interrupts to the appropriate vcpu, 
if necessary.  That avoids unnecessary exits, especially if we have 
interrupt mitigation and guest smp.

-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/PFC 0/2] s390 host support

2007-05-14 Thread Avi Kivity
Carsten Otte wrote:
> Carsten Otte wrote:
>> - When the target CPU is going idle, but has not yet called signal(), 
>> how can we figure from kernel space if it has masked this interrupt?
> *Ouch*. Should be select(), not signal().

Ah ok.  The kernel signal (or fd readiness) logic takes care of this and 
avoids unnecessary wakeups.

-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] Gentoo livecd amd64 not working?

2007-05-14 Thread Wink Saville
> Please provide a link to the livecd you used.
>
http://bouncer.gentoo.org/fetch/gentoo-2007.0-livecd/amd64/

I'm using bridging to connect guest to host and use the following commands:

1) qemu-img create -f qcow gentoo-amd64.img 20G

2) qemu-system-x86_64 . -hda gentoo-amd64.img -cdrom  \
livecd-amd64-installer-2007.0.iso -boot d -m 1024 \
-net nic,vlan=0 -net tap,vlan=0

3) Enter "gentoo" at boot prompt of guest

4) Execute" Gento Linux Installer (GTK+)" by double clicking

5) "Choose your install mode", select "Standard", then NEXT

6) "Partitioning", select "Recommended Layout", then YES to dialog

7) After partitioning is complete then, NEXT

8) "Local mounts", NEXT

9) "Network mount", NEXT (nothing selected)

10) "Stage selection", select "Build stage from files on LiveCD", NEXT

11) "Protage Tree", NEXT

12) Install activity stop with the progress bar:
 "Gentoo Linux Installer - installation Progress"
 "Step:   Unpack stge tarball"
 "Copying dev-lang/perl-5.8.8-r2 (50/116)"

The dmesg data on the host:

[47119.825384] device tap0 entered promiscuous mode
[47119.825454] audit(1179158073.252:6): dev=tap0 prom=256 old_prom=0
auid=4294967295
[47119.857412] br0: port 2(tap0) entering learning state
[47134.821907] br0: topology change detected, propagating
[47134.821914] br0: port 2(tap0) entering forwarding state

Also, the guest is alive I can execute other programs and can
"Cancel" the install. All seems well, just the install stopped.

I will leave the system up and try to provide other information
you may need when I get home tonight. Also, to night I will try
to use the iso image natively just to be sure its at least works.

Cheers,

Wink Saville

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/RFC 5/9] s390 virtual console for guests

2007-05-14 Thread Christian Bornträger
On Friday 11 May 2007 21:00, Anthony Liguori wrote:
> I think it would be better to use hvc_console as Xen now uses it too.

I just had a look at hvc_console, and indeed this driver looks appropriate for 
us. Looking at the xen-frontend driver (~130 lines of code) and the simple 
interface (get_char and put_char) it should be reasonably easy to convert our 
driver to a hvc_console user.

Christian

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/RFC 5/9] s390 virtual console for guests

2007-05-14 Thread Christian Borntraeger
On Monday 14 May 2007 18:23, Christian Bornträger wrote:
> On Friday 11 May 2007 21:00, Anthony Liguori wrote:
> > I think it would be better to use hvc_console as Xen now uses it too.
> I just had a look at hvc_console, and indeed this driver looks appropriate 

As I started prototyping this frontend I realized that hvc_console requires 
some interfaces, which are not present on s390, e.g. we have no request_irq 
and free_irq. Dont know if hvc_console is still the right way to go for us. 
This needs more thinking. 

Christian

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 05/10] KVM: Adds ability to signal userspace using a file-descriptor

2007-05-14 Thread Davide Libenzi
On Mon, 14 May 2007, Avi Kivity wrote:

> Gregory Haskins wrote:
> > > 
> > > Is having a read() (or a write()) actually necessary?
> > > 
> > 
> > Based on what I know: yes.  It could be a case of ignorance, however ;)
> > 
> > Heres why I think its necessary:  You need poll to simply tell you when
> > something is pending.  You can't clear the pending status in poll because
> > you cannot predict the internal access pattern (e.g. I assume it could be
> > polled multiple times by the kernel without returning immediately to
> > userspace).  Therefore, you need a second method to actually clear the
> > pending "signal", which I use the read() method for.  I can be convinced
> > otherwise, but that was my original thinking.
> >   
> 
> I think you are right, but am cc'ing an expert. Davide, we're using an fd to
> signal something to userspace, but have nothing to actually read() or write().
> Is a read() or write() avoidable?

I don't know exactly what you want to do, but signalfd signal de-queueing 
competes with the standard Linux signal delivery, if signals are not 
blocked.
So if you don't want to read() the signal, you can just leave the signal 
unblocked, and it'll be delivered to the signal handler.
You can even leave the signal blocked and avoid read(), but poll() on the 
signalfd will always return POLLIN if the sigmask includes the pending 
signal.




- Davide



-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 05/10] KVM: Adds ability to signal userspace using a file-descriptor

2007-05-14 Thread Avi Kivity
Davide Libenzi wrote:
> On Mon, 14 May 2007, Avi Kivity wrote:
>
>   
>> Gregory Haskins wrote:
>> 
 Is having a read() (or a write()) actually necessary?
 
 
>>> Based on what I know: yes.  It could be a case of ignorance, however ;)
>>>
>>> Heres why I think its necessary:  You need poll to simply tell you when
>>> something is pending.  You can't clear the pending status in poll because
>>> you cannot predict the internal access pattern (e.g. I assume it could be
>>> polled multiple times by the kernel without returning immediately to
>>> userspace).  Therefore, you need a second method to actually clear the
>>> pending "signal", which I use the read() method for.  I can be convinced
>>> otherwise, but that was my original thinking.
>>>   
>>>   
>> I think you are right, but am cc'ing an expert. Davide, we're using an fd to
>> signal something to userspace, but have nothing to actually read() or 
>> write().
>> Is a read() or write() avoidable?
>> 
>
> I don't know exactly what you want to do, but signalfd signal de-queueing 
> competes with the standard Linux signal delivery, if signals are not 
> blocked.
> So if you don't want to read() the signal, you can just leave the signal 
> unblocked, and it'll be delivered to the signal handler.
> You can even leave the signal blocked and avoid read(), but poll() on the 
> signalfd will always return POLLIN if the sigmask includes the pending 
> signal.
>   

This is not about a real signal.  We have an fd (for a pseudo 
filesystem) which wants to indicate its readiness to select(), but which 
doesn't have any real data to produce.  Is it possible to implement this 
without a read() or a write()?

We're also looking at using an eventfd for this, so this may be moot.

-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 05/10] KVM: Adds ability to signal userspace using a file-descriptor

2007-05-14 Thread Davide Libenzi
On Mon, 14 May 2007, Avi Kivity wrote:

> Davide Libenzi wrote:
> > On Mon, 14 May 2007, Avi Kivity wrote:
> > 
> >   
> > > Gregory Haskins wrote:
> > > 
> > > > > Is having a read() (or a write()) actually necessary?
> > > > > 
> > > > Based on what I know: yes.  It could be a case of ignorance, however ;)
> > > > 
> > > > Heres why I think its necessary:  You need poll to simply tell you when
> > > > something is pending.  You can't clear the pending status in poll
> > > > because
> > > > you cannot predict the internal access pattern (e.g. I assume it could
> > > > be
> > > > polled multiple times by the kernel without returning immediately to
> > > > userspace).  Therefore, you need a second method to actually clear the
> > > > pending "signal", which I use the read() method for.  I can be convinced
> > > > otherwise, but that was my original thinking.
> > > > 
> > > I think you are right, but am cc'ing an expert. Davide, we're using an fd
> > > to
> > > signal something to userspace, but have nothing to actually read() or
> > > write().
> > > Is a read() or write() avoidable?
> > > 
> > 
> > I don't know exactly what you want to do, but signalfd signal de-queueing
> > competes with the standard Linux signal delivery, if signals are not
> > blocked.
> > So if you don't want to read() the signal, you can just leave the signal
> > unblocked, and it'll be delivered to the signal handler.
> > You can even leave the signal blocked and avoid read(), but poll() on the
> > signalfd will always return POLLIN if the sigmask includes the pending
> > signal.
> >   
> 
> This is not about a real signal.  We have an fd (for a pseudo filesystem)
> which wants to indicate its readiness to select(), but which doesn't have any
> real data to produce.  Is it possible to implement this without a read() or a
> write()?
> 
> We're also looking at using an eventfd for this, so this may be moot.

Do you close the signaled fd after receiving the signal/event? If you 
don't close it, eventfd will always return ready (POLLIN). With the 
signalfd, you can leave the signal un-blocked and the signal will be 
delivered through the standard signal delivery, and the ready condition 
will be cleared.



- Davide



-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 05/10] KVM: Adds ability to signal userspace using a file-descriptor

2007-05-14 Thread Gregory Haskins
>>> On Mon, May 14, 2007 at 10:42 AM, in message <[EMAIL PROTECTED]>,
Avi Kivity <[EMAIL PROTECTED]> wrote: 
> Gregory Haskins wrote:
> On Mon, May 14, 2007 at  8:22 AM, in message <[EMAIL PROTECTED]>,
> 
>> Avi Kivity <[EMAIL PROTECTED]> wrote: 
>>   
 So based on this, I assume eventfd must be in the kernel already?  Cool.
   
>>> It is in 2.6.22-  rc1. As is the anonymous inodes source which can be used 
>>> to retire kvmfs (which will probably break the record for shortest-  lived 
>>> filesystem ever).
>>>
>>> 
>>
>> I just did a search against my kvm.git HEAD and do not see anything related 
> to eventfd.  Does this mean I should pull from linus' tree?  If I do this, 
> will it still work in your tree?  (Sorry...relative git- newbie here).
>>
>>   
> 
> kvm.git has eventfd merged.  See 
> http://git.kernel.org/gitweb.cgi?p=linux/kernel/git/avi/kvm.git;a=blob;f=fs/
> eventfd.c;h=480e2b3c4166a85be538d6f2c5edc25eace5ec6f;hb=HEAD 
> .
> 
> You were probably hit by the sync delay between master.kernel.org mand 
> the mirrors, or pulled before I pushed.


Ah thanks.  I see it now.  I am working on the new design that uses the 
existing eventfd() and should have a v4 patch for you soon (which includes you 
other feedback).  Stay tuned.

-Greg

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 05/10] KVM: Adds ability to signal userspace using a file-descriptor

2007-05-14 Thread Avi Kivity
Davide Libenzi wrote:
> On Mon, 14 May 2007, Avi Kivity wrote:
>
>   
>> Davide Libenzi wrote:
>> 
>>> On Mon, 14 May 2007, Avi Kivity wrote:
>>>
>>>   
>>>   
 Gregory Haskins wrote:
 
 
>> Is having a read() (or a write()) actually necessary?
>> 
>> 
> Based on what I know: yes.  It could be a case of ignorance, however ;)
>
> Heres why I think its necessary:  You need poll to simply tell you when
> something is pending.  You can't clear the pending status in poll
> because
> you cannot predict the internal access pattern (e.g. I assume it could
> be
> polled multiple times by the kernel without returning immediately to
> userspace).  Therefore, you need a second method to actually clear the
> pending "signal", which I use the read() method for.  I can be convinced
> otherwise, but that was my original thinking.
> 
>   
 I think you are right, but am cc'ing an expert. Davide, we're using an fd
 to
 signal something to userspace, but have nothing to actually read() or
 write().
 Is a read() or write() avoidable?
 
 
>>> I don't know exactly what you want to do, but signalfd signal de-queueing
>>> competes with the standard Linux signal delivery, if signals are not
>>> blocked.
>>> So if you don't want to read() the signal, you can just leave the signal
>>> unblocked, and it'll be delivered to the signal handler.
>>> You can even leave the signal blocked and avoid read(), but poll() on the
>>> signalfd will always return POLLIN if the sigmask includes the pending
>>> signal.
>>>   
>>>   
>> This is not about a real signal.  We have an fd (for a pseudo filesystem)
>> which wants to indicate its readiness to select(), but which doesn't have any
>> real data to produce.  Is it possible to implement this without a read() or a
>> write()?
>>
>> We're also looking at using an eventfd for this, so this may be moot.
>> 
>
> Do you close the signaled fd after receiving the signal/event? If you 
> don't close it, eventfd will always return ready (POLLIN).
>   

We don't.  Anyway, that's what we thought.  Thanks for the confirmation.


-- 
error compiling committee.c: too many arguments to function


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/RFC 5/9] s390 virtual console for guests

2007-05-14 Thread Anthony Liguori
Christian Borntraeger wrote:
> On Monday 14 May 2007 18:23, Christian Bornträger wrote:
>   
>> On Friday 11 May 2007 21:00, Anthony Liguori wrote:
>> 
>>> I think it would be better to use hvc_console as Xen now uses it too.
>>>   
>> I just had a look at hvc_console, and indeed this driver looks appropriate 
>> 
>
> As I started prototyping this frontend I realized that hvc_console requires 
> some interfaces, which are not present on s390, e.g. we have no request_irq 
> and free_irq. Dont know if hvc_console is still the right way to go for us. 
>   

It seems like request_irq is roughly the same as 
register_external_interrupt.  I suspect that you could get away with 
either patching hvc_console to use register_external_interrupt if 
CONFIG_S390 or perhaps providing a common interface.

I suspect that this is going to come up again for sharing other paravirt 
drivers.

Regards,

Anthony Liguori

> This needs more thinking. 
>
> Christian
>
>   


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 3/8] KVM: Adds ability to preempt an executing VCPU

2007-05-14 Thread Gregory Haskins
>>> On Mon, May 14, 2007 at 11:45 AM, in message <[EMAIL PROTECTED]>,
Avi Kivity <[EMAIL PROTECTED]> wrote: 
> Gregory Haskins wrote:
 index 059f074..0f6cc32 100644
 ---   a/drivers/kvm/kvm.h
 +++ b/drivers/kvm/kvm.h
 @@ -  329,6 +329,8 @@ struct kvm_vcpu_irq {
struct kvm_irqdevice dev;
int  pending;
int  deferred;
 +  struct task_struct  *task;
 +  int  guest_mode;
   
   
>>> -  >guest_mode can be folded into -  >task, by specifying that -  >task != 
>>> NULL is equivalent to -  >guest_mode != 0.  This will make the rest of the 
>>> code easier to read.
>>> 
>>
>> The problem with doing it this way is that its no longer possible to detect 
> the optimizing condition of "irq.task != current" when injecting interrupts.  
> This means that userspace will be inadvertently sending itself a signal every 
> time it injects interrupts, which IMHO is undesirable.
>>
>>   
> 
> I meant keeping - >task and dropping - >guest_mode.  Or did I 
> misunderstand something?

Its possible that I am actually misunderstanding you instead, but from my 
perspective those two variables are tracking orthogonal state.  irq.task is 
keeping track of the thread that is running the CPU.  This will tend to get set 
once (on the first entry to kvm_run() and stay unchanged for the duration of 
the VM.  irq.guest_mode, on the other hand, will track whether the vcpu is in 
(or near) guest mode (to switch between direct_ipi and eventfd wakeup methods).

I like having both states tracked, because it allows me to optimize the vcpu 
interrupt if the context of the injection is the same as the execution.  E.g. 
if the single QEMU thread calls KVM_RUN and then KVM_INTERRUPT, I can skip 
sending an eventfd because I know the irq.task == current and its pointless.  

(Note that in the original designs, irq.task was also used to designate a 
target for send_sig.  Perhaps it is no longer logical to have this scoped to 
the vcpu.irq structure anymore?  E.g. should I make it vcpu.task?)


> 
  
 +  vcpu-  >irq.task = current;
 +  smp_wmb();
 +
   
   
>>> This is best moved where -  >guest_mode is set.
>>> 
>>
>> I can do this, but its common to all platforms so I figured it was best to 
> be out here?
>>
>>   
> 
> Well, it scatters the logic.  If we can merge guest_mode and task it's 
> moot anyway.

Sounds reasonable.  If you convince me to condense this it goes away outright, 
otherwise I will move it together. ;)





-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 05/10] KVM: Adds ability to signal userspace using a file-descriptor

2007-05-14 Thread Gregory Haskins
>>> On Mon, May 14, 2007 at  1:23 PM, in message <[EMAIL PROTECTED]>,
Avi Kivity <[EMAIL PROTECTED]> wrote: 
>
>> Do you close the signaled fd after receiving the signal/event? If you 
>> don't close it, eventfd will always return ready (POLLIN).
>>   
> 
> We don't.  Anyway, that's what we thought.  Thanks for the confirmation.
> 

And plus I just finished converting to Davide's eventfd, so its moot ;)

On that topic, I could use some advice:

I was originally planning on adding a new ioctl like KVM_VCPU_CREATE_EVENTFD 
which would allocate a new eventfd and return it.  However, I soon realized 
that the only method to create an eventfd is sys_eventfd(), which is not 
exported by the eventfd.h headerfile (presumably this must be a new system 
call).

So based on that, I figured I would change the model so that the usermode app 
should call the eventfd open() call on its own, and then they could register 
the fd with me.  So KVM_VCPU_CREATE_EVENTFD becomes KVM_VCPU_SET_EVENTFD (where 
-1 "unregisters" it).

Does this sound like a reasonable approach?  If so, how does the usermode app 
actually open the eventfd today?  Is there a new glibc that I need to get the 
new system call?  Or can the app use open() somehow?  If open(), what is the 
path that should be specified?

Conversely, if my first approach was the right one how do I invoke the 
sys_eventfd()?  Is there a way to invoke system calls in kernel mode?  A better 
way?

Any advice appreciated.

-Greg




-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 05/10] KVM: Adds ability to signal userspace using a file-descriptor

2007-05-14 Thread Davide Libenzi
On Mon, 14 May 2007, Gregory Haskins wrote:

> >>> On Mon, May 14, 2007 at  1:23 PM, in message <[EMAIL PROTECTED]>,
> Avi Kivity <[EMAIL PROTECTED]> wrote: 
> >
> >> Do you close the signaled fd after receiving the signal/event? If you 
> >> don't close it, eventfd will always return ready (POLLIN).
> >>   
> > 
> > We don't.  Anyway, that's what we thought.  Thanks for the confirmation.
> > 
> 
> And plus I just finished converting to Davide's eventfd, so its moot ;)
> 
> On that topic, I could use some advice:
> 
> I was originally planning on adding a new ioctl like KVM_VCPU_CREATE_EVENTFD 
> which would allocate a new eventfd and return it.  However, I soon realized 
> that the only method to create an eventfd is sys_eventfd(), which is not 
> exported by the eventfd.h headerfile (presumably this must be a new system 
> call).
> 
> So based on that, I figured I would change the model so that the usermode app 
> should call the eventfd open() call on its own, and then they could register 
> the fd with me.  So KVM_VCPU_CREATE_EVENTFD becomes KVM_VCPU_SET_EVENTFD 
> (where -1 "unregisters" it).
> 
> Does this sound like a reasonable approach?  If so, how does the usermode app 
> actually open the eventfd today?  Is there a new glibc that I need to get the 
> new system call?  Or can the app use open() somehow?  If open(), what is the 
> path that should be specified?
> 
> Conversely, if my first approach was the right one how do I invoke the 
> sys_eventfd()?  Is there a way to invoke system calls in kernel mode?  A 
> better way?
> 
> Any advice appreciated.

The eventfd syscall is defined in include/linux/syscalls.h
>From userspace, till glibc aligns:

#include 

#ifndef __NR_eventfd
#if defined(__x86_64__)
#define __NR_eventfd 283
#elif defined(__i386__)
#define __NR_eventfd 323
#else
#error Cannot detect your architecture!
#endif
#endif

static int eventfd(int count) {

return syscall(__NR_eventfd, count);
}


If the kernel side receives an fd from userspace, it must use:

file = eventfd_fget(fd);
if (IS_ERR(file))

eventfd_signal(file, 1);
fput(file);



- Davide



-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 05/10] KVM: Adds ability to signal userspace using a file-descriptor

2007-05-14 Thread Gregory Haskins
>>> On Mon, May 14, 2007 at  3:12 PM, in message
<[EMAIL PROTECTED]>, Davide Libenzi
<[EMAIL PROTECTED]> wrote: 
> On Mon, 14 May 2007, Gregory Haskins wrote:
> 
>> >>> On Mon, May 14, 2007 at  1:23 PM, in message <[EMAIL PROTECTED]>,
>> Avi Kivity <[EMAIL PROTECTED]> wrote: 
>> >
>> >> Do you close the signaled fd after receiving the signal/event? If you 
>> >> don't close it, eventfd will always return ready (POLLIN).
>> >>   
>> > 
>> > We don't.  Anyway, that's what we thought.  Thanks for the confirmation.
>> > 
>> 
>> And plus I just finished converting to Davide's eventfd, so its moot ;)
>> 
>> On that topic, I could use some advice:
>> 
>> I was originally planning on adding a new ioctl like KVM_VCPU_CREATE_EVENTFD 
> which would allocate a new eventfd and return it.  However, I soon realized 
> that the only method to create an eventfd is sys_eventfd(), which is not 
> exported by the eventfd.h headerfile (presumably this must be a new system 
> call).
>> 
>> So based on that, I figured I would change the model so that the usermode 
> app should call the eventfd open() call on its own, and then they could 
> register the fd with me.  So KVM_VCPU_CREATE_EVENTFD becomes 
> KVM_VCPU_SET_EVENTFD (where - 1 "unregisters" it).
>> 
>> Does this sound like a reasonable approach?  If so, how does the usermode 
> app actually open the eventfd today?  Is there a new glibc that I need to get 
> the new system call?  Or can the app use open() somehow?  If open(), what is 
> the path that should be specified?
>> 
>> Conversely, if my first approach was the right one how do I invoke the 
> sys_eventfd()?  Is there a way to invoke system calls in kernel mode?  A 
> better way?
>> 
>> Any advice appreciated.
> 
> The eventfd syscall is defined in include/linux/syscalls.h
> From userspace, till glibc aligns:
> 
> #include 
> 
> #ifndef __NR_eventfd
> #if defined(__x86_64__)
> #define __NR_eventfd 283
> #elif defined(__i386__)
> #define __NR_eventfd 323
> #else
> #error Cannot detect your architecture!
> #endif
> #endif
> 
> static int eventfd(int count) {
> 
> return syscall(__NR_eventfd, count);
> }
> 
> 
> If the kernel side receives an fd from userspace, it must use:
> 
> file = eventfd_fget(fd);
> if (IS_ERR(file))
> 
> eventfd_signal(file, 1);
> fput(file);
> 
> 
> 
> -  Davide

Thanks Davide,
   That is very helpful.  Is there any reason why we can't export 
eventfd_signal() and eventfd_fget() to modules?

-Greg

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 05/10] KVM: Adds ability to signal userspace using a file-descriptor

2007-05-14 Thread Davide Libenzi
On Mon, 14 May 2007, Gregory Haskins wrote:

> Thanks Davide,
>  That is very helpful.  Is there any reason why we can't export 
>  eventfd_signal() and eventfd_fget() to modules?

I'll push a patch for that.



- Davide



-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH/RFC 5/9] s390 virtual console for guests

2007-05-14 Thread Arnd Bergmann
On Monday 14 May 2007, Anthony Liguori wrote:
> It seems like request_irq is roughly the same as 
> register_external_interrupt.  I suspect that you could get away with 
> either patching hvc_console to use register_external_interrupt if 
> CONFIG_S390 or perhaps providing a common interface.
> 
> I suspect that this is going to come up again for sharing other paravirt 
> drivers.

request_irq() is not a nice interface for s390, but it will probably make
sense to convert the two existing users of register_external_interrupt to
use that instead, in order to get something that can be shared across
architectures for virtual drivers.

It basically means extending struct ext_int_info_t to include a name and
a void* member that gets passed back to the interrupt handler, and to check
for invalid flags passed to request_irq.

You might want to show these in /proc/interrupts then as well,
as per-interrupt values.

Arnd <><

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 0/9] in-kernel APIC v4 (kernel side)

2007-05-14 Thread Gregory Haskins
This release incorporates v3, plus the following:

1) Avi's review comments have been added
2) Minor cleanup to the interrupt handling code
3) Conversion to eventfd userspace signaling mechanism

Has been tested on 

A) 32 bit Windows XP w/ACPI
B) 64 bit Linux (2.6.16 based)

Userspace will follow after this.

Regards,
-Greg

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 1/9] KVM: Adds support for in-kernel mmio handlers

2007-05-14 Thread Gregory Haskins
Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]>
---

 drivers/kvm/kvm.h  |   60 +++
 drivers/kvm/kvm_main.c |   94 ++--
 2 files changed, 142 insertions(+), 12 deletions(-)

diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 1bbafba..e32f63a 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -256,6 +256,65 @@ struct kvm_stat {
u32 light_exits;
 };
 
+struct kvm_io_device {
+   void (*read)(struct kvm_io_device *this,
+gpa_t addr,
+int len,
+void *val);
+   void (*write)(struct kvm_io_device *this,
+ gpa_t addr,
+ int len,
+ const void *val);
+   int (*in_range)(struct kvm_io_device *this, gpa_t addr);
+   void (*destructor)(struct kvm_io_device *this);
+
+   void *private;
+};
+
+static inline void kvm_iodevice_read(struct kvm_io_device *dev,
+gpa_t addr,
+int len,
+void *val)
+{
+   dev->read(dev, addr, len, val);
+}
+
+static inline void kvm_iodevice_write(struct kvm_io_device *dev,
+ gpa_t addr,
+ int len,
+ const void *val)
+{
+   dev->write(dev, addr, len, val);
+}
+
+static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
+{
+   return dev->in_range(dev, addr);
+}
+
+static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
+{
+   dev->destructor(dev);
+}
+
+/*
+ * It would be nice to use something smarter than a linear search, TBD...
+ * Thankfully we dont expect many devices to register (famous last words :),
+ * so until then it will suffice.  At least its abstracted so we can change
+ * in one place.
+ */
+struct kvm_io_bus {
+   int   dev_count;
+#define NR_IOBUS_DEVS 6
+   struct kvm_io_device *devs[NR_IOBUS_DEVS];
+};
+
+void kvm_io_bus_init(struct kvm_io_bus *bus);
+void kvm_io_bus_destroy(struct kvm_io_bus *bus);
+struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
+void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
+struct kvm_io_device *dev);
+
 struct kvm_vcpu {
struct kvm *kvm;
union {
@@ -375,6 +434,7 @@ struct kvm {
unsigned long rmap_overflow;
struct list_head vm_list;
struct file *filp;
+   struct kvm_io_bus mmio_bus;
 };
 
 struct descriptor_table {
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 1288cff..6ff30a2 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -317,6 +317,7 @@ static struct kvm *kvm_create_vm(void)
 
spin_lock_init(&kvm->lock);
INIT_LIST_HEAD(&kvm->active_mmu_pages);
+   kvm_io_bus_init(&kvm->mmio_bus);
for (i = 0; i < KVM_MAX_VCPUS; ++i) {
struct kvm_vcpu *vcpu = &kvm->vcpus[i];
 
@@ -414,6 +415,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
spin_lock(&kvm_lock);
list_del(&kvm->vm_list);
spin_unlock(&kvm_lock);
+   kvm_io_bus_destroy(&kvm->mmio_bus);
kvm_free_vcpus(kvm);
kvm_free_physmem(kvm);
kfree(kvm);
@@ -1037,12 +1039,25 @@ static int emulator_write_std(unsigned long addr,
return X86EMUL_UNHANDLEABLE;
 }
 
+static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
+   gpa_t addr)
+{
+   /*
+* Note that its important to have this wrapper function because
+* in the very near future we will be checking for MMIOs against
+* the LAPIC as well as the general MMIO bus
+*/
+   return kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
+}
+
 static int emulator_read_emulated(unsigned long addr,
  void *val,
  unsigned int bytes,
  struct x86_emulate_ctxt *ctxt)
 {
-   struct kvm_vcpu *vcpu = ctxt->vcpu;
+   struct kvm_vcpu  *vcpu = ctxt->vcpu;
+   struct kvm_io_device *mmio_dev;
+   gpa_t gpa;
 
if (vcpu->mmio_read_completed) {
memcpy(val, vcpu->mmio_data, bytes);
@@ -1051,18 +1066,26 @@ static int emulator_read_emulated(unsigned long addr,
} else if (emulator_read_std(addr, val, bytes, ctxt)
   == X86EMUL_CONTINUE)
return X86EMUL_CONTINUE;
-   else {
-   gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
 
-   if (gpa == UNMAPPED_GVA)
-   return X86EMUL_PROPAGATE_FAULT;
-   vcpu->mmio_needed = 1;
-   vcpu->mmio_phys_addr = gpa;
-   vcpu->mmio_size = bytes;
-   vcpu->mmio_is_write = 0;
+   gpa = vcpu->mmu.gva_to_gpa(vc

[kvm-devel] [PATCH 2/9] KVM: VMX - fix interrupt checking on light-exit

2007-05-14 Thread Gregory Haskins
Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]>
---

 drivers/kvm/vmx.c |6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index 804a623..e696d02 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -1934,13 +1934,13 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct 
kvm_run *kvm_run)
int r;
 
 preempted:
-   if (!vcpu->mmio_read_completed)
-   do_interrupt_requests(vcpu, kvm_run);
-
if (vcpu->guest_debug.enabled)
kvm_guest_debug_pre(vcpu);
 
 again:
+   if (!vcpu->mmio_read_completed)
+   do_interrupt_requests(vcpu, kvm_run);
+
vmx_save_host_state(vcpu);
kvm_load_guest_fpu(vcpu);
 


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 3/9] KVM: Add irqdevice object

2007-05-14 Thread Gregory Haskins
The current code is geared towards using a user-mode (A)PIC.  This patch adds
an "irqdevice" abstraction, and implements a "userint" model to handle the
duties of the original code.  Later, we can develop other irqdevice models
to handle objects like LAPIC, IOAPIC, i8259, etc, as appropriate

Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]>
---

 drivers/kvm/Makefile|2 
 drivers/kvm/irqdevice.h |  176 +
 drivers/kvm/kvm.h   |  107 ++-
 drivers/kvm/kvm_main.c  |   58 +---
 drivers/kvm/svm.c   |  162 --
 drivers/kvm/userint.c   |  223 +++
 drivers/kvm/vmx.c   |  161 +-
 7 files changed, 782 insertions(+), 107 deletions(-)

diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile
index c0a789f..540afbc 100644
--- a/drivers/kvm/Makefile
+++ b/drivers/kvm/Makefile
@@ -2,7 +2,7 @@
 # Makefile for Kernel-based Virtual Machine module
 #
 
-kvm-objs := kvm_main.o mmu.o x86_emulate.o
+kvm-objs := kvm_main.o mmu.o x86_emulate.o userint.o
 obj-$(CONFIG_KVM) += kvm.o
 kvm-intel-objs = vmx.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/drivers/kvm/irqdevice.h b/drivers/kvm/irqdevice.h
new file mode 100644
index 000..097d179
--- /dev/null
+++ b/drivers/kvm/irqdevice.h
@@ -0,0 +1,176 @@
+/*
+ * Defines an interface for an abstract interrupt controller.  The model
+ * consists of a unit with an arbitrary number of input lines N (IRQ0-(N-1)),
+ * an arbitrary number of output lines (INTR) (LINT, EXTINT, NMI, etc), and
+ * methods for completing an interrupt-acknowledge cycle (INTA).  A particular
+ * implementation of this model will define various policies, such as
+ * irq-to-vector translation, INTA/auto-EOI policy, etc.
+ *
+ * In addition, the INTR callback mechanism allows the unit to be "wired" to
+ * an interruptible source in a very flexible manner. For instance, an
+ * irqdevice could have its INTR wired to a VCPU (ala LAPIC), or another
+ * interrupt controller (ala cascaded i8259s)
+ *
+ * Copyright (C) 2007 Novell
+ *
+ * Authors:
+ *   Gregory Haskins <[EMAIL PROTECTED]>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef __IRQDEVICE_H
+#define __IRQDEVICE_H
+
+struct kvm_irqdevice;
+
+typedef enum {
+   kvm_irqpin_localint,
+   kvm_irqpin_extint,
+   kvm_irqpin_smi,
+   kvm_irqpin_nmi,
+   kvm_irqpin_invalid, /* must always be last */
+} kvm_irqpin_t;
+
+
+struct kvm_irqsink {
+   void (*set_intr)(struct kvm_irqsink *this,
+struct kvm_irqdevice *dev,
+kvm_irqpin_t pin);
+
+   void *private;
+};
+
+#define KVM_IRQACKDATA_VECTOR_VALID   (1 << 0)
+#define KVM_IRQACKDATA_VECTOR_PENDING (1 << 1)
+
+#define KVM_IRQACK_FLAG_PEEK  (1 << 0)
+
+struct kvm_irqack_data {
+   int flags;
+   int vector;
+};
+
+struct kvm_irqdevice {
+   int  (*ack)(struct kvm_irqdevice *this, int flags,
+   struct kvm_irqack_data *data);
+   int  (*set_pin)(struct kvm_irqdevice *this, int pin, int level);
+   void (*destructor)(struct kvm_irqdevice *this);
+
+   void   *private;
+   struct kvm_irqsink  sink;
+};
+
+/**
+ * kvm_irqdevice_init - initialize the kvm_irqdevice for use
+ * @dev: The device
+ *
+ * Description: Initialize the kvm_irqdevice for use.  Should be called before
+ *  calling any derived implementation init functions
+ *
+ * Returns: (void)
+ */
+static inline void kvm_irqdevice_init(struct kvm_irqdevice *dev)
+{
+   memset(dev, 0, sizeof(*dev));
+}
+
+/**
+ * kvm_irqdevice_ack - read and ack the highest priority vector from the device
+ * @dev: The device
+ * @flags: Modifies default behavior
+ *   [ KVM_IRQACK_FLAG_PEEK - Dont ack vector, just check status ]
+ * @data: A pointer to a kvm_irqack_data structure to hold the result
+ *
+ * Description: Read the highest priority pending vector from the device,
+ *  potentially invoking auto-EOI depending on device policy
+ *
+ *  Successful return indicates that the *data* structure is valid
+ *
+ *   data.flags -
+ *  [KVM_IRQACKDATA_VECTOR_VALID - data.vector is valid]
+ *  [KVM_IRQACKDATA_VECTOR_PENDING - more vectors are pending]
+ *
+ * Returns: (int)
+ *   [-1 = failure]
+ *   [ 0 = success]
+ */
+static inline int kvm_irqdevice_ack(struct kvm_irqdevice *dev, int flags,
+   struct kvm_irqack_data *data)
+{
+   return dev->ack(dev, flags, data);
+}
+
+/**
+ * kvm_irqdevice_set_pin - allows the caller to assert/deassert an IRQ
+ * @dev: The device
+ * @pin: The input pin to alter
+ * @level: The value to set (1 = assert, 0 = deassert)
+ *
+ * Description: Allows the caller to assert/deassert an IRQ input pin to 

[kvm-devel] [PATCH 4/9] KVM: Adds ability to preempt an executing VCPU

2007-05-14 Thread Gregory Haskins
The VCPU executes synchronously w.r.t. userspace today, and therefore
interrupt injection is pretty straight forward.  However, we will soon need
to be able to inject interrupts asynchronous to the execution of the VCPU
due to the introduction of SMP, paravirtualized drivers, and asynchronous
hypercalls.  This patch adds support to the interrupt mechanism to force
a VCPU to VMEXIT when a new interrupt is pending.

Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]>
---

 drivers/kvm/kvm.h  |2 ++
 drivers/kvm/kvm_main.c |   64 +---
 drivers/kvm/svm.c  |   50 +++---
 drivers/kvm/vmx.c  |   36 +++
 4 files changed, 145 insertions(+), 7 deletions(-)

diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 41e4eaa..7b5d5e6 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -331,6 +331,8 @@ struct kvm_vcpu_irq {
struct kvm_irqdevice dev;
int  pending;
int  deferred;
+   struct task_struct  *task;
+   int  guest_mode;
 };
 
 struct kvm_vcpu {
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index a2e1e50..cb73763 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -1891,6 +1891,9 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, 
struct kvm_run *kvm_run)
kvm_arch_ops->decache_regs(vcpu);
}
 
+   vcpu->irq.task = current;
+   smp_wmb();
+
r = kvm_arch_ops->run(vcpu, kvm_run);
 
 out:
@@ -2332,6 +2335,20 @@ out1:
 }
 
 /*
+ * This function is invoked whenever we want to interrupt a vcpu that is
+ * currently executing in guest-mode.  It currently is a no-op because
+ * the simple delivery of the IPI to execute this function accomplishes our
+ * goal: To cause a VMEXIT.  We pass the vcpu (which contains the
+ * vcpu->irq.task, etc) for future use
+ */
+static void kvm_vcpu_guest_intr(void *info)
+{
+#ifdef NOT_YET
+   struct kvm_vcpu *vcpu = (struct kvm_vcpu*)info;
+#endif
+}
+
+/*
  * This function will be invoked whenever the vcpu->irq.dev raises its INTR
  * line
  */
@@ -2340,11 +2357,50 @@ static void kvm_vcpu_intr(struct kvm_irqsink *this,
  kvm_irqpin_t pin)
 {
struct kvm_vcpu *vcpu = (struct kvm_vcpu*)this->private;
-   unsigned long flags;
+   int direct_ipi = -1;
+
+   spin_lock_irq(&vcpu->irq.lock);
+
+   if (!test_bit(pin, &vcpu->irq.pending)) {
+   /*
+* Record the change..
+*/
+   __set_bit(pin, &vcpu->irq.pending);
 
-   spin_lock_irqsave(&vcpu->irq.lock, flags);
-   __set_bit(pin, &vcpu->irq.pending);
-   spin_unlock_irqrestore(&vcpu->irq.lock, flags);
+   /*
+* then wake up the vcpu (if necessary)
+*/
+   if (vcpu->irq.task && (vcpu->irq.task != current)) {
+   if (vcpu->irq.guest_mode) {
+   /*
+* If we are in guest mode, we can optimize
+* the IPI by executing a function directly
+* on the owning processor.
+*/
+   direct_ipi = task_cpu(vcpu->irq.task);
+   BUG_ON(direct_ipi == smp_processor_id());
+   }
+   }
+   }
+
+   spin_unlock_irq(&vcpu->irq.lock);
+
+   /*
+* we can safely send the IPI outside of the lock-scope because the
+* irq.pending has already been updated.  This code assumes that
+* userspace will not sleep on anything other than HLT instructions.
+* HLT is covered in a race-free way because irq.pending was updated
+* in the critical section, and handle_halt() which check if any
+* interrupts are pending before returning to userspace.
+*
+* If it turns out that userspace can sleep on conditions other than
+* HLT, this code will need to be enhanced to allow the irq.pending
+* flags to be exported to userspace
+*/
+   if (direct_ipi != -1)
+   smp_call_function_single(direct_ipi,
+kvm_vcpu_guest_intr,
+vcpu, 0, 0);
 }
 
 static void kvm_vcpu_irqsink_init(struct kvm_vcpu *vcpu)
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index dd0a149..ab40d93 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -1544,9 +1544,41 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct 
kvm_run *kvm_run)
int r;
 
 again:
+   /*
+* We disable interrupts until the next VMEXIT to eliminate a race
+* condition for delivery of virtual interrutps.  Note that this is
+* probably not as bad as it sounds, as interrupts will still invoke
+* a VMEXIT once transiti

[kvm-devel] [PATCH 5/9] KVM: Adds ability to signal userspace using a file-descriptor

2007-05-14 Thread Gregory Haskins
Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]>
---

 drivers/kvm/kvm.h  |1 +
 drivers/kvm/kvm_main.c |   52 ++--
 include/linux/kvm.h|1 +
 3 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 7b5d5e6..f5731c4 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -333,6 +333,7 @@ struct kvm_vcpu_irq {
int  deferred;
struct task_struct  *task;
int  guest_mode;
+   int  eventfd;
 };
 
 struct kvm_vcpu {
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index cb73763..86e4262 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -40,6 +40,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "x86_emulate.h"
 #include "segment_descriptor.h"
@@ -326,6 +327,7 @@ static struct kvm *kvm_create_vm(void)
memset(&vcpu->irq, 0, sizeof(vcpu->irq));
spin_lock_init(&vcpu->irq.lock);
vcpu->irq.deferred = -1;
+   vcpu->irq.eventfd   = -1;
 
vcpu->cpu = -1;
vcpu->kvm = kvm;
@@ -2358,6 +2360,7 @@ static void kvm_vcpu_intr(struct kvm_irqsink *this,
 {
struct kvm_vcpu *vcpu = (struct kvm_vcpu*)this->private;
int direct_ipi = -1;
+   int eventfd = -1;
 
spin_lock_irq(&vcpu->irq.lock);
 
@@ -2379,7 +2382,14 @@ static void kvm_vcpu_intr(struct kvm_irqsink *this,
 */
direct_ipi = task_cpu(vcpu->irq.task);
BUG_ON(direct_ipi == smp_processor_id());
-   }
+   } else
+   /*
+* otherwise, we must assume that we could be
+* blocked anywhere, including userspace. Send
+* a signal to give everyone a chance to get
+* notification
+*/
+   eventfd = vcpu->irq.eventfd;
}
}
 
@@ -2401,6 +2411,12 @@ static void kvm_vcpu_intr(struct kvm_irqsink *this,
smp_call_function_single(direct_ipi,
 kvm_vcpu_guest_intr,
 vcpu, 0, 0);
+
+   if (eventfd != -1) {
+   struct file *filp = eventfd_fget(eventfd);
+   if (!IS_ERR(filp))
+   eventfd_signal(filp, 1);
+   }
 }
 
 static void kvm_vcpu_irqsink_init(struct kvm_vcpu *vcpu)
@@ -2584,6 +2600,14 @@ static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, 
struct kvm_fpu *fpu)
return 0;
 }
 
+static int kvm_vcpu_ioctl_set_eventfd(struct kvm_vcpu *vcpu, int fd)
+{
+   vcpu->irq.eventfd = fd;
+   smp_wmb();
+
+   return 0;
+}
+
 static long kvm_vcpu_ioctl(struct file *filp,
   unsigned int ioctl, unsigned long arg)
 {
@@ -2753,6 +2777,15 @@ static long kvm_vcpu_ioctl(struct file *filp,
r = 0;
break;
}
+   case KVM_SET_EVENTFD: {
+   int eventfd = (long)argp;
+
+   r = kvm_vcpu_ioctl_set_eventfd(vcpu, eventfd);
+   if (r)
+   goto out;
+   r = 0;
+   break;
+   }
default:
;
}
@@ -2937,12 +2970,19 @@ static long kvm_dev_ioctl(struct file *filp,
r = 0;
break;
}
-   case KVM_CHECK_EXTENSION:
-   /*
-* No extensions defined at present.
-*/
-   r = 0;
+   case KVM_CHECK_EXTENSION: {
+   int ext = (long)argp;
+
+   switch (ext) {
+   case KVM_SET_EVENTFD:
+   r = 1;
+   break;
+   default:
+   r = 0;
+   break;
+   }
break;
+   }
case KVM_GET_VCPU_MMAP_SIZE:
r = -EINVAL;
if (arg)
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index e6edca8..f13ec8c 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -300,5 +300,6 @@ struct kvm_signal_mask {
 #define KVM_SET_SIGNAL_MASK   _IOW(KVMIO,  0x8b, struct kvm_signal_mask)
 #define KVM_GET_FPU   _IOR(KVMIO,  0x8c, struct kvm_fpu)
 #define KVM_SET_FPU   _IOW(KVMIO,  0x8d, struct kvm_fpu)
+#define KVM_SET_EVENTFD   _IO(KVMIO,   0x8e)
 
 #endif


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

[kvm-devel] [PATCH 6/9] KVM: Add support for in-kernel LAPIC model

2007-05-14 Thread Gregory Haskins
Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]>
---

 drivers/kvm/Makefile   |2 
 drivers/kvm/kernint.c  |  149 +
 drivers/kvm/kvm.h  |   35 +
 drivers/kvm/kvm_main.c |  182 ++
 drivers/kvm/lapic.c| 1421 
 drivers/kvm/svm.c  |   13 
 drivers/kvm/userint.c  |8 
 drivers/kvm/vmx.c  |   16 -
 include/linux/kvm.h|   15 +
 9 files changed, 1801 insertions(+), 40 deletions(-)

diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile
index 540afbc..1aad737 100644
--- a/drivers/kvm/Makefile
+++ b/drivers/kvm/Makefile
@@ -2,7 +2,7 @@
 # Makefile for Kernel-based Virtual Machine module
 #
 
-kvm-objs := kvm_main.o mmu.o x86_emulate.o userint.o
+kvm-objs := kvm_main.o mmu.o x86_emulate.o userint.o lapic.o kernint.o
 obj-$(CONFIG_KVM) += kvm.o
 kvm-intel-objs = vmx.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/drivers/kvm/kernint.c b/drivers/kvm/kernint.c
new file mode 100644
index 000..b5cbcae
--- /dev/null
+++ b/drivers/kvm/kernint.c
@@ -0,0 +1,149 @@
+/*
+ * Kernel Interrupt IRQ device
+ *
+ * Provides a model for connecting in-kernel interrupt resources to a VCPU.
+ *
+ * A typical modern x86 processor has the concept of an internal Local-APIC
+ * and some external signal pins.  The way in which interrupts are injected is
+ * dependent on whether software enables the LAPIC or not.  When enabled,
+ * interrupts are acknowledged through the LAPIC.  Otherwise they are through
+ * an externally connected PIC (typically an i8259 on the BSP)
+ *
+ * Copyright (C) 2007 Novell
+ *
+ * Authors:
+ *   Gregory Haskins <[EMAIL PROTECTED]>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "kvm.h"
+
+struct kvm_kernint {
+   struct kvm_vcpu  *vcpu;
+   struct kvm_irqdevice *self_irq;
+   struct kvm_irqdevice *ext_irq;
+   struct kvm_irqdevice  apic_irq;
+
+};
+
+static struct kvm_irqdevice *get_irq_dev(struct kvm_kernint *s)
+{
+   struct kvm_irqdevice *dev;
+
+   if (kvm_lapic_enabled(s->vcpu))
+   dev = &s->apic_irq;
+   else
+   dev = s->ext_irq;
+
+   if (!dev)
+   kvm_crash_guest(s->vcpu->kvm);
+
+   return dev;
+}
+
+static int kernint_irqdev_ack(struct kvm_irqdevice *this, int flags,
+ struct kvm_irqack_data *data)
+{
+   struct kvm_kernint *s = (struct kvm_kernint*)this->private;
+
+   return kvm_irqdevice_ack(get_irq_dev(s), flags, data);
+}
+
+static int kernint_irqdev_set_pin(struct kvm_irqdevice *this,
+ int irq, int level)
+{
+   /* no-op */
+   return 0;
+}
+
+static void kernint_irqdev_destructor(struct kvm_irqdevice *this)
+{
+   struct kvm_kernint *s = (struct kvm_kernint*)this->private;
+
+   kvm_irqdevice_destructor(&s->apic_irq);
+   kvm_lapic_destroy(s->vcpu);
+   kfree(s);
+}
+
+static void kvm_apic_intr(struct kvm_irqsink *this,
+ struct kvm_irqdevice *dev,
+ kvm_irqpin_t pin)
+{
+   struct kvm_kernint *s = (struct kvm_kernint*)this->private;
+
+   /*
+* If the LAPIC sent us an interrupt it *must* be enabled,
+* just forward it on to the CPU
+*/
+   kvm_irqdevice_set_intr(s->self_irq, pin);
+}
+
+static void kvm_ext_intr(struct kvm_irqsink *this,
+struct kvm_irqdevice *dev,
+kvm_irqpin_t pin)
+{
+   struct kvm_kernint *s = (struct kvm_kernint*)this->private;
+
+   /*
+* If the EXTINT device sent us an interrupt, forward it to the LINT0
+* pin of the LAPIC
+*/
+   if (pin != kvm_irqpin_localint)
+   return;
+
+   /*
+* "irq 0" = LINT0, 1 = LINT1
+*/
+   kvm_irqdevice_set_pin(&s->apic_irq, 0, 1);
+}
+
+int kvm_kernint_init(struct kvm_vcpu *vcpu)
+{
+   struct kvm_irqdevice *irqdev = &vcpu->irq.dev;
+   struct kvm_kernint *s;
+   struct kvm_irqsink apicsink;
+
+   s = kzalloc(sizeof(*s), GFP_KERNEL);
+   if (!s)
+   return -ENOMEM;
+
+   s->vcpu = vcpu;
+
+   /*
+* Configure the irqdevice interface
+*/
+   irqdev->ack = kernint_irqdev_ack;
+   irqdev->set_pin = kernint_irqdev_set_pin;
+   irqdev->destructor  = kernint_irqdev_destructor;
+
+   irqdev->private = s;
+   s->self_irq = irqdev;
+
+   /*
+* Configure the EXTINT device if this is the BSP processor
+*/
+   if (!vcpu_slot(vcpu)) {
+   struct kvm_irqsink extsink = {
+   .set_intr   = kvm_ext_intr,
+   .private= s
+   };
+   s->ext_irq = &vcpu->kvm->isa_irq;
+   kvm_irqdevice_register_sink(s->ext_irq, &extsink);
+   }
+
+   /*
+* Configure the LAP

[kvm-devel] [PATCH 7/9] KVM: Adds support for real NMI injection on VMX processors

2007-05-14 Thread Gregory Haskins
Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]>
---

 drivers/kvm/vmx.c |   57 -
 drivers/kvm/vmx.h |3 +++
 2 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index bfc7b2e..a3656b9 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -1261,7 +1261,14 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
   PIN_BASED_VM_EXEC_CONTROL,
   PIN_BASED_EXT_INTR_MASK   /* 20.6.1 */
   | PIN_BASED_NMI_EXITING   /* 20.6.1 */
+  | PIN_BASED_VIRTUAL_NMI   /* 20.6.1 */
);
+
+   if (!(vmcs_read32(PIN_BASED_VM_EXEC_CONTROL) & PIN_BASED_VIRTUAL_NMI))
+   printk(KERN_DEBUG "KVM: Warning - Host processor does " \
+  "not support virtual-NMI injection.  Using IRQ " \
+  "method\n");
+ 
vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS,
   CPU_BASED_VM_EXEC_CONTROL,
   CPU_BASED_HLT_EXITING /* 20.6.2 */
@@ -1411,6 +1418,37 @@ static void inject_rmode_irq(struct kvm_vcpu *vcpu, int 
irq)
vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0x) | (sp - 6));
 }
 
+static void do_nmi_requests(struct kvm_vcpu *vcpu)
+{
+   int nmi_window = 0;
+
+   BUG_ON(!(test_bit(kvm_irqpin_nmi, &vcpu->irq.pending)));
+
+   nmi_window =
+   (((vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 0xb) == 0)
+&& (vmcs_read32(VM_ENTRY_INTR_INFO_FIELD)
+& INTR_INFO_VALID_MASK));
+
+   if (nmi_window) {
+   if (vcpu->rmode.active)
+   inject_rmode_irq(vcpu, 2);
+   else
+   vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+2 |
+INTR_TYPE_NMI |
+INTR_INFO_VALID_MASK);
+
+   __clear_bit(kvm_irqpin_nmi, &vcpu->irq.pending);
+   } else {
+   /*
+* NMIs blocked.  Wait for unblock.
+*/
+   u32 cbvec = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+   cbvec |= CPU_BASED_NMI_EXITING;
+   vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cbvec); 
+   }
+}
+
 static void do_intr_requests(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run,
kvm_irqpin_t pin)
@@ -1443,9 +1481,11 @@ static void do_intr_requests(struct kvm_vcpu *vcpu,
break;
case kvm_irqpin_nmi:
/*
-* FIXME: Someday we will handle this using the
-* specific VMX NMI features.  For now, just inject
-* the NMI as a standard interrupt on vector 2
+* We should only get here if the processor does
+* not support virtual NMIs.  Inject the NMI as a
+* standard interrupt on vector 2.  The implication is
+* that NMIs are going to be subject to RFLAGS.IF
+* masking, unfortunately.
 */
ack.flags |= KVM_IRQACKDATA_VECTOR_VALID;
ack.vector = 2;
@@ -1488,7 +1528,8 @@ static void do_intr_requests(struct kvm_vcpu *vcpu,
 static void clear_pending_controls(struct kvm_vcpu *vcpu)
 {
u32 cbvec = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-   cbvec &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+   cbvec &= ~(CPU_BASED_VIRTUAL_INTR_PENDING
+  | CPU_BASED_NMI_EXITING);
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cbvec);
 }
 
@@ -1505,7 +1546,6 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
switch (pin) {
case kvm_irqpin_localint:
case kvm_irqpin_extint:
-   case kvm_irqpin_nmi:
do_intr_requests(vcpu, kvm_run, pin);
break;
case kvm_irqpin_smi:
@@ -1513,6 +1553,13 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
printk(KERN_WARNING "KVM: dropping unhandled SMI\n");
__clear_bit(pin, &vcpu->irq.pending);
break;
+   case kvm_irqpin_nmi:
+   if (vmcs_read32(PIN_BASED_VM_EXEC_CONTROL)
+   & PIN_BASED_VIRTUAL_NMI)
+   do_nmi_requests(vcpu);
+   else
+   do_intr_requests(vcpu, kvm_run, pin);   
+   break;
case kvm_irqpin_invalid:
/* drop */
break;
diff --git a/drivers/kvm/vmx.h b/drivers/kvm/vmx.h
index d0dc93d..d3fe017 100644
--- a/

[kvm-devel] [PATCH 8/9] KVM: Adds basic plumbing to support TPR shadow features

2007-05-14 Thread Gregory Haskins
Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]>
---

 drivers/kvm/irqdevice.h |3 +++
 drivers/kvm/kvm.h   |1 +
 drivers/kvm/lapic.c |   15 +++
 3 files changed, 19 insertions(+), 0 deletions(-)

diff --git a/drivers/kvm/irqdevice.h b/drivers/kvm/irqdevice.h
index 097d179..173313d 100644
--- a/drivers/kvm/irqdevice.h
+++ b/drivers/kvm/irqdevice.h
@@ -45,12 +45,14 @@ struct kvm_irqsink {
 
 #define KVM_IRQACKDATA_VECTOR_VALID   (1 << 0)
 #define KVM_IRQACKDATA_VECTOR_PENDING (1 << 1)
+#define KVM_IRQACKDATA_NEXT_VALID (1 << 2)
 
 #define KVM_IRQACK_FLAG_PEEK  (1 << 0)
 
 struct kvm_irqack_data {
int flags;
int vector;
+   int next;
 };
 
 struct kvm_irqdevice {
@@ -92,6 +94,7 @@ static inline void kvm_irqdevice_init(struct kvm_irqdevice 
*dev)
  *   data.flags -
  *  [KVM_IRQACKDATA_VECTOR_VALID - data.vector is valid]
  *  [KVM_IRQACKDATA_VECTOR_PENDING - more vectors are pending]
+ *  [KVM_IRQACKDATA_NEXT_VALID - next-vector is valid]
  *
  * Returns: (int)
  *   [-1 = failure]
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 9d13c45..49ce8c2 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -180,6 +180,7 @@ void kvm_lapic_save(struct kvm_vcpu *vcpu, struct kvm_sregs 
*sregs);
 void kvm_lapic_restore(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 void kvm_lapic_reset(struct kvm_vcpu *vcpu);
 int  kvm_lapic_enabled(struct kvm_vcpu *vcpu);
+void *kvm_lapic_get_regs(struct kvm_vcpu *vcpu);
 
 /*
  * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c
index d62f56f..ec7cb41 100644
--- a/drivers/kvm/lapic.c
+++ b/drivers/kvm/lapic.c
@@ -1149,6 +1149,13 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
return ret;
 }
 
+void *kvm_lapic_get_regs(struct kvm_vcpu *vcpu)
+{
+   struct kvm_kern_apic *apic = (struct kvm_kern_apic*)vcpu->apic.dev;
+   return apic->regs;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_get_regs);
+
 /*
  *--
  * timer interface
@@ -1287,6 +1294,14 @@ static int apic_irqdev_ack(struct kvm_irqdevice *this, 
int flags,
 */
if ((irq & 0xf0) > apic_get_reg(apic, APIC_TASKPRI))
data->flags |= KVM_IRQACKDATA_VECTOR_PENDING;
+
+   /*
+* We report the next pending vector here so that the system
+* can asses TPR thresholds for TPR-shadowing purposes
+* (if applicable)
+*/
+   data->next   = irq;
+   data->flags |= KVM_IRQACKDATA_NEXT_VALID;
}
 
  out:


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 9/9] KVM: Add statistics from interrupt subsystem

2007-05-14 Thread Gregory Haskins
Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]>
---

 drivers/kvm/kvm.h  |5 +
 drivers/kvm/kvm_main.c |   17 +++--
 drivers/kvm/vmx.c  |2 ++
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 49ce8c2..b73dd3b 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -276,6 +276,11 @@ struct kvm_stat {
u32 request_irq_exits;
u32 irq_exits;
u32 light_exits;
+   u32 irq_posted;
+   u32 irq_accepted;
+   u32 guest_preempt;
+   u32 apic_mmio;
+   u32 local_mmio;
 };
 
 struct kvm_io_device {
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index c7d0024..efadabd 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -73,6 +73,11 @@ static struct kvm_stats_debugfs_item {
{ "request_irq", STAT_OFFSET(request_irq_exits) },
{ "irq_exits", STAT_OFFSET(irq_exits) },
{ "light_exits", STAT_OFFSET(light_exits) },
+   { "irq_posted", STAT_OFFSET(irq_posted) },
+   { "irq_accepted", STAT_OFFSET(irq_accepted) },
+   { "guest_preempt", STAT_OFFSET(guest_preempt) },
+   { "apic_mmio", STAT_OFFSET(apic_mmio) },
+   { "local_mmio", STAT_OFFSET(local_mmio) },
{ NULL }
 };
 
@@ -1138,13 +1143,19 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct 
kvm_vcpu *vcpu,
/*
 * First check if the LAPIC will snarf this request
 */
-   if (dev && dev->in_range(dev, addr))
+   if (dev && dev->in_range(dev, addr)) {
+   ++vcpu->stat.apic_mmio;
return dev;
+   }
 
/*
 * And then fallback to allow any device to participate
 */
-   return kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
+   dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
+   if (dev)
+   ++vcpu->stat.local_mmio;
+
+   return dev;
 }
 
 static int emulator_read_emulated(unsigned long addr,
@@ -2459,6 +2470,7 @@ static void kvm_vcpu_intr(struct kvm_irqsink *this,
 * Record the change..
 */
__set_bit(pin, &vcpu->irq.pending);
+   ++vcpu->stat.irq_posted;
 
/*
 * then wake up the vcpu (if necessary)
@@ -2472,6 +2484,7 @@ static void kvm_vcpu_intr(struct kvm_irqsink *this,
 */
direct_ipi = task_cpu(vcpu->irq.task);
BUG_ON(direct_ipi == smp_processor_id());
+   ++vcpu->stat.guest_preempt;
} else
/*
 * otherwise, we must assume that we could be
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index a3656b9..1b6c360 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -1506,6 +1506,8 @@ static void do_intr_requests(struct kvm_vcpu *vcpu,
 ack.vector |
 INTR_TYPE_EXT_INTR |
 INTR_INFO_VALID_MASK);
+
+   ++vcpu->stat.irq_accepted;
}
}
 


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 0/5] in-kernel APIC v4 (usermode side)

2007-05-14 Thread Gregory Haskins
Userspace additions to coordinate with v4 kernel patches.  Note that the
eventfd is supported in the user-library, but QEMU doesnt use it yet.

Regards,
-Greg

-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 1/5] KVM: Updates for compiling in-kernel APIC support with external-modules

2007-05-14 Thread Gregory Haskins
Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]>
---

 kernel/Kbuild |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/Kbuild b/kernel/Kbuild
index e9bcda7..103a179 100644
--- a/kernel/Kbuild
+++ b/kernel/Kbuild
@@ -1,5 +1,5 @@
 EXTRA_CFLAGS := -I$(src)/include -include $(src)/external-module-compat.h
 obj-m := kvm.o kvm-intel.o kvm-amd.o
-kvm-objs := kvm_main.o mmu.o x86_emulate.o
+kvm-objs := kvm_main.o mmu.o x86_emulate.o userint.o kernint.o lapic.o
 kvm-intel-objs := vmx.o vmx-debug.o
 kvm-amd-objs := svm.o


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 2/5] KVM-USER: Make the kvm_allowed flag always defined so we dont need #ifdefs

2007-05-14 Thread Gregory Haskins
Non-performance critical code is made more awkward by having to always define
both "#ifdef KVM" and "if (kvm_allowed)".  Define "kvm_allowed = 0" by
default.  Anthony Ligouri is credited with the idea.

Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]>
---

 qemu/qemu-kvm.c |9 -
 1 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c
index 212570a..d4419a3 100644
--- a/qemu/qemu-kvm.c
+++ b/qemu/qemu-kvm.c
@@ -3,6 +3,14 @@
 #include "config-host.h"
 
 #ifdef USE_KVM
+ #define KVM_ALLOWED_DEFAULT 1
+#else
+ #define KVM_ALLOWED_DEFAULT 0
+#endif
+
+int kvm_allowed = KVM_ALLOWED_DEFAULT;
+
+#ifdef USE_KVM
 
 #include "exec.h"
 
@@ -14,7 +22,6 @@
 
 extern void perror(const char *s);
 
-int kvm_allowed = 1;
 kvm_context_t kvm_context;
 static struct kvm_msr_list *kvm_msr_list;
 static int kvm_has_msr_star;


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 3/5] KVM-USER: Add ability to specify APIC emulation type from the command-line

2007-05-14 Thread Gregory Haskins
Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]>
---

 qemu/qemu-kvm.c |1 +
 qemu/vl.c   |5 +
 qemu/vl.h   |1 +
 3 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c
index d4419a3..faa4684 100644
--- a/qemu/qemu-kvm.c
+++ b/qemu/qemu-kvm.c
@@ -9,6 +9,7 @@
 #endif
 
 int kvm_allowed = KVM_ALLOWED_DEFAULT;
+int kvm_apic_level = 1;
 
 #ifdef USE_KVM
 
diff --git a/qemu/vl.c b/qemu/vl.c
index 7df1c80..986cea4 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -6531,6 +6531,7 @@ enum {
 QEMU_OPTION_vnc,
 QEMU_OPTION_no_acpi,
 QEMU_OPTION_no_kvm,
+QEMU_OPTION_kvm_apic,
 QEMU_OPTION_no_reboot,
 QEMU_OPTION_daemonize,
 QEMU_OPTION_option_rom,
@@ -6600,6 +6601,7 @@ const QEMUOption qemu_options[] = {
 #endif
 #ifdef USE_KVM
 { "no-kvm", 0, QEMU_OPTION_no_kvm },
+{ "kvm_apic", HAS_ARG, QEMU_OPTION_kvm_apic },
 #endif
 #if defined(TARGET_PPC) || defined(TARGET_SPARC)
 { "g", 1, QEMU_OPTION_g },
@@ -7309,6 +7311,9 @@ int main(int argc, char **argv)
case QEMU_OPTION_no_kvm:
kvm_allowed = 0;
break;
+   case QEMU_OPTION_kvm_apic:
+   kvm_apic_level = atoi(optarg);
+   break;
 #endif
 case QEMU_OPTION_usb:
 usb_enabled = 1;
diff --git a/qemu/vl.h b/qemu/vl.h
index debd17c..dec410e 100644
--- a/qemu/vl.h
+++ b/qemu/vl.h
@@ -158,6 +158,7 @@ extern int graphic_depth;
 extern const char *keyboard_layout;
 extern int kqemu_allowed;
 extern int kvm_allowed;
+extern int kvm_apic_level;
 extern int win2k_install_hack;
 extern int usb_enabled;
 extern int smp_cpus;


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [PATCH 4/5] KVM: in-kernel-apic modification to QEMU

2007-05-14 Thread Gregory Haskins
Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]>
---

 qemu/hw/apic.c  |   20 +++-
 qemu/hw/pc.c|   29 -
 qemu/qemu-kvm.c |   49 +++--
 qemu/qemu-kvm.h |2 ++
 qemu/vl.h   |7 ++-
 user/kvmctl.c   |   33 -
 user/kvmctl.h   |   31 ++-
 user/main.c |2 +-
 8 files changed, 141 insertions(+), 32 deletions(-)

diff --git a/qemu/hw/apic.c b/qemu/hw/apic.c
index 0b73233..5665057 100644
--- a/qemu/hw/apic.c
+++ b/qemu/hw/apic.c
@@ -18,6 +18,7 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 #include "vl.h"
+#include "qemu-kvm.h"
 
 //#define DEBUG_APIC
 //#define DEBUG_IOAPIC
@@ -87,6 +88,7 @@ typedef struct APICState {
 } APICState;
 
 struct IOAPICState {
+CPUState *cpu_env;
 uint8_t id;
 uint8_t ioregsel;
 
@@ -888,10 +890,17 @@ static void ioapic_service(IOAPICState *s)
 vector = pic_read_irq(isa_pic);
 else
 vector = entry & 0xff;
-
-apic_get_delivery_bitmask(deliver_bitmask, dest, dest_mode);
-apic_bus_deliver(deliver_bitmask, delivery_mode, 
- vector, polarity, trig_mode);
+ 
+   if (kvm_allowed && kvm_apic_level) {
+   ext_apic_bus_deliver(dest, trig_mode, dest_mode,
+delivery_mode, vector);
+   cpu_interrupt(s->cpu_env, CPU_INTERRUPT_HARD);
+   } else {
+   apic_get_delivery_bitmask(deliver_bitmask, dest,
+ dest_mode);
+   apic_bus_deliver(deliver_bitmask, delivery_mode, 
+vector, polarity, trig_mode);
+   }
 }
 }
 }
@@ -1045,7 +1054,7 @@ static CPUWriteMemoryFunc *ioapic_mem_write[3] = {
 ioapic_mem_writel,
 };
 
-IOAPICState *ioapic_init(void)
+IOAPICState *ioapic_init(CPUState *env)
 {
 IOAPICState *s;
 int io_memory;
@@ -1054,6 +1063,7 @@ IOAPICState *ioapic_init(void)
 if (!s)
 return NULL;
 ioapic_reset(s);
+s->cpu_env = env;
 s->id = last_apic_id++;
 
 io_memory = cpu_register_io_memory(0, ioapic_mem_read, 
diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
index eda49cf..df51539 100644
--- a/qemu/hw/pc.c
+++ b/qemu/hw/pc.c
@@ -91,16 +91,19 @@ int cpu_get_pic_interrupt(CPUState *env)
 {
 int intno;
 
-intno = apic_get_interrupt(env);
-if (intno >= 0) {
-/* set irq request if a PIC irq is still pending */
-/* XXX: improve that */
-pic_update_irq(isa_pic); 
-return intno;
+if (!use_kernel_apic()) {
+   intno = apic_get_interrupt(env);
+   if (intno >= 0) {
+   /* set irq request if a PIC irq is still pending */
+   /* XXX: improve that */
+   pic_update_irq(isa_pic); 
+   return intno;
+   }
+   
+   /* read the irq from the PIC */
+   if (!apic_accept_pic_intr(env))
+   return -1;
 }
-/* read the irq from the PIC */
-if (!apic_accept_pic_intr(env))
-return -1;
 
 intno = pic_read_irq(isa_pic);
 return intno;
@@ -483,9 +486,9 @@ static void pc_init1(int ram_size, int vga_ram_size, int 
boot_device,
 }
 register_savevm("cpu", i, 4, cpu_save, cpu_load, env);
 qemu_register_reset(main_cpu_reset, env);
-if (pci_enabled) {
-apic_init(env);
-}
+   if (!use_kernel_apic() && pci_enabled) {
+   apic_init(env);
+   }
 }
 
 /* allocate RAM */
@@ -671,7 +674,7 @@ static void pc_init1(int ram_size, int vga_ram_size, int 
boot_device,
 register_ioport_write(0x92, 1, 1, ioport92_write, NULL);
 
 if (pci_enabled) {
-ioapic = ioapic_init();
+ioapic = ioapic_init(env);
 }
 isa_pic = pic_init(pic_irq_request, first_cpu);
 pit = pit_init(0x40, 0);
diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c
index faa4684..59e79bf 100644
--- a/qemu/qemu-kvm.c
+++ b/qemu/qemu-kvm.c
@@ -235,9 +235,16 @@ static void load_regs(CPUState *env)
 sregs.cr3 = env->cr[3];
 sregs.cr4 = env->cr[4];
 
-sregs.apic_base = cpu_get_apic_base(env);
+if (!kvm_apic_level) {
+   /* These two are no longer used once the in-kernel APIC is enabled */
+   sregs.apic_base = 0;
+   sregs.cr8 = 0;
+} else {
+   sregs.apic_base = cpu_get_apic_base(env);
+   sregs.cr8 = cpu_get_apic_tpr(env);
+}
+
 sregs.efer = env->efer;
-sregs.cr8 = cpu_get_apic_tpr(env);
 
 kvm_set_sregs(kvm_context, 0, &sregs);
 
@@ -329,10 +336,12 @@ static void save_regs(CPUState *env)
 env->cr[3] = sregs.cr3;
 env->cr[4] = sregs.cr4;
 
-cpu_set_apic_base(env, sregs.apic_base);
+if (!kvm_apic_level) {
+   cpu_set_apic_base(env, sregs.

[kvm-devel] [PATCH 5/5] KVM-USER: Add support for listening for kernel-based interrupts

2007-05-14 Thread Gregory Haskins
Signed-off-by: Gregory Haskins <[EMAIL PROTECTED]>
---

 user/kvmctl.c |   32 
 user/kvmctl.h |2 ++
 2 files changed, 34 insertions(+), 0 deletions(-)

diff --git a/user/kvmctl.c b/user/kvmctl.c
index bc6e238..ea86426 100644
--- a/user/kvmctl.c
+++ b/user/kvmctl.c
@@ -22,6 +22,8 @@
 #include 
 #include 
 #include 
+#include 
+
 #include "kvmctl.h"
 
 #define EXPECTED_KVM_API_VERSION 12
@@ -398,6 +400,36 @@ int kvm_get_dirty_pages(kvm_context_t kvm, int slot, void 
*buf)
return kvm_get_map(kvm, KVM_GET_DIRTY_LOG, slot, buf);
 }
 
+#ifndef __NR_eventfd
+#if defined(__x86_64__)
+#define __NR_eventfd 283
+#elif defined(__i386__)
+#define __NR_eventfd 323
+#else
+#error Cannot detect your architecture!
+#endif
+#endif
+
+int kvm_vcpu_create_eventfd(kvm_context_t kvm, int vcpu)
+{
+   int r;
+/*
+* Replace this once the updated glibc comes out
+*/
+   int fd = syscall(__NR_eventfd, 0);
+
+   if (fd < 0)
+   return fd;
+
+   r = ioctl(kvm->vcpu_fd[vcpu], KVM_SET_EVENTFD, fd);
+   if (r < 0) {
+   close(fd);
+   return r;
+   }
+
+   return fd;
+}
+
 int kvm_get_mem_map(kvm_context_t kvm, int slot, void *buf)
 {
 #ifdef KVM_GET_MEM_MAP
diff --git a/user/kvmctl.h b/user/kvmctl.h
index b775347..07cbc08 100644
--- a/user/kvmctl.h
+++ b/user/kvmctl.h
@@ -337,6 +337,8 @@ void kvm_destroy_phys_mem(kvm_context_t, unsigned long 
phys_start,
  unsigned long len);
 int kvm_get_dirty_pages(kvm_context_t, int slot, void *buf);
 
+int kvm_vcpu_create_eventfd(kvm_context_t, int vcpu);
+
 
 /*
  * \brief Create a memory alias


-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] SDL problem with 2.6.21/kvm-17 on x86_64?

2007-05-14 Thread Joshua Hoblitt
Hello,

I'm attempting to get kvm-17 to work on a "server" system that has a kernel
built without alsa support and libsdl is built without alsa or oss support.

I get this error when attempting to start qemu:
--
$ sudo /usr/local/kvm/bin/qemu-system-x86_64  . -hda vdisk.img -cdrom 
/data/ipp000.0/jhoblitt/openSUSE-10.2-GM-DVD-x86_64.iso -boot d -m 1024
Could not configure '/dev/rtc' to have a 1024 Hz timer. This is not a fatal
error, but for better emulation accuracy either use a 2.6 host Linux
kernel or type 'echo 1024 > /proc/sys/dev/rtc/max-user-freq' as root.
kvm_create_vm: Invalid argument
Could not create KVM context
$ lsmod | grep kvm
kvm_amd19348  0 
kvm68040  1 kvm_amd
--

This system has 2 x Opeterson 2220s on a Tyan S2927.  I'm guess that this
failure is somehow related to sound as when I try to start qemu with -no-kvm...

--
 sudo /usr/local/kvm/bin/qemu-system-x86_64  . -hda vdisk.img -cdrom 
/data/ipp000.0/jhoblitt/openSUSE-10.2-GM-DVD-x86_64.iso -boot d -m 1024 -no-kvm
Could not configure '/dev/rtc' to have a 1024 Hz timer. This is not a fatal
error, but for better emulation accuracy either use a 2.6 host Linux kernel or
type 'echo 1024 > /proc/sys/dev/rtc/max-user-freq' as root.
Could not initialize SDL - exiting
--

So is the lack of sound hardware the problem here?  If so, will it be possible
at somepoint to run without it?

-J

--


pgptqZ1FJw0Bs.pgp
Description: PGP signature
-
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH] lighweight VM Exit

2007-05-14 Thread Dong, Eddie
Avi Kivity wrote:
> Why not use hardware autoloading?  Is it slower than software?

I believe HW is faster than SW, but the problem is that this kind of
save/restore is 
only needed for heavy weight VM Exit in KVM. While HW doesn't provide an
easy
way to bypass these MSR save/restore for light weight VM Exit, we have
to do
that in SW.

> 
> Otherwise looks good.  Did you measure performance improvement?  I
> usually use user/test/vmexit.c from kvm-userspace.git.
> 

Yes, I tested RHEL5 64 bits guest, in my old Pentium 4 platform, I get
4.9% 
performance increasement using Kernel Builder as workload. In my 4 core
Clovertown platform (Core 2 Duo), I get 5.4% performance increasement.
32 bits guest test didn't show regression either. 
Further improvement can be made base on this patch such as MSR_EFER
virtualization.

thx,eddie

A slight revise per Christoph's comments.

Signed-off-by:  Yaozu(Eddie) Dong [EMAIL PROTECTED]

against ca76d209b88c344fc6a8eac17057c0088a3d6940.



diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 1bbafba..08dd73f 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -287,6 +287,7 @@ struct kvm_vcpu {
u64 apic_base;
u64 ia32_misc_enable_msr;
int nmsrs;
+   int smsrs_bitmap;
struct vmx_msr_entry *guest_msrs;
struct vmx_msr_entry *host_msrs;
 
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 1288cff..44d8bc4 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -1596,21 +1596,27 @@ void kvm_resched(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_resched);
 
-void load_msrs(struct vmx_msr_entry *e, int n)
+void load_msrs(struct vmx_msr_entry *e, int bitmap)
 {
-   int i;
+   unsigned long nr;
 
-   for (i = 0; i < n; ++i)
-   wrmsrl(e[i].index, e[i].data);
+   while (bitmap) {
+   nr = __ffs(bitmap);
+   wrmsrl(e[nr].index, e[nr].data);
+   __clear_bit(nr,&bitmap);
+   }
 }
 EXPORT_SYMBOL_GPL(load_msrs);
 
-void save_msrs(struct vmx_msr_entry *e, int n)
+void save_msrs(struct vmx_msr_entry *e, int bitmap)
 {
-   int i;
+   unsigned long nr;
 
-   for (i = 0; i < n; ++i)
-   rdmsrl(e[i].index, e[i].data);
+   while (bitmap) {
+   nr = __ffs(bitmap);
+   rdmsrl(e[nr].index, e[nr].data);
+   __clear_bit(nr,&bitmap);
+   }
 }
 EXPORT_SYMBOL_GPL(save_msrs);
 
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index 804a623..0c69fe4 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -86,15 +86,6 @@ static const u32 vmx_msr_index[] = {
 
 #ifdef CONFIG_X86_64
 static unsigned msr_offset_kernel_gs_base;
-#define NR_64BIT_MSRS 4
-/*
- * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
- * mechanism (cpu bug AA24)
- */
-#define NR_BAD_MSRS 2
-#else
-#define NR_64BIT_MSRS 0
-#define NR_BAD_MSRS 0
 #endif
 
 static inline int is_page_fault(u32 intr_info)
@@ -117,13 +108,23 @@ static inline int is_external_interrupt(u32
intr_info)
== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
 }
 
-static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32
msr)
+static int __find_msr_index(struct kvm_vcpu *vcpu, u32 msr)
 {
int i;
 
for (i = 0; i < vcpu->nmsrs; ++i)
if (vcpu->guest_msrs[i].index == msr)
-   return &vcpu->guest_msrs[i];
+   return i;
+   return -1;
+}
+
+static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32
msr)
+{
+   int i;
+
+   i = __find_msr_index(vcpu, msr);
+   if (i >= 0) 
+   return &vcpu->guest_msrs[i];
return NULL;
 }
 
@@ -306,10 +307,10 @@ static void vmx_save_host_state(struct kvm_vcpu
*vcpu)
 
 #ifdef CONFIG_X86_64
if (is_long_mode(vcpu)) {
-   save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base,
1);
-   load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
+   save_msrs(vcpu->host_msrs, 1 <<
msr_offset_kernel_gs_base);
}
 #endif
+   load_msrs(vcpu->guest_msrs, vcpu->smsrs_bitmap);
 }
 
 static void vmx_load_host_state(struct kvm_vcpu *vcpu)
@@ -336,12 +337,8 @@ static void vmx_load_host_state(struct kvm_vcpu
*vcpu)
 
reload_tss();
}
-#ifdef CONFIG_X86_64
-   if (is_long_mode(vcpu)) {
-   save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
-   load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
-   }
-#endif
+   save_msrs(vcpu->guest_msrs, vcpu->smsrs_bitmap);
+   load_msrs(vcpu->host_msrs, vcpu->smsrs_bitmap);
 }
 
 /*
@@ -469,35 +466,51 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu,
unsigned error_code)
  */
 static void setup_msrs(struct kvm_vcpu *vcpu)
 {
-   int nr_skip, nr_good_msrs;
-
-   if (is_long_mode(vcpu))
-   nr_skip = NR_BAD_MSRS;
-   else
-   nr_skip = NR_64BIT_MSRS;
-   nr_good_msrs = vcpu->nmsrs - nr_skip;
+   int index,save_msrs;
 
-   /*
-