[PATCH 05/20] kvm: Pass CPUState to kvm_init_vcpu()

2013-01-15 Thread Andreas Färber
CPUArchState is no longer needed, and it thereby no longer depends on
NEED_CPU_H.

Signed-off-by: Andreas Färber afaer...@suse.de
---
 cpus.c   |2 +-
 include/sysemu/kvm.h |5 +++--
 kvm-all.c|3 +--
 kvm-stub.c   |2 +-
 4 Dateien geändert, 6 Zeilen hinzugefügt(+), 6 Zeilen entfernt(-)

diff --git a/cpus.c b/cpus.c
index bbb8961..a4390c3 100644
--- a/cpus.c
+++ b/cpus.c
@@ -742,7 +742,7 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
 cpu-thread_id = qemu_get_thread_id();
 cpu_single_env = env;
 
-r = kvm_init_vcpu(env);
+r = kvm_init_vcpu(cpu);
 if (r  0) {
 fprintf(stderr, kvm_init_vcpu failed: %s\n, strerror(-r));
 exit(1);
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 3db19ff..2fe8f8a 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -17,6 +17,7 @@
 #include errno.h
 #include config-host.h
 #include qemu/queue.h
+#include qom/cpu.h
 
 #ifdef CONFIG_KVM
 #include linux/kvm.h
@@ -120,9 +121,9 @@ int kvm_has_many_ioeventfds(void);
 int kvm_has_gsi_routing(void);
 int kvm_has_intx_set_mask(void);
 
-#ifdef NEED_CPU_H
-int kvm_init_vcpu(CPUArchState *env);
+int kvm_init_vcpu(CPUState *cpu);
 
+#ifdef NEED_CPU_H
 int kvm_cpu_exec(CPUArchState *env);
 
 #if !defined(CONFIG_USER_ONLY)
diff --git a/kvm-all.c b/kvm-all.c
index 4ba77de..6e2164b 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -214,9 +214,8 @@ static void kvm_reset_vcpu(void *opaque)
 kvm_arch_reset_vcpu(cpu);
 }
 
-int kvm_init_vcpu(CPUArchState *env)
+int kvm_init_vcpu(CPUState *cpu)
 {
-CPUState *cpu = ENV_GET_CPU(env);
 KVMState *s = kvm_state;
 long mmap_size;
 int ret;
diff --git a/kvm-stub.c b/kvm-stub.c
index 81f8967..47f8dca 100644
--- a/kvm-stub.c
+++ b/kvm-stub.c
@@ -24,7 +24,7 @@ bool kvm_irqfds_allowed;
 bool kvm_msi_via_irqfd_allowed;
 bool kvm_gsi_routing_allowed;
 
-int kvm_init_vcpu(CPUArchState *env)
+int kvm_init_vcpu(CPUState *cpu)
 {
 return -ENOSYS;
 }
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 04/20] cpu: Move cpu_index field to CPUState

2013-01-15 Thread Andreas Färber
Note that target-alpha accesses this field from TCG, now using a
negative offset. Therefore the field is placed last in CPUState.

Pass PowerPCCPU to [kvm]ppc_fixup_cpu() to facilitate this change.

Move common parts of mips cpu_state_reset() to mips_cpu_reset().

Acked-by: Richard Henderson r...@twiddle.net (for alpha)
[AF: Rebased onto ppc CPU subclasses and openpic changes]
Signed-off-by: Andreas Färber afaer...@suse.de
---
 cpus.c  |   14 +-
 exec.c  |   13 +++--
 gdbstub.c   |3 ++-
 hw/alpha_typhoon.c  |4 +++-
 hw/arm_gic.c|3 ++-
 hw/arm_mptimer.c|8 +---
 hw/openpic.c|5 -
 hw/ppc/e500.c   |   17 +++--
 hw/ppce500_spin.c   |8 +---
 hw/pxa.h|2 +-
 hw/pxa2xx.c |4 ++--
 hw/pxa2xx_gpio.c|5 +++--
 hw/spapr.c  |   11 ++-
 hw/spapr_hcall.c|4 +++-
 hw/spapr_rtas.c |8 +---
 hw/xics.c   |   22 --
 include/exec/cpu-defs.h |1 -
 include/exec/gdbstub.h  |3 ++-
 include/qom/cpu.h   |2 ++
 kvm-all.c   |2 +-
 monitor.c   |   15 ++-
 target-alpha/translate.c|2 +-
 target-arm/cpu.c|2 +-
 target-arm/helper.c |3 ++-
 target-cris/cpu.c   |2 +-
 target-i386/cpu.c   |7 ---
 target-i386/helper.c|   15 ---
 target-i386/misc_helper.c   |5 -
 target-lm32/cpu.c   |2 +-
 target-m68k/cpu.c   |2 +-
 target-microblaze/cpu.c |2 +-
 target-mips/cpu.c   |8 
 target-mips/translate.c |   17 +++--
 target-openrisc/cpu.c   |2 +-
 target-ppc/kvm.c|   12 +++-
 target-ppc/kvm_ppc.h|4 ++--
 target-ppc/translate_init.c |   10 ++
 target-s390x/cpu.c  |2 +-
 target-sh4/cpu.c|2 +-
 target-sparc/cpu.c  |2 +-
 40 Dateien geändert, 153 Zeilen hinzugefügt(+), 102 Zeilen entfernt(-)

diff --git a/cpus.c b/cpus.c
index d68231a..bbb8961 100644
--- a/cpus.c
+++ b/cpus.c
@@ -390,13 +390,15 @@ void hw_error(const char *fmt, ...)
 {
 va_list ap;
 CPUArchState *env;
+CPUState *cpu;
 
 va_start(ap, fmt);
 fprintf(stderr, qemu: hardware error: );
 vfprintf(stderr, fmt, ap);
 fprintf(stderr, \n);
-for(env = first_cpu; env != NULL; env = env-next_cpu) {
-fprintf(stderr, CPU #%d:\n, env-cpu_index);
+for (env = first_cpu; env != NULL; env = env-next_cpu) {
+cpu = ENV_GET_CPU(env);
+fprintf(stderr, CPU #%d:\n, cpu-cpu_index);
 cpu_dump_state(env, stderr, fprintf, CPU_DUMP_FPU);
 }
 va_end(ap);
@@ -1166,7 +1168,7 @@ void set_numa_modes(void)
 for (env = first_cpu; env != NULL; env = env-next_cpu) {
 cpu = ENV_GET_CPU(env);
 for (i = 0; i  nb_numa_nodes; i++) {
-if (test_bit(env-cpu_index, node_cpumask[i])) {
+if (test_bit(cpu-cpu_index, node_cpumask[i])) {
 cpu-numa_node = i;
 }
 }
@@ -1215,7 +1217,7 @@ CpuInfoList *qmp_query_cpus(Error **errp)
 
 info = g_malloc0(sizeof(*info));
 info-value = g_malloc0(sizeof(*info-value));
-info-value-CPU = env-cpu_index;
+info-value-CPU = cpu-cpu_index;
 info-value-current = (env == first_cpu);
 info-value-halted = env-halted;
 info-value-thread_id = cpu-thread_id;
@@ -1253,6 +1255,7 @@ void qmp_memsave(int64_t addr, int64_t size, const char 
*filename,
 FILE *f;
 uint32_t l;
 CPUArchState *env;
+CPUState *cpu;
 uint8_t buf[1024];
 
 if (!has_cpu) {
@@ -1260,7 +1263,8 @@ void qmp_memsave(int64_t addr, int64_t size, const char 
*filename,
 }
 
 for (env = first_cpu; env; env = env-next_cpu) {
-if (cpu_index == env-cpu_index) {
+cpu = ENV_GET_CPU(env);
+if (cpu_index == cpu-cpu_index) {
 break;
 }
 }
diff --git a/exec.c b/exec.c
index de5b27d..e5265e6 100644
--- a/exec.c
+++ b/exec.c
@@ -247,13 +247,16 @@ static const VMStateDescription vmstate_cpu_common = {
 };
 #endif
 
-CPUArchState *qemu_get_cpu(int cpu)
+CPUArchState *qemu_get_cpu(int index)
 {
 CPUArchState *env = first_cpu;
+CPUState *cpu;
 
 while (env) {
-if (env-cpu_index == cpu)
+cpu = ENV_GET_CPU(env);
+if (cpu-cpu_index == index) {
 break;
+}
 env = env-next_cpu;
 }
 
@@ -276,7 +279,7 @@ void cpu_exec_init(CPUArchState *env)
 penv = (*penv)-next_cpu;
 cpu_index++;
 }
-env-cpu_index = cpu_index;
+cpu-cpu_index = cpu_index;
 cpu-numa_node = 0;
 QTAILQ_INIT(env-breakpoints);
 QTAILQ_INIT(env-watchpoints);
@@ -529,7 

Re: [PATCH v5 07/14] KVM: ARM: World-switch implementation

2013-01-15 Thread Gleb Natapov
On Tue, Jan 08, 2013 at 01:39:24PM -0500, Christoffer Dall wrote:
 Provides complete world-switch implementation to switch to other guests
 running in non-secure modes. Includes Hyp exception handlers that
 capture necessary exception information and stores the information on
 the VCPU and KVM structures.
 
 The following Hyp-ABI is also documented in the code:
 
 Hyp-ABI: Calling HYP-mode functions from host (in SVC mode):
Switching to Hyp mode is done through a simple HVC #0 instruction. The
exception vector code will check that the HVC comes from VMID==0 and if
so will push the necessary state (SPSR, lr_usr) on the Hyp stack.
- r0 contains a pointer to a HYP function
- r1, r2, and r3 contain arguments to the above function.
- The HYP function will be called with its arguments in r0, r1 and r2.
On HYP function return, we return directly to SVC.
 
 A call to a function executing in Hyp mode is performed like the following:
 
 svc code
 ldr r0, =BSYM(my_hyp_fn)
 ldr r1, =my_param
 hvc #0  ; Call my_hyp_fn(my_param) from HYP mode
 svc code
 
 Otherwise, the world-switch is pretty straight-forward. All state that
 can be modified by the guest is first backed up on the Hyp stack and the
 VCPU values is loaded onto the hardware. State, which is not loaded, but
 theoretically modifiable by the guest is protected through the
 virtualiation features to generate a trap and cause software emulation.
 Upon guest returns, all state is restored from hardware onto the VCPU
 struct and the original state is restored from the Hyp-stack onto the
 hardware.
 
 SMP support using the VMPIDR calculated on the basis of the host MPIDR
 and overriding the low bits with KVM vcpu_id contributed by Marc Zyngier.
 
 Reuse of VMIDs has been implemented by Antonios Motakis and adapated from
 a separate patch into the appropriate patches introducing the
 functionality. Note that the VMIDs are stored per VM as required by the ARM
 architecture reference manual.
 
 To support VFP/NEON we trap those instructions using the HPCTR. When
 we trap, we switch the FPU.  After a guest exit, the VFP state is
 returned to the host.  When disabling access to floating point
 instructions, we also mask FPEXC_EN in order to avoid the guest
 receiving Undefined instruction exceptions before we have a chance to
 switch back the floating point state.  We are reusing vfp_hard_struct,
 so we depend on VFPv3 being enabled in the host kernel, if not, we still
 trap cp10 and cp11 in order to inject an undefined instruction exception
 whenever the guest tries to use VFP/NEON. VFP/NEON developed by
 Antionios Motakis and Rusty Russell.
 
 Aborts that are permission faults, and not stage-1 page table walk, do
 not report the faulting address in the HPFAR.  We have to resolve the
 IPA, and store it just like the HPFAR register on the VCPU struct. If
 the IPA cannot be resolved, it means another CPU is playing with the
 page tables, and we simply restart the guest.  This quirk was fixed by
 Marc Zyngier.
 
 Reviewed-by: Marcelo Tosatti mtosa...@redhat.com
 Signed-off-by: Rusty Russell rusty.russ...@linaro.org
 Signed-off-by: Antonios Motakis a.mota...@virtualopensystems.com
 Signed-off-by: Marc Zyngier marc.zyng...@arm.com
 Signed-off-by: Christoffer Dall c.d...@virtualopensystems.com
 ---
  arch/arm/include/asm/kvm_arm.h  |   51 
  arch/arm/include/asm/kvm_host.h |   10 +
  arch/arm/kernel/asm-offsets.c   |   25 ++
  arch/arm/kvm/arm.c  |  187 
  arch/arm/kvm/interrupts.S   |  396 +++
  arch/arm/kvm/interrupts_head.S  |  443 
 +++
  6 files changed, 1108 insertions(+), 4 deletions(-)
  create mode 100644 arch/arm/kvm/interrupts_head.S
 
 diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
 index fb22ee8..a3262a2 100644
 --- a/arch/arm/include/asm/kvm_arm.h
 +++ b/arch/arm/include/asm/kvm_arm.h
 @@ -98,6 +98,18 @@
  #define TTBCR_T0SZ   3
  #define HTCR_MASK(TTBCR_T0SZ | TTBCR_IRGN0 | TTBCR_ORGN0 | TTBCR_SH0)
  
 +/* Hyp System Trap Register */
 +#define HSTR_T(x)(1  x)
 +#define HSTR_TTEE(1  16)
 +#define HSTR_TJDBX   (1  17)
 +
 +/* Hyp Coprocessor Trap Register */
 +#define HCPTR_TCP(x) (1  x)
 +#define HCPTR_TCP_MASK   (0x3fff)
 +#define HCPTR_TASE   (1  15)
 +#define HCPTR_TTA(1  20)
 +#define HCPTR_TCPAC  (1  31)
 +
  /* Hyp Debug Configuration Register bits */
  #define HDCR_TDRA(1  11)
  #define HDCR_TDOSA   (1  10)
 @@ -144,6 +156,45 @@
  #else
  #define VTTBR_X  (5 - KVM_T0SZ)
  #endif
 +#define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
 +#define VTTBR_BADDR_MASK  (((1LLU  (40 - VTTBR_X)) - 1)  
 VTTBR_BADDR_SHIFT)
 +#define VTTBR_VMID_SHIFT  (48LLU)
 +#define VTTBR_VMID_MASK(0xffLLU  VTTBR_VMID_SHIFT)
 +
 +/* Hyp Syndrome Register (HSR) bits */
 +#define HSR_EC_SHIFT (26)
 +#define HSR_EC   (0x3fU  

[PATCH 1/2] virtio-scsi: split out request queue set affinity function

2013-01-15 Thread Wanlong Gao
These two patches are based on the multi-queue virtio-scsi patch set.

We set cpu affinity when the num_queues equals to the number
of VCPUs. Split out the set affinity function, this also
fix the bug when CPU IDs are not consecutive.

Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
---
 drivers/scsi/virtio_scsi.c | 50 --
 1 file changed, 44 insertions(+), 6 deletions(-)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 3641d5f..16b0ef2 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -106,6 +106,9 @@ struct virtio_scsi {
 
u32 num_queues;
 
+   /* Does the affinity hint is set for virtqueues? */
+   bool affinity_hint_set;
+
struct virtio_scsi_vq ctrl_vq;
struct virtio_scsi_vq event_vq;
struct virtio_scsi_vq req_vqs[];
@@ -701,14 +704,45 @@ static struct scsi_host_template 
virtscsi_host_template_multi = {
  __val, sizeof(__val)); \
})
 
+static void virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity)
+{
+   int i;
+   int cpu;
+
+   /* In multiqueue mode, when the number of cpu is equal
+* to the number of request queues, we let the qeueues
+* to be private to one cpu by setting the affinity hint
+* to eliminate the contention.
+*/
+   if ((vscsi-num_queues == 1 ||
+vscsi-num_queues != num_online_cpus())  affinity) {
+   if (vscsi-affinity_hint_set)
+   affinity = false;
+   else
+   return;
+   }
+
+   if (affinity) {
+   i = 0;
+   for_each_online_cpu(cpu) {
+   virtqueue_set_affinity(vscsi-req_vqs[i].vq, cpu);
+   i++;
+   }
+
+   vscsi-affinity_hint_set = true;
+   } else {
+   for (i = 0; i  vscsi-num_queues - VIRTIO_SCSI_VQ_BASE; i++)
+   virtqueue_set_affinity(vscsi-req_vqs[i].vq, -1);
+
+   vscsi-affinity_hint_set = false;
+   }
+}
 
 static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
-struct virtqueue *vq, bool affinity)
+struct virtqueue *vq)
 {
spin_lock_init(virtscsi_vq-vq_lock);
virtscsi_vq-vq = vq;
-   if (affinity)
-   virtqueue_set_affinity(vq, vq-index - VIRTIO_SCSI_VQ_BASE);
 }
 
 static void virtscsi_init_tgt(struct virtio_scsi *vscsi, int i)
@@ -736,6 +770,8 @@ static void virtscsi_remove_vqs(struct virtio_device *vdev)
struct Scsi_Host *sh = virtio_scsi_host(vdev);
struct virtio_scsi *vscsi = shost_priv(sh);
 
+   virtscsi_set_affinity(vscsi, false);
+
/* Stop all the virtqueues. */
vdev-config-reset(vdev);
 
@@ -779,11 +815,13 @@ static int virtscsi_init(struct virtio_device *vdev,
if (err)
return err;
 
-   virtscsi_init_vq(vscsi-ctrl_vq, vqs[0], false);
-   virtscsi_init_vq(vscsi-event_vq, vqs[1], false);
+   virtscsi_init_vq(vscsi-ctrl_vq, vqs[0]);
+   virtscsi_init_vq(vscsi-event_vq, vqs[1]);
for (i = VIRTIO_SCSI_VQ_BASE; i  num_vqs; i++)
virtscsi_init_vq(vscsi-req_vqs[i - VIRTIO_SCSI_VQ_BASE],
-vqs[i], vscsi-num_queues  1);
+vqs[i]);
+
+   virtscsi_set_affinity(vscsi, true);
 
virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE);
virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE);
-- 
1.8.1
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH KVM v2 1/4] KVM: fix i8254 IRQ0 to be normally high

2013-01-15 Thread Matthew Ogilvie
On Fri, Jan 11, 2013 at 05:45:28PM +0200, Gleb Natapov wrote:
 On Thu, Jan 10, 2013 at 11:40:07PM -0700, Matthew Ogilvie wrote:
  On Tue, Jan 08, 2013 at 09:31:19AM +0200, Gleb Natapov wrote:
   On Mon, Jan 07, 2013 at 06:17:22PM -0600, mmogi...@miniinfo.net wrote:
On Mon, 7 Jan 2013 11:39:18 +0200, Gleb Natapov g...@redhat.com wrote:
 On Wed, Dec 26, 2012 at 10:39:53PM -0700, Matthew Ogilvie wrote:
 Reading the spec, it is clear that most modes normally leave the IRQ
 output line high, and only pulse it low to generate a leading edge.
 Especially the most commonly used mode 2.
 
 The KVM i8254 model does not try to emulate the duration of the 
 pulse at
 all, so just swap the high/low settings it to leave it high most of
 the time.
 
 This fix is a prerequisite to improving the i8259 model to handle
 the trailing edge of an interupt request as indicated in its spec:
 If it gets a trailing edge of an IRQ line before it starts to service
 the interrupt, the request should be canceled.
 
 See http://bochs.sourceforge.net/techspec/intel-82c54-timer.pdf.gz
 or search the net for 23124406.pdf.
 
 Risks:
 
 There is a risk that migrating a running guest between versions
 with and without this patch will lose or gain a single timer
 interrupt during the migration process.  The only case where
 Can you elaborate on how exactly this can happen? Do not see it.
 

KVM 8254: In the corrected model, when the count expires, the model
briefly pulses output low and then high again, with the low to high
transition being what triggers the interrupt.  In the old model,
when the count expires, the model expects the output line
to already be low, and briefly pulses it high (triggering the
interrupt) and then low again.  But if the line was already
high (because it migrated from the corrected model),
this won't generate a new leading edge (low to high) and won't
trigger a new interrupt (the first post-back-migration pulse turns
into a simple trailing edge instead of a pulse).

Unless there is something I'm missing?

   No, I missed that pic-last_irr/ioapic-irr  will be migrated as 1. But
   this means that next interrupt after migration from new to old will
   always be lost.  What about clearing pit bit from last_irr/irr before
   migration? Should not affect new-new migration and should fix new-old
   one. The only problem is that we may need to consult irq routing table
   to know how pit is connected to ioapic.
  
  We should not clear both last_irr and irr.  That
  cancels the interrupt early if the CPU hasn't started servicing
  it yet:  If the guest CPU has interrupts disabled
  and hasn't begun to service the interrupt, a new-new migration could
  lose the interrupt much earlier in the countdown cycle than it otherwise
  might be lost.
  
 I am talking about last_irr in pic and irr in ioapic. Of course we
 shouldn't clear pic-irr on migration. ioapic uses irr to detect edge.

I probably need to look into the details of how the ioapic is supposed
to work.  Sigh.

 
  Potentially we could clear the last_irr bit only.  This effectively
  makes migration behave like the old unfixed code.  But if this is
  considered acceptable, I would be more inclined to not fix IRQ0 at all,
 If we do not fix IRQ0 next timer interrupt after migration will always be
 lost.

Obviously true if the trailing edge consumption logic for the
IRQ0 line is corrected in the 8259.  But if it isn't:

I probably could have been clearer that we could leave the 8254 unchanged,
and adjust the 8259 fix to leave IRQ0 as-is (with the incorrect handling
of a trailing edge - only other IRQ lines would be fixed), and then
timer interrupts would just work exactly as they do now.
This minimal fix would would probably be the lowest risk of
breaking something that currently works, but I don't know if
people would go for a patch that intentionally leaves in known
breakage in IRQ0...  This is option 1 below.

--

If we cleared last_irr in the qemu model during migration,
we risk getting an EXTRA interrupt when
migrating mode 4 (misremembered as mode 2 in an earlier
email below) from old to new models.  (If the sequence goes: line
is asserted, guest migrates, interrupt is handled [all in less than
a 8254 clock tick (approx 1 MHz)], then the off-by-one edge
in the new model triggers another leading edge...)  In contrast,
this might not be a risk in the KVM model as currently implemented.

Maybe this objection is not important, and you like option 4
listed below.

---

So I'm still not sure what overall fix strategy the main qemu and kvm
maintainers would like the best.  There are several possibilities,
but they all seem to have notable downsides.

Some possible fix strategies:

1. Only fix the IRQ2 (cascade) line in 8259.  Leave trailing edge
   logic for other lines as-is, and don't touch the 8254.  (Or

Re: [PATCH v5 06/14] KVM: ARM: Inject IRQs and FIQs from userspace

2013-01-15 Thread Gleb Natapov
On Tue, Jan 08, 2013 at 01:39:17PM -0500, Christoffer Dall wrote:
 From: Christoffer Dall cd...@cs.columbia.edu
 
 All interrupt injection is now based on the VM ioctl KVM_IRQ_LINE.  This
 works semantically well for the GIC as we in fact raise/lower a line on
 a machine component (the gic).  The IOCTL uses the follwing struct.
 
 struct kvm_irq_level {
   union {
   __u32 irq; /* GSI */
   __s32 status;  /* not used for KVM_IRQ_LEVEL */
   };
   __u32 level;   /* 0 or 1 */
 };
 
 ARM can signal an interrupt either at the CPU level, or at the in-kernel 
 irqchip
CPU level interrupt should use KVM_INTERRUPT instead.

 (GIC), and for in-kernel irqchip can tell the GIC to use PPIs designated for
 specific cpus.  The irq field is interpreted like this:
 
Haven't read about GIC yet. Is PPI an interrupt that device can send
directly to a specific CPU? Can we model that with irq routing like we do
for MSI?

   bits:  | 31 ... 24 | 23  ... 16 | 15...0 |
   field: | irq_type  | vcpu_index |   irq_number   |
 
 The irq_type field has the following values:
 - irq_type[0]: out-of-kernel GIC: irq_number 0 is IRQ, irq_number 1 is FIQ
 - irq_type[1]: in-kernel GIC: SPI, irq_number between 32 and 1019 (incl.)
(the vcpu_index field is ignored)
 - irq_type[2]: in-kernel GIC: PPI, irq_number between 16 and 31 (incl.)
 
 The irq_number thus corresponds to the irq ID in as in the GICv2 specs.
 
 This is documented in Documentation/kvm/api.txt.
 
 Reviewed-by: Marcelo Tosatti mtosa...@redhat.com
 Signed-off-by: Christoffer Dall c.d...@virtualopensystems.com
 ---
  Documentation/virtual/kvm/api.txt |   25 --
  arch/arm/include/asm/kvm_arm.h|1 +
  arch/arm/include/uapi/asm/kvm.h   |   21 
  arch/arm/kvm/arm.c|   65 
 +
  arch/arm/kvm/trace.h  |   25 ++
  include/uapi/linux/kvm.h  |1 +
  6 files changed, 134 insertions(+), 4 deletions(-)
 
 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index 4237c27..5050492 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -615,15 +615,32 @@ created.
  4.25 KVM_IRQ_LINE
  
  Capability: KVM_CAP_IRQCHIP
 -Architectures: x86, ia64
 +Architectures: x86, ia64, arm
  Type: vm ioctl
  Parameters: struct kvm_irq_level
  Returns: 0 on success, -1 on error
  
  Sets the level of a GSI input to the interrupt controller model in the 
 kernel.
 -Requires that an interrupt controller model has been previously created with
 -KVM_CREATE_IRQCHIP.  Note that edge-triggered interrupts require the level
 -to be set to 1 and then back to 0.
 +On some architectures it is required that an interrupt controller model has
 +been previously created with KVM_CREATE_IRQCHIP.  Note that edge-triggered
 +interrupts require the level to be set to 1 and then back to 0.
 +
 +ARM can signal an interrupt either at the CPU level, or at the in-kernel 
 irqchip
 +(GIC), and for in-kernel irqchip can tell the GIC to use PPIs designated for
 +specific cpus.  The irq field is interpreted like this:
 +
 +  bits:  | 31 ... 24 | 23  ... 16 | 15...0 |
 +  field: | irq_type  | vcpu_index | irq_id |
 +
 +The irq_type field has the following values:
 +- irq_type[0]: out-of-kernel GIC: irq_id 0 is IRQ, irq_id 1 is FIQ
 +- irq_type[1]: in-kernel GIC: SPI, irq_id between 32 and 1019 (incl.)
 +   (the vcpu_index field is ignored)
 +- irq_type[2]: in-kernel GIC: PPI, irq_id between 16 and 31 (incl.)
 +
 +(The irq_id field thus corresponds nicely to the IRQ ID in the ARM GIC specs)
 +
 +In both cases, level is used to raise/lower the line.
  
  struct kvm_irq_level {
   union {
 diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
 index 613afe2..fb22ee8 100644
 --- a/arch/arm/include/asm/kvm_arm.h
 +++ b/arch/arm/include/asm/kvm_arm.h
 @@ -68,6 +68,7 @@
  #define HCR_GUEST_MASK (HCR_TSC | HCR_TSW | HCR_TWI | HCR_VM | HCR_BSU_IS | \
   HCR_FB | HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | \
   HCR_SWIO | HCR_TIDCP)
 +#define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF)
  
  /* Hyp System Control Register (HSCTLR) bits */
  #define HSCTLR_TE(1  30)
 diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
 index c6298b1..4cf6d8f 100644
 --- a/arch/arm/include/uapi/asm/kvm.h
 +++ b/arch/arm/include/uapi/asm/kvm.h
 @@ -23,6 +23,7 @@
  #include asm/ptrace.h
  
  #define __KVM_HAVE_GUEST_DEBUG
 +#define __KVM_HAVE_IRQ_LINE
  
  #define KVM_REG_SIZE(id) \
   (1U  (((id)  KVM_REG_SIZE_MASK)  KVM_REG_SIZE_SHIFT))
 @@ -103,4 +104,24 @@ struct kvm_arch_memory_slot {
  #define KVM_REG_ARM_CORE (0x0010  KVM_REG_ARM_COPROC_SHIFT)
  #define KVM_REG_ARM_CORE_REG(name)   (offsetof(struct kvm_regs, name) / 4)
  
 +/* 

Re: BSoD, HAL.DLL

2013-01-15 Thread Michael Tokarev

14.01.2013 21:40, Sean Kennedy wrote:

I continue to get bluescreens on an XP virtual machine running on CentOS 6.3 
x86_64 using VirtIO drivers.

QEMU:
qemu-kvm-0.12.1.2-2.295.el6.x86_64



KERNEL:
Linux vmhost1 2.6.32-279.1.1.el6.x86_64 #1 SMP Tue Jul 10 13:47:21 UTC 2012 
x86_64 x86_64 x86_64 GNU/Linux


Please take this to RHEL support.  Their kernel and qemu-kvm are ancient but
very heavily patched, and it's nearly impossible to understand what's going
on there.

Thanks,

/mjt
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] s390: kvm/sigp.c: fix memory leakage

2013-01-15 Thread Cornelia Huck
On Mon, 14 Jan 2013 22:39:54 +0100
Cong Ding ding...@gmail.com wrote:

 the variable inti should be freed in the branch CPUSTAT_STOPPED.
 
 Signed-off-by: Cong Ding ding...@gmail.com
 ---
  arch/s390/kvm/sigp.c |4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)
 
 diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
 index 461e841..1c48ab2 100644
 --- a/arch/s390/kvm/sigp.c
 +++ b/arch/s390/kvm/sigp.c
 @@ -137,8 +137,10 @@ static int __inject_sigp_stop(struct 
 kvm_s390_local_interrupt *li, int action)
   inti-type = KVM_S390_SIGP_STOP;
 
   spin_lock_bh(li-lock);
 - if ((atomic_read(li-cpuflags)  CPUSTAT_STOPPED))
 + if ((atomic_read(li-cpuflags)  CPUSTAT_STOPPED)) {
 + kfree(inti);
   goto out;
 + }
   list_add_tail(inti-list, li-list);
   atomic_set(li-active, 1);
   atomic_set_mask(CPUSTAT_STOP_INT, li-cpuflags);

Thanks, applied.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] virtio-scsi: reset virtqueue affinity when doing cpu hotplug

2013-01-15 Thread Wanlong Gao

Add hot cpu notifier to reset the request virtqueue affinity
when doing cpu hotplug.

Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
---
 drivers/scsi/virtio_scsi.c | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 16b0ef2..a3b5f12 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -20,6 +20,7 @@
 #include linux/virtio_ids.h
 #include linux/virtio_config.h
 #include linux/virtio_scsi.h
+#include linux/cpu.h
 #include scsi/scsi_host.h
 #include scsi/scsi_device.h
 #include scsi/scsi_cmnd.h
@@ -109,6 +110,9 @@ struct virtio_scsi {
/* Does the affinity hint is set for virtqueues? */
bool affinity_hint_set;
 
+   /* CPU hotplug notifier */
+   struct notifier_block nb;
+
struct virtio_scsi_vq ctrl_vq;
struct virtio_scsi_vq event_vq;
struct virtio_scsi_vq req_vqs[];
@@ -738,6 +742,23 @@ static void virtscsi_set_affinity(struct virtio_scsi 
*vscsi, bool affinity)
}
 }
 
+static int virtscsi_cpu_callback(struct notifier_block *nfb,
+unsigned long action, void *hcpu)
+{
+   struct virtio_scsi *vscsi = container_of(nfb, struct virtio_scsi, nb);
+   switch(action) {
+   case CPU_ONLINE:
+   case CPU_ONLINE_FROZEN:
+   case CPU_DEAD:
+   case CPU_DEAD_FROZEN:
+   virtscsi_set_affinity(vscsi, true);
+   break;
+   default:
+   break;
+   }
+   return NOTIFY_OK;
+}
+
 static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
 struct virtqueue *vq)
 {
@@ -888,6 +909,13 @@ static int __devinit virtscsi_probe(struct virtio_device 
*vdev)
if (err)
goto virtscsi_init_failed;
 
+   vscsi-nb.notifier_call = virtscsi_cpu_callback;
+   err = register_hotcpu_notifier(vscsi-nb);
+   if (err) {
+   pr_err(virtio_scsi: registering cpu notifier failed\n);
+   goto scsi_add_host_failed;
+   }
+
cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1;
shost-cmd_per_lun = min_t(u32, cmd_per_lun, shost-can_queue);
shost-max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0x;
@@ -925,6 +953,8 @@ static void __devexit virtscsi_remove(struct virtio_device 
*vdev)
 
scsi_remove_host(shost);
 
+   unregister_hotcpu_notifier(vscsi-nb);
+
virtscsi_remove_vqs(vdev);
scsi_host_put(shost);
 }
-- 
1.8.1
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 00/12] Multiqueue virtio-net

2013-01-15 Thread Jason Wang
On 01/15/2013 03:44 AM, Anthony Liguori wrote:
 Jason Wang jasow...@redhat.com writes:

 Hello all:

 This seires is an update of last version of multiqueue virtio-net support.

 Recently, linux tap gets multiqueue support. This series implements basic
 support for multiqueue tap, nic and vhost. Then use it as an infrastructure 
 to
 enable the multiqueue support for virtio-net.

 Both vhost and userspace multiqueue were implemented for virtio-net, but
 userspace could be get much benefits since dataplane like parallized 
 mechanism
 were not implemented.

 User could start a multiqueue virtio-net card through adding a queues
 parameter to tap.

 ./qemu -netdev tap,id=hn0,queues=2,vhost=on -device virtio-net-pci,netdev=hn0

 Management tools such as libvirt can pass multiple pre-created fds through

 ./qemu -netdev tap,id=hn0,queues=2,fd=X,fd=Y -device
 virtio-net-pci,netdev=hn0
 I'm confused/frightened that this syntax works.  You shouldn't be
 allowed to have two values for the same property.  Better to have a
 syntax like fd[0]=X,fd[1]=Y or something along those lines.

Yes, but this what current a StringList type works for command line.
Some other parameters such as dnssearch, hostfwd and guestfwd have
already worked in this way. Looks like your suggestions need some
extension on QemuOps visitor, maybe we can do this on top.

Thanks

 Regards,

 Anthony Liguori

 You can fetch and try the code from:
 git://github.com/jasowang/qemu.git

 Patch 1 adds a generic method of creating multiqueue taps and implement the
 linux part.
 Patch 2 - 4 introduce some helpers which could be used to refactor the nic
 emulation codes to support multiqueue.
 Patch 5 introduces multiqueue support for qemu networking code: each peers of
 NetClientState were abstracted as a queue. Though this, most of the codes 
 could
 be reusued without change.
 Patch 6 adds basic multiqueue support for vhost which could let vhost just
 handle a subset of all virtqueues.
 Patch 7-8 introduce new helpers of virtio which is needed by multiqueue
 virtio-net.
 Patch 9-12 implement the multiqueue support of virtio-net

 Changes from RFC v2:
 - rebase the codes to latest qemu
 - align the multiqueue virtio-net implementation to virtio spec
 - split the patches into more smaller patches
 - set_link and hotplug support

 Changes from RFC V1:
 - rebase to the latest
 - fix memory leak in parse_netdev
 - fix guest notifiers assignment/de-assignment
 - changes the command lines to:
qemu -netdev tap,queues=2 -device virtio-net-pci,queues=2

 Reference:
 v2: http://lists.gnu.org/archive/html/qemu-devel/2012-06/msg04108.html
 v1: http://comments.gmane.org/gmane.comp.emulators.qemu/100481

 Perf Numbers:

 Two Intel Xeon 5620 with direct connected intel 82599EB
 Host/Guest kernel: David net tree
 vhost enabled

 - lots of improvents of both latency and cpu utilization in request-reponse 
 test
 - get regression of guest sending small packets which because TCP tends to 
 batch
   less when the latency were improved

 1q/2q/4q
 TCP_RR
  size #sessions trans.rate  norm trans.rate  norm trans.rate  norm
 1 1 9393.26   595.64  9408.18   597.34  9375.19   584.12
 1 2072162.1   2214.24 129880.22 2456.13 196949.81 2298.13
 1 50107513.38 2653.99 139721.93 2490.58 259713.82 2873.57
 1 100   126734.63 2676.54 145553.5  2406.63 265252.68 2943
 64 19453.42   632.33  9371.37   616.13  9338.19   615.97
 64 20   70620.03  2093.68 125155.75 2409.15 191239.91 2253.32
 64 50   1069662448.29 146518.67 2514.47 242134.07 2720.91
 64 100  117046.35 2394.56 190153.09 2696.82 238881.29 2704.41
 256 1   8733.29   736.36  8701.07   680.83  8608.92   530.1
 256 20  69279.89  2274.45 115103.07 2299.76 144555.16 1963.53
 256 50  97676.02  2296.09 150719.57 2522.92 254510.5  3028.44
 256 100 150221.55 2949.56 197569.3  2790.92 300695.78 3494.83
 TCP_CRR
  size #sessions trans.rate  norm trans.rate  norm trans.rate  norm
 1 1 2848.37  163.41 2230.39  130.89 2013.09  120.47
 1 2023434.5  562.11 31057.43 531.07 49488.28 564.41
 1 5028514.88 582.17 40494.23 605.92 60113.35 654.97
 1 100   28827.22 584.73 48813.25 661.6  61783.62 676.56
 64 12780.08  159.4  2201.07  127.96 2006.8   117.63
 64 20   23318.51 564.47 30982.44 530.24 49734.95 566.13
 64 50   28585.72 582.54 40576.7  610.08 60167.89 656.56
 64 100  28747.37 584.17 49081.87 667.87 60612.94 662
 256 1   2772.08  160.51 2231.84  131.05 2003.62  113.45
 256 20  23086.35 559.8  30929.09 528.16 48454.9  555.22
 256 50  28354.7  579.85 40578.31 60760261.71 657.87
 256 100 28844.55 585.67 48541.86 659.08 61941.07 676.72
 TCP_STREAM guest receiving
  size #sessions throughput  norm throughput  norm throughput  norm
 1 1 16.27   1.33   16.11.12   16.13   0.99
 1 2 33.04   2.08   32.96   2.19   32.75   1.98
 1 4 66.62   6.83   68.35.56   66.14   2.65
 64 1896.55  56.67  914.02  58.14  898.9   61.56
 64 21830.46 91.02  1812.02 64.59  1835.57 66.26
 64 43626.61 

[PATCH] Bugfix for kvm/s390.

2013-01-15 Thread Cornelia Huck
Hi,

here's another s390 bugfix for kvm-next. Please apply.

Cong Ding (1):
  KVM: s390: kvm/sigp.c: fix memory leakage

 arch/s390/kvm/sigp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

-- 
1.7.12.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM: s390: kvm/sigp.c: fix memory leakage

2013-01-15 Thread Cornelia Huck
From: Cong Ding ding...@gmail.com

the variable inti should be freed in the branch CPUSTAT_STOPPED.

Signed-off-by: Cong Ding ding...@gmail.com
Signed-off-by: Cornelia Huck cornelia.h...@de.ibm.com
---
 arch/s390/kvm/sigp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 461e841..1c48ab2 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -137,8 +137,10 @@ static int __inject_sigp_stop(struct 
kvm_s390_local_interrupt *li, int action)
inti-type = KVM_S390_SIGP_STOP;
 
spin_lock_bh(li-lock);
-   if ((atomic_read(li-cpuflags)  CPUSTAT_STOPPED))
+   if ((atomic_read(li-cpuflags)  CPUSTAT_STOPPED)) {
+   kfree(inti);
goto out;
+   }
list_add_tail(inti-list, li-list);
atomic_set(li-active, 1);
atomic_set_mask(CPUSTAT_STOP_INT, li-cpuflags);
-- 
1.7.12.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 04/12] ARM: KVM: Initial VGIC infrastructure code

2013-01-15 Thread Marc Zyngier
On 14/01/13 21:08, Christoffer Dall wrote:
 On Mon, Jan 14, 2013 at 10:31 AM, Will Deacon will.dea...@arm.com wrote:

 As I mentioned previously, I suspect that this doesn't work with big-endian
 systems. Whilst that's reasonable for the moment, a comment would be useful
 for the unlucky soul that decides to do that work in future (or add
 accessors for mmio-data as I suggested before).

 admittedly this really hurts my brain, but I think there's actually no
 problem with endianness: whatever comes in mmio-data will have native
 endianness and the vgic is always little-endian, so a guest would have
 to make sure to do its own endianness conversion before writing data,
 or did I get this backwards? (some nasty feeling about if the OS is
 compiled in another endianness than the hardware everything may
 break).
 
 Anyhow, I think there's another bug in this code though. Please take a
 look and see if you agree:
 
 commit 3cab2b93a6f6acd3c043e584f23b94ab8f1bbd66
 Author: Christoffer Dall c.d...@virtualopensystems.com
 Date:   Mon Jan 14 15:55:18 2013 -0500
 
 KVM: ARM: Limit vgic read/writes to load/store length
 
 The vgic read/write operations did not consider ldrb/strb masks, and
 would therefore unintentionally overwrite parts of a register.
 
 Consider for example a store of a single byte to a word-aligned address
 of one of the priority registers, that would cause the 3 most
 significant bytes to be overwritten with zeros.
 
 Cc: Marc Zyniger marc.zyng...@arm.com
 Cc: Will Deacon will.dea...@arm.com
 Signed-off-by: Christoffer Dall c.d...@virtualopensystems.com

Acked-by: Marc Zyngier marc.zyng...@arm.com

 
 diff --git a/arch/arm/kvm/vgic.c b/arch/arm/kvm/vgic.c
 index 25daa07..5c1bcf5 100644
 --- a/arch/arm/kvm/vgic.c
 +++ b/arch/arm/kvm/vgic.c
 @@ -233,6 +233,16 @@ static void vgic_cpu_irq_clear(struct kvm_vcpu
 *vcpu, int irq)
 vcpu-arch.vgic_cpu.pending_shared);
  }
 
 +static u32 mmio_data_read(struct kvm_exit_mmio *mmio, u32 mask)
 +{
 + return *((u32 *)mmio-data)  mask;
 +}
 +
 +static void mmio_data_write(struct kvm_exit_mmio *mmio, u32 mask, u32 value)
 +{
 + *((u32 *)mmio-data) = value  mask;
 +}
 +
  /**
   * vgic_reg_access - access vgic register
   * @mmio:   pointer to the data describing the mmio access
 @@ -247,8 +257,8 @@ static void vgic_cpu_irq_clear(struct kvm_vcpu
 *vcpu, int irq)
  static void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg,
   phys_addr_t offset, int mode)
  {
 - int shift = (offset  3) * 8;
 - u32 mask;
 + int word_offset = (offset  3) * 8;
 + u32 mask = (1UL  (mmio-len * 8)) - 1;
   u32 regval;
 
   /*
 @@ -256,7 +266,6 @@ static void vgic_reg_access(struct kvm_exit_mmio
 *mmio, u32 *reg,
* directly (ARM ARM B3.12.7 Prioritization of aborts).
*/
 
 - mask = (~0U)  shift;
   if (reg) {
   regval = *reg;
   } else {
 @@ -265,7 +274,7 @@ static void vgic_reg_access(struct kvm_exit_mmio
 *mmio, u32 *reg,
   }
 
   if (mmio-is_write) {
 - u32 data = (*((u32 *)mmio-data)  mask)  shift;
 + u32 data = mmio_data_read(mmio, mask)  word_offset;
   switch (ACCESS_WRITE_MASK(mode)) {
   case ACCESS_WRITE_IGNORED:
   return;
 @@ -279,7 +288,7 @@ static void vgic_reg_access(struct kvm_exit_mmio
 *mmio, u32 *reg,
   break;
 
   case ACCESS_WRITE_VALUE:
 - regval = (regval  ~(mask  shift)) | data;
 + regval = (regval  ~(mask  word_offset)) | data;
   break;
   }
   *reg = regval;
 @@ -290,7 +299,7 @@ static void vgic_reg_access(struct kvm_exit_mmio
 *mmio, u32 *reg,
   /* fall through */
 
   case ACCESS_READ_VALUE:
 - *((u32 *)mmio-data) = (regval  shift)  mask;
 + mmio_data_write(mmio, mask, regval  word_offset);
   }
   }
  }
 @@ -702,6 +711,12 @@ bool vgic_handle_mmio(struct kvm_vcpu *vcpu,
 struct kvm_run *run,
   (mmio-phys_addr + mmio-len)  (base + KVM_VGIC_V2_DIST_SIZE))
   return false;
 
 + /* We don't support ldrd / strd or ldm / stm to the emulated vgic */
 + if (mmio-len  4) {
 + kvm_inject_dabt(vcpu, mmio-phys_addr);
 + return true;
 + }
 +
   range = find_matching_range(vgic_ranges, mmio, base);
   if (unlikely(!range || !range-handle_mmio)) {
   pr_warn(Unhandled access %d %08llx %d\n,
 --
 
 Thanks,
 -Christoffer
 


-- 
Jazz is not dead. It just smells funny...

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] KVM call for 2013-01-15

2013-01-15 Thread Eduardo Habkost
On Mon, Jan 14, 2013 at 02:26:12PM +0100, Juan Quintela wrote:
 Hi
 
 Please send in any agenda topics that you have.

- CPU hotplug plan/design

-- 
Eduardo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 07/12] ARM: KVM: VGIC virtual CPU interface management

2013-01-15 Thread Marc Zyngier
On 14/01/13 22:02, Christoffer Dall wrote:
 On Mon, Jan 14, 2013 at 10:42 AM, Will Deacon will.dea...@arm.com wrote:
 On Tue, Jan 08, 2013 at 06:42:11PM +, Christoffer Dall wrote:
 From: Marc Zyngier marc.zyng...@arm.com

 Add VGIC virtual CPU interface code, picking pending interrupts
 from the distributor and stashing them in the VGIC control interface
 list registers.

 Signed-off-by: Marc Zyngier marc.zyng...@arm.com
 Signed-off-by: Christoffer Dall c.d...@virtualopensystems.com
 ---
  arch/arm/include/asm/kvm_vgic.h |   30 
  arch/arm/kvm/vgic.c |  327 
 +++
  2 files changed, 356 insertions(+), 1 deletion(-)

 diff --git a/arch/arm/include/asm/kvm_vgic.h 
 b/arch/arm/include/asm/kvm_vgic.h
 index 9ff0d9c..b3133c4 100644
 --- a/arch/arm/include/asm/kvm_vgic.h
 +++ b/arch/arm/include/asm/kvm_vgic.h
 @@ -110,8 +110,33 @@ struct vgic_dist {
  };

  struct vgic_cpu {
 +#ifdef CONFIG_KVM_ARM_VGIC
 +   /* per IRQ to LR mapping */
 +   u8  vgic_irq_lr_map[VGIC_NR_IRQS];
 +
 +   /* Pending interrupts on this VCPU */
 +   DECLARE_BITMAP( pending_percpu, VGIC_NR_PRIVATE_IRQS);
 +   DECLARE_BITMAP( pending_shared, VGIC_NR_SHARED_IRQS);
 +
 +   /* Bitmap of used/free list registers */
 +   DECLARE_BITMAP( lr_used, 64);
 +
 +   /* Number of list registers on this CPU */
 +   int nr_lr;
 +
 +   /* CPU vif control registers for world switch */
 +   u32 vgic_hcr;
 +   u32 vgic_vmcr;
 +   u32 vgic_misr;  /* Saved only */
 +   u32 vgic_eisr[2];   /* Saved only */
 +   u32 vgic_elrsr[2];  /* Saved only */
 +   u32 vgic_apr;
 +   u32 vgic_lr[64];/* A15 has only 4... */

 Have a #define for the maximum number of list registers.

 +#endif
  };

 +#define LR_EMPTY   0xff
 +
  struct kvm;
  struct kvm_vcpu;
  struct kvm_run;
 @@ -119,9 +144,14 @@ struct kvm_exit_mmio;

  #ifdef CONFIG_KVM_ARM_VGIC
  int kvm_vgic_set_addr(struct kvm *kvm, unsigned long type, u64 addr);
 +void kvm_vgic_sync_to_cpu(struct kvm_vcpu *vcpu);
 +void kvm_vgic_sync_from_cpu(struct kvm_vcpu *vcpu);

 Same comment as for the arch timer (flush/sync).

 +int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
  bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
   struct kvm_exit_mmio *mmio);

 +#define irqchip_in_kernel(k)   (!!((k)-arch.vgic.vctrl_base))
 +
  #else
  static inline int kvm_vgic_hyp_init(void)
  {
 diff --git a/arch/arm/kvm/vgic.c b/arch/arm/kvm/vgic.c
 index bd2bd7f..58237d5 100644
 --- a/arch/arm/kvm/vgic.c
 +++ b/arch/arm/kvm/vgic.c
 @@ -152,6 +152,34 @@ static int vgic_irq_is_enabled(struct kvm_vcpu *vcpu, 
 int irq)
 return vgic_bitmap_get_irq_val(dist-irq_enabled, vcpu-vcpu_id, 
 irq);
  }

 +static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq)
 +{
 +   struct vgic_dist *dist = vcpu-kvm-arch.vgic;
 +
 +   return vgic_bitmap_get_irq_val(dist-irq_active, vcpu-vcpu_id, 
 irq);
 +}
 +
 +static void vgic_irq_set_active(struct kvm_vcpu *vcpu, int irq)
 +{
 +   struct vgic_dist *dist = vcpu-kvm-arch.vgic;
 +
 +   vgic_bitmap_set_irq_val(dist-irq_active, vcpu-vcpu_id, irq, 1);
 +}
 +
 +static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq)
 +{
 +   struct vgic_dist *dist = vcpu-kvm-arch.vgic;
 +
 +   vgic_bitmap_set_irq_val(dist-irq_active, vcpu-vcpu_id, irq, 0);
 +}
 +
 +static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq)
 +{
 +   struct vgic_dist *dist = vcpu-kvm-arch.vgic;
 +
 +   return vgic_bitmap_get_irq_val(dist-irq_state, vcpu-vcpu_id, 
 irq);
 +}
 +
  static void vgic_dist_irq_set(struct kvm_vcpu *vcpu, int irq)
  {
 struct vgic_dist *dist = vcpu-kvm-arch.vgic;
 @@ -711,7 +739,30 @@ static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, 
 u32 reg)

  static int compute_pending_for_cpu(struct kvm_vcpu *vcpu)
  {
 -   return 0;
 +   struct vgic_dist *dist = vcpu-kvm-arch.vgic;
 +   unsigned long *pending, *enabled, *pend_percpu, *pend_shared;
 +   unsigned long pending_private, pending_shared;
 +   int vcpu_id;
 +
 +   vcpu_id = vcpu-vcpu_id;
 +   pend_percpu = vcpu-arch.vgic_cpu.pending_percpu;
 +   pend_shared = vcpu-arch.vgic_cpu.pending_shared;
 +
 +   pending = vgic_bitmap_get_cpu_map(dist-irq_state, vcpu_id);
 +   enabled = vgic_bitmap_get_cpu_map(dist-irq_enabled, vcpu_id);
 +   bitmap_and(pend_percpu, pending, enabled, VGIC_NR_PRIVATE_IRQS);
 +
 +   pending = vgic_bitmap_get_shared_map(dist-irq_state);
 +   enabled = vgic_bitmap_get_shared_map(dist-irq_enabled);
 +   bitmap_and(pend_shared, pending, enabled, VGIC_NR_SHARED_IRQS);
 +   bitmap_and(pend_shared, pend_shared,
 +  
 vgic_bitmap_get_shared_map(dist-irq_spi_target[vcpu_id]),
 +  VGIC_NR_SHARED_IRQS);
 +
 +   

Re: [PATCH v5 2/4] ARM: KVM: arch_timers: Add guest timer core support

2013-01-15 Thread Marc Zyngier
On 14/01/13 19:19, Christoffer Dall wrote:
 On Mon, Jan 14, 2013 at 10:18 AM, Will Deacon will.dea...@arm.com wrote:
 On Tue, Jan 08, 2013 at 06:43:20PM +, Christoffer Dall wrote:
 From: Marc Zyngier marc.zyng...@arm.com

 Add some the architected timer related infrastructure, and support timer
 interrupt injection, which can happen as a resultof three possible
 events:

 - The virtual timer interrupt has fired while we were still
   executing the guest
 - The timer interrupt hasn't fired, but it expired while we
   were doing the world switch
 - A hrtimer we programmed earlier has fired

 [...]

 +void kvm_timer_sync_to_cpu(struct kvm_vcpu *vcpu)
 +{
 +   struct arch_timer_cpu *timer = vcpu-arch.timer_cpu;
 +
 +   /*
 +* We're about to run this vcpu again, so there is no need to
 +* keep the background timer running, as we're about to
 +* populate the CPU timer again.
 +*/
 +   timer_disarm(timer);
 +}
 +
 +void kvm_timer_sync_from_cpu(struct kvm_vcpu *vcpu)
 +{
 +   struct arch_timer_cpu *timer = vcpu-arch.timer_cpu;
 +   cycle_t cval, now;
 +   u64 ns;
 +
 +   /* Check if the timer is enabled and unmasked first */
 +   if ((timer-cntv_ctl  3) != 1)
 +   return;
 +
 +   cval = timer-cntv_cval;
 +   now = kvm_phys_timer_read() - vcpu-kvm-arch.timer.cntvoff;
 +
 +   BUG_ON(timer_is_armed(timer));
 +
 +   if (cval = now) {
 +   /*
 +* Timer has already expired while we were not
 +* looking. Inject the interrupt and carry on.
 +*/
 +   kvm_timer_inject_irq(vcpu);
 +   return;
 +   }
 +
 +   ns = cyclecounter_cyc2ns(timecounter-cc, cval - now);
 +   timer_arm(timer, ns);
 +}

 Please use flush/sync terminology to match the rest of arch/arm/.

 ok, the following fixes this for both timers and the vgic:
 
 commit 1b68f39459dbc797f6766c103edf2c1053984161
 Author: Christoffer Dall c.d...@virtualopensystems.com
 Date:   Mon Jan 14 14:16:31 2013 -0500
 
 KVM: ARM: vgic: use sync/flush terminology
 
 Use sync/flush for saving state to/from CPUs to be consistent with
 other uses in arch/arm.

Sync and flush on their own are pretty inexpressive. Consider changing
it to {flush,sync}_hwstate, so we're consistent with what VFP does.

M.
-- 
Jazz is not dead. It just smells funny...

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 3/4] ARM: KVM: arch_timers: Add timer world switch

2013-01-15 Thread Marc Zyngier
On 14/01/13 22:08, Christoffer Dall wrote:
 On Mon, Jan 14, 2013 at 12:51 PM, Marc Zyngier marc.zyng...@arm.com wrote:
 On 14/01/13 15:21, Will Deacon wrote:
 On Tue, Jan 08, 2013 at 06:43:27PM +, Christoffer Dall wrote:
 From: Marc Zyngier marc.zyng...@arm.com

 Do the necessary save/restore dance for the timers in the world
 switch code. In the process, allow the guest to read the physical
 counter, which is useful for its own clock_event_device.

 [...]

 @@ -476,6 +513,7 @@ vcpu .reqr0  @ vcpu pointer always 
 in r0
   * for the host.
   *
   * Assumes vcpu pointer in vcpu reg
 + * Clobbers r2-r4
   */
  .macro restore_timer_state
  @ Disallow physical timer access for the guest
 @@ -484,6 +522,30 @@ vcpu.reqr0  @ vcpu pointer always 
 in r0
  orr r2, r2, #CNTHCTL_PL1PCTEN
  bic r2, r2, #CNTHCTL_PL1PCEN
  mcr p15, 4, r2, c14, c1, 0  @ CNTHCTL
 +
 +#ifdef CONFIG_KVM_ARM_TIMER
 +ldr r4, [vcpu, #VCPU_KVM]
 +ldr r2, [r4, #KVM_TIMER_ENABLED]
 +cmp r2, #0
 +beq 1f
 +
 +ldr r2, [r4, #KVM_TIMER_CNTVOFF]
 +ldr r3, [r4, #(KVM_TIMER_CNTVOFF + 4)]
 +mcrrp15, 4, r2, r3, c14 @ CNTVOFF
 +isb
 +
 +ldr r4, =VCPU_TIMER_CNTV_CVAL
 +add vcpu, vcpu, r4
 +ldrdr2, r3, [vcpu]
 +sub vcpu, vcpu, r4
 +mcrrp15, 3, r2, r3, c14 @ CNTV_CVAL
 +
 +ldr r2, [vcpu, #VCPU_TIMER_CNTV_CTL]
 +and r2, r2, #3
 +mcr p15, 0, r2, c14, c3, 1  @ CNTV_CTL
 +isb

 How many of these isbs are actually needed, given that we're going to make
 an exception return to the guest? The last one certainly looks redundant and
 I can't see the need for ordering CNTVOFF vs CNTV_CVAL. I can see an
 argument to putting one *before* CNTV_CTL, but you don't have one there!

 CNTVOFF directly influences whether or not CNTV_CVAL will trigger or
 not. Maybe I'm just being paranoid and moving the isb after CNTV_CVAL is
 enough.

 The last one is definitively superfluous.

 
 can't we also get rid of the isb on the return path then?
 
 do you agree with this patch:
 
 diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S
 index 57cfa84..7e6eedf 100644
 --- a/arch/arm/kvm/interrupts_head.S
 +++ b/arch/arm/kvm/interrupts_head.S
 @@ -492,7 +492,6 @@ vcpu  .reqr0  @ vcpu pointer always 
 in r0
   str r2, [vcpu, #VCPU_TIMER_CNTV_CTL]
   bic r2, #1  @ Clear ENABLE
   mcr p15, 0, r2, c14, c3, 1  @ CNTV_CTL
 - isb

This isb is required. Otherwise you have no guarantee about how recent
the next CNTV_CVAL read is (could be speculated).

   mrrcp15, 3, r2, r3, c14 @ CNTV_CVAL
   ldr r4, =VCPU_TIMER_CNTV_CVAL
 @@ -532,18 +531,17 @@ vcpu.reqr0  @ vcpu pointer always 
 in r0
   ldr r2, [r4, #KVM_TIMER_CNTVOFF]
   ldr r3, [r4, #(KVM_TIMER_CNTVOFF + 4)]
   mcrrp15, 4, r2, r3, c14 @ CNTVOFF
 - isb
 
   ldr r4, =VCPU_TIMER_CNTV_CVAL
   add vcpu, vcpu, r4
   ldrdr2, r3, [vcpu]
   sub vcpu, vcpu, r4
   mcrrp15, 3, r2, r3, c14 @ CNTV_CVAL
 + isb
 
   ldr r2, [vcpu, #VCPU_TIMER_CNTV_CTL]
   and r2, r2, #3
   mcr p15, 0, r2, c14, c3, 1  @ CNTV_CTL
 - isb
  1:
  #endif
  .endm

This part is OK.

M.
-- 
Jazz is not dead. It just smells funny...

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: FreeBSD-amd64 fails to start with SMP on quemu-kvm

2013-01-15 Thread Artur Samborski

W dniu 13.01.2013 10:23, Gleb Natapov pisze:

On Fri, Jan 11, 2013 at 03:57:05PM +0100, Artur Samborski wrote:

W dniu 09.01.2013 11:15, Gleb Natapov pisze:

On Tue, Jan 08, 2013 at 09:45:35PM +0100, Artur Samborski wrote:

W dniu 08.01.2013 00:00, Marcelo Tosatti pisze:

On Mon, Jan 07, 2013 at 06:13:22PM +0100, Artur Samborski wrote:

Hello,

When i try to run FreeBSD-amd64 on more than 1 vcpu in quemu-kvm
(Fedora Core 17) eg. to run FreeBSD-9.0-RELEASE-amd64 with:

qemu-kvm -m 1024m -cpu host -smp 2 -cdrom
/storage/iso/FreeBSD-9.0-RELEASE-amd64-dvd1.iso

it freezes KVM with:

KVM internal error. Suberror: 1
emulation failure
RAX=80b0d4c0 RBX=0009f000 RCX=c080
RDX=
RSI=d238 RDI= RBP=
RSP=
R8 = R9 = R10=
R11=
R12= R13= R14=
R15=
RIP=0009f076 RFL=00010086 [--S--P-] CPL=0 II=0 A20=1 SMM=0 HLT=0
ES =   f300 DPL=3 DS16 [-WA]
CS =0008   00209900 DPL=0 CS64 [--A]
SS =9f00 0009f000  f300 DPL=3 DS16 [-WA]
DS =0018   00c09300 DPL=0 DS   [-WA]
FS =   f300 DPL=3 DS16 [-WA]
GS =   f300 DPL=3 DS16 [-WA]
LDT=   8200 DPL=0 LDT
TR =   8b00 DPL=0 TSS64-busy
GDT= 0009f080 0020
IDT=  
CR0=8011 CR2= CR3=0009c000 CR4=0030
DR0= DR1= DR2=
DR3=
DR6=0ff0 DR7=0400
EFER=0501
Code=00 00 00 80 0f 22 c0 ea 70 f0 09 00 08 00 48 b8 c0 d4 b0 80
ff ff ff ff ff e0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
00 99 20 00 ff ff 00 00


Artur,

Can you check whether

https://patchwork-mail.kernel.org/patch/1942681/

fixes your problem



Hi, thanks for the reply.

Unfortunately, the patch does not help. Attempt to start FreeBSD
amd64 via quemu-kvm with -smp parameter greater than 1 fails in
exactly the same way as before.

The patch was applied to the kernel from the 3.6.11-1.fc17.src.rpm package.

Do I need some additional patches?



Can you try queue branch from kvm.git?



Thanks for the advice - I used kernel from kvm.git(queue) and was
able to run FreeBSD-amd guest with SMP on my kvm-host server without
previous problems. Unfortunately, after few hours server was hung
up. Case requires further investigations, but generally speaking we
went forward. Unfortunately, experiments on the server are
difficult, because it is used for everyday work.


Thanks for testing. This hang indeed looks like completely different
problem. What workload the VM was running?



First test:

 - kvm.git kernel
 - 2 kvm guest running:
   - Linux (in idle)
   - Freebsd-amd64 (high load, about 7 -- continuous FreeBSD world and 
kernel build)

 - KVM host hangs after about 5 hours
 - nothing special in system logs
 - message caught on one of the active SSH session:

kernel:[24742.127690] BUG: soft lockup - CPU#2 stuck for 22s! 
[vhost-3686:3700]


Second test:

 - kvm.git kernel
 - 1 kvm guest running:
   - Linux (at the time of hang -- in idle)
   - about 10 minutes before KVM host hangs -- load about 6 (kernel build)
 - in system logs:

BUG: soft lockup - CPU#0 stuck for 22s! [vhost-1771:1800]
Modules linked in: binfmt_misc ip6table_filter ip6_tables ebtable_nat 
ebtables lockd sunrpc nf_conntrack_ipv4 nf_defrag_ipv4 xt_state 
nf_conntrack xt_CHECKSUM iptable_mangle bridge stp llc be2iscsi 
iscsi_boot_sysfs bnx2i cnic uio cxgb4i cxgb4 cxgb3i cxgb3 mdio libcxgbi 
ib_iser bnep bluetooth rfkill rdma_cm ib_addr iw_cm ib_cm ib_sa ib_mad 
ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi ioatdma 
vhost_net iTCO_wdt iTCO_vendor_support ses lpc_ich tun macvtap macvlan 
mfd_core enclosure bnx2 joydev i7core_edac dca edac_core wmi coretemp 
dcdbas kvm_intel pcspkr crc32c_intel kvm serio_raw acpi_power_meter 
microcode uinput ipmi_devintf ipmi_si ipmi_msghandler megaraid_sas

CPU 0
Pid: 1800, comm: vhost-1771 Not tainted 3.7.0+ #2 Dell Inc. PowerEdge 
R610/086HF8
RIP: 0010:[8152876f]  [8152876f] 
skb_flow_dissect+0xbf/0x3e0

RSP: 0018:88042145dbd8  EFLAGS: 0246
RAX:  RBX: 8807fa489c00 RCX: f7ab0c277df5b6fd
RDX: 880820c59800 RSI: 88042145dc58 RDI: 8807fa489c00
RBP: 88042145dc48 R08: 0404 R09: 0412
R10: 8807fa489c00 R11: 0412 R12: 81522a57
R13:  R14: 81521fdc R15: 88042145db78
FS:  () GS:88083fc0() knlGS:
CS:  0010 DS:  ES:  CR0: 8005003b
CR2: 01c0e9f4 CR3: 000827973000 CR4: 27e0
DR0:  DR1: 

Re: [kvmarm] [PATCH v5 06/14] KVM: ARM: Inject IRQs and FIQs from userspace

2013-01-15 Thread Peter Maydell
On 15 January 2013 09:56, Gleb Natapov g...@redhat.com wrote:
 On Tue, Jan 08, 2013 at 01:39:17PM -0500, Christoffer Dall wrote:
 From: Christoffer Dall cd...@cs.columbia.edu

 All interrupt injection is now based on the VM ioctl KVM_IRQ_LINE.  This
 works semantically well for the GIC as we in fact raise/lower a line on
 a machine component (the gic).  The IOCTL uses the follwing struct.

 struct kvm_irq_level {
   union {
   __u32 irq; /* GSI */
   __s32 status;  /* not used for KVM_IRQ_LEVEL */
   };
   __u32 level;   /* 0 or 1 */
 };

 ARM can signal an interrupt either at the CPU level, or at the in-kernel 
 irqchip
 CPU level interrupt should use KVM_INTERRUPT instead.

No, that would be wrong. KVM_INTERRUPT is for interrupts which must be
delivered synchronously to the CPU. KVM_IRQ_LINE is for interrupts which
can be fed to the kernel asynchronously. It happens that on x86 must be
delivered synchronously and not going to in kernel irqchip are the same, but
this isn't true for other archs. For ARM all our interrupts can be fed
to the kernel
asynchronously, and so we use KVM_IRQ_LINE in all cases.

There was a big discussion thread about this on kvm and qemu-devel last
July (and we cleaned up some of the QEMU code to not smoosh together
all these different concepts under do I have an irqchip or not?).

 (GIC), and for in-kernel irqchip can tell the GIC to use PPIs designated for
 specific cpus.  The irq field is interpreted like this:

 Haven't read about GIC yet. Is PPI an interrupt that device can send
 directly to a specific CPU? Can we model that with irq routing like we do
 for MSI?

There is no routing involved -- you are raising a specific signal
line (which happens to result in prodding a particular CPU), that's all.

-- PMM
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [PATCH v5 06/14] KVM: ARM: Inject IRQs and FIQs from userspace

2013-01-15 Thread Gleb Natapov
On Tue, Jan 15, 2013 at 12:15:01PM +, Peter Maydell wrote:
 On 15 January 2013 09:56, Gleb Natapov g...@redhat.com wrote:
  On Tue, Jan 08, 2013 at 01:39:17PM -0500, Christoffer Dall wrote:
  From: Christoffer Dall cd...@cs.columbia.edu
 
  All interrupt injection is now based on the VM ioctl KVM_IRQ_LINE.  This
  works semantically well for the GIC as we in fact raise/lower a line on
  a machine component (the gic).  The IOCTL uses the follwing struct.
 
  struct kvm_irq_level {
union {
__u32 irq; /* GSI */
__s32 status;  /* not used for KVM_IRQ_LEVEL */
};
__u32 level;   /* 0 or 1 */
  };
 
  ARM can signal an interrupt either at the CPU level, or at the in-kernel 
  irqchip
  CPU level interrupt should use KVM_INTERRUPT instead.
 
 No, that would be wrong. KVM_INTERRUPT is for interrupts which must be
 delivered synchronously to the CPU. KVM_IRQ_LINE is for interrupts which
 can be fed to the kernel asynchronously. It happens that on x86 must be
 delivered synchronously and not going to in kernel irqchip are the same, 
 but
 this isn't true for other archs. For ARM all our interrupts can be fed
 to the kernel
 asynchronously, and so we use KVM_IRQ_LINE in all cases.
 
I do no quite understand what you mean by synchronously and
asynchronously. The difference between KVM_INTERRUPT and KVM_IRQ_LINE line
is that former is used when destination cpu is known to userspace later
is used when kernel code is involved in figuring out the destination. The
injections themselves are currently synchronous for both of them on x86
and ARM. i.e vcpu is kicked out from guest mode when interrupt need to
be injected into a guest and vcpu state is changed to inject interrupt
during next guest entry. In the near feature x86 will be able to inject
interrupt without kicking vcpu out from the guest mode does ARM plan to
do the same? For GIC interrupts or for IRQ/FIQ or for both?

 There was a big discussion thread about this on kvm and qemu-devel last
 July (and we cleaned up some of the QEMU code to not smoosh together
 all these different concepts under do I have an irqchip or not?).
Do you have a pointer?

 
  (GIC), and for in-kernel irqchip can tell the GIC to use PPIs designated 
  for
  specific cpus.  The irq field is interpreted like this:
 
  Haven't read about GIC yet. Is PPI an interrupt that device can send
  directly to a specific CPU? Can we model that with irq routing like we do
  for MSI?
 
 There is no routing involved -- you are raising a specific signal
 line (which happens to result in prodding a particular CPU), that's all.
 
We call it irq routing, but it is not really a router. It just a
configuration to let KVM know how specific lines are wired. We abuse it
for MSI injection. So instead of encoding destination into kvm_irq_level
you configure irq routing entry with this information and get back a
cookie. You provide the cookie in kvm_irq_level-irq to KVM_IRQ_LEVEL
ioctl. This way you are not limited to 8 bit of cpuid for instance. This
is not efficient if irq routing is very dynamic though.
 
--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 13/14] KVM: ARM: Handle I/O aborts

2013-01-15 Thread Gleb Natapov
On Tue, Jan 08, 2013 at 01:40:05PM -0500, Christoffer Dall wrote:
 When the guest accesses I/O memory this will create data abort
 exceptions and they are handled by decoding the HSR information
 (physical address, read/write, length, register) and forwarding reads
 and writes to QEMU which performs the device emulation.
 
 Certain classes of load/store operations do not support the syndrome
 information provided in the HSR and we therefore must be able to fetch
 the offending instruction from guest memory and decode it manually.
 
 We only support instruction decoding for valid reasonable MMIO operations
 where trapping them do not provide sufficient information in the HSR (no
 16-bit Thumb instructions provide register writeback that we care about).
 
 The following instruction types are NOT supported for MMIO operations
 despite the HSR not containing decode info:
  - any Load/Store multiple
  - any load/store exclusive
  - any load/store dual
  - anything with the PC as the dest register
 
 This requires changing the general flow somewhat since new calls to run
 the VCPU must check if there's a pending MMIO load and perform the write
 after userspace has made the data available.
 
 Rusty Russell fixed a horrible race pointed out by Ben Herrenschmidt:
 (1) Guest complicated mmio instruction traps.
 (2) The hardware doesn't tell us enough, so we need to read the actual
 instruction which was being exectuted.
 (3) KVM maps the instruction virtual address to a physical address.
 (4) The guest (SMP) swaps out that page, and fills it with something else.
 (5) We read the physical address, but now that's the wrong thing.
How can this happen?! The guest cannot reuse physical page before it
flushes it from all vcpus tlb cache. For that it needs to send
synchronous IPI to all vcpus and IPI will not be processed by a vcpu
while it does emulation.

 
 Reviewed-by: Marcelo Tosatti mtosa...@redhat.com
 Signed-off-by: Rusty Russell rusty.russ...@linaro.org
 Signed-off-by: Marc Zyngier marc.zyng...@arm.com
 Signed-off-by: Christoffer Dall c.d...@virtualopensystems.com
 ---
  arch/arm/include/asm/kvm_arm.h |3 
  arch/arm/include/asm/kvm_asm.h |2 
  arch/arm/include/asm/kvm_decode.h  |   47 
  arch/arm/include/asm/kvm_emulate.h |8 +
  arch/arm/include/asm/kvm_host.h|7 +
  arch/arm/include/asm/kvm_mmio.h|   51 
  arch/arm/kvm/Makefile  |2 
  arch/arm/kvm/arm.c |   14 +
  arch/arm/kvm/decode.c  |  462 
 
  arch/arm/kvm/emulate.c |  169 +
  arch/arm/kvm/interrupts.S  |   38 +++
  arch/arm/kvm/mmio.c|  154 
  arch/arm/kvm/mmu.c |7 -
  arch/arm/kvm/trace.h   |   21 ++
  14 files changed, 981 insertions(+), 4 deletions(-)
  create mode 100644 arch/arm/include/asm/kvm_decode.h
  create mode 100644 arch/arm/include/asm/kvm_mmio.h
  create mode 100644 arch/arm/kvm/decode.c
  create mode 100644 arch/arm/kvm/mmio.c
 
 diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
 index 3ff6f22..151c4ce 100644
 --- a/arch/arm/include/asm/kvm_arm.h
 +++ b/arch/arm/include/asm/kvm_arm.h
 @@ -173,8 +173,11 @@
  #define HSR_ISS  (HSR_IL - 1)
  #define HSR_ISV_SHIFT(24)
  #define HSR_ISV  (1U  HSR_ISV_SHIFT)
 +#define HSR_SRT_SHIFT(16)
 +#define HSR_SRT_MASK (0xf  HSR_SRT_SHIFT)
  #define HSR_FSC  (0x3f)
  #define HSR_FSC_TYPE (0x3c)
 +#define HSR_SSE  (1  21)
  #define HSR_WNR  (1  6)
  #define HSR_CV_SHIFT (24)
  #define HSR_CV   (1U  HSR_CV_SHIFT)
 diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
 index 5e06e81..58d787b 100644
 --- a/arch/arm/include/asm/kvm_asm.h
 +++ b/arch/arm/include/asm/kvm_asm.h
 @@ -77,6 +77,8 @@ extern void __kvm_flush_vm_context(void);
  extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
  
  extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 +
 +extern u64 __kvm_va_to_pa(struct kvm_vcpu *vcpu, u32 va, bool priv);
  #endif
  
  #endif /* __ARM_KVM_ASM_H__ */
 diff --git a/arch/arm/include/asm/kvm_decode.h 
 b/arch/arm/include/asm/kvm_decode.h
 new file mode 100644
 index 000..3c37cb9
 --- /dev/null
 +++ b/arch/arm/include/asm/kvm_decode.h
 @@ -0,0 +1,47 @@
 +/*
 + * Copyright (C) 2012 - Virtual Open Systems and Columbia University
 + * Author: Christoffer Dall c.d...@virtualopensystems.com
 + *
 + * This program is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License, version 2, as
 + * published by the Free Software Foundation.
 + *
 + * This program is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 + * GNU General Public License for more details.
 + *
 + * You should 

Re: [PATCH v5 13/14] KVM: ARM: Handle I/O aborts

2013-01-15 Thread Marc Zyngier
On 15/01/13 13:18, Gleb Natapov wrote:
 On Tue, Jan 08, 2013 at 01:40:05PM -0500, Christoffer Dall wrote:
 When the guest accesses I/O memory this will create data abort
 exceptions and they are handled by decoding the HSR information
 (physical address, read/write, length, register) and forwarding reads
 and writes to QEMU which performs the device emulation.

 Certain classes of load/store operations do not support the syndrome
 information provided in the HSR and we therefore must be able to fetch
 the offending instruction from guest memory and decode it manually.

 We only support instruction decoding for valid reasonable MMIO operations
 where trapping them do not provide sufficient information in the HSR (no
 16-bit Thumb instructions provide register writeback that we care about).

 The following instruction types are NOT supported for MMIO operations
 despite the HSR not containing decode info:
  - any Load/Store multiple
  - any load/store exclusive
  - any load/store dual
  - anything with the PC as the dest register

 This requires changing the general flow somewhat since new calls to run
 the VCPU must check if there's a pending MMIO load and perform the write
 after userspace has made the data available.

 Rusty Russell fixed a horrible race pointed out by Ben Herrenschmidt:
 (1) Guest complicated mmio instruction traps.
 (2) The hardware doesn't tell us enough, so we need to read the actual
 instruction which was being exectuted.
 (3) KVM maps the instruction virtual address to a physical address.
 (4) The guest (SMP) swaps out that page, and fills it with something else.
 (5) We read the physical address, but now that's the wrong thing.
 How can this happen?! The guest cannot reuse physical page before it
 flushes it from all vcpus tlb cache. For that it needs to send
 synchronous IPI to all vcpus and IPI will not be processed by a vcpu
 while it does emulation.

I don't know how this works on x86, but a KVM/ARM guest can definitely
handle an IPI.

Furthermore, TLB invalidation doesn't require an IPI on ARMv7 (unless
we're doing some set/way operation which is handled separately).

M.
-- 
Jazz is not dead. It just smells funny...

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 03/14] KVM: ARM: Initial skeleton to compile KVM support

2013-01-15 Thread Gleb Natapov
On Mon, Jan 14, 2013 at 05:17:31PM -0500, Christoffer Dall wrote:
 On Mon, Jan 14, 2013 at 1:49 PM, Gleb Natapov g...@redhat.com wrote:
  A couple of general question about ABI. If they were already answered
  just refer me to the previous discussion.
 
  On Tue, Jan 08, 2013 at 01:38:55PM -0500, Christoffer Dall wrote:
  diff --git a/Documentation/virtual/kvm/api.txt 
  b/Documentation/virtual/kvm/api.txt
  index a4df553..4237c27 100644
  --- a/Documentation/virtual/kvm/api.txt
  +++ b/Documentation/virtual/kvm/api.txt
  @@ -293,7 +293,7 @@ kvm_run' (see below).
   4.11 KVM_GET_REGS
 
   Capability: basic
  -Architectures: all
  +Architectures: all except ARM
   Type: vcpu ioctl
   Parameters: struct kvm_regs (out)
   Returns: 0 on success, -1 on error
  @@ -314,7 +314,7 @@ struct kvm_regs {
   4.12 KVM_SET_REGS
 
   Capability: basic
  -Architectures: all
  +Architectures: all except ARM
   Type: vcpu ioctl
   Parameters: struct kvm_regs (in)
   Returns: 0 on success, -1 on error
  @@ -600,7 +600,7 @@ struct kvm_fpu {
   4.24 KVM_CREATE_IRQCHIP
  Why KVM_GET_REGS/KVM_SET_REGS are not usable for arm?
 
 
 We use the ONE_REG API instead and we don't want to support two
 separate APIs to user space.
 
I suppose fetching all registers is not anywhere on a fast path in
userspace :)

 
   Capability: KVM_CAP_IRQCHIP
  -Architectures: x86, ia64
  +Architectures: x86, ia64, ARM
   Type: vm ioctl
   Parameters: none
   Returns: 0 on success, -1 on error
  @@ -608,7 +608,8 @@ Returns: 0 on success, -1 on error
   Creates an interrupt controller model in the kernel.  On x86, creates a 
  virtual
   ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to 
  have a
   local APIC.  IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 
  16-23
  -only go to the IOAPIC.  On ia64, a IOSAPIC is created.
  +only go to the IOAPIC.  On ia64, a IOSAPIC is created. On ARM, a GIC is
  +created.
 
 
   4.25 KVM_IRQ_LINE
  @@ -1775,6 +1776,14 @@ registers, find a list below:
 PPC   | KVM_REG_PPC_VPA_DTL   | 128
 PPC   | KVM_REG_PPC_EPCR   | 32
 
  +ARM registers are mapped using the lower 32 bits.  The upper 16 of that
  +is the register group type, or coprocessor number:
  +
  +ARM core registers have the following id bit patterns:
  +  0x4002  0010 index into the kvm_regs struct:16
  +
  +
  +
   4.69 KVM_GET_ONE_REG
 
   Capability: KVM_CAP_ONE_REG
  @@ -2127,6 +2136,46 @@ written, then `n_invalid' invalid entries, 
  invalidating any previously
   valid entries found.
 
 
  +4.77 KVM_ARM_VCPU_INIT
  +
  +Capability: basic
  +Architectures: arm
  +Type: vcpu ioctl
  +Parameters: struct struct kvm_vcpu_init (in)
  +Returns: 0 on success; -1 on error
  +Errors:
  +  EINVAL:the target is unknown, or the combination of features is 
  invalid.
  +  ENOENT:a features bit specified is unknown.
  +
  +This tells KVM what type of CPU to present to the guest, and what
  +optional features it should have.  This will cause a reset of the cpu
  +registers to their initial values.  If this is not called, KVM_RUN will
  +return ENOEXEC for that vcpu.
  +
  Can different vcpus of the same VM be of different type?
 
 
 In the future yes. For example, if we ever want to virtualize a
 Big.Little system.
 
  +Note that because some registers reflect machine topology, all vcpus
  +should be created before this ioctl is invoked.
  How cpu hot plug suppose to work?
 
 
 Those CPUs would be added from the beginning, but not powered on, and
 would be powered on later on, I suppose.  See
 https://lists.cs.columbia.edu/pipermail/kvmarm/2013-January/004617.html.
 
When we suggested similar hot plug for x86, people started screaming
how they suppose to know when they create a VM how much vcpus they will
need in the future. In short people who are asking for hot plug (on x86
at least) do not like such solution.

 
  +
  +
  +4.78 KVM_GET_REG_LIST
  +
  +Capability: basic
  +Architectures: arm
  +Type: vcpu ioctl
  +Parameters: struct kvm_reg_list (in/out)
  +Returns: 0 on success; -1 on error
  +Errors:
  +  E2BIG: the reg index list is too big to fit in the array specified 
  by
  + the user (the number required will be written into n).
  +
  +struct kvm_reg_list {
  + __u64 n; /* number of registers in reg[] */
  + __u64 reg[0];
  +};
  +
  +This ioctl returns the guest registers that are supported for the
  +KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
  +
  +
  Doesn't userspace know what registers are supported by each CPU type?
 
 It would know about core registers, but there is a huge space of
 co-processors, and we don't emulate all of them or support
 getting/setting all of them yet. Surely this is something that will
 change over time and we want user space to be able to discover the
 available registers for backwards compatibility, migration, etc.
 
 -Christoffer

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message 

Re: [PATCH v5 13/14] KVM: ARM: Handle I/O aborts

2013-01-15 Thread Gleb Natapov
On Tue, Jan 15, 2013 at 01:29:40PM +, Marc Zyngier wrote:
 On 15/01/13 13:18, Gleb Natapov wrote:
  On Tue, Jan 08, 2013 at 01:40:05PM -0500, Christoffer Dall wrote:
  When the guest accesses I/O memory this will create data abort
  exceptions and they are handled by decoding the HSR information
  (physical address, read/write, length, register) and forwarding reads
  and writes to QEMU which performs the device emulation.
 
  Certain classes of load/store operations do not support the syndrome
  information provided in the HSR and we therefore must be able to fetch
  the offending instruction from guest memory and decode it manually.
 
  We only support instruction decoding for valid reasonable MMIO operations
  where trapping them do not provide sufficient information in the HSR (no
  16-bit Thumb instructions provide register writeback that we care about).
 
  The following instruction types are NOT supported for MMIO operations
  despite the HSR not containing decode info:
   - any Load/Store multiple
   - any load/store exclusive
   - any load/store dual
   - anything with the PC as the dest register
 
  This requires changing the general flow somewhat since new calls to run
  the VCPU must check if there's a pending MMIO load and perform the write
  after userspace has made the data available.
 
  Rusty Russell fixed a horrible race pointed out by Ben Herrenschmidt:
  (1) Guest complicated mmio instruction traps.
  (2) The hardware doesn't tell us enough, so we need to read the actual
  instruction which was being exectuted.
  (3) KVM maps the instruction virtual address to a physical address.
  (4) The guest (SMP) swaps out that page, and fills it with something else.
  (5) We read the physical address, but now that's the wrong thing.
  How can this happen?! The guest cannot reuse physical page before it
  flushes it from all vcpus tlb cache. For that it needs to send
  synchronous IPI to all vcpus and IPI will not be processed by a vcpu
  while it does emulation.
 
 I don't know how this works on x86, but a KVM/ARM guest can definitely
 handle an IPI.
 
How can a vcpu handle an IPI while it is not in a guest mode?

 Furthermore, TLB invalidation doesn't require an IPI on ARMv7 (unless
 we're doing some set/way operation which is handled separately).
 
What prevents a page to be swapped out while code is fetched from it?
 
--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [PATCH v5 03/14] KVM: ARM: Initial skeleton to compile KVM support

2013-01-15 Thread Alexander Graf

On 15.01.2013, at 14:32, Gleb Natapov wrote:

 On Mon, Jan 14, 2013 at 05:17:31PM -0500, Christoffer Dall wrote:
 On Mon, Jan 14, 2013 at 1:49 PM, Gleb Natapov g...@redhat.com wrote:
 A couple of general question about ABI. If they were already answered
 just refer me to the previous discussion.
 
 On Tue, Jan 08, 2013 at 01:38:55PM -0500, Christoffer Dall wrote:
 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index a4df553..4237c27 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -293,7 +293,7 @@ kvm_run' (see below).
 4.11 KVM_GET_REGS
 
 Capability: basic
 -Architectures: all
 +Architectures: all except ARM
 Type: vcpu ioctl
 Parameters: struct kvm_regs (out)
 Returns: 0 on success, -1 on error
 @@ -314,7 +314,7 @@ struct kvm_regs {
 4.12 KVM_SET_REGS
 
 Capability: basic
 -Architectures: all
 +Architectures: all except ARM
 Type: vcpu ioctl
 Parameters: struct kvm_regs (in)
 Returns: 0 on success, -1 on error
 @@ -600,7 +600,7 @@ struct kvm_fpu {
 4.24 KVM_CREATE_IRQCHIP
 Why KVM_GET_REGS/KVM_SET_REGS are not usable for arm?
 
 
 We use the ONE_REG API instead and we don't want to support two
 separate APIs to user space.
 
 I suppose fetching all registers is not anywhere on a fast path in
 userspace :)

If it's ever going to be in a fast path, we will add MULTI_REG which will 
feature an array of ONE_REG structs to fetch multiple registers at once.


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 13/14] KVM: ARM: Handle I/O aborts

2013-01-15 Thread Marc Zyngier
On 15/01/13 13:34, Gleb Natapov wrote:
 On Tue, Jan 15, 2013 at 01:29:40PM +, Marc Zyngier wrote:
 On 15/01/13 13:18, Gleb Natapov wrote:
 On Tue, Jan 08, 2013 at 01:40:05PM -0500, Christoffer Dall wrote:
 When the guest accesses I/O memory this will create data abort
 exceptions and they are handled by decoding the HSR information
 (physical address, read/write, length, register) and forwarding reads
 and writes to QEMU which performs the device emulation.

 Certain classes of load/store operations do not support the syndrome
 information provided in the HSR and we therefore must be able to fetch
 the offending instruction from guest memory and decode it manually.

 We only support instruction decoding for valid reasonable MMIO operations
 where trapping them do not provide sufficient information in the HSR (no
 16-bit Thumb instructions provide register writeback that we care about).

 The following instruction types are NOT supported for MMIO operations
 despite the HSR not containing decode info:
  - any Load/Store multiple
  - any load/store exclusive
  - any load/store dual
  - anything with the PC as the dest register

 This requires changing the general flow somewhat since new calls to run
 the VCPU must check if there's a pending MMIO load and perform the write
 after userspace has made the data available.

 Rusty Russell fixed a horrible race pointed out by Ben Herrenschmidt:
 (1) Guest complicated mmio instruction traps.
 (2) The hardware doesn't tell us enough, so we need to read the actual
 instruction which was being exectuted.
 (3) KVM maps the instruction virtual address to a physical address.
 (4) The guest (SMP) swaps out that page, and fills it with something else.
 (5) We read the physical address, but now that's the wrong thing.
 How can this happen?! The guest cannot reuse physical page before it
 flushes it from all vcpus tlb cache. For that it needs to send
 synchronous IPI to all vcpus and IPI will not be processed by a vcpu
 while it does emulation.

 I don't know how this works on x86, but a KVM/ARM guest can definitely
 handle an IPI.

 How can a vcpu handle an IPI while it is not in a guest mode?

I think there is some misunderstanding. A guest IPI is of course handled
while running the guest. You completely lost me here.

 Furthermore, TLB invalidation doesn't require an IPI on ARMv7 (unless
 we're doing some set/way operation which is handled separately).

 What prevents a page to be swapped out while code is fetched from it?

Why would you prevent it? TLB invalidation is broadcast by the HW. If
you swap a page out, you flag the entry as invalid and invalidate the
corresponding TLB. If you hit it, you swap the page back in.

M.
-- 
Jazz is not dead. It just smells funny...

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [PATCH v5 06/14] KVM: ARM: Inject IRQs and FIQs from userspace

2013-01-15 Thread Peter Maydell
On 15 January 2013 12:52, Gleb Natapov g...@redhat.com wrote:
 On Tue, Jan 15, 2013 at 12:15:01PM +, Peter Maydell wrote:
 On 15 January 2013 09:56, Gleb Natapov g...@redhat.com wrote:
  ARM can signal an interrupt either at the CPU level, or at the in-kernel 
  irqchip
  CPU level interrupt should use KVM_INTERRUPT instead.

 No, that would be wrong. KVM_INTERRUPT is for interrupts which must be
 delivered synchronously to the CPU. KVM_IRQ_LINE is for interrupts which
 can be fed to the kernel asynchronously. It happens that on x86 must be
 delivered synchronously and not going to in kernel irqchip are the same, 
 but
 this isn't true for other archs. For ARM all our interrupts can be fed
 to the kernel asynchronously, and so we use KVM_IRQ_LINE in all
 cases.

 I do no quite understand what you mean by synchronously and
 asynchronously.

Synchronously: the vcpu has to be stopped and userspace then
feeds in the interrupt to be taken when the guest is resumed.
Asynchronously: any old thread can tell the kernel there's an
interrupt, and the guest vcpu then deals with it when needed
(the vcpu thread may leave the guest but doesn't come out of
the host kernel to qemu).

 The difference between KVM_INTERRUPT and KVM_IRQ_LINE line
 is that former is used when destination cpu is known to userspace later
 is used when kernel code is involved in figuring out the destination.

This doesn't match up with Avi's explanation at all.

 The
 injections themselves are currently synchronous for both of them on x86
 and ARM. i.e vcpu is kicked out from guest mode when interrupt need to
 be injected into a guest and vcpu state is changed to inject interrupt
 during next guest entry. In the near feature x86 will be able to inject
 interrupt without kicking vcpu out from the guest mode does ARM plan to
 do the same? For GIC interrupts or for IRQ/FIQ or for both?

 There was a big discussion thread about this on kvm and qemu-devel last
 July (and we cleaned up some of the QEMU code to not smoosh together
 all these different concepts under do I have an irqchip or not?).
 Do you have a pointer?

  http://lists.gnu.org/archive/html/qemu-devel/2012-07/msg02460.html
and there was a later longer (but less clear) thread which included
this mail from Avi:
  http://lists.gnu.org/archive/html/qemu-devel/2012-07/msg02872.html
basically explaining that the reason for the weird synchronous
KVM_INTERRUPT API is that it's emulating a weird synchronous
hardware interface which is specific to x86. ARM doesn't have
a synchronous interface in the same way, so it's much more
straightforward to use KVM_IRQ_LINE.

-- PMM
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 13/14] KVM: ARM: Handle I/O aborts

2013-01-15 Thread Gleb Natapov
On Tue, Jan 15, 2013 at 01:46:04PM +, Marc Zyngier wrote:
 On 15/01/13 13:34, Gleb Natapov wrote:
  On Tue, Jan 15, 2013 at 01:29:40PM +, Marc Zyngier wrote:
  On 15/01/13 13:18, Gleb Natapov wrote:
  On Tue, Jan 08, 2013 at 01:40:05PM -0500, Christoffer Dall wrote:
  When the guest accesses I/O memory this will create data abort
  exceptions and they are handled by decoding the HSR information
  (physical address, read/write, length, register) and forwarding reads
  and writes to QEMU which performs the device emulation.
 
  Certain classes of load/store operations do not support the syndrome
  information provided in the HSR and we therefore must be able to fetch
  the offending instruction from guest memory and decode it manually.
 
  We only support instruction decoding for valid reasonable MMIO operations
  where trapping them do not provide sufficient information in the HSR (no
  16-bit Thumb instructions provide register writeback that we care about).
 
  The following instruction types are NOT supported for MMIO operations
  despite the HSR not containing decode info:
   - any Load/Store multiple
   - any load/store exclusive
   - any load/store dual
   - anything with the PC as the dest register
 
  This requires changing the general flow somewhat since new calls to run
  the VCPU must check if there's a pending MMIO load and perform the write
  after userspace has made the data available.
 
  Rusty Russell fixed a horrible race pointed out by Ben Herrenschmidt:
  (1) Guest complicated mmio instruction traps.
  (2) The hardware doesn't tell us enough, so we need to read the actual
  instruction which was being exectuted.
  (3) KVM maps the instruction virtual address to a physical address.
  (4) The guest (SMP) swaps out that page, and fills it with something 
  else.
  (5) We read the physical address, but now that's the wrong thing.
  How can this happen?! The guest cannot reuse physical page before it
  flushes it from all vcpus tlb cache. For that it needs to send
  synchronous IPI to all vcpus and IPI will not be processed by a vcpu
  while it does emulation.
 
  I don't know how this works on x86, but a KVM/ARM guest can definitely
  handle an IPI.
 
  How can a vcpu handle an IPI while it is not in a guest mode?
 
 I think there is some misunderstanding. A guest IPI is of course handled
 while running the guest. You completely lost me here.
You need IPI from one guest vcpu to another to invalidate its TLB on
x86. That prevents the race from happening there.

 
  Furthermore, TLB invalidation doesn't require an IPI on ARMv7 (unless
  we're doing some set/way operation which is handled separately).
 
  What prevents a page to be swapped out while code is fetched from it?
 
 Why would you prevent it? TLB invalidation is broadcast by the HW. If
 you swap a page out, you flag the entry as invalid and invalidate the
 corresponding TLB. If you hit it, you swap the page back in.
 
There is no IPI (or anything that requires response from cpu whose TLB
is invalidated) involved in invalidating remote TLB?


--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 07/12] ARM: KVM: VGIC virtual CPU interface management

2013-01-15 Thread Christoffer Dall
On Tue, Jan 15, 2013 at 6:00 AM, Marc Zyngier marc.zyng...@arm.com wrote:
 On 14/01/13 22:02, Christoffer Dall wrote:
 On Mon, Jan 14, 2013 at 10:42 AM, Will Deacon will.dea...@arm.com wrote:
 On Tue, Jan 08, 2013 at 06:42:11PM +, Christoffer Dall wrote:
 From: Marc Zyngier marc.zyng...@arm.com

 Add VGIC virtual CPU interface code, picking pending interrupts
 from the distributor and stashing them in the VGIC control interface
 list registers.

 Signed-off-by: Marc Zyngier marc.zyng...@arm.com
 Signed-off-by: Christoffer Dall c.d...@virtualopensystems.com
 ---
  arch/arm/include/asm/kvm_vgic.h |   30 
  arch/arm/kvm/vgic.c |  327 
 +++
  2 files changed, 356 insertions(+), 1 deletion(-)

 diff --git a/arch/arm/include/asm/kvm_vgic.h 
 b/arch/arm/include/asm/kvm_vgic.h
 index 9ff0d9c..b3133c4 100644
 --- a/arch/arm/include/asm/kvm_vgic.h
 +++ b/arch/arm/include/asm/kvm_vgic.h
 @@ -110,8 +110,33 @@ struct vgic_dist {
  };

  struct vgic_cpu {
 +#ifdef CONFIG_KVM_ARM_VGIC
 +   /* per IRQ to LR mapping */
 +   u8  vgic_irq_lr_map[VGIC_NR_IRQS];
 +
 +   /* Pending interrupts on this VCPU */
 +   DECLARE_BITMAP( pending_percpu, VGIC_NR_PRIVATE_IRQS);
 +   DECLARE_BITMAP( pending_shared, VGIC_NR_SHARED_IRQS);
 +
 +   /* Bitmap of used/free list registers */
 +   DECLARE_BITMAP( lr_used, 64);
 +
 +   /* Number of list registers on this CPU */
 +   int nr_lr;
 +
 +   /* CPU vif control registers for world switch */
 +   u32 vgic_hcr;
 +   u32 vgic_vmcr;
 +   u32 vgic_misr;  /* Saved only */
 +   u32 vgic_eisr[2];   /* Saved only */
 +   u32 vgic_elrsr[2];  /* Saved only */
 +   u32 vgic_apr;
 +   u32 vgic_lr[64];/* A15 has only 4... */

 Have a #define for the maximum number of list registers.

 +#endif
  };

 +#define LR_EMPTY   0xff
 +
  struct kvm;
  struct kvm_vcpu;
  struct kvm_run;
 @@ -119,9 +144,14 @@ struct kvm_exit_mmio;

  #ifdef CONFIG_KVM_ARM_VGIC
  int kvm_vgic_set_addr(struct kvm *kvm, unsigned long type, u64 addr);
 +void kvm_vgic_sync_to_cpu(struct kvm_vcpu *vcpu);
 +void kvm_vgic_sync_from_cpu(struct kvm_vcpu *vcpu);

 Same comment as for the arch timer (flush/sync).

 +int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
  bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
   struct kvm_exit_mmio *mmio);

 +#define irqchip_in_kernel(k)   (!!((k)-arch.vgic.vctrl_base))
 +
  #else
  static inline int kvm_vgic_hyp_init(void)
  {
 diff --git a/arch/arm/kvm/vgic.c b/arch/arm/kvm/vgic.c
 index bd2bd7f..58237d5 100644
 --- a/arch/arm/kvm/vgic.c
 +++ b/arch/arm/kvm/vgic.c
 @@ -152,6 +152,34 @@ static int vgic_irq_is_enabled(struct kvm_vcpu *vcpu, 
 int irq)
 return vgic_bitmap_get_irq_val(dist-irq_enabled, vcpu-vcpu_id, 
 irq);
  }

 +static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq)
 +{
 +   struct vgic_dist *dist = vcpu-kvm-arch.vgic;
 +
 +   return vgic_bitmap_get_irq_val(dist-irq_active, vcpu-vcpu_id, 
 irq);
 +}
 +
 +static void vgic_irq_set_active(struct kvm_vcpu *vcpu, int irq)
 +{
 +   struct vgic_dist *dist = vcpu-kvm-arch.vgic;
 +
 +   vgic_bitmap_set_irq_val(dist-irq_active, vcpu-vcpu_id, irq, 1);
 +}
 +
 +static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq)
 +{
 +   struct vgic_dist *dist = vcpu-kvm-arch.vgic;
 +
 +   vgic_bitmap_set_irq_val(dist-irq_active, vcpu-vcpu_id, irq, 0);
 +}
 +
 +static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq)
 +{
 +   struct vgic_dist *dist = vcpu-kvm-arch.vgic;
 +
 +   return vgic_bitmap_get_irq_val(dist-irq_state, vcpu-vcpu_id, 
 irq);
 +}
 +
  static void vgic_dist_irq_set(struct kvm_vcpu *vcpu, int irq)
  {
 struct vgic_dist *dist = vcpu-kvm-arch.vgic;
 @@ -711,7 +739,30 @@ static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, 
 u32 reg)

  static int compute_pending_for_cpu(struct kvm_vcpu *vcpu)
  {
 -   return 0;
 +   struct vgic_dist *dist = vcpu-kvm-arch.vgic;
 +   unsigned long *pending, *enabled, *pend_percpu, *pend_shared;
 +   unsigned long pending_private, pending_shared;
 +   int vcpu_id;
 +
 +   vcpu_id = vcpu-vcpu_id;
 +   pend_percpu = vcpu-arch.vgic_cpu.pending_percpu;
 +   pend_shared = vcpu-arch.vgic_cpu.pending_shared;
 +
 +   pending = vgic_bitmap_get_cpu_map(dist-irq_state, vcpu_id);
 +   enabled = vgic_bitmap_get_cpu_map(dist-irq_enabled, vcpu_id);
 +   bitmap_and(pend_percpu, pending, enabled, VGIC_NR_PRIVATE_IRQS);
 +
 +   pending = vgic_bitmap_get_shared_map(dist-irq_state);
 +   enabled = vgic_bitmap_get_shared_map(dist-irq_enabled);
 +   bitmap_and(pend_shared, pending, enabled, VGIC_NR_SHARED_IRQS);
 +   bitmap_and(pend_shared, pend_shared,
 +  
 

Re: [PATCH v5 2/4] ARM: KVM: arch_timers: Add guest timer core support

2013-01-15 Thread Christoffer Dall
On Tue, Jan 15, 2013 at 6:07 AM, Marc Zyngier marc.zyng...@arm.com wrote:
 On 14/01/13 19:19, Christoffer Dall wrote:
 On Mon, Jan 14, 2013 at 10:18 AM, Will Deacon will.dea...@arm.com wrote:
 On Tue, Jan 08, 2013 at 06:43:20PM +, Christoffer Dall wrote:
 From: Marc Zyngier marc.zyng...@arm.com

 Add some the architected timer related infrastructure, and support timer
 interrupt injection, which can happen as a resultof three possible
 events:

 - The virtual timer interrupt has fired while we were still
   executing the guest
 - The timer interrupt hasn't fired, but it expired while we
   were doing the world switch
 - A hrtimer we programmed earlier has fired

 [...]

 +void kvm_timer_sync_to_cpu(struct kvm_vcpu *vcpu)
 +{
 +   struct arch_timer_cpu *timer = vcpu-arch.timer_cpu;
 +
 +   /*
 +* We're about to run this vcpu again, so there is no need to
 +* keep the background timer running, as we're about to
 +* populate the CPU timer again.
 +*/
 +   timer_disarm(timer);
 +}
 +
 +void kvm_timer_sync_from_cpu(struct kvm_vcpu *vcpu)
 +{
 +   struct arch_timer_cpu *timer = vcpu-arch.timer_cpu;
 +   cycle_t cval, now;
 +   u64 ns;
 +
 +   /* Check if the timer is enabled and unmasked first */
 +   if ((timer-cntv_ctl  3) != 1)
 +   return;
 +
 +   cval = timer-cntv_cval;
 +   now = kvm_phys_timer_read() - vcpu-kvm-arch.timer.cntvoff;
 +
 +   BUG_ON(timer_is_armed(timer));
 +
 +   if (cval = now) {
 +   /*
 +* Timer has already expired while we were not
 +* looking. Inject the interrupt and carry on.
 +*/
 +   kvm_timer_inject_irq(vcpu);
 +   return;
 +   }
 +
 +   ns = cyclecounter_cyc2ns(timecounter-cc, cval - now);
 +   timer_arm(timer, ns);
 +}

 Please use flush/sync terminology to match the rest of arch/arm/.

 ok, the following fixes this for both timers and the vgic:

 commit 1b68f39459dbc797f6766c103edf2c1053984161
 Author: Christoffer Dall c.d...@virtualopensystems.com
 Date:   Mon Jan 14 14:16:31 2013 -0500

 KVM: ARM: vgic: use sync/flush terminology

 Use sync/flush for saving state to/from CPUs to be consistent with
 other uses in arch/arm.

 Sync and flush on their own are pretty inexpressive. Consider changing
 it to {flush,sync}_hwstate, so we're consistent with what VFP does.

sure
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [PATCH v5 06/14] KVM: ARM: Inject IRQs and FIQs from userspace

2013-01-15 Thread Christoffer Dall
On Tue, Jan 15, 2013 at 9:04 AM, Peter Maydell peter.mayd...@linaro.org wrote:
 On 15 January 2013 12:52, Gleb Natapov g...@redhat.com wrote:
 On Tue, Jan 15, 2013 at 12:15:01PM +, Peter Maydell wrote:
 On 15 January 2013 09:56, Gleb Natapov g...@redhat.com wrote:
  ARM can signal an interrupt either at the CPU level, or at the in-kernel 
  irqchip
  CPU level interrupt should use KVM_INTERRUPT instead.

 No, that would be wrong. KVM_INTERRUPT is for interrupts which must be
 delivered synchronously to the CPU. KVM_IRQ_LINE is for interrupts which
 can be fed to the kernel asynchronously. It happens that on x86 must be
 delivered synchronously and not going to in kernel irqchip are the same, 
 but
 this isn't true for other archs. For ARM all our interrupts can be fed
 to the kernel asynchronously, and so we use KVM_IRQ_LINE in all
 cases.

 I do no quite understand what you mean by synchronously and
 asynchronously.

 Synchronously: the vcpu has to be stopped and userspace then
 feeds in the interrupt to be taken when the guest is resumed.
 Asynchronously: any old thread can tell the kernel there's an
 interrupt, and the guest vcpu then deals with it when needed
 (the vcpu thread may leave the guest but doesn't come out of
 the host kernel to qemu).

 The difference between KVM_INTERRUPT and KVM_IRQ_LINE line
 is that former is used when destination cpu is known to userspace later
 is used when kernel code is involved in figuring out the destination.

 This doesn't match up with Avi's explanation at all.

 The
 injections themselves are currently synchronous for both of them on x86
 and ARM. i.e vcpu is kicked out from guest mode when interrupt need to
 be injected into a guest and vcpu state is changed to inject interrupt
 during next guest entry. In the near feature x86 will be able to inject
 interrupt without kicking vcpu out from the guest mode does ARM plan to
 do the same? For GIC interrupts or for IRQ/FIQ or for both?

 There was a big discussion thread about this on kvm and qemu-devel last
 July (and we cleaned up some of the QEMU code to not smoosh together
 all these different concepts under do I have an irqchip or not?).
 Do you have a pointer?

   http://lists.gnu.org/archive/html/qemu-devel/2012-07/msg02460.html
 and there was a later longer (but less clear) thread which included
 this mail from Avi:
   http://lists.gnu.org/archive/html/qemu-devel/2012-07/msg02872.html
 basically explaining that the reason for the weird synchronous
 KVM_INTERRUPT API is that it's emulating a weird synchronous
 hardware interface which is specific to x86. ARM doesn't have
 a synchronous interface in the same way, so it's much more
 straightforward to use KVM_IRQ_LINE.

Also, this code has been reviewed numerous times by the KVM community
and as Peter points out has also been discussed in detail.

Could we please not change this API in the last second?

-Christoffer
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 13/14] KVM: ARM: Handle I/O aborts

2013-01-15 Thread Christoffer Dall
On Tue, Jan 15, 2013 at 9:27 AM, Gleb Natapov g...@redhat.com wrote:
 On Tue, Jan 15, 2013 at 01:46:04PM +, Marc Zyngier wrote:
 On 15/01/13 13:34, Gleb Natapov wrote:
  On Tue, Jan 15, 2013 at 01:29:40PM +, Marc Zyngier wrote:
  On 15/01/13 13:18, Gleb Natapov wrote:
  On Tue, Jan 08, 2013 at 01:40:05PM -0500, Christoffer Dall wrote:
  When the guest accesses I/O memory this will create data abort
  exceptions and they are handled by decoding the HSR information
  (physical address, read/write, length, register) and forwarding reads
  and writes to QEMU which performs the device emulation.
 
  Certain classes of load/store operations do not support the syndrome
  information provided in the HSR and we therefore must be able to fetch
  the offending instruction from guest memory and decode it manually.
 
  We only support instruction decoding for valid reasonable MMIO 
  operations
  where trapping them do not provide sufficient information in the HSR (no
  16-bit Thumb instructions provide register writeback that we care 
  about).
 
  The following instruction types are NOT supported for MMIO operations
  despite the HSR not containing decode info:
   - any Load/Store multiple
   - any load/store exclusive
   - any load/store dual
   - anything with the PC as the dest register
 
  This requires changing the general flow somewhat since new calls to run
  the VCPU must check if there's a pending MMIO load and perform the write
  after userspace has made the data available.
 
  Rusty Russell fixed a horrible race pointed out by Ben Herrenschmidt:
  (1) Guest complicated mmio instruction traps.
  (2) The hardware doesn't tell us enough, so we need to read the actual
  instruction which was being exectuted.
  (3) KVM maps the instruction virtual address to a physical address.
  (4) The guest (SMP) swaps out that page, and fills it with something 
  else.
  (5) We read the physical address, but now that's the wrong thing.
  How can this happen?! The guest cannot reuse physical page before it
  flushes it from all vcpus tlb cache. For that it needs to send
  synchronous IPI to all vcpus and IPI will not be processed by a vcpu
  while it does emulation.
 
  I don't know how this works on x86, but a KVM/ARM guest can definitely
  handle an IPI.
 
  How can a vcpu handle an IPI while it is not in a guest mode?

 I think there is some misunderstanding. A guest IPI is of course handled
 while running the guest. You completely lost me here.
 You need IPI from one guest vcpu to another to invalidate its TLB on
 x86. That prevents the race from happening there.


  Furthermore, TLB invalidation doesn't require an IPI on ARMv7 (unless
  we're doing some set/way operation which is handled separately).
 
  What prevents a page to be swapped out while code is fetched from it?

 Why would you prevent it? TLB invalidation is broadcast by the HW. If
 you swap a page out, you flag the entry as invalid and invalidate the
 corresponding TLB. If you hit it, you swap the page back in.

 There is no IPI (or anything that requires response from cpu whose TLB
 is invalidated) involved in invalidating remote TLB?


no there's not, the hardware broadcasts the TLB invalidate operation.

-Christoffer
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 13/14] KVM: ARM: Handle I/O aborts

2013-01-15 Thread Marc Zyngier
On 15/01/13 14:27, Gleb Natapov wrote:
 On Tue, Jan 15, 2013 at 01:46:04PM +, Marc Zyngier wrote:
 On 15/01/13 13:34, Gleb Natapov wrote:
 On Tue, Jan 15, 2013 at 01:29:40PM +, Marc Zyngier wrote:
 On 15/01/13 13:18, Gleb Natapov wrote:
 On Tue, Jan 08, 2013 at 01:40:05PM -0500, Christoffer Dall wrote:
 When the guest accesses I/O memory this will create data abort
 exceptions and they are handled by decoding the HSR information
 (physical address, read/write, length, register) and forwarding reads
 and writes to QEMU which performs the device emulation.

 Certain classes of load/store operations do not support the syndrome
 information provided in the HSR and we therefore must be able to fetch
 the offending instruction from guest memory and decode it manually.

 We only support instruction decoding for valid reasonable MMIO operations
 where trapping them do not provide sufficient information in the HSR (no
 16-bit Thumb instructions provide register writeback that we care about).

 The following instruction types are NOT supported for MMIO operations
 despite the HSR not containing decode info:
  - any Load/Store multiple
  - any load/store exclusive
  - any load/store dual
  - anything with the PC as the dest register

 This requires changing the general flow somewhat since new calls to run
 the VCPU must check if there's a pending MMIO load and perform the write
 after userspace has made the data available.

 Rusty Russell fixed a horrible race pointed out by Ben Herrenschmidt:
 (1) Guest complicated mmio instruction traps.
 (2) The hardware doesn't tell us enough, so we need to read the actual
 instruction which was being exectuted.
 (3) KVM maps the instruction virtual address to a physical address.
 (4) The guest (SMP) swaps out that page, and fills it with something 
 else.
 (5) We read the physical address, but now that's the wrong thing.
 How can this happen?! The guest cannot reuse physical page before it
 flushes it from all vcpus tlb cache. For that it needs to send
 synchronous IPI to all vcpus and IPI will not be processed by a vcpu
 while it does emulation.

 I don't know how this works on x86, but a KVM/ARM guest can definitely
 handle an IPI.

 How can a vcpu handle an IPI while it is not in a guest mode?

 I think there is some misunderstanding. A guest IPI is of course handled
 while running the guest. You completely lost me here.
 You need IPI from one guest vcpu to another to invalidate its TLB on
 x86. That prevents the race from happening there.

We don't need this on ARM (starting with v7, v6 is an entirely different
story, and we do not support KVM on v6).

The TLB is propagated by the HW using the following (pseudocode) sequence:
tlb_invalidate VA
barrier

Leaving the barrier guaranties that all TLB invalidations have been
propagated.


 Furthermore, TLB invalidation doesn't require an IPI on ARMv7 (unless
 we're doing some set/way operation which is handled separately).

 What prevents a page to be swapped out while code is fetched from it?

 Why would you prevent it? TLB invalidation is broadcast by the HW. If
 you swap a page out, you flag the entry as invalid and invalidate the
 corresponding TLB. If you hit it, you swap the page back in.

 There is no IPI (or anything that requires response from cpu whose TLB
 is invalidated) involved in invalidating remote TLB?

No. The above sequence is all you have to do.

This is why the above race is a bit hairy. A vcpu will happily
invalidate TLBs, but as the faulting vcpu already performed the
translation, we're screwed.

Thankfully, this is a case that only matters when we have to emulate an
MMIO operation that is not automatically decoded by the HW. They are
rare (the Linux kernel doesn't use them). In this case, we stop the
world (IPI).

M.
-- 
Jazz is not dead. It just smells funny...

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] KVM call for 2013-01-15

2013-01-15 Thread Andreas Färber
Am 15.01.2013 11:36, schrieb Eduardo Habkost:
 On Mon, Jan 14, 2013 at 02:26:12PM +0100, Juan Quintela wrote:
 Please send in any agenda topics that you have.
 
 - CPU hotplug plan/design

For those that didn't read it, some food for thought:
http://lists.gnu.org/archive/html/qemu-devel/2013-01/msg01219.html

The basic assumptions that I have been operating under so far:
1) CPU hotplug will be x86-specific
2) The CPU is sufficiently different from a device to not use device_add
3) x86 CPU hotplug will not allow mixing different CPU types (Alex)

Previous art seems to be HMP cpu_set. Using any such specific operation
would allow us to abstract the ongoing QOM rework - I am about 2-3
series ahead of what has been posted but not there at QOM'ish hotplug
yet, so am open to any suggestions on the call.

Regards,
Andreas

-- 
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [PATCH v5 06/14] KVM: ARM: Inject IRQs and FIQs from userspace

2013-01-15 Thread Gleb Natapov
On Tue, Jan 15, 2013 at 02:04:47PM +, Peter Maydell wrote:
 On 15 January 2013 12:52, Gleb Natapov g...@redhat.com wrote:
  On Tue, Jan 15, 2013 at 12:15:01PM +, Peter Maydell wrote:
  On 15 January 2013 09:56, Gleb Natapov g...@redhat.com wrote:
   ARM can signal an interrupt either at the CPU level, or at the 
   in-kernel irqchip
   CPU level interrupt should use KVM_INTERRUPT instead.
 
  No, that would be wrong. KVM_INTERRUPT is for interrupts which must be
  delivered synchronously to the CPU. KVM_IRQ_LINE is for interrupts which
  can be fed to the kernel asynchronously. It happens that on x86 must be
  delivered synchronously and not going to in kernel irqchip are the 
  same, but
  this isn't true for other archs. For ARM all our interrupts can be fed
  to the kernel asynchronously, and so we use KVM_IRQ_LINE in all
  cases.
 
  I do no quite understand what you mean by synchronously and
  asynchronously.
 
 Synchronously: the vcpu has to be stopped and userspace then
 feeds in the interrupt to be taken when the guest is resumed.
 Asynchronously: any old thread can tell the kernel there's an
 interrupt, and the guest vcpu then deals with it when needed
 (the vcpu thread may leave the guest but doesn't come out of
 the host kernel to qemu).
 
  The difference between KVM_INTERRUPT and KVM_IRQ_LINE line
  is that former is used when destination cpu is known to userspace later
  is used when kernel code is involved in figuring out the destination.
 
 This doesn't match up with Avi's explanation at all.
 
  The
  injections themselves are currently synchronous for both of them on x86
  and ARM. i.e vcpu is kicked out from guest mode when interrupt need to
  be injected into a guest and vcpu state is changed to inject interrupt
  during next guest entry. In the near feature x86 will be able to inject
  interrupt without kicking vcpu out from the guest mode does ARM plan to
  do the same? For GIC interrupts or for IRQ/FIQ or for both?
 
  There was a big discussion thread about this on kvm and qemu-devel last
  July (and we cleaned up some of the QEMU code to not smoosh together
  all these different concepts under do I have an irqchip or not?).
  Do you have a pointer?
 
   http://lists.gnu.org/archive/html/qemu-devel/2012-07/msg02460.html
 and there was a later longer (but less clear) thread which included
 this mail from Avi:
   http://lists.gnu.org/archive/html/qemu-devel/2012-07/msg02872.html
 basically explaining that the reason for the weird synchronous
 KVM_INTERRUPT API is that it's emulating a weird synchronous
 hardware interface which is specific to x86. ARM doesn't have
 a synchronous interface in the same way, so it's much more
 straightforward to use KVM_IRQ_LINE.
 
OK. I see. So basically Avi saw KVM_INTERRUPT as an oddball interface
required only for APIC emulation in userspace. It is used for PIC also,
where this is not strictly needed, but this is for historical reasons
(KVM_IRQ_LINE was introduces late and it is GSI centric on x86).

Thank you for the pointer.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 13/14] KVM: ARM: Handle I/O aborts

2013-01-15 Thread Gleb Natapov
On Tue, Jan 15, 2013 at 02:48:27PM +, Marc Zyngier wrote:
 On 15/01/13 14:27, Gleb Natapov wrote:
  On Tue, Jan 15, 2013 at 01:46:04PM +, Marc Zyngier wrote:
  On 15/01/13 13:34, Gleb Natapov wrote:
  On Tue, Jan 15, 2013 at 01:29:40PM +, Marc Zyngier wrote:
  On 15/01/13 13:18, Gleb Natapov wrote:
  On Tue, Jan 08, 2013 at 01:40:05PM -0500, Christoffer Dall wrote:
  When the guest accesses I/O memory this will create data abort
  exceptions and they are handled by decoding the HSR information
  (physical address, read/write, length, register) and forwarding reads
  and writes to QEMU which performs the device emulation.
 
  Certain classes of load/store operations do not support the syndrome
  information provided in the HSR and we therefore must be able to fetch
  the offending instruction from guest memory and decode it manually.
 
  We only support instruction decoding for valid reasonable MMIO 
  operations
  where trapping them do not provide sufficient information in the HSR 
  (no
  16-bit Thumb instructions provide register writeback that we care 
  about).
 
  The following instruction types are NOT supported for MMIO operations
  despite the HSR not containing decode info:
   - any Load/Store multiple
   - any load/store exclusive
   - any load/store dual
   - anything with the PC as the dest register
 
  This requires changing the general flow somewhat since new calls to run
  the VCPU must check if there's a pending MMIO load and perform the 
  write
  after userspace has made the data available.
 
  Rusty Russell fixed a horrible race pointed out by Ben Herrenschmidt:
  (1) Guest complicated mmio instruction traps.
  (2) The hardware doesn't tell us enough, so we need to read the actual
  instruction which was being exectuted.
  (3) KVM maps the instruction virtual address to a physical address.
  (4) The guest (SMP) swaps out that page, and fills it with something 
  else.
  (5) We read the physical address, but now that's the wrong thing.
  How can this happen?! The guest cannot reuse physical page before it
  flushes it from all vcpus tlb cache. For that it needs to send
  synchronous IPI to all vcpus and IPI will not be processed by a vcpu
  while it does emulation.
 
  I don't know how this works on x86, but a KVM/ARM guest can definitely
  handle an IPI.
 
  How can a vcpu handle an IPI while it is not in a guest mode?
 
  I think there is some misunderstanding. A guest IPI is of course handled
  while running the guest. You completely lost me here.
  You need IPI from one guest vcpu to another to invalidate its TLB on
  x86. That prevents the race from happening there.
 
 We don't need this on ARM (starting with v7, v6 is an entirely different
 story, and we do not support KVM on v6).
 
 The TLB is propagated by the HW using the following (pseudocode) sequence:
   tlb_invalidate VA
   barrier
 
 Leaving the barrier guaranties that all TLB invalidations have been
 propagated.
 
That explains why __get_user_pages_fast() is missing on ARM :)

 
  Furthermore, TLB invalidation doesn't require an IPI on ARMv7 (unless
  we're doing some set/way operation which is handled separately).
 
  What prevents a page to be swapped out while code is fetched from it?
 
  Why would you prevent it? TLB invalidation is broadcast by the HW. If
  you swap a page out, you flag the entry as invalid and invalidate the
  corresponding TLB. If you hit it, you swap the page back in.
 
  There is no IPI (or anything that requires response from cpu whose TLB
  is invalidated) involved in invalidating remote TLB?
 
 No. The above sequence is all you have to do.
 
 This is why the above race is a bit hairy. A vcpu will happily
 invalidate TLBs, but as the faulting vcpu already performed the
 translation, we're screwed.
 
 Thankfully, this is a case that only matters when we have to emulate an
 MMIO operation that is not automatically decoded by the HW. They are
 rare (the Linux kernel doesn't use them). In this case, we stop the
 world (IPI).
 
Got it. Thanks.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [PATCH v5 03/14] KVM: ARM: Initial skeleton to compile KVM support

2013-01-15 Thread Gleb Natapov
On Tue, Jan 15, 2013 at 02:43:26PM +0100, Alexander Graf wrote:
 
 On 15.01.2013, at 14:32, Gleb Natapov wrote:
 
  On Mon, Jan 14, 2013 at 05:17:31PM -0500, Christoffer Dall wrote:
  On Mon, Jan 14, 2013 at 1:49 PM, Gleb Natapov g...@redhat.com wrote:
  A couple of general question about ABI. If they were already answered
  just refer me to the previous discussion.
  
  On Tue, Jan 08, 2013 at 01:38:55PM -0500, Christoffer Dall wrote:
  diff --git a/Documentation/virtual/kvm/api.txt 
  b/Documentation/virtual/kvm/api.txt
  index a4df553..4237c27 100644
  --- a/Documentation/virtual/kvm/api.txt
  +++ b/Documentation/virtual/kvm/api.txt
  @@ -293,7 +293,7 @@ kvm_run' (see below).
  4.11 KVM_GET_REGS
  
  Capability: basic
  -Architectures: all
  +Architectures: all except ARM
  Type: vcpu ioctl
  Parameters: struct kvm_regs (out)
  Returns: 0 on success, -1 on error
  @@ -314,7 +314,7 @@ struct kvm_regs {
  4.12 KVM_SET_REGS
  
  Capability: basic
  -Architectures: all
  +Architectures: all except ARM
  Type: vcpu ioctl
  Parameters: struct kvm_regs (in)
  Returns: 0 on success, -1 on error
  @@ -600,7 +600,7 @@ struct kvm_fpu {
  4.24 KVM_CREATE_IRQCHIP
  Why KVM_GET_REGS/KVM_SET_REGS are not usable for arm?
  
  
  We use the ONE_REG API instead and we don't want to support two
  separate APIs to user space.
  
  I suppose fetching all registers is not anywhere on a fast path in
  userspace :)
 
 If it's ever going to be in a fast path, we will add MULTI_REG which will 
 feature an array of ONE_REG structs to fetch multiple registers at once.
 
And I hope it will not be. On x86 only the broken vmware PV
interface requires reading registers for emulation.

This reminds me the question I wanted to ask you about ONE_REG
interface. Why architecture is encoded in register name? Is architecture
implied by HW the code is running on anyway?

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Minutes for KVM call 2013-01-15

2013-01-15 Thread Juan Quintela

* cpu hot plug
  - use qdev propierties conected to a set of socket objects (anthony)
  - cpusets are the wrong interface (anthony)
  - make a link between cpu - socket instead of a propierty?
  - how far are we from being able to describe a cpu with -device?
(didn't heare the answer, andreas?)
  - perhaps the best approach?
  - After soft-freeze, exceptions depend on the maintainer
  - After hard-freeze, no exceptions
  -device don't require a bus, just an implementation detail, we can change that
  - use cpuset as an intermediate step until full vision is implemented
  - several approaches from where we are now, to have something before
we get a full solution


At this point, Andreas agreed to write a better summary of the
discussion and suggestions O:-)

Later, Juan.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [PATCH v5 03/14] KVM: ARM: Initial skeleton to compile KVM support

2013-01-15 Thread Alexander Graf

On 01/15/2013 04:35 PM, Gleb Natapov wrote:

On Tue, Jan 15, 2013 at 02:43:26PM +0100, Alexander Graf wrote:

On 15.01.2013, at 14:32, Gleb Natapov wrote:


On Mon, Jan 14, 2013 at 05:17:31PM -0500, Christoffer Dall wrote:

On Mon, Jan 14, 2013 at 1:49 PM, Gleb Natapovg...@redhat.com  wrote:

A couple of general question about ABI. If they were already answered
just refer me to the previous discussion.

On Tue, Jan 08, 2013 at 01:38:55PM -0500, Christoffer Dall wrote:

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index a4df553..4237c27 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -293,7 +293,7 @@ kvm_run' (see below).
4.11 KVM_GET_REGS

Capability: basic
-Architectures: all
+Architectures: all except ARM
Type: vcpu ioctl
Parameters: struct kvm_regs (out)
Returns: 0 on success, -1 on error
@@ -314,7 +314,7 @@ struct kvm_regs {
4.12 KVM_SET_REGS

Capability: basic
-Architectures: all
+Architectures: all except ARM
Type: vcpu ioctl
Parameters: struct kvm_regs (in)
Returns: 0 on success, -1 on error
@@ -600,7 +600,7 @@ struct kvm_fpu {
4.24 KVM_CREATE_IRQCHIP

Why KVM_GET_REGS/KVM_SET_REGS are not usable for arm?


We use the ONE_REG API instead and we don't want to support two
separate APIs to user space.


I suppose fetching all registers is not anywhere on a fast path in
userspace :)

If it's ever going to be in a fast path, we will add MULTI_REG which will 
feature an array of ONE_REG structs to fetch multiple registers at once.


And I hope it will not be. On x86 only the broken vmware PV
interface requires reading registers for emulation.

This reminds me the question I wanted to ask you about ONE_REG
interface. Why architecture is encoded in register name? Is architecture
implied by HW the code is running on anyway?


It provides for nice sanity checks. Also, 64 bits are quite a lot, so we 
can easily waste a few bits for redundancy.



Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [PATCH v5 06/14] KVM: ARM: Inject IRQs and FIQs from userspace

2013-01-15 Thread Alexander Graf

On 01/15/2013 04:17 PM, Gleb Natapov wrote:

On Tue, Jan 15, 2013 at 02:04:47PM +, Peter Maydell wrote:

On 15 January 2013 12:52, Gleb Natapovg...@redhat.com  wrote:

On Tue, Jan 15, 2013 at 12:15:01PM +, Peter Maydell wrote:

On 15 January 2013 09:56, Gleb Natapovg...@redhat.com  wrote:

ARM can signal an interrupt either at the CPU level, or at the in-kernel irqchip

CPU level interrupt should use KVM_INTERRUPT instead.

No, that would be wrong. KVM_INTERRUPT is for interrupts which must be
delivered synchronously to the CPU. KVM_IRQ_LINE is for interrupts which
can be fed to the kernel asynchronously. It happens that on x86 must be
delivered synchronously and not going to in kernel irqchip are the same, but
this isn't true for other archs. For ARM all our interrupts can be fed
to the kernel asynchronously, and so we use KVM_IRQ_LINE in all
cases.

I do no quite understand what you mean by synchronously and
asynchronously.

Synchronously: the vcpu has to be stopped and userspace then
feeds in the interrupt to be taken when the guest is resumed.
Asynchronously: any old thread can tell the kernel there's an
interrupt, and the guest vcpu then deals with it when needed
(the vcpu thread may leave the guest but doesn't come out of
the host kernel to qemu).


The difference between KVM_INTERRUPT and KVM_IRQ_LINE line
is that former is used when destination cpu is known to userspace later
is used when kernel code is involved in figuring out the destination.

This doesn't match up with Avi's explanation at all.


The
injections themselves are currently synchronous for both of them on x86
and ARM. i.e vcpu is kicked out from guest mode when interrupt need to
be injected into a guest and vcpu state is changed to inject interrupt
during next guest entry. In the near feature x86 will be able to inject
interrupt without kicking vcpu out from the guest mode does ARM plan to
do the same? For GIC interrupts or for IRQ/FIQ or for both?


There was a big discussion thread about this on kvm and qemu-devel last
July (and we cleaned up some of the QEMU code to not smoosh together
all these different concepts under do I have an irqchip or not?).

Do you have a pointer?

   http://lists.gnu.org/archive/html/qemu-devel/2012-07/msg02460.html
and there was a later longer (but less clear) thread which included
this mail from Avi:
   http://lists.gnu.org/archive/html/qemu-devel/2012-07/msg02872.html
basically explaining that the reason for the weird synchronous
KVM_INTERRUPT API is that it's emulating a weird synchronous
hardware interface which is specific to x86. ARM doesn't have
a synchronous interface in the same way, so it's much more
straightforward to use KVM_IRQ_LINE.


OK. I see. So basically Avi saw KVM_INTERRUPT as an oddball interface
required only for APIC emulation in userspace. It is used for PIC also,
where this is not strictly needed, but this is for historical reasons
(KVM_IRQ_LINE was introduces late and it is GSI centric on x86).

Thank you for the pointer.


Yeah, please keep in mind that KVM_INTERRUPT is not a unified interface 
either. In fact, it is asynchronous on PPC :). And it's called 
KVM_S390_INTERRUPT on s390 and also asynchronous. X86 is the oddball here.


But I don't care whether we call the ioctl to steer CPU interrupt pins 
KVM_INTERRUPT, KVM_S390_INTERRUPT or KVM_IRQ_LINE, as long as the code 
makes it obvious what is happening.



Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: qemu-kvm hangs at start up under 3.8.0-rc3-00074-gb719f43 (works with CONFIG_LOCKDEP)

2013-01-15 Thread Rik van Riel

On 01/14/2013 01:24 PM, Andrew Clayton wrote:

On Mon, 14 Jan 2013 15:27:36 +0200, Gleb Natapov wrote:


On Sun, Jan 13, 2013 at 10:29:58PM +, Andrew Clayton wrote:

When running qemu-kvm under 64but Fedora 16 under current 3.8, it
just hangs at start up. Dong a ps -ef hangs the process at the
point where it would display the qemu process (trying to list the
qemu-kvm /proc pid directory contents just hangs ls).

I also noticed some other weirdness at this point like Firefox
hanging for many seconds at a time and increasing load average.

The qemu command I was trying to run was

$ qemu-kvm -m 512 -smp 2 -vga vmware -k en-gb -drive
file=/home/andrew/machines/qemu/f16-i386.img,if=virtio

Here's the last few lines of a strace on it at start up.

open(/home/andrew/machines/qemu/f16-i386.img,
O_RDWR|O_DSYNC|O_CLOEXEC) = 8 lseek(8, 0,
SEEK_END)   = 9100722176 pread(8,
QFI\373\0\0\0\2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\20\0\0\0\2\200\0\0\0...,
512, 0) = 512 pread(8,
\200\0\0\0\0\4\0\0\200\0\0\0\0\10\0\0\200\0\0\0\2\210\0\0\200\0\0\0\2\233\0\0...,
512, 65536) = 512 brk(0)  =
0x7faf12db brk(0x7faf12ddd000

It's hanging in that brk syscall. The load average also then starts
to increase.


However. I can make it run fine, if I enable CONFIG_LOCKDEP. But
the only thing in dmesg I get is the usual.

kvm: SMP vm created on host with unstable TSC; guest TSC will not
be reliable

I've attached both working and non-working .configs. The only
difference being the lock checking enabled in config.good.

The most recent kernel I had it working in was 3.7.0

System is a Quad Core Intel running 64bit Fedora 16.


Can you run echo t  /proc/sysrq-trigger and see where it hangs?


Here you go, here's the bash process, qemu and a kvm bit. (From the
above command)

bashS 88013b2b0d00 0  3203   3133 0x
  880114dabe58 0082 800113558065 880114dabfd8
  880114dabfd8 4000 88013b0c5b00 88013b2b0d00
  880114dabd88 8109067d ea0004536670 ea0004536640
Call Trace:
  [8109067d] ? default_wake_function+0xd/0x10
  [8108a315] ? atomic_notifier_call_chain+0x15/0x20
  [8133d84f] ? tty_get_pgrp+0x3f/0x50
  [810819ac] ? pid_vnr+0x2c/0x30
  [8133fe54] ? tty_ioctl+0x7b4/0xbd0
  [8106bf62] ? wait_consider_task+0x102/0xaf0
  [815c00e4] schedule+0x24/0x70
  [8106cb24] do_wait+0x1d4/0x200
  [8106d9cb] sys_wait4+0x9b/0xf0
  [8106b9f0] ? task_stopped_code+0x50/0x50
  [815c1ad2] system_call_fastpath+0x16/0x1b

qemu-kvmD 88011ab8c8b8 0  3345   3203 0x
  880112129cd8 0082 880112129c50 880112129fd8
  880112129fd8 4000 88013b04ce00 880139da1a00
   000280da 880112129d38 810d3300
Call Trace:
  [810d3300] ? __alloc_pages_nodemask+0xf0/0x7c0
  [811273c6] ? touch_atime+0x66/0x170
  [810cdabf] ? generic_file_aio_read+0x5bf/0x730
  [815c00e4] schedule+0x24/0x70
  [815c0cdd] rwsem_down_failed_common+0xbd/0x150
  [815c0da3] rwsem_down_write_failed+0x13/0x15
  [812d1be3] call_rwsem_down_write_failed+0x13/0x20
  [815bf4dd] ? down_write+0x2d/0x34
  [810f0724] vma_adjust+0xe4/0x610
  [810f0fa4] vma_merge+0x1b4/0x270
  [810f1fa6] do_brk+0x196/0x330
  [810f2217] sys_brk+0xd7/0x130
  [815c1ad2] system_call_fastpath+0x16/0x1b


This looks like qemu-kvm getting stuck trying to get the anon_vma lock.

That leads to the obvious question: what is holding the lock, and/or
failed to release it?

Do you have any other (qemu-kvm?) processes on your system that have
any code in the VM (or strace/ptrace/...) in the backtrace, that might
be holding this lock?

Do you have anything in your dmesg showing threads that had a BUG_ON
(and exited) while holding the lock?

--
All rights reversed
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: qemu-kvm hangs at start up under 3.8.0-rc3-00074-gb719f43 (works with CONFIG_LOCKDEP)

2013-01-15 Thread Andrew Clayton
On Tue, 15 Jan 2013 11:48:39 -0500, Rik van Riel wrote:

 On 01/14/2013 01:24 PM, Andrew Clayton wrote:

[snip]

 
  bashS 88013b2b0d00 0  3203   3133 0x
880114dabe58 0082 800113558065
  880114dabfd8 880114dabfd8 4000 88013b0c5b00
  88013b2b0d00 880114dabd88 8109067d ea0004536670
  ea0004536640 Call Trace:
[8109067d] ? default_wake_function+0xd/0x10
[8108a315] ? atomic_notifier_call_chain+0x15/0x20
[8133d84f] ? tty_get_pgrp+0x3f/0x50
[810819ac] ? pid_vnr+0x2c/0x30
[8133fe54] ? tty_ioctl+0x7b4/0xbd0
[8106bf62] ? wait_consider_task+0x102/0xaf0
[815c00e4] schedule+0x24/0x70
[8106cb24] do_wait+0x1d4/0x200
[8106d9cb] sys_wait4+0x9b/0xf0
[8106b9f0] ? task_stopped_code+0x50/0x50
[815c1ad2] system_call_fastpath+0x16/0x1b
 
  qemu-kvmD 88011ab8c8b8 0  3345   3203 0x
880112129cd8 0082 880112129c50
  880112129fd8 880112129fd8 4000 88013b04ce00
  880139da1a00  000280da 880112129d38
  810d3300 Call Trace:
[810d3300] ? __alloc_pages_nodemask+0xf0/0x7c0
[811273c6] ? touch_atime+0x66/0x170
[810cdabf] ? generic_file_aio_read+0x5bf/0x730
[815c00e4] schedule+0x24/0x70
[815c0cdd] rwsem_down_failed_common+0xbd/0x150
[815c0da3] rwsem_down_write_failed+0x13/0x15
[812d1be3] call_rwsem_down_write_failed+0x13/0x20
[815bf4dd] ? down_write+0x2d/0x34
[810f0724] vma_adjust+0xe4/0x610
[810f0fa4] vma_merge+0x1b4/0x270
[810f1fa6] do_brk+0x196/0x330
[810f2217] sys_brk+0xd7/0x130
[815c1ad2] system_call_fastpath+0x16/0x1b
 
 This looks like qemu-kvm getting stuck trying to get the anon_vma
 lock.
 
 That leads to the obvious question: what is holding the lock, and/or
 failed to release it?
 
 Do you have any other (qemu-kvm?) processes on your system that have
 any code in the VM (or strace/ptrace/...) in the backtrace, that might
 be holding this lock?

I don't think so. The above was done having just logged into
gnome-shell and opened up a couple of gnome-terminals.
 
 Do you have anything in your dmesg showing threads that had a BUG_ON
 (and exited) while holding the lock?

I never noticed anything like that.

The interesting thing is that if I use basically the same kernel but
with CONFIG_LOCKDEP enabled, it works fine.

Hmm, I wonder if it's the same as this?
http://lkml.indiana.edu/hypermail/linux/kernel/1301.1/02810.html

When I get home, I'll try a kernel with those two patches reverted.
They seem to be

572043c90db65b45a4efd959db7458edcf6411ad
1b963c81b14509e330e0fe3218b645ece2738dc5

in Linus' tree.

CC'd Jiri as well.

Cheers,
Andrew
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: qemu-kvm hangs at start up under 3.8.0-rc3-00074-gb719f43 (works with CONFIG_LOCKDEP)

2013-01-15 Thread Andrew Clayton
On Tue, 15 Jan 2013 17:17:56 +, Andrew Clayton wrote:

 On Tue, 15 Jan 2013 11:48:39 -0500, Rik van Riel wrote:
 
  On 01/14/2013 01:24 PM, Andrew Clayton wrote:
 
 [snip]
 
  
   bashS 88013b2b0d00 0  3203   3133 0x
 880114dabe58 0082 800113558065
   880114dabfd8 880114dabfd8 4000
   88013b0c5b00 88013b2b0d00 880114dabd88
   8109067d ea0004536670 ea0004536640 Call Trace:
 [8109067d] ? default_wake_function+0xd/0x10
 [8108a315] ? atomic_notifier_call_chain+0x15/0x20
 [8133d84f] ? tty_get_pgrp+0x3f/0x50
 [810819ac] ? pid_vnr+0x2c/0x30
 [8133fe54] ? tty_ioctl+0x7b4/0xbd0
 [8106bf62] ? wait_consider_task+0x102/0xaf0
 [815c00e4] schedule+0x24/0x70
 [8106cb24] do_wait+0x1d4/0x200
 [8106d9cb] sys_wait4+0x9b/0xf0
 [8106b9f0] ? task_stopped_code+0x50/0x50
 [815c1ad2] system_call_fastpath+0x16/0x1b
  
   qemu-kvmD 88011ab8c8b8 0  3345   3203 0x
 880112129cd8 0082 880112129c50
   880112129fd8 880112129fd8 4000
   88013b04ce00 880139da1a00 
   000280da 880112129d38 810d3300 Call Trace:
 [810d3300] ? __alloc_pages_nodemask+0xf0/0x7c0
 [811273c6] ? touch_atime+0x66/0x170
 [810cdabf] ? generic_file_aio_read+0x5bf/0x730
 [815c00e4] schedule+0x24/0x70
 [815c0cdd] rwsem_down_failed_common+0xbd/0x150
 [815c0da3] rwsem_down_write_failed+0x13/0x15
 [812d1be3] call_rwsem_down_write_failed+0x13/0x20
 [815bf4dd] ? down_write+0x2d/0x34
 [810f0724] vma_adjust+0xe4/0x610
 [810f0fa4] vma_merge+0x1b4/0x270
 [810f1fa6] do_brk+0x196/0x330
 [810f2217] sys_brk+0xd7/0x130
 [815c1ad2] system_call_fastpath+0x16/0x1b
  
  This looks like qemu-kvm getting stuck trying to get the anon_vma
  lock.
  
  That leads to the obvious question: what is holding the lock, and/or
  failed to release it?
  
  Do you have any other (qemu-kvm?) processes on your system that have
  any code in the VM (or strace/ptrace/...) in the backtrace, that
  might be holding this lock?
 
 I don't think so. The above was done having just logged into
 gnome-shell and opened up a couple of gnome-terminals.
  
  Do you have anything in your dmesg showing threads that had a BUG_ON
  (and exited) while holding the lock?
 
 I never noticed anything like that.
 
 The interesting thing is that if I use basically the same kernel but
 with CONFIG_LOCKDEP enabled, it works fine.
 
 Hmm, I wonder if it's the same as this?
 http://lkml.indiana.edu/hypermail/linux/kernel/1301.1/02810.html
 
 When I get home, I'll try a kernel with those two patches reverted.
 They seem to be
 
 572043c90db65b45a4efd959db7458edcf6411ad
 1b963c81b14509e330e0fe3218b645ece2738dc5

Just checked the issue is still there in Linus' latest,
3.8.0-rc3-00293-g406089d, it is.

However, building a kernel with those two commits reverted does indeed
get it going again.

Andrew
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: qemu-kvm hangs at start up under 3.8.0-rc3-00074-gb719f43 (works with CONFIG_LOCKDEP)

2013-01-15 Thread Jiri Kosina
On Tue, 15 Jan 2013, Andrew Clayton wrote:

   bashS 88013b2b0d00 0  3203   3133 0x
 880114dabe58 0082 800113558065
   880114dabfd8 880114dabfd8 4000 88013b0c5b00
   88013b2b0d00 880114dabd88 8109067d ea0004536670
   ea0004536640 Call Trace:
 [8109067d] ? default_wake_function+0xd/0x10
 [8108a315] ? atomic_notifier_call_chain+0x15/0x20
 [8133d84f] ? tty_get_pgrp+0x3f/0x50
 [810819ac] ? pid_vnr+0x2c/0x30
 [8133fe54] ? tty_ioctl+0x7b4/0xbd0
 [8106bf62] ? wait_consider_task+0x102/0xaf0
 [815c00e4] schedule+0x24/0x70
 [8106cb24] do_wait+0x1d4/0x200
 [8106d9cb] sys_wait4+0x9b/0xf0
 [8106b9f0] ? task_stopped_code+0x50/0x50
 [815c1ad2] system_call_fastpath+0x16/0x1b
  
   qemu-kvmD 88011ab8c8b8 0  3345   3203 0x
 880112129cd8 0082 880112129c50
   880112129fd8 880112129fd8 4000 88013b04ce00
   880139da1a00  000280da 880112129d38
   810d3300 Call Trace:
 [810d3300] ? __alloc_pages_nodemask+0xf0/0x7c0
 [811273c6] ? touch_atime+0x66/0x170
 [810cdabf] ? generic_file_aio_read+0x5bf/0x730
 [815c00e4] schedule+0x24/0x70
 [815c0cdd] rwsem_down_failed_common+0xbd/0x150
 [815c0da3] rwsem_down_write_failed+0x13/0x15
 [812d1be3] call_rwsem_down_write_failed+0x13/0x20
 [815bf4dd] ? down_write+0x2d/0x34
 [810f0724] vma_adjust+0xe4/0x610
 [810f0fa4] vma_merge+0x1b4/0x270
 [810f1fa6] do_brk+0x196/0x330
 [810f2217] sys_brk+0xd7/0x130
 [815c1ad2] system_call_fastpath+0x16/0x1b
  
  This looks like qemu-kvm getting stuck trying to get the anon_vma
  lock.
  
  That leads to the obvious question: what is holding the lock, and/or
  failed to release it?
  
  Do you have any other (qemu-kvm?) processes on your system that have
  any code in the VM (or strace/ptrace/...) in the backtrace, that might
  be holding this lock?
 
 I don't think so. The above was done having just logged into
 gnome-shell and opened up a couple of gnome-terminals.
  
  Do you have anything in your dmesg showing threads that had a BUG_ON
  (and exited) while holding the lock?
 
 I never noticed anything like that.
 
 The interesting thing is that if I use basically the same kernel but
 with CONFIG_LOCKDEP enabled, it works fine.

Thorough and careful review and analysis revealed that the rootcause very 
likely is that I am a complete nitwit.

Could you please try the patch below and report backt? Thanks.



From: Jiri Kosina jkos...@suse.cz
Subject: [PATCH] lockdep, rwsem: fix down_write_nest_lock() if 
!CONFIG_DEBUG_LOCK_ALLOC

Commit 1b963c81b1 (lockdep, rwsem: provide down_write_nest_lock()) 
contains a bug in a codepath when CONFIG_DEBUG_LOCK_ALLOC is disabled, 
which causes down_read() to be called instead of down_write() by mistake 
on such configurations. Fix that.

Signed-off-by: Jiri Kosina jkos...@suse.cz
---
 include/linux/rwsem.h |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 413cc11..8da67d6 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -135,7 +135,7 @@ do {
\
 
 #else
 # define down_read_nested(sem, subclass)   down_read(sem)
-# define down_write_nest_lock(sem, nest_lock)  down_read(sem)
+# define down_write_nest_lock(sem, nest_lock)  down_write(sem)
 # define down_write_nested(sem, subclass)  down_write(sem)
 #endif
 
-- 
Jiri Kosina
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: qemu-kvm hangs at start up under 3.8.0-rc3-00074-gb719f43 (works with CONFIG_LOCKDEP)

2013-01-15 Thread Andrew Clayton
On Tue, 15 Jan 2013 19:41:32 +0100 (CET), Jiri Kosina wrote:

[snip]

 Could you please try the patch below and report backt? Thanks.
 
 
 
 From: Jiri Kosina jkos...@suse.cz
 Subject: [PATCH] lockdep, rwsem: fix down_write_nest_lock()
 if !CONFIG_DEBUG_LOCK_ALLOC
 
 Commit 1b963c81b1 (lockdep, rwsem: provide down_write_nest_lock()) 
 contains a bug in a codepath when CONFIG_DEBUG_LOCK_ALLOC is
 disabled, which causes down_read() to be called instead of
 down_write() by mistake on such configurations. Fix that.
 
 Signed-off-by: Jiri Kosina jkos...@suse.cz
 ---
  include/linux/rwsem.h |2 +-
  1 files changed, 1 insertions(+), 1 deletions(-)
 
 diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
 index 413cc11..8da67d6 100644
 --- a/include/linux/rwsem.h
 +++ b/include/linux/rwsem.h
 @@ -135,7 +135,7 @@ do
 { \ 
  #else
  # define down_read_nested(sem, subclass)
 down_read(sem) -# define down_write_nest_lock(sem, nest_lock)
 down_read(sem) +# define down_write_nest_lock(sem, nest_lock)
 down_write(sem) # define down_write_nested(sem, subclass)
 down_write(sem) #endif
  

Heh, indeed, that fix's it.

Tested-by: Andrew Clayton and...@digital-domain.net

Cheers,
Andrew
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


KVM: Questions and comments on make_all_cpus_request

2013-01-15 Thread Christoffer Dall
Hi KVM guys,

I've had a bit of challenges figuring out the exact functinality and
synchronization of vcpu-requests and friends.  In lack of a better
method, I wrote some comments as a patch.

I think this code really deserves some explaining, as it is really hard
to understand otherwise.  Unfortunately, I wasn't able to write down
concise and exact comments, but I hope someone else feels up to the
challenge.

Let me know if I just got this completely wrong and upside down.

Thanks,
Christoffer
---
 include/linux/kvm_host.h |4 
 virt/kvm/kvm_main.c  |   29 +++--
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index cbe0d68..25deef8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -252,6 +252,10 @@ struct kvm_vcpu {
struct kvm_vcpu_arch arch;
 };
 
+/*
+ * XXX: Could we explain what we're trying to achieve? Is this an
+ * optimization as to not send multiple IPIs?
+ */
 static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
 {
return cmpxchg(vcpu-mode, IN_GUEST_MODE, EXITING_GUEST_MODE);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e45c20c..ccc292d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -165,7 +165,18 @@ static void ack_flush(void *_completed)
 {
 }
 
-static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
+/**
+ * make_all_cpus_request - place request on vcpus
+ * @kvm: KVM Struct
+ * @req: Request to make on the VCPU
+ *
+ * This function places a request on a VCPU and ensures that the VCPU request
+ * is handled before returning from the function, if the VCPU is in guest
+ * mode (or exiting, or reading shadow page tables?).
+ *
+ * Returns true if at least of the vcpus were sent an IPI and responded to it,
+ */
+static bool make_all_vcpus_request(struct kvm *kvm, unsigned int req)
 {
int i, cpu, me;
cpumask_var_t cpus;
@@ -179,9 +190,19 @@ static bool make_all_cpus_request(struct kvm *kvm, 
unsigned int req)
kvm_make_request(req, vcpu);
cpu = vcpu-cpu;
 
+   /*
+* Is the following really true? Can we have an example of the
+* race that would otherwise happen? Doesn't the make_request
+* pair against the IPI and interrupt disabling, not the mode?
+*/
/* Set -requests bit before we read -mode */
smp_mb();
 
+   /*
+* Set the bit on the CPU mask for all CPUs which are somehow
+* running a guest (IN_GUEST_MODE, EXITING_GUEST_MODE, and
+* READING_SHADOW_PAGE_TABLES).
+*/
if (cpus != NULL  cpu != -1  cpu != me 
  kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
cpumask_set_cpu(cpu, cpus);
@@ -201,9 +222,13 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
long dirty_count = kvm-tlbs_dirty;
 
-   smp_mb();
+   smp_mb(); /* TODO: Someone should explain this! */
if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
++kvm-stat.remote_tlb_flush;
+   /*
+* TODO: Someone should explain this, why is it a cmpxchg, what
+* happens if the dirty is different from dirty_count?
+*/
cmpxchg(kvm-tlbs_dirty, dirty_count, 0);
 }
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm: close opened file

2013-01-15 Thread Cong Ding
The file should be closed before return.

Signed-off-by: Cong Ding ding...@gmail.com
---
 tools/kvm/builtin-setup.c |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/kvm/builtin-setup.c b/tools/kvm/builtin-setup.c
index c5b0566..8b45c56 100644
--- a/tools/kvm/builtin-setup.c
+++ b/tools/kvm/builtin-setup.c
@@ -159,12 +159,12 @@ static int copy_passwd(const char *guestfs_name)
return -1;
 
ret = fprintf(file, root:x:0:0:root:/root:/bin/sh\n);
-   if (ret  0)
-   return ret;
+   if (ret  0)
+   ret = 0;
 
fclose(file);
 
-   return 0;
+   return ret;
 }
 
 static int make_guestfs_symlink(const char *guestfs_name, const char *path)
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm: remove redundant if condition

2013-01-15 Thread Cong Ding
After we check (state.kcount != 0), state.kcount has to be 0 in all the else
branchs.

Signed-off-by: Cong Ding ding...@gmail.com
---
 tools/kvm/hw/i8042.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/kvm/hw/i8042.c b/tools/kvm/hw/i8042.c
index 9f8be6a..9035732 100644
--- a/tools/kvm/hw/i8042.c
+++ b/tools/kvm/hw/i8042.c
@@ -189,7 +189,7 @@ static u32 kbd_read_data(void)
state.mcount--;
kvm__irq_line(state.kvm, AUX_IRQ, 0);
kbd_update_irq();
-   } else if (state.kcount == 0) {
+   } else {
i = state.kread - 1;
if (i  0)
i = QUEUE_SIZE;
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 07/14] KVM: ARM: World-switch implementation

2013-01-15 Thread Christoffer Dall
On Tue, Jan 15, 2013 at 4:43 AM, Gleb Natapov g...@redhat.com wrote:
 On Tue, Jan 08, 2013 at 01:39:24PM -0500, Christoffer Dall wrote:
 Provides complete world-switch implementation to switch to other guests
 running in non-secure modes. Includes Hyp exception handlers that
 capture necessary exception information and stores the information on
 the VCPU and KVM structures.

 The following Hyp-ABI is also documented in the code:

 Hyp-ABI: Calling HYP-mode functions from host (in SVC mode):
Switching to Hyp mode is done through a simple HVC #0 instruction. The
exception vector code will check that the HVC comes from VMID==0 and if
so will push the necessary state (SPSR, lr_usr) on the Hyp stack.
- r0 contains a pointer to a HYP function
- r1, r2, and r3 contain arguments to the above function.
- The HYP function will be called with its arguments in r0, r1 and r2.
On HYP function return, we return directly to SVC.

 A call to a function executing in Hyp mode is performed like the following:

 svc code
 ldr r0, =BSYM(my_hyp_fn)
 ldr r1, =my_param
 hvc #0  ; Call my_hyp_fn(my_param) from HYP mode
 svc code

 Otherwise, the world-switch is pretty straight-forward. All state that
 can be modified by the guest is first backed up on the Hyp stack and the
 VCPU values is loaded onto the hardware. State, which is not loaded, but
 theoretically modifiable by the guest is protected through the
 virtualiation features to generate a trap and cause software emulation.
 Upon guest returns, all state is restored from hardware onto the VCPU
 struct and the original state is restored from the Hyp-stack onto the
 hardware.

 SMP support using the VMPIDR calculated on the basis of the host MPIDR
 and overriding the low bits with KVM vcpu_id contributed by Marc Zyngier.

 Reuse of VMIDs has been implemented by Antonios Motakis and adapated from
 a separate patch into the appropriate patches introducing the
 functionality. Note that the VMIDs are stored per VM as required by the ARM
 architecture reference manual.

 To support VFP/NEON we trap those instructions using the HPCTR. When
 we trap, we switch the FPU.  After a guest exit, the VFP state is
 returned to the host.  When disabling access to floating point
 instructions, we also mask FPEXC_EN in order to avoid the guest
 receiving Undefined instruction exceptions before we have a chance to
 switch back the floating point state.  We are reusing vfp_hard_struct,
 so we depend on VFPv3 being enabled in the host kernel, if not, we still
 trap cp10 and cp11 in order to inject an undefined instruction exception
 whenever the guest tries to use VFP/NEON. VFP/NEON developed by
 Antionios Motakis and Rusty Russell.

 Aborts that are permission faults, and not stage-1 page table walk, do
 not report the faulting address in the HPFAR.  We have to resolve the
 IPA, and store it just like the HPFAR register on the VCPU struct. If
 the IPA cannot be resolved, it means another CPU is playing with the
 page tables, and we simply restart the guest.  This quirk was fixed by
 Marc Zyngier.

 Reviewed-by: Marcelo Tosatti mtosa...@redhat.com
 Signed-off-by: Rusty Russell rusty.russ...@linaro.org
 Signed-off-by: Antonios Motakis a.mota...@virtualopensystems.com
 Signed-off-by: Marc Zyngier marc.zyng...@arm.com
 Signed-off-by: Christoffer Dall c.d...@virtualopensystems.com
 ---
  arch/arm/include/asm/kvm_arm.h  |   51 
  arch/arm/include/asm/kvm_host.h |   10 +
  arch/arm/kernel/asm-offsets.c   |   25 ++
  arch/arm/kvm/arm.c  |  187 
  arch/arm/kvm/interrupts.S   |  396 +++
  arch/arm/kvm/interrupts_head.S  |  443 
 +++
  6 files changed, 1108 insertions(+), 4 deletions(-)
  create mode 100644 arch/arm/kvm/interrupts_head.S

 diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
 index fb22ee8..a3262a2 100644
 --- a/arch/arm/include/asm/kvm_arm.h
 +++ b/arch/arm/include/asm/kvm_arm.h
 @@ -98,6 +98,18 @@
  #define TTBCR_T0SZ   3
  #define HTCR_MASK(TTBCR_T0SZ | TTBCR_IRGN0 | TTBCR_ORGN0 | TTBCR_SH0)

 +/* Hyp System Trap Register */
 +#define HSTR_T(x)(1  x)
 +#define HSTR_TTEE(1  16)
 +#define HSTR_TJDBX   (1  17)
 +
 +/* Hyp Coprocessor Trap Register */
 +#define HCPTR_TCP(x) (1  x)
 +#define HCPTR_TCP_MASK   (0x3fff)
 +#define HCPTR_TASE   (1  15)
 +#define HCPTR_TTA(1  20)
 +#define HCPTR_TCPAC  (1  31)
 +
  /* Hyp Debug Configuration Register bits */
  #define HDCR_TDRA(1  11)
  #define HDCR_TDOSA   (1  10)
 @@ -144,6 +156,45 @@
  #else
  #define VTTBR_X  (5 - KVM_T0SZ)
  #endif
 +#define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
 +#define VTTBR_BADDR_MASK  (((1LLU  (40 - VTTBR_X)) - 1)  
 VTTBR_BADDR_SHIFT)
 +#define VTTBR_VMID_SHIFT  (48LLU)
 +#define VTTBR_VMID_MASK(0xffLLU  VTTBR_VMID_SHIFT)
 +
 +/* Hyp Syndrome Register (HSR) bits */
 +#define 

Re: [PATCH 2/2] virtio-scsi: reset virtqueue affinity when doing cpu hotplug

2013-01-15 Thread Rusty Russell
Wanlong Gao gaowanl...@cn.fujitsu.com writes:
 Add hot cpu notifier to reset the request virtqueue affinity
 when doing cpu hotplug.

You need to be careful to get_online_cpus() and put_online_cpus() here,
so CPUs can't go up and down in the middle of operations.

In particular, get_online_cpus()/put_online_cpus() around calls to
virtscsi_set_affinity() (except within notifiers).

 +static int virtscsi_cpu_callback(struct notifier_block *nfb,
 +  unsigned long action, void *hcpu)
 +{
 + struct virtio_scsi *vscsi = container_of(nfb, struct virtio_scsi, nb);
 + switch(action) {
 + case CPU_ONLINE:
 + case CPU_ONLINE_FROZEN:
 + case CPU_DEAD:
 + case CPU_DEAD_FROZEN:
 + virtscsi_set_affinity(vscsi, true);
 + break;
 + default:
 + break;
 + }
 + return NOTIFY_OK;
 +}

You probably want to remove affinities *before* the CPU goes down, and
restore it after the CPU comes up.

So, how about:

switch (action  ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
case CPU_DOWN_FAILED:
virtscsi_set_affinity(vscsi, true, -1);
break;
case CPU_DOWN_PREPARE:
virtscsi_set_affinity(vscsi, true, (unsigned long)hcpu);
break;
}

The extra argument to virtscsi_set_affinity() is to tell it to ignore a
cpu which seems online (because it's going down).

  static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
struct virtqueue *vq)
  {
 @@ -888,6 +909,13 @@ static int __devinit virtscsi_probe(struct virtio_device 
 *vdev)
   if (err)
   goto virtscsi_init_failed;
  
 + vscsi-nb.notifier_call = virtscsi_cpu_callback;
 + err = register_hotcpu_notifier(vscsi-nb);
 + if (err) {
 + pr_err(virtio_scsi: registering cpu notifier failed\n);
 + goto scsi_add_host_failed;
 + }
 +

You have to clean this up if scsi_add_host() fails, I think.

Cheers,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 03/14] KVM: ARM: Initial skeleton to compile KVM support

2013-01-15 Thread Rusty Russell
Christoffer Dall c.d...@virtualopensystems.com writes:

 On Mon, Jan 14, 2013 at 11:24 AM, Russell King - ARM Linux
 li...@arm.linux.org.uk wrote:
 On Tue, Jan 08, 2013 at 01:38:55PM -0500, Christoffer Dall wrote:
 + /* -ENOENT for unknown features, -EINVAL for invalid combinations. */
 + for (i = 0; i  sizeof(init-features)*8; i++) {
 + if (init-features[i / 32]  (1  (i % 32))) {

 Isn't this an open-coded version of test_bit() ?

 indeed, nicely spotted:

BTW, I wrote it that was out of excessive paranoia: it's a userspace
API, and test_bit() won't be right on 64 bit BE systems.

Cheers,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] virtio-scsi: reset virtqueue affinity when doing cpu hotplug

2013-01-15 Thread Wanlong Gao
On 01/16/2013 11:31 AM, Rusty Russell wrote:
 Wanlong Gao gaowanl...@cn.fujitsu.com writes:
 Add hot cpu notifier to reset the request virtqueue affinity
 when doing cpu hotplug.
 
 You need to be careful to get_online_cpus() and put_online_cpus() here,
 so CPUs can't go up and down in the middle of operations.
 
 In particular, get_online_cpus()/put_online_cpus() around calls to
 virtscsi_set_affinity() (except within notifiers).

Yes, I'll take care of this, thank you.

 
 +static int virtscsi_cpu_callback(struct notifier_block *nfb,
 + unsigned long action, void *hcpu)
 +{
 +struct virtio_scsi *vscsi = container_of(nfb, struct virtio_scsi, nb);
 +switch(action) {
 +case CPU_ONLINE:
 +case CPU_ONLINE_FROZEN:
 +case CPU_DEAD:
 +case CPU_DEAD_FROZEN:
 +virtscsi_set_affinity(vscsi, true);
 +break;
 +default:
 +break;
 +}
 +return NOTIFY_OK;
 +}
 
 You probably want to remove affinities *before* the CPU goes down, and
 restore it after the CPU comes up.
 
 So, how about:
 
 switch (action  ~CPU_TASKS_FROZEN) {
 case CPU_ONLINE:
 case CPU_DOWN_FAILED:
 virtscsi_set_affinity(vscsi, true, -1);
 break;
 case CPU_DOWN_PREPARE:
 virtscsi_set_affinity(vscsi, true, (unsigned long)hcpu);
 break;
 }
 
 The extra argument to virtscsi_set_affinity() is to tell it to ignore a
 cpu which seems online (because it's going down).

OK, thank you very much for teaching this.

 
  static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
   struct virtqueue *vq)
  {
 @@ -888,6 +909,13 @@ static int __devinit virtscsi_probe(struct 
 virtio_device *vdev)
  if (err)
  goto virtscsi_init_failed;
  
 +vscsi-nb.notifier_call = virtscsi_cpu_callback;
 +err = register_hotcpu_notifier(vscsi-nb);
 +if (err) {
 +pr_err(virtio_scsi: registering cpu notifier failed\n);
 +goto scsi_add_host_failed;
 +}
 +
 
 You have to clean this up if scsi_add_host() fails, I think.

Yeah, will do, thank you very much. 


Regards,
Wanlong Gao

 
 Cheers,
 Rusty.
 --
 To unsubscribe from this list: send the line unsubscribe linux-kernel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 Please read the FAQ at  http://www.tux.org/lkml/
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 07/14] KVM: ARM: World-switch implementation

2013-01-15 Thread Christoffer Dall
On Tue, Jan 15, 2013 at 9:08 PM, Christoffer Dall
c.d...@virtualopensystems.com wrote:
 On Tue, Jan 15, 2013 at 4:43 AM, Gleb Natapov g...@redhat.com wrote:
 On Tue, Jan 08, 2013 at 01:39:24PM -0500, Christoffer Dall wrote:
 Provides complete world-switch implementation to switch to other guests
 running in non-secure modes. Includes Hyp exception handlers that
 capture necessary exception information and stores the information on
 the VCPU and KVM structures.

 The following Hyp-ABI is also documented in the code:

 Hyp-ABI: Calling HYP-mode functions from host (in SVC mode):
Switching to Hyp mode is done through a simple HVC #0 instruction. The
exception vector code will check that the HVC comes from VMID==0 and if
so will push the necessary state (SPSR, lr_usr) on the Hyp stack.
- r0 contains a pointer to a HYP function
- r1, r2, and r3 contain arguments to the above function.
- The HYP function will be called with its arguments in r0, r1 and r2.
On HYP function return, we return directly to SVC.

 A call to a function executing in Hyp mode is performed like the following:

 svc code
 ldr r0, =BSYM(my_hyp_fn)
 ldr r1, =my_param
 hvc #0  ; Call my_hyp_fn(my_param) from HYP mode
 svc code

 Otherwise, the world-switch is pretty straight-forward. All state that
 can be modified by the guest is first backed up on the Hyp stack and the
 VCPU values is loaded onto the hardware. State, which is not loaded, but
 theoretically modifiable by the guest is protected through the
 virtualiation features to generate a trap and cause software emulation.
 Upon guest returns, all state is restored from hardware onto the VCPU
 struct and the original state is restored from the Hyp-stack onto the
 hardware.

 SMP support using the VMPIDR calculated on the basis of the host MPIDR
 and overriding the low bits with KVM vcpu_id contributed by Marc Zyngier.

 Reuse of VMIDs has been implemented by Antonios Motakis and adapated from
 a separate patch into the appropriate patches introducing the
 functionality. Note that the VMIDs are stored per VM as required by the ARM
 architecture reference manual.

 To support VFP/NEON we trap those instructions using the HPCTR. When
 we trap, we switch the FPU.  After a guest exit, the VFP state is
 returned to the host.  When disabling access to floating point
 instructions, we also mask FPEXC_EN in order to avoid the guest
 receiving Undefined instruction exceptions before we have a chance to
 switch back the floating point state.  We are reusing vfp_hard_struct,
 so we depend on VFPv3 being enabled in the host kernel, if not, we still
 trap cp10 and cp11 in order to inject an undefined instruction exception
 whenever the guest tries to use VFP/NEON. VFP/NEON developed by
 Antionios Motakis and Rusty Russell.

 Aborts that are permission faults, and not stage-1 page table walk, do
 not report the faulting address in the HPFAR.  We have to resolve the
 IPA, and store it just like the HPFAR register on the VCPU struct. If
 the IPA cannot be resolved, it means another CPU is playing with the
 page tables, and we simply restart the guest.  This quirk was fixed by
 Marc Zyngier.

 Reviewed-by: Marcelo Tosatti mtosa...@redhat.com
 Signed-off-by: Rusty Russell rusty.russ...@linaro.org
 Signed-off-by: Antonios Motakis a.mota...@virtualopensystems.com
 Signed-off-by: Marc Zyngier marc.zyng...@arm.com
 Signed-off-by: Christoffer Dall c.d...@virtualopensystems.com
 ---
  arch/arm/include/asm/kvm_arm.h  |   51 
  arch/arm/include/asm/kvm_host.h |   10 +
  arch/arm/kernel/asm-offsets.c   |   25 ++
  arch/arm/kvm/arm.c  |  187 
  arch/arm/kvm/interrupts.S   |  396 +++
  arch/arm/kvm/interrupts_head.S  |  443 
 +++
  6 files changed, 1108 insertions(+), 4 deletions(-)
  create mode 100644 arch/arm/kvm/interrupts_head.S

 diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
 index fb22ee8..a3262a2 100644
 --- a/arch/arm/include/asm/kvm_arm.h
 +++ b/arch/arm/include/asm/kvm_arm.h
 @@ -98,6 +98,18 @@
  #define TTBCR_T0SZ   3
  #define HTCR_MASK(TTBCR_T0SZ | TTBCR_IRGN0 | TTBCR_ORGN0 | TTBCR_SH0)

 +/* Hyp System Trap Register */
 +#define HSTR_T(x)(1  x)
 +#define HSTR_TTEE(1  16)
 +#define HSTR_TJDBX   (1  17)
 +
 +/* Hyp Coprocessor Trap Register */
 +#define HCPTR_TCP(x) (1  x)
 +#define HCPTR_TCP_MASK   (0x3fff)
 +#define HCPTR_TASE   (1  15)
 +#define HCPTR_TTA(1  20)
 +#define HCPTR_TCPAC  (1  31)
 +
  /* Hyp Debug Configuration Register bits */
  #define HDCR_TDRA(1  11)
  #define HDCR_TDOSA   (1  10)
 @@ -144,6 +156,45 @@
  #else
  #define VTTBR_X  (5 - KVM_T0SZ)
  #endif
 +#define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
 +#define VTTBR_BADDR_MASK  (((1LLU  (40 - VTTBR_X)) - 1)  
 VTTBR_BADDR_SHIFT)
 +#define VTTBR_VMID_SHIFT  (48LLU)
 +#define VTTBR_VMID_MASK

Re: [Qemu-devel] [RFC PATCH 2/2] virtio-net: introduce a new control to set macaddr

2013-01-15 Thread Amos Kong
On Thu, Jan 10, 2013 at 10:57:05PM +0800, Jason Wang wrote:
 On 01/10/2013 10:45 PM, ak...@redhat.com wrote:
  From: Amos Kong ak...@redhat.com
 
  Currently we write MAC address to pci config space byte by byte,
  this means that we have an intermediate step where mac is wrong.
  This patch introduced a new control command to set MAC address
  in one time.
 
  VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility.
 
  Signed-off-by: Amos Kong ak...@redhat.com
  ---
   drivers/net/virtio_net.c| 16 +++-
   include/uapi/linux/virtio_net.h |  8 +++-
   2 files changed, 22 insertions(+), 2 deletions(-)
 
  diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
  index 395ab4f..ff22bcd 100644
  --- a/drivers/net/virtio_net.c
  +++ b/drivers/net/virtio_net.c
  @@ -803,13 +803,26 @@ static int virtnet_set_mac_address(struct net_device 
  *dev, void *p)
  struct virtio_device *vdev = vi-vdev;
  int ret;
   
  +   struct scatterlist sg;
  +
  ret = eth_mac_addr(dev, p);
  if (ret)
  return ret;
   
  -   if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
  +   if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
  +   /* Set MAC address by sending vq command */
  +   sg_init_one(sg, dev-dev_addr, dev-addr_len);
  +   virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
  +VIRTIO_NET_CTRL_MAC_ADDR_SET,
  +sg, 1, 0);
  +   return 0;
  +   }
  +
 
 Better to check the return of virtnet_send_command() and give a warn
 like other command. Btw, need we fail back to try the old way then?

Yes, it's necessary to check the return value of
virtnet_send_command().

In fail case, I like to return -EINVAL to userspace, because we don't
only want to set mac successfully, we also want to resolve the
addr inconsistent issue by this feature (vq cmd).

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/2] move virtnet_send_command() above virtnet_set_mac_address()

2013-01-15 Thread akong
From: Amos Kong ak...@redhat.com

We will send vq command to set mac address in virtnet_set_mac_address()
a little fix of coding style

Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/net/virtio_net.c | 89 
 1 file changed, 44 insertions(+), 45 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index a6fcf15..395ab4f 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -753,6 +753,50 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct 
net_device *dev)
return NETDEV_TX_OK;
 }
 
+/*
+ * Send command via the control virtqueue and check status.  Commands
+ * supported by the hypervisor, as indicated by feature bits, should
+ * never fail unless improperly formated.
+ */
+static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
+struct scatterlist *data, int out, int in)
+{
+   struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2];
+   struct virtio_net_ctrl_hdr ctrl;
+   virtio_net_ctrl_ack status = ~0;
+   unsigned int tmp;
+   int i;
+
+   /* Caller should know better */
+   BUG_ON(!virtio_has_feature(vi-vdev, VIRTIO_NET_F_CTRL_VQ) ||
+   (out + in  VIRTNET_SEND_COMMAND_SG_MAX));
+
+   out++; /* Add header */
+   in++; /* Add return status */
+
+   ctrl.class = class;
+   ctrl.cmd = cmd;
+
+   sg_init_table(sg, out + in);
+
+   sg_set_buf(sg[0], ctrl, sizeof(ctrl));
+   for_each_sg(data, s, out + in - 2, i)
+   sg_set_buf(sg[i + 1], sg_virt(s), s-length);
+   sg_set_buf(sg[out + in - 1], status, sizeof(status));
+
+   BUG_ON(virtqueue_add_buf(vi-cvq, sg, out, in, vi, GFP_ATOMIC)  0);
+
+   virtqueue_kick(vi-cvq);
+
+   /* Spin for a response, the kick causes an ioport write, trapping
+* into the hypervisor, so the request should be handled immediately.
+*/
+   while (!virtqueue_get_buf(vi-cvq, tmp))
+   cpu_relax();
+
+   return status == VIRTIO_NET_OK;
+}
+
 static int virtnet_set_mac_address(struct net_device *dev, void *p)
 {
struct virtnet_info *vi = netdev_priv(dev);
@@ -819,51 +863,6 @@ static void virtnet_netpoll(struct net_device *dev)
 }
 #endif
 
-/*
- * Send command via the control virtqueue and check status.  Commands
- * supported by the hypervisor, as indicated by feature bits, should
- * never fail unless improperly formated.
- */
-static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
-struct scatterlist *data, int out, int in)
-{
-   struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2];
-   struct virtio_net_ctrl_hdr ctrl;
-   virtio_net_ctrl_ack status = ~0;
-   unsigned int tmp;
-   int i;
-
-   /* Caller should know better */
-   BUG_ON(!virtio_has_feature(vi-vdev, VIRTIO_NET_F_CTRL_VQ) ||
-   (out + in  VIRTNET_SEND_COMMAND_SG_MAX));
-
-   out++; /* Add header */
-   in++; /* Add return status */
-
-   ctrl.class = class;
-   ctrl.cmd = cmd;
-
-   sg_init_table(sg, out + in);
-
-   sg_set_buf(sg[0], ctrl, sizeof(ctrl));
-   for_each_sg(data, s, out + in - 2, i)
-   sg_set_buf(sg[i + 1], sg_virt(s), s-length);
-   sg_set_buf(sg[out + in - 1], status, sizeof(status));
-
-   BUG_ON(virtqueue_add_buf(vi-cvq, sg, out, in, vi, GFP_ATOMIC)  0);
-
-   virtqueue_kick(vi-cvq);
-
-   /*
-* Spin for a response, the kick causes an ioport write, trapping
-* into the hypervisor, so the request should be handled immediately.
-*/
-   while (!virtqueue_get_buf(vi-cvq, tmp))
-   cpu_relax();
-
-   return status == VIRTIO_NET_OK;
-}
-
 static void virtnet_ack_link_announce(struct virtnet_info *vi)
 {
rtnl_lock();
-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/2] virtio-net: introduce a new control to set macaddr

2013-01-15 Thread akong
From: Amos Kong ak...@redhat.com

Currently we write MAC address to pci config space byte by byte,
this means that we have an intermediate step where mac is wrong.
This patch introduced a new control command to set MAC address
in one time.

VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility.

Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/net/virtio_net.c| 24 +---
 include/uapi/linux/virtio_net.h |  8 +++-
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 395ab4f..c8901b6 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -802,16 +802,25 @@ static int virtnet_set_mac_address(struct net_device 
*dev, void *p)
struct virtnet_info *vi = netdev_priv(dev);
struct virtio_device *vdev = vi-vdev;
int ret;
+   struct sockaddr *addr = p;
+   struct scatterlist sg;
 
-   ret = eth_mac_addr(dev, p);
-   if (ret)
-   return ret;
-
-   if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
+   if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
+   sg_init_one(sg, addr-sa_data, dev-addr_len);
+   if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
+ VIRTIO_NET_CTRL_MAC_ADDR_SET,
+ sg, 1, 0)) {
+   dev_warn(vdev-dev,
+Failed to set mac address by vq command.\n);
+   return -EINVAL;
+   }
+   } else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) {
vdev-config-set(vdev, offsetof(struct virtio_net_config, mac),
- dev-dev_addr, dev-addr_len);
+ addr-sa_data, dev-addr_len);
+   }
+   ret = eth_mac_addr(dev, p);
 
-   return 0;
+   return ret;
 }
 
 static struct rtnl_link_stats64 *virtnet_stats(struct net_device *dev,
@@ -1627,6 +1636,7 @@ static unsigned int features[] = {
VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN,
VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ,
+   VIRTIO_NET_F_CTRL_MAC_ADDR,
 };
 
 static struct virtio_driver virtio_net_driver = {
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 848e358..a5a8c88 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -53,6 +53,7 @@
 * network */
 #define VIRTIO_NET_F_MQ22  /* Device supports Receive Flow
 * Steering */
+#define VIRTIO_NET_F_CTRL_MAC_ADDR 23  /* Set MAC address */
 
 #define VIRTIO_NET_S_LINK_UP   1   /* Link is up */
 #define VIRTIO_NET_S_ANNOUNCE  2   /* Announcement is needed */
@@ -127,7 +128,7 @@ typedef __u8 virtio_net_ctrl_ack;
  #define VIRTIO_NET_CTRL_RX_NOBCAST  5
 
 /*
- * Control the MAC filter table.
+ * Control the MAC
  *
  * The MAC filter table is managed by the hypervisor, the guest should
  * assume the size is infinite.  Filtering should be considered
@@ -140,6 +141,10 @@ typedef __u8 virtio_net_ctrl_ack;
  * first sg list contains unicast addresses, the second is for multicast.
  * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature
  * is available.
+ *
+ * The ADDR_SET command requests one out scatterlist, it contains a
+ * 6 bytes MAC address. This functionality is present if the
+ * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available.
  */
 struct virtio_net_ctrl_mac {
__u32 entries;
@@ -148,6 +153,7 @@ struct virtio_net_ctrl_mac {
 
 #define VIRTIO_NET_CTRL_MAC1
  #define VIRTIO_NET_CTRL_MAC_TABLE_SET0
+ #define VIRTIO_NET_CTRL_MAC_ADDR_SET 1
 
 /*
  * Control VLAN filtering
-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 0/2] make mac programming for virtio net more robust

2013-01-15 Thread akong
From: Amos Kong ak...@redhat.com

Currenly mac is programmed byte by byte. This means that we
have an intermediate step where mac is wrong. 

Second patch introduced a new vq control command to set mac
address in one time.

V2: check return of sending command, delay eth_mac_addr()

Amos Kong (2):
  move virtnet_send_command() above virtnet_set_mac_address()
  virtio-net: introduce a new control to set macaddr

 drivers/net/virtio_net.c| 113 ++--
 include/uapi/linux/virtio_net.h |   8 ++-
 2 files changed, 68 insertions(+), 53 deletions(-)

-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[QEMU PATCH v2] virtio-net: introduce a new macaddr control

2013-01-15 Thread akong
From: Amos Kong ak...@redhat.com

In virtio-net guest driver, currently we write MAC address to
pci config space byte by byte, this means that we have an
intermediate step where mac is wrong. This patch introduced
a new control command to set MAC address in one time.

VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility.

Signed-off-by: Amos Kong ak...@redhat.com
---
V2: check guest's iov_len before memcpy
---
 hw/virtio-net.c | 10 ++
 hw/virtio-net.h |  9 -
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index dc7c6d6..d05f98f 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -247,6 +247,7 @@ static uint32_t virtio_net_get_features(VirtIODevice *vdev, 
uint32_t features)
 VirtIONet *n = to_virtio_net(vdev);
 
 features |= (1  VIRTIO_NET_F_MAC);
+features |= (1  VIRTIO_NET_F_CTRL_MAC_ADDR);
 
 if (!peer_has_vnet_hdr(n)) {
 features = ~(0x1  VIRTIO_NET_F_CSUM);
@@ -282,6 +283,7 @@ static uint32_t virtio_net_bad_features(VirtIODevice *vdev)
 /* Linux kernel 2.6.25.  It understood MAC (as everyone must),
  * but also these: */
 features |= (1  VIRTIO_NET_F_MAC);
+features |= (1  VIRTIO_NET_F_CTRL_MAC_ADDR);
 features |= (1  VIRTIO_NET_F_CSUM);
 features |= (1  VIRTIO_NET_F_HOST_TSO4);
 features |= (1  VIRTIO_NET_F_HOST_TSO6);
@@ -349,6 +351,14 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd,
 {
 struct virtio_net_ctrl_mac mac_data;
 
+if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET  elem-out_num == 2 
+elem-out_sg[1].iov_len == ETH_ALEN) {
+/* Set MAC address */
+memcpy(n-mac, elem-out_sg[1].iov_base, elem-out_sg[1].iov_len);
+qemu_format_nic_info_str(n-nic-nc, n-mac);
+return VIRTIO_NET_OK;
+}
+
 if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET || elem-out_num != 3 ||
 elem-out_sg[1].iov_len  sizeof(mac_data) ||
 elem-out_sg[2].iov_len  sizeof(mac_data))
diff --git a/hw/virtio-net.h b/hw/virtio-net.h
index d46fb98..9394cc0 100644
--- a/hw/virtio-net.h
+++ b/hw/virtio-net.h
@@ -44,6 +44,8 @@
 #define VIRTIO_NET_F_CTRL_VLAN  19  /* Control channel VLAN filtering */
 #define VIRTIO_NET_F_CTRL_RX_EXTRA 20   /* Extra RX mode control support */
 
+#define VIRTIO_NET_F_CTRL_MAC_ADDR   23 /* Set MAC address */
+
 #define VIRTIO_NET_S_LINK_UP1   /* Link is up */
 
 #define TX_TIMER_INTERVAL 15 /* 150 us */
@@ -106,7 +108,7 @@ typedef uint8_t virtio_net_ctrl_ack;
  #define VIRTIO_NET_CTRL_RX_MODE_NOBCAST  5
 
 /*
- * Control the MAC filter table.
+ * Control the MAC
  *
  * The MAC filter table is managed by the hypervisor, the guest should
  * assume the size is infinite.  Filtering should be considered
@@ -119,6 +121,10 @@ typedef uint8_t virtio_net_ctrl_ack;
  * first sg list contains unicast addresses, the second is for multicast.
  * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature
  * is available.
+ *
+ * The ADDR_SET command requests one out scatterlist, it contains a
+ * 6 bytes MAC address. This functionality is present if the
+ * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available.
  */
 struct virtio_net_ctrl_mac {
 uint32_t entries;
@@ -126,6 +132,7 @@ struct virtio_net_ctrl_mac {
 };
 #define VIRTIO_NET_CTRL_MAC1
  #define VIRTIO_NET_CTRL_MAC_TABLE_SET0
+ #define VIRTIO_NET_CTRL_MAC_ADDR_SET 1
 
 /*
  * Control VLAN filtering
-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 2/2] virtio-net: introduce a new control to set macaddr

2013-01-15 Thread Jason Wang
On Wednesday, January 16, 2013 01:57:01 PM ak...@redhat.com wrote:
 From: Amos Kong ak...@redhat.com
 
 Currently we write MAC address to pci config space byte by byte,
 this means that we have an intermediate step where mac is wrong.
 This patch introduced a new control command to set MAC address
 in one time.
 
 VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility.
 
 Signed-off-by: Amos Kong ak...@redhat.com
 ---
  drivers/net/virtio_net.c| 24 +---
  include/uapi/linux/virtio_net.h |  8 +++-
  2 files changed, 24 insertions(+), 8 deletions(-)
 
 diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
 index 395ab4f..c8901b6 100644
 --- a/drivers/net/virtio_net.c
 +++ b/drivers/net/virtio_net.c
 @@ -802,16 +802,25 @@ static int virtnet_set_mac_address(struct net_device
 *dev, void *p) struct virtnet_info *vi = netdev_priv(dev);
   struct virtio_device *vdev = vi-vdev;
   int ret;
 + struct sockaddr *addr = p;
 + struct scatterlist sg;
 
 - ret = eth_mac_addr(dev, p);
 - if (ret)
 - return ret;
 -
 - if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
 + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
 + sg_init_one(sg, addr-sa_data, dev-addr_len);
 + if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
 +   VIRTIO_NET_CTRL_MAC_ADDR_SET,
 +   sg, 1, 0)) {
 + dev_warn(vdev-dev,
 +  Failed to set mac address by vq command.\n);
 + return -EINVAL;
 + }
 + } else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) {
   vdev-config-set(vdev, offsetof(struct virtio_net_config, mac),
 -   dev-dev_addr, dev-addr_len);
 +   addr-sa_data, dev-addr_len);
 + }
 + ret = eth_mac_addr(dev, p);
 

The you will the validity check in eth_mac_addr which may result a wrong mac 
address to be set in the hardware (or is there any check in qemu) and a 
inconsistency bettween what kernel assumes and qemu has.

You can take a look at netvsc driver that calls eth_mac_addr() first and 
restore the software mac address when fail to enforce it to hardware.

Thanks
 - return 0;
 + return ret;
  }
 
  static struct rtnl_link_stats64 *virtnet_stats(struct net_device *dev,
 @@ -1627,6 +1636,7 @@ static unsigned int features[] = {
   VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
   VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN,
   VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ,
 + VIRTIO_NET_F_CTRL_MAC_ADDR,
  };
 
  static struct virtio_driver virtio_net_driver = {
 diff --git a/include/uapi/linux/virtio_net.h
 b/include/uapi/linux/virtio_net.h index 848e358..a5a8c88 100644
 --- a/include/uapi/linux/virtio_net.h
 +++ b/include/uapi/linux/virtio_net.h
 @@ -53,6 +53,7 @@
* network */
  #define VIRTIO_NET_F_MQ  22  /* Device supports Receive Flow
* Steering */
 +#define VIRTIO_NET_F_CTRL_MAC_ADDR 23/* Set MAC address */
 
  #define VIRTIO_NET_S_LINK_UP 1   /* Link is up */
  #define VIRTIO_NET_S_ANNOUNCE2   /* Announcement is needed */
 @@ -127,7 +128,7 @@ typedef __u8 virtio_net_ctrl_ack;
   #define VIRTIO_NET_CTRL_RX_NOBCAST  5
 
  /*
 - * Control the MAC filter table.
 + * Control the MAC
   *
   * The MAC filter table is managed by the hypervisor, the guest should
   * assume the size is infinite.  Filtering should be considered
 @@ -140,6 +141,10 @@ typedef __u8 virtio_net_ctrl_ack;
   * first sg list contains unicast addresses, the second is for multicast.
   * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature
   * is available.
 + *
 + * The ADDR_SET command requests one out scatterlist, it contains a
 + * 6 bytes MAC address. This functionality is present if the
 + * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available.
   */
  struct virtio_net_ctrl_mac {
   __u32 entries;
 @@ -148,6 +153,7 @@ struct virtio_net_ctrl_mac {
 
  #define VIRTIO_NET_CTRL_MAC1
   #define VIRTIO_NET_CTRL_MAC_TABLE_SET0
 + #define VIRTIO_NET_CTRL_MAC_ADDR_SET 1
 
  /*
   * Control VLAN filtering
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [QEMU PATCH v2] virtio-net: introduce a new macaddr control

2013-01-15 Thread Jason Wang
On Wednesday, January 16, 2013 02:16:47 PM ak...@redhat.com wrote:
 From: Amos Kong ak...@redhat.com
 
 In virtio-net guest driver, currently we write MAC address to
 pci config space byte by byte, this means that we have an
 intermediate step where mac is wrong. This patch introduced
 a new control command to set MAC address in one time.
 
 VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility.
 
 Signed-off-by: Amos Kong ak...@redhat.com
 ---
 V2: check guest's iov_len before memcpy
 ---
  hw/virtio-net.c | 10 ++
  hw/virtio-net.h |  9 -
  2 files changed, 18 insertions(+), 1 deletion(-)
 
 diff --git a/hw/virtio-net.c b/hw/virtio-net.c
 index dc7c6d6..d05f98f 100644
 --- a/hw/virtio-net.c
 +++ b/hw/virtio-net.c
 @@ -247,6 +247,7 @@ static uint32_t virtio_net_get_features(VirtIODevice
 *vdev, uint32_t features) VirtIONet *n = to_virtio_net(vdev);
 
  features |= (1  VIRTIO_NET_F_MAC);
 +features |= (1  VIRTIO_NET_F_CTRL_MAC_ADDR);
 
  if (!peer_has_vnet_hdr(n)) {
  features = ~(0x1  VIRTIO_NET_F_CSUM);
 @@ -282,6 +283,7 @@ static uint32_t virtio_net_bad_features(VirtIODevice
 *vdev) /* Linux kernel 2.6.25.  It understood MAC (as everyone must), * but
 also these: */
  features |= (1  VIRTIO_NET_F_MAC);
 +features |= (1  VIRTIO_NET_F_CTRL_MAC_ADDR);
  features |= (1  VIRTIO_NET_F_CSUM);
  features |= (1  VIRTIO_NET_F_HOST_TSO4);
  features |= (1  VIRTIO_NET_F_HOST_TSO6);
 @@ -349,6 +351,14 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t
 cmd, {
  struct virtio_net_ctrl_mac mac_data;
 
 +if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET  elem-out_num == 2 
 +elem-out_sg[1].iov_len == ETH_ALEN) {
 +/* Set MAC address */
 +memcpy(n-mac, elem-out_sg[1].iov_base, elem-out_sg[1].iov_len);
 +qemu_format_nic_info_str(n-nic-nc, n-mac);
 +return VIRTIO_NET_OK;
 +}
 +
  if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET || elem-out_num != 3 ||
  elem-out_sg[1].iov_len  sizeof(mac_data) ||
  elem-out_sg[2].iov_len  sizeof(mac_data))
 diff --git a/hw/virtio-net.h b/hw/virtio-net.h
 index d46fb98..9394cc0 100644
 --- a/hw/virtio-net.h
 +++ b/hw/virtio-net.h
 @@ -44,6 +44,8 @@
  #define VIRTIO_NET_F_CTRL_VLAN  19  /* Control channel VLAN filtering
 */ #define VIRTIO_NET_F_CTRL_RX_EXTRA 20   /* Extra RX mode control support
 */
 
 +#define VIRTIO_NET_F_CTRL_MAC_ADDR   23 /* Set MAC address */
 +

I wonder whether we need a DEFINE_PROP_BIT to disable and compat this feature. 
Consider we may migrate from a new version to an old version.
  #define VIRTIO_NET_S_LINK_UP1   /* Link is up */
 
  #define TX_TIMER_INTERVAL 15 /* 150 us */
 @@ -106,7 +108,7 @@ typedef uint8_t virtio_net_ctrl_ack;
   #define VIRTIO_NET_CTRL_RX_MODE_NOBCAST  5
 
  /*
 - * Control the MAC filter table.
 + * Control the MAC
   *
   * The MAC filter table is managed by the hypervisor, the guest should
   * assume the size is infinite.  Filtering should be considered
 @@ -119,6 +121,10 @@ typedef uint8_t virtio_net_ctrl_ack;
   * first sg list contains unicast addresses, the second is for multicast.
   * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature
   * is available.
 + *
 + * The ADDR_SET command requests one out scatterlist, it contains a
 + * 6 bytes MAC address. This functionality is present if the
 + * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available.
   */
  struct virtio_net_ctrl_mac {
  uint32_t entries;
 @@ -126,6 +132,7 @@ struct virtio_net_ctrl_mac {
  };
  #define VIRTIO_NET_CTRL_MAC1
   #define VIRTIO_NET_CTRL_MAC_TABLE_SET0
 + #define VIRTIO_NET_CTRL_MAC_ADDR_SET 1
 
  /*
   * Control VLAN filtering
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] virtio-spec: set mac address by a new vq command

2013-01-15 Thread akong
From: Amos Kong ak...@redhat.com

Virtio-net driver currently programs MAC address byte by byte,
this means that we have an intermediate step where mac is wrong.
This patch introduced a new control command to set MAC address
in one time, and added a new feature flag VIRTIO_NET_F_MAC_ADDR
for this feature.

Signed-off-by: Amos Kong ak...@redhat.com
---
 virtio-spec.lyx | 45 -
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/virtio-spec.lyx b/virtio-spec.lyx
index 1ba9992..03f5a49 100644
--- a/virtio-spec.lyx
+++ b/virtio-spec.lyx
@@ -56,6 +56,7 @@
 \html_math_output 0
 \html_css_as_file 0
 \html_be_strict false
+\author -1930653948 Amos Kong ak...@redhat.com
 \author -608949062 Rusty Russell,,, 
 \author -385801441 Cornelia Huck cornelia.h...@de.ibm.com
 \author 1112500848 Rusty Russell ru...@rustcorp.com.au
@@ -4391,6 +4392,14 @@ VIRTIO_NET_F_GUEST_ANNOUNCE(21) Guest can send 
gratuitous packets.
 
 \change_inserted 1986246365 1352742808
 VIRTIO_NET_F_MQ(22) Device supports multiqueue with automatic receive steering.
+\change_inserted -1930653948 1358319033
+
+\end_layout
+
+\begin_layout Description
+
+\change_inserted -1930653948 1358319080
+VIRTIO_NET_F_CTRL_MAC_ADDR(23) Set MAC address.
 \change_unchanged
 
 \end_layout
@@ -5284,7 +5293,11 @@ The class VIRTIO_NET_CTRL_RX has two commands: 
VIRTIO_NET_CTRL_RX_PROMISC
 \end_layout
 
 \begin_layout Subsubsection*
-Setting MAC Address Filtering
+Setting MAC Address
+\change_deleted -1930653948 1358318470
+ Filtering
+\change_unchanged
+
 \end_layout
 
 \begin_layout Standard
@@ -5324,6 +5337,17 @@ struct virtio_net_ctrl_mac {
 \begin_layout Plain Layout
 
  #define VIRTIO_NET_CTRL_MAC_TABLE_SET0 
+\change_inserted -1930653948 1358318313
+
+\end_layout
+
+\begin_layout Plain Layout
+
+\change_inserted -1930653948 1358318331
+
+ #define VIRTIO_NET_CTRL_MAC_ADDR_SET 1
+\change_unchanged
+
 \end_layout
 
 \end_inset
@@ -5349,6 +5373,25 @@ T_CTRL_MAC_TABLE_SET.
  The command-specific-data is two variable length tables of 6-byte MAC 
addresses.
  The first table contains unicast addresses, and the second contains multicast
  addresses.
+\change_inserted -1930653948 1358318545
+
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted -1930653948 1358320004
+The command VIRTIO_NET_CTRL_MAC_ADDR_SET is used to set 
+\begin_inset Quotes eld
+\end_inset
+
+physical
+\begin_inset Quotes erd
+\end_inset
+
+ address of the network card.
+ The command-specific-data is a 6-byte MAC address.
+\change_unchanged
+
 \end_layout
 
 \begin_layout Subsection*
-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Higher (quadratical) resolution

2013-01-15 Thread Dennis Böck
Dear KVM-Mailing-List,

since we use quadratically monitors with 2048x2048 pixels and we need to work 
in full-screen mode, I would like to use such a resolution in my VM. I tried it 
with all available graphic cards (-cirrus -std -vmware and -qxl) but it didn't 
work.
Do I understand correctly that according to the KVM-FAQ 
(http://www.linux-kvm.org/page/FAQ#Can_I_have_higher_or_widescreen_resolutions_.28eg_1680_x_1050.29_in_KVM.3F)
 I have to change the KVM-sources and compile my own KVM? Since I have no idea 
how to do it, I would be thankfull to get some instructions for it.
Is it only possible to change the resolution for the -vga graphic card or for 
the -qxl too? (since I would like to use spice instead of vnc in order to 
connect to the VM)

Best regards and thanks in advance
Dennis
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html