[PATCH] KVM: x86: Fix uninitialized return code

2012-12-02 Thread Jan Kiszka
From: Jan Kiszka jan.kis...@siemens.com

This is a regression caused by 18595411a7.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
---
 arch/x86/kvm/x86.c |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b0b8abe..04ced33 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3006,6 +3006,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_SET_LAPIC: {
+   r = -EFAULT;
if (!vcpu-arch.apic)
goto out;
u.lapic = memdup_user(argp, sizeof(*u.lapic));
-- 
1.7.3.4
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] KVM: x86: Fix uninitialized return code

2012-12-02 Thread Jan Kiszka
From: Jan Kiszka jan.kis...@siemens.com

This is a regression caused by 18595411a7.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
---

Sorry, copypasted wrong error code.

 arch/x86/kvm/x86.c |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b0b8abe..3bdaf29 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3006,6 +3006,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_SET_LAPIC: {
+   r = -EINVAL;
if (!vcpu-arch.apic)
goto out;
u.lapic = memdup_user(argp, sizeof(*u.lapic));
-- 
1.7.3.4
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


high host load from vmx_set_supported_cpuid call?

2012-12-02 Thread Nikola Ciprich
Hi,

while trying to find source of KVM guest slowness, I noticed following in perf 
top:

  326.00 19.6% vmx_set_supported_cpuid
/lib/modules/3.0.53lb6.02/kernel/arch/x86/kvm/kvm-intel.ko
  108.00  6.5% kvm_arch_dev_ioctl 
/lib/modules/3.0.53lb6.02/kernel/arch/x86/kvm/kvm.ko  
  100.00  6.0% tick_dev_program_event 
[kernel.kallsyms] 
   78.00  4.7% __remove_hrtimer   
[kernel.kallsyms] 
   52.00  3.1% acpi_processor_reevaluate_tstate   
/lib/modules/3.0.53lb6.02/kernel/drivers/acpi/processor.ko
   45.00  2.7% find_busiest_group 
[kernel.kallsyms] 
   43.00  2.6% do_raw_spin_lock   
[kernel.kallsyms] 

my question is, is this normal? I tried googling for vmx_set_supported_cpuid 
but am not
really clever about it...

here's snippet from trace-cmd:

version = 6
CPU 11 is empty
cpus=12
qemu-kvm-7766  [000] 235066.551604: kvm_msi_set_irq:  dst 1 vec 51 
(LowPrio|logical|edge|rh)
qemu-kvm-7766  [000] 235066.551606: kvm_apic_accept_irq:  apicid 0 vec 
81 (LowPrio|edge)
qemu-kvm-7767  [001] 235066.551618: kvm_inj_virq: irq 81
qemu-kvm-7767  [001] 235066.551620: kvm_entry:vcpu 0
qemu-kvm-7767  [001] 235066.551625: kvm_exit: reason 
EPT_MISCONFIG rip 0x81023d96 info 0 0
qemu-kvm-7767  [001] 235066.551630: kvm_emulate_insn: [FAILED TO 
PARSE] rip=18446744071578992022 csbase=0 len=2 insn=ARRAY[8b, 00, c9, 89, c0, 
c3,
 0f, 1f, 40, 00, 55, 48, 89, e5, 0f] flags=9 failed=0
qemu-kvm-7767  [001] 235066.551631: vcpu_match_mmio:  gva 
0xc90040f0 gpa 0xfed000f0 Read GPA
qemu-kvm-7767  [001] 235066.551632: kvm_mmio: mmio 
unsatisfied-read len 4 gpa 0xfed000f0 val 0x0
qemu-kvm-7767  [001] 235066.551634: kvm_userspace_exit:   reason 
KVM_EXIT_MMIO (6)
qemu-kvm-7767  [001] 235066.551648: kvm_mmio: mmio read len 
4 gpa 0xfed000f0 val 0xfed25f11
qemu-kvm-7767  [001] 235066.551649: kvm_entry:vcpu 0
qemu-kvm-7767  [001] 235066.551650: kvm_exit: reason 
EPT_MISCONFIG rip 0x81023d96 info 0 0
qemu-kvm-7767  [001] 235066.551652: kvm_emulate_insn: [FAILED TO 
PARSE] rip=18446744071578992022 csbase=0 len=2 insn=ARRAY[8b, 00, c9, 89, c0, 
c3,
 0f, 1f, 40, 00, 55, 48, 89, e5, 0f] flags=9 failed=0
qemu-kvm-7767  [001] 235066.551652: vcpu_match_mmio:  gva 
0xc90040f0 gpa 0xfed000f0 Read GPA
qemu-kvm-7767  [001] 235066.551653: kvm_mmio: mmio 
unsatisfied-read len 4 gpa 0xfed000f0 val 0x0
qemu-kvm-7767  [001] 235066.551653: kvm_userspace_exit:   reason 
KVM_EXIT_MMIO (6)
qemu-kvm-7767  [001] 235066.551656: kvm_mmio: mmio read len 
4 gpa 0xfed000f0 val 0xfed26450
qemu-kvm-7767  [001] 235066.551657: kvm_entry:vcpu 0
qemu-kvm-7767  [001] 235066.551659: kvm_exit: reason 
EPT_MISCONFIG rip 0x81023d96 info 0 0
qemu-kvm-7767  [001] 235066.551660: kvm_emulate_insn: [FAILED TO 
PARSE] rip=18446744071578992022 csbase=0 len=2 insn=ARRAY[8b, 00, c9, 89, c0, 
c3,
 0f, 1f, 40, 00, 55, 48, 89, e5, 0f] flags=9 failed=0
qemu-kvm-7767  [001] 235066.551660: vcpu_match_mmio:  gva 
0xc90040f0 gpa 0xfed000f0 Read GPA
qemu-kvm-7767  [001] 235066.551660: kvm_mmio: mmio 
unsatisfied-read len 4 gpa 0xfed000f0 val 0x0
qemu-kvm-7767  [001] 235066.551661: kvm_userspace_exit:   reason 
KVM_EXIT_MMIO (6)
qemu-kvm-7767  [001] 235066.551663: kvm_mmio: mmio read len 
4 gpa 0xfed000f0 val 0xfed2672b
qemu-kvm-7767  [001] 235066.551664: kvm_entry:vcpu 0
qemu-kvm-7767  [001] 235066.551667: kvm_exit: reason 
APIC_ACCESS rip 0x810219eb info 10b0 0
qemu-kvm-7767  [001] 235066.551667: kvm_apic: apic_write 
APIC_EOI = 0x0
qemu-kvm-7767  [001] 235066.551668: kvm_eoi:  apicid 0 
vector 81
qemu-kvm-7767  [001] 235066.551668: kvm_entry:vcpu 0
qemu-kvm-7767  [001] 235066.551681: kvm_exit: reason 
EPT_MISCONFIG rip 0x81023d96 info 0 0
qemu-kvm-7767  [001] 235066.551682: kvm_emulate_insn: [FAILED TO 
PARSE] rip=18446744071578992022 csbase=0 len=2 insn=ARRAY[8b, 00, c9, 89, c0, 
c3,
 0f, 1f, 40, 00, 55, 48, 89, e5, 0f] flags=9 failed=0
qemu-kvm-7767  [001] 235066.551683: vcpu_match_mmio:  gva 
0xc90040f0 gpa 0xfed000f0 Read GPA
qemu-kvm-7767  [001] 235066.551683: 

Re: KVM VMX: register state after reset violates spec

2012-12-02 Thread Gleb Natapov
On Thu, Nov 29, 2012 at 03:07:38PM +0100, Julian Stecklina wrote:
 Hello,
 
 we have noticed that at least on 3.6.8 with VMX after a VCPU has been
 reset via the INIT-SIPI-SIPI sequence its register state violates
 Intel's specification.
 
 Specifically for our case we see at the end of vmx_vcpu_reset the
 following vcpu state:
 
 regs_avail=ffef regs_dirty=00010010
 EIP= EAX=06e8 EBX=0001 ECX=8001 EDX=0600
 ESI=d238 EDI= EBP= ESP=
 
 although EAX, EBX, ECX, ESI, EDI, EBP, ESP should _all_ be zero. See
 http://download.intel.com/products/processor/manual/253668.pdf section
 9.1.1 (page 9-2).
 
 Shouldn't vmx_vcpu_reset actively clear those registers? And from a
 quick glance at the SVM code the problem might exist there, too.
 
It should, so why not move the fix to kvm_vcpu_reset() so it will work
for both. Also what about R8-R15? Intel SDM says nothing about them in
the section you mention, but in Volume 1 section 3.4.1.1 is says:

 Registers only available in 64-bit mode (R8-R15 and XMM8-XMM15) are
 preserved across transitions from 64-bit mode into compatibility mode
 then back into 64-bit mode. However, values of R8-R15 and XMM8-XMM15
 are undefined after transitions from 64-bit mode through compatibility
 mode to legacy or real mode and then back through compatibility mode to
 64-bit mode.

I take it that they are undefined on the first transition to 64-bit mode
too. AMD spec says that they should be zeroed on reset, so lets do that.
Also SVM does not set EDX to correct value on reset. It should be:

 Stepping ID (bits 3:0)—This field identifies the processor-revision level.
 Extended Model (bits 19:16) and Model (bits 7:4)—These fields combine to
   differentiate processor models within a instruction family. For
   example, two processors may share the same microarchitecture but
   differ in their feature set. Such processors are considered different
   models within the same instruction family. This is a split field,
   comprising an extended-model portion in bits 19:16 with a legacy
   portion in bits 7:4
 Extended Family (bits 27:20) and Family (bits 11:8)—These fields combine to
   differentiate processors by their microarchitecture.

 A workaround is to use qemu-kvm with -kvm-no-irqchip.
 
 Julian
 
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: high host load from vmx_set_supported_cpuid call?

2012-12-02 Thread Gleb Natapov
On Sun, Dec 02, 2012 at 12:41:37PM +0100, Nikola Ciprich wrote:
 Hi,
 
 while trying to find source of KVM guest slowness, I noticed following in 
 perf top:
 
   326.00 19.6% vmx_set_supported_cpuid
 /lib/modules/3.0.53lb6.02/kernel/arch/x86/kvm/kvm-intel.ko
Something wrong with your symbols. This function cannot take that much.
It is three and a half instruction long and should be called only once
during vm startup.


   108.00  6.5% kvm_arch_dev_ioctl 
 /lib/modules/3.0.53lb6.02/kernel/arch/x86/kvm/kvm.ko  
   100.00  6.0% tick_dev_program_event 
 [kernel.kallsyms] 
78.00  4.7% __remove_hrtimer   
 [kernel.kallsyms] 
52.00  3.1% acpi_processor_reevaluate_tstate   
 /lib/modules/3.0.53lb6.02/kernel/drivers/acpi/processor.ko
45.00  2.7% find_busiest_group 
 [kernel.kallsyms] 
43.00  2.6% do_raw_spin_lock   
 [kernel.kallsyms] 
 
 my question is, is this normal? I tried googling for vmx_set_supported_cpuid 
 but am not
 really clever about it...
 
 here's snippet from trace-cmd:
 
 version = 6
 CPU 11 is empty
 cpus=12
 qemu-kvm-7766  [000] 235066.551604: kvm_msi_set_irq:  dst 1 vec 
 51 (LowPrio|logical|edge|rh)
 qemu-kvm-7766  [000] 235066.551606: kvm_apic_accept_irq:  apicid 0 
 vec 81 (LowPrio|edge)
 qemu-kvm-7767  [001] 235066.551618: kvm_inj_virq: irq 81
 qemu-kvm-7767  [001] 235066.551620: kvm_entry:vcpu 0
 qemu-kvm-7767  [001] 235066.551625: kvm_exit: reason 
 EPT_MISCONFIG rip 0x81023d96 info 0 0
 qemu-kvm-7767  [001] 235066.551630: kvm_emulate_insn: [FAILED TO 
 PARSE] rip=18446744071578992022 csbase=0 len=2 insn=ARRAY[8b, 00, c9, 89, c0, 
 c3,
  0f, 1f, 40, 00, 55, 48, 89, e5, 0f] flags=9 failed=0
 qemu-kvm-7767  [001] 235066.551631: vcpu_match_mmio:  gva 
 0xc90040f0 gpa 0xfed000f0 Read GPA
 qemu-kvm-7767  [001] 235066.551632: kvm_mmio: mmio 
 unsatisfied-read len 4 gpa 0xfed000f0 val 0x0
 qemu-kvm-7767  [001] 235066.551634: kvm_userspace_exit:   reason 
 KVM_EXIT_MMIO (6)
 qemu-kvm-7767  [001] 235066.551648: kvm_mmio: mmio read 
 len 4 gpa 0xfed000f0 val 0xfed25f11
 qemu-kvm-7767  [001] 235066.551649: kvm_entry:vcpu 0
 qemu-kvm-7767  [001] 235066.551650: kvm_exit: reason 
 EPT_MISCONFIG rip 0x81023d96 info 0 0
 qemu-kvm-7767  [001] 235066.551652: kvm_emulate_insn: [FAILED TO 
 PARSE] rip=18446744071578992022 csbase=0 len=2 insn=ARRAY[8b, 00, c9, 89, c0, 
 c3,
  0f, 1f, 40, 00, 55, 48, 89, e5, 0f] flags=9 failed=0
 qemu-kvm-7767  [001] 235066.551652: vcpu_match_mmio:  gva 
 0xc90040f0 gpa 0xfed000f0 Read GPA
 qemu-kvm-7767  [001] 235066.551653: kvm_mmio: mmio 
 unsatisfied-read len 4 gpa 0xfed000f0 val 0x0
 qemu-kvm-7767  [001] 235066.551653: kvm_userspace_exit:   reason 
 KVM_EXIT_MMIO (6)
 qemu-kvm-7767  [001] 235066.551656: kvm_mmio: mmio read 
 len 4 gpa 0xfed000f0 val 0xfed26450
 qemu-kvm-7767  [001] 235066.551657: kvm_entry:vcpu 0
 qemu-kvm-7767  [001] 235066.551659: kvm_exit: reason 
 EPT_MISCONFIG rip 0x81023d96 info 0 0
 qemu-kvm-7767  [001] 235066.551660: kvm_emulate_insn: [FAILED TO 
 PARSE] rip=18446744071578992022 csbase=0 len=2 insn=ARRAY[8b, 00, c9, 89, c0, 
 c3,
  0f, 1f, 40, 00, 55, 48, 89, e5, 0f] flags=9 failed=0
 qemu-kvm-7767  [001] 235066.551660: vcpu_match_mmio:  gva 
 0xc90040f0 gpa 0xfed000f0 Read GPA
 qemu-kvm-7767  [001] 235066.551660: kvm_mmio: mmio 
 unsatisfied-read len 4 gpa 0xfed000f0 val 0x0
 qemu-kvm-7767  [001] 235066.551661: kvm_userspace_exit:   reason 
 KVM_EXIT_MMIO (6)
 qemu-kvm-7767  [001] 235066.551663: kvm_mmio: mmio read 
 len 4 gpa 0xfed000f0 val 0xfed2672b
 qemu-kvm-7767  [001] 235066.551664: kvm_entry:vcpu 0
 qemu-kvm-7767  [001] 235066.551667: kvm_exit: reason 
 APIC_ACCESS rip 0x810219eb info 10b0 0
 qemu-kvm-7767  [001] 235066.551667: kvm_apic: apic_write 
 APIC_EOI = 0x0
 qemu-kvm-7767  [001] 235066.551668: kvm_eoi:  apicid 0 
 vector 81
 qemu-kvm-7767  [001] 235066.551668: kvm_entry:vcpu 0
 qemu-kvm-7767  [001] 235066.551681: kvm_exit: reason 
 EPT_MISCONFIG rip 0x81023d96 info 0 0
 qemu-kvm-7767  [001] 235066.551682: 

Re: high host load from vmx_set_supported_cpuid call?

2012-12-02 Thread Nikola Ciprich
Hi Gleb,

 Something wrong with your symbols. This function cannot take that much.
 It is three and a half instruction long and should be called only once
 during vm startup.

well, it didn't make any sense to me, glad I wasn't that wrong :)
how could that be? I guess it could be perf/kernel mismatch right?
I'll try to fix that and see if it helps..

thanks for Your time!
n.


-- 
-
Ing. Nikola CIPRICH
LinuxBox.cz, s.r.o.
28. rijna 168, 709 00 Ostrava

tel.:   +420 591 166 214
fax:+420 596 621 273
mobil:  +420 777 093 799

www.linuxbox.cz

mobil servis: +420 737 238 656
email servis: ser...@linuxbox.cz
-


pgpMhzIV2En3z.pgp
Description: PGP signature


Re: high host load from vmx_set_supported_cpuid call?

2012-12-02 Thread Gleb Natapov
On Sun, Dec 02, 2012 at 03:31:08PM +0100, Nikola Ciprich wrote:
 Hi Gleb,
 
  Something wrong with your symbols. This function cannot take that much.
  It is three and a half instruction long and should be called only once
  during vm startup.
 
 well, it didn't make any sense to me, glad I wasn't that wrong :)
 how could that be? I guess it could be perf/kernel mismatch right?
 I'll try to fix that and see if it helps..
 
More like loaded modules/installed modules mismatch.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: high host load from vmx_set_supported_cpuid call?

2012-12-02 Thread Nikola Ciprich
 More like loaded modules/installed modules mismatch.
I see, the problem is, that I've got kvm-kmod compiled separately!
thus kvm*.ko symboles don't match!
I see that kvm-kmod build produces System.map file, I guess I need to
merge it with kernel's System.map?

-- 
-
Ing. Nikola CIPRICH
LinuxBox.cz, s.r.o.
28. rijna 168, 709 00 Ostrava

tel.:   +420 591 166 214
fax:+420 596 621 273
mobil:  +420 777 093 799

www.linuxbox.cz

mobil servis: +420 737 238 656
email servis: ser...@linuxbox.cz
-


pgpWo5au4KaYi.pgp
Description: PGP signature


Re: high host load from vmx_set_supported_cpuid call?

2012-12-02 Thread Gleb Natapov
On Sun, Dec 02, 2012 at 03:51:53PM +0100, Nikola Ciprich wrote:
  More like loaded modules/installed modules mismatch.
 I see, the problem is, that I've got kvm-kmod compiled separately!
 thus kvm*.ko symboles don't match!
 I see that kvm-kmod build produces System.map file, I guess I need to
 merge it with kernel's System.map?
 
I think you need to copy them over old modules in /lib/modules.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: high host load from vmx_set_supported_cpuid call?

2012-12-02 Thread Nikola Ciprich
 I think you need to copy them over old modules in /lib/modules.

hmm, that should be OK, new kvm*.ko modules are part of kernel rpm package,
there's no old module there. (I checked by both inspecting kernel pkg and
using modinfo)... Could it be something else?


 
 --
   Gleb.
 

-- 
-
Ing. Nikola CIPRICH
LinuxBox.cz, s.r.o.
28. rijna 168, 709 00 Ostrava

tel.:   +420 591 166 214
fax:+420 596 621 273
mobil:  +420 777 093 799

www.linuxbox.cz

mobil servis: +420 737 238 656
email servis: ser...@linuxbox.cz
-


pgpnVxHTWTEmQ.pgp
Description: PGP signature


Re: high host load from vmx_set_supported_cpuid call?

2012-12-02 Thread Nikola Ciprich
 hmm, that should be OK, new kvm*.ko modules are part of kernel rpm package,
 there's no old module there. (I checked by both inspecting kernel pkg and
 using modinfo)... Could it be something else?

hmm, I can reply to myself this time - perf seems to get the symbols using
/proc/kallsyms and there apparently are symbols from inkernel KVM modules, not
the externally built. I guess the problem with my kernel package is I build
both kernel and external KVM modules and then replace kernel ones causing
kallsyms mismatch..
I'll have to fix this first, reboot host and then see...


 
 
  
  --
  Gleb.
  
 
 -- 
 -
 Ing. Nikola CIPRICH
 LinuxBox.cz, s.r.o.
 28. rijna 168, 709 00 Ostrava
 
 tel.:   +420 591 166 214
 fax:+420 596 621 273
 mobil:  +420 777 093 799
 
 www.linuxbox.cz
 
 mobil servis: +420 737 238 656
 email servis: ser...@linuxbox.cz
 -



-- 
-
Ing. Nikola CIPRICH
LinuxBox.cz, s.r.o.
28. rijna 168, 709 00 Ostrava

tel.:   +420 591 166 214
fax:+420 596 621 273
mobil:  +420 777 093 799

www.linuxbox.cz

mobil servis: +420 737 238 656
email servis: ser...@linuxbox.cz
-


pgpTi86wLYT8l.pgp
Description: PGP signature


Re: high host load from vmx_set_supported_cpuid call?

2012-12-02 Thread Gleb Natapov
On Sun, Dec 02, 2012 at 04:10:10PM +0100, Nikola Ciprich wrote:
  I think you need to copy them over old modules in /lib/modules.
 
 hmm, that should be OK, new kvm*.ko modules are part of kernel rpm package,
 there's no old module there. (I checked by both inspecting kernel pkg and
 using modinfo)... Could it be something else?
 
Probably, I do not how to debug perf unfortunately.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: high host load from vmx_set_supported_cpuid call?

2012-12-02 Thread Gleb Natapov
On Sun, Dec 02, 2012 at 04:26:41PM +0100, Nikola Ciprich wrote:
  hmm, that should be OK, new kvm*.ko modules are part of kernel rpm package,
  there's no old module there. (I checked by both inspecting kernel pkg and
  using modinfo)... Could it be something else?
 
 hmm, I can reply to myself this time - perf seems to get the symbols using
 /proc/kallsyms and there apparently are symbols from inkernel KVM modules, not
 the externally built. I guess the problem with my kernel package is I build
 both kernel and external KVM modules and then replace kernel ones causing
 kallsyms mismatch..
 I'll have to fix this first, reboot host and then see...
 
And you can successfully install external modules while KVM is compiled
in? Strange.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] KVM: x86: Fix uninitialized return code

2012-12-02 Thread Gleb Natapov
On Sun, Dec 02, 2012 at 11:04:14AM +0100, Jan Kiszka wrote:
 From: Jan Kiszka jan.kis...@siemens.com
 
 This is a regression caused by 18595411a7.
 
 Signed-off-by: Jan Kiszka jan.kis...@siemens.com

Applied, thanks.

 ---
 
 Sorry, copypasted wrong error code.
 
  arch/x86/kvm/x86.c |1 +
  1 files changed, 1 insertions(+), 0 deletions(-)
 
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index b0b8abe..3bdaf29 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -3006,6 +3006,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
   break;
   }
   case KVM_SET_LAPIC: {
 + r = -EINVAL;
   if (!vcpu-arch.apic)
   goto out;
   u.lapic = memdup_user(argp, sizeof(*u.lapic));
 -- 
 1.7.3.4

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [net-next rfc v7 2/3] virtio_net: multiqueue support

2012-12-02 Thread Michael S. Tsirkin
On Tue, Nov 27, 2012 at 06:15:59PM +0800, Jason Wang wrote:
 This addes multiqueue support to virtio_net driver. In multiple queue modes, 
 the
 driver expects the number of queue paris is equal to the number of vcpus. To
 eliminate the contention bettwen vcpus and virtqueues, per-cpu virtqueue pairs
 were implemented through:
 
 - select the txq based on the smp processor id.
 - smp affinity hint were set to the vcpu that owns the queue pairs.
 
 Signed-off-by: Krishna Kumar krkum...@in.ibm.com
 Signed-off-by: Jason Wang jasow...@redhat.com
 ---
  drivers/net/virtio_net.c|  454 
 ++-
  include/uapi/linux/virtio_net.h |   16 ++
  2 files changed, 371 insertions(+), 99 deletions(-)
 
 diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
 index 7975133..bcaa6e5 100644
 --- a/drivers/net/virtio_net.c
 +++ b/drivers/net/virtio_net.c
 @@ -84,17 +84,25 @@ struct virtnet_info {
   struct virtio_device *vdev;
   struct virtqueue *cvq;
   struct net_device *dev;
 - struct napi_struct napi;
 - struct send_queue sq;
 - struct receive_queue rq;
 + struct send_queue *sq;
 + struct receive_queue *rq;
   unsigned int status;
  
 + /* Max # of queue pairs supported by the device */
 + u16 max_queue_pairs;
 +
 + /* # of queue pairs currently used by the driver */
 + u16 curr_queue_pairs;
 +
   /* I like... big packets and I cannot lie! */
   bool big_packets;
  
   /* Host will merge rx buffers for big packets (shake it! shake it!) */
   bool mergeable_rx_bufs;
  
 + /* Has control virtqueue */
 + bool has_cvq;
 +
   /* enable config space updates */
   bool config_enable;
  
 @@ -126,6 +134,34 @@ struct padded_vnet_hdr {
   char padding[6];
  };
  
 +static const struct ethtool_ops virtnet_ethtool_ops;
 +
 +/*
 + * Converting between virtqueue no. and kernel tx/rx queue no.
 + * 0:rx0 1:tx0 2:cvq 3:rx1 4:tx1 ... 2N+1:rxN 2N+2:txN
 + */

Weird, this isn't what spec v5 says: it says
0:rx0 1:tx0 2: rx1 3: tx1  vcq
We can change the spec to match but keeping all rx/tx
together seems a bit prettier?

 +static int vq2txq(struct virtqueue *vq)
 +{
 + int index = virtqueue_get_queue_index(vq);
 + return index == 1 ? 0 : (index - 2) / 2;
 +}
 +
 +static int txq2vq(int txq)
 +{
 + return txq ? 2 * txq + 2 : 1;
 +}
 +
 +static int vq2rxq(struct virtqueue *vq)
 +{
 + int index = virtqueue_get_queue_index(vq);
 + return index ? (index - 1) / 2 : 0;
 +}
 +
 +static int rxq2vq(int rxq)
 +{
 + return rxq ? 2 * rxq + 1 : 0;
 +}
 +
  static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
  {
   return (struct skb_vnet_hdr *)skb-cb;
 @@ -166,7 +202,7 @@ static void skb_xmit_done(struct virtqueue *vq)
   virtqueue_disable_cb(vq);
  
   /* We were probably waiting for more output buffers. */
 - netif_wake_queue(vi-dev);
 + netif_wake_subqueue(vi-dev, vq2txq(vq));
  }
  
  static void set_skb_frag(struct sk_buff *skb, struct page *page,
 @@ -503,7 +539,7 @@ static bool try_fill_recv(struct receive_queue *rq, gfp_t 
 gfp)
  static void skb_recv_done(struct virtqueue *rvq)
  {
   struct virtnet_info *vi = rvq-vdev-priv;
 - struct receive_queue *rq = vi-rq;
 + struct receive_queue *rq = vi-rq[vq2rxq(rvq)];
  
   /* Schedule NAPI, Suppress further interrupts if successful. */
   if (napi_schedule_prep(rq-napi)) {
 @@ -650,7 +686,8 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff 
 *skb)
  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
  {
   struct virtnet_info *vi = netdev_priv(dev);
 - struct send_queue *sq = vi-sq;
 + int qnum = skb_get_queue_mapping(skb);
 + struct send_queue *sq = vi-sq[qnum];
   int capacity;
  
   /* Free up any pending old buffers before queueing new ones. */
 @@ -664,13 +701,14 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, 
 struct net_device *dev)
   if (likely(capacity == -ENOMEM)) {
   if (net_ratelimit())
   dev_warn(dev-dev,
 -  TX queue failure: out of memory\n);
 +  TXQ (%d) failure: out of memory\n,
 +  qnum);
   } else {
   dev-stats.tx_fifo_errors++;
   if (net_ratelimit())
   dev_warn(dev-dev,
 -  Unexpected TX queue failure: %d\n,
 -  capacity);
 +  Unexpected TXQ (%d) failure: %d\n,
 +  qnum, capacity);
   }
   dev-stats.tx_dropped++;
   kfree_skb(skb);
 @@ -685,12 +723,12 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, 
 struct net_device *dev)
   /* Apparently nice girls don't return 

Re: [net-next rfc v7 3/3] virtio-net: change the number of queues through ethtool

2012-12-02 Thread Michael S. Tsirkin
On Tue, Nov 27, 2012 at 06:16:00PM +0800, Jason Wang wrote:
 This patch implement the {set|get}_channels method of ethool to allow user to
 change the number of queues dymaically when the device is running. This would
 let the user to configure it on demand.
 
 Signed-off-by: Jason Wang jasow...@redhat.com
 ---
  drivers/net/virtio_net.c |   41 +
  1 files changed, 41 insertions(+), 0 deletions(-)
 
 diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
 index bcaa6e5..f08ec2a 100644
 --- a/drivers/net/virtio_net.c
 +++ b/drivers/net/virtio_net.c
 @@ -1578,10 +1578,51 @@ static struct virtio_driver virtio_net_driver = {
  #endif
  };
  
 +/* TODO: Eliminate OOO packets during switching */
 +static int virtnet_set_channels(struct net_device *dev,
 + struct ethtool_channels *channels)
 +{
 + struct virtnet_info *vi = netdev_priv(dev);
 + u16 queue_pairs = channels-combined_count;
 +
 + /* We don't support separate rx/tx channels.
 +  * We don't allow setting 'other' channels.
 +  */
 + if (channels-rx_count || channels-tx_count || channels-other_count)
 + return -EINVAL;
 +
 + /* Only two modes were support currently */
 + if (queue_pairs != vi-max_queue_pairs  queue_pairs != 1)
 + return -EINVAL;
 +

Why the limitation?
Also how does userspace discover what the legal values are?

 + vi-curr_queue_pairs = queue_pairs;
 + BUG_ON(virtnet_set_queues(vi));
 +
 + netif_set_real_num_tx_queues(dev, vi-curr_queue_pairs);
 + netif_set_real_num_rx_queues(dev, vi-curr_queue_pairs);
 +
 + return 0;
 +}
 +
 +static void virtnet_get_channels(struct net_device *dev,
 +  struct ethtool_channels *channels)
 +{
 + struct virtnet_info *vi = netdev_priv(dev);
 +
 + channels-combined_count = vi-curr_queue_pairs;
 + channels-max_combined = vi-max_queue_pairs;
 + channels-max_other = 0;
 + channels-rx_count = 0;
 + channels-tx_count = 0;
 + channels-other_count = 0;
 +}
 +
  static const struct ethtool_ops virtnet_ethtool_ops = {
   .get_drvinfo = virtnet_get_drvinfo,
   .get_link = ethtool_op_get_link,
   .get_ringparam = virtnet_get_ringparam,
 + .set_channels = virtnet_set_channels,
 + .get_channels = virtnet_get_channels,
  };
  
  static int __init init(void)
 -- 
 1.7.1
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv4] virtio-spec: virtio network device RFS support

2012-12-02 Thread Rusty Russell
Michael S. Tsirkin m...@redhat.com writes:
 Add RFS support to virtio network device.
 Add a new feature flag VIRTIO_NET_F_RFS for this feature, a new
 configuration field max_virtqueue_pairs to detect supported number of
 virtqueues as well as a new command VIRTIO_NET_CTRL_RFS to program
 packet steering for unidirectional protocols.

Hi Michael,

Sorry for the delay, I took last week off.

 - rename multiqueue - rfs this is what we support
 - Be more explicit about what driver should do.
 - Simplify layout making VQs functionality depend on feature.
 - Remove unused commands, only leave in programming # of queues

Thanks: this looks really nice now.  Comments are about the text, not
the ideas.

 + 2N+1: transmitqN.
 + 2N+
 +\change_unchanged
 +2:controlq
  \begin_inset Foot
  status open

Hmm, controlq after xmit queues... a nice improvement.

 +VIRTIO_NET_F_RFS(2) Device supports Receive Flow Steering.

I think readers would prefer numerical order to historical order here,
so perhaps move this up in the list.

 -layout Two configuration fields are currently defined.
 +layout 
 +\change_deleted 1986246365 1352743300
 +Two
 +\change_inserted 1986246365 1352743301
 +Four
 +\change_unchanged
 + configuration fields are currently defined.

two to four?  I only see three?  And you didn't update the structure to
match...

 + Following this, driver should not transmit new packets on virtqueues other
 + than transmitq0 and device will not steer new packets on virtqueues other
 + than receiveq0.

Following this is vague.  After the buffer is consumed by the device.

Should not is kind of meaningless.  Let's make it clear: the device will
not steer new packets to RxqN, nor read from TxqN.

You should probably put in a note about the RFS control in the Device
Initialization section, too, ie. if you have negotiated and want to use
more queues, you must initialize them then wait for the ack of the
CTRL_RFS cmd.

Note: the following hunks didn't apply, but I'm not sure why they're in
this anyway...

 @@ -6152,13 +6385,7 @@ Virtqueues 0:receiveq(port0).
  status open
  
  \begin_layout Plain Layout
 -Ports 
 -\change_inserted 1986246365 1347188327
 -1
 -\change_deleted 1986246365 1347188327
 -2
 -\change_unchanged
 - onwards only if VIRTIO_CONSOLE_F_MULTIPORT is set
 +Ports 12 onwards only if VIRTIO_CONSOLE_F_MULTIPORT is set
  \end_layout
  
  \end_inset
 @@ -6185,13 +6412,8 @@ VIRTIO_CONSOLE_F_SIZE
  
  \begin_layout Description
  VIRTIO_CONSOLE_F_MULTIPORT(1) Device has support for multiple ports; 
 configurati
 -on fields nr_ports and max_nr_ports are valid
 -\change_inserted 1986246365 1347188404
 -; if this bit is negotiated,
 -\change_deleted 1986246365 1347188406
 - and
 -\change_unchanged
 - control virtqueues will be used.
 +on fields nr_ports and max_nr_ports are valid; if this bit is negotiated,
 + and control virtqueues will be used.
  \end_layout
  
  \end_deeper
 @@ -6260,8 +6482,7 @@ If the VIRTIO_CONSOLE_F_MULTIPORT feature is 
 negotiated, the driver can
   spawn multiple ports, not all of which may be attached to a console.
   Some could be generic ports.
   In this case, the control virtqueues are enabled and according to the 
 max_nr_po
 -rts configuration-space value, an appropriate number of virtqueues are
 - created.
 +rts configuration-space value, an appropriate number of virtqueues are 
 created.
   A control message indicating the driver is ready is sent to the host.
   The host can then send control messages for adding new ports to the device.
   After creating and initializing each port, a VIRTIO_CONSOLE_PORT_READY
 @@ -6699,14 +6920,9 @@ The driver constructs an array of addresses of memory 
 pages it has previously
  \end_layout
  
  \begin_layout Enumerate
 -If the VIRTIO_BALLOON_F_MUST_TELL_HOST feature is 
 -\change_inserted 1986246365 1347188540
 -negotiated
 -\change_deleted 1986246365 1347188542
 -set
 -\change_unchanged
 -, the guest may not use these requested pages until that descriptor in the
 - deflateq has been used by the device.
 +If the VIRTIO_BALLOON_F_MUST_TELL_HOST feature is negotiatedset, the guest
 + may not use these requested pages until that descriptor in the deflateq
 + has been used by the device.
  \end_layout
  
  \begin_layout Enumerate


Cheers,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Re: Re: Re: Re: Re: Re: Re: [RFC PATCH 0/2] kvm/vmx: Output TSC offset

2012-12-02 Thread Yoshihiro YUNOMAE

Hi Marcelo,


That is, you can't feed distinct instances of guest kernel trace.


I'm not clear for distinct instances. Is this about SMP or multiple
guests? Would you explain about this?


Distinct boot instances. If the guest reboots TSC can be written to.


OK, I understood.
I'll resend a patch series for this feature.

Thanks!

--
Yoshihiro YUNOMAE
Software Platform Research Dept. Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: yoshihiro.yunomae...@hitachi.com
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [net-next rfc v7 2/3] virtio_net: multiqueue support

2012-12-02 Thread Rusty Russell
Jason Wang jasow...@redhat.com writes:
 +static const struct ethtool_ops virtnet_ethtool_ops;
 +
 +/*
 + * Converting between virtqueue no. and kernel tx/rx queue no.
 + * 0:rx0 1:tx0 2:cvq 3:rx1 4:tx1 ... 2N+1:rxN 2N+2:txN
 + */
 +static int vq2txq(struct virtqueue *vq)
 +{
 + int index = virtqueue_get_queue_index(vq);
 + return index == 1 ? 0 : (index - 2) / 2;
 +}
 +
 +static int txq2vq(int txq)
 +{
 + return txq ? 2 * txq + 2 : 1;
 +}
 +
 +static int vq2rxq(struct virtqueue *vq)
 +{
 + int index = virtqueue_get_queue_index(vq);
 + return index ? (index - 1) / 2 : 0;
 +}
 +
 +static int rxq2vq(int rxq)
 +{
 + return rxq ? 2 * rxq + 1 : 0;
 +}
 +

I thought MST changed the proposed spec to make the control queue always
the last one, so this logic becomes trivial.

 +static int virtnet_set_queues(struct virtnet_info *vi)
 +{
 + struct scatterlist sg;
 + struct virtio_net_ctrl_rfs s;
 + struct net_device *dev = vi-dev;
 +
 + s.virtqueue_pairs = vi-curr_queue_pairs;
 + sg_init_one(sg, s, sizeof(s));
 +
 + if (!vi-has_cvq)
 + return -EINVAL;
 +
 + if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RFS,
 +   VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET, sg, 1, 0)){
 + dev_warn(dev-dev, Fail to set the number of queue pairs to
 +   %d\n, vi-curr_queue_pairs);
 + return -EINVAL;
 + }

Where do we check the VIRTIO_NET_F_RFS bit?

  static int virtnet_probe(struct virtio_device *vdev)
  {
 - int err;
 + int i, err;
   struct net_device *dev;
   struct virtnet_info *vi;
 + u16 curr_queue_pairs;
 +
 + /* Find if host supports multiqueue virtio_net device */
 + err = virtio_config_val(vdev, VIRTIO_NET_F_RFS,
 + offsetof(struct virtio_net_config,
 + max_virtqueue_pairs), curr_queue_pairs);
 +
 + /* We need at least 2 queue's */
 + if (err)
 + curr_queue_pairs = 1;

Huh?  Just call this queue_pairs.  It's not curr_ at all...

 + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
 + vi-has_cvq = true;
 +
 + /* Use single tx/rx queue pair as default */
 + vi-curr_queue_pairs = 1;
 + vi-max_queue_pairs = curr_queue_pairs;

See...

Cheers,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [net-next rfc v7 1/3] virtio-net: separate fields of sending/receiving queue from virtnet_info

2012-12-02 Thread Rusty Russell
Jason Wang jasow...@redhat.com writes:
 To support multiqueue transmitq/receiveq, the first step is to separate queue
 related structure from virtnet_info. This patch introduce send_queue and
 receive_queue structure and use the pointer to them as the parameter in
 functions handling sending/receiving.

OK, seems like a straightforward xform: a few nit-picks:

 +/* Internal representation of a receive virtqueue */
 +struct receive_queue {
 + /* Virtqueue associated with this receive_queue */
 + struct virtqueue *vq;
 +
 +struct napi_struct napi;
 +
 +/* Number of input buffers, and max we've ever had. */
 +unsigned int num, max;

Weird whitespace here.

 +
 + /* Work struct for refilling if we run low on memory. */
 + struct delayed_work refill;

I can't really see the justificaiton for a refill per queue.  Just have
one work iterate all the queues if it happens, unless it happens often
(in which case, we need to look harder at this anyway).

  struct virtnet_info {
   struct virtio_device *vdev;
 - struct virtqueue *rvq, *svq, *cvq;
 + struct virtqueue *cvq;
   struct net_device *dev;
   struct napi_struct napi;

You leave napi here, and take it away in the next patch.  I think it's
supposed to go away now.

Cheers,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] vfio powerpc: enabled on powernv platform

2012-12-02 Thread Alexey Kardashevskiy
This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson da...@gibson.dropbear.id.au
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h |9 ++
 arch/powerpc/kernel/iommu.c  |  186 ++
 arch/powerpc/platforms/powernv/pci.c |  135 
 drivers/iommu/Kconfig|8 ++
 4 files changed, 338 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..5c7087a 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
struct iommu_pool large_pool;
struct iommu_pool pools[IOMMU_NR_POOLS];
unsigned long *it_map;   /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+   struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
@@ -147,5 +150,11 @@ static inline void iommu_restore(void)
 }
 #endif
 
+extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+   unsigned long pages);
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+   uint64_t tce, enum dma_data_direction direction,
+   unsigned long pages);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..2738aa4 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -44,6 +44,7 @@
 #include asm/kdump.h
 #include asm/fadump.h
 #include asm/vio.h
+#include asm/tce.h
 
 #define DBG(...)
 
@@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t 
size,
free_pages((unsigned long)vaddr, get_order(size));
}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+
+/*
+ * Returns the number of used IOMMU pages (4K) within
+ * the same system page (4K or 64K).
+ * bitmap_weight is not used as it does not support bigendian maps.
+ */
+static int syspage_weight(unsigned long *map, unsigned long entry)
+{
+   int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
+
+   /* Aligns TCE entry number to system page boundary */
+   entry = PAGE_MASK  IOMMU_PAGE_SHIFT;
+
+   /* Count used 4K pages */
+   while (nbits--)
+   ret += (test_bit(entry++, map) == 0) ? 0 : 1;
+
+   return ret;
+}
+
+static void tce_flush(struct iommu_table *tbl)
+{
+   /* Flush/invalidate TLB caches if necessary */
+   if (ppc_md.tce_flush)
+   ppc_md.tce_flush(tbl);
+
+   /* Make sure updates are seen by hardware */
+   mb();
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of system pages
+ * which it called put_page() on
+ */
+static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
+   unsigned long pages)
+{
+   int i, retpages = 0;
+   unsigned long oldtce, oldweight;
+   struct page *page;
+
+   for (i = 0; i  pages; ++i) {
+   oldtce = ppc_md.tce_get(tbl, entry + i);
+   ppc_md.tce_free(tbl, entry + i, 1);
+
+   oldweight = syspage_weight(tbl-it_map, entry);
+   __clear_bit(entry - tbl-it_offset, tbl-it_map);
+
+   if (!(oldtce  (TCE_PCI_WRITE | TCE_PCI_READ)))
+   continue;
+
+   page = pfn_to_page(oldtce  PAGE_SHIFT);
+
+   WARN_ON(!page);
+   if (!page)
+   continue;
+
+   if (oldtce  TCE_PCI_WRITE)
+   SetPageDirty(page);
+
+   put_page(page);
+
+   /* That was the last IOMMU page within the system page */
+   if ((oldweight == 1)  !syspage_weight(tbl-it_map, entry))
+   ++retpages;
+   }
+
+   return retpages;
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number
+ / of released system pages
+ */
+long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+   unsigned long pages)
+{
+   int ret;
+   struct iommu_pool *pool = get_pool(tbl, entry);
+
+   spin_lock((pool-lock));
+   ret = clear_tces_nolock(tbl, entry, pages);
+   tce_flush(tbl);
+   spin_unlock((pool-lock));
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces);
+
+static int put_tce(struct 

[PATCH 0/2] vfio on power: yet another try

2012-12-02 Thread Alexey Kardashevskiy
The set includes 2 patches.

The first one adds necessary support for VFIO IOMMU support,
the second one adds a SPAPR TCE IOMMU driver to VFIO.

At the moment we have decided to get rid of DMA64 window
properties because we need more API than just 2 properties
(such as dynamic window allocation) but have not decided
about its actual design yet.

Alexey Kardashevskiy (2):
  vfio powerpc: enabled on powernv platform
  vfio powerpc: implemented IOMMU driver for VFIO

 arch/powerpc/include/asm/iommu.h |9 +
 arch/powerpc/kernel/iommu.c  |  186 ++
 arch/powerpc/platforms/powernv/pci.c |  135 +
 drivers/iommu/Kconfig|8 +
 drivers/vfio/Kconfig |6 +
 drivers/vfio/Makefile|1 +
 drivers/vfio/vfio_iommu_spapr_tce.c  |  350 ++
 include/linux/vfio.h |   26 +++
 8 files changed, 721 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] vfio powerpc: implemented IOMMU driver for VFIO

2012-12-02 Thread Alexey Kardashevskiy
VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.

The platform dependent part includes IOMMU initialization
and handling. This patch implements an IOMMU driver for VFIO
which does mapping/unmapping pages for the guest IO and
provides information about DMA window (required by a POWERPC
guest).

The counterpart in QEMU is required to support this functionality.

Cc: David Gibson da...@gibson.dropbear.id.au
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 drivers/vfio/Kconfig|6 +
 drivers/vfio/Makefile   |1 +
 drivers/vfio/vfio_iommu_spapr_tce.c |  350 +++
 include/linux/vfio.h|   26 +++
 4 files changed, 383 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
depends on VFIO
default n
 
+config VFIO_IOMMU_SPAPR_TCE
+   tristate
+   depends on VFIO  SPAPR_TCE_IOMMU
+   default n
+
 menuconfig VFIO
tristate VFIO Non-Privileged userspace driver framework
depends on IOMMU_API
select VFIO_IOMMU_TYPE1 if X86
+   select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
help
  VFIO provides a framework for secure userspace device drivers.
  See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..72bfabc 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 000..806ad9f
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,350 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2012 IBM Corp.  All rights reserved.
+ * Author: Alexey Kardashevskiy a...@ozlabs.ru
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ * Author: Alex Williamson alex.william...@redhat.com
+ */
+
+#include linux/module.h
+#include linux/pci.h
+#include linux/slab.h
+#include linux/uaccess.h
+#include linux/err.h
+#include linux/vfio.h
+#include asm/iommu.h
+
+#define DRIVER_VERSION  0.1
+#define DRIVER_AUTHOR   a...@ozlabs.ru
+#define DRIVER_DESC VFIO IOMMU SPAPR TCE
+
+static void tce_iommu_detach_group(void *iommu_data,
+   struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ */
+
+/*
+ * This code handles mapping and unmapping of user data buffers
+ * into DMA'ble space using the IOMMU
+ */
+
+#define NPAGE_TO_SIZE(npage)   ((size_t)(npage)  PAGE_SHIFT)
+
+struct vwork {
+   struct mm_struct*mm;
+   longnpage;
+   struct work_struct  work;
+};
+
+/* delayed decrement/increment for locked_vm */
+static void lock_acct_bg(struct work_struct *work)
+{
+   struct vwork *vwork = container_of(work, struct vwork, work);
+   struct mm_struct *mm;
+
+   mm = vwork-mm;
+   down_write(mm-mmap_sem);
+   mm-locked_vm += vwork-npage;
+   up_write(mm-mmap_sem);
+   mmput(mm);
+   kfree(vwork);
+}
+
+static void lock_acct(long npage)
+{
+   struct vwork *vwork;
+   struct mm_struct *mm;
+
+   if (!current-mm)
+   return; /* process exited */
+
+   if (down_write_trylock(current-mm-mmap_sem)) {
+   current-mm-locked_vm += npage;
+   up_write(current-mm-mmap_sem);
+   return;
+   }
+
+   /*
+* Couldn't get mmap_sem lock, so must setup to update
+* mm-locked_vm later. If locked_vm were atomic, we
+* wouldn't need this silliness
+*/
+   vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
+   if (!vwork)
+   return;
+   mm = get_task_mm(current);
+   if (!mm) {
+   kfree(vwork);
+   return;
+   }
+   INIT_WORK(vwork-work, lock_acct_bg);
+   vwork-mm = mm;
+   vwork-npage = npage;
+   schedule_work(vwork-work);
+}
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+   struct mutex lock;
+   struct iommu_table *tbl;
+};
+
+static void 

Re: [net-next rfc v7 2/3] virtio_net: multiqueue support

2012-12-02 Thread Jason Wang
On Sunday, December 02, 2012 06:06:31 PM Michael S. Tsirkin wrote:
 On Tue, Nov 27, 2012 at 06:15:59PM +0800, Jason Wang wrote:
  This addes multiqueue support to virtio_net driver. In multiple queue
  modes, the driver expects the number of queue paris is equal to the
  number of vcpus. To eliminate the contention bettwen vcpus and
  virtqueues, per-cpu virtqueue pairs were implemented through:
  
  - select the txq based on the smp processor id.
  - smp affinity hint were set to the vcpu that owns the queue pairs.
  
  Signed-off-by: Krishna Kumar krkum...@in.ibm.com
  Signed-off-by: Jason Wang jasow...@redhat.com
  ---
  
   drivers/net/virtio_net.c|  454
   ++- include/uapi/linux/virtio_net.h
   |   16 ++
   2 files changed, 371 insertions(+), 99 deletions(-)
  
  diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
  index 7975133..bcaa6e5 100644
  --- a/drivers/net/virtio_net.c
  +++ b/drivers/net/virtio_net.c
  @@ -84,17 +84,25 @@ struct virtnet_info {
  
  struct virtio_device *vdev;
  struct virtqueue *cvq;
  struct net_device *dev;
  
  -   struct napi_struct napi;
  -   struct send_queue sq;
  -   struct receive_queue rq;
  +   struct send_queue *sq;
  +   struct receive_queue *rq;
  
  unsigned int status;
  
  +   /* Max # of queue pairs supported by the device */
  +   u16 max_queue_pairs;
  +
  +   /* # of queue pairs currently used by the driver */
  +   u16 curr_queue_pairs;
  +
  
  /* I like... big packets and I cannot lie! */
  bool big_packets;
  
  /* Host will merge rx buffers for big packets (shake it! shake it!) */
  bool mergeable_rx_bufs;
  
  +   /* Has control virtqueue */
  +   bool has_cvq;
  +
  
  /* enable config space updates */
  bool config_enable;
  
  @@ -126,6 +134,34 @@ struct padded_vnet_hdr {
  
  char padding[6];
   
   };
  
  +static const struct ethtool_ops virtnet_ethtool_ops;
  +
  +/*
  + * Converting between virtqueue no. and kernel tx/rx queue no.
  + * 0:rx0 1:tx0 2:cvq 3:rx1 4:tx1 ... 2N+1:rxN 2N+2:txN
  + */
 
 Weird, this isn't what spec v5 says: it says
 0:rx0 1:tx0 2: rx1 3: tx1  vcq
 We can change the spec to match but keeping all rx/tx
 together seems a bit prettier?

Oh, I miss the check of this part in v5. Have a thought about this, if we 
change the location of cvq, we may break the support of legacy guest with only 
single queue support. Consider we start a vm with 2 queue but boot a signle 
queue legacy guest, it may think vq 2 is cvq which indeed is rx1.
 
  +static int vq2txq(struct virtqueue *vq)
  +{
  +   int index = virtqueue_get_queue_index(vq);
  +   return index == 1 ? 0 : (index - 2) / 2;
  +}
  +
  +static int txq2vq(int txq)
  +{
  +   return txq ? 2 * txq + 2 : 1;
  +}
  +
  +static int vq2rxq(struct virtqueue *vq)
  +{
  +   int index = virtqueue_get_queue_index(vq);
  +   return index ? (index - 1) / 2 : 0;
  +}
  +
  +static int rxq2vq(int rxq)
  +{
  +   return rxq ? 2 * rxq + 1 : 0;
  +}
  +
  
   static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
   {
   
  return (struct skb_vnet_hdr *)skb-cb;
  
  @@ -166,7 +202,7 @@ static void skb_xmit_done(struct virtqueue *vq)
  
  virtqueue_disable_cb(vq);
  
  /* We were probably waiting for more output buffers. */
  
  -   netif_wake_queue(vi-dev);
  +   netif_wake_subqueue(vi-dev, vq2txq(vq));
  
   }
   
   static void set_skb_frag(struct sk_buff *skb, struct page *page,
  
  @@ -503,7 +539,7 @@ static bool try_fill_recv(struct receive_queue *rq,
  gfp_t gfp) 
   static void skb_recv_done(struct virtqueue *rvq)
   {
   
  struct virtnet_info *vi = rvq-vdev-priv;
  
  -   struct receive_queue *rq = vi-rq;
  +   struct receive_queue *rq = vi-rq[vq2rxq(rvq)];
  
  /* Schedule NAPI, Suppress further interrupts if successful. */
  if (napi_schedule_prep(rq-napi)) {
  
  @@ -650,7 +686,8 @@ static int xmit_skb(struct send_queue *sq, struct
  sk_buff *skb) 
   static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device
   *dev)
   {
   
  struct virtnet_info *vi = netdev_priv(dev);
  
  -   struct send_queue *sq = vi-sq;
  +   int qnum = skb_get_queue_mapping(skb);
  +   struct send_queue *sq = vi-sq[qnum];
  
  int capacity;
  
  /* Free up any pending old buffers before queueing new ones. */
  
  @@ -664,13 +701,14 @@ static netdev_tx_t start_xmit(struct sk_buff *skb,
  struct net_device *dev) 
  if (likely(capacity == -ENOMEM)) {
  
  if (net_ratelimit())
  
  dev_warn(dev-dev,
  
  -TX queue failure: out of memory\n);
  +TXQ (%d) failure: out of memory\n,
  +qnum);
  
  } else {
  
  dev-stats.tx_fifo_errors++;
  if (net_ratelimit())
  

Re: [net-next rfc v7 2/3] virtio_net: multiqueue support

2012-12-02 Thread Jason Wang
On Monday, December 03, 2012 12:34:08 PM Rusty Russell wrote:
 Jason Wang jasow...@redhat.com writes:
  +static const struct ethtool_ops virtnet_ethtool_ops;
  +
  +/*
  + * Converting between virtqueue no. and kernel tx/rx queue no.
  + * 0:rx0 1:tx0 2:cvq 3:rx1 4:tx1 ... 2N+1:rxN 2N+2:txN
  + */
  +static int vq2txq(struct virtqueue *vq)
  +{
  +   int index = virtqueue_get_queue_index(vq);
  +   return index == 1 ? 0 : (index - 2) / 2;
  +}
  +
  +static int txq2vq(int txq)
  +{
  +   return txq ? 2 * txq + 2 : 1;
  +}
  +
  +static int vq2rxq(struct virtqueue *vq)
  +{
  +   int index = virtqueue_get_queue_index(vq);
  +   return index ? (index - 1) / 2 : 0;
  +}
  +
  +static int rxq2vq(int rxq)
  +{
  +   return rxq ? 2 * rxq + 1 : 0;
  +}
  +
 
 I thought MST changed the proposed spec to make the control queue always
 the last one, so this logic becomes trivial.

But it may break the support of legacy guest. If we boot a legacy single queue 
guest on a 2 queue virtio-net device. It may think vq 2 is cvq which is indeed 
rx1.
 
  +static int virtnet_set_queues(struct virtnet_info *vi)
  +{
  +   struct scatterlist sg;
  +   struct virtio_net_ctrl_rfs s;
  +   struct net_device *dev = vi-dev;
  +
  +   s.virtqueue_pairs = vi-curr_queue_pairs;
  +   sg_init_one(sg, s, sizeof(s));
  +
  +   if (!vi-has_cvq)
  +   return -EINVAL;
  +
  +   if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RFS,
  + VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET, sg, 1, 0)){
  +   dev_warn(dev-dev, Fail to set the number of queue pairs to
  + %d\n, vi-curr_queue_pairs);
  +   return -EINVAL;
  +   }
 
 Where do we check the VIRTIO_NET_F_RFS bit?

Yes, we need this check. Will let the caller does the check and add a comment 
and check in the caller.
 
   static int virtnet_probe(struct virtio_device *vdev)
   {
  
  -   int err;
  +   int i, err;
  
  struct net_device *dev;
  struct virtnet_info *vi;
  
  +   u16 curr_queue_pairs;
  +
  +   /* Find if host supports multiqueue virtio_net device */
  +   err = virtio_config_val(vdev, VIRTIO_NET_F_RFS,
  +   offsetof(struct virtio_net_config,
  +   max_virtqueue_pairs), curr_queue_pairs);
  +
  +   /* We need at least 2 queue's */
  +   if (err)
  +   curr_queue_pairs = 1;
 
 Huh?  Just call this queue_pairs.  It's not curr_ at all...
 
  +   if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
  +   vi-has_cvq = true;
  +
  +   /* Use single tx/rx queue pair as default */
  +   vi-curr_queue_pairs = 1;
  +   vi-max_queue_pairs = curr_queue_pairs;
 
 See...

Right, will use max_queue_pairs then.

Thanks
 
 Cheers,
 Rusty.
 --
 To unsubscribe from this list: send the line unsubscribe netdev in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [net-next rfc v7 3/3] virtio-net: change the number of queues through ethtool

2012-12-02 Thread Jason Wang
On Sunday, December 02, 2012 06:09:06 PM Michael S. Tsirkin wrote:
 On Tue, Nov 27, 2012 at 06:16:00PM +0800, Jason Wang wrote:
  This patch implement the {set|get}_channels method of ethool to allow user
  to change the number of queues dymaically when the device is running.
  This would let the user to configure it on demand.
  
  Signed-off-by: Jason Wang jasow...@redhat.com
  ---
  
   drivers/net/virtio_net.c |   41 +
   1 files changed, 41 insertions(+), 0 deletions(-)
  
  diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
  index bcaa6e5..f08ec2a 100644
  --- a/drivers/net/virtio_net.c
  +++ b/drivers/net/virtio_net.c
  @@ -1578,10 +1578,51 @@ static struct virtio_driver virtio_net_driver = {
  
   #endif
   };
  
  +/* TODO: Eliminate OOO packets during switching */
  +static int virtnet_set_channels(struct net_device *dev,
  +   struct ethtool_channels *channels)
  +{
  +   struct virtnet_info *vi = netdev_priv(dev);
  +   u16 queue_pairs = channels-combined_count;
  +
  +   /* We don't support separate rx/tx channels.
  +* We don't allow setting 'other' channels.
  +*/
  +   if (channels-rx_count || channels-tx_count || channels-other_count)
  +   return -EINVAL;
  +
  +   /* Only two modes were support currently */
  +   if (queue_pairs != vi-max_queue_pairs  queue_pairs != 1)
  +   return -EINVAL;
  +
 
 Why the limitation?

Not sure the value bettwen 1 and max_queue_pairs is useful. But anyway, I can 
remove this limitation.
 Also how does userspace discover what the legal values are?

Userspace only check whether the value is greater than max_queue_pairs.
 
  +   vi-curr_queue_pairs = queue_pairs;
  +   BUG_ON(virtnet_set_queues(vi));
  +
  +   netif_set_real_num_tx_queues(dev, vi-curr_queue_pairs);
  +   netif_set_real_num_rx_queues(dev, vi-curr_queue_pairs);
  +
  +   return 0;
  +}
  +
  +static void virtnet_get_channels(struct net_device *dev,
  +struct ethtool_channels *channels)
  +{
  +   struct virtnet_info *vi = netdev_priv(dev);
  +
  +   channels-combined_count = vi-curr_queue_pairs;
  +   channels-max_combined = vi-max_queue_pairs;
  +   channels-max_other = 0;
  +   channels-rx_count = 0;
  +   channels-tx_count = 0;
  +   channels-other_count = 0;
  +}
  +
  
   static const struct ethtool_ops virtnet_ethtool_ops = {
   
  .get_drvinfo = virtnet_get_drvinfo,
  .get_link = ethtool_op_get_link,
  .get_ringparam = virtnet_get_ringparam,
  
  +   .set_channels = virtnet_set_channels,
  +   .get_channels = virtnet_get_channels,
  
   };
   
   static int __init init(void)
 
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 0/4] x86, apicv: Add APIC virtualization support

2012-12-02 Thread Yang Zhang
APIC virtualization is a new feature which can eliminate most of VM exit
when vcpu handles a interrupt:

APIC register virtualization:
APIC read access doesn't cause APIC-access VM exits.
APIC write becomes trap-like.

Virtual interrupt delivery:
Virtual interrupt delivery avoids KVM to inject vAPIC interrupts
manually, which is fully taken care of by the hardware.

Please refer to Intel SDM volume 3, chapter 29 for more details.

Changes v2 to v3:
 * Drop Posted Interrupt patch from v3.
   According Gleb's suggestion, we will use global vector for all VCPUs as 
notification
   event vector. So we will rewrite the Posted Interrupt patch. And resend it 
later.
 * Use TMR to set the eoi exiting bitmap. We only want to set eoi exiting 
bitmap for 
   the interrupt which is level trigger or has notifier in EOI write path. So 
TMR is
   enough to distinguish the interrupt trigger mode.
 * Simplify some code according Gleb's comments.
 * rebased on top of KVM upstream.


Changes v1 to v2:
 * Add Posted Interrupt support in this series patch.
 * Since there is a notifer hook in vAPIC EOI for PIT interrupt. So always Set 
PIT
   interrupt in eoi exit bitmap to force vmexit when EOI to interrupt.
 * Rebased on top of KVM upstream

Yang Zhang (4):
  x86: PIT connects to pin 2 of IOAPIC
  x86, apicv: add APICv register virtualization support
  x86, apicv: add virtual interrupt delivery support
  x86, apicv: add virtual x2apic support

 arch/x86/include/asm/kvm_host.h |4 +
 arch/x86/include/asm/vmx.h  |   13 ++
 arch/x86/kvm/irq.c  |   53 ++---
 arch/x86/kvm/lapic.c|   72 +++--
 arch/x86/kvm/lapic.h|8 ++
 arch/x86/kvm/svm.c  |   19 +++
 arch/x86/kvm/vmx.c  |  232 +--
 arch/x86/kvm/x86.c  |   34 +-
 virt/kvm/ioapic.c   |3 +-
 9 files changed, 398 insertions(+), 40 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 2/4] x86, apicv: add APICv register virtualization support

2012-12-02 Thread Yang Zhang
- APIC read doesn't cause VM-Exit
- APIC write becomes trap-like

Signed-off-by: Yang Zhang yang.z.zh...@intel.com
Signed-off-by: Kevin Tian kevin.t...@intel.com
---
 arch/x86/include/asm/vmx.h |2 ++
 arch/x86/kvm/lapic.c   |   16 
 arch/x86/kvm/lapic.h   |2 ++
 arch/x86/kvm/vmx.c |   32 +++-
 4 files changed, 51 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 36ec21c..21101b6 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -66,6 +66,7 @@
 #define EXIT_REASON_EPT_MISCONFIG   49
 #define EXIT_REASON_WBINVD  54
 #define EXIT_REASON_XSETBV  55
+#define EXIT_REASON_APIC_WRITE  56
 #define EXIT_REASON_INVPCID 58
 
 #define VMX_EXIT_REASONS \
@@ -141,6 +142,7 @@
 #define SECONDARY_EXEC_ENABLE_VPID  0x0020
 #define SECONDARY_EXEC_WBINVD_EXITING  0x0040
 #define SECONDARY_EXEC_UNRESTRICTED_GUEST  0x0080
+#define SECONDARY_EXEC_APIC_REGISTER_VIRT   0x0100
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING  0x0400
 #define SECONDARY_EXEC_ENABLE_INVPCID  0x1000
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9392f52..7c96012 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1212,6 +1212,22 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
 
+/* emulate APIC access in a trap manner */
+int kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
+{
+   u32 val = 0;
+
+   /* hw has done the conditional check and inst decode */
+   offset = 0xff0;
+   if ((offset != APIC_EOI) 
+apic_reg_read(vcpu-arch.apic, offset, 4, val))
+   return 1;
+
+   /* TODO: optimize to just emulate side effect w/o one more write */
+   return apic_reg_write(vcpu-arch.apic, offset, val);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode);
+
 void kvm_free_lapic(struct kvm_vcpu *vcpu)
 {
struct kvm_lapic *apic = vcpu-arch.apic;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index e5ebf9f..c42f111 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -64,6 +64,8 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
 void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
 
+int kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset);
+
 void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
 void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
 void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2fd2046..6a5f651 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -83,6 +83,9 @@ module_param(vmm_exclusive, bool, S_IRUGO);
 static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, S_IRUGO);
 
+static bool __read_mostly enable_apicv_reg;
+module_param(enable_apicv_reg, bool, S_IRUGO);
+
 /*
  * If nested=1, nested virtualization is supported, i.e., guests may use
  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -761,6 +764,12 @@ static inline bool 
cpu_has_vmx_virtualize_apic_accesses(void)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 }
 
+static inline bool cpu_has_vmx_apic_register_virt(void)
+{
+   return vmcs_config.cpu_based_2nd_exec_ctrl 
+   SECONDARY_EXEC_APIC_REGISTER_VIRT;
+}
+
 static inline bool cpu_has_vmx_flexpriority(void)
 {
return cpu_has_vmx_tpr_shadow() 
@@ -2498,7 +2507,8 @@ static __init int setup_vmcs_config(struct vmcs_config 
*vmcs_conf)
SECONDARY_EXEC_UNRESTRICTED_GUEST |
SECONDARY_EXEC_PAUSE_LOOP_EXITING |
SECONDARY_EXEC_RDTSCP |
-   SECONDARY_EXEC_ENABLE_INVPCID;
+   SECONDARY_EXEC_ENABLE_INVPCID |
+   SECONDARY_EXEC_APIC_REGISTER_VIRT;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
_cpu_based_2nd_exec_control)  0)
@@ -2509,6 +2519,11 @@ static __init int setup_vmcs_config(struct vmcs_config 
*vmcs_conf)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
_cpu_based_exec_control = ~CPU_BASED_TPR_SHADOW;
 #endif
+
+   if (!(_cpu_based_exec_control  CPU_BASED_TPR_SHADOW))
+   _cpu_based_2nd_exec_control = ~(
+   SECONDARY_EXEC_APIC_REGISTER_VIRT);
+
if (_cpu_based_2nd_exec_control  SECONDARY_EXEC_ENABLE_EPT) {
/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
   enabled */
@@ -2706,6 +2721,9 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_ple())
ple_gap = 0;
 
+   if 

[PATCH v3 1/4] x86: PIT connects to pin 2 of IOAPIC

2012-12-02 Thread Yang Zhang
When PIT connects to IOAPIC, it route to pin 2 not pin 0.

Signed-off-by: Yang Zhang yang.z.zh...@intel.com
---
 virt/kvm/ioapic.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index cfb7e4d..166c450 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -181,7 +181,7 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int 
irq)
 
 #ifdef CONFIG_X86
/* Always delivery PIT interrupt to vcpu 0 */
-   if (irq == 0) {
+   if (irq == 2) {
irqe.dest_mode = 0; /* Physical mode. */
/* need to read apic_id from apic regiest since
 * it can be rewritten */
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 4/4] x86, apicv: add virtual x2apic support

2012-12-02 Thread Yang Zhang
basically to benefit from apicv, we need clear MSR bitmap for
corresponding x2apic MSRs:
0x800 - 0x8ff: no read intercept for apicv register virtualization
TPR,EOI,SELF-IPI: no write intercept for virtual interrupt delivery

Signed-off-by: Yang Zhang yang.z.zh...@intel.com
Signed-off-by: Kevin Tian kevin.t...@intel.com
---
 arch/x86/kvm/vmx.c |   64 ++-
 1 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 909ce90..ac0ebf1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3737,7 +3737,10 @@ static void free_vpid(struct vcpu_vmx *vmx)
spin_unlock(vmx_vpid_lock);
 }
 
-static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
+#define MSR_TYPE_R 1
+#define MSR_TYPE_W 2
+static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+   u32 msr, int type)
 {
int f = sizeof(unsigned long);
 
@@ -3750,20 +3753,52 @@ static void __vmx_disable_intercept_for_msr(unsigned 
long *msr_bitmap, u32 msr)
 * We can control MSRs 0x-0x1fff and 0xc000-0xc0001fff.
 */
if (msr = 0x1fff) {
-   __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
-   __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
+   if (type  MSR_TYPE_R)
+   /* read-low */
+   __clear_bit(msr, msr_bitmap + 0x000 / f);
+
+   if (type  MSR_TYPE_W)
+   /* write-low */
+   __clear_bit(msr, msr_bitmap + 0x800 / f);
+
} else if ((msr = 0xc000)  (msr = 0xc0001fff)) {
msr = 0x1fff;
-   __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
-   __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
+   if (type  MSR_TYPE_R)
+   /* read-high */
+   __clear_bit(msr, msr_bitmap + 0x400 / f);
+
+   if (type  MSR_TYPE_W)
+   /* write-high */
+   __clear_bit(msr, msr_bitmap + 0xc00 / f);
+
}
 }
 
 static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
 {
if (!longmode_only)
-   __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
-   __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
+   __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
+   msr, MSR_TYPE_R | MSR_TYPE_W);
+   __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
+   msr, MSR_TYPE_R | MSR_TYPE_W);
+}
+
+static void vmx_disable_intercept_for_msr_read(u32 msr, bool longmode_only)
+{
+   if (!longmode_only)
+   __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
+   msr, MSR_TYPE_R);
+   __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
+   msr, MSR_TYPE_R);
+}
+
+static void vmx_disable_intercept_for_msr_write(u32 msr, bool longmode_only)
+{
+   if (!longmode_only)
+   __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
+   msr, MSR_TYPE_W);
+   __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
+   msr, MSR_TYPE_W);
 }
 
 /*
@@ -7585,6 +7620,21 @@ static int __init vmx_init(void)
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
 
+   if (enable_apicv_reg) {
+   int msr;
+   for (msr = 0x800; msr = 0x8ff; msr++)
+   vmx_disable_intercept_for_msr_read(msr, false);
+   }
+
+   if (enable_apicv_vid) {
+   /* TPR */
+   vmx_disable_intercept_for_msr_write(0x808, false);
+   /* EOI */
+   vmx_disable_intercept_for_msr_write(0x80b, false);
+   /* SELF-IPI */
+   vmx_disable_intercept_for_msr_write(0x83f, false);
+   }
+
if (enable_ept) {
kvm_mmu_set_mask_ptes(0ull,
(enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 3/4] x86, apicv: add virtual interrupt delivery support

2012-12-02 Thread Yang Zhang
Virtual interrupt delivery avoids KVM to inject vAPIC interrupts
manually, which is fully taken care of by the hardware. This needs
some special awareness into existing interrupr injection path:

- for pending interrupt, instead of direct injection, we may need
  update architecture specific indicators before resuming to guest.

- A pending interrupt, which is masked by ISR, should be also
  considered in above update action, since hardware will decide
  when to inject it at right time. Current has_interrupt and
  get_interrupt only returns a valid vector from injection p.o.v.

Signed-off-by: Yang Zhang yang.z.zh...@intel.com
Signed-off-by: Kevin Tian kevin.t...@intel.com
---
 arch/x86/include/asm/kvm_host.h |4 +
 arch/x86/include/asm/vmx.h  |   11 +++
 arch/x86/kvm/irq.c  |   53 ++-
 arch/x86/kvm/lapic.c|   56 +---
 arch/x86/kvm/lapic.h|6 ++
 arch/x86/kvm/svm.c  |   19 +
 arch/x86/kvm/vmx.c  |  140 ++-
 arch/x86/kvm/x86.c  |   34 --
 virt/kvm/ioapic.c   |1 +
 9 files changed, 291 insertions(+), 33 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dc87b65..e5352c8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -697,6 +697,10 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
+   int (*has_virtual_interrupt_delivery)(struct kvm_vcpu *vcpu);
+   void (*update_irq)(struct kvm_vcpu *vcpu);
+   void (*set_eoi_exitmap)(struct kvm_vcpu *vcpu, int vector,
+   int trig_mode, int always_set);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 21101b6..1003341 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -62,6 +62,7 @@
 #define EXIT_REASON_MCE_DURING_VMENTRY  41
 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
 #define EXIT_REASON_APIC_ACCESS 44
+#define EXIT_REASON_EOI_INDUCED 45
 #define EXIT_REASON_EPT_VIOLATION   48
 #define EXIT_REASON_EPT_MISCONFIG   49
 #define EXIT_REASON_WBINVD  54
@@ -143,6 +144,7 @@
 #define SECONDARY_EXEC_WBINVD_EXITING  0x0040
 #define SECONDARY_EXEC_UNRESTRICTED_GUEST  0x0080
 #define SECONDARY_EXEC_APIC_REGISTER_VIRT   0x0100
+#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY0x0200
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING  0x0400
 #define SECONDARY_EXEC_ENABLE_INVPCID  0x1000
 
@@ -180,6 +182,7 @@ enum vmcs_field {
GUEST_GS_SELECTOR   = 0x080a,
GUEST_LDTR_SELECTOR = 0x080c,
GUEST_TR_SELECTOR   = 0x080e,
+   GUEST_INTR_STATUS   = 0x0810,
HOST_ES_SELECTOR= 0x0c00,
HOST_CS_SELECTOR= 0x0c02,
HOST_SS_SELECTOR= 0x0c04,
@@ -207,6 +210,14 @@ enum vmcs_field {
APIC_ACCESS_ADDR_HIGH   = 0x2015,
EPT_POINTER = 0x201a,
EPT_POINTER_HIGH= 0x201b,
+   EOI_EXIT_BITMAP0= 0x201c,
+   EOI_EXIT_BITMAP0_HIGH   = 0x201d,
+   EOI_EXIT_BITMAP1= 0x201e,
+   EOI_EXIT_BITMAP1_HIGH   = 0x201f,
+   EOI_EXIT_BITMAP2= 0x2020,
+   EOI_EXIT_BITMAP2_HIGH   = 0x2021,
+   EOI_EXIT_BITMAP3= 0x2022,
+   EOI_EXIT_BITMAP3_HIGH   = 0x2023,
GUEST_PHYSICAL_ADDRESS  = 0x2400,
GUEST_PHYSICAL_ADDRESS_HIGH = 0x2401,
VMCS_LINK_POINTER   = 0x2800,
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 7e06ba1..f782788 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -43,45 +43,64 @@ EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
  */
 int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 {
-   struct kvm_pic *s;
-
if (!irqchip_in_kernel(v-kvm))
return v-arch.interrupt.pending;
 
-   if (kvm_apic_has_interrupt(v) == -1) {  /* LAPIC */
-   if (kvm_apic_accept_pic_intr(v)) {
-   s = pic_irqchip(v-kvm);/* PIC */
-   return s-output;
-   } else
-   return 0;
-   }
+   if (kvm_apic_has_interrupt(v) == -1) /* LAPIC */
+   return kvm_cpu_has_extint(v); /* non-APIC */
return 1;
 }
 EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
 
 /*
+ * check if there is pending