vhost-blk and qemu

2015-11-17 Thread Mohan G via Virtualization
Hi,I am looking to experiment the vhost-blk stack. Can some one point me to the 
latest code version and the corresponding qemu version location.I am on centos 
7 (3.10 ) kernel. I am hoping using the vhost-blk.ko and corresponding qemu 
version will get me started to measure some numbers.


RegardsMohan___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[PATCH] paravirt: remove unused pv_apic_ops structure

2015-11-17 Thread Juergen Gross
The only member of that structure is startup_ipi_hook which is always
set to paravirt_nop.

Signed-off-by: Juergen Gross 
---
 arch/x86/include/asm/paravirt.h   |  9 -
 arch/x86/include/asm/paravirt_types.h | 10 --
 arch/x86/include/asm/smp.h|  3 ---
 arch/x86/kernel/paravirt.c|  8 
 arch/x86/kernel/smpboot.c |  7 ---
 arch/x86/xen/enlighten.c  |  7 ---
 6 files changed, 44 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 10d0596..4d7f080 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -285,15 +285,6 @@ static inline void slow_down_io(void)
 #endif
 }
 
-#ifdef CONFIG_SMP
-static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
-   unsigned long start_esp)
-{
-   PVOP_VCALL3(pv_apic_ops.startup_ipi_hook,
-   phys_apicid, start_eip, start_esp);
-}
-#endif
-
 static inline void paravirt_activate_mm(struct mm_struct *prev,
struct mm_struct *next)
 {
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 31247b5..b0e603b 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -215,14 +215,6 @@ struct pv_irq_ops {
 #endif
 };
 
-struct pv_apic_ops {
-#ifdef CONFIG_X86_LOCAL_APIC
-   void (*startup_ipi_hook)(int phys_apicid,
-unsigned long start_eip,
-unsigned long start_esp);
-#endif
-};
-
 struct pv_mmu_ops {
unsigned long (*read_cr2)(void);
void (*write_cr2)(unsigned long);
@@ -354,7 +346,6 @@ struct paravirt_patch_template {
struct pv_time_ops pv_time_ops;
struct pv_cpu_ops pv_cpu_ops;
struct pv_irq_ops pv_irq_ops;
-   struct pv_apic_ops pv_apic_ops;
struct pv_mmu_ops pv_mmu_ops;
struct pv_lock_ops pv_lock_ops;
 };
@@ -364,7 +355,6 @@ extern struct pv_init_ops pv_init_ops;
 extern struct pv_time_ops pv_time_ops;
 extern struct pv_cpu_ops pv_cpu_ops;
 extern struct pv_irq_ops pv_irq_ops;
-extern struct pv_apic_ops pv_apic_ops;
 extern struct pv_mmu_ops pv_mmu_ops;
 extern struct pv_lock_ops pv_lock_ops;
 
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 222a6a3..c16ddf9 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -74,9 +74,6 @@ struct smp_ops {
 extern void set_cpu_sibling_map(int cpu);
 
 #ifdef CONFIG_SMP
-#ifndef CONFIG_PARAVIRT
-#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
-#endif
 extern struct smp_ops smp_ops;
 
 static inline void smp_send_stop(void)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c2130ae..f3b79b2 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -133,7 +133,6 @@ static void *get_call_destination(u8 type)
.pv_time_ops = pv_time_ops,
.pv_cpu_ops = pv_cpu_ops,
.pv_irq_ops = pv_irq_ops,
-   .pv_apic_ops = pv_apic_ops,
.pv_mmu_ops = pv_mmu_ops,
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
.pv_lock_ops = pv_lock_ops,
@@ -403,12 +402,6 @@ NOKPROBE_SYMBOL(native_get_debugreg);
 NOKPROBE_SYMBOL(native_set_debugreg);
 NOKPROBE_SYMBOL(native_load_idt);
 
-struct pv_apic_ops pv_apic_ops = {
-#ifdef CONFIG_X86_LOCAL_APIC
-   .startup_ipi_hook = paravirt_nop,
-#endif
-};
-
 #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
 /* 32-bit pagetable entries */
 #define PTE_IDENT  __PV_IS_CALLEE_SAVE(_paravirt_ident_32)
@@ -492,6 +485,5 @@ struct pv_mmu_ops pv_mmu_ops = {
 EXPORT_SYMBOL_GPL(pv_time_ops);
 EXPORT_SYMBOL(pv_cpu_ops);
 EXPORT_SYMBOL(pv_mmu_ops);
-EXPORT_SYMBOL_GPL(pv_apic_ops);
 EXPORT_SYMBOL_GPL(pv_info);
 EXPORT_SYMBOL(pv_irq_ops);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 892ee2e5..4df 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -629,13 +629,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned 
long start_eip)
num_starts = 0;
 
/*
-* Paravirt / VMI wants a startup IPI hook here to set up the
-* target processor state.
-*/
-   startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
-stack_start);
-
-   /*
 * Run STARTUP IPI loop.
 */
pr_debug("#startup loops: %d\n", num_starts);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5774800..4334e511 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1265,12 +1265,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = 
{
.end_context_switch = xen_end_context_switch,
 };
 
-static const struct pv_apic_ops xen_apic_ops __initconst = {
-#ifdef 

Re: [PATCH] paravirt: remove paravirt ops pmd_update_defer and pte_update_defer

2015-11-17 Thread Juergen Gross
On 17/11/15 15:46, Juergen Gross wrote:
> pte_update_defer can be removed as it is always set to the same
> function as pte_update. So any usage of pte_update_defer() can be
> replaced by pte_update().
> 
> pmd_update_defer is always set to paravirt_nop, so it can just be
> nuked.
> 
> Signed-off-by: Juergen Gross 

Sorry, hit send to early. Please forget this one.


Juergen

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH] paravirt: remove paravirt ops pmd_update_defer and pte_update_defer

2015-11-17 Thread Juergen Gross
pte_update_defer can be removed as it is always set to the same
function as pte_update. So any usage of pte_update_defer() can be
replaced by pte_update().

pmd_update_defer is always set to paravirt_nop, so it can just be
nuked.

Signed-off-by: Juergen Gross 
---
 arch/x86/include/asm/paravirt.h   | 12 
 arch/x86/include/asm/paravirt_types.h |  4 
 arch/x86/include/asm/pgtable.h|  9 +
 arch/x86/kernel/paravirt.c|  2 --
 arch/x86/lguest/boot.c|  1 -
 arch/x86/mm/pgtable.c |  3 +--
 arch/x86/xen/mmu.c|  1 -
 7 files changed, 2 insertions(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 10d0596..10c39b9 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -381,18 +381,6 @@ static inline void pmd_update(struct mm_struct *mm, 
unsigned long addr,
PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp);
 }
 
-static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
-   pte_t *ptep)
-{
-   PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
-}
-
-static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr,
-   pmd_t *pmdp)
-{
-   PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp);
-}
-
 static inline pte_t __pte(pteval_t val)
 {
pteval_t ret;
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 31247b5..274727e 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -274,12 +274,8 @@ struct pv_mmu_ops {
   pmd_t *pmdp, pmd_t pmdval);
void (*pte_update)(struct mm_struct *mm, unsigned long addr,
   pte_t *ptep);
-   void (*pte_update_defer)(struct mm_struct *mm,
-unsigned long addr, pte_t *ptep);
void (*pmd_update)(struct mm_struct *mm, unsigned long addr,
   pmd_t *pmdp);
-   void (*pmd_update_defer)(struct mm_struct *mm,
-unsigned long addr, pmd_t *pmdp);
 
pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long 
addr,
pte_t *ptep);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 6ec0c8b..5126367 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -69,9 +69,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
 #define pmd_clear(pmd) native_pmd_clear(pmd)
 
 #define pte_update(mm, addr, ptep)  do { } while (0)
-#define pte_update_defer(mm, addr, ptep)do { } while (0)
 #define pmd_update(mm, addr, ptep)  do { } while (0)
-#define pmd_update_defer(mm, addr, ptep)do { } while (0)
 
 #define pgd_val(x) native_pgd_val(x)
 #define __pgd(x)   native_make_pgd(x)
@@ -731,14 +729,9 @@ static inline void native_set_pmd_at(struct mm_struct *mm, 
unsigned long addr,
  * updates should either be sets, clears, or set_pte_atomic for P->P
  * transitions, which means this hook should only be called for user PTEs.
  * This hook implies a P->P protection or access change has taken place, which
- * requires a subsequent TLB flush.  The notification can optionally be delayed
- * until the TLB flush event by using the pte_update_defer form of the
- * interface, but care must be taken to assure that the flush happens while
- * still holding the same page table lock so that the shadow and primary pages
- * do not become out of sync on SMP.
+ * requires a subsequent TLB flush.
  */
 #define pte_update(mm, addr, ptep) do { } while (0)
-#define pte_update_defer(mm, addr, ptep)   do { } while (0)
 #endif
 
 /*
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c2130ae..3ac7b85 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -444,9 +444,7 @@ struct pv_mmu_ops pv_mmu_ops = {
.set_pmd = native_set_pmd,
.set_pmd_at = native_set_pmd_at,
.pte_update = paravirt_nop,
-   .pte_update_defer = paravirt_nop,
.pmd_update = paravirt_nop,
-   .pmd_update_defer = paravirt_nop,
 
.ptep_modify_prot_start = __ptep_modify_prot_start,
.ptep_modify_prot_commit = __ptep_modify_prot_commit,
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index a0d09f6..a1900d4 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1472,7 +1472,6 @@ __init void lguest_init(void)
pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu;
pv_mmu_ops.pte_update = lguest_pte_update;
-   pv_mmu_ops.pte_update_defer = lguest_pte_update;
 
 #ifdef CONFIG_X86_LOCAL_APIC
/* APIC 

Re: [PATCH] paravirt: remove unused pv_apic_ops structure

2015-11-17 Thread David Vrabel
On 17/11/15 13:44, Juergen Gross wrote:
> The only member of that structure is startup_ipi_hook which is always
> set to paravirt_nop.

Reviewed-by: David Vrabel 

David

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH] paravirt: remove paravirt ops pmd_update[_defer] and pte_update_defer

2015-11-17 Thread Juergen Gross
pte_update_defer can be removed as it is always set to the same
function as pte_update. So any usage of pte_update_defer() can be
replaced by pte_update().

pmd_update and pmd_update_defer are always set to paravirt_nop, so they
can just be nuked.

Signed-off-by: Juergen Gross 
---
 arch/x86/include/asm/paravirt.h   | 17 -
 arch/x86/include/asm/paravirt_types.h |  6 --
 arch/x86/include/asm/pgtable.h| 15 ++-
 arch/x86/kernel/paravirt.c|  3 ---
 arch/x86/lguest/boot.c|  1 -
 arch/x86/mm/pgtable.c |  7 +--
 arch/x86/xen/mmu.c|  1 -
 7 files changed, 3 insertions(+), 47 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 10d0596..398f068 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -375,23 +375,6 @@ static inline void pte_update(struct mm_struct *mm, 
unsigned long addr,
 {
PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
 }
-static inline void pmd_update(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp)
-{
-   PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp);
-}
-
-static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
-   pte_t *ptep)
-{
-   PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
-}
-
-static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr,
-   pmd_t *pmdp)
-{
-   PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp);
-}
 
 static inline pte_t __pte(pteval_t val)
 {
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 31247b5..6418541 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -274,12 +274,6 @@ struct pv_mmu_ops {
   pmd_t *pmdp, pmd_t pmdval);
void (*pte_update)(struct mm_struct *mm, unsigned long addr,
   pte_t *ptep);
-   void (*pte_update_defer)(struct mm_struct *mm,
-unsigned long addr, pte_t *ptep);
-   void (*pmd_update)(struct mm_struct *mm, unsigned long addr,
-  pmd_t *pmdp);
-   void (*pmd_update_defer)(struct mm_struct *mm,
-unsigned long addr, pmd_t *pmdp);
 
pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long 
addr,
pte_t *ptep);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 6ec0c8b..d3eee66 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -69,9 +69,6 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
 #define pmd_clear(pmd) native_pmd_clear(pmd)
 
 #define pte_update(mm, addr, ptep)  do { } while (0)
-#define pte_update_defer(mm, addr, ptep)do { } while (0)
-#define pmd_update(mm, addr, ptep)  do { } while (0)
-#define pmd_update_defer(mm, addr, ptep)do { } while (0)
 
 #define pgd_val(x) native_pgd_val(x)
 #define __pgd(x)   native_make_pgd(x)
@@ -731,14 +728,9 @@ static inline void native_set_pmd_at(struct mm_struct *mm, 
unsigned long addr,
  * updates should either be sets, clears, or set_pte_atomic for P->P
  * transitions, which means this hook should only be called for user PTEs.
  * This hook implies a P->P protection or access change has taken place, which
- * requires a subsequent TLB flush.  The notification can optionally be delayed
- * until the TLB flush event by using the pte_update_defer form of the
- * interface, but care must be taken to assure that the flush happens while
- * still holding the same page table lock so that the shadow and primary pages
- * do not become out of sync on SMP.
+ * requires a subsequent TLB flush.
  */
 #define pte_update(mm, addr, ptep) do { } while (0)
-#define pte_update_defer(mm, addr, ptep)   do { } while (0)
 #endif
 
 /*
@@ -830,9 +822,7 @@ static inline int pmd_write(pmd_t pmd)
 static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned 
long addr,
   pmd_t *pmdp)
 {
-   pmd_t pmd = native_pmdp_get_and_clear(pmdp);
-   pmd_update(mm, addr, pmdp);
-   return pmd;
+   return native_pmdp_get_and_clear(pmdp);
 }
 
 #define __HAVE_ARCH_PMDP_SET_WRPROTECT
@@ -840,7 +830,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
  unsigned long addr, pmd_t *pmdp)
 {
clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
-   pmd_update(mm, addr, pmdp);
 }
 
 /*
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c2130ae..f601250 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -444,9 +444,6 @@ struct 

Re: [PATCH net-next RFC V3 0/3] basic busy polling support for vhost_net

2015-11-17 Thread Felipe Franciosi
Hi Jason,

I understand your busy loop timeout is quite conservative at 50us. Did you try 
any other values?

Also, did you measure how polling affects many VMs talking to each other (e.g. 
20 VMs on each host, perhaps with several vNICs each, transmitting to a 
corresponding VM/vNIC pair on another host)?


On a complete separate experiment (busy waiting on storage I/O rings on Xen), I 
have observed that bigger timeouts gave bigger benefits. On the other hand, all 
cases that contended for CPU were badly hurt with any sort of polling.

The cases that contended for CPU consisted of many VMs generating workload over 
very fast I/O devices (in that case, several NVMe devices on a single host). 
And the metric that got affected was aggregate throughput from all VMs.

The solution was to determine whether to poll depending on the host's overall 
CPU utilisation at that moment. That gave me the best of both worlds as polling 
made everything faster without slowing down any other metric.

Thanks,
Felipe



On 12/11/2015 10:20, "kvm-ow...@vger.kernel.org on behalf of Jason Wang" 
 wrote:

>
>
>On 11/12/2015 06:16 PM, Jason Wang wrote:
>> Hi all:
>>
>> This series tries to add basic busy polling for vhost net. The idea is
>> simple: at the end of tx/rx processing, busy polling for new tx added
>> descriptor and rx receive socket for a while. The maximum number of
>> time (in us) could be spent on busy polling was specified ioctl.
>>
>> Test were done through:
>>
>> - 50 us as busy loop timeout
>> - Netperf 2.6
>> - Two machines with back to back connected ixgbe
>> - Guest with 1 vcpu and 1 queue
>>
>> Results:
>> - For stream workload, ioexits were reduced dramatically in medium
>>   size (1024-2048) of tx (at most -39%) and almost all rx (at most
>>   -79%) as a result of polling. This compensate for the possible
>>   wasted cpu cycles more or less. That porbably why we can still see
>>   some increasing in the normalized throughput in some cases.
>> - Throughput of tx were increased (at most 105%) expect for the huge
>>   write (16384). And we can send more packets in the case (+tpkts were
>>   increased).
>> - Very minor rx regression in some cases.
>> - Improvemnt on TCP_RR (at most 16%).
>
>Forget to mention, the following test results by order are:
>
>1) Guest TX
>2) Guest RX
>3) TCP_RR
>
>> size/session/+thu%/+normalize%/+tpkts%/+rpkts%/+ioexits%/
>>64/ 1/   +9%/  -17%/   +5%/  +10%/   -2%
>>64/ 2/   +8%/  -18%/   +6%/  +10%/   -1%
>>64/ 4/   +4%/  -21%/   +6%/  +10%/   -1%
>>64/ 8/   +9%/  -17%/   +6%/   +9%/   -2%
>>   256/ 1/  +20%/   -1%/  +15%/  +11%/   -9%
>>   256/ 2/  +15%/   -6%/  +15%/   +8%/   -8%
>>   256/ 4/  +17%/   -4%/  +16%/   +8%/   -8%
>>   256/ 8/  -61%/  -69%/  +16%/  +10%/  -10%
>>   512/ 1/  +15%/   -3%/  +19%/  +18%/  -11%
>>   512/ 2/  +19%/0%/  +19%/  +13%/  -10%
>>   512/ 4/  +18%/   -2%/  +18%/  +15%/  -10%
>>   512/ 8/  +17%/   -1%/  +18%/  +15%/  -11%
>>  1024/ 1/  +25%/   +4%/  +27%/  +16%/  -21%
>>  1024/ 2/  +28%/   +8%/  +25%/  +15%/  -22%
>>  1024/ 4/  +25%/   +5%/  +25%/  +14%/  -21%
>>  1024/ 8/  +27%/   +7%/  +25%/  +16%/  -21%
>>  2048/ 1/  +32%/  +12%/  +31%/  +22%/  -38%
>>  2048/ 2/  +33%/  +12%/  +30%/  +23%/  -36%
>>  2048/ 4/  +31%/  +10%/  +31%/  +24%/  -37%
>>  2048/ 8/ +105%/  +75%/  +33%/  +23%/  -39%
>> 16384/ 1/0%/  -14%/   +2%/0%/  +19%
>> 16384/ 2/0%/  -13%/  +19%/  -13%/  +17%
>> 16384/ 4/0%/  -12%/   +3%/0%/   +2%
>> 16384/ 8/0%/  -11%/   -2%/   +1%/   +1%
>> size/session/+thu%/+normalize%/+tpkts%/+rpkts%/+ioexits%/
>>64/ 1/   -7%/  -23%/   +4%/   +6%/  -74%
>>64/ 2/   -2%/  -12%/   +2%/   +2%/  -55%
>>64/ 4/   +2%/   -5%/  +10%/   -2%/  -43%
>>64/ 8/   -5%/   -5%/  +11%/  -34%/  -59%
>>   256/ 1/   -6%/  -16%/   +9%/  +11%/  -60%
>>   256/ 2/   +3%/   -4%/   +6%/   -3%/  -28%
>>   256/ 4/0%/   -5%/   -9%/   -9%/  -10%
>>   256/ 8/   -3%/   -6%/  -12%/   -9%/  -40%
>>   512/ 1/   -4%/  -17%/  -10%/  +21%/  -34%
>>   512/ 2/0%/   -9%/  -14%/   -3%/  -30%
>>   512/ 4/0%/   -4%/  -18%/  -12%/   -4%
>>   512/ 8/   -1%/   -4%/   -1%/   -5%/   +4%
>>  1024/ 1/0%/  -16%/  +12%/  +11%/  -10%
>>  1024/ 2/0%/  -11%/0%/   +5%/  -31%
>>  1024/ 4/0%/   -4%/   -7%/   +1%/  -22%
>>  1024/ 8/   -5%/   -6%/  -17%/  -29%/  -79%
>>  2048/ 1/0%/  -16%/   +1%/   +9%/  -10%
>>  2048/ 2/0%/  -12%/   +7%/   +9%/  -26%
>>  2048/ 4/0%/   -7%/   -4%/   +3%/  -64%
>>  2048/ 8/   -1%/   -5%/   -6%/   +4%/  -20%
>> 16384/ 1/0%/  -12%/  +11%/   +7%/  -20%
>> 16384/ 2/0%/   -7%/   +1%/   +5%/  -26%
>> 16384/ 4/0%/   -5%/  +12%/  +22%/  -23%
>> 16384/ 8/0%/   -1%/   -8%/   +5%/   -3%
>> 

[RFC] kvmtool: add support for modern virtio-pci

2015-11-17 Thread Sasha Levin
This is a first go at adding support for the modern (based on the 1.0 virtio
spec) virtio-pci implementation.

kvmtool makes it simple to add additional transports such as this because of
it's layering, so we are able to add it as a 3rd (after legacy virtio-pci and
virtio-mmio) transport layer, and still allow users to choose to use either
the legacy or the modern implementations (but setting the modern one as
default.

The changes to the virtio devices are mostly the result of needing to support
>32bit features, and the different initialization method for VQs.

It's worth noting that supporting v1.0 implies any_layout, but some of our
devices made assumptions about the layout - which I've fixed. But it's worth
to keep in mind that some probably went unnoticed.

To sum it up: this is a lightly tested version for feedback about the design
and to weed out major bugs people notice. Feedback is very welcome!

Signed-off-by: Sasha Levin 
---
 Makefile  |   1 +
 builtin-run.c |   4 +
 include/kvm/kvm-config.h  |   1 +
 include/kvm/pci.h |   8 +-
 include/kvm/virtio-9p.h   |   2 +-
 include/kvm/{virtio-pci.h => virtio-pci-modern.h} |  23 +-
 include/kvm/virtio-pci.h  |   6 +-
 include/kvm/virtio.h  |  25 +-
 include/linux/virtio_pci.h| 199 +++
 net/uip/core.c|   7 +-
 virtio/9p.c   |  35 +-
 virtio/balloon.c  |  37 +-
 virtio/blk.c  |  50 +-
 virtio/console.c  |  42 +-
 virtio/core.c |  16 +
 virtio/mmio.c |  13 +-
 virtio/net.c  |  59 ++-
 virtio/pci.c  |   4 +-
 virtio/pci_modern.c   | 599 ++
 virtio/rng.c  |  29 +-
 virtio/scsi.c |  36 +-
 x86/include/kvm/kvm-arch.h|   2 +-
 22 files changed, 1109 insertions(+), 89 deletions(-)
 copy include/kvm/{virtio-pci.h => virtio-pci-modern.h} (69%)
 create mode 100644 include/linux/virtio_pci.h
 create mode 100644 virtio/pci_modern.c

diff --git a/Makefile b/Makefile
index 59622c3..13a12f8 100644
--- a/Makefile
+++ b/Makefile
@@ -67,6 +67,7 @@ OBJS  += virtio/net.o
 OBJS   += virtio/rng.o
 OBJS+= virtio/balloon.o
 OBJS   += virtio/pci.o
+OBJS   += virtio/pci_modern.o
 OBJS   += disk/blk.o
 OBJS   += disk/qcow.o
 OBJS   += disk/raw.o
diff --git a/builtin-run.c b/builtin-run.c
index edcaf3e..e133b10 100644
--- a/builtin-run.c
+++ b/builtin-run.c
@@ -128,6 +128,8 @@ void kvm_run_set_wrapper_sandbox(void)
" rootfs"), \
OPT_STRING('\0', "hugetlbfs", &(cfg)->hugetlbfs_path, "path",   \
"Hugetlbfs path"),  \
+   OPT_BOOLEAN('\0', "virtio-legacy", &(cfg)->old_virtio, "Use"\
+   " legacy virtio-pci devices"),  \
\
OPT_GROUP("Kernel options:"),   \
OPT_STRING('k', "kernel", &(cfg)->kernel_filename, "kernel",\
@@ -517,6 +519,8 @@ static struct kvm *kvm_cmd_run_init(int argc, const char 
**argv)
kvm->cfg.vmlinux_filename = find_vmlinux();
kvm->vmlinux = kvm->cfg.vmlinux_filename;
 
+   default_transport = kvm->cfg.old_virtio ? VIRTIO_PCI : 
VIRTIO_PCI_MODERN;
+
if (kvm->cfg.nrcpus == 0)
kvm->cfg.nrcpus = nr_online_cpus;
 
diff --git a/include/kvm/kvm-config.h b/include/kvm/kvm-config.h
index 386fa8c..b1512a1 100644
--- a/include/kvm/kvm-config.h
+++ b/include/kvm/kvm-config.h
@@ -57,6 +57,7 @@ struct kvm_config {
bool no_dhcp;
bool ioport_debug;
bool mmio_debug;
+   bool old_virtio;
 };
 
 #endif
diff --git a/include/kvm/pci.h b/include/kvm/pci.h
index b0c28a1..19ec56a 100644
--- a/include/kvm/pci.h
+++ b/include/kvm/pci.h
@@ -4,6 +4,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "kvm/devices.h"
@@ -81,7 +82,12 @@ struct pci_device_header {
u8  min_gnt;
u8  max_lat;
struct msix_cap msix;
-   u8  empty[136]; /* Rest of PCI config space */
+   struct virtio_pci_cap common_cap;
+   struct virtio_pci_notify_cap notify_cap;
+   struct virtio_pci_cap isr_cap;
+   struct virtio_pci_cap device_cap;
+   struct virtio_pci_cfg_cap pci_cap;
+   u8  empty[48]; /* Rest of PCI config space */
u32   

[PATCH -kernel] nvme: improve performance for virtual NVMe devices

2015-11-17 Thread Ming Lin
From: Rob Nelson 

This change provides a mechanism to reduce the number of MMIO doorbell
writes for the NVMe driver. When running in a virtualized environment
like QEMU, the cost of an MMIO is quite hefy here. The main idea for
the patch is provide the device two memory location locations:
 1) to store the doorbell values so they can be lookup without the doorbell
MMIO write
 2) to store an event index.
I believe the doorbell value is obvious, the event index not so much.
Similar to the virtio specificaiton, the virtual device can tell the
driver (guest OS) not to write MMIO unless you are writing past this
value.

FYI: doorbell values are written by the nvme driver (guest OS) and the
event index is written by the virtual device (host OS).

The patch implements a new admin command that will communicate where
these two memory locations reside. If the command fails, the nvme
driver will work as before without any optimizations.

Contributions:
  Eric Northup 
  Frank Swiderski 
  Ted Tso 
  Keith Busch 

Just to give an idea on the performance boost with the vendor
extension: Running fio [1], a stock NVMe driver I get about 200K read
IOPs with my vendor patch I get about 1000K read IOPs. This was
running with a null device i.e. the backing device simply returned
success on every read IO request.

[1] Running on a 4 core machine:
  fio --time_based --name=benchmark --runtime=30
  --filename=/dev/nvme0n1 --nrfiles=1 --ioengine=libaio --iodepth=32
  --direct=1 --invalidate=1 --verify=0 --verify_fatal=0 --numjobs=4
  --rw=randread --blocksize=4k --randrepeat=false

Signed-off-by: Rob Nelson 
[mlin: port for upstream]
Signed-off-by: Ming Lin 
---
 drivers/nvme/host/Kconfig |   7 +++
 drivers/nvme/host/core.c  |   1 +
 drivers/nvme/host/pci.c   | 147 ++
 include/linux/nvme.h  |  21 +++
 4 files changed, 176 insertions(+)

diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 002a94a..93f9438 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -8,3 +8,10 @@ config BLK_DEV_NVME
 
  To compile this driver as a module, choose M here: the
  module will be called nvme.
+
+config NVME_VENDOR_EXT_GOOGLE
+   tristate "NVMe Vendor Extension for Improved Virtualization"
+   depends on BLK_DEV_NVME
+   ---help---
+ Google extension to reduce the number of MMIO doorbell
+ writes for the NVMe driver
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 400b1ea..78ac8bb 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -160,6 +160,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct 
nvme_command *cmd,
 {
return __nvme_submit_sync_cmd(q, cmd, buffer, bufflen, NULL, 0);
 }
+EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
 
 int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
void __user *ubuffer, unsigned bufflen,
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 91522bb..93f1f36 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -49,6 +49,9 @@
 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
 #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
 
+/* Google Vendor ID is not in include/linux/pci_ids.h */
+#define PCI_VENDOR_ID_GOOGLE 0x1AE0
+
 static int use_threaded_interrupts;
 module_param(use_threaded_interrupts, int, 0);
 
@@ -106,6 +109,13 @@ struct nvme_dev {
unsigned long flags;
 #define NVME_CTRL_RESETTING0
 
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+   u32 *db_mem;
+   dma_addr_t doorbell;
+   u32 *ei_mem;
+   dma_addr_t eventidx;
+#endif
+
struct nvme_ctrl ctrl;
 };
 
@@ -139,6 +149,12 @@ struct nvme_queue {
u8 cq_phase;
u8 cqe_seen;
struct async_cmd_info cmdinfo;
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+   u32 *sq_doorbell_addr;
+   u32 *sq_eventidx_addr;
+   u32 *cq_doorbell_addr;
+   u32 *cq_eventidx_addr;
+#endif
 };
 
 /*
@@ -176,6 +192,9 @@ static inline void _nvme_check_size(void)
BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+   BUILD_BUG_ON(sizeof(struct nvme_doorbell_memory) != 64);
+#endif
 }
 
 /*
@@ -289,6 +308,51 @@ static void nvme_finish_aen_cmd(struct nvme_dev *dev, 
struct nvme_completion *cq
}
 }
 
+#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
+static int nvme_vendor_memory_size(struct nvme_dev *dev)
+{
+   return ((num_possible_cpus() + 1) * 8 * dev->db_stride);
+}
+
+static int nvme_set_doorbell_memory(struct nvme_dev *dev)
+{
+   struct nvme_command c;
+
+   memset(, 0, sizeof(c));
+   

[RFC PATCH 0/2] Google extension to improve qemu-nvme performance

2015-11-17 Thread Ming Lin
Hi Rob & Mihai,

I wrote vhost-nvme patches on top of Christoph's NVMe target.
vhost-nvme still uses mmio. So the guest OS can run unmodified NVMe
driver. But the tests I have done didn't show competitive performance
compared to virtio-blk/virtio-scsi. The bottleneck is in mmio. Your nvme
vendor extension patches reduces greatly the number of MMIO writes.
So I'd like to push it upstream.

I port these 2 patches to newer kernel and qemu.
I use ram disk as backend to compare performance.

qemu-nvme: 29MB/s
qemu-nvme+google-ext: 100MB/s
virtio-blk: 174MB/s
virtio-scsi: 118MB/s

I'll show you qemu-vhost-nvme+google-ext number later.

root@guest:~# cat test.job 
[global]
bs=4k
ioengine=libaio
iodepth=64
direct=1
runtime=120
time_based
rw=randread
norandommap
group_reporting
gtod_reduce=1
numjobs=2

[job1]
filename=/dev/nvme0n1
#filename=/dev/vdb
#filename=/dev/sda
rw=read

Patches also available at:
kernel:
https://git.kernel.org/cgit/linux/kernel/git/mlin/linux.git/log/?h=nvme-google-ext
qemu:
http://www.minggr.net/cgit/cgit.cgi/qemu/log/?h=nvme-google-ext 

Thanks,
Ming
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH -qemu] nvme: support Google vendor extension

2015-11-17 Thread Ming Lin
From: Mihai Rusu 

This implements the device side for an NVMe vendor extension that
reduces the number of MMIO writes which can result in a very large
performance benefit in virtualized environments.

See the following link for a description of the mechanism and the
kernel NVMe driver changes to support this vendor extension:
http://lists.infradead.org/pipermail/linux-nvme/2014-July/001076.html

On my workstation (3.2Ghz Xeon E5-1650), running QEMU:
$ bin/opt/native/x86_64-softmmu/qemu-system-x86_64 \
-enable-kvm -m 2048 -smp 4 \
-drive if=virtio,file=debian.raw,cache=none \
-drive file=nvme.raw,if=none,id=nvme-dev \
-device nvme,drive=nvme-dev,serial=nvme-serial

Using "fio":
vm # fio -time_based --name=benchmark --ioengine=libaio --iodepth=32 \
--numjobs=1 --runtime=30 --blocksize=4k --filename=/dev/nvme0n1 \
--nrfiles=1 --invalidate=1 --verify=0 --direct=1 --rw=randread

I get about 20k IOPs with the original code and about 85k IOPs with
the vendor extension changes applied (and running a vendor extension
supporting 3.14 based guest kernel).

Signed-off-by: Mihai Rusu 
[fixed for a merging into different tree; added VID/DID params]
Signed-off-by: Keith Busch 
[mlin: port for upstream]
Signed-off-by: Ming Lin 
---
 hw/block/nvme.c | 92 ++---
 hw/block/nvme.h | 18 +++
 2 files changed, 106 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 169e4fa..3e1c38d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -20,6 +20,7 @@
  *  -device nvme,drive=,serial=,id=
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -158,6 +159,14 @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t 
*ptr, uint32_t len,
 return NVME_SUCCESS;
 }
 
+static void nvme_update_cq_head(NvmeCQueue *cq)
+{
+if (cq->db_addr) {
+pci_dma_read(>ctrl->parent_obj, cq->db_addr,
+ >head, sizeof(cq->head));
+}
+}
+
 static void nvme_post_cqes(void *opaque)
 {
 NvmeCQueue *cq = opaque;
@@ -168,6 +177,8 @@ static void nvme_post_cqes(void *opaque)
 NvmeSQueue *sq;
 hwaddr addr;
 
+nvme_update_cq_head(cq);
+
 if (nvme_cq_full(cq)) {
 break;
 }
@@ -350,6 +361,8 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, 
uint64_t dma_addr,
 QTAILQ_INSERT_TAIL(&(sq->req_list), >io_req[i], entry);
 }
 sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
+sq->db_addr = 0;
+sq->eventidx_addr = 0;
 
 assert(n->cq[cqid]);
 cq = n->cq[cqid];
@@ -430,6 +443,8 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, 
uint64_t dma_addr,
 cq->head = cq->tail = 0;
 QTAILQ_INIT(>req_list);
 QTAILQ_INIT(>sq_list);
+cq->db_addr = 0;
+cq->eventidx_addr = 0;
 msix_vector_use(>parent_obj, cq->vector);
 n->cq[cqid] = cq;
 cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
@@ -528,6 +543,40 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
*cmd, NvmeRequest *req)
 return NVME_SUCCESS;
 }
 
+static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
+{
+uint64_t db_addr = le64_to_cpu(cmd->prp1);
+uint64_t eventidx_addr = le64_to_cpu(cmd->prp2);
+int i;
+
+/* Addresses should not be NULL and should be page aligned. */
+if (db_addr == 0 || db_addr & (n->page_size - 1) ||
+eventidx_addr == 0 || eventidx_addr & (n->page_size - 1)) {
+return NVME_INVALID_MEMORY_ADDRESS | NVME_DNR;
+}
+
+/* This assumes all I/O queues are created before this command is handled.
+ * We skip the admin queues. */
+for (i = 1; i < n->num_queues; i++) {
+NvmeSQueue *sq = n->sq[i];
+NvmeCQueue *cq = n->cq[i];
+
+if (sq != NULL) {
+/* Submission queue tail pointer location, 2 * QID * stride. */
+sq->db_addr = db_addr + 2 * i * 4;
+sq->eventidx_addr = eventidx_addr + 2 * i * 4;
+}
+
+if (cq != NULL) {
+/* Completion queue head pointer location, (2 * QID + 1) * stride.
+ */
+cq->db_addr = db_addr + (2 * i + 1) * 4;
+cq->eventidx_addr = eventidx_addr + (2 * i + 1) * 4;
+}
+}
+return NVME_SUCCESS;
+}
+
 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 {
 switch (cmd->opcode) {
@@ -545,11 +594,29 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
 return nvme_set_feature(n, cmd, req);
 case NVME_ADM_CMD_GET_FEATURES:
 return nvme_get_feature(n, cmd, req);
+case NVME_ADM_CMD_SET_DB_MEMORY:
+return nvme_set_db_memory(n, cmd);
 default:
 return NVME_INVALID_OPCODE | NVME_DNR;
 }
 }
 
+static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
+{
+if (sq->eventidx_addr) {
+

Re: [PATCH] virtio_ring: Shadow available ring flags & index

2015-11-17 Thread Venkatesh Srinivas via Virtualization
On Mon, Nov 16, 2015 at 7:46 PM, Xie, Huawei  wrote:

> On 11/14/2015 7:41 AM, Venkatesh Srinivas wrote:
> > On Wed, Nov 11, 2015 at 02:34:33PM +0200, Michael S. Tsirkin wrote:
> >> On Tue, Nov 10, 2015 at 04:21:07PM -0800, Venkatesh Srinivas wrote:
> >>> Improves cacheline transfer flow of available ring header.
> >>>
> >>> Virtqueues are implemented as a pair of rings, one producer->consumer
> >>> avail ring and one consumer->producer used ring; preceding the
> >>> avail ring in memory are two contiguous u16 fields -- avail->flags
> >>> and avail->idx. A producer posts work by writing to avail->idx and
> >>> a consumer reads avail->idx.
> >>>
> >>> The flags and idx fields only need to be written by a producer CPU
> >>> and only read by a consumer CPU; when the producer and consumer are
> >>> running on different CPUs and the virtio_ring code is structured to
> >>> only have source writes/sink reads, we can continuously transfer the
> >>> avail header cacheline between 'M' states between cores. This flow
> >>> optimizes core -> core bandwidth on certain CPUs.
> >>>
> >>> (see: "Software Optimization Guide for AMD Family 15h Processors",
> >>> Section 11.6; similar language appears in the 10h guide and should
> >>> apply to CPUs w/ exclusive caches, using LLC as a transfer cache)
> >>>
> >>> Unfortunately the existing virtio_ring code issued reads to the
> >>> avail->idx and read-modify-writes to avail->flags on the producer.
> >>>
> >>> This change shadows the flags and index fields in producer memory;
> >>> the vring code now reads from the shadows and only ever writes to
> >>> avail->flags and avail->idx, allowing the cacheline to transfer
> >>> core -> core optimally.
> >> Sounds logical, I'll apply this after a  bit of testing
> >> of my own, thanks!
> > Thanks!
>

> Venkatesh:
> Is it that your patch only applies to CPUs w/ exclusive caches?

No --- it applies when the inter-cache coherence flow is optimized by
'M' -> 'M' transfers and when producer reads might interfere w/
consumer prefetchw/reads. The AMD Optimization guides have specific
language on this subject, but other platforms may benefit.
(see Intel #'s below)

> Do you have perf data on Intel CPUs?

Good idea -- I ran some tests on a couple of Intel platforms:

(these are perf data from sample runs; for each I ran many runs, the
 numbers were pretty stable except for Haswell-EP cross-socket)

One-socket Intel Xeon W3690 ("Westmere"), 3.46 GHz; core turbo disabled
===
(note -- w/ core turbo disabled, performance is _very_ stable; variance of
 < 0.5% run-to-run; figure of merit is "seconds elapsed" here)

* Producer / consumer bound to Hyperthread pairs:

 Performance counter stats for './vring_bench_noshadow 10':

 343,425,166,916 L1-dcache-loads
  21,393,148 L1-dcache-load-misses #0.01% of all L1-dcache hits
  61,709,640,363 L1-dcache-stores
   5,745,690 L1-dcache-store-misses
  10,186,932,553 L1-dcache-prefetches
   1,491 L1-dcache-prefetch-misses
   121.335699344 seconds time elapsed

 Performance counter stats for './vring_bench_shadow 10':

 334,766,413,861 L1-dcache-loads
  15,787,778 L1-dcache-load-misses #0.00% of all L1-dcache hits
  62,735,792,799 L1-dcache-stores
   3,252,113 L1-dcache-store-misses
   9,018,273,596 L1-dcache-prefetches
 819 L1-dcache-prefetch-misses
   121.206339656 seconds time elapsed

Effectively Performance-neutral.

* Producer / consumer bound to separate cores, same socket:

 Performance counter stats for './vring_bench_noshadow 10':

   399,943,384,509 L1-dcache-loads
 8,868,334,693 L1-dcache-load-misses #2.22% of all L1-dcache hits
62,721,376,685 L1-dcache-stores
 2,786,806,982 L1-dcache-store-misses
10,915,046,967 L1-dcache-prefetches
   328,508 L1-dcache-prefetch-misses
 146.585969976 seconds time elapsed

 Performance counter stats for './vring_bench_shadow 10':

   425,123,067,750 L1-dcache-loads 
 6,689,318,709 L1-dcache-load-misses #1.57% of all L1-dcache hits
62,747,525,005 L1-dcache-stores 
 2,496,274,505 L1-dcache-store-misses
 8,627,873,397 L1-dcache-prefetches
   146,729 L1-dcache-prefetch-misses
 142.657327765 seconds time elapsed

2.6% reduction in runtime; note that L1-dcache-load-misses reduced dramatically,
2 Billion(!) L1d misses saved.

Two-socket Intel Sandy Bridge(-EP) Xeon, 2.6 GHz; core turbo disabled
=

* Producer / consumer bound to Hyperthread pairs:

 Performance counter stats for './vring_bench_noshadow 1':

37,129,070,402 L1-dcache-loads
 6,416,246 L1-dcache-load-misses #0.02% of all L1-dcache hits
 6,207,794,675 L1-dcache-stores
 2,800,094 L1-dcache-store-misses
  17.029790809 seconds time elapsed

 Performance counter 

Re: [PATCH] virtio_ring: Shadow available ring flags & index

2015-11-17 Thread Venkatesh Srinivas via Virtualization
On Tue, Nov 17, 2015 at 08:08:18PM -0800, Venkatesh Srinivas wrote:
> On Mon, Nov 16, 2015 at 7:46 PM, Xie, Huawei  wrote:
> 
> > On 11/14/2015 7:41 AM, Venkatesh Srinivas wrote:
> > > On Wed, Nov 11, 2015 at 02:34:33PM +0200, Michael S. Tsirkin wrote:
> > >> On Tue, Nov 10, 2015 at 04:21:07PM -0800, Venkatesh Srinivas wrote:
> > >>> Improves cacheline transfer flow of available ring header.
> > >>>
> > >>> Virtqueues are implemented as a pair of rings, one producer->consumer
> > >>> avail ring and one consumer->producer used ring; preceding the
> > >>> avail ring in memory are two contiguous u16 fields -- avail->flags
> > >>> and avail->idx. A producer posts work by writing to avail->idx and
> > >>> a consumer reads avail->idx.
> > >>>
> > >>> The flags and idx fields only need to be written by a producer CPU
> > >>> and only read by a consumer CPU; when the producer and consumer are
> > >>> running on different CPUs and the virtio_ring code is structured to
> > >>> only have source writes/sink reads, we can continuously transfer the
> > >>> avail header cacheline between 'M' states between cores. This flow
> > >>> optimizes core -> core bandwidth on certain CPUs.
> > >>>
> > >>> (see: "Software Optimization Guide for AMD Family 15h Processors",
> > >>> Section 11.6; similar language appears in the 10h guide and should
> > >>> apply to CPUs w/ exclusive caches, using LLC as a transfer cache)
> > >>>
> > >>> Unfortunately the existing virtio_ring code issued reads to the
> > >>> avail->idx and read-modify-writes to avail->flags on the producer.
> > >>>
> > >>> This change shadows the flags and index fields in producer memory;
> > >>> the vring code now reads from the shadows and only ever writes to
> > >>> avail->flags and avail->idx, allowing the cacheline to transfer
> > >>> core -> core optimally.
> > >> Sounds logical, I'll apply this after a  bit of testing
> > >> of my own, thanks!
> > > Thanks!
> >
> 
> > Venkatesh:
> > Is it that your patch only applies to CPUs w/ exclusive caches?
> 
> No --- it applies when the inter-cache coherence flow is optimized by
> 'M' -> 'M' transfers and when producer reads might interfere w/
> consumer prefetchw/reads. The AMD Optimization guides have specific
> language on this subject, but other platforms may benefit.
> (see Intel #'s below)
> 
> > Do you have perf data on Intel CPUs?
> 
> Good idea -- I ran some tests on a couple of Intel platforms:
> 
> (these are perf data from sample runs; for each I ran many runs, the
>  numbers were pretty stable except for Haswell-EP cross-socket)
> 
> One-socket Intel Xeon W3690 ("Westmere"), 3.46 GHz; core turbo disabled
> ===
> (note -- w/ core turbo disabled, performance is _very_ stable; variance of
>  < 0.5% run-to-run; figure of merit is "seconds elapsed" here)
> 
> * Producer / consumer bound to Hyperthread pairs:
> 
>  Performance counter stats for './vring_bench_noshadow 10':
> 
>  343,425,166,916 L1-dcache-loads
>   21,393,148 L1-dcache-load-misses #0.01% of all L1-dcache hits
>   61,709,640,363 L1-dcache-stores
>5,745,690 L1-dcache-store-misses
>   10,186,932,553 L1-dcache-prefetches
>1,491 L1-dcache-prefetch-misses
>121.335699344 seconds time elapsed
> 
>  Performance counter stats for './vring_bench_shadow 10':
> 
>  334,766,413,861 L1-dcache-loads
>   15,787,778 L1-dcache-load-misses #0.00% of all L1-dcache hits
>   62,735,792,799 L1-dcache-stores
>3,252,113 L1-dcache-store-misses
>9,018,273,596 L1-dcache-prefetches
>  819 L1-dcache-prefetch-misses
>121.206339656 seconds time elapsed
> 
> Effectively Performance-neutral.
> 
> * Producer / consumer bound to separate cores, same socket:
> 
>  Performance counter stats for './vring_bench_noshadow 10':
> 
>399,943,384,509 L1-dcache-loads
>  8,868,334,693 L1-dcache-load-misses #2.22% of all L1-dcache hits
> 62,721,376,685 L1-dcache-stores
>  2,786,806,982 L1-dcache-store-misses
> 10,915,046,967 L1-dcache-prefetches
>328,508 L1-dcache-prefetch-misses
>  146.585969976 seconds time elapsed
> 
>  Performance counter stats for './vring_bench_shadow 10':
> 
>425,123,067,750 L1-dcache-loads 
>  6,689,318,709 L1-dcache-load-misses #1.57% of all L1-dcache hits
> 62,747,525,005 L1-dcache-stores 
>  2,496,274,505 L1-dcache-store-misses
>  8,627,873,397 L1-dcache-prefetches
>146,729 L1-dcache-prefetch-misses
>  142.657327765 seconds time elapsed
> 
> 2.6% reduction in runtime; note that L1-dcache-load-misses reduced
> dramatically, 2 Billion(!) L1d misses saved.
> 
> Two-socket Intel Sandy Bridge(-EP) Xeon, 2.6 GHz; core turbo disabled
> =
> 
> * Producer / consumer bound to Hyperthread pairs:
> 
>  Performance counter stats for