On Mon, Aug 29, 2016 at 09:17:02AM +0800, Longpeng (Mike) wrote:
> This patch presents virtual L3 cache info for virtual cpus.

Just changing the L3 cache size in the CPUID code will make
guests see a different cache topology after upgrading QEMU (even
on live migration). If you want to change the default, you need
to keep the old values on old machine-types.

In other words, we need to make the cache size configurable, and
set compat_props on PC_COMPAT_2_7.

Other comments below:

> 
> Some software algorithms are based on the hardware's cache info, for example,
> for x86 linux kernel, when cpu1 want to wakeup a task on cpu2, cpu1 will 
> trigger
> a resched IPI and told cpu2 to do the wakeup if they don't share low level
> cache. Oppositely, cpu1 will access cpu2's runqueue directly if they share 
> llc.
> The relevant linux-kernel code as bellow:
> 
>       static void ttwu_queue(struct task_struct *p, int cpu)
>       {
>               struct rq *rq = cpu_rq(cpu);
>               ......
>               if (... && !cpus_share_cache(smp_processor_id(), cpu)) {
>                       ......
>                       ttwu_queue_remote(p, cpu); /* will trigger RES IPI */
>                       return;
>               }
>               ......
>               ttwu_do_activate(rq, p, 0); /* access target's rq directly */
>               ......
>       }
> 
> In real hardware, the cpus on the same socket share L3 cache, so one won't
> trigger a resched IPIs when wakeup a task on others. But QEMU doesn't present 
> a
> virtual L3 cache info for VM, then the linux guest will trigger lots of RES 
> IPIs
> under some workloads even if the virtual cpus belongs to the same virtual 
> socket.
> 
> For KVM, this degrades performance, because there will be lots of vmexit due 
> to
> guest send IPIs.
> 
> The workload is a SAP HANA's testsuite, we run it one round(about 40 minuates)
> and observe the (Suse11sp3)Guest's amounts of RES IPIs which triggering during
> the period:
> 
>         No-L3           With-L3(applied this patch)
> cpu0: 363890          44582
> cpu1: 373405          43109
> cpu2: 340783          43797
> cpu3: 333854          43409
> cpu4: 327170          40038
> cpu5: 325491          39922
> cpu6: 319129          42391
> cpu7: 306480          41035
> cpu8: 161139          32188
> cpu9: 164649          31024
> cpu10:        149823          30398
> cpu11:        149823          32455
> cpu12:        164830          35143
> cpu13:        172269          35805
> cpu14:        179979          33898
> cpu15:        194505          32754
> avg:  268963.6        40129.8
> 
> The VM's topology is "1*socket 8*cores 2*threads".
> After present virtual L3 cache info for VM, the amounts of RES IPI in guest
> reduce 85%.

What happens to overall system performance if the VCPU threads
actually run on separate sockets in the host?

Other questions about the code below:

> 
> Signed-off-by: Longpeng(Mike) <longpe...@huawei.com>
> ---
>  target-i386/cpu.c | 34 +++++++++++++++++++++++++++-------
>  1 file changed, 27 insertions(+), 7 deletions(-)
> 
> diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> index 6a1afab..5a5fd06 100644
> --- a/target-i386/cpu.c
> +++ b/target-i386/cpu.c
> @@ -57,6 +57,7 @@
>  #define CPUID_2_L1D_32KB_8WAY_64B 0x2c
>  #define CPUID_2_L1I_32KB_8WAY_64B 0x30
>  #define CPUID_2_L2_2MB_8WAY_64B   0x7d
> +#define CPUID_2_L3_12MB_24WAY_64B 0xea
> 
> 
>  /* CPUID Leaf 4 constants: */
> @@ -131,11 +132,15 @@
>  #define L2_LINES_PER_TAG       1
>  #define L2_SIZE_KB_AMD       512
> 
> -/* No L3 cache: */
> -#define L3_SIZE_KB             0 /* disabled */
> -#define L3_ASSOCIATIVITY       0 /* disabled */
> -#define L3_LINES_PER_TAG       0 /* disabled */
> -#define L3_LINE_SIZE           0 /* disabled */
> +/* Level 3 unified cache: */
> +#define L3_LINE_SIZE          64
> +#define L3_ASSOCIATIVITY      24
> +#define L3_SETS             8192
> +#define L3_PARTITIONS          1
> +#define L3_DESCRIPTOR CPUID_2_L3_12MB_24WAY_64B
> +/*FIXME: CPUID leaf 0x80000006 is inconsistent with leaves 2 & 4 */

Why are you intentionally introducing a bug?

> +#define L3_LINES_PER_TAG       1
> +#define L3_SIZE_KB_AMD      1024
> 
>  /* TLB definitions: */
> 
> @@ -2328,7 +2333,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index,
> uint32_t count,

I believe Thunderbird line wrapping broke the patch. Git can't
apply it.

>          }
>          *eax = 1; /* Number of CPUID[EAX=2] calls required */
>          *ebx = 0;
> -        *ecx = 0;
> +        *ecx = (L3_DESCRIPTOR);
>          *edx = (L1D_DESCRIPTOR << 16) | \
>                 (L1I_DESCRIPTOR <<  8) | \
>                 (L2_DESCRIPTOR);
> @@ -2374,6 +2379,21 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index,
> uint32_t count,
>                  *ecx = L2_SETS - 1;
>                  *edx = CPUID_4_NO_INVD_SHARING;
>                  break;
> +            case 3: /* L3 cache info */
> +                *eax |= CPUID_4_TYPE_UNIFIED | \
> +                        CPUID_4_LEVEL(3) | \
> +                        CPUID_4_SELF_INIT_LEVEL;
> +                /*
> +                * According to qemu's APICIDs generating rule, this can make
> +                * sure vcpus on the same vsocket get the same llc_id.
> +                */
> +                *eax |= (cs->nr_cores * cs->nr_threads - 1) << 14;

The above comment doesn't seem to be true.

For example: if nr_cores=9,nr_threads=3, then:

SMT_Mask_Width = Log2 ( RoundToNearestPof2(CPUID.1:EBX[23:16]) / 
((CPUID.(EAX=4, ECX=0):EAX[31:26] ) + 1))
               = Log2 ( RoundToNearestPof2(nr_cores * nr_threads) / ((nr_cores 
- 1) + 1 ))
               = Log2 ( RoundToNearestPof2(27) / 9 )
               = Log2 ( 32 / 9 ) = Log2 ( 3 ) = 2
CoreOnly_Mask_Width = Log2(1 + (CPUID.(EAX=4, ECX=0):EAX[31:26]))
                    = Log2(1 + (nr_cores - 1)) = Log2(nr_cores) =
                    = Log2(9) = 4
CorePlus_Mask_Width = CoreOnly_Mask_Width + SMT_Mask_Width
                    = 2 + 4 = 6

But:

Cache_Mask_Width[3] = Log2(RoundToNearestPof2( (1 + CPUID.(EAX=4, 
ECX=n):EAX[25:14]))
                    = Log2(RoundToNearestPof2( (1 + CPUID.(EAX=4, 
ECX=3):EAX[25:14]))
                    = Log2(RoundToNearestPof2( (1 + (nr_cores * nr_threads - 
1)))
                    = Log2(RoundToNearestPof2( nr_cores * nr_threads ))
                    = Log2(RoundToNearestPof2( 27 ))
                    = Log2(32) = 5

That means the VCPU at package 1, core 8, thread 2 (APIC ID
1100010b) have Package_ID=1 but L3 Cache_ID would be 3.


> +                *ebx = (L3_LINE_SIZE - 1) | \
> +                       ((L3_PARTITIONS - 1) << 12) | \
> +                       ((L3_ASSOCIATIVITY - 1) << 22);
> +                *ecx = L3_SETS - 1;
> +                *edx = CPUID_4_INCLUSIVE | CPUID_4_COMPLEX_IDX;
> +                break;
>              default: /* end of info */
>                  *eax = 0;
>                  *ebx = 0;
> @@ -2585,7 +2605,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index,
> uint32_t count,
>          *ecx = (L2_SIZE_KB_AMD << 16) | \
>                 (AMD_ENC_ASSOC(L2_ASSOCIATIVITY) << 12) | \
>                 (L2_LINES_PER_TAG << 8) | (L2_LINE_SIZE);
> -        *edx = ((L3_SIZE_KB/512) << 18) | \
> +        *edx = ((L3_SIZE_KB_AMD / 512) << 18) | \
>                 (AMD_ENC_ASSOC(L3_ASSOCIATIVITY) << 12) | \
>                 (L3_LINES_PER_TAG << 8) | (L3_LINE_SIZE);
>          break;
> -- 
> 1.8.3.1
> 

-- 
Eduardo

Reply via email to