intel_rdt.c: Extend RDT to per cache and per resources

Thomas Gleixner Thu, 08 Sep 2016 08:02:26 -0700

On Thu, 8 Sep 2016, Fenghua Yu wrote:

> +#define __DCBM_TABLE_INDEX(x) (x << 1)
> +#define __ICBM_TABLE_INDEX(x) ((x << 1) + 1)


This macro mess is completely undocumented.

> +inline int get_dcbm_table_index(int x)

static inline ??? 

> +{
> +     return DCBM_TABLE_INDEX(x);
> +}
> +
> +inline int get_icbm_table_index(int x)
> +{
> +     return ICBM_TABLE_INDEX(x);
> +}

Why are you introducing these when they are not used at all?

>  /*
>   * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
>   * as it does not have CPUID enumeration support for Cache allocation.
> @@ -98,41 +123,141 @@ static inline bool cache_alloc_hsw_probe(void)
>  
>       wrmsr_safe(MSR_IA32_PQR_ASSOC, l, h_old);
>  
> -     boot_cpu_data.x86_cache_max_closid = 4;
> -     boot_cpu_data.x86_cache_max_cbm_len = 20;
> +     boot_cpu_data.x86_l3_max_closid = 4;
> +     boot_cpu_data.x86_l3_max_cbm_len = 20;

So if you actually change the name of the cpudata struct member, then this
would make sense to be split out into a seperate patch. But hell, the patch
order of this stuff is an unholy mess anyway.

>       min_bitmask_len = 2;
>  
>       return true;
>  }
>  
> +u32 max_cbm_len(int level)
> +{
> +     switch (level) {
> +     case CACHE_LEVEL3:
> +             return boot_cpu_data.x86_l3_max_cbm_len;
> +     default:
> +             break;
> +     }
> +
> +     return (u32)~0;
> +}
> +u64 max_cbm(int level)

More functions without users and of course without a proper prefix for
public consumption. Documentation is not available either.

> +{
> +     switch (level) {
> +     case CACHE_LEVEL3:
> +             return (1ULL << boot_cpu_data.x86_l3_max_cbm_len) - 1;

So the above max_cmb_len() returns:

        cpuid_count(0x00000010, 1, &eax, &ebx, &ecx, &edx);
        c->x86_l3_max_cbm_len = eax + 1;

i.e the content of leaf 10:1 EAX plus 1.

According to the SDM:

    EAX 4:0: Length of the capacity bit mask for the corresponding ResID.

So first of all. Why is there a "+ 1" ? Again, that's not documented in the
cpuid initialization code at all.

So now you take that magic number and return:

       (2 ^ magic) - 1

Cute. To be honest I'm too lazy to read through the SDM and figure it out
myself. This kind of stuff belongs into the code as comments. I seriously
doubt that the function names match the actual meaning.

> +     default:
> +             break;
> +     }
> +
> +     return (u64)~0;

What kind of return code is this ?

> +static inline bool cat_l3_supported(struct cpuinfo_x86 *c)
> +{
> +     if (cpu_has(c, X86_FEATURE_CAT_L3))
> +             return true;
> +
> +     /*
> +      * Probe for Haswell server CPUs.
> +      */
> +     if (c->x86 == 0x6 && c->x86_model == 0x3f)
> +             return cache_alloc_hsw_probe();

Ah now we have an actual user for that haswell probe thingy ....

> +     return false;
> +}
> +
>  void __intel_rdt_sched_in(void *dummy)

I still have not found a reasonable explanation for this dummy argument.

>  {
>       struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
> +     struct rdtgroup *rdtgrp;
> +     int closid;
> +     int domain;

Sigh.
  
>       /*
> -      * Currently closid is always 0. When  user interface is added,
> -      * closid will come from user interface.
> +      * If this task is assigned to an rdtgroup, use it.
> +      * Else use the group assigned to this cpu.
>        */
> -     if (state->closid == 0)
> +     rdtgrp = current->rdtgroup;
> +     if (!rdtgrp)
> +             rdtgrp = this_cpu_read(cpu_rdtgroup);

This makes actually sense! Thanks for listening!

> +
> +     domain = this_cpu_read(cpu_shared_domain);
> +     closid = rdtgrp->resource.closid[domain];
> +
> +     if (closid == state->closid)
>               return;
>  
> -     wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0);
> -     state->closid = 0;
> +     state->closid = closid;
> +     /* Don't really write PQR register in simulation mode. */
> +     if (unlikely(rdt_opts.simulate_cat_l3))
> +             return;
> +
> +     wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid);
>  }
>  
>  /*
>   * When cdp mode is enabled, refcnt is maintained in the dcache_cbm entry.
>   */
> -static inline void closid_get(u32 closid)
> +inline void closid_get(u32 closid, int domain)

s/static inline/inline/ What the heck?

Can you please explain what this is doing?

>  {
> -     struct clos_cbm_table *cct = &cctable[DCBM_TABLE_INDEX(closid)];
> -
>       lockdep_assert_held(&rdtgroup_mutex);
>  
> -     cct->clos_refcnt++;
> +     if (cat_l3_enabled) {
> +             int l3_domain;
> +             int dindex;

                int l3_domain, dindex;

> +             l3_domain = shared_domain[domain].l3_domain;
> +             dindex = DCBM_TABLE_INDEX(closid);
> +             l3_cctable[l3_domain][dindex].clos_refcnt++;
> +             if (cdp_enabled) {
> +                     int iindex = ICBM_TABLE_INDEX(closid);

And if you call that variable 'index' instead of 'dindex' then you don't
need an extra one 'iindex'.

> +
> +                     l3_cctable[l3_domain][iindex].clos_refcnt++;

So now you have a seperate refcount for the Icache part, but the comment
above the function still says otherwise.

> +             }
> +     }
>  }
>  
> -static int closid_alloc(u32 *closid)
> +int closid_alloc(u32 *closid, int domain)
>  {
>       u32 maxid;
>       u32 id;
> @@ -140,105 +265,215 @@ static int closid_alloc(u32 *closid)
>       lockdep_assert_held(&rdtgroup_mutex);
>  
>       maxid = cconfig.max_closid;
> -     id = find_first_zero_bit(cconfig.closmap, maxid);
> +     id = find_first_zero_bit((unsigned long *)cconfig.closmap[domain],

Why do you need a typecast here? Get your damned structs straight.

> +                              maxid);
> +
>       if (id == maxid)
>               return -ENOSPC;
>  
> -     set_bit(id, cconfig.closmap);
> -     closid_get(id);
> +     set_bit(id, (unsigned long *)cconfig.closmap[domain]);
> +     closid_get(id, domain);
>       *closid = id;
> -     cconfig.closids_used++;
>  
>       return 0;
>  }
>  
> -static inline void closid_free(u32 closid)
> +unsigned int get_domain_num(int level)
>  {
> -     clear_bit(closid, cconfig.closmap);
> -     cctable[DCBM_TABLE_INDEX(closid)].cbm = 0;
> -
> -     if (WARN_ON(!cconfig.closids_used))
> -             return;
> +     if (level == CACHE_LEVEL3)
> +             return cpumask_weight(&rdt_l3_cpumask);

get_domain_num(level) suggests to me that it returns the domain number
corresponding to the level, but it actually returns the number of bits set
in the rdt_l3_cpumask. Very intuitive - NOT!

> +     else
> +             return -EINVAL;

Proper return value for a function returning 'unsigned int' ....

> +}
>  
> -     cconfig.closids_used--;
> +int level_to_leaf(int level)
> +{
> +     switch (level) {
> +     case CACHE_LEVEL3:
> +             return 3;
> +     default:
> +             return -EINVAL;
> +     }
>  }
>  
> -static void closid_put(u32 closid)
> +void closid_free(u32 closid, int domain, int level)
>  {
> -     struct clos_cbm_table *cct = &cctable[DCBM_TABLE_INDEX(closid)];
> +     struct clos_cbm_table **cctable;
> +     int leaf;
> +     struct cpumask *mask;
> +     int cpu;
> +
> +     if (level == CACHE_LEVEL3)
> +             cctable = l3_cctable;

Oh well. Why needs this assignment to happen here?

> +
> +     clear_bit(closid, (unsigned long *)cconfig.closmap[domain]);
> +
> +     if (level == CACHE_LEVEL3) {

And not here where it actually makes sense?

> +             cctable[domain][closid].cbm = max_cbm(level);
> +             leaf = level_to_leaf(level);
> +             mask = &cache_domains[leaf].shared_cpu_map[domain];
> +             cpu = cpumask_first(mask);
> +             smp_call_function_single(cpu, cbm_update_l3_msr, &closid, 1);

A comment explaining that this must be done on one of the cpus in @domain
would be too helpful.

> +     }
> +}
>  
> +static void _closid_put(u32 closid, struct clos_cbm_table *cct,

Please use two underscores so it's obvious.

> +                     int domain, int level)
> +{
>       lockdep_assert_held(&rdtgroup_mutex);
>       if (WARN_ON(!cct->clos_refcnt))
>               return;
>  
>       if (!--cct->clos_refcnt)
> -             closid_free(closid);
> +             closid_free(closid, domain, level);
>  }
>  
> -static void msr_cpu_update(void *arg)
> +void closid_put(u32 closid, int domain)
> +{
> +     struct clos_cbm_table *cct;
> +
> +     if (cat_l3_enabled) {
> +             int l3_domain = shared_domain[domain].l3_domain;
> +
> +             cct = &l3_cctable[l3_domain][DCBM_TABLE_INDEX(closid)];
> +             _closid_put(closid, cct, l3_domain, CACHE_LEVEL3);
> +             if (cdp_enabled) {
> +                     cct = &l3_cctable[l3_domain][ICBM_TABLE_INDEX(closid)];
> +                     _closid_put(closid, cct, l3_domain, CACHE_LEVEL3);
> +             }
> +     }
> +}
> +
> +void msr_cpu_update(void *arg)
>  {
>       struct rdt_remote_data *info = arg;
>  
> +     if (unlikely(rdt_opts.verbose))
> +             pr_info("Write %lx to msr %x on cpu%d\n",
> +                     (unsigned long)info->val, info->msr,
> +                     smp_processor_id());

That's what DYNAMIC_DEBUG is for.

> +
> +     if (unlikely(rdt_opts.simulate_cat_l3))
> +             return;

Do we really need all this simulation stuff?

> +
>       wrmsrl(info->msr, info->val);
>  }
>  
> +static struct cpumask *rdt_cache_cpumask(int level)
> +{
> +     return &rdt_l3_cpumask;

That's a really useful helper .... Your choice of checking 'level' five
times in a row in various helpers versus returning l3 unconditionally is at
least interesting.

> +int get_cache_leaf(int level, int cpu)
> +{
> +     int index;
> +     struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);

this_cpu_ci is a complete misnomer as it suggests that it's actually the
cacheinfo for 'this cpu', i.e. the cpu on which the code is executing.

> +     struct cacheinfo *this_leaf;
> +     int num_leaves = this_cpu_ci->num_leaves;
> +
> +     for (index = 0; index < num_leaves; index++) {
> +             this_leaf = this_cpu_ci->info_list + index;
> +             if (this_leaf->level == level)
> +                     return index;

The function is misnomed as well. It does not return the cache leaf, it
returns the leaf index .....

> +     }

Why do you have a cacheinfo related function in this RDT code? 

> +
> +     return -EINVAL;
> +}
> +
> +static struct cpumask *get_shared_cpu_map(int cpu, int level)
> +{
> +     int index;
> +     struct cacheinfo *leaf;
> +     struct cpu_cacheinfo *cpu_ci = get_cpu_cacheinfo(cpu);
> +
> +     index = get_cache_leaf(level, cpu);
> +     if (index < 0)
> +             return 0;
> +
> +     leaf = cpu_ci->info_list + index;

While here you actually get the leaf.

> +
> +     return &leaf->shared_cpu_map;
>  }
  
>  static int clear_rdtgroup_cpumask(unsigned int cpu)
> @@ -293,63 +531,270 @@ static int clear_rdtgroup_cpumask(unsigned int cpu)
>  
>  static int intel_rdt_offline_cpu(unsigned int cpu)
>  {
> -     int i;
> +     struct cpumask *shared_cpu_map;
> +     int new_cpu;
> +     int l3_domain;
> +     int level;
> +     int leaf;

Sigh. 1/3 of the line space is wasted for single variable declarations.
 
>       mutex_lock(&rdtgroup_mutex);
> -     if (!cpumask_test_and_clear_cpu(cpu, &rdt_cpumask)) {
> -             mutex_unlock(&rdtgroup_mutex);
> -             return;
> -     }
>  
> -     cpumask_and(&tmp_cpumask, topology_core_cpumask(cpu), cpu_online_mask);
> -     cpumask_clear_cpu(cpu, &tmp_cpumask);
> -     i = cpumask_any(&tmp_cpumask);
> +     level = CACHE_LEVEL3;

I have a hard time to understand the value of that 'level' variable, but
well that's the least of my worries with that code.

> +
> +     l3_domain = per_cpu(cpu_l3_domain, cpu);
> +     leaf = level_to_leaf(level);
> +     shared_cpu_map = &cache_domains[leaf].shared_cpu_map[l3_domain];
>  
> -     if (i < nr_cpu_ids)
> -             cpumask_set_cpu(i, &rdt_cpumask);
> +     cpumask_clear_cpu(cpu, &rdt_l3_cpumask);
> +     cpumask_clear_cpu(cpu, shared_cpu_map);
> +     if (cpumask_empty(shared_cpu_map))
> +             goto out;

So what clears @cpu in the rdtgroup cpumask?

> +
> +     new_cpu = cpumask_first(shared_cpu_map);
> +     rdt_cpumask_update(&rdt_l3_cpumask, new_cpu, level);
>  
>       clear_rdtgroup_cpumask(cpu);

Can you please use a consistent prefix and naming scheme?

    rdt_cpumask_update()
    clear_rdtgroup_cpumask()

WTF?

> +out:
>       mutex_unlock(&rdtgroup_mutex);
> +     return 0;
> +}
> +
> +/*
> + * Initialize per-cpu cpu_l3_domain.
> + *
> + * cpu_l3_domain numbers are consequtive integer starting from 0.
> + * Sets up 1:1 mapping of cpu id and cpu_l3_domain.
> + */
> +static int __init cpu_cache_domain_init(int level)
> +{
> +     int i, j;
> +     int max_cpu_cache_domain = 0;
> +     int index;
> +     struct cacheinfo *leaf;
> +     int *domain;
> +     struct cpu_cacheinfo *cpu_ci;

Eyes hurt.

> +
> +     for_each_online_cpu(i) {
> +             domain = &per_cpu(cpu_l3_domain, i);

per_cpu_ptr() exists for a reason.

> +             if (*domain == -1) {
> +                     index = get_cache_leaf(level, i);
> +                     if (index < 0)
> +                             return -EINVAL;
> +
> +                     cpu_ci = get_cpu_cacheinfo(i);
> +                     leaf = cpu_ci->info_list + index;
> +                     if (cpumask_empty(&leaf->shared_cpu_map)) {
> +                             WARN(1, "no shared cpu for L2\n");

So L2 is always the right thing for every value of @level? And what the
heck is the value of that WARN? Nothing because the callchain is already
known. What's worse is that you don't tell about @level, @index, @i (which
should be named @cpu).

> +                             return -EINVAL;
> +                     }
> +
> +                     for_each_cpu(j, &leaf->shared_cpu_map) {

So again independent of @level you fiddle with cpu_l3_domain. Interesting.

> +                             domain = &per_cpu(cpu_l3_domain, j);
> +                             *domain = max_cpu_cache_domain;
> +                     }
> +                     max_cpu_cache_domain++;

what's the actual meaning of max_cpu_cache_domain?

> +             }
> +     }
> +
> +     return 0;
> +}
> +
> +static int __init rdt_setup(char *str)
> +{
> +     char *tok;
> +
> +     while ((tok = strsep(&str, ",")) != NULL) {
> +             if (!*tok)
> +                     return -EINVAL;
> +
> +             if (strcmp(tok, "simulate_cat_l3") == 0) {
> +                     pr_info("Simulate CAT L3\n");
> +                     rdt_opts.simulate_cat_l3 = true;

So this goes into rdt_opts

> +             } else if (strcmp(tok, "disable_cat_l3") == 0) {
> +                     pr_info("CAT L3 is disabled\n");
> +                     disable_cat_l3 = true;

While this is a distinct control variable. Very consistent.

> +             } else {
> +                     pr_info("Invalid rdt option\n");

Very helpful w/o printing the actual option ....

> +                     return -EINVAL;
> +             }
> +     }
> +
> +     return 0;
> +}
> +__setup("resctrl=", rdt_setup);
> +
> +static inline bool resource_alloc_enabled(void)
> +{
> +     return cat_l3_enabled;
> +}

Oh well.

> +
> +static int shared_domain_init(void)
> +{
> +     int l3_domain_num = get_domain_num(CACHE_LEVEL3);
> +     int size;
> +     int domain;
> +     struct cpumask *cpumask;
> +     struct cpumask *shared_cpu_map;
> +     int cpu;

More random variable declarations.

> +     if (cat_l3_enabled) {
> +             shared_domain_num = l3_domain_num;
> +             cpumask = &rdt_l3_cpumask;
> +     } else
> +             return -EINVAL;

Missing curly braces.

> +
> +     size = shared_domain_num * sizeof(struct shared_domain);
> +     shared_domain = kzalloc(size, GFP_KERNEL);
> +     if (!shared_domain)
> +             return -EINVAL;
> +
> +     domain = 0;
> +     for_each_cpu(cpu, cpumask) {
> +             if (cat_l3_enabled)
> +                     shared_domain[domain].l3_domain =
> +                                     per_cpu(cpu_l3_domain, cpu);
> +             else
> +                     shared_domain[domain].l3_domain = -1;
> +
> +             shared_cpu_map = get_shared_cpu_map(cpu, CACHE_LEVEL3);
> +
> +             cpumask_copy(&shared_domain[domain].cpumask, shared_cpu_map);

What's the point of updating the cpumask when the thing is disabled? If
there is a reason then this should be documented in a comment.

> +             domain++;
> +     }
> +     for_each_online_cpu(cpu) {
> +             if (cat_l3_enabled)
> +                     per_cpu(cpu_shared_domain, cpu) =
> +                                     per_cpu(cpu_l3_domain, cpu);

More missing curly braces. And using an intermediate variable would remove
this hard to read line breaks.

> +     }
> +
> +     return 0;
> +}
> +
> +static int cconfig_init(int maxid)
> +{
> +     int num;
> +     int domain;
> +     unsigned long *closmap_block;
> +     int maxid_size;
> +
> +     maxid_size = BITS_TO_LONGS(maxid);
> +     num = maxid_size * shared_domain_num;
> +     cconfig.closmap = kcalloc(maxid, sizeof(unsigned long *), GFP_KERNEL);

Really intuitive. You calc num right before allocating the closmap pointers
and then you use it in the next alloc.

> +     if (!cconfig.closmap)
> +             goto out_free;
> +
> +     closmap_block = kcalloc(num, sizeof(unsigned long), GFP_KERNEL);
> +     if (!closmap_block)
> +             goto out_free;
> +
> +     for (domain = 0; domain < shared_domain_num; domain++)
> +             cconfig.closmap[domain] = (unsigned long *)closmap_block +

More random type casting.

> +                                       domain * maxid_size;

Why don't you allocate that whole mess in one go?

        unsigned int ptrsize, mapsize, size, d;
        void *p;

        ptrsize = maxid * sizeof(unsigned long *);
        mapsize = BITS_TO_LONGS(maxid) * sizeof(unsigned long);
        size = ptrsize + num_shared_domains * mapsize;

        p = kzalloc(size, GFP_KERNEL);
        if (!p)
                return -ENOMEM;

        cconfig.closmap = p;    
        cconfig.max_closid = maxid;

        p += ptrsize;
        for (d = 0; d < num_shared_domains; d++, p += mapsize)
               cconfig.closmap[d] = p;
        return 0;

Would be too simple. Once more.

> +
> +     cconfig.max_closid = maxid;
> +
> +     return 0;
> +out_free:
> +     kfree(cconfig.closmap);
> +     kfree(closmap_block);
> +     return -ENOMEM;
> +}
> +
> +static int __init cat_cache_init(int level, int maxid,
> +                              struct clos_cbm_table ***cctable)
> +{
> +     int domain_num;
> +     int domain;
> +     int size;
> +     int ret = 0;
> +     struct clos_cbm_table *p;
> +
> +     domain_num = get_domain_num(level);
> +     size = domain_num * sizeof(struct clos_cbm_table *);
> +     *cctable = kzalloc(size, GFP_KERNEL);
> +     if (!*cctable) {
> +             ret = -ENOMEM;
> +             goto out;
> +     }
> +
> +     size = maxid * domain_num * sizeof(struct clos_cbm_table);
> +     p = kzalloc(size, GFP_KERNEL);
> +     if (!p) {
> +             kfree(*cctable);
> +             ret = -ENOMEM;
> +             goto out;
> +     }
> +     for (domain = 0; domain < domain_num; domain++)
> +             (*cctable)[domain] = p + domain * maxid;

Same crap.

> +
> +     ret = cpu_cache_domain_init(level);
> +     if (ret) {
> +             kfree(*cctable);
> +             kfree(p);
> +     }
> +out:
> +     return ret;
>  }
>  
>  static int __init intel_rdt_late_init(void)
>  {
>       struct cpuinfo_x86 *c = &boot_cpu_data;
>       u32 maxid;
> -     int err = 0, size, i;
> -
> -     maxid = c->x86_cache_max_closid;
> -
> -     size = maxid * sizeof(struct clos_cbm_table);
> -     cctable = kzalloc(size, GFP_KERNEL);
> -     if (!cctable) {
> -             err = -ENOMEM;
> -             goto out_err;
> +     int i;
> +     int ret;
> +
> +     if (unlikely(disable_cat_l3))

This inlikely is completely pointless. This is not a hotpath function. It
just makes the code harder to read.

> +             cat_l3_enabled = false;
> +     else if (cat_l3_supported(c))
> +             cat_l3_enabled = true;
> +     else if (rdt_opts.simulate_cat_l3 &&
> +              get_cache_leaf(CACHE_LEVEL3, 0) >= 0)
> +             cat_l3_enabled = true;
> +     else
> +             cat_l3_enabled = false;

Please move that into resource_alloc_enabled() and make it readable.

> +     if (!resource_alloc_enabled())
> +             return -ENODEV;
> +
> +     if (rdt_opts.simulate_cat_l3) {
> +             boot_cpu_data.x86_l3_max_closid = 16;
> +             boot_cpu_data.x86_l3_max_cbm_len = 20;
> +     }
> +     for_each_online_cpu(i) {
> +             rdt_cpumask_update(&rdt_l3_cpumask, i, CACHE_LEVEL3);
>       }
>  
> -     size = BITS_TO_LONGS(maxid) * sizeof(long);
> -     cconfig.closmap = kzalloc(size, GFP_KERNEL);
> -     if (!cconfig.closmap) {
> -             kfree(cctable);
> -             err = -ENOMEM;
> -             goto out_err;
> +     maxid = 0;
> +     if (cat_l3_enabled) {
> +             maxid = boot_cpu_data.x86_l3_max_closid;
> +             ret = cat_cache_init(CACHE_LEVEL3, maxid, &l3_cctable);
> +             if (ret)
> +                     cat_l3_enabled = false;
>       }
>  
> -     for_each_online_cpu(i)
> -             rdt_cpumask_update(i);
> +     if (!cat_l3_enabled)
> +             return -ENOSPC;

Huch? How do you get here when cat_l3_enabled is false?

> +
> +     ret = shared_domain_init();
> +     if (ret)
> +             return -ENODEV;

  Leaks closmaps

> +
> +     ret = cconfig_init(maxid);
> +     if (ret)
> +             return ret;

Leaks more stuff.

>       ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
>                               "AP_INTEL_RDT_ONLINE",
>                               intel_rdt_online_cpu, intel_rdt_offline_cpu);

I still have not figured out how all that init scheme is working so that
you can use nocalls() for the hotplug registration. 

> -     if (err < 0)
> -             goto out_err;
> +     if (ret < 0)
> +             return ret;

And this as well.

Thanks,

        tglx

Re: [PATCH v2 22/33] x86/intel_rdt.c: Extend RDT to per cache and per resources

Reply via email to