On Thu, 8 Sep 2016, Fenghua Yu wrote: > +#define __DCBM_TABLE_INDEX(x) (x << 1) > +#define __ICBM_TABLE_INDEX(x) ((x << 1) + 1)
This macro mess is completely undocumented. > +inline int get_dcbm_table_index(int x) static inline ??? > +{ > + return DCBM_TABLE_INDEX(x); > +} > + > +inline int get_icbm_table_index(int x) > +{ > + return ICBM_TABLE_INDEX(x); > +} Why are you introducing these when they are not used at all? > /* > * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs > * as it does not have CPUID enumeration support for Cache allocation. > @@ -98,41 +123,141 @@ static inline bool cache_alloc_hsw_probe(void) > > wrmsr_safe(MSR_IA32_PQR_ASSOC, l, h_old); > > - boot_cpu_data.x86_cache_max_closid = 4; > - boot_cpu_data.x86_cache_max_cbm_len = 20; > + boot_cpu_data.x86_l3_max_closid = 4; > + boot_cpu_data.x86_l3_max_cbm_len = 20; So if you actually change the name of the cpudata struct member, then this would make sense to be split out into a seperate patch. But hell, the patch order of this stuff is an unholy mess anyway. > min_bitmask_len = 2; > > return true; > } > > +u32 max_cbm_len(int level) > +{ > + switch (level) { > + case CACHE_LEVEL3: > + return boot_cpu_data.x86_l3_max_cbm_len; > + default: > + break; > + } > + > + return (u32)~0; > +} > +u64 max_cbm(int level) More functions without users and of course without a proper prefix for public consumption. Documentation is not available either. > +{ > + switch (level) { > + case CACHE_LEVEL3: > + return (1ULL << boot_cpu_data.x86_l3_max_cbm_len) - 1; So the above max_cmb_len() returns: cpuid_count(0x00000010, 1, &eax, &ebx, &ecx, &edx); c->x86_l3_max_cbm_len = eax + 1; i.e the content of leaf 10:1 EAX plus 1. According to the SDM: EAX 4:0: Length of the capacity bit mask for the corresponding ResID. So first of all. Why is there a "+ 1" ? Again, that's not documented in the cpuid initialization code at all. So now you take that magic number and return: (2 ^ magic) - 1 Cute. To be honest I'm too lazy to read through the SDM and figure it out myself. This kind of stuff belongs into the code as comments. I seriously doubt that the function names match the actual meaning. > + default: > + break; > + } > + > + return (u64)~0; What kind of return code is this ? > +static inline bool cat_l3_supported(struct cpuinfo_x86 *c) > +{ > + if (cpu_has(c, X86_FEATURE_CAT_L3)) > + return true; > + > + /* > + * Probe for Haswell server CPUs. > + */ > + if (c->x86 == 0x6 && c->x86_model == 0x3f) > + return cache_alloc_hsw_probe(); Ah now we have an actual user for that haswell probe thingy .... > + return false; > +} > + > void __intel_rdt_sched_in(void *dummy) I still have not found a reasonable explanation for this dummy argument. > { > struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); > + struct rdtgroup *rdtgrp; > + int closid; > + int domain; Sigh. > /* > - * Currently closid is always 0. When user interface is added, > - * closid will come from user interface. > + * If this task is assigned to an rdtgroup, use it. > + * Else use the group assigned to this cpu. > */ > - if (state->closid == 0) > + rdtgrp = current->rdtgroup; > + if (!rdtgrp) > + rdtgrp = this_cpu_read(cpu_rdtgroup); This makes actually sense! Thanks for listening! > + > + domain = this_cpu_read(cpu_shared_domain); > + closid = rdtgrp->resource.closid[domain]; > + > + if (closid == state->closid) > return; > > - wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0); > - state->closid = 0; > + state->closid = closid; > + /* Don't really write PQR register in simulation mode. */ > + if (unlikely(rdt_opts.simulate_cat_l3)) > + return; > + > + wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid); > } > > /* > * When cdp mode is enabled, refcnt is maintained in the dcache_cbm entry. > */ > -static inline void closid_get(u32 closid) > +inline void closid_get(u32 closid, int domain) s/static inline/inline/ What the heck? Can you please explain what this is doing? > { > - struct clos_cbm_table *cct = &cctable[DCBM_TABLE_INDEX(closid)]; > - > lockdep_assert_held(&rdtgroup_mutex); > > - cct->clos_refcnt++; > + if (cat_l3_enabled) { > + int l3_domain; > + int dindex; int l3_domain, dindex; > + l3_domain = shared_domain[domain].l3_domain; > + dindex = DCBM_TABLE_INDEX(closid); > + l3_cctable[l3_domain][dindex].clos_refcnt++; > + if (cdp_enabled) { > + int iindex = ICBM_TABLE_INDEX(closid); And if you call that variable 'index' instead of 'dindex' then you don't need an extra one 'iindex'. > + > + l3_cctable[l3_domain][iindex].clos_refcnt++; So now you have a seperate refcount for the Icache part, but the comment above the function still says otherwise. > + } > + } > } > > -static int closid_alloc(u32 *closid) > +int closid_alloc(u32 *closid, int domain) > { > u32 maxid; > u32 id; > @@ -140,105 +265,215 @@ static int closid_alloc(u32 *closid) > lockdep_assert_held(&rdtgroup_mutex); > > maxid = cconfig.max_closid; > - id = find_first_zero_bit(cconfig.closmap, maxid); > + id = find_first_zero_bit((unsigned long *)cconfig.closmap[domain], Why do you need a typecast here? Get your damned structs straight. > + maxid); > + > if (id == maxid) > return -ENOSPC; > > - set_bit(id, cconfig.closmap); > - closid_get(id); > + set_bit(id, (unsigned long *)cconfig.closmap[domain]); > + closid_get(id, domain); > *closid = id; > - cconfig.closids_used++; > > return 0; > } > > -static inline void closid_free(u32 closid) > +unsigned int get_domain_num(int level) > { > - clear_bit(closid, cconfig.closmap); > - cctable[DCBM_TABLE_INDEX(closid)].cbm = 0; > - > - if (WARN_ON(!cconfig.closids_used)) > - return; > + if (level == CACHE_LEVEL3) > + return cpumask_weight(&rdt_l3_cpumask); get_domain_num(level) suggests to me that it returns the domain number corresponding to the level, but it actually returns the number of bits set in the rdt_l3_cpumask. Very intuitive - NOT! > + else > + return -EINVAL; Proper return value for a function returning 'unsigned int' .... > +} > > - cconfig.closids_used--; > +int level_to_leaf(int level) > +{ > + switch (level) { > + case CACHE_LEVEL3: > + return 3; > + default: > + return -EINVAL; > + } > } > > -static void closid_put(u32 closid) > +void closid_free(u32 closid, int domain, int level) > { > - struct clos_cbm_table *cct = &cctable[DCBM_TABLE_INDEX(closid)]; > + struct clos_cbm_table **cctable; > + int leaf; > + struct cpumask *mask; > + int cpu; > + > + if (level == CACHE_LEVEL3) > + cctable = l3_cctable; Oh well. Why needs this assignment to happen here? > + > + clear_bit(closid, (unsigned long *)cconfig.closmap[domain]); > + > + if (level == CACHE_LEVEL3) { And not here where it actually makes sense? > + cctable[domain][closid].cbm = max_cbm(level); > + leaf = level_to_leaf(level); > + mask = &cache_domains[leaf].shared_cpu_map[domain]; > + cpu = cpumask_first(mask); > + smp_call_function_single(cpu, cbm_update_l3_msr, &closid, 1); A comment explaining that this must be done on one of the cpus in @domain would be too helpful. > + } > +} > > +static void _closid_put(u32 closid, struct clos_cbm_table *cct, Please use two underscores so it's obvious. > + int domain, int level) > +{ > lockdep_assert_held(&rdtgroup_mutex); > if (WARN_ON(!cct->clos_refcnt)) > return; > > if (!--cct->clos_refcnt) > - closid_free(closid); > + closid_free(closid, domain, level); > } > > -static void msr_cpu_update(void *arg) > +void closid_put(u32 closid, int domain) > +{ > + struct clos_cbm_table *cct; > + > + if (cat_l3_enabled) { > + int l3_domain = shared_domain[domain].l3_domain; > + > + cct = &l3_cctable[l3_domain][DCBM_TABLE_INDEX(closid)]; > + _closid_put(closid, cct, l3_domain, CACHE_LEVEL3); > + if (cdp_enabled) { > + cct = &l3_cctable[l3_domain][ICBM_TABLE_INDEX(closid)]; > + _closid_put(closid, cct, l3_domain, CACHE_LEVEL3); > + } > + } > +} > + > +void msr_cpu_update(void *arg) > { > struct rdt_remote_data *info = arg; > > + if (unlikely(rdt_opts.verbose)) > + pr_info("Write %lx to msr %x on cpu%d\n", > + (unsigned long)info->val, info->msr, > + smp_processor_id()); That's what DYNAMIC_DEBUG is for. > + > + if (unlikely(rdt_opts.simulate_cat_l3)) > + return; Do we really need all this simulation stuff? > + > wrmsrl(info->msr, info->val); > } > > +static struct cpumask *rdt_cache_cpumask(int level) > +{ > + return &rdt_l3_cpumask; That's a really useful helper .... Your choice of checking 'level' five times in a row in various helpers versus returning l3 unconditionally is at least interesting. > +int get_cache_leaf(int level, int cpu) > +{ > + int index; > + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); this_cpu_ci is a complete misnomer as it suggests that it's actually the cacheinfo for 'this cpu', i.e. the cpu on which the code is executing. > + struct cacheinfo *this_leaf; > + int num_leaves = this_cpu_ci->num_leaves; > + > + for (index = 0; index < num_leaves; index++) { > + this_leaf = this_cpu_ci->info_list + index; > + if (this_leaf->level == level) > + return index; The function is misnomed as well. It does not return the cache leaf, it returns the leaf index ..... > + } Why do you have a cacheinfo related function in this RDT code? > + > + return -EINVAL; > +} > + > +static struct cpumask *get_shared_cpu_map(int cpu, int level) > +{ > + int index; > + struct cacheinfo *leaf; > + struct cpu_cacheinfo *cpu_ci = get_cpu_cacheinfo(cpu); > + > + index = get_cache_leaf(level, cpu); > + if (index < 0) > + return 0; > + > + leaf = cpu_ci->info_list + index; While here you actually get the leaf. > + > + return &leaf->shared_cpu_map; > } > static int clear_rdtgroup_cpumask(unsigned int cpu) > @@ -293,63 +531,270 @@ static int clear_rdtgroup_cpumask(unsigned int cpu) > > static int intel_rdt_offline_cpu(unsigned int cpu) > { > - int i; > + struct cpumask *shared_cpu_map; > + int new_cpu; > + int l3_domain; > + int level; > + int leaf; Sigh. 1/3 of the line space is wasted for single variable declarations. > mutex_lock(&rdtgroup_mutex); > - if (!cpumask_test_and_clear_cpu(cpu, &rdt_cpumask)) { > - mutex_unlock(&rdtgroup_mutex); > - return; > - } > > - cpumask_and(&tmp_cpumask, topology_core_cpumask(cpu), cpu_online_mask); > - cpumask_clear_cpu(cpu, &tmp_cpumask); > - i = cpumask_any(&tmp_cpumask); > + level = CACHE_LEVEL3; I have a hard time to understand the value of that 'level' variable, but well that's the least of my worries with that code. > + > + l3_domain = per_cpu(cpu_l3_domain, cpu); > + leaf = level_to_leaf(level); > + shared_cpu_map = &cache_domains[leaf].shared_cpu_map[l3_domain]; > > - if (i < nr_cpu_ids) > - cpumask_set_cpu(i, &rdt_cpumask); > + cpumask_clear_cpu(cpu, &rdt_l3_cpumask); > + cpumask_clear_cpu(cpu, shared_cpu_map); > + if (cpumask_empty(shared_cpu_map)) > + goto out; So what clears @cpu in the rdtgroup cpumask? > + > + new_cpu = cpumask_first(shared_cpu_map); > + rdt_cpumask_update(&rdt_l3_cpumask, new_cpu, level); > > clear_rdtgroup_cpumask(cpu); Can you please use a consistent prefix and naming scheme? rdt_cpumask_update() clear_rdtgroup_cpumask() WTF? > +out: > mutex_unlock(&rdtgroup_mutex); > + return 0; > +} > + > +/* > + * Initialize per-cpu cpu_l3_domain. > + * > + * cpu_l3_domain numbers are consequtive integer starting from 0. > + * Sets up 1:1 mapping of cpu id and cpu_l3_domain. > + */ > +static int __init cpu_cache_domain_init(int level) > +{ > + int i, j; > + int max_cpu_cache_domain = 0; > + int index; > + struct cacheinfo *leaf; > + int *domain; > + struct cpu_cacheinfo *cpu_ci; Eyes hurt. > + > + for_each_online_cpu(i) { > + domain = &per_cpu(cpu_l3_domain, i); per_cpu_ptr() exists for a reason. > + if (*domain == -1) { > + index = get_cache_leaf(level, i); > + if (index < 0) > + return -EINVAL; > + > + cpu_ci = get_cpu_cacheinfo(i); > + leaf = cpu_ci->info_list + index; > + if (cpumask_empty(&leaf->shared_cpu_map)) { > + WARN(1, "no shared cpu for L2\n"); So L2 is always the right thing for every value of @level? And what the heck is the value of that WARN? Nothing because the callchain is already known. What's worse is that you don't tell about @level, @index, @i (which should be named @cpu). > + return -EINVAL; > + } > + > + for_each_cpu(j, &leaf->shared_cpu_map) { So again independent of @level you fiddle with cpu_l3_domain. Interesting. > + domain = &per_cpu(cpu_l3_domain, j); > + *domain = max_cpu_cache_domain; > + } > + max_cpu_cache_domain++; what's the actual meaning of max_cpu_cache_domain? > + } > + } > + > + return 0; > +} > + > +static int __init rdt_setup(char *str) > +{ > + char *tok; > + > + while ((tok = strsep(&str, ",")) != NULL) { > + if (!*tok) > + return -EINVAL; > + > + if (strcmp(tok, "simulate_cat_l3") == 0) { > + pr_info("Simulate CAT L3\n"); > + rdt_opts.simulate_cat_l3 = true; So this goes into rdt_opts > + } else if (strcmp(tok, "disable_cat_l3") == 0) { > + pr_info("CAT L3 is disabled\n"); > + disable_cat_l3 = true; While this is a distinct control variable. Very consistent. > + } else { > + pr_info("Invalid rdt option\n"); Very helpful w/o printing the actual option .... > + return -EINVAL; > + } > + } > + > + return 0; > +} > +__setup("resctrl=", rdt_setup); > + > +static inline bool resource_alloc_enabled(void) > +{ > + return cat_l3_enabled; > +} Oh well. > + > +static int shared_domain_init(void) > +{ > + int l3_domain_num = get_domain_num(CACHE_LEVEL3); > + int size; > + int domain; > + struct cpumask *cpumask; > + struct cpumask *shared_cpu_map; > + int cpu; More random variable declarations. > + if (cat_l3_enabled) { > + shared_domain_num = l3_domain_num; > + cpumask = &rdt_l3_cpumask; > + } else > + return -EINVAL; Missing curly braces. > + > + size = shared_domain_num * sizeof(struct shared_domain); > + shared_domain = kzalloc(size, GFP_KERNEL); > + if (!shared_domain) > + return -EINVAL; > + > + domain = 0; > + for_each_cpu(cpu, cpumask) { > + if (cat_l3_enabled) > + shared_domain[domain].l3_domain = > + per_cpu(cpu_l3_domain, cpu); > + else > + shared_domain[domain].l3_domain = -1; > + > + shared_cpu_map = get_shared_cpu_map(cpu, CACHE_LEVEL3); > + > + cpumask_copy(&shared_domain[domain].cpumask, shared_cpu_map); What's the point of updating the cpumask when the thing is disabled? If there is a reason then this should be documented in a comment. > + domain++; > + } > + for_each_online_cpu(cpu) { > + if (cat_l3_enabled) > + per_cpu(cpu_shared_domain, cpu) = > + per_cpu(cpu_l3_domain, cpu); More missing curly braces. And using an intermediate variable would remove this hard to read line breaks. > + } > + > + return 0; > +} > + > +static int cconfig_init(int maxid) > +{ > + int num; > + int domain; > + unsigned long *closmap_block; > + int maxid_size; > + > + maxid_size = BITS_TO_LONGS(maxid); > + num = maxid_size * shared_domain_num; > + cconfig.closmap = kcalloc(maxid, sizeof(unsigned long *), GFP_KERNEL); Really intuitive. You calc num right before allocating the closmap pointers and then you use it in the next alloc. > + if (!cconfig.closmap) > + goto out_free; > + > + closmap_block = kcalloc(num, sizeof(unsigned long), GFP_KERNEL); > + if (!closmap_block) > + goto out_free; > + > + for (domain = 0; domain < shared_domain_num; domain++) > + cconfig.closmap[domain] = (unsigned long *)closmap_block + More random type casting. > + domain * maxid_size; Why don't you allocate that whole mess in one go? unsigned int ptrsize, mapsize, size, d; void *p; ptrsize = maxid * sizeof(unsigned long *); mapsize = BITS_TO_LONGS(maxid) * sizeof(unsigned long); size = ptrsize + num_shared_domains * mapsize; p = kzalloc(size, GFP_KERNEL); if (!p) return -ENOMEM; cconfig.closmap = p; cconfig.max_closid = maxid; p += ptrsize; for (d = 0; d < num_shared_domains; d++, p += mapsize) cconfig.closmap[d] = p; return 0; Would be too simple. Once more. > + > + cconfig.max_closid = maxid; > + > + return 0; > +out_free: > + kfree(cconfig.closmap); > + kfree(closmap_block); > + return -ENOMEM; > +} > + > +static int __init cat_cache_init(int level, int maxid, > + struct clos_cbm_table ***cctable) > +{ > + int domain_num; > + int domain; > + int size; > + int ret = 0; > + struct clos_cbm_table *p; > + > + domain_num = get_domain_num(level); > + size = domain_num * sizeof(struct clos_cbm_table *); > + *cctable = kzalloc(size, GFP_KERNEL); > + if (!*cctable) { > + ret = -ENOMEM; > + goto out; > + } > + > + size = maxid * domain_num * sizeof(struct clos_cbm_table); > + p = kzalloc(size, GFP_KERNEL); > + if (!p) { > + kfree(*cctable); > + ret = -ENOMEM; > + goto out; > + } > + for (domain = 0; domain < domain_num; domain++) > + (*cctable)[domain] = p + domain * maxid; Same crap. > + > + ret = cpu_cache_domain_init(level); > + if (ret) { > + kfree(*cctable); > + kfree(p); > + } > +out: > + return ret; > } > > static int __init intel_rdt_late_init(void) > { > struct cpuinfo_x86 *c = &boot_cpu_data; > u32 maxid; > - int err = 0, size, i; > - > - maxid = c->x86_cache_max_closid; > - > - size = maxid * sizeof(struct clos_cbm_table); > - cctable = kzalloc(size, GFP_KERNEL); > - if (!cctable) { > - err = -ENOMEM; > - goto out_err; > + int i; > + int ret; > + > + if (unlikely(disable_cat_l3)) This inlikely is completely pointless. This is not a hotpath function. It just makes the code harder to read. > + cat_l3_enabled = false; > + else if (cat_l3_supported(c)) > + cat_l3_enabled = true; > + else if (rdt_opts.simulate_cat_l3 && > + get_cache_leaf(CACHE_LEVEL3, 0) >= 0) > + cat_l3_enabled = true; > + else > + cat_l3_enabled = false; Please move that into resource_alloc_enabled() and make it readable. > + if (!resource_alloc_enabled()) > + return -ENODEV; > + > + if (rdt_opts.simulate_cat_l3) { > + boot_cpu_data.x86_l3_max_closid = 16; > + boot_cpu_data.x86_l3_max_cbm_len = 20; > + } > + for_each_online_cpu(i) { > + rdt_cpumask_update(&rdt_l3_cpumask, i, CACHE_LEVEL3); > } > > - size = BITS_TO_LONGS(maxid) * sizeof(long); > - cconfig.closmap = kzalloc(size, GFP_KERNEL); > - if (!cconfig.closmap) { > - kfree(cctable); > - err = -ENOMEM; > - goto out_err; > + maxid = 0; > + if (cat_l3_enabled) { > + maxid = boot_cpu_data.x86_l3_max_closid; > + ret = cat_cache_init(CACHE_LEVEL3, maxid, &l3_cctable); > + if (ret) > + cat_l3_enabled = false; > } > > - for_each_online_cpu(i) > - rdt_cpumask_update(i); > + if (!cat_l3_enabled) > + return -ENOSPC; Huch? How do you get here when cat_l3_enabled is false? > + > + ret = shared_domain_init(); > + if (ret) > + return -ENODEV; Leaks closmaps > + > + ret = cconfig_init(maxid); > + if (ret) > + return ret; Leaks more stuff. > ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, > "AP_INTEL_RDT_ONLINE", > intel_rdt_online_cpu, intel_rdt_offline_cpu); I still have not figured out how all that init scheme is working so that you can use nocalls() for the hotplug registration. > - if (err < 0) > - goto out_err; > + if (ret < 0) > + return ret; And this as well. Thanks, tglx