To avoid IPIs from IRQ disabled contexts, the occupancy for a RMID in a remote package (a package other than the one the current cpu belongs) is obtained from a cache that is periodically updated. This removes the need for an IPI when reading occupancy for a task event, that was the reason to add the problematic pmu::count and dummy perf_event_read() in the previous CQM version.
The occupancy of all active prmids is updated every __rmid_timed_update_period ms . To avoid holding raw_spin_locks on the prmid hierarchy for too long, the raw rmids to be read are copied to a temporal array list. The array list is consumed to perform the wrmsrl and rdmsrl in each RMID required to read its llc_occupancy. This decoupling of traversing the RMID hierarchy and read occupancy is specially useful due to high latency of the wrmsrl and rdmsl for the llc_occupancy event (thousand of cycles in my test machine). To avoid unnecessary memory allocations, the objects used to temporarily store RMIDs are pooled in a per-package list and allocated on demand. The infrastructure introduced in this patch will be used in future patches in this series to perform reads on subtrees of a prmid hierarchy. Reviewed-by: Stephane Eranian <eran...@google.com> Signed-off-by: David Carrillo-Cisneros <davi...@google.com> --- arch/x86/events/intel/cqm.c | 251 +++++++++++++++++++++++++++++++++++++++++++- arch/x86/events/intel/cqm.h | 36 +++++++ 2 files changed, 286 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index 31f0fd6..904f2d3 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -189,6 +189,8 @@ static inline bool __valid_pkg_id(u16 pkg_id) return pkg_id < PQR_MAX_NR_PKGS; } +static int anode_pool__alloc_one(u16 pkg_id); + /* Init cqm pkg_data for @cpu 's package. */ static int pkg_data_init_cpu(int cpu) { @@ -241,11 +243,19 @@ static int pkg_data_init_cpu(int cpu) mutex_init(&pkg_data->pkg_data_mutex); raw_spin_lock_init(&pkg_data->pkg_data_lock); + INIT_LIST_HEAD(&pkg_data->anode_pool_head); + raw_spin_lock_init(&pkg_data->anode_pool_lock); + INIT_DELAYED_WORK( &pkg_data->rotation_work, intel_cqm_rmid_rotation_work); /* XXX: Chose randomly*/ pkg_data->rotation_cpu = cpu; + INIT_DELAYED_WORK( + &pkg_data->timed_update_work, intel_cqm_timed_update_work); + /* XXX: Chose randomly*/ + pkg_data->timed_update_cpu = cpu; + cqm_pkgs_data[pkg_id] = pkg_data; return 0; } @@ -744,6 +754,189 @@ static void monr_dealloc(struct monr *monr) } /* + * Logic for reading sets of rmids into per-package lists. + * This package lists can be used to update occupancies without + * holding locks in the hierarchies of pmonrs. + * @pool: free pool. + */ +struct astack { + struct list_head pool; + struct list_head items; + int top_idx; + int max_idx; + u16 pkg_id; +}; + +static void astack__init(struct astack *astack, int max_idx, u16 pkg_id) +{ + INIT_LIST_HEAD(&astack->items); + INIT_LIST_HEAD(&astack->pool); + astack->top_idx = -1; + astack->max_idx = max_idx; + astack->pkg_id = pkg_id; +} + +/* Try to enlarge astack->pool with a anode from this pkgs pool. */ +static int astack__try_add_pool(struct astack *astack) +{ + unsigned long flags; + int ret = -1; + struct pkg_data *pkg_data = cqm_pkgs_data[astack->pkg_id]; + + raw_spin_lock_irqsave(&pkg_data->anode_pool_lock, flags); + + if (!list_empty(&pkg_data->anode_pool_head)) { + list_move_tail(pkg_data->anode_pool_head.prev, &astack->pool); + ret = 0; + } + + raw_spin_unlock_irqrestore(&pkg_data->anode_pool_lock, flags); + return ret; +} + +static int astack__push(struct astack *astack) +{ + if (!list_empty(&astack->items) && astack->top_idx < astack->max_idx) { + astack->top_idx++; + return 0; + } + + if (list_empty(&astack->pool) && astack__try_add_pool(astack)) + return -1; + list_move_tail(astack->pool.prev, &astack->items); + astack->top_idx = 0; + return 0; +} + +/* Must be non-empty */ +# define __astack__top(astack_, member_) \ + list_last_entry(&(astack_)->items, \ + struct anode, entry)->member_[(astack_)->top_idx] + +static void astack__clear(struct astack *astack) +{ + list_splice_tail_init(&astack->items, &astack->pool); + astack->top_idx = -1; +} + +/* Put back into pkg_data's pool. */ +static void astack__release(struct astack *astack) +{ + unsigned long flags; + struct pkg_data *pkg_data = cqm_pkgs_data[astack->pkg_id]; + + astack__clear(astack); + raw_spin_lock_irqsave(&pkg_data->anode_pool_lock, flags); + list_splice_tail_init(&astack->pool, &pkg_data->anode_pool_head); + raw_spin_unlock_irqrestore(&pkg_data->anode_pool_lock, flags); +} + +static int anode_pool__alloc_one(u16 pkg_id) +{ + unsigned long flags; + struct anode *anode; + struct pkg_data *pkg_data = cqm_pkgs_data[pkg_id]; + + anode = kmalloc_node(sizeof(struct anode), GFP_KERNEL, + cpu_to_node(pkg_data->rotation_cpu)); + if (!anode) + return -ENOMEM; + raw_spin_lock_irqsave(&pkg_data->anode_pool_lock, flags); + list_add_tail(&anode->entry, &pkg_data->anode_pool_head); + raw_spin_unlock_irqrestore(&pkg_data->anode_pool_lock, flags); + return 0; +} + +static int astack__end(struct astack *astack, struct anode *anode, int idx) +{ + return list_is_last(&anode->entry, &astack->items) && + idx > astack->top_idx; +} + +static int __rmid_fn__cqm_prmid_update(struct prmid *prmid, u64 *val) +{ + int ret = cqm_prmid_update(prmid); + + if (ret >= 0) + *val = atomic64_read(&prmid->last_read_value); + return ret; +} + +/* Apply function to all elements in all nodes. + * On error returns first error in read, zero otherwise. + */ +static int astack__rmids_sum_apply( + struct astack *astack, + u16 pkg_id, int (*fn)(struct prmid *, u64 *), u64 *total) +{ + struct prmid *prmid; + struct anode *anode; + u32 rmid; + int i, ret, first_error = 0; + u64 count; + *total = 0; + + list_for_each_entry(anode, &astack->items, entry) { + for (i = 0; i <= astack->max_idx; i++) { + /* node in tail only has astack->top_idx elements. */ + if (astack__end(astack, anode, i)) + break; + rmid = anode->rmids[i]; + prmid = cqm_pkgs_data[pkg_id]->prmids_by_rmid[rmid]; + WARN_ON_ONCE(!prmid); + ret = fn(prmid, &count); + if (ret < 0) { + if (!first_error) + first_error = ret; + continue; + } + *total += count; + } + } + return first_error; +} + +/* Does not need mutex since protected by locks when transversing + * astate_pmonrs_lru and updating atomic prmids. + */ +static int update_rmids_in_astate_pmonrs_lru(u16 pkg_id) +{ + struct astack astack; + struct pkg_data *pkg_data; + struct pmonr *pmonr; + int ret = 0; + unsigned long flags; + u64 count; + + astack__init(&astack, NR_RMIDS_PER_NODE - 1, pkg_id); + pkg_data = cqm_pkgs_data[pkg_id]; + +retry: + if (ret) { + anode_pool__alloc_one(pkg_id); + ret = 0; + } + raw_spin_lock_irqsave_nested(&pkg_data->pkg_data_lock, flags, pkg_id); + list_for_each_entry(pmonr, + &pkg_data->astate_pmonrs_lru, rotation_entry) { + ret = astack__push(&astack); + if (ret) + break; + __astack__top(&astack, rmids) = pmonr->prmid->rmid; + } + raw_spin_unlock_irqrestore(&pkg_data->pkg_data_lock, flags); + if (ret) { + astack__clear(&astack); + goto retry; + } + /* count is not used. */ + ret = astack__rmids_sum_apply(&astack, pkg_id, + &__rmid_fn__cqm_prmid_update, &count); + astack__release(&astack); + return ret; +} + +/* * Wrappers for monr manipulation in events. * */ @@ -1532,6 +1725,17 @@ exit: mutex_unlock(&pkg_data->pkg_data_mutex); } +static void +__intel_cqm_timed_update(u16 pkg_id) +{ + int ret; + + mutex_lock_nested(&cqm_pkgs_data[pkg_id]->pkg_data_mutex, pkg_id); + ret = update_rmids_in_astate_pmonrs_lru(pkg_id); + mutex_unlock(&cqm_pkgs_data[pkg_id]->pkg_data_mutex); + WARN_ON_ONCE(ret); +} + static struct pmu intel_cqm_pmu; /* Rotation only needs to be run when there is any pmonr in (I)state. */ @@ -1554,6 +1758,22 @@ static bool intel_cqm_need_rotation(u16 pkg_id) return need_rot; } +static bool intel_cqm_need_timed_update(u16 pkg_id) +{ + + struct pkg_data *pkg_data; + bool need_update; + + pkg_data = cqm_pkgs_data[pkg_id]; + + mutex_lock_nested(&pkg_data->pkg_data_mutex, pkg_id); + /* Update is needed if prmids if there is any active prmid. */ + need_update = !list_empty(&pkg_data->active_prmids_pool); + mutex_unlock(&pkg_data->pkg_data_mutex); + + return need_update; +} + /* * Schedule rotation in one package. */ @@ -1568,6 +1788,19 @@ static void __intel_cqm_schedule_rotation_for_pkg(u16 pkg_id) pkg_data->rotation_cpu, &pkg_data->rotation_work, delay); } +static void __intel_cqm_schedule_timed_update_for_pkg(u16 pkg_id) +{ + struct pkg_data *pkg_data; + unsigned long delay; + + delay = msecs_to_jiffies(__rmid_timed_update_period); + pkg_data = cqm_pkgs_data[pkg_id]; + schedule_delayed_work_on( + pkg_data->timed_update_cpu, + &pkg_data->timed_update_work, delay); +} + + /* * Schedule rotation and rmid's timed update in all packages. * Reescheduling will stop when no longer needed. @@ -1576,8 +1809,10 @@ static void intel_cqm_schedule_work_all_pkgs(void) { int pkg_id; - cqm_pkg_id_for_each_online(pkg_id) + cqm_pkg_id_for_each_online(pkg_id) { __intel_cqm_schedule_rotation_for_pkg(pkg_id); + __intel_cqm_schedule_timed_update_for_pkg(pkg_id); + } } static void intel_cqm_rmid_rotation_work(struct work_struct *work) @@ -1598,6 +1833,20 @@ static void intel_cqm_rmid_rotation_work(struct work_struct *work) __intel_cqm_schedule_rotation_for_pkg(pkg_id); } +static void intel_cqm_timed_update_work(struct work_struct *work) +{ + struct pkg_data *pkg_data = container_of( + to_delayed_work(work), struct pkg_data, timed_update_work); + u16 pkg_id = topology_physical_package_id(pkg_data->timed_update_cpu); + + WARN_ON_ONCE(pkg_data != cqm_pkgs_data[pkg_id]); + + __intel_cqm_timed_update(pkg_id); + + if (intel_cqm_need_timed_update(pkg_id)) + __intel_cqm_schedule_timed_update_for_pkg(pkg_id); +} + /* * Find a group and setup RMID. * diff --git a/arch/x86/events/intel/cqm.h b/arch/x86/events/intel/cqm.h index b0e1698..25646a2 100644 --- a/arch/x86/events/intel/cqm.h +++ b/arch/x86/events/intel/cqm.h @@ -45,6 +45,10 @@ static unsigned int __rmid_min_update_time = RMID_DEFAULT_MIN_UPDATE_TIME; static inline int cqm_prmid_update(struct prmid *prmid); +#define RMID_DEFAULT_TIMED_UPDATE_PERIOD 100 /* ms */ +static unsigned int __rmid_timed_update_period = + RMID_DEFAULT_TIMED_UPDATE_PERIOD; + /* * union prmid_summary: Machine-size summary of a pmonr's prmid state. * @value: One word accesor. @@ -211,6 +215,21 @@ struct pmonr { atomic64_t prmid_summary_atomic; }; +/* Store all RMIDs that can fit in a anode while keeping sizeof(struct anode) + * within one cache line (for performance). + */ +#define NR_TYPE_PER_NODE(__type) ((SMP_CACHE_BYTES - (int)sizeof(struct list_head)) / \ + (int)sizeof(__type)) + +#define NR_RMIDS_PER_NODE NR_TYPE_PER_NODE(u32) + +/* struct anode: Node of an array list used to temporarily store RMIDs. */ +struct anode { + /* Last valid RMID is RMID_INVALID */ + u32 rmids[NR_RMIDS_PER_NODE]; + struct list_head entry; +}; + /* * struct pkg_data: Per-package CQM data. * @max_rmid: Max rmid valid for cpus in this package. @@ -239,6 +258,14 @@ struct pmonr { * @rotation_cpu: CPU to run @rotation_work on, it must be in the * package associated to this instance of pkg_data. * @rotation_work: Task that performs rotation of prmids. + * @timed_update_work: Task that performs periodic updates of values + * for active rmids. These values are used when + * inter-package event read is not available due to + * irqs disabled contexts. + * @timed_update_cpu: CPU to run @timed_update_work on, it must be a + * cpu in this package. + * @anode_pool_head: Pool of unused anodes. + * @anode_pool_lock: Protect @anode_pool_head. */ struct pkg_data { u32 max_rmid; @@ -268,6 +295,13 @@ struct pkg_data { struct delayed_work rotation_work; int rotation_cpu; + + struct delayed_work timed_update_work; + int timed_update_cpu; + + /* Pool of unused rmid_list_nodes and its lock */ + struct list_head anode_pool_head; + raw_spinlock_t anode_pool_lock; }; /* @@ -438,6 +472,8 @@ static inline int monr_hrchy_count_held_raw_spin_locks(void) */ static void intel_cqm_rmid_rotation_work(struct work_struct *work); +static void intel_cqm_timed_update_work(struct work_struct *work); + /* * Service Level Objectives (SLO) for the rotation logic. * -- 2.8.0.rc3.226.g39d4020