From: Sukadev Bhattiprolu <suka...@linux.vnet.ibm.com>
Date: Thu Feb  5 20:56:20 EST 2015 -0300
Subject: [RFC][PATCH] perf: Implement read_group() PMU operation

This is a lightly tested, exploratory patch to allow PMUs to return
several counters at once. Appreciate any comments :-)

Unlike normal hardware PMCs, the 24x7 counters[1] in Power8 are stored
in memory and accessed via a hypervisor call (HCALL).  A major aspect
of the HCALL is that it allows retireving _SEVERAL_ counters at once
(unlike regular PMCs, which are read one at a time).

This patch implements a ->read_group() PMU operation that tries to
take advantage of this ability to read several counters at once.  A
PMU that implements the ->read_group() operation would allow users
to retrieve several counters at once and get a more consistent
snapshot.

NOTE:   This patch has a TODO in h_24x7_event_read_group() in that it
        still does multiple HCALLS. I think that can be optimized 
        independently, once the pmu->read_group() interface itself is
        finalized.

Appreciate comments on the ->read_group interface and best managing the
interfaces between the core and PMU layers - eg: Ok for hv-24x7 PMU to
to walk the ->sibling_list ?

[1] Some notes about 24x7 counters:

        Power8 supports 24x7 counters[1] which differ from traditional PMCs
        in several ways:

        - The 24x7 counters are always on and counting. Rather than
          start/stop the PMCs, we read/report the _change_ in values
          in the counters during the execution of the workload.

        - The 24x7 counters are not tied to a task context (they are
          always on).

        - Rather than reading the event counts from registers, we make
          a hypervisor call (HCALL) to retrieve counts. The HCALL allows
          retrieving a large number of counters in a single call.

        - These counters don't generate interrupts when they overflow (so
          sampling does not apply to these counters).
---     

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1d36314..b69fbdf 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -232,6 +232,13 @@ struct pmu {
        void (*read)                    (struct perf_event *event);
 
        /*
+        * Read a group of counters.
+        */
+       int (*read_group)               (struct perf_event *event,
+                                               u64 *values,
+                                               int ncounters);
+
+       /*
         * Group events scheduling is treated as a transaction, add
         * group events as a whole and perform one schedulability test.
         * If the test fails, roll back the whole group
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 934687f..026a9d0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3549,10 +3549,43 @@ static int perf_event_read_group(struct perf_event 
*event,
        struct perf_event *leader = event->group_leader, *sub;
        int n = 0, size = 0, ret = -EFAULT;
        struct perf_event_context *ctx = leader->ctx;
+       u64 *valuesp;
        u64 values[5];
+       int use_group_read;
        u64 count, enabled, running;
+       struct pmu *pmu = event->pmu;
+
+       /*
+        * If PMU supports group read and group read is requested,
+        * allocate memory before taking the mutex.
+        */
+       use_group_read = 0;
+       if ((read_format & PERF_FORMAT_GROUP) && pmu->read_group) {
+               use_group_read++;
+       }
+
+       if (use_group_read) {
+               valuesp = kzalloc(leader->nr_siblings * sizeof(u64), 
GFP_KERNEL);
+               if (!valuesp)
+                       return -ENOMEM;
+       }
 
        mutex_lock(&ctx->mutex);
+
+       if (use_group_read) {
+               ret = pmu->read_group(leader, valuesp, leader->nr_siblings);
+               if (ret >= 0) {
+                       size = ret * sizeof(u64);
+
+                       ret = size;
+                       if (copy_to_user(buf, valuesp, size))
+                               ret = -EFAULT;
+               }
+
+               kfree(valuesp);
+               goto unlock;
+       }
+
        count = perf_event_read_value(leader, &enabled, &running);
 
        values[n++] = 1 + leader->nr_siblings;
diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index 9445a82..cd48cf0 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -1071,12 +1071,33 @@ static int h_24x7_event_init(struct perf_event *event)
        struct hv_perf_caps caps;
        unsigned domain;
        unsigned long hret;
+       u64 read_format, inv_flags;
        u64 ct;
 
        /* Not our event */
        if (event->attr.type != event->pmu->type)
                return -ENOENT;
 
+       /*
+        * We don't support enabled/running times with PERF_FORMAT_GROUP.
+        * The ->read_group() operation is intended to be used in continous
+        * monitoring mode, so these time values are not important at least
+        * for now.
+        *
+        * Not sure if the PERF_FORMAT_ID is useful. Block it for now.
+        */
+       read_format = event->attr.read_format;
+       inv_flags = PERF_FORMAT_TOTAL_TIME_ENABLED;
+       inv_flags |= PERF_FORMAT_TOTAL_TIME_RUNNING;
+       inv_flags |= PERF_FORMAT_ID;
+
+       if ((read_format & PERF_FORMAT_GROUP) && (read_format & inv_flags)) {
+               pr_devel("%s(): Invalid flags: rf 0x%llx, invf 0x%llx\n",
+                               __func__, (unsigned long long)read_format,
+                               (unsigned long long)inv_flags);
+               return -EINVAL;
+       }
+
        /* Unused areas must be 0 */
        if (event_get_reserved1(event) ||
            event_get_reserved2(event) ||
@@ -1181,6 +1202,50 @@ static int h_24x7_event_add(struct perf_event *event, 
int flags)
        return 0;
 }
 
+static int h_24x7_event_read_group(struct perf_event *leader, u64 *values,
+                               int ncounters)
+{
+       struct perf_event *sub;
+       int n = 0;
+
+       BUG_ON(!(leader->attr.read_format & PERF_FORMAT_GROUP));
+
+       /*
+        * sys_perf_event_open() for now prevents inheritance with
+        * PERF_FORMAT_GROUP. Ensure that hasn't changed.
+        */
+       BUG_ON(!list_empty(&leader->child_list));
+
+       if (ncounters < leader->nr_siblings) {
+               pr_devel("%s(): Insufficient buffer : ns %d, nc %d\n",
+                               __func__, leader->nr_siblings, ncounters);
+               return -EINVAL;
+       }
+
+       raw_spin_lock(&leader->ctx->lock);
+
+       if (leader->state == PERF_EVENT_STATE_ACTIVE) {
+               h_24x7_event_update(leader);
+               values[n++] = local64_read(&leader->count);
+       }
+
+       /*
+        * TODO: For now, make one HCALL per event. We will soon retrieve
+        *       several events with one HCALL.
+        */
+       list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+               if (sub->state != PERF_EVENT_STATE_ACTIVE)
+                       continue;
+
+               h_24x7_event_update(sub);
+               values[n++] =  local64_read(&sub->count);
+       }
+
+       raw_spin_unlock(&leader->ctx->lock);
+
+       return n;
+}
+
 static struct pmu h_24x7_pmu = {
        .task_ctx_nr = perf_invalid_context,
 
@@ -1192,6 +1257,7 @@ static struct pmu h_24x7_pmu = {
        .start       = h_24x7_event_start,
        .stop        = h_24x7_event_stop,
        .read        = h_24x7_event_update,
+       .read_group  = h_24x7_event_read_group,
 };
 
 static int hv_24x7_init(void)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to