Re: [RFCv5, 18/46] arm: topology: Define TC2 energy and provide it to the scheduler

2015-08-20 Thread Dietmar Eggemann

Hi Leo,

On 08/17/2015 02:19 AM, Leo Yan wrote:

[...]


diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index b35d3e5..bbe20c7 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -274,6 +274,119 @@ void store_cpu_topology(unsigned int cpuid)
cpu_topology[cpuid].socket_id, mpidr);
  }

+/*
+ * ARM TC2 specific energy cost model data. There are no unit requirements for
+ * the data. Data can be normalized to any reference point, but the
+ * normalization must be consistent. That is, one bogo-joule/watt must be the
+ * same quantity for all data, but we don't care what it is.
+ */
+static struct idle_state idle_states_cluster_a7[] = {
+{ .power = 25 }, /* WFI */


This state is confused. Is this state corresponding to all CPUs have been
powered off but L2 cache RAM array and SCU are still power on?


This is what we refer to as 'active idle'. All cpus of the cluster are 
in WFI but the cluster is not in cluster-sleep yet. We measure the 
corresponding energy value by disabling the 'cluster-sleep-[b,l]' state 
and let the cpus do nothing for a specific time period.



+{ .power = 10 }, /* cluster-sleep-l */


Is this status means all CPU and cluster have been powered off, if so
then it will have no power consumption anymore...


The cluster is in cluster-sleep but there is still some peripheral 
related to the cluster active which explains this power value we 
calculated from the pre/post energy value diff (by reading the vexpress 
energy counter for this cluster) and the time period we were idling on 
this cluster.





+   };
+
+static struct idle_state idle_states_cluster_a15[] = {
+{ .power = 70 }, /* WFI */
+{ .power = 25 }, /* cluster-sleep-b */
+   };
+
+static struct capacity_state cap_states_cluster_a7[] = {
+   /* Cluster only power */
+{ .cap =  150, .power = 2967, }, /*  350 MHz */


For cluster level's capacity, does it mean need run benchmark on all
CPUs within cluster?


We run an 'always running thread per cpu' workload on {n, n-1, ..., 1} 
cpus of a cluster (hotplug-out the other cpus) for a specific time 
period. Then we calculate the cluster power value by extrapolating from 
the power values for the {n, n-1, ... 1} test runs and use the delta 
between a n and n+1 test run value as core power value.


[...]


+static struct idle_state idle_states_core_a7[] = {
+{ .power = 0 }, /* WFI */


Should have two idle states for CPU level (WFI and CPU's power off)?


The ARM TC2 platform has only 2 idle states, there is no 'cpu power off':

# cat /sys/devices/system/cpu/cpu[0,2]/cpuidle/state*/name
WFI
cluster-sleep-b
WFI
cluster-sleep-l

[...]

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFCv5, 18/46] arm: topology: Define TC2 energy and provide it to the scheduler

2015-08-20 Thread Dietmar Eggemann

Hi Leo,

On 08/17/2015 02:19 AM, Leo Yan wrote:

[...]


diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index b35d3e5..bbe20c7 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -274,6 +274,119 @@ void store_cpu_topology(unsigned int cpuid)
cpu_topology[cpuid].socket_id, mpidr);
  }

+/*
+ * ARM TC2 specific energy cost model data. There are no unit requirements for
+ * the data. Data can be normalized to any reference point, but the
+ * normalization must be consistent. That is, one bogo-joule/watt must be the
+ * same quantity for all data, but we don't care what it is.
+ */
+static struct idle_state idle_states_cluster_a7[] = {
+{ .power = 25 }, /* WFI */


This state is confused. Is this state corresponding to all CPUs have been
powered off but L2 cache RAM array and SCU are still power on?


This is what we refer to as 'active idle'. All cpus of the cluster are 
in WFI but the cluster is not in cluster-sleep yet. We measure the 
corresponding energy value by disabling the 'cluster-sleep-[b,l]' state 
and let the cpus do nothing for a specific time period.



+{ .power = 10 }, /* cluster-sleep-l */


Is this status means all CPU and cluster have been powered off, if so
then it will have no power consumption anymore...


The cluster is in cluster-sleep but there is still some peripheral 
related to the cluster active which explains this power value we 
calculated from the pre/post energy value diff (by reading the vexpress 
energy counter for this cluster) and the time period we were idling on 
this cluster.





+   };
+
+static struct idle_state idle_states_cluster_a15[] = {
+{ .power = 70 }, /* WFI */
+{ .power = 25 }, /* cluster-sleep-b */
+   };
+
+static struct capacity_state cap_states_cluster_a7[] = {
+   /* Cluster only power */
+{ .cap =  150, .power = 2967, }, /*  350 MHz */


For cluster level's capacity, does it mean need run benchmark on all
CPUs within cluster?


We run an 'always running thread per cpu' workload on {n, n-1, ..., 1} 
cpus of a cluster (hotplug-out the other cpus) for a specific time 
period. Then we calculate the cluster power value by extrapolating from 
the power values for the {n, n-1, ... 1} test runs and use the delta 
between a n and n+1 test run value as core power value.


[...]


+static struct idle_state idle_states_core_a7[] = {
+{ .power = 0 }, /* WFI */


Should have two idle states for CPU level (WFI and CPU's power off)?


The ARM TC2 platform has only 2 idle states, there is no 'cpu power off':

# cat /sys/devices/system/cpu/cpu[0,2]/cpuidle/state*/name
WFI
cluster-sleep-b
WFI
cluster-sleep-l

[...]

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFCv5, 18/46] arm: topology: Define TC2 energy and provide it to the scheduler

2015-08-17 Thread Leo Yan
Hi Morten,

On Tue, Jul 07, 2015 at 07:24:01PM +0100, Morten Rasmussen wrote:
> From: Dietmar Eggemann 
> 
> This patch is only here to be able to test provisioning of energy related
> data from an arch topology shim layer to the scheduler. Since there is no
> code today which deals with extracting energy related data from the dtb or
> acpi, and process it in the topology shim layer, the content of the
> sched_group_energy structures as well as the idle_state and capacity_state
> arrays are hard-coded here.
> 
> This patch defines the sched_group_energy structure as well as the
> idle_state and capacity_state array for the cluster (relates to sched
> groups (sgs) in DIE sched domain level) and for the core (relates to sgs
> in MC sd level) for a Cortex A7 as well as for a Cortex A15.
> It further provides related implementations of the sched_domain_energy_f
> functions (cpu_cluster_energy() and cpu_core_energy()).
> 
> To be able to propagate this information from the topology shim layer to
> the scheduler, the elements of the arm_topology[] table have been
> provisioned with the appropriate sched_domain_energy_f functions.
> 
> cc: Russell King 
> 
> Signed-off-by: Dietmar Eggemann 
> 
> ---
> arch/arm/kernel/topology.c | 118 +++--
>  1 file changed, 115 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
> index b35d3e5..bbe20c7 100644
> --- a/arch/arm/kernel/topology.c
> +++ b/arch/arm/kernel/topology.c
> @@ -274,6 +274,119 @@ void store_cpu_topology(unsigned int cpuid)
>   cpu_topology[cpuid].socket_id, mpidr);
>  }
>  
> +/*
> + * ARM TC2 specific energy cost model data. There are no unit requirements 
> for
> + * the data. Data can be normalized to any reference point, but the
> + * normalization must be consistent. That is, one bogo-joule/watt must be the
> + * same quantity for all data, but we don't care what it is.
> + */
> +static struct idle_state idle_states_cluster_a7[] = {
> +  { .power = 25 }, /* WFI */

This state is confused. Is this state corresponding to all CPUs have been
powered off but L2 cache RAM array and SCU are still power on?

> +  { .power = 10 }, /* cluster-sleep-l */

Is this status means all CPU and cluster have been powered off, if so
then it will have no power consumption anymore...

> + };
> +
> +static struct idle_state idle_states_cluster_a15[] = {
> +  { .power = 70 }, /* WFI */
> +  { .power = 25 }, /* cluster-sleep-b */
> + };
> +
> +static struct capacity_state cap_states_cluster_a7[] = {
> + /* Cluster only power */
> +  { .cap =  150, .power = 2967, }, /*  350 MHz */

For cluster level's capacity, does it mean need run benchmark on all
CPUs within cluster?

> +  { .cap =  172, .power = 2792, }, /*  400 MHz */
> +  { .cap =  215, .power = 2810, }, /*  500 MHz */
> +  { .cap =  258, .power = 2815, }, /*  600 MHz */
> +  { .cap =  301, .power = 2919, }, /*  700 MHz */
> +  { .cap =  344, .power = 2847, }, /*  800 MHz */
> +  { .cap =  387, .power = 3917, }, /*  900 MHz */
> +  { .cap =  430, .power = 4905, }, /* 1000 MHz */
> + };
> +
> +static struct capacity_state cap_states_cluster_a15[] = {
> + /* Cluster only power */
> +  { .cap =  426, .power =  7920, }, /*  500 MHz */
> +  { .cap =  512, .power =  8165, }, /*  600 MHz */
> +  { .cap =  597, .power =  8172, }, /*  700 MHz */
> +  { .cap =  682, .power =  8195, }, /*  800 MHz */
> +  { .cap =  768, .power =  8265, }, /*  900 MHz */
> +  { .cap =  853, .power =  8446, }, /* 1000 MHz */
> +  { .cap =  938, .power = 11426, }, /* 1100 MHz */
> +  { .cap = 1024, .power = 15200, }, /* 1200 MHz */
> + };
> +
> +static struct sched_group_energy energy_cluster_a7 = {
> +   .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a7),
> +   .idle_states= idle_states_cluster_a7,
> +   .nr_cap_states  = ARRAY_SIZE(cap_states_cluster_a7),
> +   .cap_states = cap_states_cluster_a7,
> +};
> +
> +static struct sched_group_energy energy_cluster_a15 = {
> +   .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a15),
> +   .idle_states= idle_states_cluster_a15,
> +   .nr_cap_states  = ARRAY_SIZE(cap_states_cluster_a15),
> +   .cap_states = cap_states_cluster_a15,
> +};
> +
> +static struct idle_state idle_states_core_a7[] = {
> +  { .power = 0 }, /* WFI */

Should have two idle states for CPU level (WFI and CPU's power off)?

> + };
> +
> +static struct idle_state idle_states_core_a15[] = {
> +  { .power = 0 }, /* WFI */
> + };
> +
> +static struct capacity_state cap_states_core_a7[] = {
> + /* Power per cpu */
> +  { .cap =  150, .power =  187, }, /*  350 MHz */
> +  { .cap =  172, .power =  275, }, /*  400 MHz */
> +  { .cap =  215, .power =  334, }, /*  500 MHz */
> +  { .cap =  258, .power =  407, }, /*  600 MHz */
> +  { .cap =  301, .power =  

Re: [RFCv5, 18/46] arm: topology: Define TC2 energy and provide it to the scheduler

2015-08-17 Thread Leo Yan
Hi Morten,

On Tue, Jul 07, 2015 at 07:24:01PM +0100, Morten Rasmussen wrote:
 From: Dietmar Eggemann dietmar.eggem...@arm.com
 
 This patch is only here to be able to test provisioning of energy related
 data from an arch topology shim layer to the scheduler. Since there is no
 code today which deals with extracting energy related data from the dtb or
 acpi, and process it in the topology shim layer, the content of the
 sched_group_energy structures as well as the idle_state and capacity_state
 arrays are hard-coded here.
 
 This patch defines the sched_group_energy structure as well as the
 idle_state and capacity_state array for the cluster (relates to sched
 groups (sgs) in DIE sched domain level) and for the core (relates to sgs
 in MC sd level) for a Cortex A7 as well as for a Cortex A15.
 It further provides related implementations of the sched_domain_energy_f
 functions (cpu_cluster_energy() and cpu_core_energy()).
 
 To be able to propagate this information from the topology shim layer to
 the scheduler, the elements of the arm_topology[] table have been
 provisioned with the appropriate sched_domain_energy_f functions.
 
 cc: Russell King li...@arm.linux.org.uk
 
 Signed-off-by: Dietmar Eggemann dietmar.eggem...@arm.com
 
 ---
 arch/arm/kernel/topology.c | 118 +++--
  1 file changed, 115 insertions(+), 3 deletions(-)
 
 diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
 index b35d3e5..bbe20c7 100644
 --- a/arch/arm/kernel/topology.c
 +++ b/arch/arm/kernel/topology.c
 @@ -274,6 +274,119 @@ void store_cpu_topology(unsigned int cpuid)
   cpu_topology[cpuid].socket_id, mpidr);
  }
  
 +/*
 + * ARM TC2 specific energy cost model data. There are no unit requirements 
 for
 + * the data. Data can be normalized to any reference point, but the
 + * normalization must be consistent. That is, one bogo-joule/watt must be the
 + * same quantity for all data, but we don't care what it is.
 + */
 +static struct idle_state idle_states_cluster_a7[] = {
 +  { .power = 25 }, /* WFI */

This state is confused. Is this state corresponding to all CPUs have been
powered off but L2 cache RAM array and SCU are still power on?

 +  { .power = 10 }, /* cluster-sleep-l */

Is this status means all CPU and cluster have been powered off, if so
then it will have no power consumption anymore...

 + };
 +
 +static struct idle_state idle_states_cluster_a15[] = {
 +  { .power = 70 }, /* WFI */
 +  { .power = 25 }, /* cluster-sleep-b */
 + };
 +
 +static struct capacity_state cap_states_cluster_a7[] = {
 + /* Cluster only power */
 +  { .cap =  150, .power = 2967, }, /*  350 MHz */

For cluster level's capacity, does it mean need run benchmark on all
CPUs within cluster?

 +  { .cap =  172, .power = 2792, }, /*  400 MHz */
 +  { .cap =  215, .power = 2810, }, /*  500 MHz */
 +  { .cap =  258, .power = 2815, }, /*  600 MHz */
 +  { .cap =  301, .power = 2919, }, /*  700 MHz */
 +  { .cap =  344, .power = 2847, }, /*  800 MHz */
 +  { .cap =  387, .power = 3917, }, /*  900 MHz */
 +  { .cap =  430, .power = 4905, }, /* 1000 MHz */
 + };
 +
 +static struct capacity_state cap_states_cluster_a15[] = {
 + /* Cluster only power */
 +  { .cap =  426, .power =  7920, }, /*  500 MHz */
 +  { .cap =  512, .power =  8165, }, /*  600 MHz */
 +  { .cap =  597, .power =  8172, }, /*  700 MHz */
 +  { .cap =  682, .power =  8195, }, /*  800 MHz */
 +  { .cap =  768, .power =  8265, }, /*  900 MHz */
 +  { .cap =  853, .power =  8446, }, /* 1000 MHz */
 +  { .cap =  938, .power = 11426, }, /* 1100 MHz */
 +  { .cap = 1024, .power = 15200, }, /* 1200 MHz */
 + };
 +
 +static struct sched_group_energy energy_cluster_a7 = {
 +   .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a7),
 +   .idle_states= idle_states_cluster_a7,
 +   .nr_cap_states  = ARRAY_SIZE(cap_states_cluster_a7),
 +   .cap_states = cap_states_cluster_a7,
 +};
 +
 +static struct sched_group_energy energy_cluster_a15 = {
 +   .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a15),
 +   .idle_states= idle_states_cluster_a15,
 +   .nr_cap_states  = ARRAY_SIZE(cap_states_cluster_a15),
 +   .cap_states = cap_states_cluster_a15,
 +};
 +
 +static struct idle_state idle_states_core_a7[] = {
 +  { .power = 0 }, /* WFI */

Should have two idle states for CPU level (WFI and CPU's power off)?

 + };
 +
 +static struct idle_state idle_states_core_a15[] = {
 +  { .power = 0 }, /* WFI */
 + };
 +
 +static struct capacity_state cap_states_core_a7[] = {
 + /* Power per cpu */
 +  { .cap =  150, .power =  187, }, /*  350 MHz */
 +  { .cap =  172, .power =  275, }, /*  400 MHz */
 +  { .cap =  215, .power =  334, }, /*  500 MHz */
 +  { .cap =  258, .power =  407, }, /*  600 MHz */
 +  { .cap =  301, .power =  447, }, /*  700 MHz */
 +  {