Re: [PATCH] cpuidle/powernv : init all present cpus for deep states

2018-05-23 Thread Akshay Adiga
On Wed, May 16, 2018 at 05:32:14PM +0530, Akshay Adiga wrote:
> Init all present cpus for deep states instead of "all possible" cpus.
> Init fails if the possible cpu is gaurded. Resulting in making only
> non-deep states available for cpuidle/hotplug.
> 
> Signed-off-by: Akshay Adiga 
> ---
>  arch/powerpc/platforms/powernv/idle.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/idle.c 
> b/arch/powerpc/platforms/powernv/idle.c
> index 1f12ab1..1c5d067 100644
> --- a/arch/powerpc/platforms/powernv/idle.c
> +++ b/arch/powerpc/platforms/powernv/idle.c
> @@ -79,7 +79,7 @@ static int pnv_save_sprs_for_deep_states(void)
>   uint64_t msr_val = MSR_IDLE;
>   uint64_t psscr_val = pnv_deepest_stop_psscr_val;
> 
> - for_each_possible_cpu(cpu) {
> + for_each_present_cpu(cpu) {
>   uint64_t pir = get_hard_smp_processor_id(cpu);
>   uint64_t hsprg0_val = (uint64_t)paca_ptrs[cpu];
> 
> @@ -814,7 +814,7 @@ static int __init pnv_init_idle_states(void)
>   int cpu;
> 
>   pr_info("powernv: idle: Saving PACA pointers of all CPUs in 
> their thread sibling PACA\n");
> - for_each_possible_cpu(cpu) {
> + for_each_present_cpu(cpu) {
>   int base_cpu = cpu_first_thread_sibling(cpu);
>   int idx = cpu_thread_in_core(cpu);
>   int i;
> -- 
> 2.5.5
>

Missed CC'ing to  m...@ellerman.id.au 



Re: [PATCH] cpufreq: powernv: fix missing check of return value in init_powernv_pstates()

2019-02-17 Thread Akshay Adiga
On Sat, Feb 16, 2019 at 12:06:23PM -0500, Yangtao Li wrote:
> kmalloc() could fail, so insert a check of its return value. And
> if it fails, returns -ENOMEM.
> 
> And remove (struct pstate_idx_revmap_data *) to fix coccinelle WARNING
> by the way.
> 
> WARNING: casting value returned by memory allocation function to (struct
> pstate_idx_revmap_data *) is useless.
> 
> Signed-off-by: Yangtao Li 
> ---
>  drivers/cpufreq/powernv-cpufreq.c | 10 +++---
>  1 file changed, 7 insertions(+), 3 deletions(-)
> 
Looks good to me. Thanks for fixing this.



[PATCH 1/3] powernv/cpuidle: Parse dt idle properties into global structure

2018-06-18 Thread Akshay Adiga
Device-tree parsing happens in twice, once while deciding idle state to
be used for hotplug and once during cpuidle init. Hence, parsing the
device tree and caching it will reduce code duplication. Parsing code
has been moved to pnv_parse_cpuidle_dt() from pnv_probe_idle_states().

Setting up things so that number of available idle states can be
accessible to cpuidle-powernv driver. Hence adding nr_pnv_idle_states to
track number of idle states.

Signed-off-by: Akshay Adiga 
---
 arch/powerpc/include/asm/cpuidle.h|  14 +++
 arch/powerpc/platforms/powernv/idle.c | 197 --
 2 files changed, 152 insertions(+), 59 deletions(-)

diff --git a/arch/powerpc/include/asm/cpuidle.h 
b/arch/powerpc/include/asm/cpuidle.h
index e210a83..55ee7e3 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -79,6 +79,20 @@ struct stop_sprs {
u64 mmcra;
 };
 
+#define PNV_IDLE_NAME_LEN16
+struct pnv_idle_states_t {
+   char name[PNV_IDLE_NAME_LEN];
+   u32 latency_ns;
+   u32 residency_ns;
+   /*
+* Register value/mask used to select different idle states.
+* PMICR in POWER8 and PSSCR in POWER9
+*/
+   u64 pm_ctrl_reg_val;
+   u64 pm_ctrl_reg_mask;
+   u32 flags;
+};
+
 extern u32 pnv_fastsleep_workaround_at_entry[];
 extern u32 pnv_fastsleep_workaround_at_exit[];
 
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 1c5d067..07be984 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -36,6 +36,8 @@
 #define P9_STOP_SPR_PSSCR  855
 
 static u32 supported_cpuidle_states;
+struct pnv_idle_states_t *pnv_idle_states;
+int nr_pnv_idle_states;
 
 /*
  * The default stop state that will be used by ppc_md.power_save
@@ -625,45 +627,8 @@ int validate_psscr_val_mask(u64 *psscr_val, u64 
*psscr_mask, u32 flags)
 static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags,
int dt_idle_states)
 {
-   u64 *psscr_val = NULL;
-   u64 *psscr_mask = NULL;
-   u32 *residency_ns = NULL;
u64 max_residency_ns = 0;
-   int rc = 0, i;
-
-   psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val), GFP_KERNEL);
-   psscr_mask = kcalloc(dt_idle_states, sizeof(*psscr_mask), GFP_KERNEL);
-   residency_ns = kcalloc(dt_idle_states, sizeof(*residency_ns),
-  GFP_KERNEL);
-
-   if (!psscr_val || !psscr_mask || !residency_ns) {
-   rc = -1;
-   goto out;
-   }
-
-   if (of_property_read_u64_array(np,
-   "ibm,cpu-idle-state-psscr",
-   psscr_val, dt_idle_states)) {
-   pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr in 
DT\n");
-   rc = -1;
-   goto out;
-   }
-
-   if (of_property_read_u64_array(np,
-  "ibm,cpu-idle-state-psscr-mask",
-  psscr_mask, dt_idle_states)) {
-   pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr-mask 
in DT\n");
-   rc = -1;
-   goto out;
-   }
-
-   if (of_property_read_u32_array(np,
-  "ibm,cpu-idle-state-residency-ns",
-   residency_ns, dt_idle_states)) {
-   pr_warn("cpuidle-powernv: missing 
ibm,cpu-idle-state-residency-ns in DT\n");
-   rc = -1;
-   goto out;
-   }
+   int i;
 
/*
 * Set pnv_first_deep_stop_state, pnv_deepest_stop_psscr_{val,mask},
@@ -681,31 +646,33 @@ static int __init pnv_power9_idle_init(struct device_node 
*np, u32 *flags,
pnv_first_deep_stop_state = MAX_STOP_STATE;
for (i = 0; i < dt_idle_states; i++) {
int err;
-   u64 psscr_rl = psscr_val[i] & PSSCR_RL_MASK;
+   struct pnv_idle_states_t *state = &pnv_idle_states[i];
+   u64 psscr_rl = state->pm_ctrl_reg_val & PSSCR_RL_MASK;
 
-   if ((flags[i] & OPAL_PM_LOSE_FULL_CONTEXT) &&
-(pnv_first_deep_stop_state > psscr_rl))
+   if ((state->flags & OPAL_PM_LOSE_FULL_CONTEXT) &&
+   (pnv_first_deep_stop_state > psscr_rl))
pnv_first_deep_stop_state = psscr_rl;
 
-   err = validate_psscr_val_mask(&psscr_val[i], &psscr_mask[i],
- flags[i]);
+   err = validate_psscr_val_mask(&state->pm_ctrl_reg_val,
+ &state->pm_ctrl_reg_mask,
+ state->flags);
if (err) {
-   report_invalid_psscr

[PATCH 3/3] powernv/cpuidle: Use parsed device tree values for cpuidle_init

2018-06-18 Thread Akshay Adiga
Export pnv_idle_states and nr_pnv_idle_states so that its accessible to
cpuidle driver. Use properties from pnv_idle_states structure for powernv
cpuidle_init.

Signed-off-by: Akshay Adiga 
---
 arch/powerpc/include/asm/cpuidle.h |  2 ++
 drivers/cpuidle/cpuidle-powernv.c  | 49 +++---
 2 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/cpuidle.h 
b/arch/powerpc/include/asm/cpuidle.h
index 55ee7e3..1446747 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -93,6 +93,8 @@ struct pnv_idle_states_t {
u32 flags;
 };
 
+extern struct pnv_idle_states_t *pnv_idle_states;
+extern int nr_pnv_idle_states;
 extern u32 pnv_fastsleep_workaround_at_entry[];
 extern u32 pnv_fastsleep_workaround_at_exit[];
 
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index d29e4f0..de8ba26 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -285,6 +285,11 @@ static int powernv_add_idle_states(void)
goto out;
}
 
+   if (nr_pnv_idle_states <= 0) {
+   pr_warn("opal: No idle states found\n");
+   goto out;
+   }
+
/* Read values of any property to determine the num of idle states */
dt_idle_states = of_property_count_u32_elems(power_mgt, 
"ibm,cpu-idle-state-flags");
if (dt_idle_states < 0) {
@@ -338,7 +343,7 @@ static int powernv_add_idle_states(void)
 * If the idle states use stop instruction, probe for psscr values
 * and psscr mask which are necessary to specify required stop level.
 */
-   has_stop_states = (flags[0] &
+   has_stop_states = (pnv_idle_states[0].flags &
   (OPAL_PM_STOP_INST_FAST | OPAL_PM_STOP_INST_DEEP));
if (has_stop_states) {
count = of_property_count_u64_elems(power_mgt,
@@ -387,51 +392,55 @@ static int powernv_add_idle_states(void)
residency_ns, dt_idle_states);
}
 
-   for (i = 0; i < dt_idle_states; i++) {
+   for (i = 0; i < nr_pnv_idle_states; i++) {
unsigned int exit_latency, target_residency;
bool stops_timebase = false;
+   struct pnv_idle_states_t *state = &pnv_idle_states[i];
 
/*
 * Skip the platform idle state whose flag isn't in
-* the supported_cpuidle_states flag mask.
+* the supported_pnv_idle_states flag mask.
 */
-   if ((flags[i] & supported_flags) != flags[i])
+   if ((state->flags & supported_flags) !=
+   state->flags)
continue;
/*
 * If an idle state has exit latency beyond
 * POWERNV_THRESHOLD_LATENCY_NS then don't use it
 * in cpu-idle.
 */
-   if (latency_ns[i] > POWERNV_THRESHOLD_LATENCY_NS)
+   if (state->latency_ns > POWERNV_THRESHOLD_LATENCY_NS)
continue;
/*
 * Firmware passes residency and latency values in ns.
 * cpuidle expects it in us.
 */
-   exit_latency = DIV_ROUND_UP(latency_ns[i], 1000);
+   exit_latency = DIV_ROUND_UP(state->latency_ns, 1000);
if (!rc)
-   target_residency = DIV_ROUND_UP(residency_ns[i], 1000);
+   target_residency = DIV_ROUND_UP(state->residency_ns, 
1000);
else
target_residency = 0;
 
if (has_stop_states) {
-   int err = validate_psscr_val_mask(&psscr_val[i],
- &psscr_mask[i],
- flags[i]);
+   int err;
+   err = validate_psscr_val_mask(&state->pm_ctrl_reg_val,
+ &state->pm_ctrl_reg_mask,
+ state->flags);
if (err) {
-   report_invalid_psscr_val(psscr_val[i], err);
+   report_invalid_psscr_val(state->pm_ctrl_reg_val,
+err);
continue;
}
}
 
-   if (flags[i] & OPAL_PM_TIMEBASE_STOP)
+   if (state->flags & OPAL_PM_TIMEBASE_STOP)
stops_timebase = true;
 
/*
 * For nap and fastsleep, use default target_residency
 * values if f/w does

[PATCH] cpuidle/powernv : init all present cpus for deep states

2018-05-16 Thread Akshay Adiga
Init all present cpus for deep states instead of "all possible" cpus.
Init fails if the possible cpu is gaurded. Resulting in making only
non-deep states available for cpuidle/hotplug.

Signed-off-by: Akshay Adiga 
---
 arch/powerpc/platforms/powernv/idle.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 1f12ab1..1c5d067 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -79,7 +79,7 @@ static int pnv_save_sprs_for_deep_states(void)
uint64_t msr_val = MSR_IDLE;
uint64_t psscr_val = pnv_deepest_stop_psscr_val;
 
-   for_each_possible_cpu(cpu) {
+   for_each_present_cpu(cpu) {
uint64_t pir = get_hard_smp_processor_id(cpu);
uint64_t hsprg0_val = (uint64_t)paca_ptrs[cpu];
 
@@ -814,7 +814,7 @@ static int __init pnv_init_idle_states(void)
int cpu;
 
pr_info("powernv: idle: Saving PACA pointers of all CPUs in 
their thread sibling PACA\n");
-   for_each_possible_cpu(cpu) {
+   for_each_present_cpu(cpu) {
int base_cpu = cpu_first_thread_sibling(cpu);
int idx = cpu_thread_in_core(cpu);
int i;
-- 
2.5.5



Re: [PATCH] cpuidle/powernv : init all present cpus for deep states

2018-05-16 Thread Akshay Adiga
Yes this needs to be sent to  stable. 

Fixes: d405a98c ("powerpc/powernv: Move cpuidle related code from setup.c
to new file")



[RFC PATCH v2 1/3] cpuidle/powernv: Add support for states with ibm,cpuidle-state-v1

2018-10-11 Thread Akshay Adiga
This patch adds support for new device-tree format for idle state
description.

Previously if a older kernel runs on a newer firmware, it may enable
all available states irrespective of its capability of handling it.
New device tree format adds a compatible flag, so that only kernel
which has the capability to handle the version of stop state will enable
it.

Older kernel will still see stop0 and stop0_lite in older format and we
will depricate it after some time.

1) Idea is to bump up the version in firmware if we find a bug or
regression in stop states. A fix will be provided in linux which would
now know about the bumped up version of stop states, where as kernel
without fixes would ignore the states.

2) Slowly deprecate cpuidle /cpuhotplug threshold which is hard-coded
into cpuidle-powernv driver. Instead use compatible strings to indicate
if idle state is suitable for cpuidle and hotplug.

New idle state device tree format :
   power-mgt {
...
 ibm,enabled-stop-levels = <0xec00>;
 ibm,cpu-idle-state-psscr-mask = <0x0 0x3003ff 0x0 0x3003ff>;
 ibm,cpu-idle-state-latencies-ns = <0x3e8 0x7d0>;
 ibm,cpu-idle-state-psscr = <0x0 0x330 0x0 0x300330>;
 ibm,cpu-idle-state-flags = <0x10 0x101000>;
 ibm,cpu-idle-state-residency-ns = <0x2710 0x4e20>;
 ibm,idle-states {
 stop4 {
 flags = <0x207000>;
 compatible = "ibm,state-v1",
  "opal-supported";
 type = "cpuidle";
 psscr-mask = <0x0 0x3003ff>;
 handle = <0x102>;
 latency-ns = <0x186a0>;
 residency-ns = <0x989680>;
 psscr = <0x0 0x300374>;
  };
...
stop11 {
 ...
 compatible = "ibm,state-v1",
  "opal-supported";
 type = "cpuoffline";
 ...
  };
 };
type strings :
"cpuidle" : indicates it should be used by cpuidle-driver
"cpuoffline" : indicates it should be used by hotplug driver

compatible strings :
"ibm,state-v1" : kernel checks if it knows about this version
"opal-supported" : indicates kernel can fall back to use opal
   for stop-transitions

Signed-off-by: Akshay Adiga 
---

Changes from v1 :
 - Code is rebased on Nick Piggin's v4 patch "powerpc/64s: reimplement book3s
   idle code in C"
 - Moved "cpuidle" and "cpuoffline" as seperate property called
   "type"
 

 arch/powerpc/include/asm/cpuidle.h|   9 ++
 arch/powerpc/platforms/powernv/idle.c | 132 +-
 drivers/cpuidle/cpuidle-powernv.c |  31 --
 3 files changed, 160 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/cpuidle.h 
b/arch/powerpc/include/asm/cpuidle.h
index 9844b3ded187..e920a15e797f 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -70,14 +70,23 @@
 
 #ifndef __ASSEMBLY__
 
+enum idle_state_type_t {
+   CPUIDLE_TYPE,
+   CPUOFFLINE_TYPE
+};
+
+#define POWERNV_THRESHOLD_LATENCY_NS 20
+#define PNV_VER_NAME_LEN32
 #define PNV_IDLE_NAME_LEN16
 struct pnv_idle_states_t {
char name[PNV_IDLE_NAME_LEN];
+   char version[PNV_VER_NAME_LEN];
u32 latency_ns;
u32 residency_ns;
u64 psscr_val;
u64 psscr_mask;
u32 flags;
+   enum idle_state_type_t type;
bool valid;
 };
 
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 96186af9e953..755918402591 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -54,6 +54,20 @@ static bool default_stop_found;
 static u64 pnv_first_tb_loss_level = MAX_STOP_STATE + 1;
 static u64 pnv_first_hv_loss_level = MAX_STOP_STATE + 1;
 
+
+static int parse_dt_v1(struct device_node *np);
+struct stop_version_t {
+   const char name[PNV_VER_NAME_LEN];
+   int (*parser_fn)(struct device_node *np);
+};
+struct stop_version_t known_versions[] = {
+   {
+   .name =  "ibm,state-v1",
+   .parser_fn = parse_dt_v1,
+   }
+   };
+const int nr_known_versions = 1;
+
 /*
  * psscr value and mask of the deepest stop idle state.
  * Used when a cpu is offlined.
@@ -1195,6 +1209,77 @@ static void __init pnv_probe_idle_states(void)
supported_cpuidle_states |= pnv_idle_states[i].flags;
 }
 
+static int parse_dt_v1(struct device_node *dt_node)
+{
+   const char *temp_

[RFC PATCH v2 0/3] New device-tree format and Opal based idle save-restore

2018-10-11 Thread Akshay Adiga
Previously if a older kernel runs on a newer firmware, it may enable
all available states irrespective of its capability of handling it.
New device tree format adds a compatible flag, so that only kernel
which has the capability to handle the version of stop state will enable
it.

Older kernel will still see stop0 and stop0_lite in older format and we
will depricate it after some time.

1) Idea is to bump up the version string in firmware if we find a bug or
regression in stop states. A fix will be provided in linux which would
now know about the bumped up version of stop states, where as kernel
without fixes would ignore the states.

2) Slowly deprecate cpuidle/cpuhotplug threshold which is hard-coded
into cpuidle-powernv driver. Instead use compatible strings to indicate
if idle state is suitable for cpuidle and hotplug.

New idle state device tree format :
   power-mgt {
...
 ibm,enabled-stop-levels = <0xec00>;
 ibm,cpu-idle-state-psscr-mask = <0x0 0x3003ff 0x0 0x3003ff>;
 ibm,cpu-idle-state-latencies-ns = <0x3e8 0x7d0>;
 ibm,cpu-idle-state-psscr = <0x0 0x330 0x0 0x300330>;
 ibm,cpu-idle-state-flags = <0x10 0x101000>;
 ibm,cpu-idle-state-residency-ns = <0x2710 0x4e20>;
 ibm,idle-states {
 stop4 {
 flags = <0x207000>;
 compatible = "ibm,state-v1",
  "opal-support";
 type = "cpuidle";
 psscr-mask = <0x0 0x3003ff>;
 handle = <0x102>;
 latency-ns = <0x186a0>;
 residency-ns = <0x989680>;
 psscr = <0x0 0x300374>;
  };
...
stop11 {
 ...
 compatible = "ibm,state-v1",
  "opal-support";
 type = "cpuoffline";
 ...
  };
 };

High-level parsing algorithm :

Say Known version string = "ibm,state-v1"

for each stop state node in device tree:
if (compatible has known version string)
kernel takes care of stop-transitions
else if (compatible has "opal-support")
OPAL takes care of stop-transitions
else
Skip All deeper states

When a state does not have both version support and opal support,
Its possible to exit from a shallower state. Hence skipping all
deeper states.

OPAL support for idle states


With this patch series, all the states that loose hypervisor state
will be handled through opal_call.

Patch 3 adds support for Saving/restoring of SPRs and resync-timebase
in OPAL. Also all the decision making such as identifying first thread
in the core and taking locks before restoring, etc are implemented in
OPAL.

How does it work ?
---

Consider a case that stop4 has a bug. We take the following steps to
mitigate the problem.

1) Change compatible string for stop4 in OPAL to "ibm-state-v2" and
remove "opal-supported". ship the new firmware.
The kernel ignores stop4 and all deeper states. But we will still have
shallower states. Prevents from completely disabling stop states.

2) Implement workaround in OPAL and add "opal-supported". Ship new firmware
The kernel uses opal for stop-transtion , which has workaround implemented.
We get stop4 and deeper states working without kernel changes and backports.
(and considerably less time)

3) Implement workaround in kernel and add "ibm-state-v2" as known versions
The kernel will now be able to handle stop4 and deeper states.

Changes from v1 :
 - Code is rebased on Nick Piggin's v4 patch "powerpc/64s: reimplement book3s
   idle code in C"
http://patchwork.ozlabs.org/patch/969596/
 - All the states that loses hypervisor states will be handled by OPAL
 - All the decision making such as identifying first thread in
   the core and taking locks before restoring in such cases have also been
   moved to OPAL


Abhishek Goel (1):
  cpuidle/powernv: save-restore sprs in opal

Akshay Adiga (2):
  cpuidle/powernv: Add support for states with ibm,cpuidle-state-v1
  powernv/cpuidle: Pass pointers instead of values to  stop loop

 arch/powerpc/include/asm/cpuidle.h|   9 +
 arch/powerpc/include/asm/opal-api.h   |   4 +-
 arch/powerpc/include/asm/opal.h   |   3 +
 arch/powerpc/include/asm/processor.h  |   8 +-
 arch/powerpc/kernel/idle_book3s.S |   6 +-
 arch/powerpc/platforms/powernv/idle.c | 247 ++
 .../powerpc/platforms/powernv/opal-wrappers.S |   2 +
 drivers/cpuidle/cpuidle-powernv.c |  46 ++--
 8 files changed, 251 insertions(+), 74 deletions(-)

-- 
2.17.1



[RFC PATCH v2 3/3] cpuidle/powernv: save-restore sprs in opal

2018-10-11 Thread Akshay Adiga
From: Abhishek Goel 

This patch moves the saving and restoring of sprs for P9 cpuidle
from kernel to opal.
In an attempt to make the powernv idle code backward compatible,
and to some extent forward compatible, add support for pre-stop entry
and post-stop exit actions in OPAL. If a kernel knows about this
opal call, then just a firmware supporting newer hardware is required,
instead of waiting for kernel updates.

Signed-off-by: Abhishek Goel 
Signed-off-by: Akshay Adiga 
---
Changes from v1 :
 - Code is rebased on Nick Piggin's v4 patch "powerpc/64s: reimplement book3s
   idle code in C"
 - Set a global variable "request_opal_call" to indicate that deep
   states should make opal_call.
 - All the states that loses hypervisor states will be handled by OPAL
 - All the decision making such as identifying first thread in
   the core and taking locks before restoring in such cases have also been
   moved to OPAL
 arch/powerpc/include/asm/opal-api.h   |  4 +-
 arch/powerpc/include/asm/opal.h   |  3 +
 arch/powerpc/include/asm/processor.h  |  3 +-
 arch/powerpc/kernel/idle_book3s.S |  6 +-
 arch/powerpc/platforms/powernv/idle.c | 88 +--
 .../powerpc/platforms/powernv/opal-wrappers.S |  2 +
 6 files changed, 77 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 8365353330b4..93ea1f79e295 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -210,7 +210,9 @@
 #define OPAL_PCI_GET_PBCQ_TUNNEL_BAR   164
 #define OPAL_PCI_SET_PBCQ_TUNNEL_BAR   165
 #defineOPAL_NX_COPROC_INIT 167
-#define OPAL_LAST  167
+#define OPAL_IDLE_SAVE 170
+#define OPAL_IDLE_RESTORE  171
+#define OPAL_LAST  171
 
 #define QUIESCE_HOLD   1 /* Spin all calls at entry */
 #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index ff3866473afe..26995e16171e 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -356,6 +356,9 @@ extern int opal_handle_hmi_exception(struct pt_regs *regs);
 extern void opal_shutdown(void);
 extern int opal_resync_timebase(void);
 
+extern int opal_cpuidle_save(u64 psscr);
+extern int opal_cpuidle_restore(u64 psscr, u64 srr1);
+
 extern void opal_lpc_init(void);
 
 extern void opal_kmsg_init(void);
diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index 822d3236ad7f..26fa6c1836f4 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -510,7 +510,8 @@ static inline unsigned long get_clean_sp(unsigned long sp, 
int is_32)
 
 /* asm stubs */
 extern unsigned long isa300_idle_stop_noloss(unsigned long psscr_val);
-extern unsigned long isa300_idle_stop_mayloss(unsigned long psscr_val);
+extern unsigned long isa300_idle_stop_mayloss(unsigned long psscr_val,
+   bool request_opal_call);
 extern unsigned long isa206_idle_insn_mayloss(unsigned long type);
 
 extern unsigned long cpuidle_disable;
diff --git a/arch/powerpc/kernel/idle_book3s.S 
b/arch/powerpc/kernel/idle_book3s.S
index ffdee1ab4388..a2014d152035 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -52,14 +52,16 @@ _GLOBAL(isa300_idle_stop_noloss)
 _GLOBAL(isa300_idle_stop_mayloss)
mtspr   SPRN_PSSCR,r3
std r1,PACAR1(r13)
-   mflrr4
+   mflrr7
mfcrr5
/* use stack red zone rather than a new frame */
addir6,r1,-INT_FRAME_SIZE
SAVE_GPR(2, r6)
SAVE_NVGPRS(r6)
-   std r4,_LINK(r6)
+   std r7,_LINK(r6)
std r5,_CCR(r6)
+   cmpwi   r4,0
+   bne opal_cpuidle_save
PPC_STOP
b   .   /* catch bugs */
 
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 681a23a066bb..bcfe08022e65 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -171,6 +171,7 @@ static void pnv_fastsleep_workaround_apply(void *info)
 
 static bool power7_fastsleep_workaround_entry = true;
 static bool power7_fastsleep_workaround_exit = true;
+static bool request_opal_call = false;
 
 /*
  * Used to store fastsleep workaround state
@@ -604,6 +605,7 @@ static unsigned long power9_idle_stop(unsigned long psscr, 
bool mmu_on)
unsigned long mmcr0 = 0;
struct p9_sprs sprs;
bool sprs_saved = false;
+   bool is_hv_loss = false;
 
memset(&sprs, 0, sizeof(sprs));
 
@@ -648,7 +650,9 @@ static unsigned long power9_idle_stop(unsigned long psscr, 
bool mmu_on)
  */
  

[RFC PATCH v2 2/3] powernv/cpuidle: Pass pointers instead of values to stop loop

2018-10-11 Thread Akshay Adiga
Passing pointer to the pnv_idle_state instead of psscr value and mask.
This helps us to pass more information to the stop loop. This will help to
figure out the method to enter/exit idle state.

Signed-off-by: Akshay Adiga 

---
Changes from v1 :
 - Code is rebased on Nick Piggin's v4 patch "powerpc/64s: reimplement book3s
   idle code in C"

 arch/powerpc/include/asm/processor.h  |  5 ++-
 arch/powerpc/platforms/powernv/idle.c | 47 ++-
 drivers/cpuidle/cpuidle-powernv.c | 15 +++--
 3 files changed, 24 insertions(+), 43 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index 936795acba48..822d3236ad7f 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -43,6 +43,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* We do _not_ want to define new machine types at all, those must die
  * in favor of using the device-tree
@@ -518,9 +519,7 @@ enum idle_boot_override {IDLE_NO_OVERRIDE = 0, 
IDLE_POWERSAVE_OFF};
 extern int powersave_nap;  /* set if nap mode can be used in idle loop */
 
 extern void power7_idle_type(unsigned long type);
-extern void power9_idle_type(unsigned long stop_psscr_val,
- unsigned long stop_psscr_mask);
-
+extern void power9_idle_type(struct pnv_idle_states_t *state);
 extern void flush_instruction_cache(void);
 extern void hard_reset_now(void);
 extern void poweroff_now(void);
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 755918402591..681a23a066bb 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -44,8 +44,7 @@ int nr_pnv_idle_states;
  * The default stop state that will be used by ppc_md.power_save
  * function on platforms that support stop instruction.
  */
-static u64 pnv_default_stop_val;
-static u64 pnv_default_stop_mask;
+struct pnv_idle_states_t *pnv_default_state;
 static bool default_stop_found;
 
 /*
@@ -72,9 +71,7 @@ const int nr_known_versions = 1;
  * psscr value and mask of the deepest stop idle state.
  * Used when a cpu is offlined.
  */
-static u64 pnv_deepest_stop_psscr_val;
-static u64 pnv_deepest_stop_psscr_mask;
-static u64 pnv_deepest_stop_flag;
+static struct pnv_idle_states_t *pnv_deepest_state;
 static bool deepest_stop_found;
 
 static unsigned long power7_offline_type;
@@ -96,7 +93,7 @@ static int pnv_save_sprs_for_deep_states(void)
uint64_t hid5_val   = mfspr(SPRN_HID5);
uint64_t hmeer_val  = mfspr(SPRN_HMEER);
uint64_t msr_val = MSR_IDLE;
-   uint64_t psscr_val = pnv_deepest_stop_psscr_val;
+   uint64_t psscr_val = pnv_deepest_state->psscr_val;
 
for_each_present_cpu(cpu) {
uint64_t pir = get_hard_smp_processor_id(cpu);
@@ -820,17 +817,15 @@ static unsigned long power9_offline_stop(unsigned long 
psscr)
return srr1;
 }
 
-static unsigned long __power9_idle_type(unsigned long stop_psscr_val,
- unsigned long stop_psscr_mask)
+static unsigned long __power9_idle_type(struct pnv_idle_states_t *state)
 {
unsigned long psscr;
unsigned long srr1;
 
if (!prep_irq_for_idle_irqsoff())
return 0;
-
psscr = mfspr(SPRN_PSSCR);
-   psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
+   psscr = (psscr & ~state->psscr_mask) | state->psscr_val;
 
__ppc64_runlatch_off();
srr1 = power9_idle_stop(psscr, true);
@@ -841,12 +836,10 @@ static unsigned long __power9_idle_type(unsigned long 
stop_psscr_val,
return srr1;
 }
 
-void power9_idle_type(unsigned long stop_psscr_val,
- unsigned long stop_psscr_mask)
+void power9_idle_type(struct pnv_idle_states_t *state)
 {
unsigned long srr1;
-
-   srr1 = __power9_idle_type(stop_psscr_val, stop_psscr_mask);
+   srr1 = __power9_idle_type(state);
irq_set_pending_from_srr1(srr1);
 }
 
@@ -855,7 +848,7 @@ void power9_idle_type(unsigned long stop_psscr_val,
  */
 void power9_idle(void)
 {
-   power9_idle_type(pnv_default_stop_val, pnv_default_stop_mask);
+   power9_idle_type(pnv_default_state);
 }
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -974,8 +967,8 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
unsigned long psscr;
 
psscr = mfspr(SPRN_PSSCR);
-   psscr = (psscr & ~pnv_deepest_stop_psscr_mask) |
-   pnv_deepest_stop_psscr_val;
+   psscr = (psscr & ~pnv_deepest_state->psscr_mask) |
+   pnv_deepest_state->psscr_val;
srr1 = power9_offline_stop(psscr);
} else if (cpu_has_feature(CPU_FTR_ARCH_206) && power7_offline_type) {
srr1 = power7_offline();
@@ -1123,16 +1116,13 @@ static void 

[PATCH] Work around for enabling CONFIG_CMDLINE on ppc64le

2016-09-21 Thread Akshay Adiga
Observed that boot arguments (passed as CONFIG_CMDLINE)  are not being
picked up by kernel while using gcc-ppc64-linux-gnu v5.4.0 and v6.1.1.
While it works as expected with v5.3.1 .

Found that in init/main.c in  setup_command_line() the pointers passed to
strcpy() is messed up.

source for setup_command_line from init/main.c:
void setup_command_line(char *command_line)
{
saved_command_line =
memblock_virt_alloc(strlen(boot_command_line) + 1, 0);
initcall_command_line =
memblock_virt_alloc(strlen(boot_command_line) + 1, 0);
static_command_line = memblock_virt_alloc(strlen(command_line) + 1, 0);
strcpy(saved_command_line, boot_command_line);
strcpy(static_command_line, command_line);
}

Following is the asm dump for strcpy:

char *strcpy(char *dest, const char *src)
{
c0161408:   ff ff 84 38 addir4,r4,-1
c016140c:   ff ff 43 39 addir10,r3,-1
char *tmp = dest;

while ((*dest++ = *src++) != '\0')
c0161410:   01 00 24 8d lbzur9,1(r4)
c0161414:   00 00 a9 2f cmpdi   cr7,r9,0
c0161418:   01 00 2a 9d stbur9,1(r10)
c016141c:   f4 ff 9e 40 bne cr7,c0161410

/* nothing */;
return tmp;
}

Following are the asm dump for the working and non working binaries which
concluded that the argument for the second strcpy() is not loaded into r3 and
is getting clobbered with the return value of previous strcpy().

Not Working asm dump :

c03308d8:   38 c4 6a f8 std r3,-15304(r10)
strcpy(saved_command_line, boot_command_line);
c03308dc:   06 00 62 3c addis   r3,r2,6
c03308e0:   28 c4 63 e8 ld  r3,-15320(r3)
c03308e4:   25 0b e3 4b bl  c0161408

c03308e8:   00 00 00 60 nop
strcpy(static_command_line, command_line);
c03308ec:   78 f3 c4 7f mr  r4,r30
c03308f0:   19 0b e3 4b bl  c0161408

c03308f4:   00 00 00 60 nop

Working asm dump :

c03308d4:   38 c4 c3 fb std r30,-15304(r3)
strcpy(saved_command_line, boot_command_line);
c03308d8:   06 00 62 3c addis   r3,r2,6
c03308dc:   28 c4 63 e8 ld  r3,-15320(r3)
c03308e0:   6d 08 e3 4b bl  c016114c

c03308e4:   00 00 00 60 nop
strcpy(static_command_line, command_line);
c03308e8:   78 eb a4 7f mr  r4,r29
c03308ec:   78 f3 c3 7f mr  r3,r30
c03308f0:   5d 08 e3 4b bl  c016114c

c03308f4:   00 00 00 60 nop

The problem goes away when compiler optimization is restricted to -O1.

Reported-by: Madhavan Srinivasan 
Signed-off-by: Akshay Adiga 
---
 init/main.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/init/main.c b/init/main.c
index a8a58e2..4259c42 100644
--- a/init/main.c
+++ b/init/main.c
@@ -358,7 +358,13 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) 
{ }
  * parsing is performed in place, and we should allow a component to
  * store reference of name/value for future reference.
  */
-static void __init setup_command_line(char *command_line)
+static void __init
+#ifdef CONFIG_PPC64
+   #if  GCC_VERSION > 50301
+   __attribute__((optimize("-O1")))
+   #endif
+#endif
+   setup_command_line(char *command_line)
 {
saved_command_line =
memblock_virt_alloc(strlen(boot_command_line) + 1, 0);
-- 
2.5.5



Re: [PATCH] Work around for enabling CONFIG_CMDLINE on ppc64le

2016-09-22 Thread Akshay Adiga

Hi Michael,

Anton found this bug and raised it against gcc v7.0 and a fix is available
 in upstream gcc.

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71709

Currently, gcc v5.4.0  and v6.1.1 shipped with Ubuntu 16.04 and 16.10  
respectively,
 are hitting this problem.

I have also raised bug against Ubuntu for fixing gcc for 16.04.

https://bugzilla.linux.ibm.com/show_bug.cgi?id=146668


On 09/22/2016 03:51 PM, Michael Ellerman wrote:

Akshay Adiga  writes:


Observed that boot arguments (passed as CONFIG_CMDLINE)  are not being
picked up by kernel while using gcc-ppc64-linux-gnu v5.4.0 and v6.1.1.
While it works as expected with v5.3.1 .

Found that in init/main.c in  setup_command_line() the pointers passed to
strcpy() is messed up.

Hi Akshay,

Thanks for debugging this.


The problem goes away when compiler optimization is restricted to -O1.
diff --git a/init/main.c b/init/main.c
index a8a58e2..4259c42 100644
--- a/init/main.c
+++ b/init/main.c
@@ -358,7 +358,13 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) 
{ }
   * parsing is performed in place, and we should allow a component to
   * store reference of name/value for future reference.
   */
-static void __init setup_command_line(char *command_line)
+static void __init
+#ifdef CONFIG_PPC64
+   #if  GCC_VERSION > 50301
+   __attribute__((optimize("-O1")))
+   #endif
+#endif
+   setup_command_line(char *command_line)
  {
saved_command_line =
memblock_virt_alloc(strlen(boot_command_line) + 1, 0);

But I can't merge that patch.

Our options are one or both of:
  - get GCC fixed and backport the fix to the compilers we care about.
  - blacklist the broken compiler versions.

Is there a GCC bug filed for this?

cheers





[PATCH v3 2/2] cpufreq: powernv: Use PMCR to verify global and local pstate

2016-11-08 Thread Akshay Adiga
As fast_switch() may get called with interrupt disable mode, we cannot
hold a mutex to update the global_pstate_info. So currently, fast_switch()
does not update the global_pstate_info and it will end up with stale data
whenever pstate is updated through fast_switch().

As the gpstate_timer can fire after fast_switch() has updated the pstates,
the timer handler cannot rely on the cached values of local and global
pstate and needs to read it from the PMCR.

Only gpstate_timer_handler() is affected by the stale cached pstate data
beacause either fast_switch() or target_index() routines will be called
for a given govenor, but gpstate_timer can fire after the governor has
changed to schedutil.

Signed-off-by: Akshay Adiga 
Reviewed-by: Gautham R. Shenoy 
Acked-by: Viresh Kumar 
---
Changes from v2 :
- Added generic macros GET_LPSTATE and GET_GPSTATE
 instead of making it specific to PMCR.
Changes from v1 :
- Corrected Commit message
- Type cast pstate values read from PMCR to type s8
- Added Macros to get local and global pstates from PMCR

 drivers/cpufreq/powernv-cpufreq.c | 36 ++--
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index 4a4380d..c82304b 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -42,6 +42,10 @@
 #define PMSR_PSAFE_ENABLE  (1UL << 30)
 #define PMSR_SPR_EM_DISABLE(1UL << 31)
 #define PMSR_MAX(x)((x >> 32) & 0xFF)
+#define LPSTATE_SHIFT  48
+#define GPSTATE_SHIFT  56
+#define GET_LPSTATE(x) (((x) >> LPSTATE_SHIFT) & 0xFF)
+#define GET_GPSTATE(x) (((x) >> GPSTATE_SHIFT) & 0xFF)
 
 #define MAX_RAMP_DOWN_TIME 5120
 /*
@@ -592,7 +596,8 @@ void gpstate_timer_handler(unsigned long data)
 {
struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
struct global_pstate_info *gpstates = policy->driver_data;
-   int gpstate_idx;
+   int gpstate_idx, lpstate_idx;
+   unsigned long val;
unsigned int time_diff = jiffies_to_msecs(jiffies)
- gpstates->last_sampled_time;
struct powernv_smp_call_data freq_data;
@@ -600,21 +605,36 @@ void gpstate_timer_handler(unsigned long data)
if (!spin_trylock(&gpstates->gpstate_lock))
return;
 
+   /*
+* If PMCR was last updated was using fast_swtich then
+* We may have wrong in gpstate->last_lpstate_idx
+* value. Hence, read from PMCR to get correct data.
+*/
+   val = get_pmspr(SPRN_PMCR);
+   freq_data.gpstate_id = (s8)GET_GPSTATE(val);
+   freq_data.pstate_id = (s8)GET_LPSTATE(val);
+   if (freq_data.gpstate_id  == freq_data.pstate_id) {
+   reset_gpstates(policy);
+   spin_unlock(&gpstates->gpstate_lock);
+   return;
+   }
+
gpstates->last_sampled_time += time_diff;
gpstates->elapsed_time += time_diff;
-   freq_data.pstate_id = idx_to_pstate(gpstates->last_lpstate_idx);
 
-   if ((gpstates->last_gpstate_idx == gpstates->last_lpstate_idx) ||
-   (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME)) {
+   if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
gpstate_idx = pstate_to_idx(freq_data.pstate_id);
reset_gpstates(policy);
gpstates->highest_lpstate_idx = gpstate_idx;
} else {
+   lpstate_idx = pstate_to_idx(freq_data.pstate_id);
gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
 gpstates->highest_lpstate_idx,
-gpstates->last_lpstate_idx);
+lpstate_idx);
}
-
+   freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
+   gpstates->last_gpstate_idx = gpstate_idx;
+   gpstates->last_lpstate_idx = lpstate_idx;
/*
 * If local pstate is equal to global pstate, rampdown is over
 * So timer is not required to be queued.
@@ -622,10 +642,6 @@ void gpstate_timer_handler(unsigned long data)
if (gpstate_idx != gpstates->last_lpstate_idx)
queue_gpstate_timer(gpstates);
 
-   freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
-   gpstates->last_gpstate_idx = pstate_to_idx(freq_data.gpstate_id);
-   gpstates->last_lpstate_idx = pstate_to_idx(freq_data.pstate_id);
-
spin_unlock(&gpstates->gpstate_lock);
 
/* Timer may get migrated to a different cpu on cpu hot unplug */
-- 
2.5.5



[PATCH v3 1/2] cpufreq: powernv: Adding fast_switch for schedutil

2016-11-08 Thread Akshay Adiga
Adding fast_switch which does light weight operation to set the desired
pstate. Both global and local pstates are set to the same desired pstate.

Signed-off-by: Akshay Adiga 
Reviewed-by: Gautham R. Shenoy 
Acked-by: Viresh Kumar 
---
Changes from v2 :
- No changes.
Changes from v1 :
- Removed unnecessary check for index out of bound.

 drivers/cpufreq/powernv-cpufreq.c | 20 +++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index d3ffde8..4a4380d 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -752,9 +752,12 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy 
*policy)
spin_lock_init(&gpstates->gpstate_lock);
ret = cpufreq_table_validate_and_show(policy, powernv_freqs);
 
-   if (ret < 0)
+   if (ret < 0) {
kfree(policy->driver_data);
+   return ret;
+   }
 
+   policy->fast_switch_possible = true;
return ret;
 }
 
@@ -897,6 +900,20 @@ static void powernv_cpufreq_stop_cpu(struct cpufreq_policy 
*policy)
del_timer_sync(&gpstates->timer);
 }
 
+static unsigned int powernv_fast_switch(struct cpufreq_policy *policy,
+   unsigned int target_freq)
+{
+   int index;
+   struct powernv_smp_call_data freq_data;
+
+   index = cpufreq_table_find_index_dl(policy, target_freq);
+   freq_data.pstate_id = powernv_freqs[index].driver_data;
+   freq_data.gpstate_id = powernv_freqs[index].driver_data;
+   set_pstate(&freq_data);
+
+   return powernv_freqs[index].frequency;
+}
+
 static struct cpufreq_driver powernv_cpufreq_driver = {
.name   = "powernv-cpufreq",
.flags  = CPUFREQ_CONST_LOOPS,
@@ -904,6 +921,7 @@ static struct cpufreq_driver powernv_cpufreq_driver = {
.exit   = powernv_cpufreq_cpu_exit,
.verify = cpufreq_generic_frequency_table_verify,
.target_index   = powernv_cpufreq_target_index,
+   .fast_switch= powernv_fast_switch,
.get= powernv_cpufreq_get,
.stop_cpu   = powernv_cpufreq_stop_cpu,
.attr   = powernv_cpu_freq_attr,
-- 
2.5.5



Re: [PATCH v2 2/2] cpufreq: powernv: Use PMCR to verify global and local pstate

2016-11-08 Thread Akshay Adiga

Thanks gautham for the review.

Good point, I have made the macros more generic in the next version as 
you have mentioned.


I will post a separate patch to set pstates using these macros. :)

On 11/08/2016 09:10 AM, Gautham R Shenoy wrote:

On Mon, Nov 07, 2016 at 01:09:09PM +0530, Akshay Adiga wrote:

As fast_switch() may get called with interrupt disable mode, we cannot
hold a mutex to update the global_pstate_info. So currently, fast_switch()
does not update the global_pstate_info and it will end up with stale data
whenever pstate is updated through fast_switch().

As the gpstate_timer can fire after fast_switch() has updated the pstates,
the timer handler cannot rely on the cached values of local and global
pstate and needs to read it from the PMCR.

Only gpstate_timer_handler() is affected by the stale cached pstate data
beacause either fast_switch() or target_index() routines will be called
for a given govenor, but gpstate_timer can fire after the governor has
changed to schedutil.


Signed-off-by: Akshay Adiga 
---

Changes from v1 :
- Corrected Commit message
- Type cast pstate values read from PMCR to type s8
- Added Macros to get local and global pstates from PMCR

Thanks for this. Could you also send a (separate patch) to set the
local and global pstates to PMCR in set_pstate?



  drivers/cpufreq/powernv-cpufreq.c | 34 --
  1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index 4a4380d..bf4bc585 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -42,6 +42,8 @@
  #define PMSR_PSAFE_ENABLE (1UL << 30)
  #define PMSR_SPR_EM_DISABLE   (1UL << 31)
  #define PMSR_MAX(x)   ((x >> 32) & 0xFF)
+#define PMCR_LPSTATE(x)(((x) >> 48) & 0xFF)
+#define PMCR_GPSTATE(x)(((x) >> 56) & 0xFF)

You define:
#define LPSTATE_SHIFT48
#define GPSTATE_SHIFT56

since we can use this in the set_variants.

Moreover, the LPSTATE, GPSTATE retreival is applicable to both PMCR and PMSR. So
could you rename these functions to GET_LPSTATE, GET_GPSTATE.

Similarly, we might want to have a SET_LPSTATE, SET_GPSTATE and fix
the hard coded values that we have in set_pstate.



  #define MAX_RAMP_DOWN_TIME5120
  /*
@@ -592,7 +594,8 @@ void gpstate_timer_handler(unsigned long data)
  {
struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
struct global_pstate_info *gpstates = policy->driver_data;
-   int gpstate_idx;
+   int gpstate_idx, lpstate_idx;
+   unsigned long val;
unsigned int time_diff = jiffies_to_msecs(jiffies)
- gpstates->last_sampled_time;
struct powernv_smp_call_data freq_data;
@@ -600,21 +603,36 @@ void gpstate_timer_handler(unsigned long data)
if (!spin_trylock(&gpstates->gpstate_lock))
return;

+   /*
+* If PMCR was last updated was using fast_swtich then
+* We may have wrong in gpstate->last_lpstate_idx
+* value. Hence, read from PMCR to get correct data.
+*/
+   val = get_pmspr(SPRN_PMCR);
+   freq_data.gpstate_id = (s8)PMCR_GPSTATE(val);
+   freq_data.pstate_id = (s8)PMCR_LPSTATE(val);
+   if (freq_data.gpstate_id  == freq_data.pstate_id) {
+   reset_gpstates(policy);
+   spin_unlock(&gpstates->gpstate_lock);
+   return;
+   }
+
gpstates->last_sampled_time += time_diff;
gpstates->elapsed_time += time_diff;
-   freq_data.pstate_id = idx_to_pstate(gpstates->last_lpstate_idx);

-   if ((gpstates->last_gpstate_idx == gpstates->last_lpstate_idx) ||
-   (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME)) {
+   if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
gpstate_idx = pstate_to_idx(freq_data.pstate_id);
reset_gpstates(policy);
gpstates->highest_lpstate_idx = gpstate_idx;
} else {
+   lpstate_idx = pstate_to_idx(freq_data.pstate_id);
gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
 gpstates->highest_lpstate_idx,
-gpstates->last_lpstate_idx);
+lpstate_idx);
}
-
+   freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
+   gpstates->last_gpstate_idx = gpstate_idx;
+   gpstates->last_lpstate_idx = lpstate_idx;
/*
 * If local pstate is equal to global pstate, rampdown is over
 * So timer is not required to be queued.
@@ -622,10 +640,6 @@ void gpstate_timer_handler(unsigned long data)
if (gpstate_idx != gpstates->last_lpstate_idx)
queue_gpstat

[PATCH] cpupower: Warn if values are truncated in frequency-info

2016-11-08 Thread Akshay Adiga
frequency-info currently rounds the frequnecy values to 2 decimal places
while printing "available frequency steps". Hence frequencies with 3rd
decimal place being greater than 5 will be rounded to next higher number
like 2.227GHz will be rounded to 2.23Ghz. On setting this frequency,
using "cpupower frequency-set -f " ,if the value does not match,
then it will set to the next lowest frequency greater than provided
frequency value (according to the userspace governor).

Hence adding a warning if there are any cases where the displayed
frequency cannot be used directly to set a perticular frequency, and
insist the user to use the "-n" option.

Simple usecase where it give counter intuitive results :
bash# cpupower frequency-info
...
  available frequency steps: 2.43 GHz, 2.39 GHz, 2.36 GHz, 2.33 GHz,
2.29 GHz, 2.26 GHz, 2.23 GHz, 2.19 GHz, 2.16 GHz, 2.13 GHz

bash# cpupower frequency-info -n
...
available frequency steps: 2.427000 GHz, 2.394000 GHz, 2.36 GHz,
2.327000 GHz, 2.294000 GHz, 2.261000 GHz, 2.227000 GHz, 2.194000 GHz,
2.161000 GHz, 2.128000 GHz

bash# cpupower frequency-set -f 2.23Ghz

bash# cpupower frequency-info
...
 current CPU frequency: 2.26 GHz (asserted by call to hardware)

Signed-off-by: Akshay Adiga 
---
 tools/power/cpupower/utils/cpufreq-info.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/tools/power/cpupower/utils/cpufreq-info.c 
b/tools/power/cpupower/utils/cpufreq-info.c
index 590d12a..4f13b06 100644
--- a/tools/power/cpupower/utils/cpufreq-info.c
+++ b/tools/power/cpupower/utils/cpufreq-info.c
@@ -85,6 +85,7 @@ static void proc_cpufreq_output(void)
 }
 
 static int no_rounding;
+static int freq_info_truncated;
 static void print_speed(unsigned long speed)
 {
unsigned long tmp;
@@ -103,8 +104,10 @@ static void print_speed(unsigned long speed)
} else {
if (speed > 100) {
tmp = speed%1;
-   if (tmp >= 5000)
+   if (tmp >= 5000) {
+   freq_info_truncated = 1;
speed += 1;
+   }
printf("%u.%02u GHz", ((unsigned int) speed/100),
((unsigned int) (speed%100)/1));
} else if (speed > 10) {
@@ -243,6 +246,7 @@ static int get_boost_mode(unsigned int cpu)
printf(_("%.0f MHz max turbo 1 active cores\n"),
   ratio * bclk);
}
+
return 0;
 }
 
@@ -641,5 +645,12 @@ int cmd_freq_info(int argc, char **argv)
return ret;
printf("\n");
}
+
+   if (freq_info_truncated) {
+   printf("  Warning! Frequency values shown are rounded off\n");
+   printf("  To set frequency use frequency values provided\n");
+   printf("  by \"cpupower frequency-info -n \"\n");
+   }
+
return ret;
 }
-- 
2.5.5



[PATCH] cpufreq: powernv: Fix crash in gpstate_timer_handler

2016-08-04 Thread Akshay Adiga
'commit 09ca4c9b5958 ("cpufreq: powernv: Replacing pstate_id with
frequency table index")' changes calc_global_pstate() to use
cpufreq_table index instead of pstate_id.

But in gpstate_timer_handler() pstate_id was being passed instead
of cpufreq_table index, which caused the index_to_pstate() to access
out of bound indices, leading to this crash.

Adding sanity check for index and pstate, to ensure only valid pstate
and index values are returned.

Call Trace:
[c0078d66b130] [c011d224] __free_irq+0x234/0x360
(unreliable)
[c0078d66b1c0] [c011d44c] free_irq+0x6c/0xa0
[c0078d66b1f0] [c006c4f8] opal_event_shutdown+0x88/0xd0
[c0078d66b230] [c0067a4c] opal_shutdown+0x1c/0x90
[c0078d66b260] [c0063a00] pnv_shutdown+0x20/0x40
[c0078d66b280] [c0021538] machine_restart+0x38/0x90
[c78d66b310] [c0965ea0] panic+0x284/0x300
[c0078d66b3a0] [c001f508] die+0x388/0x450
[c0078d66b430] [c0045a50] bad_page_fault+0xd0/0x140
[c0078d66b4a0] [c0008964] handle_page_fault+0x2c/0x30
   interrupt: 300 at gpstate_timer_handler+0x150/0x260
LR = gpstate_timer_handler+0x130/0x260
[c0078d66b7f0] [c0132b58] call_timer_fn+0x58/0x1c0
[c0078d66b880] [c0132e20] expire_timers+0x130/0x1d0
[c0078d66b8f0] [c0133068] run_timer_softirq+0x1a8/0x230
[c0078d66b980] [c00b535c] __do_softirq+0x18c/0x400
[c0078d66ba70] [c00b5828] irq_exit+0xc8/0x100
[c0078d66ba90] [c001e214] timer_interrupt+0xa4/0xe0
[c0078d66bac0] [c00027d0] decrementer_common+0x150/0x180
   interrupt: 901 at arch_local_irq_restore+0x74/0x90
  0] [c0106b34] call_cpuidle+0x44/0x90
[c0078d66be50] [c010708c] cpu_startup_entry+0x38c/0x460
[c0078d66bf20] [c003d930] start_secondary+0x330/0x380
[c0078d66bf90] [c0008e6c] start_secondary_prolog+0x10/0x14

Fixes: 08d27eb ("cpufreq: powernv: Replacing pstate_id with
frequency table index")
Reported-by: Madhavan Srinivasan 
Signed-off-by: Akshay Adiga 
---
 drivers/cpufreq/powernv-cpufreq.c | 21 -
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index 87796e0..d3ffde8 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -145,11 +145,30 @@ static struct powernv_pstate_info {
 /* Use following macros for conversions between pstate_id and index */
 static inline int idx_to_pstate(unsigned int i)
 {
+   if (unlikely(i >= powernv_pstate_info.nr_pstates)) {
+   pr_warn_once("index %u is out of bound\n", i);
+   return powernv_freqs[powernv_pstate_info.nominal].driver_data;
+   }
+
return powernv_freqs[i].driver_data;
 }
 
 static inline unsigned int pstate_to_idx(int pstate)
 {
+   int min = powernv_freqs[powernv_pstate_info.min].driver_data;
+   int max = powernv_freqs[powernv_pstate_info.max].driver_data;
+
+   if (min > 0) {
+   if (unlikely((pstate < max) || (pstate > min))) {
+   pr_warn_once("pstate %d is out of bound\n", pstate);
+   return powernv_pstate_info.nominal;
+   }
+   } else {
+   if (unlikely((pstate > max) || (pstate < min))) {
+   pr_warn_once("pstate %d is out of bound\n", pstate);
+   return powernv_pstate_info.nominal;
+   }
+   }
/*
 * abs() is deliberately used so that is works with
 * both monotonically increasing and decreasing
@@ -593,7 +612,7 @@ void gpstate_timer_handler(unsigned long data)
} else {
gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
 gpstates->highest_lpstate_idx,
-freq_data.pstate_id);
+gpstates->last_lpstate_idx);
}
 
/*
-- 
2.1.4



Re: [PATCH v2] cpufreq: powernv: Replacing pstate_id with frequency table index

2016-07-06 Thread Akshay Adiga



On 06/30/2016 11:53 AM, Akshay Adiga wrote:

Refactoring code to use frequency table index instead of pstate_id.
This abstraction will make the code independent of the pstate values.

- No functional changes
- The highest frequency is at frequency table index 0 and the frequency
   decreases as the index increases.
- Macros pstates_to_idx() and idx_to_pstate() can be used for conversion
   between pstate_id and index.
- powernv_pstate_info now contains frequency table index to min, max and
   nominal frequency (instead of pstate_ids)
- global_pstate_info new stores index values instead pstate ids.
- variables renamed as *_idx which now store index instead of pstate

Signed-off-by: Akshay Adiga 
Reviewed-by: Gautham R. Shenoy 
---
Changes from v1:
   - changed macro names from get_pstate()/ get_index() to
idx_to_pstate()/ pstate_to_idx()
   - Renamed variables that store index instead of pstate_id to *_idx
   - Retained previous printk's

v1 : http://marc.info/?l=linux-pm&m=146677701501225&w=1

  drivers/cpufreq/powernv-cpufreq.c | 177 ++
  1 file changed, 102 insertions(+), 75 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index b29c5c2..72c91d8 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -64,12 +64,14 @@
  /**
   * struct global_pstate_info -Per policy data structure to maintain 
history of
   *global pstates
- * @highest_lpstate:   The local pstate from which we are ramping down
+ * @highest_lpstate_idx:   The local pstate index from which we are
+ * ramping down
   * @elapsed_time: Time in ms spent in ramping down from
- * highest_lpstate
+ * highest_lpstate_idx
   * @last_sampled_time:Time from boot in ms when global 
pstates were
   *last set
- * @last_lpstate,last_gpstate: Last set values for local and global pstates
+ * @last_lpstate_idx,  Last set value of local pstate and global
+ * last_gpstate_idxpstate in terms of cpufreq table index
   * @timer:Is used for ramping down if cpu goes idle for
   *a long time with global pstate held high
   * @gpstate_lock: A spinlock to maintain synchronization between
@@ -77,11 +79,11 @@
   *governer's target_index calls
   */
  struct global_pstate_info {
-   int highest_lpstate;
+   int highest_lpstate_idx;
unsigned int elapsed_time;
unsigned int last_sampled_time;
-   int last_lpstate;
-   int last_gpstate;
+   int last_lpstate_idx;
+   int last_gpstate_idx;
spinlock_t gpstate_lock;
struct timer_list timer;
  };
@@ -124,29 +126,47 @@ static int nr_chips;
  static DEFINE_PER_CPU(struct chip *, chip_info);
  
  /*

- * Note: The set of pstates consists of contiguous integers, the
- * smallest of which is indicated by powernv_pstate_info.min, the
- * largest of which is indicated by powernv_pstate_info.max.
+ * Note:
+ * The set of pstates consists of contiguous integers.
+ * powernv_pstate_info stores the index of the frequency table for
+ * max, min and nominal frequencies. It also stores number of
+ * available frequencies.
   *
- * The nominal pstate is the highest non-turbo pstate in this
- * platform. This is indicated by powernv_pstate_info.nominal.
+ * powernv_pstate_info.nominal indicates the index to the highest
+ * non-turbo frequency.
   */
  static struct powernv_pstate_info {
-   int min;
-   int max;
-   int nominal;
-   int nr_pstates;
+   unsigned int min;
+   unsigned int max;
+   unsigned int nominal;
+   unsigned int nr_pstates;
  } powernv_pstate_info;
  
+/* Use following macros for conversions between pstate_id and index */

+static inline int idx_to_pstate(unsigned int i)
+{
+   return powernv_freqs[i].driver_data;
+}
+
+static inline unsigned int pstate_to_idx(int pstate)
+{
+   /*
+* abs() is deliberately used so that is works with
+* both monotonically increasing and decreasing
+* pstate values
+*/
+   return abs(pstate - idx_to_pstate(powernv_pstate_info.max));
+}
+
  static inline void reset_gpstates(struct cpufreq_policy *policy)
  {
struct global_pstate_info *gpstates = policy->driver_data;
  
-	gpstates->highest_lpstate = 0;

+   gpstates->highest_lpstate_idx = 0;
gpstates->elapsed_time = 0;
gpstates->last_sampled_time = 0;
-   gpstates->last_lpstate = 0;
-   gpstates->last_gpstate = 0;
+   gpstates->last_lpstate_idx = 0;
+   gpstates->last_gpstate_idx = 0;
  }
  
  /*

@@ -156,9 +176,10 @@ static inline void reset_gpstates(struct cpufreq_policy 
*policy)
  static int init_powernv

Re: linux-next: build warning after merge of the pm tree

2016-11-14 Thread Akshay Adiga

Hi Rafeal,

Good catch, I need to set lpstate_idx = gpstate_idx.
I will send a patch which fixes this commit.

Thanks Stephen for reporting it. :)

Regards
Akshay Adiga

On 11/14/2016 05:33 AM, Rafael J. Wysocki wrote:

On Monday, November 14, 2016 10:40:09 AM Stephen Rothwell wrote:
> Hi Rafael,

Hi Stephen,

> After merging the pm tree, today's linux-next build (powerpc
> ppc64_defconfig) produced this warning:
>
> drivers/cpufreq/powernv-cpufreq.c: In function 'gpstate_timer_handler':
> drivers/cpufreq/powernv-cpufreq.c:642:5: warning: 'lpstate_idx' may
be used uninitialized in this function [-Wmaybe-uninitialized]
>   if (gpstate_idx != gpstates->last_lpstate_idx)
>  ^
>
> Introduced by commit
>
>   20b15b766354 ("cpufreq: powernv: Use PMCR to verify global and
local pstate")

Thanks for the notice!

Akshay, any thoughts?

Thanks,
Rafael





[PATCH -next] powernv: cpufreq: Fix uninitialized lpstate_idx in gpstates_timer_handler

2016-11-14 Thread Akshay Adiga
lpstate_idx remains uninitialized in the case when elapsed_time
is greater than MAX_RAMP_DOWN_TIME. At the end of rampdown
global pstate should be equal to local pstate.

Fixes: 20b15b766354 ("cpufreq: powernv: Use PMCR to verify global and
localpstate")
Reported-by: Stephen Rothwell 
Signed-off-by: Akshay Adiga 
---
 drivers/cpufreq/powernv-cpufreq.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index c82304b..c5c5bc3 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -624,6 +624,7 @@ void gpstate_timer_handler(unsigned long data)
 
if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
gpstate_idx = pstate_to_idx(freq_data.pstate_id);
+   lpstate_idx = gpstate_idx;
reset_gpstates(policy);
gpstates->highest_lpstate_idx = gpstate_idx;
} else {
-- 
2.5.5



[PATCH] cpufreq: powernv: Replacing pstate_id with frequency table index

2016-06-24 Thread Akshay Adiga
Refactoring code to use frequency table index instead of pstate_id.
This abstraction will make the code independent of the pstate values.

- No functional changes
- The highest frequency is at frequency table index 0 and the frequency
  decreases as the index increases.
- Macros get_index() and get_pstate() can be used for conversion between
  pstate_id and index.
- powernv_pstate_info now contains frequency table index to min, max and
  nominal frequency (instead of pstate_ids)

Signed-off-by: Akshay Adiga 
---
 drivers/cpufreq/powernv-cpufreq.c | 107 ++
 1 file changed, 61 insertions(+), 46 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index b29c5c2..f6ce6f0 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -43,6 +43,7 @@
 #define PMSR_SPR_EM_DISABLE(1UL << 31)
 #define PMSR_MAX(x)((x >> 32) & 0xFF)
 
+
 #define MAX_RAMP_DOWN_TIME 5120
 /*
  * On an idle system we want the global pstate to ramp-down from max value to
@@ -124,20 +125,29 @@ static int nr_chips;
 static DEFINE_PER_CPU(struct chip *, chip_info);
 
 /*
- * Note: The set of pstates consists of contiguous integers, the
- * smallest of which is indicated by powernv_pstate_info.min, the
- * largest of which is indicated by powernv_pstate_info.max.
+ * Note: The set of pstates consists of contiguous integers,
+ *
+ * powernv_pstate_info stores the index of the frequency table
+ *  instead of pstate itself for each of the pstates referred
  *
  * The nominal pstate is the highest non-turbo pstate in this
  * platform. This is indicated by powernv_pstate_info.nominal.
  */
 static struct powernv_pstate_info {
-   int min;
-   int max;
-   int nominal;
-   int nr_pstates;
+   unsigned int min;
+   unsigned int max;
+   unsigned int nominal;
+   unsigned int nr_pstates;
 } powernv_pstate_info;
 
+/* Use following macros for conversions between pstate_id and index */
+static inline int get_pstate(unsigned int i) {
+   return powernv_freqs[i].driver_data;
+}
+static inline unsigned int get_index(int pstate) {
+   return abs(pstate - get_pstate(powernv_pstate_info.max));
+}
+
 static inline void reset_gpstates(struct cpufreq_policy *policy)
 {
struct global_pstate_info *gpstates = policy->driver_data;
@@ -208,23 +218,28 @@ static int init_powernv_pstates(void)
return -ENODEV;
}
 
+   powernv_pstate_info.nr_pstates = nr_pstates;
pr_debug("NR PStates %d\n", nr_pstates);
for (i = 0; i < nr_pstates; i++) {
u32 id = be32_to_cpu(pstate_ids[i]);
u32 freq = be32_to_cpu(pstate_freqs[i]);
 
-   pr_debug("PState id %d freq %d MHz\n", id, freq);
powernv_freqs[i].frequency = freq * 1000; /* kHz */
powernv_freqs[i].driver_data = id;
+
+   if (id == pstate_max)
+   powernv_pstate_info.max = i;
+   else if (id == pstate_nominal)
+   powernv_pstate_info.nominal = i;
+   else if (id == pstate_min)
+   powernv_pstate_info.min = i;
}
+
+   pr_info("pstate_info: min %d nominal %d max %d\n",
+   powernv_pstate_info.min, powernv_pstate_info.nominal,
+   powernv_pstate_info.max);
/* End of list marker entry */
powernv_freqs[i].frequency = CPUFREQ_TABLE_END;
-
-   powernv_pstate_info.min = pstate_min;
-   powernv_pstate_info.max = pstate_max;
-   powernv_pstate_info.nominal = pstate_nominal;
-   powernv_pstate_info.nr_pstates = nr_pstates;
-
return 0;
 }
 
@@ -233,15 +248,15 @@ static unsigned int pstate_id_to_freq(int pstate_id)
 {
int i;
 
-   i = powernv_pstate_info.max - pstate_id;
+   i = get_index(pstate_id);
if (i >= powernv_pstate_info.nr_pstates || i < 0) {
pr_warn("PState id %d outside of PState table, "
"reporting nominal id %d instead\n",
pstate_id, powernv_pstate_info.nominal);
-   i = powernv_pstate_info.max - powernv_pstate_info.nominal;
+   i = powernv_pstate_info.nominal;
}
 
-   return powernv_freqs[i].frequency;
+   return get_pstate(i);
 }
 
 /*
@@ -252,7 +267,7 @@ static ssize_t cpuinfo_nominal_freq_show(struct 
cpufreq_policy *policy,
char *buf)
 {
return sprintf(buf, "%u\n",
-   pstate_id_to_freq(powernv_pstate_info.nominal));
+   get_pstate(powernv_pstate_info.nominal));
 }
 
 struct freq_attr cpufreq_freq_attr_cpuinfo_nominal_freq =
@@ -426,7 +441,7 @@ static void set_pstate(void *data)
  */
 static inline unsigned int get_nominal_index(void)

[PATCH v2] cpufreq: powernv: Replacing pstate_id with frequency table index

2016-06-29 Thread Akshay Adiga
Refactoring code to use frequency table index instead of pstate_id.
This abstraction will make the code independent of the pstate values.

- No functional changes
- The highest frequency is at frequency table index 0 and the frequency
  decreases as the index increases.
- Macros pstates_to_idx() and idx_to_pstate() can be used for conversion
  between pstate_id and index.
- powernv_pstate_info now contains frequency table index to min, max and
  nominal frequency (instead of pstate_ids)
- global_pstate_info new stores index values instead pstate ids.
- variables renamed as *_idx which now store index instead of pstate

Signed-off-by: Akshay Adiga 
Reviewed-by: Gautham R. Shenoy 
---
Changes from v1:
  - changed macro names from get_pstate()/ get_index() to 
idx_to_pstate()/ pstate_to_idx()
  - Renamed variables that store index instead of pstate_id to *_idx
  - Retained previous printk's 

v1 : http://marc.info/?l=linux-pm&m=146677701501225&w=1

 drivers/cpufreq/powernv-cpufreq.c | 177 ++
 1 file changed, 102 insertions(+), 75 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index b29c5c2..72c91d8 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -64,12 +64,14 @@
 /**
  * struct global_pstate_info - Per policy data structure to maintain history of
  * global pstates
- * @highest_lpstate:   The local pstate from which we are ramping down
+ * @highest_lpstate_idx:   The local pstate index from which we are
+ * ramping down
  * @elapsed_time:  Time in ms spent in ramping down from
- * highest_lpstate
+ * highest_lpstate_idx
  * @last_sampled_time: Time from boot in ms when global pstates were
  * last set
- * @last_lpstate,last_gpstate: Last set values for local and global pstates
+ * @last_lpstate_idx,  Last set value of local pstate and global
+ * last_gpstate_idxpstate in terms of cpufreq table index
  * @timer: Is used for ramping down if cpu goes idle for
  * a long time with global pstate held high
  * @gpstate_lock:  A spinlock to maintain synchronization between
@@ -77,11 +79,11 @@
  * governer's target_index calls
  */
 struct global_pstate_info {
-   int highest_lpstate;
+   int highest_lpstate_idx;
unsigned int elapsed_time;
unsigned int last_sampled_time;
-   int last_lpstate;
-   int last_gpstate;
+   int last_lpstate_idx;
+   int last_gpstate_idx;
spinlock_t gpstate_lock;
struct timer_list timer;
 };
@@ -124,29 +126,47 @@ static int nr_chips;
 static DEFINE_PER_CPU(struct chip *, chip_info);
 
 /*
- * Note: The set of pstates consists of contiguous integers, the
- * smallest of which is indicated by powernv_pstate_info.min, the
- * largest of which is indicated by powernv_pstate_info.max.
+ * Note:
+ * The set of pstates consists of contiguous integers.
+ * powernv_pstate_info stores the index of the frequency table for
+ * max, min and nominal frequencies. It also stores number of
+ * available frequencies.
  *
- * The nominal pstate is the highest non-turbo pstate in this
- * platform. This is indicated by powernv_pstate_info.nominal.
+ * powernv_pstate_info.nominal indicates the index to the highest
+ * non-turbo frequency.
  */
 static struct powernv_pstate_info {
-   int min;
-   int max;
-   int nominal;
-   int nr_pstates;
+   unsigned int min;
+   unsigned int max;
+   unsigned int nominal;
+   unsigned int nr_pstates;
 } powernv_pstate_info;
 
+/* Use following macros for conversions between pstate_id and index */
+static inline int idx_to_pstate(unsigned int i)
+{
+   return powernv_freqs[i].driver_data;
+}
+
+static inline unsigned int pstate_to_idx(int pstate)
+{
+   /*
+* abs() is deliberately used so that is works with
+* both monotonically increasing and decreasing
+* pstate values
+*/
+   return abs(pstate - idx_to_pstate(powernv_pstate_info.max));
+}
+
 static inline void reset_gpstates(struct cpufreq_policy *policy)
 {
struct global_pstate_info *gpstates = policy->driver_data;
 
-   gpstates->highest_lpstate = 0;
+   gpstates->highest_lpstate_idx = 0;
gpstates->elapsed_time = 0;
gpstates->last_sampled_time = 0;
-   gpstates->last_lpstate = 0;
-   gpstates->last_gpstate = 0;
+   gpstates->last_lpstate_idx = 0;
+   gpstates->last_gpstate_idx = 0;
 }
 
 /*
@@ -156,9 +176,10 @@ static inline void reset_gpstates(struct cpufreq_policy 
*policy)
 static int init_powernv_pstates(void)
 {
struct device_node *power_mgt;
-   int i, pstate_min, pstate_max, 

Re: [PATCH] cpufreq: powernv: Replacing pstate_id with frequency table index

2016-06-27 Thread Akshay Adiga

Hi viresh,

My apologies. I realize that i have messed it up a quite a few places. Surely 
with the checkpatch as well. I will send a v2 with corrections.

On 06/27/2016 12:00 PM, Viresh Kumar wrote:


Hi Akshay,

Did you try running checkpatch for this?

On 24-06-16, 19:33, Akshay Adiga wrote:

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index b29c5c2..f6ce6f0 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -43,6 +43,7 @@
  #define PMSR_SPR_EM_DISABLE   (1UL << 31)
  #define PMSR_MAX(x)   ((x >> 32) & 0xFF)
  
+

?


  #define MAX_RAMP_DOWN_TIME5120
  /*
   * On an idle system we want the global pstate to ramp-down from max value to
@@ -124,20 +125,29 @@ static int nr_chips;
  static DEFINE_PER_CPU(struct chip *, chip_info);
  
  /*

- * Note: The set of pstates consists of contiguous integers, the
- * smallest of which is indicated by powernv_pstate_info.min, the
- * largest of which is indicated by powernv_pstate_info.max.
+ * Note: The set of pstates consists of contiguous integers,
+ *
+ * powernv_pstate_info stores the index of the frequency table
+ *  instead of pstate itself for each of the pstates referred
   *
   * The nominal pstate is the highest non-turbo pstate in this
   * platform. This is indicated by powernv_pstate_info.nominal.
   */
  static struct powernv_pstate_info {
-   int min;
-   int max;
-   int nominal;
-   int nr_pstates;
+   unsigned int min;
+   unsigned int max;
+   unsigned int nominal;
+   unsigned int nr_pstates;
  } powernv_pstate_info;
  
+/* Use following macros for conversions between pstate_id and index */

+static inline int get_pstate(unsigned int i) {

Read coding-styles please on how to write functions.


+   return powernv_freqs[i].driver_data;
+}

Add a blank line here please.


+static inline unsigned int get_index(int pstate) {
+   return abs(pstate - get_pstate(powernv_pstate_info.max));
+}
+
  static inline void reset_gpstates(struct cpufreq_policy *policy)
  {
struct global_pstate_info *gpstates = policy->driver_data;
@@ -208,23 +218,28 @@ static int init_powernv_pstates(void)
return -ENODEV;
}
  
+	powernv_pstate_info.nr_pstates = nr_pstates;

pr_debug("NR PStates %d\n", nr_pstates);
for (i = 0; i < nr_pstates; i++) {
u32 id = be32_to_cpu(pstate_ids[i]);
u32 freq = be32_to_cpu(pstate_freqs[i]);
  
-		pr_debug("PState id %d freq %d MHz\n", id, freq);

?


powernv_freqs[i].frequency = freq * 1000; /* kHz */
powernv_freqs[i].driver_data = id;

Will it be possible for Shilpa who was earlier on this to review this patch? As
we don't really have great knowledge of the internals of this driver.





Re: [PATCH] Work around for enabling CONFIG_CMDLINE on ppc64le

2016-09-27 Thread Akshay Adiga

Hi Michael,

Here is the link to the bug raised on launchpad.
https://bugs.launchpad.net/ubuntu/+source/gcc-5/+bug/1628207


On 09/23/2016 09:45 AM, Akshay Adiga wrote:

Hi Michael,

Anton found this bug and raised it against gcc v7.0 and a fix is 
available

 in upstream gcc.

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71709

Currently, gcc v5.4.0  and v6.1.1 shipped with Ubuntu 16.04 and 16.10  
respectively,

 are hitting this problem.

I have also raised bug against Ubuntu for fixing gcc for 16.04.

https://bugzilla.linux.ibm.com/show_bug.cgi?id=146668


On 09/22/2016 03:51 PM, Michael Ellerman wrote:

Akshay Adiga  writes:


Observed that boot arguments (passed as CONFIG_CMDLINE)  are not being
picked up by kernel while using gcc-ppc64-linux-gnu v5.4.0 and v6.1.1.
While it works as expected with v5.3.1 .

Found that in init/main.c in  setup_command_line() the pointers 
passed to

strcpy() is messed up.

Hi Akshay,

Thanks for debugging this.


The problem goes away when compiler optimization is restricted to -O1.
diff --git a/init/main.c b/init/main.c
index a8a58e2..4259c42 100644
--- a/init/main.c
+++ b/init/main.c
@@ -358,7 +358,13 @@ static inline void smp_prepare_cpus(unsigned 
int maxcpus) { }

   * parsing is performed in place, and we should allow a component to
   * store reference of name/value for future reference.
   */
-static void __init setup_command_line(char *command_line)
+static void __init
+#ifdef CONFIG_PPC64
+#if  GCC_VERSION > 50301
+__attribute__((optimize("-O1")))
+#endif
+#endif
+setup_command_line(char *command_line)
  {
  saved_command_line =
  memblock_virt_alloc(strlen(boot_command_line) + 1, 0);

But I can't merge that patch.

Our options are one or both of:
  - get GCC fixed and backport the fix to the compilers we care about.
  - blacklist the broken compiler versions.

Is there a GCC bug filed for this?

cheers







Re: [PATCH-next v2 0/2] cpufreq: powernv: Fixes for Global pstate management

2016-05-09 Thread Akshay Adiga

On 05/03/2016 08:49 PM, Akshay Adiga wrote:


Fixes are based on patch https://patchwork.ozlabs.org/patch/612058/ which
is in Rafael's linux-next.

- Patch [1] fixes WARN_ON in powernv_target_index()
- Patch [2] Deleting any pending timer to saves an unnecessary irq call
  in powernv_target_index()

Akshay Adiga (2):
   cpufreq: powernv: Move smp_call_function_any() out of irq safe block
   cpufreq: powernv: del_timer_sync when global and local pstate are
 equal

  drivers/cpufreq/powernv-cpufreq.c | 11 +++
  1 file changed, 7 insertions(+), 4 deletions(-)


Does this look good ? Anything i should do ?


Regards
Akshay Adiga



[PATCH -next 2/2] cpufreq: powernv: del_timer_sync when global and local pstate are equal

2016-05-03 Thread Akshay Adiga
Deleting pending gpstates->timer for the policy when global and local pstate
are equal while executing target_index(). This saves an unnecessary
irq call.

Signed-off-by: Akshay Adiga 
---
Patch is based on Rafael's linux-next
 drivers/cpufreq/powernv-cpufreq.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index 1f0e20c..54c4536 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -647,6 +647,8 @@ static int powernv_cpufreq_target_index(struct 
cpufreq_policy *policy,
 */
if (gpstate_id != freq_data.pstate_id)
queue_gpstate_timer(gpstates);
+   else
+   del_timer_sync(&gpstates->timer);
 
 gpstates_done:
freq_data.gpstate_id = gpstate_id;
-- 
2.5.5



[PATCH -next 0/2] cpufreq: powernv: Fixes for Global pstate management

2016-05-03 Thread Akshay Adiga
Fixes are based on patch https://patchwork.ozlabs.org/patch/612058/ which is
in Rafael's linux-next.

- Patch [1] fixes WARN_ON in powernv_target_index()
- Patch [2] Deleting any pending timer to saves an unnecessary irq call
 in powernv_target_index()


Akshay Adiga (2):
  cpufreq: powernv: Move smp_call_function_any() out of irq safe block
  cpufreq: powernv: del_timer_sync when global and local pstate are
equal

 drivers/cpufreq/powernv-cpufreq.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

-- 
2.5.5



[PATCH -next 1/2] cpufreq: powernv: Move smp_call_function_any() out of irq safe block

2016-05-03 Thread Akshay Adiga
Fixing a WARN_ON caused by smp_call_function_any() when irq is disabled,
because of changes made in the patch
('cpufreq: powernv: Ramp-down global pstate slower than local-pstate')
https://patchwork.ozlabs.org/patch/612058/

 WARNING: CPU: 0 PID: 4 at kernel/smp.c:291
smp_call_function_single+0x170/0x180

 Call Trace:
 [c007f648f9f0] [c007f648fa90] 0xc007f648fa90 (unreliable)
 [c007f648fa30] [c01430e0] smp_call_function_any+0x170/0x1c0
 [c007f648fa90] [c07b4b00]
powernv_cpufreq_target_index+0xe0/0x250
 [c007f648fb00] [c07ac9dc]
__cpufreq_driver_target+0x20c/0x3d0
 [c007f648fbc0] [c07b1b4c] od_dbs_timer+0xcc/0x260
 [c007f648fc10] [c07b3024] dbs_work_handler+0x54/0xa0
 [c007f648fc50] [c00c49a8] process_one_work+0x1d8/0x590
 [c007f648fce0] [c00c4e08] worker_thread+0xa8/0x660
 [c007f648fd80] [c00cca88] kthread+0x108/0x130
 [c007f648fe30] [c00095e8] ret_from_kernel_thread+0x5c/0x74

Moving smp_call_function_any() out of the critical section and changing
irq safe spinlocks to normal spinlocks.

Reported-by: Abdul Haleem 
Signed-off-by: Akshay Adiga 
---
Patch is based on Rafael's linux-next
 drivers/cpufreq/powernv-cpufreq.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index 144c732..1f0e20c 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -581,9 +581,10 @@ void gpstate_timer_handler(unsigned long data)
gpstates->last_gpstate = freq_data.gpstate_id;
gpstates->last_lpstate = freq_data.pstate_id;
 
+   spin_unlock(&gpstates->gpstate_lock);
+
/* Timer may get migrated to a different cpu on cpu hot unplug */
smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
-   spin_unlock(&gpstates->gpstate_lock);
 }
 
 /*
@@ -596,7 +597,6 @@ static int powernv_cpufreq_target_index(struct 
cpufreq_policy *policy,
 {
struct powernv_smp_call_data freq_data;
unsigned int cur_msec, gpstate_id;
-   unsigned long flags;
struct global_pstate_info *gpstates = policy->driver_data;
 
if (unlikely(rebooting) && new_index != get_nominal_index())
@@ -607,7 +607,7 @@ static int powernv_cpufreq_target_index(struct 
cpufreq_policy *policy,
 
cur_msec = jiffies_to_msecs(get_jiffies_64());
 
-   spin_lock_irqsave(&gpstates->gpstate_lock, flags);
+   spin_lock(&gpstates->gpstate_lock);
freq_data.pstate_id = powernv_freqs[new_index].driver_data;
 
if (!gpstates->last_sampled_time) {
@@ -654,13 +654,14 @@ gpstates_done:
gpstates->last_gpstate = freq_data.gpstate_id;
gpstates->last_lpstate = freq_data.pstate_id;
 
+   spin_unlock(&gpstates->gpstate_lock);
+
/*
 * Use smp_call_function to send IPI and execute the
 * mtspr on target CPU.  We could do that without IPI
 * if current CPU is within policy->cpus (core)
 */
smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
-   spin_unlock_irqrestore(&gpstates->gpstate_lock, flags);
return 0;
 }
 
-- 
2.5.5



Re: [PATCH -next 1/2] cpufreq: powernv: Move smp_call_function_any() out of irq safe block

2016-05-03 Thread Akshay Adiga


Hi Viresh,

On 05/03/2016 05:19 PM, Viresh Kumar wrote:


On 03-05-16, 15:10, Akshay Adiga wrote:

Fixing a WARN_ON caused by smp_call_function_any() when irq is disabled,
because of changes made in the patch
('cpufreq: powernv: Ramp-down global pstate slower than local-pstate')
https://patchwork.ozlabs.org/patch/612058/

  WARNING: CPU: 0 PID: 4 at kernel/smp.c:291
smp_call_function_single+0x170/0x180

  Call Trace:
  [c007f648f9f0] [c007f648fa90] 0xc007f648fa90 (unreliable)
  [c007f648fa30] [c01430e0] smp_call_function_any+0x170/0x1c0
  [c007f648fa90] [c07b4b00]
powernv_cpufreq_target_index+0xe0/0x250
  [c007f648fb00] [c07ac9dc]
__cpufreq_driver_target+0x20c/0x3d0
  [c007f648fbc0] [c07b1b4c] od_dbs_timer+0xcc/0x260
  [c007f648fc10] [c07b3024] dbs_work_handler+0x54/0xa0
  [c007f648fc50] [c00c49a8] process_one_work+0x1d8/0x590
  [c007f648fce0] [c00c4e08] worker_thread+0xa8/0x660
  [c007f648fd80] [c00cca88] kthread+0x108/0x130
  [c007f648fe30] [c00095e8] ret_from_kernel_thread+0x5c/0x74

Moving smp_call_function_any() out of the critical section and changing
irq safe spinlocks to normal spinlocks.

Reported-by: Abdul Haleem 
Signed-off-by: Akshay Adiga 
---
Patch is based on Rafael's linux-next
  drivers/cpufreq/powernv-cpufreq.c | 9 +
  1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index 144c732..1f0e20c 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -581,9 +581,10 @@ void gpstate_timer_handler(unsigned long data)
gpstates->last_gpstate = freq_data.gpstate_id;
gpstates->last_lpstate = freq_data.pstate_id;
  
+	spin_unlock(&gpstates->gpstate_lock);

+
/* Timer may get migrated to a different cpu on cpu hot unplug */
smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
-   spin_unlock(&gpstates->gpstate_lock);
  }
  
  /*

@@ -596,7 +597,6 @@ static int powernv_cpufreq_target_index(struct 
cpufreq_policy *policy,
  {
struct powernv_smp_call_data freq_data;
unsigned int cur_msec, gpstate_id;
-   unsigned long flags;
struct global_pstate_info *gpstates = policy->driver_data;
  
  	if (unlikely(rebooting) && new_index != get_nominal_index())

@@ -607,7 +607,7 @@ static int powernv_cpufreq_target_index(struct 
cpufreq_policy *policy,
  
  	cur_msec = jiffies_to_msecs(get_jiffies_64());
  
-	spin_lock_irqsave(&gpstates->gpstate_lock, flags);

+   spin_lock(&gpstates->gpstate_lock);

You don't necessarily have to write 'what you are doing' in the commit log, but
tell us why you are doing that.

Please explain, why is this changed and why will things continue to work
without this.


Thanks for reviewing. I have tried to convey that in the first line of commit 
message,

"WARN_ON caused by smp_call_function_any() when irq is disabled,
because of changes made in the patch"

I see, i have not explained why i am changing irq safe spinlock to normal 
spinlock.
will add some explanation.



[PATCH-next v2 0/2] cpufreq: powernv: Fixes for Global pstate management

2016-05-03 Thread Akshay Adiga
Fixes are based on patch https://patchwork.ozlabs.org/patch/612058/ which
is in Rafael's linux-next.

- Patch [1] fixes WARN_ON in powernv_target_index()
- Patch [2] Deleting any pending timer to saves an unnecessary irq call
 in powernv_target_index()

Akshay Adiga (2):
  cpufreq: powernv: Move smp_call_function_any() out of irq safe block
  cpufreq: powernv: del_timer_sync when global and local pstate are
equal

 drivers/cpufreq/powernv-cpufreq.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

-- 
2.5.5



[PATCH-next v2 1/2] cpufreq: powernv: Move smp_call_function_any() out of irq safe block

2016-05-03 Thread Akshay Adiga
Fix a WARN_ON caused by smp_call_function_any() when irq is disabled,
because of changes made in the patch ('cpufreq: powernv: Ramp-down
 global pstate slower than local-pstate')
https://patchwork.ozlabs.org/patch/612058/

 WARNING: CPU: 0 PID: 4 at kernel/smp.c:291
smp_call_function_single+0x170/0x180

 Call Trace:
 [c007f648f9f0] [c007f648fa90] 0xc007f648fa90 (unreliable)
 [c007f648fa30] [c01430e0] smp_call_function_any+0x170/0x1c0
 [c007f648fa90] [c07b4b00]
powernv_cpufreq_target_index+0xe0/0x250
 [c007f648fb00] [c07ac9dc]
__cpufreq_driver_target+0x20c/0x3d0
 [c007f648fbc0] [c07b1b4c] od_dbs_timer+0xcc/0x260
 [c007f648fc10] [c07b3024] dbs_work_handler+0x54/0xa0
 [c007f648fc50] [c00c49a8] process_one_work+0x1d8/0x590
 [c007f648fce0] [c00c4e08] worker_thread+0xa8/0x660
 [c007f648fd80] [c00cca88] kthread+0x108/0x130
 [c007f648fe30] [c00095e8] ret_from_kernel_thread+0x5c/0x74

- Calling smp_call_function_any() with interrupt disabled (through
 spin_lock_irqsave) could cause a deadlock, as smp_call_function_any()
 relies on the IPI to complete. This is detected in the
 smp_call_function_any() call and hence the WARN_ON.

- As the spinlock (gpstates->lock) is only used to synchronize access of
 global_pstate_info  between timer irq handler and target_index calls. And
 the timer irq handler just try_locks() hence it would not cause a
 deadlock. Hence could do without making spinlocks irq safe.

- As the smp_call_function_any() is a blocking call and does not access
 global_pstates_info, it could reduce the critcal section by moving
 smp_call_function_any() after giving up the lock.

Reported-by: Abdul Haleem 
Signed-off-by: Akshay Adiga 
---
Patch is based on Rafael's linux-next
 drivers/cpufreq/powernv-cpufreq.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index 144c732..1f0e20c 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -581,9 +581,10 @@ void gpstate_timer_handler(unsigned long data)
gpstates->last_gpstate = freq_data.gpstate_id;
gpstates->last_lpstate = freq_data.pstate_id;
 
+   spin_unlock(&gpstates->gpstate_lock);
+
/* Timer may get migrated to a different cpu on cpu hot unplug */
smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
-   spin_unlock(&gpstates->gpstate_lock);
 }
 
 /*
@@ -596,7 +597,6 @@ static int powernv_cpufreq_target_index(struct 
cpufreq_policy *policy,
 {
struct powernv_smp_call_data freq_data;
unsigned int cur_msec, gpstate_id;
-   unsigned long flags;
struct global_pstate_info *gpstates = policy->driver_data;
 
if (unlikely(rebooting) && new_index != get_nominal_index())
@@ -607,7 +607,7 @@ static int powernv_cpufreq_target_index(struct 
cpufreq_policy *policy,
 
cur_msec = jiffies_to_msecs(get_jiffies_64());
 
-   spin_lock_irqsave(&gpstates->gpstate_lock, flags);
+   spin_lock(&gpstates->gpstate_lock);
freq_data.pstate_id = powernv_freqs[new_index].driver_data;
 
if (!gpstates->last_sampled_time) {
@@ -654,13 +654,14 @@ gpstates_done:
gpstates->last_gpstate = freq_data.gpstate_id;
gpstates->last_lpstate = freq_data.pstate_id;
 
+   spin_unlock(&gpstates->gpstate_lock);
+
/*
 * Use smp_call_function to send IPI and execute the
 * mtspr on target CPU.  We could do that without IPI
 * if current CPU is within policy->cpus (core)
 */
smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
-   spin_unlock_irqrestore(&gpstates->gpstate_lock, flags);
return 0;
 }
 
-- 
2.5.5



[PATCH-next v2 2/2] cpufreq: powernv: del_timer_sync when global and local pstate are equal

2016-05-03 Thread Akshay Adiga
When global and local pstate are equal in a powernv_target_index() call,
we don't queue a timer. But we may have timer already queued for future.
This could cause the timer to fire one additional time for no use.

Signed-off-by: Akshay Adiga 
---
Patch is based on Rafael's linux-next
 drivers/cpufreq/powernv-cpufreq.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index 1f0e20c..54c4536 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -647,6 +647,8 @@ static int powernv_cpufreq_target_index(struct 
cpufreq_policy *policy,
 */
if (gpstate_id != freq_data.pstate_id)
queue_gpstate_timer(gpstates);
+   else
+   del_timer_sync(&gpstates->timer);
 
 gpstates_done:
freq_data.gpstate_id = gpstate_id;
-- 
2.5.5



Re: [PATCH v2 2/2] cpufreq: powernv: Ramp-down global pstate slower than local-pstate

2016-04-19 Thread Akshay Adiga

Hi Viresh,

On 04/18/2016 03:48 PM, Viresh Kumar wrote:

On 15-04-16, 11:58, Akshay Adiga wrote:

  static int powernv_cpufreq_reboot_notifier(struct notifier_block *nb,
-   unsigned long action, void *unused)
+  unsigned long action, void *unused)

Unrelated change.. better don't add such changes..


Posting out v3 with out this unrelated change.


  {
int cpu;
struct cpufreq_policy cpu_policy;
@@ -603,15 +843,18 @@ static struct notifier_block powernv_cpufreq_opal_nb = {
  static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy)
  {
struct powernv_smp_call_data freq_data;
-
+   struct global_pstate_info *gpstates = policy->driver_data;

You removed a blank line here and I feel the code looks better with
that.


freq_data.pstate_id = powernv_pstate_info.min;
+   freq_data.gpstate_id = powernv_pstate_info.min;
smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1);
+   del_timer_sync(&gpstates->timer);
  }
  
  static struct cpufreq_driver powernv_cpufreq_driver = {

.name   = "powernv-cpufreq",
.flags  = CPUFREQ_CONST_LOOPS,
.init   = powernv_cpufreq_cpu_init,
+   .exit   = powernv_cpufreq_cpu_exit,
.verify = cpufreq_generic_frequency_table_verify,
.target_index   = powernv_cpufreq_target_index,
.get= powernv_cpufreq_get,

None of the above comments are mandatory for you to fix..

Acked-by: Viresh Kumar 


Thanks for Ack  :)



[PATCH v3 1/2] cpufreq: powernv: Remove flag use-case of policy->driver_data

2016-04-19 Thread Akshay Adiga
From: Shilpasri G Bhat 

commit 1b0289848d5d ("cpufreq: powernv: Add sysfs attributes to show
throttle stats") used policy->driver_data as a flag for one-time creation
of throttle sysfs files. Instead of this use 'kernfs_find_and_get()' to
check if the attribute already exists. This is required as
policy->driver_data is used for other purposes in the later patch.

Signed-off-by: Shilpasri G Bhat 
Signed-off-by: Akshay Adiga 
Acked-by: Viresh Kumar 
---
 drivers/cpufreq/powernv-cpufreq.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index 39ac78c..e2e2219 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -455,13 +455,15 @@ static int powernv_cpufreq_target_index(struct 
cpufreq_policy *policy,
 static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
int base, i;
+   struct kernfs_node *kn;
 
base = cpu_first_thread_sibling(policy->cpu);
 
for (i = 0; i < threads_per_core; i++)
cpumask_set_cpu(base + i, policy->cpus);
 
-   if (!policy->driver_data) {
+   kn = kernfs_find_and_get(policy->kobj.sd, throttle_attr_grp.name);
+   if (!kn) {
int ret;
 
ret = sysfs_create_group(&policy->kobj, &throttle_attr_grp);
@@ -470,11 +472,8 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy 
*policy)
policy->cpu);
return ret;
}
-   /*
-* policy->driver_data is used as a flag for one-time
-* creation of throttle sysfs files.
-*/
-   policy->driver_data = policy;
+   } else {
+   kernfs_put(kn);
}
return cpufreq_table_validate_and_show(policy, powernv_freqs);
 }
-- 
2.5.5



[PATCH v3 2/2] cpufreq: powernv: Ramp-down global pstate slower than local-pstate

2016-04-19 Thread Akshay Adiga
iterations )
---
op/sOperation   with patch  without patch   %change
---
15000   Read61480.6 50261.4 22.32
15000   cleanup 215.2   293.6   -26.70
15000   update  25666.2 25163.8 2.00

25000   Read32626.2 89525.4 -63.56
25000   cleanup 292.2   263.0   11.10
25000   update  32293.4 90255.0 -64.22

35000   Read34783.0 33119.0 5.02
35000   cleanup 321.2   395.8   -18.8
35000   update  36047.0 38747.8 -6.97

4   Read38562.2 42357.4 -8.96
4   cleanup 371.8   384.6   -3.33
4   update  27861.4 41547.8 -32.94

45000   Read42271.0 88120.6 -52.03
45000   cleanup 263.6   383.0   -31.17
45000   update  29755.8 81359.0 -63.43

(test without target op/s)
47659   Read83061.4 136440.6-39.12
47659   cleanup 195.8   193.8   1.03
47659   update  73429.4 124971.8-41.24

Signed-off-by: Akshay Adiga 
Reviewed-by: Gautham R. Shenoy 
Acked-by: Viresh Kumar 
---
 drivers/cpufreq/powernv-cpufreq.c | 258 --
 1 file changed, 251 insertions(+), 7 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index e2e2219..144c732 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -36,12 +36,56 @@
 #include 
 #include  /* Required for cpu_sibling_mask() in UP configs */
 #include 
+#include 
 
 #define POWERNV_MAX_PSTATES256
 #define PMSR_PSAFE_ENABLE  (1UL << 30)
 #define PMSR_SPR_EM_DISABLE(1UL << 31)
 #define PMSR_MAX(x)((x >> 32) & 0xFF)
 
+#define MAX_RAMP_DOWN_TIME 5120
+/*
+ * On an idle system we want the global pstate to ramp-down from max value to
+ * min over a span of ~5 secs. Also we want it to initially ramp-down slowly 
and
+ * then ramp-down rapidly later on.
+ *
+ * This gives a percentage rampdown for time elapsed in milliseconds.
+ * ramp_down_percentage = ((ms * ms) >> 18)
+ * ~= 3.8 * (sec * sec)
+ *
+ * At 0 ms ramp_down_percent = 0
+ * At 5120 ms  ramp_down_percent = 100
+ */
+#define ramp_down_percent(time)((time * time) >> 18)
+
+/* Interval after which the timer is queued to bring down global pstate */
+#define GPSTATE_TIMER_INTERVAL 2000
+
+/**
+ * struct global_pstate_info - Per policy data structure to maintain history of
+ * global pstates
+ * @highest_lpstate:   The local pstate from which we are ramping down
+ * @elapsed_time:  Time in ms spent in ramping down from
+ * highest_lpstate
+ * @last_sampled_time: Time from boot in ms when global pstates were
+ * last set
+ * @last_lpstate,last_gpstate: Last set values for local and global pstates
+ * @timer: Is used for ramping down if cpu goes idle for
+ * a long time with global pstate held high
+ * @gpstate_lock:  A spinlock to maintain synchronization between
+ * routines called by the timer handler and
+ * governer's target_index calls
+ */
+struct global_pstate_info {
+   int highest_lpstate;
+   unsigned int elapsed_time;
+   unsigned int last_sampled_time;
+   int last_lpstate;
+   int last_gpstate;
+   spinlock_t gpstate_lock;
+   struct timer_list timer;
+};
+
 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
 static bool rebooting, throttled, occ_reset;
 
@@ -94,6 +138,17 @@ static struct powernv_pstate_info {
int nr_pstates;
 } powernv_pstate_info;
 
+static inline void reset_gpstates(struct cpufreq_policy *policy)
+{
+   struct global_pstate_info *gpstates = policy->driver_data;
+
+   gpstates->highest_lpstate = 0;
+   gpstates->elapsed_time = 0;
+   gpstates->last_sampled_time = 0;
+   gpstates->last_lpstate = 0;
+   gpstates->last_gpstate = 0;
+}
+
 /*
  * Initialize the freq table based on data obtained
  * from the firmware passed via device-tree
@@ -285,6 +340,7 @@ static inline void set_pmspr(unsigned long sprn, unsigned 
long val)
 struct powernv_smp_call_data {
unsigned int freq;
int pstate_id;
+   int gpstate_id;
 };
 
 /*
@@ -343,19 +399,21 @@ static unsigned int powernv_cpufreq_get(unsigned int cpu)
  * (struct powernv_smp_call_data *) and the pstate_id which needs to be set
  * on th

[PATCH v3 0/2] cpufreq: powernv: Ramp-down global pstate slower than local-pstate

2016-04-19 Thread Akshay Adiga
The frequency transition latency from pmin to pmax is observed to be in few
millisecond granurality. And it usually happens to take a performance penalty
during sudden frequency rampup requests.

This patch set solves this problem by using a chip-level entity called "global
pstates". Global pstate manages elements across other dependent core chiplets.
Typically, the element that needs to be managed is the voltage setting.
So by holding global pstates higher than local pstate for some amount of time
( ~5 seconds) the subsequent rampups could be made faster.

(1/2) patch removes the flag from cpufreq_policy->driver_data, so that it can
be used for tracking global pstates.

(2/2) patch adds code for global pstate management.
- The iozone results with this patchset, shows improvements in almost all cases.
- YCSB workload on redis with various  target operations per second shows 
better MaxLatency with this patch.

Changes from v1:
- Fixed coding style
- Added a routine to reset global_pstate_info instead of hacky memset
- Handled case where cpufreq_table_validate_and_show() fails
- changed int queue_gpstate_timer() to void queue_gpstate_timer()

Changes from v2:
- dropped the unreated change. 

Akshay Adiga (1):
  cpufreq: powernv: Ramp-down global pstate slower than local-pstate

Shilpasri G Bhat (1):
  cpufreq: powernv: Remove flag use-case of policy->driver_data

 drivers/cpufreq/powernv-cpufreq.c | 269 --
 1 file changed, 256 insertions(+), 13 deletions(-)

-- 
2.5.5



[PATCH v2 2/2] cpufreq: powernv: Ramp-down global pstate slower than local-pstate

2016-04-14 Thread Akshay Adiga
iterations )
---
op/sOperation   with patch  without patch   %change
---
15000   Read61480.6 50261.4 22.32
15000   cleanup 215.2   293.6   -26.70
15000   update  25666.2 25163.8 2.00

25000   Read32626.2 89525.4 -63.56
25000   cleanup 292.2   263.0   11.10
25000   update  32293.4 90255.0 -64.22

35000   Read34783.0 33119.0 5.02
35000   cleanup 321.2   395.8   -18.8
35000   update  36047.0 38747.8 -6.97

4   Read38562.2 42357.4 -8.96
4   cleanup 371.8   384.6   -3.33
4   update  27861.4 41547.8 -32.94

45000   Read42271.0 88120.6 -52.03
45000   cleanup 263.6   383.0   -31.17
45000   update  29755.8 81359.0 -63.43

(test without target op/s)
47659   Read83061.4 136440.6-39.12
47659   cleanup 195.8   193.8   1.03
47659   update  73429.4 124971.8-41.24

Signed-off-by: Akshay Adiga 
Reviewed-by: Gautham R. Shenoy 
---
 drivers/cpufreq/powernv-cpufreq.c | 261 --
 1 file changed, 252 insertions(+), 9 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index e2e2219..78388c0 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -36,12 +36,56 @@
 #include 
 #include  /* Required for cpu_sibling_mask() in UP configs */
 #include 
+#include 
 
 #define POWERNV_MAX_PSTATES256
 #define PMSR_PSAFE_ENABLE  (1UL << 30)
 #define PMSR_SPR_EM_DISABLE(1UL << 31)
 #define PMSR_MAX(x)((x >> 32) & 0xFF)
 
+#define MAX_RAMP_DOWN_TIME 5120
+/*
+ * On an idle system we want the global pstate to ramp-down from max value to
+ * min over a span of ~5 secs. Also we want it to initially ramp-down slowly 
and
+ * then ramp-down rapidly later on.
+ *
+ * This gives a percentage rampdown for time elapsed in milliseconds.
+ * ramp_down_percentage = ((ms * ms) >> 18)
+ * ~= 3.8 * (sec * sec)
+ *
+ * At 0 ms ramp_down_percent = 0
+ * At 5120 ms  ramp_down_percent = 100
+ */
+#define ramp_down_percent(time)((time * time) >> 18)
+
+/* Interval after which the timer is queued to bring down global pstate */
+#define GPSTATE_TIMER_INTERVAL 2000
+
+/**
+ * struct global_pstate_info - Per policy data structure to maintain history of
+ * global pstates
+ * @highest_lpstate:   The local pstate from which we are ramping down
+ * @elapsed_time:  Time in ms spent in ramping down from
+ * highest_lpstate
+ * @last_sampled_time: Time from boot in ms when global pstates were
+ * last set
+ * @last_lpstate,last_gpstate: Last set values for local and global pstates
+ * @timer: Is used for ramping down if cpu goes idle for
+ * a long time with global pstate held high
+ * @gpstate_lock:  A spinlock to maintain synchronization between
+ * routines called by the timer handler and
+ * governer's target_index calls
+ */
+struct global_pstate_info {
+   int highest_lpstate;
+   unsigned int elapsed_time;
+   unsigned int last_sampled_time;
+   int last_lpstate;
+   int last_gpstate;
+   spinlock_t gpstate_lock;
+   struct timer_list timer;
+};
+
 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
 static bool rebooting, throttled, occ_reset;
 
@@ -94,6 +138,17 @@ static struct powernv_pstate_info {
int nr_pstates;
 } powernv_pstate_info;
 
+static inline void reset_gpstates(struct cpufreq_policy *policy)
+{
+   struct global_pstate_info *gpstates = policy->driver_data;
+
+   gpstates->highest_lpstate = 0;
+   gpstates->elapsed_time = 0;
+   gpstates->last_sampled_time = 0;
+   gpstates->last_lpstate = 0;
+   gpstates->last_gpstate = 0;
+}
+
 /*
  * Initialize the freq table based on data obtained
  * from the firmware passed via device-tree
@@ -285,6 +340,7 @@ static inline void set_pmspr(unsigned long sprn, unsigned 
long val)
 struct powernv_smp_call_data {
unsigned int freq;
int pstate_id;
+   int gpstate_id;
 };
 
 /*
@@ -343,19 +399,21 @@ static unsigned int powernv_cpufreq_get(unsigned int cpu)
  * (struct powernv_smp_call_data *) and the pstate_id which needs to be set
  * on this CPU should be present in

[PATCH v2 0/2] cpufreq: powernv: Ramp-down global pstate slower than local-pstate

2016-04-14 Thread Akshay Adiga
The frequency transition latency from pmin to pmax is observed to be in few
millisecond granurality. And it usually happens to take a performance penalty
during sudden frequency rampup requests.

This patch set solves this problem by using a chip-level entity called "global
 pstates". Global pstate manages elements across other dependent core chiplets.
Typically, the element that needs to be managed is the voltage setting.
So by holding global pstates higher than local pstate for some amount of time
( ~5 seconds) the subsequent rampups could be made faster.

(1/2) patch removes the flag from cpufreq_policy->driver_data, so that it can
be used for tracking global pstates.

(2/2) patch adds code for global pstate management.

- The iozone results with this patchset, shows improvements in almost all cases.
- YCSB workload on redis with various  target operations per second shows 
better MaxLatency with this patch.

Changes from v1:
- Fixed coding style
- Added a routine to reset global_pstate_info instead of hacky memset
- Handled case when cpufreq_table_validate_and_show() fails
- changed int queue_gpstate_timer() to void queue_gpstate_timer()


Akshay Adiga (1):
  cpufreq: powernv: Ramp-down global pstate slower than local-pstate

Shilpasri G Bhat (1):
  cpufreq: powernv: Remove flag use-case of policy->driver_data

 drivers/cpufreq/powernv-cpufreq.c | 272 +++---
 1 file changed, 257 insertions(+), 15 deletions(-)

-- 
2.5.5



[PATCH v2 1/2] cpufreq: powernv: Remove flag use-case of policy->driver_data

2016-04-14 Thread Akshay Adiga
From: Shilpasri G Bhat 

commit 1b0289848d5d ("cpufreq: powernv: Add sysfs attributes to show
throttle stats") used policy->driver_data as a flag for one-time creation
of throttle sysfs files. Instead of this use 'kernfs_find_and_get()' to
check if the attribute already exists. This is required as
policy->driver_data is used for other purposes in the later patch.

Signed-off-by: Shilpasri G Bhat 
Signed-off-by: Akshay Adiga 
---
 drivers/cpufreq/powernv-cpufreq.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index 39ac78c..e2e2219 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -455,13 +455,15 @@ static int powernv_cpufreq_target_index(struct 
cpufreq_policy *policy,
 static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
int base, i;
+   struct kernfs_node *kn;
 
base = cpu_first_thread_sibling(policy->cpu);
 
for (i = 0; i < threads_per_core; i++)
cpumask_set_cpu(base + i, policy->cpus);
 
-   if (!policy->driver_data) {
+   kn = kernfs_find_and_get(policy->kobj.sd, throttle_attr_grp.name);
+   if (!kn) {
int ret;
 
ret = sysfs_create_group(&policy->kobj, &throttle_attr_grp);
@@ -470,11 +472,8 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy 
*policy)
policy->cpu);
return ret;
}
-   /*
-* policy->driver_data is used as a flag for one-time
-* creation of throttle sysfs files.
-*/
-   policy->driver_data = policy;
+   } else {
+   kernfs_put(kn);
}
return cpufreq_table_validate_and_show(policy, powernv_freqs);
 }
-- 
2.5.5



Re: [PATCH] cpuidle/powernv : Restore different PSSCR for idle and hotplug

2018-02-28 Thread Akshay Adiga
On Mon, Feb 26, 2018 at 03:47:12PM +1100, Stewart Smith wrote:
> Akshay Adiga  writes:
> > commit 1e1601b38e6e ("powerpc/powernv/idle: Restore SPRs for deep idle
> > states via stop API.") uses stop-api provided by the firmware to restore
> > PSSCR. PSSCR restore is required for handling special wakeup. When special
> > wakeup is completed, the core enters stop state based on restored PSSCR.
> >
> > Currently PSSCR is restored to deepest available stop state, causing
> > a idle cpu to enter deeper stop state on a special wakeup, which causes
> > the cpu to hang on wakeup.
> >
> > A "sensors" command which reads temperature (through DTS sensors) on idle
> > cpu can trigger special wakeup.
> >
> > Failed Scenario :
> > Request restore of PSSCR with RL = 11
> > cpu enters idle state (stop5)
> >   user triggers "sensors" command
> >Assert special wakeup on cpu
> >  Restores PSSCR with RL = 11  < Done by firmware
> >   Read DTS sensor
> >Deassert special wakeup
> >   cpu enters idle state (stop11) <-- Instead of stop5
> >
> > Cpu hang is caused because cpu ended up in a deeper state than it requested
> >
> > This patch fixes instability caused by special wakeup when stop11 is
> > enabled. Requests restore of PSSCR to deepest stop state used by cpuidle.
> > Only when offlining cpu, request restore of PSSCR to deepest stop state.
> > On onlining cpu, request restore of PSSCR to deepest stop state used by
> > cpuidle.
> >
> > Fixes : 1e1601b38e6e ("powerpc/powernv/idle: Restore SPRs for deep idle
> > states via stop API.")
> 
> This should CC stable ?
> 
> We'll need this to enable stop11 in firmware and not break things, right?

Yes I will resend and CC it to stable.
> 
> -- 
> Stewart Smith
> OPAL Architect, IBM.
> 



[RESEND][PATCH] cpuidle/powernv : Restore different PSSCR for idle and hotplug

2018-02-28 Thread Akshay Adiga
commit 1e1601b38e6e ("powerpc/powernv/idle: Restore SPRs for deep idle
states via stop API.") uses stop-api provided by the firmware to restore
PSSCR. PSSCR restore is required for handling special wakeup. When special
wakeup is completed, the core enters stop state based on restored PSSCR.

Currently PSSCR is restored to deepest available stop state, causing
a idle cpu to enter deeper stop state on a special wakeup, which causes
the cpu to hang on wakeup.

A "sensors" command which reads temperature (through DTS sensors) on idle
cpu can trigger special wakeup.

Failed Scenario :
Request restore of PSSCR with RL = 11
cpu enters idle state (stop5)
  user triggers "sensors" command
   Assert special wakeup on cpu
 Restores PSSCR with RL = 11  < Done by firmware
  Read DTS sensor
   Deassert special wakeup
  cpu enters idle state (stop11) <-- Instead of stop5

Cpu hang is caused because cpu ended up in a deeper state than it requested

This patch fixes instability caused by special wakeup when stop11 is
enabled. Requests restore of PSSCR to deepest stop state used by cpuidle.
Only when offlining cpu, request restore of PSSCR to deepest stop state.
On onlining cpu, request restore of PSSCR to deepest stop state used by
cpuidle.

Cc:  # v4.14+
Fixes : 1e1601b38e6e ("powerpc/powernv/idle: Restore SPRs for deep idle
states via stop API.")
Reported-by: Pridhiviraj Paidipeddi 
Signed-off-by: Akshay Adiga 
---
 arch/powerpc/include/asm/cpuidle.h|  2 ++
 arch/powerpc/platforms/powernv/idle.c | 46 ---
 drivers/cpuidle/cpuidle-powernv.c |  1 -
 3 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/cpuidle.h 
b/arch/powerpc/include/asm/cpuidle.h
index e210a83..f52e9f1 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -67,6 +67,8 @@
 #define ERR_EC_ESL_MISMATCH-1
 #define ERR_DEEP_STATE_ESL_MISMATCH-2
 
+#define POWERNV_THRESHOLD_LATENCY_NS 20
+
 #ifndef __ASSEMBLY__
 /* Additional SPRs that need to be saved/restored during stop */
 struct stop_sprs {
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 443d5ca..4b0c7d24 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -56,8 +56,11 @@ u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
  */
 static u64 pnv_deepest_stop_psscr_val;
 static u64 pnv_deepest_stop_psscr_mask;
+static u64 pnv_deepest_cpuidle_psscr_val;
+static u64 pnv_deepest_cpuidle_psscr_mask;
 static u64 pnv_deepest_stop_flag;
 static bool deepest_stop_found;
+static bool deepest_cpuidle_found;
 
 static int pnv_save_sprs_for_deep_states(void)
 {
@@ -76,7 +79,14 @@ static int pnv_save_sprs_for_deep_states(void)
uint64_t hid5_val = mfspr(SPRN_HID5);
uint64_t hmeer_val = mfspr(SPRN_HMEER);
uint64_t msr_val = MSR_IDLE;
-   uint64_t psscr_val = pnv_deepest_stop_psscr_val;
+
+   /*
+* Pick deepest cpuidle psscr as the value to be
+* restored through wakeup engine.
+* We will request a deeper state to be restored
+* in hotplug path
+*/
+   uint64_t psscr_val = pnv_deepest_cpuidle_psscr_val;
 
for_each_possible_cpu(cpu) {
uint64_t pir = get_hard_smp_processor_id(cpu);
@@ -409,7 +419,7 @@ static void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, 
u64 lpcr_val)
  */
 unsigned long pnv_cpu_offline(unsigned int cpu)
 {
-   unsigned long srr1;
+   u64 srr1;
u32 idle_states = pnv_get_supported_cpuidle_states();
u64 lpcr_val;
 
@@ -429,12 +439,18 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
__ppc64_runlatch_off();
 
if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) {
-   unsigned long psscr;
+   u64 psscr;
+   u64 pir = get_hard_smp_processor_id(cpu);
 
psscr = mfspr(SPRN_PSSCR);
psscr = (psscr & ~pnv_deepest_stop_psscr_mask) |
pnv_deepest_stop_psscr_val;
+   if (pnv_deepest_stop_psscr_val != pnv_deepest_cpuidle_psscr_val)
+   opal_slw_set_reg(pir, P9_STOP_SPR_PSSCR, psscr);
srr1 = power9_idle_stop(psscr);
+   psscr = (psscr & ~pnv_deepest_cpuidle_psscr_mask) |
+   pnv_deepest_cpuidle_psscr_val;
+   opal_slw_set_reg(pir, P9_STOP_SPR_PSSCR, psscr);
 
} else if ((idle_states & OPAL_PM_WINKLE_ENABLED) &&
   (idle_states & OPAL_PM_LOSE_FULL_CONTEXT)) {
@@ -555,6 +571,7 @@ static int __init pnv_power9_idle_init(struct device_node 
*np, u32 *flags,
u64 *psscr_val = NULL;
u64 *psscr_mask = NULL;
u32 *residency_ns = NULL;
+   u32 *latency_ns = NULL;
u64 max_re

[PATCH] powernv:idle: Clear r12 on wakeup from stop lite

2017-06-27 Thread Akshay Adiga
pnv_wakeup_noloss expects R12 to contain SRR1 value to determine if
the wakeup reason is an HMI in CHECK_HMI_INTERRUPT.

When we wakeup with ESL=0, SRR1 will not contain the wakeup reason, so
there is no point setting R12 to SRR1.

However, we don't set R12 at all and R12 contains garbage, and still
being used to check HMI assuming that it had SRR1. causing the
OPAL msglog to be filled with the following print:
HMI: Received HMI interrupt: HMER = 0x0040

This patch clears R12 after waking up from stop with ESL=EC=0, so that
we don't accidentally enter the HMI handler in pnv_wakeup_noloss if
the R12[42:45] corresponds to HMI as wakeup reason.

Bug existed prior to "commit 9d29250136f6 ("powerpc/64s/idle: Avoid SRR
usage in idle sleep/wake paths")  but was never hit in practice

Signed-off-by: Akshay Adiga 
Fixes: 9d29250136f6 ("powerpc/64s/idle: Avoid SRR usage in idle
sleep/wake paths")
---
 arch/powerpc/kernel/idle_book3s.S | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/arch/powerpc/kernel/idle_book3s.S 
b/arch/powerpc/kernel/idle_book3s.S
index 1ea14b9..34794fd 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -256,6 +256,21 @@ power_enter_stop:
bne  .Lhandle_esl_ec_set
IDLE_STATE_ENTER_SEQ(PPC_STOP)
li  r3,0  /* Since we didn't lose state, return 0 */
+   /*
+* pnv_wakeup_noloss expects R12 to contain SRR1 value
+* to determine if the wakeup reason is an HMI in
+* CHECK_HMI_INTERRUPT.
+*
+* However, when we wakeup with ESL=0,
+* SRR1 will not contain the wakeup reason,
+* so there is no point setting R12 to SRR1.
+*
+* Further, we clear R12 here, so that we
+* don't accidentally enter the HMI
+* in pnv_wakeup_noloss if the
+* R12[42:45] == WAKE_HMI.
+*/
+   li  r12, 0
b   pnv_wakeup_noloss
 
 .Lhandle_esl_ec_set:
-- 
2.5.5



Re: [PATCH] powerpc/powernv: Clear LPCR[PECE1] via stop-api only for deep state offline

2017-09-19 Thread Akshay Adiga

Hi Michael,

Any comments on this patch ?

On 09/06/2017 02:32 PM, pavrampu wrote:

On 2017-08-31 17:17, Gautham R. Shenoy wrote:
> From: "Gautham R. Shenoy" 
>
> commit 24be85a23d1f ("powerpc/powernv: Clear PECE1 in LPCR via
> stop-api only on Hotplug") clears the PECE1 bit of the LPCR via
> stop-api during CPU-Hotplug to prevent wakeup due to a decrementer on
> an offlined CPU which is in a deep stop state.
>
> In the case where the stop-api support is found to be lacking, the
> commit 785a12afdb4a ("powerpc/powernv/idle: Disable LOSE_FULL_CONTEXT
> states when stop-api fails") disables deep states that lose hypervisor
> context. Thus in this case, the offlined CPU will be put to some
> shallow idle state.
>
> However, we currently unconditionally clear the PECE1 in LPCR via
> stop-api during CPU-Hotplug even when deep states are disabled due to
> stop-api failure.
>
> Fix this by clearing PECE1 of LPCR via stop-api during CPU-Hotplug
> *only* when the offlined CPU will be put to a deep state that loses
> hypervisor context.
>
> Fixes: commit 24be85a23d1f ("powerpc/powernv: Clear PECE1 in LPCR via
> stop-api only on Hotplug")
>
> Reported-by: Pavithra Prakash 
> Signed-off-by: Gautham R. Shenoy 

Tested-by: Pavithra Prakash 

> ---
>  arch/powerpc/platforms/powernv/idle.c | 8 +++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/platforms/powernv/idle.c
> b/arch/powerpc/platforms/powernv/idle.c
> index 9f59041..23f8fba 100644
> --- a/arch/powerpc/platforms/powernv/idle.c
> +++ b/arch/powerpc/platforms/powernv/idle.c
> @@ -393,7 +393,13 @@ static void pnv_program_cpu_hotplug_lpcr(unsigned
> int cpu, u64 lpcr_val)
>  u64 pir = get_hard_smp_processor_id(cpu);
>
>  mtspr(SPRN_LPCR, lpcr_val);
> -opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val);
> +
> +/*
> + * Program the LPCR via stop-api only for deepest stop state
> + * can lose hypervisor context.
> + */
> +if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT)
> +opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val);
>  }
>
>  /*




Re: [PATCH] powerpc/powernv: Clear LPCR[PECE1] via stop-api only for deep state offline

2017-09-01 Thread Akshay Adiga


On 08/31/2017 05:37 PM, Nicholas Piggin wrote:

On Thu, 31 Aug 2017 17:17:41 +0530
"Gautham R. Shenoy"  wrote:

> From: "Gautham R. Shenoy" 
>
> commit 24be85a23d1f ("powerpc/powernv: Clear PECE1 in LPCR via
> stop-api only on Hotplug") clears the PECE1 bit of the LPCR via
> stop-api during CPU-Hotplug to prevent wakeup due to a decrementer on
> an offlined CPU which is in a deep stop state.
>
> In the case where the stop-api support is found to be lacking, the
> commit 785a12afdb4a ("powerpc/powernv/idle: Disable LOSE_FULL_CONTEXT
> states when stop-api fails") disables deep states that lose hypervisor
> context. Thus in this case, the offlined CPU will be put to some
> shallow idle state.
>
> However, we currently unconditionally clear the PECE1 in LPCR via
> stop-api during CPU-Hotplug even when deep states are disabled due to
> stop-api failure.
>
> Fix this by clearing PECE1 of LPCR via stop-api during CPU-Hotplug
> *only* when the offlined CPU will be put to a deep state that loses
> hypervisor context.

This looks okay to me. The bug is due to calling opal_slw_set_reg when
firmware has not enabled that feature, right?

Yes,

In the case where the stop-api support is found to be lacking, the 
commit 785a12afdb4a ("powerpc/powernv/idle: Disable LOSE_FULL_CONTEXT 
states when stop-api fails") disables deep states that lose hypervisor 
context. Thus in this case, the offlined CPU will be put to some shallow 
idle state.


If a shallow state ( < stop4 ) is being chosen for cpu hotplug, then :
1) this opal call is not required.
2) may not be supported.

Hence  should call opal_slw_set_reg() only if a deep state chosen for 
cpu hotplug.


>
> Fixes: commit 24be85a23d1f ("powerpc/powernv: Clear PECE1 in LPCR via
> stop-api only on Hotplug")
>
> Reported-by: Pavithra Prakash 
> Signed-off-by: Gautham R. Shenoy 
> ---
>  arch/powerpc/platforms/powernv/idle.c | 8 +++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/platforms/powernv/idle.c
b/arch/powerpc/platforms/powernv/idle.c
> index 9f59041..23f8fba 100644
> --- a/arch/powerpc/platforms/powernv/idle.c
> +++ b/arch/powerpc/platforms/powernv/idle.c
> @@ -393,7 +393,13 @@ static void
pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val)
>  u64 pir = get_hard_smp_processor_id(cpu);
>
>  mtspr(SPRN_LPCR, lpcr_val);
> -opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val);
> +
> +/*
> + * Program the LPCR via stop-api only for deepest stop state
> + * can lose hypervisor context.
> + */
> +if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT)
> +opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val);
>  }
>
>  /*





[PATCH] cpuidle/powernv : Restore different PSSCR for idle and hotplug

2018-02-19 Thread Akshay Adiga
commit 1e1601b38e6e ("powerpc/powernv/idle: Restore SPRs for deep idle
states via stop API.") uses stop-api provided by the firmware to restore
PSSCR. PSSCR restore is required for handling special wakeup. When special
wakeup is completed, the core enters stop state based on restored PSSCR.

Currently PSSCR is restored to deepest available stop state, causing
a idle cpu to enter deeper stop state on a special wakeup, which causes
the cpu to hang on wakeup.

A "sensors" command which reads temperature (through DTS sensors) on idle
cpu can trigger special wakeup.

Failed Scenario :
Request restore of PSSCR with RL = 11
cpu enters idle state (stop5)
  user triggers "sensors" command
   Assert special wakeup on cpu
 Restores PSSCR with RL = 11  < Done by firmware
  Read DTS sensor
   Deassert special wakeup
  cpu enters idle state (stop11) <-- Instead of stop5

Cpu hang is caused because cpu ended up in a deeper state than it requested

This patch fixes instability caused by special wakeup when stop11 is
enabled. Requests restore of PSSCR to deepest stop state used by cpuidle.
Only when offlining cpu, request restore of PSSCR to deepest stop state.
On onlining cpu, request restore of PSSCR to deepest stop state used by
cpuidle.

Fixes : 1e1601b38e6e ("powerpc/powernv/idle: Restore SPRs for deep idle
states via stop API.")
Reported-by: Pridhiviraj Paidipeddi 
Signed-off-by: Akshay Adiga 
---
 arch/powerpc/include/asm/cpuidle.h|  2 ++
 arch/powerpc/platforms/powernv/idle.c | 46 ---
 drivers/cpuidle/cpuidle-powernv.c |  1 -
 3 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/cpuidle.h 
b/arch/powerpc/include/asm/cpuidle.h
index e210a83..f52e9f1 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -67,6 +67,8 @@
 #define ERR_EC_ESL_MISMATCH-1
 #define ERR_DEEP_STATE_ESL_MISMATCH-2
 
+#define POWERNV_THRESHOLD_LATENCY_NS 20
+
 #ifndef __ASSEMBLY__
 /* Additional SPRs that need to be saved/restored during stop */
 struct stop_sprs {
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 443d5ca..4b0c7d24 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -56,8 +56,11 @@ u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
  */
 static u64 pnv_deepest_stop_psscr_val;
 static u64 pnv_deepest_stop_psscr_mask;
+static u64 pnv_deepest_cpuidle_psscr_val;
+static u64 pnv_deepest_cpuidle_psscr_mask;
 static u64 pnv_deepest_stop_flag;
 static bool deepest_stop_found;
+static bool deepest_cpuidle_found;
 
 static int pnv_save_sprs_for_deep_states(void)
 {
@@ -76,7 +79,14 @@ static int pnv_save_sprs_for_deep_states(void)
uint64_t hid5_val = mfspr(SPRN_HID5);
uint64_t hmeer_val = mfspr(SPRN_HMEER);
uint64_t msr_val = MSR_IDLE;
-   uint64_t psscr_val = pnv_deepest_stop_psscr_val;
+
+   /*
+* Pick deepest cpuidle psscr as the value to be
+* restored through wakeup engine.
+* We will request a deeper state to be restored
+* in hotplug path
+*/
+   uint64_t psscr_val = pnv_deepest_cpuidle_psscr_val;
 
for_each_possible_cpu(cpu) {
uint64_t pir = get_hard_smp_processor_id(cpu);
@@ -409,7 +419,7 @@ static void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, 
u64 lpcr_val)
  */
 unsigned long pnv_cpu_offline(unsigned int cpu)
 {
-   unsigned long srr1;
+   u64 srr1;
u32 idle_states = pnv_get_supported_cpuidle_states();
u64 lpcr_val;
 
@@ -429,12 +439,18 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
__ppc64_runlatch_off();
 
if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) {
-   unsigned long psscr;
+   u64 psscr;
+   u64 pir = get_hard_smp_processor_id(cpu);
 
psscr = mfspr(SPRN_PSSCR);
psscr = (psscr & ~pnv_deepest_stop_psscr_mask) |
pnv_deepest_stop_psscr_val;
+   if (pnv_deepest_stop_psscr_val != pnv_deepest_cpuidle_psscr_val)
+   opal_slw_set_reg(pir, P9_STOP_SPR_PSSCR, psscr);
srr1 = power9_idle_stop(psscr);
+   psscr = (psscr & ~pnv_deepest_cpuidle_psscr_mask) |
+   pnv_deepest_cpuidle_psscr_val;
+   opal_slw_set_reg(pir, P9_STOP_SPR_PSSCR, psscr);
 
} else if ((idle_states & OPAL_PM_WINKLE_ENABLED) &&
   (idle_states & OPAL_PM_LOSE_FULL_CONTEXT)) {
@@ -555,6 +571,7 @@ static int __init pnv_power9_idle_init(struct device_node 
*np, u32 *flags,
u64 *psscr_val = NULL;
u64 *psscr_mask = NULL;
u32 *residency_ns = NULL;
+   u32 *latency_ns = NULL;
u64 max_residency_ns = 0;
int 

[PATCH 2/2] cpufreq: powernv: Use PMSR to verify global and local pstate

2016-11-03 Thread Akshay Adiga
As fast_switch may get called in interrupt disable mode, it does not
update the global_pstate_info data structure. Hence the global_pstate_info
has stale data whenever pstate is updated through fast_swtich().

So the gpstate_timer can fire after a fast_switch() call has update
the pstates to a different value. Hence the timer handler cannot rely
on the cached values of local and global pstate and needs to read it
from the PMSR. 

Signed-off-by: Akshay Adiga 

---
 drivers/cpufreq/powernv-cpufreq.c | 32 ++--
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index 09a0496..57713b5 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -592,7 +592,8 @@ void gpstate_timer_handler(unsigned long data)
 {
struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
struct global_pstate_info *gpstates = policy->driver_data;
-   int gpstate_idx;
+   int gpstate_idx, lpstate_idx;
+   unsigned long val;
unsigned int time_diff = jiffies_to_msecs(jiffies)
- gpstates->last_sampled_time;
struct powernv_smp_call_data freq_data;
@@ -600,21 +601,36 @@ void gpstate_timer_handler(unsigned long data)
if (!spin_trylock(&gpstates->gpstate_lock))
return;
 
+   /*
+* If PMCR was last updated was using fast_swtich then
+* We may have wrong in gpstate->last_lpstate_idx
+* value. Hence, read from PMCR to get correct data.
+*/
+   val = get_pmspr(SPRN_PMCR);
+   freq_data.gpstate_id = (val >> (56)) & 0xFF;
+   freq_data.pstate_id = (val >> (48)) & 0xFF;
+   if (freq_data.gpstate_id  == freq_data.pstate_id) {
+   reset_gpstates(policy);
+   spin_unlock(&gpstates->gpstate_lock);
+   return;
+   }
+
gpstates->last_sampled_time += time_diff;
gpstates->elapsed_time += time_diff;
-   freq_data.pstate_id = idx_to_pstate(gpstates->last_lpstate_idx);
 
-   if ((gpstates->last_gpstate_idx == gpstates->last_lpstate_idx) ||
-   (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME)) {
+   if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
gpstate_idx = pstate_to_idx(freq_data.pstate_id);
reset_gpstates(policy);
gpstates->highest_lpstate_idx = gpstate_idx;
} else {
+   lpstate_idx = pstate_to_idx(freq_data.pstate_id);
gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
 gpstates->highest_lpstate_idx,
-gpstates->last_lpstate_idx);
+lpstate_idx);
}
-
+   freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
+   gpstates->last_gpstate_idx = gpstate_idx;
+   gpstates->last_lpstate_idx = lpstate_idx;
/*
 * If local pstate is equal to global pstate, rampdown is over
 * So timer is not required to be queued.
@@ -622,10 +638,6 @@ void gpstate_timer_handler(unsigned long data)
if (gpstate_idx != gpstates->last_lpstate_idx)
queue_gpstate_timer(gpstates);
 
-   freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
-   gpstates->last_gpstate_idx = pstate_to_idx(freq_data.gpstate_id);
-   gpstates->last_lpstate_idx = pstate_to_idx(freq_data.pstate_id);
-
spin_unlock(&gpstates->gpstate_lock);
 
/* Timer may get migrated to a different cpu on cpu hot unplug */
-- 
2.7.4



[PATCH 1/2] cpufreq: powernv: Adding fast_switch for schedutil

2016-11-03 Thread Akshay Adiga
Adding fast_switch which does light weight operation to
set the desired pstate.

Signed-off-by: Akshay Adiga 
---
 drivers/cpufreq/powernv-cpufreq.c | 22 +-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index d3ffde8..09a0496 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -752,9 +752,12 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy 
*policy)
spin_lock_init(&gpstates->gpstate_lock);
ret = cpufreq_table_validate_and_show(policy, powernv_freqs);
 
-   if (ret < 0)
+   if (ret < 0) {
kfree(policy->driver_data);
+   return ret;
+   }
 
+   policy->fast_switch_possible = true;
return ret;
 }
 
@@ -897,6 +900,22 @@ static void powernv_cpufreq_stop_cpu(struct cpufreq_policy 
*policy)
del_timer_sync(&gpstates->timer);
 }
 
+static unsigned int powernv_fast_switch(struct cpufreq_policy *policy,
+   unsigned int target_freq)
+{
+   int index;
+   struct powernv_smp_call_data freq_data;
+
+   index = cpufreq_table_find_index_dl(policy, target_freq);
+   if (index < 0 || index >= powernv_pstate_info.nr_pstates)
+   return CPUFREQ_ENTRY_INVALID;
+   freq_data.pstate_id = powernv_freqs[index].driver_data;
+   freq_data.gpstate_id = powernv_freqs[index].driver_data;
+   set_pstate(&freq_data);
+
+   return powernv_freqs[index].frequency;
+}
+
 static struct cpufreq_driver powernv_cpufreq_driver = {
.name   = "powernv-cpufreq",
.flags  = CPUFREQ_CONST_LOOPS,
@@ -904,6 +923,7 @@ static struct cpufreq_driver powernv_cpufreq_driver = {
.exit   = powernv_cpufreq_cpu_exit,
.verify = cpufreq_generic_frequency_table_verify,
.target_index   = powernv_cpufreq_target_index,
+   .fast_switch= powernv_fast_switch,
.get= powernv_cpufreq_get,
.stop_cpu   = powernv_cpufreq_stop_cpu,
.attr   = powernv_cpu_freq_attr,
-- 
2.7.4



Re: [PATCH 2/2] cpufreq: powernv: Use PMSR to verify global and local pstate

2016-11-06 Thread Akshay Adiga

Thanks Viresh for taking a look at it.

I will make the mentioned changes in the next version of the patch and
will add Shilpa and Gautham to the mail chain.

Regards

Akshay Adiga


On 11/04/2016 12:11 PM, Viresh Kumar wrote:

On 04-11-16, 10:57, Akshay Adiga wrote:

As fast_switch may get called in interrupt disable mode, it does not

s/in interrupt disable mode/with interrupts disabled
s/it does/it may


update the global_pstate_info data structure. Hence the global_pstate_info
has stale data whenever pstate is updated through fast_swtich().

s/has/may have
s/swtich/switch


So the gpstate_timer can fire after a fast_switch() call has update

s/So the/The
s/a fast_swtich() call has update/the fast_switch() call has updated


the pstates to a different value. Hence the timer handler cannot rely
on the cached values of local and global pstate and needs to read it
from the PMSR.

Signed-off-by: Akshay Adiga 

---
  drivers/cpufreq/powernv-cpufreq.c | 32 ++--
  1 file changed, 22 insertions(+), 10 deletions(-)

I am not the best guy to judge the code changes here. Can you please include
Shilpa and Gautham to the mail chain and get there feedback.








[PATCH v2 1/2] cpufreq: powernv: Adding fast_switch for schedutil

2016-11-06 Thread Akshay Adiga
Adding fast_switch which does light weight operation to set the desired
pstate. Both global and local pstates are set to the same desired pstate.

Signed-off-by: Akshay Adiga 
---
Changes from v1 :
- Removed unnecessary check for index out of bound.

 drivers/cpufreq/powernv-cpufreq.c | 20 +++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index d3ffde8..4a4380d 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -752,9 +752,12 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy 
*policy)
spin_lock_init(&gpstates->gpstate_lock);
ret = cpufreq_table_validate_and_show(policy, powernv_freqs);
 
-   if (ret < 0)
+   if (ret < 0) {
kfree(policy->driver_data);
+   return ret;
+   }
 
+   policy->fast_switch_possible = true;
return ret;
 }
 
@@ -897,6 +900,20 @@ static void powernv_cpufreq_stop_cpu(struct cpufreq_policy 
*policy)
del_timer_sync(&gpstates->timer);
 }
 
+static unsigned int powernv_fast_switch(struct cpufreq_policy *policy,
+   unsigned int target_freq)
+{
+   int index;
+   struct powernv_smp_call_data freq_data;
+
+   index = cpufreq_table_find_index_dl(policy, target_freq);
+   freq_data.pstate_id = powernv_freqs[index].driver_data;
+   freq_data.gpstate_id = powernv_freqs[index].driver_data;
+   set_pstate(&freq_data);
+
+   return powernv_freqs[index].frequency;
+}
+
 static struct cpufreq_driver powernv_cpufreq_driver = {
.name   = "powernv-cpufreq",
.flags  = CPUFREQ_CONST_LOOPS,
@@ -904,6 +921,7 @@ static struct cpufreq_driver powernv_cpufreq_driver = {
.exit   = powernv_cpufreq_cpu_exit,
.verify = cpufreq_generic_frequency_table_verify,
.target_index   = powernv_cpufreq_target_index,
+   .fast_switch= powernv_fast_switch,
.get= powernv_cpufreq_get,
.stop_cpu   = powernv_cpufreq_stop_cpu,
.attr   = powernv_cpu_freq_attr,
-- 
2.5.5



[PATCH v2 2/2] cpufreq: powernv: Use PMCR to verify global and local pstate

2016-11-06 Thread Akshay Adiga
As fast_switch() may get called with interrupt disable mode, we cannot
hold a mutex to update the global_pstate_info. So currently, fast_switch()
does not update the global_pstate_info and it will end up with stale data
whenever pstate is updated through fast_switch().

As the gpstate_timer can fire after fast_switch() has updated the pstates,
the timer handler cannot rely on the cached values of local and global
pstate and needs to read it from the PMCR.

Only gpstate_timer_handler() is affected by the stale cached pstate data
beacause either fast_switch() or target_index() routines will be called
for a given govenor, but gpstate_timer can fire after the governor has
changed to schedutil.


Signed-off-by: Akshay Adiga 
---

Changes from v1 :
- Corrected Commit message
- Type cast pstate values read from PMCR to type s8
- Added Macros to get local and global pstates from PMCR


 drivers/cpufreq/powernv-cpufreq.c | 34 --
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index 4a4380d..bf4bc585 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -42,6 +42,8 @@
 #define PMSR_PSAFE_ENABLE  (1UL << 30)
 #define PMSR_SPR_EM_DISABLE(1UL << 31)
 #define PMSR_MAX(x)((x >> 32) & 0xFF)
+#define PMCR_LPSTATE(x)(((x) >> 48) & 0xFF)
+#define PMCR_GPSTATE(x)(((x) >> 56) & 0xFF)
 
 #define MAX_RAMP_DOWN_TIME 5120
 /*
@@ -592,7 +594,8 @@ void gpstate_timer_handler(unsigned long data)
 {
struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
struct global_pstate_info *gpstates = policy->driver_data;
-   int gpstate_idx;
+   int gpstate_idx, lpstate_idx;
+   unsigned long val;
unsigned int time_diff = jiffies_to_msecs(jiffies)
- gpstates->last_sampled_time;
struct powernv_smp_call_data freq_data;
@@ -600,21 +603,36 @@ void gpstate_timer_handler(unsigned long data)
if (!spin_trylock(&gpstates->gpstate_lock))
return;
 
+   /*
+* If PMCR was last updated was using fast_swtich then
+* We may have wrong in gpstate->last_lpstate_idx
+* value. Hence, read from PMCR to get correct data.
+*/
+   val = get_pmspr(SPRN_PMCR);
+   freq_data.gpstate_id = (s8)PMCR_GPSTATE(val);
+   freq_data.pstate_id = (s8)PMCR_LPSTATE(val);
+   if (freq_data.gpstate_id  == freq_data.pstate_id) {
+   reset_gpstates(policy);
+   spin_unlock(&gpstates->gpstate_lock);
+   return;
+   }
+
gpstates->last_sampled_time += time_diff;
gpstates->elapsed_time += time_diff;
-   freq_data.pstate_id = idx_to_pstate(gpstates->last_lpstate_idx);
 
-   if ((gpstates->last_gpstate_idx == gpstates->last_lpstate_idx) ||
-   (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME)) {
+   if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
gpstate_idx = pstate_to_idx(freq_data.pstate_id);
reset_gpstates(policy);
gpstates->highest_lpstate_idx = gpstate_idx;
} else {
+   lpstate_idx = pstate_to_idx(freq_data.pstate_id);
gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
 gpstates->highest_lpstate_idx,
-gpstates->last_lpstate_idx);
+lpstate_idx);
}
-
+   freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
+   gpstates->last_gpstate_idx = gpstate_idx;
+   gpstates->last_lpstate_idx = lpstate_idx;
/*
 * If local pstate is equal to global pstate, rampdown is over
 * So timer is not required to be queued.
@@ -622,10 +640,6 @@ void gpstate_timer_handler(unsigned long data)
if (gpstate_idx != gpstates->last_lpstate_idx)
queue_gpstate_timer(gpstates);
 
-   freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
-   gpstates->last_gpstate_idx = pstate_to_idx(freq_data.gpstate_id);
-   gpstates->last_lpstate_idx = pstate_to_idx(freq_data.pstate_id);
-
spin_unlock(&gpstates->gpstate_lock);
 
/* Timer may get migrated to a different cpu on cpu hot unplug */
-- 
2.5.5



Re: [PATCH 1/2] cpufreq: powernv: Adding fast_switch for schedutil

2016-11-06 Thread Akshay Adiga

Thanks Viresh for taking a look at it.

I will make the mentioned changes in the next version of the patch.


Regards

Akshay Adiga


On 11/04/2016 12:03 PM, Viresh Kumar wrote:

On 04-11-16, 10:57, Akshay Adiga wrote:

Adding fast_switch which does light weight operation to
set the desired pstate.

Signed-off-by: Akshay Adiga 
---
  drivers/cpufreq/powernv-cpufreq.c | 22 +-
  1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index d3ffde8..09a0496 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -752,9 +752,12 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy 
*policy)
spin_lock_init(&gpstates->gpstate_lock);
ret = cpufreq_table_validate_and_show(policy, powernv_freqs);
  
-	if (ret < 0)

+   if (ret < 0) {
kfree(policy->driver_data);
+   return ret;
+   }
  
+	policy->fast_switch_possible = true;

return ret;
  }
  
@@ -897,6 +900,22 @@ static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy)

del_timer_sync(&gpstates->timer);
  }
  
+static unsigned int powernv_fast_switch(struct cpufreq_policy *policy,

+   unsigned int target_freq)
+{
+   int index;
+   struct powernv_smp_call_data freq_data;
+
+   index = cpufreq_table_find_index_dl(policy, target_freq);
+   if (index < 0 || index >= powernv_pstate_info.nr_pstates)
+   return CPUFREQ_ENTRY_INVALID;

I don't think such a check is required at all. It wouldn't happen without a BUG
in kernel.

+   freq_data.pstate_id = powernv_freqs[index].driver_data;
+   freq_data.gpstate_id = powernv_freqs[index].driver_data;
+   set_pstate(&freq_data);
+
+   return powernv_freqs[index].frequency;
+}
+
  static struct cpufreq_driver powernv_cpufreq_driver = {
.name   = "powernv-cpufreq",
.flags  = CPUFREQ_CONST_LOOPS,
@@ -904,6 +923,7 @@ static struct cpufreq_driver powernv_cpufreq_driver = {
.exit   = powernv_cpufreq_cpu_exit,
.verify = cpufreq_generic_frequency_table_verify,
.target_index   = powernv_cpufreq_target_index,
+   .fast_switch= powernv_fast_switch,
.get= powernv_cpufreq_get,
.stop_cpu   = powernv_cpufreq_stop_cpu,
.attr   = powernv_cpu_freq_attr,
--
2.7.4