[PATCH] Perf: Calling available function for stats printing

2024-06-27 Thread Abhishek Dubey
For printing dump_trace, just use existing stats_print()
function.

Signed-off-by: Abhishek Dubey 
---
 tools/perf/builtin-report.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 69618fb0110b..8678eebc49e6 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1089,10 +1089,7 @@ static int __cmd_report(struct report *rep)
perf_session__fprintf_dsos(session, stdout);
 
if (dump_trace) {
-   perf_session__fprintf_nr_events(session, stdout,
-   rep->skip_empty);
-   evlist__fprintf_nr_events(session->evlist, stdout,
- rep->skip_empty);
+   stats_print(rep);
return 0;
}
}
-- 
2.44.0



[PATCH v3] PowerPC: Replace kretprobe with rethook

2024-06-27 Thread Abhishek Dubey
This is an adaptation of commit f3a112c0c40d ("x86,rethook,kprobes:
Replace kretprobe with rethook on x86") to PowerPC.

Replaces the kretprobe code with rethook on Power. With this patch,
kretprobe on Power uses the rethook instead of kretprobe specific
trampoline code.

Reference to other archs:
commit b57c2f124098 ("riscv: add riscv rethook implementation")
commit 7b0a096436c2 ("LoongArch: Replace kretprobe with rethook")

Signed-off-by: Abhishek Dubey 
---

Changes in v3:
* Fixing return address moved to arch_rethook_fixup_return()
* Addressed other minor comments
* show_stack mods to show correct fn name to go in separate patch
---
 arch/powerpc/Kconfig |  1 +
 arch/powerpc/kernel/Makefile |  1 +
 arch/powerpc/kernel/kprobes.c| 65 +--
 arch/powerpc/kernel/optprobes.c  |  2 +-
 arch/powerpc/kernel/rethook.c| 77 
 arch/powerpc/kernel/stacktrace.c |  6 ++-
 6 files changed, 85 insertions(+), 67 deletions(-)
 create mode 100644 arch/powerpc/kernel/rethook.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index c88c6d46a5bc..fa0b1ab3f935 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -270,6 +270,7 @@ config PPC
select HAVE_PERF_EVENTS_NMI if PPC64
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
+   select HAVE_RETHOOK
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE
select HAVE_RSEQ
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 8585d03c02d3..7dd1b523b17f 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -140,6 +140,7 @@ obj-$(CONFIG_KPROBES)   += kprobes.o
 obj-$(CONFIG_OPTPROBES)+= optprobes.o optprobes_head.o
 obj-$(CONFIG_KPROBES_ON_FTRACE)+= kprobes-ftrace.o
 obj-$(CONFIG_UPROBES)  += uprobes.o
+obj-$(CONFIG_RETHOOK)   += rethook.o
 obj-$(CONFIG_PPC_UDBG_16550)   += legacy_serial.o udbg_16550.o
 obj-$(CONFIG_SWIOTLB)  += dma-swiotlb.o
 obj-$(CONFIG_ARCH_HAS_DMA_SET_MASK) += dma-mask.o
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 14c5ddec3056..f8aa91bc3b17 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -228,16 +228,6 @@ static nokprobe_inline void set_current_kprobe(struct 
kprobe *p, struct pt_regs
kcb->kprobe_saved_msr = regs->msr;
 }
 
-void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs 
*regs)
-{
-   ri->ret_addr = (kprobe_opcode_t *)regs->link;
-   ri->fp = NULL;
-
-   /* Replace the return addr with trampoline addr */
-   regs->link = (unsigned long)__kretprobe_trampoline;
-}
-NOKPROBE_SYMBOL(arch_prepare_kretprobe);
-
 static int try_to_emulate(struct kprobe *p, struct pt_regs *regs)
 {
int ret;
@@ -394,49 +384,6 @@ int kprobe_handler(struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(kprobe_handler);
 
-/*
- * Function return probe trampoline:
- * - init_kprobes() establishes a probepoint here
- * - When the probed function returns, this probe
- * causes the handlers to fire
- */
-asm(".global __kretprobe_trampoline\n"
-   ".type __kretprobe_trampoline, @function\n"
-   "__kretprobe_trampoline:\n"
-   "nop\n"
-   "blr\n"
-   ".size __kretprobe_trampoline, .-__kretprobe_trampoline\n");
-
-/*
- * Called when the probe at kretprobe trampoline is hit
- */
-static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
-{
-   unsigned long orig_ret_address;
-
-   orig_ret_address = __kretprobe_trampoline_handler(regs, NULL);
-   /*
-* We get here through one of two paths:
-* 1. by taking a trap -> kprobe_handler() -> here
-* 2. by optprobe branch -> optimized_callback() -> opt_pre_handler() 
-> here
-*
-* When going back through (1), we need regs->nip to be setup properly
-* as it is used to determine the return address from the trap.
-* For (2), since nip is not honoured with optprobes, we instead setup
-* the link register properly so that the subsequent 'blr' in
-* __kretprobe_trampoline jumps back to the right instruction.
-*
-* For nip, we should set the address to the previous instruction since
-* we end up emulating it in kprobe_handler(), which increments the nip
-* again.
-*/
-   regs_set_return_ip(regs, orig_ret_address - 4);
-   regs->link = orig_ret_address;
-
-   return 0;
-}
-NOKPROBE_SYMBOL(trampoline_probe_handler);
-
 /*
  * Called after single-stepping.  p->addr is the address of the
  * instruction whose first byte has been replaced by the "breakpoint"
@@ -539,19 +486,9 @@ int kprobe_fault_handler(struct p

[PATCH v2] PowerPC: Replace kretprobe with rethook

2024-06-10 Thread Abhishek Dubey
This is an adaptation of commit f3a112c0c40d ("x86,rethook,kprobes:
Replace kretprobe with rethook on x86") to PowerPC.

Replaces the kretprobe code with rethook on Power. With this patch,
kretprobe on Power uses the rethook instead of kretprobe specific
trampoline code.

Reference to other archs:
commit b57c2f124098 ("riscv: add riscv rethook implementation")
commit 7b0a096436c2 ("LoongArch: Replace kretprobe with rethook")

Signed-off-by: Abhishek Dubey 
---
 arch/powerpc/Kconfig |  1 +
 arch/powerpc/kernel/Makefile |  1 +
 arch/powerpc/kernel/kprobes.c| 65 +
 arch/powerpc/kernel/optprobes.c  |  2 +-
 arch/powerpc/kernel/rethook.c| 71 
 arch/powerpc/kernel/stacktrace.c | 10 +++--
 6 files changed, 81 insertions(+), 69 deletions(-)
 create mode 100644 arch/powerpc/kernel/rethook.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index c88c6d46a5bc..fa0b1ab3f935 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -270,6 +270,7 @@ config PPC
select HAVE_PERF_EVENTS_NMI if PPC64
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
+   select HAVE_RETHOOK
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE
select HAVE_RSEQ
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 8585d03c02d3..7dd1b523b17f 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -140,6 +140,7 @@ obj-$(CONFIG_KPROBES)   += kprobes.o
 obj-$(CONFIG_OPTPROBES)+= optprobes.o optprobes_head.o
 obj-$(CONFIG_KPROBES_ON_FTRACE)+= kprobes-ftrace.o
 obj-$(CONFIG_UPROBES)  += uprobes.o
+obj-$(CONFIG_RETHOOK)   += rethook.o
 obj-$(CONFIG_PPC_UDBG_16550)   += legacy_serial.o udbg_16550.o
 obj-$(CONFIG_SWIOTLB)  += dma-swiotlb.o
 obj-$(CONFIG_ARCH_HAS_DMA_SET_MASK) += dma-mask.o
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 14c5ddec3056..f8aa91bc3b17 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -228,16 +228,6 @@ static nokprobe_inline void set_current_kprobe(struct 
kprobe *p, struct pt_regs
kcb->kprobe_saved_msr = regs->msr;
 }
 
-void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs 
*regs)
-{
-   ri->ret_addr = (kprobe_opcode_t *)regs->link;
-   ri->fp = NULL;
-
-   /* Replace the return addr with trampoline addr */
-   regs->link = (unsigned long)__kretprobe_trampoline;
-}
-NOKPROBE_SYMBOL(arch_prepare_kretprobe);
-
 static int try_to_emulate(struct kprobe *p, struct pt_regs *regs)
 {
int ret;
@@ -394,49 +384,6 @@ int kprobe_handler(struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(kprobe_handler);
 
-/*
- * Function return probe trampoline:
- * - init_kprobes() establishes a probepoint here
- * - When the probed function returns, this probe
- * causes the handlers to fire
- */
-asm(".global __kretprobe_trampoline\n"
-   ".type __kretprobe_trampoline, @function\n"
-   "__kretprobe_trampoline:\n"
-   "nop\n"
-   "blr\n"
-   ".size __kretprobe_trampoline, .-__kretprobe_trampoline\n");
-
-/*
- * Called when the probe at kretprobe trampoline is hit
- */
-static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
-{
-   unsigned long orig_ret_address;
-
-   orig_ret_address = __kretprobe_trampoline_handler(regs, NULL);
-   /*
-* We get here through one of two paths:
-* 1. by taking a trap -> kprobe_handler() -> here
-* 2. by optprobe branch -> optimized_callback() -> opt_pre_handler() 
-> here
-*
-* When going back through (1), we need regs->nip to be setup properly
-* as it is used to determine the return address from the trap.
-* For (2), since nip is not honoured with optprobes, we instead setup
-* the link register properly so that the subsequent 'blr' in
-* __kretprobe_trampoline jumps back to the right instruction.
-*
-* For nip, we should set the address to the previous instruction since
-* we end up emulating it in kprobe_handler(), which increments the nip
-* again.
-*/
-   regs_set_return_ip(regs, orig_ret_address - 4);
-   regs->link = orig_ret_address;
-
-   return 0;
-}
-NOKPROBE_SYMBOL(trampoline_probe_handler);
-
 /*
  * Called after single-stepping.  p->addr is the address of the
  * instruction whose first byte has been replaced by the "breakpoint"
@@ -539,19 +486,9 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 }
 NOKPROBE_SYMBOL(kprobe_fault_handler);
 
-static struct kprobe trampoline_p = {
-   .addr = (kprobe_opcode_t *) &__kretprobe_trampoline,
-   .pre_ha

[PATCH] Perf: Calling available function for stats printing

2024-05-16 Thread Abhishek Dubey
For printing dump_trace, use existing stats_print()
function.

Signed-off-by: Abhishek Dubey 
---
 tools/perf/builtin-report.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index dcd93ee5fc24..3cabd5b0bfec 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1088,10 +1088,7 @@ static int __cmd_report(struct report *rep)
perf_session__fprintf_dsos(session, stdout);
 
if (dump_trace) {
-   perf_session__fprintf_nr_events(session, stdout,
-   rep->skip_empty);
-   evlist__fprintf_nr_events(session->evlist, stdout,
- rep->skip_empty);
+   stats_print(rep);
return 0;
}
}
-- 
2.44.0



[PATCH] PowerPC: Replace kretprobe with rethook

2024-05-16 Thread Abhishek Dubey
This is an adaptation of commit f3a112c0c40d ("x86,rethook,kprobes:
Replace kretprobe with rethook on x86") to Power.

Replaces the kretprobe code with rethook on Power. With this patch,
kretprobe on Power uses the rethook instead of kretprobe specific
trampoline code.

Reference to other archs:
commit b57c2f124098 ("riscv: add riscv rethook implementation")
commit 7b0a096436c2 ("LoongArch: Replace kretprobe with rethook")

Signed-off-by: Abhishek Dubey 
---
 arch/powerpc/Kconfig |  1 +
 arch/powerpc/kernel/Makefile |  1 +
 arch/powerpc/kernel/kprobes.c| 65 +
 arch/powerpc/kernel/optprobes.c  |  2 +-
 arch/powerpc/kernel/rethook.c| 71 
 arch/powerpc/kernel/stacktrace.c |  6 +--
 include/linux/rethook.h  |  1 -
 7 files changed, 78 insertions(+), 69 deletions(-)
 create mode 100644 arch/powerpc/kernel/rethook.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1c4be3373686..108de491965a 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -268,6 +268,7 @@ config PPC
select HAVE_PERF_EVENTS_NMI if PPC64
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
+   select HAVE_RETHOOK
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE
select HAVE_RSEQ
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index d3282fbea4f2..181d764be3a6 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -142,6 +142,7 @@ obj-$(CONFIG_KPROBES)   += kprobes.o
 obj-$(CONFIG_OPTPROBES)+= optprobes.o optprobes_head.o
 obj-$(CONFIG_KPROBES_ON_FTRACE)+= kprobes-ftrace.o
 obj-$(CONFIG_UPROBES)  += uprobes.o
+obj-$(CONFIG_RETHOOK)   += rethook.o
 obj-$(CONFIG_PPC_UDBG_16550)   += legacy_serial.o udbg_16550.o
 obj-$(CONFIG_SWIOTLB)  += dma-swiotlb.o
 obj-$(CONFIG_ARCH_HAS_DMA_SET_MASK) += dma-mask.o
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index bbca90a5e2ec..614bb68ad0e6 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -248,16 +248,6 @@ static nokprobe_inline void set_current_kprobe(struct 
kprobe *p, struct pt_regs
kcb->kprobe_saved_msr = regs->msr;
 }
 
-void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs 
*regs)
-{
-   ri->ret_addr = (kprobe_opcode_t *)regs->link;
-   ri->fp = NULL;
-
-   /* Replace the return addr with trampoline addr */
-   regs->link = (unsigned long)__kretprobe_trampoline;
-}
-NOKPROBE_SYMBOL(arch_prepare_kretprobe);
-
 static int try_to_emulate(struct kprobe *p, struct pt_regs *regs)
 {
int ret;
@@ -414,49 +404,6 @@ int kprobe_handler(struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(kprobe_handler);
 
-/*
- * Function return probe trampoline:
- * - init_kprobes() establishes a probepoint here
- * - When the probed function returns, this probe
- * causes the handlers to fire
- */
-asm(".global __kretprobe_trampoline\n"
-   ".type __kretprobe_trampoline, @function\n"
-   "__kretprobe_trampoline:\n"
-   "nop\n"
-   "blr\n"
-   ".size __kretprobe_trampoline, .-__kretprobe_trampoline\n");
-
-/*
- * Called when the probe at kretprobe trampoline is hit
- */
-static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
-{
-   unsigned long orig_ret_address;
-
-   orig_ret_address = __kretprobe_trampoline_handler(regs, NULL);
-   /*
-* We get here through one of two paths:
-* 1. by taking a trap -> kprobe_handler() -> here
-* 2. by optprobe branch -> optimized_callback() -> opt_pre_handler() 
-> here
-*
-* When going back through (1), we need regs->nip to be setup properly
-* as it is used to determine the return address from the trap.
-* For (2), since nip is not honoured with optprobes, we instead setup
-* the link register properly so that the subsequent 'blr' in
-* __kretprobe_trampoline jumps back to the right instruction.
-*
-* For nip, we should set the address to the previous instruction since
-* we end up emulating it in kprobe_handler(), which increments the nip
-* again.
-*/
-   regs_set_return_ip(regs, orig_ret_address - 4);
-   regs->link = orig_ret_address;
-
-   return 0;
-}
-NOKPROBE_SYMBOL(trampoline_probe_handler);
-
 /*
  * Called after single-stepping.  p->addr is the address of the
  * instruction whose first byte has been replaced by the "breakpoint"
@@ -559,19 +506,9 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 }
 NOKPROBE_SYMBOL(kprobe_fault_handler);
 
-static struct kprobe trampoline_p = {
-   .addr = (kprobe_opcode_t *) &__kr

[RFC v2 2/2] powerpc/powernv : Introduce capability for firmware-enabled-stop

2020-07-06 Thread Abhishek Goel
This patch introduces the capability for firmware to handle the stop
states instead. A bit is set based on the discovery of the feature
and correspondingly also the responsibility to handle the stop states.

If Kernel does not know about stop version, it can fallback to opal for
idle stop support if firmware-stop-supported property is present.

Earlier part of this patch was posted in this series :
https://lkml.org/lkml/2020/3/4/589

Signed-off-by: Abhishek Goel 
Signed-off-by: Pratik Rajesh Sampat 
---
v1->v2 : Combined patch 2 and 3 from previous iteration and rebased it.

 arch/powerpc/include/asm/processor.h  | 18 ++
 arch/powerpc/kernel/dt_cpu_ftrs.c | 13 +
 arch/powerpc/platforms/powernv/idle.c | 20 
 drivers/cpuidle/cpuidle-powernv.c |  3 ++-
 4 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index bfa336fbcfeb..b8de7146387c 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -428,6 +428,24 @@ extern void power4_idle_nap(void);
 extern unsigned long cpuidle_disable;
 enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
 
+#define STOP_ENABLE0x0001
+#define FIRMWARE_STOP_ENABLE   0x0010
+
+#define STOP_VERSION_P9   0x1
+
+/*
+ * Classify the dependencies of the stop states
+ * @idle_stop: function handler to handle the quirk stop version
+ * @cpuidle_prop: Signify support for stop states through kernel and/or 
firmware
+ * @stop_version: Classify quirk versions for stop states
+ */
+typedef struct {
+   unsigned long (*idle_stop)(unsigned long psscr, bool mmu_on);
+   uint8_t cpuidle_prop;
+   uint8_t stop_version;
+} stop_deps_t;
+extern stop_deps_t stop_dep;
+
 extern int powersave_nap;  /* set if nap mode can be used in idle loop */
 
 extern void power7_idle_type(unsigned long type);
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c 
b/arch/powerpc/kernel/dt_cpu_ftrs.c
index 36bc0d5c4f3a..737686fae3c7 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -291,6 +291,15 @@ static int __init feat_enable_idle_stop(struct 
dt_cpu_feature *f)
lpcr |=  LPCR_PECE1;
lpcr |=  LPCR_PECE2;
mtspr(SPRN_LPCR, lpcr);
+   stop_dep.cpuidle_prop |= STOP_ENABLE;
+   stop_dep.stop_version = STOP_VERSION_P9;
+
+   return 1;
+}
+
+static int __init feat_enable_firmware_stop(struct dt_cpu_feature *f)
+{
+   stop_dep.cpuidle_prop |= FIRMWARE_STOP_ENABLE;
 
return 1;
 }
@@ -589,6 +598,7 @@ static struct dt_cpu_feature_match __initdata
{"idle-nap", feat_enable_idle_nap, 0},
{"alignment-interrupt-dsisr", feat_enable_align_dsisr, 0},
{"idle-stop", feat_enable_idle_stop, 0},
+   {"firmware-stop-supported", feat_enable_firmware_stop, 0},
{"machine-check-power8", feat_enable_mce_power8, 0},
{"performance-monitor-power8", feat_enable_pmu_power8, 0},
{"data-stream-control-register", feat_enable_dscr, CPU_FTR_DSCR},
@@ -656,6 +666,9 @@ static void __init cpufeatures_setup_start(u32 isa)
}
 }
 
+stop_deps_t stop_dep = {NULL, 0x0, 0x0};
+EXPORT_SYMBOL(stop_dep);
+
 static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f)
 {
const struct dt_cpu_feature_match *m;
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 3afd4293f729..3602950f6c08 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -824,7 +824,7 @@ static unsigned long power9_offline_stop(unsigned long 
psscr)
 
 #ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE
__ppc64_runlatch_off();
-   srr1 = power9_idle_stop(psscr, true);
+   srr1 = stop_dep.idle_stop(psscr, true);
__ppc64_runlatch_on();
 #else
/*
@@ -840,7 +840,7 @@ static unsigned long power9_offline_stop(unsigned long 
psscr)
local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE;
 
__ppc64_runlatch_off();
-   srr1 = power9_idle_stop(psscr, false);
+   srr1 = stop_dep.idle_stop(psscr, true);
__ppc64_runlatch_on();
 
local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL;
@@ -868,7 +868,7 @@ void power9_idle_type(unsigned long stop_psscr_val,
psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
 
__ppc64_runlatch_off();
-   srr1 = power9_idle_stop(psscr, true);
+   srr1 = stop_dep.idle_stop(psscr, true);
__ppc64_runlatch_on();
 
fini_irq_for_idle_irqsoff();
@@ -1365,8 +1365,20 @@ static int __init pnv_init_idle_states(void)
nr_pnv_idle_states = 0;
supported_cpuidle_states = 0;
 
-   if (cpuidle_disable != IDLE_NO_OVERRIDE)
+   if (cpuidle_disable != IDLE_NO_OVERRIDE ||
+   !(stop_dep.cpuidle_prop

[PATCH] cpuidle/powernv : Remove dead code block

2020-07-05 Thread Abhishek Goel
Commit 1961acad2f88559c2cdd2ef67c58c3627f1f6e54 removes usage of
function "validate_dt_prop_sizes". This patch removes this unused
function.

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/cpuidle-powernv.c | 14 --
 1 file changed, 14 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 1b299e801f74..addaa6e6718b 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -244,20 +244,6 @@ static inline void add_powernv_state(int index, const char 
*name,
stop_psscr_table[index].mask = psscr_mask;
 }
 
-/*
- * Returns 0 if prop1_len == prop2_len. Else returns -1
- */
-static inline int validate_dt_prop_sizes(const char *prop1, int prop1_len,
-const char *prop2, int prop2_len)
-{
-   if (prop1_len == prop2_len)
-   return 0;
-
-   pr_warn("cpuidle-powernv: array sizes don't match for %s and %s\n",
-   prop1, prop2);
-   return -1;
-}
-
 extern u32 pnv_get_supported_cpuidle_states(void);
 static int powernv_add_idle_states(void)
 {
-- 
2.17.1



[RFC v2 1/2] powerpc/powernv : Add support for pre-entry and post-exit of stop state using OPAL V4 OS services

2020-07-05 Thread Abhishek Goel
This patch provides kernel framework fro opal support of save restore
of sprs in idle stop loop. Opal support for stop states is needed to
selectively enable stop states or to introduce a quirk quickly in case
a buggy stop state is present.

We make a opal call from kernel if firmware-stop-support for stop
states is present and enabled. All the quirks for pre-entry of stop
state is handled inside opal. A call from opal is made into kernel
where we execute stop afer saving of NVGPRs.
After waking up from 0x100 vector in kernel, we enter back into opal.
All the quirks in post exit path, if any, are then handled in opal,
from where we return successfully back to kernel.

Signed-off-by: Abhishek Goel 
---
v1->v2 : Rebased the patch on Nick's Opal V4 OS patchset

 arch/powerpc/include/asm/opal-api.h|  4 +++-
 arch/powerpc/include/asm/opal.h|  1 +
 arch/powerpc/platforms/powernv/idle.c  | 12 
 arch/powerpc/platforms/powernv/opal-call.c |  1 +
 arch/powerpc/platforms/powernv/opal.c  | 15 +++
 5 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 97c5e5423827..437b6937685d 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -219,7 +219,8 @@
 #define OPAL_REPORT_TRAP   183
 #define OPAL_FIND_VM_AREA  184
 #define OPAL_REGISTER_OS_OPS   185
-#define OPAL_LAST  185
+#define OPAL_CPU_IDLE  186
+#define OPAL_LAST  186
 
 #define QUIESCE_HOLD   1 /* Spin all calls at entry */
 #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */
@@ -1207,6 +1208,7 @@ struct opal_os_ops {
__be64  os_printf; /* void printf(int32_t level, const char *str) */
__be64  os_vm_map; /* int64_t os_vm_map(uint64_t ea, uint64_t pa, 
uint64_t flags) */
__be64  os_vm_unmap; /* void os_vm_unmap(uint64_t ea) */
+   __be64  os_idle_stop; /* void os_idle_stop(uint64_t srr1_addr, uint64_t 
psscr) */
 };
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 09985b7718b3..1774c056acb8 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -407,6 +407,7 @@ void opal_sensor_groups_init(void);
 
 int64_t opal_find_vm_area(uint64_t addr, struct opal_vm_area *opal_vm_area);
 int64_t opal_register_os_ops(struct opal_os_ops *ops, uint64_t size);
+int64_t opal_cpu_idle(uint64_t srr1_addr, uint64_t psscr);
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 78599bca66c2..3afd4293f729 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -805,6 +805,18 @@ static unsigned long power9_idle_stop(unsigned long psscr, 
bool mmu_on)
return srr1;
 }
 
+static unsigned long power9_firmware_idle_stop(unsigned long psscr, bool 
mmu_on)
+{
+   unsigned long srr1;
+   int rc;
+
+   rc = opal_cpu_idle(cpu_to_be64(), (uint64_t) psscr);
+
+   if (mmu_on)
+   mtmsr(MSR_KERNEL);
+   return srr1;
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 static unsigned long power9_offline_stop(unsigned long psscr)
 {
diff --git a/arch/powerpc/platforms/powernv/opal-call.c 
b/arch/powerpc/platforms/powernv/opal-call.c
index 11f419e76059..79076ca2de03 100644
--- a/arch/powerpc/platforms/powernv/opal-call.c
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -351,3 +351,4 @@ OPAL_CALL(opal_sym_to_addr, 
OPAL_SYM_TO_ADDR);
 OPAL_CALL(opal_report_trap,OPAL_REPORT_TRAP);
 OPAL_CALL(opal_find_vm_area,   OPAL_FIND_VM_AREA);
 OPAL_CALL(opal_register_os_ops,OPAL_REGISTER_OS_OPS);
+OPAL_CALL(opal_cpu_idle,   OPAL_CPU_IDLE);
diff --git a/arch/powerpc/platforms/powernv/opal.c 
b/arch/powerpc/platforms/powernv/opal.c
index 93b9afaf33b3..1fbf7065f918 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -1150,6 +1150,20 @@ static void os_vm_unmap(uint64_t ea)
local_flush_tlb_mm(mm);
 }
 
+int64_t os_idle_stop(uint64_t srr1_addr, uint64_t psscr)
+{
+   /*
+* For lite state which does not lose even GPRS we call
+* idle_stop_noloss while for all other states we call
+* idle_stop_mayloss. Saving and restoration of other additional
+* SPRs if required is handled in OPAL. All the quirks are also
+* handled in OPAL.
+*/
+   if (!(psscr & (PSSCR_EC|PSSCR_ESL)))
+   return isa300_idle_stop_noloss(psscr);
+   return isa300_idle_stop_mayloss(psscr);
+}
+
 static int __init opal_init_mm(void)
 {
struct mm_struct *mm;
@@ -1231,6 +1245,7 @@ static int __init opal_init_ear

Re: [RFC 1/3] powernv/cpuidle : Support for pre-entry and post exit of stop state in firmware

2020-04-29 Thread Abhishek

Hi Nick,

Have you posted out the kernel side of "opal v4" patchset?
I could only find the opal patchset.

Thanks,
Abhishek

On 04/28/2020 06:38 AM, Nicholas Piggin wrote:

Thanks for picking this up and pushing it along. I do plan to come back
and take another look at it all, but what we do need to do first is get
a coherent approach to this proposed new calling convention and OS ops.

It's fine to work on this in the meantime, but to start merging things
my idea is:

- OPAL must leave r13-r15 untouched for the OS.
- OS ops are made available only for a "v4" OS that uses the new
   calling convention, including kernel stack.
- OS ops baseline (all OSes must provide) will be console / printk
   facility, trap handling and crash/symbol decoding on behalf of OPAL,
   and runtime virtual memory.

Other OS ops features can be added in the versioned structure, including
this.

I'm trying to get back to cleaning these things up and start getting
them merged now. Any comments or review on those would be helpful.

Thanks,
Nick





[RFC 3/3] powernv/cpuidle : Introduce capability for firmware-enabled-stop

2020-04-26 Thread Abhishek Goel
This patch introduces the capability for firmware to handle the stop
states instead. A bit is set based on the discovery of the feature
and correspondingly also the responsibility to handle the stop states.

If Kernel does not know about stop version, it can fallback to opal for
idle stop support if firmware-stop-supported property is present.

Earlier this patch was posted as part of this series :
https://lkml.org/lkml/2020/3/4/589

Signed-off-by: Pratik Rajesh Sampat 
Signed-off-by: Abhishek Goel 
---

 v1->v2: This patch is newly added in this series.

 arch/powerpc/include/asm/processor.h  |  1 +
 arch/powerpc/kernel/dt_cpu_ftrs.c |  8 
 arch/powerpc/platforms/powernv/idle.c | 27 ---
 3 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index 66fa20476d0e..d5c6532b11ef 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -430,6 +430,7 @@ extern unsigned long cpuidle_disable;
 enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
 
 #define STOP_ENABLE0x0001
+#define FIRMWARE_STOP_ENABLE   0x0010
 
 #define STOP_VERSION_P9   0x1
 
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c 
b/arch/powerpc/kernel/dt_cpu_ftrs.c
index db1a525e090d..ff4a87b79702 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -298,6 +298,13 @@ static int __init feat_enable_idle_stop(struct 
dt_cpu_feature *f)
return 1;
 }
 
+static int __init feat_enable_firmware_stop(struct dt_cpu_feature *f)
+{
+   stop_dep.cpuidle_prop |= FIRMWARE_STOP_ENABLE;
+
+   return 1;
+}
+
 static int __init feat_enable_mmu_hash(struct dt_cpu_feature *f)
 {
u64 lpcr;
@@ -592,6 +599,7 @@ static struct dt_cpu_feature_match __initdata
{"idle-nap", feat_enable_idle_nap, 0},
{"alignment-interrupt-dsisr", feat_enable_align_dsisr, 0},
{"idle-stop", feat_enable_idle_stop, 0},
+   {"firmware-stop-supported", feat_enable_firmware_stop, 0},
{"machine-check-power8", feat_enable_mce_power8, 0},
{"performance-monitor-power8", feat_enable_pmu_power8, 0},
{"data-stream-control-register", feat_enable_dscr, CPU_FTR_DSCR},
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 538f0842ac3f..0de5de81902e 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -633,16 +633,6 @@ static unsigned long power9_idle_stop(unsigned long psscr, 
bool mmu_on)
unsigned long mmcr0 = 0;
struct p9_sprs sprs = {}; /* avoid false used-uninitialised */
bool sprs_saved = false;
-   int rc = 0;
-
-   /*
-* Kernel takes decision whether to make OPAL call or not. This logic
-* will be combined with the logic for BE opal to take decision.
-*/
-   if (firmware_stop_supported) {
-   rc = opal_cpu_idle(cpu_to_be64(__pa()), (uint64_t) psscr);
-   goto out;
-   }
 
if (!(psscr & (PSSCR_EC|PSSCR_ESL))) {
/* EC=ESL=0 case */
@@ -835,6 +825,19 @@ static unsigned long power9_idle_stop(unsigned long psscr, 
bool mmu_on)
return srr1;
 }
 
+static unsigned long power9_firmware_idle_stop(unsigned long psscr, bool 
mmu_on)
+{
+   unsigned long srr1;
+   int rc;
+
+   rc = opal_cpu_idle(cpu_to_be64(__pa()), (uint64_t) psscr);
+
+   if (mmu_on)
+   mtmsr(MSR_KERNEL);
+   return srr1;
+
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 static unsigned long power9_offline_stop(unsigned long psscr)
 {
@@ -1394,9 +1397,11 @@ static int __init pnv_init_idle_states(void)
!(stop_dep.cpuidle_prop & STOP_ENABLE))
goto out;
 
-   /* Check for supported version in kernel */
+   /* Check for supported version in kernel or fallback to kernel*/
if (stop_dep.stop_version & STOP_VERSION_P9) {
stop_dep.idle_stop = power9_idle_stop;
+   } else if (stop_dep.cpuidle_prop & FIRMWARE_STOP_ENABLE) {
+   stop_dep.idle_stop = power9_firmware_idle_stop;
} else {
stop_dep.idle_stop = NULL;
goto out;
-- 
2.17.1



[RFC 2/3] powernv/cpuidle : Interface for an idle-stop dependency structure

2020-04-26 Thread Abhishek Goel
This patch introduces the idea of having a dependency structure for
idle-stop. The structure encapsulates the following:
1. Bitmask for version of idle-stop
2. Bitmask for propterties like ENABLE/DISABLE
3. Function pointer which helps handle how the stop must be invoked

This patch lays a foundation for other idle-stop versions to be added
and handled cleanly based on their specified requirments.
Currently it handles the existing "idle-stop" version by setting the
discovery bits and the function pointer.

Earlier this patch was posted as part of this series :
https://lkml.org/lkml/2020/3/4/589

Signed-off-by: Pratik Rajesh Sampat 
Signed-off-by: Abhishek Goel 
---

 v1->v2: This patch is newly added in this series.

 arch/powerpc/include/asm/processor.h  | 17 +
 arch/powerpc/kernel/dt_cpu_ftrs.c |  5 +
 arch/powerpc/platforms/powernv/idle.c | 18 ++
 drivers/cpuidle/cpuidle-powernv.c |  3 ++-
 4 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index eedcbfb9a6ff..66fa20476d0e 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -429,6 +429,23 @@ extern void power4_idle_nap(void);
 extern unsigned long cpuidle_disable;
 enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
 
+#define STOP_ENABLE0x0001
+
+#define STOP_VERSION_P9   0x1
+
+/*
+ * Classify the dependencies of the stop states
+ * @idle_stop: function handler to handle the quirk stop version
+ * @cpuidle_prop: Signify support for stop states through kernel and/or 
firmware
+ * @stop_version: Classify quirk versions for stop states
+ */
+typedef struct {
+   unsigned long (*idle_stop)(unsigned long psscr, bool mmu_on);
+   uint8_t cpuidle_prop;
+   uint8_t stop_version;
+} stop_deps_t;
+extern stop_deps_t stop_dep;
+
 extern int powersave_nap;  /* set if nap mode can be used in idle loop */
 
 extern void power7_idle_type(unsigned long type);
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c 
b/arch/powerpc/kernel/dt_cpu_ftrs.c
index 182b4047c1ef..db1a525e090d 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -292,6 +292,8 @@ static int __init feat_enable_idle_stop(struct 
dt_cpu_feature *f)
lpcr |=  LPCR_PECE1;
lpcr |=  LPCR_PECE2;
mtspr(SPRN_LPCR, lpcr);
+   stop_dep.cpuidle_prop |= STOP_ENABLE;
+   stop_dep.stop_version = STOP_VERSION_P9;
 
return 1;
 }
@@ -657,6 +659,9 @@ static void __init cpufeatures_setup_start(u32 isa)
}
 }
 
+stop_deps_t stop_dep = {NULL, 0x0, 0x0};
+EXPORT_SYMBOL(stop_dep);
+
 static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f)
 {
const struct dt_cpu_feature_match *m;
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 1841027b25c5..538f0842ac3f 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -842,7 +842,7 @@ static unsigned long power9_offline_stop(unsigned long 
psscr)
 
 #ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE
__ppc64_runlatch_off();
-   srr1 = power9_idle_stop(psscr, true);
+   srr1 = stop_dep.idle_stop(psscr, true);
__ppc64_runlatch_on();
 #else
/*
@@ -858,7 +858,7 @@ static unsigned long power9_offline_stop(unsigned long 
psscr)
local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE;
 
__ppc64_runlatch_off();
-   srr1 = power9_idle_stop(psscr, false);
+   srr1 = stop_dep.idle_stop(psscr, true);
__ppc64_runlatch_on();
 
local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL;
@@ -886,7 +886,7 @@ void power9_idle_type(unsigned long stop_psscr_val,
psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
 
__ppc64_runlatch_off();
-   srr1 = power9_idle_stop(psscr, true);
+   srr1 = stop_dep.idle_stop(psscr, true);
__ppc64_runlatch_on();
 
fini_irq_for_idle_irqsoff();
@@ -1390,8 +1390,18 @@ static int __init pnv_init_idle_states(void)
nr_pnv_idle_states = 0;
supported_cpuidle_states = 0;
 
-   if (cpuidle_disable != IDLE_NO_OVERRIDE)
+   if (cpuidle_disable != IDLE_NO_OVERRIDE ||
+   !(stop_dep.cpuidle_prop & STOP_ENABLE))
goto out;
+
+   /* Check for supported version in kernel */
+   if (stop_dep.stop_version & STOP_VERSION_P9) {
+   stop_dep.idle_stop = power9_idle_stop;
+   } else {
+   stop_dep.idle_stop = NULL;
+   goto out;
+   }
+
rc = pnv_parse_cpuidle_dt();
if (rc)
return rc;
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 1b299e801f74..7430a8edf5c9 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -371,7 +371,8 @@ sta

[RFC 1/3] powernv/cpuidle : Support for pre-entry and post exit of stop state in firmware

2020-04-26 Thread Abhishek Goel
This patch provides kernel framework fro opal support of save restore
of sprs in idle stop loop. Opal support for stop states is needed to
selectively enable stop states or to introduce a quirk quickly in case
a buggy stop state is present.

We make a opal call from kernel if firmware-stop-support for stop
states is present and enabled. All the quirks for pre-entry of stop
state is handled inside opal. A call from opal is made into kernel
where we execute stop afer saving of NVGPRs.
After waking up from 0x100 vector in kernel, we enter back into opal.
All the quirks in post exit path, if any, are then handled in opal,
from where we return successfully back to kernel.
For deep stop states in which additional SPRs are lost, saving and
restoration will be done in OPAL.

This idea was first proposed by Nick here:
https://patchwork.ozlabs.org/patch/1208159/

The corresponding skiboot patch for this kernel patch is here:
https://patchwork.ozlabs.org/project/skiboot/list/?series=172831

When we callback from OPAL into kernel, r13 is clobbered. So, to
access PACA we need to restore it from HSPRGO. In future we can
handle this into OPAL as in here:
https://patchwork.ozlabs.org/patch/1245275/

Signed-off-by: Abhishek Goel 
Signed-off-by: Nicholas Piggin 
---

 v1->v2 : No change in this patch.

 arch/powerpc/include/asm/opal-api.h|  8 -
 arch/powerpc/include/asm/opal.h|  3 ++
 arch/powerpc/kernel/idle_book3s.S  |  5 +++
 arch/powerpc/platforms/powernv/idle.c  | 37 ++
 arch/powerpc/platforms/powernv/opal-call.c |  2 ++
 5 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index c1f25a760eb1..a2c782c99c9e 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -214,7 +214,9 @@
 #define OPAL_SECVAR_GET176
 #define OPAL_SECVAR_GET_NEXT   177
 #define OPAL_SECVAR_ENQUEUE_UPDATE 178
-#define OPAL_LAST  178
+#define OPAL_REGISTER_OS_OPS   181
+#define OPAL_CPU_IDLE  182
+#define OPAL_LAST  182
 
 #define QUIESCE_HOLD   1 /* Spin all calls at entry */
 #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */
@@ -1181,6 +1183,10 @@ struct opal_mpipl_fadump {
struct  opal_mpipl_region region[];
 } __packed;
 
+struct opal_os_ops {
+   __be64 os_idle_stop;
+};
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __OPAL_API_H */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 9986ac34b8e2..3c340bc4df8e 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -400,6 +400,9 @@ void opal_powercap_init(void);
 void opal_psr_init(void);
 void opal_sensor_groups_init(void);
 
+extern int64_t opal_register_os_ops(struct opal_os_ops *os_ops);
+extern int64_t opal_cpu_idle(__be64 srr1_addr, uint64_t psscr);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_OPAL_H */
diff --git a/arch/powerpc/kernel/idle_book3s.S 
b/arch/powerpc/kernel/idle_book3s.S
index 22f249b6f58d..8d287d1d06c0 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -49,6 +49,8 @@ _GLOBAL(isa300_idle_stop_noloss)
  */
 _GLOBAL(isa300_idle_stop_mayloss)
mtspr   SPRN_PSSCR,r3
+   mr  r6, r13
+   mfspr   r13, SPRN_HSPRG0
std r1,PACAR1(r13)
mflrr4
mfcrr5
@@ -74,6 +76,7 @@ _GLOBAL(isa300_idle_stop_mayloss)
std r31,-8*18(r1)
std r4,-8*19(r1)
std r5,-8*20(r1)
+   std r6,-8*21(r1)
/* 168 bytes */
PPC_STOP
b   .   /* catch bugs */
@@ -91,8 +94,10 @@ _GLOBAL(idle_return_gpr_loss)
ld  r1,PACAR1(r13)
ld  r4,-8*19(r1)
ld  r5,-8*20(r1)
+   ld  r6,-8*21(r1)
mtlrr4
mtcrr5
+   mr  r13,r6
/*
 * KVM nap requires r2 to be saved, rather than just restoring it
 * from PACATOC. This could be avoided for that less common case
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 78599bca66c2..1841027b25c5 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -35,6 +35,7 @@
 static u32 supported_cpuidle_states;
 struct pnv_idle_states_t *pnv_idle_states;
 int nr_pnv_idle_states;
+static bool firmware_stop_supported;
 
 /*
  * The default stop state that will be used by ppc_md.power_save
@@ -602,6 +603,25 @@ struct p9_sprs {
u64 uamor;
 };
 
+/*
+ * This function is called from OPAL if firmware support for stop
+ * states is present and enabled. It provides a fallback for idle
+ * stop states via OPAL.
+ */
+static uint64_t os_idle_stop(uint64_t psscr, bool save_gprs)
+{
+   /*
+* For l

[RFC] cpuidle/powernv : Support for pre-entry and post exit of stop state in firmware

2020-04-03 Thread Abhishek Goel
This patch provides kernel framework fro opal support of save restore
of sprs in idle stop loop. Opal support for stop states is needed to
selectively enable stop states or to introduce a quirk quickly in case
a buggy stop state is present.

We make a opal call from kernel if firmware-stop-support for stop
states is present and enabled. All the quirks for pre-entry of stop
state is handled inside opal. A call from opal is made into kernel
where we execute stop afer saving of NVGPRs.
After waking up from 0x100 vector in kernel, we enter back into opal.
All the quirks in post exit path, if any, are then handled in opal,
from where we return successfully back to kernel.
For deep stop states in which additional SPRs are lost, saving and
restoration will be done in OPAL.

This idea was first proposed by Nick here:
https://patchwork.ozlabs.org/patch/1208159/

The corresponding skiboot patch for this kernel patch is here:
https://patchwork.ozlabs.org/patch/1265959/

When we callback from OPAL into kernel, r13 is clobbered. So, to
access PACA we need to restore it from HSPRGO. In future we can
handle this into OPAL as in here:
https://patchwork.ozlabs.org/patch/1245275/

Signed-off-by: Abhishek Goel 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/opal-api.h|  8 -
 arch/powerpc/include/asm/opal.h|  3 ++
 arch/powerpc/kernel/idle_book3s.S  |  5 +++
 arch/powerpc/platforms/powernv/idle.c  | 37 ++
 arch/powerpc/platforms/powernv/opal-call.c |  2 ++
 5 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index c1f25a760eb1..a2c782c99c9e 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -214,7 +214,9 @@
 #define OPAL_SECVAR_GET176
 #define OPAL_SECVAR_GET_NEXT   177
 #define OPAL_SECVAR_ENQUEUE_UPDATE 178
-#define OPAL_LAST  178
+#define OPAL_REGISTER_OS_OPS   181
+#define OPAL_CPU_IDLE  182
+#define OPAL_LAST  182
 
 #define QUIESCE_HOLD   1 /* Spin all calls at entry */
 #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */
@@ -1181,6 +1183,10 @@ struct opal_mpipl_fadump {
struct  opal_mpipl_region region[];
 } __packed;
 
+struct opal_os_ops {
+   __be64 os_idle_stop;
+};
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __OPAL_API_H */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 9986ac34b8e2..3c340bc4df8e 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -400,6 +400,9 @@ void opal_powercap_init(void);
 void opal_psr_init(void);
 void opal_sensor_groups_init(void);
 
+extern int64_t opal_register_os_ops(struct opal_os_ops *os_ops);
+extern int64_t opal_cpu_idle(__be64 srr1_addr, uint64_t psscr);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_OPAL_H */
diff --git a/arch/powerpc/kernel/idle_book3s.S 
b/arch/powerpc/kernel/idle_book3s.S
index 22f249b6f58d..8d287d1d06c0 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -49,6 +49,8 @@ _GLOBAL(isa300_idle_stop_noloss)
  */
 _GLOBAL(isa300_idle_stop_mayloss)
mtspr   SPRN_PSSCR,r3
+   mr  r6, r13
+   mfspr   r13, SPRN_HSPRG0
std r1,PACAR1(r13)
mflrr4
mfcrr5
@@ -74,6 +76,7 @@ _GLOBAL(isa300_idle_stop_mayloss)
std r31,-8*18(r1)
std r4,-8*19(r1)
std r5,-8*20(r1)
+   std r6,-8*21(r1)
/* 168 bytes */
PPC_STOP
b   .   /* catch bugs */
@@ -91,8 +94,10 @@ _GLOBAL(idle_return_gpr_loss)
ld  r1,PACAR1(r13)
ld  r4,-8*19(r1)
ld  r5,-8*20(r1)
+   ld  r6,-8*21(r1)
mtlrr4
mtcrr5
+   mr  r13,r6
/*
 * KVM nap requires r2 to be saved, rather than just restoring it
 * from PACATOC. This could be avoided for that less common case
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 78599bca66c2..1841027b25c5 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -35,6 +35,7 @@
 static u32 supported_cpuidle_states;
 struct pnv_idle_states_t *pnv_idle_states;
 int nr_pnv_idle_states;
+static bool firmware_stop_supported;
 
 /*
  * The default stop state that will be used by ppc_md.power_save
@@ -602,6 +603,25 @@ struct p9_sprs {
u64 uamor;
 };
 
+/*
+ * This function is called from OPAL if firmware support for stop
+ * states is present and enabled. It provides a fallback for idle
+ * stop states via OPAL.
+ */
+static uint64_t os_idle_stop(uint64_t psscr, bool save_gprs)
+{
+   /*
+* For lite state which does not lose even GPRS we call

[RFC v5 3/3] cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled

2019-10-03 Thread Abhishek Goel
The disable callback can be used to compute timeout for other states
whenever a state is enabled or disabled. We store the computed timeout
in "timeout" defined in cpuidle state strucure. So, we compute timeout
only when some state is enabled or disabled and not every time in the
fast idle path.
We also use the computed timeout to get timeout for snooze, thus getting
rid of get_snooze_timeout for snooze loop.

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/cpuidle-powernv.c | 35 +++
 include/linux/cpuidle.h   |  1 +
 2 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index d7686ce6e..a75226f52 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -45,7 +45,6 @@ struct stop_psscr_table {
 static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] 
__read_mostly;
 
 static u64 default_snooze_timeout __read_mostly;
-static bool snooze_timeout_en __read_mostly;
 
 static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
 struct cpuidle_driver *drv,
@@ -67,26 +66,13 @@ static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
return 0;
 }
 
-static u64 get_snooze_timeout(struct cpuidle_device *dev,
- struct cpuidle_driver *drv,
- int index)
+static void pnv_disable_callback(struct cpuidle_device *dev,
+struct cpuidle_driver *drv)
 {
int i;
 
-   if (unlikely(!snooze_timeout_en))
-   return default_snooze_timeout;
-
-   for (i = index + 1; i < drv->state_count; i++) {
-   struct cpuidle_state *s = >states[i];
-   struct cpuidle_state_usage *su = >states_usage[i];
-
-   if (s->disabled || su->disable)
-   continue;
-
-   return s->target_residency * tb_ticks_per_usec;
-   }
-
-   return default_snooze_timeout;
+   for (i = 0; i < drv->state_count; i++)
+   drv->states[i].timeout = forced_wakeup_timeout(dev, drv, i);
 }
 
 static int snooze_loop(struct cpuidle_device *dev,
@@ -94,16 +80,20 @@ static int snooze_loop(struct cpuidle_device *dev,
int index)
 {
u64 snooze_exit_time;
+   u64 snooze_timeout = drv->states[index].timeout;
+
+   if (!snooze_timeout)
+   snooze_timeout = default_snooze_timeout;
 
set_thread_flag(TIF_POLLING_NRFLAG);
 
local_irq_enable();
 
-   snooze_exit_time = get_tb() + get_snooze_timeout(dev, drv, index);
+   snooze_exit_time = get_tb() + snooze_timeout;
ppc64_runlatch_off();
HMT_very_low();
while (!need_resched()) {
-   if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) {
+   if (get_tb() > snooze_exit_time) {
/*
 * Task has not woken up but we are exiting the polling
 * loop anyway. Require a barrier after polling is
@@ -168,7 +158,7 @@ static int stop_loop(struct cpuidle_device *dev,
u64 timeout_tb;
bool forced_wakeup = false;
 
-   timeout_tb = forced_wakeup_timeout(dev, drv, index);
+   timeout_tb = drv->states[index].timeout;
 
if (timeout_tb) {
/* Ensure that the timeout is at least one microsecond
@@ -263,6 +253,7 @@ static int powernv_cpuidle_driver_init(void)
 */
 
drv->cpumask = (struct cpumask *)cpu_present_mask;
+   drv->disable_callback = pnv_disable_callback;
 
return 0;
 }
@@ -422,8 +413,6 @@ static int powernv_idle_probe(void)
/* Device tree can indicate more idle states */
max_idle_state = powernv_add_idle_states();
default_snooze_timeout = TICK_USEC * tb_ticks_per_usec;
-   if (max_idle_state > 1)
-   snooze_timeout_en = true;
} else
return -ENODEV;
 
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 1729a497b..64195861b 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -50,6 +50,7 @@ struct cpuidle_state {
int power_usage; /* in mW */
unsigned inttarget_residency; /* in US */
booldisabled; /* disabled on all CPUs */
+   unsigned long long timeout; /* timeout for exiting out of a state */
 
int (*enter)(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
-- 
2.17.1



[RFC v5 2/3] cpuidle : Add callback whenever a state usage is enabled/disabled

2019-10-03 Thread Abhishek Goel
To force wakeup a cpu, we need to compute the timeout in the fast idle
path as a state may be enabled or disabled but there did not exist a
feedback to driver when a state is enabled or disabled.
This patch adds a callback whenever a state_usage records a store for
disable attribute

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/sysfs.c | 15 ++-
 include/linux/cpuidle.h |  3 +++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index 2bb2683b4..6c9bf2f7b 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -418,8 +418,21 @@ static ssize_t cpuidle_state_store(struct kobject *kobj, 
struct attribute *attr,
struct cpuidle_state_attr *cattr = attr_to_stateattr(attr);
struct cpuidle_device *dev = kobj_to_device(kobj);
 
-   if (cattr->store)
+   if (cattr->store) {
ret = cattr->store(state, state_usage, buf, size);
+   if (ret == size &&
+   strncmp(cattr->attr.name, "disable",
+   strlen("disable"))) {
+   struct kobject *cpuidle_kobj = kobj->parent;
+   struct cpuidle_device *dev =
+   to_cpuidle_device(cpuidle_kobj);
+   struct cpuidle_driver *drv =
+   cpuidle_get_cpu_driver(dev);
+
+   if (drv->disable_callback)
+   drv->disable_callback(dev, drv);
+   }
+   }
 
/* reset poll time cache */
dev->poll_limit_ns = 0;
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 4b6b5bea8..1729a497b 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -122,6 +122,9 @@ struct cpuidle_driver {
/* the driver handles the cpus in cpumask */
struct cpumask  *cpumask;
 
+   void (*disable_callback)(struct cpuidle_device *dev,
+struct cpuidle_driver *drv);
+
/* preferred governor to switch at register time */
const char  *governor;
 };
-- 
2.17.1



[PATCH v5 1/3] cpuidle-powernv : forced wakeup for stop states

2019-10-03 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU may end up in the shallow state.

This is problematic, when the predicted state in the aforementioned
scenario is a shallow stop state on a tickless system. As we might get
stuck into shallow states for hours, in absence of ticks or interrupts.

To address this, We forcefully wakeup the cpu by setting the
decrementer. The decrementer is set to a value that corresponds with the
residency of the next available state. Thus firing up a timer that will
forcefully wakeup the cpu. Few such iterations will essentially train the
governor to select a deeper state for that cpu, as the timer here
corresponds to the next available cpuidle state residency. Thus, cpu will
eventually end up in the deepest possible state.

Signed-off-by: Abhishek Goel 
---

Auto-promotion
  v1 : started as auto promotion logic for cpuidle states in generic
driver
  v2 : Removed timeout_needed and rebased the code to upstream kernel
Forced-wakeup
  v1 : New patch with name of forced wakeup started
  v2 : Extending the forced wakeup logic for all states. Setting the
decrementer instead of queuing up a hrtimer to implement the logic.
  v3 : Cleanly handle setting/resetting of decrementer so as to not break
irq work
  v4 : Changed type and name of set/reset decrementer fucntion
   Handled irq_work_pending in try_set_dec_before_idle
  v5 : Removed forced wakeup for last stop state by changing the
   checking conditon of timeout_tb

 arch/powerpc/include/asm/time.h   |  2 ++
 arch/powerpc/kernel/time.c| 43 +++
 drivers/cpuidle/cpuidle-powernv.c | 40 
 3 files changed, 85 insertions(+)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 08dbe3e68..06a6a2314 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -184,6 +184,8 @@ static inline unsigned long tb_ticks_since(unsigned long 
tstamp)
 extern u64 mulhdu(u64, u64);
 #endif
 
+extern bool try_set_dec_before_idle(u64 timeout);
+extern void try_reset_dec_after_idle(void);
 extern void div128_by_32(u64 dividend_high, u64 dividend_low,
 unsigned divisor, struct div_result *dr);
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 694522308..d004c0d8e 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -576,6 +576,49 @@ void arch_irq_work_raise(void)
 
 #endif /* CONFIG_IRQ_WORK */
 
+/*
+ * This function tries setting decrementer before entering into idle.
+ * Returns true if we have reprogrammed the decrementer for idle.
+ * Returns false if the decrementer is unchanged.
+ */
+bool try_set_dec_before_idle(u64 timeout)
+{
+   u64 *next_tb = this_cpu_ptr(_next_tb);
+   u64 now = get_tb_or_rtc();
+
+   if (now + timeout > *next_tb)
+   return false;
+
+   set_dec(timeout);
+   if (test_irq_work_pending())
+   set_dec(1);
+
+   return true;
+}
+
+/*
+ * This function gets called if we have set decrementer before
+ * entering into idle. It tries to reset/restore the decrementer
+ * to its original value.
+ */
+void try_reset_dec_after_idle(void)
+{
+   u64 now;
+   u64 *next_tb;
+
+   if (test_irq_work_pending())
+   return;
+
+   now = get_tb_or_rtc();
+   next_tb = this_cpu_ptr(_next_tb);
+   if (now >= *next_tb)
+   return;
+
+   set_dec(*next_tb - now);
+   if (test_irq_work_pending())
+   set_dec(1);
+}
+
 /*
  * timer_interrupt - gets called when the decrementer overflows,
  * with interrupts disabled.
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe21..d7686ce6e 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Expose only those Hardware idle states via the cpuidle framework
@@ -46,6 +47,26 @@ static struct stop_psscr_table 
stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly
 static u64 default_snooze_timeout __read_mostly;
 static bool snooze_timeout_en __read_mostly;
 
+static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
+struct cpuidle_driver *drv,
+int index)
+{
+   int i;
+
+   for (i = index + 1; i < drv->state_count; i++) {
+   struct cpuidle_state *s = >states[i];
+   struct cpuidle_state_usage *su = >states_usage[i];
+
+   if (s->disabled || su->disable)
+   continue;

[PATCH v5 0/3] Forced-wakeup for stop states on Powernv

2019-10-03 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU will end up in the shallow state.

Motivation
--
In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a shallow stop state on a tickless system. As
we might get stuck into shallow states even for hours, in absence of ticks
or interrupts.

To address this, We forcefully wakeup the cpu by setting the decrementer.
The decrementer is set to a value that corresponds with the residency of
the next available state. Thus firing up a timer that will forcefully
wakeup the cpu. Few such iterations will essentially train the governor to
select a deeper state for that cpu, as the timer here corresponds to the
next available cpuidle state residency. Thus, cpu will eventually end up
in the deepest possible state and we won't get stuck in a shallow state
for long duration.

Experiment
--
For earlier versions when this feature was meat to be only for shallow lite
states, I performed experiments for three scenarios to collect some data.

case 1 :
Without this patch and without tick retained, i.e. in a upstream kernel,
It would spend more than even a second to get out of stop0_lite.

case 2 : With tick retained in a upstream kernel -

Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected
it to take 8 sched tick to get out of stop0_lite. Experimentally,
observation was

=
sample  minmax   99percentile
20  4ms12ms  4ms
=

It would take atleast one sched tick to get out of stop0_lite.

case 2 :  With this patch (not stopping tick, but explicitly queuing a
  timer)


sample  min max 99percentile

20  144us   192us   144us



Description of current implementation
-

We calculate timeout for the current idle state as the residency value
of the next available idle state. If the decrementer is set to be
greater than this timeout, we update the decrementer value with the
residency of next available idle state. Thus, essentially training the
governor to select the next available deeper state until we reach the
deepest state. Hence, we won't get stuck unnecessarily in shallow states
for longer duration.


v1 of auto-promotion : https://lkml.org/lkml/2019/3/22/58 This patch was
implemented only for shallow lite state in generic cpuidle driver.

v2 : Removed timeout_needed and rebased to current
upstream kernel

Then,
v1 of forced-wakeup : Moved the code to cpuidle powernv driver and started
as forced wakeup instead of auto-promotion

v2 : Extended the forced wakeup logic for all states.
Setting the decrementer instead of queuing up a hrtimer to implement the
logic.

v3 : 1) Cleanly handle setting the decrementer after exiting out of stop
   states.
 2) Added a disable_callback feature to compute timeout whenever a
state is enbaled or disabled instead of computing everytime in fast
idle path.
 3) Use disable callback to recompute timeout whenever state usage
is changed for a state. Also, cleaned up the get_snooze_timeout
function.

v4 :Changed the type and name of set/reset decrementer function.
Handled irq work pending in try_set_dec_before_idle.
No change in patch 2 and 3.

v5 :Removed forced wakeup for the last state. We dont want to wakeup
unnecessarily when already in deepest state. It was a mistake in
previous patches that was found out in recent experiments.
No change in patch 2 and 3.

Abhishek Goel (3):
  cpuidle-powernv : forced wakeup for stop states
  cpuidle : Add callback whenever a state usage is enabled/disabled
  cpuidle-powernv : Recompute the idle-state timeouts when state usage
is enabled/disabled

 arch/powerpc/include/asm/time.h   |  2 ++
 arch/powerpc/kernel/time.c| 43 
 drivers/cpuidle/cpuidle-powernv.c | 55 +++
 drivers/cpuidle/sysfs.c   | 15 -
 include/linux/cpuidle.h   |  4 +++
 5 files changed, 105 insertions(+), 14 deletions(-)

-- 
2.17.1



[RFC 3/3] cpuidle/powernv : Add flags to identify stop state type

2019-08-23 Thread Abhishek Goel
Removed threshold latency which was being used to decide if a state
is cpuidle type or not. This decision can be taken using flags, as this
information has been encapsulated in the state->flags and being read
from idle device-tree.

Signed-off-by: Abhishek Goel 
---
 arch/powerpc/include/asm/opal-api.h |  7 +++
 drivers/cpuidle/cpuidle-powernv.c   | 16 +---
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 383242eb0dea..b9068fee21d8 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -233,6 +233,13 @@
 #define OPAL_PM_STOP_INST_FAST 0x0010
 #define OPAL_PM_STOP_INST_DEEP 0x0020
 
+/*
+ * Flags for stop states to distinguish between cpuidle and
+ * cpuoffline type of states.
+ */
+#define OPAL_PM_STOP_CPUIDLE   0x0100
+#define OPAL_PM_STOP_CPUHOTPLUG0x0200
+
 /*
  * OPAL_CONFIG_CPU_IDLE_STATE parameters
  */
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 1b6c84d4ac77..1a33a548b769 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -270,8 +270,13 @@ static int powernv_add_idle_states(void)
goto out;
}
 
-   /* TODO: Count only states which are eligible for cpuidle */
-   dt_idle_states = nr_pnv_idle_states;
+   /* Count only cpuidle states*/
+   for (i = 0; i < nr_pnv_idle_states; i++) {
+   if (pnv_idle_states[i].flags & OPAL_PM_STOP_CPUIDLE)
+   dt_idle_states++;
+   }
+   pr_info("idle states in dt = %d , states with idle flag = %d",
+   nr_pnv_idle_states, dt_idle_states);
 
/*
 * Since snooze is used as first idle state, max idle states allowed is
@@ -300,11 +305,8 @@ static int powernv_add_idle_states(void)
continue;
}
 
-   /*
-* If an idle state has exit latency beyond
-* POWERNV_THRESHOLD_LATENCY_NS then don't use it in cpu-idle.
-*/
-   if (state->latency_ns > POWERNV_THRESHOLD_LATENCY_NS) {
+   /* Check whether a state is of cpuidle type */
+   if ((state->flags & OPAL_PM_STOP_CPUIDLE) != state->flags) {
pr_info("State %d is not idletype\n", i);
continue;
}
-- 
2.17.1



[RFC 2/3] cpuidle/powernv: Add support for versioned stop states

2019-08-23 Thread Abhishek Goel
New device tree format adds a compatible flag which has version
corresponding to every state, so that only kernel which has the capability
to handle the version of stop state will enable it. Drawback of the array
based dt node is that versioning of idle states is not possible.

Older kernel will still see stop0 and stop0_lite in older format and we
will deprecate it after some time.

Consider a case that stop4 has a bug. We take the following steps to
mitigate the problem.

1) Change compatible string for stop4 in OPAL to "stop4,v2" from
"stop4,v1", i.e. basicallly bump up the previous version and ship the
new firmware.
2) The kernel will ignore stop4 as it won't be able to recognize this
new version. Kernel will also ignore all the deeper states because its
possible that a cpu have requested for a deeper state but was never able
to enter into it. But we will still have shallower states that are there
before stop 4. This, thus prevents from completely disabling stop states.

Linux kernel can now look at the version string and decide if it has the
ability to handle that idle state. Henceforth, if kernel does not know
about a version, it will skip that state and all the deeper state.

Once when the workaround are implemented into the kernel, we can bump up
the known version in kernel for that state, so that support can be
enabled once again in kernel.

New Device-tree :

Final output
   power-mgt {
...
 ibm,enabled-stop-levels = <0xec00>;
 ibm,cpu-idle-state-psscr-mask = <0x0 0x3003ff 0x0 0x3003ff>;
 ibm,cpu-idle-state-latencies-ns = <0x3e8 0x7d0>;
 ibm,cpu-idle-state-psscr = <0x0 0x330 0x0 0x300330>;
 ibm,cpu-idle-state-flags = <0x10 0x101000>;
 ibm,cpu-idle-state-residency-ns = <0x2710 0x4e20>;
 ibm,idle-states {
 stop4 {
 flags = <0x207000>;
 compatible = "stop4,v1",
 psscr-mask = <0x0 0x3003ff>;
 latency-ns = <0x186a0>;
 residency-ns = <0x989680>;
 psscr = <0x0 0x300374>;
 ...
  };
...
stop11 {
 ...
 compatible = "stop11,v1",
 ...
  };
 };

Since state pointer is being passed to stop loop, I have separated idle
fast path for lite, shallow and deep states. This improves the
readability of the code and also future maintainability of the code.
Stop handle corresponding to each state can be called directly since
state pointer is being passed now.

Signed-off-by: Abhishek Goel 
---
 arch/powerpc/include/asm/cpuidle.h|   8 +-
 arch/powerpc/platforms/powernv/idle.c | 331 +++---
 2 files changed, 299 insertions(+), 40 deletions(-)

diff --git a/arch/powerpc/include/asm/cpuidle.h 
b/arch/powerpc/include/asm/cpuidle.h
index 9844b3ded187..5eb9a98fcb86 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -70,15 +70,19 @@
 
 #ifndef __ASSEMBLY__
 
-#define PNV_IDLE_NAME_LEN16
+#define PNV_VERSION_LEN25
+#define PNV_IDLE_NAME_LEN  16
 struct pnv_idle_states_t {
char name[PNV_IDLE_NAME_LEN];
+   char compat_version[PNV_VERSION_LEN];
u32 latency_ns;
u32 residency_ns;
u64 psscr_val;
u64 psscr_mask;
-   u32 flags;
+   u64 flags;
bool valid;
+   unsigned long (*stop_handle)(struct pnv_idle_states_t *state,
+bool mmu_on);
 };
 
 extern struct pnv_idle_states_t *pnv_idle_states;
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index db810c0a16e1..7398a66e1ddb 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -49,6 +49,12 @@ static bool default_stop_found;
 static u64 pnv_first_tb_loss_level = MAX_STOP_STATE + 1;
 static u64 pnv_first_spr_loss_level = MAX_STOP_STATE + 1;
 
+struct stop_version_t {
+   const char compat_version[PNV_VERSION_LEN];
+   unsigned long (*stop_handle)(struct pnv_idle_states_t *state,
+bool mmu_on);
+};
+
 /*
  * psscr value and mask of the deepest stop idle state.
  * Used when a cpu is offlined.
@@ -599,40 +605,152 @@ struct p9_sprs {
u64 uamor;
 };
 
-static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
+/* Splitting the previous power9_idle_stop into three functions
+ * to separately handle lite, shallow and deep states.
+ */
+
+static unsigned long power9_stop_lite(struct pnv_idle_states_t *state,
+ bool mmu_on)
 {
-   int cpu = raw_smp_processor_id();
-   int first = cpu_first_thread_sib

[RFC 1/3] cpuidle/powernv : Pass state pointer instead of values to stop loop

2019-08-23 Thread Abhishek Goel
Instead of passing psscr_val and psscr_mask, we will pass state pointer
to stop loop. This will help to figure out the method to enter/exit idle
state. Removed psscr_mask and psscr_val from driver idle code. Same has
also been done in platform idle code.
Also, added some cleanups and debugging info.

Signed-off-by: Abhishek Goel 
---
 arch/powerpc/include/asm/processor.h  |  5 +-
 arch/powerpc/platforms/powernv/idle.c | 50 ---
 drivers/cpuidle/cpuidle-powernv.c | 69 +--
 3 files changed, 55 insertions(+), 69 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index a9993e7a443b..855666e8d271 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -39,6 +39,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* We do _not_ want to define new machine types at all, those must die
  * in favor of using the device-tree
@@ -419,9 +420,7 @@ enum idle_boot_override {IDLE_NO_OVERRIDE = 0, 
IDLE_POWERSAVE_OFF};
 extern int powersave_nap;  /* set if nap mode can be used in idle loop */
 
 extern void power7_idle_type(unsigned long type);
-extern void power9_idle_type(unsigned long stop_psscr_val,
- unsigned long stop_psscr_mask);
-
+extern void power9_idle_type(struct pnv_idle_states_t *state);
 extern void flush_instruction_cache(void);
 extern void hard_reset_now(void);
 extern void poweroff_now(void);
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 09f49eed7fb8..db810c0a16e1 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -40,8 +40,7 @@ int nr_pnv_idle_states;
  * The default stop state that will be used by ppc_md.power_save
  * function on platforms that support stop instruction.
  */
-static u64 pnv_default_stop_val;
-static u64 pnv_default_stop_mask;
+struct pnv_idle_states_t *pnv_default_state;
 static bool default_stop_found;
 
 /*
@@ -54,9 +53,7 @@ static u64 pnv_first_spr_loss_level = MAX_STOP_STATE + 1;
  * psscr value and mask of the deepest stop idle state.
  * Used when a cpu is offlined.
  */
-static u64 pnv_deepest_stop_psscr_val;
-static u64 pnv_deepest_stop_psscr_mask;
-static u64 pnv_deepest_stop_flag;
+static struct pnv_idle_states_t *pnv_deepest_state;
 static bool deepest_stop_found;
 
 static unsigned long power7_offline_type;
@@ -78,7 +75,7 @@ static int pnv_save_sprs_for_deep_states(void)
uint64_t hid5_val   = mfspr(SPRN_HID5);
uint64_t hmeer_val  = mfspr(SPRN_HMEER);
uint64_t msr_val = MSR_IDLE;
-   uint64_t psscr_val = pnv_deepest_stop_psscr_val;
+   uint64_t psscr_val = pnv_deepest_state->psscr_val;
 
for_each_present_cpu(cpu) {
uint64_t pir = get_hard_smp_processor_id(cpu);
@@ -804,9 +801,13 @@ static unsigned long power9_idle_stop(unsigned long psscr, 
bool mmu_on)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static unsigned long power9_offline_stop(unsigned long psscr)
+static unsigned long power9_offline_stop(struct pnv_idle_states_t *state)
 {
unsigned long srr1;
+   unsigned long psscr;
+
+   psscr = mfspr(SPRN_PSSCR);
+   psscr = (psscr & ~state->psscr_mask) | state->psscr_val;
 
 #ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE
__ppc64_runlatch_off();
@@ -841,8 +842,7 @@ static unsigned long power9_offline_stop(unsigned long 
psscr)
 }
 #endif
 
-void power9_idle_type(unsigned long stop_psscr_val,
- unsigned long stop_psscr_mask)
+void power9_idle_type(struct pnv_idle_states_t *state)
 {
unsigned long psscr;
unsigned long srr1;
@@ -851,7 +851,7 @@ void power9_idle_type(unsigned long stop_psscr_val,
return;
 
psscr = mfspr(SPRN_PSSCR);
-   psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
+   psscr = (psscr & ~state->psscr_mask) | state->psscr_val;
 
__ppc64_runlatch_off();
srr1 = power9_idle_stop(psscr, true);
@@ -867,7 +867,7 @@ void power9_idle_type(unsigned long stop_psscr_val,
  */
 void power9_idle(void)
 {
-   power9_idle_type(pnv_default_stop_val, pnv_default_stop_mask);
+   power9_idle_type(pnv_default_state);
 }
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -970,12 +970,7 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
__ppc64_runlatch_off();
 
if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) {
-   unsigned long psscr;
-
-   psscr = mfspr(SPRN_PSSCR);
-   psscr = (psscr & ~pnv_deepest_stop_psscr_mask) |
-   pnv_deepest_stop_psscr_val;
-   srr1 = power9_offline_stop(psscr);
+   srr1 = power9_offline_stop(pnv_deepest_state);
} else if (cpu_has_feature(CPU_FTR_ARCH_206) && power7_offline_type) {
srr1 = power7_offline();
} el

[RFC 0/3] New idle device-tree format and support for versioned stop state

2019-08-23 Thread Abhishek Goel
Background
--

Previously if a older kernel runs on a newer firmware, it may enable
all available states irrespective of its capability of handling it.
Consider a case that some stop state has a bug, we end up disabling all
the stop states. This patch introduces selective control to solve this
problem.

Previous version of these patches can be found at:
https://lkml.org/lkml/2018/10/11/544
These patch however also had patches for support of opal save-restore
which now I am decoupling and will take them seperately.
I have posted the corresponding skiboot patches for this kernel patchset
here : https://patchwork.ozlabs.org/cover/1144587/

What's new?


Add stop states under ibm,idle-states in addition to the current array
based device tree properties.

New device tree format adds a compatible flag which has version
corresponding to every state, so that only kernel which has the capability
to handle the version of stop state will enable it. Drawback of the array
based dt node is that versioning of idle states is not possible.

Older kernel will still see stop0 and stop0_lite in older format and we
will deprecate it after some time.

Consider a case that stop4 has a bug. We take the following steps to
mitigate the problem.

1) Change compatible string for stop4 in OPAL to "stop4,v2" from
"stop4,v1", i.e. basicallly bump up the previous version and ship the
new firmware.

2) The kernel will ignore stop4 as it won't be able to recognize this
new version. Kernel will also ignore all the deeper states because its
possible that a cpu have requested for a deeper state but was never able
to enter into it. But we will still have shallower states that are there
before stop 4. This, thus prevents from completely disabling stop states.

Linux kernel can now look at the version string and decide if it has the
ability to handle that idle state. Henceforth, if kernel does not know
about a version, it will skip that state and all the deeper state.

Once when the workaround are implemented into the kernel, we can bump up
the known version in kernel for that state, so that support can be
enabled once again in kernel.

New Device-tree :

Final output
   power-mgt {
...
 ibm,enabled-stop-levels = <0xec00>;
 ibm,cpu-idle-state-psscr-mask = <0x0 0x3003ff 0x0 0x3003ff>;
 ibm,cpu-idle-state-latencies-ns = <0x3e8 0x7d0>;
 ibm,cpu-idle-state-psscr = <0x0 0x330 0x0 0x300330>;
 ibm,cpu-idle-state-flags = <0x10 0x101000>;
 ibm,cpu-idle-state-residency-ns = <0x2710 0x4e20>;
 ibm,idle-states {
 stop4 {
 flags = <0x207000>;
 compatible = "stop4,v1",
 psscr-mask = <0x0 0x3003ff>;
 latency-ns = <0x186a0>;
 residency-ns = <0x989680>;
 psscr = <0x0 0x300374>;
 ...
  };
...
stop11 {
 ...
     compatible = "stop11,v1",
 ...
  };
 };


Abhishek Goel (3):
  cpuidle/powernv : Pass state pointer instead of values to stop loop
  cpuidle/powernv: Add support for versioned stop states
  cpuidle/powernv : Add flags to identify stop state type

 arch/powerpc/include/asm/cpuidle.h|   8 +-
 arch/powerpc/include/asm/opal-api.h   |   7 +
 arch/powerpc/include/asm/processor.h  |   5 +-
 arch/powerpc/platforms/powernv/idle.c | 371 +-
 drivers/cpuidle/cpuidle-powernv.c |  81 +++---
 5 files changed, 363 insertions(+), 109 deletions(-)

-- 
2.17.1



[RFC v4 3/3] cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled

2019-07-12 Thread Abhishek Goel
The disable callback can be used to compute timeout for other states
whenever a state is enabled or disabled. We store the computed timeout
in "timeout" defined in cpuidle state strucure. So, we compute timeout
only when some state is enabled or disabled and not every time in the
fast idle path.
We also use the computed timeout to get timeout for snooze, thus getting
rid of get_snooze_timeout for snooze loop.

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/cpuidle-powernv.c | 35 +++
 include/linux/cpuidle.h   |  1 +
 2 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 17e20e408ffe..29add322d0c4 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -45,7 +45,6 @@ struct stop_psscr_table {
 static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] 
__read_mostly;
 
 static u64 default_snooze_timeout __read_mostly;
-static bool snooze_timeout_en __read_mostly;
 
 static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
 struct cpuidle_driver *drv,
@@ -67,26 +66,13 @@ static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
return 0;
 }
 
-static u64 get_snooze_timeout(struct cpuidle_device *dev,
- struct cpuidle_driver *drv,
- int index)
+static void pnv_disable_callback(struct cpuidle_device *dev,
+struct cpuidle_driver *drv)
 {
int i;
 
-   if (unlikely(!snooze_timeout_en))
-   return default_snooze_timeout;
-
-   for (i = index + 1; i < drv->state_count; i++) {
-   struct cpuidle_state *s = >states[i];
-   struct cpuidle_state_usage *su = >states_usage[i];
-
-   if (s->disabled || su->disable)
-   continue;
-
-   return s->target_residency * tb_ticks_per_usec;
-   }
-
-   return default_snooze_timeout;
+   for (i = 0; i < drv->state_count; i++)
+   drv->states[i].timeout = forced_wakeup_timeout(dev, drv, i);
 }
 
 static int snooze_loop(struct cpuidle_device *dev,
@@ -94,16 +80,20 @@ static int snooze_loop(struct cpuidle_device *dev,
int index)
 {
u64 snooze_exit_time;
+   u64 snooze_timeout = drv->states[index].timeout;
+
+   if (!snooze_timeout)
+   snooze_timeout = default_snooze_timeout;
 
set_thread_flag(TIF_POLLING_NRFLAG);
 
local_irq_enable();
 
-   snooze_exit_time = get_tb() + get_snooze_timeout(dev, drv, index);
+   snooze_exit_time = get_tb() + snooze_timeout;
ppc64_runlatch_off();
HMT_very_low();
while (!need_resched()) {
-   if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) {
+   if (get_tb() > snooze_exit_time) {
/*
 * Task has not woken up but we are exiting the polling
 * loop anyway. Require a barrier after polling is
@@ -168,7 +158,7 @@ static int stop_loop(struct cpuidle_device *dev,
u64 timeout_tb;
bool forced_wakeup = false;
 
-   timeout_tb = forced_wakeup_timeout(dev, drv, index);
+   timeout_tb = drv->states[index].timeout;
 
/* Ensure that the timeout is at least one microsecond
 * greater than current decrement value. Else, we will
@@ -263,6 +253,7 @@ static int powernv_cpuidle_driver_init(void)
 */
 
drv->cpumask = (struct cpumask *)cpu_present_mask;
+   drv->disable_callback = pnv_disable_callback;
 
return 0;
 }
@@ -422,8 +413,6 @@ static int powernv_idle_probe(void)
/* Device tree can indicate more idle states */
max_idle_state = powernv_add_idle_states();
default_snooze_timeout = TICK_USEC * tb_ticks_per_usec;
-   if (max_idle_state > 1)
-   snooze_timeout_en = true;
} else
return -ENODEV;
 
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 8a0e54bd0d5d..31662b657b9c 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -50,6 +50,7 @@ struct cpuidle_state {
int power_usage; /* in mW */
unsigned inttarget_residency; /* in US */
booldisabled; /* disabled on all CPUs */
+   unsigned long long timeout; /* timeout for exiting out of a state */
 
int (*enter)(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
-- 
2.17.1



[RFC v4 2/3] cpuidle : Add callback whenever a state usage is enabled/disabled

2019-07-12 Thread Abhishek Goel
To force wakeup a cpu, we need to compute the timeout in the fast idle
path as a state may be enabled or disabled but there did not exist a
feedback to driver when a state is enabled or disabled.
This patch adds a callback whenever a state_usage records a store for
disable attribute.

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/sysfs.c | 15 ++-
 include/linux/cpuidle.h |  4 
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index eb20adb5de23..141671a53967 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -415,8 +415,21 @@ static ssize_t cpuidle_state_store(struct kobject *kobj, 
struct attribute *attr,
struct cpuidle_state_usage *state_usage = kobj_to_state_usage(kobj);
struct cpuidle_state_attr *cattr = attr_to_stateattr(attr);
 
-   if (cattr->store)
+   if (cattr->store) {
ret = cattr->store(state, state_usage, buf, size);
+   if (ret == size &&
+   strncmp(cattr->attr.name, "disable",
+   strlen("disable"))) {
+   struct kobject *cpuidle_kobj = kobj->parent;
+   struct cpuidle_device *dev =
+   to_cpuidle_device(cpuidle_kobj);
+   struct cpuidle_driver *drv =
+   cpuidle_get_cpu_driver(dev);
+
+   if (drv->disable_callback)
+   drv->disable_callback(dev, drv);
+   }
+   }
 
return ret;
 }
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index bb9a0db89f1a..8a0e54bd0d5d 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -119,6 +119,10 @@ struct cpuidle_driver {
 
/* the driver handles the cpus in cpumask */
struct cpumask  *cpumask;
+
+   void (*disable_callback)(struct cpuidle_device *dev,
+   struct cpuidle_driver *drv);
+
 };
 
 #ifdef CONFIG_CPU_IDLE
-- 
2.17.1



[PATCH v4 1/3] cpuidle-powernv : forced wakeup for stop states

2019-07-12 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU may end up in the shallow state.

This is problematic, when the predicted state in the aforementioned
scenario is a shallow stop state on a tickless system. As we might get
stuck into shallow states for hours, in absence of ticks or interrupts.

To address this, We forcefully wakeup the cpu by setting the
decrementer. The decrementer is set to a value that corresponds with the
residency of the next available state. Thus firing up a timer that will
forcefully wakeup the cpu. Few such iterations will essentially train the
governor to select a deeper state for that cpu, as the timer here
corresponds to the next available cpuidle state residency. Thus, cpu will
eventually end up in the deepest possible state.

Signed-off-by: Abhishek Goel 
---

Auto-promotion
  v1 : started as auto promotion logic for cpuidle states in generic
driver
  v2 : Removed timeout_needed and rebased the code to upstream kernel
Forced-wakeup
  v1 : New patch with name of forced wakeup started
  v2 : Extending the forced wakeup logic for all states. Setting the
decrementer instead of queuing up a hrtimer to implement the logic.
  v3 : Cleanly handle setting/resetting of decrementer so as to not break
irq work
  v4 : Changed type and name of set/reset decrementer fucntion
   Handled irq_work_pending in try_set_dec_before_idle

 arch/powerpc/include/asm/time.h   |  2 ++
 arch/powerpc/kernel/time.c| 43 +++
 drivers/cpuidle/cpuidle-powernv.c | 40 
 3 files changed, 85 insertions(+)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 54f4ec1f9fab..294a472ce161 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -188,6 +188,8 @@ static inline unsigned long tb_ticks_since(unsigned long 
tstamp)
 extern u64 mulhdu(u64, u64);
 #endif
 
+extern bool try_set_dec_before_idle(u64 timeout);
+extern void try_reset_dec_after_idle(void);
 extern void div128_by_32(u64 dividend_high, u64 dividend_low,
 unsigned divisor, struct div_result *dr);
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 694522308cd5..d004c0d8e099 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -576,6 +576,49 @@ void arch_irq_work_raise(void)
 
 #endif /* CONFIG_IRQ_WORK */
 
+/*
+ * This function tries setting decrementer before entering into idle.
+ * Returns true if we have reprogrammed the decrementer for idle.
+ * Returns false if the decrementer is unchanged.
+ */
+bool try_set_dec_before_idle(u64 timeout)
+{
+   u64 *next_tb = this_cpu_ptr(_next_tb);
+   u64 now = get_tb_or_rtc();
+
+   if (now + timeout > *next_tb)
+   return false;
+
+   set_dec(timeout);
+   if (test_irq_work_pending())
+   set_dec(1);
+
+   return true;
+}
+
+/*
+ * This function gets called if we have set decrementer before
+ * entering into idle. It tries to reset/restore the decrementer
+ * to its original value.
+ */
+void try_reset_dec_after_idle(void)
+{
+   u64 now;
+   u64 *next_tb;
+
+   if (test_irq_work_pending())
+   return;
+
+   now = get_tb_or_rtc();
+   next_tb = this_cpu_ptr(_next_tb);
+   if (now >= *next_tb)
+   return;
+
+   set_dec(*next_tb - now);
+   if (test_irq_work_pending())
+   set_dec(1);
+}
+
 /*
  * timer_interrupt - gets called when the decrementer overflows,
  * with interrupts disabled.
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe212b3..17e20e408ffe 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Expose only those Hardware idle states via the cpuidle framework
@@ -46,6 +47,26 @@ static struct stop_psscr_table 
stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly
 static u64 default_snooze_timeout __read_mostly;
 static bool snooze_timeout_en __read_mostly;
 
+static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
+struct cpuidle_driver *drv,
+int index)
+{
+   int i;
+
+   for (i = index + 1; i < drv->state_count; i++) {
+   struct cpuidle_state *s = >states[i];
+   struct cpuidle_state_usage *su = >states_usage[i];
+
+   if (s->disabled || su->disable)
+   continue;
+
+   return (s->target_residency + 2 * s->exit_latency) *
+  

[PATCH v4 0/3] Forced-wakeup for stop states on Powernv

2019-07-12 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU will end up in the shallow state.

Motivation
--
In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a shallow stop state on a tickless system. As
we might get stuck into shallow states even for hours, in absence of ticks
or interrupts.

To address this, We forcefully wakeup the cpu by setting the decrementer.
The decrementer is set to a value that corresponds with the residency of
the next available state. Thus firing up a timer that will forcefully
wakeup the cpu. Few such iterations will essentially train the governor to
select a deeper state for that cpu, as the timer here corresponds to the
next available cpuidle state residency. Thus, cpu will eventually end up
in the deepest possible state and we won't get stuck in a shallow state
for long duration.

Experiment
--
For earlier versions when this feature was meat to be only for shallow lite
states, I performed experiments for three scenarios to collect some data.

case 1 :
Without this patch and without tick retained, i.e. in a upstream kernel,
It would spend more than even a second to get out of stop0_lite.

case 2 : With tick retained in a upstream kernel -

Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected
it to take 8 sched tick to get out of stop0_lite. Experimentally,
observation was

=
sample  minmax   99percentile
20  4ms12ms  4ms
=

It would take atleast one sched tick to get out of stop0_lite.

case 2 :  With this patch (not stopping tick, but explicitly queuing a
  timer)


sample  min max 99percentile

20  144us   192us   144us



Description of current implementation
-

We calculate timeout for the current idle state as the residency value
of the next available idle state. If the decrementer is set to be
greater than this timeout, we update the decrementer value with the
residency of next available idle state. Thus, essentially training the
governor to select the next available deeper state until we reach the
deepest state. Hence, we won't get stuck unnecessarily in shallow states
for longer duration.


v1 of auto-promotion : https://lkml.org/lkml/2019/3/22/58 This patch was
implemented only for shallow lite state in generic cpuidle driver.

v2 : Removed timeout_needed and rebased to current
upstream kernel

Then,
v1 of forced-wakeup : Moved the code to cpuidle powernv driver and started
as forced wakeup instead of auto-promotion

v2 : Extended the forced wakeup logic for all states.
Setting the decrementer instead of queuing up a hrtimer to implement the
logic.

v3 : 1) Cleanly handle setting the decrementer after exiting out of stop
   states.
 2) Added a disable_callback feature to compute timeout whenever a
state is enbaled or disabled instead of computing everytime in fast
idle path.
 3) Use disable callback to recompute timeout whenever state usage
is changed for a state. Also, cleaned up the get_snooze_timeout
function.

v4 :Changed the type and name of set/reset decrementer function.
Handled irq work pending in try_set_dec_before_idle.
No change in patch 2 and 3.

Abhishek Goel (3):
  cpuidle-powernv : forced wakeup for stop states
  cpuidle : Add callback whenever a state usage is enabled/disabled
  cpuidle-powernv : Recompute the idle-state timeouts when state usage
is enabled/disabled

 arch/powerpc/include/asm/time.h   |  2 ++
 arch/powerpc/kernel/time.c| 43 
 drivers/cpuidle/cpuidle-powernv.c | 55 +++
 drivers/cpuidle/sysfs.c   | 15 -
 include/linux/cpuidle.h   |  5 +++
 5 files changed, 106 insertions(+), 14 deletions(-)

-- 
2.17.1



Re: [PATCH v3 1/3] cpuidle-powernv : forced wakeup for stop states

2019-07-08 Thread Abhishek

Hi Nick,

Will post next version with the changes you have suggested.
There is a comment below.

On 07/07/2019 03:43 PM, Nicholas Piggin wrote:

Abhishek Goel's on July 4, 2019 7:18 pm:

Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU may end up in the shallow state.

This is problematic, when the predicted state in the aforementioned
scenario is a shallow stop state on a tickless system. As we might get
stuck into shallow states for hours, in absence of ticks or interrupts.

To address this, We forcefully wakeup the cpu by setting the
decrementer. The decrementer is set to a value that corresponds with the
residency of the next available state. Thus firing up a timer that will
forcefully wakeup the cpu. Few such iterations will essentially train the
governor to select a deeper state for that cpu, as the timer here
corresponds to the next available cpuidle state residency. Thus, cpu will
eventually end up in the deepest possible state.

Signed-off-by: Abhishek Goel 
---

Auto-promotion
  v1 : started as auto promotion logic for cpuidle states in generic
driver
  v2 : Removed timeout_needed and rebased the code to upstream kernel
Forced-wakeup
  v1 : New patch with name of forced wakeup started
  v2 : Extending the forced wakeup logic for all states. Setting the
decrementer instead of queuing up a hrtimer to implement the logic.
  v3 : Cleanly handle setting/resetting of decrementer so as to not break
irq work

  arch/powerpc/include/asm/time.h   |  2 ++
  arch/powerpc/kernel/time.c| 40 +++
  drivers/cpuidle/cpuidle-powernv.c | 32 +
  3 files changed, 74 insertions(+)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 54f4ec1f9..a3bd4f3c0 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -188,6 +188,8 @@ static inline unsigned long tb_ticks_since(unsigned long 
tstamp)
  extern u64 mulhdu(u64, u64);
  #endif
  
+extern int set_dec_before_idle(u64 timeout);

+extern void reset_dec_after_idle(void);
  extern void div128_by_32(u64 dividend_high, u64 dividend_low,
 unsigned divisor, struct div_result *dr);
  
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c

index 694522308..814de3469 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -576,6 +576,46 @@ void arch_irq_work_raise(void)
  
  #endif /* CONFIG_IRQ_WORK */
  
+/*

+ * Returns 1 if we have reprogrammed the decrementer for idle.
+ * Returns 0 if the decrementer is unchanged.
+ */
+int set_dec_before_idle(u64 timeout)
+{
+   u64 *next_tb = this_cpu_ptr(_next_tb);
+   u64 now = get_tb_or_rtc();
+
+   /*
+* Ensure that the timeout is at least one microsecond
+* before the current decrement value. Else, we will
+* unnecesarily wakeup again within a microsecond.
+*/
+   if (now + timeout + 512 > *next_tb)

I would pass this 512 in as a parameter and put the comment in the
idle code. Timer code does not know/care.

Maybe return bool and call it try_set_dec_before_idle.

+   return 0;
+
+   set_dec(timeout);

This needs to have

   if (test_irq_work_pending())
   set_dec(1);

here AFAIKS


+
+   return 1;
+}
+
+void reset_dec_after_idle(void)
+{
+   u64 now;
+   u64 *next_tb;
+
+   if (test_irq_work_pending())
+   return;
+
+   now = get_tb_or_rtc();
+   next_tb = this_cpu_ptr(_next_tb);
+   if (now >= *next_tb)
+   return;

Are you sure it's okay to escape early in this case?


Yeah, It looks safe. In power9_idle_type, we call irq_set_pending_from_srr1
which sets the irq_happened. If reason is IRQ_DEC, in __check_irq_replay,
decrementer_check_overflow will be called which will set dec to a positive
valid value.
Also, we typically disable MSR EE before entering stop. And if a decrementer
wakes us up, before we enable EE, check for pending interrupt will be done.
And we finally reset dec to a positive value before we set EE=1.

Thanks,
Nick



Thanks,
Abhishek



[RFC v3 3/3] cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled

2019-07-04 Thread Abhishek Goel
The disable callback can be used to compute timeout for other states
whenever a state is enabled or disabled. We store the computed timeout
in "timeout" defined in cpuidle state strucure. So, we compute timeout
only when some state is enabled or disabled and not every time in the
fast idle path.
We also use the computed timeout to get timeout for snooze, thus getting
rid of get_snooze_timeout for snooze loop.

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/cpuidle-powernv.c | 35 +++
 include/linux/cpuidle.h   |  1 +
 2 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index f51478460..7350f404a 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -45,7 +45,6 @@ struct stop_psscr_table {
 static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] 
__read_mostly;
 
 static u64 default_snooze_timeout __read_mostly;
-static bool snooze_timeout_en __read_mostly;
 
 static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
 struct cpuidle_driver *drv,
@@ -67,26 +66,13 @@ static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
return 0;
 }
 
-static u64 get_snooze_timeout(struct cpuidle_device *dev,
- struct cpuidle_driver *drv,
- int index)
+static void pnv_disable_callback(struct cpuidle_device *dev,
+struct cpuidle_driver *drv)
 {
int i;
 
-   if (unlikely(!snooze_timeout_en))
-   return default_snooze_timeout;
-
-   for (i = index + 1; i < drv->state_count; i++) {
-   struct cpuidle_state *s = >states[i];
-   struct cpuidle_state_usage *su = >states_usage[i];
-
-   if (s->disabled || su->disable)
-   continue;
-
-   return s->target_residency * tb_ticks_per_usec;
-   }
-
-   return default_snooze_timeout;
+   for (i = 0; i < drv->state_count; i++)
+   drv->states[i].timeout = forced_wakeup_timeout(dev, drv, i);
 }
 
 static int snooze_loop(struct cpuidle_device *dev,
@@ -94,16 +80,20 @@ static int snooze_loop(struct cpuidle_device *dev,
int index)
 {
u64 snooze_exit_time;
+   u64 snooze_timeout = drv->states[index].timeout;
+
+   if (!snooze_timeout)
+   snooze_timeout = default_snooze_timeout;
 
set_thread_flag(TIF_POLLING_NRFLAG);
 
local_irq_enable();
 
-   snooze_exit_time = get_tb() + get_snooze_timeout(dev, drv, index);
+   snooze_exit_time = get_tb() + snooze_timeout;
ppc64_runlatch_off();
HMT_very_low();
while (!need_resched()) {
-   if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) {
+   if (get_tb() > snooze_exit_time) {
/*
 * Task has not woken up but we are exiting the polling
 * loop anyway. Require a barrier after polling is
@@ -168,7 +158,7 @@ static int stop_loop(struct cpuidle_device *dev,
u64 timeout_tb;
int forced_wakeup = 0;
 
-   timeout_tb = forced_wakeup_timeout(dev, drv, index);
+   timeout_tb = drv->states[index].timeout;
if (timeout_tb)
forced_wakeup = set_dec_before_idle(timeout_tb);
 
@@ -255,6 +245,7 @@ static int powernv_cpuidle_driver_init(void)
 */
 
drv->cpumask = (struct cpumask *)cpu_present_mask;
+   drv->disable_callback = pnv_disable_callback;
 
return 0;
 }
@@ -414,8 +405,6 @@ static int powernv_idle_probe(void)
/* Device tree can indicate more idle states */
max_idle_state = powernv_add_idle_states();
default_snooze_timeout = TICK_USEC * tb_ticks_per_usec;
-   if (max_idle_state > 1)
-   snooze_timeout_en = true;
} else
return -ENODEV;
 
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 8a0e54bd0..31662b657 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -50,6 +50,7 @@ struct cpuidle_state {
int power_usage; /* in mW */
unsigned inttarget_residency; /* in US */
booldisabled; /* disabled on all CPUs */
+   unsigned long long timeout; /* timeout for exiting out of a state */
 
int (*enter)(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
-- 
2.17.1



[RFC v3 2/3] cpuidle : Add callback whenever a state usage is enabled/disabled

2019-07-04 Thread Abhishek Goel
To force wakeup a cpu, we need to compute the timeout in the fast idle
path as a state may be enabled or disabled but there did not exist a
feedback to driver when a state is enabled or disabled.
This patch adds a callback whenever a state_usage records a store for
disable attribute.

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/sysfs.c | 15 ++-
 include/linux/cpuidle.h |  4 
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index eb20adb5d..141671a53 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -415,8 +415,21 @@ static ssize_t cpuidle_state_store(struct kobject *kobj, 
struct attribute *attr,
struct cpuidle_state_usage *state_usage = kobj_to_state_usage(kobj);
struct cpuidle_state_attr *cattr = attr_to_stateattr(attr);
 
-   if (cattr->store)
+   if (cattr->store) {
ret = cattr->store(state, state_usage, buf, size);
+   if (ret == size &&
+   strncmp(cattr->attr.name, "disable",
+   strlen("disable"))) {
+   struct kobject *cpuidle_kobj = kobj->parent;
+   struct cpuidle_device *dev =
+   to_cpuidle_device(cpuidle_kobj);
+   struct cpuidle_driver *drv =
+   cpuidle_get_cpu_driver(dev);
+
+   if (drv->disable_callback)
+   drv->disable_callback(dev, drv);
+   }
+   }
 
return ret;
 }
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index bb9a0db89..8a0e54bd0 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -119,6 +119,10 @@ struct cpuidle_driver {
 
/* the driver handles the cpus in cpumask */
struct cpumask  *cpumask;
+
+   void (*disable_callback)(struct cpuidle_device *dev,
+   struct cpuidle_driver *drv);
+
 };
 
 #ifdef CONFIG_CPU_IDLE
-- 
2.17.1



[PATCH v3 1/3] cpuidle-powernv : forced wakeup for stop states

2019-07-04 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU may end up in the shallow state.

This is problematic, when the predicted state in the aforementioned
scenario is a shallow stop state on a tickless system. As we might get
stuck into shallow states for hours, in absence of ticks or interrupts.

To address this, We forcefully wakeup the cpu by setting the
decrementer. The decrementer is set to a value that corresponds with the
residency of the next available state. Thus firing up a timer that will
forcefully wakeup the cpu. Few such iterations will essentially train the
governor to select a deeper state for that cpu, as the timer here
corresponds to the next available cpuidle state residency. Thus, cpu will
eventually end up in the deepest possible state.

Signed-off-by: Abhishek Goel 
---

Auto-promotion
 v1 : started as auto promotion logic for cpuidle states in generic
driver
 v2 : Removed timeout_needed and rebased the code to upstream kernel
Forced-wakeup
 v1 : New patch with name of forced wakeup started
 v2 : Extending the forced wakeup logic for all states. Setting the
decrementer instead of queuing up a hrtimer to implement the logic.
 v3 : Cleanly handle setting/resetting of decrementer so as to not break
irq work 

 arch/powerpc/include/asm/time.h   |  2 ++
 arch/powerpc/kernel/time.c| 40 +++
 drivers/cpuidle/cpuidle-powernv.c | 32 +
 3 files changed, 74 insertions(+)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 54f4ec1f9..a3bd4f3c0 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -188,6 +188,8 @@ static inline unsigned long tb_ticks_since(unsigned long 
tstamp)
 extern u64 mulhdu(u64, u64);
 #endif
 
+extern int set_dec_before_idle(u64 timeout);
+extern void reset_dec_after_idle(void);
 extern void div128_by_32(u64 dividend_high, u64 dividend_low,
 unsigned divisor, struct div_result *dr);
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 694522308..814de3469 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -576,6 +576,46 @@ void arch_irq_work_raise(void)
 
 #endif /* CONFIG_IRQ_WORK */
 
+/*
+ * Returns 1 if we have reprogrammed the decrementer for idle.
+ * Returns 0 if the decrementer is unchanged.
+ */
+int set_dec_before_idle(u64 timeout)
+{
+   u64 *next_tb = this_cpu_ptr(_next_tb);
+   u64 now = get_tb_or_rtc();
+
+   /*
+* Ensure that the timeout is at least one microsecond
+* before the current decrement value. Else, we will
+* unnecesarily wakeup again within a microsecond.
+*/
+   if (now + timeout + 512 > *next_tb)
+   return 0;
+
+   set_dec(timeout);
+
+   return 1;
+}
+
+void reset_dec_after_idle(void)
+{
+   u64 now;
+   u64 *next_tb;
+
+   if (test_irq_work_pending())
+   return;
+
+   now = get_tb_or_rtc();
+   next_tb = this_cpu_ptr(_next_tb);
+   if (now >= *next_tb)
+   return;
+
+   set_dec(*next_tb - now);
+   if (test_irq_work_pending())
+   set_dec(1);
+}
+
 /*
  * timer_interrupt - gets called when the decrementer overflows,
  * with interrupts disabled.
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe21..f51478460 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Expose only those Hardware idle states via the cpuidle framework
@@ -46,6 +47,26 @@ static struct stop_psscr_table 
stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly
 static u64 default_snooze_timeout __read_mostly;
 static bool snooze_timeout_en __read_mostly;
 
+static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
+struct cpuidle_driver *drv,
+int index)
+{
+   int i;
+
+   for (i = index + 1; i < drv->state_count; i++) {
+   struct cpuidle_state *s = >states[i];
+   struct cpuidle_state_usage *su = >states_usage[i];
+
+   if (s->disabled || su->disable)
+   continue;
+
+   return (s->target_residency + 2 * s->exit_latency) *
+   tb_ticks_per_usec;
+   }
+
+   return 0;
+}
+
 static u64 get_snooze_timeout(struct cpuidle_device *dev,
  struct cpuidle_driver *drv,
  int index)
@@ -144,8 +165,19 @@ static int stop_l

[PATCH v3 0/3] Forced-wakeup for stop states on Powernv

2019-07-04 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU will end up in the shallow state.

Motivation
--
In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a shallow stop state on a tickless system. As
we might get stuck into shallow states even for hours, in absence of ticks
or interrupts.

To address this, We forcefully wakeup the cpu by setting the decrementer.
The decrementer is set to a value that corresponds with the residency of
the next available state. Thus firing up a timer that will forcefully
wakeup the cpu. Few such iterations will essentially train the governor to
select a deeper state for that cpu, as the timer here corresponds to the
next available cpuidle state residency. Thus, cpu will eventually end up
in the deepest possible state and we won't get stuck in a shallow state
for long duration.

Experiment
--
For earlier versions when this feature was meat to be only for shallow lite
states, I performed experiments for three scenarios to collect some data.

case 1 :
Without this patch and without tick retained, i.e. in a upstream kernel,
It would spend more than even a second to get out of stop0_lite.

case 2 : With tick retained in a upstream kernel -

Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected
it to take 8 sched tick to get out of stop0_lite. Experimentally,
observation was

=
sample  minmax   99percentile
20  4ms12ms  4ms
=

It would take atleast one sched tick to get out of stop0_lite.

case 2 :  With this patch (not stopping tick, but explicitly queuing a
  timer)


sample  min max 99percentile

20  144us   192us   144us



Description of current implementation
-

We calculate timeout for the current idle state as the residency value
of the next available idle state. If the decrementer is set to be
greater than this timeout, we update the decrementer value with the
residency of next available idle state. Thus, essentially training the
governor to select the next available deeper state until we reach the
deepest state. Hence, we won't get stuck unnecessarily in shallow states
for longer duration.


v1 of auto-promotion : https://lkml.org/lkml/2019/3/22/58 This patch was
implemented only for shallow lite state in generic cpuidle driver.

v2 : Removed timeout_needed and rebased to current
upstream kernel

Then,
v1 of forced-wakeup : Moved the code to cpuidle powernv driver and started
as forced wakeup instead of auto-promotion

v2 : Extended the forced wakeup logic for all states.
Setting the decrementer instead of queuing up a hrtimer to implement the
logic.

v3 : 1) Cleanly handle setting the decrementer after exiting out of stop
   states.
 2) Added a disable_callback feature to compute timeout whenever a
state is enbaled or disabled instead of computing everytime in fast
idle path.
 3) Use disable callback to recompute timeout whenever state usage
is changed for a state. Also, cleaned up the get_snooze_timeout
function.


Abhishek Goel (3):
  cpuidle-powernv : forced wakeup for stop states
  cpuidle : Add callback whenever a state usage is enabled/disabled
  cpuidle-powernv : Recompute the idle-state timeouts when state usage
is enabled/disabled

 arch/powerpc/include/asm/time.h   |  2 ++
 arch/powerpc/kernel/time.c| 40 ++
 drivers/cpuidle/cpuidle-powernv.c | 47 ++-
 drivers/cpuidle/sysfs.c   | 15 +-
 include/linux/cpuidle.h   |  5 
 5 files changed, 95 insertions(+), 14 deletions(-)

-- 
2.17.1



Re: [PATCH v2 1/1] cpuidle-powernv : forced wakeup for stop states

2019-06-26 Thread Abhishek

Hi Nick,


On 06/19/2019 03:39 PM, Nicholas Piggin wrote:

Abhishek's on June 19, 2019 7:08 pm:

Hi Nick,

Thanks for the review. Some replies below.

On 06/19/2019 09:53 AM, Nicholas Piggin wrote:

Abhishek Goel's on June 17, 2019 7:56 pm:

Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU may end up in the shallow state.

This is problematic, when the predicted state in the aforementioned
scenario is a shallow stop state on a tickless system. As we might get
stuck into shallow states for hours, in absence of ticks or interrupts.

To address this, We forcefully wakeup the cpu by setting the
decrementer. The decrementer is set to a value that corresponds with the
residency of the next available state. Thus firing up a timer that will
forcefully wakeup the cpu. Few such iterations will essentially train the
governor to select a deeper state for that cpu, as the timer here
corresponds to the next available cpuidle state residency. Thus, cpu will
eventually end up in the deepest possible state.

Signed-off-by: Abhishek Goel 
---

Auto-promotion
   v1 : started as auto promotion logic for cpuidle states in generic
driver
   v2 : Removed timeout_needed and rebased the code to upstream kernel
Forced-wakeup
   v1 : New patch with name of forced wakeup started
   v2 : Extending the forced wakeup logic for all states. Setting the
decrementer instead of queuing up a hrtimer to implement the logic.

   drivers/cpuidle/cpuidle-powernv.c | 38 +++
   1 file changed, 38 insertions(+)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe212b3..bc9ca18ae7e3 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -46,6 +46,26 @@ static struct stop_psscr_table 
stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly
   static u64 default_snooze_timeout __read_mostly;
   static bool snooze_timeout_en __read_mostly;
   
+static u64 forced_wakeup_timeout(struct cpuidle_device *dev,

+struct cpuidle_driver *drv,
+int index)
+{
+   int i;
+
+   for (i = index + 1; i < drv->state_count; i++) {
+   struct cpuidle_state *s = >states[i];
+   struct cpuidle_state_usage *su = >states_usage[i];
+
+   if (s->disabled || su->disable)
+   continue;
+
+   return (s->target_residency + 2 * s->exit_latency) *
+   tb_ticks_per_usec;
+   }
+
+   return 0;
+}

It would be nice to not have this kind of loop iteration in the
idle fast path. Can we add a flag or something to the idle state?

Currently, we do not have any callback notification or some feedback that
notifies the driver everytime some state is enabled/disabled. So we have
to parse everytime to get the next enabled state.

Ahh, that's why you're doing that.


Are you suggesting to
add something like next_enabled_state in cpuidle state structure itself
which will be updated when a state is enabled or disabled?

Hmm, I guess it normally should not iterate over more than one state
unless some idle states are disabled.

What would have been nice is each state just have its own timeout
field with ticks already calculated, if that could be updated when
a state is enabled or disabled. How hard is that to add to the
cpuidle core?


I have implemented a prototype which does what you have asked for. Added
a  disable_callback which will update timeout whenever a state is 
enabled or
disabled. But It would mean adding some code to cpuidle.h and 
cpuidle/sysfs.c.

If that is not an issue, should I go ahead and post it?

+
   static u64 get_snooze_timeout(struct cpuidle_device *dev,
  struct cpuidle_driver *drv,
  int index)
@@ -144,8 +164,26 @@ static int stop_loop(struct cpuidle_device *dev,
 struct cpuidle_driver *drv,
 int index)
   {
+   u64 dec_expiry_tb, dec, timeout_tb, forced_wakeup;
+
+   dec = mfspr(SPRN_DEC);
+   timeout_tb = forced_wakeup_timeout(dev, drv, index);
+   forced_wakeup = 0;
+
+   if (timeout_tb && timeout_tb < dec) {
+   forced_wakeup = 1;
+   dec_expiry_tb = mftb() + dec;
+   }

The compiler probably can't optimise away the SPR manipulations so try
to avoid them if possible.

Are you suggesting something like set_dec_before_idle?(in line with
what you have suggested to do after idle, reset_dec_after_idle)

I should have been clear, I meant don't mfspr(SPRN_DEC) until you
have tested timeou

Re: [PATCH v2 1/1] cpuidle-powernv : forced wakeup for stop states

2019-06-19 Thread Abhishek

Hi Nick,

Thanks for the review. Some replies below.

On 06/19/2019 09:53 AM, Nicholas Piggin wrote:

Abhishek Goel's on June 17, 2019 7:56 pm:

Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU may end up in the shallow state.

This is problematic, when the predicted state in the aforementioned
scenario is a shallow stop state on a tickless system. As we might get
stuck into shallow states for hours, in absence of ticks or interrupts.

To address this, We forcefully wakeup the cpu by setting the
decrementer. The decrementer is set to a value that corresponds with the
residency of the next available state. Thus firing up a timer that will
forcefully wakeup the cpu. Few such iterations will essentially train the
governor to select a deeper state for that cpu, as the timer here
corresponds to the next available cpuidle state residency. Thus, cpu will
eventually end up in the deepest possible state.

Signed-off-by: Abhishek Goel 
---

Auto-promotion
  v1 : started as auto promotion logic for cpuidle states in generic
driver
  v2 : Removed timeout_needed and rebased the code to upstream kernel
Forced-wakeup
  v1 : New patch with name of forced wakeup started
  v2 : Extending the forced wakeup logic for all states. Setting the
decrementer instead of queuing up a hrtimer to implement the logic.

  drivers/cpuidle/cpuidle-powernv.c | 38 +++
  1 file changed, 38 insertions(+)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe212b3..bc9ca18ae7e3 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -46,6 +46,26 @@ static struct stop_psscr_table 
stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly
  static u64 default_snooze_timeout __read_mostly;
  static bool snooze_timeout_en __read_mostly;
  
+static u64 forced_wakeup_timeout(struct cpuidle_device *dev,

+struct cpuidle_driver *drv,
+int index)
+{
+   int i;
+
+   for (i = index + 1; i < drv->state_count; i++) {
+   struct cpuidle_state *s = >states[i];
+   struct cpuidle_state_usage *su = >states_usage[i];
+
+   if (s->disabled || su->disable)
+   continue;
+
+   return (s->target_residency + 2 * s->exit_latency) *
+   tb_ticks_per_usec;
+   }
+
+   return 0;
+}

It would be nice to not have this kind of loop iteration in the
idle fast path. Can we add a flag or something to the idle state?

Currently, we do not have any callback notification or some feedback that
notifies the driver everytime some state is enabled/disabled. So we have
to parse everytime to get the next enabled state. Are you suggesting to
add something like next_enabled_state in cpuidle state structure itself
which will be updated when a state is enabled or disabled?

+
  static u64 get_snooze_timeout(struct cpuidle_device *dev,
  struct cpuidle_driver *drv,
  int index)
@@ -144,8 +164,26 @@ static int stop_loop(struct cpuidle_device *dev,
 struct cpuidle_driver *drv,
 int index)
  {
+   u64 dec_expiry_tb, dec, timeout_tb, forced_wakeup;
+
+   dec = mfspr(SPRN_DEC);
+   timeout_tb = forced_wakeup_timeout(dev, drv, index);
+   forced_wakeup = 0;
+
+   if (timeout_tb && timeout_tb < dec) {
+   forced_wakeup = 1;
+   dec_expiry_tb = mftb() + dec;
+   }

The compiler probably can't optimise away the SPR manipulations so try
to avoid them if possible.

Are you suggesting something like set_dec_before_idle?(in line with
what you have suggested to do after idle, reset_dec_after_idle)



+
+   if (forced_wakeup)
+   mtspr(SPRN_DEC, timeout_tb);

This should just be put in the above 'if'.

Fair point.



+
power9_idle_type(stop_psscr_table[index].val,
 stop_psscr_table[index].mask);
+
+   if (forced_wakeup)
+   mtspr(SPRN_DEC, dec_expiry_tb - mftb());

This will sometimes go negative and result in another timer interrupt.

It also breaks irq work (which can be set here by machine check I
believe.

May need to implement some timer code to do this for you.

static void reset_dec_after_idle(void)
{
u64 now;
 u64 *next_tb;

if (test_irq_work_pending())
return;
now = mftb;
next_tb = this_cpu_ptr(_next_tb);

if (now >= *next_tb)
return;
set_dec(*next_tb - n

[PATCH v2 1/1] cpuidle-powernv : forced wakeup for stop states

2019-06-17 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU may end up in the shallow state.

This is problematic, when the predicted state in the aforementioned
scenario is a shallow stop state on a tickless system. As we might get
stuck into shallow states for hours, in absence of ticks or interrupts.

To address this, We forcefully wakeup the cpu by setting the
decrementer. The decrementer is set to a value that corresponds with the
residency of the next available state. Thus firing up a timer that will
forcefully wakeup the cpu. Few such iterations will essentially train the
governor to select a deeper state for that cpu, as the timer here
corresponds to the next available cpuidle state residency. Thus, cpu will
eventually end up in the deepest possible state.

Signed-off-by: Abhishek Goel 
---

Auto-promotion
 v1 : started as auto promotion logic for cpuidle states in generic
driver
 v2 : Removed timeout_needed and rebased the code to upstream kernel
Forced-wakeup
 v1 : New patch with name of forced wakeup started
 v2 : Extending the forced wakeup logic for all states. Setting the
decrementer instead of queuing up a hrtimer to implement the logic.

 drivers/cpuidle/cpuidle-powernv.c | 38 +++
 1 file changed, 38 insertions(+)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe212b3..bc9ca18ae7e3 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -46,6 +46,26 @@ static struct stop_psscr_table 
stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly
 static u64 default_snooze_timeout __read_mostly;
 static bool snooze_timeout_en __read_mostly;
 
+static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
+struct cpuidle_driver *drv,
+int index)
+{
+   int i;
+
+   for (i = index + 1; i < drv->state_count; i++) {
+   struct cpuidle_state *s = >states[i];
+   struct cpuidle_state_usage *su = >states_usage[i];
+
+   if (s->disabled || su->disable)
+   continue;
+
+   return (s->target_residency + 2 * s->exit_latency) *
+   tb_ticks_per_usec;
+   }
+
+   return 0;
+}
+
 static u64 get_snooze_timeout(struct cpuidle_device *dev,
  struct cpuidle_driver *drv,
  int index)
@@ -144,8 +164,26 @@ static int stop_loop(struct cpuidle_device *dev,
 struct cpuidle_driver *drv,
 int index)
 {
+   u64 dec_expiry_tb, dec, timeout_tb, forced_wakeup;
+
+   dec = mfspr(SPRN_DEC);
+   timeout_tb = forced_wakeup_timeout(dev, drv, index);
+   forced_wakeup = 0;
+
+   if (timeout_tb && timeout_tb < dec) {
+   forced_wakeup = 1;
+   dec_expiry_tb = mftb() + dec;
+   }
+
+   if (forced_wakeup)
+   mtspr(SPRN_DEC, timeout_tb);
+
power9_idle_type(stop_psscr_table[index].val,
 stop_psscr_table[index].mask);
+
+   if (forced_wakeup)
+   mtspr(SPRN_DEC, dec_expiry_tb - mftb());
+
return index;
 }
 
-- 
2.17.1



[PATCH v2 0/1] Forced-wakeup for stop states on Powernv

2019-06-17 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU will end up in the shallow state.

Motivation
--
In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a shallow stop state on a tickless system. As
we might get stuck into shallow states even for hours, in absence of ticks
or interrupts.

To address this, We forcefully wakeup the cpu by setting the decrementer.
The decrementer is set to a value that corresponds with the residency of
the next available state. Thus firing up a timer that will forcefully
wakeup the cpu. Few such iterations will essentially train the governor to
select a deeper state for that cpu, as the timer here corresponds to the
next available cpuidle state residency. Thus, cpu will eventually end up
in the deepest possible state and we won't get stuck in a shallow state
for long duration.

Experiment
--
For earlier versions when this feature was meat to be only for shallow lite
states, I performed experiments for three scenarios to collect some data.

case 1 :
Without this patch and without tick retained, i.e. in a upstream kernel,
It would spend more than even a second to get out of stop0_lite.

case 2 : With tick retained in a upstream kernel -

Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected
it to take 8 sched tick to get out of stop0_lite. Experimentally,
observation was

=
sample  minmax   99percentile
20  4ms12ms  4ms
=

It would take atleast one sched tick to get out of stop0_lite.

case 2 :  With this patch (not stopping tick, but explicitly queuing a
  timer)


sample  min max 99percentile

20  144us   192us   144us



Description of current implementation
-

We calculate timeout for the current idle state as the residency value
of the next available idle state. If the decrementer is set to be
greater than this timeout, we update the decrementer value with the
residency of next available idle state. Thus, essentially training the
governor to select the next available deeper state until we reach the
deepest state. Hence, we won't get stuck unnecessarily in shallow states
for longer duration.


v1 of auto-promotion : https://lkml.org/lkml/2019/3/22/58 This patch was
implemented only for shallow lite state in generic cpuidle driver.

v2 of auto-promotion : Removed timeout_needed and rebased to current
upstream kernel

Then, 
v1 of forced-wakeup : Moved the code to cpuidle powernv driver and started
as forced wakeup instead of auto-promotion

v2 of forced-wakeup : Extended the forced wakeup logic for all states.
Setting the decrementer instead of queuing up a hrtimer to implement the
logic.

Abhishek Goel (1):
  cpuidle-powernv : forced wakeup for stop states

 drivers/cpuidle/cpuidle-powernv.c | 38 +++
 1 file changed, 38 insertions(+)

-- 
2.17.1



Re: [PATCH 0/1] Forced-wakeup for stop lite states on Powernv

2019-05-13 Thread Abhishek

On 05/08/2019 10:29 AM, Nicholas Piggin wrote:

Abhishek Goel's on April 22, 2019 4:32 pm:

Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU will end up in the shallow state.

Motivation
--
In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core from
using the core resources.

So we do not want to get stucked in such states for longer duration. To
address this, the cpuidle-core can queue timer to correspond with the
residency value of the next available state. This timer will forcefully
wakeup the cpu. Few such iterations will essentially train the governor to
select a deeper state for that cpu, as the timer here corresponds to the
next available cpuidle state residency. Cpu will be kicked out of the lite
state and end up in a non-lite state.

Experiment
--
I performed experiments for three scenarios to collect some data.

case 1 :
Without this patch and without tick retained, i.e. in a upstream kernel,
It would spend more than even a second to get out of stop0_lite.

case 2 : With tick retained in a upstream kernel -

Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected
it to take 8 sched tick to get out of stop0_lite. Experimentally,
observation was

=
sample  minmax   99percentile
20  4ms12ms  4ms
=

It would take atleast one sched tick to get out of stop0_lite.

case 2 :  With this patch (not stopping tick, but explicitly queuing a
   timer)


sample  min max 99percentile

20  144us   192us   144us


In this patch, we queue a timer just before entering into a stop0_lite
state. The timer fires at (residency of next available state + exit latency
of next available state * 2). Let's say if next state(stop0) is available
which has residency of 20us, it should get out in as low as (20+2*2)*8
[Based on the forumla (residency + 2xlatency)*history length] microseconds
= 192us. Ideally we would expect 8 iterations, it was observed to get out
in 6-7 iterations. Even if let's say stop2 is next available state(stop0
and stop1 both are unavailable), it would take (100+2*10)*8 = 960us to get
into stop2.

So, We are able to get out of stop0_lite generally in 150us(with this
patch) as compared to 4ms(with tick retained). As stated earlier, we do not
want to get stuck into stop0_lite as it inhibits SMT folding for other
sibling threads, depriving them of core resources. Current patch is using
forced-wakeup only for stop0_lite, as it gives performance benefit(primary
reason) along with lowering down power consumption. We may extend this
model for other states in future.

I still have to wonder, between our snooze loop and stop0, what does
stop0_lite buy us.

That said, the problem you're solving here is a generic one that all
stop states have, I think. Doesn't the same thing apply going from
stop0 to stop5? You might under estimate the sleep time and lose power
savings and therefore performance there too. Shouldn't we make it
generic for all stop states?

Thanks,
Nick


When a cpu is in snooze, it takes both space and time of core. When in 
stop0_lite,
it free up time but it still takes space. When it is in stop0 or deeper, 
it free up both

space and time slice of core.
In stop0_lite, cpu doesn't free up the core resources and thus inhibits 
thread
folding. When a cpu goes to stop0, it will free up the core resources 
thus increasing

the single thread performance of other sibling thread.
Hence, we do not want to get stuck in stop0_lite for long duration, and 
want to quickly

move onto the next state.
If we get stuck in any other state we would possibly be losing on to 
power saving,
but will still be able to gain the performance benefits for other 
sibling threads.


Thanks,
Abhishek



[PATCH 1/1] cpuidle-powernv : forced wakeup for stop lite states

2019-04-22 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU will end up in the shallow state.

In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core from
using the core resources.

So we do not want to get stucked in such states for longer duration. To
address this, the cpuidle-core can queue timer to correspond with the
residency value of the next available state. This timer will forcefully
wakeup the cpu. Few such iterations will essentially train the governor to
select a deeper state for that cpu, as the timer here corresponds to the
next available cpuidle state residency. Cpu will be kicked out of the lite
state and end up in a non-lite state.

Signed-off-by: Abhishek Goel 
---
 arch/powerpc/include/asm/opal-api.h |  1 +
 drivers/cpuidle/cpuidle-powernv.c   | 71 -
 2 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 870fb7b23..735dec731 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -226,6 +226,7 @@
  */
 
 #define OPAL_PM_TIMEBASE_STOP  0x0002
+#define OPAL_PM_LOSE_USER_CONTEXT  0x1000
 #define OPAL_PM_LOSE_HYP_CONTEXT   0x2000
 #define OPAL_PM_LOSE_FULL_CONTEXT  0x4000
 #define OPAL_PM_NAP_ENABLED0x0001
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe21..30b877962 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -43,6 +44,40 @@ struct stop_psscr_table {
 
 static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] 
__read_mostly;
 
+DEFINE_PER_CPU(struct hrtimer, forced_wakeup_timer);
+
+static int forced_wakeup_time_compute(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv,
+ int index)
+{
+   int i, timeout_us = 0;
+
+   for (i = index + 1; i < drv->state_count; i++) {
+   if (drv->states[i].disabled || dev->states_usage[i].disable)
+   continue;
+   timeout_us = drv->states[i].target_residency +
+   2 * drv->states[i].exit_latency;
+   break;
+   }
+
+   return timeout_us;
+}
+
+enum hrtimer_restart forced_wakeup_hrtimer_callback(struct hrtimer *hrtimer)
+{
+   return HRTIMER_NORESTART;
+}
+
+static void forced_wakeup_timer_init(int cpu, struct cpuidle_driver *drv)
+{
+   struct hrtimer *cpu_forced_wakeup_timer = _cpu(forced_wakeup_timer,
+   cpu);
+
+   hrtimer_init(cpu_forced_wakeup_timer, CLOCK_MONOTONIC,
+   HRTIMER_MODE_REL);
+   cpu_forced_wakeup_timer->function = forced_wakeup_hrtimer_callback;
+}
+
 static u64 default_snooze_timeout __read_mostly;
 static bool snooze_timeout_en __read_mostly;
 
@@ -103,6 +138,28 @@ static int snooze_loop(struct cpuidle_device *dev,
return index;
 }
 
+static int stop_lite_loop(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv,
+ int index)
+{
+   int timeout_us;
+   struct hrtimer *this_timer = _cpu(forced_wakeup_timer, dev->cpu);
+
+   timeout_us = forced_wakeup_time_compute(dev, drv, index);
+
+   if (timeout_us > 0)
+   hrtimer_start(this_timer, ns_to_ktime(timeout_us * 1000),
+   HRTIMER_MODE_REL_PINNED);
+
+   power9_idle_type(stop_psscr_table[index].val,
+stop_psscr_table[index].mask);
+
+   if (unlikely(hrtimer_is_queued(this_timer)))
+   hrtimer_cancel(this_timer);
+
+   return index;
+}
+
 static int nap_loop(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
int index)
@@ -190,7 +247,7 @@ static int powernv_cpuidle_cpu_dead(unsigned int cpu)
  */
 static int powernv_cpuidle_driver_init(void)
 {
-   int idle_state;
+   int idle_state, cpu;
struct cpuidle_driver *drv = _idle_driver;
 
drv->state_count = 0;
@@ -224,6 +281,9 @@ static int powernv_cpuidle_driver_init(void)
 
drv->cpumask = (struct cpumask *)cpu_present_mask;
 
+   for_each_cpu(cpu, drv->cpumask)
+   forced_wakeup_time

[PATCH 0/1] Forced-wakeup for stop lite states on Powernv

2019-04-22 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU will end up in the shallow state.

Motivation
--
In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core from
using the core resources.

So we do not want to get stucked in such states for longer duration. To
address this, the cpuidle-core can queue timer to correspond with the
residency value of the next available state. This timer will forcefully
wakeup the cpu. Few such iterations will essentially train the governor to
select a deeper state for that cpu, as the timer here corresponds to the
next available cpuidle state residency. Cpu will be kicked out of the lite
state and end up in a non-lite state.

Experiment
--
I performed experiments for three scenarios to collect some data.

case 1 :
Without this patch and without tick retained, i.e. in a upstream kernel,
It would spend more than even a second to get out of stop0_lite.

case 2 : With tick retained in a upstream kernel -

Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected
it to take 8 sched tick to get out of stop0_lite. Experimentally,
observation was

=
sample  minmax   99percentile
20  4ms12ms  4ms
=

It would take atleast one sched tick to get out of stop0_lite.

case 2 :  With this patch (not stopping tick, but explicitly queuing a
  timer)


sample  min max 99percentile

20  144us   192us   144us


In this patch, we queue a timer just before entering into a stop0_lite
state. The timer fires at (residency of next available state + exit latency
of next available state * 2). Let's say if next state(stop0) is available
which has residency of 20us, it should get out in as low as (20+2*2)*8
[Based on the forumla (residency + 2xlatency)*history length] microseconds
= 192us. Ideally we would expect 8 iterations, it was observed to get out
in 6-7 iterations. Even if let's say stop2 is next available state(stop0
and stop1 both are unavailable), it would take (100+2*10)*8 = 960us to get
into stop2.

So, We are able to get out of stop0_lite generally in 150us(with this
patch) as compared to 4ms(with tick retained). As stated earlier, we do not
want to get stuck into stop0_lite as it inhibits SMT folding for other
sibling threads, depriving them of core resources. Current patch is using
forced-wakeup only for stop0_lite, as it gives performance benefit(primary
reason) along with lowering down power consumption. We may extend this
model for other states in future.

Abhishek Goel (1):
  cpuidle-powernv : forced wakeup for stop lite states

 arch/powerpc/include/asm/opal-api.h |  1 +
 drivers/cpuidle/cpuidle-powernv.c   | 71 -
 2 files changed, 71 insertions(+), 1 deletion(-)

-- 
2.17.1



Re: [PATCH v2 1/2] cpuidle : auto-promotion for cpuidle states

2019-04-14 Thread Abhishek

Hi Rafael,

Thanks for the Review. Few inline replies below.


On 04/09/2019 03:31 PM, Rafael J. Wysocki wrote:

On Fri, Apr 5, 2019 at 11:17 AM Abhishek Goel
 wrote:

Currently, the cpuidle governors (menu /ladder) determine what idle state

There are three governors in 5.1-rc.


an idling CPU should enter into based on heuristics that depend on the
idle history on that CPU. Given that no predictive heuristic is perfect,
there are cases where the governor predicts a shallow idle state, hoping
that the CPU will be busy soon. However, if no new workload is scheduled
on that CPU in the near future, the CPU will end up in the shallow state.

In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core from
using the core resources.

To address this, such lite states need to be autopromoted.

I don't quite agree with this statement and it doesn't even match what
the patch does AFAICS.  "Autopromotion" would be going from the given
state to a deeper one without running state selection in between, but
that's not what's going on here.

Thinking to call it "timed-exit". Is that good?

The cpuidle-core can queue timer to correspond with the residency value of the 
next
available state. Thus leading to auto-promotion to a deeper idle state as
soon as possible.

No, it doesn't automatically cause a deeper state to be used next
time.  It simply kicks the CPU out of the idle state and one more
iteration of the idle loop runs on it.  Whether or not a deeper state
will be selected in that iteration depends on the governor
computations carried out in it.

I did not mean that next state is chosen automatically. I should have been
more descriptive here instead of just using "as soon as possible"

Now, this appears to be almost analogous to the "polling" state used
on x86 which uses the next idle state's target residency as a timeout.

While generally I'm not a big fan of setting up timers in the idle
loop (it sort of feels like pulling your own hair in order to get
yourself out of a swamp), if idle states like these are there in your
platform, setting up a timer to get out of them in the driver's
->enter() routine might not be particularly objectionable.  Doing that
in the core is a whole different story, though.

Generally, this adds quite a bit of complexity (on the "ugly" side of
things IMO) to the core to cover a corner case present in one
platform, while IMO it can be covered in the driver for that platform
directly.

As of now, since this code doesn't add any benefit to the other platform,
I will post a patch with this implementation covered in platform-specific
driver code.
You are right that all the information needed for this implementation 
are also

available there in platform driver code, so we should be good to go.



Re: [PATCH v2 1/2] cpuidle : auto-promotion for cpuidle states

2019-04-09 Thread Abhishek

On 04/09/2019 03:00 PM, Rafael J. Wysocki wrote:

On Tue, Apr 9, 2019 at 11:29 AM Abhishek  wrote:

Hi Daniel,

Thanks for such a descriptive review. I will include all the suggestions
made in my next iteration.

Please give me some time to send comments before that.

Sure, I will wait for your review.



Re: [PATCH v2 1/2] cpuidle : auto-promotion for cpuidle states

2019-04-09 Thread Abhishek

Hi Daniel,

On 04/08/2019 07:55 PM, Daniel Axtens wrote:

Hi,

Sorry, just realised another thing I wanted to ask:


@@ -442,6 +442,26 @@ static int menu_select(struct cpuidle_driver *drv, struct 
cpuidle_device *dev,
}
}


+#ifdef CPUIDLE_FLAG_AUTO_PROMOTION

Why is this based on CPUIDLE_FLAG_ rather than CONFIG_CPU_IDLE_? Won't
this always be true, given that the flag is defined regardless of the
config option in the header?

Yeah, You are right. This should have been CONFIG_CPU_IDLE_AUTO_PROMOTION.

--Abhishek



Re: [PATCH v2 1/2] cpuidle : auto-promotion for cpuidle states

2019-04-09 Thread Abhishek

Hi Daniel,

Thanks for such a descriptive review. I will include all the suggestions 
made in my next iteration.


--Abhishek

On 04/08/2019 07:42 PM, Daniel Axtens wrote:

Hi Abhishek,


Currently, the cpuidle governors (menu /ladder) determine what idle state
an idling CPU should enter into based on heuristics that depend on the
idle history on that CPU. Given that no predictive heuristic is perfect,
there are cases where the governor predicts a shallow idle state, hoping
that the CPU will be busy soon. However, if no new workload is scheduled
on that CPU in the near future, the CPU will end up in the shallow state.

In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core from
using the core resources.

To address this, such lite states need to be autopromoted. The cpuidle-
core can queue timer to correspond with the residency value of the next
available state. Thus leading to auto-promotion to a deeper idle state as
soon as possible.


This sounds sensible to me, although I'm not really qualified to offer a
full power-management opinion on it. I have some general code questions
and comments, however, which are below:


Signed-off-by: Abhishek Goel 
---

v1->v2 : Removed timeout_needed and rebased to current upstream kernel

  drivers/cpuidle/cpuidle.c  | 68 +-
  drivers/cpuidle/governors/ladder.c |  3 +-
  drivers/cpuidle/governors/menu.c   | 22 +-
  include/linux/cpuidle.h| 10 -
  4 files changed, 99 insertions(+), 4 deletions(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 7f108309e..11ce43f19 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -36,6 +36,11 @@ static int enabled_devices;
  static int off __read_mostly;
  static int initialized __read_mostly;
  
+struct auto_promotion {

+   struct hrtimer  hrtimer;
+   unsigned long   timeout_us;
+};
+
  int cpuidle_disabled(void)
  {
return off;
@@ -188,6 +193,54 @@ int cpuidle_enter_s2idle(struct cpuidle_driver *drv, 
struct cpuidle_device *dev)
  }
  #endif /* CONFIG_SUSPEND */
  
+enum hrtimer_restart auto_promotion_hrtimer_callback(struct hrtimer *hrtimer)

+{
+   return HRTIMER_NORESTART;
+}
+
+#ifdef CONFIG_CPU_IDLE_AUTO_PROMOTION

As far as I can tell, this config flag isn't defined until the next
patch, making this dead code for now. Is this intentional?


+DEFINE_PER_CPU(struct auto_promotion, ap);

A quick grep suggests that most per-cpu variable have more descriptive
names, perhaps this one should too.


+
+static void cpuidle_auto_promotion_start(int cpu, struct cpuidle_state *state)
+{
+   struct auto_promotion *this_ap = _cpu(ap, cpu);
+
+   if (state->flags & CPUIDLE_FLAG_AUTO_PROMOTION)
+   hrtimer_start(_ap->hrtimer, ns_to_ktime(this_ap->timeout_us
+   * 1000), HRTIMER_MODE_REL_PINNED);

Would it be clearer to have both sides of the multiplication on the same
line? i.e.
+   hrtimer_start(_ap->hrtimer,
+ ns_to_ktime(this_ap->timeout_us * 1000),
+ HRTIMER_MODE_REL_PINNED);


+}
+
+static void cpuidle_auto_promotion_cancel(int cpu)
+{
+   struct hrtimer *hrtimer;
+
+   hrtimer = _cpu(ap, cpu).hrtimer;
+   if (hrtimer_is_queued(hrtimer))
+   hrtimer_cancel(hrtimer);
+}
+
+static void cpuidle_auto_promotion_update(int cpu, unsigned long timeout)
+{
+   per_cpu(ap, cpu).timeout_us = timeout;
+}
+
+static void cpuidle_auto_promotion_init(int cpu, struct cpuidle_driver *drv)
+{
+   struct auto_promotion *this_ap = _cpu(ap, cpu);
+
+   hrtimer_init(_ap->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+   this_ap->hrtimer.function = auto_promotion_hrtimer_callback;
+}
+#else
+static inline void cpuidle_auto_promotion_start(int cpu, struct cpuidle_state
+   *state) { }
+static inline void cpuidle_auto_promotion_cancel(int cpu) { }
+static inline void cpuidle_auto_promotion_update(int cpu, unsigned long
+   timeout) { }
+static inline void cpuidle_auto_promotion_init(int cpu, struct cpuidle_driver
+   *drv) { }

Several of these have the type, then a line break, and then the name
(unsigned long\n  timeout). This is a bit harder to read, they should
probably all be on the same line.


+#endif
+
  /**
   * cpuidle_enter_state - enter the state and update stats
   * @dev: cpuidle device for this cpu
@@ -225,12 +278,17 @@ int cpuidle_enter_state(struct cpuidle_device *dev, 
struct cpuidle_driver *drv,
trace_cpu_idle_rcuidle(index, dev->cpu);
time_start = ns_to_ktime(local_clock());
  
+	cpuidle_auto_promoti

[PATCH v2 2/2] cpuidle : Add auto-promotion flag to cpuidle flags

2019-04-05 Thread Abhishek Goel
This patch sets up flags for the state which needs to be auto-promoted. On
POWERNV system, only lite states need to be autopromoted. We identify lite
states by those which do not lose user context. That information has been
used to set the flag for lite states.

Signed-off-by: Abhishek Goel 
---
 arch/powerpc/include/asm/opal-api.h |  1 +
 drivers/cpuidle/Kconfig |  4 
 drivers/cpuidle/cpuidle-powernv.c   | 13 +++--
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 870fb7b23..735dec731 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -226,6 +226,7 @@
  */
 
 #define OPAL_PM_TIMEBASE_STOP  0x0002
+#define OPAL_PM_LOSE_USER_CONTEXT  0x1000
 #define OPAL_PM_LOSE_HYP_CONTEXT   0x2000
 #define OPAL_PM_LOSE_FULL_CONTEXT  0x4000
 #define OPAL_PM_NAP_ENABLED0x0001
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index 8caccbbd7..9b8e9b96a 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -35,6 +35,10 @@ config CPU_IDLE_GOV_TEO
 config DT_IDLE_STATES
bool
 
+config CPU_IDLE_AUTO_PROMOTION
+   bool
+   default y if PPC_POWERNV
+
 menu "ARM CPU Idle Drivers"
 depends on ARM || ARM64
 source "drivers/cpuidle/Kconfig.arm"
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe21..0dd767270 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -299,6 +299,7 @@ static int powernv_add_idle_states(void)
for (i = 0; i < dt_idle_states; i++) {
unsigned int exit_latency, target_residency;
bool stops_timebase = false;
+   bool lose_user_context = false;
struct pnv_idle_states_t *state = _idle_states[i];
 
/*
@@ -324,6 +325,9 @@ static int powernv_add_idle_states(void)
if (has_stop_states && !(state->valid))
continue;
 
+   if (state->flags & OPAL_PM_LOSE_USER_CONTEXT)
+   lose_user_context = true;
+
if (state->flags & OPAL_PM_TIMEBASE_STOP)
stops_timebase = true;
 
@@ -332,12 +336,17 @@ static int powernv_add_idle_states(void)
add_powernv_state(nr_idle_states, "Nap",
  CPUIDLE_FLAG_NONE, nap_loop,
  target_residency, exit_latency, 0, 0);
+   } else if (has_stop_states && !lose_user_context) {
+   add_powernv_state(nr_idle_states, state->name,
+ CPUIDLE_FLAG_AUTO_PROMOTION,
+ stop_loop, target_residency,
+ exit_latency, state->psscr_val,
+ state->psscr_mask);
} else if (has_stop_states && !stops_timebase) {
add_powernv_state(nr_idle_states, state->name,
  CPUIDLE_FLAG_NONE, stop_loop,
  target_residency, exit_latency,
- state->psscr_val,
- state->psscr_mask);
+ state->psscr_val, state->psscr_mask);
}
 
/*
-- 
2.17.1



[PATCH v2 1/2] cpuidle : auto-promotion for cpuidle states

2019-04-05 Thread Abhishek Goel
Currently, the cpuidle governors (menu /ladder) determine what idle state
an idling CPU should enter into based on heuristics that depend on the
idle history on that CPU. Given that no predictive heuristic is perfect,
there are cases where the governor predicts a shallow idle state, hoping
that the CPU will be busy soon. However, if no new workload is scheduled
on that CPU in the near future, the CPU will end up in the shallow state.

In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core from
using the core resources.

To address this, such lite states need to be autopromoted. The cpuidle-
core can queue timer to correspond with the residency value of the next
available state. Thus leading to auto-promotion to a deeper idle state as
soon as possible.

Signed-off-by: Abhishek Goel 
---

v1->v2 : Removed timeout_needed and rebased to current upstream kernel

 drivers/cpuidle/cpuidle.c  | 68 +-
 drivers/cpuidle/governors/ladder.c |  3 +-
 drivers/cpuidle/governors/menu.c   | 22 +-
 include/linux/cpuidle.h| 10 -
 4 files changed, 99 insertions(+), 4 deletions(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 7f108309e..11ce43f19 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -36,6 +36,11 @@ static int enabled_devices;
 static int off __read_mostly;
 static int initialized __read_mostly;
 
+struct auto_promotion {
+   struct hrtimer  hrtimer;
+   unsigned long   timeout_us;
+};
+
 int cpuidle_disabled(void)
 {
return off;
@@ -188,6 +193,54 @@ int cpuidle_enter_s2idle(struct cpuidle_driver *drv, 
struct cpuidle_device *dev)
 }
 #endif /* CONFIG_SUSPEND */
 
+enum hrtimer_restart auto_promotion_hrtimer_callback(struct hrtimer *hrtimer)
+{
+   return HRTIMER_NORESTART;
+}
+
+#ifdef CONFIG_CPU_IDLE_AUTO_PROMOTION
+DEFINE_PER_CPU(struct auto_promotion, ap);
+
+static void cpuidle_auto_promotion_start(int cpu, struct cpuidle_state *state)
+{
+   struct auto_promotion *this_ap = _cpu(ap, cpu);
+
+   if (state->flags & CPUIDLE_FLAG_AUTO_PROMOTION)
+   hrtimer_start(_ap->hrtimer, ns_to_ktime(this_ap->timeout_us
+   * 1000), HRTIMER_MODE_REL_PINNED);
+}
+
+static void cpuidle_auto_promotion_cancel(int cpu)
+{
+   struct hrtimer *hrtimer;
+
+   hrtimer = _cpu(ap, cpu).hrtimer;
+   if (hrtimer_is_queued(hrtimer))
+   hrtimer_cancel(hrtimer);
+}
+
+static void cpuidle_auto_promotion_update(int cpu, unsigned long timeout)
+{
+   per_cpu(ap, cpu).timeout_us = timeout;
+}
+
+static void cpuidle_auto_promotion_init(int cpu, struct cpuidle_driver *drv)
+{
+   struct auto_promotion *this_ap = _cpu(ap, cpu);
+
+   hrtimer_init(_ap->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+   this_ap->hrtimer.function = auto_promotion_hrtimer_callback;
+}
+#else
+static inline void cpuidle_auto_promotion_start(int cpu, struct cpuidle_state
+   *state) { }
+static inline void cpuidle_auto_promotion_cancel(int cpu) { }
+static inline void cpuidle_auto_promotion_update(int cpu, unsigned long
+   timeout) { }
+static inline void cpuidle_auto_promotion_init(int cpu, struct cpuidle_driver
+   *drv) { }
+#endif
+
 /**
  * cpuidle_enter_state - enter the state and update stats
  * @dev: cpuidle device for this cpu
@@ -225,12 +278,17 @@ int cpuidle_enter_state(struct cpuidle_device *dev, 
struct cpuidle_driver *drv,
trace_cpu_idle_rcuidle(index, dev->cpu);
time_start = ns_to_ktime(local_clock());
 
+   cpuidle_auto_promotion_start(dev->cpu, target_state);
+
stop_critical_timings();
entered_state = target_state->enter(dev, drv, index);
start_critical_timings();
 
sched_clock_idle_wakeup_event();
time_end = ns_to_ktime(local_clock());
+
+   cpuidle_auto_promotion_cancel(dev->cpu);
+
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
 
/* The cpu is no longer idle or about to enter idle. */
@@ -312,7 +370,13 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct 
cpuidle_driver *drv,
 int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
   bool *stop_tick)
 {
-   return cpuidle_curr_governor->select(drv, dev, stop_tick);
+   unsigned long timeout_us, ret;
+
+   timeout_us = UINT_MAX;
+   ret = cpuidle_curr_governor->select(drv, dev, stop_tick, _us);
+   cpuidle_auto_promotion_update(dev->cpu, timeout_us);
+
+   return ret;
 }
 
 /**
@@ -658,6 +722,8 @@ int cpuidle_register(struct cpuidle_driver *drv,
device = _cpu(cpuidl

[PATCH v2 0/2] Auto-promotion logic for cpuidle states

2019-04-05 Thread Abhishek Goel
Currently, the cpuidle governors (menu/ladder) determine what idle state a
idling CPU should enter into based on heuristics that depend on the idle
history on that CPU. Given that no predictive heuristic is perfect, there
are cases where the governor predicts a shallow idle state, hoping that
the CPU will be busy soon. However, if no new workload is scheduled on
that CPU in the near future, the CPU will end up in the shallow state.

Motivation
--
In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core from
using the core resources.

To address this, such lite states need to be autopromoted. The cpuidle-core
can queue timer to correspond with the residency value of the next
available state. Thus leading to auto-promotion to a deeper idle state as
soon as possible.

Experiment
--
I performed experiments for three scenarios to collect some data.

case 1 :
Without this patch and without tick retained, i.e. in a upstream kernel,
It would spend more than even a second to get out of stop0_lite.

case 2 : With tick retained(as suggested) -

Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected
it to take 8 sched tick to get out of stop0_lite. Experimentally,
observation was

=
sample  minmax   99percentile
20  4ms12ms  4ms
=
*ms = milliseconds

It would take atleast one sched tick to get out of stop0_lite.

case 2 :  With this patch (not stopping tick, but explicitly queuing a
  timer)


sample  min max 99percentile

20  144us   192us   144us

*us = microseconds

In this patch, we queue a timer just before entering into a stop0_lite
state. The timer fires at (residency of next available state + exit latency
of next available state * 2). Let's say if next state(stop0) is available
which has residency of 20us, it should get out in as low as (20+2*2)*8
[Based on the forumla (residency + 2xlatency)*history length] microseconds
= 192us. Ideally we would expect 8 iterations, it was observed to get out
in 6-7 iterations. Even if let's say stop2 is next available state(stop0
and stop1 both are unavailable), it would take (100+2*10)*8 = 960us to get
into stop2.

So, We are able to get out of stop0_lite generally in 150us(with this
patch) as compared to 4ms(with tick retained). As stated earlier, we do not
want to get stuck into stop0_lite as it inhibits SMT folding for other
sibling threads, depriving them of core resources. Current patch is using
auto-promotion only for stop0_lite, as it gives performance benefit(primary
reason) along with lowering down power consumption. We may extend this
model for other states in future.

Abhishek Goel (2):
  cpuidle : auto-promotion for cpuidle states
  cpuidle : Add auto-promotion flag to cpuidle flags

 arch/powerpc/include/asm/opal-api.h |  1 +
 drivers/cpuidle/Kconfig |  4 ++
 drivers/cpuidle/cpuidle-powernv.c   | 13 +-
 drivers/cpuidle/cpuidle.c   | 68 -
 drivers/cpuidle/governors/ladder.c  |  3 +-
 drivers/cpuidle/governors/menu.c| 22 +-
 include/linux/cpuidle.h | 10 -
 7 files changed, 115 insertions(+), 6 deletions(-)

-- 
2.17.1



Re: [PATCH 1/2] cpuidle : auto-promotion for cpuidle states

2019-04-04 Thread Abhishek




On 04/04/2019 03:51 PM, Daniel Lezcano wrote:

Hi Abhishek,

thanks for taking the time to test the different scenario and give us
the numbers.

On 01/04/2019 07:11, Abhishek wrote:


On 03/22/2019 06:56 PM, Daniel Lezcano wrote:

On 22/03/2019 10:45, Rafael J. Wysocki wrote:

On Fri, Mar 22, 2019 at 8:31 AM Abhishek Goel
 wrote:

Currently, the cpuidle governors (menu /ladder) determine what idle
state
an idling CPU should enter into based on heuristics that depend on the
idle history on that CPU. Given that no predictive heuristic is
perfect,
there are cases where the governor predicts a shallow idle state,
hoping
that the CPU will be busy soon. However, if no new workload is
scheduled
on that CPU in the near future, the CPU will end up in the shallow
state.

In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core
from
using the core resources.

I can understand an idle state can prevent other threads to use the core
resources. But why a deeper idle state does not prevent this also?



To address this, such lite states need to be autopromoted. The cpuidle-
core can queue timer to correspond with the residency value of the next
available state. Thus leading to auto-promotion to a deeper idle
state as
soon as possible.

Isn't the tick stopping avoidance sufficient for that?

I was about to ask the same :)





Thanks for the review.
I performed experiments for three scenarios to collect some data.

case 1 :
Without this patch and without tick retained, i.e. in a upstream kernel,
It would spend more than even a second to get out of stop0_lite.

case 2 : With tick retained(as suggested) -

Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected
it to take 8 sched tick to get out of stop0_lite. Experimentally,
observation was

===
min            max            99percentile
4ms            12ms          4ms
===
*ms = milliseconds

It would take atleast one sched tick to get out of stop0_lite.

case 2 :  With this patch (not stopping tick, but explicitly queuing a
timer)

min            max              99.5percentile
===
144us       192us              144us
===
*us = microseconds

In this patch, we queue a timer just before entering into a stop0_lite
state. The timer fires at (residency of next available state + exit
latency of next available state * 2).

So for the context, we have a similar issue but from the power
management point of view where a CPU can stay in a shallow state for a
long period, thus consuming a lot of energy.

The window was reduced by preventing stopping the tick when a shallow
state is selected. Unfortunately, if the tick is stopped and we
exit/enter again and we select a shallow state, the situation is the same.

A solution was previously proposed with a timer some years ago, like
this patch does, and merged but there were complains about bad
performance impact, so it has been reverted.


Let's say if next state(stop0) is available which has residency of 20us, it
should get out in as low as (20+2*2)*8 [Based on the forumla (residency +
2xlatency)*history length] microseconds = 192us. Ideally we would expect 8
iterations, it was observed to get out in 6-7 iterations.

Can you explain the formula? I don't get the rational. Why using the
exit latency and why multiply it by 2?

Why the timer is not set to the next state's target residency value ?

The idea behind multiplying by 2 is, entry latency + exit latency = 2* 
exit latency, i.e.,

using exit latency = entry latency
So in effect, we are using target residency + 2 * exit latency for 
timeout of timer.
Latency is generally <=10% of residency. I have tried to be conservative 
by including latency
factor in computation for timeout. Thus, this formula will give slightly 
greater value compared

to directly using residency of target state.

--Abhishek



Re: [PATCH 1/2] cpuidle : auto-promotion for cpuidle states

2019-03-31 Thread Abhishek




On 03/22/2019 06:56 PM, Daniel Lezcano wrote:

On 22/03/2019 10:45, Rafael J. Wysocki wrote:

On Fri, Mar 22, 2019 at 8:31 AM Abhishek Goel
 wrote:

Currently, the cpuidle governors (menu /ladder) determine what idle state
an idling CPU should enter into based on heuristics that depend on the
idle history on that CPU. Given that no predictive heuristic is perfect,
there are cases where the governor predicts a shallow idle state, hoping
that the CPU will be busy soon. However, if no new workload is scheduled
on that CPU in the near future, the CPU will end up in the shallow state.

In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core from
using the core resources.

To address this, such lite states need to be autopromoted. The cpuidle-
core can queue timer to correspond with the residency value of the next
available state. Thus leading to auto-promotion to a deeper idle state as
soon as possible.

Isn't the tick stopping avoidance sufficient for that?

I was about to ask the same :)





Thanks for the review.
I performed experiments for three scenarios to collect some data.

case 1 :
Without this patch and without tick retained, i.e. in a upstream kernel,
It would spend more than even a second to get out of stop0_lite.

case 2 : With tick retained(as suggested) -

Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected
it to take 8 sched tick to get out of stop0_lite. Experimentally, 
observation was


===
min            max            99percentile
4ms            12ms          4ms
===
*ms = milliseconds

It would take atleast one sched tick to get out of stop0_lite.

case 2 :  With this patch (not stopping tick, but explicitly queuing a 
timer)


min            max              99.5percentile
===
144us       192us              144us
===
*us = microseconds

In this patch, we queue a timer just before entering into a stop0_lite
state. The timer fires at (residency of next available state + exit
latency of next available state * 2).
Let's say if next state(stop0) is available which has residency of 20us, it
should get out in as low as (20+2*2)*8 [Based on the forumla (residency +
2xlatency)*history length] microseconds = 192us. Ideally we would expect 8
iterations, it was observed to get out in 6-7 iterations.
Even if let's say stop2 is next available state(stop0 and stop1 both are
unavailable), it would take (100+2*10)*8 = 960us to get into stop2.

So, We are able to get out of stop0_lite generally in 150us(with this 
patch) as
compared to 4ms(with tick retained). As stated earlier, we do not want 
to get
stuck into stop0_lite as it inhibits SMT folding for other sibling 
threads, depriving
them of core resources. Current patch is using auto-promotion only for 
stop0_lite,
as it gives performance benefit(primary reason) along with lowering down 
power

consumption. We may extend this model for other states in future.

--Abhishek



Re: [PATCH 0/2] Auto-promotion logic for cpuidle states

2019-03-22 Thread Abhishek

Please ignore this set as this is incomplete. I have resent the patches.

--Abhishek

On 03/22/2019 11:55 AM, Abhishek Goel wrote:

Currently, the cpuidle governors (menu/ladder) determine what idle state a
idling CPU should enter into based on heuristics that depend on the idle
history on that CPU. Given that no predictive heuristic is perfect, there
are cases where the governor predicts a shallow idle state, hoping that
the CPU will be busy soon. However, if no new workload is scheduled on
that CPU in the near future, the CPU will end up in the shallow state.

Motivation
--
In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core from
using the core resources.

To address this, such lite states need to be autopromoted. The cpuidle-core
can queue timer to correspond with the residency value of the next
available state. Thus leading to auto-promotion to a deeper idle state as
soon as possible.

Experiment
--
Without this patch -
It was seen that for a idle system, a cpu may remain in stop0_lite for few
seconds and then directly goes to a deeper state such as stop2.

With this patch -
A cpu will not remain in stop0_lite for more than the residency of next
available state, and thus it will go to a deeper state in conservative
fashion. Using this, we may spent even less than 20 milliseconds if
susbsequent stop states are enabled. In the worst case, we may end up
spending more than a second, as was the case without this patch. The
worst case will occur in the scenario when no other shallow states are
enbaled, and only deep states are available for auto-promotion.

Abhishek Goel (2):
   cpuidle : auto-promotion for cpuidle states
   cpuidle : Add auto-promotion flag to cpuidle flags

  arch/powerpc/include/asm/opal-api.h |  1 +
  drivers/cpuidle/Kconfig |  4 
  drivers/cpuidle/cpuidle-powernv.c   | 13 +++--
  drivers/cpuidle/cpuidle.c   |  3 ---
  4 files changed, 16 insertions(+), 5 deletions(-)





[PATCH 2/2] cpuidle : Add auto-promotion flag to cpuidle flags

2019-03-22 Thread Abhishek Goel
This patch sets up flags for the state which needs to be auto-promoted.
For powernv systems, lite states do not even lose user context. That
information has been used to set the flag for lite states.

Signed-off-by: Abhishek Goel 
---
 arch/powerpc/include/asm/opal-api.h |  1 +
 drivers/cpuidle/Kconfig |  4 
 drivers/cpuidle/cpuidle-powernv.c   | 13 +++--
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 870fb7b23..735dec731 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -226,6 +226,7 @@
  */
 
 #define OPAL_PM_TIMEBASE_STOP  0x0002
+#define OPAL_PM_LOSE_USER_CONTEXT  0x1000
 #define OPAL_PM_LOSE_HYP_CONTEXT   0x2000
 #define OPAL_PM_LOSE_FULL_CONTEXT  0x4000
 #define OPAL_PM_NAP_ENABLED0x0001
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index 7e48eb5bf..0ece62684 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -26,6 +26,10 @@ config CPU_IDLE_GOV_MENU
 config DT_IDLE_STATES
bool
 
+config CPU_IDLE_AUTO_PROMOTION
+   bool
+   default y if PPC_POWERNV
+
 menu "ARM CPU Idle Drivers"
 depends on ARM || ARM64
 source "drivers/cpuidle/Kconfig.arm"
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe21..e351f5f9c 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -299,6 +299,7 @@ static int powernv_add_idle_states(void)
for (i = 0; i < dt_idle_states; i++) {
unsigned int exit_latency, target_residency;
bool stops_timebase = false;
+   bool lose_user_context = false;
struct pnv_idle_states_t *state = _idle_states[i];
 
/*
@@ -324,6 +325,9 @@ static int powernv_add_idle_states(void)
if (has_stop_states && !(state->valid))
continue;
 
+   if (state->flags & OPAL_PM_LOSE_USER_CONTEXT)
+   lose_user_context = true;
+
if (state->flags & OPAL_PM_TIMEBASE_STOP)
stops_timebase = true;
 
@@ -332,12 +336,17 @@ static int powernv_add_idle_states(void)
add_powernv_state(nr_idle_states, "Nap",
  CPUIDLE_FLAG_NONE, nap_loop,
  target_residency, exit_latency, 0, 0);
+   } else if (has_stop_states & !lose_user_context) {
+   add_powernv_state(nr_idle_states, state->name,
+ CPUIDLE_FLAG_AUTO_PROMOTION,
+ stop_loop, target_residency,
+ exit_latency, state->psscr_val,
+ state->psscr_mask);
} else if (has_stop_states && !stops_timebase) {
add_powernv_state(nr_idle_states, state->name,
  CPUIDLE_FLAG_NONE, stop_loop,
  target_residency, exit_latency,
- state->psscr_val,
- state->psscr_mask);
+ state->psscr_val, state->psscr_mask);
}
 
/*
-- 
2.17.1



[PATCH 1/2] cpuidle : auto-promotion for cpuidle states

2019-03-22 Thread Abhishek Goel
Currently, the cpuidle governors (menu /ladder) determine what idle state
an idling CPU should enter into based on heuristics that depend on the
idle history on that CPU. Given that no predictive heuristic is perfect,
there are cases where the governor predicts a shallow idle state, hoping
that the CPU will be busy soon. However, if no new workload is scheduled
on that CPU in the near future, the CPU will end up in the shallow state.

In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core from
using the core resources.

To address this, such lite states need to be autopromoted. The cpuidle-
core can queue timer to correspond with the residency value of the next
available state. Thus leading to auto-promotion to a deeper idle state as
soon as possible.

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/cpuidle.c  | 79 +-
 drivers/cpuidle/governors/ladder.c |  3 +-
 drivers/cpuidle/governors/menu.c   | 23 -
 include/linux/cpuidle.h| 12 +++--
 4 files changed, 111 insertions(+), 6 deletions(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 7f108309e..c4d1c1b38 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -36,6 +36,12 @@ static int enabled_devices;
 static int off __read_mostly;
 static int initialized __read_mostly;
 
+struct auto_promotion {
+   struct hrtimer  hrtimer;
+   int timeout;
+   booltimeout_needed;
+};
+
 int cpuidle_disabled(void)
 {
return off;
@@ -188,6 +194,64 @@ int cpuidle_enter_s2idle(struct cpuidle_driver *drv, 
struct cpuidle_device *dev)
 }
 #endif /* CONFIG_SUSPEND */
 
+enum hrtimer_restart auto_promotion_hrtimer_callback(struct hrtimer *hrtimer)
+{
+   return HRTIMER_NORESTART;
+}
+
+#ifdef CONFIG_CPU_IDLE_AUTO_PROMOTION
+DEFINE_PER_CPU(struct auto_promotion, ap);
+
+static void cpuidle_auto_promotion_start(struct cpuidle_state *state, int cpu)
+{
+   struct auto_promotion *this_ap = _cpu(ap, cpu);
+
+   if (this_ap->timeout_needed && (state->flags &
+   CPUIDLE_FLAG_AUTO_PROMOTION))
+   hrtimer_start(_ap->hrtimer, ns_to_ktime(this_ap->timeout
+   * 1000), HRTIMER_MODE_REL_PINNED);
+}
+
+static void cpuidle_auto_promotion_cancel(int cpu)
+{
+   struct hrtimer *hrtimer;
+
+   hrtimer = _cpu(ap, cpu).hrtimer;
+   if (hrtimer_is_queued(hrtimer))
+   hrtimer_cancel(hrtimer);
+}
+
+static void cpuidle_auto_promotion_update(int time, int cpu)
+{
+   per_cpu(ap, cpu).timeout = time;
+}
+
+static void cpuidle_auto_promotion_init(struct cpuidle_driver *drv, int cpu)
+{
+   int i;
+   struct auto_promotion *this_ap = _cpu(ap, cpu);
+
+   this_ap->timeout_needed = 0;
+
+   for (i = 0; i < drv->state_count; i++) {
+   if (drv->states[i].flags & CPUIDLE_FLAG_AUTO_PROMOTION) {
+   this_ap->timeout_needed = 1;
+   break;
+   }
+   }
+
+   hrtimer_init(_ap->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+   this_ap->hrtimer.function = auto_promotion_hrtimer_callback;
+}
+#else
+static inline void cpuidle_auto_promotion_start(struct cpuidle_state *state,
+   int cpu) { }
+static inline void cpuidle_auto_promotion_cancel(int cpu) { }
+static inline void cpuidle_auto_promotion_update(int timeout, int cpu) { }
+static inline void cpuidle_auto_promotion_init(struct cpuidle_driver *drv,
+   int cpu) { }
+#endif
+
 /**
  * cpuidle_enter_state - enter the state and update stats
  * @dev: cpuidle device for this cpu
@@ -225,12 +289,17 @@ int cpuidle_enter_state(struct cpuidle_device *dev, 
struct cpuidle_driver *drv,
trace_cpu_idle_rcuidle(index, dev->cpu);
time_start = ns_to_ktime(local_clock());
 
+   cpuidle_auto_promotion_start(target_state, dev->cpu);
+
stop_critical_timings();
entered_state = target_state->enter(dev, drv, index);
start_critical_timings();
 
sched_clock_idle_wakeup_event();
time_end = ns_to_ktime(local_clock());
+
+   cpuidle_auto_promotion_cancel(dev->cpu);
+
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
 
/* The cpu is no longer idle or about to enter idle. */
@@ -312,7 +381,13 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct 
cpuidle_driver *drv,
 int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
   bool *stop_tick)
 {
-   return cpuidle_curr_governor->select(drv, dev, stop_tick);
+   int timeout, ret;
+
+   timeout = INT_MAX;
+   ret = cpuidle_curr_governor->select

[PATCH 0/2] Auto-promotion logic for cpuidle states

2019-03-22 Thread Abhishek Goel
Currently, the cpuidle governors (menu/ladder) determine what idle state a
idling CPU should enter into based on heuristics that depend on the idle
history on that CPU. Given that no predictive heuristic is perfect, there
are cases where the governor predicts a shallow idle state, hoping that
the CPU will be busy soon. However, if no new workload is scheduled on
that CPU in the near future, the CPU will end up in the shallow state.

Motivation
--
In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core from
using the core resources.

To address this, such lite states need to be autopromoted. The cpuidle-core
can queue timer to correspond with the residency value of the next
available state. Thus leading to auto-promotion to a deeper idle state as
soon as possible.

Experiment
--
Without this patch -
It was seen that for a idle system, a cpu may remain in stop0_lite for few
seconds and then directly goes to a deeper state such as stop2.

With this patch -
A cpu will not remain in stop0_lite for more than the residency of next
available state, and thus it will go to a deeper state in conservative
fashion. Using this, we may spent even less than 20 milliseconds if
susbsequent stop states are enabled. In the worst case, we may end up
spending more than a second, as was the case without this patch. The
worst case will occur in the scenario when no other shallow states are
enbaled, and only deep states are available for auto-promotion.

Abhishek Goel (2):
  cpuidle : auto-promotion for cpuidle states
  cpuidle : Add auto-promotion flag to cpuidle flags

 arch/powerpc/include/asm/opal-api.h |  1 +
 drivers/cpuidle/Kconfig |  4 ++
 drivers/cpuidle/cpuidle-powernv.c   | 13 -
 drivers/cpuidle/cpuidle.c   | 79 -
 drivers/cpuidle/governors/ladder.c  |  3 +-
 drivers/cpuidle/governors/menu.c| 23 -
 include/linux/cpuidle.h | 12 +++--
 7 files changed, 127 insertions(+), 8 deletions(-)

-- 
2.17.1



[PATCH 0/2] Auto-promotion logic for cpuidle states

2019-03-22 Thread Abhishek Goel
Currently, the cpuidle governors (menu/ladder) determine what idle state a
idling CPU should enter into based on heuristics that depend on the idle
history on that CPU. Given that no predictive heuristic is perfect, there
are cases where the governor predicts a shallow idle state, hoping that
the CPU will be busy soon. However, if no new workload is scheduled on
that CPU in the near future, the CPU will end up in the shallow state.

Motivation
--
In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core from
using the core resources.

To address this, such lite states need to be autopromoted. The cpuidle-core
can queue timer to correspond with the residency value of the next
available state. Thus leading to auto-promotion to a deeper idle state as
soon as possible.

Experiment
--
Without this patch -
It was seen that for a idle system, a cpu may remain in stop0_lite for few
seconds and then directly goes to a deeper state such as stop2.

With this patch -
A cpu will not remain in stop0_lite for more than the residency of next
available state, and thus it will go to a deeper state in conservative
fashion. Using this, we may spent even less than 20 milliseconds if
susbsequent stop states are enabled. In the worst case, we may end up
spending more than a second, as was the case without this patch. The
worst case will occur in the scenario when no other shallow states are
enbaled, and only deep states are available for auto-promotion.

Abhishek Goel (2):
  cpuidle : auto-promotion for cpuidle states
  cpuidle : Add auto-promotion flag to cpuidle flags

 arch/powerpc/include/asm/opal-api.h |  1 +
 drivers/cpuidle/Kconfig |  4 
 drivers/cpuidle/cpuidle-powernv.c   | 13 +++--
 drivers/cpuidle/cpuidle.c   |  3 ---
 4 files changed, 16 insertions(+), 5 deletions(-)

-- 
2.17.1



[PATCH 2/2] cpuidle : Add auto-promotion flag to cpuidle flags

2019-03-22 Thread Abhishek Goel
This patch sets up flags for the state which needs to be auto-promoted.
For powernv systems, lite states do not even lose user context. That
information has been used to set the flag for lite states.

Signed-off-by: Abhishek Goel 
---
 arch/powerpc/include/asm/opal-api.h |  1 +
 drivers/cpuidle/Kconfig |  4 
 drivers/cpuidle/cpuidle-powernv.c   | 13 +++--
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 870fb7b23..735dec731 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -226,6 +226,7 @@
  */
 
 #define OPAL_PM_TIMEBASE_STOP  0x0002
+#define OPAL_PM_LOSE_USER_CONTEXT  0x1000
 #define OPAL_PM_LOSE_HYP_CONTEXT   0x2000
 #define OPAL_PM_LOSE_FULL_CONTEXT  0x4000
 #define OPAL_PM_NAP_ENABLED0x0001
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index 7e48eb5bf..0ece62684 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -26,6 +26,10 @@ config CPU_IDLE_GOV_MENU
 config DT_IDLE_STATES
bool
 
+config CPU_IDLE_AUTO_PROMOTION
+   bool
+   default y if PPC_POWERNV
+
 menu "ARM CPU Idle Drivers"
 depends on ARM || ARM64
 source "drivers/cpuidle/Kconfig.arm"
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe21..e351f5f9c 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -299,6 +299,7 @@ static int powernv_add_idle_states(void)
for (i = 0; i < dt_idle_states; i++) {
unsigned int exit_latency, target_residency;
bool stops_timebase = false;
+   bool lose_user_context = false;
struct pnv_idle_states_t *state = _idle_states[i];
 
/*
@@ -324,6 +325,9 @@ static int powernv_add_idle_states(void)
if (has_stop_states && !(state->valid))
continue;
 
+   if (state->flags & OPAL_PM_LOSE_USER_CONTEXT)
+   lose_user_context = true;
+
if (state->flags & OPAL_PM_TIMEBASE_STOP)
stops_timebase = true;
 
@@ -332,12 +336,17 @@ static int powernv_add_idle_states(void)
add_powernv_state(nr_idle_states, "Nap",
  CPUIDLE_FLAG_NONE, nap_loop,
  target_residency, exit_latency, 0, 0);
+   } else if (has_stop_states & !lose_user_context) {
+   add_powernv_state(nr_idle_states, state->name,
+ CPUIDLE_FLAG_AUTO_PROMOTION,
+ stop_loop, target_residency,
+ exit_latency, state->psscr_val,
+ state->psscr_mask);
} else if (has_stop_states && !stops_timebase) {
add_powernv_state(nr_idle_states, state->name,
  CPUIDLE_FLAG_NONE, stop_loop,
  target_residency, exit_latency,
- state->psscr_val,
- state->psscr_mask);
+ state->psscr_val, state->psscr_mask);
}
 
/*
-- 
2.17.1



[PATCH 1/2] cpuidle : auto-promotion for cpuidle states

2019-03-22 Thread Abhishek Goel
Currently, the cpuidle governors (menu /ladder) determine what idle state
an idling CPU should enter into based on heuristics that depend on the
idle history on that CPU. Given that no predictive heuristic is perfect,
there are cases where the governor predicts a shallow idle state, hoping
that the CPU will be busy soon. However, if no new workload is scheduled
on that CPU in the near future, the CPU will end up in the shallow state.

In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core from
using the core resources.

To address this, such lite states need to be autopromoted. The cpuidle-
core can queue timer to correspond with the residency value of the next
available state. Thus leading to auto-promotion to a deeper idle state as
soon as possible.

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/cpuidle.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 2406e2655..c4d1c1b38 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -584,11 +584,8 @@ static void __cpuidle_unregister_device(struct 
cpuidle_device *dev)
 
 static void __cpuidle_device_init(struct cpuidle_device *dev)
 {
-   int i;
memset(dev->states_usage, 0, sizeof(dev->states_usage));
dev->last_residency = 0;
-   for (i = 0; i < CPUIDLE_STATE_MAX; i++)
-   dev->states_usage[i].disable = true;
 }
 
 /**
-- 
2.17.1



[RFC 1/1] cpuidle : Move saving and restoring of sprs to opal

2018-07-23 Thread Abhishek Goel
This patch moves the saving and restoring of sprs for P9 cpuidle
from kernel to opal. This patch still uses existing code to detect
first thread in core.
In an attempt to make the powernv idle code backward compatible,
and to some extent forward compatible, add support for pre-stop entry
and post-stop exit actions in OPAL. If a kernel knows about this
opal call, then just a firmware supporting newer hardware is required,
instead of waiting for kernel updates.

Signed-off-by: Abhishek Goel 
---

Link to the Skiboot patch corresponding to this patch:
http://patchwork.ozlabs.org/patch/947568/

 arch/powerpc/include/asm/cpuidle.h|  10 --
 arch/powerpc/include/asm/opal-api.h   |   4 +-
 arch/powerpc/include/asm/opal.h   |   3 +
 arch/powerpc/include/asm/paca.h   |   5 +-
 arch/powerpc/kernel/asm-offsets.c |  10 +-
 arch/powerpc/kernel/idle_book3s.S | 130 ++
 arch/powerpc/platforms/powernv/idle.c |   4 +
 .../powerpc/platforms/powernv/opal-wrappers.S |   2 +
 arch/powerpc/xmon/xmon.c  |  12 +-
 9 files changed, 61 insertions(+), 119 deletions(-)

diff --git a/arch/powerpc/include/asm/cpuidle.h 
b/arch/powerpc/include/asm/cpuidle.h
index e210a83eb196..c10f47af9a55 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -68,16 +68,6 @@
 #define ERR_DEEP_STATE_ESL_MISMATCH-2
 
 #ifndef __ASSEMBLY__
-/* Additional SPRs that need to be saved/restored during stop */
-struct stop_sprs {
-   u64 pid;
-   u64 ldbar;
-   u64 fscr;
-   u64 hfscr;
-   u64 mmcr1;
-   u64 mmcr2;
-   u64 mmcra;
-};
 
 extern u32 pnv_fastsleep_workaround_at_entry[];
 extern u32 pnv_fastsleep_workaround_at_exit[];
diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 3bab299eda49..6792a737bc9a 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -208,7 +208,9 @@
 #define OPAL_SENSOR_READ_U64   162
 #define OPAL_PCI_GET_PBCQ_TUNNEL_BAR   164
 #define OPAL_PCI_SET_PBCQ_TUNNEL_BAR   165
-#define OPAL_LAST  165
+#define OPAL_IDLE_SAVE 168
+#define OPAL_IDLE_RESTORE  169
+#define OPAL_LAST  169
 
 #define QUIESCE_HOLD   1 /* Spin all calls at entry */
 #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index e1b2910c6e81..12d57aeacde2 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -356,6 +356,9 @@ extern void opal_kmsg_init(void);
 
 extern int opal_event_request(unsigned int opal_event_nr);
 
+extern int opal_cpuidle_save(u64 *stop_sprs, int scope, u64 psscr);
+extern int opal_cpuidle_restore(u64 *stop_sprs, int scope, u64 psscr, u64 
srr1);
+
 struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr,
 unsigned long vmalloc_size);
 void opal_free_sg_list(struct opal_sg_list *sg);
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 6d34bd71139d..765524e76beb 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -195,11 +195,12 @@ struct paca_struct {
/* The PSSCR value that the kernel requested before going to stop */
u64 requested_psscr;
 
+   u64 wakeup_psscr;
/*
-* Save area for additional SPRs that need to be
+* Save area for SPRs that need to be
 * saved/restored during cpuidle stop.
 */
-   struct stop_sprs stop_sprs;
+   u64 *opal_stop_sprs;
 #endif
 
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 0a0544335950..65a3d8582017 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -769,14 +769,8 @@ int main(void)
OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas);
OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr);
OFFSET(PACA_DONT_STOP, paca_struct, dont_stop);
-#define STOP_SPR(x, f) OFFSET(x, paca_struct, stop_sprs.f)
-   STOP_SPR(STOP_PID, pid);
-   STOP_SPR(STOP_LDBAR, ldbar);
-   STOP_SPR(STOP_FSCR, fscr);
-   STOP_SPR(STOP_HFSCR, hfscr);
-   STOP_SPR(STOP_MMCR1, mmcr1);
-   STOP_SPR(STOP_MMCR2, mmcr2);
-   STOP_SPR(STOP_MMCRA, mmcra);
+   OFFSET(PACA_WAKEUP_PSSCR, paca_struct, wakeup_psscr);
+   OFFSET(STOP_SPRS, paca_struct, opal_stop_sprs);
 #endif
 
DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER);
diff --git a/arch/powerpc/kernel/idle_book3s.S 
b/arch/powerpc/kernel/idle_book3s.S
index e734f6e45abc..66fc955abee3 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -45,6 +45,9 @@
 
 #define

Re: [PATCH v2] cpuidle/powernv : Add Description for cpuidle state

2018-06-05 Thread Abhishek




On 06/04/2018 05:15 PM, Akshay Adiga wrote:

On Mon, Jun 04, 2018 at 07:04:14PM +1000, Benjamin Herrenschmidt wrote:

Is this a new property ? I'm not fan of adding yet another of those
silly arrays.

I would say this is the right time now to switch over to a node per
state instead, as we discussed with Vaidy.


It is not a new property. Name was being used for description as 
description was not present in device tree. A skiboot patch adding 
description to device tree have been posted. This patch reads those 
description instead of copying name itself into description. And we fall 
back to reading name into description to not break the comaptibility 
with older firmware.


Thanks
Abhishek


I posted  the node based device tree here :
skiboot patch :  https://patchwork.ozlabs.org/patch/923120/
kernel patch : https://lkml.org/lkml/2018/5/30/1146

Do you have any inputs for this design ?


Additionally, while doing that, we can provide the versioning mechanism
I proposed so we can deal with state specific issues and erratas.

Cheers,
Ben.





[PATCH v2] cpuidle/powernv : Add Description for cpuidle state

2018-06-04 Thread Abhishek Goel
Names of cpuidle states were being used for description of states
in POWER as no descriptions were added in device tree. This patch
reads description for idle states which have been added in device
tree.
The description for idle states in case of POWER can be printed
using "cpupower monitor -l" or "cpupower idle-info".

Signed-off-by: Abhishek Goel 
---

The skiboot patch which adds description for idle states in device tree
can be found here: https://patchwork.ozlabs.org/patch/924879/

 drivers/cpuidle/cpuidle-powernv.c | 19 +++
 include/linux/cpuidle.h   |  2 +-
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 1a8234e706bc..08d8b0953a14 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -133,7 +133,7 @@ static int stop_loop(struct cpuidle_device *dev,
 static struct cpuidle_state powernv_states[CPUIDLE_STATE_MAX] = {
{ /* Snooze */
.name = "snooze",
-   .desc = "snooze",
+   .desc = "idle polling state",
.exit_latency = 0,
.target_residency = 0,
.enter = snooze_loop },
@@ -206,6 +206,7 @@ static int powernv_cpuidle_driver_init(void)
 }
 
 static inline void add_powernv_state(int index, const char *name,
+const char *desc,
 unsigned int flags,
 int (*idle_fn)(struct cpuidle_device *,
struct cpuidle_driver *,
@@ -215,7 +216,7 @@ static inline void add_powernv_state(int index, const char 
*name,
 u64 psscr_val, u64 psscr_mask)
 {
strlcpy(powernv_states[index].name, name, CPUIDLE_NAME_LEN);
-   strlcpy(powernv_states[index].desc, name, CPUIDLE_NAME_LEN);
+   strlcpy(powernv_states[index].desc, desc, CPUIDLE_DESC_LEN);
powernv_states[index].flags = flags;
powernv_states[index].target_residency = target_residency;
powernv_states[index].exit_latency = exit_latency;
@@ -250,6 +251,7 @@ static int powernv_add_idle_states(void)
u64 psscr_val[CPUIDLE_STATE_MAX];
u64 psscr_mask[CPUIDLE_STATE_MAX];
const char *names[CPUIDLE_STATE_MAX];
+   const char *descs[CPUIDLE_STATE_MAX];
u32 has_stop_states = 0;
int i, rc;
u32 supported_flags = pnv_get_supported_cpuidle_states();
@@ -311,6 +313,13 @@ static int powernv_add_idle_states(void)
pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-names in 
DT\n");
goto out;
}
+   if (of_property_read_string_array(power_mgt,
+   "ibm,cpu-idle-state-descs", descs, dt_idle_states) < 0) {
+   of_property_read_string_array(power_mgt,
+   "ibm,cpu-idle-state-names", descs, 
dt_idle_states);
+   pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-descs in 
DT\n"
+   "Name will be used for description\n");
+   }
 
/*
 * If the idle states use stop instruction, probe for psscr values
@@ -414,10 +423,11 @@ static int powernv_add_idle_states(void)
target_residency = 100;
/* Add NAP state */
add_powernv_state(nr_idle_states, "Nap",
+ "stop processor execution",
  CPUIDLE_FLAG_NONE, nap_loop,
  target_residency, exit_latency, 0, 0);
} else if (has_stop_states && !stops_timebase) {
-   add_powernv_state(nr_idle_states, names[i],
+   add_powernv_state(nr_idle_states, names[i], descs[i],
  CPUIDLE_FLAG_NONE, stop_loop,
  target_residency, exit_latency,
  psscr_val[i], psscr_mask[i]);
@@ -434,11 +444,12 @@ static int powernv_add_idle_states(void)
target_residency = 30;
/* Add FASTSLEEP state */
add_powernv_state(nr_idle_states, "FastSleep",
+ "Core and L2 clock gating",
  CPUIDLE_FLAG_TIMER_STOP,
  fastsleep_loop,
  target_residency, exit_latency, 0, 0);
} else if (has_stop_states && stops_timebase) {
-   add_powernv_state(nr_idle_states, names

[PATCH] cpuidle/powernv : Add Description for cpuidle state

2018-05-28 Thread Abhishek Goel
Names of cpuidle states were being used for description of states
in POWER as no descriptions were added in device tree. This patch
reads description for idle states which have been added in device
tree.
The description for idle states in case of POWER can be printed
using "cpupower monitor -l" or "cpupower idle-info".

Signed-off-by: Abhishek Goel 
---

The skiboot patch which adds description for idle states in device tree
can be found here: https://patchwork.ozlabs.org/patch/921637/

 drivers/cpuidle/cpuidle-powernv.c | 17 +
 include/linux/cpuidle.h   |  2 +-
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 1a8234e706bc..d3915a965128 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -133,7 +133,7 @@ static int stop_loop(struct cpuidle_device *dev,
 static struct cpuidle_state powernv_states[CPUIDLE_STATE_MAX] = {
{ /* Snooze */
.name = "snooze",
-   .desc = "snooze",
+   .desc = "idle polling state",
.exit_latency = 0,
.target_residency = 0,
.enter = snooze_loop },
@@ -206,6 +206,7 @@ static int powernv_cpuidle_driver_init(void)
 }
 
 static inline void add_powernv_state(int index, const char *name,
+const char *desc,
 unsigned int flags,
 int (*idle_fn)(struct cpuidle_device *,
struct cpuidle_driver *,
@@ -215,7 +216,7 @@ static inline void add_powernv_state(int index, const char 
*name,
 u64 psscr_val, u64 psscr_mask)
 {
strlcpy(powernv_states[index].name, name, CPUIDLE_NAME_LEN);
-   strlcpy(powernv_states[index].desc, name, CPUIDLE_NAME_LEN);
+   strlcpy(powernv_states[index].desc, desc, CPUIDLE_DESC_LEN);
powernv_states[index].flags = flags;
powernv_states[index].target_residency = target_residency;
powernv_states[index].exit_latency = exit_latency;
@@ -250,6 +251,7 @@ static int powernv_add_idle_states(void)
u64 psscr_val[CPUIDLE_STATE_MAX];
u64 psscr_mask[CPUIDLE_STATE_MAX];
const char *names[CPUIDLE_STATE_MAX];
+   const char *descs[CPUIDLE_STATE_MAX];
u32 has_stop_states = 0;
int i, rc;
u32 supported_flags = pnv_get_supported_cpuidle_states();
@@ -311,6 +313,11 @@ static int powernv_add_idle_states(void)
pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-names in 
DT\n");
goto out;
}
+   if (of_property_read_string_array(power_mgt,
+   "ibm,cpu-idle-state-descs", descs, dt_idle_states) < 0) {
+   pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-descs in 
DT\n");
+   goto out;
+   }
 
/*
 * If the idle states use stop instruction, probe for psscr values
@@ -414,10 +421,11 @@ static int powernv_add_idle_states(void)
target_residency = 100;
/* Add NAP state */
add_powernv_state(nr_idle_states, "Nap",
+ "stop processor execution",
  CPUIDLE_FLAG_NONE, nap_loop,
  target_residency, exit_latency, 0, 0);
} else if (has_stop_states && !stops_timebase) {
-   add_powernv_state(nr_idle_states, names[i],
+   add_powernv_state(nr_idle_states, names[i], descs[i],
  CPUIDLE_FLAG_NONE, stop_loop,
  target_residency, exit_latency,
  psscr_val[i], psscr_mask[i]);
@@ -434,11 +442,12 @@ static int powernv_add_idle_states(void)
target_residency = 30;
/* Add FASTSLEEP state */
add_powernv_state(nr_idle_states, "FastSleep",
+ "Core and L2 clock gating",
  CPUIDLE_FLAG_TIMER_STOP,
  fastsleep_loop,
  target_residency, exit_latency, 0, 0);
} else if (has_stop_states && stops_timebase) {
-   add_powernv_state(nr_idle_states, names[i],
+   add_powernv_state(nr_idle_states, names[i], descs[i],
  CPUIDLE_FLAG_TIMER_STOP, stop_loop,
  target_residency, exit_latency,
   

Re: [PATCH v2] cpufreq: powernv: Add support of frequency domain

2018-01-22 Thread Abhishek



On 12/20/2017 12:20 PM, Viresh Kumar wrote:

On 20-12-17, 12:12, Abhishek Goel wrote:

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index b6d7c4c..fd642bc 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -37,6 +37,7 @@
  #include  /* Required for cpu_sibling_mask() in UP configs */
  #include 
  #include 
+#include 
  
  #define POWERNV_MAX_PSTATES	256

  #define PMSR_PSAFE_ENABLE (1UL << 30)
@@ -130,6 +131,9 @@ static struct chip {
  static int nr_chips;
  static DEFINE_PER_CPU(struct chip *, chip_info);
  
+static u32 freq_domain_indicator;

+static u32 flag;

I wouldn't name it as flag, its unreadable. Maybe its better to name
it based on the quirk you are trying to workaround with ?


+
  /*
   * Note:
   * The set of pstates consists of contiguous integers.
@@ -194,6 +198,38 @@ static inline void reset_gpstates(struct cpufreq_policy 
*policy)
gpstates->last_gpstate_idx = 0;
  }
  
+#define SIZE NR_CPUS

+#define ORDER_FREQ_MAP ilog2(SIZE)
+
+static DEFINE_HASHTABLE(freq_domain_map, ORDER_FREQ_MAP);
+
+struct hashmap {
+   cpumask_t mask;
+   int chip_id;
+   u32 pir_key;
+   struct hlist_node hash_node;
+};
+
+static void insert(u32 key, int cpu)
+{
+   struct hashmap *data;
+
+   hash_for_each_possible(freq_domain_map, data, hash_node, key%SIZE) {
+   if (data->chip_id == cpu_to_chip_id(cpu) &&
+   data->pir_key == key) {
+   cpumask_set_cpu(cpu, >mask);
+   return;
+   }
+   }
+
+   data = kzalloc(sizeof(*data), GFP_KERNEL);
+   hash_add(freq_domain_map, >hash_node, key%SIZE);
+   cpumask_set_cpu(cpu, >mask);
+   data->chip_id = cpu_to_chip_id(cpu);
+   data->pir_key = key;
+
+}
+
  /*
   * Initialize the freq table based on data obtained
   * from the firmware passed via device-tree
@@ -206,7 +242,9 @@ static int init_powernv_pstates(void)
u32 len_ids, len_freqs;
u32 pstate_min, pstate_max, pstate_nominal;
u32 pstate_turbo, pstate_ultra_turbo;
+   u32 key;
  
+	flag = 0;

Isn't flag already 0 (global-uninitialized) ?


power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
if (!power_mgt) {
pr_warn("power-mgt node not found\n");
@@ -229,6 +267,17 @@ static int init_powernv_pstates(void)
return -ENODEV;
}
  
+	if (of_device_is_compatible(power_mgt, "freq-domain-v1") &&

+   of_property_read_u32(power_mgt, "ibm,freq-domain-indicator",
+_domain_indicator)) {
+   pr_warn("ibm,freq-domain-indicator not found\n");
+   freq_domain_indicator = 0;

You shouldn't be required to set it to 0 here.


+   }
+
+   if (of_device_is_compatible(power_mgt, "P9-occ-quirk")) {
+   flag = 1;
+   }

Remove {} and a better name like p9_occ_quirk would be good for flag.
Also making it a bool may be better ?


+
if (of_property_read_u32(power_mgt, "ibm,pstate-ultra-turbo",
 _ultra_turbo)) {
powernv_pstate_info.wof_enabled = false;
@@ -249,6 +298,7 @@ static int init_powernv_pstates(void)
  next:
pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min,
pstate_nominal, pstate_max);
+   pr_info("frequency domain indicator %d", freq_domain_indicator);
pr_info("Workload Optimized Frequency is %s in the platform\n",
(powernv_pstate_info.wof_enabled) ? "enabled" : "disabled");
  
@@ -276,6 +326,15 @@ static int init_powernv_pstates(void)

return -ENODEV;
}
  
+	if (freq_domain_indicator) {

+   hash_init(freq_domain_map);
+   for_each_possible_cpu(i) {
+   key = ((u32) get_hard_smp_processor_id(i) &
+   freq_domain_indicator);

Maybe break it like:

key = (u32) get_hard_smp_processor_id(i);
 key &= freq_domain_indicator;

to make it easily readable ?


+   insert(key, i);
+   }
+   }
+
powernv_pstate_info.nr_pstates = nr_pstates;
pr_debug("NR PStates %d\n", nr_pstates);
for (i = 0; i < nr_pstates; i++) {
@@ -693,6 +752,7 @@ static int powernv_cpufreq_target_index(struct 
cpufreq_policy *policy,
  {
struct powernv_smp_call_data freq_data;
unsigned int cur_msec, gpstate_idx;
+

:(


struct global_pstate_info *gpstates = policy->driver_data;
  
  	if (unlikely(rebooting) && new_index != get_nominal_index())

@@ -760,25 +820,55 @@ static int powernv_cpufreq_target_index(struct 
cpufreq_policy *policy,
  

[PATCH v3] cpufreq: powernv: Add support of frequency domain

2018-01-22 Thread Abhishek Goel
Frequency-domain indicates group of CPUs that would share same frequency.
It is detected using device-tree node "frequency-domain-indicator".
frequency-domain-indicator is a bitmask which will have different value
depending upon the generation of the processor.

CPUs of the same chip for which the result of a bitwise AND between
their PIR and the frequency-domain-indicator is the same share the same
frequency.

In this patch, we define hash-table indexed by the aforementioned
bitwise ANDed value to store the cpumask of the CPUs sharing the same
frequency domain. Further, the cpufreq policy will be created per
frequency-domain

So for POWER9, a cpufreq policy is created per quad while for POWER8 it
is created per core. Governor decides frequency for each policy but
multiple cores may come under same policy. In such case frequency needs
to be set on each core sharing that policy.

Signed-off-by: Abhishek Goel <hunt...@linux.vnet.ibm.com>
---

Skiboot patch required for the corresponding device-tree changes have been
posted here : http://patchwork.ozlabs.org/patch/862256/

 drivers/cpufreq/powernv-cpufreq.c | 104 ++
 1 file changed, 95 insertions(+), 9 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index b6d7c4c..aab23a4 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -37,6 +37,7 @@
 #include  /* Required for cpu_sibling_mask() in UP configs */
 #include 
 #include 
+#include 
 
 #define POWERNV_MAX_PSTATES256
 #define PMSR_PSAFE_ENABLE  (1UL << 30)
@@ -130,6 +131,9 @@ enum throttle_reason_type {
 static int nr_chips;
 static DEFINE_PER_CPU(struct chip *, chip_info);
 
+static u32 freq_domain_indicator;
+static bool p9_occ_quirk;
+
 /*
  * Note:
  * The set of pstates consists of contiguous integers.
@@ -194,6 +198,38 @@ static inline void reset_gpstates(struct cpufreq_policy 
*policy)
gpstates->last_gpstate_idx = 0;
 }
 
+#define SIZE NR_CPUS
+#define ORDER_FREQ_MAP ilog2(SIZE)
+
+static DEFINE_HASHTABLE(freq_domain_map, ORDER_FREQ_MAP);
+
+struct hashmap {
+   cpumask_t mask;
+   int chip_id;
+   u32 pir_key;
+   struct hlist_node hash_node;
+};
+
+static void insert(u32 key, int cpu)
+{
+   struct hashmap *data;
+
+   hash_for_each_possible(freq_domain_map, data, hash_node, key%SIZE) {
+   if (data->chip_id == cpu_to_chip_id(cpu) &&
+   data->pir_key == key) {
+   cpumask_set_cpu(cpu, >mask);
+   return;
+   }
+   }
+
+   data = kzalloc(sizeof(*data), GFP_KERNEL);
+   hash_add(freq_domain_map, >hash_node, key%SIZE);
+   cpumask_set_cpu(cpu, >mask);
+   data->chip_id = cpu_to_chip_id(cpu);
+   data->pir_key = key;
+
+}
+
 /*
  * Initialize the freq table based on data obtained
  * from the firmware passed via device-tree
@@ -206,6 +242,7 @@ static int init_powernv_pstates(void)
u32 len_ids, len_freqs;
u32 pstate_min, pstate_max, pstate_nominal;
u32 pstate_turbo, pstate_ultra_turbo;
+   u32 key;
 
power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
if (!power_mgt) {
@@ -246,9 +283,18 @@ static int init_powernv_pstates(void)
else
powernv_pstate_info.wof_enabled = true;
 
+   if (of_device_is_compatible(power_mgt, "freq-domain-v1") &&
+   of_property_read_u32(power_mgt, "ibm,freq-domain-indicator",
+   _domain_indicator))
+   pr_warn("ibm,freq-domain-indicator not found\n");
+
+if (of_device_is_compatible(power_mgt, "p9-occ-quirk"))
+   p9_occ_quirk = true;
+
 next:
pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min,
pstate_nominal, pstate_max);
+   pr_info("frequency domain indicator %d", freq_domain_indicator);
pr_info("Workload Optimized Frequency is %s in the platform\n",
(powernv_pstate_info.wof_enabled) ? "enabled" : "disabled");
 
@@ -276,6 +322,15 @@ static int init_powernv_pstates(void)
return -ENODEV;
}
 
+   if (freq_domain_indicator) {
+   hash_init(freq_domain_map);
+   for_each_possible_cpu(i) {
+   key = ((u32) get_hard_smp_processor_id(i) &
+   freq_domain_indicator);
+   insert(key, i);
+   }
+   }
+
powernv_pstate_info.nr_pstates = nr_pstates;
pr_debug("NR PStates %d\n", nr_pstates);
for (i = 0; i < nr_pstates; i++) {
@@ -760,25 +815,56 @@ static int powernv_cpufreq_target_index(struct 
cpufreq_policy *policy,
 
spin_unlock(>gpstate_lock);
 
- 

[PATCH v2] cpufreq: powernv: Add support of frequency domain

2017-12-19 Thread Abhishek Goel
Frequency-domain indicates group of CPUs that would share same frequency.
It is detected using device-tree node "frequency-domain-indicator".
frequency-domain-indicator is a bitmask which will have different value
depending upon the generation of the processor.

CPUs of the same chip for which the result of a bitwise AND between
their PIR and the frequency-domain-indicator is the same share the same
frequency.

In this patch, we define hash-table indexed by the aforementioned
bitwise ANDed value to store the cpumask of the CPUs sharing the same
frequency domain. Further, the cpufreq policy will be created per
frequency-domain

So for POWER9, a cpufreq policy is created per quad while for POWER8 it
is created per core. Governor decides frequency for each policy but
multiple cores may come under same policy. In such case frequency needs
to be set on each core sharing that policy.

Signed-off-by: Abhishek Goel <hunt...@linux.vnet.ibm.com>
---
 drivers/cpufreq/powernv-cpufreq.c | 110 ++
 1 file changed, 100 insertions(+), 10 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index b6d7c4c..fd642bc 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -37,6 +37,7 @@
 #include  /* Required for cpu_sibling_mask() in UP configs */
 #include 
 #include 
+#include 
 
 #define POWERNV_MAX_PSTATES256
 #define PMSR_PSAFE_ENABLE  (1UL << 30)
@@ -130,6 +131,9 @@ static struct chip {
 static int nr_chips;
 static DEFINE_PER_CPU(struct chip *, chip_info);
 
+static u32 freq_domain_indicator;
+static u32 flag;
+
 /*
  * Note:
  * The set of pstates consists of contiguous integers.
@@ -194,6 +198,38 @@ static inline void reset_gpstates(struct cpufreq_policy 
*policy)
gpstates->last_gpstate_idx = 0;
 }
 
+#define SIZE NR_CPUS
+#define ORDER_FREQ_MAP ilog2(SIZE)
+
+static DEFINE_HASHTABLE(freq_domain_map, ORDER_FREQ_MAP);
+
+struct hashmap {
+   cpumask_t mask;
+   int chip_id;
+   u32 pir_key;
+   struct hlist_node hash_node;
+};
+
+static void insert(u32 key, int cpu)
+{
+   struct hashmap *data;
+
+   hash_for_each_possible(freq_domain_map, data, hash_node, key%SIZE) {
+   if (data->chip_id == cpu_to_chip_id(cpu) &&
+   data->pir_key == key) {
+   cpumask_set_cpu(cpu, >mask);
+   return;
+   }
+   }
+
+   data = kzalloc(sizeof(*data), GFP_KERNEL);
+   hash_add(freq_domain_map, >hash_node, key%SIZE);
+   cpumask_set_cpu(cpu, >mask);
+   data->chip_id = cpu_to_chip_id(cpu);
+   data->pir_key = key;
+
+}
+
 /*
  * Initialize the freq table based on data obtained
  * from the firmware passed via device-tree
@@ -206,7 +242,9 @@ static int init_powernv_pstates(void)
u32 len_ids, len_freqs;
u32 pstate_min, pstate_max, pstate_nominal;
u32 pstate_turbo, pstate_ultra_turbo;
+   u32 key;
 
+   flag = 0;
power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
if (!power_mgt) {
pr_warn("power-mgt node not found\n");
@@ -229,6 +267,17 @@ static int init_powernv_pstates(void)
return -ENODEV;
}
 
+   if (of_device_is_compatible(power_mgt, "freq-domain-v1") &&
+   of_property_read_u32(power_mgt, "ibm,freq-domain-indicator",
+_domain_indicator)) {
+   pr_warn("ibm,freq-domain-indicator not found\n");
+   freq_domain_indicator = 0;
+   }
+
+   if (of_device_is_compatible(power_mgt, "P9-occ-quirk")) {
+   flag = 1;
+   }
+
if (of_property_read_u32(power_mgt, "ibm,pstate-ultra-turbo",
 _ultra_turbo)) {
powernv_pstate_info.wof_enabled = false;
@@ -249,6 +298,7 @@ static int init_powernv_pstates(void)
 next:
pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min,
pstate_nominal, pstate_max);
+   pr_info("frequency domain indicator %d", freq_domain_indicator);
pr_info("Workload Optimized Frequency is %s in the platform\n",
(powernv_pstate_info.wof_enabled) ? "enabled" : "disabled");
 
@@ -276,6 +326,15 @@ static int init_powernv_pstates(void)
return -ENODEV;
}
 
+   if (freq_domain_indicator) {
+   hash_init(freq_domain_map);
+   for_each_possible_cpu(i) {
+   key = ((u32) get_hard_smp_processor_id(i) &
+   freq_domain_indicator);
+   insert(key, i);
+   }
+   }
+
powernv_pstate_info.nr_pstates = nr_pstates;
pr_debug("NR PStates %d\n", n

Re: [PATCH] cpufreq: powernv: Add support of frequency domain

2017-12-17 Thread Abhishek

On 12/14/2017 10:12 AM, Viresh Kumar wrote:

+ Gautham,

@Gautham: Can you please help reviewing this one ?

On 13-12-17, 13:49, Abhishek Goel wrote:

@@ -693,6 +746,8 @@ static int powernv_cpufreq_target_index(struct 
cpufreq_policy *policy,
  {
struct powernv_smp_call_data freq_data;
unsigned int cur_msec, gpstate_idx;
+   cpumask_t temp;
+   u32 cpu;
struct global_pstate_info *gpstates = policy->driver_data;
  
  	if (unlikely(rebooting) && new_index != get_nominal_index())

@@ -761,24 +816,48 @@ static int powernv_cpufreq_target_index(struct 
cpufreq_policy *policy,
spin_unlock(>gpstate_lock);
  
  	/*

-* Use smp_call_function to send IPI and execute the
-* mtspr on target CPU.  We could do that without IPI
-* if current CPU is within policy->cpus (core)
+* Use smp_call_function to send IPI and execute the mtspr on CPU.
+* This needs to be done on every core of the policy

Why on each CPU ?
We need to do it in this way as the current implementation takes the max 
of the PMSR of the cores. Thus, when the frequency is required to be 
ramped up, it suffices to write to just the local PMSR, but when the 
frequency is to be ramped down, if we don't send the IPI it breaks the 
compatibility with P8.



 */
-   smp_call_function_any(policy->cpus, set_pstate, _data, 1);
+   cpumask_copy(, policy->cpus);
+
+   while (!cpumask_empty()) {
+   cpu = cpumask_first();
+   smp_call_function_any(cpu_sibling_mask(cpu),
+   set_pstate, _data, 1);
+   cpumask_andnot(, , cpu_sibling_mask(cpu));
+   }
+
return 0;
  }




[PATCH] cpufreq: powernv: Add support of frequency domain

2017-12-13 Thread Abhishek Goel
Frequency-domain indicates group of CPUs that would share same frequency.
It is detected using device-tree node "frequency-domain-indicator".
frequency-domain-indicator is a bitmask which will have different value
depending upon the generation of the processor.

CPUs of the same chip for which the result of a bitwise AND between
their PIR and the frequency-domain-indicator is the same share the same
frequency.

In this patch, we define hash-table indexed by the aforementioned
bitwise ANDed value to store the cpumask of the CPUs sharing the same
frequency domain. Further, the cpufreq policy will be created per
frequency-domain

So for POWER9, a cpufreq policy is created per quad while for POWER8 it
is created per core. Governor decides frequency for each policy but
multiple cores may come under same policy. In such case frequency needs
to be set on each core sharing that policy.

Signed-off-by: Abhishek Goel <hunt...@linux.vnet.ibm.com>
---
 drivers/cpufreq/powernv-cpufreq.c | 95 +++
 1 file changed, 87 insertions(+), 8 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index b6d7c4c..9384110 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -37,6 +37,7 @@
 #include  /* Required for cpu_sibling_mask() in UP configs */
 #include 
 #include 
+#include 
 
 #define POWERNV_MAX_PSTATES256
 #define PMSR_PSAFE_ENABLE  (1UL << 30)
@@ -130,6 +131,8 @@ static struct chip {
 static int nr_chips;
 static DEFINE_PER_CPU(struct chip *, chip_info);
 
+static u32 freq_domain_indicator;
+
 /*
  * Note:
  * The set of pstates consists of contiguous integers.
@@ -194,6 +197,38 @@ static inline void reset_gpstates(struct cpufreq_policy 
*policy)
gpstates->last_gpstate_idx = 0;
 }
 
+#define SIZE NR_CPUS
+#define ORDER_FREQ_MAP ilog2(SIZE)
+
+static DEFINE_HASHTABLE(freq_domain_map, ORDER_FREQ_MAP);
+
+struct hashmap {
+   cpumask_t mask;
+   int chip_id;
+   u32 pir_key;
+   struct hlist_node hash_node;
+};
+
+static void insert(u32 key, int cpu)
+{
+   struct hashmap *data;
+
+   hash_for_each_possible(freq_domain_map, data, hash_node, key%SIZE) {
+   if (data->chip_id == cpu_to_chip_id(cpu) &&
+   data->pir_key == key) {
+   cpumask_set_cpu(cpu, >mask);
+   return;
+   }
+   }
+
+   data = kzalloc(sizeof(*data), GFP_KERNEL);
+   hash_add(freq_domain_map, >hash_node, key%SIZE);
+   cpumask_set_cpu(cpu, >mask);
+   data->chip_id = cpu_to_chip_id(cpu);
+   data->pir_key = key;
+
+}
+
 /*
  * Initialize the freq table based on data obtained
  * from the firmware passed via device-tree
@@ -206,6 +241,7 @@ static int init_powernv_pstates(void)
u32 len_ids, len_freqs;
u32 pstate_min, pstate_max, pstate_nominal;
u32 pstate_turbo, pstate_ultra_turbo;
+   u32 key;
 
power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
if (!power_mgt) {
@@ -229,6 +265,13 @@ static int init_powernv_pstates(void)
return -ENODEV;
}
 
+   if (of_device_is_compatible(power_mgt, "freq-domain-v1") &&
+   of_property_read_u32(power_mgt, "ibm,freq-domain-indicator",
+_domain_indicator)) {
+   pr_warn("ibm,freq-domain-indicator not found\n");
+   freq_domain_indicator = 0;
+   }
+
if (of_property_read_u32(power_mgt, "ibm,pstate-ultra-turbo",
 _ultra_turbo)) {
powernv_pstate_info.wof_enabled = false;
@@ -249,6 +292,7 @@ static int init_powernv_pstates(void)
 next:
pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min,
pstate_nominal, pstate_max);
+   pr_info("frequency domain indicator %d", freq_domain_indicator);
pr_info("Workload Optimized Frequency is %s in the platform\n",
(powernv_pstate_info.wof_enabled) ? "enabled" : "disabled");
 
@@ -276,6 +320,15 @@ static int init_powernv_pstates(void)
return -ENODEV;
}
 
+   if (freq_domain_indicator) {
+   hash_init(freq_domain_map);
+   for_each_possible_cpu(i) {
+   key = ((u32) get_hard_smp_processor_id(i) &
+   freq_domain_indicator);
+   insert(key, i);
+   }
+   }
+
powernv_pstate_info.nr_pstates = nr_pstates;
pr_debug("NR PStates %d\n", nr_pstates);
for (i = 0; i < nr_pstates; i++) {
@@ -693,6 +746,8 @@ static int powernv_cpufreq_target_index(struct 
cpufreq_policy *policy,
 {
struct powernv_smp_call_data freq_data;
unsigned in