[PATCH] Perf: Calling available function for stats printing
For printing dump_trace, just use existing stats_print() function. Signed-off-by: Abhishek Dubey --- tools/perf/builtin-report.c | 5 + 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 69618fb0110b..8678eebc49e6 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1089,10 +1089,7 @@ static int __cmd_report(struct report *rep) perf_session__fprintf_dsos(session, stdout); if (dump_trace) { - perf_session__fprintf_nr_events(session, stdout, - rep->skip_empty); - evlist__fprintf_nr_events(session->evlist, stdout, - rep->skip_empty); + stats_print(rep); return 0; } } -- 2.44.0
[PATCH v3] PowerPC: Replace kretprobe with rethook
This is an adaptation of commit f3a112c0c40d ("x86,rethook,kprobes: Replace kretprobe with rethook on x86") to PowerPC. Replaces the kretprobe code with rethook on Power. With this patch, kretprobe on Power uses the rethook instead of kretprobe specific trampoline code. Reference to other archs: commit b57c2f124098 ("riscv: add riscv rethook implementation") commit 7b0a096436c2 ("LoongArch: Replace kretprobe with rethook") Signed-off-by: Abhishek Dubey --- Changes in v3: * Fixing return address moved to arch_rethook_fixup_return() * Addressed other minor comments * show_stack mods to show correct fn name to go in separate patch --- arch/powerpc/Kconfig | 1 + arch/powerpc/kernel/Makefile | 1 + arch/powerpc/kernel/kprobes.c| 65 +-- arch/powerpc/kernel/optprobes.c | 2 +- arch/powerpc/kernel/rethook.c| 77 arch/powerpc/kernel/stacktrace.c | 6 ++- 6 files changed, 85 insertions(+), 67 deletions(-) create mode 100644 arch/powerpc/kernel/rethook.c diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index c88c6d46a5bc..fa0b1ab3f935 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -270,6 +270,7 @@ config PPC select HAVE_PERF_EVENTS_NMI if PPC64 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_RETHOOK select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE select HAVE_RSEQ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 8585d03c02d3..7dd1b523b17f 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -140,6 +140,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_OPTPROBES)+= optprobes.o optprobes_head.o obj-$(CONFIG_KPROBES_ON_FTRACE)+= kprobes-ftrace.o obj-$(CONFIG_UPROBES) += uprobes.o +obj-$(CONFIG_RETHOOK) += rethook.o obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o obj-$(CONFIG_SWIOTLB) += dma-swiotlb.o obj-$(CONFIG_ARCH_HAS_DMA_SET_MASK) += dma-mask.o diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 14c5ddec3056..f8aa91bc3b17 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -228,16 +228,6 @@ static nokprobe_inline void set_current_kprobe(struct kprobe *p, struct pt_regs kcb->kprobe_saved_msr = regs->msr; } -void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - ri->ret_addr = (kprobe_opcode_t *)regs->link; - ri->fp = NULL; - - /* Replace the return addr with trampoline addr */ - regs->link = (unsigned long)__kretprobe_trampoline; -} -NOKPROBE_SYMBOL(arch_prepare_kretprobe); - static int try_to_emulate(struct kprobe *p, struct pt_regs *regs) { int ret; @@ -394,49 +384,6 @@ int kprobe_handler(struct pt_regs *regs) } NOKPROBE_SYMBOL(kprobe_handler); -/* - * Function return probe trampoline: - * - init_kprobes() establishes a probepoint here - * - When the probed function returns, this probe - * causes the handlers to fire - */ -asm(".global __kretprobe_trampoline\n" - ".type __kretprobe_trampoline, @function\n" - "__kretprobe_trampoline:\n" - "nop\n" - "blr\n" - ".size __kretprobe_trampoline, .-__kretprobe_trampoline\n"); - -/* - * Called when the probe at kretprobe trampoline is hit - */ -static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) -{ - unsigned long orig_ret_address; - - orig_ret_address = __kretprobe_trampoline_handler(regs, NULL); - /* -* We get here through one of two paths: -* 1. by taking a trap -> kprobe_handler() -> here -* 2. by optprobe branch -> optimized_callback() -> opt_pre_handler() -> here -* -* When going back through (1), we need regs->nip to be setup properly -* as it is used to determine the return address from the trap. -* For (2), since nip is not honoured with optprobes, we instead setup -* the link register properly so that the subsequent 'blr' in -* __kretprobe_trampoline jumps back to the right instruction. -* -* For nip, we should set the address to the previous instruction since -* we end up emulating it in kprobe_handler(), which increments the nip -* again. -*/ - regs_set_return_ip(regs, orig_ret_address - 4); - regs->link = orig_ret_address; - - return 0; -} -NOKPROBE_SYMBOL(trampoline_probe_handler); - /* * Called after single-stepping. p->addr is the address of the * instruction whose first byte has been replaced by the "breakpoint" @@ -539,19 +486,9 @@ int kprobe_fault_handler(struct p
[PATCH v2] PowerPC: Replace kretprobe with rethook
This is an adaptation of commit f3a112c0c40d ("x86,rethook,kprobes: Replace kretprobe with rethook on x86") to PowerPC. Replaces the kretprobe code with rethook on Power. With this patch, kretprobe on Power uses the rethook instead of kretprobe specific trampoline code. Reference to other archs: commit b57c2f124098 ("riscv: add riscv rethook implementation") commit 7b0a096436c2 ("LoongArch: Replace kretprobe with rethook") Signed-off-by: Abhishek Dubey --- arch/powerpc/Kconfig | 1 + arch/powerpc/kernel/Makefile | 1 + arch/powerpc/kernel/kprobes.c| 65 + arch/powerpc/kernel/optprobes.c | 2 +- arch/powerpc/kernel/rethook.c| 71 arch/powerpc/kernel/stacktrace.c | 10 +++-- 6 files changed, 81 insertions(+), 69 deletions(-) create mode 100644 arch/powerpc/kernel/rethook.c diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index c88c6d46a5bc..fa0b1ab3f935 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -270,6 +270,7 @@ config PPC select HAVE_PERF_EVENTS_NMI if PPC64 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_RETHOOK select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE select HAVE_RSEQ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 8585d03c02d3..7dd1b523b17f 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -140,6 +140,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_OPTPROBES)+= optprobes.o optprobes_head.o obj-$(CONFIG_KPROBES_ON_FTRACE)+= kprobes-ftrace.o obj-$(CONFIG_UPROBES) += uprobes.o +obj-$(CONFIG_RETHOOK) += rethook.o obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o obj-$(CONFIG_SWIOTLB) += dma-swiotlb.o obj-$(CONFIG_ARCH_HAS_DMA_SET_MASK) += dma-mask.o diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 14c5ddec3056..f8aa91bc3b17 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -228,16 +228,6 @@ static nokprobe_inline void set_current_kprobe(struct kprobe *p, struct pt_regs kcb->kprobe_saved_msr = regs->msr; } -void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - ri->ret_addr = (kprobe_opcode_t *)regs->link; - ri->fp = NULL; - - /* Replace the return addr with trampoline addr */ - regs->link = (unsigned long)__kretprobe_trampoline; -} -NOKPROBE_SYMBOL(arch_prepare_kretprobe); - static int try_to_emulate(struct kprobe *p, struct pt_regs *regs) { int ret; @@ -394,49 +384,6 @@ int kprobe_handler(struct pt_regs *regs) } NOKPROBE_SYMBOL(kprobe_handler); -/* - * Function return probe trampoline: - * - init_kprobes() establishes a probepoint here - * - When the probed function returns, this probe - * causes the handlers to fire - */ -asm(".global __kretprobe_trampoline\n" - ".type __kretprobe_trampoline, @function\n" - "__kretprobe_trampoline:\n" - "nop\n" - "blr\n" - ".size __kretprobe_trampoline, .-__kretprobe_trampoline\n"); - -/* - * Called when the probe at kretprobe trampoline is hit - */ -static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) -{ - unsigned long orig_ret_address; - - orig_ret_address = __kretprobe_trampoline_handler(regs, NULL); - /* -* We get here through one of two paths: -* 1. by taking a trap -> kprobe_handler() -> here -* 2. by optprobe branch -> optimized_callback() -> opt_pre_handler() -> here -* -* When going back through (1), we need regs->nip to be setup properly -* as it is used to determine the return address from the trap. -* For (2), since nip is not honoured with optprobes, we instead setup -* the link register properly so that the subsequent 'blr' in -* __kretprobe_trampoline jumps back to the right instruction. -* -* For nip, we should set the address to the previous instruction since -* we end up emulating it in kprobe_handler(), which increments the nip -* again. -*/ - regs_set_return_ip(regs, orig_ret_address - 4); - regs->link = orig_ret_address; - - return 0; -} -NOKPROBE_SYMBOL(trampoline_probe_handler); - /* * Called after single-stepping. p->addr is the address of the * instruction whose first byte has been replaced by the "breakpoint" @@ -539,19 +486,9 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) } NOKPROBE_SYMBOL(kprobe_fault_handler); -static struct kprobe trampoline_p = { - .addr = (kprobe_opcode_t *) &__kretprobe_trampoline, - .pre_ha
[PATCH] Perf: Calling available function for stats printing
For printing dump_trace, use existing stats_print() function. Signed-off-by: Abhishek Dubey --- tools/perf/builtin-report.c | 5 + 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index dcd93ee5fc24..3cabd5b0bfec 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1088,10 +1088,7 @@ static int __cmd_report(struct report *rep) perf_session__fprintf_dsos(session, stdout); if (dump_trace) { - perf_session__fprintf_nr_events(session, stdout, - rep->skip_empty); - evlist__fprintf_nr_events(session->evlist, stdout, - rep->skip_empty); + stats_print(rep); return 0; } } -- 2.44.0
[PATCH] PowerPC: Replace kretprobe with rethook
This is an adaptation of commit f3a112c0c40d ("x86,rethook,kprobes: Replace kretprobe with rethook on x86") to Power. Replaces the kretprobe code with rethook on Power. With this patch, kretprobe on Power uses the rethook instead of kretprobe specific trampoline code. Reference to other archs: commit b57c2f124098 ("riscv: add riscv rethook implementation") commit 7b0a096436c2 ("LoongArch: Replace kretprobe with rethook") Signed-off-by: Abhishek Dubey --- arch/powerpc/Kconfig | 1 + arch/powerpc/kernel/Makefile | 1 + arch/powerpc/kernel/kprobes.c| 65 + arch/powerpc/kernel/optprobes.c | 2 +- arch/powerpc/kernel/rethook.c| 71 arch/powerpc/kernel/stacktrace.c | 6 +-- include/linux/rethook.h | 1 - 7 files changed, 78 insertions(+), 69 deletions(-) create mode 100644 arch/powerpc/kernel/rethook.c diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1c4be3373686..108de491965a 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -268,6 +268,7 @@ config PPC select HAVE_PERF_EVENTS_NMI if PPC64 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_RETHOOK select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE select HAVE_RSEQ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index d3282fbea4f2..181d764be3a6 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -142,6 +142,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_OPTPROBES)+= optprobes.o optprobes_head.o obj-$(CONFIG_KPROBES_ON_FTRACE)+= kprobes-ftrace.o obj-$(CONFIG_UPROBES) += uprobes.o +obj-$(CONFIG_RETHOOK) += rethook.o obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o obj-$(CONFIG_SWIOTLB) += dma-swiotlb.o obj-$(CONFIG_ARCH_HAS_DMA_SET_MASK) += dma-mask.o diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index bbca90a5e2ec..614bb68ad0e6 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -248,16 +248,6 @@ static nokprobe_inline void set_current_kprobe(struct kprobe *p, struct pt_regs kcb->kprobe_saved_msr = regs->msr; } -void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - ri->ret_addr = (kprobe_opcode_t *)regs->link; - ri->fp = NULL; - - /* Replace the return addr with trampoline addr */ - regs->link = (unsigned long)__kretprobe_trampoline; -} -NOKPROBE_SYMBOL(arch_prepare_kretprobe); - static int try_to_emulate(struct kprobe *p, struct pt_regs *regs) { int ret; @@ -414,49 +404,6 @@ int kprobe_handler(struct pt_regs *regs) } NOKPROBE_SYMBOL(kprobe_handler); -/* - * Function return probe trampoline: - * - init_kprobes() establishes a probepoint here - * - When the probed function returns, this probe - * causes the handlers to fire - */ -asm(".global __kretprobe_trampoline\n" - ".type __kretprobe_trampoline, @function\n" - "__kretprobe_trampoline:\n" - "nop\n" - "blr\n" - ".size __kretprobe_trampoline, .-__kretprobe_trampoline\n"); - -/* - * Called when the probe at kretprobe trampoline is hit - */ -static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) -{ - unsigned long orig_ret_address; - - orig_ret_address = __kretprobe_trampoline_handler(regs, NULL); - /* -* We get here through one of two paths: -* 1. by taking a trap -> kprobe_handler() -> here -* 2. by optprobe branch -> optimized_callback() -> opt_pre_handler() -> here -* -* When going back through (1), we need regs->nip to be setup properly -* as it is used to determine the return address from the trap. -* For (2), since nip is not honoured with optprobes, we instead setup -* the link register properly so that the subsequent 'blr' in -* __kretprobe_trampoline jumps back to the right instruction. -* -* For nip, we should set the address to the previous instruction since -* we end up emulating it in kprobe_handler(), which increments the nip -* again. -*/ - regs_set_return_ip(regs, orig_ret_address - 4); - regs->link = orig_ret_address; - - return 0; -} -NOKPROBE_SYMBOL(trampoline_probe_handler); - /* * Called after single-stepping. p->addr is the address of the * instruction whose first byte has been replaced by the "breakpoint" @@ -559,19 +506,9 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) } NOKPROBE_SYMBOL(kprobe_fault_handler); -static struct kprobe trampoline_p = { - .addr = (kprobe_opcode_t *) &__kr
[RFC v2 2/2] powerpc/powernv : Introduce capability for firmware-enabled-stop
This patch introduces the capability for firmware to handle the stop states instead. A bit is set based on the discovery of the feature and correspondingly also the responsibility to handle the stop states. If Kernel does not know about stop version, it can fallback to opal for idle stop support if firmware-stop-supported property is present. Earlier part of this patch was posted in this series : https://lkml.org/lkml/2020/3/4/589 Signed-off-by: Abhishek Goel Signed-off-by: Pratik Rajesh Sampat --- v1->v2 : Combined patch 2 and 3 from previous iteration and rebased it. arch/powerpc/include/asm/processor.h | 18 ++ arch/powerpc/kernel/dt_cpu_ftrs.c | 13 + arch/powerpc/platforms/powernv/idle.c | 20 drivers/cpuidle/cpuidle-powernv.c | 3 ++- 4 files changed, 49 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index bfa336fbcfeb..b8de7146387c 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -428,6 +428,24 @@ extern void power4_idle_nap(void); extern unsigned long cpuidle_disable; enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF}; +#define STOP_ENABLE0x0001 +#define FIRMWARE_STOP_ENABLE 0x0010 + +#define STOP_VERSION_P9 0x1 + +/* + * Classify the dependencies of the stop states + * @idle_stop: function handler to handle the quirk stop version + * @cpuidle_prop: Signify support for stop states through kernel and/or firmware + * @stop_version: Classify quirk versions for stop states + */ +typedef struct { + unsigned long (*idle_stop)(unsigned long psscr, bool mmu_on); + uint8_t cpuidle_prop; + uint8_t stop_version; +} stop_deps_t; +extern stop_deps_t stop_dep; + extern int powersave_nap; /* set if nap mode can be used in idle loop */ extern void power7_idle_type(unsigned long type); diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 36bc0d5c4f3a..737686fae3c7 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -291,6 +291,15 @@ static int __init feat_enable_idle_stop(struct dt_cpu_feature *f) lpcr |= LPCR_PECE1; lpcr |= LPCR_PECE2; mtspr(SPRN_LPCR, lpcr); + stop_dep.cpuidle_prop |= STOP_ENABLE; + stop_dep.stop_version = STOP_VERSION_P9; + + return 1; +} + +static int __init feat_enable_firmware_stop(struct dt_cpu_feature *f) +{ + stop_dep.cpuidle_prop |= FIRMWARE_STOP_ENABLE; return 1; } @@ -589,6 +598,7 @@ static struct dt_cpu_feature_match __initdata {"idle-nap", feat_enable_idle_nap, 0}, {"alignment-interrupt-dsisr", feat_enable_align_dsisr, 0}, {"idle-stop", feat_enable_idle_stop, 0}, + {"firmware-stop-supported", feat_enable_firmware_stop, 0}, {"machine-check-power8", feat_enable_mce_power8, 0}, {"performance-monitor-power8", feat_enable_pmu_power8, 0}, {"data-stream-control-register", feat_enable_dscr, CPU_FTR_DSCR}, @@ -656,6 +666,9 @@ static void __init cpufeatures_setup_start(u32 isa) } } +stop_deps_t stop_dep = {NULL, 0x0, 0x0}; +EXPORT_SYMBOL(stop_dep); + static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f) { const struct dt_cpu_feature_match *m; diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 3afd4293f729..3602950f6c08 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -824,7 +824,7 @@ static unsigned long power9_offline_stop(unsigned long psscr) #ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE __ppc64_runlatch_off(); - srr1 = power9_idle_stop(psscr, true); + srr1 = stop_dep.idle_stop(psscr, true); __ppc64_runlatch_on(); #else /* @@ -840,7 +840,7 @@ static unsigned long power9_offline_stop(unsigned long psscr) local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE; __ppc64_runlatch_off(); - srr1 = power9_idle_stop(psscr, false); + srr1 = stop_dep.idle_stop(psscr, true); __ppc64_runlatch_on(); local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL; @@ -868,7 +868,7 @@ void power9_idle_type(unsigned long stop_psscr_val, psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val; __ppc64_runlatch_off(); - srr1 = power9_idle_stop(psscr, true); + srr1 = stop_dep.idle_stop(psscr, true); __ppc64_runlatch_on(); fini_irq_for_idle_irqsoff(); @@ -1365,8 +1365,20 @@ static int __init pnv_init_idle_states(void) nr_pnv_idle_states = 0; supported_cpuidle_states = 0; - if (cpuidle_disable != IDLE_NO_OVERRIDE) + if (cpuidle_disable != IDLE_NO_OVERRIDE || + !(stop_dep.cpuidle_prop
[PATCH] cpuidle/powernv : Remove dead code block
Commit 1961acad2f88559c2cdd2ef67c58c3627f1f6e54 removes usage of function "validate_dt_prop_sizes". This patch removes this unused function. Signed-off-by: Abhishek Goel --- drivers/cpuidle/cpuidle-powernv.c | 14 -- 1 file changed, 14 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 1b299e801f74..addaa6e6718b 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -244,20 +244,6 @@ static inline void add_powernv_state(int index, const char *name, stop_psscr_table[index].mask = psscr_mask; } -/* - * Returns 0 if prop1_len == prop2_len. Else returns -1 - */ -static inline int validate_dt_prop_sizes(const char *prop1, int prop1_len, -const char *prop2, int prop2_len) -{ - if (prop1_len == prop2_len) - return 0; - - pr_warn("cpuidle-powernv: array sizes don't match for %s and %s\n", - prop1, prop2); - return -1; -} - extern u32 pnv_get_supported_cpuidle_states(void); static int powernv_add_idle_states(void) { -- 2.17.1
[RFC v2 1/2] powerpc/powernv : Add support for pre-entry and post-exit of stop state using OPAL V4 OS services
This patch provides kernel framework fro opal support of save restore of sprs in idle stop loop. Opal support for stop states is needed to selectively enable stop states or to introduce a quirk quickly in case a buggy stop state is present. We make a opal call from kernel if firmware-stop-support for stop states is present and enabled. All the quirks for pre-entry of stop state is handled inside opal. A call from opal is made into kernel where we execute stop afer saving of NVGPRs. After waking up from 0x100 vector in kernel, we enter back into opal. All the quirks in post exit path, if any, are then handled in opal, from where we return successfully back to kernel. Signed-off-by: Abhishek Goel --- v1->v2 : Rebased the patch on Nick's Opal V4 OS patchset arch/powerpc/include/asm/opal-api.h| 4 +++- arch/powerpc/include/asm/opal.h| 1 + arch/powerpc/platforms/powernv/idle.c | 12 arch/powerpc/platforms/powernv/opal-call.c | 1 + arch/powerpc/platforms/powernv/opal.c | 15 +++ 5 files changed, 32 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 97c5e5423827..437b6937685d 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -219,7 +219,8 @@ #define OPAL_REPORT_TRAP 183 #define OPAL_FIND_VM_AREA 184 #define OPAL_REGISTER_OS_OPS 185 -#define OPAL_LAST 185 +#define OPAL_CPU_IDLE 186 +#define OPAL_LAST 186 #define QUIESCE_HOLD 1 /* Spin all calls at entry */ #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */ @@ -1207,6 +1208,7 @@ struct opal_os_ops { __be64 os_printf; /* void printf(int32_t level, const char *str) */ __be64 os_vm_map; /* int64_t os_vm_map(uint64_t ea, uint64_t pa, uint64_t flags) */ __be64 os_vm_unmap; /* void os_vm_unmap(uint64_t ea) */ + __be64 os_idle_stop; /* void os_idle_stop(uint64_t srr1_addr, uint64_t psscr) */ }; #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 09985b7718b3..1774c056acb8 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -407,6 +407,7 @@ void opal_sensor_groups_init(void); int64_t opal_find_vm_area(uint64_t addr, struct opal_vm_area *opal_vm_area); int64_t opal_register_os_ops(struct opal_os_ops *ops, uint64_t size); +int64_t opal_cpu_idle(uint64_t srr1_addr, uint64_t psscr); #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 78599bca66c2..3afd4293f729 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -805,6 +805,18 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) return srr1; } +static unsigned long power9_firmware_idle_stop(unsigned long psscr, bool mmu_on) +{ + unsigned long srr1; + int rc; + + rc = opal_cpu_idle(cpu_to_be64(), (uint64_t) psscr); + + if (mmu_on) + mtmsr(MSR_KERNEL); + return srr1; +} + #ifdef CONFIG_HOTPLUG_CPU static unsigned long power9_offline_stop(unsigned long psscr) { diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c index 11f419e76059..79076ca2de03 100644 --- a/arch/powerpc/platforms/powernv/opal-call.c +++ b/arch/powerpc/platforms/powernv/opal-call.c @@ -351,3 +351,4 @@ OPAL_CALL(opal_sym_to_addr, OPAL_SYM_TO_ADDR); OPAL_CALL(opal_report_trap,OPAL_REPORT_TRAP); OPAL_CALL(opal_find_vm_area, OPAL_FIND_VM_AREA); OPAL_CALL(opal_register_os_ops,OPAL_REGISTER_OS_OPS); +OPAL_CALL(opal_cpu_idle, OPAL_CPU_IDLE); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 93b9afaf33b3..1fbf7065f918 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -1150,6 +1150,20 @@ static void os_vm_unmap(uint64_t ea) local_flush_tlb_mm(mm); } +int64_t os_idle_stop(uint64_t srr1_addr, uint64_t psscr) +{ + /* +* For lite state which does not lose even GPRS we call +* idle_stop_noloss while for all other states we call +* idle_stop_mayloss. Saving and restoration of other additional +* SPRs if required is handled in OPAL. All the quirks are also +* handled in OPAL. +*/ + if (!(psscr & (PSSCR_EC|PSSCR_ESL))) + return isa300_idle_stop_noloss(psscr); + return isa300_idle_stop_mayloss(psscr); +} + static int __init opal_init_mm(void) { struct mm_struct *mm; @@ -1231,6 +1245,7 @@ static int __init opal_init_ear
Re: [RFC 1/3] powernv/cpuidle : Support for pre-entry and post exit of stop state in firmware
Hi Nick, Have you posted out the kernel side of "opal v4" patchset? I could only find the opal patchset. Thanks, Abhishek On 04/28/2020 06:38 AM, Nicholas Piggin wrote: Thanks for picking this up and pushing it along. I do plan to come back and take another look at it all, but what we do need to do first is get a coherent approach to this proposed new calling convention and OS ops. It's fine to work on this in the meantime, but to start merging things my idea is: - OPAL must leave r13-r15 untouched for the OS. - OS ops are made available only for a "v4" OS that uses the new calling convention, including kernel stack. - OS ops baseline (all OSes must provide) will be console / printk facility, trap handling and crash/symbol decoding on behalf of OPAL, and runtime virtual memory. Other OS ops features can be added in the versioned structure, including this. I'm trying to get back to cleaning these things up and start getting them merged now. Any comments or review on those would be helpful. Thanks, Nick
[RFC 3/3] powernv/cpuidle : Introduce capability for firmware-enabled-stop
This patch introduces the capability for firmware to handle the stop states instead. A bit is set based on the discovery of the feature and correspondingly also the responsibility to handle the stop states. If Kernel does not know about stop version, it can fallback to opal for idle stop support if firmware-stop-supported property is present. Earlier this patch was posted as part of this series : https://lkml.org/lkml/2020/3/4/589 Signed-off-by: Pratik Rajesh Sampat Signed-off-by: Abhishek Goel --- v1->v2: This patch is newly added in this series. arch/powerpc/include/asm/processor.h | 1 + arch/powerpc/kernel/dt_cpu_ftrs.c | 8 arch/powerpc/platforms/powernv/idle.c | 27 --- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 66fa20476d0e..d5c6532b11ef 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -430,6 +430,7 @@ extern unsigned long cpuidle_disable; enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF}; #define STOP_ENABLE0x0001 +#define FIRMWARE_STOP_ENABLE 0x0010 #define STOP_VERSION_P9 0x1 diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index db1a525e090d..ff4a87b79702 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -298,6 +298,13 @@ static int __init feat_enable_idle_stop(struct dt_cpu_feature *f) return 1; } +static int __init feat_enable_firmware_stop(struct dt_cpu_feature *f) +{ + stop_dep.cpuidle_prop |= FIRMWARE_STOP_ENABLE; + + return 1; +} + static int __init feat_enable_mmu_hash(struct dt_cpu_feature *f) { u64 lpcr; @@ -592,6 +599,7 @@ static struct dt_cpu_feature_match __initdata {"idle-nap", feat_enable_idle_nap, 0}, {"alignment-interrupt-dsisr", feat_enable_align_dsisr, 0}, {"idle-stop", feat_enable_idle_stop, 0}, + {"firmware-stop-supported", feat_enable_firmware_stop, 0}, {"machine-check-power8", feat_enable_mce_power8, 0}, {"performance-monitor-power8", feat_enable_pmu_power8, 0}, {"data-stream-control-register", feat_enable_dscr, CPU_FTR_DSCR}, diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 538f0842ac3f..0de5de81902e 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -633,16 +633,6 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) unsigned long mmcr0 = 0; struct p9_sprs sprs = {}; /* avoid false used-uninitialised */ bool sprs_saved = false; - int rc = 0; - - /* -* Kernel takes decision whether to make OPAL call or not. This logic -* will be combined with the logic for BE opal to take decision. -*/ - if (firmware_stop_supported) { - rc = opal_cpu_idle(cpu_to_be64(__pa()), (uint64_t) psscr); - goto out; - } if (!(psscr & (PSSCR_EC|PSSCR_ESL))) { /* EC=ESL=0 case */ @@ -835,6 +825,19 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) return srr1; } +static unsigned long power9_firmware_idle_stop(unsigned long psscr, bool mmu_on) +{ + unsigned long srr1; + int rc; + + rc = opal_cpu_idle(cpu_to_be64(__pa()), (uint64_t) psscr); + + if (mmu_on) + mtmsr(MSR_KERNEL); + return srr1; + +} + #ifdef CONFIG_HOTPLUG_CPU static unsigned long power9_offline_stop(unsigned long psscr) { @@ -1394,9 +1397,11 @@ static int __init pnv_init_idle_states(void) !(stop_dep.cpuidle_prop & STOP_ENABLE)) goto out; - /* Check for supported version in kernel */ + /* Check for supported version in kernel or fallback to kernel*/ if (stop_dep.stop_version & STOP_VERSION_P9) { stop_dep.idle_stop = power9_idle_stop; + } else if (stop_dep.cpuidle_prop & FIRMWARE_STOP_ENABLE) { + stop_dep.idle_stop = power9_firmware_idle_stop; } else { stop_dep.idle_stop = NULL; goto out; -- 2.17.1
[RFC 2/3] powernv/cpuidle : Interface for an idle-stop dependency structure
This patch introduces the idea of having a dependency structure for idle-stop. The structure encapsulates the following: 1. Bitmask for version of idle-stop 2. Bitmask for propterties like ENABLE/DISABLE 3. Function pointer which helps handle how the stop must be invoked This patch lays a foundation for other idle-stop versions to be added and handled cleanly based on their specified requirments. Currently it handles the existing "idle-stop" version by setting the discovery bits and the function pointer. Earlier this patch was posted as part of this series : https://lkml.org/lkml/2020/3/4/589 Signed-off-by: Pratik Rajesh Sampat Signed-off-by: Abhishek Goel --- v1->v2: This patch is newly added in this series. arch/powerpc/include/asm/processor.h | 17 + arch/powerpc/kernel/dt_cpu_ftrs.c | 5 + arch/powerpc/platforms/powernv/idle.c | 18 ++ drivers/cpuidle/cpuidle-powernv.c | 3 ++- 4 files changed, 38 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index eedcbfb9a6ff..66fa20476d0e 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -429,6 +429,23 @@ extern void power4_idle_nap(void); extern unsigned long cpuidle_disable; enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF}; +#define STOP_ENABLE0x0001 + +#define STOP_VERSION_P9 0x1 + +/* + * Classify the dependencies of the stop states + * @idle_stop: function handler to handle the quirk stop version + * @cpuidle_prop: Signify support for stop states through kernel and/or firmware + * @stop_version: Classify quirk versions for stop states + */ +typedef struct { + unsigned long (*idle_stop)(unsigned long psscr, bool mmu_on); + uint8_t cpuidle_prop; + uint8_t stop_version; +} stop_deps_t; +extern stop_deps_t stop_dep; + extern int powersave_nap; /* set if nap mode can be used in idle loop */ extern void power7_idle_type(unsigned long type); diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 182b4047c1ef..db1a525e090d 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -292,6 +292,8 @@ static int __init feat_enable_idle_stop(struct dt_cpu_feature *f) lpcr |= LPCR_PECE1; lpcr |= LPCR_PECE2; mtspr(SPRN_LPCR, lpcr); + stop_dep.cpuidle_prop |= STOP_ENABLE; + stop_dep.stop_version = STOP_VERSION_P9; return 1; } @@ -657,6 +659,9 @@ static void __init cpufeatures_setup_start(u32 isa) } } +stop_deps_t stop_dep = {NULL, 0x0, 0x0}; +EXPORT_SYMBOL(stop_dep); + static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f) { const struct dt_cpu_feature_match *m; diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 1841027b25c5..538f0842ac3f 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -842,7 +842,7 @@ static unsigned long power9_offline_stop(unsigned long psscr) #ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE __ppc64_runlatch_off(); - srr1 = power9_idle_stop(psscr, true); + srr1 = stop_dep.idle_stop(psscr, true); __ppc64_runlatch_on(); #else /* @@ -858,7 +858,7 @@ static unsigned long power9_offline_stop(unsigned long psscr) local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE; __ppc64_runlatch_off(); - srr1 = power9_idle_stop(psscr, false); + srr1 = stop_dep.idle_stop(psscr, true); __ppc64_runlatch_on(); local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL; @@ -886,7 +886,7 @@ void power9_idle_type(unsigned long stop_psscr_val, psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val; __ppc64_runlatch_off(); - srr1 = power9_idle_stop(psscr, true); + srr1 = stop_dep.idle_stop(psscr, true); __ppc64_runlatch_on(); fini_irq_for_idle_irqsoff(); @@ -1390,8 +1390,18 @@ static int __init pnv_init_idle_states(void) nr_pnv_idle_states = 0; supported_cpuidle_states = 0; - if (cpuidle_disable != IDLE_NO_OVERRIDE) + if (cpuidle_disable != IDLE_NO_OVERRIDE || + !(stop_dep.cpuidle_prop & STOP_ENABLE)) goto out; + + /* Check for supported version in kernel */ + if (stop_dep.stop_version & STOP_VERSION_P9) { + stop_dep.idle_stop = power9_idle_stop; + } else { + stop_dep.idle_stop = NULL; + goto out; + } + rc = pnv_parse_cpuidle_dt(); if (rc) return rc; diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 1b299e801f74..7430a8edf5c9 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -371,7 +371,8 @@ sta
[RFC 1/3] powernv/cpuidle : Support for pre-entry and post exit of stop state in firmware
This patch provides kernel framework fro opal support of save restore of sprs in idle stop loop. Opal support for stop states is needed to selectively enable stop states or to introduce a quirk quickly in case a buggy stop state is present. We make a opal call from kernel if firmware-stop-support for stop states is present and enabled. All the quirks for pre-entry of stop state is handled inside opal. A call from opal is made into kernel where we execute stop afer saving of NVGPRs. After waking up from 0x100 vector in kernel, we enter back into opal. All the quirks in post exit path, if any, are then handled in opal, from where we return successfully back to kernel. For deep stop states in which additional SPRs are lost, saving and restoration will be done in OPAL. This idea was first proposed by Nick here: https://patchwork.ozlabs.org/patch/1208159/ The corresponding skiboot patch for this kernel patch is here: https://patchwork.ozlabs.org/project/skiboot/list/?series=172831 When we callback from OPAL into kernel, r13 is clobbered. So, to access PACA we need to restore it from HSPRGO. In future we can handle this into OPAL as in here: https://patchwork.ozlabs.org/patch/1245275/ Signed-off-by: Abhishek Goel Signed-off-by: Nicholas Piggin --- v1->v2 : No change in this patch. arch/powerpc/include/asm/opal-api.h| 8 - arch/powerpc/include/asm/opal.h| 3 ++ arch/powerpc/kernel/idle_book3s.S | 5 +++ arch/powerpc/platforms/powernv/idle.c | 37 ++ arch/powerpc/platforms/powernv/opal-call.c | 2 ++ 5 files changed, 54 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index c1f25a760eb1..a2c782c99c9e 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -214,7 +214,9 @@ #define OPAL_SECVAR_GET176 #define OPAL_SECVAR_GET_NEXT 177 #define OPAL_SECVAR_ENQUEUE_UPDATE 178 -#define OPAL_LAST 178 +#define OPAL_REGISTER_OS_OPS 181 +#define OPAL_CPU_IDLE 182 +#define OPAL_LAST 182 #define QUIESCE_HOLD 1 /* Spin all calls at entry */ #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */ @@ -1181,6 +1183,10 @@ struct opal_mpipl_fadump { struct opal_mpipl_region region[]; } __packed; +struct opal_os_ops { + __be64 os_idle_stop; +}; + #endif /* __ASSEMBLY__ */ #endif /* __OPAL_API_H */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 9986ac34b8e2..3c340bc4df8e 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -400,6 +400,9 @@ void opal_powercap_init(void); void opal_psr_init(void); void opal_sensor_groups_init(void); +extern int64_t opal_register_os_ops(struct opal_os_ops *os_ops); +extern int64_t opal_cpu_idle(__be64 srr1_addr, uint64_t psscr); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_OPAL_H */ diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 22f249b6f58d..8d287d1d06c0 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -49,6 +49,8 @@ _GLOBAL(isa300_idle_stop_noloss) */ _GLOBAL(isa300_idle_stop_mayloss) mtspr SPRN_PSSCR,r3 + mr r6, r13 + mfspr r13, SPRN_HSPRG0 std r1,PACAR1(r13) mflrr4 mfcrr5 @@ -74,6 +76,7 @@ _GLOBAL(isa300_idle_stop_mayloss) std r31,-8*18(r1) std r4,-8*19(r1) std r5,-8*20(r1) + std r6,-8*21(r1) /* 168 bytes */ PPC_STOP b . /* catch bugs */ @@ -91,8 +94,10 @@ _GLOBAL(idle_return_gpr_loss) ld r1,PACAR1(r13) ld r4,-8*19(r1) ld r5,-8*20(r1) + ld r6,-8*21(r1) mtlrr4 mtcrr5 + mr r13,r6 /* * KVM nap requires r2 to be saved, rather than just restoring it * from PACATOC. This could be avoided for that less common case diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 78599bca66c2..1841027b25c5 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -35,6 +35,7 @@ static u32 supported_cpuidle_states; struct pnv_idle_states_t *pnv_idle_states; int nr_pnv_idle_states; +static bool firmware_stop_supported; /* * The default stop state that will be used by ppc_md.power_save @@ -602,6 +603,25 @@ struct p9_sprs { u64 uamor; }; +/* + * This function is called from OPAL if firmware support for stop + * states is present and enabled. It provides a fallback for idle + * stop states via OPAL. + */ +static uint64_t os_idle_stop(uint64_t psscr, bool save_gprs) +{ + /* +* For l
[RFC] cpuidle/powernv : Support for pre-entry and post exit of stop state in firmware
This patch provides kernel framework fro opal support of save restore of sprs in idle stop loop. Opal support for stop states is needed to selectively enable stop states or to introduce a quirk quickly in case a buggy stop state is present. We make a opal call from kernel if firmware-stop-support for stop states is present and enabled. All the quirks for pre-entry of stop state is handled inside opal. A call from opal is made into kernel where we execute stop afer saving of NVGPRs. After waking up from 0x100 vector in kernel, we enter back into opal. All the quirks in post exit path, if any, are then handled in opal, from where we return successfully back to kernel. For deep stop states in which additional SPRs are lost, saving and restoration will be done in OPAL. This idea was first proposed by Nick here: https://patchwork.ozlabs.org/patch/1208159/ The corresponding skiboot patch for this kernel patch is here: https://patchwork.ozlabs.org/patch/1265959/ When we callback from OPAL into kernel, r13 is clobbered. So, to access PACA we need to restore it from HSPRGO. In future we can handle this into OPAL as in here: https://patchwork.ozlabs.org/patch/1245275/ Signed-off-by: Abhishek Goel Signed-off-by: Nicholas Piggin --- arch/powerpc/include/asm/opal-api.h| 8 - arch/powerpc/include/asm/opal.h| 3 ++ arch/powerpc/kernel/idle_book3s.S | 5 +++ arch/powerpc/platforms/powernv/idle.c | 37 ++ arch/powerpc/platforms/powernv/opal-call.c | 2 ++ 5 files changed, 54 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index c1f25a760eb1..a2c782c99c9e 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -214,7 +214,9 @@ #define OPAL_SECVAR_GET176 #define OPAL_SECVAR_GET_NEXT 177 #define OPAL_SECVAR_ENQUEUE_UPDATE 178 -#define OPAL_LAST 178 +#define OPAL_REGISTER_OS_OPS 181 +#define OPAL_CPU_IDLE 182 +#define OPAL_LAST 182 #define QUIESCE_HOLD 1 /* Spin all calls at entry */ #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */ @@ -1181,6 +1183,10 @@ struct opal_mpipl_fadump { struct opal_mpipl_region region[]; } __packed; +struct opal_os_ops { + __be64 os_idle_stop; +}; + #endif /* __ASSEMBLY__ */ #endif /* __OPAL_API_H */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 9986ac34b8e2..3c340bc4df8e 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -400,6 +400,9 @@ void opal_powercap_init(void); void opal_psr_init(void); void opal_sensor_groups_init(void); +extern int64_t opal_register_os_ops(struct opal_os_ops *os_ops); +extern int64_t opal_cpu_idle(__be64 srr1_addr, uint64_t psscr); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_OPAL_H */ diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 22f249b6f58d..8d287d1d06c0 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -49,6 +49,8 @@ _GLOBAL(isa300_idle_stop_noloss) */ _GLOBAL(isa300_idle_stop_mayloss) mtspr SPRN_PSSCR,r3 + mr r6, r13 + mfspr r13, SPRN_HSPRG0 std r1,PACAR1(r13) mflrr4 mfcrr5 @@ -74,6 +76,7 @@ _GLOBAL(isa300_idle_stop_mayloss) std r31,-8*18(r1) std r4,-8*19(r1) std r5,-8*20(r1) + std r6,-8*21(r1) /* 168 bytes */ PPC_STOP b . /* catch bugs */ @@ -91,8 +94,10 @@ _GLOBAL(idle_return_gpr_loss) ld r1,PACAR1(r13) ld r4,-8*19(r1) ld r5,-8*20(r1) + ld r6,-8*21(r1) mtlrr4 mtcrr5 + mr r13,r6 /* * KVM nap requires r2 to be saved, rather than just restoring it * from PACATOC. This could be avoided for that less common case diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 78599bca66c2..1841027b25c5 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -35,6 +35,7 @@ static u32 supported_cpuidle_states; struct pnv_idle_states_t *pnv_idle_states; int nr_pnv_idle_states; +static bool firmware_stop_supported; /* * The default stop state that will be used by ppc_md.power_save @@ -602,6 +603,25 @@ struct p9_sprs { u64 uamor; }; +/* + * This function is called from OPAL if firmware support for stop + * states is present and enabled. It provides a fallback for idle + * stop states via OPAL. + */ +static uint64_t os_idle_stop(uint64_t psscr, bool save_gprs) +{ + /* +* For lite state which does not lose even GPRS we call
[RFC v5 3/3] cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled
The disable callback can be used to compute timeout for other states whenever a state is enabled or disabled. We store the computed timeout in "timeout" defined in cpuidle state strucure. So, we compute timeout only when some state is enabled or disabled and not every time in the fast idle path. We also use the computed timeout to get timeout for snooze, thus getting rid of get_snooze_timeout for snooze loop. Signed-off-by: Abhishek Goel --- drivers/cpuidle/cpuidle-powernv.c | 35 +++ include/linux/cpuidle.h | 1 + 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index d7686ce6e..a75226f52 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -45,7 +45,6 @@ struct stop_psscr_table { static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly; static u64 default_snooze_timeout __read_mostly; -static bool snooze_timeout_en __read_mostly; static u64 forced_wakeup_timeout(struct cpuidle_device *dev, struct cpuidle_driver *drv, @@ -67,26 +66,13 @@ static u64 forced_wakeup_timeout(struct cpuidle_device *dev, return 0; } -static u64 get_snooze_timeout(struct cpuidle_device *dev, - struct cpuidle_driver *drv, - int index) +static void pnv_disable_callback(struct cpuidle_device *dev, +struct cpuidle_driver *drv) { int i; - if (unlikely(!snooze_timeout_en)) - return default_snooze_timeout; - - for (i = index + 1; i < drv->state_count; i++) { - struct cpuidle_state *s = >states[i]; - struct cpuidle_state_usage *su = >states_usage[i]; - - if (s->disabled || su->disable) - continue; - - return s->target_residency * tb_ticks_per_usec; - } - - return default_snooze_timeout; + for (i = 0; i < drv->state_count; i++) + drv->states[i].timeout = forced_wakeup_timeout(dev, drv, i); } static int snooze_loop(struct cpuidle_device *dev, @@ -94,16 +80,20 @@ static int snooze_loop(struct cpuidle_device *dev, int index) { u64 snooze_exit_time; + u64 snooze_timeout = drv->states[index].timeout; + + if (!snooze_timeout) + snooze_timeout = default_snooze_timeout; set_thread_flag(TIF_POLLING_NRFLAG); local_irq_enable(); - snooze_exit_time = get_tb() + get_snooze_timeout(dev, drv, index); + snooze_exit_time = get_tb() + snooze_timeout; ppc64_runlatch_off(); HMT_very_low(); while (!need_resched()) { - if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) { + if (get_tb() > snooze_exit_time) { /* * Task has not woken up but we are exiting the polling * loop anyway. Require a barrier after polling is @@ -168,7 +158,7 @@ static int stop_loop(struct cpuidle_device *dev, u64 timeout_tb; bool forced_wakeup = false; - timeout_tb = forced_wakeup_timeout(dev, drv, index); + timeout_tb = drv->states[index].timeout; if (timeout_tb) { /* Ensure that the timeout is at least one microsecond @@ -263,6 +253,7 @@ static int powernv_cpuidle_driver_init(void) */ drv->cpumask = (struct cpumask *)cpu_present_mask; + drv->disable_callback = pnv_disable_callback; return 0; } @@ -422,8 +413,6 @@ static int powernv_idle_probe(void) /* Device tree can indicate more idle states */ max_idle_state = powernv_add_idle_states(); default_snooze_timeout = TICK_USEC * tb_ticks_per_usec; - if (max_idle_state > 1) - snooze_timeout_en = true; } else return -ENODEV; diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 1729a497b..64195861b 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -50,6 +50,7 @@ struct cpuidle_state { int power_usage; /* in mW */ unsigned inttarget_residency; /* in US */ booldisabled; /* disabled on all CPUs */ + unsigned long long timeout; /* timeout for exiting out of a state */ int (*enter)(struct cpuidle_device *dev, struct cpuidle_driver *drv, -- 2.17.1
[RFC v5 2/3] cpuidle : Add callback whenever a state usage is enabled/disabled
To force wakeup a cpu, we need to compute the timeout in the fast idle path as a state may be enabled or disabled but there did not exist a feedback to driver when a state is enabled or disabled. This patch adds a callback whenever a state_usage records a store for disable attribute Signed-off-by: Abhishek Goel --- drivers/cpuidle/sysfs.c | 15 ++- include/linux/cpuidle.h | 3 +++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index 2bb2683b4..6c9bf2f7b 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -418,8 +418,21 @@ static ssize_t cpuidle_state_store(struct kobject *kobj, struct attribute *attr, struct cpuidle_state_attr *cattr = attr_to_stateattr(attr); struct cpuidle_device *dev = kobj_to_device(kobj); - if (cattr->store) + if (cattr->store) { ret = cattr->store(state, state_usage, buf, size); + if (ret == size && + strncmp(cattr->attr.name, "disable", + strlen("disable"))) { + struct kobject *cpuidle_kobj = kobj->parent; + struct cpuidle_device *dev = + to_cpuidle_device(cpuidle_kobj); + struct cpuidle_driver *drv = + cpuidle_get_cpu_driver(dev); + + if (drv->disable_callback) + drv->disable_callback(dev, drv); + } + } /* reset poll time cache */ dev->poll_limit_ns = 0; diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 4b6b5bea8..1729a497b 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -122,6 +122,9 @@ struct cpuidle_driver { /* the driver handles the cpus in cpumask */ struct cpumask *cpumask; + void (*disable_callback)(struct cpuidle_device *dev, +struct cpuidle_driver *drv); + /* preferred governor to switch at register time */ const char *governor; }; -- 2.17.1
[PATCH v5 1/3] cpuidle-powernv : forced wakeup for stop states
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU may end up in the shallow state. This is problematic, when the predicted state in the aforementioned scenario is a shallow stop state on a tickless system. As we might get stuck into shallow states for hours, in absence of ticks or interrupts. To address this, We forcefully wakeup the cpu by setting the decrementer. The decrementer is set to a value that corresponds with the residency of the next available state. Thus firing up a timer that will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Thus, cpu will eventually end up in the deepest possible state. Signed-off-by: Abhishek Goel --- Auto-promotion v1 : started as auto promotion logic for cpuidle states in generic driver v2 : Removed timeout_needed and rebased the code to upstream kernel Forced-wakeup v1 : New patch with name of forced wakeup started v2 : Extending the forced wakeup logic for all states. Setting the decrementer instead of queuing up a hrtimer to implement the logic. v3 : Cleanly handle setting/resetting of decrementer so as to not break irq work v4 : Changed type and name of set/reset decrementer fucntion Handled irq_work_pending in try_set_dec_before_idle v5 : Removed forced wakeup for last stop state by changing the checking conditon of timeout_tb arch/powerpc/include/asm/time.h | 2 ++ arch/powerpc/kernel/time.c| 43 +++ drivers/cpuidle/cpuidle-powernv.c | 40 3 files changed, 85 insertions(+) diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 08dbe3e68..06a6a2314 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -184,6 +184,8 @@ static inline unsigned long tb_ticks_since(unsigned long tstamp) extern u64 mulhdu(u64, u64); #endif +extern bool try_set_dec_before_idle(u64 timeout); +extern void try_reset_dec_after_idle(void); extern void div128_by_32(u64 dividend_high, u64 dividend_low, unsigned divisor, struct div_result *dr); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 694522308..d004c0d8e 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -576,6 +576,49 @@ void arch_irq_work_raise(void) #endif /* CONFIG_IRQ_WORK */ +/* + * This function tries setting decrementer before entering into idle. + * Returns true if we have reprogrammed the decrementer for idle. + * Returns false if the decrementer is unchanged. + */ +bool try_set_dec_before_idle(u64 timeout) +{ + u64 *next_tb = this_cpu_ptr(_next_tb); + u64 now = get_tb_or_rtc(); + + if (now + timeout > *next_tb) + return false; + + set_dec(timeout); + if (test_irq_work_pending()) + set_dec(1); + + return true; +} + +/* + * This function gets called if we have set decrementer before + * entering into idle. It tries to reset/restore the decrementer + * to its original value. + */ +void try_reset_dec_after_idle(void) +{ + u64 now; + u64 *next_tb; + + if (test_irq_work_pending()) + return; + + now = get_tb_or_rtc(); + next_tb = this_cpu_ptr(_next_tb); + if (now >= *next_tb) + return; + + set_dec(*next_tb - now); + if (test_irq_work_pending()) + set_dec(1); +} + /* * timer_interrupt - gets called when the decrementer overflows, * with interrupts disabled. diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 84b1ebe21..d7686ce6e 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -21,6 +21,7 @@ #include #include #include +#include /* * Expose only those Hardware idle states via the cpuidle framework @@ -46,6 +47,26 @@ static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly static u64 default_snooze_timeout __read_mostly; static bool snooze_timeout_en __read_mostly; +static u64 forced_wakeup_timeout(struct cpuidle_device *dev, +struct cpuidle_driver *drv, +int index) +{ + int i; + + for (i = index + 1; i < drv->state_count; i++) { + struct cpuidle_state *s = >states[i]; + struct cpuidle_state_usage *su = >states_usage[i]; + + if (s->disabled || su->disable) + continue;
[PATCH v5 0/3] Forced-wakeup for stop states on Powernv
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. Motivation -- In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a shallow stop state on a tickless system. As we might get stuck into shallow states even for hours, in absence of ticks or interrupts. To address this, We forcefully wakeup the cpu by setting the decrementer. The decrementer is set to a value that corresponds with the residency of the next available state. Thus firing up a timer that will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Thus, cpu will eventually end up in the deepest possible state and we won't get stuck in a shallow state for long duration. Experiment -- For earlier versions when this feature was meat to be only for shallow lite states, I performed experiments for three scenarios to collect some data. case 1 : Without this patch and without tick retained, i.e. in a upstream kernel, It would spend more than even a second to get out of stop0_lite. case 2 : With tick retained in a upstream kernel - Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected it to take 8 sched tick to get out of stop0_lite. Experimentally, observation was = sample minmax 99percentile 20 4ms12ms 4ms = It would take atleast one sched tick to get out of stop0_lite. case 2 : With this patch (not stopping tick, but explicitly queuing a timer) sample min max 99percentile 20 144us 192us 144us Description of current implementation - We calculate timeout for the current idle state as the residency value of the next available idle state. If the decrementer is set to be greater than this timeout, we update the decrementer value with the residency of next available idle state. Thus, essentially training the governor to select the next available deeper state until we reach the deepest state. Hence, we won't get stuck unnecessarily in shallow states for longer duration. v1 of auto-promotion : https://lkml.org/lkml/2019/3/22/58 This patch was implemented only for shallow lite state in generic cpuidle driver. v2 : Removed timeout_needed and rebased to current upstream kernel Then, v1 of forced-wakeup : Moved the code to cpuidle powernv driver and started as forced wakeup instead of auto-promotion v2 : Extended the forced wakeup logic for all states. Setting the decrementer instead of queuing up a hrtimer to implement the logic. v3 : 1) Cleanly handle setting the decrementer after exiting out of stop states. 2) Added a disable_callback feature to compute timeout whenever a state is enbaled or disabled instead of computing everytime in fast idle path. 3) Use disable callback to recompute timeout whenever state usage is changed for a state. Also, cleaned up the get_snooze_timeout function. v4 :Changed the type and name of set/reset decrementer function. Handled irq work pending in try_set_dec_before_idle. No change in patch 2 and 3. v5 :Removed forced wakeup for the last state. We dont want to wakeup unnecessarily when already in deepest state. It was a mistake in previous patches that was found out in recent experiments. No change in patch 2 and 3. Abhishek Goel (3): cpuidle-powernv : forced wakeup for stop states cpuidle : Add callback whenever a state usage is enabled/disabled cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled arch/powerpc/include/asm/time.h | 2 ++ arch/powerpc/kernel/time.c| 43 drivers/cpuidle/cpuidle-powernv.c | 55 +++ drivers/cpuidle/sysfs.c | 15 - include/linux/cpuidle.h | 4 +++ 5 files changed, 105 insertions(+), 14 deletions(-) -- 2.17.1
[RFC 3/3] cpuidle/powernv : Add flags to identify stop state type
Removed threshold latency which was being used to decide if a state is cpuidle type or not. This decision can be taken using flags, as this information has been encapsulated in the state->flags and being read from idle device-tree. Signed-off-by: Abhishek Goel --- arch/powerpc/include/asm/opal-api.h | 7 +++ drivers/cpuidle/cpuidle-powernv.c | 16 +--- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 383242eb0dea..b9068fee21d8 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -233,6 +233,13 @@ #define OPAL_PM_STOP_INST_FAST 0x0010 #define OPAL_PM_STOP_INST_DEEP 0x0020 +/* + * Flags for stop states to distinguish between cpuidle and + * cpuoffline type of states. + */ +#define OPAL_PM_STOP_CPUIDLE 0x0100 +#define OPAL_PM_STOP_CPUHOTPLUG0x0200 + /* * OPAL_CONFIG_CPU_IDLE_STATE parameters */ diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 1b6c84d4ac77..1a33a548b769 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -270,8 +270,13 @@ static int powernv_add_idle_states(void) goto out; } - /* TODO: Count only states which are eligible for cpuidle */ - dt_idle_states = nr_pnv_idle_states; + /* Count only cpuidle states*/ + for (i = 0; i < nr_pnv_idle_states; i++) { + if (pnv_idle_states[i].flags & OPAL_PM_STOP_CPUIDLE) + dt_idle_states++; + } + pr_info("idle states in dt = %d , states with idle flag = %d", + nr_pnv_idle_states, dt_idle_states); /* * Since snooze is used as first idle state, max idle states allowed is @@ -300,11 +305,8 @@ static int powernv_add_idle_states(void) continue; } - /* -* If an idle state has exit latency beyond -* POWERNV_THRESHOLD_LATENCY_NS then don't use it in cpu-idle. -*/ - if (state->latency_ns > POWERNV_THRESHOLD_LATENCY_NS) { + /* Check whether a state is of cpuidle type */ + if ((state->flags & OPAL_PM_STOP_CPUIDLE) != state->flags) { pr_info("State %d is not idletype\n", i); continue; } -- 2.17.1
[RFC 2/3] cpuidle/powernv: Add support for versioned stop states
New device tree format adds a compatible flag which has version corresponding to every state, so that only kernel which has the capability to handle the version of stop state will enable it. Drawback of the array based dt node is that versioning of idle states is not possible. Older kernel will still see stop0 and stop0_lite in older format and we will deprecate it after some time. Consider a case that stop4 has a bug. We take the following steps to mitigate the problem. 1) Change compatible string for stop4 in OPAL to "stop4,v2" from "stop4,v1", i.e. basicallly bump up the previous version and ship the new firmware. 2) The kernel will ignore stop4 as it won't be able to recognize this new version. Kernel will also ignore all the deeper states because its possible that a cpu have requested for a deeper state but was never able to enter into it. But we will still have shallower states that are there before stop 4. This, thus prevents from completely disabling stop states. Linux kernel can now look at the version string and decide if it has the ability to handle that idle state. Henceforth, if kernel does not know about a version, it will skip that state and all the deeper state. Once when the workaround are implemented into the kernel, we can bump up the known version in kernel for that state, so that support can be enabled once again in kernel. New Device-tree : Final output power-mgt { ... ibm,enabled-stop-levels = <0xec00>; ibm,cpu-idle-state-psscr-mask = <0x0 0x3003ff 0x0 0x3003ff>; ibm,cpu-idle-state-latencies-ns = <0x3e8 0x7d0>; ibm,cpu-idle-state-psscr = <0x0 0x330 0x0 0x300330>; ibm,cpu-idle-state-flags = <0x10 0x101000>; ibm,cpu-idle-state-residency-ns = <0x2710 0x4e20>; ibm,idle-states { stop4 { flags = <0x207000>; compatible = "stop4,v1", psscr-mask = <0x0 0x3003ff>; latency-ns = <0x186a0>; residency-ns = <0x989680>; psscr = <0x0 0x300374>; ... }; ... stop11 { ... compatible = "stop11,v1", ... }; }; Since state pointer is being passed to stop loop, I have separated idle fast path for lite, shallow and deep states. This improves the readability of the code and also future maintainability of the code. Stop handle corresponding to each state can be called directly since state pointer is being passed now. Signed-off-by: Abhishek Goel --- arch/powerpc/include/asm/cpuidle.h| 8 +- arch/powerpc/platforms/powernv/idle.c | 331 +++--- 2 files changed, 299 insertions(+), 40 deletions(-) diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h index 9844b3ded187..5eb9a98fcb86 100644 --- a/arch/powerpc/include/asm/cpuidle.h +++ b/arch/powerpc/include/asm/cpuidle.h @@ -70,15 +70,19 @@ #ifndef __ASSEMBLY__ -#define PNV_IDLE_NAME_LEN16 +#define PNV_VERSION_LEN25 +#define PNV_IDLE_NAME_LEN 16 struct pnv_idle_states_t { char name[PNV_IDLE_NAME_LEN]; + char compat_version[PNV_VERSION_LEN]; u32 latency_ns; u32 residency_ns; u64 psscr_val; u64 psscr_mask; - u32 flags; + u64 flags; bool valid; + unsigned long (*stop_handle)(struct pnv_idle_states_t *state, +bool mmu_on); }; extern struct pnv_idle_states_t *pnv_idle_states; diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index db810c0a16e1..7398a66e1ddb 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -49,6 +49,12 @@ static bool default_stop_found; static u64 pnv_first_tb_loss_level = MAX_STOP_STATE + 1; static u64 pnv_first_spr_loss_level = MAX_STOP_STATE + 1; +struct stop_version_t { + const char compat_version[PNV_VERSION_LEN]; + unsigned long (*stop_handle)(struct pnv_idle_states_t *state, +bool mmu_on); +}; + /* * psscr value and mask of the deepest stop idle state. * Used when a cpu is offlined. @@ -599,40 +605,152 @@ struct p9_sprs { u64 uamor; }; -static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) +/* Splitting the previous power9_idle_stop into three functions + * to separately handle lite, shallow and deep states. + */ + +static unsigned long power9_stop_lite(struct pnv_idle_states_t *state, + bool mmu_on) { - int cpu = raw_smp_processor_id(); - int first = cpu_first_thread_sib
[RFC 1/3] cpuidle/powernv : Pass state pointer instead of values to stop loop
Instead of passing psscr_val and psscr_mask, we will pass state pointer to stop loop. This will help to figure out the method to enter/exit idle state. Removed psscr_mask and psscr_val from driver idle code. Same has also been done in platform idle code. Also, added some cleanups and debugging info. Signed-off-by: Abhishek Goel --- arch/powerpc/include/asm/processor.h | 5 +- arch/powerpc/platforms/powernv/idle.c | 50 --- drivers/cpuidle/cpuidle-powernv.c | 69 +-- 3 files changed, 55 insertions(+), 69 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index a9993e7a443b..855666e8d271 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -39,6 +39,7 @@ #include #include #include +#include /* We do _not_ want to define new machine types at all, those must die * in favor of using the device-tree @@ -419,9 +420,7 @@ enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF}; extern int powersave_nap; /* set if nap mode can be used in idle loop */ extern void power7_idle_type(unsigned long type); -extern void power9_idle_type(unsigned long stop_psscr_val, - unsigned long stop_psscr_mask); - +extern void power9_idle_type(struct pnv_idle_states_t *state); extern void flush_instruction_cache(void); extern void hard_reset_now(void); extern void poweroff_now(void); diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 09f49eed7fb8..db810c0a16e1 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -40,8 +40,7 @@ int nr_pnv_idle_states; * The default stop state that will be used by ppc_md.power_save * function on platforms that support stop instruction. */ -static u64 pnv_default_stop_val; -static u64 pnv_default_stop_mask; +struct pnv_idle_states_t *pnv_default_state; static bool default_stop_found; /* @@ -54,9 +53,7 @@ static u64 pnv_first_spr_loss_level = MAX_STOP_STATE + 1; * psscr value and mask of the deepest stop idle state. * Used when a cpu is offlined. */ -static u64 pnv_deepest_stop_psscr_val; -static u64 pnv_deepest_stop_psscr_mask; -static u64 pnv_deepest_stop_flag; +static struct pnv_idle_states_t *pnv_deepest_state; static bool deepest_stop_found; static unsigned long power7_offline_type; @@ -78,7 +75,7 @@ static int pnv_save_sprs_for_deep_states(void) uint64_t hid5_val = mfspr(SPRN_HID5); uint64_t hmeer_val = mfspr(SPRN_HMEER); uint64_t msr_val = MSR_IDLE; - uint64_t psscr_val = pnv_deepest_stop_psscr_val; + uint64_t psscr_val = pnv_deepest_state->psscr_val; for_each_present_cpu(cpu) { uint64_t pir = get_hard_smp_processor_id(cpu); @@ -804,9 +801,13 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) } #ifdef CONFIG_HOTPLUG_CPU -static unsigned long power9_offline_stop(unsigned long psscr) +static unsigned long power9_offline_stop(struct pnv_idle_states_t *state) { unsigned long srr1; + unsigned long psscr; + + psscr = mfspr(SPRN_PSSCR); + psscr = (psscr & ~state->psscr_mask) | state->psscr_val; #ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE __ppc64_runlatch_off(); @@ -841,8 +842,7 @@ static unsigned long power9_offline_stop(unsigned long psscr) } #endif -void power9_idle_type(unsigned long stop_psscr_val, - unsigned long stop_psscr_mask) +void power9_idle_type(struct pnv_idle_states_t *state) { unsigned long psscr; unsigned long srr1; @@ -851,7 +851,7 @@ void power9_idle_type(unsigned long stop_psscr_val, return; psscr = mfspr(SPRN_PSSCR); - psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val; + psscr = (psscr & ~state->psscr_mask) | state->psscr_val; __ppc64_runlatch_off(); srr1 = power9_idle_stop(psscr, true); @@ -867,7 +867,7 @@ void power9_idle_type(unsigned long stop_psscr_val, */ void power9_idle(void) { - power9_idle_type(pnv_default_stop_val, pnv_default_stop_mask); + power9_idle_type(pnv_default_state); } #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE @@ -970,12 +970,7 @@ unsigned long pnv_cpu_offline(unsigned int cpu) __ppc64_runlatch_off(); if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) { - unsigned long psscr; - - psscr = mfspr(SPRN_PSSCR); - psscr = (psscr & ~pnv_deepest_stop_psscr_mask) | - pnv_deepest_stop_psscr_val; - srr1 = power9_offline_stop(psscr); + srr1 = power9_offline_stop(pnv_deepest_state); } else if (cpu_has_feature(CPU_FTR_ARCH_206) && power7_offline_type) { srr1 = power7_offline(); } el
[RFC 0/3] New idle device-tree format and support for versioned stop state
Background -- Previously if a older kernel runs on a newer firmware, it may enable all available states irrespective of its capability of handling it. Consider a case that some stop state has a bug, we end up disabling all the stop states. This patch introduces selective control to solve this problem. Previous version of these patches can be found at: https://lkml.org/lkml/2018/10/11/544 These patch however also had patches for support of opal save-restore which now I am decoupling and will take them seperately. I have posted the corresponding skiboot patches for this kernel patchset here : https://patchwork.ozlabs.org/cover/1144587/ What's new? Add stop states under ibm,idle-states in addition to the current array based device tree properties. New device tree format adds a compatible flag which has version corresponding to every state, so that only kernel which has the capability to handle the version of stop state will enable it. Drawback of the array based dt node is that versioning of idle states is not possible. Older kernel will still see stop0 and stop0_lite in older format and we will deprecate it after some time. Consider a case that stop4 has a bug. We take the following steps to mitigate the problem. 1) Change compatible string for stop4 in OPAL to "stop4,v2" from "stop4,v1", i.e. basicallly bump up the previous version and ship the new firmware. 2) The kernel will ignore stop4 as it won't be able to recognize this new version. Kernel will also ignore all the deeper states because its possible that a cpu have requested for a deeper state but was never able to enter into it. But we will still have shallower states that are there before stop 4. This, thus prevents from completely disabling stop states. Linux kernel can now look at the version string and decide if it has the ability to handle that idle state. Henceforth, if kernel does not know about a version, it will skip that state and all the deeper state. Once when the workaround are implemented into the kernel, we can bump up the known version in kernel for that state, so that support can be enabled once again in kernel. New Device-tree : Final output power-mgt { ... ibm,enabled-stop-levels = <0xec00>; ibm,cpu-idle-state-psscr-mask = <0x0 0x3003ff 0x0 0x3003ff>; ibm,cpu-idle-state-latencies-ns = <0x3e8 0x7d0>; ibm,cpu-idle-state-psscr = <0x0 0x330 0x0 0x300330>; ibm,cpu-idle-state-flags = <0x10 0x101000>; ibm,cpu-idle-state-residency-ns = <0x2710 0x4e20>; ibm,idle-states { stop4 { flags = <0x207000>; compatible = "stop4,v1", psscr-mask = <0x0 0x3003ff>; latency-ns = <0x186a0>; residency-ns = <0x989680>; psscr = <0x0 0x300374>; ... }; ... stop11 { ... compatible = "stop11,v1", ... }; }; Abhishek Goel (3): cpuidle/powernv : Pass state pointer instead of values to stop loop cpuidle/powernv: Add support for versioned stop states cpuidle/powernv : Add flags to identify stop state type arch/powerpc/include/asm/cpuidle.h| 8 +- arch/powerpc/include/asm/opal-api.h | 7 + arch/powerpc/include/asm/processor.h | 5 +- arch/powerpc/platforms/powernv/idle.c | 371 +- drivers/cpuidle/cpuidle-powernv.c | 81 +++--- 5 files changed, 363 insertions(+), 109 deletions(-) -- 2.17.1
[RFC v4 3/3] cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled
The disable callback can be used to compute timeout for other states whenever a state is enabled or disabled. We store the computed timeout in "timeout" defined in cpuidle state strucure. So, we compute timeout only when some state is enabled or disabled and not every time in the fast idle path. We also use the computed timeout to get timeout for snooze, thus getting rid of get_snooze_timeout for snooze loop. Signed-off-by: Abhishek Goel --- drivers/cpuidle/cpuidle-powernv.c | 35 +++ include/linux/cpuidle.h | 1 + 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 17e20e408ffe..29add322d0c4 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -45,7 +45,6 @@ struct stop_psscr_table { static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly; static u64 default_snooze_timeout __read_mostly; -static bool snooze_timeout_en __read_mostly; static u64 forced_wakeup_timeout(struct cpuidle_device *dev, struct cpuidle_driver *drv, @@ -67,26 +66,13 @@ static u64 forced_wakeup_timeout(struct cpuidle_device *dev, return 0; } -static u64 get_snooze_timeout(struct cpuidle_device *dev, - struct cpuidle_driver *drv, - int index) +static void pnv_disable_callback(struct cpuidle_device *dev, +struct cpuidle_driver *drv) { int i; - if (unlikely(!snooze_timeout_en)) - return default_snooze_timeout; - - for (i = index + 1; i < drv->state_count; i++) { - struct cpuidle_state *s = >states[i]; - struct cpuidle_state_usage *su = >states_usage[i]; - - if (s->disabled || su->disable) - continue; - - return s->target_residency * tb_ticks_per_usec; - } - - return default_snooze_timeout; + for (i = 0; i < drv->state_count; i++) + drv->states[i].timeout = forced_wakeup_timeout(dev, drv, i); } static int snooze_loop(struct cpuidle_device *dev, @@ -94,16 +80,20 @@ static int snooze_loop(struct cpuidle_device *dev, int index) { u64 snooze_exit_time; + u64 snooze_timeout = drv->states[index].timeout; + + if (!snooze_timeout) + snooze_timeout = default_snooze_timeout; set_thread_flag(TIF_POLLING_NRFLAG); local_irq_enable(); - snooze_exit_time = get_tb() + get_snooze_timeout(dev, drv, index); + snooze_exit_time = get_tb() + snooze_timeout; ppc64_runlatch_off(); HMT_very_low(); while (!need_resched()) { - if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) { + if (get_tb() > snooze_exit_time) { /* * Task has not woken up but we are exiting the polling * loop anyway. Require a barrier after polling is @@ -168,7 +158,7 @@ static int stop_loop(struct cpuidle_device *dev, u64 timeout_tb; bool forced_wakeup = false; - timeout_tb = forced_wakeup_timeout(dev, drv, index); + timeout_tb = drv->states[index].timeout; /* Ensure that the timeout is at least one microsecond * greater than current decrement value. Else, we will @@ -263,6 +253,7 @@ static int powernv_cpuidle_driver_init(void) */ drv->cpumask = (struct cpumask *)cpu_present_mask; + drv->disable_callback = pnv_disable_callback; return 0; } @@ -422,8 +413,6 @@ static int powernv_idle_probe(void) /* Device tree can indicate more idle states */ max_idle_state = powernv_add_idle_states(); default_snooze_timeout = TICK_USEC * tb_ticks_per_usec; - if (max_idle_state > 1) - snooze_timeout_en = true; } else return -ENODEV; diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 8a0e54bd0d5d..31662b657b9c 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -50,6 +50,7 @@ struct cpuidle_state { int power_usage; /* in mW */ unsigned inttarget_residency; /* in US */ booldisabled; /* disabled on all CPUs */ + unsigned long long timeout; /* timeout for exiting out of a state */ int (*enter)(struct cpuidle_device *dev, struct cpuidle_driver *drv, -- 2.17.1
[RFC v4 2/3] cpuidle : Add callback whenever a state usage is enabled/disabled
To force wakeup a cpu, we need to compute the timeout in the fast idle path as a state may be enabled or disabled but there did not exist a feedback to driver when a state is enabled or disabled. This patch adds a callback whenever a state_usage records a store for disable attribute. Signed-off-by: Abhishek Goel --- drivers/cpuidle/sysfs.c | 15 ++- include/linux/cpuidle.h | 4 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index eb20adb5de23..141671a53967 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -415,8 +415,21 @@ static ssize_t cpuidle_state_store(struct kobject *kobj, struct attribute *attr, struct cpuidle_state_usage *state_usage = kobj_to_state_usage(kobj); struct cpuidle_state_attr *cattr = attr_to_stateattr(attr); - if (cattr->store) + if (cattr->store) { ret = cattr->store(state, state_usage, buf, size); + if (ret == size && + strncmp(cattr->attr.name, "disable", + strlen("disable"))) { + struct kobject *cpuidle_kobj = kobj->parent; + struct cpuidle_device *dev = + to_cpuidle_device(cpuidle_kobj); + struct cpuidle_driver *drv = + cpuidle_get_cpu_driver(dev); + + if (drv->disable_callback) + drv->disable_callback(dev, drv); + } + } return ret; } diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index bb9a0db89f1a..8a0e54bd0d5d 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -119,6 +119,10 @@ struct cpuidle_driver { /* the driver handles the cpus in cpumask */ struct cpumask *cpumask; + + void (*disable_callback)(struct cpuidle_device *dev, + struct cpuidle_driver *drv); + }; #ifdef CONFIG_CPU_IDLE -- 2.17.1
[PATCH v4 1/3] cpuidle-powernv : forced wakeup for stop states
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU may end up in the shallow state. This is problematic, when the predicted state in the aforementioned scenario is a shallow stop state on a tickless system. As we might get stuck into shallow states for hours, in absence of ticks or interrupts. To address this, We forcefully wakeup the cpu by setting the decrementer. The decrementer is set to a value that corresponds with the residency of the next available state. Thus firing up a timer that will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Thus, cpu will eventually end up in the deepest possible state. Signed-off-by: Abhishek Goel --- Auto-promotion v1 : started as auto promotion logic for cpuidle states in generic driver v2 : Removed timeout_needed and rebased the code to upstream kernel Forced-wakeup v1 : New patch with name of forced wakeup started v2 : Extending the forced wakeup logic for all states. Setting the decrementer instead of queuing up a hrtimer to implement the logic. v3 : Cleanly handle setting/resetting of decrementer so as to not break irq work v4 : Changed type and name of set/reset decrementer fucntion Handled irq_work_pending in try_set_dec_before_idle arch/powerpc/include/asm/time.h | 2 ++ arch/powerpc/kernel/time.c| 43 +++ drivers/cpuidle/cpuidle-powernv.c | 40 3 files changed, 85 insertions(+) diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 54f4ec1f9fab..294a472ce161 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -188,6 +188,8 @@ static inline unsigned long tb_ticks_since(unsigned long tstamp) extern u64 mulhdu(u64, u64); #endif +extern bool try_set_dec_before_idle(u64 timeout); +extern void try_reset_dec_after_idle(void); extern void div128_by_32(u64 dividend_high, u64 dividend_low, unsigned divisor, struct div_result *dr); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 694522308cd5..d004c0d8e099 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -576,6 +576,49 @@ void arch_irq_work_raise(void) #endif /* CONFIG_IRQ_WORK */ +/* + * This function tries setting decrementer before entering into idle. + * Returns true if we have reprogrammed the decrementer for idle. + * Returns false if the decrementer is unchanged. + */ +bool try_set_dec_before_idle(u64 timeout) +{ + u64 *next_tb = this_cpu_ptr(_next_tb); + u64 now = get_tb_or_rtc(); + + if (now + timeout > *next_tb) + return false; + + set_dec(timeout); + if (test_irq_work_pending()) + set_dec(1); + + return true; +} + +/* + * This function gets called if we have set decrementer before + * entering into idle. It tries to reset/restore the decrementer + * to its original value. + */ +void try_reset_dec_after_idle(void) +{ + u64 now; + u64 *next_tb; + + if (test_irq_work_pending()) + return; + + now = get_tb_or_rtc(); + next_tb = this_cpu_ptr(_next_tb); + if (now >= *next_tb) + return; + + set_dec(*next_tb - now); + if (test_irq_work_pending()) + set_dec(1); +} + /* * timer_interrupt - gets called when the decrementer overflows, * with interrupts disabled. diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 84b1ebe212b3..17e20e408ffe 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -21,6 +21,7 @@ #include #include #include +#include /* * Expose only those Hardware idle states via the cpuidle framework @@ -46,6 +47,26 @@ static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly static u64 default_snooze_timeout __read_mostly; static bool snooze_timeout_en __read_mostly; +static u64 forced_wakeup_timeout(struct cpuidle_device *dev, +struct cpuidle_driver *drv, +int index) +{ + int i; + + for (i = index + 1; i < drv->state_count; i++) { + struct cpuidle_state *s = >states[i]; + struct cpuidle_state_usage *su = >states_usage[i]; + + if (s->disabled || su->disable) + continue; + + return (s->target_residency + 2 * s->exit_latency) * +
[PATCH v4 0/3] Forced-wakeup for stop states on Powernv
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. Motivation -- In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a shallow stop state on a tickless system. As we might get stuck into shallow states even for hours, in absence of ticks or interrupts. To address this, We forcefully wakeup the cpu by setting the decrementer. The decrementer is set to a value that corresponds with the residency of the next available state. Thus firing up a timer that will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Thus, cpu will eventually end up in the deepest possible state and we won't get stuck in a shallow state for long duration. Experiment -- For earlier versions when this feature was meat to be only for shallow lite states, I performed experiments for three scenarios to collect some data. case 1 : Without this patch and without tick retained, i.e. in a upstream kernel, It would spend more than even a second to get out of stop0_lite. case 2 : With tick retained in a upstream kernel - Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected it to take 8 sched tick to get out of stop0_lite. Experimentally, observation was = sample minmax 99percentile 20 4ms12ms 4ms = It would take atleast one sched tick to get out of stop0_lite. case 2 : With this patch (not stopping tick, but explicitly queuing a timer) sample min max 99percentile 20 144us 192us 144us Description of current implementation - We calculate timeout for the current idle state as the residency value of the next available idle state. If the decrementer is set to be greater than this timeout, we update the decrementer value with the residency of next available idle state. Thus, essentially training the governor to select the next available deeper state until we reach the deepest state. Hence, we won't get stuck unnecessarily in shallow states for longer duration. v1 of auto-promotion : https://lkml.org/lkml/2019/3/22/58 This patch was implemented only for shallow lite state in generic cpuidle driver. v2 : Removed timeout_needed and rebased to current upstream kernel Then, v1 of forced-wakeup : Moved the code to cpuidle powernv driver and started as forced wakeup instead of auto-promotion v2 : Extended the forced wakeup logic for all states. Setting the decrementer instead of queuing up a hrtimer to implement the logic. v3 : 1) Cleanly handle setting the decrementer after exiting out of stop states. 2) Added a disable_callback feature to compute timeout whenever a state is enbaled or disabled instead of computing everytime in fast idle path. 3) Use disable callback to recompute timeout whenever state usage is changed for a state. Also, cleaned up the get_snooze_timeout function. v4 :Changed the type and name of set/reset decrementer function. Handled irq work pending in try_set_dec_before_idle. No change in patch 2 and 3. Abhishek Goel (3): cpuidle-powernv : forced wakeup for stop states cpuidle : Add callback whenever a state usage is enabled/disabled cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled arch/powerpc/include/asm/time.h | 2 ++ arch/powerpc/kernel/time.c| 43 drivers/cpuidle/cpuidle-powernv.c | 55 +++ drivers/cpuidle/sysfs.c | 15 - include/linux/cpuidle.h | 5 +++ 5 files changed, 106 insertions(+), 14 deletions(-) -- 2.17.1
Re: [PATCH v3 1/3] cpuidle-powernv : forced wakeup for stop states
Hi Nick, Will post next version with the changes you have suggested. There is a comment below. On 07/07/2019 03:43 PM, Nicholas Piggin wrote: Abhishek Goel's on July 4, 2019 7:18 pm: Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU may end up in the shallow state. This is problematic, when the predicted state in the aforementioned scenario is a shallow stop state on a tickless system. As we might get stuck into shallow states for hours, in absence of ticks or interrupts. To address this, We forcefully wakeup the cpu by setting the decrementer. The decrementer is set to a value that corresponds with the residency of the next available state. Thus firing up a timer that will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Thus, cpu will eventually end up in the deepest possible state. Signed-off-by: Abhishek Goel --- Auto-promotion v1 : started as auto promotion logic for cpuidle states in generic driver v2 : Removed timeout_needed and rebased the code to upstream kernel Forced-wakeup v1 : New patch with name of forced wakeup started v2 : Extending the forced wakeup logic for all states. Setting the decrementer instead of queuing up a hrtimer to implement the logic. v3 : Cleanly handle setting/resetting of decrementer so as to not break irq work arch/powerpc/include/asm/time.h | 2 ++ arch/powerpc/kernel/time.c| 40 +++ drivers/cpuidle/cpuidle-powernv.c | 32 + 3 files changed, 74 insertions(+) diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 54f4ec1f9..a3bd4f3c0 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -188,6 +188,8 @@ static inline unsigned long tb_ticks_since(unsigned long tstamp) extern u64 mulhdu(u64, u64); #endif +extern int set_dec_before_idle(u64 timeout); +extern void reset_dec_after_idle(void); extern void div128_by_32(u64 dividend_high, u64 dividend_low, unsigned divisor, struct div_result *dr); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 694522308..814de3469 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -576,6 +576,46 @@ void arch_irq_work_raise(void) #endif /* CONFIG_IRQ_WORK */ +/* + * Returns 1 if we have reprogrammed the decrementer for idle. + * Returns 0 if the decrementer is unchanged. + */ +int set_dec_before_idle(u64 timeout) +{ + u64 *next_tb = this_cpu_ptr(_next_tb); + u64 now = get_tb_or_rtc(); + + /* +* Ensure that the timeout is at least one microsecond +* before the current decrement value. Else, we will +* unnecesarily wakeup again within a microsecond. +*/ + if (now + timeout + 512 > *next_tb) I would pass this 512 in as a parameter and put the comment in the idle code. Timer code does not know/care. Maybe return bool and call it try_set_dec_before_idle. + return 0; + + set_dec(timeout); This needs to have if (test_irq_work_pending()) set_dec(1); here AFAIKS + + return 1; +} + +void reset_dec_after_idle(void) +{ + u64 now; + u64 *next_tb; + + if (test_irq_work_pending()) + return; + + now = get_tb_or_rtc(); + next_tb = this_cpu_ptr(_next_tb); + if (now >= *next_tb) + return; Are you sure it's okay to escape early in this case? Yeah, It looks safe. In power9_idle_type, we call irq_set_pending_from_srr1 which sets the irq_happened. If reason is IRQ_DEC, in __check_irq_replay, decrementer_check_overflow will be called which will set dec to a positive valid value. Also, we typically disable MSR EE before entering stop. And if a decrementer wakes us up, before we enable EE, check for pending interrupt will be done. And we finally reset dec to a positive value before we set EE=1. Thanks, Nick Thanks, Abhishek
[RFC v3 3/3] cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled
The disable callback can be used to compute timeout for other states whenever a state is enabled or disabled. We store the computed timeout in "timeout" defined in cpuidle state strucure. So, we compute timeout only when some state is enabled or disabled and not every time in the fast idle path. We also use the computed timeout to get timeout for snooze, thus getting rid of get_snooze_timeout for snooze loop. Signed-off-by: Abhishek Goel --- drivers/cpuidle/cpuidle-powernv.c | 35 +++ include/linux/cpuidle.h | 1 + 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index f51478460..7350f404a 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -45,7 +45,6 @@ struct stop_psscr_table { static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly; static u64 default_snooze_timeout __read_mostly; -static bool snooze_timeout_en __read_mostly; static u64 forced_wakeup_timeout(struct cpuidle_device *dev, struct cpuidle_driver *drv, @@ -67,26 +66,13 @@ static u64 forced_wakeup_timeout(struct cpuidle_device *dev, return 0; } -static u64 get_snooze_timeout(struct cpuidle_device *dev, - struct cpuidle_driver *drv, - int index) +static void pnv_disable_callback(struct cpuidle_device *dev, +struct cpuidle_driver *drv) { int i; - if (unlikely(!snooze_timeout_en)) - return default_snooze_timeout; - - for (i = index + 1; i < drv->state_count; i++) { - struct cpuidle_state *s = >states[i]; - struct cpuidle_state_usage *su = >states_usage[i]; - - if (s->disabled || su->disable) - continue; - - return s->target_residency * tb_ticks_per_usec; - } - - return default_snooze_timeout; + for (i = 0; i < drv->state_count; i++) + drv->states[i].timeout = forced_wakeup_timeout(dev, drv, i); } static int snooze_loop(struct cpuidle_device *dev, @@ -94,16 +80,20 @@ static int snooze_loop(struct cpuidle_device *dev, int index) { u64 snooze_exit_time; + u64 snooze_timeout = drv->states[index].timeout; + + if (!snooze_timeout) + snooze_timeout = default_snooze_timeout; set_thread_flag(TIF_POLLING_NRFLAG); local_irq_enable(); - snooze_exit_time = get_tb() + get_snooze_timeout(dev, drv, index); + snooze_exit_time = get_tb() + snooze_timeout; ppc64_runlatch_off(); HMT_very_low(); while (!need_resched()) { - if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) { + if (get_tb() > snooze_exit_time) { /* * Task has not woken up but we are exiting the polling * loop anyway. Require a barrier after polling is @@ -168,7 +158,7 @@ static int stop_loop(struct cpuidle_device *dev, u64 timeout_tb; int forced_wakeup = 0; - timeout_tb = forced_wakeup_timeout(dev, drv, index); + timeout_tb = drv->states[index].timeout; if (timeout_tb) forced_wakeup = set_dec_before_idle(timeout_tb); @@ -255,6 +245,7 @@ static int powernv_cpuidle_driver_init(void) */ drv->cpumask = (struct cpumask *)cpu_present_mask; + drv->disable_callback = pnv_disable_callback; return 0; } @@ -414,8 +405,6 @@ static int powernv_idle_probe(void) /* Device tree can indicate more idle states */ max_idle_state = powernv_add_idle_states(); default_snooze_timeout = TICK_USEC * tb_ticks_per_usec; - if (max_idle_state > 1) - snooze_timeout_en = true; } else return -ENODEV; diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 8a0e54bd0..31662b657 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -50,6 +50,7 @@ struct cpuidle_state { int power_usage; /* in mW */ unsigned inttarget_residency; /* in US */ booldisabled; /* disabled on all CPUs */ + unsigned long long timeout; /* timeout for exiting out of a state */ int (*enter)(struct cpuidle_device *dev, struct cpuidle_driver *drv, -- 2.17.1
[RFC v3 2/3] cpuidle : Add callback whenever a state usage is enabled/disabled
To force wakeup a cpu, we need to compute the timeout in the fast idle path as a state may be enabled or disabled but there did not exist a feedback to driver when a state is enabled or disabled. This patch adds a callback whenever a state_usage records a store for disable attribute. Signed-off-by: Abhishek Goel --- drivers/cpuidle/sysfs.c | 15 ++- include/linux/cpuidle.h | 4 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index eb20adb5d..141671a53 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -415,8 +415,21 @@ static ssize_t cpuidle_state_store(struct kobject *kobj, struct attribute *attr, struct cpuidle_state_usage *state_usage = kobj_to_state_usage(kobj); struct cpuidle_state_attr *cattr = attr_to_stateattr(attr); - if (cattr->store) + if (cattr->store) { ret = cattr->store(state, state_usage, buf, size); + if (ret == size && + strncmp(cattr->attr.name, "disable", + strlen("disable"))) { + struct kobject *cpuidle_kobj = kobj->parent; + struct cpuidle_device *dev = + to_cpuidle_device(cpuidle_kobj); + struct cpuidle_driver *drv = + cpuidle_get_cpu_driver(dev); + + if (drv->disable_callback) + drv->disable_callback(dev, drv); + } + } return ret; } diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index bb9a0db89..8a0e54bd0 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -119,6 +119,10 @@ struct cpuidle_driver { /* the driver handles the cpus in cpumask */ struct cpumask *cpumask; + + void (*disable_callback)(struct cpuidle_device *dev, + struct cpuidle_driver *drv); + }; #ifdef CONFIG_CPU_IDLE -- 2.17.1
[PATCH v3 1/3] cpuidle-powernv : forced wakeup for stop states
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU may end up in the shallow state. This is problematic, when the predicted state in the aforementioned scenario is a shallow stop state on a tickless system. As we might get stuck into shallow states for hours, in absence of ticks or interrupts. To address this, We forcefully wakeup the cpu by setting the decrementer. The decrementer is set to a value that corresponds with the residency of the next available state. Thus firing up a timer that will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Thus, cpu will eventually end up in the deepest possible state. Signed-off-by: Abhishek Goel --- Auto-promotion v1 : started as auto promotion logic for cpuidle states in generic driver v2 : Removed timeout_needed and rebased the code to upstream kernel Forced-wakeup v1 : New patch with name of forced wakeup started v2 : Extending the forced wakeup logic for all states. Setting the decrementer instead of queuing up a hrtimer to implement the logic. v3 : Cleanly handle setting/resetting of decrementer so as to not break irq work arch/powerpc/include/asm/time.h | 2 ++ arch/powerpc/kernel/time.c| 40 +++ drivers/cpuidle/cpuidle-powernv.c | 32 + 3 files changed, 74 insertions(+) diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 54f4ec1f9..a3bd4f3c0 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -188,6 +188,8 @@ static inline unsigned long tb_ticks_since(unsigned long tstamp) extern u64 mulhdu(u64, u64); #endif +extern int set_dec_before_idle(u64 timeout); +extern void reset_dec_after_idle(void); extern void div128_by_32(u64 dividend_high, u64 dividend_low, unsigned divisor, struct div_result *dr); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 694522308..814de3469 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -576,6 +576,46 @@ void arch_irq_work_raise(void) #endif /* CONFIG_IRQ_WORK */ +/* + * Returns 1 if we have reprogrammed the decrementer for idle. + * Returns 0 if the decrementer is unchanged. + */ +int set_dec_before_idle(u64 timeout) +{ + u64 *next_tb = this_cpu_ptr(_next_tb); + u64 now = get_tb_or_rtc(); + + /* +* Ensure that the timeout is at least one microsecond +* before the current decrement value. Else, we will +* unnecesarily wakeup again within a microsecond. +*/ + if (now + timeout + 512 > *next_tb) + return 0; + + set_dec(timeout); + + return 1; +} + +void reset_dec_after_idle(void) +{ + u64 now; + u64 *next_tb; + + if (test_irq_work_pending()) + return; + + now = get_tb_or_rtc(); + next_tb = this_cpu_ptr(_next_tb); + if (now >= *next_tb) + return; + + set_dec(*next_tb - now); + if (test_irq_work_pending()) + set_dec(1); +} + /* * timer_interrupt - gets called when the decrementer overflows, * with interrupts disabled. diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 84b1ebe21..f51478460 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -21,6 +21,7 @@ #include #include #include +#include /* * Expose only those Hardware idle states via the cpuidle framework @@ -46,6 +47,26 @@ static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly static u64 default_snooze_timeout __read_mostly; static bool snooze_timeout_en __read_mostly; +static u64 forced_wakeup_timeout(struct cpuidle_device *dev, +struct cpuidle_driver *drv, +int index) +{ + int i; + + for (i = index + 1; i < drv->state_count; i++) { + struct cpuidle_state *s = >states[i]; + struct cpuidle_state_usage *su = >states_usage[i]; + + if (s->disabled || su->disable) + continue; + + return (s->target_residency + 2 * s->exit_latency) * + tb_ticks_per_usec; + } + + return 0; +} + static u64 get_snooze_timeout(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) @@ -144,8 +165,19 @@ static int stop_l
[PATCH v3 0/3] Forced-wakeup for stop states on Powernv
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. Motivation -- In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a shallow stop state on a tickless system. As we might get stuck into shallow states even for hours, in absence of ticks or interrupts. To address this, We forcefully wakeup the cpu by setting the decrementer. The decrementer is set to a value that corresponds with the residency of the next available state. Thus firing up a timer that will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Thus, cpu will eventually end up in the deepest possible state and we won't get stuck in a shallow state for long duration. Experiment -- For earlier versions when this feature was meat to be only for shallow lite states, I performed experiments for three scenarios to collect some data. case 1 : Without this patch and without tick retained, i.e. in a upstream kernel, It would spend more than even a second to get out of stop0_lite. case 2 : With tick retained in a upstream kernel - Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected it to take 8 sched tick to get out of stop0_lite. Experimentally, observation was = sample minmax 99percentile 20 4ms12ms 4ms = It would take atleast one sched tick to get out of stop0_lite. case 2 : With this patch (not stopping tick, but explicitly queuing a timer) sample min max 99percentile 20 144us 192us 144us Description of current implementation - We calculate timeout for the current idle state as the residency value of the next available idle state. If the decrementer is set to be greater than this timeout, we update the decrementer value with the residency of next available idle state. Thus, essentially training the governor to select the next available deeper state until we reach the deepest state. Hence, we won't get stuck unnecessarily in shallow states for longer duration. v1 of auto-promotion : https://lkml.org/lkml/2019/3/22/58 This patch was implemented only for shallow lite state in generic cpuidle driver. v2 : Removed timeout_needed and rebased to current upstream kernel Then, v1 of forced-wakeup : Moved the code to cpuidle powernv driver and started as forced wakeup instead of auto-promotion v2 : Extended the forced wakeup logic for all states. Setting the decrementer instead of queuing up a hrtimer to implement the logic. v3 : 1) Cleanly handle setting the decrementer after exiting out of stop states. 2) Added a disable_callback feature to compute timeout whenever a state is enbaled or disabled instead of computing everytime in fast idle path. 3) Use disable callback to recompute timeout whenever state usage is changed for a state. Also, cleaned up the get_snooze_timeout function. Abhishek Goel (3): cpuidle-powernv : forced wakeup for stop states cpuidle : Add callback whenever a state usage is enabled/disabled cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled arch/powerpc/include/asm/time.h | 2 ++ arch/powerpc/kernel/time.c| 40 ++ drivers/cpuidle/cpuidle-powernv.c | 47 ++- drivers/cpuidle/sysfs.c | 15 +- include/linux/cpuidle.h | 5 5 files changed, 95 insertions(+), 14 deletions(-) -- 2.17.1
Re: [PATCH v2 1/1] cpuidle-powernv : forced wakeup for stop states
Hi Nick, On 06/19/2019 03:39 PM, Nicholas Piggin wrote: Abhishek's on June 19, 2019 7:08 pm: Hi Nick, Thanks for the review. Some replies below. On 06/19/2019 09:53 AM, Nicholas Piggin wrote: Abhishek Goel's on June 17, 2019 7:56 pm: Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU may end up in the shallow state. This is problematic, when the predicted state in the aforementioned scenario is a shallow stop state on a tickless system. As we might get stuck into shallow states for hours, in absence of ticks or interrupts. To address this, We forcefully wakeup the cpu by setting the decrementer. The decrementer is set to a value that corresponds with the residency of the next available state. Thus firing up a timer that will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Thus, cpu will eventually end up in the deepest possible state. Signed-off-by: Abhishek Goel --- Auto-promotion v1 : started as auto promotion logic for cpuidle states in generic driver v2 : Removed timeout_needed and rebased the code to upstream kernel Forced-wakeup v1 : New patch with name of forced wakeup started v2 : Extending the forced wakeup logic for all states. Setting the decrementer instead of queuing up a hrtimer to implement the logic. drivers/cpuidle/cpuidle-powernv.c | 38 +++ 1 file changed, 38 insertions(+) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 84b1ebe212b3..bc9ca18ae7e3 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -46,6 +46,26 @@ static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly static u64 default_snooze_timeout __read_mostly; static bool snooze_timeout_en __read_mostly; +static u64 forced_wakeup_timeout(struct cpuidle_device *dev, +struct cpuidle_driver *drv, +int index) +{ + int i; + + for (i = index + 1; i < drv->state_count; i++) { + struct cpuidle_state *s = >states[i]; + struct cpuidle_state_usage *su = >states_usage[i]; + + if (s->disabled || su->disable) + continue; + + return (s->target_residency + 2 * s->exit_latency) * + tb_ticks_per_usec; + } + + return 0; +} It would be nice to not have this kind of loop iteration in the idle fast path. Can we add a flag or something to the idle state? Currently, we do not have any callback notification or some feedback that notifies the driver everytime some state is enabled/disabled. So we have to parse everytime to get the next enabled state. Ahh, that's why you're doing that. Are you suggesting to add something like next_enabled_state in cpuidle state structure itself which will be updated when a state is enabled or disabled? Hmm, I guess it normally should not iterate over more than one state unless some idle states are disabled. What would have been nice is each state just have its own timeout field with ticks already calculated, if that could be updated when a state is enabled or disabled. How hard is that to add to the cpuidle core? I have implemented a prototype which does what you have asked for. Added a disable_callback which will update timeout whenever a state is enabled or disabled. But It would mean adding some code to cpuidle.h and cpuidle/sysfs.c. If that is not an issue, should I go ahead and post it? + static u64 get_snooze_timeout(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) @@ -144,8 +164,26 @@ static int stop_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { + u64 dec_expiry_tb, dec, timeout_tb, forced_wakeup; + + dec = mfspr(SPRN_DEC); + timeout_tb = forced_wakeup_timeout(dev, drv, index); + forced_wakeup = 0; + + if (timeout_tb && timeout_tb < dec) { + forced_wakeup = 1; + dec_expiry_tb = mftb() + dec; + } The compiler probably can't optimise away the SPR manipulations so try to avoid them if possible. Are you suggesting something like set_dec_before_idle?(in line with what you have suggested to do after idle, reset_dec_after_idle) I should have been clear, I meant don't mfspr(SPRN_DEC) until you have tested timeou
Re: [PATCH v2 1/1] cpuidle-powernv : forced wakeup for stop states
Hi Nick, Thanks for the review. Some replies below. On 06/19/2019 09:53 AM, Nicholas Piggin wrote: Abhishek Goel's on June 17, 2019 7:56 pm: Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU may end up in the shallow state. This is problematic, when the predicted state in the aforementioned scenario is a shallow stop state on a tickless system. As we might get stuck into shallow states for hours, in absence of ticks or interrupts. To address this, We forcefully wakeup the cpu by setting the decrementer. The decrementer is set to a value that corresponds with the residency of the next available state. Thus firing up a timer that will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Thus, cpu will eventually end up in the deepest possible state. Signed-off-by: Abhishek Goel --- Auto-promotion v1 : started as auto promotion logic for cpuidle states in generic driver v2 : Removed timeout_needed and rebased the code to upstream kernel Forced-wakeup v1 : New patch with name of forced wakeup started v2 : Extending the forced wakeup logic for all states. Setting the decrementer instead of queuing up a hrtimer to implement the logic. drivers/cpuidle/cpuidle-powernv.c | 38 +++ 1 file changed, 38 insertions(+) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 84b1ebe212b3..bc9ca18ae7e3 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -46,6 +46,26 @@ static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly static u64 default_snooze_timeout __read_mostly; static bool snooze_timeout_en __read_mostly; +static u64 forced_wakeup_timeout(struct cpuidle_device *dev, +struct cpuidle_driver *drv, +int index) +{ + int i; + + for (i = index + 1; i < drv->state_count; i++) { + struct cpuidle_state *s = >states[i]; + struct cpuidle_state_usage *su = >states_usage[i]; + + if (s->disabled || su->disable) + continue; + + return (s->target_residency + 2 * s->exit_latency) * + tb_ticks_per_usec; + } + + return 0; +} It would be nice to not have this kind of loop iteration in the idle fast path. Can we add a flag or something to the idle state? Currently, we do not have any callback notification or some feedback that notifies the driver everytime some state is enabled/disabled. So we have to parse everytime to get the next enabled state. Are you suggesting to add something like next_enabled_state in cpuidle state structure itself which will be updated when a state is enabled or disabled? + static u64 get_snooze_timeout(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) @@ -144,8 +164,26 @@ static int stop_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { + u64 dec_expiry_tb, dec, timeout_tb, forced_wakeup; + + dec = mfspr(SPRN_DEC); + timeout_tb = forced_wakeup_timeout(dev, drv, index); + forced_wakeup = 0; + + if (timeout_tb && timeout_tb < dec) { + forced_wakeup = 1; + dec_expiry_tb = mftb() + dec; + } The compiler probably can't optimise away the SPR manipulations so try to avoid them if possible. Are you suggesting something like set_dec_before_idle?(in line with what you have suggested to do after idle, reset_dec_after_idle) + + if (forced_wakeup) + mtspr(SPRN_DEC, timeout_tb); This should just be put in the above 'if'. Fair point. + power9_idle_type(stop_psscr_table[index].val, stop_psscr_table[index].mask); + + if (forced_wakeup) + mtspr(SPRN_DEC, dec_expiry_tb - mftb()); This will sometimes go negative and result in another timer interrupt. It also breaks irq work (which can be set here by machine check I believe. May need to implement some timer code to do this for you. static void reset_dec_after_idle(void) { u64 now; u64 *next_tb; if (test_irq_work_pending()) return; now = mftb; next_tb = this_cpu_ptr(_next_tb); if (now >= *next_tb) return; set_dec(*next_tb - n
[PATCH v2 1/1] cpuidle-powernv : forced wakeup for stop states
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU may end up in the shallow state. This is problematic, when the predicted state in the aforementioned scenario is a shallow stop state on a tickless system. As we might get stuck into shallow states for hours, in absence of ticks or interrupts. To address this, We forcefully wakeup the cpu by setting the decrementer. The decrementer is set to a value that corresponds with the residency of the next available state. Thus firing up a timer that will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Thus, cpu will eventually end up in the deepest possible state. Signed-off-by: Abhishek Goel --- Auto-promotion v1 : started as auto promotion logic for cpuidle states in generic driver v2 : Removed timeout_needed and rebased the code to upstream kernel Forced-wakeup v1 : New patch with name of forced wakeup started v2 : Extending the forced wakeup logic for all states. Setting the decrementer instead of queuing up a hrtimer to implement the logic. drivers/cpuidle/cpuidle-powernv.c | 38 +++ 1 file changed, 38 insertions(+) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 84b1ebe212b3..bc9ca18ae7e3 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -46,6 +46,26 @@ static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly static u64 default_snooze_timeout __read_mostly; static bool snooze_timeout_en __read_mostly; +static u64 forced_wakeup_timeout(struct cpuidle_device *dev, +struct cpuidle_driver *drv, +int index) +{ + int i; + + for (i = index + 1; i < drv->state_count; i++) { + struct cpuidle_state *s = >states[i]; + struct cpuidle_state_usage *su = >states_usage[i]; + + if (s->disabled || su->disable) + continue; + + return (s->target_residency + 2 * s->exit_latency) * + tb_ticks_per_usec; + } + + return 0; +} + static u64 get_snooze_timeout(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) @@ -144,8 +164,26 @@ static int stop_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { + u64 dec_expiry_tb, dec, timeout_tb, forced_wakeup; + + dec = mfspr(SPRN_DEC); + timeout_tb = forced_wakeup_timeout(dev, drv, index); + forced_wakeup = 0; + + if (timeout_tb && timeout_tb < dec) { + forced_wakeup = 1; + dec_expiry_tb = mftb() + dec; + } + + if (forced_wakeup) + mtspr(SPRN_DEC, timeout_tb); + power9_idle_type(stop_psscr_table[index].val, stop_psscr_table[index].mask); + + if (forced_wakeup) + mtspr(SPRN_DEC, dec_expiry_tb - mftb()); + return index; } -- 2.17.1
[PATCH v2 0/1] Forced-wakeup for stop states on Powernv
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. Motivation -- In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a shallow stop state on a tickless system. As we might get stuck into shallow states even for hours, in absence of ticks or interrupts. To address this, We forcefully wakeup the cpu by setting the decrementer. The decrementer is set to a value that corresponds with the residency of the next available state. Thus firing up a timer that will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Thus, cpu will eventually end up in the deepest possible state and we won't get stuck in a shallow state for long duration. Experiment -- For earlier versions when this feature was meat to be only for shallow lite states, I performed experiments for three scenarios to collect some data. case 1 : Without this patch and without tick retained, i.e. in a upstream kernel, It would spend more than even a second to get out of stop0_lite. case 2 : With tick retained in a upstream kernel - Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected it to take 8 sched tick to get out of stop0_lite. Experimentally, observation was = sample minmax 99percentile 20 4ms12ms 4ms = It would take atleast one sched tick to get out of stop0_lite. case 2 : With this patch (not stopping tick, but explicitly queuing a timer) sample min max 99percentile 20 144us 192us 144us Description of current implementation - We calculate timeout for the current idle state as the residency value of the next available idle state. If the decrementer is set to be greater than this timeout, we update the decrementer value with the residency of next available idle state. Thus, essentially training the governor to select the next available deeper state until we reach the deepest state. Hence, we won't get stuck unnecessarily in shallow states for longer duration. v1 of auto-promotion : https://lkml.org/lkml/2019/3/22/58 This patch was implemented only for shallow lite state in generic cpuidle driver. v2 of auto-promotion : Removed timeout_needed and rebased to current upstream kernel Then, v1 of forced-wakeup : Moved the code to cpuidle powernv driver and started as forced wakeup instead of auto-promotion v2 of forced-wakeup : Extended the forced wakeup logic for all states. Setting the decrementer instead of queuing up a hrtimer to implement the logic. Abhishek Goel (1): cpuidle-powernv : forced wakeup for stop states drivers/cpuidle/cpuidle-powernv.c | 38 +++ 1 file changed, 38 insertions(+) -- 2.17.1
Re: [PATCH 0/1] Forced-wakeup for stop lite states on Powernv
On 05/08/2019 10:29 AM, Nicholas Piggin wrote: Abhishek Goel's on April 22, 2019 4:32 pm: Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. Motivation -- In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. So we do not want to get stucked in such states for longer duration. To address this, the cpuidle-core can queue timer to correspond with the residency value of the next available state. This timer will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Cpu will be kicked out of the lite state and end up in a non-lite state. Experiment -- I performed experiments for three scenarios to collect some data. case 1 : Without this patch and without tick retained, i.e. in a upstream kernel, It would spend more than even a second to get out of stop0_lite. case 2 : With tick retained in a upstream kernel - Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected it to take 8 sched tick to get out of stop0_lite. Experimentally, observation was = sample minmax 99percentile 20 4ms12ms 4ms = It would take atleast one sched tick to get out of stop0_lite. case 2 : With this patch (not stopping tick, but explicitly queuing a timer) sample min max 99percentile 20 144us 192us 144us In this patch, we queue a timer just before entering into a stop0_lite state. The timer fires at (residency of next available state + exit latency of next available state * 2). Let's say if next state(stop0) is available which has residency of 20us, it should get out in as low as (20+2*2)*8 [Based on the forumla (residency + 2xlatency)*history length] microseconds = 192us. Ideally we would expect 8 iterations, it was observed to get out in 6-7 iterations. Even if let's say stop2 is next available state(stop0 and stop1 both are unavailable), it would take (100+2*10)*8 = 960us to get into stop2. So, We are able to get out of stop0_lite generally in 150us(with this patch) as compared to 4ms(with tick retained). As stated earlier, we do not want to get stuck into stop0_lite as it inhibits SMT folding for other sibling threads, depriving them of core resources. Current patch is using forced-wakeup only for stop0_lite, as it gives performance benefit(primary reason) along with lowering down power consumption. We may extend this model for other states in future. I still have to wonder, between our snooze loop and stop0, what does stop0_lite buy us. That said, the problem you're solving here is a generic one that all stop states have, I think. Doesn't the same thing apply going from stop0 to stop5? You might under estimate the sleep time and lose power savings and therefore performance there too. Shouldn't we make it generic for all stop states? Thanks, Nick When a cpu is in snooze, it takes both space and time of core. When in stop0_lite, it free up time but it still takes space. When it is in stop0 or deeper, it free up both space and time slice of core. In stop0_lite, cpu doesn't free up the core resources and thus inhibits thread folding. When a cpu goes to stop0, it will free up the core resources thus increasing the single thread performance of other sibling thread. Hence, we do not want to get stuck in stop0_lite for long duration, and want to quickly move onto the next state. If we get stuck in any other state we would possibly be losing on to power saving, but will still be able to gain the performance benefits for other sibling threads. Thanks, Abhishek
[PATCH 1/1] cpuidle-powernv : forced wakeup for stop lite states
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. So we do not want to get stucked in such states for longer duration. To address this, the cpuidle-core can queue timer to correspond with the residency value of the next available state. This timer will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Cpu will be kicked out of the lite state and end up in a non-lite state. Signed-off-by: Abhishek Goel --- arch/powerpc/include/asm/opal-api.h | 1 + drivers/cpuidle/cpuidle-powernv.c | 71 - 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 870fb7b23..735dec731 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -226,6 +226,7 @@ */ #define OPAL_PM_TIMEBASE_STOP 0x0002 +#define OPAL_PM_LOSE_USER_CONTEXT 0x1000 #define OPAL_PM_LOSE_HYP_CONTEXT 0x2000 #define OPAL_PM_LOSE_FULL_CONTEXT 0x4000 #define OPAL_PM_NAP_ENABLED0x0001 diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 84b1ebe21..30b877962 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,40 @@ struct stop_psscr_table { static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly; +DEFINE_PER_CPU(struct hrtimer, forced_wakeup_timer); + +static int forced_wakeup_time_compute(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + int i, timeout_us = 0; + + for (i = index + 1; i < drv->state_count; i++) { + if (drv->states[i].disabled || dev->states_usage[i].disable) + continue; + timeout_us = drv->states[i].target_residency + + 2 * drv->states[i].exit_latency; + break; + } + + return timeout_us; +} + +enum hrtimer_restart forced_wakeup_hrtimer_callback(struct hrtimer *hrtimer) +{ + return HRTIMER_NORESTART; +} + +static void forced_wakeup_timer_init(int cpu, struct cpuidle_driver *drv) +{ + struct hrtimer *cpu_forced_wakeup_timer = _cpu(forced_wakeup_timer, + cpu); + + hrtimer_init(cpu_forced_wakeup_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + cpu_forced_wakeup_timer->function = forced_wakeup_hrtimer_callback; +} + static u64 default_snooze_timeout __read_mostly; static bool snooze_timeout_en __read_mostly; @@ -103,6 +138,28 @@ static int snooze_loop(struct cpuidle_device *dev, return index; } +static int stop_lite_loop(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + int timeout_us; + struct hrtimer *this_timer = _cpu(forced_wakeup_timer, dev->cpu); + + timeout_us = forced_wakeup_time_compute(dev, drv, index); + + if (timeout_us > 0) + hrtimer_start(this_timer, ns_to_ktime(timeout_us * 1000), + HRTIMER_MODE_REL_PINNED); + + power9_idle_type(stop_psscr_table[index].val, +stop_psscr_table[index].mask); + + if (unlikely(hrtimer_is_queued(this_timer))) + hrtimer_cancel(this_timer); + + return index; +} + static int nap_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) @@ -190,7 +247,7 @@ static int powernv_cpuidle_cpu_dead(unsigned int cpu) */ static int powernv_cpuidle_driver_init(void) { - int idle_state; + int idle_state, cpu; struct cpuidle_driver *drv = _idle_driver; drv->state_count = 0; @@ -224,6 +281,9 @@ static int powernv_cpuidle_driver_init(void) drv->cpumask = (struct cpumask *)cpu_present_mask; + for_each_cpu(cpu, drv->cpumask) + forced_wakeup_time
[PATCH 0/1] Forced-wakeup for stop lite states on Powernv
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. Motivation -- In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. So we do not want to get stucked in such states for longer duration. To address this, the cpuidle-core can queue timer to correspond with the residency value of the next available state. This timer will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Cpu will be kicked out of the lite state and end up in a non-lite state. Experiment -- I performed experiments for three scenarios to collect some data. case 1 : Without this patch and without tick retained, i.e. in a upstream kernel, It would spend more than even a second to get out of stop0_lite. case 2 : With tick retained in a upstream kernel - Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected it to take 8 sched tick to get out of stop0_lite. Experimentally, observation was = sample minmax 99percentile 20 4ms12ms 4ms = It would take atleast one sched tick to get out of stop0_lite. case 2 : With this patch (not stopping tick, but explicitly queuing a timer) sample min max 99percentile 20 144us 192us 144us In this patch, we queue a timer just before entering into a stop0_lite state. The timer fires at (residency of next available state + exit latency of next available state * 2). Let's say if next state(stop0) is available which has residency of 20us, it should get out in as low as (20+2*2)*8 [Based on the forumla (residency + 2xlatency)*history length] microseconds = 192us. Ideally we would expect 8 iterations, it was observed to get out in 6-7 iterations. Even if let's say stop2 is next available state(stop0 and stop1 both are unavailable), it would take (100+2*10)*8 = 960us to get into stop2. So, We are able to get out of stop0_lite generally in 150us(with this patch) as compared to 4ms(with tick retained). As stated earlier, we do not want to get stuck into stop0_lite as it inhibits SMT folding for other sibling threads, depriving them of core resources. Current patch is using forced-wakeup only for stop0_lite, as it gives performance benefit(primary reason) along with lowering down power consumption. We may extend this model for other states in future. Abhishek Goel (1): cpuidle-powernv : forced wakeup for stop lite states arch/powerpc/include/asm/opal-api.h | 1 + drivers/cpuidle/cpuidle-powernv.c | 71 - 2 files changed, 71 insertions(+), 1 deletion(-) -- 2.17.1
Re: [PATCH v2 1/2] cpuidle : auto-promotion for cpuidle states
Hi Rafael, Thanks for the Review. Few inline replies below. On 04/09/2019 03:31 PM, Rafael J. Wysocki wrote: On Fri, Apr 5, 2019 at 11:17 AM Abhishek Goel wrote: Currently, the cpuidle governors (menu /ladder) determine what idle state There are three governors in 5.1-rc. an idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. To address this, such lite states need to be autopromoted. I don't quite agree with this statement and it doesn't even match what the patch does AFAICS. "Autopromotion" would be going from the given state to a deeper one without running state selection in between, but that's not what's going on here. Thinking to call it "timed-exit". Is that good? The cpuidle-core can queue timer to correspond with the residency value of the next available state. Thus leading to auto-promotion to a deeper idle state as soon as possible. No, it doesn't automatically cause a deeper state to be used next time. It simply kicks the CPU out of the idle state and one more iteration of the idle loop runs on it. Whether or not a deeper state will be selected in that iteration depends on the governor computations carried out in it. I did not mean that next state is chosen automatically. I should have been more descriptive here instead of just using "as soon as possible" Now, this appears to be almost analogous to the "polling" state used on x86 which uses the next idle state's target residency as a timeout. While generally I'm not a big fan of setting up timers in the idle loop (it sort of feels like pulling your own hair in order to get yourself out of a swamp), if idle states like these are there in your platform, setting up a timer to get out of them in the driver's ->enter() routine might not be particularly objectionable. Doing that in the core is a whole different story, though. Generally, this adds quite a bit of complexity (on the "ugly" side of things IMO) to the core to cover a corner case present in one platform, while IMO it can be covered in the driver for that platform directly. As of now, since this code doesn't add any benefit to the other platform, I will post a patch with this implementation covered in platform-specific driver code. You are right that all the information needed for this implementation are also available there in platform driver code, so we should be good to go.
Re: [PATCH v2 1/2] cpuidle : auto-promotion for cpuidle states
On 04/09/2019 03:00 PM, Rafael J. Wysocki wrote: On Tue, Apr 9, 2019 at 11:29 AM Abhishek wrote: Hi Daniel, Thanks for such a descriptive review. I will include all the suggestions made in my next iteration. Please give me some time to send comments before that. Sure, I will wait for your review.
Re: [PATCH v2 1/2] cpuidle : auto-promotion for cpuidle states
Hi Daniel, On 04/08/2019 07:55 PM, Daniel Axtens wrote: Hi, Sorry, just realised another thing I wanted to ask: @@ -442,6 +442,26 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, } } +#ifdef CPUIDLE_FLAG_AUTO_PROMOTION Why is this based on CPUIDLE_FLAG_ rather than CONFIG_CPU_IDLE_? Won't this always be true, given that the flag is defined regardless of the config option in the header? Yeah, You are right. This should have been CONFIG_CPU_IDLE_AUTO_PROMOTION. --Abhishek
Re: [PATCH v2 1/2] cpuidle : auto-promotion for cpuidle states
Hi Daniel, Thanks for such a descriptive review. I will include all the suggestions made in my next iteration. --Abhishek On 04/08/2019 07:42 PM, Daniel Axtens wrote: Hi Abhishek, Currently, the cpuidle governors (menu /ladder) determine what idle state an idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. To address this, such lite states need to be autopromoted. The cpuidle- core can queue timer to correspond with the residency value of the next available state. Thus leading to auto-promotion to a deeper idle state as soon as possible. This sounds sensible to me, although I'm not really qualified to offer a full power-management opinion on it. I have some general code questions and comments, however, which are below: Signed-off-by: Abhishek Goel --- v1->v2 : Removed timeout_needed and rebased to current upstream kernel drivers/cpuidle/cpuidle.c | 68 +- drivers/cpuidle/governors/ladder.c | 3 +- drivers/cpuidle/governors/menu.c | 22 +- include/linux/cpuidle.h| 10 - 4 files changed, 99 insertions(+), 4 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 7f108309e..11ce43f19 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -36,6 +36,11 @@ static int enabled_devices; static int off __read_mostly; static int initialized __read_mostly; +struct auto_promotion { + struct hrtimer hrtimer; + unsigned long timeout_us; +}; + int cpuidle_disabled(void) { return off; @@ -188,6 +193,54 @@ int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev) } #endif /* CONFIG_SUSPEND */ +enum hrtimer_restart auto_promotion_hrtimer_callback(struct hrtimer *hrtimer) +{ + return HRTIMER_NORESTART; +} + +#ifdef CONFIG_CPU_IDLE_AUTO_PROMOTION As far as I can tell, this config flag isn't defined until the next patch, making this dead code for now. Is this intentional? +DEFINE_PER_CPU(struct auto_promotion, ap); A quick grep suggests that most per-cpu variable have more descriptive names, perhaps this one should too. + +static void cpuidle_auto_promotion_start(int cpu, struct cpuidle_state *state) +{ + struct auto_promotion *this_ap = _cpu(ap, cpu); + + if (state->flags & CPUIDLE_FLAG_AUTO_PROMOTION) + hrtimer_start(_ap->hrtimer, ns_to_ktime(this_ap->timeout_us + * 1000), HRTIMER_MODE_REL_PINNED); Would it be clearer to have both sides of the multiplication on the same line? i.e. + hrtimer_start(_ap->hrtimer, + ns_to_ktime(this_ap->timeout_us * 1000), + HRTIMER_MODE_REL_PINNED); +} + +static void cpuidle_auto_promotion_cancel(int cpu) +{ + struct hrtimer *hrtimer; + + hrtimer = _cpu(ap, cpu).hrtimer; + if (hrtimer_is_queued(hrtimer)) + hrtimer_cancel(hrtimer); +} + +static void cpuidle_auto_promotion_update(int cpu, unsigned long timeout) +{ + per_cpu(ap, cpu).timeout_us = timeout; +} + +static void cpuidle_auto_promotion_init(int cpu, struct cpuidle_driver *drv) +{ + struct auto_promotion *this_ap = _cpu(ap, cpu); + + hrtimer_init(_ap->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + this_ap->hrtimer.function = auto_promotion_hrtimer_callback; +} +#else +static inline void cpuidle_auto_promotion_start(int cpu, struct cpuidle_state + *state) { } +static inline void cpuidle_auto_promotion_cancel(int cpu) { } +static inline void cpuidle_auto_promotion_update(int cpu, unsigned long + timeout) { } +static inline void cpuidle_auto_promotion_init(int cpu, struct cpuidle_driver + *drv) { } Several of these have the type, then a line break, and then the name (unsigned long\n timeout). This is a bit harder to read, they should probably all be on the same line. +#endif + /** * cpuidle_enter_state - enter the state and update stats * @dev: cpuidle device for this cpu @@ -225,12 +278,17 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, trace_cpu_idle_rcuidle(index, dev->cpu); time_start = ns_to_ktime(local_clock()); + cpuidle_auto_promoti
[PATCH v2 2/2] cpuidle : Add auto-promotion flag to cpuidle flags
This patch sets up flags for the state which needs to be auto-promoted. On POWERNV system, only lite states need to be autopromoted. We identify lite states by those which do not lose user context. That information has been used to set the flag for lite states. Signed-off-by: Abhishek Goel --- arch/powerpc/include/asm/opal-api.h | 1 + drivers/cpuidle/Kconfig | 4 drivers/cpuidle/cpuidle-powernv.c | 13 +++-- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 870fb7b23..735dec731 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -226,6 +226,7 @@ */ #define OPAL_PM_TIMEBASE_STOP 0x0002 +#define OPAL_PM_LOSE_USER_CONTEXT 0x1000 #define OPAL_PM_LOSE_HYP_CONTEXT 0x2000 #define OPAL_PM_LOSE_FULL_CONTEXT 0x4000 #define OPAL_PM_NAP_ENABLED0x0001 diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index 8caccbbd7..9b8e9b96a 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -35,6 +35,10 @@ config CPU_IDLE_GOV_TEO config DT_IDLE_STATES bool +config CPU_IDLE_AUTO_PROMOTION + bool + default y if PPC_POWERNV + menu "ARM CPU Idle Drivers" depends on ARM || ARM64 source "drivers/cpuidle/Kconfig.arm" diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 84b1ebe21..0dd767270 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -299,6 +299,7 @@ static int powernv_add_idle_states(void) for (i = 0; i < dt_idle_states; i++) { unsigned int exit_latency, target_residency; bool stops_timebase = false; + bool lose_user_context = false; struct pnv_idle_states_t *state = _idle_states[i]; /* @@ -324,6 +325,9 @@ static int powernv_add_idle_states(void) if (has_stop_states && !(state->valid)) continue; + if (state->flags & OPAL_PM_LOSE_USER_CONTEXT) + lose_user_context = true; + if (state->flags & OPAL_PM_TIMEBASE_STOP) stops_timebase = true; @@ -332,12 +336,17 @@ static int powernv_add_idle_states(void) add_powernv_state(nr_idle_states, "Nap", CPUIDLE_FLAG_NONE, nap_loop, target_residency, exit_latency, 0, 0); + } else if (has_stop_states && !lose_user_context) { + add_powernv_state(nr_idle_states, state->name, + CPUIDLE_FLAG_AUTO_PROMOTION, + stop_loop, target_residency, + exit_latency, state->psscr_val, + state->psscr_mask); } else if (has_stop_states && !stops_timebase) { add_powernv_state(nr_idle_states, state->name, CPUIDLE_FLAG_NONE, stop_loop, target_residency, exit_latency, - state->psscr_val, - state->psscr_mask); + state->psscr_val, state->psscr_mask); } /* -- 2.17.1
[PATCH v2 1/2] cpuidle : auto-promotion for cpuidle states
Currently, the cpuidle governors (menu /ladder) determine what idle state an idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. To address this, such lite states need to be autopromoted. The cpuidle- core can queue timer to correspond with the residency value of the next available state. Thus leading to auto-promotion to a deeper idle state as soon as possible. Signed-off-by: Abhishek Goel --- v1->v2 : Removed timeout_needed and rebased to current upstream kernel drivers/cpuidle/cpuidle.c | 68 +- drivers/cpuidle/governors/ladder.c | 3 +- drivers/cpuidle/governors/menu.c | 22 +- include/linux/cpuidle.h| 10 - 4 files changed, 99 insertions(+), 4 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 7f108309e..11ce43f19 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -36,6 +36,11 @@ static int enabled_devices; static int off __read_mostly; static int initialized __read_mostly; +struct auto_promotion { + struct hrtimer hrtimer; + unsigned long timeout_us; +}; + int cpuidle_disabled(void) { return off; @@ -188,6 +193,54 @@ int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev) } #endif /* CONFIG_SUSPEND */ +enum hrtimer_restart auto_promotion_hrtimer_callback(struct hrtimer *hrtimer) +{ + return HRTIMER_NORESTART; +} + +#ifdef CONFIG_CPU_IDLE_AUTO_PROMOTION +DEFINE_PER_CPU(struct auto_promotion, ap); + +static void cpuidle_auto_promotion_start(int cpu, struct cpuidle_state *state) +{ + struct auto_promotion *this_ap = _cpu(ap, cpu); + + if (state->flags & CPUIDLE_FLAG_AUTO_PROMOTION) + hrtimer_start(_ap->hrtimer, ns_to_ktime(this_ap->timeout_us + * 1000), HRTIMER_MODE_REL_PINNED); +} + +static void cpuidle_auto_promotion_cancel(int cpu) +{ + struct hrtimer *hrtimer; + + hrtimer = _cpu(ap, cpu).hrtimer; + if (hrtimer_is_queued(hrtimer)) + hrtimer_cancel(hrtimer); +} + +static void cpuidle_auto_promotion_update(int cpu, unsigned long timeout) +{ + per_cpu(ap, cpu).timeout_us = timeout; +} + +static void cpuidle_auto_promotion_init(int cpu, struct cpuidle_driver *drv) +{ + struct auto_promotion *this_ap = _cpu(ap, cpu); + + hrtimer_init(_ap->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + this_ap->hrtimer.function = auto_promotion_hrtimer_callback; +} +#else +static inline void cpuidle_auto_promotion_start(int cpu, struct cpuidle_state + *state) { } +static inline void cpuidle_auto_promotion_cancel(int cpu) { } +static inline void cpuidle_auto_promotion_update(int cpu, unsigned long + timeout) { } +static inline void cpuidle_auto_promotion_init(int cpu, struct cpuidle_driver + *drv) { } +#endif + /** * cpuidle_enter_state - enter the state and update stats * @dev: cpuidle device for this cpu @@ -225,12 +278,17 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, trace_cpu_idle_rcuidle(index, dev->cpu); time_start = ns_to_ktime(local_clock()); + cpuidle_auto_promotion_start(dev->cpu, target_state); + stop_critical_timings(); entered_state = target_state->enter(dev, drv, index); start_critical_timings(); sched_clock_idle_wakeup_event(); time_end = ns_to_ktime(local_clock()); + + cpuidle_auto_promotion_cancel(dev->cpu); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); /* The cpu is no longer idle or about to enter idle. */ @@ -312,7 +370,13 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, bool *stop_tick) { - return cpuidle_curr_governor->select(drv, dev, stop_tick); + unsigned long timeout_us, ret; + + timeout_us = UINT_MAX; + ret = cpuidle_curr_governor->select(drv, dev, stop_tick, _us); + cpuidle_auto_promotion_update(dev->cpu, timeout_us); + + return ret; } /** @@ -658,6 +722,8 @@ int cpuidle_register(struct cpuidle_driver *drv, device = _cpu(cpuidl
[PATCH v2 0/2] Auto-promotion logic for cpuidle states
Currently, the cpuidle governors (menu/ladder) determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. Motivation -- In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. To address this, such lite states need to be autopromoted. The cpuidle-core can queue timer to correspond with the residency value of the next available state. Thus leading to auto-promotion to a deeper idle state as soon as possible. Experiment -- I performed experiments for three scenarios to collect some data. case 1 : Without this patch and without tick retained, i.e. in a upstream kernel, It would spend more than even a second to get out of stop0_lite. case 2 : With tick retained(as suggested) - Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected it to take 8 sched tick to get out of stop0_lite. Experimentally, observation was = sample minmax 99percentile 20 4ms12ms 4ms = *ms = milliseconds It would take atleast one sched tick to get out of stop0_lite. case 2 : With this patch (not stopping tick, but explicitly queuing a timer) sample min max 99percentile 20 144us 192us 144us *us = microseconds In this patch, we queue a timer just before entering into a stop0_lite state. The timer fires at (residency of next available state + exit latency of next available state * 2). Let's say if next state(stop0) is available which has residency of 20us, it should get out in as low as (20+2*2)*8 [Based on the forumla (residency + 2xlatency)*history length] microseconds = 192us. Ideally we would expect 8 iterations, it was observed to get out in 6-7 iterations. Even if let's say stop2 is next available state(stop0 and stop1 both are unavailable), it would take (100+2*10)*8 = 960us to get into stop2. So, We are able to get out of stop0_lite generally in 150us(with this patch) as compared to 4ms(with tick retained). As stated earlier, we do not want to get stuck into stop0_lite as it inhibits SMT folding for other sibling threads, depriving them of core resources. Current patch is using auto-promotion only for stop0_lite, as it gives performance benefit(primary reason) along with lowering down power consumption. We may extend this model for other states in future. Abhishek Goel (2): cpuidle : auto-promotion for cpuidle states cpuidle : Add auto-promotion flag to cpuidle flags arch/powerpc/include/asm/opal-api.h | 1 + drivers/cpuidle/Kconfig | 4 ++ drivers/cpuidle/cpuidle-powernv.c | 13 +- drivers/cpuidle/cpuidle.c | 68 - drivers/cpuidle/governors/ladder.c | 3 +- drivers/cpuidle/governors/menu.c| 22 +- include/linux/cpuidle.h | 10 - 7 files changed, 115 insertions(+), 6 deletions(-) -- 2.17.1
Re: [PATCH 1/2] cpuidle : auto-promotion for cpuidle states
On 04/04/2019 03:51 PM, Daniel Lezcano wrote: Hi Abhishek, thanks for taking the time to test the different scenario and give us the numbers. On 01/04/2019 07:11, Abhishek wrote: On 03/22/2019 06:56 PM, Daniel Lezcano wrote: On 22/03/2019 10:45, Rafael J. Wysocki wrote: On Fri, Mar 22, 2019 at 8:31 AM Abhishek Goel wrote: Currently, the cpuidle governors (menu /ladder) determine what idle state an idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. I can understand an idle state can prevent other threads to use the core resources. But why a deeper idle state does not prevent this also? To address this, such lite states need to be autopromoted. The cpuidle- core can queue timer to correspond with the residency value of the next available state. Thus leading to auto-promotion to a deeper idle state as soon as possible. Isn't the tick stopping avoidance sufficient for that? I was about to ask the same :) Thanks for the review. I performed experiments for three scenarios to collect some data. case 1 : Without this patch and without tick retained, i.e. in a upstream kernel, It would spend more than even a second to get out of stop0_lite. case 2 : With tick retained(as suggested) - Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected it to take 8 sched tick to get out of stop0_lite. Experimentally, observation was === min         max         99percentile 4ms         12ms       4ms === *ms = milliseconds It would take atleast one sched tick to get out of stop0_lite. case 2 : With this patch (not stopping tick, but explicitly queuing a timer) min         max          99.5percentile === 144us     192us          144us === *us = microseconds In this patch, we queue a timer just before entering into a stop0_lite state. The timer fires at (residency of next available state + exit latency of next available state * 2). So for the context, we have a similar issue but from the power management point of view where a CPU can stay in a shallow state for a long period, thus consuming a lot of energy. The window was reduced by preventing stopping the tick when a shallow state is selected. Unfortunately, if the tick is stopped and we exit/enter again and we select a shallow state, the situation is the same. A solution was previously proposed with a timer some years ago, like this patch does, and merged but there were complains about bad performance impact, so it has been reverted. Let's say if next state(stop0) is available which has residency of 20us, it should get out in as low as (20+2*2)*8 [Based on the forumla (residency + 2xlatency)*history length] microseconds = 192us. Ideally we would expect 8 iterations, it was observed to get out in 6-7 iterations. Can you explain the formula? I don't get the rational. Why using the exit latency and why multiply it by 2? Why the timer is not set to the next state's target residency value ? The idea behind multiplying by 2 is, entry latency + exit latency = 2* exit latency, i.e., using exit latency = entry latency So in effect, we are using target residency + 2 * exit latency for timeout of timer. Latency is generally <=10% of residency. I have tried to be conservative by including latency factor in computation for timeout. Thus, this formula will give slightly greater value compared to directly using residency of target state. --Abhishek
Re: [PATCH 1/2] cpuidle : auto-promotion for cpuidle states
On 03/22/2019 06:56 PM, Daniel Lezcano wrote: On 22/03/2019 10:45, Rafael J. Wysocki wrote: On Fri, Mar 22, 2019 at 8:31 AM Abhishek Goel wrote: Currently, the cpuidle governors (menu /ladder) determine what idle state an idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. To address this, such lite states need to be autopromoted. The cpuidle- core can queue timer to correspond with the residency value of the next available state. Thus leading to auto-promotion to a deeper idle state as soon as possible. Isn't the tick stopping avoidance sufficient for that? I was about to ask the same :) Thanks for the review. I performed experiments for three scenarios to collect some data. case 1 : Without this patch and without tick retained, i.e. in a upstream kernel, It would spend more than even a second to get out of stop0_lite. case 2 : With tick retained(as suggested) - Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected it to take 8 sched tick to get out of stop0_lite. Experimentally, observation was === min         max         99percentile 4ms         12ms       4ms === *ms = milliseconds It would take atleast one sched tick to get out of stop0_lite. case 2 : With this patch (not stopping tick, but explicitly queuing a timer) min         max          99.5percentile === 144us     192us          144us === *us = microseconds In this patch, we queue a timer just before entering into a stop0_lite state. The timer fires at (residency of next available state + exit latency of next available state * 2). Let's say if next state(stop0) is available which has residency of 20us, it should get out in as low as (20+2*2)*8 [Based on the forumla (residency + 2xlatency)*history length] microseconds = 192us. Ideally we would expect 8 iterations, it was observed to get out in 6-7 iterations. Even if let's say stop2 is next available state(stop0 and stop1 both are unavailable), it would take (100+2*10)*8 = 960us to get into stop2. So, We are able to get out of stop0_lite generally in 150us(with this patch) as compared to 4ms(with tick retained). As stated earlier, we do not want to get stuck into stop0_lite as it inhibits SMT folding for other sibling threads, depriving them of core resources. Current patch is using auto-promotion only for stop0_lite, as it gives performance benefit(primary reason) along with lowering down power consumption. We may extend this model for other states in future. --Abhishek
Re: [PATCH 0/2] Auto-promotion logic for cpuidle states
Please ignore this set as this is incomplete. I have resent the patches. --Abhishek On 03/22/2019 11:55 AM, Abhishek Goel wrote: Currently, the cpuidle governors (menu/ladder) determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. Motivation -- In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. To address this, such lite states need to be autopromoted. The cpuidle-core can queue timer to correspond with the residency value of the next available state. Thus leading to auto-promotion to a deeper idle state as soon as possible. Experiment -- Without this patch - It was seen that for a idle system, a cpu may remain in stop0_lite for few seconds and then directly goes to a deeper state such as stop2. With this patch - A cpu will not remain in stop0_lite for more than the residency of next available state, and thus it will go to a deeper state in conservative fashion. Using this, we may spent even less than 20 milliseconds if susbsequent stop states are enabled. In the worst case, we may end up spending more than a second, as was the case without this patch. The worst case will occur in the scenario when no other shallow states are enbaled, and only deep states are available for auto-promotion. Abhishek Goel (2): cpuidle : auto-promotion for cpuidle states cpuidle : Add auto-promotion flag to cpuidle flags arch/powerpc/include/asm/opal-api.h | 1 + drivers/cpuidle/Kconfig | 4 drivers/cpuidle/cpuidle-powernv.c | 13 +++-- drivers/cpuidle/cpuidle.c | 3 --- 4 files changed, 16 insertions(+), 5 deletions(-)
[PATCH 2/2] cpuidle : Add auto-promotion flag to cpuidle flags
This patch sets up flags for the state which needs to be auto-promoted. For powernv systems, lite states do not even lose user context. That information has been used to set the flag for lite states. Signed-off-by: Abhishek Goel --- arch/powerpc/include/asm/opal-api.h | 1 + drivers/cpuidle/Kconfig | 4 drivers/cpuidle/cpuidle-powernv.c | 13 +++-- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 870fb7b23..735dec731 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -226,6 +226,7 @@ */ #define OPAL_PM_TIMEBASE_STOP 0x0002 +#define OPAL_PM_LOSE_USER_CONTEXT 0x1000 #define OPAL_PM_LOSE_HYP_CONTEXT 0x2000 #define OPAL_PM_LOSE_FULL_CONTEXT 0x4000 #define OPAL_PM_NAP_ENABLED0x0001 diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index 7e48eb5bf..0ece62684 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -26,6 +26,10 @@ config CPU_IDLE_GOV_MENU config DT_IDLE_STATES bool +config CPU_IDLE_AUTO_PROMOTION + bool + default y if PPC_POWERNV + menu "ARM CPU Idle Drivers" depends on ARM || ARM64 source "drivers/cpuidle/Kconfig.arm" diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 84b1ebe21..e351f5f9c 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -299,6 +299,7 @@ static int powernv_add_idle_states(void) for (i = 0; i < dt_idle_states; i++) { unsigned int exit_latency, target_residency; bool stops_timebase = false; + bool lose_user_context = false; struct pnv_idle_states_t *state = _idle_states[i]; /* @@ -324,6 +325,9 @@ static int powernv_add_idle_states(void) if (has_stop_states && !(state->valid)) continue; + if (state->flags & OPAL_PM_LOSE_USER_CONTEXT) + lose_user_context = true; + if (state->flags & OPAL_PM_TIMEBASE_STOP) stops_timebase = true; @@ -332,12 +336,17 @@ static int powernv_add_idle_states(void) add_powernv_state(nr_idle_states, "Nap", CPUIDLE_FLAG_NONE, nap_loop, target_residency, exit_latency, 0, 0); + } else if (has_stop_states & !lose_user_context) { + add_powernv_state(nr_idle_states, state->name, + CPUIDLE_FLAG_AUTO_PROMOTION, + stop_loop, target_residency, + exit_latency, state->psscr_val, + state->psscr_mask); } else if (has_stop_states && !stops_timebase) { add_powernv_state(nr_idle_states, state->name, CPUIDLE_FLAG_NONE, stop_loop, target_residency, exit_latency, - state->psscr_val, - state->psscr_mask); + state->psscr_val, state->psscr_mask); } /* -- 2.17.1
[PATCH 1/2] cpuidle : auto-promotion for cpuidle states
Currently, the cpuidle governors (menu /ladder) determine what idle state an idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. To address this, such lite states need to be autopromoted. The cpuidle- core can queue timer to correspond with the residency value of the next available state. Thus leading to auto-promotion to a deeper idle state as soon as possible. Signed-off-by: Abhishek Goel --- drivers/cpuidle/cpuidle.c | 79 +- drivers/cpuidle/governors/ladder.c | 3 +- drivers/cpuidle/governors/menu.c | 23 - include/linux/cpuidle.h| 12 +++-- 4 files changed, 111 insertions(+), 6 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 7f108309e..c4d1c1b38 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -36,6 +36,12 @@ static int enabled_devices; static int off __read_mostly; static int initialized __read_mostly; +struct auto_promotion { + struct hrtimer hrtimer; + int timeout; + booltimeout_needed; +}; + int cpuidle_disabled(void) { return off; @@ -188,6 +194,64 @@ int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev) } #endif /* CONFIG_SUSPEND */ +enum hrtimer_restart auto_promotion_hrtimer_callback(struct hrtimer *hrtimer) +{ + return HRTIMER_NORESTART; +} + +#ifdef CONFIG_CPU_IDLE_AUTO_PROMOTION +DEFINE_PER_CPU(struct auto_promotion, ap); + +static void cpuidle_auto_promotion_start(struct cpuidle_state *state, int cpu) +{ + struct auto_promotion *this_ap = _cpu(ap, cpu); + + if (this_ap->timeout_needed && (state->flags & + CPUIDLE_FLAG_AUTO_PROMOTION)) + hrtimer_start(_ap->hrtimer, ns_to_ktime(this_ap->timeout + * 1000), HRTIMER_MODE_REL_PINNED); +} + +static void cpuidle_auto_promotion_cancel(int cpu) +{ + struct hrtimer *hrtimer; + + hrtimer = _cpu(ap, cpu).hrtimer; + if (hrtimer_is_queued(hrtimer)) + hrtimer_cancel(hrtimer); +} + +static void cpuidle_auto_promotion_update(int time, int cpu) +{ + per_cpu(ap, cpu).timeout = time; +} + +static void cpuidle_auto_promotion_init(struct cpuidle_driver *drv, int cpu) +{ + int i; + struct auto_promotion *this_ap = _cpu(ap, cpu); + + this_ap->timeout_needed = 0; + + for (i = 0; i < drv->state_count; i++) { + if (drv->states[i].flags & CPUIDLE_FLAG_AUTO_PROMOTION) { + this_ap->timeout_needed = 1; + break; + } + } + + hrtimer_init(_ap->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + this_ap->hrtimer.function = auto_promotion_hrtimer_callback; +} +#else +static inline void cpuidle_auto_promotion_start(struct cpuidle_state *state, + int cpu) { } +static inline void cpuidle_auto_promotion_cancel(int cpu) { } +static inline void cpuidle_auto_promotion_update(int timeout, int cpu) { } +static inline void cpuidle_auto_promotion_init(struct cpuidle_driver *drv, + int cpu) { } +#endif + /** * cpuidle_enter_state - enter the state and update stats * @dev: cpuidle device for this cpu @@ -225,12 +289,17 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, trace_cpu_idle_rcuidle(index, dev->cpu); time_start = ns_to_ktime(local_clock()); + cpuidle_auto_promotion_start(target_state, dev->cpu); + stop_critical_timings(); entered_state = target_state->enter(dev, drv, index); start_critical_timings(); sched_clock_idle_wakeup_event(); time_end = ns_to_ktime(local_clock()); + + cpuidle_auto_promotion_cancel(dev->cpu); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); /* The cpu is no longer idle or about to enter idle. */ @@ -312,7 +381,13 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, bool *stop_tick) { - return cpuidle_curr_governor->select(drv, dev, stop_tick); + int timeout, ret; + + timeout = INT_MAX; + ret = cpuidle_curr_governor->select
[PATCH 0/2] Auto-promotion logic for cpuidle states
Currently, the cpuidle governors (menu/ladder) determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. Motivation -- In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. To address this, such lite states need to be autopromoted. The cpuidle-core can queue timer to correspond with the residency value of the next available state. Thus leading to auto-promotion to a deeper idle state as soon as possible. Experiment -- Without this patch - It was seen that for a idle system, a cpu may remain in stop0_lite for few seconds and then directly goes to a deeper state such as stop2. With this patch - A cpu will not remain in stop0_lite for more than the residency of next available state, and thus it will go to a deeper state in conservative fashion. Using this, we may spent even less than 20 milliseconds if susbsequent stop states are enabled. In the worst case, we may end up spending more than a second, as was the case without this patch. The worst case will occur in the scenario when no other shallow states are enbaled, and only deep states are available for auto-promotion. Abhishek Goel (2): cpuidle : auto-promotion for cpuidle states cpuidle : Add auto-promotion flag to cpuidle flags arch/powerpc/include/asm/opal-api.h | 1 + drivers/cpuidle/Kconfig | 4 ++ drivers/cpuidle/cpuidle-powernv.c | 13 - drivers/cpuidle/cpuidle.c | 79 - drivers/cpuidle/governors/ladder.c | 3 +- drivers/cpuidle/governors/menu.c| 23 - include/linux/cpuidle.h | 12 +++-- 7 files changed, 127 insertions(+), 8 deletions(-) -- 2.17.1
[PATCH 0/2] Auto-promotion logic for cpuidle states
Currently, the cpuidle governors (menu/ladder) determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. Motivation -- In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. To address this, such lite states need to be autopromoted. The cpuidle-core can queue timer to correspond with the residency value of the next available state. Thus leading to auto-promotion to a deeper idle state as soon as possible. Experiment -- Without this patch - It was seen that for a idle system, a cpu may remain in stop0_lite for few seconds and then directly goes to a deeper state such as stop2. With this patch - A cpu will not remain in stop0_lite for more than the residency of next available state, and thus it will go to a deeper state in conservative fashion. Using this, we may spent even less than 20 milliseconds if susbsequent stop states are enabled. In the worst case, we may end up spending more than a second, as was the case without this patch. The worst case will occur in the scenario when no other shallow states are enbaled, and only deep states are available for auto-promotion. Abhishek Goel (2): cpuidle : auto-promotion for cpuidle states cpuidle : Add auto-promotion flag to cpuidle flags arch/powerpc/include/asm/opal-api.h | 1 + drivers/cpuidle/Kconfig | 4 drivers/cpuidle/cpuidle-powernv.c | 13 +++-- drivers/cpuidle/cpuidle.c | 3 --- 4 files changed, 16 insertions(+), 5 deletions(-) -- 2.17.1
[PATCH 2/2] cpuidle : Add auto-promotion flag to cpuidle flags
This patch sets up flags for the state which needs to be auto-promoted. For powernv systems, lite states do not even lose user context. That information has been used to set the flag for lite states. Signed-off-by: Abhishek Goel --- arch/powerpc/include/asm/opal-api.h | 1 + drivers/cpuidle/Kconfig | 4 drivers/cpuidle/cpuidle-powernv.c | 13 +++-- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 870fb7b23..735dec731 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -226,6 +226,7 @@ */ #define OPAL_PM_TIMEBASE_STOP 0x0002 +#define OPAL_PM_LOSE_USER_CONTEXT 0x1000 #define OPAL_PM_LOSE_HYP_CONTEXT 0x2000 #define OPAL_PM_LOSE_FULL_CONTEXT 0x4000 #define OPAL_PM_NAP_ENABLED0x0001 diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index 7e48eb5bf..0ece62684 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -26,6 +26,10 @@ config CPU_IDLE_GOV_MENU config DT_IDLE_STATES bool +config CPU_IDLE_AUTO_PROMOTION + bool + default y if PPC_POWERNV + menu "ARM CPU Idle Drivers" depends on ARM || ARM64 source "drivers/cpuidle/Kconfig.arm" diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 84b1ebe21..e351f5f9c 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -299,6 +299,7 @@ static int powernv_add_idle_states(void) for (i = 0; i < dt_idle_states; i++) { unsigned int exit_latency, target_residency; bool stops_timebase = false; + bool lose_user_context = false; struct pnv_idle_states_t *state = _idle_states[i]; /* @@ -324,6 +325,9 @@ static int powernv_add_idle_states(void) if (has_stop_states && !(state->valid)) continue; + if (state->flags & OPAL_PM_LOSE_USER_CONTEXT) + lose_user_context = true; + if (state->flags & OPAL_PM_TIMEBASE_STOP) stops_timebase = true; @@ -332,12 +336,17 @@ static int powernv_add_idle_states(void) add_powernv_state(nr_idle_states, "Nap", CPUIDLE_FLAG_NONE, nap_loop, target_residency, exit_latency, 0, 0); + } else if (has_stop_states & !lose_user_context) { + add_powernv_state(nr_idle_states, state->name, + CPUIDLE_FLAG_AUTO_PROMOTION, + stop_loop, target_residency, + exit_latency, state->psscr_val, + state->psscr_mask); } else if (has_stop_states && !stops_timebase) { add_powernv_state(nr_idle_states, state->name, CPUIDLE_FLAG_NONE, stop_loop, target_residency, exit_latency, - state->psscr_val, - state->psscr_mask); + state->psscr_val, state->psscr_mask); } /* -- 2.17.1
[PATCH 1/2] cpuidle : auto-promotion for cpuidle states
Currently, the cpuidle governors (menu /ladder) determine what idle state an idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. To address this, such lite states need to be autopromoted. The cpuidle- core can queue timer to correspond with the residency value of the next available state. Thus leading to auto-promotion to a deeper idle state as soon as possible. Signed-off-by: Abhishek Goel --- drivers/cpuidle/cpuidle.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 2406e2655..c4d1c1b38 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -584,11 +584,8 @@ static void __cpuidle_unregister_device(struct cpuidle_device *dev) static void __cpuidle_device_init(struct cpuidle_device *dev) { - int i; memset(dev->states_usage, 0, sizeof(dev->states_usage)); dev->last_residency = 0; - for (i = 0; i < CPUIDLE_STATE_MAX; i++) - dev->states_usage[i].disable = true; } /** -- 2.17.1
[RFC 1/1] cpuidle : Move saving and restoring of sprs to opal
This patch moves the saving and restoring of sprs for P9 cpuidle from kernel to opal. This patch still uses existing code to detect first thread in core. In an attempt to make the powernv idle code backward compatible, and to some extent forward compatible, add support for pre-stop entry and post-stop exit actions in OPAL. If a kernel knows about this opal call, then just a firmware supporting newer hardware is required, instead of waiting for kernel updates. Signed-off-by: Abhishek Goel --- Link to the Skiboot patch corresponding to this patch: http://patchwork.ozlabs.org/patch/947568/ arch/powerpc/include/asm/cpuidle.h| 10 -- arch/powerpc/include/asm/opal-api.h | 4 +- arch/powerpc/include/asm/opal.h | 3 + arch/powerpc/include/asm/paca.h | 5 +- arch/powerpc/kernel/asm-offsets.c | 10 +- arch/powerpc/kernel/idle_book3s.S | 130 ++ arch/powerpc/platforms/powernv/idle.c | 4 + .../powerpc/platforms/powernv/opal-wrappers.S | 2 + arch/powerpc/xmon/xmon.c | 12 +- 9 files changed, 61 insertions(+), 119 deletions(-) diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h index e210a83eb196..c10f47af9a55 100644 --- a/arch/powerpc/include/asm/cpuidle.h +++ b/arch/powerpc/include/asm/cpuidle.h @@ -68,16 +68,6 @@ #define ERR_DEEP_STATE_ESL_MISMATCH-2 #ifndef __ASSEMBLY__ -/* Additional SPRs that need to be saved/restored during stop */ -struct stop_sprs { - u64 pid; - u64 ldbar; - u64 fscr; - u64 hfscr; - u64 mmcr1; - u64 mmcr2; - u64 mmcra; -}; extern u32 pnv_fastsleep_workaround_at_entry[]; extern u32 pnv_fastsleep_workaround_at_exit[]; diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 3bab299eda49..6792a737bc9a 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -208,7 +208,9 @@ #define OPAL_SENSOR_READ_U64 162 #define OPAL_PCI_GET_PBCQ_TUNNEL_BAR 164 #define OPAL_PCI_SET_PBCQ_TUNNEL_BAR 165 -#define OPAL_LAST 165 +#define OPAL_IDLE_SAVE 168 +#define OPAL_IDLE_RESTORE 169 +#define OPAL_LAST 169 #define QUIESCE_HOLD 1 /* Spin all calls at entry */ #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index e1b2910c6e81..12d57aeacde2 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -356,6 +356,9 @@ extern void opal_kmsg_init(void); extern int opal_event_request(unsigned int opal_event_nr); +extern int opal_cpuidle_save(u64 *stop_sprs, int scope, u64 psscr); +extern int opal_cpuidle_restore(u64 *stop_sprs, int scope, u64 psscr, u64 srr1); + struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr, unsigned long vmalloc_size); void opal_free_sg_list(struct opal_sg_list *sg); diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 6d34bd71139d..765524e76beb 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -195,11 +195,12 @@ struct paca_struct { /* The PSSCR value that the kernel requested before going to stop */ u64 requested_psscr; + u64 wakeup_psscr; /* -* Save area for additional SPRs that need to be +* Save area for SPRs that need to be * saved/restored during cpuidle stop. */ - struct stop_sprs stop_sprs; + u64 *opal_stop_sprs; #endif #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 0a0544335950..65a3d8582017 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -769,14 +769,8 @@ int main(void) OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas); OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr); OFFSET(PACA_DONT_STOP, paca_struct, dont_stop); -#define STOP_SPR(x, f) OFFSET(x, paca_struct, stop_sprs.f) - STOP_SPR(STOP_PID, pid); - STOP_SPR(STOP_LDBAR, ldbar); - STOP_SPR(STOP_FSCR, fscr); - STOP_SPR(STOP_HFSCR, hfscr); - STOP_SPR(STOP_MMCR1, mmcr1); - STOP_SPR(STOP_MMCR2, mmcr2); - STOP_SPR(STOP_MMCRA, mmcra); + OFFSET(PACA_WAKEUP_PSSCR, paca_struct, wakeup_psscr); + OFFSET(STOP_SPRS, paca_struct, opal_stop_sprs); #endif DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER); diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index e734f6e45abc..66fc955abee3 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -45,6 +45,9 @@ #define
Re: [PATCH v2] cpuidle/powernv : Add Description for cpuidle state
On 06/04/2018 05:15 PM, Akshay Adiga wrote: On Mon, Jun 04, 2018 at 07:04:14PM +1000, Benjamin Herrenschmidt wrote: Is this a new property ? I'm not fan of adding yet another of those silly arrays. I would say this is the right time now to switch over to a node per state instead, as we discussed with Vaidy. It is not a new property. Name was being used for description as description was not present in device tree. A skiboot patch adding description to device tree have been posted. This patch reads those description instead of copying name itself into description. And we fall back to reading name into description to not break the comaptibility with older firmware. Thanks Abhishek I posted the node based device tree here : skiboot patch : https://patchwork.ozlabs.org/patch/923120/ kernel patch : https://lkml.org/lkml/2018/5/30/1146 Do you have any inputs for this design ? Additionally, while doing that, we can provide the versioning mechanism I proposed so we can deal with state specific issues and erratas. Cheers, Ben.
[PATCH v2] cpuidle/powernv : Add Description for cpuidle state
Names of cpuidle states were being used for description of states in POWER as no descriptions were added in device tree. This patch reads description for idle states which have been added in device tree. The description for idle states in case of POWER can be printed using "cpupower monitor -l" or "cpupower idle-info". Signed-off-by: Abhishek Goel --- The skiboot patch which adds description for idle states in device tree can be found here: https://patchwork.ozlabs.org/patch/924879/ drivers/cpuidle/cpuidle-powernv.c | 19 +++ include/linux/cpuidle.h | 2 +- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 1a8234e706bc..08d8b0953a14 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -133,7 +133,7 @@ static int stop_loop(struct cpuidle_device *dev, static struct cpuidle_state powernv_states[CPUIDLE_STATE_MAX] = { { /* Snooze */ .name = "snooze", - .desc = "snooze", + .desc = "idle polling state", .exit_latency = 0, .target_residency = 0, .enter = snooze_loop }, @@ -206,6 +206,7 @@ static int powernv_cpuidle_driver_init(void) } static inline void add_powernv_state(int index, const char *name, +const char *desc, unsigned int flags, int (*idle_fn)(struct cpuidle_device *, struct cpuidle_driver *, @@ -215,7 +216,7 @@ static inline void add_powernv_state(int index, const char *name, u64 psscr_val, u64 psscr_mask) { strlcpy(powernv_states[index].name, name, CPUIDLE_NAME_LEN); - strlcpy(powernv_states[index].desc, name, CPUIDLE_NAME_LEN); + strlcpy(powernv_states[index].desc, desc, CPUIDLE_DESC_LEN); powernv_states[index].flags = flags; powernv_states[index].target_residency = target_residency; powernv_states[index].exit_latency = exit_latency; @@ -250,6 +251,7 @@ static int powernv_add_idle_states(void) u64 psscr_val[CPUIDLE_STATE_MAX]; u64 psscr_mask[CPUIDLE_STATE_MAX]; const char *names[CPUIDLE_STATE_MAX]; + const char *descs[CPUIDLE_STATE_MAX]; u32 has_stop_states = 0; int i, rc; u32 supported_flags = pnv_get_supported_cpuidle_states(); @@ -311,6 +313,13 @@ static int powernv_add_idle_states(void) pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-names in DT\n"); goto out; } + if (of_property_read_string_array(power_mgt, + "ibm,cpu-idle-state-descs", descs, dt_idle_states) < 0) { + of_property_read_string_array(power_mgt, + "ibm,cpu-idle-state-names", descs, dt_idle_states); + pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-descs in DT\n" + "Name will be used for description\n"); + } /* * If the idle states use stop instruction, probe for psscr values @@ -414,10 +423,11 @@ static int powernv_add_idle_states(void) target_residency = 100; /* Add NAP state */ add_powernv_state(nr_idle_states, "Nap", + "stop processor execution", CPUIDLE_FLAG_NONE, nap_loop, target_residency, exit_latency, 0, 0); } else if (has_stop_states && !stops_timebase) { - add_powernv_state(nr_idle_states, names[i], + add_powernv_state(nr_idle_states, names[i], descs[i], CPUIDLE_FLAG_NONE, stop_loop, target_residency, exit_latency, psscr_val[i], psscr_mask[i]); @@ -434,11 +444,12 @@ static int powernv_add_idle_states(void) target_residency = 30; /* Add FASTSLEEP state */ add_powernv_state(nr_idle_states, "FastSleep", + "Core and L2 clock gating", CPUIDLE_FLAG_TIMER_STOP, fastsleep_loop, target_residency, exit_latency, 0, 0); } else if (has_stop_states && stops_timebase) { - add_powernv_state(nr_idle_states, names
[PATCH] cpuidle/powernv : Add Description for cpuidle state
Names of cpuidle states were being used for description of states in POWER as no descriptions were added in device tree. This patch reads description for idle states which have been added in device tree. The description for idle states in case of POWER can be printed using "cpupower monitor -l" or "cpupower idle-info". Signed-off-by: Abhishek Goel --- The skiboot patch which adds description for idle states in device tree can be found here: https://patchwork.ozlabs.org/patch/921637/ drivers/cpuidle/cpuidle-powernv.c | 17 + include/linux/cpuidle.h | 2 +- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 1a8234e706bc..d3915a965128 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -133,7 +133,7 @@ static int stop_loop(struct cpuidle_device *dev, static struct cpuidle_state powernv_states[CPUIDLE_STATE_MAX] = { { /* Snooze */ .name = "snooze", - .desc = "snooze", + .desc = "idle polling state", .exit_latency = 0, .target_residency = 0, .enter = snooze_loop }, @@ -206,6 +206,7 @@ static int powernv_cpuidle_driver_init(void) } static inline void add_powernv_state(int index, const char *name, +const char *desc, unsigned int flags, int (*idle_fn)(struct cpuidle_device *, struct cpuidle_driver *, @@ -215,7 +216,7 @@ static inline void add_powernv_state(int index, const char *name, u64 psscr_val, u64 psscr_mask) { strlcpy(powernv_states[index].name, name, CPUIDLE_NAME_LEN); - strlcpy(powernv_states[index].desc, name, CPUIDLE_NAME_LEN); + strlcpy(powernv_states[index].desc, desc, CPUIDLE_DESC_LEN); powernv_states[index].flags = flags; powernv_states[index].target_residency = target_residency; powernv_states[index].exit_latency = exit_latency; @@ -250,6 +251,7 @@ static int powernv_add_idle_states(void) u64 psscr_val[CPUIDLE_STATE_MAX]; u64 psscr_mask[CPUIDLE_STATE_MAX]; const char *names[CPUIDLE_STATE_MAX]; + const char *descs[CPUIDLE_STATE_MAX]; u32 has_stop_states = 0; int i, rc; u32 supported_flags = pnv_get_supported_cpuidle_states(); @@ -311,6 +313,11 @@ static int powernv_add_idle_states(void) pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-names in DT\n"); goto out; } + if (of_property_read_string_array(power_mgt, + "ibm,cpu-idle-state-descs", descs, dt_idle_states) < 0) { + pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-descs in DT\n"); + goto out; + } /* * If the idle states use stop instruction, probe for psscr values @@ -414,10 +421,11 @@ static int powernv_add_idle_states(void) target_residency = 100; /* Add NAP state */ add_powernv_state(nr_idle_states, "Nap", + "stop processor execution", CPUIDLE_FLAG_NONE, nap_loop, target_residency, exit_latency, 0, 0); } else if (has_stop_states && !stops_timebase) { - add_powernv_state(nr_idle_states, names[i], + add_powernv_state(nr_idle_states, names[i], descs[i], CPUIDLE_FLAG_NONE, stop_loop, target_residency, exit_latency, psscr_val[i], psscr_mask[i]); @@ -434,11 +442,12 @@ static int powernv_add_idle_states(void) target_residency = 30; /* Add FASTSLEEP state */ add_powernv_state(nr_idle_states, "FastSleep", + "Core and L2 clock gating", CPUIDLE_FLAG_TIMER_STOP, fastsleep_loop, target_residency, exit_latency, 0, 0); } else if (has_stop_states && stops_timebase) { - add_powernv_state(nr_idle_states, names[i], + add_powernv_state(nr_idle_states, names[i], descs[i], CPUIDLE_FLAG_TIMER_STOP, stop_loop, target_residency, exit_latency,
Re: [PATCH v2] cpufreq: powernv: Add support of frequency domain
On 12/20/2017 12:20 PM, Viresh Kumar wrote: On 20-12-17, 12:12, Abhishek Goel wrote: diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index b6d7c4c..fd642bc 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -37,6 +37,7 @@ #include /* Required for cpu_sibling_mask() in UP configs */ #include #include +#include #define POWERNV_MAX_PSTATES 256 #define PMSR_PSAFE_ENABLE (1UL << 30) @@ -130,6 +131,9 @@ static struct chip { static int nr_chips; static DEFINE_PER_CPU(struct chip *, chip_info); +static u32 freq_domain_indicator; +static u32 flag; I wouldn't name it as flag, its unreadable. Maybe its better to name it based on the quirk you are trying to workaround with ? + /* * Note: * The set of pstates consists of contiguous integers. @@ -194,6 +198,38 @@ static inline void reset_gpstates(struct cpufreq_policy *policy) gpstates->last_gpstate_idx = 0; } +#define SIZE NR_CPUS +#define ORDER_FREQ_MAP ilog2(SIZE) + +static DEFINE_HASHTABLE(freq_domain_map, ORDER_FREQ_MAP); + +struct hashmap { + cpumask_t mask; + int chip_id; + u32 pir_key; + struct hlist_node hash_node; +}; + +static void insert(u32 key, int cpu) +{ + struct hashmap *data; + + hash_for_each_possible(freq_domain_map, data, hash_node, key%SIZE) { + if (data->chip_id == cpu_to_chip_id(cpu) && + data->pir_key == key) { + cpumask_set_cpu(cpu, >mask); + return; + } + } + + data = kzalloc(sizeof(*data), GFP_KERNEL); + hash_add(freq_domain_map, >hash_node, key%SIZE); + cpumask_set_cpu(cpu, >mask); + data->chip_id = cpu_to_chip_id(cpu); + data->pir_key = key; + +} + /* * Initialize the freq table based on data obtained * from the firmware passed via device-tree @@ -206,7 +242,9 @@ static int init_powernv_pstates(void) u32 len_ids, len_freqs; u32 pstate_min, pstate_max, pstate_nominal; u32 pstate_turbo, pstate_ultra_turbo; + u32 key; + flag = 0; Isn't flag already 0 (global-uninitialized) ? power_mgt = of_find_node_by_path("/ibm,opal/power-mgt"); if (!power_mgt) { pr_warn("power-mgt node not found\n"); @@ -229,6 +267,17 @@ static int init_powernv_pstates(void) return -ENODEV; } + if (of_device_is_compatible(power_mgt, "freq-domain-v1") && + of_property_read_u32(power_mgt, "ibm,freq-domain-indicator", +_domain_indicator)) { + pr_warn("ibm,freq-domain-indicator not found\n"); + freq_domain_indicator = 0; You shouldn't be required to set it to 0 here. + } + + if (of_device_is_compatible(power_mgt, "P9-occ-quirk")) { + flag = 1; + } Remove {} and a better name like p9_occ_quirk would be good for flag. Also making it a bool may be better ? + if (of_property_read_u32(power_mgt, "ibm,pstate-ultra-turbo", _ultra_turbo)) { powernv_pstate_info.wof_enabled = false; @@ -249,6 +298,7 @@ static int init_powernv_pstates(void) next: pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min, pstate_nominal, pstate_max); + pr_info("frequency domain indicator %d", freq_domain_indicator); pr_info("Workload Optimized Frequency is %s in the platform\n", (powernv_pstate_info.wof_enabled) ? "enabled" : "disabled"); @@ -276,6 +326,15 @@ static int init_powernv_pstates(void) return -ENODEV; } + if (freq_domain_indicator) { + hash_init(freq_domain_map); + for_each_possible_cpu(i) { + key = ((u32) get_hard_smp_processor_id(i) & + freq_domain_indicator); Maybe break it like: key = (u32) get_hard_smp_processor_id(i); key &= freq_domain_indicator; to make it easily readable ? + insert(key, i); + } + } + powernv_pstate_info.nr_pstates = nr_pstates; pr_debug("NR PStates %d\n", nr_pstates); for (i = 0; i < nr_pstates; i++) { @@ -693,6 +752,7 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy, { struct powernv_smp_call_data freq_data; unsigned int cur_msec, gpstate_idx; + :( struct global_pstate_info *gpstates = policy->driver_data; if (unlikely(rebooting) && new_index != get_nominal_index()) @@ -760,25 +820,55 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy,
[PATCH v3] cpufreq: powernv: Add support of frequency domain
Frequency-domain indicates group of CPUs that would share same frequency. It is detected using device-tree node "frequency-domain-indicator". frequency-domain-indicator is a bitmask which will have different value depending upon the generation of the processor. CPUs of the same chip for which the result of a bitwise AND between their PIR and the frequency-domain-indicator is the same share the same frequency. In this patch, we define hash-table indexed by the aforementioned bitwise ANDed value to store the cpumask of the CPUs sharing the same frequency domain. Further, the cpufreq policy will be created per frequency-domain So for POWER9, a cpufreq policy is created per quad while for POWER8 it is created per core. Governor decides frequency for each policy but multiple cores may come under same policy. In such case frequency needs to be set on each core sharing that policy. Signed-off-by: Abhishek Goel <hunt...@linux.vnet.ibm.com> --- Skiboot patch required for the corresponding device-tree changes have been posted here : http://patchwork.ozlabs.org/patch/862256/ drivers/cpufreq/powernv-cpufreq.c | 104 ++ 1 file changed, 95 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index b6d7c4c..aab23a4 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -37,6 +37,7 @@ #include /* Required for cpu_sibling_mask() in UP configs */ #include #include +#include #define POWERNV_MAX_PSTATES256 #define PMSR_PSAFE_ENABLE (1UL << 30) @@ -130,6 +131,9 @@ enum throttle_reason_type { static int nr_chips; static DEFINE_PER_CPU(struct chip *, chip_info); +static u32 freq_domain_indicator; +static bool p9_occ_quirk; + /* * Note: * The set of pstates consists of contiguous integers. @@ -194,6 +198,38 @@ static inline void reset_gpstates(struct cpufreq_policy *policy) gpstates->last_gpstate_idx = 0; } +#define SIZE NR_CPUS +#define ORDER_FREQ_MAP ilog2(SIZE) + +static DEFINE_HASHTABLE(freq_domain_map, ORDER_FREQ_MAP); + +struct hashmap { + cpumask_t mask; + int chip_id; + u32 pir_key; + struct hlist_node hash_node; +}; + +static void insert(u32 key, int cpu) +{ + struct hashmap *data; + + hash_for_each_possible(freq_domain_map, data, hash_node, key%SIZE) { + if (data->chip_id == cpu_to_chip_id(cpu) && + data->pir_key == key) { + cpumask_set_cpu(cpu, >mask); + return; + } + } + + data = kzalloc(sizeof(*data), GFP_KERNEL); + hash_add(freq_domain_map, >hash_node, key%SIZE); + cpumask_set_cpu(cpu, >mask); + data->chip_id = cpu_to_chip_id(cpu); + data->pir_key = key; + +} + /* * Initialize the freq table based on data obtained * from the firmware passed via device-tree @@ -206,6 +242,7 @@ static int init_powernv_pstates(void) u32 len_ids, len_freqs; u32 pstate_min, pstate_max, pstate_nominal; u32 pstate_turbo, pstate_ultra_turbo; + u32 key; power_mgt = of_find_node_by_path("/ibm,opal/power-mgt"); if (!power_mgt) { @@ -246,9 +283,18 @@ static int init_powernv_pstates(void) else powernv_pstate_info.wof_enabled = true; + if (of_device_is_compatible(power_mgt, "freq-domain-v1") && + of_property_read_u32(power_mgt, "ibm,freq-domain-indicator", + _domain_indicator)) + pr_warn("ibm,freq-domain-indicator not found\n"); + +if (of_device_is_compatible(power_mgt, "p9-occ-quirk")) + p9_occ_quirk = true; + next: pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min, pstate_nominal, pstate_max); + pr_info("frequency domain indicator %d", freq_domain_indicator); pr_info("Workload Optimized Frequency is %s in the platform\n", (powernv_pstate_info.wof_enabled) ? "enabled" : "disabled"); @@ -276,6 +322,15 @@ static int init_powernv_pstates(void) return -ENODEV; } + if (freq_domain_indicator) { + hash_init(freq_domain_map); + for_each_possible_cpu(i) { + key = ((u32) get_hard_smp_processor_id(i) & + freq_domain_indicator); + insert(key, i); + } + } + powernv_pstate_info.nr_pstates = nr_pstates; pr_debug("NR PStates %d\n", nr_pstates); for (i = 0; i < nr_pstates; i++) { @@ -760,25 +815,56 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy, spin_unlock(>gpstate_lock); -
[PATCH v2] cpufreq: powernv: Add support of frequency domain
Frequency-domain indicates group of CPUs that would share same frequency. It is detected using device-tree node "frequency-domain-indicator". frequency-domain-indicator is a bitmask which will have different value depending upon the generation of the processor. CPUs of the same chip for which the result of a bitwise AND between their PIR and the frequency-domain-indicator is the same share the same frequency. In this patch, we define hash-table indexed by the aforementioned bitwise ANDed value to store the cpumask of the CPUs sharing the same frequency domain. Further, the cpufreq policy will be created per frequency-domain So for POWER9, a cpufreq policy is created per quad while for POWER8 it is created per core. Governor decides frequency for each policy but multiple cores may come under same policy. In such case frequency needs to be set on each core sharing that policy. Signed-off-by: Abhishek Goel <hunt...@linux.vnet.ibm.com> --- drivers/cpufreq/powernv-cpufreq.c | 110 ++ 1 file changed, 100 insertions(+), 10 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index b6d7c4c..fd642bc 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -37,6 +37,7 @@ #include /* Required for cpu_sibling_mask() in UP configs */ #include #include +#include #define POWERNV_MAX_PSTATES256 #define PMSR_PSAFE_ENABLE (1UL << 30) @@ -130,6 +131,9 @@ static struct chip { static int nr_chips; static DEFINE_PER_CPU(struct chip *, chip_info); +static u32 freq_domain_indicator; +static u32 flag; + /* * Note: * The set of pstates consists of contiguous integers. @@ -194,6 +198,38 @@ static inline void reset_gpstates(struct cpufreq_policy *policy) gpstates->last_gpstate_idx = 0; } +#define SIZE NR_CPUS +#define ORDER_FREQ_MAP ilog2(SIZE) + +static DEFINE_HASHTABLE(freq_domain_map, ORDER_FREQ_MAP); + +struct hashmap { + cpumask_t mask; + int chip_id; + u32 pir_key; + struct hlist_node hash_node; +}; + +static void insert(u32 key, int cpu) +{ + struct hashmap *data; + + hash_for_each_possible(freq_domain_map, data, hash_node, key%SIZE) { + if (data->chip_id == cpu_to_chip_id(cpu) && + data->pir_key == key) { + cpumask_set_cpu(cpu, >mask); + return; + } + } + + data = kzalloc(sizeof(*data), GFP_KERNEL); + hash_add(freq_domain_map, >hash_node, key%SIZE); + cpumask_set_cpu(cpu, >mask); + data->chip_id = cpu_to_chip_id(cpu); + data->pir_key = key; + +} + /* * Initialize the freq table based on data obtained * from the firmware passed via device-tree @@ -206,7 +242,9 @@ static int init_powernv_pstates(void) u32 len_ids, len_freqs; u32 pstate_min, pstate_max, pstate_nominal; u32 pstate_turbo, pstate_ultra_turbo; + u32 key; + flag = 0; power_mgt = of_find_node_by_path("/ibm,opal/power-mgt"); if (!power_mgt) { pr_warn("power-mgt node not found\n"); @@ -229,6 +267,17 @@ static int init_powernv_pstates(void) return -ENODEV; } + if (of_device_is_compatible(power_mgt, "freq-domain-v1") && + of_property_read_u32(power_mgt, "ibm,freq-domain-indicator", +_domain_indicator)) { + pr_warn("ibm,freq-domain-indicator not found\n"); + freq_domain_indicator = 0; + } + + if (of_device_is_compatible(power_mgt, "P9-occ-quirk")) { + flag = 1; + } + if (of_property_read_u32(power_mgt, "ibm,pstate-ultra-turbo", _ultra_turbo)) { powernv_pstate_info.wof_enabled = false; @@ -249,6 +298,7 @@ static int init_powernv_pstates(void) next: pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min, pstate_nominal, pstate_max); + pr_info("frequency domain indicator %d", freq_domain_indicator); pr_info("Workload Optimized Frequency is %s in the platform\n", (powernv_pstate_info.wof_enabled) ? "enabled" : "disabled"); @@ -276,6 +326,15 @@ static int init_powernv_pstates(void) return -ENODEV; } + if (freq_domain_indicator) { + hash_init(freq_domain_map); + for_each_possible_cpu(i) { + key = ((u32) get_hard_smp_processor_id(i) & + freq_domain_indicator); + insert(key, i); + } + } + powernv_pstate_info.nr_pstates = nr_pstates; pr_debug("NR PStates %d\n", n
Re: [PATCH] cpufreq: powernv: Add support of frequency domain
On 12/14/2017 10:12 AM, Viresh Kumar wrote: + Gautham, @Gautham: Can you please help reviewing this one ? On 13-12-17, 13:49, Abhishek Goel wrote: @@ -693,6 +746,8 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy, { struct powernv_smp_call_data freq_data; unsigned int cur_msec, gpstate_idx; + cpumask_t temp; + u32 cpu; struct global_pstate_info *gpstates = policy->driver_data; if (unlikely(rebooting) && new_index != get_nominal_index()) @@ -761,24 +816,48 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy, spin_unlock(>gpstate_lock); /* -* Use smp_call_function to send IPI and execute the -* mtspr on target CPU. We could do that without IPI -* if current CPU is within policy->cpus (core) +* Use smp_call_function to send IPI and execute the mtspr on CPU. +* This needs to be done on every core of the policy Why on each CPU ? We need to do it in this way as the current implementation takes the max of the PMSR of the cores. Thus, when the frequency is required to be ramped up, it suffices to write to just the local PMSR, but when the frequency is to be ramped down, if we don't send the IPI it breaks the compatibility with P8. */ - smp_call_function_any(policy->cpus, set_pstate, _data, 1); + cpumask_copy(, policy->cpus); + + while (!cpumask_empty()) { + cpu = cpumask_first(); + smp_call_function_any(cpu_sibling_mask(cpu), + set_pstate, _data, 1); + cpumask_andnot(, , cpu_sibling_mask(cpu)); + } + return 0; }
[PATCH] cpufreq: powernv: Add support of frequency domain
Frequency-domain indicates group of CPUs that would share same frequency. It is detected using device-tree node "frequency-domain-indicator". frequency-domain-indicator is a bitmask which will have different value depending upon the generation of the processor. CPUs of the same chip for which the result of a bitwise AND between their PIR and the frequency-domain-indicator is the same share the same frequency. In this patch, we define hash-table indexed by the aforementioned bitwise ANDed value to store the cpumask of the CPUs sharing the same frequency domain. Further, the cpufreq policy will be created per frequency-domain So for POWER9, a cpufreq policy is created per quad while for POWER8 it is created per core. Governor decides frequency for each policy but multiple cores may come under same policy. In such case frequency needs to be set on each core sharing that policy. Signed-off-by: Abhishek Goel <hunt...@linux.vnet.ibm.com> --- drivers/cpufreq/powernv-cpufreq.c | 95 +++ 1 file changed, 87 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index b6d7c4c..9384110 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -37,6 +37,7 @@ #include /* Required for cpu_sibling_mask() in UP configs */ #include #include +#include #define POWERNV_MAX_PSTATES256 #define PMSR_PSAFE_ENABLE (1UL << 30) @@ -130,6 +131,8 @@ static struct chip { static int nr_chips; static DEFINE_PER_CPU(struct chip *, chip_info); +static u32 freq_domain_indicator; + /* * Note: * The set of pstates consists of contiguous integers. @@ -194,6 +197,38 @@ static inline void reset_gpstates(struct cpufreq_policy *policy) gpstates->last_gpstate_idx = 0; } +#define SIZE NR_CPUS +#define ORDER_FREQ_MAP ilog2(SIZE) + +static DEFINE_HASHTABLE(freq_domain_map, ORDER_FREQ_MAP); + +struct hashmap { + cpumask_t mask; + int chip_id; + u32 pir_key; + struct hlist_node hash_node; +}; + +static void insert(u32 key, int cpu) +{ + struct hashmap *data; + + hash_for_each_possible(freq_domain_map, data, hash_node, key%SIZE) { + if (data->chip_id == cpu_to_chip_id(cpu) && + data->pir_key == key) { + cpumask_set_cpu(cpu, >mask); + return; + } + } + + data = kzalloc(sizeof(*data), GFP_KERNEL); + hash_add(freq_domain_map, >hash_node, key%SIZE); + cpumask_set_cpu(cpu, >mask); + data->chip_id = cpu_to_chip_id(cpu); + data->pir_key = key; + +} + /* * Initialize the freq table based on data obtained * from the firmware passed via device-tree @@ -206,6 +241,7 @@ static int init_powernv_pstates(void) u32 len_ids, len_freqs; u32 pstate_min, pstate_max, pstate_nominal; u32 pstate_turbo, pstate_ultra_turbo; + u32 key; power_mgt = of_find_node_by_path("/ibm,opal/power-mgt"); if (!power_mgt) { @@ -229,6 +265,13 @@ static int init_powernv_pstates(void) return -ENODEV; } + if (of_device_is_compatible(power_mgt, "freq-domain-v1") && + of_property_read_u32(power_mgt, "ibm,freq-domain-indicator", +_domain_indicator)) { + pr_warn("ibm,freq-domain-indicator not found\n"); + freq_domain_indicator = 0; + } + if (of_property_read_u32(power_mgt, "ibm,pstate-ultra-turbo", _ultra_turbo)) { powernv_pstate_info.wof_enabled = false; @@ -249,6 +292,7 @@ static int init_powernv_pstates(void) next: pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min, pstate_nominal, pstate_max); + pr_info("frequency domain indicator %d", freq_domain_indicator); pr_info("Workload Optimized Frequency is %s in the platform\n", (powernv_pstate_info.wof_enabled) ? "enabled" : "disabled"); @@ -276,6 +320,15 @@ static int init_powernv_pstates(void) return -ENODEV; } + if (freq_domain_indicator) { + hash_init(freq_domain_map); + for_each_possible_cpu(i) { + key = ((u32) get_hard_smp_processor_id(i) & + freq_domain_indicator); + insert(key, i); + } + } + powernv_pstate_info.nr_pstates = nr_pstates; pr_debug("NR PStates %d\n", nr_pstates); for (i = 0; i < nr_pstates; i++) { @@ -693,6 +746,8 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy, { struct powernv_smp_call_data freq_data; unsigned in