[PATCH V5 8/8] cpuidle/powernv: Parse device tree to setup idle states

2014-01-15 Thread Preeti U Murthy
Add deep idle states such as nap and fast sleep to the cpuidle state table
only if they are discovered from the device tree during cpuidle initialization.

Signed-off-by: Preeti U Murthy 
---

 drivers/cpuidle/cpuidle-powernv.c |   81 +
 1 file changed, 64 insertions(+), 17 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index e3aa62f..b01987d 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -12,10 +12,17 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 
+/* Flags and constants used in PowerNV platform */
+
+#define MAX_POWERNV_IDLE_STATES8
+#define IDLE_USE_INST_NAP  0x0001 /* Use nap instruction */
+#define IDLE_USE_INST_SLEEP0x0002 /* Use sleep instruction */
+
 struct cpuidle_driver powernv_idle_driver = {
.name = "powernv_idle",
.owner= THIS_MODULE,
@@ -84,7 +91,7 @@ static int fastsleep_loop(struct cpuidle_device *dev,
 /*
  * States for dedicated partition case.
  */
-static struct cpuidle_state powernv_states[] = {
+static struct cpuidle_state powernv_states[MAX_POWERNV_IDLE_STATES] = {
{ /* Snooze */
.name = "snooze",
.desc = "snooze",
@@ -92,20 +99,6 @@ static struct cpuidle_state powernv_states[] = {
.exit_latency = 0,
.target_residency = 0,
.enter = &snooze_loop },
-   { /* NAP */
-   .name = "NAP",
-   .desc = "NAP",
-   .flags = CPUIDLE_FLAG_TIME_VALID,
-   .exit_latency = 10,
-   .target_residency = 100,
-   .enter = &nap_loop },
-{ /* Fastsleep */
-   .name = "fastsleep",
-   .desc = "fastsleep",
-   .flags = CPUIDLE_FLAG_TIME_VALID,
-   .exit_latency = 10,
-   .target_residency = 100,
-   .enter = &fastsleep_loop },
 };
 
 static int powernv_cpuidle_add_cpu_notifier(struct notifier_block *n,
@@ -166,19 +159,73 @@ static int powernv_cpuidle_driver_init(void)
return 0;
 }
 
+static int powernv_add_idle_states(void)
+{
+   struct device_node *power_mgt;
+   struct property *prop;
+   int nr_idle_states = 1; /* Snooze */
+   int dt_idle_states;
+   u32 *flags;
+   int i;
+
+   /* Currently we have snooze statically defined */
+
+   power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
+   if (!power_mgt) {
+   pr_warn("opal: PowerMgmt Node not found\n");
+   return nr_idle_states;
+   }
+
+   prop = of_find_property(power_mgt, "ibm,cpu-idle-state-flags", NULL);
+   if (!prop) {
+   pr_warn("DT-PowerMgmt: missing ibm,cpu-idle-state-flags\n");
+   return nr_idle_states;
+   }
+
+   dt_idle_states = prop->length / sizeof(u32);
+   flags = (u32 *) prop->value;
+
+   for (i = 0; i < dt_idle_states; i++) {
+
+   if (flags[i] & IDLE_USE_INST_NAP) {
+   /* Add NAP state */
+   strcpy(powernv_states[nr_idle_states].name, "Nap");
+   strcpy(powernv_states[nr_idle_states].desc, "Nap");
+   powernv_states[nr_idle_states].flags = 
CPUIDLE_FLAG_TIME_VALID;
+   powernv_states[nr_idle_states].exit_latency = 10;
+   powernv_states[nr_idle_states].target_residency = 100;
+   powernv_states[nr_idle_states].enter = &nap_loop;
+   nr_idle_states++;
+   }
+
+   if (flags[i] & IDLE_USE_INST_SLEEP) {
+   /* Add FASTSLEEP state */
+   strcpy(powernv_states[nr_idle_states].name, 
"FastSleep");
+   strcpy(powernv_states[nr_idle_states].desc, 
"FastSleep");
+   powernv_states[nr_idle_states].flags = 
CPUIDLE_FLAG_TIME_VALID;
+   powernv_states[nr_idle_states].exit_latency = 300;
+   powernv_states[nr_idle_states].target_residency = 
100;
+   powernv_states[nr_idle_states].enter = &fastsleep_loop;
+   nr_idle_states++;
+   }
+   }
+
+   return nr_idle_states;
+}
+
 /*
  * powernv_idle_probe()
  * Choose state table for shared versus dedicated partition
  */
 static int powernv_idle_probe(void)
 {
-
if (cpuidle_disable != IDLE_NO_OVERRIDE)
return -ENODEV;
 
if (firmware_has_feature(FW_FEATURE_OPALv3)) {
cpuidle_state_table = powernv_states;
-   max_idle_state = ARRAY_SIZE(powernv_states);
+   /* Device tree can indicate mor

[PATCH V5 6/8] time/cpuidle: Support in tick broadcast framework in the absence of external clock device

2014-01-15 Thread Preeti U Murthy
On some architectures, in certain CPU deep idle states the local timers stop.
An external clock device is used to wakeup these CPUs. The kernel support for 
the
wakeup of these CPUs is provided by the tick broadcast framework by using the
external clock device as the wakeup source.

However not all implementations of architectures provide such an external
clock device such as some PowerPC ones. This patch includes support in the
broadcast framework to handle the wakeup of the CPUs in deep idle states on such
systems by queuing a hrtimer on one of the CPUs, meant to handle the wakeup of
CPUs in deep idle states. This CPU is identified as the bc_cpu.

Each time the hrtimer expires, it is reprogrammed for the next wakeup of the
CPUs in deep idle state after handling broadcast. However when a CPU is about
to enter  deep idle state with its wakeup time earlier than the time at which
the hrtimer is currently programmed, it *becomes the new bc_cpu* and restarts
the hrtimer on itself. This way the job of doing broadcast is handed around to
the CPUs that ask for the earliest wakeup just before entering deep idle
state. This is consistent with what happens in cases where an external clock
device is present. The smp affinity of this clock device is set to the CPU
with the earliest wakeup.

The important point here is that the bc_cpu cannot enter deep idle state
since it has a hrtimer queued to wakeup the other CPUs in deep idle. Hence it
cannot have its local timer stopped. Therefore for such a CPU, the
BROADCAST_ENTER notification has to fail implying that it cannot enter deep
idle state. On architectures where an external clock device is present, all
CPUs can enter deep idle.

During hotplug of the bc_cpu, the job of doing a broadcast is assigned to the
first cpu in the broadcast mask. This newly nominated bc_cpu is woken up by
an IPI so as to queue the above mentioned hrtimer on it.

Signed-off-by: Preeti U Murthy 
---

 include/linux/clockchips.h   |4 -
 kernel/time/clockevents.c|9 +-
 kernel/time/tick-broadcast.c |  192 ++
 kernel/time/tick-internal.h  |8 +-
 4 files changed, 186 insertions(+), 27 deletions(-)

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 493aa02..bbda37b 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -186,9 +186,9 @@ static inline int tick_check_broadcast_expired(void) { 
return 0; }
 #endif
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
-extern void clockevents_notify(unsigned long reason, void *arg);
+extern int clockevents_notify(unsigned long reason, void *arg);
 #else
-static inline void clockevents_notify(unsigned long reason, void *arg) {}
+static inline int clockevents_notify(unsigned long reason, void *arg) {}
 #endif
 
 #else /* CONFIG_GENERIC_CLOCKEVENTS_BUILD */
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 086ad60..d61404e 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -524,12 +524,13 @@ void clockevents_resume(void)
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
 /**
  * clockevents_notify - notification about relevant events
+ * Returns non zero on error.
  */
-void clockevents_notify(unsigned long reason, void *arg)
+int clockevents_notify(unsigned long reason, void *arg)
 {
struct clock_event_device *dev, *tmp;
unsigned long flags;
-   int cpu;
+   int cpu, ret = 0;
 
raw_spin_lock_irqsave(&clockevents_lock, flags);
 
@@ -542,11 +543,12 @@ void clockevents_notify(unsigned long reason, void *arg)
 
case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
-   tick_broadcast_oneshot_control(reason);
+   ret = tick_broadcast_oneshot_control(reason);
break;
 
case CLOCK_EVT_NOTIFY_CPU_DYING:
tick_handover_do_timer(arg);
+   tick_handover_broadcast_cpu(arg);
break;
 
case CLOCK_EVT_NOTIFY_SUSPEND:
@@ -585,6 +587,7 @@ void clockevents_notify(unsigned long reason, void *arg)
break;
}
raw_spin_unlock_irqrestore(&clockevents_lock, flags);
+   return ret;
 }
 EXPORT_SYMBOL_GPL(clockevents_notify);
 
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 9532690..1c23912 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "tick-internal.h"
 
@@ -35,6 +36,15 @@ static cpumask_var_t tmpmask;
 static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
 static int tick_broadcast_force;
 
+/*
+ * Helper variables for handling broadcast in the absence of a
+ * tick_broadcast_device.
+ * */
+static struct hrtimer *bc_hrtimer;
+static int bc_cpu = -1;
+static ktime_t bc_next_wakeup;
+static int hrtimer_initialized = 0;
+
 #ifdef CONFIG_TICK_ONESHOT
 static void tick_broadcast_clear_oneshot(int cpu);
 #else
@@ -528,6 +53

[PATCH V5 7/8] cpuidle/powernv: Add "Fast-Sleep" CPU idle state

2014-01-15 Thread Preeti U Murthy
Fast sleep is one of the deep idle states on Power8 in which local timers of
CPUs stop. On PowerPC we do not have an external clock device which can
handle wakeup of such CPUs. Now that we have the support in the tick broadcast
framework for archs that do not sport such a device and the low level support
for fast sleep, enable it in the cpuidle framework on PowerNV.

Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/Kconfig  |2 ++
 arch/powerpc/kernel/time.c|2 +-
 drivers/cpuidle/cpuidle-powernv.c |   39 +
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b44b52c..cafa788 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -129,6 +129,8 @@ config PPC
select GENERIC_CMOS_UPDATE
select GENERIC_TIME_VSYSCALL_OLD
select GENERIC_CLOCKEVENTS
+   select GENERIC_CLOCKEVENTS_BROADCAST
+   select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
select HAVE_MOD_ARCH_SPECIFIC
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 42cb603..d9efd93 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -106,7 +106,7 @@ struct clock_event_device decrementer_clockevent = {
.irq= 0,
.set_next_event = decrementer_set_next_event,
.set_mode   = decrementer_set_mode,
-   .features   = CLOCK_EVT_FEAT_ONESHOT,
+   .features   = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP,
 };
 EXPORT_SYMBOL(decrementer_clockevent);
 
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 78fd174..e3aa62f 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -49,6 +50,37 @@ static int nap_loop(struct cpuidle_device *dev,
return index;
 }
 
+static int fastsleep_loop(struct cpuidle_device *dev,
+   struct cpuidle_driver *drv,
+   int index)
+{
+   int cpu = dev->cpu;
+   unsigned long old_lpcr = mfspr(SPRN_LPCR);
+   unsigned long new_lpcr;
+
+   new_lpcr = old_lpcr;
+   new_lpcr &= ~(LPCR_MER | LPCR_PECE); /* lpcr[mer] must be 0 */
+
+   /* exit powersave upon external interrupt, but not decrementer
+* interrupt, Emulate sleep.
+*/
+   new_lpcr |= LPCR_PECE0;
+
+   if (clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu)) {
+   new_lpcr |= LPCR_PECE1;
+   mtspr(SPRN_LPCR, new_lpcr);
+   power7_nap();
+   } else {
+   mtspr(SPRN_LPCR, new_lpcr);
+   power7_sleep();
+   clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+   }
+
+   mtspr(SPRN_LPCR, old_lpcr);
+
+   return index;
+}
+
 /*
  * States for dedicated partition case.
  */
@@ -67,6 +99,13 @@ static struct cpuidle_state powernv_states[] = {
.exit_latency = 10,
.target_residency = 100,
.enter = &nap_loop },
+{ /* Fastsleep */
+   .name = "fastsleep",
+   .desc = "fastsleep",
+   .flags = CPUIDLE_FLAG_TIME_VALID,
+   .exit_latency = 10,
+   .target_residency = 100,
+   .enter = &fastsleep_loop },
 };
 
 static int powernv_cpuidle_add_cpu_notifier(struct notifier_block *n,

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V5 5/8] powermgt: Add OPAL call to resync timebase on wakeup

2014-01-15 Thread Preeti U Murthy
From: Vaidyanathan Srinivasan 

During "Fast-sleep" and deeper power savings state, decrementer and
timebase could be stopped making it out of sync with rest
of the cores in the system.

Add a firmware call to request platform to resync timebase
using low level platform methods.

Signed-off-by: Vaidyanathan Srinivasan 
Signed-off-by: Preeti U. Murthy 
---

 arch/powerpc/include/asm/opal.h|2 ++
 arch/powerpc/kernel/exceptions-64s.S   |2 +-
 arch/powerpc/kernel/idle_power7.S  |   27 
 arch/powerpc/platforms/powernv/opal-wrappers.S |1 +
 4 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 033c06b..a662d06 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -132,6 +132,7 @@ extern int opal_enter_rtas(struct rtas_args *args,
 #define OPAL_FLASH_VALIDATE76
 #define OPAL_FLASH_MANAGE  77
 #define OPAL_FLASH_UPDATE  78
+#define OPAL_RESYNC_TIMEBASE   79
 
 #ifndef __ASSEMBLY__
 
@@ -763,6 +764,7 @@ extern void opal_flash_init(void);
 extern int opal_machine_check(struct pt_regs *regs);
 
 extern void opal_shutdown(void);
+extern int opal_resync_timebase(void);
 
 extern void opal_lpc_init(void);
 
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index b8139fb..91e6417 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -145,7 +145,7 @@ BEGIN_FTR_SECTION
 
/* Fast Sleep wakeup on PowerNV */
 8: GET_PACA(r13)
-   b   .power7_wakeup_loss
+   b   .power7_wakeup_tb_loss
 
 9:
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
diff --git a/arch/powerpc/kernel/idle_power7.S 
b/arch/powerpc/kernel/idle_power7.S
index e4bbca2..34c71e8 100644
--- a/arch/powerpc/kernel/idle_power7.S
+++ b/arch/powerpc/kernel/idle_power7.S
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #undef DEBUG
 
@@ -124,6 +125,32 @@ _GLOBAL(power7_sleep)
b   power7_powersave_common
/* No return */
 
+_GLOBAL(power7_wakeup_tb_loss)
+   ld  r2,PACATOC(r13);
+   ld  r1,PACAR1(r13)
+
+   /* Time base re-sync */
+   li  r0,OPAL_RESYNC_TIMEBASE
+   LOAD_REG_ADDR(r11,opal);
+   ld  r12,8(r11);
+   ld  r2,0(r11);
+   mtctr   r12
+   bctrl
+
+   /* TODO: Check r3 for failure */
+
+   REST_NVGPRS(r1)
+   REST_GPR(2, r1)
+   ld  r3,_CCR(r1)
+   ld  r4,_MSR(r1)
+   ld  r5,_NIP(r1)
+   addir1,r1,INT_FRAME_SIZE
+   mtcrr3
+   mfspr   r3,SPRN_SRR1/* Return SRR1 */
+   mtspr   SPRN_SRR1,r4
+   mtspr   SPRN_SRR0,r5
+   rfid
+
 _GLOBAL(power7_wakeup_loss)
ld  r1,PACAR1(r13)
REST_NVGPRS(r1)
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S 
b/arch/powerpc/platforms/powernv/opal-wrappers.S
index e780650..ddfe95a 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -126,3 +126,4 @@ OPAL_CALL(opal_return_cpu,  
OPAL_RETURN_CPU);
 OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE);
 OPAL_CALL(opal_manage_flash,   OPAL_FLASH_MANAGE);
 OPAL_CALL(opal_update_flash,   OPAL_FLASH_UPDATE);
+OPAL_CALL(opal_resync_timebase,OPAL_RESYNC_TIMEBASE);

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V5 4/8] powernv/cpuidle: Add context management for Fast Sleep

2014-01-15 Thread Preeti U Murthy
From: Vaidyanathan Srinivasan 

Before adding Fast-Sleep into the cpuidle framework, some low level
support needs to be added to enable it. This includes saving and
restoring of certain registers at entry and exit time of this state
respectively just like we do in the NAP idle state.

Signed-off-by: Vaidyanathan Srinivasan 
[Changelog modified by Preeti U. Murthy ]
Signed-off-by: Preeti U. Murthy 
---

 arch/powerpc/include/asm/processor.h |1 +
 arch/powerpc/kernel/exceptions-64s.S |   10 -
 arch/powerpc/kernel/idle_power7.S|   63 --
 3 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index 027fefd..22e547a 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -444,6 +444,7 @@ enum idle_boot_override {IDLE_NO_OVERRIDE = 0, 
IDLE_POWERSAVE_OFF};
 
 extern int powersave_nap;  /* set if nap mode can be used in idle loop */
 extern void power7_nap(void);
+extern void power7_sleep(void);
 extern void flush_instruction_cache(void);
 extern void hard_reset_now(void);
 extern void poweroff_now(void);
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 9f905e4..b8139fb 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -121,9 +121,10 @@ BEGIN_FTR_SECTION
cmpwi   cr1,r13,2
/* Total loss of HV state is fatal, we could try to use the
 * PIR to locate a PACA, then use an emergency stack etc...
-* but for now, let's just stay stuck here
+* OPAL v3 based powernv platforms have new idle states
+* which fall in this catagory.
 */
-   bgt cr1,.
+   bgt cr1,8f
GET_PACA(r13)
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -141,6 +142,11 @@ BEGIN_FTR_SECTION
beq cr1,2f
b   .power7_wakeup_noloss
 2: b   .power7_wakeup_loss
+
+   /* Fast Sleep wakeup on PowerNV */
+8: GET_PACA(r13)
+   b   .power7_wakeup_loss
+
 9:
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 #endif /* CONFIG_PPC_P7_NAP */
diff --git a/arch/powerpc/kernel/idle_power7.S 
b/arch/powerpc/kernel/idle_power7.S
index 847e40e..e4bbca2 100644
--- a/arch/powerpc/kernel/idle_power7.S
+++ b/arch/powerpc/kernel/idle_power7.S
@@ -20,17 +20,27 @@
 
 #undef DEBUG
 
-   .text
+/* Idle state entry routines */
 
-_GLOBAL(power7_idle)
-   /* Now check if user or arch enabled NAP mode */
-   LOAD_REG_ADDRBASE(r3,powersave_nap)
-   lwz r4,ADDROFF(powersave_nap)(r3)
-   cmpwi   0,r4,0
-   beqlr
-   /* fall through */
+#defineIDLE_STATE_ENTER_SEQ(IDLE_INST) \
+   /* Magic NAP/SLEEP/WINKLE mode enter sequence */\
+   std r0,0(r1);   \
+   ptesync;\
+   ld  r0,0(r1);   \
+1: cmp cr0,r0,r0;  \
+   bne 1b; \
+   IDLE_INST;  \
+   b   .
 
-_GLOBAL(power7_nap)
+   .text
+
+/*
+ * Pass requested state in r3:
+ * 0 - nap
+ * 1 - sleep
+ */
+_GLOBAL(power7_powersave_common)
+   /* Use r3 to pass state nap/sleep/winkle */
/* NAP is a state loss, we create a regs frame on the
 * stack, fill it up with the state we care about and
 * stick a pointer to it in PACAR1. We really only
@@ -79,8 +89,8 @@ _GLOBAL(power7_nap)
/* Continue saving state */
SAVE_GPR(2, r1)
SAVE_NVGPRS(r1)
-   mfcrr3
-   std r3,_CCR(r1)
+   mfcrr4
+   std r4,_CCR(r1)
std r9,_MSR(r1)
std r1,PACAR1(r13)
 
@@ -89,15 +99,30 @@ _GLOBAL(power7_nap)
li  r4,KVM_HWTHREAD_IN_NAP
stb r4,HSTATE_HWTHREAD_STATE(r13)
 #endif
+   cmpwi   cr0,r3,1
+   beq 2f
+   IDLE_STATE_ENTER_SEQ(PPC_NAP)
+   /* No return */
+2: IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
+   /* No return */
 
-   /* Magic NAP mode enter sequence */
-   std r0,0(r1)
-   ptesync
-   ld  r0,0(r1)
-1: cmp cr0,r0,r0
-   bne 1b
-   PPC_NAP
-   b   .
+_GLOBAL(power7_idle)
+   /* Now check if user or arch enabled NAP mode */
+   LOAD_REG_ADDRBASE(r3,powersave_nap)
+   lwz r4,ADDROFF(powersave_nap)(r3)
+   cmpwi   0,r4,0
+   beqlr
+   /* fall through */
+
+_GLOBAL(power7_nap)
+   li  r3,0
+   b   power7_powersave_common
+   /* No return */
+
+_GLOBAL(power7_sleep)
+   li  r3,1
+   b   power7_powersave_common
+   /* No return */
 
 _GLOBAL(power7_wakeup_loss)
ld  r1,PACAR1(r13)

___
Lin

[PATCH V5 3/8] cpuidle/ppc: Split timer_interrupt() into timer handling and interrupt handling routines

2014-01-15 Thread Preeti U Murthy
Split timer_interrupt(), which is the local timer interrupt handler on ppc
into routines called during regular interrupt handling and __timer_interrupt(),
which takes care of running local timers and collecting time related stats.

This will enable callers interested only in running expired local timers to
directly call into __timer_interupt(). One of the use cases of this is the
tick broadcast IPI handling in which the sleeping CPUs need to handle the local
timers that have expired.

Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/kernel/time.c |   73 +---
 1 file changed, 41 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 42269c7..42cb603 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -478,6 +478,42 @@ void arch_irq_work_raise(void)
 
 #endif /* CONFIG_IRQ_WORK */
 
+static void __timer_interrupt(void)
+{
+   struct pt_regs *regs = get_irq_regs();
+   u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
+   struct clock_event_device *evt = &__get_cpu_var(decrementers);
+   u64 now;
+
+   __get_cpu_var(irq_stat).timer_irqs++;
+   trace_timer_interrupt_entry(regs);
+
+   if (test_irq_work_pending()) {
+   clear_irq_work_pending();
+   irq_work_run();
+   }
+
+   now = get_tb_or_rtc();
+   if (now >= *next_tb) {
+   *next_tb = ~(u64)0;
+   if (evt->event_handler)
+   evt->event_handler(evt);
+   } else {
+   now = *next_tb - now;
+   if (now <= DECREMENTER_MAX)
+   set_dec((int)now);
+   }
+
+#ifdef CONFIG_PPC64
+   /* collect purr register values often, for accurate calculations */
+   if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+   struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array);
+   cu->current_tb = mfspr(SPRN_PURR);
+   }
+#endif
+   trace_timer_interrupt_exit(regs);
+}
+
 /*
  * timer_interrupt - gets called when the decrementer overflows,
  * with interrupts disabled.
@@ -486,8 +522,6 @@ void timer_interrupt(struct pt_regs * regs)
 {
struct pt_regs *old_regs;
u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
-   struct clock_event_device *evt = &__get_cpu_var(decrementers);
-   u64 now;
 
/* Ensure a positive value is written to the decrementer, or else
 * some CPUs will continue to take decrementer exceptions.
@@ -510,8 +544,6 @@ void timer_interrupt(struct pt_regs * regs)
 */
may_hard_irq_enable();
 
-   __get_cpu_var(irq_stat).timer_irqs++;
-
 #if defined(CONFIG_PPC32) && defined(CONFIG_PMAC)
if (atomic_read(&ppc_n_lost_interrupts) != 0)
do_IRQ(regs);
@@ -520,34 +552,7 @@ void timer_interrupt(struct pt_regs * regs)
old_regs = set_irq_regs(regs);
irq_enter();
 
-   trace_timer_interrupt_entry(regs);
-
-   if (test_irq_work_pending()) {
-   clear_irq_work_pending();
-   irq_work_run();
-   }
-
-   now = get_tb_or_rtc();
-   if (now >= *next_tb) {
-   *next_tb = ~(u64)0;
-   if (evt->event_handler)
-   evt->event_handler(evt);
-   } else {
-   now = *next_tb - now;
-   if (now <= DECREMENTER_MAX)
-   set_dec((int)now);
-   }
-
-#ifdef CONFIG_PPC64
-   /* collect purr register values often, for accurate calculations */
-   if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
-   struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array);
-   cu->current_tb = mfspr(SPRN_PURR);
-   }
-#endif
-
-   trace_timer_interrupt_exit(regs);
-
+   __timer_interrupt();
irq_exit();
set_irq_regs(old_regs);
 }
@@ -816,6 +821,10 @@ static void decrementer_set_mode(enum clock_event_mode 
mode,
 /* Interrupt handler for the timer broadcast IPI */
 void tick_broadcast_ipi_handler(void)
 {
+   u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
+
+   *next_tb = get_tb_or_rtc();
+   __timer_interrupt();
 }
 
 static void register_decrementer_clockevent(int cpu)

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V5 2/8] powerpc: Implement tick broadcast IPI as a fixed IPI message

2014-01-15 Thread Preeti U Murthy
From: Srivatsa S. Bhat 

For scalability and performance reasons, we want the tick broadcast IPIs
to be handled as efficiently as possible. Fixed IPI messages
are one of the most efficient mechanisms available - they are faster than
the smp_call_function mechanism because the IPI handlers are fixed and hence
they don't involve costly operations such as adding IPI handlers to the target
CPU's function queue, acquiring locks for synchronization etc.

Luckily we have an unused IPI message slot, so use that to implement
tick broadcast IPIs efficiently.

Signed-off-by: Srivatsa S. Bhat 
[Functions renamed to tick_broadcast* and Changelog modified by
 Preeti U. Murthy]
Signed-off-by: Preeti U. Murthy 
Acked-by: Geoff Levand  [For the PS3 part]
---

 arch/powerpc/include/asm/smp.h  |2 +-
 arch/powerpc/include/asm/time.h |1 +
 arch/powerpc/kernel/smp.c   |   19 +++
 arch/powerpc/kernel/time.c  |5 +
 arch/powerpc/platforms/cell/interrupt.c |2 +-
 arch/powerpc/platforms/ps3/smp.c|2 +-
 6 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 9f7356b..ff51046 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -120,7 +120,7 @@ extern int cpu_to_core_id(int cpu);
  * in /proc/interrupts will be wrong!!! --Troy */
 #define PPC_MSG_CALL_FUNCTION   0
 #define PPC_MSG_RESCHEDULE  1
-#define PPC_MSG_UNUSED 2
+#define PPC_MSG_TICK_BROADCAST 2
 #define PPC_MSG_DEBUGGER_BREAK  3
 
 /* for irq controllers that have dedicated ipis per message (4) */
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index c1f2676..1d428e6 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -28,6 +28,7 @@ extern struct clock_event_device decrementer_clockevent;
 struct rtc_time;
 extern void to_tm(int tim, struct rtc_time * tm);
 extern void GregorianDay(struct rtc_time *tm);
+extern void tick_broadcast_ipi_handler(void);
 
 extern void generic_calibrate_decr(void);
 
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index c2bd8d6..c77c6d7 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -145,9 +146,9 @@ static irqreturn_t reschedule_action(int irq, void *data)
return IRQ_HANDLED;
 }
 
-static irqreturn_t unused_action(int irq, void *data)
+static irqreturn_t tick_broadcast_ipi_action(int irq, void *data)
 {
-   /* This slot is unused and hence available for use, if needed */
+   tick_broadcast_ipi_handler();
return IRQ_HANDLED;
 }
 
@@ -168,14 +169,14 @@ static irqreturn_t debug_ipi_action(int irq, void *data)
 static irq_handler_t smp_ipi_action[] = {
[PPC_MSG_CALL_FUNCTION] =  call_function_action,
[PPC_MSG_RESCHEDULE] = reschedule_action,
-   [PPC_MSG_UNUSED] = unused_action,
+   [PPC_MSG_TICK_BROADCAST] = tick_broadcast_ipi_action,
[PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action,
 };
 
 const char *smp_ipi_name[] = {
[PPC_MSG_CALL_FUNCTION] =  "ipi call function",
[PPC_MSG_RESCHEDULE] = "ipi reschedule",
-   [PPC_MSG_UNUSED] = "ipi unused",
+   [PPC_MSG_TICK_BROADCAST] = "ipi tick-broadcast",
[PPC_MSG_DEBUGGER_BREAK] = "ipi debugger",
 };
 
@@ -251,6 +252,8 @@ irqreturn_t smp_ipi_demux(void)
generic_smp_call_function_interrupt();
if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE))
scheduler_ipi();
+   if (all & IPI_MESSAGE(PPC_MSG_TICK_BROADCAST))
+   tick_broadcast_ipi_handler();
if (all & IPI_MESSAGE(PPC_MSG_DEBUGGER_BREAK))
debug_ipi_action(0, NULL);
} while (info->messages);
@@ -289,6 +292,14 @@ void arch_send_call_function_ipi_mask(const struct cpumask 
*mask)
do_message_pass(cpu, PPC_MSG_CALL_FUNCTION);
 }
 
+void tick_broadcast(const struct cpumask *mask)
+{
+   unsigned int cpu;
+
+   for_each_cpu(cpu, mask)
+   do_message_pass(cpu, PPC_MSG_TICK_BROADCAST);
+}
+
 #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
 void smp_send_debugger_break(void)
 {
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index b3b1441..42269c7 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -813,6 +813,11 @@ static void decrementer_set_mode(enum clock_event_mode 
mode,
decrementer_set_next_event(DECREMENTER_MAX, dev);
 }
 
+/* Interrupt handler for the timer broadcast IPI */
+void tick_broadcast_ipi_handler(void)
+{
+}
+
 static void register_decrementer_clockevent(int cpu)
 {
struct clock_event_device *dec = &per_cpu(decrementers, cpu);
diff

[PATCH V5 1/8] powerpc: Free up the slot of PPC_MSG_CALL_FUNC_SINGLE IPI message

2014-01-15 Thread Preeti U Murthy
From: Srivatsa S. Bhat 

The IPI handlers for both PPC_MSG_CALL_FUNC and PPC_MSG_CALL_FUNC_SINGLE map
to a common implementation - generic_smp_call_function_single_interrupt(). So,
we can consolidate them and save one of the IPI message slots, (which are
precious on powerpc, since only 4 of those slots are available).

So, implement the functionality of PPC_MSG_CALL_FUNC_SINGLE using
PPC_MSG_CALL_FUNC itself and release its IPI message slot, so that it can be
used for something else in the future, if desired.

Signed-off-by: Srivatsa S. Bhat 
Signed-off-by: Preeti U. Murthy 
Acked-by: Geoff Levand  [For the PS3 part]
---

 arch/powerpc/include/asm/smp.h  |2 +-
 arch/powerpc/kernel/smp.c   |   12 +---
 arch/powerpc/platforms/cell/interrupt.c |2 +-
 arch/powerpc/platforms/ps3/smp.c|2 +-
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 084e080..9f7356b 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -120,7 +120,7 @@ extern int cpu_to_core_id(int cpu);
  * in /proc/interrupts will be wrong!!! --Troy */
 #define PPC_MSG_CALL_FUNCTION   0
 #define PPC_MSG_RESCHEDULE  1
-#define PPC_MSG_CALL_FUNC_SINGLE   2
+#define PPC_MSG_UNUSED 2
 #define PPC_MSG_DEBUGGER_BREAK  3
 
 /* for irq controllers that have dedicated ipis per message (4) */
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index a3b64f3..c2bd8d6 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -145,9 +145,9 @@ static irqreturn_t reschedule_action(int irq, void *data)
return IRQ_HANDLED;
 }
 
-static irqreturn_t call_function_single_action(int irq, void *data)
+static irqreturn_t unused_action(int irq, void *data)
 {
-   generic_smp_call_function_single_interrupt();
+   /* This slot is unused and hence available for use, if needed */
return IRQ_HANDLED;
 }
 
@@ -168,14 +168,14 @@ static irqreturn_t debug_ipi_action(int irq, void *data)
 static irq_handler_t smp_ipi_action[] = {
[PPC_MSG_CALL_FUNCTION] =  call_function_action,
[PPC_MSG_RESCHEDULE] = reschedule_action,
-   [PPC_MSG_CALL_FUNC_SINGLE] = call_function_single_action,
+   [PPC_MSG_UNUSED] = unused_action,
[PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action,
 };
 
 const char *smp_ipi_name[] = {
[PPC_MSG_CALL_FUNCTION] =  "ipi call function",
[PPC_MSG_RESCHEDULE] = "ipi reschedule",
-   [PPC_MSG_CALL_FUNC_SINGLE] = "ipi call function single",
+   [PPC_MSG_UNUSED] = "ipi unused",
[PPC_MSG_DEBUGGER_BREAK] = "ipi debugger",
 };
 
@@ -251,8 +251,6 @@ irqreturn_t smp_ipi_demux(void)
generic_smp_call_function_interrupt();
if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE))
scheduler_ipi();
-   if (all & IPI_MESSAGE(PPC_MSG_CALL_FUNC_SINGLE))
-   generic_smp_call_function_single_interrupt();
if (all & IPI_MESSAGE(PPC_MSG_DEBUGGER_BREAK))
debug_ipi_action(0, NULL);
} while (info->messages);
@@ -280,7 +278,7 @@ EXPORT_SYMBOL_GPL(smp_send_reschedule);
 
 void arch_send_call_function_single_ipi(int cpu)
 {
-   do_message_pass(cpu, PPC_MSG_CALL_FUNC_SINGLE);
+   do_message_pass(cpu, PPC_MSG_CALL_FUNCTION);
 }
 
 void arch_send_call_function_ipi_mask(const struct cpumask *mask)
diff --git a/arch/powerpc/platforms/cell/interrupt.c 
b/arch/powerpc/platforms/cell/interrupt.c
index 2d42f3b..adf3726 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -215,7 +215,7 @@ void iic_request_IPIs(void)
 {
iic_request_ipi(PPC_MSG_CALL_FUNCTION);
iic_request_ipi(PPC_MSG_RESCHEDULE);
-   iic_request_ipi(PPC_MSG_CALL_FUNC_SINGLE);
+   iic_request_ipi(PPC_MSG_UNUSED);
iic_request_ipi(PPC_MSG_DEBUGGER_BREAK);
 }
 
diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c
index 4b35166..00d1a7c 100644
--- a/arch/powerpc/platforms/ps3/smp.c
+++ b/arch/powerpc/platforms/ps3/smp.c
@@ -76,7 +76,7 @@ static int __init ps3_smp_probe(void)
 
BUILD_BUG_ON(PPC_MSG_CALL_FUNCTION!= 0);
BUILD_BUG_ON(PPC_MSG_RESCHEDULE   != 1);
-   BUILD_BUG_ON(PPC_MSG_CALL_FUNC_SINGLE != 2);
+   BUILD_BUG_ON(PPC_MSG_UNUSED   != 2);
BUILD_BUG_ON(PPC_MSG_DEBUGGER_BREAK   != 3);
 
for (i = 0; i < MSG_COUNT; i++) {

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V5 0/8] cpuidle/ppc: Enable deep idle states on PowerNV

2014-01-15 Thread Preeti U Murthy
ted broadcast cpu on
hotplug of the old instead of smp_call_function_single(). This is because we
are interrupt disabled at this point and should not be using
smp_call_function_single or its children in this context to send an ipi.

6. Move GENERIC_CLOCKEVENTS_BROADCAST to arch/powerpc/Kconfig.

7. Fix coding style issues.


Changes in V2:
-
https://lkml.org/lkml/2013/8/14/239

1. Dynamically pick a broadcast CPU, instead of having a dedicated one.
2. Remove the constraint of having to disable tickless idle on the broadcast
CPU by queueing a hrtimer dedicated to do broadcast.



V1 posting: https://lkml.org/lkml/2013/7/25/740.

1. Added the infrastructure to wakeup CPUs in deep idle states in which the
local timers stop.
---

Preeti U Murthy (4):
  cpuidle/ppc: Split timer_interrupt() into timer handling and interrupt 
handling routines
  time/cpuidle: Support in tick broadcast framework in the absence of 
external clock device
  cpuidle/powernv: Add "Fast-Sleep" CPU idle state
  cpuidle/powernv: Parse device tree to setup idle states

Srivatsa S. Bhat (2):
  powerpc: Free up the slot of PPC_MSG_CALL_FUNC_SINGLE IPI message
  powerpc: Implement tick broadcast IPI as a fixed IPI message

Vaidyanathan Srinivasan (2):
  powernv/cpuidle: Add context management for Fast Sleep
  powermgt: Add OPAL call to resync timebase on wakeup


 arch/powerpc/Kconfig   |2 
 arch/powerpc/include/asm/opal.h|2 
 arch/powerpc/include/asm/processor.h   |1 
 arch/powerpc/include/asm/smp.h |2 
 arch/powerpc/include/asm/time.h|1 
 arch/powerpc/kernel/exceptions-64s.S   |   10 +
 arch/powerpc/kernel/idle_power7.S  |   90 +--
 arch/powerpc/kernel/smp.c  |   23 ++-
 arch/powerpc/kernel/time.c |   80 ++
 arch/powerpc/platforms/cell/interrupt.c|2 
 arch/powerpc/platforms/powernv/opal-wrappers.S |1 
 arch/powerpc/platforms/ps3/smp.c   |2 
 drivers/cpuidle/cpuidle-powernv.c  |  106 -
 include/linux/clockchips.h |4 -
 kernel/time/clockevents.c  |9 +
 kernel/time/tick-broadcast.c   |  192 ++--
 kernel/time/tick-internal.h|8 +
 17 files changed, 434 insertions(+), 101 deletions(-)

-- 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: Disable sleep states on P7+

2014-01-14 Thread Preeti U Murthy
Hi Steven,

On 01/14/2014 08:06 PM, Steven Pratt wrote:
> I am looking for info on when and how we are able to disable power saving 
> features of current (P7, P7+) chips in order to reduce latency. This is often 
> done in latency sensitive applications when power consumption is not an 
> issue. On Intel boxes we can disable P-state frequency changes as well as 
> disabling C-State or sleep state changes. In fact we can control how deep a 
> sleep the processor can go into.  I know we have control Dynamic Processor 
> Scaling and Idle Power Savings, but what states do these really affect?  Can 
> I really disable Nap mode of a processor? If so how?  Can I disable even the 
> lightest winkle mode?  Looking for current information (read RHEL 6 and 
> SLES11), future changes are interesting.
> 
> Steve

I can answer this question with respect to cpuidle on PowerNV platforms.

1. In order to disable cpuidle states management altogether, one can
pass the powersave=off kernel cmd line parameter during boot up of the
kernel. This will ensure that each time a CPU has nothing to do, it can
enter low thread priority which could lower power consumption to some
extent but is not expected to hit latency of applications noticeably.

2. In order to exactly control the cpuidle states into which idle CPUs
can enter into during runtime, one can make use of the sysfs files under:
/sys/devices/system/cpu/cpux/cpuidle/statex/disable option to
selectively disable any state.

However if one is using the menu cpuidle governor, disabling an idle
state does not disable the idle states which are deeper than it. They
continue to remain active unless they are specifically disabled. What
this means is that one cannot control the depth of the idle states
available for a CPU, although we can control the exact idle states
available for a processor.

But if the ladder governor is used, one can control the depth of the
idle states that a CPU can enter into. The governor can be chosen by
echoing either menu/ladder to
/sys/devices/system/cpu/cpuidle/current_governor_ro. The cpuidle
governor takes decisions about the idle state for a cpu to enter into
depending on its idle history. The popular governor used by most archs
is the menu governor.

Hence nap/sleep/winkle any of these states can be disabled. The code
which enables the above mentioned functionalities on powernv is yet to
go upstream although the same is already upstream and can be used for
the pseries platform to disable/enable the idle states on it.

Today on powernv the default idle state nap is entered into all the
time. One can disable it by echoing 0 to powersave_nap under
/proc/sys/kernel/powersave_nap, in which case the cpu enters low thread
priority.

Thanks

Regards
Preeti U Murthy

> 
> ___
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] cpuidle/menu: Fail cpuidle_idle_call() if no idle state is acceptable

2014-01-14 Thread Preeti U Murthy
On 01/14/2014 01:07 PM, Srivatsa S. Bhat wrote:
> On 01/14/2014 12:30 PM, Srivatsa S. Bhat wrote:
>> On 01/14/2014 11:35 AM, Preeti U Murthy wrote:
>>> On PowerPC, in a particular test scenario, all the cpu idle states were 
>>> disabled.
>>> Inspite of this it was observed that the idle state count of the shallowest
>>> idle state, snooze, was increasing.
>>>
>>> This is because the governor returns the idle state index as 0 even in
>>> scenarios when no idle state can be chosen. These scenarios could be when 
>>> the
>>> latency requirement is 0 or as mentioned above when the user wants to 
>>> disable
>>> certain cpu idle states at runtime. In the latter case, its possible that no
>>> cpu idle state is valid because the suitable states were disabled
>>> and the rest did not match the menu governor criteria to be chosen as the
>>> next idle state.
>>>
>>> This patch adds the code to indicate that a valid cpu idle state could not 
>>> be
>>> chosen by the menu governor and reports back to arch so that it can take 
>>> some
>>> default action.
>>>
>>
>> That sounds fair enough. However, the "default" action of pseries idle loop
>> (pseries_lpar_idle()) surprises me. It enters Cede, which is _deeper_ than 
>> doing
>> a snooze! IOW, a user might "disable" cpuidle or set the 
>> PM_QOS_CPU_DMA_LATENCY
>> to 0 hoping to prevent the CPUs from going to deep idle states, but then the
>> machine would still end up going to Cede, even though that wont get reflected
>> in the idle state counts. IMHO that scenario needs some thought as well...
>>
> 
> I checked the git history and found that the default idle was changed (on 
> purpose)
> to cede the processor, in order to speed up booting.. Hmm..
> 
> commit 363edbe2614aa90df706c0f19ccfa2a6c06af0be
> Author: Vaidyanathan Srinivasan 
> Date:   Fri Sep 6 00:25:06 2013 +0530
> 
> powerpc: Default arch idle could cede processor on pseries

This issue is not powerpc specific as I observed on digging a bit into
the default idle routines of the common archs. The way that archs
perceive the call to cpuidle framework today is that if it fails, it
means that cpuidle backend driver fails to *function* due to some reason
(as is mentioned in the above commit: either since cpuidle driver is not
registered or it does not work on some specific platforms) and that
therefore the archs should decide on an idle state themselves. They
therefore end up choosing a convenient idle state which could very well
be one of the idle states in the cpuidle state table.

The archs do not see failed call to cpuidle driver as "cpuidle driver
says no idle state can be entered now because there are strict latency
requirements or the idle states are disabled". IOW, the call to cpuidle
driver is currently based on if cpuidle driver exists rather than if it
agrees on entry into any of the idle states.

This patch brings in the need for the archs to incorporate this
additional check of "did cpuidle_idle_call() fail because it did not
find it wise to enter any of the idle states". In which case they should
simply exit without taking any *default action*.

Need to give this some thought and reconsider the patch.

Regards
Preeti U Murthy
> 
> 
> Regards,
> Srivatsa S. Bhat
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] cpuidle/menu: Fail cpuidle_idle_call() if no idle state is acceptable

2014-01-14 Thread Preeti U Murthy
Hi Srivatsa,

On 01/14/2014 12:30 PM, Srivatsa S. Bhat wrote:
> On 01/14/2014 11:35 AM, Preeti U Murthy wrote:
>> On PowerPC, in a particular test scenario, all the cpu idle states were 
>> disabled.
>> Inspite of this it was observed that the idle state count of the shallowest
>> idle state, snooze, was increasing.
>>
>> This is because the governor returns the idle state index as 0 even in
>> scenarios when no idle state can be chosen. These scenarios could be when the
>> latency requirement is 0 or as mentioned above when the user wants to disable
>> certain cpu idle states at runtime. In the latter case, its possible that no
>> cpu idle state is valid because the suitable states were disabled
>> and the rest did not match the menu governor criteria to be chosen as the
>> next idle state.
>>
>> This patch adds the code to indicate that a valid cpu idle state could not be
>> chosen by the menu governor and reports back to arch so that it can take some
>> default action.
>>
> 
> That sounds fair enough. However, the "default" action of pseries idle loop
> (pseries_lpar_idle()) surprises me. It enters Cede, which is _deeper_ than 
> doing
> a snooze! IOW, a user might "disable" cpuidle or set the 
> PM_QOS_CPU_DMA_LATENCY
> to 0 hoping to prevent the CPUs from going to deep idle states, but then the
> machine would still end up going to Cede, even though that wont get reflected
> in the idle state counts. IMHO that scenario needs some thought as well...

Yes I did see this, but since the patch intends to only communicate
whether the cpuidle governor was successful in choosing an idle state on
its part, I wished to address the default action of pseries idle loop
separately. You are right we will need to understand the patch which
introduced this action. I will take a look at it.

> 
>> Signed-off-by: Preeti U Murthy 
>> ---
>>
>>  drivers/cpuidle/cpuidle.c|6 +-
>>  drivers/cpuidle/governors/menu.c |7 ---
>>  2 files changed, 9 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
>> index a55e68f..5bf06bb 100644
>> --- a/drivers/cpuidle/cpuidle.c
>> +++ b/drivers/cpuidle/cpuidle.c
>> @@ -131,8 +131,9 @@ int cpuidle_idle_call(void)
>>
>>  /* ask the governor for the next state */
>>  next_state = cpuidle_curr_governor->select(drv, dev);
>> +
>> +dev->last_residency = 0;
>>  if (need_resched()) {
>> -dev->last_residency = 0;
>>  /* give the governor an opportunity to reflect on the outcome */
>>  if (cpuidle_curr_governor->reflect)
>>  cpuidle_curr_governor->reflect(dev, next_state);
> 
> The comments on top of the .reflect() routines of the governors say that the
> second parameter is the index of the actual state entered. But after this 
> patch,
> next_state can be negative, indicating an invalid index. So those comments 
> need
> to be updated accordingly.

Right, I will take care of the comment in the next post.
> 
>> @@ -140,6 +141,9 @@ int cpuidle_idle_call(void)
>>  return 0;
>>  }
>>
>> +if (next_state < 0)
>> +return -EINVAL;
> 
> The exit path above (due to need_resched) returns with irqs enabled, but the 
> new
> one you are adding (next_state < 0) returns with irqs disabled. This is 
> correct,
> because in the latter case, "idle" is still in progress and the arch will 
> choose
> a default handler to execute (unlike the former case where "idle" is over and
> hence its time to enable interrupts).

Correct.
> 
> IMHO it would be good to add comments around this code to explain this subtle
> difference. We can never be too careful with these things... ;-)

Ok, will do so.
> 
>> +
>>  trace_cpu_idle_rcuidle(next_state, dev->cpu);
>>
>>  broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
>> diff --git a/drivers/cpuidle/governors/menu.c 
>> b/drivers/cpuidle/governors/menu.c
>> index cf7f2f0..6921543 100644
>> --- a/drivers/cpuidle/governors/menu.c
>> +++ b/drivers/cpuidle/governors/menu.c
>> @@ -283,6 +283,7 @@ again:
>>   * menu_select - selects the next idle state to enter
>>   * @drv: cpuidle driver containing state data
>>   * @dev: the CPU
>> + * Returns -1 when no idle state is suitable
>>   */
>>  static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device 
>> *dev)
>>  {
>> @@ -292,17 +293,17 @@ static int menu_select(struct cpuidle_d

[PATCH] cpuidle/menu: Fail cpuidle_idle_call() if no idle state is acceptable

2014-01-13 Thread Preeti U Murthy
On PowerPC, in a particular test scenario, all the cpu idle states were 
disabled.
Inspite of this it was observed that the idle state count of the shallowest
idle state, snooze, was increasing.

This is because the governor returns the idle state index as 0 even in
scenarios when no idle state can be chosen. These scenarios could be when the
latency requirement is 0 or as mentioned above when the user wants to disable
certain cpu idle states at runtime. In the latter case, its possible that no
cpu idle state is valid because the suitable states were disabled
and the rest did not match the menu governor criteria to be chosen as the
next idle state.

This patch adds the code to indicate that a valid cpu idle state could not be
chosen by the menu governor and reports back to arch so that it can take some
default action.

Signed-off-by: Preeti U Murthy 
---

 drivers/cpuidle/cpuidle.c|6 +-
 drivers/cpuidle/governors/menu.c |7 ---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index a55e68f..5bf06bb 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -131,8 +131,9 @@ int cpuidle_idle_call(void)
 
/* ask the governor for the next state */
next_state = cpuidle_curr_governor->select(drv, dev);
+
+   dev->last_residency = 0;
if (need_resched()) {
-   dev->last_residency = 0;
/* give the governor an opportunity to reflect on the outcome */
if (cpuidle_curr_governor->reflect)
cpuidle_curr_governor->reflect(dev, next_state);
@@ -140,6 +141,9 @@ int cpuidle_idle_call(void)
return 0;
}
 
+   if (next_state < 0)
+   return -EINVAL;
+
trace_cpu_idle_rcuidle(next_state, dev->cpu);
 
broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index cf7f2f0..6921543 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -283,6 +283,7 @@ again:
  * menu_select - selects the next idle state to enter
  * @drv: cpuidle driver containing state data
  * @dev: the CPU
+ * Returns -1 when no idle state is suitable
  */
 static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 {
@@ -292,17 +293,17 @@ static int menu_select(struct cpuidle_driver *drv, struct 
cpuidle_device *dev)
int multiplier;
struct timespec t;
 
-   if (data->needs_update) {
+   if (data->last_state_idx >= 0 && data->needs_update) {
menu_update(drv, dev);
data->needs_update = 0;
}
 
-   data->last_state_idx = 0;
+   data->last_state_idx = -1;
data->exit_us = 0;
 
/* Special case when user has set very strict latency requirement */
if (unlikely(latency_req == 0))
-   return 0;
+   return data->last_state_idx;
 
/* determine the expected residency time, round up */
t = ktime_to_timespec(tick_nohz_get_sleep_length());

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] pseries/cpuidle: Remove redundant call to ppc64_runlatch_off() in cpu idle routines

2014-01-12 Thread Preeti U Murthy
Hi Mikey

I have the patch with the changelog according to your suggestion below.

Thanks

On 01/13/2014 11:08 AM, Michael Ellerman wrote:
> On Thu, 2014-01-09 at 10:35 +0530, Preeti U Murthy wrote:
>> Commit fbd7740fdfdf9475f switched pseries cpu idle handling from complete 
>> idle
>> loops to ppc_md.powersave functions. Earlier to this switch,
>> ppc64_runlatch_off() had to be called in each of the idle routines. But after
>> the switch this call is handled in arch_cpu_idle(),just before the call
>> to ppc_md.powersave, where platform specific idle routines are called.
>>
>> As a consequence, the call to ppc64_runlatch_off() got duplicated in the
>> arch_cpu_idle() routine as well as in the some of the idle routines in
>> pseries and commit fbd7740fdfdf9475f missed to get rid of these redundant
>> calls. These calls were carried over subsequent enhancements to the pseries
>> cpuidle routines. This patch takes care of eliminating this redundancy.
> 
> It's "obvious" that turning the runlatch off multiple times is harmless,
> although it adds extra overhead, but please spell that out in the changelog.
> 
> cheers
> 
> 

pseries/cpuidle: Remove redundant call to ppc64_runlatch_off() in cpu idle 
routines

From: Preeti U Murthy 

Commit fbd7740fdfdf9475f(powerpc: Simplify pSeries idle loop) switched pseries 
cpu
idle handling from complete idle loops to ppc_md.powersave functions. Earlier to
this switch, ppc64_runlatch_off() had to be called in each of the idle routines.
But after the switch, this call is handled in arch_cpu_idle(),just before the 
call
to ppc_md.powersave, where platform specific idle routines are called.

As a consequence, the call to ppc64_runlatch_off() got duplicated in the
arch_cpu_idle() routine as well as in the some of the idle routines in
pseries and commit fbd7740fdfdf9475f missed to get rid of these redundant
calls. These calls were carried over subsequent enhancements to the pseries
cpuidle routines.

Although multiple calls to ppc64_runlatch_off() is harmless, there is still some
overhead due to it. Besides that, these calls could also make way for a
misunderstanding that it is *necessary* to call ppc64_runlatch_off() multiple
times, when that is not the case. Hence this patch takes care of eliminating
this redundancy.

Signed-off-by: Preeti U Murthy 
---
 arch/powerpc/platforms/pseries/processor_idle.c |3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/processor_idle.c 
b/arch/powerpc/platforms/pseries/processor_idle.c
index a166e38..09e4f56 100644
--- a/arch/powerpc/platforms/pseries/processor_idle.c
+++ b/arch/powerpc/platforms/pseries/processor_idle.c
@@ -17,7 +17,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 
 struct cpuidle_driver pseries_idle_driver = {
@@ -63,7 +62,6 @@ static int snooze_loop(struct cpuidle_device *dev,
set_thread_flag(TIF_POLLING_NRFLAG);
 
while ((!need_resched()) && cpu_online(cpu)) {
-   ppc64_runlatch_off();
HMT_low();
HMT_very_low();
}
@@ -103,7 +101,6 @@ static int dedicated_cede_loop(struct cpuidle_device *dev,
idle_loop_prolog(&in_purr);
get_lppaca()->donate_dedicated_cpu = 1;
 
-   ppc64_runlatch_off();
HMT_medium();
check_and_cede_processor();


Regards
Preeti U Murthy 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] pseries/cpuidle: Remove redundant call to ppc64_runlatch_off() in cpu idle routines

2014-01-12 Thread Preeti U Murthy
Hi Deepthi,

On 01/13/2014 09:27 AM, Deepthi Dharwar wrote:
> On 01/09/2014 10:35 AM, Preeti U Murthy wrote:
>> Commit fbd7740fdfdf9475f switched pseries cpu idle handling from complete 
>> idle
>> loops to ppc_md.powersave functions. Earlier to this switch,
>> ppc64_runlatch_off() had to be called in each of the idle routines. But after
>> the switch this call is handled in arch_cpu_idle(),just before the call
>> to ppc_md.powersave, where platform specific idle routines are called.
>>
>> As a consequence, the call to ppc64_runlatch_off() got duplicated in the
>> arch_cpu_idle() routine as well as in the some of the idle routines in
>> pseries and commit fbd7740fdfdf9475f missed to get rid of these redundant
>> calls. These calls were carried over subsequent enhancements to the pseries
>> cpuidle routines. This patch takes care of eliminating this redundancy.
>>
>> Signed-off-by: Preeti U Murthy 
>> ---
> 
> Acked-by: Deepthi Dharwar 
> 
> Preeti, I will include this patch as part of the pseries cpuidle driver
> clean-ups series which I have undertaken.

Yes that would be great, thanks!

Regards
Preeti U Murthy
> 
> Regards,
> Deepthi
> 
>>  arch/powerpc/platforms/pseries/processor_idle.c |3 ---
>>  1 file changed, 3 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/pseries/processor_idle.c 
>> b/arch/powerpc/platforms/pseries/processor_idle.c
>> index a166e38..09e4f56 100644
>> --- a/arch/powerpc/platforms/pseries/processor_idle.c
>> +++ b/arch/powerpc/platforms/pseries/processor_idle.c
>> @@ -17,7 +17,6 @@
>>  #include 
>>  #include 
>>  #include 
>> -#include 
>>  #include 
>>
>>  struct cpuidle_driver pseries_idle_driver = {
>> @@ -63,7 +62,6 @@ static int snooze_loop(struct cpuidle_device *dev,
>>  set_thread_flag(TIF_POLLING_NRFLAG);
>>
>>  while ((!need_resched()) && cpu_online(cpu)) {
>> -ppc64_runlatch_off();
>>  HMT_low();
>>  HMT_very_low();
>>  }
>> @@ -103,7 +101,6 @@ static int dedicated_cede_loop(struct cpuidle_device 
>> *dev,
>>  idle_loop_prolog(&in_purr);
>>  get_lppaca()->donate_dedicated_cpu = 1;
>>
>> -ppc64_runlatch_off();
>>  HMT_medium();
>>  check_and_cede_processor();
>>
>>
>> ___
>> Linuxppc-dev mailing list
>> Linuxppc-dev@lists.ozlabs.org
>> https://lists.ozlabs.org/listinfo/linuxppc-dev
>>
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH] pseries/cpuidle: Remove redundant call to ppc64_runlatch_off() in cpu idle routines

2014-01-08 Thread Preeti U Murthy
Commit fbd7740fdfdf9475f switched pseries cpu idle handling from complete idle
loops to ppc_md.powersave functions. Earlier to this switch,
ppc64_runlatch_off() had to be called in each of the idle routines. But after
the switch this call is handled in arch_cpu_idle(),just before the call
to ppc_md.powersave, where platform specific idle routines are called.

As a consequence, the call to ppc64_runlatch_off() got duplicated in the
arch_cpu_idle() routine as well as in the some of the idle routines in
pseries and commit fbd7740fdfdf9475f missed to get rid of these redundant
calls. These calls were carried over subsequent enhancements to the pseries
cpuidle routines. This patch takes care of eliminating this redundancy.

Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/platforms/pseries/processor_idle.c |3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/processor_idle.c 
b/arch/powerpc/platforms/pseries/processor_idle.c
index a166e38..09e4f56 100644
--- a/arch/powerpc/platforms/pseries/processor_idle.c
+++ b/arch/powerpc/platforms/pseries/processor_idle.c
@@ -17,7 +17,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 
 struct cpuidle_driver pseries_idle_driver = {
@@ -63,7 +62,6 @@ static int snooze_loop(struct cpuidle_device *dev,
set_thread_flag(TIF_POLLING_NRFLAG);
 
while ((!need_resched()) && cpu_online(cpu)) {
-   ppc64_runlatch_off();
HMT_low();
HMT_very_low();
}
@@ -103,7 +101,6 @@ static int dedicated_cede_loop(struct cpuidle_device *dev,
idle_loop_prolog(&in_purr);
get_lppaca()->donate_dedicated_cpu = 1;
 
-   ppc64_runlatch_off();
HMT_medium();
check_and_cede_processor();
 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V2] time/cpuidle: Support in tick broadcast framework for archs without external clock device

2013-12-31 Thread Preeti U Murthy
On some architectures, in certain CPU deep idle states the local timers stop.
An external clock device is used to wakeup these CPUs. The kernel support for 
the
wakeup of these CPUs is provided by the tick broadcast framework by using the
external clock device as the wakeup source.

However not all implementations of architectures provide such an external
clock device such as some PowerPC ones. This patch includes support in the
broadcast framework to handle the wakeup of the CPUs in deep idle states on such
systems by queuing a hrtimer on one of the CPUs, meant to handle the wakeup of
CPUs in deep idle states. This CPU is identified as the bc_cpu.

Each time the hrtimer expires, it is reprogrammed for the next wakeup of the
CPUs in deep idle state after handling broadcast. However when a CPU is about
to enter  deep idle state with its wakeup time earlier than the time at which
the hrtimer is currently programmed, it *becomes the new bc_cpu* and restarts
the hrtimer on itself. This way the job of doing broadcast is handed around to
the CPUs that ask for the earliest wakeup just before entering deep idle
state. This is consistent with what happens in cases where an external clock
device is present. The smp affinity of this clock device is set to the CPU
with the earliest wakeup.

The important point here is that the bc_cpu cannot enter deep idle state
since it has a hrtimer queued to wakeup the other CPUs in deep idle. Hence it
cannot have its local timer stopped. Therefore for such a CPU, the
BROADCAST_ENTER notification has to fail implying that it cannot enter deep
idle state. On architectures where an external clock device is present, all
CPUs can enter deep idle.

During hotplug of the bc_cpu, the job of doing a broadcast is assigned to the
first cpu in the broadcast mask. This newly nominated bc_cpu is woken up by
an IPI so as to queue the above mentioned hrtimer on itself.

Changes from V1:https://lkml.org/lkml/2013/12/12/687

If idle states exist when the local timers of CPUs stop and
there is no external clock device to handle their wakeups the kernel switches
the tick mode to periodic so as to prevent the CPUs from entering such idle
states altogether. Therefore include an additional check consistent
with this patch, where if an external clock device does not exist, queue a
hrtimer to handle wakeups. If this also fails, only then switch the tick mode to
periodic.

Signed-off-by: Preeti U Murthy 
---

 include/linux/clockchips.h   |4 -
 kernel/time/clockevents.c|8 +-
 kernel/time/tick-broadcast.c |  180 ++
 kernel/time/tick-internal.h  |8 +-
 4 files changed, 173 insertions(+), 27 deletions(-)

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 493aa02..bbda37b 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -186,9 +186,9 @@ static inline int tick_check_broadcast_expired(void) { 
return 0; }
 #endif
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
-extern void clockevents_notify(unsigned long reason, void *arg);
+extern int clockevents_notify(unsigned long reason, void *arg);
 #else
-static inline void clockevents_notify(unsigned long reason, void *arg) {}
+static inline int clockevents_notify(unsigned long reason, void *arg) {}
 #endif
 
 #else /* CONFIG_GENERIC_CLOCKEVENTS_BUILD */
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 086ad60..bbbd671 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -525,11 +525,11 @@ void clockevents_resume(void)
 /**
  * clockevents_notify - notification about relevant events
  */
-void clockevents_notify(unsigned long reason, void *arg)
+int clockevents_notify(unsigned long reason, void *arg)
 {
struct clock_event_device *dev, *tmp;
unsigned long flags;
-   int cpu;
+   int cpu, ret = 0;
 
raw_spin_lock_irqsave(&clockevents_lock, flags);
 
@@ -542,11 +542,12 @@ void clockevents_notify(unsigned long reason, void *arg)
 
case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
-   tick_broadcast_oneshot_control(reason);
+   ret = tick_broadcast_oneshot_control(reason);
break;
 
case CLOCK_EVT_NOTIFY_CPU_DYING:
tick_handover_do_timer(arg);
+   tick_handover_bc_cpu(arg);
break;
 
case CLOCK_EVT_NOTIFY_SUSPEND:
@@ -585,6 +586,7 @@ void clockevents_notify(unsigned long reason, void *arg)
break;
}
raw_spin_unlock_irqrestore(&clockevents_lock, flags);
+   return ret;
 }
 EXPORT_SYMBOL_GPL(clockevents_notify);
 
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 9532690..1755984 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "tick-internal.h"
 
@@ -35,6 +36,11 @@ static cpumask_var_

Re: [RFC PATCH] time: Support in tick broadcast framework for archs without an external wakeup source

2013-12-15 Thread Preeti U Murthy
Hi,

The patch had some compile time fixes to be done. It was accidentally mailed
out before doing so. Below is the right patch. Apologies for the same.

Thanks

Regards
Preeti U Murthy

-

time: Support in tick broadcast framework for archs without an external wakeup 
source

From: Preeti U Murthy 

On some architectures, in certain CPU deep idle states the local timers stop.
An external clock device is used to wakeup these CPUs. The kernel support for 
the
wakeup of these CPUs is provided by the tick broadcast framework by using the
external clock device as the wakeup source.

However on architectures like PowerPC there is no external clock device. This
patch includes support in the broadcast framework to handle the wakeup of the
CPUs in deep idle states on such architectures by queuing a hrtimer on one of
the CPUs, meant to handle the wakeup of CPUs in deep idle states. This CPU is
identified as the bc_cpu.

Each time the hrtimer expires, it is reprogrammed for the next wakeup of the
CPUs in deep idle state after handling broadcast. However when a CPU is about
to enter  deep idle state with its wakeup time earlier than the time at which
the hrtimer is currently programmed, it *becomes the new bc_cpu* and restarts
the hrtimer on itself. This way the job of doing broadcast is handed around to
the CPUs that ask for the earliest wakeup just before entering deep idle
state. This is consistent with what happens in cases where an external clock
device is present. The smp affinity of this clock device is set to the CPU
with the earliest wakeup.

The important point here is that the bc_cpu cannot enter deep idle state
since it has a hrtimer queued to wakeup the other CPUs in deep idle. Hence it
cannot have its local timer stopped. Therefore for such a CPU, the
BROADCAST_ENTER notification has to fail implying that it cannot enter deep
idle state. On architectures where an external clock device is present, all
CPUs can enter deep idle.

During hotplug of the bc_cpu, the job of doing a broadcast is assigned to the
first cpu in the broadcast mask. This newly nominated bc_cpu is woken up by
an IPI so as to queue the above mentioned hrtimer on itself.

This patch is compile tested only.

Signed-off-by: Preeti U Murthy 
---
 include/linux/clockchips.h   |4 +
 kernel/time/clockevents.c|8 +-
 kernel/time/tick-broadcast.c |  161 +-
 kernel/time/tick-internal.h  |8 +-
 4 files changed, 155 insertions(+), 26 deletions(-)

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 493aa02..bbda37b 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -186,9 +186,9 @@ static inline int tick_check_broadcast_expired(void) { 
return 0; }
 #endif
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
-extern void clockevents_notify(unsigned long reason, void *arg);
+extern int clockevents_notify(unsigned long reason, void *arg);
 #else
-static inline void clockevents_notify(unsigned long reason, void *arg) {}
+static inline int clockevents_notify(unsigned long reason, void *arg) {}
 #endif
 
 #else /* CONFIG_GENERIC_CLOCKEVENTS_BUILD */
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 086ad60..bbbd671 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -525,11 +525,11 @@ void clockevents_resume(void)
 /**
  * clockevents_notify - notification about relevant events
  */
-void clockevents_notify(unsigned long reason, void *arg)
+int clockevents_notify(unsigned long reason, void *arg)
 {
struct clock_event_device *dev, *tmp;
unsigned long flags;
-   int cpu;
+   int cpu, ret = 0;
 
raw_spin_lock_irqsave(&clockevents_lock, flags);
 
@@ -542,11 +542,12 @@ void clockevents_notify(unsigned long reason, void *arg)
 
case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
-   tick_broadcast_oneshot_control(reason);
+   ret = tick_broadcast_oneshot_control(reason);
break;
 
case CLOCK_EVT_NOTIFY_CPU_DYING:
tick_handover_do_timer(arg);
+   tick_handover_bc_cpu(arg);
break;
 
case CLOCK_EVT_NOTIFY_SUSPEND:
@@ -585,6 +586,7 @@ void clockevents_notify(unsigned long reason, void *arg)
break;
}
raw_spin_unlock_irqrestore(&clockevents_lock, flags);
+   return ret;
 }
 EXPORT_SYMBOL_GPL(clockevents_notify);
 
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 9532690..4ba8abb 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "tick-internal.h"
 
@@ -35,6 +36,10 @@ static cpumask_var_t tmpmask;
 static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
 static int tick_broadcast_force;
 
+static struct hrtimer *bc_hr

Re: [RFC PATCH] time: Support in tick broadcast framework for archs without an external wakeup source

2013-12-12 Thread Preeti U Murthy
Hi Ben,

On 12/13/2013 10:47 AM, Benjamin Herrenschmidt wrote:
> On Fri, 2013-12-13 at 09:49 +0530, Preeti U Murthy wrote:
>> On some architectures, in certain CPU deep idle states the local timers stop.
>> An external clock device is used to wakeup these CPUs. The kernel support 
>> for the
>> wakeup of these CPUs is provided by the tick broadcast framework by using the
>> external clock device as the wakeup source.
> 
>> However on architectures like PowerPC there is no external clock device. 
> 
> Minor nit ...
> 
> I wouldn't make this an architectural statement. Some PowerPC's do have
> external clock devices (for example the old MPIC interrupt controller
> had timers). In fact, if we really need it, I'm sure we *could* find
> something somewhere in P8 that could act as a timer, probably hijacking
> a bit of the OCC or similar but at this stage, that's not on the radar.
> 
> So make it an implementation statement. "However, not all
> implementations, such as some PowerPC ones, provide such an external
> timer ...".

Thanks for this information. I will update this going ahead.

Regards
Preeti U Murthy

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH] time: Support in tick broadcast framework for archs without an external wakeup source

2013-12-12 Thread Preeti U Murthy
On some architectures, in certain CPU deep idle states the local timers stop.
An external clock device is used to wakeup these CPUs. The kernel support for 
the
wakeup of these CPUs is provided by the tick broadcast framework by using the
external clock device as the wakeup source.

However on architectures like PowerPC there is no external clock device. This
patch includes support in the broadcast framework to handle the wakeup of the
CPUs in deep idle states on such architectures by queuing a hrtimer on one of
the CPUs, meant to handle the wakeup of CPUs in deep idle states. This CPU is
identified as the bc_cpu.

Each time the hrtimer expires, it is reprogrammed for the next wakeup of the
CPUs in deep idle state after handling broadcast. However when a CPU is about
to enter  deep idle state with its wakeup time earlier than the time at which
the hrtimer is currently programmed, it *becomes the new bc_cpu* and restarts
the hrtimer on itself. This way the job of doing broadcast is handed around to
the CPUs that ask for the earliest wakeup just before entering deep idle
state. This is consistent with what happens in cases where an external clock
device is present. The smp affinity of this clock device is set to the CPU
with the earliest wakeup.

The important point here is that the bc_cpu cannot enter deep idle state
since it has a hrtimer queued to wakeup the other CPUs in deep idle. Hence it
cannot have its local timer stopped. Therefore for such a CPU, the
BROADCAST_ENTER notification has to fail implying that it cannot enter deep
idle state. On architectures where an external clock device is present, all
CPUs can enter deep idle.

During hotplug of the bc_cpu, the job of doing a broadcast is assigned to the
first cpu in the broadcast mask. This newly nominated bc_cpu is woken up by
an IPI so as to queue the above mentioned hrtimer on itself.

This patch is compile tested only.

Signed-off-by: Preeti U Murthy 
---

 include/linux/clockchips.h   |4 +
 kernel/time/clockevents.c|8 +-
 kernel/time/tick-broadcast.c |  157 ++
 kernel/time/tick-internal.h  |8 +-
 4 files changed, 153 insertions(+), 24 deletions(-)

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 493aa02..bbda37b 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -186,9 +186,9 @@ static inline int tick_check_broadcast_expired(void) { 
return 0; }
 #endif
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
-extern void clockevents_notify(unsigned long reason, void *arg);
+extern int clockevents_notify(unsigned long reason, void *arg);
 #else
-static inline void clockevents_notify(unsigned long reason, void *arg) {}
+static inline int clockevents_notify(unsigned long reason, void *arg) {}
 #endif
 
 #else /* CONFIG_GENERIC_CLOCKEVENTS_BUILD */
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 086ad60..bbbd671 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -525,11 +525,11 @@ void clockevents_resume(void)
 /**
  * clockevents_notify - notification about relevant events
  */
-void clockevents_notify(unsigned long reason, void *arg)
+int clockevents_notify(unsigned long reason, void *arg)
 {
struct clock_event_device *dev, *tmp;
unsigned long flags;
-   int cpu;
+   int cpu, ret = 0;
 
raw_spin_lock_irqsave(&clockevents_lock, flags);
 
@@ -542,11 +542,12 @@ void clockevents_notify(unsigned long reason, void *arg)
 
case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
-   tick_broadcast_oneshot_control(reason);
+   ret = tick_broadcast_oneshot_control(reason);
break;
 
case CLOCK_EVT_NOTIFY_CPU_DYING:
tick_handover_do_timer(arg);
+   tick_handover_bc_cpu(arg);
break;
 
case CLOCK_EVT_NOTIFY_SUSPEND:
@@ -585,6 +586,7 @@ void clockevents_notify(unsigned long reason, void *arg)
break;
}
raw_spin_unlock_irqrestore(&clockevents_lock, flags);
+   return ret;
 }
 EXPORT_SYMBOL_GPL(clockevents_notify);
 
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 9532690..f90b865 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "tick-internal.h"
 
@@ -35,6 +36,10 @@ static cpumask_var_t tmpmask;
 static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
 static int tick_broadcast_force;
 
+static struct hrtimer *bc_hrtimer;
+static int bc_cpu = -1;
+static ktime_t bc_next_wakeup;
+
 #ifdef CONFIG_TICK_ONESHOT
 static void tick_broadcast_clear_oneshot(int cpu);
 #else
@@ -528,6 +533,20 @@ static int tick_broadcast_set_event(struct 
clock_event_device *bc, int cpu,
return ret;
 }
 
+static void tick_broadcast_set_next_wakeup(int cpu, ktime_t expires, int force)
+{
+   struct 

Re: [PATCH V4 6/9] cpuidle/ppc: Add basic infrastructure to enable the broadcast framework on ppc

2013-12-02 Thread Preeti U Murthy
Hi Thomas,

On 11/29/2013 05:28 PM, Thomas Gleixner wrote:
> On Fri, 29 Nov 2013, Preeti U Murthy wrote:
>> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
>> index b44b52c..cafa788 100644
>> --- a/arch/powerpc/Kconfig
>> +++ b/arch/powerpc/Kconfig
>> @@ -129,6 +129,8 @@ config PPC
>>  select GENERIC_CMOS_UPDATE
>>  select GENERIC_TIME_VSYSCALL_OLD
>>  select GENERIC_CLOCKEVENTS
>> +select GENERIC_CLOCKEVENTS_BROADCAST
>> +select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
> 
> What's the point of this config switch? It's nowhere used.

When broadcast IPIs are to be sent, either the "broadcast" method
associated with the local timers is used or an arch-specific method
tick_broadcast() is invoked. For the latter be invoked,
ARCH_HAS_TICK_BROADCAST config needs to be set. On PowerPC, the
broadcast method is not associated with the local timer. Hence we invoke
tick_broadcast(). This function has been added in [PATCH 2/9].
> 
>> +static int broadcast_set_next_event(unsigned long evt,
>> +struct clock_event_device *dev)
>> +{
>> +return 0;
>> +}
>> +
>> +static void broadcast_set_mode(enum clock_event_mode mode,
>> + struct clock_event_device *dev)
>> +{
>> +if (mode != CLOCK_EVT_MODE_ONESHOT)
>> +broadcast_set_next_event(DECREMENTER_MAX, dev);
> 
> What's the point of calling an empty function?  

You are right, this should have remained a dummy function like
broadcast_set_next_event() as per the design of this patchset.
> 
>> +}
>> +
>>  static void decrementer_set_mode(enum clock_event_mode mode,
>>   struct clock_event_device *dev)
>>  {
>> @@ -840,6 +869,19 @@ static void register_decrementer_clockevent(int cpu)
>>  clockevents_register_device(dec);
>>  }
>>  
>> +static void register_broadcast_clockevent(int cpu)
>> +{
>> +struct clock_event_device *bc_evt = &bc_timer;
>> +
>> +*bc_evt = broadcast_clockevent;
>> +bc_evt->cpumask = cpu_possible_mask;
>> +
>> +printk_once(KERN_DEBUG "clockevent: %s mult[%x] shift[%d] cpu[%d]\n",
>> +bc_evt->name, bc_evt->mult, bc_evt->shift, cpu);
>> +
>> +clockevents_register_device(bc_evt);
>> +}
>> +
>>  static void __init init_decrementer_clockevent(void)
>>  {
>>  int cpu = smp_processor_id();
>> @@ -854,6 +896,19 @@ static void __init init_decrementer_clockevent(void)
>>  register_decrementer_clockevent(cpu);
>>  }
>>  
>> +static void __init init_broadcast_clockevent(void)
>> +{
>> +int cpu = smp_processor_id();
>> +
>> +clockevents_calc_mult_shift(&broadcast_clockevent, ppc_tb_freq, 4);
>> +
>> +broadcast_clockevent.max_delta_ns =
>> +clockevent_delta2ns(DECREMENTER_MAX, &broadcast_clockevent);
>> +broadcast_clockevent.min_delta_ns =
>> +clockevent_delta2ns(2, &broadcast_clockevent);
> 
> clockevents_config()

Right, I will change this to call clockevents_config(). I see that this
needs to be done during the initialization of the decrementer as well.
Will do the same.

Thank you

Regards
Preeti U Murthy
> 
> Thanks,
> 
>   tglx
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH V4 7/9] cpuidle/powernv: Add "Fast-Sleep" CPU idle state

2013-12-02 Thread Preeti U Murthy
Hi Thomas,

On 11/29/2013 08:09 PM, Thomas Gleixner wrote:
> On Fri, 29 Nov 2013, Preeti U Murthy wrote:
>> +static enum hrtimer_restart handle_broadcast(struct hrtimer *hrtimer)
>> +{
>> +struct clock_event_device *bc_evt = &bc_timer;
>> +ktime_t interval, next_bc_tick, now;
>> +
>> +now = ktime_get();
>> +
>> +if (!restart_broadcast(bc_evt))
>> +return HRTIMER_NORESTART;
>> +
>> +interval = ktime_sub(bc_evt->next_event, now);
>> +next_bc_tick = get_next_bc_tick();
> 
> So you're seriously using a hrtimer to poll in HZ frequency for
> updates of bc->next_event?
> 
> To be honest, this design sucks.
> 
> First of all, why is this a PPC specific feature? There are probably
> other architectures which could make use of this. So this should be
> implemented in the core code to begin with.
> 
> And a lot of the things you need for this are already available in the
> core in one form or the other.
> 
> For a start you can stick the broadcast hrtimer to the cpu which does
> the timekeeping. The handover in the hotplug case is handled there as
> well as is the handover for the NOHZ case.
> 
> This needs to be extended for this hrtimer broadcast thingy to work,
> but it shouldn't be that hard to do so.
> 
> Now for the polling. That's a complete trainwreck.
> 
> This can be solved via the broadcast IPI as well. When a CPU which
> goes down into deep idle sets the broadcast to expire earlier than the
> active value it can denote that and send the timer broadcast IPI over
> to the CPU which has the honour of dealing with this.
> 
> This supports HIGHRES and NO_HZ if done right, without polling at
> all. So you can even let the last CPU which handles the broadcast
> hrtimer go for a long sleep, just not in the deepest idle state.

Thank you for the review. The above points are all valid. I will rework
the design to:

1. Eliminate the concept of a broadcast CPU and integrate its
functionality in the timekeeping CPU.

2. Avoid polling by using IPIs to communicate the next wakeup of the
CPUs in deep idle state so as to reprogram the broadcast hrtimer.

3. Make this feature generic and not arch-specific.

Regards
Preeti U Murthy
> 
> Thanks,
> 
>   tglx
> ___
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: 3.13 Oops on ppc64_cpu --smt=off

2013-12-02 Thread Preeti U Murthy
Hi,

On 12/02/2013 03:27 PM, Alexander Graf wrote:
> 
> On 02.12.2013, at 05:01, Preeti U Murthy  wrote:
> 
>> Hi,
>>
>> On 11/30/2013 11:15 PM, Alexander Graf wrote:
>>> Hi Ben,
>>>
>>> With current linus master (3.13-rc2+) I'm facing an interesting issue with
>>
>> SMT disabling on p7. When I trigger the cpu offlining it works as expected,
>> but after a few seconds the machine goes into an oops as you can see below.
>>>
>>> It looks like a null pointer dereference.
>>
>> tip/sched/urgent has the below fix. Can you please apply the following it and
>> check if the issue gets resolved?  A similar issue was reported earlier as
> 
> I've disabled NO_HZ now on that machine which also "fixed" it for me. 
> Unfortunately I can't reboot that box for at least the next week now to test 
> whether the patch does fix the issue.

The commit 37dc6b50cee9 that has caused this regression is around NO_HZ.
It decides when to kick nohz idle balancing.

Regards
Preeti U Murthy
> 
> 
> Alex
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: 3.13 Oops on ppc64_cpu --smt=off

2013-12-01 Thread Preeti U Murthy
Hi,

On 11/30/2013 11:15 PM, Alexander Graf wrote:
> Hi Ben,
> 
> With current linus master (3.13-rc2+) I'm facing an interesting issue with

SMT disabling on p7. When I trigger the cpu offlining it works as expected,
but after a few seconds the machine goes into an oops as you can see below.
> 
> It looks like a null pointer dereference.

tip/sched/urgent has the below fix. Can you please apply the following it and
check if the issue gets resolved?  A similar issue was reported earlier as
well and it pointed to the commit id 37dc65. I believe the problem that you 
report
is also pointing to the regression caused by the same commit id.

Thanks

Regards
Preeti U Murthy

---
commit 42eb088ed246a5a817bb45a8b32fe234cf1c0f8b
Author: Peter Zijlstra 
Date:   Tue Nov 19 16:41:49 2013 +0100

sched: Avoid NULL dereference on sd_busy

Commit 37dc6b50cee9 ("sched: Remove unnecessary iteration over sched
domains to update nr_busy_cpus") forgot to clear 'sd_busy' under some
conditions leading to a possible NULL deref in set_cpu_sd_state_idle().

    Reported-by: Anton Blanchard 
Cc: Preeti U Murthy 
Signed-off-by: Peter Zijlstra 
Link: 
http://lkml.kernel.org/r/20131118113701.gf3...@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c1808606ee5f..a1591ca7eb5a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4910,8 +4910,9 @@ static void update_top_cache_domain(int cpu)
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
-   rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
+   sd = sd->parent; /* sd_busy */
}
+   rcu_assign_pointer(per_cpu(sd_busy, cpu), sd);

rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;


> 
> 
> Alex
> 
> ($ ppc64_cpu --smt=off)
> kvm: disabling virtualization on CPU1
> kvm: disabling virtualization on CPU2
> kvm: disabling virtualization on CPU3
> kvm: disabling virtualization on CPU5
> kvm: disabling virtualization on CPU6
> kvm: disabling virtualization on CPU7
> kvm: disabling virtualization on CPU9
> kvm: disabling virtualization on CPU10
> kvm: disabling virtualization on CPU11
> kvm: disabling virtualization on CPU13
> kvm: disabling virtualization on CPU14
> kvm: disabling virtualization on CPU15
> kvm: disabling virtualization on CPU17
> kvm: disabling virtualization on CPU18
> kvm: disabling virtualization on CPU19
> kvm: disabling virtualization on CPU21
> kvm: disabling virtualization on CPU22
> kvm: disabling virtualization on CPU23
> kvm: disabling virtualization on CPU25
> kvm: disabling virtualization on CPU26
> kvm: disabling virtualization on CPU27
> kvm: disabling virtualization on CPU29
> kvm: disabling virtualization on CPU30
> kvm: disabling virtualization on CPU31
> kvm: disabling virtualization on CPU33
> kvm: disabling virtualization on CPU34
> kvm: disabling virtualization on CPU35
> kvm: disabling virtualization on CPU37
> kvm: disabling virtualization on CPU38
> kvm: disabling virtualization on CPU39
> kvm: disabling virtualization on CPU41
> kvm: disabling virtualization on CPU42
> kvm: disabling virtualization on CPU43
> kvm: disabling virtualization on CPU45
> kvm: disabling virtualization on CPU46
> kvm: disabling virtualization on CPU47
> kvm: disabling virtualization on CPU49
> kvm: disabling virtualization on CPU50
> kvm: disabling virtualization on CPU51
> kvm: disabling virtualization on CPU53
> kvm: disabling virtualization on CPU54
> kvm: disabling virtualization on CPU55
> kvm: disabling virtualization on CPU57
> kvm: disabling virtualization on CPU58
> kvm: disabling virtualization on CPU59
> kvm: disabling virtualization on CPU61
> kvm: disabling virtualization on CPU62
> kvm: disabling virtualization on CPU63
> Unable to handle kernel paging request for data at address 0x0010
> Faulting instruction address: 0xc0124188
> Oops: Kernel access of bad area, sig: 11 [#1]
> SMP NR_CPUS=1024 NUMA PowerNV
> Modules linked in: iptable_filter ip_tables x_tables nfsv3 nfs_acl nfs 
> fscache lockd sunrpc autofs4 binfmt_misc af_packet fuse loop dm_mod ohci_pci 
> ohci_hcd ehci_pci ehci_hcd e1000e usbcore sr_mod cdrom ses enclosure 
> rtc_generic usb_common ptp sg pps_core sd_mod crc_t10dif crct10dif_common 
> scsi_dh_hp_sw scsi_dh_alua scsi_dh_emc scsi_dh_rdac scsi_dh virtio_pci 
> virtio_console virtio_blk virtio virtio_ring ipr libata scsi_mod
> CPU: 56 PID: 0 Comm: swapper/56 Not tainted 3.13.0-rc2-0.g01695c8-default+ #1
> task: c007f28b5180 ti: c007f28c8000 task.ti: c007f28c8000
> NIP: c0124188 LR

[PATCH V4 9/9] cpuidle/powernv: Parse device tree to setup idle states

2013-11-29 Thread Preeti U Murthy
Add deep idle states such as nap and fast sleep to the cpuidle state table
only if they are discovered from the device tree during cpuidle initialization.

Signed-off-by: Preeti U. Murthy 
---

 drivers/cpuidle/cpuidle-powerpc-book3s.c |   81 --
 1 file changed, 64 insertions(+), 17 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-powerpc-book3s.c 
b/drivers/cpuidle/cpuidle-powerpc-book3s.c
index 59cd529..b80ee9b 100644
--- a/drivers/cpuidle/cpuidle-powerpc-book3s.c
+++ b/drivers/cpuidle/cpuidle-powerpc-book3s.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -27,6 +28,12 @@
 #include 
 #include 
 
+/* Flags and constants used in PowerNV platform */
+
+#define MAX_POWERNV_IDLE_STATES8
+#define IDLE_USE_INST_NAP  0x0001 /* Use nap instruction */
+#define IDLE_USE_INST_SLEEP0x0002 /* Use sleep instruction */
+
 struct cpuidle_driver powerpc_book3s_idle_driver = {
.name = "powerpc_book3s_idle",
.owner= THIS_MODULE,
@@ -327,7 +334,7 @@ static struct cpuidle_state shared_states[] = {
.enter = &shared_cede_loop },
 };
 
-static struct cpuidle_state powernv_states[] = {
+static struct cpuidle_state powernv_states[MAX_POWERNV_IDLE_STATES] = {
{ /* Snooze */
.name = "snooze",
.desc = "snooze",
@@ -335,20 +342,6 @@ static struct cpuidle_state powernv_states[] = {
.exit_latency = 0,
.target_residency = 0,
.enter = &snooze_loop },
-   { /* NAP */
-   .name = "NAP",
-   .desc = "NAP",
-   .flags = CPUIDLE_FLAG_TIME_VALID,
-   .exit_latency = 10,
-   .target_residency = 100,
-   .enter = &nap_loop },
-{ /* Fastsleep */
-   .name = "fastsleep",
-   .desc = "fastsleep",
-   .flags = CPUIDLE_FLAG_TIME_VALID,
-   .exit_latency = 10,
-   .target_residency = 100,
-   .enter = &fastsleep_loop },
 };
 
 void update_smt_snooze_delay(int cpu, int residency)
@@ -418,6 +411,60 @@ static struct notifier_block setup_hotplug_notifier = {
.notifier_call = powerpc_book3s_cpuidle_add_cpu_notifier,
 };
 
+static int powernv_add_idle_states(void)
+{
+   struct device_node *power_mgt;
+   struct property *prop;
+   int nr_idle_states = 1; /* Snooze */
+   int dt_idle_states;
+   u32 *flags;
+   int i;
+
+   /* Currently we have snooze statically defined */
+
+   power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
+   if (!power_mgt) {
+   pr_warn("opal: PowerMgmt Node not found\n");
+   return nr_idle_states;
+   }
+
+   prop = of_find_property(power_mgt, "ibm,cpu-idle-state-flags", NULL);
+   if (!prop) {
+   pr_warn("DT-PowerMgmt: missing ibm,cpu-idle-state-flags\n");
+   return nr_idle_states;
+   }
+
+   dt_idle_states = prop->length / sizeof(u32);
+   flags = (u32 *) prop->value;
+
+   for (i = 0; i < dt_idle_states; i++) {
+
+   if (flags[i] & IDLE_USE_INST_NAP) {
+   /* Add NAP state */
+   strcpy(powernv_states[nr_idle_states].name, "Nap");
+   strcpy(powernv_states[nr_idle_states].desc, "Nap");
+   powernv_states[nr_idle_states].flags = 
CPUIDLE_FLAG_TIME_VALID;
+   powernv_states[nr_idle_states].exit_latency = 10;
+   powernv_states[nr_idle_states].target_residency = 100;
+   powernv_states[nr_idle_states].enter = &nap_loop;
+   nr_idle_states++;
+   }
+
+   if (flags[i] & IDLE_USE_INST_SLEEP) {
+   /* Add FASTSLEEP state */
+   strcpy(powernv_states[nr_idle_states].name, 
"FastSleep");
+   strcpy(powernv_states[nr_idle_states].desc, 
"FastSleep");
+   powernv_states[nr_idle_states].flags = 
CPUIDLE_FLAG_TIME_VALID;
+   powernv_states[nr_idle_states].exit_latency = 300;
+   powernv_states[nr_idle_states].target_residency = 
100;
+   powernv_states[nr_idle_states].enter = &fastsleep_loop;
+   nr_idle_states++;
+   }
+   }
+
+   return nr_idle_states;
+}
+
 /*
  * powerpc_book3s_cpuidle_driver_init()
  */
@@ -448,7 +495,6 @@ static int powerpc_book3s_cpuidle_driver_init(void)
  */
 static int powerpc_book3s_idle_probe(void)
 {
-
if (cpuidle_disable != IDLE_NO_OVERRIDE)
return -ENODEV;
 
@@ -463,7 +509,8 @@ static int powerpc_book3s_id

[PATCH V4 8/9] cpuidle/ppc: Nominate new broadcast cpu on hotplug of the old

2013-11-29 Thread Preeti U Murthy
On hotplug of the broadcast cpu, cancel the hrtimer queued to do
broadcast and nominate a new broadcast cpu.

We choose the new broadcast cpu as one of the cpus in deep idle and thus
send an ipi to wake it up to continue the duty of broadcast. The new
broadcast cpu needs to find out if it woke up to resume broadcast.
If so it needs to restart the broadcast hrtimer on itself.

Its possible that the old broadcast cpu was hotplugged out when the broadcast
hrtimer was about to fire on it. Therefore the newly nominated broadcast cpu
should set the broadcast hrtimer on itself to expire immediately so as to not
miss wakeups under such scenarios.

Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/include/asm/time.h  |1 +
 arch/powerpc/kernel/time.c   |1 +
 drivers/cpuidle/cpuidle-powerpc-book3s.c |   22 ++
 3 files changed, 24 insertions(+)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index a6604b7..e24ebb4 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -31,6 +31,7 @@ struct rtc_time;
 extern void to_tm(int tim, struct rtc_time * tm);
 extern void GregorianDay(struct rtc_time *tm);
 extern void tick_broadcast_ipi_handler(void);
+extern void broadcast_irq_entry(void);
 
 extern void generic_calibrate_decr(void);
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index f0603a0..021a5c5 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -852,6 +852,7 @@ void tick_broadcast_ipi_handler(void)
 {
u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
 
+   broadcast_irq_entry();
*next_tb = get_tb_or_rtc();
__timer_interrupt();
 }
diff --git a/drivers/cpuidle/cpuidle-powerpc-book3s.c 
b/drivers/cpuidle/cpuidle-powerpc-book3s.c
index 649c330..59cd529 100644
--- a/drivers/cpuidle/cpuidle-powerpc-book3s.c
+++ b/drivers/cpuidle/cpuidle-powerpc-book3s.c
@@ -288,6 +288,12 @@ static int fastsleep_loop(struct cpuidle_device *dev,
return index;
 }
 
+void broadcast_irq_entry(void)
+{
+   if (smp_processor_id() == bc_cpu)
+   hrtimer_start(bc_hrtimer, ns_to_ktime(0), 
HRTIMER_MODE_REL_PINNED);
+}
+
 /*
  * States for dedicated partition case.
  */
@@ -366,6 +372,7 @@ static int powerpc_book3s_cpuidle_add_cpu_notifier(struct 
notifier_block *n,
unsigned long action, void *hcpu)
 {
int hotcpu = (unsigned long)hcpu;
+   unsigned long flags;
struct cpuidle_device *dev =
per_cpu(cpuidle_devices, hotcpu);
 
@@ -378,6 +385,21 @@ static int powerpc_book3s_cpuidle_add_cpu_notifier(struct 
notifier_block *n,
cpuidle_resume_and_unlock();
break;
 
+   case CPU_DYING:
+   case CPU_DYING_FROZEN:
+   spin_lock_irqsave(&fastsleep_idle_lock, flags);
+   if (hotcpu == bc_cpu) {
+   bc_cpu = -1;
+   hrtimer_cancel(bc_hrtimer);
+   if 
(!cpumask_empty(tick_get_broadcast_oneshot_mask())) {
+   bc_cpu = cpumask_first(
+   
tick_get_broadcast_oneshot_mask());
+   tick_broadcast(cpumask_of(bc_cpu));
+   }
+   }
+   spin_unlock_irqrestore(&fastsleep_idle_lock, flags);
+   break;
+
case CPU_DEAD:
case CPU_DEAD_FROZEN:
cpuidle_pause_and_lock();

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V4 6/9] cpuidle/ppc: Add basic infrastructure to enable the broadcast framework on ppc

2013-11-29 Thread Preeti U Murthy
On ppc there are certain deep CPU idle states in which the local timers stop. 
One such
idle state on Power8 is "Fast-Sleep". However we do not have an external timer
to wake up these CPUs. Hence we prevent one of the CPUs from entering
Fast-Sleep so that it can wakeup the remaining CPUs in this state.

However we would still rely on the broadcast framework[1] in the kernel to keep
track of the CPUs in deep idle and the time at which to wake them up. To enable
this framework, we need to register a clock device that does not stop in deep 
idle
states. Without such a device, the broadcast framework does not take any
action when CPUs enter and exit deep idle states since it believes that there
is no clock device to wakeup the CPUs in deep idle states.

A local timer does not satisfy this condition and hence we introduce a
pseudo clock device, called the broadcast_clockevent and get this registered
in the broadcast framework. This is done to trick the broadcast framework
into believing that we have an external timer to wakeup the CPUs. But this
device is not programmable; it just enables us to make use of the broadcast 
framework.

[1]http://lwn.net/Articles/574591/

Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/Kconfig|2 +
 arch/powerpc/include/asm/time.h |1 +
 arch/powerpc/kernel/time.c  |   58 ++-
 3 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b44b52c..cafa788 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -129,6 +129,8 @@ config PPC
select GENERIC_CMOS_UPDATE
select GENERIC_TIME_VSYSCALL_OLD
select GENERIC_CLOCKEVENTS
+   select GENERIC_CLOCKEVENTS_BROADCAST
+   select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
select HAVE_MOD_ARCH_SPECIFIC
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 1d428e6..4057425 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -24,6 +24,7 @@ extern unsigned long tb_ticks_per_jiffy;
 extern unsigned long tb_ticks_per_usec;
 extern unsigned long tb_ticks_per_sec;
 extern struct clock_event_device decrementer_clockevent;
+extern struct clock_event_device broadcast_clockevent;
 
 struct rtc_time;
 extern void to_tm(int tim, struct rtc_time * tm);
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 42cb603..d2e582b 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -42,6 +42,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -97,6 +98,10 @@ static struct clocksource clocksource_timebase = {
 
 static int decrementer_set_next_event(unsigned long evt,
  struct clock_event_device *dev);
+static int broadcast_set_next_event(unsigned long evt,
+ struct clock_event_device *dev);
+static void broadcast_set_mode(enum clock_event_mode mode,
+struct clock_event_device *dev);
 static void decrementer_set_mode(enum clock_event_mode mode,
 struct clock_event_device *dev);
 
@@ -106,12 +111,23 @@ struct clock_event_device decrementer_clockevent = {
.irq= 0,
.set_next_event = decrementer_set_next_event,
.set_mode   = decrementer_set_mode,
-   .features   = CLOCK_EVT_FEAT_ONESHOT,
+   .features   = CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_ONESHOT,
 };
 EXPORT_SYMBOL(decrementer_clockevent);
 
+struct clock_event_device broadcast_clockevent = {
+   .name   = "broadcast",
+   .rating = 200,
+   .irq= 0,
+   .set_next_event = broadcast_set_next_event,
+   .set_mode   = broadcast_set_mode,
+   .features   = CLOCK_EVT_FEAT_ONESHOT,
+};
+EXPORT_SYMBOL(broadcast_clockevent);
+
 DEFINE_PER_CPU(u64, decrementers_next_tb);
 static DEFINE_PER_CPU(struct clock_event_device, decrementers);
+static struct clock_event_device bc_timer;
 
 #define XSEC_PER_SEC (1024*1024)
 
@@ -811,6 +827,19 @@ static int decrementer_set_next_event(unsigned long evt,
return 0;
 }
 
+static int broadcast_set_next_event(unsigned long evt,
+   struct clock_event_device *dev)
+{
+   return 0;
+}
+
+static void broadcast_set_mode(enum clock_event_mode mode,
+struct clock_event_device *dev)
+{
+   if (mode != CLOCK_EVT_MODE_ONESHOT)
+   broadcast_set_next_event(DECREMENTER_MAX, dev);
+}
+
 static void decrementer_set_mode(enum clock_event_mode mode,
 struct clock_event_device *dev)
 {
@@ -840,6 +869,19 @@ static void register_decrementer_clockevent(int cpu)
clockevents_register_device(dec

[PATCH V4 7/9] cpuidle/powernv: Add "Fast-Sleep" CPU idle state

2013-11-29 Thread Preeti U Murthy
Fast sleep is one of the deep idle states on Power8 in which local timers of
CPUs stop. Now that the basic support for fast sleep has been added,
enable it in the cpuidle framework on PowerNV.

On ppc, since we do not have an external device that can wakeup cpus in deep
idle, the local timer of one of the CPUs needs to be nominated to do this job.
This cpu is called the broadcast cpu/bc_cpu. Only if the bc_cpu is nominated
will the remaining cpus be allowed to enter deep idle state after notifying
the broadcast framework. The bc_cpu is not allowed to enter deep idle state.

The bc_cpu queues a hrtimer onto itself to handle the wakeup of CPUs in deep
idle state. The hrtimer handler calls into the broadcast framework which takes
care of sending IPIs to all those CPUs in deep idle whose wakeup times has 
expired.
On each expiry of the hrtimer, it is programmed to the earlier of the
next wakeup time of  cpus in deep idle and and a safety period so as to not miss
any wakeups. This safety period is currently maintained at a jiffy.

But having a dedicated bc_cpu would mean overloading just one cpu with the
broadcast work which could hinder its performance apart from leading to thermal
imbalance on the chip. Therefore the first CPU that enters deep idle state is
the bc_cpu. It gets unassigned when there are no more CPUs in deep idle to be
woken up. This state remains until such a time that a CPU enters the
deep idle state again to be nominated as the bc_cpu and the cycle repeats.

Protect the region of nomination,de-nomination and check for existence of 
broadcast
CPU with a lock to ensure synchronization between them.

Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/include/asm/time.h  |1 
 arch/powerpc/kernel/time.c   |2 
 drivers/cpuidle/cpuidle-powerpc-book3s.c |  152 ++
 3 files changed, 154 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 4057425..a6604b7 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -25,6 +25,7 @@ extern unsigned long tb_ticks_per_usec;
 extern unsigned long tb_ticks_per_sec;
 extern struct clock_event_device decrementer_clockevent;
 extern struct clock_event_device broadcast_clockevent;
+extern struct clock_event_device bc_timer;
 
 struct rtc_time;
 extern void to_tm(int tim, struct rtc_time * tm);
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index d2e582b..f0603a0 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -127,7 +127,7 @@ EXPORT_SYMBOL(broadcast_clockevent);
 
 DEFINE_PER_CPU(u64, decrementers_next_tb);
 static DEFINE_PER_CPU(struct clock_event_device, decrementers);
-static struct clock_event_device bc_timer;
+struct clock_event_device bc_timer;
 
 #define XSEC_PER_SEC (1024*1024)
 
diff --git a/drivers/cpuidle/cpuidle-powerpc-book3s.c 
b/drivers/cpuidle/cpuidle-powerpc-book3s.c
index 25e8a99..649c330 100644
--- a/drivers/cpuidle/cpuidle-powerpc-book3s.c
+++ b/drivers/cpuidle/cpuidle-powerpc-book3s.c
@@ -12,12 +12,19 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
 #include 
 
 struct cpuidle_driver powerpc_book3s_idle_driver = {
@@ -28,6 +35,26 @@ struct cpuidle_driver powerpc_book3s_idle_driver = {
 static int max_idle_state;
 static struct cpuidle_state *cpuidle_state_table;
 
+static int bc_cpu = -1;
+static struct hrtimer *bc_hrtimer;
+static int bc_hrtimer_initialized = 0;
+
+/*
+ * Bits to indicate if a cpu can enter deep idle where local timer gets
+ * switched off.
+ * BROADCAST_CPU_PRESENT : Enter deep idle since bc_cpu is assigned
+ * BROADCAST_CPU_SELF   : Do not enter deep idle since you are bc_cpu
+ * BROADCAST_CPU_ABSENT : Do not enter deep idle since there is no 
bc_cpu,
+ *hence nominate yourself as bc_cpu
+ * BROADCAST_CPU_ERROR :  Do not enter deep idle since there is no bc_cpu
+ *and the broadcast hrtimer could not be initialized.
+ */
+enum broadcast_cpu_status {
+   BROADCAST_CPU_PRESENT,
+   BROADCAST_CPU_SELF,
+   BROADCAST_CPU_ERROR,
+};
+
 static inline void idle_loop_prolog(unsigned long *in_purr)
 {
*in_purr = mfspr(SPRN_PURR);
@@ -48,6 +75,8 @@ static inline void idle_loop_epilog(unsigned long in_purr)
get_lppaca()->idle = 0;
 }
 
+static DEFINE_SPINLOCK(fastsleep_idle_lock);
+
 static int snooze_loop(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
int index)
@@ -143,6 +172,122 @@ static int nap_loop(struct cpuidle_device *dev,
return index;
 }
 
+/* Functions supporting broadcasting in fastsleep */
+static ktime_t get_next_bc_tick(void)
+{
+   u64 next_bc_ns;
+
+   next_bc_ns = (tb_ticks_per_jiffy / tb_ticks_per_usec) * 1000;
+   return ns_to_kt

[PATCH V4 5/9] powermgt: Add OPAL call to resync timebase on wakeup

2013-11-29 Thread Preeti U Murthy
From: Vaidyanathan Srinivasan 

During "Fast-sleep" and deeper power savings state, decrementer and
timebase could be stopped making it out of sync with rest
of the cores in the system.

Add a firmware call to request platform to resync timebase
using low level platform methods.

Signed-off-by: Vaidyanathan Srinivasan 
Signed-off-by: Preeti U. Murthy 
---

 arch/powerpc/include/asm/opal.h|2 ++
 arch/powerpc/kernel/exceptions-64s.S   |2 +-
 arch/powerpc/kernel/idle_power7.S  |   27 
 arch/powerpc/platforms/powernv/opal-wrappers.S |1 +
 4 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 033c06b..a662d06 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -132,6 +132,7 @@ extern int opal_enter_rtas(struct rtas_args *args,
 #define OPAL_FLASH_VALIDATE76
 #define OPAL_FLASH_MANAGE  77
 #define OPAL_FLASH_UPDATE  78
+#define OPAL_RESYNC_TIMEBASE   79
 
 #ifndef __ASSEMBLY__
 
@@ -763,6 +764,7 @@ extern void opal_flash_init(void);
 extern int opal_machine_check(struct pt_regs *regs);
 
 extern void opal_shutdown(void);
+extern int opal_resync_timebase(void);
 
 extern void opal_lpc_init(void);
 
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index b8139fb..91e6417 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -145,7 +145,7 @@ BEGIN_FTR_SECTION
 
/* Fast Sleep wakeup on PowerNV */
 8: GET_PACA(r13)
-   b   .power7_wakeup_loss
+   b   .power7_wakeup_tb_loss
 
 9:
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
diff --git a/arch/powerpc/kernel/idle_power7.S 
b/arch/powerpc/kernel/idle_power7.S
index e4bbca2..34c71e8 100644
--- a/arch/powerpc/kernel/idle_power7.S
+++ b/arch/powerpc/kernel/idle_power7.S
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #undef DEBUG
 
@@ -124,6 +125,32 @@ _GLOBAL(power7_sleep)
b   power7_powersave_common
/* No return */
 
+_GLOBAL(power7_wakeup_tb_loss)
+   ld  r2,PACATOC(r13);
+   ld  r1,PACAR1(r13)
+
+   /* Time base re-sync */
+   li  r0,OPAL_RESYNC_TIMEBASE
+   LOAD_REG_ADDR(r11,opal);
+   ld  r12,8(r11);
+   ld  r2,0(r11);
+   mtctr   r12
+   bctrl
+
+   /* TODO: Check r3 for failure */
+
+   REST_NVGPRS(r1)
+   REST_GPR(2, r1)
+   ld  r3,_CCR(r1)
+   ld  r4,_MSR(r1)
+   ld  r5,_NIP(r1)
+   addir1,r1,INT_FRAME_SIZE
+   mtcrr3
+   mfspr   r3,SPRN_SRR1/* Return SRR1 */
+   mtspr   SPRN_SRR1,r4
+   mtspr   SPRN_SRR0,r5
+   rfid
+
 _GLOBAL(power7_wakeup_loss)
ld  r1,PACAR1(r13)
REST_NVGPRS(r1)
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S 
b/arch/powerpc/platforms/powernv/opal-wrappers.S
index e780650..ddfe95a 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -126,3 +126,4 @@ OPAL_CALL(opal_return_cpu,  
OPAL_RETURN_CPU);
 OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE);
 OPAL_CALL(opal_manage_flash,   OPAL_FLASH_MANAGE);
 OPAL_CALL(opal_update_flash,   OPAL_FLASH_UPDATE);
+OPAL_CALL(opal_resync_timebase,OPAL_RESYNC_TIMEBASE);

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V4 4/9] powernv/cpuidle: Add context management for Fast Sleep

2013-11-29 Thread Preeti U Murthy
From: Vaidyanathan Srinivasan 

Before adding Fast-Sleep into the cpuidle framework, some low level
support needs to be added to enable it. This includes saving and
restoring of certain registers at entry and exit time of this state
respectively just like we do in the NAP idle state.

Signed-off-by: Vaidyanathan Srinivasan 
[Changelog modified by Preeti U. Murthy ]
Signed-off-by: Preeti U. Murthy 
---

 arch/powerpc/include/asm/processor.h |1 +
 arch/powerpc/kernel/exceptions-64s.S |   10 -
 arch/powerpc/kernel/idle_power7.S|   63 --
 3 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index 4f7b047..d7633d0 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -444,6 +444,7 @@ enum idle_boot_override {IDLE_NO_OVERRIDE = 0, 
IDLE_POWERSAVE_OFF};
 
 extern int powersave_nap;  /* set if nap mode can be used in idle loop */
 extern void power7_nap(void);
+extern void power7_sleep(void);
 
 #ifdef CONFIG_CPU_IDLE_POWERPC_BOOK3S
 extern void update_smt_snooze_delay(int cpu, int residency);
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 9f905e4..b8139fb 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -121,9 +121,10 @@ BEGIN_FTR_SECTION
cmpwi   cr1,r13,2
/* Total loss of HV state is fatal, we could try to use the
 * PIR to locate a PACA, then use an emergency stack etc...
-* but for now, let's just stay stuck here
+* OPAL v3 based powernv platforms have new idle states
+* which fall in this catagory.
 */
-   bgt cr1,.
+   bgt cr1,8f
GET_PACA(r13)
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -141,6 +142,11 @@ BEGIN_FTR_SECTION
beq cr1,2f
b   .power7_wakeup_noloss
 2: b   .power7_wakeup_loss
+
+   /* Fast Sleep wakeup on PowerNV */
+8: GET_PACA(r13)
+   b   .power7_wakeup_loss
+
 9:
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 #endif /* CONFIG_PPC_P7_NAP */
diff --git a/arch/powerpc/kernel/idle_power7.S 
b/arch/powerpc/kernel/idle_power7.S
index 847e40e..e4bbca2 100644
--- a/arch/powerpc/kernel/idle_power7.S
+++ b/arch/powerpc/kernel/idle_power7.S
@@ -20,17 +20,27 @@
 
 #undef DEBUG
 
-   .text
+/* Idle state entry routines */
 
-_GLOBAL(power7_idle)
-   /* Now check if user or arch enabled NAP mode */
-   LOAD_REG_ADDRBASE(r3,powersave_nap)
-   lwz r4,ADDROFF(powersave_nap)(r3)
-   cmpwi   0,r4,0
-   beqlr
-   /* fall through */
+#defineIDLE_STATE_ENTER_SEQ(IDLE_INST) \
+   /* Magic NAP/SLEEP/WINKLE mode enter sequence */\
+   std r0,0(r1);   \
+   ptesync;\
+   ld  r0,0(r1);   \
+1: cmp cr0,r0,r0;  \
+   bne 1b; \
+   IDLE_INST;  \
+   b   .
 
-_GLOBAL(power7_nap)
+   .text
+
+/*
+ * Pass requested state in r3:
+ * 0 - nap
+ * 1 - sleep
+ */
+_GLOBAL(power7_powersave_common)
+   /* Use r3 to pass state nap/sleep/winkle */
/* NAP is a state loss, we create a regs frame on the
 * stack, fill it up with the state we care about and
 * stick a pointer to it in PACAR1. We really only
@@ -79,8 +89,8 @@ _GLOBAL(power7_nap)
/* Continue saving state */
SAVE_GPR(2, r1)
SAVE_NVGPRS(r1)
-   mfcrr3
-   std r3,_CCR(r1)
+   mfcrr4
+   std r4,_CCR(r1)
std r9,_MSR(r1)
std r1,PACAR1(r13)
 
@@ -89,15 +99,30 @@ _GLOBAL(power7_nap)
li  r4,KVM_HWTHREAD_IN_NAP
stb r4,HSTATE_HWTHREAD_STATE(r13)
 #endif
+   cmpwi   cr0,r3,1
+   beq 2f
+   IDLE_STATE_ENTER_SEQ(PPC_NAP)
+   /* No return */
+2: IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
+   /* No return */
 
-   /* Magic NAP mode enter sequence */
-   std r0,0(r1)
-   ptesync
-   ld  r0,0(r1)
-1: cmp cr0,r0,r0
-   bne 1b
-   PPC_NAP
-   b   .
+_GLOBAL(power7_idle)
+   /* Now check if user or arch enabled NAP mode */
+   LOAD_REG_ADDRBASE(r3,powersave_nap)
+   lwz r4,ADDROFF(powersave_nap)(r3)
+   cmpwi   0,r4,0
+   beqlr
+   /* fall through */
+
+_GLOBAL(power7_nap)
+   li  r3,0
+   b   power7_powersave_common
+   /* No return */
+
+_GLOBAL(power7_sleep)
+   li  r3,1
+   b   power7_powersave_common
+   /* No return */
 
 _GLOBAL(power7_wakeup_loss)
ld  r1,PACAR1(r13)

___
Linuxpp

[PATCH V4 3/9] cpuidle/ppc: Split timer_interrupt() into timer handling and interrupt handling routines

2013-11-29 Thread Preeti U Murthy
Split timer_interrupt(), which is the local timer interrupt handler on ppc
into routines called during regular interrupt handling and __timer_interrupt(),
which takes care of running local timers and collecting time related stats.

This will enable callers interested only in running expired local timers to
directly call into __timer_interupt(). One of the use cases of this is the
tick broadcast IPI handling in which the sleeping CPUs need to handle the local
timers that have expired.

Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/kernel/time.c |   73 +---
 1 file changed, 41 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 42269c7..42cb603 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -478,6 +478,42 @@ void arch_irq_work_raise(void)
 
 #endif /* CONFIG_IRQ_WORK */
 
+static void __timer_interrupt(void)
+{
+   struct pt_regs *regs = get_irq_regs();
+   u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
+   struct clock_event_device *evt = &__get_cpu_var(decrementers);
+   u64 now;
+
+   __get_cpu_var(irq_stat).timer_irqs++;
+   trace_timer_interrupt_entry(regs);
+
+   if (test_irq_work_pending()) {
+   clear_irq_work_pending();
+   irq_work_run();
+   }
+
+   now = get_tb_or_rtc();
+   if (now >= *next_tb) {
+   *next_tb = ~(u64)0;
+   if (evt->event_handler)
+   evt->event_handler(evt);
+   } else {
+   now = *next_tb - now;
+   if (now <= DECREMENTER_MAX)
+   set_dec((int)now);
+   }
+
+#ifdef CONFIG_PPC64
+   /* collect purr register values often, for accurate calculations */
+   if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+   struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array);
+   cu->current_tb = mfspr(SPRN_PURR);
+   }
+#endif
+   trace_timer_interrupt_exit(regs);
+}
+
 /*
  * timer_interrupt - gets called when the decrementer overflows,
  * with interrupts disabled.
@@ -486,8 +522,6 @@ void timer_interrupt(struct pt_regs * regs)
 {
struct pt_regs *old_regs;
u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
-   struct clock_event_device *evt = &__get_cpu_var(decrementers);
-   u64 now;
 
/* Ensure a positive value is written to the decrementer, or else
 * some CPUs will continue to take decrementer exceptions.
@@ -510,8 +544,6 @@ void timer_interrupt(struct pt_regs * regs)
 */
may_hard_irq_enable();
 
-   __get_cpu_var(irq_stat).timer_irqs++;
-
 #if defined(CONFIG_PPC32) && defined(CONFIG_PMAC)
if (atomic_read(&ppc_n_lost_interrupts) != 0)
do_IRQ(regs);
@@ -520,34 +552,7 @@ void timer_interrupt(struct pt_regs * regs)
old_regs = set_irq_regs(regs);
irq_enter();
 
-   trace_timer_interrupt_entry(regs);
-
-   if (test_irq_work_pending()) {
-   clear_irq_work_pending();
-   irq_work_run();
-   }
-
-   now = get_tb_or_rtc();
-   if (now >= *next_tb) {
-   *next_tb = ~(u64)0;
-   if (evt->event_handler)
-   evt->event_handler(evt);
-   } else {
-   now = *next_tb - now;
-   if (now <= DECREMENTER_MAX)
-   set_dec((int)now);
-   }
-
-#ifdef CONFIG_PPC64
-   /* collect purr register values often, for accurate calculations */
-   if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
-   struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array);
-   cu->current_tb = mfspr(SPRN_PURR);
-   }
-#endif
-
-   trace_timer_interrupt_exit(regs);
-
+   __timer_interrupt();
irq_exit();
set_irq_regs(old_regs);
 }
@@ -816,6 +821,10 @@ static void decrementer_set_mode(enum clock_event_mode 
mode,
 /* Interrupt handler for the timer broadcast IPI */
 void tick_broadcast_ipi_handler(void)
 {
+   u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
+
+   *next_tb = get_tb_or_rtc();
+   __timer_interrupt();
 }
 
 static void register_decrementer_clockevent(int cpu)

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V4 2/9] powerpc: Implement tick broadcast IPI as a fixed IPI message

2013-11-29 Thread Preeti U Murthy
From: Srivatsa S. Bhat 

For scalability and performance reasons, we want the tick broadcast IPIs
to be handled as efficiently as possible. Fixed IPI messages
are one of the most efficient mechanisms available - they are faster than
the smp_call_function mechanism because the IPI handlers are fixed and hence
they don't involve costly operations such as adding IPI handlers to the target
CPU's function queue, acquiring locks for synchronization etc.

Luckily we have an unused IPI message slot, so use that to implement
tick broadcast IPIs efficiently.

Signed-off-by: Srivatsa S. Bhat 
[Functions renamed to tick_broadcast* and Changelog modified by
 Preeti U. Murthy]
Signed-off-by: Preeti U. Murthy 
Acked-by: Geoff Levand  [For the PS3 part]
---

 arch/powerpc/include/asm/smp.h  |2 +-
 arch/powerpc/include/asm/time.h |1 +
 arch/powerpc/kernel/smp.c   |   19 +++
 arch/powerpc/kernel/time.c  |5 +
 arch/powerpc/platforms/cell/interrupt.c |2 +-
 arch/powerpc/platforms/ps3/smp.c|2 +-
 6 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 9f7356b..ff51046 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -120,7 +120,7 @@ extern int cpu_to_core_id(int cpu);
  * in /proc/interrupts will be wrong!!! --Troy */
 #define PPC_MSG_CALL_FUNCTION   0
 #define PPC_MSG_RESCHEDULE  1
-#define PPC_MSG_UNUSED 2
+#define PPC_MSG_TICK_BROADCAST 2
 #define PPC_MSG_DEBUGGER_BREAK  3
 
 /* for irq controllers that have dedicated ipis per message (4) */
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index c1f2676..1d428e6 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -28,6 +28,7 @@ extern struct clock_event_device decrementer_clockevent;
 struct rtc_time;
 extern void to_tm(int tim, struct rtc_time * tm);
 extern void GregorianDay(struct rtc_time *tm);
+extern void tick_broadcast_ipi_handler(void);
 
 extern void generic_calibrate_decr(void);
 
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index c2bd8d6..c77c6d7 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -145,9 +146,9 @@ static irqreturn_t reschedule_action(int irq, void *data)
return IRQ_HANDLED;
 }
 
-static irqreturn_t unused_action(int irq, void *data)
+static irqreturn_t tick_broadcast_ipi_action(int irq, void *data)
 {
-   /* This slot is unused and hence available for use, if needed */
+   tick_broadcast_ipi_handler();
return IRQ_HANDLED;
 }
 
@@ -168,14 +169,14 @@ static irqreturn_t debug_ipi_action(int irq, void *data)
 static irq_handler_t smp_ipi_action[] = {
[PPC_MSG_CALL_FUNCTION] =  call_function_action,
[PPC_MSG_RESCHEDULE] = reschedule_action,
-   [PPC_MSG_UNUSED] = unused_action,
+   [PPC_MSG_TICK_BROADCAST] = tick_broadcast_ipi_action,
[PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action,
 };
 
 const char *smp_ipi_name[] = {
[PPC_MSG_CALL_FUNCTION] =  "ipi call function",
[PPC_MSG_RESCHEDULE] = "ipi reschedule",
-   [PPC_MSG_UNUSED] = "ipi unused",
+   [PPC_MSG_TICK_BROADCAST] = "ipi tick-broadcast",
[PPC_MSG_DEBUGGER_BREAK] = "ipi debugger",
 };
 
@@ -251,6 +252,8 @@ irqreturn_t smp_ipi_demux(void)
generic_smp_call_function_interrupt();
if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE))
scheduler_ipi();
+   if (all & IPI_MESSAGE(PPC_MSG_TICK_BROADCAST))
+   tick_broadcast_ipi_handler();
if (all & IPI_MESSAGE(PPC_MSG_DEBUGGER_BREAK))
debug_ipi_action(0, NULL);
} while (info->messages);
@@ -289,6 +292,14 @@ void arch_send_call_function_ipi_mask(const struct cpumask 
*mask)
do_message_pass(cpu, PPC_MSG_CALL_FUNCTION);
 }
 
+void tick_broadcast(const struct cpumask *mask)
+{
+   unsigned int cpu;
+
+   for_each_cpu(cpu, mask)
+   do_message_pass(cpu, PPC_MSG_TICK_BROADCAST);
+}
+
 #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
 void smp_send_debugger_break(void)
 {
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index b3b1441..42269c7 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -813,6 +813,11 @@ static void decrementer_set_mode(enum clock_event_mode 
mode,
decrementer_set_next_event(DECREMENTER_MAX, dev);
 }
 
+/* Interrupt handler for the timer broadcast IPI */
+void tick_broadcast_ipi_handler(void)
+{
+}
+
 static void register_decrementer_clockevent(int cpu)
 {
struct clock_event_device *dec = &per_cpu(decrementers, cpu);
diff

[PATCH V4 1/9] powerpc: Free up the slot of PPC_MSG_CALL_FUNC_SINGLE IPI message

2013-11-29 Thread Preeti U Murthy
From: Srivatsa S. Bhat 

The IPI handlers for both PPC_MSG_CALL_FUNC and PPC_MSG_CALL_FUNC_SINGLE map
to a common implementation - generic_smp_call_function_single_interrupt(). So,
we can consolidate them and save one of the IPI message slots, (which are
precious on powerpc, since only 4 of those slots are available).

So, implement the functionality of PPC_MSG_CALL_FUNC_SINGLE using
PPC_MSG_CALL_FUNC itself and release its IPI message slot, so that it can be
used for something else in the future, if desired.

Signed-off-by: Srivatsa S. Bhat 
Signed-off-by: Preeti U. Murthy 
Acked-by: Geoff Levand  [For the PS3 part]
---

 arch/powerpc/include/asm/smp.h  |2 +-
 arch/powerpc/kernel/smp.c   |   12 +---
 arch/powerpc/platforms/cell/interrupt.c |2 +-
 arch/powerpc/platforms/ps3/smp.c|2 +-
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 084e080..9f7356b 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -120,7 +120,7 @@ extern int cpu_to_core_id(int cpu);
  * in /proc/interrupts will be wrong!!! --Troy */
 #define PPC_MSG_CALL_FUNCTION   0
 #define PPC_MSG_RESCHEDULE  1
-#define PPC_MSG_CALL_FUNC_SINGLE   2
+#define PPC_MSG_UNUSED 2
 #define PPC_MSG_DEBUGGER_BREAK  3
 
 /* for irq controllers that have dedicated ipis per message (4) */
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index a3b64f3..c2bd8d6 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -145,9 +145,9 @@ static irqreturn_t reschedule_action(int irq, void *data)
return IRQ_HANDLED;
 }
 
-static irqreturn_t call_function_single_action(int irq, void *data)
+static irqreturn_t unused_action(int irq, void *data)
 {
-   generic_smp_call_function_single_interrupt();
+   /* This slot is unused and hence available for use, if needed */
return IRQ_HANDLED;
 }
 
@@ -168,14 +168,14 @@ static irqreturn_t debug_ipi_action(int irq, void *data)
 static irq_handler_t smp_ipi_action[] = {
[PPC_MSG_CALL_FUNCTION] =  call_function_action,
[PPC_MSG_RESCHEDULE] = reschedule_action,
-   [PPC_MSG_CALL_FUNC_SINGLE] = call_function_single_action,
+   [PPC_MSG_UNUSED] = unused_action,
[PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action,
 };
 
 const char *smp_ipi_name[] = {
[PPC_MSG_CALL_FUNCTION] =  "ipi call function",
[PPC_MSG_RESCHEDULE] = "ipi reschedule",
-   [PPC_MSG_CALL_FUNC_SINGLE] = "ipi call function single",
+   [PPC_MSG_UNUSED] = "ipi unused",
[PPC_MSG_DEBUGGER_BREAK] = "ipi debugger",
 };
 
@@ -251,8 +251,6 @@ irqreturn_t smp_ipi_demux(void)
generic_smp_call_function_interrupt();
if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE))
scheduler_ipi();
-   if (all & IPI_MESSAGE(PPC_MSG_CALL_FUNC_SINGLE))
-   generic_smp_call_function_single_interrupt();
if (all & IPI_MESSAGE(PPC_MSG_DEBUGGER_BREAK))
debug_ipi_action(0, NULL);
} while (info->messages);
@@ -280,7 +278,7 @@ EXPORT_SYMBOL_GPL(smp_send_reschedule);
 
 void arch_send_call_function_single_ipi(int cpu)
 {
-   do_message_pass(cpu, PPC_MSG_CALL_FUNC_SINGLE);
+   do_message_pass(cpu, PPC_MSG_CALL_FUNCTION);
 }
 
 void arch_send_call_function_ipi_mask(const struct cpumask *mask)
diff --git a/arch/powerpc/platforms/cell/interrupt.c 
b/arch/powerpc/platforms/cell/interrupt.c
index 2d42f3b..adf3726 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -215,7 +215,7 @@ void iic_request_IPIs(void)
 {
iic_request_ipi(PPC_MSG_CALL_FUNCTION);
iic_request_ipi(PPC_MSG_RESCHEDULE);
-   iic_request_ipi(PPC_MSG_CALL_FUNC_SINGLE);
+   iic_request_ipi(PPC_MSG_UNUSED);
iic_request_ipi(PPC_MSG_DEBUGGER_BREAK);
 }
 
diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c
index 4b35166..00d1a7c 100644
--- a/arch/powerpc/platforms/ps3/smp.c
+++ b/arch/powerpc/platforms/ps3/smp.c
@@ -76,7 +76,7 @@ static int __init ps3_smp_probe(void)
 
BUILD_BUG_ON(PPC_MSG_CALL_FUNCTION!= 0);
BUILD_BUG_ON(PPC_MSG_RESCHEDULE   != 1);
-   BUILD_BUG_ON(PPC_MSG_CALL_FUNC_SINGLE != 2);
+   BUILD_BUG_ON(PPC_MSG_UNUSED   != 2);
BUILD_BUG_ON(PPC_MSG_DEBUGGER_BREAK   != 3);
 
for (i = 0; i < MSG_COUNT; i++) {

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V4 0/9] cpuidle/ppc: Enable deep idle states on PowerNV

2013-11-29 Thread Preeti U Murthy
On PowerPC, when CPUs enter certain deep idle states, the local timers stop and 
the
time base could go out of sync with the rest of the cores in the system.

This patchset adds support to wake up CPUs in such idle states by broadcasting
IPIs to them at their next timer events. We refer to these IPIs as the tick
broadcast IPIs in this patchset to refer to this context.

The patchset also includes resyncing of time base with the rest of the cores
in the system as soon as the CPUs wake up from deep idle states.

"Fast-Sleep" is a deep idle state on Power8 in which the above mentioned 
challenges
exist. With the required support for deep idle states thus in place, the 
patchset
adds Fast-Sleep into cpuidle. Fast-Sleep can yield us significantly more power
savings than the idle states that we have in cpuidle so far.

This patchset is based on mainline-3.13-rc1 and the cpuidle driver for power
posted by Deepthi Dharwar: https://lkml.org/lkml/2013/11/11/29

Changes in V4:

1. Add Fast Sleep CPU idle state on PowerNV.

2. Add the required context management for Fast Sleep and the call to OPAL
to synchronize time base after wakeup from fast sleep.

4. Add parsing of CPU idle states from the device tree to populate the cpuidle
state table.

5. Rename ambiguous functions in the code around waking up of CPUs from fast
sleep.

6. Fixed a bug in re-programming of the hrtimer that is queued to wakeup the
CPUs in fast sleep and modified Changelogs.

7. Added the ARCH_HAS_TICK_BROADCAST option. This signifies that we have a
arch specific function to perform broadcast.

Changes in V3:
http://thread.gmane.org/gmane.linux.power-management.general/38113

1. Fix the way in which a broadcast ipi is handled on the idling cpus. Timer
handling on a broadcast ipi is being done now without missing out any timer
stats generation.

2. Fix a bug in the programming of the hrtimer meant to do broadcast. Program
it to trigger at the earlier of a "broadcast period", and the next wakeup
event. By introducing the "broadcast period" as the maximum period after
which the broadcast hrtimer can fire, we ensure that we do not miss
wakeups in corner cases.

3. On hotplug of a broadcast cpu, trigger the hrtimer meant to do broadcast
to fire immediately on the new broadcast cpu. This will ensure we do not miss
doing a broadcast pending in the nearest future.

4. Change the type of allocation from GFP_KERNEL to GFP_NOWAIT while
initializing bc_hrtimer since we are in an atomic context and cannot sleep.

5. Use the broadcast ipi to wakeup the newly nominated broadcast cpu on
hotplug of the old instead of smp_call_function_single(). This is because we
are interrupt disabled at this point and should not be using
smp_call_function_single or its children in this context to send an ipi.

6. Move GENERIC_CLOCKEVENTS_BROADCAST to arch/powerpc/Kconfig.

7. Fix coding style issues.

Changes in V2: https://lkml.org/lkml/2013/8/14/239

1. Dynamically pick a broadcast CPU, instead of having a dedicated one.
2. Remove the constraint of having to disable tickless idle on the broadcast
CPU by queueing a hrtimer dedicated to do broadcast.

V1 posting: https://lkml.org/lkml/2013/7/25/740.

1. Added the infrastructure to wakeup CPUs in deep idle states in which the
local timers stop.

---

Preeti U Murthy (5):
  cpuidle/ppc: Split timer_interrupt() into timer handling and interrupt 
handling routines
  cpuidle/ppc: Add basic infrastructure to enable the broadcast framework 
on ppc
  cpuidle/powernv: Add "Fast-Sleep" CPU idle state
  cpuidle/ppc: Nominate new broadcast cpu on hotplug of the old
  cpuidle/powernv: Parse device tree to setup idle states

Srivatsa S. Bhat (2):
  powerpc: Free up the slot of PPC_MSG_CALL_FUNC_SINGLE IPI message
  powerpc: Implement tick broadcast IPI as a fixed IPI message

Vaidyanathan Srinivasan (2):
  powernv/cpuidle: Add context management for Fast Sleep
  powermgt: Add OPAL call to resync timebase on wakeup


 arch/powerpc/Kconfig   |2 
 arch/powerpc/include/asm/opal.h|2 
 arch/powerpc/include/asm/processor.h   |1 
 arch/powerpc/include/asm/smp.h |2 
 arch/powerpc/include/asm/time.h|4 
 arch/powerpc/kernel/exceptions-64s.S   |   10 +
 arch/powerpc/kernel/idle_power7.S  |   90 +++--
 arch/powerpc/kernel/smp.c  |   23 ++
 arch/powerpc/kernel/time.c |  137 ++
 arch/powerpc/platforms/cell/interrupt.c|2 
 arch/powerpc/platforms/powernv/opal-wrappers.S |1 
 arch/powerpc/platforms/ps3/smp.c   |2 
 drivers/cpuidle/cpuidle-powerpc-book3s.c   |  241 +++-
 13 files changed, 443 insertions(+), 74 deletions(-)

-- 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH V2 2/2] sched: Remove un-necessary iteration over sched domains to update nr_busy_cpus

2013-10-30 Thread Preeti U Murthy
Hi Kamalesh,

On 10/30/2013 02:53 PM, Kamalesh Babulal wrote:
> Hi Preeti,
> 
>> nr_busy_cpus parameter is used by nohz_kick_needed() to find out the number
>> of busy cpus in a sched domain which has SD_SHARE_PKG_RESOURCES flag set.
>> Therefore instead of updating nr_busy_cpus at every level of sched domain,
>> since it is irrelevant, we can update this parameter only at the parent
>> domain of the sd which has this flag set. Introduce a per-cpu parameter
>> sd_busy which represents this parent domain.
>>
>> In nohz_kick_needed() we directly query the nr_busy_cpus parameter
>> associated with the groups of sd_busy.
>>
>> By associating sd_busy with the highest domain which has
>> SD_SHARE_PKG_RESOURCES flag set, we cover all lower level domains which could
>> have this flag set and trigger nohz_idle_balancing if any of the levels have
>> more than one busy cpu.
>>
>> sd_busy is irrelevant for asymmetric load balancing. However sd_asym has been
>> introduced to represent the highest sched domain which has SD_ASYM_PACKING 
>> flag set
>> so that it can be queried directly when required.
>>
>> While we are at it, we might as well change the nohz_idle parameter to be
>> updated at the sd_busy domain level alone and not the base domain level of a 
>> CPU.
>> This will unify the concept of busy cpus at just one level of sched domain
>> where it is currently used.
>>
>> Signed-off-by: Preeti U Murthy
>> ---
>>  kernel/sched/core.c  |6 ++
>>  kernel/sched/fair.c  |   38 --
>>  kernel/sched/sched.h |2 ++
>>  3 files changed, 28 insertions(+), 18 deletions(-)
>>
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index c06b8d3..e6a6244 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -5271,6 +5271,8 @@ DEFINE_PER_CPU(struct sched_domain *, sd_llc);
>>  DEFINE_PER_CPU(int, sd_llc_size);
>>  DEFINE_PER_CPU(int, sd_llc_id);
>>  DEFINE_PER_CPU(struct sched_domain *, sd_numa);
>> +DEFINE_PER_CPU(struct sched_domain *, sd_busy);
>> +DEFINE_PER_CPU(struct sched_domain *, sd_asym);
>>
>>  static void update_top_cache_domain(int cpu)
>>  {
>> @@ -5282,6 +5284,7 @@ static void update_top_cache_domain(int cpu)
>>  if (sd) {
>>  id = cpumask_first(sched_domain_span(sd));
>>  size = cpumask_weight(sched_domain_span(sd));
>> +rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
>>  }
> 
> 
> consider a machine with single socket, dual core with HT enabled. The top most
> domain is also the highest domain with SD_SHARE_PKG_RESOURCES flag set,
> i.e MC domain (the machine toplogy consist of SIBLING and MC domain).
> 
> # lstopo-no-graphics --no-bridges --no-io
> Machine (7869MB) + Socket L#0 + L3 L#0 (3072KB)
>   L2 L#0 (256KB) + L1d L#0 (32KB) + L1i L#0 (32KB) + Core L#0
> PU L#0 (P#0)
> PU L#1 (P#1)
>   L2 L#1 (256KB) + L1d L#1 (32KB) + L1i L#1 (32KB) + Core L#1
> PU L#2 (P#2)
> PU L#3 (P#3)
> 
> With this approach parent of MC domain is NULL and given that sd_busy is NULL,
> nr_busy_cpus of sched domain sd_busy will never be incremented/decremented.
> Resulting is nohz_kick_needed returning 0.

Right and it *should* return 0. There is no sibling domain that can
offload tasks from it. Therefore there is no point kicking nohz idle
balance.

Regards
Preeti U Murthy
> 
> Thanks,
> Kamalesh.
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH V2 2/2] sched: Remove un-necessary iteration over sched domains to update nr_busy_cpus

2013-10-29 Thread Preeti U Murthy
The changelog has missed mentioning the introduction of sd_asym per_cpu sched 
domain.
Apologies for this. The patch with the changelog including mention of sd_asym is
pasted below.

Regards
Preeti U Murthy

---

sched: Remove un-necessary iteration over sched domains to update nr_busy_cpus

From: Preeti U Murthy 

nr_busy_cpus parameter is used by nohz_kick_needed() to find out the number
of busy cpus in a sched domain which has SD_SHARE_PKG_RESOURCES flag set.
Therefore instead of updating nr_busy_cpus at every level of sched domain,
since it is irrelevant, we can update this parameter only at the parent
domain of the sd which has this flag set. Introduce a per-cpu parameter
sd_busy which represents this parent domain.

In nohz_kick_needed() we directly query the nr_busy_cpus parameter
associated with the groups of sd_busy.

By associating sd_busy with the highest domain which has
SD_SHARE_PKG_RESOURCES flag set, we cover all lower level domains which could
have this flag set and trigger nohz_idle_balancing if any of the levels have
more than one busy cpu.

sd_busy is irrelevant for asymmetric load balancing. However sd_asym has been
introduced to represent the highest sched domain which has SD_ASYM_PACKING flag 
set
so that it can be queried directly when required.

While we are at it, we might as well change the nohz_idle parameter to be
updated at the sd_busy domain level alone and not the base domain level of a 
CPU.
This will unify the concept of busy cpus at just one level of sched domain
where it is currently used.

Signed-off-by: Preeti U Murthy
---
 kernel/sched/core.c  |6 ++
 kernel/sched/fair.c  |   38 --
 kernel/sched/sched.h |2 ++
 3 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c06b8d3..e6a6244 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5271,6 +5271,8 @@ DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain *, sd_busy);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym);
 
 static void update_top_cache_domain(int cpu)
 {
@@ -5282,6 +5284,7 @@ static void update_top_cache_domain(int cpu)
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
+   rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
}
 
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
@@ -5290,6 +5293,9 @@ static void update_top_cache_domain(int cpu)
 
sd = lowest_flag_domain(cpu, SD_NUMA);
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+
+   sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+   rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
 }
 
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e9c9549..8602b2c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6515,16 +6515,16 @@ static inline void nohz_balance_exit_idle(int cpu)
 static inline void set_cpu_sd_state_busy(void)
 {
struct sched_domain *sd;
+   int cpu = smp_processor_id();
 
rcu_read_lock();
-   sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+   sd = rcu_dereference(per_cpu(sd_busy, cpu));
 
if (!sd || !sd->nohz_idle)
goto unlock;
sd->nohz_idle = 0;
 
-   for (; sd; sd = sd->parent)
-   atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+   atomic_inc(&sd->groups->sgp->nr_busy_cpus);
 unlock:
rcu_read_unlock();
 }
@@ -6532,16 +6532,16 @@ unlock:
 void set_cpu_sd_state_idle(void)
 {
struct sched_domain *sd;
+   int cpu = smp_processor_id();
 
rcu_read_lock();
-   sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+   sd = rcu_dereference(per_cpu(sd_busy, cpu));
 
if (!sd || sd->nohz_idle)
goto unlock;
sd->nohz_idle = 1;
 
-   for (; sd; sd = sd->parent)
-   atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+   atomic_dec(&sd->groups->sgp->nr_busy_cpus);
 unlock:
rcu_read_unlock();
 }
@@ -6748,6 +6748,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
 {
unsigned long now = jiffies;
struct sched_domain *sd;
+   struct sched_group_power *sgp;
+   int nr_busy;
 
if (unlikely(idle_cpu(cpu)))
return 0;
@@ -6773,22 +6775,22 @@ static inline int nohz_kick_needed(struct rq *rq, int 
cpu)
goto need_kick;
 
rcu_read_lock();
-   for_each_domain(cpu, sd) {
-   struct sched_group *sg = sd->groups;
-   struct sched_group_power *sgp = sg->sgp;
-   int nr_busy = atomic_read(&sgp->nr_busy_cpus);
+   sd = rcu_dereference(per

[PATCH V2 2/2] sched: Remove un-necessary iteration over sched domains to update nr_busy_cpus

2013-10-29 Thread Preeti U Murthy
nr_busy_cpus parameter is used by nohz_kick_needed() to find out the number
of busy cpus in a sched domain which has SD_SHARE_PKG_RESOURCES flag set.
Therefore instead of updating nr_busy_cpus at every level of sched domain,
since it is irrelevant, we can update this parameter only at the parent
domain of the sd which has this flag set. Introduce a per-cpu parameter
sd_busy which represents this parent domain.

In nohz_kick_needed() we directly query the nr_busy_cpus parameter
associated with the groups of sd_busy.

By associating sd_busy with the highest domain which has
SD_SHARE_PKG_RESOURCES flag set, we cover all lower level domains which could
have this flag set and trigger nohz_idle_balancing if any of the levels have
more than one busy cpu.

sd_busy is irrelevant for asymmetric load balancing.

While we are at it, we might as well change the nohz_idle parameter to be
updated at the sd_busy domain level alone and not the base domain level of a 
CPU.
This will unify the concept of busy cpus at just one level of sched domain
where it is currently used.

Signed-off-by: Preeti U Murthy
---

 kernel/sched/core.c  |6 ++
 kernel/sched/fair.c  |   38 --
 kernel/sched/sched.h |2 ++
 3 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c06b8d3..e6a6244 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5271,6 +5271,8 @@ DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain *, sd_busy);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym);
 
 static void update_top_cache_domain(int cpu)
 {
@@ -5282,6 +5284,7 @@ static void update_top_cache_domain(int cpu)
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
+   rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
}
 
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
@@ -5290,6 +5293,9 @@ static void update_top_cache_domain(int cpu)
 
sd = lowest_flag_domain(cpu, SD_NUMA);
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+
+   sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+   rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
 }
 
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e9c9549..8602b2c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6515,16 +6515,16 @@ static inline void nohz_balance_exit_idle(int cpu)
 static inline void set_cpu_sd_state_busy(void)
 {
struct sched_domain *sd;
+   int cpu = smp_processor_id();
 
rcu_read_lock();
-   sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+   sd = rcu_dereference(per_cpu(sd_busy, cpu));
 
if (!sd || !sd->nohz_idle)
goto unlock;
sd->nohz_idle = 0;
 
-   for (; sd; sd = sd->parent)
-   atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+   atomic_inc(&sd->groups->sgp->nr_busy_cpus);
 unlock:
rcu_read_unlock();
 }
@@ -6532,16 +6532,16 @@ unlock:
 void set_cpu_sd_state_idle(void)
 {
struct sched_domain *sd;
+   int cpu = smp_processor_id();
 
rcu_read_lock();
-   sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+   sd = rcu_dereference(per_cpu(sd_busy, cpu));
 
if (!sd || sd->nohz_idle)
goto unlock;
sd->nohz_idle = 1;
 
-   for (; sd; sd = sd->parent)
-   atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+   atomic_dec(&sd->groups->sgp->nr_busy_cpus);
 unlock:
rcu_read_unlock();
 }
@@ -6748,6 +6748,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
 {
unsigned long now = jiffies;
struct sched_domain *sd;
+   struct sched_group_power *sgp;
+   int nr_busy;
 
if (unlikely(idle_cpu(cpu)))
return 0;
@@ -6773,22 +6775,22 @@ static inline int nohz_kick_needed(struct rq *rq, int 
cpu)
goto need_kick;
 
rcu_read_lock();
-   for_each_domain(cpu, sd) {
-   struct sched_group *sg = sd->groups;
-   struct sched_group_power *sgp = sg->sgp;
-   int nr_busy = atomic_read(&sgp->nr_busy_cpus);
+   sd = rcu_dereference(per_cpu(sd_busy, cpu));
 
-   if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
-   goto need_kick_unlock;
+   if (sd) {
+   sgp = sd->groups->sgp;
+   nr_busy = atomic_read(&sgp->nr_busy_cpus);
 
-   if (sd->flags & SD_ASYM_PACKING
-   && (cpumask_first_and(nohz.idle_cpus_mask,
- sched_domain_span(sd)) < cpu))
+   

[PATCH V2 1/2] sched: Fix asymmetric scheduling for POWER7

2013-10-29 Thread Preeti U Murthy
From: Vaidyanathan Srinivasan 

Asymmetric scheduling within a core is a scheduler loadbalancing
feature that is triggered when SD_ASYM_PACKING flag is set.  The goal
for the load balancer is to move tasks to lower order idle SMT threads
within a core on a POWER7 system.

In nohz_kick_needed(), we intend to check if our sched domain (core)
is completely busy or we have idle cpu.

The following check for SD_ASYM_PACKING:

(cpumask_first_and(nohz.idle_cpus_mask, sched_domain_span(sd)) < cpu)

already covers the case of checking if the domain has an idle cpu,
because cpumask_first_and() will not yield any set bits if this domain
has no idle cpu.

Hence, nr_busy check against group weight can be removed.

Reported-by: Michael Neuling 
Signed-off-by: Vaidyanathan Srinivasan 
Signed-off-by: Preeti U Murthy 
Tested-by: Michael Neuling 
---

 kernel/sched/fair.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 813dd61..e9c9549 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6781,7 +6781,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
goto need_kick_unlock;
 
-   if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
+   if (sd->flags & SD_ASYM_PACKING
&& (cpumask_first_and(nohz.idle_cpus_mask,
  sched_domain_span(sd)) < cpu))
goto need_kick_unlock;

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V2 0/2] sched: Cleanups,fixes in nohz_kick_needed()

2013-10-29 Thread Preeti U Murthy
Changes from V1:https://lkml.org/lkml/2013/10/21/248

1. Swapped the order of PATCH1 and PATCH2 in V1 so as to not mess with the
nr_busy_cpus parameter computation during asymmetric balancing, while fixing
it.

2. nohz_busy_cpus parameter is to be updated and queried at only one level of
the sched domain-sd_busy where it is relevant.

3. Introduce sd_asym to represent the sched domain where asymmetric load
balancing has to be done.
---

Preeti U Murthy (1):
  sched: Remove un-necessary iteration over sched domains to update 
nr_busy_cpus

Vaidyanathan Srinivasan (1):
  sched: Fix asymmetric scheduling for POWER7


 kernel/sched/core.c  |6 ++
 kernel/sched/fair.c  |   38 --
 kernel/sched/sched.h |2 ++
 3 files changed, 28 insertions(+), 18 deletions(-)

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/3] sched: Aggressive balance in domains whose groups share package resources

2013-10-28 Thread Preeti U Murthy
Hi Peter,

On 10/28/2013 09:23 PM, Peter Zijlstra wrote:
> On Mon, Oct 21, 2013 at 05:15:02PM +0530, Vaidyanathan Srinivasan wrote:
>> From: Preeti U Murthy 
>>
>> The current logic in load balance is such that after picking the
>> busiest group, the load is attempted to be moved from the busiest cpu
>> in that group to the dst_cpu. If the load cannot be moved from the
>> busiest cpu to dst_cpu due to either tsk_cpus_allowed mask or cache
>> hot tasks, then the dst_cpu is changed to be another idle cpu within
>> the dst->grpmask. If even then, the load cannot be moved from the
>> busiest cpu, then the source group is changed. The next busiest group
>> is found and the above steps are repeated.
>>
>> However if the cpus in the group share package resources, then when
>> a load movement from the busiest cpu in this group fails as above,
>> instead of finding the next busiest group to move load from, find the
>> next busiest cpu *within the same group* from which to move load away.
>> By doing so, a conscious effort is made during load balancing to keep
>> just one cpu busy as much as possible within domains that have
>> SHARED_PKG_RESOURCES flag set unless under scenarios of high load.
>> Having multiple cpus busy within a domain which share package resource
>> could lead to a performance hit.
>>
>> A similar scenario arises in active load balancing as well. When the
>> current task on the busiest cpu cannot be moved away due to task
>> pinning, currently no more attempts at load balancing is made.
> 
>> This
>> patch checks if the balancing is being done on a group whose cpus
>> share package resources. If so, then check if the load balancing can
>> be done for other cpus in the same group.
> 
> So I absolutely hate this patch... Also I'm not convinced I actually
> understand the explanation above.
> 
> Furthermore; there is nothing special about spreading tasks for
> SHARED_PKG_RESOURCES and special casing that one case is just wrong.
> 
> If anything it should be keyed off of SD_PREFER_SIBLING and or
> cpu_power.

At a SIBLING level, which has SHARED_PKG_RESOURCES set, cpu_power in
fact takes care of ensuring that the scheduler mostly spreads the load
when there is more than one running task by nominating the group as
busy. But the issue that this patch is bringing to the front is a bit
different; its not during the time of this nomination, its at the time
of load balancing. It is explained below.

So metrics like cpu_power and flags like SD_PREFER_SIBLING ensure that
we spread the load by nominating such groups as busiest in
update_sg_lb_stats() and update_sd_lb_stats(). So "nominating a group"
as busiest by virtue of cpu_power or flags is taken care of.

However, in load_balance(), if the imbalance cannot be offset by moving
load from the busiest_cpu in the busiest_group, then today we do not try
the *next busiest cpu in the group*; instead we try the next busiest_group.

So whatever effort we put in by nominating this group as busiest, if the
grp_power and flags do not favour tasks on it, seems relevant only if
the busiest cpu in that group co-operates in offloading tasks. Should we
not be trying our best to move load from any other cpu in this group ?

This patch identifies one such situation, which led to too many tasks on
a core and got me to ponder over this question. I agree that the fix in
this patch is not right. But I thought this would open up discussion
around the above question. Its true that iterating over all the cpus in
a group during the actual load balance is too much of an overhead, but
isn't there a balance we can strike during load balance iterations for
such groups which have limited cpu power?
> 
Thanks

Regards
Preeti U Murthy

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 1/3] sched: Fix nohz_kick_needed to consider the nr_busy of the parent domain's group

2013-10-28 Thread Preeti U Murthy
Hi Peter,

On 10/28/2013 07:20 PM, Peter Zijlstra wrote:
> On Thu, Oct 24, 2013 at 01:37:38PM +0530, Preeti U Murthy wrote:
>>  kernel/sched/core.c  |5 +
>>  kernel/sched/fair.c  |   38 --
>>  kernel/sched/sched.h |1 +
>>  3 files changed, 26 insertions(+), 18 deletions(-)
>>
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index c06b8d3..c540392 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -5271,6 +5271,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_llc);
>>  DEFINE_PER_CPU(int, sd_llc_size);
>>  DEFINE_PER_CPU(int, sd_llc_id);
>>  DEFINE_PER_CPU(struct sched_domain *, sd_numa);
>> +DEFINE_PER_CPU(struct sched_domain *, sd_busy);
>>  
>>  static void update_top_cache_domain(int cpu)
>>  {
>> @@ -5290,6 +5291,10 @@ static void update_top_cache_domain(int cpu)
>>  
>>  sd = lowest_flag_domain(cpu, SD_NUMA);
>>  rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
>> +
>> +sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
>> +if (sd)
>> +rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
>>  }
>>  
>>  /*
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index e9c9549..f66cfd9 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -6515,16 +6515,16 @@ static inline void nohz_balance_exit_idle(int cpu)
>>  static inline void set_cpu_sd_state_busy(void)
>>  {
>>  struct sched_domain *sd;
>> +int cpu = smp_processor_id();
>>  
>>  rcu_read_lock();
>> +sd = rcu_dereference(per_cpu(sd_busy, cpu));
>>  
>>  if (!sd || !sd->nohz_idle)
>>  goto unlock;
>>  sd->nohz_idle = 0;
>>  
>> +atomic_inc(&sd->groups->sgp->nr_busy_cpus);
>>  unlock:
>>  rcu_read_unlock();
>>  }
>> @@ -6532,16 +6532,16 @@ unlock:
>>  void set_cpu_sd_state_idle(void)
>>  {
>>  struct sched_domain *sd;
>> +int cpu = smp_processor_id();
>>  
>>  rcu_read_lock();
>> +sd = rcu_dereference(per_cpu(sd_busy, cpu));
>>  
>>  if (!sd || sd->nohz_idle)
>>  goto unlock;
>>  sd->nohz_idle = 1;
>>  
>> +atomic_dec(&sd->groups->sgp->nr_busy_cpus);
>>  unlock:
>>  rcu_read_unlock();
>>  }
> 
> Oh nice, that gets rid of the multiple atomics, and it nicely splits
> this nohz logic into per topology groups -- now if only we could split
> the rest too :-)

I am sorry, I don't get you here. By the 'rest', do you refer to
nohz_kick_needed() as below? Or am I missing something?

> 
>> @@ -6748,6 +6748,8 @@ static inline int nohz_kick_needed(struct rq *rq, int 
>> cpu)
>>  {
>>  unsigned long now = jiffies;
>>  struct sched_domain *sd;
>> +struct sched_group_power *sgp;
>> +int nr_busy;
>>  
>>  if (unlikely(idle_cpu(cpu)))
>>  return 0;
>> @@ -6773,22 +6775,22 @@ static inline int nohz_kick_needed(struct rq *rq, 
>> int cpu)
>>  goto need_kick;
>>  
>>  rcu_read_lock();
>> +sd = rcu_dereference(per_cpu(sd_busy, cpu));
>>  
>> +if (sd) {
>> +sgp = sd->groups->sgp;
>> +nr_busy = atomic_read(&sgp->nr_busy_cpus);
>>  
>> +if (nr_busy > 1)
>>  goto need_kick_unlock;
>>  }
> 
> OK, so far so good.
> 
>> +
>> +sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
>> +
>> +if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
>> +  sched_domain_span(sd)) < cpu))
>> +goto need_kick_unlock;
>> +
>>      rcu_read_unlock();
>>  return 0;
> 
> This again is a bit sad; most archs will not have SD_ASYM_PACKING set at
> all; this means that they all will do a complete (and pointless) sched
> domain tree walk here.

There will not be a 'complete' sched domain tree walk right? The
iteration will break at the first level of the sched domain for those
archs which do not have SD_ASYM_PACKING set at all.

But it is true that doing a sched domain tree walk regularly is a bad
idea, might as well update the domain with SD_ASYM_PACKING flag set once
and query this domain when required.

I will send out the patch with sd_asym domain introduced rather than the
above.

Thanks

Regards
Preeti U Murthy

> 
> It would be much better to also introduce sd_asym and do the analogous
> thing to the new sd_busy.
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/3] sched: Aggressive balance in domains whose groups share package resources

2013-10-25 Thread Preeti U Murthy
Hi Peter,

On 10/23/2013 03:53 AM, Peter Zijlstra wrote:
> On Mon, Oct 21, 2013 at 05:15:02PM +0530, Vaidyanathan Srinivasan wrote:
>>  kernel/sched/fair.c |   18 ++
>>  1 file changed, 18 insertions(+)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 828ed97..bbcd96b 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5165,6 +5165,8 @@ static int load_balance(int this_cpu, struct rq 
>> *this_rq,
>>  {
>>  int ld_moved, cur_ld_moved, active_balance = 0;
>>  struct sched_group *group;
>> +struct sched_domain *child;
>> +int share_pkg_res = 0;
>>  struct rq *busiest;
>>  unsigned long flags;
>>  struct cpumask *cpus = __get_cpu_var(load_balance_mask);
>> @@ -5190,6 +5192,10 @@ static int load_balance(int this_cpu, struct rq 
>> *this_rq,
>>  
>>  schedstat_inc(sd, lb_count[idle]);
>>  
>> +child = sd->child;
>> +if (child && child->flags & SD_SHARE_PKG_RESOURCES)
>> +share_pkg_res = 1;
>> +
>>  redo:
>>  if (!should_we_balance(&env)) {
>>  *continue_balancing = 0;
>> @@ -5202,6 +5208,7 @@ redo:
>>  goto out_balanced;
>>  }
>>  
>> +redo_grp:
>>  busiest = find_busiest_queue(&env, group);
>>  if (!busiest) {
>>  schedstat_inc(sd, lb_nobusyq[idle]);
>> @@ -5292,6 +5299,11 @@ more_balance:
>>  if (!cpumask_empty(cpus)) {
>>  env.loop = 0;
>>  env.loop_break = sched_nr_migrate_break;
>> +if (share_pkg_res &&
>> +cpumask_intersects(cpus,
>> +to_cpumask(group->cpumask)))
> 
> sched_group_cpus()
> 
>> +goto redo_grp;
>> +
>>  goto redo;
>>  }
>>  goto out_balanced;
>> @@ -5318,9 +5330,15 @@ more_balance:
>>   */
>>  if (!cpumask_test_cpu(this_cpu,
>>  tsk_cpus_allowed(busiest->curr))) {
>> +cpumask_clear_cpu(cpu_of(busiest), cpus);
>>  raw_spin_unlock_irqrestore(&busiest->lock,
>>  flags);
>>  env.flags |= LBF_ALL_PINNED;
>> +if (share_pkg_res &&
>> +cpumask_intersects(cpus,
>> +        to_cpumask(group->cpumask)))
>> +goto redo_grp;
>> +
>>  goto out_one_pinned;
>>  }
> 
> Man this retry logic is getting annoying.. isn't there anything saner we
> can do?

Maybe we can do this just at the SIBLINGS level? Having the hyper
threads busy due to the scenario described in the changelog is bad for
performance.

Regards
Preeti U Murthy
> ___
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 1/3] sched: Fix nohz_kick_needed to consider the nr_busy of the parent domain's group

2013-10-24 Thread Preeti U Murthy
Hi Vincent,

I have addressed your comments and below is the fresh patch. This patch
applies on PATCH 2/3 posted in this thread.

Regards
Preeti U Murthy


sched:Remove un-necessary iterations over sched domains to update/query 
nr_busy_cpus

From: Preeti U Murthy 

nr_busy_cpus parameter is used by nohz_kick_needed() to find out the number
of busy cpus in a sched domain which has SD_SHARE_PKG_RESOURCES flag set.
Therefore instead of updating nr_busy_cpus at every level of sched domain,
since it is irrelevant, we can update this parameter only at the parent
domain of the sd which has this flag set. Introduce a per-cpu parameter
sd_busy which represents this parent domain.

In nohz_kick_needed() we directly query the nr_busy_cpus parameter
associated with the groups of sd_busy.

By associating sd_busy with the highest domain which has
SD_SHARE_PKG_RESOURCES flag set, we cover all lower level domains which could
have this flag set and trigger nohz_idle_balancing if any of the levels have
more than one busy cpu.

sd_busy is irrelevant for asymmetric load balancing.

While we are at it, we might as well change the nohz_idle parameter to be
updated at the sd_busy domain level alone and not the base domain level of a 
CPU.
This will unify the concept of busy cpus at just one level of sched domain
where it is currently used.

Signed-off-by: Preeti U Murthy
---
 kernel/sched/core.c  |5 +
 kernel/sched/fair.c  |   38 --
 kernel/sched/sched.h |1 +
 3 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c06b8d3..c540392 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5271,6 +5271,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain *, sd_busy);
 
 static void update_top_cache_domain(int cpu)
 {
@@ -5290,6 +5291,10 @@ static void update_top_cache_domain(int cpu)
 
sd = lowest_flag_domain(cpu, SD_NUMA);
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+
+   sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+   if (sd)
+   rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
 }
 
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e9c9549..f66cfd9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6515,16 +6515,16 @@ static inline void nohz_balance_exit_idle(int cpu)
 static inline void set_cpu_sd_state_busy(void)
 {
struct sched_domain *sd;
+   int cpu = smp_processor_id();
 
rcu_read_lock();
-   sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+   sd = rcu_dereference(per_cpu(sd_busy, cpu));
 
if (!sd || !sd->nohz_idle)
goto unlock;
sd->nohz_idle = 0;
 
-   for (; sd; sd = sd->parent)
-   atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+   atomic_inc(&sd->groups->sgp->nr_busy_cpus);
 unlock:
rcu_read_unlock();
 }
@@ -6532,16 +6532,16 @@ unlock:
 void set_cpu_sd_state_idle(void)
 {
struct sched_domain *sd;
+   int cpu = smp_processor_id();
 
rcu_read_lock();
-   sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+   sd = rcu_dereference(per_cpu(sd_busy, cpu));
 
if (!sd || sd->nohz_idle)
goto unlock;
sd->nohz_idle = 1;
 
-   for (; sd; sd = sd->parent)
-   atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+   atomic_dec(&sd->groups->sgp->nr_busy_cpus);
 unlock:
rcu_read_unlock();
 }
@@ -6748,6 +6748,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
 {
unsigned long now = jiffies;
struct sched_domain *sd;
+   struct sched_group_power *sgp;
+   int nr_busy;
 
if (unlikely(idle_cpu(cpu)))
return 0;
@@ -6773,22 +6775,22 @@ static inline int nohz_kick_needed(struct rq *rq, int 
cpu)
goto need_kick;
 
rcu_read_lock();
-   for_each_domain(cpu, sd) {
-   struct sched_group *sg = sd->groups;
-   struct sched_group_power *sgp = sg->sgp;
-   int nr_busy = atomic_read(&sgp->nr_busy_cpus);
+   sd = rcu_dereference(per_cpu(sd_busy, cpu));
 
-   if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
-   goto need_kick_unlock;
+   if (sd) {
+   sgp = sd->groups->sgp;
+   nr_busy = atomic_read(&sgp->nr_busy_cpus);
 
-   if (sd->flags & SD_ASYM_PACKING
-   && (cpumask_first_and(nohz.idle_cpus_mask,
- sched_domain_span(sd)) < cpu))
+   if (nr_busy > 1)
goto need_kick_unlock;
-
- 

Re: [PATCH 3/3] sched: Aggressive balance in domains whose groups share package resources

2013-10-23 Thread Preeti U Murthy
Hi Peter,

On 10/23/2013 03:53 AM, Peter Zijlstra wrote:
> On Mon, Oct 21, 2013 at 05:15:02PM +0530, Vaidyanathan Srinivasan wrote:
>>  kernel/sched/fair.c |   18 ++
>>  1 file changed, 18 insertions(+)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 828ed97..bbcd96b 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5165,6 +5165,8 @@ static int load_balance(int this_cpu, struct rq 
>> *this_rq,
>>  {
>>  int ld_moved, cur_ld_moved, active_balance = 0;
>>  struct sched_group *group;
>> +struct sched_domain *child;
>> +int share_pkg_res = 0;
>>  struct rq *busiest;
>>  unsigned long flags;
>>  struct cpumask *cpus = __get_cpu_var(load_balance_mask);
>> @@ -5190,6 +5192,10 @@ static int load_balance(int this_cpu, struct rq 
>> *this_rq,
>>  
>>  schedstat_inc(sd, lb_count[idle]);
>>  
>> +child = sd->child;
>> +if (child && child->flags & SD_SHARE_PKG_RESOURCES)
>> +share_pkg_res = 1;
>> +
>>  redo:
>>  if (!should_we_balance(&env)) {
>>  *continue_balancing = 0;
>> @@ -5202,6 +5208,7 @@ redo:
>>  goto out_balanced;
>>  }
>>  
>> +redo_grp:
>>  busiest = find_busiest_queue(&env, group);
>>  if (!busiest) {
>>  schedstat_inc(sd, lb_nobusyq[idle]);
>> @@ -5292,6 +5299,11 @@ more_balance:
>>  if (!cpumask_empty(cpus)) {
>>  env.loop = 0;
>>  env.loop_break = sched_nr_migrate_break;
>> +if (share_pkg_res &&
>> +cpumask_intersects(cpus,
>> +to_cpumask(group->cpumask)))
> 
> sched_group_cpus()
> 
>> +goto redo_grp;
>> +
>>  goto redo;
>>  }
>>  goto out_balanced;
>> @@ -5318,9 +5330,15 @@ more_balance:
>>   */
>>  if (!cpumask_test_cpu(this_cpu,
>>  tsk_cpus_allowed(busiest->curr))) {
>> +cpumask_clear_cpu(cpu_of(busiest), cpus);
>>  raw_spin_unlock_irqrestore(&busiest->lock,
>>  flags);
>>  env.flags |= LBF_ALL_PINNED;
>> +if (share_pkg_res &&
>> +        cpumask_intersects(cpus,
>> +to_cpumask(group->cpumask)))
>> +goto redo_grp;
>> +
>>  goto out_one_pinned;
>>  }
> 
> Man this retry logic is getting annoying.. isn't there anything saner we
> can do?

Let me give this a thought and get back.

Regards
Preeti U Murthy
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 1/3] sched: Fix nohz_kick_needed to consider the nr_busy of the parent domain's group

2013-10-23 Thread Preeti U Murthy
Hi Peter

On 10/23/2013 03:41 AM, Peter Zijlstra wrote:
> This nohz stuff really needs to be re-thought and made more scalable --
> its a royal pain :/

Why  not do something like the below instead? It does the following.

This patch introduces sd_busy just like your suggested patch, except that
it points to the parent of the highest level sched domain which has the
SD_SHARE_PKG_RESOURCES set and initializes it in update_top_cache_domain(). 
This is the sched domain that is relevant in nohz_kick_needed().

sd_set_sd_state_busy(), sd_set_sd_state_idle() and nohz_kick_needed() query
and update *only* this sched domain(sd_busy) for nr_busy_cpus. They are the
only users of this parameter. While we are at it, we might as well change
the nohz_idle parameter to be updated at the sd_busy domain level alone and
not the base domain level of a CPU. This will unify the concept of busy cpus
at just one level of sched domain.

There is no need to iterate through all levels of sched domains of a cpu to
update nr_busy_cpus since it is irrelevant at all other sched domains except
at sd_busy level.

De-couple asymmetric load balancing from the nr_busy parameter which the
PATCH 2/3 anyway does. sd_busy therefore is irrelevant for asymmetric load
balancing.

Regards
Preeti U Murthy
START_PATCH---

sched: Fix nohz_kick_needed()

---
 kernel/sched/core.c  |4 
 kernel/sched/fair.c  |   40 ++--
 kernel/sched/sched.h |1 +
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c06b8d3..c1dd11c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5271,6 +5271,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain *, sd_busy);
 
 static void update_top_cache_domain(int cpu)
 {
@@ -5290,6 +5291,9 @@ static void update_top_cache_domain(int cpu)
 
sd = lowest_flag_domain(cpu, SD_NUMA);
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+
+   sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES)->parent;
+   rcu_assign_pointer(per_cpu(sd_busy, cpu), sd);
 }
 
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 813dd61..71e6f14 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6515,16 +6515,16 @@ static inline void nohz_balance_exit_idle(int cpu)
 static inline void set_cpu_sd_state_busy(void)
 {
struct sched_domain *sd;
+   int cpu = smp_processor_id();
 
rcu_read_lock();
-   sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+   sd = per_cpu(sd_busy, cpu);
 
if (!sd || !sd->nohz_idle)
goto unlock;
sd->nohz_idle = 0;
 
-   for (; sd; sd = sd->parent)
-   atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+   atomic_inc(&sd->groups->sgp->nr_busy_cpus);
 unlock:
rcu_read_unlock();
 }
@@ -6532,16 +6532,16 @@ unlock:
 void set_cpu_sd_state_idle(void)
 {
struct sched_domain *sd;
+   int cpu = smp_processor_id();
 
rcu_read_lock();
-   sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+   sd = per_cpu(sd_busy, cpu);
 
if (!sd || sd->nohz_idle)
goto unlock;
sd->nohz_idle = 1;
 
-   for (; sd; sd = sd->parent)
-   atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+   atomic_dec(&sd->groups->sgp->nr_busy_cpus);
 unlock:
rcu_read_unlock();
 }
@@ -6748,6 +6748,9 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
 {
unsigned long now = jiffies;
struct sched_domain *sd;
+   struct sched_group *sg;
+   struct sched_group_power *sgp;
+   int nr_busy;
 
if (unlikely(idle_cpu(cpu)))
return 0;
@@ -6773,22 +6776,23 @@ static inline int nohz_kick_needed(struct rq *rq, int 
cpu)
goto need_kick;
 
rcu_read_lock();
-   for_each_domain(cpu, sd) {
-   struct sched_group *sg = sd->groups;
-   struct sched_group_power *sgp = sg->sgp;
-   int nr_busy = atomic_read(&sgp->nr_busy_cpus);
+   sd = per_cpu(sd_busy, cpu);
 
-   if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
-   goto need_kick_unlock;
+   if (sd) {
+   sg = sd->groups;
+   sgp = sg->sgp;
+   nr_busy = atomic_read(&sgp->nr_busy_cpus);
 
-   if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
-   && (cpumask_first_and(nohz.idle_cpus_mask,
- sched_domain_span(sd)) < cpu))
+   if (nr_busy > 1)

Re: [PATCH 1/3] sched: Fix nohz_kick_needed to consider the nr_busy of the parent domain's group

2013-10-22 Thread Preeti U Murthy
On 10/23/2013 09:30 AM, Preeti U Murthy wrote:
> Hi Peter,
> 
> On 10/23/2013 03:41 AM, Peter Zijlstra wrote:
>> On Mon, Oct 21, 2013 at 05:14:42PM +0530, Vaidyanathan Srinivasan wrote:
>>>  kernel/sched/fair.c |   19 +--
>>>  1 file changed, 13 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 7c70201..12f0eab 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -5807,12 +5807,19 @@ static inline int nohz_kick_needed(struct rq *rq, 
>>> int cpu)
>>>  
>>> rcu_read_lock();
>>> for_each_domain(cpu, sd) {
>>> +   struct sched_domain *sd_parent = sd->parent;
>>> +   struct sched_group *sg;
>>> +   struct sched_group_power *sgp;
>>> +   int nr_busy;
>>> +
>>> +   if (sd_parent) {
>>> +   sg = sd_parent->groups;
>>> +   sgp = sg->sgp;
>>> +   nr_busy = atomic_read(&sgp->nr_busy_cpus);
>>> +
>>> +   if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
>>> +   goto need_kick_unlock;
>>> +   }
>>>  
>>> if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
>>> && (cpumask_first_and(nohz.idle_cpus_mask,
>>>
>>
>> Almost I'd say; what happens on !sd_parent && SD_ASYM_PACKING ?
> 
> You are right, sorry about this. The idea was to correct the nr_busy
> computation before the patch that would remove its usage in the second
> patch. But that would mean the condition nr_busy != sg->group_weight
> would be invalid with this patch. The second patch needs to go first to
> avoid this confusion.
> 
>>
>> Also, this made me look at the nr_busy stuff again, and somehow that
>> entire thing makes me a little sad.
>>
>> Can't we do something like the below and cut that nr_busy sd iteration
>> short?
> 
> We can surely cut the nr_busy sd iteration but not like what is done
> with this patch. You stop the nr_busy computation at the sched domain
> that has the flag SD_SHARE_PKG_RESOURCES set. But nohz_kick_needed()
> would want to know the nr_busy for one level above this.
>Consider a core. Assume it is the highest domain with this flag set.
> The nr_busy of its groups, which are logical threads are set to 1/0
> each. But nohz_kick_needed() would like to know the sum of the nr_busy
> parameter of all the groups, i.e. the threads in a core before it
> decides if it can kick nohz_idle balancing. The information about the
> individual group's nr_busy is of no relevance here.
> 
> Thats why the above patch tries to get the
> sd->parent->groups->sgp->nr_busy_cpus. This will translate rightly to
> the core's busy cpus in this example. But the below patch stops before
> updating this parameter at the sd->parent level, where sd is the highest
> level sched domain with the SD_SHARE_PKG_RESOURCES flag set.
> 
> But we can get around all this confusion if we can move the nr_busy
> parameter to be included in the sched_domain structure rather than the
> sched_groups_power structure. Anyway the only place where nr_busy is
> used, that is at nohz_kick_needed(), is done to know the total number of
> busy cpus at a sched domain level which has the SD_SHARE_PKG_RESOURCES
> set and not at a sched group level.
> 
> So why not move nr_busy to struct sched_domain  and having the below
> patch which just updates this parameter for the sched domain, sd_busy ?

Oh this can't be done :( Domain structures are per cpu!

Regards
Preeti U Murthy

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 1/3] sched: Fix nohz_kick_needed to consider the nr_busy of the parent domain's group

2013-10-22 Thread Preeti U Murthy
Hi Peter,

On 10/23/2013 03:41 AM, Peter Zijlstra wrote:
> On Mon, Oct 21, 2013 at 05:14:42PM +0530, Vaidyanathan Srinivasan wrote:
>>  kernel/sched/fair.c |   19 +--
>>  1 file changed, 13 insertions(+), 6 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 7c70201..12f0eab 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5807,12 +5807,19 @@ static inline int nohz_kick_needed(struct rq *rq, 
>> int cpu)
>>  
>>  rcu_read_lock();
>>  for_each_domain(cpu, sd) {
>> +struct sched_domain *sd_parent = sd->parent;
>> +struct sched_group *sg;
>> +struct sched_group_power *sgp;
>> +int nr_busy;
>> +
>> +if (sd_parent) {
>> +sg = sd_parent->groups;
>> +sgp = sg->sgp;
>> +nr_busy = atomic_read(&sgp->nr_busy_cpus);
>> +
>> +if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
>> +goto need_kick_unlock;
>> +}
>>  
>>  if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
>>  && (cpumask_first_and(nohz.idle_cpus_mask,
>>
> 
> Almost I'd say; what happens on !sd_parent && SD_ASYM_PACKING ?

You are right, sorry about this. The idea was to correct the nr_busy
computation before the patch that would remove its usage in the second
patch. But that would mean the condition nr_busy != sg->group_weight
would be invalid with this patch. The second patch needs to go first to
avoid this confusion.

> 
> Also, this made me look at the nr_busy stuff again, and somehow that
> entire thing makes me a little sad.
> 
> Can't we do something like the below and cut that nr_busy sd iteration
> short?

We can surely cut the nr_busy sd iteration but not like what is done
with this patch. You stop the nr_busy computation at the sched domain
that has the flag SD_SHARE_PKG_RESOURCES set. But nohz_kick_needed()
would want to know the nr_busy for one level above this.
   Consider a core. Assume it is the highest domain with this flag set.
The nr_busy of its groups, which are logical threads are set to 1/0
each. But nohz_kick_needed() would like to know the sum of the nr_busy
parameter of all the groups, i.e. the threads in a core before it
decides if it can kick nohz_idle balancing. The information about the
individual group's nr_busy is of no relevance here.

Thats why the above patch tries to get the
sd->parent->groups->sgp->nr_busy_cpus. This will translate rightly to
the core's busy cpus in this example. But the below patch stops before
updating this parameter at the sd->parent level, where sd is the highest
level sched domain with the SD_SHARE_PKG_RESOURCES flag set.

But we can get around all this confusion if we can move the nr_busy
parameter to be included in the sched_domain structure rather than the
sched_groups_power structure. Anyway the only place where nr_busy is
used, that is at nohz_kick_needed(), is done to know the total number of
busy cpus at a sched domain level which has the SD_SHARE_PKG_RESOURCES
set and not at a sched group level.

So why not move nr_busy to struct sched_domain  and having the below
patch which just updates this parameter for the sched domain, sd_busy ?
This will avoid iterating through all the levels of sched domains and
should resolve the scalability issue. We also don't need to get to
sd->parent to get the nr_busy parameter for the sake of nohz_kick_needed().

What do you think?

Regards
Preeti U Murthy
> 
> This nohz stuff really needs to be re-thought and made more scalable --
> its a royal pain :/
> 
> 
>  kernel/sched/core.c  |  4 
>  kernel/sched/fair.c  | 21 +++--
>  kernel/sched/sched.h |  5 ++---
>  3 files changed, 21 insertions(+), 9 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index c06b8d3..89db8dc 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5271,6 +5271,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_llc);
>  DEFINE_PER_CPU(int, sd_llc_size);
>  DEFINE_PER_CPU(int, sd_llc_id);
>  DEFINE_PER_CPU(struct sched_domain *, sd_numa);
> +DEFINE_PER_CPU(struct sched_domain *, sd_busy);
> 
>  static void update_top_cache_domain(int cpu)
>  {
> @@ -5290,6 +5291,9 @@ static void update_top_cache_domain(int cpu)
> 
>   sd = lowest_flag_domain(cpu, SD_NUMA);
>   rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
> +
> + sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING);
> + rcu_assign_pointer(per_cpu(sd_busy, cp

Re: [PATCH 1/3] sched: Fix nohz_kick_needed to consider the nr_busy of the parent domain's group

2013-10-22 Thread Preeti U Murthy
Hi Kamalesh,

On 10/22/2013 08:05 PM, Kamalesh Babulal wrote:
> * Vaidyanathan Srinivasan  [2013-10-21 17:14:42]:
> 
>>  for_each_domain(cpu, sd) {
>> -struct sched_group *sg = sd->groups;
>> -struct sched_group_power *sgp = sg->sgp;
>> -int nr_busy = atomic_read(&sgp->nr_busy_cpus);
>> -
>> -if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
>> -goto need_kick_unlock;
>> +struct sched_domain *sd_parent = sd->parent;
>> +struct sched_group *sg;
>> +struct sched_group_power *sgp;
>> +int nr_busy;
>> +
>> +if (sd_parent) {
>> +sg = sd_parent->groups;
>> +sgp = sg->sgp;
>> +nr_busy = atomic_read(&sgp->nr_busy_cpus);
>> +
>> +if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
>> +goto need_kick_unlock;
>> +}
>>
>>  if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
>>  && (cpumask_first_and(nohz.idle_cpus_mask,
> 
> CC'ing Suresh Siddha and Vincent Guittot
> 
> Please correct me, If my understanding of idle balancing is wrong.
> With proposed approach will not idle load balancer kick in, even if
> there are busy cpus across groups or if there are 2 busy cpus which
> are spread across sockets.

Yes load balancing will happen on busy cpus periodically.

Wrt idle balancing there are two points here. One, when a CPU is just
about to go idle, it will enter idle_balance(), and trigger load
balancing with itself being the destination CPU to begin with. It will
load balance at every level of the sched domain that it belongs to. If
it manages to pull tasks, good, else it will enter an idle state.

nohz_idle_balancing is triggered by a busy cpu at every tick if it has
more than one task in its runqueue or if it belongs to a group that
shares the package resources and has more than one cpu busy. By
"nohz_idle_balance triggered", it means the busy cpu will send an ipi to
the ilb_cpu to do load balancing on the behalf of the idle cpus in the
nohz mask.

So to answer your question wrt this patch, if there is one busy cpu with
say 2 tasks in one socket and another busy cpu with 1 task on another
socket, the former busy cpu can kick nohz_idle_balance since it has more
than one task in its runqueue. An idle cpu in either socket could be
woken up to balance tasks with it.

The usual idle load balancer that runs on a CPU about to become idle
could pull from either cpu depending on who is more busy as it begins to
load balance across all levels of sched domain that it belongs to.
> 
> Consider 2 socket machine with 4 processors each (MC and NUMA domains).
> If the machine is partial loaded such that cpus 0,4,5,6,7 are busy, then too
> nohz balancing is triggered because with this approach
> (NUMA)->groups->sgp->nr_busy_cpus is taken in account for nohz kick, while
> iterating over MC domain.

For the example that you mention, you will have a CPU domain and a NUMA
domain. When the sockets are NUMA nodes, each socket will belong to a
CPU domain. If the sockets are non-numa nodes, then the domain
encompassing both the nodes will be a CPU domain, possibly with each
socket being an MC domain.
> 
> Isn't idle load balancer not suppose kick in, even in the case of two busy
> cpu's in a dual-core single socket system

nohz_idle_balancing is a special case. It is triggered when the
conditions mentioned in nohz_kick_needed() are true. A CPU just about to
go idle will trigger load balancing without any pre-conditions.

In a single socket machine, there will be a CPU domain encompassing the
socket and the MC domain will encompass a core. nohz_idle load balancer
will kick in if both the threads in the core have tasks running on them.
This is fair enough because the threads share the resources of the core.

Regards
Preeti U Murthy
> 
> Thanks,
> Kamalesh.
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V3 5/6] cpuidle/ppc: Introduce the deep idle state in which the local timers stop

2013-09-10 Thread Preeti U Murthy
Now that we have the basic infrastructure setup to make use of the broadcast
framework, introduce the deep idle state in which cpus need to avail the
functionality provided by this infrastructure to wake them up at their
expired timer events. On ppc this deep idle state is called sleep.
In this patch however, we introduce longnap, which emulates sleep
state, by disabling timer interrupts. This is until such time that sleep 
support is
made available in the kernel.

Since on ppc, we do not have an external device that can wakeup cpus in deep
idle, the local timer of one of the cpus need to be nominated to do this job.
This cpu is called the broadcast cpu/bc_cpu. Only if the bc_cpu is nominated
will the remaining cpus be allowed to enter deep idle state after notifying
the broadcast framework about their next timer event. The bc_cpu is not allowed
to enter deep idle state.

The first cpu that enters longnap is made the bc_cpu. It queues a hrtimer onto
itself which expires after a broadcast period. The job of this
hrtimer is to call into the broadcast framework[1] using the pseudo clock device
that we have initiliazed, in which, the cpus whose wakeup times
have expired are sent an ipi.
On each expiry of the hrtimer, it is programmed to the earlier of the
next pending timer event of the cpus in deep idle and the broadcast period, so
as to not miss any wakeups.

The broadcast period is nothing but the max duration until which the
bc_cpu need not concern itself with checking for expired timer events on cpus
in deep idle. The broadcast period is set to a jiffy in this patch for debug
purposes. Ideally it needn't be smaller than the target_residency of the deep
idle state.

But having a dedicated bc_cpu would mean overloading just one cpu with the
broadcast work which could hinder its performance apart from leading to thermal
imbalance on the chip. Therefore unassign the bc_cpu when there are no more cpus
in deep idle to be woken up. The bc_cpu is left unassigned until such a time 
that
a cpu enters longnap to be nominated as the bc_cpu and the above cycle repeats.

Protect the region of nomination,de-nomination and check for existence of 
broadcast
cpu with a lock to ensure synchronization between them.

[1] tick_handle_oneshot_broadcast() or tick_handle_periodic_broadcast().

Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/include/asm/time.h |1 
 arch/powerpc/kernel/time.c  |2 
 drivers/cpuidle/cpuidle-ibm-power.c |  150 +++
 3 files changed, 152 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 264dc96..38341fa 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -25,6 +25,7 @@ extern unsigned long tb_ticks_per_usec;
 extern unsigned long tb_ticks_per_sec;
 extern struct clock_event_device decrementer_clockevent;
 extern struct clock_event_device broadcast_clockevent;
+extern struct clock_event_device bc_timer;
 
 struct rtc_time;
 extern void to_tm(int tim, struct rtc_time * tm);
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index bda78bb..44a76de 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -129,7 +129,7 @@ EXPORT_SYMBOL(broadcast_clockevent);
 
 DEFINE_PER_CPU(u64, decrementers_next_tb);
 static DEFINE_PER_CPU(struct clock_event_device, decrementers);
-static struct clock_event_device bc_timer;
+struct clock_event_device bc_timer;
 
 #define XSEC_PER_SEC (1024*1024)
 
diff --git a/drivers/cpuidle/cpuidle-ibm-power.c 
b/drivers/cpuidle/cpuidle-ibm-power.c
index f8905c3..ae47a0a 100644
--- a/drivers/cpuidle/cpuidle-ibm-power.c
+++ b/drivers/cpuidle/cpuidle-ibm-power.c
@@ -12,12 +12,19 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
 #include 
 
 struct cpuidle_driver power_idle_driver = {
@@ -28,6 +35,26 @@ struct cpuidle_driver power_idle_driver = {
 static int max_idle_state;
 static struct cpuidle_state *cpuidle_state_table;
 
+static int bc_cpu = -1;
+static struct hrtimer *bc_hrtimer;
+static int bc_hrtimer_initialized = 0;
+
+/*
+ * Bits to indicate if a cpu can enter deep idle where local timer gets
+ * switched off.
+ * BROADCAST_CPU_PRESENT : Enter deep idle since bc_cpu is assigned
+ * BROADCAST_CPU_SELF   : Do not enter deep idle since you are bc_cpu
+ * BROADCAST_CPU_ABSENT : Do not enter deep idle since there is no 
bc_cpu,
+ *hence nominate yourself as bc_cpu
+ * BROADCAST_CPU_ERROR :  Do not enter deep idle since there is no bc_cpu
+ *and the broadcast hrtimer could not be initialized.
+ */
+enum broadcast_cpu_status {
+   BROADCAST_CPU_PRESENT,
+   BROADCAST_CPU_SELF,
+   BROADCAST_CPU_ERROR,
+};
+
 static inline void idle_loop_prolog(unsigned long *in_purr)
 {
*in_purr = 

[PATCH V3 6/6] cpuidle/ppc: Nominate new broadcast cpu on hotplug of the old

2013-09-10 Thread Preeti U Murthy
On hotplug of the broadcast cpu, cancel the hrtimer queued to do
broadcast and nominate a new broadcast cpu to be the first cpu in the
broadcast mask which includes all the cpus that have notified the broadcast
framework about entering deep idle state.

Since the new broadcast cpu is one of the cpus in deep idle, send an ipi to
wake it up to continue the duty of broadcast. The new broadcast cpu needs to
find out if it woke up to resume broadcast. If so it needs to restart the
broadcast hrtimer on itself.

Its possible that the old broadcast cpu was hotplugged out when the broadcast
hrtimer was about to fire on it. Therefore the newly nominated broadcast cpu
should set the broadcast hrtimer on itself to expire immediately so as to not
miss wakeups under such scenarios.

Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/include/asm/time.h |1 +
 arch/powerpc/kernel/time.c  |1 +
 drivers/cpuidle/cpuidle-ibm-power.c |   22 ++
 3 files changed, 24 insertions(+)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 38341fa..3bc0205 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -31,6 +31,7 @@ struct rtc_time;
 extern void to_tm(int tim, struct rtc_time * tm);
 extern void GregorianDay(struct rtc_time *tm);
 extern void decrementer_timer_interrupt(void);
+extern void broadcast_irq_entry(void);
 
 extern void generic_calibrate_decr(void);
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 44a76de..0ac2e11 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -853,6 +853,7 @@ void decrementer_timer_interrupt(void)
 {
u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
 
+   broadcast_irq_entry();
*next_tb = get_tb_or_rtc();
__timer_interrupt();
 }
diff --git a/drivers/cpuidle/cpuidle-ibm-power.c 
b/drivers/cpuidle/cpuidle-ibm-power.c
index ae47a0a..580ea04 100644
--- a/drivers/cpuidle/cpuidle-ibm-power.c
+++ b/drivers/cpuidle/cpuidle-ibm-power.c
@@ -282,6 +282,12 @@ static int longnap_loop(struct cpuidle_device *dev,
return index;
 }
 
+void broadcast_irq_entry(void)
+{
+   if (smp_processor_id() == bc_cpu)
+   hrtimer_start(bc_hrtimer, ns_to_ktime(0), 
HRTIMER_MODE_REL_PINNED);
+}
+
 /*
  * States for dedicated partition case.
  */
@@ -360,6 +366,7 @@ static int power_cpuidle_add_cpu_notifier(struct 
notifier_block *n,
unsigned long action, void *hcpu)
 {
int hotcpu = (unsigned long)hcpu;
+   unsigned long flags;
struct cpuidle_device *dev =
per_cpu(cpuidle_devices, hotcpu);
 
@@ -372,6 +379,21 @@ static int power_cpuidle_add_cpu_notifier(struct 
notifier_block *n,
cpuidle_resume_and_unlock();
break;
 
+   case CPU_DYING:
+   case CPU_DYING_FROZEN:
+   spin_lock_irqsave(&longnap_idle_lock, flags);
+   if (hotcpu == bc_cpu) {
+   bc_cpu = -1;
+   hrtimer_cancel(bc_hrtimer);
+   if 
(!cpumask_empty(tick_get_broadcast_oneshot_mask())) {
+   bc_cpu = cpumask_first(
+   
tick_get_broadcast_oneshot_mask());
+   
arch_send_tick_broadcast(cpumask_of(bc_cpu));
+   }
+   }
+   spin_unlock_irqrestore(&longnap_idle_lock, flags);
+   break;
+
case CPU_DEAD:
case CPU_DEAD_FROZEN:
cpuidle_pause_and_lock();

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V3 4/6] cpuidle/ppc: Add basic infrastructure to support the broadcast framework on ppc

2013-09-10 Thread Preeti U Murthy
The broadcast framework in the kernel expects an external clock device which 
will
continue functioning in deep idle states also. This ability is specified by
the "non-existence" of the feature C3STOP . This is the device that it relies
upon to wakup cpus in deep idle states whose local timers/clock devices get
switched off in deep idle states.

On ppc we do not have such an external device. Therefore we introduce a
pseudo clock device, which has the features of this external clock device
called the broadcast_clockevent. Having such a device qualifies the cpus to
enter and exit deep idle states from the point of view of the broadcast
framework, because there is an external device to wake them up.
Specifically the broadcast framework uses this device's event
handler and next_event members in its functioning. On ppc we use this
device as the gateway into the broadcast framework and *not* as a
timer. An explicit timer infrastructure will be developed in the following
patches to keep track of when to wake up cpus in deep idle.

Since this device is a pseudo device, it can be safely assumed to work for
all cpus. Therefore its cpumask is set to cpu_possible_mask. Also due to the
same reason, the set_next_event() routine associated with this device is a
nop.

The broadcast framework relies on a broadcast functionality being made
available in the .broadcast member of the local clock devices on all cpus.
This function is called upon by the broadcast framework on one of the nominated
cpus, to send ipis to all the cpus in deep idle at their expired timer events.
This patch also initializes the .broadcast member of the decrementer whose
job is to send the broadcast ipis.

When cpus inform the broadcast framework that they are entering deep idle,
their local timers are put in shutdown mode. On ppc, this means setting the
decrementer_next_tb and programming the decrementer to DECREMENTER_MAX.
On being woken up by the broadcast ipi, these cpus call __timer_interrupt(),
which runs the local timers only if decrementer_next_tb has expired.
  Therefore on being woken up from the broadcast ipi, set the 
decrementers_next_tb
to now before calling __timer_interrupt().

Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/Kconfig|1 +
 arch/powerpc/include/asm/time.h |1 +
 arch/powerpc/kernel/time.c  |   69 ++-
 3 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index dbd9d3c..550fc04 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -130,6 +130,7 @@ config PPC
select GENERIC_CMOS_UPDATE
select GENERIC_TIME_VSYSCALL_OLD
select GENERIC_CLOCKEVENTS
+   select GENERIC_CLOCKEVENTS_BROADCAST
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
select HAVE_MOD_ARCH_SPECIFIC
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 4e35282..264dc96 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -24,6 +24,7 @@ extern unsigned long tb_ticks_per_jiffy;
 extern unsigned long tb_ticks_per_usec;
 extern unsigned long tb_ticks_per_sec;
 extern struct clock_event_device decrementer_clockevent;
+extern struct clock_event_device broadcast_clockevent;
 
 struct rtc_time;
 extern void to_tm(int tim, struct rtc_time * tm);
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index eb48291..bda78bb 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -42,6 +42,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -97,8 +98,13 @@ static struct clocksource clocksource_timebase = {
 
 static int decrementer_set_next_event(unsigned long evt,
  struct clock_event_device *dev);
+static int broadcast_set_next_event(unsigned long evt,
+ struct clock_event_device *dev);
+static void broadcast_set_mode(enum clock_event_mode mode,
+struct clock_event_device *dev);
 static void decrementer_set_mode(enum clock_event_mode mode,
 struct clock_event_device *dev);
+static void decrementer_timer_broadcast(const struct cpumask *mask);
 
 struct clock_event_device decrementer_clockevent = {
.name   = "decrementer",
@@ -106,12 +112,24 @@ struct clock_event_device decrementer_clockevent = {
.irq= 0,
.set_next_event = decrementer_set_next_event,
.set_mode   = decrementer_set_mode,
-   .features   = CLOCK_EVT_FEAT_ONESHOT,
+   .broadcast  = decrementer_timer_broadcast,
+   .features   = CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_ONESHOT,
 };
 EXPORT_SYMBOL(decrementer_clockevent);
 
+struct clock_event_device broadcast_clockevent = {
+   .name   = "broadcast",
+   .rating

[PATCH V3 2/6] powerpc: Implement broadcast timer interrupt as an IPI message

2013-09-10 Thread Preeti U Murthy
From: Srivatsa S. Bhat 

For scalability and performance reasons, we want the broadcast IPIs
to be handled as efficiently as possible. Fixed IPI messages
are one of the most efficient mechanisms available - they are faster
than the smp_call_function mechanism because the IPI handlers are fixed
and hence they don't involve costly operations such as adding IPI handlers
to the target CPU's function queue, acquiring locks for synchronization etc.

Luckily we have an unused IPI message slot, so use that to implement
broadcast timer interrupts efficiently.

Signed-off-by: Srivatsa S. Bhat 
[Changelog modified by pre...@linux.vnet.ibm.com]
Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/include/asm/smp.h  |3 ++-
 arch/powerpc/include/asm/time.h |1 +
 arch/powerpc/kernel/smp.c   |   19 +++
 arch/powerpc/kernel/time.c  |4 
 arch/powerpc/platforms/cell/interrupt.c |2 +-
 arch/powerpc/platforms/ps3/smp.c|2 +-
 scripts/kconfig/streamline_config.pl|0 
 7 files changed, 24 insertions(+), 7 deletions(-)
 mode change 100644 => 100755 scripts/kconfig/streamline_config.pl

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index a632b6e..22f6d63 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -117,7 +117,7 @@ extern int cpu_to_core_id(int cpu);
  *
  * Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up
  * in /proc/interrupts will be wrong!!! --Troy */
-#define PPC_MSG_UNUSED 0
+#define PPC_MSG_TIMER  0
 #define PPC_MSG_RESCHEDULE  1
 #define PPC_MSG_CALL_FUNC_SINGLE   2
 #define PPC_MSG_DEBUGGER_BREAK  3
@@ -194,6 +194,7 @@ extern struct smp_ops_t *smp_ops;
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
+extern void arch_send_tick_broadcast(const struct cpumask *mask);
 
 /* Definitions relative to the secondary CPU spin loop
  * and entry point. Not all of them exist on both 32 and
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index c1f2676..4e35282 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -28,6 +28,7 @@ extern struct clock_event_device decrementer_clockevent;
 struct rtc_time;
 extern void to_tm(int tim, struct rtc_time * tm);
 extern void GregorianDay(struct rtc_time *tm);
+extern void decrementer_timer_interrupt(void);
 
 extern void generic_calibrate_decr(void);
 
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index bc41e9f..d3b7014 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -111,9 +112,9 @@ int smp_generic_kick_cpu(int nr)
 }
 #endif /* CONFIG_PPC64 */
 
-static irqreturn_t unused_action(int irq, void *data)
+static irqreturn_t timer_action(int irq, void *data)
 {
-   /* This slot is unused and hence available for use, if needed */
+   decrementer_timer_interrupt();
return IRQ_HANDLED;
 }
 
@@ -144,14 +145,14 @@ static irqreturn_t debug_ipi_action(int irq, void *data)
 }
 
 static irq_handler_t smp_ipi_action[] = {
-   [PPC_MSG_UNUSED] =  unused_action, /* Slot available for future use */
+   [PPC_MSG_TIMER] =  timer_action,
[PPC_MSG_RESCHEDULE] = reschedule_action,
[PPC_MSG_CALL_FUNC_SINGLE] = call_function_single_action,
[PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action,
 };
 
 const char *smp_ipi_name[] = {
-   [PPC_MSG_UNUSED] =  "ipi unused",
+   [PPC_MSG_TIMER] =  "ipi timer",
[PPC_MSG_RESCHEDULE] = "ipi reschedule",
[PPC_MSG_CALL_FUNC_SINGLE] = "ipi call function single",
[PPC_MSG_DEBUGGER_BREAK] = "ipi debugger",
@@ -221,6 +222,8 @@ irqreturn_t smp_ipi_demux(void)
all = xchg(&info->messages, 0);
 
 #ifdef __BIG_ENDIAN
+   if (all & (1 << (24 - 8 * PPC_MSG_TIMER)))
+   decrementer_timer_interrupt();
if (all & (1 << (24 - 8 * PPC_MSG_RESCHEDULE)))
scheduler_ipi();
if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNC_SINGLE)))
@@ -266,6 +269,14 @@ void arch_send_call_function_ipi_mask(const struct cpumask 
*mask)
do_message_pass(cpu, PPC_MSG_CALL_FUNC_SINGLE);
 }
 
+void arch_send_tick_broadcast(const struct cpumask *mask)
+{
+   unsigned int cpu;
+
+   for_each_cpu(cpu, mask)
+   do_message_pass(cpu, PPC_MSG_TIMER);
+}
+
 #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
 void smp_send_debugger_break(void)
 {
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 65ab9e9..0dfa0c5 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/pow

[PATCH V3 3/6] cpuidle/ppc: Split timer_interrupt() into timer handling and interrupt handling routines

2013-09-10 Thread Preeti U Murthy
On PowerPC, when CPUs enter deep idle states, their local timers get
switched off. The local timer is called the decrementer. An external clock
device needs to programmed to wake them up at their next timer event.
On PowerPC, we do not have an external device equivalent to HPET,
which is currently used on architectures like x86 under the same scenario.
Instead we assign the local timer of one of the CPUs to do this job.

On expiry of this timer, the broadcast framework today has the infrastructure
to send ipis to all such CPUs whose local timers have expired.

When such an ipi is received, the cpus in deep idle should handle their
expired timers. It should be as though they were woken up from a
timer interrupt itself. Hence this external ipi serves as an emulated timer
interrupt for the cpus in deep idle.

Therefore ideally on ppc, these cpus should call timer_interrupt() which
is the interrupt handler for a decrementer interrupt. But timer_interrupt()
also contains routines which are usually performed in an interrupt handler.
These are not required to be done in this scenario as the external interrupt
handler takes care of them.

Therefore split up timer_interrupt() into routines performed during regular
interrupt handling and __timer_interrupt(), which takes care of running local
timers and collecting time related stats. Now on a broadcast ipi, call
__timer_interrupt().

Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/kernel/time.c |   69 
 1 file changed, 37 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 0dfa0c5..eb48291 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -478,6 +478,42 @@ void arch_irq_work_raise(void)
 
 #endif /* CONFIG_IRQ_WORK */
 
+static void __timer_interrupt(void)
+{
+   struct pt_regs *regs = get_irq_regs();
+   u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
+   struct clock_event_device *evt = &__get_cpu_var(decrementers);
+   u64 now;
+
+   __get_cpu_var(irq_stat).timer_irqs++;
+   trace_timer_interrupt_entry(regs);
+
+   if (test_irq_work_pending()) {
+   clear_irq_work_pending();
+   irq_work_run();
+   }
+
+   now = get_tb_or_rtc();
+   if (now >= *next_tb) {
+   *next_tb = ~(u64)0;
+   if (evt->event_handler)
+   evt->event_handler(evt);
+   } else {
+   now = *next_tb - now;
+   if (now <= DECREMENTER_MAX)
+   set_dec((int)now);
+   }
+
+#ifdef CONFIG_PPC64
+   /* collect purr register values often, for accurate calculations */
+   if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+   struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array);
+   cu->current_tb = mfspr(SPRN_PURR);
+   }
+#endif
+   trace_timer_interrupt_exit(regs);
+}
+
 /*
  * timer_interrupt - gets called when the decrementer overflows,
  * with interrupts disabled.
@@ -486,8 +522,6 @@ void timer_interrupt(struct pt_regs * regs)
 {
struct pt_regs *old_regs;
u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
-   struct clock_event_device *evt = &__get_cpu_var(decrementers);
-   u64 now;
 
/* Ensure a positive value is written to the decrementer, or else
 * some CPUs will continue to take decrementer exceptions.
@@ -510,8 +544,6 @@ void timer_interrupt(struct pt_regs * regs)
 */
may_hard_irq_enable();
 
-   __get_cpu_var(irq_stat).timer_irqs++;
-
 #if defined(CONFIG_PPC32) && defined(CONFIG_PMAC)
if (atomic_read(&ppc_n_lost_interrupts) != 0)
do_IRQ(regs);
@@ -520,34 +552,7 @@ void timer_interrupt(struct pt_regs * regs)
old_regs = set_irq_regs(regs);
irq_enter();
 
-   trace_timer_interrupt_entry(regs);
-
-   if (test_irq_work_pending()) {
-   clear_irq_work_pending();
-   irq_work_run();
-   }
-
-   now = get_tb_or_rtc();
-   if (now >= *next_tb) {
-   *next_tb = ~(u64)0;
-   if (evt->event_handler)
-   evt->event_handler(evt);
-   } else {
-   now = *next_tb - now;
-   if (now <= DECREMENTER_MAX)
-   set_dec((int)now);
-   }
-
-#ifdef CONFIG_PPC64
-   /* collect purr register values often, for accurate calculations */
-   if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
-   struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array);
-   cu->current_tb = mfspr(SPRN_PURR);
-   }
-#endif
-
-   trace_timer_interrupt_exit(regs);
-
+   __timer_interrupt();
irq_exit();
set_irq_regs(old_regs);
 }

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V3 0/6] cpuidle/ppc: Enable broadcast support for deep idle states

2013-09-10 Thread Preeti U Murthy
On PowerPC, when CPUs enter deep idle states, their local timers get
switched off. An external clock device needs to programmed to wake them
up at their next timer event.
On PowerPC, we do not have an external device equivalent to HPET,
which is currently used on architectures like x86 under the same scenario.
Instead we assign the local timer of one of the CPUs to do this job.

This patchset is an attempt to hook onto the existing timer broadcast
framework in the kernel by using the local timer of one of the CPUs to do the
job of the external clock device.

On expiry of this device, the broadcast framework today has the infrastructure
to send ipis to all such CPUs whose local timers have expired. Hence the term
"broadcast" and the ipi sent is called the broadcast ipi.

This patch series is ported ontop of 3.11-rc7 + the cpuidle driver backend
for power posted by Deepthi Dharwar recently.
http://comments.gmane.org/gmane.linux.ports.ppc.embedded/63556

Changes in V3:

1. Fix the way in which a broadcast ipi is handled on the idling cpus. Timer
handling on a broadcast ipi is being done now without missing out any timer
stats generation.

2. Fix a bug in the programming of the hrtimer meant to do broadcast. Program
it to trigger at the earlier of a "broadcast period", and the next wakeup
event. By introducing the "broadcast period" as the maximum period after
which the broadcast hrtimer can fire, we ensure that we do not miss
wakeups in corner cases.

3. On hotplug of a broadcast cpu, trigger the hrtimer meant to do broadcast
to fire immediately on the new broadcast cpu. This will ensure we do not miss
doing a broadcast pending in the nearest future.

4. Change the type of allocation from GFP_KERNEL to GFP_NOWAIT while
initializing bc_hrtimer since we are in an atomic context and cannot sleep.

5. Use the broadcast ipi to wakeup the newly nominated broadcast cpu on
hotplug of the old instead of smp_call_function_single(). This is because we
are interrupt disabled at this point and should not be using
smp_call_function_single or its children in this context to send an ipi.

6. Move GENERIC_CLOCKEVENTS_BROADCAST to arch/powerpc/Kconfig.

7. Fix coding style issues.

Changes in V2: https://lkml.org/lkml/2013/8/14/239

1. Dynamically pick a broadcast CPU, instead of having a dedicated one.
2. Remove the constraint of having to disable tickless idle on the broadcast
CPU by queueing a hrtimer dedicated to do broadcast.

V1 posting: https://lkml.org/lkml/2013/7/25/740.

The patchset has been tested for stability in idle and during multi threaded
ebizzy runs.

Many thanks to Ben H, Frederic Weisbecker, Li Yang, Srivatsa S. Bhat and
Vaidyanathan Srinivasan for all their comments and suggestions so far.

---

Preeti U Murthy (4):
  cpuidle/ppc: Split timer_interrupt() into timer handling and interrupt 
handling routines
  cpuidle/ppc: Add basic infrastructure to support the broadcast framework 
on ppc
  cpuidle/ppc: Introduce the deep idle state in which the local timers stop
  cpuidle/ppc: Nominate new broadcast cpu on hotplug of the old

Srivatsa S. Bhat (2):
  powerpc: Free up the IPI message slot of ipi call function 
(PPC_MSG_CALL_FUNC)
  powerpc: Implement broadcast timer interrupt as an IPI message


 arch/powerpc/Kconfig|1 
 arch/powerpc/include/asm/smp.h  |3 -
 arch/powerpc/include/asm/time.h |4 +
 arch/powerpc/kernel/smp.c   |   23 +++-
 arch/powerpc/kernel/time.c  |  143 --
 arch/powerpc/platforms/cell/interrupt.c |2 
 arch/powerpc/platforms/ps3/smp.c|2 
 drivers/cpuidle/cpuidle-ibm-power.c |  172 +++
 scripts/kconfig/streamline_config.pl|0 
 9 files changed, 307 insertions(+), 43 deletions(-)
 mode change 100644 => 100755 scripts/kconfig/streamline_config.pl

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V3 1/6] powerpc: Free up the IPI message slot of ipi call function (PPC_MSG_CALL_FUNC)

2013-09-10 Thread Preeti U Murthy
From: Srivatsa S. Bhat 

The IPI handlers for both PPC_MSG_CALL_FUNC and PPC_MSG_CALL_FUNC_SINGLE map to
a common implementation - generic_smp_call_function_single_interrupt(). So, we
can consolidate them and save one of the IPI message slots, (which are precious,
since only 4 of those slots are available).

So, implement the functionality of PPC_MSG_CALL_FUNC using
PPC_MSG_CALL_FUNC_SINGLE itself and release its IPI message slot, so that it
can be used for something else in the future, if desired.

Signed-off-by: Srivatsa S. Bhat 
Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/include/asm/smp.h  |2 +-
 arch/powerpc/kernel/smp.c   |   12 +---
 arch/powerpc/platforms/cell/interrupt.c |2 +-
 arch/powerpc/platforms/ps3/smp.c|2 +-
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 48cfc85..a632b6e 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -117,7 +117,7 @@ extern int cpu_to_core_id(int cpu);
  *
  * Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up
  * in /proc/interrupts will be wrong!!! --Troy */
-#define PPC_MSG_CALL_FUNCTION   0
+#define PPC_MSG_UNUSED 0
 #define PPC_MSG_RESCHEDULE  1
 #define PPC_MSG_CALL_FUNC_SINGLE   2
 #define PPC_MSG_DEBUGGER_BREAK  3
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 38b0ba6..bc41e9f 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -111,9 +111,9 @@ int smp_generic_kick_cpu(int nr)
 }
 #endif /* CONFIG_PPC64 */
 
-static irqreturn_t call_function_action(int irq, void *data)
+static irqreturn_t unused_action(int irq, void *data)
 {
-   generic_smp_call_function_interrupt();
+   /* This slot is unused and hence available for use, if needed */
return IRQ_HANDLED;
 }
 
@@ -144,14 +144,14 @@ static irqreturn_t debug_ipi_action(int irq, void *data)
 }
 
 static irq_handler_t smp_ipi_action[] = {
-   [PPC_MSG_CALL_FUNCTION] =  call_function_action,
+   [PPC_MSG_UNUSED] =  unused_action, /* Slot available for future use */
[PPC_MSG_RESCHEDULE] = reschedule_action,
[PPC_MSG_CALL_FUNC_SINGLE] = call_function_single_action,
[PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action,
 };
 
 const char *smp_ipi_name[] = {
-   [PPC_MSG_CALL_FUNCTION] =  "ipi call function",
+   [PPC_MSG_UNUSED] =  "ipi unused",
[PPC_MSG_RESCHEDULE] = "ipi reschedule",
[PPC_MSG_CALL_FUNC_SINGLE] = "ipi call function single",
[PPC_MSG_DEBUGGER_BREAK] = "ipi debugger",
@@ -221,8 +221,6 @@ irqreturn_t smp_ipi_demux(void)
all = xchg(&info->messages, 0);
 
 #ifdef __BIG_ENDIAN
-   if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNCTION)))
-   generic_smp_call_function_interrupt();
if (all & (1 << (24 - 8 * PPC_MSG_RESCHEDULE)))
scheduler_ipi();
if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNC_SINGLE)))
@@ -265,7 +263,7 @@ void arch_send_call_function_ipi_mask(const struct cpumask 
*mask)
unsigned int cpu;
 
for_each_cpu(cpu, mask)
-   do_message_pass(cpu, PPC_MSG_CALL_FUNCTION);
+   do_message_pass(cpu, PPC_MSG_CALL_FUNC_SINGLE);
 }
 
 #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
diff --git a/arch/powerpc/platforms/cell/interrupt.c 
b/arch/powerpc/platforms/cell/interrupt.c
index 2d42f3b..28166e4 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -213,7 +213,7 @@ static void iic_request_ipi(int msg)
 
 void iic_request_IPIs(void)
 {
-   iic_request_ipi(PPC_MSG_CALL_FUNCTION);
+   iic_request_ipi(PPC_MSG_UNUSED);
iic_request_ipi(PPC_MSG_RESCHEDULE);
iic_request_ipi(PPC_MSG_CALL_FUNC_SINGLE);
iic_request_ipi(PPC_MSG_DEBUGGER_BREAK);
diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c
index 4b35166..488f069 100644
--- a/arch/powerpc/platforms/ps3/smp.c
+++ b/arch/powerpc/platforms/ps3/smp.c
@@ -74,7 +74,7 @@ static int __init ps3_smp_probe(void)
* to index needs to be setup.
*/
 
-   BUILD_BUG_ON(PPC_MSG_CALL_FUNCTION!= 0);
+   BUILD_BUG_ON(PPC_MSG_UNUSED   != 0);
BUILD_BUG_ON(PPC_MSG_RESCHEDULE   != 1);
BUILD_BUG_ON(PPC_MSG_CALL_FUNC_SINGLE != 2);
BUILD_BUG_ON(PPC_MSG_DEBUGGER_BREAK   != 3);

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC V2 PATCH 4/6] cpuidle/ppc: Add longnap state to the idle states on powernv

2013-08-22 Thread Preeti U Murthy
Hi Ben,

On 08/22/2013 08:58 AM, Benjamin Herrenschmidt wrote:
> On Wed, 2013-08-14 at 17:26 +0530, Preeti U Murthy wrote:
>> This patch hooks into the existing broadcast framework along with the support
>> that this patchset introduces for ppc, and the cpuidle driver backend
>> for powernv(posted out by Deepthi 
>> Dharwar:https://lkml.org/lkml/2013/7/23/128)
>> to add sleep state as one of the deep idle states, in which the decrementer
>> is switched off.
>>
>> However in this patch, we only emulate sleep by going into a state which does
>> a nap with the decrementer interrupts disabled, termed as longnap. This 
>> enables
>> focus on the timer broadcast framework for ppc in this series of patches ,
>> which is required as a first step to enable sleep on ppc.
> 
> This is only for debug / proof of concept right ? We should use a real
> sleep here.
> 
> If we need to know whether the FW supports it (PORE etc...) we shall add
> a device-tree property from the FW to indicate that fact.

We also need the sleep support right? The context management, I mean.
Yes it is a debug patch, so as to first ensure that we have the hook up
to the broadcast framework done right.

Regards
Preeti U Murthy

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC V2 PATCH 3/6] cpuidle/ppc: Add timer offload framework to support deep idle states

2013-08-22 Thread Preeti U Murthy
Hi Ben,

On 08/22/2013 08:57 AM, Benjamin Herrenschmidt wrote:
> On Wed, 2013-08-14 at 17:26 +0530, Preeti U Murthy wrote:
> 
>>  static irqreturn_t timer_action(int irq, void *data)
>>  {
>> -timer_interrupt();
>> +decrementer_timer_interrupt();
>>  return IRQ_HANDLED;
>>  }
> 
> I don't completely understand what you are doing here, but ...
> 
>> @@ -223,7 +223,7 @@ irqreturn_t smp_ipi_demux(void)
>>  
>>  #ifdef __BIG_ENDIAN
>>  if (all & (1 << (24 - 8 * PPC_MSG_TIMER)))
>> -timer_interrupt();
>> +decrementer_timer_interrupt();
> 
> Why call this decrementer_* since it's specifically *not* the
> decrementer ?
>
> Makes more sense to be called broadcast_timer_interrupt() no ?

A broadcast IPI is meant to trigger timer interrupt handling on the
target CPUs. In deep idle states, even though the local timers of CPUs
become non-functional, it should not make a difference to them because
of the broadcast framework's help.

The broadcast framework is meant to hide the fact that the CPUs in deep
idle states require external help to wakeup on their expired timer
events. So the CPUs should wake up to find themselves handling the
timers under such a scenario, although they woke up from an IPI.

This whole idea gets conveyed by naming the handler of the broadcast IPI
to decrementer_timer_interrupt().

That said, ideally it should have been called timer_interrupt(). But
since we already have the timer interrupt handler with the same name,
and also we cannot call it directly for reasons mentioned in the reply
to your review on PATCH 2/6, I named it decrementer_timer_interrupt() to
come close to conveying the idea. This calls only the timer interrupt
handler there.

> 
>>  if (all & (1 << (24 - 8 * PPC_MSG_RESCHEDULE)))
>>  scheduler_ipi();
>>  if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNC_SINGLE)))
>> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
>> index 65ab9e9..7e858e1 100644
>> --- a/arch/powerpc/kernel/time.c
>> +++ b/arch/powerpc/kernel/time.c
>> @@ -42,6 +42,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  #include 
>>  #include 
>>  #include 
>> @@ -97,8 +98,11 @@ static struct clocksource clocksource_timebase = {
>>  
>>  static int decrementer_set_next_event(unsigned long evt,
>>struct clock_event_device *dev);
>> +static int broadcast_set_next_event(unsigned long evt,
>> +  struct clock_event_device *dev);
>>  static void decrementer_set_mode(enum clock_event_mode mode,
>>   struct clock_event_device *dev);
>> +static void decrementer_timer_broadcast(const struct cpumask *mask);
>>  
>>  struct clock_event_device decrementer_clockevent = {
>>  .name   = "decrementer",
>> @@ -106,13 +110,26 @@ struct clock_event_device decrementer_clockevent = {
>>  .irq= 0,
>>  .set_next_event = decrementer_set_next_event,
>>  .set_mode   = decrementer_set_mode,
>> -.features   = CLOCK_EVT_FEAT_ONESHOT,
>> +.broadcast  = decrementer_timer_broadcast,
>> +.features   = CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_ONESHOT,
>>  };
>>  EXPORT_SYMBOL(decrementer_clockevent);
>>  
>> +struct clock_event_device broadcast_clockevent = {
>> +.name   = "broadcast",
>> +.rating = 200,
>> +.irq= 0,
>> +.set_next_event = broadcast_set_next_event,
>> +.set_mode   = decrementer_set_mode,
> 
> Same here, why "decrementer" ? This event device is *not* the
> decrementer right ?

You are right. In this case it should have been broadcast_set_mode. This
is because this function is associated with the sender(the broadcast cpu).

> 
> Also, pardon my ignorance, by why do we need a separate
> clock_event_device ? Ie what does that do ? I am not familiar with the
> broadcast scheme and what .broadcast do in the "decrementer" one, so
> you need to provide me at least with better explanations.

The answer in short to why we need an additional clock event device is
that, in this patchset, we try to integrate the support for deep sleep
states in power, with the broadcast framework existent today in the
kernel without any changes to this broadcast framework and trying our
best to adapt to it. Let me now elaborate.

The broadcast framework kicks in if there is a broadcast clock event
device. We can double up the local timer/decrementer of a

Re: [RFC V2 PATCH 2/6] powerpc: Implement broadcast timer interrupt as an IPI message

2013-08-21 Thread Preeti U Murthy
Hi Ben

On 08/22/2013 08:40 AM, Benjamin Herrenschmidt wrote:
> On Wed, 2013-08-14 at 17:26 +0530, Preeti U Murthy wrote:
>> -static irqreturn_t unused_action(int irq, void *data)
>> +static irqreturn_t timer_action(int irq, void *data)
>>  {
>> -   /* This slot is unused and hence available for use, if needed
>> */
>> +   timer_interrupt();
>> return IRQ_HANDLED;
>>  }
>>  
> 
> That means we'll do irq_enter/irq_exit twice no ? And things like
> may_hard_irq_enable() are also already done by do_IRQ so you
> don't need timer_interrupt() to do it again.
> 
> We probably are better off breaking timer_interrupt in two:
> 
> void __timer_interrupt(struct pt_regs * regs)
> 
> Does the current stuff between irq_enter and irq_exit, timer_interrupt
> does the remaining around it and calls __timer_interrupt.
> 
> Then from timer_action, you call __timer_interrupt()

We actually tried out this approach. The implementation was have a
set_dec(0) in the timer_action(). This would ensure that we actually do
get a timer interrupt.

But the problem with either this approach or the one that you
suggest,i.e. calling __timer_interrupt is in the following flow.

do_IRQ() -> irq_exit() -> tick_irq_exit() -> tick_nohz_irq_exit()
-> tick_nohz_stop_sched_tick()

The problem lies in the function tick_nohz_stop_sched_tick(). This
function checks for the next timer interrupt pending on this cpu, and
programs the decrementer_next_event to the time of the next event, which
is of course > now.

As a result when in the timer_action() above, we do call
__timer_interrupt() or try to trigger a timer interrupt through
set_dec(0), the condition  if(now >= *next_tb) in timer_interrupt() or
__timer_interrupt() will fail, and will not call the timer interrupt
event handler.

---> if (now >= *next_tb) {
 *next_tb = ~(u64)0;
 if (evt->event_handler)
evt->event_handler(evt);
  } else {

The broadcast IPI , is meant to make the target CPU of this IPI believe
that it woke up from a timer interrupt, and not from an IPI. (The reason
for this I will explain in the reply to the next patch). The target CPU
should then ideally do what it would have done had it received a real
timer interrupt, call the timer interrupt event handler.

But due to the above code flow this does  not happen.
Hence as the next patch PATCH 3/6 shows, we simply call the event
handler of a timer interrupt without this explicit now >= *next_tb check.

This problem arises only in the implementation of this patchset, because
a timer interrupt is pseudo triggered from an IPI. So the effects of the
IPI handler will be felt on the timer interrupt handler triggered from
this IPI, like above.

Regards
Preeti U Murthy

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC V2 PATCH 6/6] cpuidle/ppc : Queue a hrtimer on bc_cpu, explicitly to do broadcast handling

2013-08-14 Thread Preeti U Murthy
In the current design we were depending on the timer interrupt on the
bc_cpu to trigger broadcast handling. In tickless idle, timer interrupts
could be many ticks away which could result in missed wakeups on CPUs in deep
idle states. Disabling tickless idle on the bc_cpu is not good for
powersavings.

Therefore queue a hrtimer specifically for broadcast handling. When the 
broadcast
CPU is chosen, it schedules this hrtimer to fire after a jiffy.
This is meant to initiate broadcast handling. For each expiration of
this hrtimer thereon, it is reprogrammed to fire at the time the next broadcast
handling has to be done. But if there is no pending broadcast handling to be
done in the future, the broadcast cpu is invalidated and the hrtimer is
cancelled. The above cycle repeats when the next CPU attempts to enter sleep 
state.

Of course the time at which the hrtimer fires initially can be scheduled for
time=target_residency of deep idle state instead of a jiffy, since CPUs going
into deep idle states will not have their next wakeup event before this
target_residency time of the the deep idle state.
  But this patchset is based on longnap which is now used to mimick sleep
but is actually nap with decrementer interrupts disabled. Therefore its
target_residency is the same as nap. The CPUs going into longnap, will
probably need to be woken up sooner than they would have been,had they gone into
sleep. Hence the initial scheduling of the hrtimer is held at a jiffy as of now.

There is one other significant point. On CPU hotplug, the hrtimer on the
broadcast CPU is cancelled, the bc_cpu entry is invalidated, a new
broadcast cpu is chosen as before, and an IPI is sent to it. However instead
of using a broadcast IPI to wake it up, use smp_call_function_single(),
because apart from just wakeup, the new broadcast CPU has to restart
the hrtimer on itself so as to continue broadcast handling.

Signed-off-by: Preeti U Murthy
---

 arch/powerpc/include/asm/time.h |5 ++
 arch/powerpc/kernel/time.c  |   47 ---
 arch/powerpc/platforms/powernv/processor_idle.c |   38 ++-
 3 files changed, 73 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 92260c9..b9a60eb 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -16,6 +16,7 @@
 #ifdef __KERNEL__
 #include 
 #include 
+#include 
 
 #include 
 
@@ -26,6 +27,7 @@ extern unsigned long tb_ticks_per_sec;
 extern struct clock_event_device decrementer_clockevent;
 extern struct clock_event_device broadcast_clockevent;
 extern struct clock_event_device bc_timer;
+extern struct hrtimer *bc_hrtimer;
 
 struct rtc_time;
 extern void to_tm(int tim, struct rtc_time * tm);
@@ -35,7 +37,10 @@ extern void decrementer_timer_interrupt(void);
 extern void generic_calibrate_decr(void);
 
 extern void set_dec_cpu6(unsigned int val);
+extern ktime_t get_next_bc_tick(void);
+extern enum hrtimer_restart handle_broadcast(struct hrtimer *hrtimer);
 extern int bc_cpu;
+extern int bc_hrtimer_initialized;
 
 /* Some sane defaults: 125 MHz timebase, 1GHz processor */
 extern unsigned long ppc_proc_freq;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index a19c8ca..1a64d58 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -43,6 +43,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -128,6 +130,8 @@ EXPORT_SYMBOL(broadcast_clockevent);
 DEFINE_PER_CPU(u64, decrementers_next_tb);
 static DEFINE_PER_CPU(struct clock_event_device, decrementers);
 struct clock_event_device bc_timer;
+struct hrtimer *bc_hrtimer;
+int bc_hrtimer_initialized = 0;
 
 int bc_cpu = -1;
 #define XSEC_PER_SEC (1024*1024)
@@ -504,8 +508,6 @@ void timer_interrupt(struct pt_regs * regs)
struct pt_regs *old_regs;
u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
struct clock_event_device *evt = &__get_cpu_var(decrementers);
-   struct clock_event_device *bc_evt = &bc_timer;
-   int cpu = smp_processor_id();
u64 now;
 
/* Ensure a positive value is written to the decrementer, or else
@@ -551,10 +553,6 @@ void timer_interrupt(struct pt_regs * regs)
*next_tb = ~(u64)0;
if (evt->event_handler)
evt->event_handler(evt);
-   if (cpu == bc_cpu && bc_evt->event_handler) {
-   bc_evt->event_handler(bc_evt);
-   }
-
} else {
now = *next_tb - now;
if (now <= DECREMENTER_MAX)
@@ -864,6 +862,42 @@ static void decrementer_timer_broadcast(const struct 
cpumask *mask)
arch_send_tick_broadcast(mask);
 }
 
+ktime_t get_next_bc_tick(void)
+{
+   u64 next_bc_ns;
+
+   next_bc_ns = (tb_ticks_per_jiffy / tb_ticks_per_usec) * 1000;
+   return ns_to_ktime(next

[RFC V2 PATCH 5/6] cpuidle/ppc: Enable dynamic movement of the broadcast functionality across CPUs

2013-08-14 Thread Preeti U Murthy
In the current design of the timer offload framework for powerpc, there is a
dedicated broadcast CPU, which is the boot CPU. But this is not good because:

a.It disallows this CPU from being hotplugged out.

b.Overburdening this CPU with the broadcast duty can take
a toll on the performance, which could worsen if this CPU
is already too busy.

c.This could lead to thermal or power imbalance within the chip.

To overcome the above constraints, float around the duty of doing a broadcast
around the CPUs. The current design proposes to choose the first CPU that
attempts to go to deep idle state to be the broadcast CPU/bc_cpu.
It is disallowed from entering deep idle state.

Let the broadcast CPU become invalidated when there are no more CPUs in
the broadcast mask. Until this point the rest of the CPUs attempting to enter
deep idle will be allowed to do so, to be woken up by the broadcast CPU.
Hence the set and unset of the bc_cpu variable is done only by the broadcast
CPU.

Protect the region of all the above activity with a lock in order to avoid
race conditions between readers and writers of the bc_cpu
entry and the broadcast cpus mask. One typical scenario could be:

CPUACPUB

Read bc_cpu exists  Is the bc_cpu, finds the broadcast mask
empty,and invalidates the bc_cpu.

Enter itself into the
the broadcast mask.

Thus, CPUA  would go into deep idle when broadcast handling is inactive.

The broadcast clockevent device is now one single pseudo device capable of 
working for
all possible cpus (instead of being per-cpu like it was before, there is no
point in having so), due to the dynamic movement of the broadcast CPU. This
is a pseudo device and the dynamic movement of bc_cpu will therefore not affect
its functioning. The broadcast clockevent device's event handler will be called
by the bc_cpu in each of its timer interrupt.

This patchset adds hotplug notifiers to change the bc_cpu, in case it goes
offline. In this case choose the first cpu in the broadcast mask to be the
next bc_cpu and send an IPI, to wake it up to begin to handle broadcast events
thereon. This IPI is the same as the broadcast IPI.
   The idea being the intention of both these scenarios(hotplug and actual
broadcast wakeup) is to wake up a CPU in the broadcast mask, except that
they are for different reasons.

Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/include/asm/time.h |1 
 arch/powerpc/kernel/time.c  |   10 ++--
 arch/powerpc/platforms/powernv/processor_idle.c |   56 +++
 3 files changed, 53 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 936be0d..92260c9 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -25,6 +25,7 @@ extern unsigned long tb_ticks_per_usec;
 extern unsigned long tb_ticks_per_sec;
 extern struct clock_event_device decrementer_clockevent;
 extern struct clock_event_device broadcast_clockevent;
+extern struct clock_event_device bc_timer;
 
 struct rtc_time;
 extern void to_tm(int tim, struct rtc_time * tm);
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 7e858e1..a19c8ca 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -127,9 +127,9 @@ EXPORT_SYMBOL(broadcast_clockevent);
 
 DEFINE_PER_CPU(u64, decrementers_next_tb);
 static DEFINE_PER_CPU(struct clock_event_device, decrementers);
-static DEFINE_PER_CPU(struct clock_event_device, bc_timer);
+struct clock_event_device bc_timer;
 
-int bc_cpu;
+int bc_cpu = -1;
 #define XSEC_PER_SEC (1024*1024)
 
 #ifdef CONFIG_PPC64
@@ -504,7 +504,7 @@ void timer_interrupt(struct pt_regs * regs)
struct pt_regs *old_regs;
u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
struct clock_event_device *evt = &__get_cpu_var(decrementers);
-   struct clock_event_device *bc_evt = &__get_cpu_var(bc_timer);
+   struct clock_event_device *bc_evt = &bc_timer;
int cpu = smp_processor_id();
u64 now;
 
@@ -879,10 +879,10 @@ static void register_decrementer_clockevent(int cpu)
 
 static void register_broadcast_clockevent(int cpu)
 {
-   struct clock_event_device *bc_evt = &per_cpu(bc_timer, cpu);
+   struct clock_event_device *bc_evt = &bc_timer;
 
*bc_evt = broadcast_clockevent;
-   bc_evt->cpumask = cpumask_of(cpu);
+   bc_evt->cpumask = cpu_possible_mask;
 
printk_once(KERN_DEBUG "clockevent: %s mult[%x] shift[%d] cpu[%d]\n",
bc_evt->name, bc_evt->mult, bc_evt->shift, cpu);
diff --git a/arch/powerpc/platforms/powernv/processor_idle.c 
b/arch/powerpc/platforms/powernv/processor_idle.c
index 9aca502..9554da6 100644
--- a/arch/powerpc/platforms/powernv/processor_idle.c
+++ b/arch/powerpc/platforms/powernv/processor_idle.c
@@ -10,6 +10,8 @@
 #include 
 

[RFC V2 PATCH 4/6] cpuidle/ppc: Add longnap state to the idle states on powernv

2013-08-14 Thread Preeti U Murthy
This patch hooks into the existing broadcast framework along with the support
that this patchset introduces for ppc, and the cpuidle driver backend
for powernv(posted out by Deepthi Dharwar:https://lkml.org/lkml/2013/7/23/128)
to add sleep state as one of the deep idle states, in which the decrementer
is switched off.

However in this patch, we only emulate sleep by going into a state which does
a nap with the decrementer interrupts disabled, termed as longnap. This enables
focus on the timer broadcast framework for ppc in this series of patches ,
which is required as a first step to enable sleep on ppc.

Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/platforms/powernv/processor_idle.c |   48 +++
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/processor_idle.c 
b/arch/powerpc/platforms/powernv/processor_idle.c
index f43ad91a..9aca502 100644
--- a/arch/powerpc/platforms/powernv/processor_idle.c
+++ b/arch/powerpc/platforms/powernv/processor_idle.c
@@ -9,16 +9,18 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
+#include 
 
 struct cpuidle_driver powernv_idle_driver = {
.name = "powernv_idle",
.owner =THIS_MODULE,
 };
 
-#define MAX_IDLE_STATE_COUNT   2
+#define MAX_IDLE_STATE_COUNT   3
 
 static int max_idle_state = MAX_IDLE_STATE_COUNT - 1;
 static struct cpuidle_device __percpu *powernv_cpuidle_devices;
@@ -54,6 +56,43 @@ static int nap_loop(struct cpuidle_device *dev,
return index;
 }
 
+/* Emulate sleep, with long nap.
+ * During sleep, the core does not receive decrementer interrupts.
+ * Emulate sleep using long nap with decrementers interrupts disabled.
+ * This is an initial prototype to test the timer offload framework for ppc.
+ * We will eventually introduce the sleep state once the timer offload 
framework
+ * for ppc is stable.
+ */
+static int longnap_loop(struct cpuidle_device *dev,
+   struct cpuidle_driver *drv,
+   int index)
+{
+   int cpu = dev->cpu;
+
+   unsigned long lpcr = mfspr(SPRN_LPCR);
+
+   lpcr &= ~(LPCR_MER | LPCR_PECE); /* lpcr[mer] must be 0 */
+
+   /* exit powersave upon external interrupt, but not decrementer
+* interrupt, Emulate sleep.
+*/
+   lpcr |= LPCR_PECE0;
+
+   if (cpu != bc_cpu) {
+   mtspr(SPRN_LPCR, lpcr);
+   clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
+   power7_nap();
+   clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+   } else {
+   /* Wakeup on a decrementer interrupt, Do a nap */
+   lpcr |= LPCR_PECE1;
+   mtspr(SPRN_LPCR, lpcr);
+   power7_nap();
+   }
+
+   return index;
+}
+
 /*
  * States for dedicated partition case.
  */
@@ -72,6 +111,13 @@ static struct cpuidle_state 
powernv_states[MAX_IDLE_STATE_COUNT] = {
.exit_latency = 10,
.target_residency = 100,
.enter = &nap_loop },
+{ /* LongNap */
+   .name = "LongNap",
+   .desc = "LongNap",
+   .flags = CPUIDLE_FLAG_TIME_VALID,
+   .exit_latency = 10,
+   .target_residency = 100,
+   .enter = &longnap_loop },
 };
 
 static int powernv_cpuidle_add_cpu_notifier(struct notifier_block *n,

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC V2 PATCH 3/6] cpuidle/ppc: Add timer offload framework to support deep idle states

2013-08-14 Thread Preeti U Murthy
On ppc, in deep idle states, the local clock event device of CPUs gets
switched off. On PowerPC, the local clock event device is called the
decrementer. Make use of the broadcast framework to issue interrupts to
cpus in deep idle states on their timer events, except that on ppc, we
do not have an external device such as HPET, but we use the decrementer
of one of the CPUs itself as the broadcast device.

Instantiate two different clock event devices, one representing the
decrementer and another representing the broadcast device for each cpu.
The cpu which registers its broadcast device will be responsible for
performing the function of issuing timer interrupts to CPUs in deep idle
states, and is referred to as the broadcast cpu/bc_cpu in the changelogs of this
patchset for convenience. Such a CPU is not allowed to enter deep idle
states, where the decrementer is switched off.

For now, only the boot cpu's broadcast device gets registered as a clock event
device along with the decrementer. Hence this is the broadcast cpu.

On the broadcast cpu, on each timer interrupt, apart from the regular local
timer event handler the broadcast handler is also called. We avoid the overhead
of programming the decrementer specifically for a broadcast event. The reason 
is for
performance and scalability reasons. Say cpuX goes to deep idle state. It
has to ask the broadcast CPU to reprogram its(broadcast CPU's) decrementer for
the next local timer event of cpuX. cpuX can do so only by sending an IPI to the
broadcast CPU. With many more cpus going to deep idle, this model of sending
IPIs each time will result in performance bottleneck and may not scale well.

Apart from this there is no change in the way broadcast is handled today. On
a broadcast ipi the event handler for a timer interrupt is called on the cpu
in deep idle state to handle the local events.

The current design and implementation of the timer offload framework supports
the ONESHOT tick mode but not the PERIODIC mode.

Signed-off-by: Preeti U. Murthy 
---

 arch/powerpc/include/asm/time.h|3 +
 arch/powerpc/kernel/smp.c  |4 +-
 arch/powerpc/kernel/time.c |   81 
 arch/powerpc/platforms/powernv/Kconfig |1 
 4 files changed, 86 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index c1f2676..936be0d 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -24,14 +24,17 @@ extern unsigned long tb_ticks_per_jiffy;
 extern unsigned long tb_ticks_per_usec;
 extern unsigned long tb_ticks_per_sec;
 extern struct clock_event_device decrementer_clockevent;
+extern struct clock_event_device broadcast_clockevent;
 
 struct rtc_time;
 extern void to_tm(int tim, struct rtc_time * tm);
 extern void GregorianDay(struct rtc_time *tm);
+extern void decrementer_timer_interrupt(void);
 
 extern void generic_calibrate_decr(void);
 
 extern void set_dec_cpu6(unsigned int val);
+extern int bc_cpu;
 
 /* Some sane defaults: 125 MHz timebase, 1GHz processor */
 extern unsigned long ppc_proc_freq;
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 6a68ca4..d3b7014 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -114,7 +114,7 @@ int smp_generic_kick_cpu(int nr)
 
 static irqreturn_t timer_action(int irq, void *data)
 {
-   timer_interrupt();
+   decrementer_timer_interrupt();
return IRQ_HANDLED;
 }
 
@@ -223,7 +223,7 @@ irqreturn_t smp_ipi_demux(void)
 
 #ifdef __BIG_ENDIAN
if (all & (1 << (24 - 8 * PPC_MSG_TIMER)))
-   timer_interrupt();
+   decrementer_timer_interrupt();
if (all & (1 << (24 - 8 * PPC_MSG_RESCHEDULE)))
scheduler_ipi();
if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNC_SINGLE)))
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 65ab9e9..7e858e1 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -42,6 +42,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -97,8 +98,11 @@ static struct clocksource clocksource_timebase = {
 
 static int decrementer_set_next_event(unsigned long evt,
  struct clock_event_device *dev);
+static int broadcast_set_next_event(unsigned long evt,
+ struct clock_event_device *dev);
 static void decrementer_set_mode(enum clock_event_mode mode,
 struct clock_event_device *dev);
+static void decrementer_timer_broadcast(const struct cpumask *mask);
 
 struct clock_event_device decrementer_clockevent = {
.name   = "decrementer",
@@ -106,13 +110,26 @@ struct clock_event_device decrementer_clockevent = {
.irq= 0,
.set_next_eve

[RFC V2 PATCH 2/6] powerpc: Implement broadcast timer interrupt as an IPI message

2013-08-14 Thread Preeti U Murthy
From: Srivatsa S. Bhat 

For scalability and performance reasons, we want the broadcast timer
interrupts to be handled as efficiently as possible. Fixed IPI messages
are one of the most efficient mechanisms available - they are faster
than the smp_call_function mechanism because the IPI handlers are fixed
and hence they don't involve costly operations such as adding IPI handlers
to the target CPU's function queue, acquiring locks for synchronization etc.

Luckily we have an unused IPI message slot, so use that to implement
broadcast timer interrupts efficiently.

Signed-off-by: Srivatsa S. Bhat 
Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/include/asm/smp.h  |3 ++-
 arch/powerpc/kernel/smp.c   |   19 +++
 arch/powerpc/platforms/cell/interrupt.c |2 +-
 arch/powerpc/platforms/ps3/smp.c|2 +-
 4 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 51bf017..d877b69 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -117,7 +117,7 @@ extern int cpu_to_core_id(int cpu);
  *
  * Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up
  * in /proc/interrupts will be wrong!!! --Troy */
-#define PPC_MSG_UNUSED 0
+#define PPC_MSG_TIMER  0
 #define PPC_MSG_RESCHEDULE  1
 #define PPC_MSG_CALL_FUNC_SINGLE   2
 #define PPC_MSG_DEBUGGER_BREAK  3
@@ -190,6 +190,7 @@ extern struct smp_ops_t *smp_ops;
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
+extern void arch_send_tick_broadcast(const struct cpumask *mask);
 
 /* Definitions relative to the secondary CPU spin loop
  * and entry point. Not all of them exist on both 32 and
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index bc41e9f..6a68ca4 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -111,9 +112,9 @@ int smp_generic_kick_cpu(int nr)
 }
 #endif /* CONFIG_PPC64 */
 
-static irqreturn_t unused_action(int irq, void *data)
+static irqreturn_t timer_action(int irq, void *data)
 {
-   /* This slot is unused and hence available for use, if needed */
+   timer_interrupt();
return IRQ_HANDLED;
 }
 
@@ -144,14 +145,14 @@ static irqreturn_t debug_ipi_action(int irq, void *data)
 }
 
 static irq_handler_t smp_ipi_action[] = {
-   [PPC_MSG_UNUSED] =  unused_action, /* Slot available for future use */
+   [PPC_MSG_TIMER] =  timer_action,
[PPC_MSG_RESCHEDULE] = reschedule_action,
[PPC_MSG_CALL_FUNC_SINGLE] = call_function_single_action,
[PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action,
 };
 
 const char *smp_ipi_name[] = {
-   [PPC_MSG_UNUSED] =  "ipi unused",
+   [PPC_MSG_TIMER] =  "ipi timer",
[PPC_MSG_RESCHEDULE] = "ipi reschedule",
[PPC_MSG_CALL_FUNC_SINGLE] = "ipi call function single",
[PPC_MSG_DEBUGGER_BREAK] = "ipi debugger",
@@ -221,6 +222,8 @@ irqreturn_t smp_ipi_demux(void)
all = xchg(&info->messages, 0);
 
 #ifdef __BIG_ENDIAN
+   if (all & (1 << (24 - 8 * PPC_MSG_TIMER)))
+   timer_interrupt();
if (all & (1 << (24 - 8 * PPC_MSG_RESCHEDULE)))
scheduler_ipi();
if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNC_SINGLE)))
@@ -266,6 +269,14 @@ void arch_send_call_function_ipi_mask(const struct cpumask 
*mask)
do_message_pass(cpu, PPC_MSG_CALL_FUNC_SINGLE);
 }
 
+void arch_send_tick_broadcast(const struct cpumask *mask)
+{
+   unsigned int cpu;
+
+   for_each_cpu(cpu, mask)
+   do_message_pass(cpu, PPC_MSG_TIMER);
+}
+
 #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
 void smp_send_debugger_break(void)
 {
diff --git a/arch/powerpc/platforms/cell/interrupt.c 
b/arch/powerpc/platforms/cell/interrupt.c
index 28166e4..1359113 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -213,7 +213,7 @@ static void iic_request_ipi(int msg)
 
 void iic_request_IPIs(void)
 {
-   iic_request_ipi(PPC_MSG_UNUSED);
+   iic_request_ipi(PPC_MSG_TIMER);
iic_request_ipi(PPC_MSG_RESCHEDULE);
iic_request_ipi(PPC_MSG_CALL_FUNC_SINGLE);
iic_request_ipi(PPC_MSG_DEBUGGER_BREAK);
diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c
index 488f069..5cb742a 100644
--- a/arch/powerpc/platforms/ps3/smp.c
+++ b/arch/powerpc/platforms/ps3/smp.c
@@ -74,7 +74,7 @@ static int __init ps3_smp_probe(void)
* to index needs to be setup.
*/
 
-   BUILD_BUG_ON(PPC_MSG_UNUSED   != 0);
+   BUILD_

[RFC V2 PATCH 1/6] powerpc: Free up the IPI message slot of ipi call function (PPC_MSG_CALL_FUNC)

2013-08-14 Thread Preeti U Murthy
From: Srivatsa S. Bhat 

The IPI handlers for both PPC_MSG_CALL_FUNC and PPC_MSG_CALL_FUNC_SINGLE
map to a common implementation - generic_smp_call_function_single_interrupt().
So, we can consolidate them and save one of the IPI message slots, (which are
precious, since only 4 of those slots are available).

So, implement the functionality of PPC_MSG_CALL_FUNC using
PPC_MSG_CALL_FUNC_SINGLE itself and release its IPI message slot, so that it
can be used for something else in the future, if desired.

Signed-off-by: Srivatsa S. Bhat 
Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/include/asm/smp.h  |2 +-
 arch/powerpc/kernel/smp.c   |   12 +---
 arch/powerpc/platforms/cell/interrupt.c |2 +-
 arch/powerpc/platforms/ps3/smp.c|2 +-
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index ffbaabe..51bf017 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -117,7 +117,7 @@ extern int cpu_to_core_id(int cpu);
  *
  * Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up
  * in /proc/interrupts will be wrong!!! --Troy */
-#define PPC_MSG_CALL_FUNCTION   0
+#define PPC_MSG_UNUSED 0
 #define PPC_MSG_RESCHEDULE  1
 #define PPC_MSG_CALL_FUNC_SINGLE   2
 #define PPC_MSG_DEBUGGER_BREAK  3
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 38b0ba6..bc41e9f 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -111,9 +111,9 @@ int smp_generic_kick_cpu(int nr)
 }
 #endif /* CONFIG_PPC64 */
 
-static irqreturn_t call_function_action(int irq, void *data)
+static irqreturn_t unused_action(int irq, void *data)
 {
-   generic_smp_call_function_interrupt();
+   /* This slot is unused and hence available for use, if needed */
return IRQ_HANDLED;
 }
 
@@ -144,14 +144,14 @@ static irqreturn_t debug_ipi_action(int irq, void *data)
 }
 
 static irq_handler_t smp_ipi_action[] = {
-   [PPC_MSG_CALL_FUNCTION] =  call_function_action,
+   [PPC_MSG_UNUSED] =  unused_action, /* Slot available for future use */
[PPC_MSG_RESCHEDULE] = reschedule_action,
[PPC_MSG_CALL_FUNC_SINGLE] = call_function_single_action,
[PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action,
 };
 
 const char *smp_ipi_name[] = {
-   [PPC_MSG_CALL_FUNCTION] =  "ipi call function",
+   [PPC_MSG_UNUSED] =  "ipi unused",
[PPC_MSG_RESCHEDULE] = "ipi reschedule",
[PPC_MSG_CALL_FUNC_SINGLE] = "ipi call function single",
[PPC_MSG_DEBUGGER_BREAK] = "ipi debugger",
@@ -221,8 +221,6 @@ irqreturn_t smp_ipi_demux(void)
all = xchg(&info->messages, 0);
 
 #ifdef __BIG_ENDIAN
-   if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNCTION)))
-   generic_smp_call_function_interrupt();
if (all & (1 << (24 - 8 * PPC_MSG_RESCHEDULE)))
scheduler_ipi();
if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNC_SINGLE)))
@@ -265,7 +263,7 @@ void arch_send_call_function_ipi_mask(const struct cpumask 
*mask)
unsigned int cpu;
 
for_each_cpu(cpu, mask)
-   do_message_pass(cpu, PPC_MSG_CALL_FUNCTION);
+   do_message_pass(cpu, PPC_MSG_CALL_FUNC_SINGLE);
 }
 
 #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
diff --git a/arch/powerpc/platforms/cell/interrupt.c 
b/arch/powerpc/platforms/cell/interrupt.c
index 2d42f3b..28166e4 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -213,7 +213,7 @@ static void iic_request_ipi(int msg)
 
 void iic_request_IPIs(void)
 {
-   iic_request_ipi(PPC_MSG_CALL_FUNCTION);
+   iic_request_ipi(PPC_MSG_UNUSED);
iic_request_ipi(PPC_MSG_RESCHEDULE);
iic_request_ipi(PPC_MSG_CALL_FUNC_SINGLE);
iic_request_ipi(PPC_MSG_DEBUGGER_BREAK);
diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c
index 4b35166..488f069 100644
--- a/arch/powerpc/platforms/ps3/smp.c
+++ b/arch/powerpc/platforms/ps3/smp.c
@@ -74,7 +74,7 @@ static int __init ps3_smp_probe(void)
* to index needs to be setup.
*/
 
-   BUILD_BUG_ON(PPC_MSG_CALL_FUNCTION!= 0);
+   BUILD_BUG_ON(PPC_MSG_UNUSED   != 0);
BUILD_BUG_ON(PPC_MSG_RESCHEDULE   != 1);
BUILD_BUG_ON(PPC_MSG_CALL_FUNC_SINGLE != 2);
BUILD_BUG_ON(PPC_MSG_DEBUGGER_BREAK   != 3);

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC V2 PATCH 0/6] cpuidle/ppc: Timer offload framework to support deep idle states

2013-08-14 Thread Preeti U Murthy
On PowerPC, when CPUs enter deep idle states, their local timers are
switched off. The responsibility of waking them up at their next timer event,
needs to be handed over to an external device. On PowerPC, we do not have an
external device equivalent to HPET, which is currently done on architectures
like x86. Instead we assign the local timer of one of the CPUs to do this job.

This patchset is an attempt to make use of the existing timer broadcast
framework in the kernel to meet the above requirement, except that the tick
broadcast device is the local timer of the boot CPU.

This patch series is ported ontop of 3.11-rc1 + the cpuidle driver backend
for powernv posted by Deepthi Dharwar recently. NOHZ_FULL is disabled for all
testing purposes.
  The current design and implementation supports the ONESHOT tick mode.
It does not yet support the PERIODIC tick mode.

The discussion around V1 of this patchset can be found here:
https://lkml.org/lkml/2013/7/25/740.

Changes since V1:
1. Dynamically pick a broadcast CPU, instead of having a dedicated one.
2. Remove the constraint of having to disable tickless idle on the broadcast 
CPU.

Thanks to Ben H, Frederic Weisbecker, Li Yang and Vaidyanathan Srinivasan for
all their comments and suggestions on the V1 of this patchset.

Patch[1/6], Patch[2/6]: optimize the broadcast mechanism on ppc.
Patch[3/6]: Introduces the core of the timer offload framework on powerpc.
Patch[4/6]: Add a deep idle state to the cpuidle state table on powernv
Patch[5/6]: Dynamically pick a broadcast CPU
Patch[6/6]: Remove the constraint of having to disable tickless idle on the
broadcast cpu, by queueing a hrtimer exclusively to do broadcast handling.

---

Preeti U Murthy (4):
  cpuidle/ppc: Add timer offload framework to support deep idle states
  cpuidle/ppc: Add longnap state to the idle states on powernv
  cpuidle/ppc: Enable dynamic movement of the broadcast functionality 
across CPUs
  cpuidle/ppc : Queue a hrtimer on bc_cpu, explicitly to do broadcast 
handling

Srivatsa S. Bhat (2):
  powerpc: Free up the IPI message slot of ipi call function 
(PPC_MSG_CALL_FUNC)
  powerpc: Implement broadcast timer interrupt as an IPI message


 arch/powerpc/include/asm/smp.h  |3 -
 arch/powerpc/include/asm/time.h |9 ++
 arch/powerpc/kernel/smp.c   |   23 +++--
 arch/powerpc/kernel/time.c  |  114 +++
 arch/powerpc/platforms/cell/interrupt.c |2 
 arch/powerpc/platforms/powernv/Kconfig  |1 
 arch/powerpc/platforms/powernv/processor_idle.c |  104 +
 arch/powerpc/platforms/ps3/smp.c|2 
 8 files changed, 246 insertions(+), 12 deletions(-)

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [linux-pm] [PATCH 1/3] cpuidle/powernv: cpuidle backend driver for powernv

2013-08-02 Thread Preeti U Murthy
Hi Daniel,

On 07/27/2013 10:57 AM, Daniel Lezcano wrote:
> On 07/23/2013 11:01 AM, Deepthi Dharwar wrote:
>> This patch implements a back-end cpuidle driver for
>> powernv calling power7_nap and snooze idle states.
>> This can be extended by adding more idle states
>> in the future to the existing framework.
>>
>> Signed-off-by: Deepthi Dharwar 
>> ---
>>  arch/powerpc/platforms/powernv/Kconfig  |9 +
>>  arch/powerpc/platforms/powernv/Makefile |1 
>>  arch/powerpc/platforms/powernv/processor_idle.c |  239 
>> +++
>>  3 files changed, 249 insertions(+)
>>  create mode 100644 arch/powerpc/platforms/powernv/processor_idle.c
>>
>> diff --git a/arch/powerpc/platforms/powernv/processor_idle.c 
>> b/arch/powerpc/platforms/powernv/processor_idle.c
>> new file mode 100644
>> index 000..f43ad91a
>> --- /dev/null
>> +++ b/arch/powerpc/platforms/powernv/processor_idle.c
>> @@ -0,0 +1,239 @@
>> +/*
>> + *  processor_idle - idle state cpuidle driver.
>> + */
>> +
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +
>> +#include 
>> +#include 
>> +
>> +struct cpuidle_driver powernv_idle_driver = {
>> +.name = "powernv_idle",
>> +.owner =THIS_MODULE,
>> +};
>> +
>> +#define MAX_IDLE_STATE_COUNT2
>> +
>> +static int max_idle_state = MAX_IDLE_STATE_COUNT - 1;
>> +static struct cpuidle_device __percpu *powernv_cpuidle_devices;
>> +static struct cpuidle_state *cpuidle_state_table;
>> +
>> +static int snooze_loop(struct cpuidle_device *dev,
>> +struct cpuidle_driver *drv,
>> +int index)
>> +{
>> +int cpu = dev->cpu;
>> +
>> +local_irq_enable();
>> +set_thread_flag(TIF_POLLING_NRFLAG);
>> +
>> +while ((!need_resched()) && cpu_online(cpu)) {
>> +ppc64_runlatch_off();
>> +HMT_very_low();
>> +}
> 
> Why are you using the cpu_online test here ?
> 
>> +
>> +HMT_medium();
>> +clear_thread_flag(TIF_POLLING_NRFLAG);
>> +smp_mb();
>> +return index;
>> +}
>> +
>> +
>> +static int nap_loop(struct cpuidle_device *dev,
>> +struct cpuidle_driver *drv,
>> +int index)
>> +{
>> +ppc64_runlatch_off();
>> +power7_idle();
>> +return index;
>> +}
>> +
>> +/*
>> + * States for dedicated partition case.
>> + */
>> +static struct cpuidle_state powernv_states[MAX_IDLE_STATE_COUNT] = {
>> +{ /* Snooze */
>> +.name = "snooze",
>> +.desc = "snooze",
>> +.flags = CPUIDLE_FLAG_TIME_VALID,
>> +.exit_latency = 0,
>> +.target_residency = 0,
>> +.enter = &snooze_loop },
>> + { /* Nap */
>> +.name = "Nap",
>> +.desc = "Nap",
>> +.flags = CPUIDLE_FLAG_TIME_VALID,
>> +.exit_latency = 10,
>> +.target_residency = 100,
>> +.enter = &nap_loop },
>> +};
>> +
>> +static int powernv_cpuidle_add_cpu_notifier(struct notifier_block *n,
>> +unsigned long action, void *hcpu)
>> +{
>> +int hotcpu = (unsigned long)hcpu;
>> +struct cpuidle_device *dev =
>> +per_cpu_ptr(powernv_cpuidle_devices, hotcpu);
>> +
>> +if (dev && cpuidle_get_driver()) {
>> +switch (action) {
>> +case CPU_ONLINE:
>> +case CPU_ONLINE_FROZEN:
>> +cpuidle_pause_and_lock();
>> +cpuidle_enable_device(dev);
>> +cpuidle_resume_and_unlock();
>> +break;
>> +
>> +        case CPU_DEAD:
>> +case CPU_DEAD_FROZEN:
>> +cpuidle_pause_and_lock();
>> +cpuidle_disable_device(dev);
>> +cpuidle_resume_and_unlock();
>> +break;
>> +
>> +default:
>> +return NOTIFY_DONE;
>> +}
>> +}
>> +return NOTIFY_OK;
>> +}
>> +
>> +static struct notifier_block setup_hotplug_notifier = {
>> +.notifier_call = powernv_cpuidle_add_cpu_notifier,
>> +};
> 
> This is duplicated code with the pseries cpuidle driver and IMHO it
> should be moved to the cpuidle framework.
> 

Will this not require a cleanup of the hotplug cpuidle notifiers from
other architectures into the cpuidle framework as well?

Regards
Preeti U Murthy

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH V2 4/6] cpuidle/pseries: Move the pseries_idle backend driver to sysdev.

2013-07-31 Thread Preeti U Murthy
Hi Dongsheng,

On 07/31/2013 11:16 AM, Wang Dongsheng-B40534 wrote:
> Hi Preeti,
> 
>> -Original Message-----
>> From: Preeti U Murthy [mailto:pre...@linux.vnet.ibm.com]
>> Sent: Wednesday, July 31, 2013 12:00 PM
>> To: Wang Dongsheng-B40534
>> Cc: Deepthi Dharwar; b...@kernel.crashing.org; daniel.lezc...@linaro.org;
>> linux-ker...@vger.kernel.org; mich...@ellerman.id.au;
>> srivatsa.b...@linux.vnet.ibm.com; sva...@linux.vnet.ibm.com; linuxppc-
>> d...@lists.ozlabs.org; r...@sisk.pl; linux...@vger.kernel.org
>> Subject: Re: [PATCH V2 4/6] cpuidle/pseries: Move the pseries_idle
>> backend driver to sysdev.
>>
>> Hi Dongsheng,
>>
>> On 07/31/2013 08:52 AM, Wang Dongsheng-B40534 wrote:
>>>
>>>
>>>> -Original Message-
>>>> From: Deepthi Dharwar [mailto:deep...@linux.vnet.ibm.com]
>>>> Sent: Wednesday, July 31, 2013 10:59 AM
>>>> To: b...@kernel.crashing.org; daniel.lezc...@linaro.org; linux-
>>>> ker...@vger.kernel.org; mich...@ellerman.id.au;
>>>> srivatsa.b...@linux.vnet.ibm.com; pre...@linux.vnet.ibm.com;
>>>> sva...@linux.vnet.ibm.com; linuxppc-dev@lists.ozlabs.org
>>>> Cc: r...@sisk.pl; Wang Dongsheng-B40534; linux...@vger.kernel.org
>>>> Subject: [PATCH V2 4/6] cpuidle/pseries: Move the pseries_idle
>>>> backend driver to sysdev.
>>>>
>>>> Move pseries_idle backend driver code to arch/powerpc/sysdev so that
>>>> the code can be used for a common driver for powernv and pseries.
>>>> This removes a lot of code duplicacy.
>>>>
>>> Why not drivers/cpuidle/?
>>>
>>> I think it should be move to drivers/cpuidle.
>>
>> Please take a look at what the cpuidle under drivers has to provide.
>> cpuidle has two parts to it. The front end and the back end. The front
>> end constitutes the cpuidle governors, registering of arch specific
>> cpuidle drivers, disabling and enabling of cpuidle feature. It is this
>> front end code which is present under drivers/cpuidle.
>>
>> The arch specific cpuidle drivers which decide what needs to be done to
>> enter a specific idle state chosen by the cpuidle governor is what
>> constitutes the back end of cpuidle. This will not be in drivers/cpuidle
>> but in an arch/ specific code.
>>
>> The cpuidle under drivers/cpuidle drives the idle power management, but
>> the low level handling of the entry into idle states should be taken care
>> of by the architecture.
>>
>> Your recent patch :
>> cpuidle: add freescale e500 family porcessors idle support IMO should
>> hook onto the backend cpuidle driver that this patchset provides.
>>
> Sorry, I don't think so, cpuidle framework has been already very common.
> Here we just need to do state definition and handling. I wonder whether
> we need this layer.
> 
> If your handle is platform dependent, it should be in arch/platform.
> 
> If it is only for some platforms and the operation of these platforms can be
> multiplexed, Why cannot as a driver to put into driver/cpuidle?
> 
> If it a general driver, I think we can put some common operating to 
> driver/cpuidle
> and make the platform specific code to arch/powerpc/platform.
> 
> This patch include front end and back end, not just back end.
> 
> This patch include too many state of different platforms and handle function. 
> This state
> and handle that should belong to itself platforms. Not a general way. If 
> Deepthi will do
> a general powerpc cpuidle, I think, it's cannot just using the macro to 
> distinguish
> platform. the front end code maybe move to driver/cpuidle(drvier register) 
> should be better,
> make the Low Power State and what should be handle to 
> arch/powerpc/platform/**, because different
> platforms have different state of low power consumption, and the processing 
> method.
> The front end can provide some general methods to register into general 
> powerpc cpuidle driver.


As Daniel pointed out, with a call to cpuidle_register(), we can get the
cpuidle_driver and cpuidle_device registered through the generic cpuidle
framework. Hence we can get rid of the powerpc_idle_devices_init() routine.

We can have the hotplug notifier in the generic cpuidle framework as
well. The rest of the patchset however should be arch specific IMO.

Regards
Preeti U Murthy

> 
> -dongsheng
> 
> ___
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH V2 4/6] cpuidle/pseries: Move the pseries_idle backend driver to sysdev.

2013-07-30 Thread Preeti U Murthy
Hi Dongsheng,

On 07/31/2013 08:52 AM, Wang Dongsheng-B40534 wrote:
> 
> 
>> -Original Message-
>> From: Deepthi Dharwar [mailto:deep...@linux.vnet.ibm.com]
>> Sent: Wednesday, July 31, 2013 10:59 AM
>> To: b...@kernel.crashing.org; daniel.lezc...@linaro.org; linux-
>> ker...@vger.kernel.org; mich...@ellerman.id.au;
>> srivatsa.b...@linux.vnet.ibm.com; pre...@linux.vnet.ibm.com;
>> sva...@linux.vnet.ibm.com; linuxppc-dev@lists.ozlabs.org
>> Cc: r...@sisk.pl; Wang Dongsheng-B40534; linux...@vger.kernel.org
>> Subject: [PATCH V2 4/6] cpuidle/pseries: Move the pseries_idle backend
>> driver to sysdev.
>>
>> Move pseries_idle backend driver code to arch/powerpc/sysdev
>> so that the code can be used for a common driver for powernv
>> and pseries. This removes a lot of code duplicacy.
>>
> Why not drivers/cpuidle/?
> 
> I think it should be move to drivers/cpuidle.

Please take a look at what the cpuidle under drivers has to provide.
cpuidle has two parts to it. The front end and the back end. The front
end constitutes the cpuidle governors, registering of arch specific
cpuidle drivers, disabling and enabling of cpuidle feature. It is this
front end code which is present under drivers/cpuidle.

The arch specific cpuidle drivers which decide what needs to be done to
enter a specific idle state chosen by the cpuidle governor is what
constitutes the back end of cpuidle. This will not be in drivers/cpuidle
but in an arch/ specific code.

The cpuidle under drivers/cpuidle drives the idle power management, but
the low level handling of the entry into idle states should be taken
care of by the architecture.

Your recent patch :
cpuidle: add freescale e500 family porcessors idle support IMO should
hook onto the backend cpuidle driver that this patchset provides.

Regards
Preeti U Murthy

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC PATCH 4/5] cpuidle/ppc: CPU goes tickless if there are no arch-specific constraints

2013-07-29 Thread Preeti U Murthy
Hi,

On 07/29/2013 10:58 AM, Vaidyanathan Srinivasan wrote:
> * Preeti U Murthy  [2013-07-27 13:20:37]:
> 
>> Hi Ben,
>>
>> On 07/27/2013 12:00 PM, Benjamin Herrenschmidt wrote:
>>> On Fri, 2013-07-26 at 08:09 +0530, Preeti U Murthy wrote:
>>>> *The lapic of a broadcast CPU is active always*. Say CPUX, wants the
>>>> broadcast CPU to wake it up at timeX.  Since we cannot program the lapic
>>>> of a remote CPU, CPUX will need to send an IPI to the broadcast CPU,
>>>> asking it to program its lapic to fire at timeX so as to wake up CPUX.
>>>> *With multiple CPUs the overhead of sending IPI, could result in
>>>> performance bottlenecks and may not scale well.*
>>>>
>>>> Hence the workaround is that the broadcast CPU on each of its timer
>>>> interrupt checks if any of the next timer event of a CPU in deep idle
>>>> state has expired, which can very well be found from dev->next_event of
>>>> that CPU. For example the timeX that has been mentioned above has
>>>> expired. If so the broadcast handler is called to send an IPI to the
>>>> idling CPU to wake it up.
>>>>
>>>> *If the broadcast CPU, is in tickless idle, its timer interrupt could be
>>>> many ticks away. It could miss waking up a CPU in deep idle*, if its
>>>> wakeup is much before this timer interrupt of the broadcast CPU. But
>>>> without tickless idle, atleast at each period we are assured of a timer
>>>> interrupt. At which time broadcast handling is done as stated in the
>>>> previous paragraph and we will not miss wakeup of CPUs in deep idle states.
>>>
>>> But that means a great loss of power saving on the broadcast CPU when the 
>>> machine
>>> is basically completely idle. We might be able to come up with some thing 
>>> better.
>>>
>>> (Note : I do no know the timer offload code if it exists already, I'm 
>>> describing
>>> how things could happen "out of the blue" without any knowledge of 
>>> pre-existing
>>> framework here)
>>>
>>> We can know when the broadcast CPU expects to wake up next. When a CPU goes 
>>> to
>>> a deep sleep state, it can then
>>>
>>>  - Indicate to the broadcast CPU when it intends to be woken up by queuing
>>> itself into an ordered queue (ordered by target wakeup time). (OPTIMISATION:
>>> Play with the locality of that: have one queue (and one "broadcast CPU") per
>>> chip or per node instead of a global one to limit cache bouncing).
>>>
>>>  - Check if that happens before the broadcast CPU intended wake time (we
>>> need statistics to see how often that happens), and in that case send an IPI
>>> to wake it up now. When the broadcast CPU goes to sleep, it limits its sleep
>>> time to the min of it's intended sleep time and the new sleeper time.
>>> (OPTIMISATION: Dynamically chose a broadcast CPU based on closest expiry ?)
>>>
>>>  - We can probably limit spurrious wakeups a *LOT* by aligning that target 
>>> time
>>> to a global jiffy boundary, meaning that several CPUs going to idle are 
>>> likely
>>> to be choosing the same. Or maybe better, an adaptative alignment by 
>>> essentially
>>> getting more coarse grained as we go further in the future
>>>
>>>  - When the "broadcast" CPU goes to sleep, it can play the same game of 
>>> alignment.
>>>
>>> I don't like the concept of a dedicated broadcast CPU however. I'd rather 
>>> have a
>>> general queue (or per node) of sleepers needing a wakeup and more/less 
>>> dynamically
>>> pick a waker to be the last man standing, but it does make things a bit more
>>> tricky with tickless scheduler (non-idle).
>>>
>>> Still, I wonder if we could just have some algorithm to actually pick wakers
>>> more dynamically based on who ever has the closest "next wakeup" planned,
>>> that sort of thing. A fixed broadcaster will create an imbalance in
>>> power/thermal within the chip in addition to needing to be moved around on
>>> hotplug etc...
>>
>> Thank you for having listed out the above suggestions. Below, I will
>> bring out some ideas about how the concerns that you have raised can be
>> addressed in the increasing order of priority.
>>
>> - To begin with, I think we can have the following model to have the
>> responsibility of the broadcast CPU float a

Re: [RFC PATCH 4/5] cpuidle/ppc: CPU goes tickless if there are no arch-specific constraints

2013-07-27 Thread Preeti U Murthy
Hi Ben,

On 07/27/2013 12:00 PM, Benjamin Herrenschmidt wrote:
> On Fri, 2013-07-26 at 08:09 +0530, Preeti U Murthy wrote:
>> *The lapic of a broadcast CPU is active always*. Say CPUX, wants the
>> broadcast CPU to wake it up at timeX.  Since we cannot program the lapic
>> of a remote CPU, CPUX will need to send an IPI to the broadcast CPU,
>> asking it to program its lapic to fire at timeX so as to wake up CPUX.
>> *With multiple CPUs the overhead of sending IPI, could result in
>> performance bottlenecks and may not scale well.*
>>
>> Hence the workaround is that the broadcast CPU on each of its timer
>> interrupt checks if any of the next timer event of a CPU in deep idle
>> state has expired, which can very well be found from dev->next_event of
>> that CPU. For example the timeX that has been mentioned above has
>> expired. If so the broadcast handler is called to send an IPI to the
>> idling CPU to wake it up.
>>
>> *If the broadcast CPU, is in tickless idle, its timer interrupt could be
>> many ticks away. It could miss waking up a CPU in deep idle*, if its
>> wakeup is much before this timer interrupt of the broadcast CPU. But
>> without tickless idle, atleast at each period we are assured of a timer
>> interrupt. At which time broadcast handling is done as stated in the
>> previous paragraph and we will not miss wakeup of CPUs in deep idle states.
> 
> But that means a great loss of power saving on the broadcast CPU when the 
> machine
> is basically completely idle. We might be able to come up with some thing 
> better.
> 
> (Note : I do no know the timer offload code if it exists already, I'm 
> describing
> how things could happen "out of the blue" without any knowledge of 
> pre-existing
> framework here)
> 
> We can know when the broadcast CPU expects to wake up next. When a CPU goes to
> a deep sleep state, it can then
> 
>  - Indicate to the broadcast CPU when it intends to be woken up by queuing
> itself into an ordered queue (ordered by target wakeup time). (OPTIMISATION:
> Play with the locality of that: have one queue (and one "broadcast CPU") per
> chip or per node instead of a global one to limit cache bouncing).
> 
>  - Check if that happens before the broadcast CPU intended wake time (we
> need statistics to see how often that happens), and in that case send an IPI
> to wake it up now. When the broadcast CPU goes to sleep, it limits its sleep
> time to the min of it's intended sleep time and the new sleeper time.
> (OPTIMISATION: Dynamically chose a broadcast CPU based on closest expiry ?)
> 
>  - We can probably limit spurrious wakeups a *LOT* by aligning that target 
> time
> to a global jiffy boundary, meaning that several CPUs going to idle are likely
> to be choosing the same. Or maybe better, an adaptative alignment by 
> essentially
> getting more coarse grained as we go further in the future
> 
>  - When the "broadcast" CPU goes to sleep, it can play the same game of 
> alignment.
> 
> I don't like the concept of a dedicated broadcast CPU however. I'd rather 
> have a
> general queue (or per node) of sleepers needing a wakeup and more/less 
> dynamically
> pick a waker to be the last man standing, but it does make things a bit more
> tricky with tickless scheduler (non-idle).
> 
> Still, I wonder if we could just have some algorithm to actually pick wakers
> more dynamically based on who ever has the closest "next wakeup" planned,
> that sort of thing. A fixed broadcaster will create an imbalance in
> power/thermal within the chip in addition to needing to be moved around on
> hotplug etc...

Thank you for having listed out the above suggestions. Below, I will
bring out some ideas about how the concerns that you have raised can be
addressed in the increasing order of priority.

- To begin with, I think we can have the following model to have the
responsibility of the broadcast CPU float around certain CPUs. i.e. Not
have a dedicated broadcast CPU. I will refer to the broadcast CPU as the
bc_cpu henceforth for convenience.

1. The first CPU that intends to enter deep sleep state will be the bc_cpu.

2. Every other CPU that intends to enter deep idle state will enter
themselves into a mask, say the bc_mask, which is already being done
today, after they check that a bc_cpu has been assigned.

3. The bc_cpu should not enter tickless idle, until step 5a holds true.

4. So on every timer interrupt, which is at-least every period, it
checks the bc_mask to see if any CPUs need to be woken up.

5. The bc cpu should not enter tickless idle *until* it is de-nominated
as the bc_cpu. The de-nomination occurs when:
  a. In one of its timer interrupts, it does broad

Re: [RFC PATCH 0/5] cpuidle/ppc: Timer offload framework to support deep idle states

2013-07-26 Thread Preeti U Murthy
Hi Li,

On 07/26/2013 03:35 PM, Li Yang-R58472 wrote:
> 
> 
>> -Original Message-
>> From: linux-pm-ow...@vger.kernel.org [mailto:linux-pm-
>> ow...@vger.kernel.org] On Behalf Of Preeti U Murthy
>> Sent: Thursday, July 25, 2013 5:03 PM
>> To: b...@kernel.crashing.org; paul.gortma...@windriver.com;
>> pau...@samba.org; sha...@linux.vnet.ibm.com; ga...@kernel.crashing.org;
>> fweis...@gmail.com; paul...@linux.vnet.ibm.com; mich...@ellerman.id.au;
>> a...@arndb.de; linux...@vger.kernel.org; rost...@goodmis.org; r...@sisk.pl;
>> john.stu...@linaro.org; t...@linutronix.de; Zhao Chenhui-B35336;
>> deep...@linux.vnet.ibm.com; ge...@infradead.org; linux-
>> ker...@vger.kernel.org; srivatsa.b...@linux.vnet.ibm.com;
>> schwidef...@de.ibm.com; sva...@linux.vnet.ibm.com; linuxppc-
>> d...@lists.ozlabs.org
>> Subject: [RFC PATCH 0/5] cpuidle/ppc: Timer offload framework to support
>> deep idle states
>>
>> On PowerPC, when CPUs enter deep idle states, their local timers are
>> switched off. The responsibility of waking them up at their next timer
>> event, needs to be handed over to an external device. On PowerPC, we do
>> not have an external device equivalent to HPET, which is currently done
>> on architectures like x86. Instead we assign the local timer of one of
>> the CPUs to do this job.
> 
> I don't know much about the deep idle states.  But is it necessary to turn 
> off the timer in these states?  I mean the extra work needed on the booting 
> CPU and re-syncing Time Base after wakeup might defeat the power consumption 
> benefit of turning off the timer.

The idle state that CPUs should enter into is chosen by the cpuidle
governor. And the target residency and the wakeup latency of the idle
states among others, are the factors considered by the governor before
choosing an idle state for a CPU. So deep idle states which have a
higher target residency and wakeup latency, but that have huge power
savings, are usually targeted by the governors when there are large
periods of idle, i.e. CPUs tend to remain idle for long.

Depending on the workload running, large periods of idle could be rare,
in which case deep idle states are not even entered into. But if the
workload is such that it is woken up occasionally to do work, with large
periods of idle in between, deep idle states are targeted. In such a
scenario, the power savings that such idle states yield is significant
to offset the overhead of the factors that you mention, which will need
to be done occasionally.

Hence for an almost completely idle system, having such deep idle states
fetches enormous power savings.

> 
> Btw, the Open PIC has an internal timer.  Can we use it as the wakeup event?

I am not aware of such a device on PowerPC. Even if there is one, it is
important to investigate what happens to it in deep idle states. If it
is intended to be switched off in one of the deep idle states, again the
idea that this patchset brings in has to be the fall-back solution.

My point is, it is good to have the timer offload framework
implementation because we cannot continue to rely on external devices
being there to aid us in situations when the local timer devices of CPUs
have to be turned off in deep idle states.

> 
> Regards,
> Leo

Regards
Preeti U Murthy

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[Resend RFC PATCH 4/5] cpuidle/ppc: CPU goes tickless if there are no arch-specific constraints

2013-07-25 Thread Preeti U Murthy
In the current design of timer offload framework, the broadcast cpu should
*not* go into tickless idle so as to avoid missed wakeups on CPUs in deep idle 
states.

Since we prevent the CPUs entering deep idle states from programming the
decrementer of the broadcast cpu for their respective next local events for
reasons mentioned in PATCH[3/5], the broadcast CPU checks if there are any
CPUs to be woken up during each of its timer interrupt, which is programmed
to its local events.

With tickless idle, the broadcast CPU might not have a timer interrupt
pending till after many ticks, which can result in missed wakeups on CPUs
in deep idle states. By disabling tickless idle, worst case, the tick_sched
hrtimer will trigger a timer interrupt every period.

However the current setup of tickless idle does not let us make the choice
of tickless on individual cpus. NOHZ_MODE_INACTIVE which disables tickless idle,
is a system wide setting. Hence resort to an arch specific call to check if a 
cpu
can go into tickless idle.

Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/kernel/time.c |5 +
 kernel/time/tick-sched.c   |7 +++
 2 files changed, 12 insertions(+)

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 7e858e1..916c32f 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -864,6 +864,11 @@ static void decrementer_timer_broadcast(const struct 
cpumask *mask)
arch_send_tick_broadcast(mask);
 }
 
+int arch_can_stop_idle_tick(int cpu)
+{
+   return cpu != bc_cpu;
+}
+
 static void register_decrementer_clockevent(int cpu)
 {
struct clock_event_device *dec = &per_cpu(decrementers, cpu);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6960172..e9ffa84 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -700,8 +700,15 @@ static void tick_nohz_full_stop_tick(struct tick_sched *ts)
 #endif
 }
 
+int __weak arch_can_stop_idle_tick(int cpu)
+{
+   return 1;
+}
+
 static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 {
+   if (!arch_can_stop_idle_tick(cpu))
+   return false;
/*
 * If this cpu is offline and it is the one which updates
 * jiffies, then give up the assignment and let it be taken by

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[Resend RFC PATCH 1/5] powerpc: Free up the IPI message slot of ipi call function (PPC_MSG_CALL_FUNC)

2013-07-25 Thread Preeti U Murthy
From: Srivatsa S. Bhat 

The IPI handlers for both PPC_MSG_CALL_FUNC and PPC_MSG_CALL_FUNC_SINGLE
map to a common implementation - generic_smp_call_function_single_interrupt().
So, we can consolidate them and save one of the IPI message slots, (which are
precious, since only 4 of those slots are available).

So, implement the functionality of PPC_MSG_CALL_FUNC using
PPC_MSG_CALL_FUNC_SINGLE itself and release its IPI message slot, so that it
can be used for something else in the future, if desired.

Signed-off-by: Srivatsa S. Bhat 
Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/include/asm/smp.h  |2 +-
 arch/powerpc/kernel/smp.c   |   12 +---
 arch/powerpc/platforms/cell/interrupt.c |2 +-
 arch/powerpc/platforms/ps3/smp.c|2 +-
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index ffbaabe..51bf017 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -117,7 +117,7 @@ extern int cpu_to_core_id(int cpu);
  *
  * Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up
  * in /proc/interrupts will be wrong!!! --Troy */
-#define PPC_MSG_CALL_FUNCTION   0
+#define PPC_MSG_UNUSED 0
 #define PPC_MSG_RESCHEDULE  1
 #define PPC_MSG_CALL_FUNC_SINGLE   2
 #define PPC_MSG_DEBUGGER_BREAK  3
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 38b0ba6..bc41e9f 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -111,9 +111,9 @@ int smp_generic_kick_cpu(int nr)
 }
 #endif /* CONFIG_PPC64 */
 
-static irqreturn_t call_function_action(int irq, void *data)
+static irqreturn_t unused_action(int irq, void *data)
 {
-   generic_smp_call_function_interrupt();
+   /* This slot is unused and hence available for use, if needed */
return IRQ_HANDLED;
 }
 
@@ -144,14 +144,14 @@ static irqreturn_t debug_ipi_action(int irq, void *data)
 }
 
 static irq_handler_t smp_ipi_action[] = {
-   [PPC_MSG_CALL_FUNCTION] =  call_function_action,
+   [PPC_MSG_UNUSED] =  unused_action, /* Slot available for future use */
[PPC_MSG_RESCHEDULE] = reschedule_action,
[PPC_MSG_CALL_FUNC_SINGLE] = call_function_single_action,
[PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action,
 };
 
 const char *smp_ipi_name[] = {
-   [PPC_MSG_CALL_FUNCTION] =  "ipi call function",
+   [PPC_MSG_UNUSED] =  "ipi unused",
[PPC_MSG_RESCHEDULE] = "ipi reschedule",
[PPC_MSG_CALL_FUNC_SINGLE] = "ipi call function single",
[PPC_MSG_DEBUGGER_BREAK] = "ipi debugger",
@@ -221,8 +221,6 @@ irqreturn_t smp_ipi_demux(void)
all = xchg(&info->messages, 0);
 
 #ifdef __BIG_ENDIAN
-   if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNCTION)))
-   generic_smp_call_function_interrupt();
if (all & (1 << (24 - 8 * PPC_MSG_RESCHEDULE)))
scheduler_ipi();
if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNC_SINGLE)))
@@ -265,7 +263,7 @@ void arch_send_call_function_ipi_mask(const struct cpumask 
*mask)
unsigned int cpu;
 
for_each_cpu(cpu, mask)
-   do_message_pass(cpu, PPC_MSG_CALL_FUNCTION);
+   do_message_pass(cpu, PPC_MSG_CALL_FUNC_SINGLE);
 }
 
 #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
diff --git a/arch/powerpc/platforms/cell/interrupt.c 
b/arch/powerpc/platforms/cell/interrupt.c
index 2d42f3b..28166e4 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -213,7 +213,7 @@ static void iic_request_ipi(int msg)
 
 void iic_request_IPIs(void)
 {
-   iic_request_ipi(PPC_MSG_CALL_FUNCTION);
+   iic_request_ipi(PPC_MSG_UNUSED);
iic_request_ipi(PPC_MSG_RESCHEDULE);
iic_request_ipi(PPC_MSG_CALL_FUNC_SINGLE);
iic_request_ipi(PPC_MSG_DEBUGGER_BREAK);
diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c
index 4b35166..488f069 100644
--- a/arch/powerpc/platforms/ps3/smp.c
+++ b/arch/powerpc/platforms/ps3/smp.c
@@ -74,7 +74,7 @@ static int __init ps3_smp_probe(void)
* to index needs to be setup.
*/
 
-   BUILD_BUG_ON(PPC_MSG_CALL_FUNCTION!= 0);
+   BUILD_BUG_ON(PPC_MSG_UNUSED   != 0);
BUILD_BUG_ON(PPC_MSG_RESCHEDULE   != 1);
BUILD_BUG_ON(PPC_MSG_CALL_FUNC_SINGLE != 2);
BUILD_BUG_ON(PPC_MSG_DEBUGGER_BREAK   != 3);

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[Resend RFC PATCH 5/5] cpuidle/ppc: Add longnap state to the idle states on powernv

2013-07-25 Thread Preeti U Murthy
This patch hooks into the existing broadcast framework with the support that 
this
patchset introduces for ppc, and the cpuidle driver backend
for powernv(posted out recently by Deepthi Dharwar) to add sleep state as
one of the deep idle states, in which the decrementer is switched off.

However in this patch, we only emulate sleep by going into a state which does
a nap with the decrementer interrupts disabled, termed as longnap. This enables
focus on the timer broadcast framework for ppc in this series of patches ,
which is required as a first step to enable sleep on ppc.

Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/platforms/powernv/processor_idle.c |   48 +++
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/processor_idle.c 
b/arch/powerpc/platforms/powernv/processor_idle.c
index f43ad91a..9aca502 100644
--- a/arch/powerpc/platforms/powernv/processor_idle.c
+++ b/arch/powerpc/platforms/powernv/processor_idle.c
@@ -9,16 +9,18 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
+#include 
 
 struct cpuidle_driver powernv_idle_driver = {
.name = "powernv_idle",
.owner =THIS_MODULE,
 };
 
-#define MAX_IDLE_STATE_COUNT   2
+#define MAX_IDLE_STATE_COUNT   3
 
 static int max_idle_state = MAX_IDLE_STATE_COUNT - 1;
 static struct cpuidle_device __percpu *powernv_cpuidle_devices;
@@ -54,6 +56,43 @@ static int nap_loop(struct cpuidle_device *dev,
return index;
 }
 
+/* Emulate sleep, with long nap.
+ * During sleep, the core does not receive decrementer interrupts.
+ * Emulate sleep using long nap with decrementers interrupts disabled.
+ * This is an initial prototype to test the timer offload framework for ppc.
+ * We will eventually introduce the sleep state once the timer offload 
framework
+ * for ppc is stable.
+ */
+static int longnap_loop(struct cpuidle_device *dev,
+   struct cpuidle_driver *drv,
+   int index)
+{
+   int cpu = dev->cpu;
+
+   unsigned long lpcr = mfspr(SPRN_LPCR);
+
+   lpcr &= ~(LPCR_MER | LPCR_PECE); /* lpcr[mer] must be 0 */
+
+   /* exit powersave upon external interrupt, but not decrementer
+* interrupt, Emulate sleep.
+*/
+   lpcr |= LPCR_PECE0;
+
+   if (cpu != bc_cpu) {
+   mtspr(SPRN_LPCR, lpcr);
+   clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
+   power7_nap();
+   clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+   } else {
+   /* Wakeup on a decrementer interrupt, Do a nap */
+   lpcr |= LPCR_PECE1;
+   mtspr(SPRN_LPCR, lpcr);
+   power7_nap();
+   }
+
+   return index;
+}
+
 /*
  * States for dedicated partition case.
  */
@@ -72,6 +111,13 @@ static struct cpuidle_state 
powernv_states[MAX_IDLE_STATE_COUNT] = {
.exit_latency = 10,
.target_residency = 100,
.enter = &nap_loop },
+{ /* LongNap */
+   .name = "LongNap",
+   .desc = "LongNap",
+   .flags = CPUIDLE_FLAG_TIME_VALID,
+   .exit_latency = 10,
+   .target_residency = 100,
+   .enter = &longnap_loop },
 };
 
 static int powernv_cpuidle_add_cpu_notifier(struct notifier_block *n,

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[Resend RFC PATCH 3/5] cpuidle/ppc: Add timer offload framework to support deep idle states

2013-07-25 Thread Preeti U Murthy
On ppc, in deep idle states, the local clock event device of CPUs gets
switched off. On PowerPC, the local clock event device is called the
decrementer. Make use of the broadcast framework to issue interrupts to
cpus in deep idle states on their timer events, except that on ppc, we
do not have an external device such as HPET, but we use the decrementer
of one of the CPUs itself as the broadcast device.

Instantiate two different clock event devices, one representing the
decrementer and another representing the broadcast device for each cpu.
The cpu which registers its broadcast device will be responsible for
performing the function of issuing timer interrupts to CPUs in deep idle
states, and is referred to as the broadcast cpu in the changelogs of this
patchset for convenience. Such a CPU is not allowed to enter deep idle
states, where the decrementer is switched off.

For now, only the boot cpu's broadcast device gets registered as a clock event
device along with the decrementer. Hence this is the broadcast cpu.

On the broadcast cpu, on each timer interrupt, apart from the regular local
timer event handler the broadcast handler is also called. We avoid the overhead
of programming the decrementer specifically for a broadcast event. The reason 
is for
performance and scalability reasons. Say cpuX goes to deep idle state. It
has to ask the broadcast CPU to reprogram its(broadcast CPU's) decrementer for
the next local timer event of cpuX. cpuX can do so only by sending an IPI to the
broadcast CPU. With many more cpus going to deep idle, this model of sending
IPIs each time will result in performance bottleneck and may not scale well.

Apart from this there is no change in the way broadcast is handled today. On
a broadcast ipi the event handler for a timer interrupt is called on the cpu
in deep idle state to handle the local events.

The current design and implementation of the timer offload framework supports
the ONESHOT tick mode but not the PERIODIC mode.

Signed-off-by: Preeti U. Murthy 
---

 arch/powerpc/include/asm/time.h|3 +
 arch/powerpc/kernel/smp.c  |4 +-
 arch/powerpc/kernel/time.c |   81 
 arch/powerpc/platforms/powernv/Kconfig |1 
 4 files changed, 86 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index c1f2676..936be0d 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -24,14 +24,17 @@ extern unsigned long tb_ticks_per_jiffy;
 extern unsigned long tb_ticks_per_usec;
 extern unsigned long tb_ticks_per_sec;
 extern struct clock_event_device decrementer_clockevent;
+extern struct clock_event_device broadcast_clockevent;
 
 struct rtc_time;
 extern void to_tm(int tim, struct rtc_time * tm);
 extern void GregorianDay(struct rtc_time *tm);
+extern void decrementer_timer_interrupt(void);
 
 extern void generic_calibrate_decr(void);
 
 extern void set_dec_cpu6(unsigned int val);
+extern int bc_cpu;
 
 /* Some sane defaults: 125 MHz timebase, 1GHz processor */
 extern unsigned long ppc_proc_freq;
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 6a68ca4..d3b7014 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -114,7 +114,7 @@ int smp_generic_kick_cpu(int nr)
 
 static irqreturn_t timer_action(int irq, void *data)
 {
-   timer_interrupt();
+   decrementer_timer_interrupt();
return IRQ_HANDLED;
 }
 
@@ -223,7 +223,7 @@ irqreturn_t smp_ipi_demux(void)
 
 #ifdef __BIG_ENDIAN
if (all & (1 << (24 - 8 * PPC_MSG_TIMER)))
-   timer_interrupt();
+   decrementer_timer_interrupt();
if (all & (1 << (24 - 8 * PPC_MSG_RESCHEDULE)))
scheduler_ipi();
if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNC_SINGLE)))
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 65ab9e9..7e858e1 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -42,6 +42,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -97,8 +98,11 @@ static struct clocksource clocksource_timebase = {
 
 static int decrementer_set_next_event(unsigned long evt,
  struct clock_event_device *dev);
+static int broadcast_set_next_event(unsigned long evt,
+ struct clock_event_device *dev);
 static void decrementer_set_mode(enum clock_event_mode mode,
 struct clock_event_device *dev);
+static void decrementer_timer_broadcast(const struct cpumask *mask);
 
 struct clock_event_device decrementer_clockevent = {
.name   = "decrementer",
@@ -106,13 +110,26 @@ struct clock_event_device decrementer_clockevent = {
.irq= 0,
.set_next_event = decrementer_set_next_eve

[Resend RFC PATCH 2/5] powerpc: Implement broadcast timer interrupt as an IPI message

2013-07-25 Thread Preeti U Murthy
From: Srivatsa S. Bhat 

For scalability and performance reasons, we want the broadcast timer
interrupts to be handled as efficiently as possible. Fixed IPI messages
are one of the most efficient mechanisms available - they are faster
than the smp_call_function mechanism because the IPI handlers are fixed
and hence they don't involve costly operations such as adding IPI handlers
to the target CPU's function queue, acquiring locks for synchronization etc.

Luckily we have an unused IPI message slot, so use that to implement
broadcast timer interrupts efficiently.

Signed-off-by: Srivatsa S. Bhat 
Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/include/asm/smp.h  |3 ++-
 arch/powerpc/kernel/smp.c   |   19 +++
 arch/powerpc/platforms/cell/interrupt.c |2 +-
 arch/powerpc/platforms/ps3/smp.c|2 +-
 4 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 51bf017..d877b69 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -117,7 +117,7 @@ extern int cpu_to_core_id(int cpu);
  *
  * Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up
  * in /proc/interrupts will be wrong!!! --Troy */
-#define PPC_MSG_UNUSED 0
+#define PPC_MSG_TIMER  0
 #define PPC_MSG_RESCHEDULE  1
 #define PPC_MSG_CALL_FUNC_SINGLE   2
 #define PPC_MSG_DEBUGGER_BREAK  3
@@ -190,6 +190,7 @@ extern struct smp_ops_t *smp_ops;
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
+extern void arch_send_tick_broadcast(const struct cpumask *mask);
 
 /* Definitions relative to the secondary CPU spin loop
  * and entry point. Not all of them exist on both 32 and
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index bc41e9f..6a68ca4 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -111,9 +112,9 @@ int smp_generic_kick_cpu(int nr)
 }
 #endif /* CONFIG_PPC64 */
 
-static irqreturn_t unused_action(int irq, void *data)
+static irqreturn_t timer_action(int irq, void *data)
 {
-   /* This slot is unused and hence available for use, if needed */
+   timer_interrupt();
return IRQ_HANDLED;
 }
 
@@ -144,14 +145,14 @@ static irqreturn_t debug_ipi_action(int irq, void *data)
 }
 
 static irq_handler_t smp_ipi_action[] = {
-   [PPC_MSG_UNUSED] =  unused_action, /* Slot available for future use */
+   [PPC_MSG_TIMER] =  timer_action,
[PPC_MSG_RESCHEDULE] = reschedule_action,
[PPC_MSG_CALL_FUNC_SINGLE] = call_function_single_action,
[PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action,
 };
 
 const char *smp_ipi_name[] = {
-   [PPC_MSG_UNUSED] =  "ipi unused",
+   [PPC_MSG_TIMER] =  "ipi timer",
[PPC_MSG_RESCHEDULE] = "ipi reschedule",
[PPC_MSG_CALL_FUNC_SINGLE] = "ipi call function single",
[PPC_MSG_DEBUGGER_BREAK] = "ipi debugger",
@@ -221,6 +222,8 @@ irqreturn_t smp_ipi_demux(void)
all = xchg(&info->messages, 0);
 
 #ifdef __BIG_ENDIAN
+   if (all & (1 << (24 - 8 * PPC_MSG_TIMER)))
+   timer_interrupt();
if (all & (1 << (24 - 8 * PPC_MSG_RESCHEDULE)))
scheduler_ipi();
if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNC_SINGLE)))
@@ -266,6 +269,14 @@ void arch_send_call_function_ipi_mask(const struct cpumask 
*mask)
do_message_pass(cpu, PPC_MSG_CALL_FUNC_SINGLE);
 }
 
+void arch_send_tick_broadcast(const struct cpumask *mask)
+{
+   unsigned int cpu;
+
+   for_each_cpu(cpu, mask)
+   do_message_pass(cpu, PPC_MSG_TIMER);
+}
+
 #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
 void smp_send_debugger_break(void)
 {
diff --git a/arch/powerpc/platforms/cell/interrupt.c 
b/arch/powerpc/platforms/cell/interrupt.c
index 28166e4..1359113 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -213,7 +213,7 @@ static void iic_request_ipi(int msg)
 
 void iic_request_IPIs(void)
 {
-   iic_request_ipi(PPC_MSG_UNUSED);
+   iic_request_ipi(PPC_MSG_TIMER);
iic_request_ipi(PPC_MSG_RESCHEDULE);
iic_request_ipi(PPC_MSG_CALL_FUNC_SINGLE);
iic_request_ipi(PPC_MSG_DEBUGGER_BREAK);
diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c
index 488f069..5cb742a 100644
--- a/arch/powerpc/platforms/ps3/smp.c
+++ b/arch/powerpc/platforms/ps3/smp.c
@@ -74,7 +74,7 @@ static int __init ps3_smp_probe(void)
* to index needs to be setup.
*/
 
-   BUILD_BUG_ON(PPC_MSG_UNUSED   != 0);
+   BUILD_

[Resend RFC PATCH 0/5] cpuidle/ppc: Timer offload framework to support deep idle states

2013-07-25 Thread Preeti U Murthy
On PowerPC, when CPUs enter deep idle states, their local timers are
switched off. The responsibility of waking them up at their next timer event,
needs to be handed over to an external device. On PowerPC, we do not have an
external device equivalent to HPET, which is currently done on architectures
like x86. Instead we assign the local timer of one of the CPUs to do this
job.

This patchset is an attempt to make use of the existing timer broadcast
framework in the kernel to meet the above requirement, except that the tick
broadcast device is the local timer of the boot CPU.

This patch series is ported ontop of 3.11-rc1 + the cpuidle driver backend
for powernv posted by Deepthi Dharwar recently. The current design and
implementation supports the ONESHOT tick mode. It does not yet support
the PERIODIC tick mode. This patch is tested with NOHZ_FULL off.

Patch[1/5], Patch[2/5]: optimize the broadcast mechanism on ppc.
Patch[3/5]: Introduces the core of the timer offload framework on powerpc.
Patch[4/5]: The cpu doing the broadcast should not go into tickless idle.
Patch[5/5]: Add a deep idle state to the cpuidle state table on powernv.

Patch[5/5] is the patch that ultimately makes use of the timer offload
framework that the patches Patch[1/5] to Patch[4/5] build.

This patch series is being resent to clarify certain ambiguity in the patch
descriptions from the previous post. Discussion around this:
https://lkml.org/lkml/2013/7/25/754

---

Preeti U Murthy (3):
  cpuidle/ppc: Add timer offload framework to support deep idle states
  cpuidle/ppc: CPU goes tickless if there are no arch-specific constraints
  cpuidle/ppc: Add longnap state to the idle states on powernv

Srivatsa S. Bhat (2):
  powerpc: Free up the IPI message slot of ipi call function 
(PPC_MSG_CALL_FUNC)
  powerpc: Implement broadcast timer interrupt as an IPI message


 arch/powerpc/include/asm/smp.h  |3 +
 arch/powerpc/include/asm/time.h |3 +
 arch/powerpc/kernel/smp.c   |   23 --
 arch/powerpc/kernel/time.c  |   86 +++
 arch/powerpc/platforms/cell/interrupt.c |2 -
 arch/powerpc/platforms/powernv/Kconfig  |1 
 arch/powerpc/platforms/powernv/processor_idle.c |   48 +
 arch/powerpc/platforms/ps3/smp.c|2 -
 kernel/time/tick-sched.c|7 ++
 9 files changed, 163 insertions(+), 12 deletions(-)

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC PATCH 4/5] cpuidle/ppc: CPU goes tickless if there are no arch-specific constraints

2013-07-25 Thread Preeti U Murthy
Hi Frederic,

I apologise for the confusion. As Paul pointed out maybe the usage of
the term lapic is causing a large amount of confusion. So please see the
clarification below. Maybe it will help answer your question.

On 07/26/2013 08:09 AM, Preeti U Murthy wrote:
> Hi Frederic,
> 
> On 07/25/2013 07:00 PM, Frederic Weisbecker wrote:
>> On Thu, Jul 25, 2013 at 02:33:02PM +0530, Preeti U Murthy wrote:
>>> In the current design of timer offload framework, the broadcast cpu should
>>> *not* go into tickless idle so as to avoid missed wakeups on CPUs in deep 
>>> idle states.
>>>
>>> Since we prevent the CPUs entering deep idle states from programming the 
>>> lapic of the
>>> broadcast cpu for their respective next local events for reasons mentioned 
>>> in
>>> PATCH[3/5], the broadcast CPU checks if there are any CPUs to be woken up 
>>> during
>>> each of its timer interrupt programmed to its local events.
>>>
>>> With tickless idle, the broadcast CPU might not get a timer interrupt till 
>>> after
>>> many ticks which can result in missed wakeups on CPUs in deep idle states. 
>>> By
>>> disabling tickless idle, worst case, the tick_sched hrtimer will trigger a
>>> timer interrupt every period to check for broadcast.
>>>
>>> However the current setup of tickless idle does not let us make the choice
>>> of tickless on individual cpus. NOHZ_MODE_INACTIVE which disables tickless 
>>> idle,
>>> is a system wide setting. Hence resort to an arch specific call to check if 
>>> a cpu
>>> can go into tickless idle.
>>
>> Hi Preeti,
>>
>> I'm not exactly sure why you can't enter the broadcast CPU in dynticks idle 
>> mode.
>> I read in the previous patch that's because in dynticks idle mode the 
>> broadcast
>> CPU deactivates its lapic so it doesn't receive the IPI. But may be I 
>> misunderstood.
>> Anyway that's not good for powersaving.

Firstly, when CPUs enter deep idle states, their local clock event
devices get switched off. In the case of powerpc, local clock event
device is the decrementer. Hence such CPUs *do not get timer interrupts*
but are still *capable of taking IPIs.*

So we need to ensure that some other CPU, in this case the broadcast
CPU, makes note of when the timer interrupt of the CPU in such deep idle
states is to trigger and at that moment issue an IPI to that CPU.

*The broadcast CPU however should have its decrementer active always*,
meaning it is disallowed from entering deep idle states, where the
decrementer switches off, precisely because the other idling CPUs bank
on it for the above mentioned reason.

> *The lapic of a broadcast CPU is active always*. Say CPUX, wants the
> broadcast CPU to wake it up at timeX.  Since we cannot program the lapic
> of a remote CPU, CPUX will need to send an IPI to the broadcast CPU,
> asking it to program its lapic to fire at timeX so as to wake up CPUX.
> *With multiple CPUs the overhead of sending IPI, could result in
> performance bottlenecks and may not scale well.*

Rewording the above. The decrementer of the broadcast CPU is active
always. Since we cannot program the clock event device
of a remote CPU, CPUX will need to send an IPI to the broadcast CPU,
(which the broadcast CPU is very well capable of receiving), asking it
to program its decrementer to fire at timeX so as to wake up CPUX
*With multiple CPUs the overhead of sending IPI, could result in
performance bottlenecks and may not scale well.*

> 
> Hence the workaround is that the broadcast CPU on each of its timer
> interrupt checks if any of the next timer event of a CPU in deep idle
> state has expired, which can very well be found from dev->next_event of
> that CPU. For example the timeX that has been mentioned above has
> expired. If so the broadcast handler is called to send an IPI to the
> idling CPU to wake it up.
> 
> *If the broadcast CPU, is in tickless idle, its timer interrupt could be
> many ticks away. It could miss waking up a CPU in deep idle*, if its
> wakeup is much before this timer interrupt of the broadcast CPU. But
> without tickless idle, atleast at each period we are assured of a timer
> interrupt. At which time broadcast handling is done as stated in the
> previous paragraph and we will not miss wakeup of CPUs in deep idle states.
> 
> Yeah it is true that not allowing the broadcast CPU to enter tickless
> idle is bad for power savings, but for the use case that we are aiming
> at in this patch series, the current approach seems to be the best, with
> minimal trade-offs in performance, power savings, scalability and no
> change in the broadcast framework that exists today in the kernel.
> 

Regards
Preeti U Murthy

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC PATCH 4/5] cpuidle/ppc: CPU goes tickless if there are no arch-specific constraints

2013-07-25 Thread Preeti U Murthy
Hi Paul,

On 07/26/2013 08:49 AM, Paul Mackerras wrote:
> On Fri, Jul 26, 2013 at 08:09:23AM +0530, Preeti U Murthy wrote:
>> Hi Frederic,
>>
>> On 07/25/2013 07:00 PM, Frederic Weisbecker wrote:
>>> Hi Preeti,
>>>
>>> I'm not exactly sure why you can't enter the broadcast CPU in dynticks idle 
>>> mode.
>>> I read in the previous patch that's because in dynticks idle mode the 
>>> broadcast
>>> CPU deactivates its lapic so it doesn't receive the IPI. But may be I 
>>> misunderstood.
>>> Anyway that's not good for powersaving.
>>
>> Let me elaborate. The CPUs in deep idle states have their lapics
>> deactivated. This means the next timer event which would typically have
>> been taken care of by a lapic firing at the appropriate moment does not
>> get taken care of in deep idle states, due to the lapic being switched off.
> 
> I really don't think it's helpful to use the term "lapic" in
> connection with Power systems.  There is nothing that is called a
> "lapic" in a Power machine.  The nearest equivalent of the LAPIC on
> x86 machines is the ICP, the interrupt-controller presentation
> element, of which there is one per CPU thread.
> 
> However, I don't believe the ICP gets disabled in deep sleep modes.
> What does get disabled is the "decrementer", which is a register that
> normally counts down (at 512MHz) and generates an exception when it is
> negative.  The decrementer *is* part of the CPU core, unlike the ICP.
> That's why we can still get IPIs but not timer interrupts.
> 
> Please reword your patch description to not use the term "lapic",
> which is not defined in the Power context and is therefore just
> causing confusion.

Noted. Thank you :) I will probably send out a fresh patchset with the
appropriate changelog to avoid this confusion ?
> 
> Paul.
> 
Regards
Preeti U murthy

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC PATCH 4/5] cpuidle/ppc: CPU goes tickless if there are no arch-specific constraints

2013-07-25 Thread Preeti U Murthy
Hi Frederic,

On 07/25/2013 07:00 PM, Frederic Weisbecker wrote:
> On Thu, Jul 25, 2013 at 02:33:02PM +0530, Preeti U Murthy wrote:
>> In the current design of timer offload framework, the broadcast cpu should
>> *not* go into tickless idle so as to avoid missed wakeups on CPUs in deep 
>> idle states.
>>
>> Since we prevent the CPUs entering deep idle states from programming the 
>> lapic of the
>> broadcast cpu for their respective next local events for reasons mentioned in
>> PATCH[3/5], the broadcast CPU checks if there are any CPUs to be woken up 
>> during
>> each of its timer interrupt programmed to its local events.
>>
>> With tickless idle, the broadcast CPU might not get a timer interrupt till 
>> after
>> many ticks which can result in missed wakeups on CPUs in deep idle states. By
>> disabling tickless idle, worst case, the tick_sched hrtimer will trigger a
>> timer interrupt every period to check for broadcast.
>>
>> However the current setup of tickless idle does not let us make the choice
>> of tickless on individual cpus. NOHZ_MODE_INACTIVE which disables tickless 
>> idle,
>> is a system wide setting. Hence resort to an arch specific call to check if 
>> a cpu
>> can go into tickless idle.
> 
> Hi Preeti,
> 
> I'm not exactly sure why you can't enter the broadcast CPU in dynticks idle 
> mode.
> I read in the previous patch that's because in dynticks idle mode the 
> broadcast
> CPU deactivates its lapic so it doesn't receive the IPI. But may be I 
> misunderstood.
> Anyway that's not good for powersaving.
> 
> Also when an arch wants to prevent a CPU from entering dynticks idle mode, it 
> typically
> use arch_needs_cpu(). May be that could fit for you as well?

Yes this will suit our requirement perfectly. I will note down this
change for the next version of this patchset. Thank you very much for
pointing this out :)

Regards
Preeti U Murthy

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC PATCH 4/5] cpuidle/ppc: CPU goes tickless if there are no arch-specific constraints

2013-07-25 Thread Preeti U Murthy
Hi Frederic,

On 07/25/2013 07:00 PM, Frederic Weisbecker wrote:
> On Thu, Jul 25, 2013 at 02:33:02PM +0530, Preeti U Murthy wrote:
>> In the current design of timer offload framework, the broadcast cpu should
>> *not* go into tickless idle so as to avoid missed wakeups on CPUs in deep 
>> idle states.
>>
>> Since we prevent the CPUs entering deep idle states from programming the 
>> lapic of the
>> broadcast cpu for their respective next local events for reasons mentioned in
>> PATCH[3/5], the broadcast CPU checks if there are any CPUs to be woken up 
>> during
>> each of its timer interrupt programmed to its local events.
>>
>> With tickless idle, the broadcast CPU might not get a timer interrupt till 
>> after
>> many ticks which can result in missed wakeups on CPUs in deep idle states. By
>> disabling tickless idle, worst case, the tick_sched hrtimer will trigger a
>> timer interrupt every period to check for broadcast.
>>
>> However the current setup of tickless idle does not let us make the choice
>> of tickless on individual cpus. NOHZ_MODE_INACTIVE which disables tickless 
>> idle,
>> is a system wide setting. Hence resort to an arch specific call to check if 
>> a cpu
>> can go into tickless idle.
> 
> Hi Preeti,
> 
> I'm not exactly sure why you can't enter the broadcast CPU in dynticks idle 
> mode.
> I read in the previous patch that's because in dynticks idle mode the 
> broadcast
> CPU deactivates its lapic so it doesn't receive the IPI. But may be I 
> misunderstood.
> Anyway that's not good for powersaving.

Let me elaborate. The CPUs in deep idle states have their lapics
deactivated. This means the next timer event which would typically have
been taken care of by a lapic firing at the appropriate moment does not
get taken care of in deep idle states, due to the lapic being switched off.

Hence such CPUs offload their next timer event to the broadcast CPU,
which should *not* enter deep idle states. The broadcast CPU has the
responsibility of waking the CPUs in deep idle states.

*The lapic of a broadcast CPU is active always*. Say CPUX, wants the
broadcast CPU to wake it up at timeX.  Since we cannot program the lapic
of a remote CPU, CPUX will need to send an IPI to the broadcast CPU,
asking it to program its lapic to fire at timeX so as to wake up CPUX.
*With multiple CPUs the overhead of sending IPI, could result in
performance bottlenecks and may not scale well.*

Hence the workaround is that the broadcast CPU on each of its timer
interrupt checks if any of the next timer event of a CPU in deep idle
state has expired, which can very well be found from dev->next_event of
that CPU. For example the timeX that has been mentioned above has
expired. If so the broadcast handler is called to send an IPI to the
idling CPU to wake it up.

*If the broadcast CPU, is in tickless idle, its timer interrupt could be
many ticks away. It could miss waking up a CPU in deep idle*, if its
wakeup is much before this timer interrupt of the broadcast CPU. But
without tickless idle, atleast at each period we are assured of a timer
interrupt. At which time broadcast handling is done as stated in the
previous paragraph and we will not miss wakeup of CPUs in deep idle states.

Yeah it is true that not allowing the broadcast CPU to enter tickless
idle is bad for power savings, but for the use case that we are aiming
at in this patch series, the current approach seems to be the best, with
minimal trade-offs in performance, power savings, scalability and no
change in the broadcast framework that exists today in the kernel.

> 
> Also when an arch wants to prevent a CPU from entering dynticks idle mode, it 
> typically
> use arch_needs_cpu(). May be that could fit for you as well?

Oh ok thanks :) I will look into this and get back on if we can use it.

Regards
Preeti U Murthy

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 5/5] cpuidle/ppc: Add longnap state to the idle states on powernv

2013-07-25 Thread Preeti U Murthy
This patch hooks into the existing broadcast framework with the support that 
this
patchset introduces for ppc, and the cpuidle driver backend
for powernv(posted out recently by Deepthi Dharwar) to add sleep state as
one of the deep idle states, in which the decrementer is switched off.

However in this patch, we only emulate sleep by going into a state which does
a nap with the decrementer interrupts disabled, termed as longnap. This enables
focus on the timer broadcast framework for ppc in this series of patches ,
which is required as a first step to enable sleep on ppc.

Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/platforms/powernv/processor_idle.c |   48 +++
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/processor_idle.c 
b/arch/powerpc/platforms/powernv/processor_idle.c
index f43ad91a..9aca502 100644
--- a/arch/powerpc/platforms/powernv/processor_idle.c
+++ b/arch/powerpc/platforms/powernv/processor_idle.c
@@ -9,16 +9,18 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
+#include 
 
 struct cpuidle_driver powernv_idle_driver = {
.name = "powernv_idle",
.owner =THIS_MODULE,
 };
 
-#define MAX_IDLE_STATE_COUNT   2
+#define MAX_IDLE_STATE_COUNT   3
 
 static int max_idle_state = MAX_IDLE_STATE_COUNT - 1;
 static struct cpuidle_device __percpu *powernv_cpuidle_devices;
@@ -54,6 +56,43 @@ static int nap_loop(struct cpuidle_device *dev,
return index;
 }
 
+/* Emulate sleep, with long nap.
+ * During sleep, the core does not receive decrementer interrupts.
+ * Emulate sleep using long nap with decrementers interrupts disabled.
+ * This is an initial prototype to test the timer offload framework for ppc.
+ * We will eventually introduce the sleep state once the timer offload 
framework
+ * for ppc is stable.
+ */
+static int longnap_loop(struct cpuidle_device *dev,
+   struct cpuidle_driver *drv,
+   int index)
+{
+   int cpu = dev->cpu;
+
+   unsigned long lpcr = mfspr(SPRN_LPCR);
+
+   lpcr &= ~(LPCR_MER | LPCR_PECE); /* lpcr[mer] must be 0 */
+
+   /* exit powersave upon external interrupt, but not decrementer
+* interrupt, Emulate sleep.
+*/
+   lpcr |= LPCR_PECE0;
+
+   if (cpu != bc_cpu) {
+   mtspr(SPRN_LPCR, lpcr);
+   clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
+   power7_nap();
+   clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+   } else {
+   /* Wakeup on a decrementer interrupt, Do a nap */
+   lpcr |= LPCR_PECE1;
+   mtspr(SPRN_LPCR, lpcr);
+   power7_nap();
+   }
+
+   return index;
+}
+
 /*
  * States for dedicated partition case.
  */
@@ -72,6 +111,13 @@ static struct cpuidle_state 
powernv_states[MAX_IDLE_STATE_COUNT] = {
.exit_latency = 10,
.target_residency = 100,
.enter = &nap_loop },
+{ /* LongNap */
+   .name = "LongNap",
+   .desc = "LongNap",
+   .flags = CPUIDLE_FLAG_TIME_VALID,
+   .exit_latency = 10,
+   .target_residency = 100,
+   .enter = &longnap_loop },
 };
 
 static int powernv_cpuidle_add_cpu_notifier(struct notifier_block *n,

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 4/5] cpuidle/ppc: CPU goes tickless if there are no arch-specific constraints

2013-07-25 Thread Preeti U Murthy
In the current design of timer offload framework, the broadcast cpu should
*not* go into tickless idle so as to avoid missed wakeups on CPUs in deep idle 
states.

Since we prevent the CPUs entering deep idle states from programming the lapic 
of the
broadcast cpu for their respective next local events for reasons mentioned in
PATCH[3/5], the broadcast CPU checks if there are any CPUs to be woken up during
each of its timer interrupt programmed to its local events.

With tickless idle, the broadcast CPU might not get a timer interrupt till after
many ticks which can result in missed wakeups on CPUs in deep idle states. By
disabling tickless idle, worst case, the tick_sched hrtimer will trigger a
timer interrupt every period to check for broadcast.

However the current setup of tickless idle does not let us make the choice
of tickless on individual cpus. NOHZ_MODE_INACTIVE which disables tickless idle,
is a system wide setting. Hence resort to an arch specific call to check if a 
cpu
can go into tickless idle.

Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/kernel/time.c |5 +
 kernel/time/tick-sched.c   |7 +++
 2 files changed, 12 insertions(+)

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 8ed0fb3..68a636f 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -862,6 +862,11 @@ static void decrementer_timer_broadcast(const struct 
cpumask *mask)
arch_send_tick_broadcast(mask);
 }
 
+int arch_can_stop_idle_tick(int cpu)
+{
+   return cpu != bc_cpu;
+}
+
 static void register_decrementer_clockevent(int cpu)
 {
struct clock_event_device *dec = &per_cpu(decrementers, cpu);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6960172..e9ffa84 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -700,8 +700,15 @@ static void tick_nohz_full_stop_tick(struct tick_sched *ts)
 #endif
 }
 
+int __weak arch_can_stop_idle_tick(int cpu)
+{
+   return 1;
+}
+
 static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 {
+   if (!arch_can_stop_idle_tick(cpu))
+   return false;
/*
 * If this cpu is offline and it is the one which updates
 * jiffies, then give up the assignment and let it be taken by

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 3/5] cpuidle/ppc: Add timer offload framework to support deep idle states

2013-07-25 Thread Preeti U Murthy
On ppc, in deep idle states, the lapic of the cpus gets switched off.
Hence make use of the broadcast framework to wakeup cpus in sleep state,
except that on ppc, we do not have an external device such as HPET, but
we use the lapic of a cpu itself as the broadcast device.

Instantiate two different clock event devices, one representing the
lapic and another representing the broadcast device for each cpu.
Such a cpu is forbidden to enter the deep idle state. The cpu which hosts
the broadcast device will be referred to as the broadcast cpu in the
changelogs of this patchset for convenience.

For now, only the boot cpu's broadcast device gets registered as a clock event
device along with the lapic. Hence this is the broadcast cpu.

On the broadcast cpu, on each timer interrupt, apart from the regular lapic 
event
handler the broadcast handler is also called. We avoid the overhead of
programming the lapic for a broadcast event specifically. The reason is
prevent multiple cpus from sending IPIs to program the lapic of the broadcast
cpu for their next local event each time they go to deep idle state.

Apart from this there is no change in the way broadcast is handled today. On
a broadcast ipi the event handler for a timer interrupt is called on the cpu
in deep idle state to handle the local events.

The current design and implementation of the timer offload framework supports
the ONESHOT tick mode but not the PERIODIC mode.

Signed-off-by: Preeti U. Murthy 
---

 arch/powerpc/include/asm/time.h|3 +
 arch/powerpc/kernel/smp.c  |4 +-
 arch/powerpc/kernel/time.c |   79 
 arch/powerpc/platforms/powernv/Kconfig |1 
 4 files changed, 84 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index c1f2676..936be0d 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -24,14 +24,17 @@ extern unsigned long tb_ticks_per_jiffy;
 extern unsigned long tb_ticks_per_usec;
 extern unsigned long tb_ticks_per_sec;
 extern struct clock_event_device decrementer_clockevent;
+extern struct clock_event_device broadcast_clockevent;
 
 struct rtc_time;
 extern void to_tm(int tim, struct rtc_time * tm);
 extern void GregorianDay(struct rtc_time *tm);
+extern void decrementer_timer_interrupt(void);
 
 extern void generic_calibrate_decr(void);
 
 extern void set_dec_cpu6(unsigned int val);
+extern int bc_cpu;
 
 /* Some sane defaults: 125 MHz timebase, 1GHz processor */
 extern unsigned long ppc_proc_freq;
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 6a68ca4..d3b7014 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -114,7 +114,7 @@ int smp_generic_kick_cpu(int nr)
 
 static irqreturn_t timer_action(int irq, void *data)
 {
-   timer_interrupt();
+   decrementer_timer_interrupt();
return IRQ_HANDLED;
 }
 
@@ -223,7 +223,7 @@ irqreturn_t smp_ipi_demux(void)
 
 #ifdef __BIG_ENDIAN
if (all & (1 << (24 - 8 * PPC_MSG_TIMER)))
-   timer_interrupt();
+   decrementer_timer_interrupt();
if (all & (1 << (24 - 8 * PPC_MSG_RESCHEDULE)))
scheduler_ipi();
if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNC_SINGLE)))
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 65ab9e9..8ed0fb3 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -42,6 +42,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -97,8 +98,11 @@ static struct clocksource clocksource_timebase = {
 
 static int decrementer_set_next_event(unsigned long evt,
  struct clock_event_device *dev);
+static int broadcast_set_next_event(unsigned long evt,
+ struct clock_event_device *dev);
 static void decrementer_set_mode(enum clock_event_mode mode,
 struct clock_event_device *dev);
+static void decrementer_timer_broadcast(const struct cpumask *mask);
 
 struct clock_event_device decrementer_clockevent = {
.name   = "decrementer",
@@ -106,13 +110,26 @@ struct clock_event_device decrementer_clockevent = {
.irq= 0,
.set_next_event = decrementer_set_next_event,
.set_mode   = decrementer_set_mode,
-   .features   = CLOCK_EVT_FEAT_ONESHOT,
+   .broadcast  = decrementer_timer_broadcast,
+   .features   = CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_ONESHOT,
 };
 EXPORT_SYMBOL(decrementer_clockevent);
 
+struct clock_event_device broadcast_clockevent = {
+   .name   = "broadcast",
+   .rating = 200,
+   .irq= 0,
+   .set_next_event = broadcast_set_next_event,
+   .set_mode   = decrementer_set_mode,
+   .feat

[RFC PATCH 2/5] powerpc: Implement broadcast timer interrupt as an IPI message

2013-07-25 Thread Preeti U Murthy
From: Srivatsa S. Bhat 

For scalability and performance reasons, we want the broadcast timer
interrupts to be handled as efficiently as possible. Fixed IPI messages
are one of the most efficient mechanisms available - they are faster
than the smp_call_function mechanism because the IPI handlers are fixed
and hence they don't involve costly operations such as adding IPI handlers
to the target CPU's function queue, acquiring locks for synchronization etc.

Luckily we have an unused IPI message slot, so use that to implement
broadcast timer interrupts efficiently.

Signed-off-by: Srivatsa S. Bhat 
Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/include/asm/smp.h  |3 ++-
 arch/powerpc/kernel/smp.c   |   19 +++
 arch/powerpc/platforms/cell/interrupt.c |2 +-
 arch/powerpc/platforms/ps3/smp.c|2 +-
 4 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 51bf017..d877b69 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -117,7 +117,7 @@ extern int cpu_to_core_id(int cpu);
  *
  * Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up
  * in /proc/interrupts will be wrong!!! --Troy */
-#define PPC_MSG_UNUSED 0
+#define PPC_MSG_TIMER  0
 #define PPC_MSG_RESCHEDULE  1
 #define PPC_MSG_CALL_FUNC_SINGLE   2
 #define PPC_MSG_DEBUGGER_BREAK  3
@@ -190,6 +190,7 @@ extern struct smp_ops_t *smp_ops;
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
+extern void arch_send_tick_broadcast(const struct cpumask *mask);
 
 /* Definitions relative to the secondary CPU spin loop
  * and entry point. Not all of them exist on both 32 and
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index bc41e9f..6a68ca4 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -111,9 +112,9 @@ int smp_generic_kick_cpu(int nr)
 }
 #endif /* CONFIG_PPC64 */
 
-static irqreturn_t unused_action(int irq, void *data)
+static irqreturn_t timer_action(int irq, void *data)
 {
-   /* This slot is unused and hence available for use, if needed */
+   timer_interrupt();
return IRQ_HANDLED;
 }
 
@@ -144,14 +145,14 @@ static irqreturn_t debug_ipi_action(int irq, void *data)
 }
 
 static irq_handler_t smp_ipi_action[] = {
-   [PPC_MSG_UNUSED] =  unused_action, /* Slot available for future use */
+   [PPC_MSG_TIMER] =  timer_action,
[PPC_MSG_RESCHEDULE] = reschedule_action,
[PPC_MSG_CALL_FUNC_SINGLE] = call_function_single_action,
[PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action,
 };
 
 const char *smp_ipi_name[] = {
-   [PPC_MSG_UNUSED] =  "ipi unused",
+   [PPC_MSG_TIMER] =  "ipi timer",
[PPC_MSG_RESCHEDULE] = "ipi reschedule",
[PPC_MSG_CALL_FUNC_SINGLE] = "ipi call function single",
[PPC_MSG_DEBUGGER_BREAK] = "ipi debugger",
@@ -221,6 +222,8 @@ irqreturn_t smp_ipi_demux(void)
all = xchg(&info->messages, 0);
 
 #ifdef __BIG_ENDIAN
+   if (all & (1 << (24 - 8 * PPC_MSG_TIMER)))
+   timer_interrupt();
if (all & (1 << (24 - 8 * PPC_MSG_RESCHEDULE)))
scheduler_ipi();
if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNC_SINGLE)))
@@ -266,6 +269,14 @@ void arch_send_call_function_ipi_mask(const struct cpumask 
*mask)
do_message_pass(cpu, PPC_MSG_CALL_FUNC_SINGLE);
 }
 
+void arch_send_tick_broadcast(const struct cpumask *mask)
+{
+   unsigned int cpu;
+
+   for_each_cpu(cpu, mask)
+   do_message_pass(cpu, PPC_MSG_TIMER);
+}
+
 #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
 void smp_send_debugger_break(void)
 {
diff --git a/arch/powerpc/platforms/cell/interrupt.c 
b/arch/powerpc/platforms/cell/interrupt.c
index 28166e4..1359113 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -213,7 +213,7 @@ static void iic_request_ipi(int msg)
 
 void iic_request_IPIs(void)
 {
-   iic_request_ipi(PPC_MSG_UNUSED);
+   iic_request_ipi(PPC_MSG_TIMER);
iic_request_ipi(PPC_MSG_RESCHEDULE);
iic_request_ipi(PPC_MSG_CALL_FUNC_SINGLE);
iic_request_ipi(PPC_MSG_DEBUGGER_BREAK);
diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c
index 488f069..5cb742a 100644
--- a/arch/powerpc/platforms/ps3/smp.c
+++ b/arch/powerpc/platforms/ps3/smp.c
@@ -74,7 +74,7 @@ static int __init ps3_smp_probe(void)
* to index needs to be setup.
*/
 
-   BUILD_BUG_ON(PPC_MSG_UNUSED   != 0);
+   BUILD_

[RFC PATCH 0/5] cpuidle/ppc: Timer offload framework to support deep idle states

2013-07-25 Thread Preeti U Murthy
On PowerPC, when CPUs enter deep idle states, their local timers are
switched off. The responsibility of waking them up at their next timer event,
needs to be handed over to an external device. On PowerPC, we do not have an
external device equivalent to HPET, which is currently done on architectures
like x86. Instead we assign the local timer of one of the CPUs to do this job.

This patchset is an attempt to make use of the existing timer broadcast
framework in the kernel to meet the above requirement, except that the tick
broadcast device is the local timer of the boot CPU.

This patch series is ported ontop of 3.11-rc1 + the cpuidle driver backend
for powernv posted by Deepthi Dharwar recently. The current design and
implementation supports the ONESHOT tick mode. It does not yet support
the PERIODIC tick mode. This patch is tested with NOHZ_FULL off.

Patch[1/5], Patch[2/5]: optimize the broadcast mechanism on ppc.
Patch[3/5]: Introduces the core of the timer offload framework on powerpc.
Patch[4/5]: The cpu doing the broadcast should not go into tickless idle.
Patch[5/5]: Add a deep idle state to the cpuidle state table on powernv.

Patch[5/5] is the patch that ultimately makes use of the timer offload
framework that the patches Patch[1/5] to Patch[4/5] build.

---

Preeti U Murthy (3):
  cpuidle/ppc: Add timer offload framework to support deep idle states
  cpuidle/ppc: CPU goes tickless if there are no arch-specific constraints
  cpuidle/ppc: Add longnap state to the idle states on powernv

Srivatsa S. Bhat (2):
  powerpc: Free up the IPI message slot of ipi call function 
(PPC_MSG_CALL_FUNC)
  powerpc: Implement broadcast timer interrupt as an IPI message


 arch/powerpc/include/asm/smp.h  |3 +
 arch/powerpc/include/asm/time.h |3 +
 arch/powerpc/kernel/smp.c   |   23 --
 arch/powerpc/kernel/time.c  |   84 +++
 arch/powerpc/platforms/cell/interrupt.c |2 -
 arch/powerpc/platforms/powernv/Kconfig  |1 
 arch/powerpc/platforms/powernv/processor_idle.c |   48 +
 arch/powerpc/platforms/ps3/smp.c|2 -
 kernel/time/tick-sched.c|7 ++
 9 files changed, 161 insertions(+), 12 deletions(-)

-- 
Signature

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 1/5] powerpc: Free up the IPI message slot of ipi call function (PPC_MSG_CALL_FUNC)

2013-07-25 Thread Preeti U Murthy
From: Srivatsa S. Bhat 

The IPI handlers for both PPC_MSG_CALL_FUNC and PPC_MSG_CALL_FUNC_SINGLE
map to a common implementation - generic_smp_call_function_single_interrupt().
So, we can consolidate them and save one of the IPI message slots, (which are
precious, since only 4 of those slots are available).

So, implement the functionality of PPC_MSG_CALL_FUNC using
PPC_MSG_CALL_FUNC_SINGLE itself and release its IPI message slot, so that it
can be used for something else in the future, if desired.

Signed-off-by: Srivatsa S. Bhat 
Signed-off-by: Preeti U Murthy 
---

 arch/powerpc/include/asm/smp.h  |2 +-
 arch/powerpc/kernel/smp.c   |   12 +---
 arch/powerpc/platforms/cell/interrupt.c |2 +-
 arch/powerpc/platforms/ps3/smp.c|2 +-
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index ffbaabe..51bf017 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -117,7 +117,7 @@ extern int cpu_to_core_id(int cpu);
  *
  * Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up
  * in /proc/interrupts will be wrong!!! --Troy */
-#define PPC_MSG_CALL_FUNCTION   0
+#define PPC_MSG_UNUSED 0
 #define PPC_MSG_RESCHEDULE  1
 #define PPC_MSG_CALL_FUNC_SINGLE   2
 #define PPC_MSG_DEBUGGER_BREAK  3
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 38b0ba6..bc41e9f 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -111,9 +111,9 @@ int smp_generic_kick_cpu(int nr)
 }
 #endif /* CONFIG_PPC64 */
 
-static irqreturn_t call_function_action(int irq, void *data)
+static irqreturn_t unused_action(int irq, void *data)
 {
-   generic_smp_call_function_interrupt();
+   /* This slot is unused and hence available for use, if needed */
return IRQ_HANDLED;
 }
 
@@ -144,14 +144,14 @@ static irqreturn_t debug_ipi_action(int irq, void *data)
 }
 
 static irq_handler_t smp_ipi_action[] = {
-   [PPC_MSG_CALL_FUNCTION] =  call_function_action,
+   [PPC_MSG_UNUSED] =  unused_action, /* Slot available for future use */
[PPC_MSG_RESCHEDULE] = reschedule_action,
[PPC_MSG_CALL_FUNC_SINGLE] = call_function_single_action,
[PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action,
 };
 
 const char *smp_ipi_name[] = {
-   [PPC_MSG_CALL_FUNCTION] =  "ipi call function",
+   [PPC_MSG_UNUSED] =  "ipi unused",
[PPC_MSG_RESCHEDULE] = "ipi reschedule",
[PPC_MSG_CALL_FUNC_SINGLE] = "ipi call function single",
[PPC_MSG_DEBUGGER_BREAK] = "ipi debugger",
@@ -221,8 +221,6 @@ irqreturn_t smp_ipi_demux(void)
all = xchg(&info->messages, 0);
 
 #ifdef __BIG_ENDIAN
-   if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNCTION)))
-   generic_smp_call_function_interrupt();
if (all & (1 << (24 - 8 * PPC_MSG_RESCHEDULE)))
scheduler_ipi();
if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNC_SINGLE)))
@@ -265,7 +263,7 @@ void arch_send_call_function_ipi_mask(const struct cpumask 
*mask)
unsigned int cpu;
 
for_each_cpu(cpu, mask)
-   do_message_pass(cpu, PPC_MSG_CALL_FUNCTION);
+   do_message_pass(cpu, PPC_MSG_CALL_FUNC_SINGLE);
 }
 
 #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
diff --git a/arch/powerpc/platforms/cell/interrupt.c 
b/arch/powerpc/platforms/cell/interrupt.c
index 2d42f3b..28166e4 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -213,7 +213,7 @@ static void iic_request_ipi(int msg)
 
 void iic_request_IPIs(void)
 {
-   iic_request_ipi(PPC_MSG_CALL_FUNCTION);
+   iic_request_ipi(PPC_MSG_UNUSED);
iic_request_ipi(PPC_MSG_RESCHEDULE);
iic_request_ipi(PPC_MSG_CALL_FUNC_SINGLE);
iic_request_ipi(PPC_MSG_DEBUGGER_BREAK);
diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c
index 4b35166..488f069 100644
--- a/arch/powerpc/platforms/ps3/smp.c
+++ b/arch/powerpc/platforms/ps3/smp.c
@@ -74,7 +74,7 @@ static int __init ps3_smp_probe(void)
* to index needs to be setup.
*/
 
-   BUILD_BUG_ON(PPC_MSG_CALL_FUNCTION!= 0);
+   BUILD_BUG_ON(PPC_MSG_UNUSED   != 0);
BUILD_BUG_ON(PPC_MSG_RESCHEDULE   != 1);
BUILD_BUG_ON(PPC_MSG_CALL_FUNC_SINGLE != 2);
BUILD_BUG_ON(PPC_MSG_DEBUGGER_BREAK   != 3);

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


<    1   2   3