[RFC PATCH] smack: fix access permissions for keyring
Function smack_key_permission() only issues smack requests for the following operations: - KEY_NEED_READ (issues MAY_READ) - KEY_NEED_WRITE (issues MAY_WRITE) - KEY_NEED_LINK (issues MAY_WRITE) - KEY_NEED_SETATTR (issues MAY_WRITE) A blank smack request is issued in all other cases, resulting in smack access being granted if there is any rule defined between subject and object, or denied with -EACCES otherwise. Request MAY_READ access for KEY_NEED_SEARCH and KEY_NEED_VIEW. Fix the logic in the unlikely case when both MAY_READ and MAY_WRITE are needed. Validate access permission field for valid contents. Signed-off-by: Zoran Markovic Cc: Casey Schaufler Cc: James Morris Cc: "Serge E. Hallyn" --- security/smack/smack_lsm.c | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 340fc30..77e405f 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -4326,6 +4326,12 @@ static int smack_key_permission(key_ref_t key_ref, int request = 0; int rc; + /* +* Validate requested permissions +*/ + if (perm & ~KEY_NEED_ALL) + return -EINVAL; + keyp = key_ref_to_ptr(key_ref); if (keyp == NULL) return -EINVAL; @@ -4349,10 +4355,10 @@ static int smack_key_permission(key_ref_t key_ref, ad.a.u.key_struct.key = keyp->serial; ad.a.u.key_struct.key_desc = keyp->description; #endif - if (perm & KEY_NEED_READ) - request = MAY_READ; + if (perm & (KEY_NEED_READ | KEY_NEED_SEARCH | KEY_NEED_VIEW)) + request |= MAY_READ; if (perm & (KEY_NEED_WRITE | KEY_NEED_LINK | KEY_NEED_SETATTR)) - request = MAY_WRITE; + request |= MAY_WRITE; rc = smk_access(tkp, keyp->security, request, &ad); rc = smk_bu_note("key access", tkp, keyp->security, request, rc); return rc; -- 2.7.4
[RFC PATCHv2 2/4] clk: mdm9615: Add EBI2 clock
Add definition of EBI2 clock used by MDM9615 NAND controller. Cc: Andy Gross Cc: David Brown Cc: Michael Turquette Cc: Stephen Boyd Cc: Rob Herring Cc: Mark Rutland Cc: Neil Armstrong Cc: linux-arm-...@vger.kernel.org Cc: linux-...@vger.kernel.org Cc: linux-...@vger.kernel.org Cc: devicet...@vger.kernel.org Signed-off-by: Zoran Markovic --- drivers/clk/qcom/gcc-mdm9615.c | 30 ++ include/dt-bindings/clock/qcom,gcc-mdm9615.h |3 +++ 2 files changed, 33 insertions(+) diff --git a/drivers/clk/qcom/gcc-mdm9615.c b/drivers/clk/qcom/gcc-mdm9615.c index 581a17f..e9e98b1 100644 --- a/drivers/clk/qcom/gcc-mdm9615.c +++ b/drivers/clk/qcom/gcc-mdm9615.c @@ -1563,6 +1563,34 @@ enum { }, }; +static struct clk_branch ebi2_clk = { + .hwcg_reg = 0x2664, + .hwcg_bit = 6, + .halt_reg = 0x2fcc, + .halt_bit = 23, + .clkr = { + .enable_reg = 0x2664, + .enable_mask = BIT(6) | BIT(4), + .hw.init = &(struct clk_init_data){ + .name = "ebi2_clk", + .ops = &clk_branch_ops, + }, + }, +}; + +static struct clk_branch ebi2_aon_clk = { + .halt_reg = 0x2fcc, + .halt_bit = 23, + .clkr = { + .enable_reg = 0x2664, + .enable_mask = BIT(8), + .hw.init = &(struct clk_init_data){ + .name = "ebi2_aon_clk", + .ops = &clk_branch_ops, + }, + }, +}; + static struct clk_hw *gcc_mdm9615_hws[] = { &cxo.hw, }; @@ -1637,6 +1665,8 @@ enum { [PMIC_ARB1_H_CLK] = &pmic_arb1_h_clk.clkr, [PMIC_SSBI2_CLK] = &pmic_ssbi2_clk.clkr, [RPM_MSG_RAM_H_CLK] = &rpm_msg_ram_h_clk.clkr, + [EBI2_CLK] = &ebi2_clk.clkr, + [EBI2_AON_CLK] = &ebi2_aon_clk.clkr, }; static const struct qcom_reset_map gcc_mdm9615_resets[] = { diff --git a/include/dt-bindings/clock/qcom,gcc-mdm9615.h b/include/dt-bindings/clock/qcom,gcc-mdm9615.h index 9ab2c40..57cdca6 100644 --- a/include/dt-bindings/clock/qcom,gcc-mdm9615.h +++ b/include/dt-bindings/clock/qcom,gcc-mdm9615.h @@ -323,5 +323,8 @@ #define CE3_H_CLK 305 #define USB_HS1_SYSTEM_CLK_SRC 306 #define USB_HS1_SYSTEM_CLK 307 +#define EBI2_CLK 308 +#define EBI2_AON_CLK 309 + #endif -- 1.7.9.5
[RFC PATCH 0/4] Enable NAND on Sierra Wireless WP8548 board
Enable NAND flash on Sierra Wireless's WP8548 module used on MangOH Green board. The patch set consists of device tree descriptions for ADM DMA engine, NAND controller and NAND flash partitioned for Sierra Wireless Legato framework, as well as definition of EBI2 clock used by NAND controller. This patch set depends on Andy Gross's driver for ADM DMA engine: https://lwn.net/Articles/636881/ Zoran Markovic (4): dt-bindings: mdm9615: Add ADM DMA engine clk: mdm9615: Add EBI2 clock dt-bindings: mdm9615: Add NAND controller dt-bindings: wp8548: Add on-board NAND flash arch/arm/boot/dts/qcom-mdm9615-wp8548.dtsi | 50 ++ arch/arm/boot/dts/qcom-mdm9615.dtsi | 35 +- drivers/clk/qcom/gcc-mdm9615.c | 30 include/dt-bindings/clock/qcom,gcc-mdm9615.h |3 ++ 4 files changed, 117 insertions(+), 1 deletion(-) -- 1.7.9.5
[RFC PATCH 3/4] dt-bindings: mdm9615: Add NAND controller
Add dt description of NAND controller on MDM9615. Signed-off-by: Zoran Markovic --- arch/arm/boot/dts/qcom-mdm9615.dtsi | 16 1 file changed, 16 insertions(+) diff --git a/arch/arm/boot/dts/qcom-mdm9615.dtsi b/arch/arm/boot/dts/qcom-mdm9615.dtsi index fbc7d68..6d42ff3 100644 --- a/arch/arm/boot/dts/qcom-mdm9615.dtsi +++ b/arch/arm/boot/dts/qcom-mdm9615.dtsi @@ -373,6 +373,22 @@ qcom,ee = <0>; }; + nand0: nand@1b40 { + compatible = "qcom,ipq806x-nand"; + reg = <0x1b40 0x800>; + clocks = <&gcc EBI2_CLK>, +<&gcc EBI2_AON_CLK>; + clock-names = "core", "aon"; + + dmas = <&adm_dma 3>; + dma-names = "rxtx"; + qcom,cmd-crci = <15>; + qcom,data-crci = <3>; + + #address-cells = <1>; + #size-cells = <0>; + }; + amba { compatible = "arm,amba-bus"; #address-cells = <1>; -- 1.7.9.5
[RFC PATCH 1/4] dt-bindings: mdm9615: Add ADM DMA engine
Add configuration for ADM DMA engine on MDM9615, used by the EBI2 NAND controller. This commit requires the ADM DMA patches from Andy Gross: https://lwn.net/Articles/636881/ Signed-off-by: Zoran Markovic --- arch/arm/boot/dts/qcom-mdm9615.dtsi | 19 ++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/qcom-mdm9615.dtsi b/arch/arm/boot/dts/qcom-mdm9615.dtsi index 5ae4ec5..fbc7d68 100644 --- a/arch/arm/boot/dts/qcom-mdm9615.dtsi +++ b/arch/arm/boot/dts/qcom-mdm9615.dtsi @@ -336,7 +336,24 @@ }; }; - sdcc1bam: dma@12182000{ + adm_dma: dma@1830 { + compatible = "qcom,adm"; + reg = <0x1830 0x10>; + interrupts = <0 170 0>; + #dma-cells = <1>; + + clocks = <&gcc ADM0_CLK>, <&gcc ADM0_PBUS_CLK>; + clock-names = "core", "iface"; + + resets = <&gcc ADM0_RESET>, +<&gcc ADM0_C0_RESET>, +<&gcc ADM0_C1_RESET>, +<&gcc ADM0_C2_RESET>; + reset-names = "clk", "c0", "c1", "c2"; + qcom,ee = <0>; + }; + + sdcc1bam:dma@12182000{ compatible = "qcom,bam-v1.3.0"; reg = <0x12182000 0x8000>; interrupts = ; -- 1.7.9.5
[RFC PATCH 3/4] dt-bindings: mdm9615: Add NAND controller
Add dt description of NAND controller on MDM9615. Cc: Andy Gross Cc: David Brown Cc: Rob Herring Cc: Mark Rutland Cc: Russell King Cc: linux-arm-...@vger.kernel.org Cc: linux-...@vger.kernel.org Cc: devicet...@vger.kernel.org Cc: linux-arm-ker...@lists.infradead.org Signed-off-by: Zoran Markovic --- arch/arm/boot/dts/qcom-mdm9615.dtsi | 16 1 file changed, 16 insertions(+) diff --git a/arch/arm/boot/dts/qcom-mdm9615.dtsi b/arch/arm/boot/dts/qcom-mdm9615.dtsi index fbc7d68..6d42ff3 100644 --- a/arch/arm/boot/dts/qcom-mdm9615.dtsi +++ b/arch/arm/boot/dts/qcom-mdm9615.dtsi @@ -373,6 +373,22 @@ qcom,ee = <0>; }; + nand0: nand@1b40 { + compatible = "qcom,ipq806x-nand"; + reg = <0x1b40 0x800>; + clocks = <&gcc EBI2_CLK>, +<&gcc EBI2_AON_CLK>; + clock-names = "core", "aon"; + + dmas = <&adm_dma 3>; + dma-names = "rxtx"; + qcom,cmd-crci = <15>; + qcom,data-crci = <3>; + + #address-cells = <1>; + #size-cells = <0>; + }; + amba { compatible = "arm,amba-bus"; #address-cells = <1>; -- 1.7.9.5
[RFC PATCH 2/4] clk: mdm9615: Add EBI2 clock
Add definition of EBI2 clock used by MDM9615 NAND controller. Cc: Andy Gross Cc: David Brown Cc: Michael Turquette Cc: Stephen Boyd Cc: Rob Herring Cc: Mark Rutland Cc: Neil Armstrong Cc: linux-arm-...@vger.kernel.org Cc: linux-...@vger.kernel.org Cc: linux-...@vger.kernel.org Cc: devicet...@vger.kernel.org Signed-off-by: Zoran Markovic --- drivers/clk/qcom/gcc-mdm9615.c | 30 ++ include/dt-bindings/clock/qcom,gcc-mdm9615.h |3 +++ 2 files changed, 33 insertions(+) diff --git a/drivers/clk/qcom/gcc-mdm9615.c b/drivers/clk/qcom/gcc-mdm9615.c index 581a17f..e9e98b1 100644 --- a/drivers/clk/qcom/gcc-mdm9615.c +++ b/drivers/clk/qcom/gcc-mdm9615.c @@ -1563,6 +1563,34 @@ enum { }, }; +static struct clk_branch ebi2_clk = { + .hwcg_reg = 0x2664, + .hwcg_bit = 6, + .halt_reg = 0x2fcc, + .halt_bit = 23, + .clkr = { + .enable_reg = 0x2664, + .enable_mask = BIT(6)|BIT(4), + .hw.init = &(struct clk_init_data){ + .name = "ebi2_clk", + .ops = &clk_branch_ops, + }, + }, +}; + +static struct clk_branch ebi2_aon_clk = { + .halt_reg = 0x2fcc, + .halt_bit = 23, + .clkr = { + .enable_reg = 0x2664, + .enable_mask = BIT(8), + .hw.init = &(struct clk_init_data){ + .name = "ebi2_always_on_clk", + .ops = &clk_branch_ops, + }, + }, +}; + static struct clk_hw *gcc_mdm9615_hws[] = { &cxo.hw, }; @@ -1637,6 +1665,8 @@ enum { [PMIC_ARB1_H_CLK] = &pmic_arb1_h_clk.clkr, [PMIC_SSBI2_CLK] = &pmic_ssbi2_clk.clkr, [RPM_MSG_RAM_H_CLK] = &rpm_msg_ram_h_clk.clkr, + [EBI2_CLK] = &ebi2_clk.clkr, + [EBI2_AON_CLK] = &ebi2_aon_clk.clkr, }; static const struct qcom_reset_map gcc_mdm9615_resets[] = { diff --git a/include/dt-bindings/clock/qcom,gcc-mdm9615.h b/include/dt-bindings/clock/qcom,gcc-mdm9615.h index 9ab2c40..57cdca6 100644 --- a/include/dt-bindings/clock/qcom,gcc-mdm9615.h +++ b/include/dt-bindings/clock/qcom,gcc-mdm9615.h @@ -323,5 +323,8 @@ #define CE3_H_CLK 305 #define USB_HS1_SYSTEM_CLK_SRC 306 #define USB_HS1_SYSTEM_CLK 307 +#define EBI2_CLK 309 +#define EBI2_AON_CLK 310 + #endif -- 1.7.9.5
[RFC PATCH 4/4] dt-bindings: wp8548: Add on-board NAND flash
Add description of NAND flash on Sierra Wireless WP8548 module (and MangOH board). Signed-off-by: Zoran Markovic --- arch/arm/boot/dts/qcom-mdm9615-wp8548.dtsi | 50 1 file changed, 50 insertions(+) diff --git a/arch/arm/boot/dts/qcom-mdm9615-wp8548.dtsi b/arch/arm/boot/dts/qcom-mdm9615-wp8548.dtsi index 7869898..a4d1158 100644 --- a/arch/arm/boot/dts/qcom-mdm9615-wp8548.dtsi +++ b/arch/arm/boot/dts/qcom-mdm9615-wp8548.dtsi @@ -54,6 +54,56 @@ }; }; +&nand0 { + nandcs@0 { + compatible = "qcom,nandcs"; + reg = <0>; + + linux,mtd-name = "micron,mt29f4g08"; + #address-cells = <1>; + #size-cells = <0>; + nand-ecc-strength = <4>; + nand-ecc-step-size = <512>; + + partitions { + compatible = "fixed-partitions"; + #address-cells = <1>; + #size-cells = <1>; + + bootloader@0x051c { + reg = <0x51c 0x10>; + read-only; + }; + + kernel@0x052c { + reg = <0x52c 0x140>; + read-only; + }; + + rootfs@0x066c { + reg = <0x66c 0x314>; + read-only; + }; + + user0@0x0980 { + reg = <0x980 0x278>; + }; + + user1@0x0bf8 { + reg = <0xbf8 0x8B8>; + }; + + user2@0x14b0 { + reg = <0x14b0 0x50>; + }; + + user3@0x1500 { + reg = <0x1500 0x20>; + }; + }; + }; +}; + &msmgpio { pinctrl-0 = <&reset_out_pins>; pinctrl-names = "default"; -- 1.7.9.5
[RFC PATCH 0/4] Enable NAND on Sierra Wireless WP8548 board
Enable NAND flash on Sierra Wireless's WP8548 module used on MangOH Green board. The patch set consists of device tree descriptions for ADM DMA engine, NAND controller and NAND flash partitioned for Sierra Wireless Legato framework, as well as definition of EBI2 clock used by NAND controller. This patch set depends on Andy Gross's driver for ADM DMA engine: https://lwn.net/Articles/636881/ Zoran Markovic (4): dt-bindings: mdm9615: Add ADM DMA engine clk: mdm9615: Add EBI2 clock dt-bindings: mdm9615: Add NAND controller dt-bindings: wp8548: Add on-board NAND flash arch/arm/boot/dts/qcom-mdm9615-wp8548.dtsi | 50 ++ arch/arm/boot/dts/qcom-mdm9615.dtsi | 35 +- drivers/clk/qcom/gcc-mdm9615.c | 30 include/dt-bindings/clock/qcom,gcc-mdm9615.h |3 ++ 4 files changed, 117 insertions(+), 1 deletion(-) -- 1.7.9.5
[RFC PATCH 2/4] clk: mdm9615: Add EBI2 clock
Add definition of EBI2 clock used by MDM9615 NAND controller. Signed-off-by: Zoran Markovic --- drivers/clk/qcom/gcc-mdm9615.c | 30 ++ include/dt-bindings/clock/qcom,gcc-mdm9615.h |3 +++ 2 files changed, 33 insertions(+) diff --git a/drivers/clk/qcom/gcc-mdm9615.c b/drivers/clk/qcom/gcc-mdm9615.c index 581a17f..e9e98b1 100644 --- a/drivers/clk/qcom/gcc-mdm9615.c +++ b/drivers/clk/qcom/gcc-mdm9615.c @@ -1563,6 +1563,34 @@ enum { }, }; +static struct clk_branch ebi2_clk = { + .hwcg_reg = 0x2664, + .hwcg_bit = 6, + .halt_reg = 0x2fcc, + .halt_bit = 23, + .clkr = { + .enable_reg = 0x2664, + .enable_mask = BIT(6)|BIT(4), + .hw.init = &(struct clk_init_data){ + .name = "ebi2_clk", + .ops = &clk_branch_ops, + }, + }, +}; + +static struct clk_branch ebi2_aon_clk = { + .halt_reg = 0x2fcc, + .halt_bit = 23, + .clkr = { + .enable_reg = 0x2664, + .enable_mask = BIT(8), + .hw.init = &(struct clk_init_data){ + .name = "ebi2_always_on_clk", + .ops = &clk_branch_ops, + }, + }, +}; + static struct clk_hw *gcc_mdm9615_hws[] = { &cxo.hw, }; @@ -1637,6 +1665,8 @@ enum { [PMIC_ARB1_H_CLK] = &pmic_arb1_h_clk.clkr, [PMIC_SSBI2_CLK] = &pmic_ssbi2_clk.clkr, [RPM_MSG_RAM_H_CLK] = &rpm_msg_ram_h_clk.clkr, + [EBI2_CLK] = &ebi2_clk.clkr, + [EBI2_AON_CLK] = &ebi2_aon_clk.clkr, }; static const struct qcom_reset_map gcc_mdm9615_resets[] = { diff --git a/include/dt-bindings/clock/qcom,gcc-mdm9615.h b/include/dt-bindings/clock/qcom,gcc-mdm9615.h index 9ab2c40..57cdca6 100644 --- a/include/dt-bindings/clock/qcom,gcc-mdm9615.h +++ b/include/dt-bindings/clock/qcom,gcc-mdm9615.h @@ -323,5 +323,8 @@ #define CE3_H_CLK 305 #define USB_HS1_SYSTEM_CLK_SRC 306 #define USB_HS1_SYSTEM_CLK 307 +#define EBI2_CLK 309 +#define EBI2_AON_CLK 310 + #endif -- 1.7.9.5
[RFC PATCH 1/4] dt-bindings: mdm9615: Add ADM DMA engine
Add configuration for ADM DMA engine on MDM9615, used by the EBI2 NAND controller. This commit requires the ADM DMA patches from Andy Gross: https://lkml.org/lkml/2015/3/17/19 Cc: Andy Gross Cc: David Brown Cc: Rob Herring Cc: Mark Rutland Cc: Russell King Cc: linux-arm-...@vger.kernel.org Cc: linux-...@vger.kernel.org Cc: devicet...@vger.kernel.org Cc: linux-arm-ker...@lists.infradead.org Signed-off-by: Zoran Markovic --- arch/arm/boot/dts/qcom-mdm9615.dtsi | 19 ++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/qcom-mdm9615.dtsi b/arch/arm/boot/dts/qcom-mdm9615.dtsi index 5ae4ec5..fbc7d68 100644 --- a/arch/arm/boot/dts/qcom-mdm9615.dtsi +++ b/arch/arm/boot/dts/qcom-mdm9615.dtsi @@ -336,7 +336,24 @@ }; }; - sdcc1bam: dma@12182000{ + adm_dma: dma@1830 { + compatible = "qcom,adm"; + reg = <0x1830 0x10>; + interrupts = <0 170 0>; + #dma-cells = <1>; + + clocks = <&gcc ADM0_CLK>, <&gcc ADM0_PBUS_CLK>; + clock-names = "core", "iface"; + + resets = <&gcc ADM0_RESET>, +<&gcc ADM0_C0_RESET>, +<&gcc ADM0_C1_RESET>, +<&gcc ADM0_C2_RESET>; + reset-names = "clk", "c0", "c1", "c2"; + qcom,ee = <0>; + }; + + sdcc1bam:dma@12182000{ compatible = "qcom,bam-v1.3.0"; reg = <0x12182000 0x8000>; interrupts = ; -- 1.7.9.5
[RFC PATCH 4/4] dt-bindings: wp8548: Add on-board NAND flash
Add description of NAND flash on Sierra Wireless WP8548 module (and MangOH board). Cc: Andy Gross Cc: David Brown Cc: Rob Herring Cc: Mark Rutland Cc: Russell King Cc: linux-arm-...@vger.kernel.org Cc: linux-...@vger.kernel.org Cc: devicet...@vger.kernel.org Cc: linux-arm-ker...@lists.infradead.org Signed-off-by: Zoran Markovic --- arch/arm/boot/dts/qcom-mdm9615-wp8548.dtsi | 50 1 file changed, 50 insertions(+) diff --git a/arch/arm/boot/dts/qcom-mdm9615-wp8548.dtsi b/arch/arm/boot/dts/qcom-mdm9615-wp8548.dtsi index 7869898..a4d1158 100644 --- a/arch/arm/boot/dts/qcom-mdm9615-wp8548.dtsi +++ b/arch/arm/boot/dts/qcom-mdm9615-wp8548.dtsi @@ -54,6 +54,56 @@ }; }; +&nand0 { + nandcs@0 { + compatible = "qcom,nandcs"; + reg = <0>; + + linux,mtd-name = "micron,mt29f4g08"; + #address-cells = <1>; + #size-cells = <0>; + nand-ecc-strength = <4>; + nand-ecc-step-size = <512>; + + partitions { + compatible = "fixed-partitions"; + #address-cells = <1>; + #size-cells = <1>; + + bootloader@0x051c { + reg = <0x51c 0x10>; + read-only; + }; + + kernel@0x052c { + reg = <0x52c 0x140>; + read-only; + }; + + rootfs@0x066c { + reg = <0x66c 0x314>; + read-only; + }; + + user0@0x0980 { + reg = <0x980 0x278>; + }; + + user1@0x0bf8 { + reg = <0xbf8 0x8B8>; + }; + + user2@0x14b0 { + reg = <0x14b0 0x50>; + }; + + user3@0x1500 { + reg = <0x1500 0x20>; + }; + }; + }; +}; + &msmgpio { pinctrl-0 = <&reset_out_pins>; pinctrl-names = "default"; -- 1.7.9.5
[RFC PATCH 2/2] sched: Add documentation for idlestat scheduler benchmarking tool
This patch documents the proposed functionality of idlestat tool and states its intended use for scheduler benchmarking. The documentation file describes the design of the tool, what kernel functionality it relies upon, and what information is contained in the output report. It also contains a simple linear model for estimating CPU power consumption during idlestat run. Idlestat focuses itself on CPU and cluster power states in precise intervals in time. This is of particular use when the benchmarked process is a load synthesis tool: idlestat could focus its acquisition period to a particular sub-period in the load sequence. Output results from idlestat can be applied to a power model in order to estimate the power consumption of CPUs and clusters during the benchmark interval. Initial measurements on ARM Versatile Express TC2 platform show a model error of ~2.6% for the linear power model described in the documentation. Cc: Rob Landley Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Daniel Lezcano Signed-off-by: Zoran Markovic --- Documentation/scheduler/idlestat.txt | 79 ++ 1 file changed, 79 insertions(+) create mode 100644 Documentation/scheduler/idlestat.txt diff --git a/Documentation/scheduler/idlestat.txt b/Documentation/scheduler/idlestat.txt new file mode 100644 index 000..8e6b695 --- /dev/null +++ b/Documentation/scheduler/idlestat.txt @@ -0,0 +1,79 @@ +This document captures the desired operation of the idlestat tool. + +With the advent of battery-powered Linux devices, it became important to add +a power-aware component to the existing CFS scheduler solution. Future +developments in this field need to be benchmarked using a simple tool that +monitors power parameters during system runs and provides sufficient info for +developers to assess how changes to scheduler code affected CPU power +consumption. The idlestat tool attempts to capture this. + +Idlestat uses kernel's FTRACE function to monitor and capture C-state and +P-state transitions of CPUs over a time interval. It extracts the following +information from trace file: + - Times when CPUs entered and exited a certain C-state + - Times when CPUs entered and exited a certain P-state + - Raised IRQs + +Following a successful run, idlestat calculates and reports the following +information: + - Total, average, minimum and maximum time spent in each C-state, + per-CPU. + - Total, average, minimum and maximum time spent in each P-state, + per-CPU. + - Total, average, minimum and maximum time during which all CPUs in + a cluster were in the same C-state, per-cluster. + - Number of times a certain IRQ caused a CPU to exit idle state, + per-CPU and per-IRQ. + +The tool parses sysfs entries to determine the CPU/cluster topology, as well +as supported C-states and P-states per CPU. It is unaware of CPU/cluster power +consumption in each C-state and P-state, but if these parameters are +externally known, a ballpark estimate of the energy consumed during idlestat +run can be calculated as follows: + +energy = sum_per_cpu(PCi*(TCi-TCCi)) + sum_per_cluster(PCCi*TCCi) + +sum_per_cpu(PPi*TPi) + +where: +PCi- is the power consumption of CPU in Ci power state +TCi- is the total time the CPU has spent in Ci power state +PCCi - is the power consumption of cluster in Ci power state +TCCi - is the total time the cluster has spent in Ci power state +PPi- is the power consumption of CPU in Pi power state +TPi- is the total time the CPU has spent in Pi power state + +Below is an example report of one idlestat run on a dual-core system: +clusterA@state hits total(us) avg(us) min(us) max(us) + C1 108215879554.00 543.35 0.0023163.00 + C2 0 0.000.00 0.000.00 + C3 78 2929290.0037555.00 0.00101441.00 + cpu0@statehits total(us) avg(us) min(us) max(us) + C1 6744 6407808.00 950.15 0.0023194.00 + C2 3 8819.00 2939.67 549.00 5310.00 + C3 75 2960110.0039468.13 213.00 101441.00 + 350 1047 204490.00 195.31 0.004578.00 + 700 5628 396247.00 70.41 0.001465.00 + 920 0 0.000.00 0.000.00 + cpu0 wakeups namecount + irq109 ehci_hcd:usb1 1727 + irq029 twd 4524 + irq069 gp_timer60 + irq115 mmc07 + irq044 DMA 3 + cpu1@statehits total(us) avg(us) min(us) max(us) + C1 6544 6398931.00 977.83 0.0036255.00 + C2 1 1129.00 1129.00 1129.00 1129.00 + C3 77 29552
[RFC PATCH 0/2] sched: proposal for idlestat scheduler benchmarking tool
Conclusions from Energy Aware Scheduling sessions at the latest Kernel Summit identified a need for tools that would assess power consumption of the system These tools would be used to prove efficiency of scheduler patches by comparing power consumption before and after they were applied. Attached is the proposal for the idlestat tool. The purpose of this patch is to solicit feedback on tool's features, possible enhancements, etc. Source code and sample idlestat report are provided for reference. Please review and provide comments in anticipation of further development. Regards, Zoran Zoran Markovic (2): power: Add idlestat tool for benchmarking energy-aware scheduler sched: Add documentation for idlestat scheduler benchmarking tool Documentation/scheduler/idlestat.txt | 79 +++ tools/power/idlestat/.gitignore | 50 ++ tools/power/idlestat/Makefile| 34 + tools/power/idlestat/idlestat.c | 1229 ++ tools/power/idlestat/idlestat.h | 106 +++ tools/power/idlestat/list.h | 588 tools/power/idlestat/topology.c | 503 ++ tools/power/idlestat/topology.h | 77 +++ tools/power/idlestat/trace.c | 87 +++ tools/power/idlestat/trace.h | 43 ++ tools/power/idlestat/utils.c | 115 tools/power/idlestat/utils.h | 35 + 12 files changed, 2946 insertions(+) create mode 100644 Documentation/scheduler/idlestat.txt create mode 100644 tools/power/idlestat/.gitignore create mode 100644 tools/power/idlestat/Makefile create mode 100644 tools/power/idlestat/idlestat.c create mode 100644 tools/power/idlestat/idlestat.h create mode 100644 tools/power/idlestat/list.h create mode 100644 tools/power/idlestat/topology.c create mode 100644 tools/power/idlestat/topology.h create mode 100644 tools/power/idlestat/trace.c create mode 100644 tools/power/idlestat/trace.h create mode 100644 tools/power/idlestat/utils.c create mode 100644 tools/power/idlestat/utils.h -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCHv2] usb: move hub init and LED blink work to power efficient workqueue
I believe there may still be use cases where you want to wake up the same CPU that scheduled the work. Thanks for the Ack. Can you please queue this for 3.14? Regards, Zoran On 2 February 2014 08:10, Alan Stern wrote: > On Sat, 1 Feb 2014, Zoran Markovic wrote: > >> From: Shaibal Dutta >> >> Allow the scheduler to select the best CPU to handle hub initalization >> and LED blinking work. This extends idle residency times on idle CPUs >> and conserves power. >> >> This functionality is enabled when CONFIG_WQ_POWER_EFFICIENT is selected. >> >> Cc: Greg Kroah-Hartman >> Cc: Alan Stern >> Cc: Sarah Sharp >> Cc: Xenia Ragiadakou >> Cc: Julius Werner >> Cc: Krzysztof Mazur >> Cc: Matthias Beyer >> Cc: Dan Williams >> Cc: Mathias Nyman >> Cc: Thomas Pugliese >> Signed-off-by: Shaibal Dutta >> [zoran.marko...@linaro.org: Rebased to latest kernel. Added commit message. >> Changed reference from system to power efficient workqueue for LEDs in >> check_highspeed() and hub_port_connect_change().] >> Signed-off-by: Zoran Markovic > > Acked-off-by: Alan Stern > > Is there some reason why schedule_delayed_work() doesn't use the > power-efficient work queue by default? > > Alan Stern > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCHv2] usb: move hub init and LED blink work to power efficient workqueue
From: Shaibal Dutta Allow the scheduler to select the best CPU to handle hub initalization and LED blinking work. This extends idle residency times on idle CPUs and conserves power. This functionality is enabled when CONFIG_WQ_POWER_EFFICIENT is selected. Cc: Greg Kroah-Hartman Cc: Alan Stern Cc: Sarah Sharp Cc: Xenia Ragiadakou Cc: Julius Werner Cc: Krzysztof Mazur Cc: Matthias Beyer Cc: Dan Williams Cc: Mathias Nyman Cc: Thomas Pugliese Signed-off-by: Shaibal Dutta [zoran.marko...@linaro.org: Rebased to latest kernel. Added commit message. Changed reference from system to power efficient workqueue for LEDs in check_highspeed() and hub_port_connect_change().] Signed-off-by: Zoran Markovic --- drivers/usb/core/hub.c | 19 +-- 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index babba88..e11a7e9 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -504,7 +504,8 @@ static void led_work (struct work_struct *work) changed++; } if (changed) - schedule_delayed_work(&hub->leds, LED_CYCLE_PERIOD); + queue_delayed_work(system_power_efficient_wq, + &hub->leds, LED_CYCLE_PERIOD); } /* use a short timeout for hub/port status fetches */ @@ -1046,7 +1047,8 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) if (type == HUB_INIT) { delay = hub_power_on(hub, false); PREPARE_DELAYED_WORK(&hub->init_work, hub_init_func2); - schedule_delayed_work(&hub->init_work, + queue_delayed_work(system_power_efficient_wq, + &hub->init_work, msecs_to_jiffies(delay)); /* Suppress autosuspend until init is done */ @@ -1200,7 +1202,8 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) /* Don't do a long sleep inside a workqueue routine */ if (type == HUB_INIT2) { PREPARE_DELAYED_WORK(&hub->init_work, hub_init_func3); - schedule_delayed_work(&hub->init_work, + queue_delayed_work(system_power_efficient_wq, + &hub->init_work, msecs_to_jiffies(delay)); return; /* Continues at init3: below */ } else { @@ -1214,7 +1217,8 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) if (status < 0) dev_err(hub->intfdev, "activate --> %d\n", status); if (hub->has_indicators && blinkenlights) - schedule_delayed_work(&hub->leds, LED_CYCLE_PERIOD); + queue_delayed_work(system_power_efficient_wq, + &hub->leds, LED_CYCLE_PERIOD); /* Scan all ports that need attention */ kick_khubd(hub); @@ -4316,7 +4320,8 @@ check_highspeed (struct usb_hub *hub, struct usb_device *udev, int port1) /* hub LEDs are probably harder to miss than syslog */ if (hub->has_indicators) { hub->indicator[port1-1] = INDICATOR_GREEN_BLINK; - schedule_delayed_work (&hub->leds, 0); + queue_delayed_work(system_power_efficient_wq, + &hub->leds, 0); } } kfree(qual); @@ -4545,7 +4550,9 @@ static void hub_port_connect_change(struct usb_hub *hub, int port1, if (hub->has_indicators) { hub->indicator[port1-1] = INDICATOR_AMBER_BLINK; - schedule_delayed_work (&hub->leds, 0); + queue_delayed_work( + system_power_efficient_wq, + &hub->leds, 0); } status = -ENOTCONN; /* Don't retry */ goto loop_disable; -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCH] thermal: add generic cpu hotplug cooling device
Hi Eduardo, The merge window for 3.14 is now open and I'm wondering if you had a chance to look at these numbers? Thanks, Zoran On 30 December 2013 12:48, Zoran Markovic wrote: > Eduardo, > >>> What is the workload you're running besides the proprietary heater code? > I re-did experiments from Linaro's site pointed by Amit while > profiling _cpu_down() and _cpu_up() times: >>> [1] https://wiki.linaro.org/WorkingGroups/PowerManagement/Archives/Hotplug > > I am attaching a spreadsheet with some results and graphs: > > Sheet 1 (thermal_ramp) has three plots. Topmost is an unbound thermal > ramp that levels off at ~48C. Middle plot is a thermal ramp with cpu > hotplug kicking in as a cooling device at 38C. Bottom plot is a > thermal ramp with cpu hotplug kicking in at 38C and cpufreq kicking in > at 40C. One interesting thing to note is that the middle plot slowly > drifts towards 40C even though cooling is set to 38C. I attribute this > to the logic of step-wise governor combined with polling mode: if > temperature is dropping above trip point, cooling is reduced. Adding > another cooling device at 40C as a back-stop seems to keep temperature > in check. In all cases running code was ARM's max_power test that > maximizes CPU usage, as evidenced by results of 'top': > PID USER PR NI VIRT RES SHR S %CPU %MEMTIME+ COMMAND >33 root 20 0 000 R 100.0 0.0 45:46.43 thread1 >32 root 20 0 000 R 91.4 0.0 44:48.14 thread0 > 1344 root 20 0 000 R 8.6 0.0 0:03.64 kworker/u4:1 > 1380 root 20 0 2476 996 712 R 0.3 0.1 0:00.07 top > > Sheet 2 (idle) has two plots. Top one represents latency of > _cpu_down() while gradually adding instances of cyclictest process, > from 0 to 10; 20 samples were captured in each case. Bottom one > represents latency of _cpu_up() in the same test. Other than running > cyclictest, the system was mostly idle. > > Sheet 3 (max_power) repeated the same test as in sheet 2, but it was > running ARM's max_power test in the background. > > A quick look at the latency graphs shows that loading the system > causes a stochastic - but not deterministic - component added to > latencies. Minimum latency times appear unchanged. > >> - Homogeneous dual core Cortex-A9 environment. >> - They go up to 48C when fully loaded. Can you explain where is your >> sensor location? Gradient to hotspot, etc? 48C at A9s or board temperature? > Thermal sensor is located at L2 cache, with gradient to sensor likely > smaller than sensor inaccuracy. > >> - This code looks promising on embedded dual core system. However, it >> does not necessarily mean it works fine on, say server side. How about a >> system with 8/16/32 cores? How about a more heterogeneous workload? Not >> to talk about heterogeneous cores. I think in more complicated scenarios >> the data you provided above might even change. The difference between >> your minimum and maximum shutdown/startup times are quite considerable, >> so I am assuming your variance is not negligible, imaging if we scale >> this up, what happens? > Agreed that this is difficult to characterize across all platform > types. Maybe other list members could comment the behaviour on their > platforms? Passing in a cpu mask defines CPUs that contribute to > cooling of a single zone, so there is some flexibility in defining > cooling strategy. Hopefully this is good enough for a start... > >> >> - The other point is that this type of cooling device must be taken in >> very sensible way. Shutting down circuitry may not be the best strategy >> for thermal. In fact, if you think about it, given you have a workload >> well balanced between, say, two cores, as same of your environment, >> turning one off it means you need to deal the very same load in only one >> CPU. In other words, turning of circuitry means, from thermal standpoint >> that you are increasing you heat/area ratio. Sometimes, you actually >> want to increase this ratio in order to properly cool down your system. > In this particular test case since both CPUs are fully loaded, > temperature is reduced at the expense of parallelism (i.e. execution > time), so overall heat/area is still reduced. If particular areas are > heat-sensitive, then it makes sense to define a separate thermal zone > (and sensor) for each of them. Just a thought. > > Looking forward to further discussion. > > Regards, > Zoran -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCH] firmware: use power efficient workqueue for unloading and aborting fw load
From: Shaibal Dutta Allow the scheduler to select the most appropriate CPU for running the firmware load timeout routine and delayed routine for firmware unload. This extends idle residency times and conserves power. This functionality is enabled when CONFIG_WQ_POWER_EFFICIENT is selected. Cc: Ming Lei Cc: Greg Kroah-Hartman Signed-off-by: Shaibal Dutta [zoran.marko...@linaro.org: Rebased to latest kernel, added commit message. Fixed code alignment.] Signed-off-by: Zoran Markovic --- drivers/base/firmware_class.c |7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c index 8a97ddf..ae34219 100644 --- a/drivers/base/firmware_class.c +++ b/drivers/base/firmware_class.c @@ -900,7 +900,8 @@ static int _request_firmware_load(struct firmware_priv *fw_priv, dev_set_uevent_suppress(f_dev, false); dev_dbg(f_dev, "firmware: requesting %s\n", buf->fw_id); if (timeout != MAX_SCHEDULE_TIMEOUT) - schedule_delayed_work(&fw_priv->timeout_work, timeout); + queue_delayed_work(system_power_efficient_wq, + &fw_priv->timeout_work, timeout); kobject_uevent(&fw_priv->dev.kobj, KOBJ_ADD); } @@ -1570,8 +1571,8 @@ static void device_uncache_fw_images_work(struct work_struct *work) */ static void device_uncache_fw_images_delay(unsigned long delay) { - schedule_delayed_work(&fw_cache.work, - msecs_to_jiffies(delay)); + queue_delayed_work(system_power_efficient_wq, &fw_cache.work, + msecs_to_jiffies(delay)); } static int fw_pm_notify(struct notifier_block *notify_block, -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCH] mmc: move clock gating work to power efficient workqueue
From: Shaibal Dutta Instead of binding the clock gating work to the CPU that scheduled it, allow the scheduler to select the best CPU to handle it. This extends idle residency times and conserves power. This functionality is enabled when CONFIG_WQ_POWER_EFFICIENT is selected. Cc: Chris Ball Cc: Guennadi Liakhovetski Cc: Ulf Hansson Cc: H Hartley Sweeten Cc: Andrew Morton Cc: Simon Baatz Cc: Laurent Pinchart Cc: Tejun Heo Signed-off-by: Shaibal Dutta [zoran.marko...@linaro.org: Rebased to latest kernel. Added commit message. Fixed code alignment.] Signed-off-by: Zoran Markovic --- drivers/mmc/core/host.c |5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index 49bc403..a787f1b 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -207,8 +207,9 @@ void mmc_host_clk_release(struct mmc_host *host) host->clk_requests--; if (mmc_host_may_gate_card(host->card) && !host->clk_requests) - schedule_delayed_work(&host->clk_gate_work, - msecs_to_jiffies(host->clkgate_delay)); + queue_delayed_work(system_power_efficient_wq, + &host->clk_gate_work, + msecs_to_jiffies(host->clkgate_delay)); spin_unlock_irqrestore(&host->clk_lock, flags); } -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCH] usb: move hub init and LED blink work to power efficient workqueue
From: Shaibal Dutta Allow the scheduler to select the best CPU to handle hub initalization and LED blinking work. This extends idle residency times on idle CPUs and conserves power. This functionality is enabled when CONFIG_WQ_POWER_EFFICIENT is selected. Cc: Greg Kroah-Hartman Cc: Alan Stern Cc: Sarah Sharp Cc: Xenia Ragiadakou Cc: Julius Werner Cc: Krzysztof Mazur Cc: Matthias Beyer Cc: Dan Williams Cc: Mathias Nyman Cc: Thomas Pugliese Signed-off-by: Shaibal Dutta [zoran.marko...@linaro.org: Rebased to latest kernel. Added commit message. Changed reference from system to power efficient workqueue for LEDs in check_highspeed() and hub_port_connect_change().] Signed-off-by: Zoran Markovic --- drivers/usb/core/hub.c | 23 +++ 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index babba88..ae07ffe 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -504,7 +504,8 @@ static void led_work (struct work_struct *work) changed++; } if (changed) - schedule_delayed_work(&hub->leds, LED_CYCLE_PERIOD); + queue_delayed_work(system_power_efficient_wq, + &hub->leds, LED_CYCLE_PERIOD); } /* use a short timeout for hub/port status fetches */ @@ -1046,8 +1047,9 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) if (type == HUB_INIT) { delay = hub_power_on(hub, false); PREPARE_DELAYED_WORK(&hub->init_work, hub_init_func2); - schedule_delayed_work(&hub->init_work, - msecs_to_jiffies(delay)); + queue_delayed_work(system_power_efficient_wq, + &hub->init_work, + msecs_to_jiffies(delay)); /* Suppress autosuspend until init is done */ usb_autopm_get_interface_no_resume( @@ -1200,8 +1202,9 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) /* Don't do a long sleep inside a workqueue routine */ if (type == HUB_INIT2) { PREPARE_DELAYED_WORK(&hub->init_work, hub_init_func3); - schedule_delayed_work(&hub->init_work, - msecs_to_jiffies(delay)); + queue_delayed_work(system_power_efficient_wq, + &hub->init_work, + msecs_to_jiffies(delay)); return; /* Continues at init3: below */ } else { msleep(delay); @@ -1214,7 +1217,8 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) if (status < 0) dev_err(hub->intfdev, "activate --> %d\n", status); if (hub->has_indicators && blinkenlights) - schedule_delayed_work(&hub->leds, LED_CYCLE_PERIOD); + queue_delayed_work(system_power_efficient_wq, + &hub->leds, LED_CYCLE_PERIOD); /* Scan all ports that need attention */ kick_khubd(hub); @@ -4316,7 +4320,8 @@ check_highspeed (struct usb_hub *hub, struct usb_device *udev, int port1) /* hub LEDs are probably harder to miss than syslog */ if (hub->has_indicators) { hub->indicator[port1-1] = INDICATOR_GREEN_BLINK; - schedule_delayed_work (&hub->leds, 0); + queue_delayed_work(system_power_efficient_wq, + &hub->leds, 0); } } kfree(qual); @@ -4545,7 +4550,9 @@ static void hub_port_connect_change(struct usb_hub *hub, int port1, if (hub->has_indicators) { hub->indicator[port1-1] = INDICATOR_AMBER_BLINK; - schedule_delayed_work (&hub->leds, 0); + queue_delayed_work( + system_power_efficient_wq, + &hub->leds, 0); } status = -ENOTCONN; /* Don't retry */ goto loop_disable; -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCH] power: move pm_qos update timeout handler to power-efficient workqueue
From: Shaibal Dutta To avoid waking up idle CPUs, allow the scheduler to select the best CPU to handle pm_qos update timeouts. This extends idle residency times and conserves power. This functionality is enabled when CONFIG_WQ_POWER_EFFICIENT is selected. Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Len Brown Signed-off-by: Shaibal Dutta [zoran.marko...@linaro.org: Rebased to latest kernel. Fixed code alignment. Added commit message.] Signed-off-by: Zoran Markovic --- kernel/power/qos.c |3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 8dff9b4..5e35a3a 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -405,7 +405,8 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value, pm_qos_array[req->pm_qos_class]->constraints, &req->node, PM_QOS_UPDATE_REQ, new_value); - schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us)); + queue_delayed_work(system_power_efficient_wq, + &req->work, usecs_to_jiffies(timeout_us)); } /** -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCH] rcu: move SRCU grace period work to power efficient workqueue
Signed-off-by: Zoran Markovic On 31 January 2014 11:53, Zoran Markovic wrote: > From: Shaibal Dutta > > For better use of CPU idle time, allow the scheduler to select the CPU > on which the SRCU grace period work would be scheduled. This improves > idle residency time and conserves power. > > This functionality is enabled when CONFIG_WQ_POWER_EFFICIENT is selected. > > Cc: Lai Jiangshan > Cc: "Paul E. McKenney" > Cc: Dipankar Sarma > Signed-off-by: Shaibal Dutta > [zoran.marko...@linaro.org: Rebased to latest kernel version. Added commit > message. Fixed code alignment.] > --- > kernel/rcu/srcu.c |5 +++-- > 1 file changed, 3 insertions(+), 2 deletions(-) > > diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c > index 3318d82..a1ebe6d 100644 > --- a/kernel/rcu/srcu.c > +++ b/kernel/rcu/srcu.c > @@ -398,7 +398,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head > *head, > rcu_batch_queue(&sp->batch_queue, head); > if (!sp->running) { > sp->running = true; > - schedule_delayed_work(&sp->work, 0); > + queue_delayed_work(system_power_efficient_wq, &sp->work, 0); > } > spin_unlock_irqrestore(&sp->queue_lock, flags); > } > @@ -674,7 +674,8 @@ static void srcu_reschedule(struct srcu_struct *sp) > } > > if (pending) > - schedule_delayed_work(&sp->work, SRCU_INTERVAL); > + queue_delayed_work(system_power_efficient_wq, > + &sp->work, SRCU_INTERVAL); > } > > /* > -- > 1.7.9.5 > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCH] rcu: move SRCU grace period work to power efficient workqueue
From: Shaibal Dutta For better use of CPU idle time, allow the scheduler to select the CPU on which the SRCU grace period work would be scheduled. This improves idle residency time and conserves power. This functionality is enabled when CONFIG_WQ_POWER_EFFICIENT is selected. Cc: Lai Jiangshan Cc: "Paul E. McKenney" Cc: Dipankar Sarma Signed-off-by: Shaibal Dutta [zoran.marko...@linaro.org: Rebased to latest kernel version. Added commit message. Fixed code alignment.] --- kernel/rcu/srcu.c |5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 3318d82..a1ebe6d 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -398,7 +398,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, rcu_batch_queue(&sp->batch_queue, head); if (!sp->running) { sp->running = true; - schedule_delayed_work(&sp->work, 0); + queue_delayed_work(system_power_efficient_wq, &sp->work, 0); } spin_unlock_irqrestore(&sp->queue_lock, flags); } @@ -674,7 +674,8 @@ static void srcu_reschedule(struct srcu_struct *sp) } if (pending) - schedule_delayed_work(&sp->work, SRCU_INTERVAL); + queue_delayed_work(system_power_efficient_wq, + &sp->work, SRCU_INTERVAL); } /* -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCH] timekeeping: move clock sync work to power efficient workqueue
From: Shaibal Dutta For better use of CPU idle time, allow the scheduler to select the CPU on which the CMOS clock sync work would be scheduled. This improves idle residency time and conserver power. This functionality is enabled when CONFIG_WQ_POWER_EFFICIENT is selected. Cc: John Stultz Cc: Thomas Gleixner Signed-off-by: Shaibal Dutta [zoran.marko...@linaro.org: Added commit message. Aligned code.] Signed-off-by: Zoran Markovic --- kernel/time/ntp.c |5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index af8d1d4..419a52c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -514,12 +514,13 @@ static void sync_cmos_clock(struct work_struct *work) next.tv_sec++; next.tv_nsec -= NSEC_PER_SEC; } - schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); + queue_delayed_work(system_power_efficient_wq, + &sync_cmos_work, timespec_to_jiffies(&next)); } void ntp_notify_cmos_timer(void) { - schedule_delayed_work(&sync_cmos_work, 0); + queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0); } #else -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCHv2] net: core: move core networking work to power efficient workqueue
From: Shaibal Dutta This patch moves the following work to the power efficient workqueue: - Transmit work of netpoll - Destination cache garbage collector work - Link watch event handler work In general, assignment of CPUs to pending work could be deferred to the scheduler in order to extend idle residency time and improve power efficiency. I would value community's opinion on the migration of this work to the power efficient workqueue, with an emphasis on migration of netpoll's transmit work. This functionality is enabled when CONFIG_WQ_POWER_EFFICIENT is selected. Cc: "David S. Miller" Cc: Jiri Pirko Cc: YOSHIFUJI Hideaki Cc: Eric Dumazet Cc: Julian Anastasov Cc: Flavio Leitner Cc: Neil Horman Cc: Patrick McHardy Cc: John Fastabend Cc: Amerigo Wang Cc: Joe Perches Cc: Jason Wang Cc: Antonio Quartulli Cc: Simon Horman Cc: Nikolay Aleksandrov Signed-off-by: Shaibal Dutta [zoran.marko...@linaro.org: Rebased to latest kernel version. Edited calls to mod_delayed_work to reference power efficient workqueue. Added commit message. Fixed code alignment.] Signed-off-by: Zoran Markovic --- net/core/dst.c|5 +++-- net/core/link_watch.c |5 +++-- net/core/netpoll.c|6 -- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/net/core/dst.c b/net/core/dst.c index ca4231e..57fba10 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -135,7 +135,8 @@ loop: */ if (expires > 4*HZ) expires = round_jiffies_relative(expires); - schedule_delayed_work(&dst_gc_work, expires); + queue_delayed_work(system_power_efficient_wq, + &dst_gc_work, expires); } spin_unlock_bh(&dst_garbage.lock); @@ -223,7 +224,7 @@ void __dst_free(struct dst_entry *dst) if (dst_garbage.timer_inc > DST_GC_INC) { dst_garbage.timer_inc = DST_GC_INC; dst_garbage.timer_expires = DST_GC_MIN; - mod_delayed_work(system_wq, &dst_gc_work, + mod_delayed_work(system_power_efficient_wq, &dst_gc_work, dst_garbage.timer_expires); } spin_unlock_bh(&dst_garbage.lock); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 9c3a839..6899935 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -135,9 +135,10 @@ static void linkwatch_schedule_work(int urgent) * override the existing timer. */ if (test_bit(LW_URGENT, &linkwatch_flags)) - mod_delayed_work(system_wq, &linkwatch_work, 0); + mod_delayed_work(system_power_efficient_wq, &linkwatch_work, 0); else - schedule_delayed_work(&linkwatch_work, delay); + queue_delayed_work(system_power_efficient_wq, + &linkwatch_work, delay); } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index c03f3de..6685938 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -101,7 +101,8 @@ static void queue_process(struct work_struct *work) __netif_tx_unlock(txq); local_irq_restore(flags); - schedule_delayed_work(&npinfo->tx_work, HZ/10); + queue_delayed_work(system_power_efficient_wq, + &npinfo->tx_work, HZ/10); return; } __netif_tx_unlock(txq); @@ -423,7 +424,8 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, if (status != NETDEV_TX_OK) { skb_queue_tail(&npinfo->txq, skb); - schedule_delayed_work(&npinfo->tx_work,0); + queue_delayed_work(system_power_efficient_wq, + &npinfo->tx_work, 0); } } EXPORT_SYMBOL(netpoll_send_skb_on_dev); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCHv2] net: ipv4: move inetpeer garbage collector work to power efficient workqueue
From: Shaibal Dutta Garbage collector work does not have to be bound to the CPU that scheduled it. By moving work to the power-efficient workqueue, the selection of CPU executing the work is left to the scheduler. This extends idle residency times and conserves power. This functionality is enabled when CONFIG_WQ_POWER_EFFICIENT is selected. Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Signed-off-by: Shaibal Dutta [zoran.marko...@linaro.org: Rebased to latest kernel version. Added commit message. Fixed code alignment.] Signed-off-by: Zoran Markovic --- net/ipv4/inetpeer.c |6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 48f4244..7e3da6c6 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -161,7 +161,8 @@ static void inetpeer_gc_worker(struct work_struct *work) list_splice(&list, &gc_list); spin_unlock_bh(&gc_lock); - schedule_delayed_work(&gc_work, gc_delay); + queue_delayed_work(system_power_efficient_wq, + &gc_work, gc_delay); } /* Called from ip_output.c:ip_init */ @@ -576,7 +577,8 @@ static void inetpeer_inval_rcu(struct rcu_head *head) list_add_tail(&p->gc_list, &gc_list); spin_unlock_bh(&gc_lock); - schedule_delayed_work(&gc_work, gc_delay); + queue_delayed_work(system_power_efficient_wq, + &gc_work, gc_delay); } void inetpeer_invalidate_tree(struct inet_peer_base *base) -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCH] net: core: move core networking work to power efficient workqueue
From: Shaibal Dutta This patch moves the following work to the power efficient workqueue: - Transmit work of netpoll - Destination cache garbage collector work - Link watch event handler work In general, assignment of CPUs to pending work could be deferred to the scheduler in order to extend idle residency time and improve power efficiency. I would value community's opinion on the migration of this work to the power efficient workqueue, with an emphasis on migration of netpoll's transmit work. This functionality is enabled when CONFIG_WQ_POWER_EFFICIENT is selected. Cc: "David S. Miller" Cc: Jiri Pirko Cc: YOSHIFUJI Hideaki Cc: Eric Dumazet Cc: Julian Anastasov Cc: Flavio Leitner Cc: Neil Horman Cc: Patrick McHardy Cc: John Fastabend Cc: Amerigo Wang Cc: Joe Perches Cc: Jason Wang Cc: Antonio Quartulli Cc: Simon Horman Cc: Nikolay Aleksandrov Signed-off-by: Shaibal Dutta [zoran.marko...@linaro.org: Rebased to latest kernel version. Edited calls to mod_delayed_work to reference power efficient workqueue. Added commit message.] Signed-off-by: Zoran Markovic --- net/core/dst.c|5 +++-- net/core/link_watch.c |5 +++-- net/core/netpoll.c|6 -- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/net/core/dst.c b/net/core/dst.c index ca4231e..cc28352 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -135,7 +135,8 @@ loop: */ if (expires > 4*HZ) expires = round_jiffies_relative(expires); - schedule_delayed_work(&dst_gc_work, expires); + queue_delayed_work(system_power_efficient_wq, + &dst_gc_work, expires); } spin_unlock_bh(&dst_garbage.lock); @@ -223,7 +224,7 @@ void __dst_free(struct dst_entry *dst) if (dst_garbage.timer_inc > DST_GC_INC) { dst_garbage.timer_inc = DST_GC_INC; dst_garbage.timer_expires = DST_GC_MIN; - mod_delayed_work(system_wq, &dst_gc_work, + mod_delayed_work(system_power_efficient_wq, &dst_gc_work, dst_garbage.timer_expires); } spin_unlock_bh(&dst_garbage.lock); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 9c3a839..0ae3994 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -135,9 +135,10 @@ static void linkwatch_schedule_work(int urgent) * override the existing timer. */ if (test_bit(LW_URGENT, &linkwatch_flags)) - mod_delayed_work(system_wq, &linkwatch_work, 0); + mod_delayed_work(system_power_efficient_wq, &linkwatch_work, 0); else - schedule_delayed_work(&linkwatch_work, delay); + queue_delayed_work(system_power_efficient_wq, + &linkwatch_work, delay); } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index c03f3de..2c8f839 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -101,7 +101,8 @@ static void queue_process(struct work_struct *work) __netif_tx_unlock(txq); local_irq_restore(flags); - schedule_delayed_work(&npinfo->tx_work, HZ/10); + queue_delayed_work(system_power_efficient_wq, + &npinfo->tx_work, HZ/10); return; } __netif_tx_unlock(txq); @@ -423,7 +424,8 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, if (status != NETDEV_TX_OK) { skb_queue_tail(&npinfo->txq, skb); - schedule_delayed_work(&npinfo->tx_work,0); + queue_delayed_work(system_power_efficient_wq, + &npinfo->tx_work, 0); } } EXPORT_SYMBOL(netpoll_send_skb_on_dev); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCH] net: ipv4: move inetpeer garbage collector work to power efficient workqueue
From: Shaibal Dutta Garbage collector work does not have to be bound to the CPU that scheduled it. By moving work to the power-efficient workqueue, the selection of CPU executing the work is left to the scheduler. This extends idle residency times and conserves power. This functionality is enabled when CONFIG_WQ_POWER_EFFICIENT is selected. Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Signed-off-by: Shaibal Dutta [zoran.marko...@linaro.org: Rebased to latest kernel version. Added commit message.] Signed-off-by: Zoran Markovic --- net/ipv4/inetpeer.c |6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 48f4244..87155aa 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -161,7 +161,8 @@ static void inetpeer_gc_worker(struct work_struct *work) list_splice(&list, &gc_list); spin_unlock_bh(&gc_lock); - schedule_delayed_work(&gc_work, gc_delay); + queue_delayed_work(system_power_efficient_wq, + &gc_work, gc_delay); } /* Called from ip_output.c:ip_init */ @@ -576,7 +577,8 @@ static void inetpeer_inval_rcu(struct rcu_head *head) list_add_tail(&p->gc_list, &gc_list); spin_unlock_bh(&gc_lock); - schedule_delayed_work(&gc_work, gc_delay); + queue_delayed_work(system_power_efficient_wq, + &gc_work, gc_delay); } void inetpeer_invalidate_tree(struct inet_peer_base *base) -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCH] net: wireless: move regulatory timeout work to power efficient workqueue
From: Shaibal Dutta For better use of CPU idle time, allow the scheduler to select the CPU on which the timeout work of regulatory settings would be executed. This extends CPU idle residency time and saves power. This functionality is enabled when CONFIG_WQ_POWER_EFFICIENT is selected. Cc: Johannes Berg Cc: "John W. Linville" Cc: "David S. Miller" Signed-off-by: Shaibal Dutta [zoran.marko...@linaro.org: Rebased to latest kernel. Added commit message.] Signed-off-by: Zoran Markovic --- net/wireless/reg.c |9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 9b897fc..6e21011 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1703,7 +1703,8 @@ static void reg_process_hint(struct regulatory_request *reg_request) if (treatment == REG_REQ_OK || treatment == REG_REQ_ALREADY_SET) return; - schedule_delayed_work(®_timeout, msecs_to_jiffies(3142)); + queue_delayed_work(system_power_efficient_wq, + ®_timeout, msecs_to_jiffies(3142)); return; case NL80211_REGDOM_SET_BY_DRIVER: treatment = reg_process_hint_driver(wiphy, reg_request); @@ -2294,7 +2295,8 @@ static int reg_set_rd_driver(const struct ieee80211_regdomain *rd, request_wiphy = wiphy_idx_to_wiphy(driver_request->wiphy_idx); if (!request_wiphy) { - schedule_delayed_work(®_timeout, 0); + queue_delayed_work(system_power_efficient_wq, + ®_timeout, 0); return -ENODEV; } @@ -2354,7 +2356,8 @@ static int reg_set_rd_country_ie(const struct ieee80211_regdomain *rd, request_wiphy = wiphy_idx_to_wiphy(country_ie_request->wiphy_idx); if (!request_wiphy) { - schedule_delayed_work(®_timeout, 0); + queue_delayed_work(system_power_efficient_wq, + ®_timeout, 0); return -ENODEV; } -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCH] net: rfkill: move poll work to power efficient workqueue
From: Shaibal Dutta This patch moves the rfkill poll_work to the power efficient workqueue. This work does not have to be bound to the CPU that scheduled it, hence the selection of CPU that executes it would be left to the scheduler. Net result is that CPU idle times would be extended, resulting in power savings. This behaviour is enabled when CONFIG_WQ_POWER_EFFICIENT is selected. Cc: Johannes Berg Cc: "John W. Linville" Cc: "David S. Miller" Signed-off-by: Shaibal Dutta [zoran.marko...@linaro.org: Rebased to latest kernel, added commit message. Fixed workqueue selection after suspend/resume cycle.] Signed-off-by: Zoran Markovic --- net/rfkill/core.c |9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/rfkill/core.c b/net/rfkill/core.c index ed7e0b4..b3b16c0 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -789,7 +789,8 @@ void rfkill_resume_polling(struct rfkill *rfkill) if (!rfkill->ops->poll) return; - schedule_work(&rfkill->poll_work.work); + queue_delayed_work(system_power_efficient_wq, + &rfkill->poll_work, 0); } EXPORT_SYMBOL(rfkill_resume_polling); @@ -894,7 +895,8 @@ static void rfkill_poll(struct work_struct *work) */ rfkill->ops->poll(rfkill, rfkill->data); - schedule_delayed_work(&rfkill->poll_work, + queue_delayed_work(system_power_efficient_wq, + &rfkill->poll_work, round_jiffies_relative(POLL_INTERVAL)); } @@ -958,7 +960,8 @@ int __must_check rfkill_register(struct rfkill *rfkill) INIT_WORK(&rfkill->sync_work, rfkill_sync_work); if (rfkill->ops->poll) - schedule_delayed_work(&rfkill->poll_work, + queue_delayed_work(system_power_efficient_wq, + &rfkill->poll_work, round_jiffies_relative(POLL_INTERVAL)); if (!rfkill->persistent || rfkill_epo_lock_active) { -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCH] thermal: add generic cpu hotplug cooling device
Hi Eduardo, > Yeah, I would like to see it. But what I was more interested in seeing > is how long does it take to offline a CPU? > I profiled this over 70 shutdown/startup cycles of CPU1 on Capri-AP (Cortex-A9x2) board and I get: shutdown: 1445usec (average), 3159usec (maximum), 834usec (minimum) startup: 707usec (average), 3159usec (maximum), 327usec (minimum) It's using a 32KHz clock so time resolution is ~30usec. Regards, Zoran >>>>> On 20 September 2013 15:15, Zoran Markovic >>>>> wrote: >>>>>> This patch implements a generic CPU hotplug cooling device. The >>>>>> implementation scales down the number of running CPUs when temperature >>>>>> increases through a thermal trip point and prevents booting CPUs >>>>>> until thermal conditions are restored. Upon restoration, the action >>>>>> of starting up a CPU is left to another entity (e.g. CPU offline >>>>>> governor, for which a patch is in the works). >>>>>> >>>>>> In the past two years, ARM considerably reduced the time required for >>>>>> CPUs to boot and shutdown; this time is now measured in microseconds. >>>>>> This patch is predominantly intended for ARM big.LITTLE architectures >>>>>> where big cores are expected to have a much bigger impact on thermal >>>>>> budget than little cores, resulting in fast temperature ramps to a trip >>>>>> point, i.e. thermal runaways. Switching off the big core(s) may be one >>>>>> of the recovery mechanisms to restore system temperature, but the actual >>>>>> strategy is left to the thermal governor. >>>>>> >>>>>> The assumption is that CPU shutdown/startup is a rare event, so no >>>>>> attempt was made to make the code atomic, i.e. the code evidently races >>>>>> with CPU hotplug driver. The set_cur_state() function offlines CPUs >>>>>> iteratively one at a time, checking the cooling state before each CPU >>>>>> shutdown. A hotplug notifier callback validates any CPU boot requests >>>>>> against current cooling state and approves/denies accordingly. This >>>>>> mechanism guarantees that the desired cooling state could be reached in a >>>>>> maximum of d-c iterations, where d and c are the "desired" and "current" >>>>>> cooling states expressed in the number of offline CPUs. >>>>>> >>>>>> Credits to Amit Daniel Kachhap for initial attempt to upstream this >>>>>> feature. >>>>>> >>>>>> Cc: Zhang Rui >>>>>> Cc: Eduardo Valentin >>>>>> Cc: Rob Landley >>>>>> Cc: Amit Daniel Kachhap >>>>>> Cc: Andrew Morton >>>>>> Cc: Durgadoss R >>>>>> Cc: Christian Daudt >>>>>> Cc: James King >>>>>> Signed-off-by: Zoran Markovic >>>>>> --- >>>>>> Documentation/thermal/cpu-cooling-api.txt | 17 ++ >>>>>> drivers/thermal/Kconfig | 10 + >>>>>> drivers/thermal/Makefile |1 + >>>>>> drivers/thermal/cpu_hotplug.c | 362 >>>>>> + >>>>>> include/linux/cpuhp_cooling.h | 57 + >>>>>> 5 files changed, 447 insertions(+) >>>>>> create mode 100644 drivers/thermal/cpu_hotplug.c >>>>>> create mode 100644 include/linux/cpuhp_cooling.h >>>>>> >>>>>> diff --git a/Documentation/thermal/cpu-cooling-api.txt >>>>>> b/Documentation/thermal/cpu-cooling-api.txt >>>>>> index fca24c9..2f94f68 100644 >>>>>> --- a/Documentation/thermal/cpu-cooling-api.txt >>>>>> +++ b/Documentation/thermal/cpu-cooling-api.txt >>>>>> @@ -30,3 +30,20 @@ the user. The registration APIs returns the cooling >>>>>> device pointer. >>>>>> This interface function unregisters the "thermal-cpufreq-%x" >>>>>> cooling device. >>>>>> >>>>>> cdev: Cooling device pointer which has to be unregistered. >>>>>> + >>>>>> +1.2 cpu hotplug registration/unregistration APIs >>>>>> +1.2.1 struct thermal_cooling_device *cpuhp_cooling_register( >>>>>> + st
Re: [RFC PATCH] thermal: add generic cpu hotplug cooling device
Hi Eduardo, I have some graphs created for Broadcom's Capri (Cortex-A9x2) device. I do a full temperature ramp using ARM-proprietary test, which heats it up to ~48C. By hot-unplugging CPU1 I can cool it down to ~40C within seconds. Let me know if you'd like to see the graphs. Regards, Zoran On 29 November 2013 06:08, Eduardo Valentin wrote: > Hello Zoran, > > On 27-11-2013 17:56, Zoran Markovic wrote: >> Pinging again... Does anyone have any opinion on this feature? > > Sorry for not answering you. Yes there is interest in such work. > Besides, your patch is not the very first attempt to do so. If I > remember correctly, when Amit D. K was originally sending the current > cpu cooling device, it included a hotplug part. That is why it was named > cpucooling and not cpufreqcooling. Anyways, the major concerns by that > time was the latencies to off line a CPU, mainly due to task and > structure migration. > > Thus the question is, have you measure the behavior of your system when > using this cooling device? Does it present any cooling effectiveness > during high system load scenarios for instance? In case you have data, > would you be able to share them? > > >> Thanks, >> Zoran >> >> On 4 October 2013 15:52, Zoran Markovic wrote: >>> Any comments on this proposed feature and implementation? Apparently >>> it's also useful for server systems. > > > > >>> Thanks, >>> Zoran >>> >>> On 20 September 2013 15:15, Zoran Markovic >>> wrote: >>>> This patch implements a generic CPU hotplug cooling device. The >>>> implementation scales down the number of running CPUs when temperature >>>> increases through a thermal trip point and prevents booting CPUs >>>> until thermal conditions are restored. Upon restoration, the action >>>> of starting up a CPU is left to another entity (e.g. CPU offline >>>> governor, for which a patch is in the works). >>>> >>>> In the past two years, ARM considerably reduced the time required for >>>> CPUs to boot and shutdown; this time is now measured in microseconds. >>>> This patch is predominantly intended for ARM big.LITTLE architectures >>>> where big cores are expected to have a much bigger impact on thermal >>>> budget than little cores, resulting in fast temperature ramps to a trip >>>> point, i.e. thermal runaways. Switching off the big core(s) may be one >>>> of the recovery mechanisms to restore system temperature, but the actual >>>> strategy is left to the thermal governor. >>>> >>>> The assumption is that CPU shutdown/startup is a rare event, so no >>>> attempt was made to make the code atomic, i.e. the code evidently races >>>> with CPU hotplug driver. The set_cur_state() function offlines CPUs >>>> iteratively one at a time, checking the cooling state before each CPU >>>> shutdown. A hotplug notifier callback validates any CPU boot requests >>>> against current cooling state and approves/denies accordingly. This >>>> mechanism guarantees that the desired cooling state could be reached in a >>>> maximum of d-c iterations, where d and c are the "desired" and "current" >>>> cooling states expressed in the number of offline CPUs. >>>> >>>> Credits to Amit Daniel Kachhap for initial attempt to upstream this >>>> feature. >>>> >>>> Cc: Zhang Rui >>>> Cc: Eduardo Valentin >>>> Cc: Rob Landley >>>> Cc: Amit Daniel Kachhap >>>> Cc: Andrew Morton >>>> Cc: Durgadoss R >>>> Cc: Christian Daudt >>>> Cc: James King >>>> Signed-off-by: Zoran Markovic >>>> --- >>>> Documentation/thermal/cpu-cooling-api.txt | 17 ++ >>>> drivers/thermal/Kconfig | 10 + >>>> drivers/thermal/Makefile |1 + >>>> drivers/thermal/cpu_hotplug.c | 362 >>>> + >>>> include/linux/cpuhp_cooling.h | 57 + >>>> 5 files changed, 447 insertions(+) >>>> create mode 100644 drivers/thermal/cpu_hotplug.c >>>> create mode 100644 include/linux/cpuhp_cooling.h >>>> >>>> diff --git a/Documentation/thermal/cpu-cooling-api.txt >>>> b/Documentation/thermal/cpu-cooling-api.txt >>>> index fca24c9..2f94f68 100644 >>>> --- a/Documentation/thermal/cpu-cooling-api.txt >>>> +++ b/Docume
Re: [RFC PATCH] thermal: add generic cpu hotplug cooling device
Pinging again... Does anyone have any opinion on this feature? Thanks, Zoran On 4 October 2013 15:52, Zoran Markovic wrote: > Any comments on this proposed feature and implementation? Apparently > it's also useful for server systems. > Thanks, > Zoran > > On 20 September 2013 15:15, Zoran Markovic wrote: >> This patch implements a generic CPU hotplug cooling device. The >> implementation scales down the number of running CPUs when temperature >> increases through a thermal trip point and prevents booting CPUs >> until thermal conditions are restored. Upon restoration, the action >> of starting up a CPU is left to another entity (e.g. CPU offline >> governor, for which a patch is in the works). >> >> In the past two years, ARM considerably reduced the time required for >> CPUs to boot and shutdown; this time is now measured in microseconds. >> This patch is predominantly intended for ARM big.LITTLE architectures >> where big cores are expected to have a much bigger impact on thermal >> budget than little cores, resulting in fast temperature ramps to a trip >> point, i.e. thermal runaways. Switching off the big core(s) may be one >> of the recovery mechanisms to restore system temperature, but the actual >> strategy is left to the thermal governor. >> >> The assumption is that CPU shutdown/startup is a rare event, so no >> attempt was made to make the code atomic, i.e. the code evidently races >> with CPU hotplug driver. The set_cur_state() function offlines CPUs >> iteratively one at a time, checking the cooling state before each CPU >> shutdown. A hotplug notifier callback validates any CPU boot requests >> against current cooling state and approves/denies accordingly. This >> mechanism guarantees that the desired cooling state could be reached in a >> maximum of d-c iterations, where d and c are the "desired" and "current" >> cooling states expressed in the number of offline CPUs. >> >> Credits to Amit Daniel Kachhap for initial attempt to upstream this feature. >> >> Cc: Zhang Rui >> Cc: Eduardo Valentin >> Cc: Rob Landley >> Cc: Amit Daniel Kachhap >> Cc: Andrew Morton >> Cc: Durgadoss R >> Cc: Christian Daudt >> Cc: James King >> Signed-off-by: Zoran Markovic >> --- >> Documentation/thermal/cpu-cooling-api.txt | 17 ++ >> drivers/thermal/Kconfig | 10 + >> drivers/thermal/Makefile |1 + >> drivers/thermal/cpu_hotplug.c | 362 >> + >> include/linux/cpuhp_cooling.h | 57 + >> 5 files changed, 447 insertions(+) >> create mode 100644 drivers/thermal/cpu_hotplug.c >> create mode 100644 include/linux/cpuhp_cooling.h >> >> diff --git a/Documentation/thermal/cpu-cooling-api.txt >> b/Documentation/thermal/cpu-cooling-api.txt >> index fca24c9..2f94f68 100644 >> --- a/Documentation/thermal/cpu-cooling-api.txt >> +++ b/Documentation/thermal/cpu-cooling-api.txt >> @@ -30,3 +30,20 @@ the user. The registration APIs returns the cooling >> device pointer. >> This interface function unregisters the "thermal-cpufreq-%x" cooling >> device. >> >> cdev: Cooling device pointer which has to be unregistered. >> + >> +1.2 cpu hotplug registration/unregistration APIs >> +1.2.1 struct thermal_cooling_device *cpuhp_cooling_register( >> + struct cpumask *cpus, const char *ext) >> + >> +This function creates and registers a cpu hotplug cooling device with >> +the name "cpu-hotplug-%s". >> + >> +cpus: cpumask of cpu cores participating in cooling. >> +ext: instance-specific name of device >> + >> +1.2.2 void cpuhotplug_cooling_unregister(struct thermal_cooling_device >> *cdev) >> + >> +This function unregisters and frees the cpu hotplug cooling device cdev. >> + >> +cdev: Pointer to cooling device to unregister. >> + >> diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig >> index 52b6ed7..3509100 100644 >> --- a/drivers/thermal/Kconfig >> +++ b/drivers/thermal/Kconfig >> @@ -79,6 +79,16 @@ config CPU_THERMAL >> >> If you want this support, you should say Y here. >> >> +config CPU_THERMAL_HOTPLUG >> + bool "Generic CPU hotplug cooling" >> + depends on HOTPLUG_CPU >> + help >> + Shutdown CPUs to prevent the device from overheating. This feature >> + uses generic CPU hot-unplug capabilities to control de
[PATCHv6] drivers: power: Detect device suspend/resume lockup and log event in pstore.
From: Benoit Goby Rather than hard-lock the kernel, dump the suspend/resume thread stack and panic() to capture a message in pstore when a driver takes too long to suspend/resume. Default suspend/resume watchdog timeout is set to 12 seconds to be longer than the usbhid 10 second timeout, but could be changed at compile time. Exclude from the watchdog the time spent waiting for children that are resumed asynchronously and time every device, whether or not they resumed synchronously. This patch is targeted for mobile devices where a suspend/resume lockup could cause a system reboot. Information about failing device can be retrieved in subsequent boot session by mounting pstore and inspecting the log. Laptops with EFI-enabled pstore could also benefit from this feature. The hardware watchdog timer is likely suspended during this time and couldn't be relied upon. The soft-lockup detector would eventually tell that tasks are not scheduled, but would provide little context as to why. The patch hence uses system timer and assumes it is still active while the devices are suspended/resumed. This feature can be enabled/disabled during kernel configuration. Cc: Android Kernel Team Cc: Colin Cross Cc: Todd Poynor Cc: San Mehat Cc: Benoit Goby Cc: John Stultz Cc: Pavel Machek Cc: Rafael J. Wysocki Cc: Len Brown Cc: Greg Kroah-Hartman Original-author: San Mehat Signed-off-by: Benoit Goby [zoran.marko...@linaro.org: Changed printk(KERN_EMERG,...) to pr_emerg(...), tweaked commit message. Moved call to dpm_wd_set() before device_lock() in device_resume(). Minor changes to add compile-time inclusion of the feature. Renamed 'dpm_wd...' to 'dpm_watchdog...'. Fixed compile errors/warnings for x86_64 and s390.] Signed-off-by: Zoran Markovic --- drivers/base/power/main.c | 72 + kernel/power/Kconfig | 16 ++ 2 files changed, 88 insertions(+) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 9f098a8..f2633da 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -30,6 +30,8 @@ #include #include #include +#include + #include "../base.h" #include "power.h" @@ -390,6 +392,70 @@ static int dpm_run_callback(pm_callback_t cb, struct device *dev, return error; } +#ifdef CONFIG_DPM_WATCHDOG +struct dpm_watchdog { + struct device *dev; + struct task_struct *tsk; + struct timer_list timer; +}; + +#define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \ + struct dpm_watchdog wd + +/** + * dpm_watchdog_handler - Driver suspend / resume watchdog handler. + * + * Called when a driver has timed out suspending or resuming. + * There's not much we can do here to recover so panic() to + * capture a crash-dump in pstore. + */ +static void dpm_watchdog_handler(unsigned long data) +{ + struct dpm_watchdog *wd = (void *)data; + + dev_emerg(wd->dev, " DPM device timeout \n"); + show_stack(wd->tsk, NULL); + panic("%s %s: unrecoverable failure\n", + dev_driver_string(wd->dev), dev_name(wd->dev)); +} + +/** + * dpm_watchdog_set - Enable pm watchdog for given device. + * @wd: Watchdog. Must be allocated on the stack. + * @dev: Device to handle. + */ +static void dpm_watchdog_set(struct dpm_watchdog *wd, struct device *dev) +{ + struct timer_list *timer = &wd->timer; + + wd->dev = dev; + wd->tsk = current; + + init_timer_on_stack(timer); + /* use same timeout value for both suspend and resume */ + timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_TIMEOUT; + timer->function = dpm_watchdog_handler; + timer->data = (unsigned long)wd; + add_timer(timer); +} + +/** + * dpm_watchdog_clear - Disable suspend/resume watchdog. + * @wd: Watchdog to disable. + */ +static void dpm_watchdog_clear(struct dpm_watchdog *wd) +{ + struct timer_list *timer = &wd->timer; + + del_timer_sync(timer); + destroy_timer_on_stack(timer); +} +#else +#define DECLARE_DPM_WATCHDOG_ON_STACK(wd) +#define dpm_watchdog_set(x, y) +#define dpm_watchdog_clear(x) +#endif + /*- Resume routines -*/ /** @@ -576,6 +642,7 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) pm_callback_t callback = NULL; char *info = NULL; int error = 0; + DECLARE_DPM_WATCHDOG_ON_STACK(wd); TRACE_DEVICE(dev); TRACE_RESUME(0); @@ -584,6 +651,7 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) goto Complete; dpm_wait(dev->parent, async); + dpm_watchdog_set(&wd, dev); device_lock(dev); /* @@ -642,6 +710,7 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) Unlock:
Re: [RFC PATCH] timekeeping: Correct run-time detection of real-time clock.
Hi Feng, Looking at the OMAP implementation, persistent_clock is updated on every read of the 32K counter. If the read doesn't happen often enough to accurately update persistent_clock, then the 32K counter would fail the definition of a persistent clock and some other timekeeping source should be used. Regards, Zoran On 12 October 2013 00:48, Feng Tang wrote: > Hi Zoran, > > Thanks for the patch! (This reply may be t late :)) > > One question just for curiosity: for the counter_32K timer, it's running > at 32K Hz and has one 32b counter. I understand it is only for suspend > time calculation use, but the wrap time for it is about > 4G/32K ~= 128K seconds ~= 35 hours > What if one suspend time is longer than that? > > - Feng > > On Fri, May 17, 2013 at 11:24:05AM -0700, Zoran Markovic wrote: >> Since commit <31ade30692dc9680bfc95700d794818fa3f754ac>, timekeeping_init() >> checks for presence of persistent clock by attempting to read a non-zero >> time value from real-time clock. This is an issue on platforms where >> persistent_clock (instead of a RTC) is implemented as a free-running counter >> starting from zero on each boot and running during suspend. Examples are some >> ARM platforms (e.g. PandaBoard). An attempt to read such a clock during >> timekeeping_init() may return zero value and falsely declare persistent clock >> as missing. Additionally, in the above case suspend times may be accounted >> twice (once from timekeeping_resume() and once from rtc_resume()), resulting >> in a gradual drift of system time. >> >> This patch does a run-time correction of the issue by doing the same check >> during timekeeping_suspend(). >> >> A better long-term solution would have to return error when trying to read >> non-existing clock and zero when trying to read an uninitialized clock, but >> that would require changing all persistent_clock implementations. >> >> This patch addresses the immediate breakage, for now. >> >> Cc: John Stultz >> Cc: Thomas Gleixner >> Cc: Feng Tang >> Cc: sta...@vger.kernel.org >> Signed-off-by: Zoran Markovic >> --- >> kernel/time/timekeeping.c |8 >> 1 file changed, 8 insertions(+) >> >> diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c >> index 98cd470..baeeb5c 100644 >> --- a/kernel/time/timekeeping.c >> +++ b/kernel/time/timekeeping.c >> @@ -975,6 +975,14 @@ static int timekeeping_suspend(void) >> >> read_persistent_clock(&timekeeping_suspend_time); >> >> + /* >> + * On some systems the persistent_clock can not be detected at >> + * timekeeping_init by its return value, so if we see a valid >> + * value returned, update the persistent_clock_exists flag. >> + */ >> + if (timekeeping_suspend_time.tv_sec || >> timekeeping_suspend_time.tv_nsec) >> + persistent_clock_exist = true; >> + >> raw_spin_lock_irqsave(&timekeeper_lock, flags); >> write_seqcount_begin(&timekeeper_seq); >> timekeeping_forward_now(tk); >> -- >> 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv5] drivers: power: Detect device suspend/resume lockup and log event in pstore.
From: Benoit Goby Rather than hard-lock the kernel, dump the suspend/resume thread stack and panic() to capture a message in pstore when a driver takes too long to suspend/resume. Default suspend/resume watchdog timeout is set to 12 seconds to be longer than the usbhid 10 second timeout, but could be changed at compile time. Exclude from the watchdog the time spent waiting for children that are resumed asynchronously and time every device, whether or not they resumed synchronously. This patch is targeted for mobile devices where a suspend/resume lockup could cause a system reboot. Information about failing device can be retrieved in subsequent boot session by mounting pstore and inspecting the log. Laptops with EFI-enabled pstore could also benefit from this feature. The hardware watchdog timer is likely suspended during this time and couldn't be relied upon. The soft-lockup detector would eventually tell that tasks are not scheduled, but would provide little context as to why. The patch hence uses system timer and assumes it is still active while the devices are suspended/resumed. This feature can be enabled/disabled during kernel configuration. Cc: Android Kernel Team Cc: Colin Cross Cc: Todd Poynor Cc: San Mehat Cc: Benoit Goby Cc: John Stultz Cc: Pavel Machek Cc: Rafael J. Wysocki Cc: Len Brown Cc: Greg Kroah-Hartman Original-author: San Mehat Signed-off-by: Benoit Goby [zoran.marko...@linaro.org: Changed printk(KERN_EMERG,...) to pr_emerg(...), tweaked commit message. Moved call to dpm_wd_set() before device_lock() in device_resume(). Minor changes to add compile-time inclusion of the feature. Renamed 'dpm_wd...' to 'dpm_watchdog...'.] Signed-off-by: Zoran Markovic --- drivers/base/power/main.c | 68 + kernel/power/Kconfig | 16 +++ 2 files changed, 84 insertions(+) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 9f098a8..06fbc62 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -30,6 +30,8 @@ #include #include #include +#include + #include "../base.h" #include "power.h" @@ -55,6 +57,12 @@ struct suspend_stats suspend_stats; static DEFINE_MUTEX(dpm_list_mtx); static pm_message_t pm_transition; +struct dpm_watchdog { + struct device *dev; + struct task_struct *tsk; + struct timer_list timer; +}; + static int async_error; static char *pm_verb(int event) @@ -390,6 +398,60 @@ static int dpm_run_callback(pm_callback_t cb, struct device *dev, return error; } +#ifdef CONFIG_DPM_WATCHDOG +/** + * dpm_watchdog_handler - Driver suspend / resume watchdog handler. + * + * Called when a driver has timed out suspending or resuming. + * There's not much we can do here to recover so panic() to + * capture a crash-dump in pstore. + */ +static void dpm_watchdog_handler(unsigned long data) +{ + struct dpm_watchdog *wd = (void *)data; + + dev_emerg(wd->dev, " DPM device timeout \n"); + show_stack(wd->tsk, NULL); + panic("%s %s: unrecoverable failure\n", + dev_driver_string(wd->dev), dev_name(wd->dev)); +} + +/** + * dpm_watchdog_set - Enable pm watchdog for given device. + * @wd: Watchdog. Must be allocated on the stack. + * @dev: Device to handle. + */ +static void dpm_watchdog_set(struct dpm_watchdog *wd, struct device *dev) +{ + struct timer_list *timer = &wd->timer; + + wd->dev = dev; + wd->tsk = get_current(); + + init_timer_on_stack(timer); + /* use same timeout value for both suspend and resume */ + timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_TIMEOUT; + timer->function = dpm_watchdog_handler; + timer->data = (unsigned long)wd; + add_timer(timer); +} + +/** + * dpm_watchdog_clear - Disable suspend/resume watchdog. + * @wd: Watchdog to disable. + */ +static void dpm_watchdog_clear(struct dpm_watchdog *wd) +{ + struct timer_list *timer = &wd->timer; + + del_timer_sync(timer); + destroy_timer_on_stack(timer); +} +#else +#define dpm_watchdog_set(x, y) +#define dpm_watchdog_clear(x) +#endif + /*- Resume routines -*/ /** @@ -576,6 +638,7 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) pm_callback_t callback = NULL; char *info = NULL; int error = 0; + struct dpm_watchdog wd; TRACE_DEVICE(dev); TRACE_RESUME(0); @@ -584,6 +647,7 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) goto Complete; dpm_wait(dev->parent, async); + dpm_watchdog_set(&wd, dev); device_lock(dev); /* @@ -642,6 +706,7 @@ static int device_resume(struct device *dev, pm_message_t state, bool a
Re: [RFC PATCHv4] drivers: power: Detect device suspend/resume lockup and log event in pstore.
Hi Rafael, Just wondering if you would like anything changed in this patch in order to get it into 3.13. I'd prefer not missing yet another merge window... Thanks, Zoran On 25 September 2013 15:31, Zoran Markovic wrote: > From: Benoit Goby > > Rather than hard-lock the kernel, dump the suspend/resume thread stack and > panic() to capture a message in pstore when a driver takes too long to > suspend/resume. Default suspend/resume watchdog timeout is set to 12 > seconds to be longer than the usbhid 10 second timeout, but could be > changed at compile time. > > Exclude from the watchdog the time spent waiting for children that > are resumed asynchronously and time every device, whether or not they > resumed synchronously. > > This patch is targeted for mobile devices where a suspend/resume lockup > could cause a system reboot. Information about failing device can be > retrieved in subsequent boot session by mounting pstore and inspecting > the log. Laptops with EFI-enabled pstore could also benefit from > this feature. > > The hardware watchdog timer is likely suspended during this time and > couldn't be relied upon. The soft-lockup detector would eventually tell > that tasks are not scheduled, but would provide little context as to why. > The patch hence uses system timer and assumes it is still active while the > devices are suspended/resumed. > > This feature can be enabled/disabled during kernel configuration. > > Cc: Android Kernel Team > Cc: Colin Cross > Cc: Todd Poynor > Cc: San Mehat > Cc: Benoit Goby > Cc: John Stultz > Cc: Pavel Machek > Cc: Rafael J. Wysocki > Cc: Len Brown > Cc: Greg Kroah-Hartman > Original-author: San Mehat > Signed-off-by: Benoit Goby > [zoran.marko...@linaro.org: Changed printk(KERN_EMERG,...) to pr_emerg(...), > tweaked commit message. Moved call to dpm_wd_set() before device_lock() in > device_resume(). Minor changes to add compile-time inclusion of the feature.] > Signed-off-by: Zoran Markovic > --- > drivers/base/power/main.c | 68 > + > kernel/power/Kconfig | 16 +++ > 2 files changed, 84 insertions(+) > > diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c > index 9f098a8..9b7e6b6 100644 > --- a/drivers/base/power/main.c > +++ b/drivers/base/power/main.c > @@ -30,6 +30,8 @@ > #include > #include > #include > +#include > + > #include "../base.h" > #include "power.h" > > @@ -55,6 +57,12 @@ struct suspend_stats suspend_stats; > static DEFINE_MUTEX(dpm_list_mtx); > static pm_message_t pm_transition; > > +struct dpm_watchdog { > + struct device *dev; > + struct task_struct *tsk; > + struct timer_list timer; > +}; > + > static int async_error; > > static char *pm_verb(int event) > @@ -390,6 +398,60 @@ static int dpm_run_callback(pm_callback_t cb, struct > device *dev, > return error; > } > > +#ifdef CONFIG_DPM_WD > +/** > + * dpm_wd_handler - Driver suspend / resume watchdog handler. > + * > + * Called when a driver has timed out suspending or resuming. > + * There's not much we can do here to recover so panic() to > + * capture a crash-dump in pstore. > + */ > +static void dpm_wd_handler(unsigned long data) > +{ > + struct dpm_watchdog *wd = (void *)data; > + > + dev_emerg(wd->dev, " DPM device timeout \n"); > + show_stack(wd->tsk, NULL); > + panic("%s %s: unrecoverable failure\n", > + dev_driver_string(wd->dev), dev_name(wd->dev)); > +} > + > +/** > + * dpm_wd_set - Enable pm watchdog for given device. > + * @wd: Watchdog. Must be allocated on the stack. > + * @dev: Device to handle. > + */ > +static void dpm_wd_set(struct dpm_watchdog *wd, struct device *dev) > +{ > + struct timer_list *timer = &wd->timer; > + > + wd->dev = dev; > + wd->tsk = get_current(); > + > + init_timer_on_stack(timer); > + /* use same timeout value for both suspend and resume */ > + timer->expires = jiffies + HZ * CONFIG_DPM_WD_TIMEOUT; > + timer->function = dpm_wd_handler; > + timer->data = (unsigned long)wd; > + add_timer(timer); > +} > + > +/** > + * dpm_wd_clear - Disable suspend/resume watchdog. > + * @wd: Watchdog to disable. > + */ > +static void dpm_wd_clear(struct dpm_watchdog *wd) > +{ > + struct timer_list *timer = &wd->timer; > + > + del_timer_sync(timer); > + destroy_timer_on_stack(timer); > +} > +#else > +#define dpm_wd_set(x, y) > +#
Re: [RFC PATCH] thermal: add generic cpu hotplug cooling device
Any comments on this proposed feature and implementation? Apparently it's also useful for server systems. Thanks, Zoran On 20 September 2013 15:15, Zoran Markovic wrote: > This patch implements a generic CPU hotplug cooling device. The > implementation scales down the number of running CPUs when temperature > increases through a thermal trip point and prevents booting CPUs > until thermal conditions are restored. Upon restoration, the action > of starting up a CPU is left to another entity (e.g. CPU offline > governor, for which a patch is in the works). > > In the past two years, ARM considerably reduced the time required for > CPUs to boot and shutdown; this time is now measured in microseconds. > This patch is predominantly intended for ARM big.LITTLE architectures > where big cores are expected to have a much bigger impact on thermal > budget than little cores, resulting in fast temperature ramps to a trip > point, i.e. thermal runaways. Switching off the big core(s) may be one > of the recovery mechanisms to restore system temperature, but the actual > strategy is left to the thermal governor. > > The assumption is that CPU shutdown/startup is a rare event, so no > attempt was made to make the code atomic, i.e. the code evidently races > with CPU hotplug driver. The set_cur_state() function offlines CPUs > iteratively one at a time, checking the cooling state before each CPU > shutdown. A hotplug notifier callback validates any CPU boot requests > against current cooling state and approves/denies accordingly. This > mechanism guarantees that the desired cooling state could be reached in a > maximum of d-c iterations, where d and c are the "desired" and "current" > cooling states expressed in the number of offline CPUs. > > Credits to Amit Daniel Kachhap for initial attempt to upstream this feature. > > Cc: Zhang Rui > Cc: Eduardo Valentin > Cc: Rob Landley > Cc: Amit Daniel Kachhap > Cc: Andrew Morton > Cc: Durgadoss R > Cc: Christian Daudt > Cc: James King > Signed-off-by: Zoran Markovic > --- > Documentation/thermal/cpu-cooling-api.txt | 17 ++ > drivers/thermal/Kconfig | 10 + > drivers/thermal/Makefile |1 + > drivers/thermal/cpu_hotplug.c | 362 > + > include/linux/cpuhp_cooling.h | 57 + > 5 files changed, 447 insertions(+) > create mode 100644 drivers/thermal/cpu_hotplug.c > create mode 100644 include/linux/cpuhp_cooling.h > > diff --git a/Documentation/thermal/cpu-cooling-api.txt > b/Documentation/thermal/cpu-cooling-api.txt > index fca24c9..2f94f68 100644 > --- a/Documentation/thermal/cpu-cooling-api.txt > +++ b/Documentation/thermal/cpu-cooling-api.txt > @@ -30,3 +30,20 @@ the user. The registration APIs returns the cooling device > pointer. > This interface function unregisters the "thermal-cpufreq-%x" cooling > device. > > cdev: Cooling device pointer which has to be unregistered. > + > +1.2 cpu hotplug registration/unregistration APIs > +1.2.1 struct thermal_cooling_device *cpuhp_cooling_register( > + struct cpumask *cpus, const char *ext) > + > +This function creates and registers a cpu hotplug cooling device with > +the name "cpu-hotplug-%s". > + > +cpus: cpumask of cpu cores participating in cooling. > +ext: instance-specific name of device > + > +1.2.2 void cpuhotplug_cooling_unregister(struct thermal_cooling_device *cdev) > + > +This function unregisters and frees the cpu hotplug cooling device cdev. > + > +cdev: Pointer to cooling device to unregister. > + > diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig > index 52b6ed7..3509100 100644 > --- a/drivers/thermal/Kconfig > +++ b/drivers/thermal/Kconfig > @@ -79,6 +79,16 @@ config CPU_THERMAL > > If you want this support, you should say Y here. > > +config CPU_THERMAL_HOTPLUG > + bool "Generic CPU hotplug cooling" > + depends on HOTPLUG_CPU > + help > + Shutdown CPUs to prevent the device from overheating. This feature > + uses generic CPU hot-unplug capabilities to control device > + temperature. When the temperature increases over a trip point, a > + random subset of CPUs is shut down to reach the desired cooling > + state. > + > config THERMAL_EMULATION > bool "Thermal emulation mode support" > help > diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile > index 5ee0db0..0bd08be 100644 > --- a/drivers/thermal/Makefile > +++ b/drivers/thermal/Makefile > @@ -12,6 +12,7 @@ thermal_sys-$(CONFIG_THERMA
[RFC PATCHv4] drivers: power: Detect device suspend/resume lockup and log event in pstore.
From: Benoit Goby Rather than hard-lock the kernel, dump the suspend/resume thread stack and panic() to capture a message in pstore when a driver takes too long to suspend/resume. Default suspend/resume watchdog timeout is set to 12 seconds to be longer than the usbhid 10 second timeout, but could be changed at compile time. Exclude from the watchdog the time spent waiting for children that are resumed asynchronously and time every device, whether or not they resumed synchronously. This patch is targeted for mobile devices where a suspend/resume lockup could cause a system reboot. Information about failing device can be retrieved in subsequent boot session by mounting pstore and inspecting the log. Laptops with EFI-enabled pstore could also benefit from this feature. The hardware watchdog timer is likely suspended during this time and couldn't be relied upon. The soft-lockup detector would eventually tell that tasks are not scheduled, but would provide little context as to why. The patch hence uses system timer and assumes it is still active while the devices are suspended/resumed. This feature can be enabled/disabled during kernel configuration. Cc: Android Kernel Team Cc: Colin Cross Cc: Todd Poynor Cc: San Mehat Cc: Benoit Goby Cc: John Stultz Cc: Pavel Machek Cc: Rafael J. Wysocki Cc: Len Brown Cc: Greg Kroah-Hartman Original-author: San Mehat Signed-off-by: Benoit Goby [zoran.marko...@linaro.org: Changed printk(KERN_EMERG,...) to pr_emerg(...), tweaked commit message. Moved call to dpm_wd_set() before device_lock() in device_resume(). Minor changes to add compile-time inclusion of the feature.] Signed-off-by: Zoran Markovic --- drivers/base/power/main.c | 68 + kernel/power/Kconfig | 16 +++ 2 files changed, 84 insertions(+) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 9f098a8..9b7e6b6 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -30,6 +30,8 @@ #include #include #include +#include + #include "../base.h" #include "power.h" @@ -55,6 +57,12 @@ struct suspend_stats suspend_stats; static DEFINE_MUTEX(dpm_list_mtx); static pm_message_t pm_transition; +struct dpm_watchdog { + struct device *dev; + struct task_struct *tsk; + struct timer_list timer; +}; + static int async_error; static char *pm_verb(int event) @@ -390,6 +398,60 @@ static int dpm_run_callback(pm_callback_t cb, struct device *dev, return error; } +#ifdef CONFIG_DPM_WD +/** + * dpm_wd_handler - Driver suspend / resume watchdog handler. + * + * Called when a driver has timed out suspending or resuming. + * There's not much we can do here to recover so panic() to + * capture a crash-dump in pstore. + */ +static void dpm_wd_handler(unsigned long data) +{ + struct dpm_watchdog *wd = (void *)data; + + dev_emerg(wd->dev, " DPM device timeout \n"); + show_stack(wd->tsk, NULL); + panic("%s %s: unrecoverable failure\n", + dev_driver_string(wd->dev), dev_name(wd->dev)); +} + +/** + * dpm_wd_set - Enable pm watchdog for given device. + * @wd: Watchdog. Must be allocated on the stack. + * @dev: Device to handle. + */ +static void dpm_wd_set(struct dpm_watchdog *wd, struct device *dev) +{ + struct timer_list *timer = &wd->timer; + + wd->dev = dev; + wd->tsk = get_current(); + + init_timer_on_stack(timer); + /* use same timeout value for both suspend and resume */ + timer->expires = jiffies + HZ * CONFIG_DPM_WD_TIMEOUT; + timer->function = dpm_wd_handler; + timer->data = (unsigned long)wd; + add_timer(timer); +} + +/** + * dpm_wd_clear - Disable suspend/resume watchdog. + * @wd: Watchdog to disable. + */ +static void dpm_wd_clear(struct dpm_watchdog *wd) +{ + struct timer_list *timer = &wd->timer; + + del_timer_sync(timer); + destroy_timer_on_stack(timer); +} +#else +#define dpm_wd_set(x, y) +#define dpm_wd_clear(x) +#endif + /*- Resume routines -*/ /** @@ -576,6 +638,7 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) pm_callback_t callback = NULL; char *info = NULL; int error = 0; + struct dpm_watchdog wd; TRACE_DEVICE(dev); TRACE_RESUME(0); @@ -584,6 +647,7 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) goto Complete; dpm_wait(dev->parent, async); + dpm_wd_set(&wd, dev); device_lock(dev); /* @@ -642,6 +706,7 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) Unlock: device_unlock(dev); + dpm_wd_clear(&wd); Complete: complete_all(&dev->power.compl
[RFC PATCH] thermal: add generic cpu hotplug cooling device
This patch implements a generic CPU hotplug cooling device. The implementation scales down the number of running CPUs when temperature increases through a thermal trip point and prevents booting CPUs until thermal conditions are restored. Upon restoration, the action of starting up a CPU is left to another entity (e.g. CPU offline governor, for which a patch is in the works). In the past two years, ARM considerably reduced the time required for CPUs to boot and shutdown; this time is now measured in microseconds. This patch is predominantly intended for ARM big.LITTLE architectures where big cores are expected to have a much bigger impact on thermal budget than little cores, resulting in fast temperature ramps to a trip point, i.e. thermal runaways. Switching off the big core(s) may be one of the recovery mechanisms to restore system temperature, but the actual strategy is left to the thermal governor. The assumption is that CPU shutdown/startup is a rare event, so no attempt was made to make the code atomic, i.e. the code evidently races with CPU hotplug driver. The set_cur_state() function offlines CPUs iteratively one at a time, checking the cooling state before each CPU shutdown. A hotplug notifier callback validates any CPU boot requests against current cooling state and approves/denies accordingly. This mechanism guarantees that the desired cooling state could be reached in a maximum of d-c iterations, where d and c are the "desired" and "current" cooling states expressed in the number of offline CPUs. Credits to Amit Daniel Kachhap for initial attempt to upstream this feature. Cc: Zhang Rui Cc: Eduardo Valentin Cc: Rob Landley Cc: Amit Daniel Kachhap Cc: Andrew Morton Cc: Durgadoss R Cc: Christian Daudt Cc: James King Signed-off-by: Zoran Markovic --- Documentation/thermal/cpu-cooling-api.txt | 17 ++ drivers/thermal/Kconfig | 10 + drivers/thermal/Makefile |1 + drivers/thermal/cpu_hotplug.c | 362 + include/linux/cpuhp_cooling.h | 57 + 5 files changed, 447 insertions(+) create mode 100644 drivers/thermal/cpu_hotplug.c create mode 100644 include/linux/cpuhp_cooling.h diff --git a/Documentation/thermal/cpu-cooling-api.txt b/Documentation/thermal/cpu-cooling-api.txt index fca24c9..2f94f68 100644 --- a/Documentation/thermal/cpu-cooling-api.txt +++ b/Documentation/thermal/cpu-cooling-api.txt @@ -30,3 +30,20 @@ the user. The registration APIs returns the cooling device pointer. This interface function unregisters the "thermal-cpufreq-%x" cooling device. cdev: Cooling device pointer which has to be unregistered. + +1.2 cpu hotplug registration/unregistration APIs +1.2.1 struct thermal_cooling_device *cpuhp_cooling_register( + struct cpumask *cpus, const char *ext) + +This function creates and registers a cpu hotplug cooling device with +the name "cpu-hotplug-%s". + +cpus: cpumask of cpu cores participating in cooling. +ext: instance-specific name of device + +1.2.2 void cpuhotplug_cooling_unregister(struct thermal_cooling_device *cdev) + +This function unregisters and frees the cpu hotplug cooling device cdev. + +cdev: Pointer to cooling device to unregister. + diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index 52b6ed7..3509100 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -79,6 +79,16 @@ config CPU_THERMAL If you want this support, you should say Y here. +config CPU_THERMAL_HOTPLUG + bool "Generic CPU hotplug cooling" + depends on HOTPLUG_CPU + help + Shutdown CPUs to prevent the device from overheating. This feature + uses generic CPU hot-unplug capabilities to control device + temperature. When the temperature increases over a trip point, a + random subset of CPUs is shut down to reach the desired cooling + state. + config THERMAL_EMULATION bool "Thermal emulation mode support" help diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile index 5ee0db0..0bd08be 100644 --- a/drivers/thermal/Makefile +++ b/drivers/thermal/Makefile @@ -12,6 +12,7 @@ thermal_sys-$(CONFIG_THERMAL_GOV_USER_SPACE) += user_space.o # cpufreq cooling thermal_sys-$(CONFIG_CPU_THERMAL) += cpu_cooling.o +thermal_sys-$(CONFIG_CPU_THERMAL_HOTPLUG) += cpu_hotplug.o # platform thermal drivers obj-$(CONFIG_SPEAR_THERMAL)+= spear_thermal.o diff --git a/drivers/thermal/cpu_hotplug.c b/drivers/thermal/cpu_hotplug.c new file mode 100644 index 000..8c3021e --- /dev/null +++ b/drivers/thermal/cpu_hotplug.c @@ -0,0 +1,362 @@ +/* + * drivers/thermal/cpu_hotplug.c + * + * Copyright (C) 2013 Broadcom Corporation Ltd. + * Copyright (C) 2013 Zoran Markovic + * + * ~~ + * This pro
Re: [RFC PATCH] mmc: Enable wakeup_sources for mmc core
Hi Ulf, Thanks for reviewing this, it was very helpful! > 1. mmc_detect_change does obviously not have to be run the same number > of times as the mmc_rescan function. In other words, the calls to > __pm_stay_awake is not paired with __pm_relay, I suppose this does not > matter? It shouldn't, since a single __pm_relax() would cancel all previous calls to __pm_stay_awake() on the same wakeup source. What is important is that mmc_rescan() is scheduled after __pm_stay_awake() to make sure wakeup source is released. > 2. mmc_detect_change can for example be called while the device > suspend sequence is progressing. At this point the rescan work is > disabled, thus __pm_relax will not be called, until the next rescan > work as executed which is after the complete resume cycle > (mmc_pm_notify:PM_POST_SUSPEND). Is that an issue? If started, mmc_detect_change() should run uninterrupted to call __pm_stay_awake(), which should abort any previous suspend requests. The abort sequence should restart the rescan work, so __pm_relax() eventually gets called. >> /* If there is a non-removable card registered, only scan once */ >> - if ((host->caps & MMC_CAP_NONREMOVABLE) && host->rescan_entered) >> + if ((host->caps & MMC_CAP_NONREMOVABLE) && host->rescan_entered) { >> + __pm_relax(host->ws); > > By calling __pm_relax here, this indicates to me that is seems like > you might have prevented, even for a very small timeslot, with a > MMC_CAP_NONREMOVABLE card/host from the system to suspend. > > For sure, you must not prevent the suspend even for small timeslots, > when MMC_CAP_NONREMOVABLE is set. I agree. It appears that the corresponding __pm_stay_awake() is indiscriminately called on system resume regardless of card type, so this needs to be fixed. >> mmc_release_host(host); >> >> out: >> - if (host->caps & MMC_CAP_NEEDS_POLL) >> + if (extend_wakeup) >> + /* extra 1/2 second should be enough, hopefully */ >> + __pm_wakeup_event(host->ws, MSEC_PER_SEC/2); >> + else >> + __pm_relax(host->ws); >> + >> + if (host->caps & MMC_CAP_NEEDS_POLL) { >> + __pm_stay_awake(host->ws); > > This does not make sense. > > So when using polling mode to detect card insert/remove, you will > prevent suspend forever? Maybe I missed a point somewhere? > >> mmc_schedule_delayed_work(&host->detect, HZ); >> + } >> } You are right, and I find it interesting that the same wake_lock() call exists in the Android kernel. Would someone from the Android team be able to comment? >> /* clear pm flags now and let card drivers set them as needed */ >> @@ -2628,7 +2645,8 @@ int mmc_suspend_host(struct mmc_host *host) >> { > > This function has become deprecated. You need to rebase this patch and > please do not add some new code in here. > If suspend is now initiated from the bus level, will there be a host-level suspend/resume function at all? I need to know where this code should move in the next revision of patch... Regards, Zoran -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCHv3] drivers: power: Detect device suspend/resume lockup and log event in pstore.
> Is there any practical reason why it should go into the next release? Android folks find this useful, albeit a debug feature. Zoran -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCHv3] drivers: power: Detect device suspend/resume lockup and log event in pstore.
Hi Rafael, > It doesn't look too bad from a quick look, but there's a couple of things > I don't like in it still (relatively minor). If there are things you would like changed in this patch, please let me know. It would be nice to catch the 3.12 merge window. Thanks, - Zoran -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCHv3] drivers: power: Detect device suspend/resume lockup and log event in pstore.
Rafael, I haven't seen any other proposals/alternatives on how to do this. Is there anything else we should do to get this upstream? I believe this is a valuable debug feature for Android and it now explicitly depends on pstore... Thanks, Zoran On 30 July 2013 13:38, Zoran Markovic wrote: > From: Benoit Goby > > Rather than hard-lock the kernel, dump the suspend/resume thread stack and > panic() to capture a message in pstore when a driver takes too long to > suspend/resume. Default suspend/resume watchdog timeout is set to 12 > seconds to be longer than the usbhid 10 second timeout, but could be > changed at compile time. > > Exclude from the watchdog the time spent waiting for children that > are resumed asynchronously and time every device, whether or not they > resumed synchronously. > > This patch is targeted for mobile devices where a suspend/resume lockup > could cause a system reboot. Information about failing device can be > retrieved in subsequent boot session by mounting pstore and inspecting > the log. > > The hardware watchdog timer is likely suspended during this time and > couldn't be relied upon. The soft-lockup detector would eventually tell > that tasks are not scheduled, but would provide little context as to why. > The patch hence uses system timer and assumes it is still active while the > devices are suspended/resumed. > > This feature can be enabled/disabled during kernel configuration. > > Cc: Android Kernel Team > Cc: Colin Cross > Cc: Todd Poynor > Cc: San Mehat > Cc: Benoit Goby > Cc: John Stultz > Cc: Pavel Machek > Cc: Rafael J. Wysocki > Cc: Len Brown > Cc: Greg Kroah-Hartman > Original-author: San Mehat > Signed-off-by: Benoit Goby > [zoran.marko...@linaro.org: Changed printk(KERN_EMERG,...) to pr_emerg(...), > tweaked commit message. Minor changes to add compile-time inclusion of > the feature.] > Signed-off-by: Zoran Markovic > --- > v3: > * Added explicit dependency on pstore > * Collapsed recovery options to system panic only > * Logged driver string in panic message > > drivers/base/power/main.c | 70 > + > kernel/power/Kconfig | 16 +++ > 2 files changed, 86 insertions(+) > > diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c > index 5a9b656..c19aec0 100644 > --- a/drivers/base/power/main.c > +++ b/drivers/base/power/main.c > @@ -29,6 +29,8 @@ > #include > #include > #include > +#include > + > #include "../base.h" > #include "power.h" > > @@ -54,6 +56,12 @@ struct suspend_stats suspend_stats; > static DEFINE_MUTEX(dpm_list_mtx); > static pm_message_t pm_transition; > > +struct dpm_watchdog { > + struct device *dev; > + struct task_struct *tsk; > + struct timer_list timer; > +}; > + > static int async_error; > > /** > @@ -384,6 +392,60 @@ static int dpm_run_callback(pm_callback_t cb, struct > device *dev, > return error; > } > > +#ifdef CONFIG_DPM_WD > +/** > + * dpm_wd_handler - Driver suspend / resume watchdog handler. > + * > + * Called when a driver has timed out suspending or resuming. > + * There's not much we can do here to recover so panic() to > + * capture a crash-dump in pstore. > + */ > +static void dpm_wd_handler(unsigned long data) > +{ > + struct dpm_watchdog *wd = (void *)data; > + > + dev_emerg(wd->dev, " DPM device timeout \n"); > + show_stack(wd->tsk, NULL); > + panic("%s %s: unrecoverable failure\n", > + dev_driver_string(wd->dev), dev_name(wd->dev)); > +} > + > +/** > + * dpm_wd_set - Enable pm watchdog for given device. > + * @wd: Watchdog. Must be allocated on the stack. > + * @dev: Device to handle. > + */ > +static void dpm_wd_set(struct dpm_watchdog *wd, struct device *dev) > +{ > + struct timer_list *timer = &wd->timer; > + > + wd->dev = dev; > + wd->tsk = get_current(); > + > + init_timer_on_stack(timer); > + /* use same timeout value for both suspend and resume */ > + timer->expires = jiffies + HZ * CONFIG_DPM_WD_TIMEOUT; > + timer->function = dpm_wd_handler; > + timer->data = (unsigned long)wd; > + add_timer(timer); > +} > + > +/** > + * dpm_wd_clear - Disable suspend/resume watchdog. > + * @wd: Watchdog to disable. > + */ > +static void dpm_wd_clear(struct dpm_watchdog *wd) > +{ > + struct timer_list *timer = &wd->timer; > + > + del_timer_sync(timer); > + destroy_timer_on_stack(timer);
Re: [RFC PATCH] pm: prevent suspend until power supply events are processed
Any opinions on this patch? Regards, Zoran On 2 August 2013 13:38, Zoran Markovic wrote: > This patch, originally authored by Arve Hjonnevag and Todd Poynor, > prevents the system from entering suspend mode until the power > supply plug, unplug, or any other change of state event is fully > processed. This guarantees that the screen lights up and displays > the battery charging state. The implementation uses the power > supply wakeup_source object. > > Cc: Anton Vorontsov > Cc: David Woodhouse > Cc: Arve Hjonnevag > Cc: Todd Poynor > Cc: John Stultz > Signed-off-by: Zoran Markovic > --- > drivers/power/power_supply_core.c | 37 > +++-- > include/linux/power_supply.h |2 ++ > 2 files changed, 33 insertions(+), 6 deletions(-) > > diff --git a/drivers/power/power_supply_core.c > b/drivers/power/power_supply_core.c > index 3b2d5df..e68d598 100644 > --- a/drivers/power/power_supply_core.c > +++ b/drivers/power/power_supply_core.c > @@ -67,23 +67,41 @@ static int __power_supply_changed_work(struct device > *dev, void *data) > > static void power_supply_changed_work(struct work_struct *work) > { > + unsigned long flags; > struct power_supply *psy = container_of(work, struct power_supply, > changed_work); > > dev_dbg(psy->dev, "%s\n", __func__); > > - class_for_each_device(power_supply_class, NULL, psy, > - __power_supply_changed_work); > - > - power_supply_update_leds(psy); > - > - kobject_uevent(&psy->dev->kobj, KOBJ_CHANGE); > + spin_lock_irqsave(&psy->changed_lock, flags); > + if (psy->changed) { > + psy->changed = false; > + spin_unlock_irqrestore(&psy->changed_lock, flags); > + class_for_each_device(power_supply_class, NULL, psy, > + __power_supply_changed_work); > + power_supply_update_leds(psy); > + kobject_uevent(&psy->dev->kobj, KOBJ_CHANGE); > + spin_lock_irqsave(&psy->changed_lock, flags); > + } > + /* dependent power supplies (e.g. battery) may have changed > +* state as a result of this event, so poll again and hold > +* the wakeup_source until all events are processed. > +*/ > + if (!psy->changed) > + pm_relax(psy->dev); > + spin_unlock_irqrestore(&psy->changed_lock, flags); > } > > void power_supply_changed(struct power_supply *psy) > { > + unsigned long flags; > + > dev_dbg(psy->dev, "%s\n", __func__); > > + spin_lock_irqsave(&psy->changed_lock, flags); > + psy->changed = true; > + pm_stay_awake(psy->dev); > + spin_unlock_irqrestore(&psy->changed_lock, flags); > schedule_work(&psy->changed_work); > } > EXPORT_SYMBOL_GPL(power_supply_changed); > @@ -500,6 +518,11 @@ int power_supply_register(struct device *parent, struct > power_supply *psy) > goto check_supplies_failed; > } > > + spin_lock_init(&psy->changed_lock); > + rc = device_init_wakeup(dev, true); > + if (rc) > + goto wakeup_init_failed; > + > rc = kobject_set_name(&dev->kobj, "%s", psy->name); > if (rc) > goto kobject_set_name_failed; > @@ -529,6 +552,7 @@ create_triggers_failed: > register_cooler_failed: > psy_unregister_thermal(psy); > register_thermal_failed: > +wakeup_init_failed: > device_del(dev); > kobject_set_name_failed: > device_add_failed: > @@ -546,6 +570,7 @@ void power_supply_unregister(struct power_supply *psy) > power_supply_remove_triggers(psy); > psy_unregister_cooler(psy); > psy_unregister_thermal(psy); > + device_init_wakeup(psy->dev, false); > device_unregister(psy->dev); > } > EXPORT_SYMBOL_GPL(power_supply_unregister); > diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h > index 804b906..253d412 100644 > --- a/include/linux/power_supply.h > +++ b/include/linux/power_supply.h > @@ -194,6 +194,8 @@ struct power_supply { > /* private */ > struct device *dev; > struct work_struct changed_work; > + spinlock_t changed_lock; > + bool changed; > #ifdef CONFIG_THERMAL > struct thermal_zone_device *tzd; > struct thermal_cooling_device *tcd; > -- > 1.7.9.5 > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCH] mmc: Enable wakeup_sources for mmc core
Ulf, > I got confirmation from Broadcom that all cell phone reference designs > have card insert/removal configured as a wakeup IRQ. Unless our > customers change that - which I doubt - this results in a considerable > number of products implementing this feature. > > Please let me know how you wish to proceed. I think this patch would be useful for all mobile applications. What are the chances of getting this in the next kernel version? Thanks, Zoran -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCH] pm: prevent suspend until power supply events are processed
This patch, originally authored by Arve Hjonnevag and Todd Poynor, prevents the system from entering suspend mode until the power supply plug, unplug, or any other change of state event is fully processed. This guarantees that the screen lights up and displays the battery charging state. The implementation uses the power supply wakeup_source object. Cc: Anton Vorontsov Cc: David Woodhouse Cc: Arve Hjonnevag Cc: Todd Poynor Cc: John Stultz Signed-off-by: Zoran Markovic --- drivers/power/power_supply_core.c | 37 +++-- include/linux/power_supply.h |2 ++ 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/drivers/power/power_supply_core.c b/drivers/power/power_supply_core.c index 3b2d5df..e68d598 100644 --- a/drivers/power/power_supply_core.c +++ b/drivers/power/power_supply_core.c @@ -67,23 +67,41 @@ static int __power_supply_changed_work(struct device *dev, void *data) static void power_supply_changed_work(struct work_struct *work) { + unsigned long flags; struct power_supply *psy = container_of(work, struct power_supply, changed_work); dev_dbg(psy->dev, "%s\n", __func__); - class_for_each_device(power_supply_class, NULL, psy, - __power_supply_changed_work); - - power_supply_update_leds(psy); - - kobject_uevent(&psy->dev->kobj, KOBJ_CHANGE); + spin_lock_irqsave(&psy->changed_lock, flags); + if (psy->changed) { + psy->changed = false; + spin_unlock_irqrestore(&psy->changed_lock, flags); + class_for_each_device(power_supply_class, NULL, psy, + __power_supply_changed_work); + power_supply_update_leds(psy); + kobject_uevent(&psy->dev->kobj, KOBJ_CHANGE); + spin_lock_irqsave(&psy->changed_lock, flags); + } + /* dependent power supplies (e.g. battery) may have changed +* state as a result of this event, so poll again and hold +* the wakeup_source until all events are processed. +*/ + if (!psy->changed) + pm_relax(psy->dev); + spin_unlock_irqrestore(&psy->changed_lock, flags); } void power_supply_changed(struct power_supply *psy) { + unsigned long flags; + dev_dbg(psy->dev, "%s\n", __func__); + spin_lock_irqsave(&psy->changed_lock, flags); + psy->changed = true; + pm_stay_awake(psy->dev); + spin_unlock_irqrestore(&psy->changed_lock, flags); schedule_work(&psy->changed_work); } EXPORT_SYMBOL_GPL(power_supply_changed); @@ -500,6 +518,11 @@ int power_supply_register(struct device *parent, struct power_supply *psy) goto check_supplies_failed; } + spin_lock_init(&psy->changed_lock); + rc = device_init_wakeup(dev, true); + if (rc) + goto wakeup_init_failed; + rc = kobject_set_name(&dev->kobj, "%s", psy->name); if (rc) goto kobject_set_name_failed; @@ -529,6 +552,7 @@ create_triggers_failed: register_cooler_failed: psy_unregister_thermal(psy); register_thermal_failed: +wakeup_init_failed: device_del(dev); kobject_set_name_failed: device_add_failed: @@ -546,6 +570,7 @@ void power_supply_unregister(struct power_supply *psy) power_supply_remove_triggers(psy); psy_unregister_cooler(psy); psy_unregister_thermal(psy); + device_init_wakeup(psy->dev, false); device_unregister(psy->dev); } EXPORT_SYMBOL_GPL(power_supply_unregister); diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 804b906..253d412 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -194,6 +194,8 @@ struct power_supply { /* private */ struct device *dev; struct work_struct changed_work; + spinlock_t changed_lock; + bool changed; #ifdef CONFIG_THERMAL struct thermal_zone_device *tzd; struct thermal_cooling_device *tcd; -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCH] mmc: Enable wakeup_sources for mmc core
Ulf, I got confirmation from Broadcom that all cell phone reference designs have card insert/removal configured as a wakeup IRQ. Unless our customers change that - which I doubt - this results in a considerable number of products implementing this feature. Please let me know how you wish to proceed. Cheers, Zoran On 26 June 2013 13:57, Ulf Hansson wrote: > On 24 June 2013 21:58, Zoran Markovic wrote: >>>> This patch is ported from the Android common tree, so you've probably >>>> been using it. >>> >>> We removed more or less all Android code in the mmc subsystem, since >>> it just didn't work. :-) >>> >>> The "deferred resume" was very useful though, so after some rework we >>> kept it and could then improve the system resume time significantly. >> >> For what it's worth, I fixed one bug I noticed in the Android kernel: >> if a system has a non-removable MMC device, a suspend/resume cycle on >> this device would hold a wake lock forever. This was a visible issue >> on the panda board I am using. >> >> If there are doubts on whether or not the system should stay awake >> during a MMC mount, we have the option to make the calls to >> wakeup_source_register/unregister configurable. Skipping these calls >> would leave the .ws field NULL, in which case >> __pm_stay_awake/__pm_relax/__pm_wakeup_event would do nothing. > > Even if we make this feature configurable, I can't see any host driver > that would benefit from it as of today. The reason is simply that host > drivers do not configure it's card detect irq as a wakeup irq. Myself > is also having quite hard to see the benefit of doing that, but I > don't know all the use cases. > > Let's see if we can get someone else to provide input... > >> >> Thoughts? >> >> - Zoran > > > Kind regards > Ulf Hansson -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCHv3] drivers: power: Detect device suspend/resume lockup and log event in pstore.
From: Benoit Goby Rather than hard-lock the kernel, dump the suspend/resume thread stack and panic() to capture a message in pstore when a driver takes too long to suspend/resume. Default suspend/resume watchdog timeout is set to 12 seconds to be longer than the usbhid 10 second timeout, but could be changed at compile time. Exclude from the watchdog the time spent waiting for children that are resumed asynchronously and time every device, whether or not they resumed synchronously. This patch is targeted for mobile devices where a suspend/resume lockup could cause a system reboot. Information about failing device can be retrieved in subsequent boot session by mounting pstore and inspecting the log. The hardware watchdog timer is likely suspended during this time and couldn't be relied upon. The soft-lockup detector would eventually tell that tasks are not scheduled, but would provide little context as to why. The patch hence uses system timer and assumes it is still active while the devices are suspended/resumed. This feature can be enabled/disabled during kernel configuration. Cc: Android Kernel Team Cc: Colin Cross Cc: Todd Poynor Cc: San Mehat Cc: Benoit Goby Cc: John Stultz Cc: Pavel Machek Cc: Rafael J. Wysocki Cc: Len Brown Cc: Greg Kroah-Hartman Original-author: San Mehat Signed-off-by: Benoit Goby [zoran.marko...@linaro.org: Changed printk(KERN_EMERG,...) to pr_emerg(...), tweaked commit message. Minor changes to add compile-time inclusion of the feature.] Signed-off-by: Zoran Markovic --- v3: * Added explicit dependency on pstore * Collapsed recovery options to system panic only * Logged driver string in panic message drivers/base/power/main.c | 70 + kernel/power/Kconfig | 16 +++ 2 files changed, 86 insertions(+) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 5a9b656..c19aec0 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -29,6 +29,8 @@ #include #include #include +#include + #include "../base.h" #include "power.h" @@ -54,6 +56,12 @@ struct suspend_stats suspend_stats; static DEFINE_MUTEX(dpm_list_mtx); static pm_message_t pm_transition; +struct dpm_watchdog { + struct device *dev; + struct task_struct *tsk; + struct timer_list timer; +}; + static int async_error; /** @@ -384,6 +392,60 @@ static int dpm_run_callback(pm_callback_t cb, struct device *dev, return error; } +#ifdef CONFIG_DPM_WD +/** + * dpm_wd_handler - Driver suspend / resume watchdog handler. + * + * Called when a driver has timed out suspending or resuming. + * There's not much we can do here to recover so panic() to + * capture a crash-dump in pstore. + */ +static void dpm_wd_handler(unsigned long data) +{ + struct dpm_watchdog *wd = (void *)data; + + dev_emerg(wd->dev, " DPM device timeout \n"); + show_stack(wd->tsk, NULL); + panic("%s %s: unrecoverable failure\n", + dev_driver_string(wd->dev), dev_name(wd->dev)); +} + +/** + * dpm_wd_set - Enable pm watchdog for given device. + * @wd: Watchdog. Must be allocated on the stack. + * @dev: Device to handle. + */ +static void dpm_wd_set(struct dpm_watchdog *wd, struct device *dev) +{ + struct timer_list *timer = &wd->timer; + + wd->dev = dev; + wd->tsk = get_current(); + + init_timer_on_stack(timer); + /* use same timeout value for both suspend and resume */ + timer->expires = jiffies + HZ * CONFIG_DPM_WD_TIMEOUT; + timer->function = dpm_wd_handler; + timer->data = (unsigned long)wd; + add_timer(timer); +} + +/** + * dpm_wd_clear - Disable suspend/resume watchdog. + * @wd: Watchdog to disable. + */ +static void dpm_wd_clear(struct dpm_watchdog *wd) +{ + struct timer_list *timer = &wd->timer; + + del_timer_sync(timer); + destroy_timer_on_stack(timer); +} +#else +#define dpm_wd_set(x, y) +#define dpm_wd_clear(x) +#endif + /*- Resume routines -*/ /** @@ -570,6 +632,7 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) pm_callback_t callback = NULL; char *info = NULL; int error = 0; + struct dpm_watchdog wd; TRACE_DEVICE(dev); TRACE_RESUME(0); @@ -585,6 +648,7 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) * a resumed device, even if the device hasn't been completed yet. */ dev->power.is_prepared = false; + dpm_wd_set(&wd, dev); if (!dev->power.is_suspended) goto Unlock; @@ -636,6 +700,7 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) Unlock: device_unlock(dev); + dpm_wd_clear(&wd); Co
[RFC PATCH] rtc: keep system awake until all expired RTC timers are handled
Current implementation of RTC interface allows for system suspend to occur in the following cases: (a) if a timer is set in the past and rtc_timer_do_work() is scheduled to handle it, and (b) if rtc_timer_do_work() is called to handle expired timers whose handlers implement a preemption point. A pending suspend request may be honoured in the above cases causing timer handling to be delayed until after the next resume. This is undesirable since timer handlers may have time-critical code to execute. This patch makes sure that the system stays awake until all expired timers are handled. Note that all calls to pm_stay_awake() are eventually paired with the single pm_relax() call in rtc_timer_do_work(), which is launched using schedule_work(). Cc: Alessandro Zummo Cc: John Stultz Cc: Arve Hjonnevag Cc: Todd Poynor Signed-off-by: Zoran Markovic --- drivers/rtc/interface.c | 13 + 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c index 42bd57d..dace26e 100644 --- a/drivers/rtc/interface.c +++ b/drivers/rtc/interface.c @@ -72,6 +72,7 @@ int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm) } else err = -EINVAL; + pm_stay_awake(rtc->dev.parent); mutex_unlock(&rtc->ops_lock); /* A timer might have just expired */ schedule_work(&rtc->irqwork); @@ -113,6 +114,7 @@ int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs) else err = -EINVAL; + pm_stay_awake(rtc->dev.parent); mutex_unlock(&rtc->ops_lock); /* A timer might have just expired */ schedule_work(&rtc->irqwork); @@ -771,9 +773,10 @@ static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer) alarm.time = rtc_ktime_to_tm(timer->node.expires); alarm.enabled = 1; err = __rtc_set_alarm(rtc, &alarm); - if (err == -ETIME) + if (err == -ETIME) { + pm_stay_awake(rtc->dev.parent); schedule_work(&rtc->irqwork); - else if (err) { + } else if (err) { timerqueue_del(&rtc->timerqueue, &timer->node); timer->enabled = 0; return err; @@ -818,8 +821,10 @@ static void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer) alarm.time = rtc_ktime_to_tm(next->expires); alarm.enabled = 1; err = __rtc_set_alarm(rtc, &alarm); - if (err == -ETIME) + if (err == -ETIME) { + pm_stay_awake(rtc->dev.parent); schedule_work(&rtc->irqwork); + } } } @@ -845,7 +850,6 @@ void rtc_timer_do_work(struct work_struct *work) mutex_lock(&rtc->ops_lock); again: - pm_relax(rtc->dev.parent); __rtc_read_time(rtc, &tm); now = rtc_tm_to_ktime(tm); while ((next = timerqueue_getnext(&rtc->timerqueue))) { @@ -880,6 +884,7 @@ again: } else rtc_alarm_disable(rtc); + pm_relax(rtc->dev.parent); mutex_unlock(&rtc->ops_lock); } -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCH] mmc: Enable wakeup_sources for mmc core
>> This patch is ported from the Android common tree, so you've probably >> been using it. > > We removed more or less all Android code in the mmc subsystem, since > it just didn't work. :-) > > The "deferred resume" was very useful though, so after some rework we > kept it and could then improve the system resume time significantly. For what it's worth, I fixed one bug I noticed in the Android kernel: if a system has a non-removable MMC device, a suspend/resume cycle on this device would hold a wake lock forever. This was a visible issue on the panda board I am using. If there are doubts on whether or not the system should stay awake during a MMC mount, we have the option to make the calls to wakeup_source_register/unregister configurable. Skipping these calls would leave the .ws field NULL, in which case __pm_stay_awake/__pm_relax/__pm_wakeup_event would do nothing. Thoughts? - Zoran -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCH] mmc: Enable wakeup_sources for mmc core
> I am not sure I understand why this patch is needed. When a new card > is inserted/removed and the upper levels gets notification about the > new card, triggering the mounting/un-mounting of the file system, why > should it be the lowest layer (mmc) that prevents the platform from > enter suspend/sleep? Why do we need to prevent it at all? > > Note that notifier handling in mmc_pm_notify, was if I remember > correctly, not completely developed when the original version of this > patch was being discussed. mmc_pm_notify prevents cards from being > inserted/removed in the middle of suspend->resume sequence, is that > not enough? I will try to speak on behalf of the original implementers in a hope they would provide the original motivation for the patch. My understanding is that any preemption in the procedure could be an opportunity to suspend, as there may be a suspend request racing with this code. This is why the calls to __pm_stay_awake() and queue_delayed_work() are so tightly coupled. It would be up to the delayed work procedure (mmc_rescan()) to decide whether or not it is safe to suspend. If there are no changes in the MMC state or all changes can be handled by mmc_rescan(), it is safe to call __pm_relax(). Otherwise, userland may take over processing of this event, and this is why the awake state needs to be extended by 1/2 second. Regards, Zoran -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCH] mmc: Enable wakeup_sources for mmc core
This is a reworked implementation of wakelocks for the MMC core from Android kernel, originally authored by Colin Cross and San Mehat. The patch makes sure that whenever a MMC device is inserted/removed, the system stays awake until it's reconfigured for the new state. It is assumed that 1/2 second is sufficient for the system to start the configuration action for the newly detected MMC device, which might include e.g. mounting the hosted file system(s). The implementation uses wakeup_sources instead of wake_locks. Feedback on the approach is greatly appreciated, in particular for the 1/2 second extension peroid. Cc: San Mehat Cc: Colin Cross Cc: John Stultz Cc: Chris Ball Cc: Ulf Hansson Cc: Johan Rudholm Cc: Jaehoon Chung Cc: Konstantin Dorfman Cc: Guennadi Liakhovetski Cc: Tejun Heo Cc: Andrew Morton Signed-off-by: John Stultz [: tweaked commit message, reworked to use wakeup_source_register/unregister instead of wakeup_source_init/trash, added the missing __pm_relax() for non-removable devices in mmc_rescan().] Signed-off-by: Zoran Markovic --- drivers/mmc/core/core.c | 31 +-- drivers/mmc/core/host.c |7 +++ include/linux/mmc/host.h |2 ++ 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index c40396f..d5230c7 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -1656,6 +1657,7 @@ void mmc_detect_change(struct mmc_host *host, unsigned long delay) spin_unlock_irqrestore(&host->lock, flags); #endif host->detect_change = 1; + __pm_stay_awake(host->ws); mmc_schedule_delayed_work(&host->detect, delay); } @@ -2351,13 +2353,16 @@ void mmc_rescan(struct work_struct *work) struct mmc_host *host = container_of(work, struct mmc_host, detect.work); int i; + bool extend_wakeup = false; if (host->rescan_disable) return; /* If there is a non-removable card registered, only scan once */ - if ((host->caps & MMC_CAP_NONREMOVABLE) && host->rescan_entered) + if ((host->caps & MMC_CAP_NONREMOVABLE) && host->rescan_entered) { + __pm_relax(host->ws); return; + } host->rescan_entered = 1; mmc_bus_get(host); @@ -2400,16 +2405,27 @@ void mmc_rescan(struct work_struct *work) mmc_claim_host(host); for (i = 0; i < ARRAY_SIZE(freqs); i++) { - if (!mmc_rescan_try_freq(host, max(freqs[i], host->f_min))) + if (!mmc_rescan_try_freq(host, max(freqs[i], host->f_min))) { + /* stay awake extra time to process detected device */ + extend_wakeup = true; break; + } if (freqs[i] <= host->f_min) break; } mmc_release_host(host); out: - if (host->caps & MMC_CAP_NEEDS_POLL) + if (extend_wakeup) + /* extra 1/2 second should be enough, hopefully */ + __pm_wakeup_event(host->ws, MSEC_PER_SEC/2); + else + __pm_relax(host->ws); + + if (host->caps & MMC_CAP_NEEDS_POLL) { + __pm_stay_awake(host->ws); mmc_schedule_delayed_work(&host->detect, HZ); + } } void mmc_start_host(struct mmc_host *host) @@ -2433,7 +2449,8 @@ void mmc_stop_host(struct mmc_host *host) #endif host->rescan_disable = 1; - cancel_delayed_work_sync(&host->detect); + if (cancel_delayed_work_sync(&host->detect)) + __pm_relax(host->ws); mmc_flush_scheduled_work(); /* clear pm flags now and let card drivers set them as needed */ @@ -2628,7 +2645,8 @@ int mmc_suspend_host(struct mmc_host *host) { int err = 0; - cancel_delayed_work(&host->detect); + if (cancel_delayed_work(&host->detect)) + __pm_relax(host->ws); mmc_flush_scheduled_work(); mmc_bus_get(host); @@ -2741,7 +2759,8 @@ int mmc_pm_notify(struct notifier_block *notify_block, spin_lock_irqsave(&host->lock, flags); host->rescan_disable = 1; spin_unlock_irqrestore(&host->lock, flags); - cancel_delayed_work_sync(&host->detect); + if (cancel_delayed_work_sync(&host->detect)) + __pm_relax(host->ws); if (!host->bus_ops || host->bus_ops->suspend) break; diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index 2a3593d..3cbb3d7 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@
Re: [RFC PATCHv2 1/2] drivers: power: Add watchdog timer to catch drivers which lockup during suspend/resume.
Rafael, >>> We could do cancel_work_sync() as a recovery, but that call blocks until the >>> running async task is flushed, which might never happen. So doing a panic() >>> is pretty much the only option for recovering. >> >> Well, its usefulness is quite limited, then. That said I'm still not >> convinced >> that this actually is the case. > > It does block in my environment, AFAICS. Looking a bit further in the > code, it looks like dpm_suspend() does an async_synchronize_full() > which would wait for all async tasks to complete. This is a > show-stopper because (under the circumstances) the assumption that > every async suspend routine eventually completes doesn't hold. > > We could possibly select which async tasks to wait for, but this would > add unnecessary complexity to a feature targeted for debugging. It > seems that this approach - although sounding reasonable - needs to > wait until we have a mechanism to cancel an async task. Looks like the implementation of proposal for an async suspend + wait_for_completion_timeout is quite complex due to above limitations. How do we proceed from here? We have the following options: 1. Give up on the idea of having a suspend/resume watchdog. 2. Use the timer implementation (with possible modifications). 3. Wait for the implementation of (or implement) killing of an already running async work. Are there any other ideas floating around? Thanks, Zoran -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCHv2 1/2] drivers: power: Add watchdog timer to catch drivers which lockup during suspend/resume.
>> We could do cancel_work_sync() as a recovery, but that call blocks until the >> running async task is flushed, which might never happen. So doing a panic() >> is pretty much the only option for recovering. > > Well, its usefulness is quite limited, then. That said I'm still not > convinced > that this actually is the case. It does block in my environment, AFAICS. Looking a bit further in the code, it looks like dpm_suspend() does an async_synchronize_full() which would wait for all async tasks to complete. This is a show-stopper because (under the circumstances) the assumption that every async suspend routine eventually completes doesn't hold. We could possibly select which async tasks to wait for, but this would add unnecessary complexity to a feature targeted for debugging. It seems that this approach - although sounding reasonable - needs to wait until we have a mechanism to cancel an async task. Regards, Zoran -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCHv2 1/2] drivers: power: Add watchdog timer to catch drivers which lockup during suspend/resume.
> What about this: > - Add one more list_head to struct dev_pm_info. > - Make dpm_prepare() create a new list for the next steps instead of moving >devices out of dpm_list. > - Start an async work to carry out dpm_suspend() and make the main thread >do wait_for_completion_timeout() for every device in dpm_list (in the >reverse order). > - If it times out, mark the device in question as unusable, possibly resume >the already suspended devices (except for descendants of the failed one) >and abort the suspend. Return a specific error code to user space so that >it knows what happened. [You can make this step configurable to BUG() >instead of doing all those things if you think that will be more useful for >platforms you care about.] > - Disable future suspends. > And analogously for resume. > > That should allow people to investigate what happened on a system that > (hopefully) is not completely dead and you still can have your "reboot if > suspend hangs" feature if you like. I looked into implementing this. The problem that I encountered is that there is no reliable way of canceling an async task, and hence the asynchronous __device_suspend() would be left racing with a recovery from a suspend timeout. We could do cancel_work_sync() as a recovery, but that call blocks until the running async task is flushed, which might never happen. So doing a panic() is pretty much the only option for recovering. - Zoran -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCHv2] power: Add option to log time spent in suspend
From: Colin Cross Below is a patch from android kernel that maintains a histogram of suspend times. Please review and provide feedback. Statistices on the time spent in suspend are kept in /sys/kernel/debug/sleep_time. Cc: Android Kernel Team Cc: Colin Cross Cc: Todd Poynor Cc: San Mehat Cc: Benoit Goby Cc: John Stultz Cc: Thomas Gleixner Signed-off-by: Colin Cross Signed-off-by: Todd Poynor [zoran.marko...@linaro.org: Re-formatted suspend time table to better fit expected values. Moved accounting of suspend time into timekeeping core. Removed CONFIG_SUSPEND_TIME flag and made the feature conditional on CONFIG_DEBUG_FS. Changed the file name to sleep_time to better fit terminology in timekeeping core. Changed seq_printf to seq_puts. Tweaked commit message] Signed-off-by: Zoran Markovic --- kernel/time/Makefile |1 + kernel/time/timekeeping.c |2 + kernel/time/timekeeping_debug.c| 72 kernel/time/timekeeping_internal.h | 14 +++ 4 files changed, 89 insertions(+) create mode 100644 kernel/time/timekeeping_debug.c create mode 100644 kernel/time/timekeeping_internal.h diff --git a/kernel/time/Makefile b/kernel/time/Makefile index ff7d9d2..d52ac8b 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o +obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index baeeb5c..e7e2f05 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -25,6 +25,7 @@ #include "tick-internal.h" #include "ntp_internal.h" +#include "timekeeping_internal.h" static struct timekeeper timekeeper; static DEFINE_RAW_SPINLOCK(timekeeper_lock); @@ -841,6 +842,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, tk_xtime_add(tk, delta); tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); + tk_debug_account_sleep_time(delta); } /** diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c new file mode 100644 index 000..802433a --- /dev/null +++ b/kernel/time/timekeeping_debug.c @@ -0,0 +1,72 @@ +/* + * debugfs file to track time spent in suspend + * + * Copyright (c) 2011, Google, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include + +static unsigned int sleep_time_bin[32] = {0}; + +static int tk_debug_show_sleep_time(struct seq_file *s, void *data) +{ + unsigned int bin; + seq_puts(s, " time (secs)count\n"); + seq_puts(s, "--\n"); + for (bin = 0; bin < 32; bin++) { + if (sleep_time_bin[bin] == 0) + continue; + seq_printf(s, "%10u - %-10u %4u\n", + bin ? 1 << (bin - 1) : 0, 1 << bin, + sleep_time_bin[bin]); + } + return 0; +} + +static int tk_debug_sleep_time_open(struct inode *inode, struct file *file) +{ + return single_open(file, tk_debug_show_sleep_time, NULL); +} + +static const struct file_operations tk_debug_sleep_time_fops = { + .open = tk_debug_sleep_time_open, + .read = seq_read, + .llseek = seq_lseek, + .release= single_release, +}; + +static int __init tk_debug_sleep_time_init(void) +{ + struct dentry *d; + + d = debugfs_create_file("sleep_time", 0444, NULL, NULL, + &tk_debug_sleep_time_fops); + if (!d) { + pr_err("Failed to create sleep_time debug file\n"); + return -ENOMEM; + } + + return 0; +} +late_initcall(tk_debug_sleep_time_init); + +void tk_debug_account_sleep_time(struct timespec *t) +{ + sleep_time_bin[fls(t->tv_sec)]++; +} + diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h new file mode 100644 index 000..13323ea --- /dev/null +++ b/kernel/time/timekeeping_internal.h @@ -0,0 +1
[RFC PATCH] timekeeping: Correct run-time detection of real-time clock.
Since commit <31ade30692dc9680bfc95700d794818fa3f754ac>, timekeeping_init() checks for presence of persistent clock by attempting to read a non-zero time value from real-time clock. This is an issue on platforms where persistent_clock (instead of a RTC) is implemented as a free-running counter starting from zero on each boot and running during suspend. Examples are some ARM platforms (e.g. PandaBoard). An attempt to read such a clock during timekeeping_init() may return zero value and falsely declare persistent clock as missing. Additionally, in the above case suspend times may be accounted twice (once from timekeeping_resume() and once from rtc_resume()), resulting in a gradual drift of system time. This patch does a run-time correction of the issue by doing the same check during timekeeping_suspend(). A better long-term solution would have to return error when trying to read non-existing clock and zero when trying to read an uninitialized clock, but that would require changing all persistent_clock implementations. This patch addresses the immediate breakage, for now. Cc: John Stultz Cc: Thomas Gleixner Cc: Feng Tang Cc: sta...@vger.kernel.org Signed-off-by: Zoran Markovic --- kernel/time/timekeeping.c |8 1 file changed, 8 insertions(+) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 98cd470..baeeb5c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -975,6 +975,14 @@ static int timekeeping_suspend(void) read_persistent_clock(&timekeeping_suspend_time); + /* +* On some systems the persistent_clock can not be detected at +* timekeeping_init by its return value, so if we see a valid +* value returned, update the persistent_clock_exists flag. +*/ + if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) + persistent_clock_exist = true; + raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCHv2 1/2] drivers: power: Add watchdog timer to catch drivers which lockup during suspend/resume.
From: Benoit Goby Below is a patch from android kernel that detects a driver suspend/resume lockup and captures dump in the kernel log. Please review and provide comments. Rather than hard-lock the kernel, dump the suspend/resume thread stack and BUG() when a driver takes too long to suspend/resume. The timeout is set to 12 seconds to be longer than the usbhid 10 second timeout. Exclude from the watchdog the time spent waiting for children that are resumed asynchronously and time every device, whether or not they resumed synchronously. This patch is targeted for mobile devices where a suspend/resume lockup could cause a system reboot and catch user's attention. Information about failing device can later be retrieved from captured log in subsequent boot session. The hardware watchdog timer is likely suspended during this time and couldn't be relied upon. The soft-lockup detector would eventually tell that tasks are not scheduled, but would provide little context as to why. The patch hence uses system timer and assumes it is still active while the devices are suspended/resumed. Cc: Android Kernel Team Cc: Colin Cross Cc: Todd Poynor Cc: San Mehat Cc: Benoit Goby Cc: John Stultz Cc: Pavel Machek Cc: Rafael J. Wysocki Cc: Len Brown Cc: Greg Kroah-Hartman Original-author: San Mehat Signed-off-by: Benoit Goby [zoran.marko...@linaro.org: Changed printk(KERN_EMERG,...) to pr_emerg(...), tweaked commit message. Minor changes to merge code into kernel tip.] Signed-off-by: Zoran Markovic --- drivers/base/power/main.c | 66 + 1 file changed, 66 insertions(+) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 5a9b656..a6a02c0 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -29,6 +29,8 @@ #include #include #include +#include + #include "../base.h" #include "power.h" @@ -54,6 +56,12 @@ struct suspend_stats suspend_stats; static DEFINE_MUTEX(dpm_list_mtx); static pm_message_t pm_transition; +struct dpm_watchdog { + struct device *dev; + struct task_struct *tsk; + struct timer_list timer; +}; + static int async_error; /** @@ -384,6 +392,56 @@ static int dpm_run_callback(pm_callback_t cb, struct device *dev, return error; } +/** + * dpm_wd_handler - Driver suspend / resume watchdog handler. + * + * Called when a driver has timed out suspending or resuming. + * There's not much we can do here to recover so BUG() out for + * a crash-dump + */ +static void dpm_wd_handler(unsigned long data) +{ + struct dpm_watchdog *wd = (void *)data; + struct device *dev = wd->dev; + struct task_struct *tsk = wd->tsk; + + dev_emerg(dev, " DPM device timeout \n"); + show_stack(tsk, NULL); + + BUG(); +} + +/** + * dpm_wd_set - Enable pm watchdog for given device. + * @wd: Watchdog. Must be allocated on the stack. + * @dev: Device to handle. + */ +static void dpm_wd_set(struct dpm_watchdog *wd, struct device *dev) +{ + struct timer_list *timer = &wd->timer; + + wd->dev = dev; + wd->tsk = get_current(); + + init_timer_on_stack(timer); + timer->expires = jiffies + HZ * 12; + timer->function = dpm_wd_handler; + timer->data = (unsigned long)wd; + add_timer(timer); +} + +/** + * dpm_wd_clear - Disable pm watchdog. + * @wd: Watchdog to disable. + */ +static void dpm_wd_clear(struct dpm_watchdog *wd) +{ + struct timer_list *timer = &wd->timer; + + del_timer_sync(timer); + destroy_timer_on_stack(timer); +} + /*- Resume routines -*/ /** @@ -570,6 +628,7 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) pm_callback_t callback = NULL; char *info = NULL; int error = 0; + struct dpm_watchdog wd; TRACE_DEVICE(dev); TRACE_RESUME(0); @@ -585,6 +644,7 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) * a resumed device, even if the device hasn't been completed yet. */ dev->power.is_prepared = false; + dpm_wd_set(&wd, dev); if (!dev->power.is_suspended) goto Unlock; @@ -636,6 +696,7 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) Unlock: device_unlock(dev); + dpm_wd_clear(&wd); Complete: complete_all(&dev->power.completion); @@ -1053,6 +1114,7 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) pm_callback_t callback = NULL; char *info = NULL; int error = 0; + struct dpm_watchdog wd; dpm_wait_for_children(dev, async); @@ -1076,6 +1138,8 @@ static int __device_suspend(struct device *dev, pm_message_t state
[RFC PATCHv2 2/2] PM: compile-time configuration of device suspend/resume watchdogs.
Power management debug option to configure device suspend/resume watchdogs. Available options are: 1. Enable/disable the feature. 2. Select triggered watchdog action between: - system panic (default) - dump stacktrace - log event 3. Select timeout value for the watchdog(s). Minor changes were made to watchdog code to accommodate this feature. Cc: Android Kernel Team Cc: Colin Cross Cc: Todd Poynor Cc: San Mehat Cc: Benoit Goby Cc: John Stultz Cc: Pavel Machek Cc: Rafael J. Wysocki Cc: Len Brown Cc: Greg Kroah-Hartman Signed-off-by: Zoran Markovic --- drivers/base/power/main.c | 37 ++ kernel/power/Kconfig | 48 + 2 files changed, 77 insertions(+), 8 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index a6a02c0..8e0bb33 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -392,6 +392,26 @@ static int dpm_run_callback(pm_callback_t cb, struct device *dev, return error; } +#ifdef CONFIG_DPM_WD +/** + * dpm_wd_action - recovery from suspend/resume watchdog timeout + * @wd: Watchdog. Must be allocated on the stack. + */ +#if defined(CONFIG_DPM_WD_ACTION_STACKTRACE) +static inline void dpm_wd_action(struct dpm_watchdog *wd) +{ + show_stack(wd->tsk, NULL); +} +#elif defined(CONFIG_DPM_WD_ACTION_PANIC) +static inline void dpm_wd_action(struct dpm_watchdog *wd) +{ + panic("%s: unrecoverable failure\n", dev_name(wd->dev)); +} +#else /* CONFIG_DPM_WD_ACTION_LOG */ +/* event already logged in dpm_wd_handler() */ +#define dpm_wd_action(x) +#endif + /** * dpm_wd_handler - Driver suspend / resume watchdog handler. * @@ -402,13 +422,9 @@ static int dpm_run_callback(pm_callback_t cb, struct device *dev, static void dpm_wd_handler(unsigned long data) { struct dpm_watchdog *wd = (void *)data; - struct device *dev = wd->dev; - struct task_struct *tsk = wd->tsk; - - dev_emerg(dev, " DPM device timeout \n"); - show_stack(tsk, NULL); - BUG(); + dev_emerg(wd->dev, " DPM device timeout \n"); + dpm_wd_action(wd); } /** @@ -424,14 +440,15 @@ static void dpm_wd_set(struct dpm_watchdog *wd, struct device *dev) wd->tsk = get_current(); init_timer_on_stack(timer); - timer->expires = jiffies + HZ * 12; + /* use same timeout value for both suspend and resume */ + timer->expires = jiffies + HZ * CONFIG_DPM_WD_TIMEOUT; timer->function = dpm_wd_handler; timer->data = (unsigned long)wd; add_timer(timer); } /** - * dpm_wd_clear - Disable pm watchdog. + * dpm_wd_clear - Disable suspend/resume watchdog. * @wd: Watchdog to disable. */ static void dpm_wd_clear(struct dpm_watchdog *wd) @@ -441,6 +458,10 @@ static void dpm_wd_clear(struct dpm_watchdog *wd) del_timer_sync(timer); destroy_timer_on_stack(timer); } +#else +#define dpm_wd_set(x, y) +#define dpm_wd_clear(x) +#endif /*- Resume routines -*/ diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index edc8bdd..339caa1 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -179,6 +179,54 @@ config PM_SLEEP_DEBUG def_bool y depends on PM_DEBUG && PM_SLEEP +config DPM_WD + bool "Device suspend/resume watchdog" + depends on PM_DEBUG + ---help--- + Sets up a watchdog timer to capture drivers that are + locked up attempting to suspend/resume a device. + A detected lockup causes a configurable watchdog action, + such as logging the event, dumping the stack trace or + kernel panic. + +choice + prompt "Watchdog recovery action" + default DPM_WD_ACTION_PANIC + depends on DPM_WD + ---help--- + Select recovery action triggered by suspend/resume watchdog. + +config DPM_WD_ACTION_PANIC + bool "System panic" + ---help--- + When selected, a lockup during device's suspend or + resume would cause a system panic. This would immediately + capture user's attention. Panic message can be observed in + subsequent boot session using pstore. + +config DPM_WD_ACTION_STACKTRACE + bool "Dump stack" + ---help--- + When selected, a lockup during device's suspend or + resume would cause the caller's stack to be + captured in the system log. The stack trace shows + which driver call caused a lockup. + +config DPM_WD_ACTION_LOG + bool "Log event" + ---help--- + When selected, a lockup during device's suspend or + resume would cause the watchdog timeout event to be + logged in the system log. + +endchoice + +conf
[RFC PATCHv2 0/2] power: device suspend/resume watchdog
Hi all, Attached are two patches addressing comments on the implementation of device suspend (and resume) watchdogs from the android kernel. I have squashed changes for the suspend and resume watchdogs as they address pretty much the same functionality, and also added compile-time configurability of the watchdogs. Please be kind to review and comment if it is ready for upstreaming. Best regards, Zoran Cc: Android Kernel Team Cc: Colin Cross Cc: Todd Poynor Cc: San Mehat Cc: Benoit Goby Cc: John Stultz Cc: Pavel Machek Cc: Rafael J. Wysocki Cc: Len Brown Cc: Greg Kroah-Hartman Benoit Goby (1): drivers: power: Add watchdog timer to catch drivers which lockup during suspend/resume. Zoran Markovic (1): PM: compile-time configuration of device suspend/resume watchdogs. drivers/base/power/main.c | 87 + kernel/power/Kconfig | 48 + 2 files changed, 135 insertions(+) -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCH] power: Add option to log time spent in suspend
From: Colin Cross Below is a patch from android kernel that maintains a histogram of suspend times. Please review and provide feedback. Prints the time spent in suspend in the kernel log, and keeps statistics on the time spent in suspend in /sys/kernel/debug/suspend_time Cc: Android Kernel Team Cc: Colin Cross Cc: Todd Poynor Cc: San Mehat Cc: Benoit Goby Cc: John Stultz Cc: Pavel Machek Cc: Rafael J. Wysocki Cc: Len Brown Signed-off-by: Colin Cross Signed-off-by: Todd Poynor [zoran.marko...@linaro.org: Re-formatted suspend time table to better fit expected values, tweaked commit message] Signed-off-by: Zoran Markovic --- kernel/power/Kconfig|7 +++ kernel/power/Makefile |1 + kernel/power/suspend_time.c | 111 +++ 3 files changed, 119 insertions(+) create mode 100644 kernel/power/suspend_time.c diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 5dfdc9e..edc8bdd 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -274,3 +274,10 @@ config PM_GENERIC_DOMAINS_RUNTIME config CPU_PM bool depends on SUSPEND || CPU_IDLE + +config SUSPEND_TIME + bool "Log time spent in suspend" + ---help--- + Prints the time spent in suspend in the kernel log, and + keeps statistics on the time spent in suspend in + /sys/kernel/debug/suspend_time diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 29472bf..578e20e 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -11,5 +11,6 @@ obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ block_io.o obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o +obj-$(CONFIG_SUSPEND_TIME) += suspend_time.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/suspend_time.c b/kernel/power/suspend_time.c new file mode 100644 index 000..a613ede --- /dev/null +++ b/kernel/power/suspend_time.c @@ -0,0 +1,111 @@ +/* + * debugfs file to track time spent in suspend + * + * Copyright (c) 2011, Google, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +static struct timespec suspend_time_before; +static unsigned int time_in_suspend_bins[32]; + +#ifdef CONFIG_DEBUG_FS +static int suspend_time_debug_show(struct seq_file *s, void *data) +{ + unsigned int bin; + seq_printf(s, " time (secs)count\n"); + seq_printf(s, "--\n"); + for (bin = 0; bin < 32; bin++) { + if (time_in_suspend_bins[bin] == 0) + continue; + seq_printf(s, "%10u - %-10u %4u\n", + bin ? 1 << (bin - 1) : 0, 1 << bin, + time_in_suspend_bins[bin]); + } + return 0; +} + +static int suspend_time_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, suspend_time_debug_show, NULL); +} + +static const struct file_operations suspend_time_debug_fops = { + .open = suspend_time_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release= single_release, +}; + +static int __init suspend_time_debug_init(void) +{ + struct dentry *d; + + d = debugfs_create_file("suspend_time", 0444, NULL, NULL, + &suspend_time_debug_fops); + if (!d) { + pr_err("Failed to create suspend_time debug file\n"); + return -ENOMEM; + } + + return 0; +} + +late_initcall(suspend_time_debug_init); +#endif + +static int suspend_time_syscore_suspend(void) +{ + read_persistent_clock(&suspend_time_before); + + return 0; +} + +static void suspend_time_syscore_resume(void) +{ + struct timespec after; + + read_persistent_clock(&after); + + after = timespec_sub(after, suspend_time_before); + + time_in_suspend_bins[fls(after.tv_sec)]++; + + pr_info("Suspended for %lu.%03lu seconds\n", after.tv_sec, + after.tv_nsec / NSEC_PER_MSEC); +} + +static struct syscore_ops suspend_time_syscore_ops = { + .suspend = suspend_time_syscore_suspend, + .resume = suspend_time_syscore_resume, +}; + +static int suspend_time_syscore_init(void)
[RFC PATCH] drivers: power: Add watchdog timer to catch drivers which lockup during suspend.
From: Benoit Goby Below is a patch from android kernel that detects a driver suspend lockup and captures dump in the kernel log. Please review and provide comments. Rather than hard-lock the kernel, dump the suspend thread stack and BUG() when a driver takes too long to suspend. The timeout is set to 12 seconds to be longer than the usbhid 10 second timeout. Exclude from the watchdog the time spent waiting for children that are resumed asynchronously and time every device, whether or not they resumed synchronously. Cc: Android Kernel Team Cc: Colin Cross Cc: Todd Poynor Cc: San Mehat Cc: Benoit Goby Cc: John Stultz Cc: Pavel Machek Cc: Rafael J. Wysocki Cc: Len Brown Cc: Greg Kroah-Hartman Original-author: San Mehat Signed-off-by: Benoit Goby [zoran.marko...@linaro.org: Changed printk(KERN_EMERG,...) to pr_emerg(...), tweaked commit message.] Signed-off-by: Zoran Markovic --- drivers/base/power/main.c | 45 + 1 file changed, 45 insertions(+) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 15beb50..eb70c0e 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -29,6 +29,8 @@ #include #include #include +#include + #include "../base.h" #include "power.h" @@ -54,6 +56,12 @@ struct suspend_stats suspend_stats; static DEFINE_MUTEX(dpm_list_mtx); static pm_message_t pm_transition; +static void dpm_drv_timeout(unsigned long data); +struct dpm_drv_wd_data { + struct device *dev; + struct task_struct *tsk; +}; + static int async_error; /** @@ -663,6 +671,30 @@ static bool is_async(struct device *dev) } /** + * dpm_drv_timeout - Driver suspend / resume watchdog handler + * @data: struct device which timed out + * + * Called when a driver has timed out suspending or resuming. + * There's not much we can do here to recover so + * BUG() out for a crash-dump + * + */ +static void dpm_drv_timeout(unsigned long data) +{ + struct dpm_drv_wd_data *wd_data = (void *)data; + struct device *dev = wd_data->dev; + struct task_struct *tsk = wd_data->tsk; + + pr_emerg(" DPM device timeout: %s (%s)\n", dev_name(dev), + (dev->driver ? dev->driver->name : "no driver")); + + pr_emerg("dpm suspend stack:\n"); + show_stack(tsk, NULL); + + BUG(); +} + +/** * dpm_resume - Execute "resume" callbacks for non-sysdev devices. * @state: PM transition of the system being carried out. * @@ -1053,6 +1085,8 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) pm_callback_t callback = NULL; char *info = NULL; int error = 0; + struct timer_list timer; + struct dpm_drv_wd_data data; dpm_wait_for_children(dev, async); @@ -1076,6 +1110,14 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) if (dev->power.syscore) goto Complete; + data.dev = dev; + data.tsk = get_current(); + init_timer_on_stack(&timer); + timer.expires = jiffies + HZ * 12; + timer.function = dpm_drv_timeout; + timer.data = (unsigned long)&data; + add_timer(&timer); + device_lock(dev); if (dev->pm_domain) { @@ -1131,6 +1173,9 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) device_unlock(dev); + del_timer_sync(&timer); + destroy_timer_on_stack(&timer); + Complete: complete_all(&dev->power.completion); if (error) -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/