from:"Dirk Brandewie"

Re: [PATCH 1/2 V6] intel_pstate: skip this driver if Sun server has _PPC method

2014-12-01 Thread Dirk Brandewie


On 11/30/2014 06:32 PM, Ethan Zhao wrote:

Oracle Sun X86 servers have dynamic power capping capability that works via
ACPI _PPC method etc, so skip loading this driver if Sun server has ACPI _PPC
enabled.

Signed-off-by: Ethan Zhao 
Signed-off-by: Dirk Brandewie 
Tested-by: Linda Knippers 


In the future you should not add other peoples Signed-off-by or Tested-by
tags unless they have explicitly told you can do so. Other than that I
am fine with this patch.

--Dirk

---
   v2: fix break HP Proliant issue.
   v3: expand the hardware vendor list.
   v4: refine code.
   v5v6: change enum PCC to PPC.

  drivers/cpufreq/intel_pstate.c | 45 ++
  1 file changed, 41 insertions(+), 4 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 27bb6d3..1bb62ca 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -943,15 +943,46 @@ static bool intel_pstate_no_acpi_pss(void)
return true;
  }

+static bool intel_pstate_has_acpi_ppc(void)
+{
+   int i;
+
+   for_each_possible_cpu(i) {
+   struct acpi_processor *pr = per_cpu(processors, i);
+
+   if (!pr)
+   continue;
+   if (acpi_has_method(pr->handle, "_PPC"))
+   return true;
+   }
+   return false;
+}
+
+enum {
+   PSS,
+   PPC,
+};
+
  struct hw_vendor_info {
u16  valid;
char oem_id[ACPI_OEM_ID_SIZE];
char oem_table_id[ACPI_OEM_TABLE_ID_SIZE];
+   int  oem_pwr_table;
  };

  /* Hardware vendor-specific info that has its own power management modes */
  static struct hw_vendor_info vendor_info[] = {
-   {1, "HP", "ProLiant"},
+   {1, "HP", "ProLiant", PSS},
+   {1, "ORACLE", "X4-2", PPC},
+   {1, "ORACLE", "X4-2L   ", PPC},
+   {1, "ORACLE", "X4-2B   ", PPC},
+   {1, "ORACLE", "X3-2", PPC},
+   {1, "ORACLE", "X3-2L   ", PPC},
+   {1, "ORACLE", "X3-2B   ", PPC},
+   {1, "ORACLE", "X4470M2 ", PPC},
+   {1, "ORACLE", "X4270M3 ", PPC},
+   {1, "ORACLE", "X4270M2 ", PPC},
+   {1, "ORACLE", "X4170M2 ", PPC},
{0, "", ""},
  };

@@ -966,15 +997,21 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void)

for (v_info = vendor_info; v_info->valid; v_info++) {
if (!strncmp(hdr.oem_id, v_info->oem_id, ACPI_OEM_ID_SIZE) &&
-   !strncmp(hdr.oem_table_id, v_info->oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE) &&
-   intel_pstate_no_acpi_pss())
-   return true;
+   !strncmp(hdr.oem_table_id, v_info->oem_table_id,
+   ACPI_OEM_TABLE_ID_SIZE))
+   switch (v_info->oem_pwr_table) {
+   case PSS:
+   return intel_pstate_no_acpi_pss();
+   case PPC:
+   return intel_pstate_has_acpi_ppc();
+   }
}

return false;
  }
  #else /* CONFIG_ACPI not enabled */
  static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return 
false; }
+static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
  #endif /* CONFIG_ACPI */

  static int __init intel_pstate_init(void)



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/2 V6] intel_pstate: skip this driver if Sun server has _PPC method

2014-12-01 Thread Dirk Brandewie


On 11/30/2014 06:32 PM, Ethan Zhao wrote:

Oracle Sun X86 servers have dynamic power capping capability that works via
ACPI _PPC method etc, so skip loading this driver if Sun server has ACPI _PPC
enabled.

Signed-off-by: Ethan Zhao ethan.z...@oracle.com
Signed-off-by: Dirk Brandewie dirk.brande...@gmail.com
Tested-by: Linda Knippers linda.knipp...@hp.com


In the future you should not add other peoples Signed-off-by or Tested-by
tags unless they have explicitly told you can do so. Other than that I
am fine with this patch.

--Dirk

---
   v2: fix break HP Proliant issue.
   v3: expand the hardware vendor list.
   v4: refine code.
   v5v6: change enum PCC to PPC.

  drivers/cpufreq/intel_pstate.c | 45 ++
  1 file changed, 41 insertions(+), 4 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 27bb6d3..1bb62ca 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -943,15 +943,46 @@ static bool intel_pstate_no_acpi_pss(void)
return true;
  }

+static bool intel_pstate_has_acpi_ppc(void)
+{
+   int i;
+
+   for_each_possible_cpu(i) {
+   struct acpi_processor *pr = per_cpu(processors, i);
+
+   if (!pr)
+   continue;
+   if (acpi_has_method(pr-handle, _PPC))
+   return true;
+   }
+   return false;
+}
+
+enum {
+   PSS,
+   PPC,
+};
+
  struct hw_vendor_info {
u16  valid;
char oem_id[ACPI_OEM_ID_SIZE];
char oem_table_id[ACPI_OEM_TABLE_ID_SIZE];
+   int  oem_pwr_table;
  };

  /* Hardware vendor-specific info that has its own power management modes */
  static struct hw_vendor_info vendor_info[] = {
-   {1, HP, ProLiant},
+   {1, HP, ProLiant, PSS},
+   {1, ORACLE, X4-2, PPC},
+   {1, ORACLE, X4-2L   , PPC},
+   {1, ORACLE, X4-2B   , PPC},
+   {1, ORACLE, X3-2, PPC},
+   {1, ORACLE, X3-2L   , PPC},
+   {1, ORACLE, X3-2B   , PPC},
+   {1, ORACLE, X4470M2 , PPC},
+   {1, ORACLE, X4270M3 , PPC},
+   {1, ORACLE, X4270M2 , PPC},
+   {1, ORACLE, X4170M2 , PPC},
{0, , },
  };

@@ -966,15 +997,21 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void)

for (v_info = vendor_info; v_info-valid; v_info++) {
if (!strncmp(hdr.oem_id, v_info-oem_id, ACPI_OEM_ID_SIZE) 
-   !strncmp(hdr.oem_table_id, v_info-oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE) 
-   intel_pstate_no_acpi_pss())
-   return true;
+   !strncmp(hdr.oem_table_id, v_info-oem_table_id,
+   ACPI_OEM_TABLE_ID_SIZE))
+   switch (v_info-oem_pwr_table) {
+   case PSS:
+   return intel_pstate_no_acpi_pss();
+   case PPC:
+   return intel_pstate_has_acpi_ppc();
+   }
}

return false;
  }
  #else /* CONFIG_ACPI not enabled */
  static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return 
false; }
+static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
  #endif /* CONFIG_ACPI */

  static int __init intel_pstate_init(void)



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/2 v3] intel_pstate: skip this driver if Sun server has _PPC method

2014-11-25 Thread Dirk Brandewie


On 11/24/2014 08:59 PM, Ethan Zhao wrote:

Oracle Sun X86 servers have dynamic power capping capability that works via
ACPI _PPC method etc, so skip loading this driver if Sun server has ACPI _PPC
enabled.



How about this patch? only compile tested.

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 3468387..db7b8b2 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1025,15 +1025,46 @@ static bool intel_pstate_no_acpi_pss(void)
return true;
 }

+static bool intel_pstate_has_acpi_ppc(void)
+{
+   int i;
+
+   for_each_possible_cpu(i) {
+   struct acpi_processor *pr = per_cpu(processors, i);
+
+   if (!pr)
+   continue;
+   if (acpi_has_method(pr->handle, "_PPC"))
+   return true;
+   }
+   return false;
+}
+
+enum {
+   PSS,
+   PCC,
+};
+
 struct hw_vendor_info {
u16  valid;
char oem_id[ACPI_OEM_ID_SIZE];
char oem_table_id[ACPI_OEM_TABLE_ID_SIZE];
+   int  oem_pwr_table;
 };

 /* Hardware vendor-specific info that has its own power management modes */
 static struct hw_vendor_info vendor_info[] = {
-   {1, "HP", "ProLiant"},
+   {1, "HP", "ProLiant", PSS},
+   {1, "ORACLE", "X4-2", PCC},
+   {1, "ORACLE", "X4-2L   ", PCC},
+   {1, "ORACLE", "X4-2B   ", PCC},
+   {1, "ORACLE", "X3-2", PCC},
+   {1, "ORACLE", "X3-2L   ", PCC},
+   {1, "ORACLE", "X3-2B   ", PCC},
+   {1, "ORACLE", "X4470M2 ", PCC},
+   {1, "ORACLE", "X4270M3 ", PCC},
+   {1, "ORACLE", "X4270M2 ", PCC},
+   {1, "ORACLE", "X4170M2 ", PCC},
{0, "", ""},
 };

@@ -1057,15 +1088,20 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void)

for (v_info = vendor_info; v_info->valid; v_info++) {
if (!strncmp(hdr.oem_id, v_info->oem_id, ACPI_OEM_ID_SIZE) &&
-   !strncmp(hdr.oem_table_id, v_info->oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE) &&
-   intel_pstate_no_acpi_pss())
-   return true;
+   !strncmp(hdr.oem_table_id, v_info->oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE))
+   switch (v_info->oem_pwr_table) {
+   case PSS:
+   return intel_pstate_no_acpi_pss();
+   case PCC:
+   return intel_pstate_has_acpi_ppc();
+   }
}

return false;
 }
 #else /* CONFIG_ACPI not enabled */
 static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; 
}
+static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
 #endif /* CONFIG_ACPI */

 static int __init intel_pstate_init(void)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/2 v3] intel_pstate: add kernel parameter to enable loading on Sun X86 servers.

2014-11-25 Thread Dirk Brandewie


On 11/24/2014 08:59 PM, Ethan Zhao wrote:

To force loading on Oracle Sun X86 servers, provide one kernel command line
parameter

   intel_pstate = onora

For those who be aware of the risk doing so.

Signed-off-by: Ethan Zhao 
---
  v2: change to hardware vendor specific naming parameter.

  Documentation/kernel-parameters.txt | 3 +++
  drivers/cpufreq/intel_pstate.c  | 6 +-
  2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 479f332..e4b1b81 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1446,6 +1446,9 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
   disable
 Do not enable intel_pstate as the default
 scaling driver for the supported processors
+  onora
+Enable loading intel_pstate on Oracle Sun Servers(X86).
+only for those who be aware of the risk.


What are the risks?  What is the behaviour if platform power management is
enabled and intel_pstate is trying to control P state selection as well?

If intel_pstate will be able to successfully control P state selection
with platform power management enabled then how about the name "oracle_force"?
Also the documentation should say what the risks are.



intremap=   [X86-64, Intel-IOMMU]
on  enable Interrupt Remapping (default)
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index fa67fb3..e49b050 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -866,6 +866,7 @@ static struct cpufreq_driver intel_pstate_driver = {
  };

  static int __initdata no_load;
+static unsigned int  load_on_sun;

  static int intel_pstate_msrs_not_valid(void)
  {
@@ -1005,7 +1006,8 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void)
if (!strncmp(hdr.oem_id, v_info->oem_id, ACPI_OEM_ID_SIZE) &&
!strncmp(hdr.oem_table_id, v_info->oem_table_id,
ACPI_OEM_TABLE_ID_SIZE) &&
-   intel_pstate_has_acpi_ppc())
+   intel_pstate_has_acpi_ppc() &&
+   !load_on_sun)
return true;
}

@@ -1080,6 +1082,8 @@ static int __init intel_pstate_setup(char *str)

if (!strcmp(str, "disable"))
no_load = 1;
+   if (!strcmp(str, "onora"))
+   load_on_sun = 1;
return 0;
  }
  early_param("intel_pstate", intel_pstate_setup);



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/2 v3] intel_pstate: add kernel parameter to enable loading on Sun X86 servers.

2014-11-25 Thread Dirk Brandewie


On 11/24/2014 08:59 PM, Ethan Zhao wrote:

To force loading on Oracle Sun X86 servers, provide one kernel command line
parameter

   intel_pstate = onora

For those who be aware of the risk doing so.

Signed-off-by: Ethan Zhao ethan.z...@oracle.com
---
  v2: change to hardware vendor specific naming parameter.

  Documentation/kernel-parameters.txt | 3 +++
  drivers/cpufreq/intel_pstate.c  | 6 +-
  2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 479f332..e4b1b81 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1446,6 +1446,9 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
   disable
 Do not enable intel_pstate as the default
 scaling driver for the supported processors
+  onora
+Enable loading intel_pstate on Oracle Sun Servers(X86).
+only for those who be aware of the risk.


What are the risks?  What is the behaviour if platform power management is
enabled and intel_pstate is trying to control P state selection as well?

If intel_pstate will be able to successfully control P state selection
with platform power management enabled then how about the name oracle_force?
Also the documentation should say what the risks are.



intremap=   [X86-64, Intel-IOMMU]
on  enable Interrupt Remapping (default)
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index fa67fb3..e49b050 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -866,6 +866,7 @@ static struct cpufreq_driver intel_pstate_driver = {
  };

  static int __initdata no_load;
+static unsigned int  load_on_sun;

  static int intel_pstate_msrs_not_valid(void)
  {
@@ -1005,7 +1006,8 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void)
if (!strncmp(hdr.oem_id, v_info-oem_id, ACPI_OEM_ID_SIZE) 
!strncmp(hdr.oem_table_id, v_info-oem_table_id,
ACPI_OEM_TABLE_ID_SIZE) 
-   intel_pstate_has_acpi_ppc())
+   intel_pstate_has_acpi_ppc() 
+   !load_on_sun)
return true;
}

@@ -1080,6 +1082,8 @@ static int __init intel_pstate_setup(char *str)

if (!strcmp(str, disable))
no_load = 1;
+   if (!strcmp(str, onora))
+   load_on_sun = 1;
return 0;
  }
  early_param(intel_pstate, intel_pstate_setup);



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/2 v3] intel_pstate: skip this driver if Sun server has _PPC method

2014-11-25 Thread Dirk Brandewie


On 11/24/2014 08:59 PM, Ethan Zhao wrote:

Oracle Sun X86 servers have dynamic power capping capability that works via
ACPI _PPC method etc, so skip loading this driver if Sun server has ACPI _PPC
enabled.



How about this patch? only compile tested.

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 3468387..db7b8b2 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1025,15 +1025,46 @@ static bool intel_pstate_no_acpi_pss(void)
return true;
 }

+static bool intel_pstate_has_acpi_ppc(void)
+{
+   int i;
+
+   for_each_possible_cpu(i) {
+   struct acpi_processor *pr = per_cpu(processors, i);
+
+   if (!pr)
+   continue;
+   if (acpi_has_method(pr-handle, _PPC))
+   return true;
+   }
+   return false;
+}
+
+enum {
+   PSS,
+   PCC,
+};
+
 struct hw_vendor_info {
u16  valid;
char oem_id[ACPI_OEM_ID_SIZE];
char oem_table_id[ACPI_OEM_TABLE_ID_SIZE];
+   int  oem_pwr_table;
 };

 /* Hardware vendor-specific info that has its own power management modes */
 static struct hw_vendor_info vendor_info[] = {
-   {1, HP, ProLiant},
+   {1, HP, ProLiant, PSS},
+   {1, ORACLE, X4-2, PCC},
+   {1, ORACLE, X4-2L   , PCC},
+   {1, ORACLE, X4-2B   , PCC},
+   {1, ORACLE, X3-2, PCC},
+   {1, ORACLE, X3-2L   , PCC},
+   {1, ORACLE, X3-2B   , PCC},
+   {1, ORACLE, X4470M2 , PCC},
+   {1, ORACLE, X4270M3 , PCC},
+   {1, ORACLE, X4270M2 , PCC},
+   {1, ORACLE, X4170M2 , PCC},
{0, , },
 };

@@ -1057,15 +1088,20 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void)

for (v_info = vendor_info; v_info-valid; v_info++) {
if (!strncmp(hdr.oem_id, v_info-oem_id, ACPI_OEM_ID_SIZE) 
-   !strncmp(hdr.oem_table_id, v_info-oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE) 
-   intel_pstate_no_acpi_pss())
-   return true;
+   !strncmp(hdr.oem_table_id, v_info-oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE))
+   switch (v_info-oem_pwr_table) {
+   case PSS:
+   return intel_pstate_no_acpi_pss();
+   case PCC:
+   return intel_pstate_has_acpi_ppc();
+   }
}

return false;
 }
 #else /* CONFIG_ACPI not enabled */
 static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; 
}
+static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
 #endif /* CONFIG_ACPI */

 static int __init intel_pstate_init(void)


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] intel_pstate: skip the driver if Sun server has ACPI _PPC method

2014-11-20 Thread Dirk Brandewie


On 11/19/2014 12:22 PM, Linda Knippers wrote:

On 11/18/2014 3:37 AM, Ethan Zhao wrote:

Oracle Sun X86 servers have dynamic power capping capability that works via
ACPI _PPC method etc, so skip loading this driver if Sun server has ACPI _PPC
enabled.

Signed-off-by: Ethan Zhao 
---
  drivers/cpufreq/intel_pstate.c | 20 
  1 file changed, 20 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 27bb6d3..5498eb0 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -943,6 +943,21 @@ static bool intel_pstate_no_acpi_pss(void)
return true;
  }

+static bool intel_pstate_has_acpi_ppc(void)
+{
+   int i;
+
+   for_each_possible_cpu(i) {
+   struct acpi_processor *pr = per_cpu(processors, i);
+
+   if (!pr)
+   continue;
+   if (acpi_has_method(pr->handle, "_PPC"))
+   return true;
+   }
+   return false;
+}
+
  struct hw_vendor_info {
u16  valid;
char oem_id[ACPI_OEM_ID_SIZE];
@@ -952,6 +967,7 @@ struct hw_vendor_info {
  /* Hardware vendor-specific info that has its own power management modes */
  static struct hw_vendor_info vendor_info[] = {
{1, "HP", "ProLiant"},
+   {1, "ORACLE", ""},
{0, "", ""},
  };

@@ -969,12 +985,16 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void)
!strncmp(hdr.oem_table_id, v_info->oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE) &&
intel_pstate_no_acpi_pss())
return true;
+   if (!strncmp(hdr.oem_id, v_info->oem_id, ACPI_OEM_ID_SIZE) &&
+   intel_pstate_has_acpi_ppc())


We need try this on a few platforms to make sure this patch doesn't break the
HP platforms that may or may not need this driver, depending on the BIOS 
settings.



It looks like HP systems would get swept up in this check too if they have _PPC

What about extending the hw_vendor_info struct to include whether _PSS or
_PPC should be done for the platform since it appears that oracle and HP
have implemented similar functionality using two different methods.



I don't suppose you tested on a ProLiant too?

-- ljk


+   return true;
}

return false;
  }
  #else /* CONFIG_ACPI not enabled */
  static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return 
false; }
+static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
  #endif /* CONFIG_ACPI */

  static int __init intel_pstate_init(void)





--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] intel_pstate: skip the driver if Sun server has ACPI _PPC method

2014-11-20 Thread Dirk Brandewie


On 11/19/2014 12:22 PM, Linda Knippers wrote:

On 11/18/2014 3:37 AM, Ethan Zhao wrote:

Oracle Sun X86 servers have dynamic power capping capability that works via
ACPI _PPC method etc, so skip loading this driver if Sun server has ACPI _PPC
enabled.

Signed-off-by: Ethan Zhao ethan.z...@oracle.com
---
  drivers/cpufreq/intel_pstate.c | 20 
  1 file changed, 20 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 27bb6d3..5498eb0 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -943,6 +943,21 @@ static bool intel_pstate_no_acpi_pss(void)
return true;
  }

+static bool intel_pstate_has_acpi_ppc(void)
+{
+   int i;
+
+   for_each_possible_cpu(i) {
+   struct acpi_processor *pr = per_cpu(processors, i);
+
+   if (!pr)
+   continue;
+   if (acpi_has_method(pr-handle, _PPC))
+   return true;
+   }
+   return false;
+}
+
  struct hw_vendor_info {
u16  valid;
char oem_id[ACPI_OEM_ID_SIZE];
@@ -952,6 +967,7 @@ struct hw_vendor_info {
  /* Hardware vendor-specific info that has its own power management modes */
  static struct hw_vendor_info vendor_info[] = {
{1, HP, ProLiant},
+   {1, ORACLE, },
{0, , },
  };

@@ -969,12 +985,16 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void)
!strncmp(hdr.oem_table_id, v_info-oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE) 
intel_pstate_no_acpi_pss())
return true;
+   if (!strncmp(hdr.oem_id, v_info-oem_id, ACPI_OEM_ID_SIZE) 
+   intel_pstate_has_acpi_ppc())


We need try this on a few platforms to make sure this patch doesn't break the
HP platforms that may or may not need this driver, depending on the BIOS 
settings.



It looks like HP systems would get swept up in this check too if they have _PPC

What about extending the hw_vendor_info struct to include whether _PSS or
_PPC should be done for the platform since it appears that oracle and HP
have implemented similar functionality using two different methods.



I don't suppose you tested on a ProLiant too?

-- ljk


+   return true;
}

return false;
  }
  #else /* CONFIG_ACPI not enabled */
  static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return 
false; }
+static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
  #endif /* CONFIG_ACPI */

  static int __init intel_pstate_init(void)





--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] intel_pstate: skip the driver if Sun server has ACPI _PPC method

2014-11-19 Thread Dirk Brandewie


On 11/18/2014 12:37 AM, Ethan Zhao wrote:

Oracle Sun X86 servers have dynamic power capping capability that works via
ACPI _PPC method etc, so skip loading this driver if Sun server has ACPI _PPC
enabled.

Signed-off-by: Ethan Zhao 
---
  drivers/cpufreq/intel_pstate.c | 20 
  1 file changed, 20 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 27bb6d3..5498eb0 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -943,6 +943,21 @@ static bool intel_pstate_no_acpi_pss(void)
return true;
  }

+static bool intel_pstate_has_acpi_ppc(void)
+{
+   int i;
+
+   for_each_possible_cpu(i) {
+   struct acpi_processor *pr = per_cpu(processors, i);
+
+   if (!pr)
+   continue;
+   if (acpi_has_method(pr->handle, "_PPC"))
+   return true;
+   }
+   return false;
+}
+
  struct hw_vendor_info {
u16  valid;
char oem_id[ACPI_OEM_ID_SIZE];
@@ -952,6 +967,7 @@ struct hw_vendor_info {
  /* Hardware vendor-specific info that has its own power management modes */
  static struct hw_vendor_info vendor_info[] = {
{1, "HP", "ProLiant"},
+   {1, "ORACLE", ""},
{0, "", ""},
  };



Does this apply to ALL oracle systems?

Is the presence or absense of the _PPC method configurable in the oracle BIOS?


@@ -969,12 +985,16 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void)
!strncmp(hdr.oem_table_id, v_info->oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE) &&
intel_pstate_no_acpi_pss())
return true;
+   if (!strncmp(hdr.oem_id, v_info->oem_id, ACPI_OEM_ID_SIZE) &&
+   intel_pstate_has_acpi_ppc())
+   return true;
}

return false;
  }
  #else /* CONFIG_ACPI not enabled */
  static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return 
false; }
+static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
  #endif /* CONFIG_ACPI */

  static int __init intel_pstate_init(void)



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] intel_pstate: skip the driver if Sun server has ACPI _PPC method

2014-11-19 Thread Dirk Brandewie


On 11/18/2014 12:37 AM, Ethan Zhao wrote:

Oracle Sun X86 servers have dynamic power capping capability that works via
ACPI _PPC method etc, so skip loading this driver if Sun server has ACPI _PPC
enabled.

Signed-off-by: Ethan Zhao ethan.z...@oracle.com
---
  drivers/cpufreq/intel_pstate.c | 20 
  1 file changed, 20 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 27bb6d3..5498eb0 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -943,6 +943,21 @@ static bool intel_pstate_no_acpi_pss(void)
return true;
  }

+static bool intel_pstate_has_acpi_ppc(void)
+{
+   int i;
+
+   for_each_possible_cpu(i) {
+   struct acpi_processor *pr = per_cpu(processors, i);
+
+   if (!pr)
+   continue;
+   if (acpi_has_method(pr-handle, _PPC))
+   return true;
+   }
+   return false;
+}
+
  struct hw_vendor_info {
u16  valid;
char oem_id[ACPI_OEM_ID_SIZE];
@@ -952,6 +967,7 @@ struct hw_vendor_info {
  /* Hardware vendor-specific info that has its own power management modes */
  static struct hw_vendor_info vendor_info[] = {
{1, HP, ProLiant},
+   {1, ORACLE, },
{0, , },
  };



Does this apply to ALL oracle systems?

Is the presence or absense of the _PPC method configurable in the oracle BIOS?


@@ -969,12 +985,16 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void)
!strncmp(hdr.oem_table_id, v_info-oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE) 
intel_pstate_no_acpi_pss())
return true;
+   if (!strncmp(hdr.oem_id, v_info-oem_id, ACPI_OEM_ID_SIZE) 
+   intel_pstate_has_acpi_ppc())
+   return true;
}

return false;
  }
  #else /* CONFIG_ACPI not enabled */
  static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return 
false; }
+static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
  #endif /* CONFIG_ACPI */

  static int __init intel_pstate_init(void)



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/2] x86: Add support for Intel HWP feature detection.

2014-11-06 Thread dirk . brandewie

From: Dirk Brandewie 

Add support of Hardware Managed Performance States (HWP) described in Volume 3
section 14.4 of the SDM.

One bit CPUID.06H:EAX[bit 7] expresses the presence of the HWP feature on
the processor. The remaining bits CPUID.06H:EAX[bit 8-11] denote the
presense of various HWP features.

Cc: x...@kernel.org
Signed-off-by: Dirk Brandewie 
---
 arch/x86/include/asm/cpufeature.h | 5 +
 arch/x86/kernel/cpu/scattered.c   | 5 +
 2 files changed, 10 insertions(+)

diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index 0bb1335..aede2c3 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -189,6 +189,11 @@
 #define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */
 #define X86_FEATURE_HW_PSTATE  ( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+#define X86_FEATURE_HWP( 7*32+ 10) /* "hwp" Intel HWP */
+#define X86_FEATURE_HWP_NOITFY ( 7*32+ 11) /* Intel HWP_NOTIFY */
+#define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */
+#define X86_FEATURE_HWP_EPP( 7*32+13) /* Intel HWP_EPP */
+#define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 4a8013d..6063909 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -36,6 +36,11 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{ X86_FEATURE_ARAT, CR_EAX, 2, 0x0006, 0 },
{ X86_FEATURE_PLN,  CR_EAX, 4, 0x0006, 0 },
{ X86_FEATURE_PTS,  CR_EAX, 6, 0x0006, 0 },
+   { X86_FEATURE_HWP,  CR_EAX, 7, 0x0006, 0 },
+   { X86_FEATURE_HWP_NOITFY,   CR_EAX, 8, 0x0006, 0 },
+   { X86_FEATURE_HWP_ACT_WINDOW,   CR_EAX, 9, 0x0006, 0 },
+   { X86_FEATURE_HWP_EPP,  CR_EAX,10, 0x0006, 0 },
+   { X86_FEATURE_HWP_PKG_REQ,  CR_EAX,11, 0x0006, 0 },
{ X86_FEATURE_APERFMPERF,   CR_ECX, 0, 0x0006, 0 },
{ X86_FEATURE_EPB,  CR_ECX, 3, 0x0006, 0 },
{ X86_FEATURE_HW_PSTATE,CR_EDX, 7, 0x8007, 0 },
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/2] intel_pstate: Add support for HWP

2014-11-06 Thread dirk . brandewie

From: Dirk Brandewie 

Add support of Hardware Managed Performance States (HWP) described in Volume 3
section 14.4 of the SDM.

With HWP enbaled intel_pstate will no longer be responsible for selecting P
states for the processor. intel_pstate will continue to register to
the cpufreq core as the scaling driver for CPUs implementing
HWP. In HWP mode intel_pstate provides three functions reporting
frequency to the cpufreq core, support for the set_policy() interface
from the core and maintaining the intel_pstate sysfs interface in
/sys/devices/system/cpu/intel_pstate.  User preferences expressed via
the set_policy() interface or the sysfs interface are forwared to the
CPU via the HWP MSR interface.

Signed-off-by: Dirk Brandewie 
---
 Documentation/cpu-freq/intel-pstate.txt |  37 
 Documentation/kernel-parameters.txt |   3 +
 arch/x86/include/uapi/asm/msr-index.h   |  41 +
 drivers/cpufreq/intel_pstate.c  | 102 +++-
 4 files changed, 168 insertions(+), 15 deletions(-)

diff --git a/Documentation/cpu-freq/intel-pstate.txt 
b/Documentation/cpu-freq/intel-pstate.txt
index a69ffe1..765d7fc 100644
--- a/Documentation/cpu-freq/intel-pstate.txt
+++ b/Documentation/cpu-freq/intel-pstate.txt
@@ -1,17 +1,28 @@
 Intel P-state driver
 
 
-This driver implements a scaling driver with an internal governor for
-Intel Core processors.  The driver follows the same model as the
-Transmeta scaling driver (longrun.c) and implements the setpolicy()
-instead of target().  Scaling drivers that implement setpolicy() are
-assumed to implement internal governors by the cpufreq core. All the
-logic for selecting the current P state is contained within the
-driver; no external governor is used by the cpufreq core.
-
-Intel SandyBridge+ processors are supported.
-
-New sysfs files for controlling P state selection have been added to
+This driver provides an interface to control the P state selection for
+SandyBridge+ Intel processors.  The driver can operate two different
+modes based on the processor model legacy and Hardware P state (HWP)
+mode.
+
+In legacy mode the driver implements a scaling driver with an internal
+governor for Intel Core processors.  The driver follows the same model
+as the Transmeta scaling driver (longrun.c) and implements the
+setpolicy() instead of target().  Scaling drivers that implement
+setpolicy() are assumed to implement internal governors by the cpufreq
+core. All the logic for selecting the current P state is contained
+within the driver; no external governor is used by the cpufreq core.
+
+In HWP mode P state selection is implemented in the processor
+itself. The driver provides the interfaces between the cpufreq core and
+the processor to control P state selection based on user preferences
+and reporting frequency to the cpufreq core.  In this mode the
+internal governor code is disabled.
+
+In addtion to the interfaces provided by the cpufreq core for
+controlling frequency the driver provides sysfs files for
+controlling P state selection. These files have been added to
 /sys/devices/system/cpu/intel_pstate/
 
   max_perf_pct: limits the maximum P state that will be requested by
@@ -33,7 +44,9 @@ frequency is fiction for Intel Core processors. Even if the 
scaling
 driver selects a single P state the actual frequency the processor
 will run at is selected by the processor itself.
 
-New debugfs files have also been added to /sys/kernel/debug/pstate_snb/
+For legacy mode debugfs files have also been added to allow tuning of
+the internal governor algorythm. These files are located at
+/sys/kernel/debug/pstate_snb/ These files are NOT present in HWP mode.
 
   deadband
   d_gain_pct
diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 4c81a86..571f9d2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1446,6 +1446,9 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
   disable
 Do not enable intel_pstate as the default
 scaling driver for the supported processors
+  no_hwp
+Do not enable hardware P state control (HWP)
+if available.   
 
intremap=   [X86-64, Intel-IOMMU]
on  enable Interrupt Remapping (default)
diff --git a/arch/x86/include/uapi/asm/msr-index.h 
b/arch/x86/include/uapi/asm/msr-index.h
index e21331c..86dda97 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -152,6 +152,45 @@
 #define MSR_CC6_DEMOTION_POLICY_CONFIG 0x0668
 #define MSR_MC6_DEMOTION_POLICY_CONFIG 0x0669
 
+/* Hardware P state interface */
+#define MSR_PPERF  0x064e
+#define MSR_PERF_LIMIT_REASONS 0x064f
+#define MSR_PM_ENABLE

[PATCH 0/2] intel_pstate: Add support for hardware managed P states (HWP)

2014-11-06 Thread dirk . brandewie

From: Dirk Brandewie 

This patch set adds support for HWP. When HWP is enabled the CPU will
do P state autonomously and intel_pstate simply provides an interface
to forward user preferences to the CPU while maintaining the
interfaces required by the cpufreq core. 

Dirk Brandewie (2):
  x86: Add support for Intel HWP feature detection.
  intel_pstate: Add support for HWP

 Documentation/cpu-freq/intel-pstate.txt |  37 
 Documentation/kernel-parameters.txt |   3 +
 arch/x86/include/asm/cpufeature.h   |   5 ++
 arch/x86/include/uapi/asm/msr-index.h   |  41 +
 arch/x86/kernel/cpu/scattered.c |   5 ++
 drivers/cpufreq/intel_pstate.c  | 102 +++-
 6 files changed, 178 insertions(+), 15 deletions(-)

-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 0/2] intel_pstate: Add support for hardware managed P states (HWP)

2014-11-06 Thread dirk . brandewie

From: Dirk Brandewie dirk.j.brande...@intel.com

This patch set adds support for HWP. When HWP is enabled the CPU will
do P state autonomously and intel_pstate simply provides an interface
to forward user preferences to the CPU while maintaining the
interfaces required by the cpufreq core. 

Dirk Brandewie (2):
  x86: Add support for Intel HWP feature detection.
  intel_pstate: Add support for HWP

 Documentation/cpu-freq/intel-pstate.txt |  37 
 Documentation/kernel-parameters.txt |   3 +
 arch/x86/include/asm/cpufeature.h   |   5 ++
 arch/x86/include/uapi/asm/msr-index.h   |  41 +
 arch/x86/kernel/cpu/scattered.c |   5 ++
 drivers/cpufreq/intel_pstate.c  | 102 +++-
 6 files changed, 178 insertions(+), 15 deletions(-)

-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/2] intel_pstate: Add support for HWP

2014-11-06 Thread dirk . brandewie

From: Dirk Brandewie dirk.j.brande...@intel.com

Add support of Hardware Managed Performance States (HWP) described in Volume 3
section 14.4 of the SDM.

With HWP enbaled intel_pstate will no longer be responsible for selecting P
states for the processor. intel_pstate will continue to register to
the cpufreq core as the scaling driver for CPUs implementing
HWP. In HWP mode intel_pstate provides three functions reporting
frequency to the cpufreq core, support for the set_policy() interface
from the core and maintaining the intel_pstate sysfs interface in
/sys/devices/system/cpu/intel_pstate.  User preferences expressed via
the set_policy() interface or the sysfs interface are forwared to the
CPU via the HWP MSR interface.

Signed-off-by: Dirk Brandewie dirk.j.brande...@intel.com
---
 Documentation/cpu-freq/intel-pstate.txt |  37 
 Documentation/kernel-parameters.txt |   3 +
 arch/x86/include/uapi/asm/msr-index.h   |  41 +
 drivers/cpufreq/intel_pstate.c  | 102 +++-
 4 files changed, 168 insertions(+), 15 deletions(-)

diff --git a/Documentation/cpu-freq/intel-pstate.txt 
b/Documentation/cpu-freq/intel-pstate.txt
index a69ffe1..765d7fc 100644
--- a/Documentation/cpu-freq/intel-pstate.txt
+++ b/Documentation/cpu-freq/intel-pstate.txt
@@ -1,17 +1,28 @@
 Intel P-state driver
 
 
-This driver implements a scaling driver with an internal governor for
-Intel Core processors.  The driver follows the same model as the
-Transmeta scaling driver (longrun.c) and implements the setpolicy()
-instead of target().  Scaling drivers that implement setpolicy() are
-assumed to implement internal governors by the cpufreq core. All the
-logic for selecting the current P state is contained within the
-driver; no external governor is used by the cpufreq core.
-
-Intel SandyBridge+ processors are supported.
-
-New sysfs files for controlling P state selection have been added to
+This driver provides an interface to control the P state selection for
+SandyBridge+ Intel processors.  The driver can operate two different
+modes based on the processor model legacy and Hardware P state (HWP)
+mode.
+
+In legacy mode the driver implements a scaling driver with an internal
+governor for Intel Core processors.  The driver follows the same model
+as the Transmeta scaling driver (longrun.c) and implements the
+setpolicy() instead of target().  Scaling drivers that implement
+setpolicy() are assumed to implement internal governors by the cpufreq
+core. All the logic for selecting the current P state is contained
+within the driver; no external governor is used by the cpufreq core.
+
+In HWP mode P state selection is implemented in the processor
+itself. The driver provides the interfaces between the cpufreq core and
+the processor to control P state selection based on user preferences
+and reporting frequency to the cpufreq core.  In this mode the
+internal governor code is disabled.
+
+In addtion to the interfaces provided by the cpufreq core for
+controlling frequency the driver provides sysfs files for
+controlling P state selection. These files have been added to
 /sys/devices/system/cpu/intel_pstate/
 
   max_perf_pct: limits the maximum P state that will be requested by
@@ -33,7 +44,9 @@ frequency is fiction for Intel Core processors. Even if the 
scaling
 driver selects a single P state the actual frequency the processor
 will run at is selected by the processor itself.
 
-New debugfs files have also been added to /sys/kernel/debug/pstate_snb/
+For legacy mode debugfs files have also been added to allow tuning of
+the internal governor algorythm. These files are located at
+/sys/kernel/debug/pstate_snb/ These files are NOT present in HWP mode.
 
   deadband
   d_gain_pct
diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 4c81a86..571f9d2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1446,6 +1446,9 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
   disable
 Do not enable intel_pstate as the default
 scaling driver for the supported processors
+  no_hwp
+Do not enable hardware P state control (HWP)
+if available.   
 
intremap=   [X86-64, Intel-IOMMU]
on  enable Interrupt Remapping (default)
diff --git a/arch/x86/include/uapi/asm/msr-index.h 
b/arch/x86/include/uapi/asm/msr-index.h
index e21331c..86dda97 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -152,6 +152,45 @@
 #define MSR_CC6_DEMOTION_POLICY_CONFIG 0x0668
 #define MSR_MC6_DEMOTION_POLICY_CONFIG 0x0669
 
+/* Hardware P state interface */
+#define MSR_PPERF  0x064e
+#define MSR_PERF_LIMIT_REASONS

[PATCH 1/2] x86: Add support for Intel HWP feature detection.

2014-11-06 Thread dirk . brandewie

From: Dirk Brandewie dirk.j.brande...@intel.com

Add support of Hardware Managed Performance States (HWP) described in Volume 3
section 14.4 of the SDM.

One bit CPUID.06H:EAX[bit 7] expresses the presence of the HWP feature on
the processor. The remaining bits CPUID.06H:EAX[bit 8-11] denote the
presense of various HWP features.

Cc: x...@kernel.org
Signed-off-by: Dirk Brandewie dirk.j.brande...@intel.com
---
 arch/x86/include/asm/cpufeature.h | 5 +
 arch/x86/kernel/cpu/scattered.c   | 5 +
 2 files changed, 10 insertions(+)

diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index 0bb1335..aede2c3 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -189,6 +189,11 @@
 #define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */
 #define X86_FEATURE_HW_PSTATE  ( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+#define X86_FEATURE_HWP( 7*32+ 10) /* hwp Intel HWP */
+#define X86_FEATURE_HWP_NOITFY ( 7*32+ 11) /* Intel HWP_NOTIFY */
+#define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */
+#define X86_FEATURE_HWP_EPP( 7*32+13) /* Intel HWP_EPP */
+#define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 4a8013d..6063909 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -36,6 +36,11 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{ X86_FEATURE_ARAT, CR_EAX, 2, 0x0006, 0 },
{ X86_FEATURE_PLN,  CR_EAX, 4, 0x0006, 0 },
{ X86_FEATURE_PTS,  CR_EAX, 6, 0x0006, 0 },
+   { X86_FEATURE_HWP,  CR_EAX, 7, 0x0006, 0 },
+   { X86_FEATURE_HWP_NOITFY,   CR_EAX, 8, 0x0006, 0 },
+   { X86_FEATURE_HWP_ACT_WINDOW,   CR_EAX, 9, 0x0006, 0 },
+   { X86_FEATURE_HWP_EPP,  CR_EAX,10, 0x0006, 0 },
+   { X86_FEATURE_HWP_PKG_REQ,  CR_EAX,11, 0x0006, 0 },
{ X86_FEATURE_APERFMPERF,   CR_ECX, 0, 0x0006, 0 },
{ X86_FEATURE_EPB,  CR_ECX, 3, 0x0006, 0 },
{ X86_FEATURE_HW_PSTATE,CR_EDX, 7, 0x8007, 0 },
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2] cpufreq: allow powersave governor as the default without expert mode

2014-10-31 Thread Dirk Brandewie


On 10/30/2014 02:18 PM, Rafael J. Wysocki wrote:

On Thursday, October 16, 2014 07:37:11 AM James Geboski wrote:

The intel_pstate driver only supports the performance and the powersave
governors. With the performance governor ensuring the highest possible
performance settings, userspace tools fail to make any lasting changes.
In order to allow userspace tools to make modifications to the settings,
the powersave governor must be in use. This makes having the powersave
governor as the default convenient for systems where the intel_pstate
driver is being employed. Having to enable expert mode in the kernel
configuration is just a headache for such a trivial task.

This patch applies to all kernel versions 2.6.38 or greater after the
migration from CONFIG_EMBEDDED to CONFIG_EXPERT (6a108a14fa35). Most
importantly, this applies to kernel versions 3.9 or greater when the
intel_pstate driver was introduced.

Signed-off-by: James Geboski 
Acked-by: Viresh Kumar 


Dirk, any objections?


No objection.



---
ChangeLog v2:
   - Acked-by: Viresh Kumar 
---
  drivers/cpufreq/Kconfig | 1 -
  1 file changed, 1 deletion(-)

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 3489f8f..73df7db 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -63,7 +63,6 @@ config CPU_FREQ_DEFAULT_GOV_PERFORMANCE

  config CPU_FREQ_DEFAULT_GOV_POWERSAVE
bool "powersave"
-   depends on EXPERT
select CPU_FREQ_GOV_POWERSAVE
help
  Use the CPUFreq governor 'powersave' as default. This sets





--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2] cpufreq: allow powersave governor as the default without expert mode

2014-10-31 Thread Dirk Brandewie


On 10/30/2014 02:18 PM, Rafael J. Wysocki wrote:

On Thursday, October 16, 2014 07:37:11 AM James Geboski wrote:

The intel_pstate driver only supports the performance and the powersave
governors. With the performance governor ensuring the highest possible
performance settings, userspace tools fail to make any lasting changes.
In order to allow userspace tools to make modifications to the settings,
the powersave governor must be in use. This makes having the powersave
governor as the default convenient for systems where the intel_pstate
driver is being employed. Having to enable expert mode in the kernel
configuration is just a headache for such a trivial task.

This patch applies to all kernel versions 2.6.38 or greater after the
migration from CONFIG_EMBEDDED to CONFIG_EXPERT (6a108a14fa35). Most
importantly, this applies to kernel versions 3.9 or greater when the
intel_pstate driver was introduced.

Signed-off-by: James Geboski jgebo...@gmail.com
Acked-by: Viresh Kumar viresh.ku...@linaro.org


Dirk, any objections?


No objection.



---
ChangeLog v2:
   - Acked-by: Viresh Kumar viresh.ku...@linaro.org
---
  drivers/cpufreq/Kconfig | 1 -
  1 file changed, 1 deletion(-)

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 3489f8f..73df7db 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -63,7 +63,6 @@ config CPU_FREQ_DEFAULT_GOV_PERFORMANCE

  config CPU_FREQ_DEFAULT_GOV_POWERSAVE
bool powersave
-   depends on EXPERT
select CPU_FREQ_GOV_POWERSAVE
help
  Use the CPUFreq governor 'powersave' as default. This sets





--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] cpufreq: intel_pstate: Fix setting max_perf_pct in performance policy

2014-10-16 Thread Dirk Brandewie


On 10/15/2014 04:16 PM, Pali Rohár wrote:

Code which changes policy to powersave changes also max_policy_pct based on
max_freq. Code which change max_perf_pct has upper limit base on value
max_policy_pct. When policy is changing from powersave back to performance
then max_policy_pct is not changed. Which means that changing max_perf_pct is
not possible to high values if max_freq was too low in powersave policy.

Test case:

$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_min_freq
80
$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
330
$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
performance
$ cat /sys/devices/system/cpu/intel_pstate/max_perf_pct
100

$ echo powersave > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
$ echo 80 > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
$ echo 20 > /sys/devices/system/cpu/intel_pstate/max_perf_pct

$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
powersave
$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
80
$ cat /sys/devices/system/cpu/intel_pstate/max_perf_pct
20

$ echo performance > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
$ echo 330 > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
$ echo 100 > /sys/devices/system/cpu/intel_pstate/max_perf_pct

$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
performance
$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
330
$ cat /sys/devices/system/cpu/intel_pstate/max_perf_pct
24

And now intel_pstate driver allows to set maximal value for max_perf_pct based
on max_policy_pct which is 24 for previous powersave max_freq 80.

This patch will set default value for max_policy_pct when setting policy to
performance so it will allow to set also max value for max_perf_pct.

Signed-off-by: Pali Rohár 


Acked-by: Dirk Brandewie 


Cc: sta...@vger.kernel.org
---
  drivers/cpufreq/intel_pstate.c |1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 0668b38..7547ab5 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -714,6 +714,7 @@ static int intel_pstate_set_policy(struct cpufreq_policy 
*policy)
if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) {
limits.min_perf_pct = 100;
limits.min_perf = int_tofp(1);
+   limits.max_policy_pct = 100;
limits.max_perf_pct = 100;
limits.max_perf = int_tofp(1);
limits.no_turbo = limits.turbo_disabled;



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] cpufreq: intel_pstate: Fix setting max_perf_pct in performance policy

2014-10-16 Thread Dirk Brandewie


On 10/15/2014 04:16 PM, Pali Rohár wrote:

Code which changes policy to powersave changes also max_policy_pct based on
max_freq. Code which change max_perf_pct has upper limit base on value
max_policy_pct. When policy is changing from powersave back to performance
then max_policy_pct is not changed. Which means that changing max_perf_pct is
not possible to high values if max_freq was too low in powersave policy.

Test case:

$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_min_freq
80
$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
330
$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
performance
$ cat /sys/devices/system/cpu/intel_pstate/max_perf_pct
100

$ echo powersave  /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
$ echo 80  /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
$ echo 20  /sys/devices/system/cpu/intel_pstate/max_perf_pct

$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
powersave
$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
80
$ cat /sys/devices/system/cpu/intel_pstate/max_perf_pct
20

$ echo performance  /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
$ echo 330  /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
$ echo 100  /sys/devices/system/cpu/intel_pstate/max_perf_pct

$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
performance
$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
330
$ cat /sys/devices/system/cpu/intel_pstate/max_perf_pct
24

And now intel_pstate driver allows to set maximal value for max_perf_pct based
on max_policy_pct which is 24 for previous powersave max_freq 80.

This patch will set default value for max_policy_pct when setting policy to
performance so it will allow to set also max value for max_perf_pct.

Signed-off-by: Pali Rohár pali.ro...@gmail.com


Acked-by: Dirk Brandewie dirk.j.brande...@intel.com


Cc: sta...@vger.kernel.org
---
  drivers/cpufreq/intel_pstate.c |1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 0668b38..7547ab5 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -714,6 +714,7 @@ static int intel_pstate_set_policy(struct cpufreq_policy 
*policy)
if (policy-policy == CPUFREQ_POLICY_PERFORMANCE) {
limits.min_perf_pct = 100;
limits.min_perf = int_tofp(1);
+   limits.max_policy_pct = 100;
limits.max_perf_pct = 100;
limits.max_perf = int_tofp(1);
limits.no_turbo = limits.turbo_disabled;



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.

2014-09-11 Thread Dirk Brandewie


On 09/10/2014 06:04 PM, Sameer Nanda wrote:

On Wed, Sep 10, 2014 at 5:04 PM, Rafael J. Wysocki  wrote:

On Wednesday, September 10, 2014 04:39:05 PM Anup Chenthamarakshan wrote:

On Thu, Sep 11, 2014 at 12:49:48AM +0200, Rafael J. Wysocki wrote:

On Wednesday, September 10, 2014 03:15:08 PM Anup Chenthamarakshan wrote:


Tools like powertop and turbostat are not present by default on all systems,
so it is not always possible to use them :(


Which systems are you referring to in particular?


We're testing on Chrome OS devices (Chromebooks).


How big of a deal is it to install the tools mentioned above on such a system?

At least turbostat is shipped with the kernel source.


Given the web browser based front end of Chrome OS, installing these
tools will only get us so far -- if the system is in developer mode,
the tools are accessible but when the system is in normal (verified
boot mode) these tools cannot be launched directly.

We are in the process of switching Chrome OS x86 kernels from ondemand
governor to intel_pstate.  When debugging power consumption issues,
losing the ability to easily get CPU frequency related information as
a side-effect of this switch is less than ideal.

We are happy to spin this patch to expose aperf/mperf based CPU
frequency information if you think that is the better route to take
longer term.


You can get the frequency as measured by intel_pstate from /proc/cpuinfo
or /sys/devices/system/cpu/cpu[n]/cpufreq/cpuinfo_cur_freq but his is only
for the most recent sample on cpu[n]

reading MSR 0x199 and some reasonable rate will let you graph what request
is being made on each core.





--
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.






--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.

2014-09-11 Thread Dirk Brandewie


On 09/10/2014 06:04 PM, Sameer Nanda wrote:

On Wed, Sep 10, 2014 at 5:04 PM, Rafael J. Wysocki r...@rjwysocki.net wrote:

On Wednesday, September 10, 2014 04:39:05 PM Anup Chenthamarakshan wrote:

On Thu, Sep 11, 2014 at 12:49:48AM +0200, Rafael J. Wysocki wrote:

On Wednesday, September 10, 2014 03:15:08 PM Anup Chenthamarakshan wrote:


Tools like powertop and turbostat are not present by default on all systems,
so it is not always possible to use them :(


Which systems are you referring to in particular?


We're testing on Chrome OS devices (Chromebooks).


How big of a deal is it to install the tools mentioned above on such a system?

At least turbostat is shipped with the kernel source.


Given the web browser based front end of Chrome OS, installing these
tools will only get us so far -- if the system is in developer mode,
the tools are accessible but when the system is in normal (verified
boot mode) these tools cannot be launched directly.

We are in the process of switching Chrome OS x86 kernels from ondemand
governor to intel_pstate.  When debugging power consumption issues,
losing the ability to easily get CPU frequency related information as
a side-effect of this switch is less than ideal.

We are happy to spin this patch to expose aperf/mperf based CPU
frequency information if you think that is the better route to take
longer term.


You can get the frequency as measured by intel_pstate from /proc/cpuinfo
or /sys/devices/system/cpu/cpu[n]/cpufreq/cpuinfo_cur_freq but his is only
for the most recent sample on cpu[n]

reading MSR 0x199 and some reasonable rate will let you graph what request
is being made on each core.





--
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.






--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC v1 0/6] CPPC as a PID backend

2014-09-10 Thread Dirk Brandewie


On 09/10/2014 09:11 AM, Ashwin Chaugule wrote:

On 10 September 2014 11:44, Dirk Brandewie  wrote:

Hi Ashwin,


Hi Dirk,



I think the CPPC based driver should be a separate driver.

We made the conscious decision to not use any of the ACPI mechanisms
to enumerate or control P state selection.  Experience over the years
has shown that the quality/accuracy of the BIOS/ACPI implementations
vary widely across OEM's and platform types from a single OEM. Features
that always work on a server platform from a given OEM may not work or
provide bad information on client platforms for example.

Another reason for doing intel_pstate was to be able to land intel specific
features and fixes without breaking other architectures as the power
management capabilities of the platform evolve. As processors that support
Hardware P states (HWP) as described in section 14.4 of the current SDM
come into the market intel_pstate will change to not doing much other
than enabling HWP and providing an interface to forward user configuration
requests to the processor if the user chooses to enable HWP otherwise the
current mechanisms will be used.  This is why the intel_pstate sysfs
interface is the way it is to be able to map cleanly to HWP and provide
an abstract interface going forward.

Having separate drivers allows the system integrator/user to select the
most appropriate mechanism for their system.

--Dirk


With the current split I think you will still be able to maintain
Intel specific changes for the future in the backend driver. The PID
algorithm seems platform independent anyway and the PID knobs are
exported to userspace for platform specific tuning. The Intel backend
driver should be unaffected by the CPPC (ACPI) backend. We can also
make them mutually exclusive at runtime.


We could make it runtime selectable whether to use CPPC or the
native mechanisms for P state enumeration and selection but we would
get into an awful black/white list situation that would not make
anyone happy.

Using CPPC on Intel platforms implies using HWP which is already
planned for in intel_pstate.  I am not aware of any effort to support
CPPC on Intel platforms that do not support HWP.  For Intel platforms
using CPPC is NOT needed or desirable IMHO.  We had many conversations
over many months while CPPC was being defined and made the decision
to not use this mechanism on Intel Linux platforms.

For other platforms that plan on conforming to ACPI 5.x with respect
to P state enumeration and selection I would like to leave it to them
to hurd all the cats at the OEMs to get CPPC correct on all their platforms.



Or are you suggesting using PID + CPPC as another driver? IIUC, that
would lead to a lot of redundancy.



The redundancy is actually pretty small IMHO if you take out the
enumeration/init code the code shared at runtime is pretty small
sample/calc_busy/PID.



Cheers,
Ashwin



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.

2014-09-10 Thread Dirk Brandewie


On 09/09/2014 04:22 PM, Anup Chenthamarakshan wrote:

On Tue, Sep 09, 2014 at 08:15:13AM -0700, Dirk Brandewie wrote:

On 09/08/2014 05:10 PM, Anup Chenthamarakshan wrote:

Exported stats appear in
/devices/system/cpu/intel_pstate/time_in_state as follows:

## CPU 0
40 3647
50 24342
60 144150
70 202469
## CPU 1
40 4813
50 22628
60 149564
70 211885
80 173890

Signed-off-by: Anup Chenthamarakshan 


What is this information being used for?


I'm using P-state residency information in power consumption tests to calculate
proportion of time spent in each P-state across all processors (one global set
of percentages, corresponding to each P-state). This is used to validate new
changes from the power perspective. Essentially, sanity checks to flag changes
with large difference in P-state residency.

So far, we've been using the data exported by acpi-cpufreq to track this.



Tracking the current P state request for each core is only part of the
story.  The processor aggregates the requests from all cores and then decides
what frequency the package will run at, this evaluation happens at ~1ms time
frame.  If a core is idle then it loses its vote for that package frequency will
be and its frequency will be zero even though it may have been requesting
a high P state when it went idle.  Tracking the residency of the requested
P state doesn't provide much useful information other than ensuring the the
requests are changing over time IMHO.


This is exactly why we're trying to track it.


My point is that you are tracking the residency of the request and not
the P state the package was running at.  On a lightly loaded system
it is not unusual for a core that was very busy and requesting a high
P state to go idle for several seconds.  In this case that core would
lose its vote for the package P state but the stats would show that
the P state was high for a very long time when its real frequency
was zero.

There are a couple of ways to get what I consider better information
about what is actually going on.

  The current turbostat provides C state residency and calculates the
  average/effective frequency of the core over its sample time.
  Turbostat will also measure the power consumption from the CPU point
  of view if your processor supports the RAPL registers.

  Reading MSR 0x198 MSR_IA32_PERF_STATUS will tell you what the core
  would run at if it not idle, this reflects the decision that the
  package made based on current requests.

  Using perf to collect power:pstate_sample event will give information
  about each sample on the core and give you timestamps to detect idle
  times.

  Using perf to collect power:cpu_frequency will show when the P state
  request was changed on each core and is triggered by intel_pstate and
  acpi_cpufreq.

  Powertop collects that same information as turbostat and a bunch of
  other information useful in seeing where you could be burning power
  for no good reason.

For getting an idea of real power turbostat is the easiest to use and
is available on most systems.  Using perf will give you a very fine grained
view of what is going on as well as point to the culprit for bad
behaviour in most cases.





This interface will not be supportable with upcoming processors using
hardware P states as documented in volume 3 of the current SDM Section 14.4
http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
The OS will have no way of knowing what the P state requests are for a
given core are.


Will there be any means to determine the proportion of time spent in different
HWP-states when HWP gets enabled (maybe at a package level)?


Not that I am aware of :-(  There is MSR_PPERF section 14.4.5.1 that will give
the CPUs view of the amount of productive work/scalability of the current load.

--Dirk
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC v1 0/6] CPPC as a PID backend

2014-09-10 Thread Dirk Brandewie

Hi Ashwin,

I think the CPPC based driver should be a separate driver.

We made the conscious decision to not use any of the ACPI mechanisms
to enumerate or control P state selection. Experience over the years
has shown that the quality/accuracy of the BIOS/ACPI implementations
vary widely across OEM's and platform types from a single OEM. Features
that always work on a server platform from a given OEM may not work or
provide bad information on client platforms for example.

Another reason for doing intel_pstate was to be able to land intel specific
features and fixes without breaking other architectures as the power
management capabilities of the platform evolve. As processors that support
Hardware P states (HWP) as described in section 14.4 of the current SDM
come into the market intel_pstate will change to not doing much other
than enabling HWP and providing an interface to forward user configuration
requests to the processor if the user chooses to enable HWP otherwise the
current mechanisms will be used. This is why the intel_pstate sysfs
interface is the way it is to be able to map cleanly to HWP and provide
an abstract interface going forward.

Having separate drivers allows the system integrator/user to select the
most appropriate mechanism for their system.

--Dirk

On 09/09/2014 03:12 PM, Ashwin Chaugule wrote:

This patchset introduces CPPC(Collaborative Processor Performance Control) as a
backend
to the PID governor. The PID governor from intel_pstate.c maps cleanly onto
some CPPC
interfaces.
e.g. The CPU performance requests are made on a continuous scale as against
discrete pstate
levels. The CPU performance feedback over an interval is gauged using platform
specific
counters which are also described by CPPC.

Although CPPC describes several other registers to provide more hints to the
platform,
Linux as of today does not have the infrastructure to make use of those
registers.
Some of the CPPC specific information could be made available from the
scheduler as
part of the CPUfreq and Scheduler intergration work. Until then PID can be used
as the
front end for CPPC.

Beyond code restructuring and renaming, this patchset does not change the logic
from the
intel_pstate.c driver. Kernel compilation times were compared with the original
intel_pstate.c,
intel backend(intel_pid_ctrl.c) and the CPPC backend and no significant
overheads were noticed.

Testing was performed on a Thinkpad X240 laptop.

PID_CTRL + INTEL_PSTATE:
===
real5m37.742s
user18m42.575s
sys 1m0.521s

PID_CTRL + CPPC_PID_CTRL:

real5m48.321s
user18m24.487s
sys 0m59.327s

ORIGINAL INTEL_PSTATE:
==
real5m40.642s
user18m37.411s
sys 1m0.185s

The complete patchset including the PCC hacks used for testing is available in
[4].

Changes since V0: [1]
- Split intel_pstate.c into a generic PID governor and platform specific
backend.
- Add CPPC accessors as PID backend.

CPPC:

CPPC (Collaborative Processor Performance Control) is a new way to control CPU
performance using an abstract continous scale as against a discretized P-state
scale
which is tied to CPU frequency only. It is defined in the ACPI 5.0+ spec. In
brief,
the basic operation involves:
- OS makes a CPU performance request. (Can provide min and max tolerable bounds)

- Platform (such as BMC) is free to optimize request within requested bounds
depending
on power/thermal budgets etc.

- Platform conveys its decision back to OS

The communication between OS and platform occurs through another medium called
(PCC)
Platform communication Channel. This is a generic mailbox like mechanism which
includes
doorbell semantics to indicate register updates. The PCC driver is being
discussed in a
separate patchset [3] and is not included here, since CPPC is only one client
of PCC.

Finer details about the PCC and CPPC spec are available in the latest ACPI 5.1
specification.[2]

[1] - http://lwn.net/Articles/608715/
[2] - http://www.uefi.org/sites/default/files/resources/ACPI_5_1release.pdf
[3] - http://comments.gmane.org/gmane.linux.acpi.devel/70299
[4] -
http://git.linaro.org/people/ashwin.chaugule/leg-kernel.git/shortlog/refs/heads/cppc-pid-no_freq_domain

Ashwin Chaugule (6):
PID Controller governor
PID: Move Turbo detection into backend driver
PID: Move Baytrail specific accessors into backend driver
PID: Add new function pointers to read multiple registers
PID: Rename counters to make them more generic
PID: Add CPPC (Collaborative Processor Performance) backend driver

Re: [RFC v1 0/6] CPPC as a PID backend

2014-09-10 Thread Dirk Brandewie

Hi Ashwin,

I think the CPPC based driver should be a separate driver.

Having separate drivers allows the system integrator/user to select the
most appropriate mechanism for their system.

--Dirk

On 09/09/2014 03:12 PM, Ashwin Chaugule wrote:

Testing was performed on a Thinkpad X240 laptop.

PID_CTRL + INTEL_PSTATE:
===
real5m37.742s
user18m42.575s
sys 1m0.521s

PID_CTRL + CPPC_PID_CTRL:

real5m48.321s
user18m24.487s
sys 0m59.327s

ORIGINAL INTEL_PSTATE:
==
real5m40.642s
user18m37.411s
sys 1m0.185s

The complete patchset including the PCC hacks used for testing is available in
[4].

Changes since V0: [1]
- Split intel_pstate.c into a generic PID governor and platform specific
backend.
- Add CPPC accessors as PID backend.

CPPC:

- Platform (such as BMC) is free to optimize request within requested bounds
depending
on power/thermal budgets etc.

- Platform conveys its decision back to OS

Finer details about the PCC and CPPC spec are available in the latest ACPI 5.1
specification.[2]

Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.

2014-09-10 Thread Dirk Brandewie


On 09/09/2014 04:22 PM, Anup Chenthamarakshan wrote:

On Tue, Sep 09, 2014 at 08:15:13AM -0700, Dirk Brandewie wrote:

On 09/08/2014 05:10 PM, Anup Chenthamarakshan wrote:

Exported stats appear in
sysfs/devices/system/cpu/intel_pstate/time_in_state as follows:

## CPU 0
40 3647
50 24342
60 144150
70 202469
## CPU 1
40 4813
50 22628
60 149564
70 211885
80 173890

Signed-off-by: Anup Chenthamarakshan an...@chromium.org


What is this information being used for?


I'm using P-state residency information in power consumption tests to calculate
proportion of time spent in each P-state across all processors (one global set
of percentages, corresponding to each P-state). This is used to validate new
changes from the power perspective. Essentially, sanity checks to flag changes
with large difference in P-state residency.

So far, we've been using the data exported by acpi-cpufreq to track this.



Tracking the current P state request for each core is only part of the
story.  The processor aggregates the requests from all cores and then decides
what frequency the package will run at, this evaluation happens at ~1ms time
frame.  If a core is idle then it loses its vote for that package frequency will
be and its frequency will be zero even though it may have been requesting
a high P state when it went idle.  Tracking the residency of the requested
P state doesn't provide much useful information other than ensuring the the
requests are changing over time IMHO.


This is exactly why we're trying to track it.


My point is that you are tracking the residency of the request and not
the P state the package was running at.  On a lightly loaded system
it is not unusual for a core that was very busy and requesting a high
P state to go idle for several seconds.  In this case that core would
lose its vote for the package P state but the stats would show that
the P state was high for a very long time when its real frequency
was zero.

There are a couple of ways to get what I consider better information
about what is actually going on.

  The current turbostat provides C state residency and calculates the
  average/effective frequency of the core over its sample time.
  Turbostat will also measure the power consumption from the CPU point
  of view if your processor supports the RAPL registers.

  Reading MSR 0x198 MSR_IA32_PERF_STATUS will tell you what the core
  would run at if it not idle, this reflects the decision that the
  package made based on current requests.

  Using perf to collect power:pstate_sample event will give information
  about each sample on the core and give you timestamps to detect idle
  times.

  Using perf to collect power:cpu_frequency will show when the P state
  request was changed on each core and is triggered by intel_pstate and
  acpi_cpufreq.

  Powertop collects that same information as turbostat and a bunch of
  other information useful in seeing where you could be burning power
  for no good reason.

For getting an idea of real power turbostat is the easiest to use and
is available on most systems.  Using perf will give you a very fine grained
view of what is going on as well as point to the culprit for bad
behaviour in most cases.





This interface will not be supportable with upcoming processors using
hardware P states as documented in volume 3 of the current SDM Section 14.4
http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
The OS will have no way of knowing what the P state requests are for a
given core are.


Will there be any means to determine the proportion of time spent in different
HWP-states when HWP gets enabled (maybe at a package level)?


Not that I am aware of :-(  There is MSR_PPERF section 14.4.5.1 that will give
the CPUs view of the amount of productive work/scalability of the current load.

--Dirk
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC v1 0/6] CPPC as a PID backend

2014-09-10 Thread Dirk Brandewie


On 09/10/2014 09:11 AM, Ashwin Chaugule wrote:

On 10 September 2014 11:44, Dirk Brandewie dirk.brande...@gmail.com wrote:

Hi Ashwin,


Hi Dirk,



I think the CPPC based driver should be a separate driver.

We made the conscious decision to not use any of the ACPI mechanisms
to enumerate or control P state selection.  Experience over the years
has shown that the quality/accuracy of the BIOS/ACPI implementations
vary widely across OEM's and platform types from a single OEM. Features
that always work on a server platform from a given OEM may not work or
provide bad information on client platforms for example.

Another reason for doing intel_pstate was to be able to land intel specific
features and fixes without breaking other architectures as the power
management capabilities of the platform evolve. As processors that support
Hardware P states (HWP) as described in section 14.4 of the current SDM
come into the market intel_pstate will change to not doing much other
than enabling HWP and providing an interface to forward user configuration
requests to the processor if the user chooses to enable HWP otherwise the
current mechanisms will be used.  This is why the intel_pstate sysfs
interface is the way it is to be able to map cleanly to HWP and provide
an abstract interface going forward.

Having separate drivers allows the system integrator/user to select the
most appropriate mechanism for their system.

--Dirk


With the current split I think you will still be able to maintain
Intel specific changes for the future in the backend driver. The PID
algorithm seems platform independent anyway and the PID knobs are
exported to userspace for platform specific tuning. The Intel backend
driver should be unaffected by the CPPC (ACPI) backend. We can also
make them mutually exclusive at runtime.


We could make it runtime selectable whether to use CPPC or the
native mechanisms for P state enumeration and selection but we would
get into an awful black/white list situation that would not make
anyone happy.

Using CPPC on Intel platforms implies using HWP which is already
planned for in intel_pstate.  I am not aware of any effort to support
CPPC on Intel platforms that do not support HWP.  For Intel platforms
using CPPC is NOT needed or desirable IMHO.  We had many conversations
over many months while CPPC was being defined and made the decision
to not use this mechanism on Intel Linux platforms.

For other platforms that plan on conforming to ACPI 5.x with respect
to P state enumeration and selection I would like to leave it to them
to hurd all the cats at the OEMs to get CPPC correct on all their platforms.



Or are you suggesting using PID + CPPC as another driver? IIUC, that
would lead to a lot of redundancy.



The redundancy is actually pretty small IMHO if you take out the
enumeration/init code the code shared at runtime is pretty small
sample/calc_busy/PID.



Cheers,
Ashwin



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.

2014-09-09 Thread Dirk Brandewie

On 09/08/2014 05:10 PM, Anup Chenthamarakshan wrote:
> Exported stats appear in
> /devices/system/cpu/intel_pstate/time_in_state as follows:
> 
> ## CPU 0
> 40 3647
> 50 24342
> 60 144150
> 70 202469
> ## CPU 1
> 40 4813
> 50 22628
> 60 149564
> 70 211885
> 80 173890
> 
> Signed-off-by: Anup Chenthamarakshan 

What is this information being used for?

Tracking the current P state request for each core is only part of the 
story.  The processor aggregates the requests from all cores and then decides
what frequency the package will run at, this evaluation happens at ~1ms time
frame.  If a core is idle then it loses its vote for that package frequency will
be and its frequency will be zero even though it may have been requesting
a high P state when it went idle.  Tracking the residency of the requested
P state doesn't provide much useful information other than ensuring the the 
requests are changing over time IMHO.

This interface will not be supportable with upcoming processors using
hardware P states as documented in volume 3 of the current SDM Section 14.4
http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
The OS will have no way of knowing what the P state requests are for a
given core are.

--Dirk 
> ---
>   drivers/cpufreq/intel_pstate.c | 77 
> --
>   1 file changed, 74 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
> index 0668b38..7be89bd 100644
> --- a/drivers/cpufreq/intel_pstate.c
> +++ b/drivers/cpufreq/intel_pstate.c
> @@ -84,6 +84,11 @@ struct _pid {
>   int32_t last_err;
>   };
>   
> +struct pstate_stat {
> + int pstate;
> + u64 time;
> +};
> +
>   struct cpudata {
>   int cpu;
>   
> @@ -97,6 +102,9 @@ struct cpudata {
>   u64 prev_aperf;
>   u64 prev_mperf;
>   struct sample sample;
> +
> + struct pstate_stat *stat;
> + u64 last_updated;
>   };
>   
>   static struct cpudata **all_cpu_data;
> @@ -218,6 +226,18 @@ static inline void intel_pstate_reset_all_pid(void)
>   }
>   }
>   
> +static void intel_pstate_account_time_to_current_pstate(struct cpudata *cpu)
> +{
> + /* Handle the initial call from intel_pstate_init_cpu */
> + if (likely(cpu->stat)) {
> + u64 now = jiffies;
> + int index = cpu->pstate.current_pstate - cpu->pstate.min_pstate;
> +
> + cpu->stat[index].time += now - cpu->last_updated;
> + cpu->last_updated = now;
> + }
> +}
> +
>   /** debugfs begin /
>   static int pid_param_set(void *data, u64 val)
>   {
> @@ -323,6 +343,40 @@ static ssize_t store_min_perf_pct(struct kobject *a, 
> struct attribute *b,
>   return count;
>   }
>   
> +static ssize_t show_time_in_state(struct kobject *kobj, struct attribute 
> *attr,
> + char *buf)
> +{
> + unsigned int cpu;
> + struct cpudata *cpudata;
> + int i, len = 0, total_states;
> +
> + for_each_online_cpu(cpu) {
> + if (!all_cpu_data[cpu])
> + continue;
> +
> + cpudata = all_cpu_data[cpu];
> + len += snprintf(buf + len, PAGE_SIZE - len, "## CPU %d\n", cpu);
> + if (len >= PAGE_SIZE)
> + return len;
> +
> + total_states = cpudata->pstate.turbo_pstate -
> + cpudata->pstate.min_pstate + 1;
> +
> + intel_pstate_account_time_to_current_pstate(cpudata);
> +
> + for (i = 0; i < total_states; i++) {
> + len += snprintf(buf + len, PAGE_SIZE - len, "%d %llu\n",
> + cpudata->stat[i].pstate * 10,
> + cpudata->stat[i].time);
> +
> + if (len >= PAGE_SIZE)
> + return len;
> + }
> + }
> +
> + return len;
> +}
> +
>   show_one(no_turbo, no_turbo);
>   show_one(max_perf_pct, max_perf_pct);
>   show_one(min_perf_pct, min_perf_pct);
> @@ -331,10 +385,13 @@ define_one_global_rw(no_turbo);
>   define_one_global_rw(max_perf_pct);
>   define_one_global_rw(min_perf_pct);
>   
> +define_one_global_ro(time_in_state);
> +
>   static struct attribute *intel_pstate_attributes[] = {
>   _turbo.attr,
>   _perf_pct.attr,
>   _perf_pct.attr,
> + _in_state.attr,
>   NULL
>   };
>   
> @@ -525,9 +582,11 @@ static void intel_pstate_set_pstate(struct cpudata *cpu, 
> int pstate)
>   
>   trace_cpu_frequency(pstate * 10, cpu->cpu);
>   
> - cpu->pstate.current_pstate = pstate;
> -
>   pstate_funcs.set(cpu, pstate);
> +
> + intel_pstate_account_time_to_current_pstate(cpu);
> +
> + cpu->pstate.current_pstate = pstate;
>   }
>   
>   static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
> @@ -751,6

Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.

2014-09-09 Thread Dirk Brandewie

On 09/08/2014 05:10 PM, Anup Chenthamarakshan wrote:
 Exported stats appear in
 sysfs/devices/system/cpu/intel_pstate/time_in_state as follows:
 
 ## CPU 0
 40 3647
 50 24342
 60 144150
 70 202469
 ## CPU 1
 40 4813
 50 22628
 60 149564
 70 211885
 80 173890
 
 Signed-off-by: Anup Chenthamarakshan an...@chromium.org

What is this information being used for?

Tracking the current P state request for each core is only part of the 
story.  The processor aggregates the requests from all cores and then decides
what frequency the package will run at, this evaluation happens at ~1ms time
frame.  If a core is idle then it loses its vote for that package frequency will
be and its frequency will be zero even though it may have been requesting
a high P state when it went idle.  Tracking the residency of the requested
P state doesn't provide much useful information other than ensuring the the 
requests are changing over time IMHO.

This interface will not be supportable with upcoming processors using
hardware P states as documented in volume 3 of the current SDM Section 14.4
http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
The OS will have no way of knowing what the P state requests are for a
given core are.

--Dirk 
 ---
   drivers/cpufreq/intel_pstate.c | 77 
 --
   1 file changed, 74 insertions(+), 3 deletions(-)
 
 diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
 index 0668b38..7be89bd 100644
 --- a/drivers/cpufreq/intel_pstate.c
 +++ b/drivers/cpufreq/intel_pstate.c
 @@ -84,6 +84,11 @@ struct _pid {
   int32_t last_err;
   };
   
 +struct pstate_stat {
 + int pstate;
 + u64 time;
 +};
 +
   struct cpudata {
   int cpu;
   
 @@ -97,6 +102,9 @@ struct cpudata {
   u64 prev_aperf;
   u64 prev_mperf;
   struct sample sample;
 +
 + struct pstate_stat *stat;
 + u64 last_updated;
   };
   
   static struct cpudata **all_cpu_data;
 @@ -218,6 +226,18 @@ static inline void intel_pstate_reset_all_pid(void)
   }
   }
   
 +static void intel_pstate_account_time_to_current_pstate(struct cpudata *cpu)
 +{
 + /* Handle the initial call from intel_pstate_init_cpu */
 + if (likely(cpu-stat)) {
 + u64 now = jiffies;
 + int index = cpu-pstate.current_pstate - cpu-pstate.min_pstate;
 +
 + cpu-stat[index].time += now - cpu-last_updated;
 + cpu-last_updated = now;
 + }
 +}
 +
   /** debugfs begin /
   static int pid_param_set(void *data, u64 val)
   {
 @@ -323,6 +343,40 @@ static ssize_t store_min_perf_pct(struct kobject *a, 
 struct attribute *b,
   return count;
   }
   
 +static ssize_t show_time_in_state(struct kobject *kobj, struct attribute 
 *attr,
 + char *buf)
 +{
 + unsigned int cpu;
 + struct cpudata *cpudata;
 + int i, len = 0, total_states;
 +
 + for_each_online_cpu(cpu) {
 + if (!all_cpu_data[cpu])
 + continue;
 +
 + cpudata = all_cpu_data[cpu];
 + len += snprintf(buf + len, PAGE_SIZE - len, ## CPU %d\n, cpu);
 + if (len = PAGE_SIZE)
 + return len;
 +
 + total_states = cpudata-pstate.turbo_pstate -
 + cpudata-pstate.min_pstate + 1;
 +
 + intel_pstate_account_time_to_current_pstate(cpudata);
 +
 + for (i = 0; i  total_states; i++) {
 + len += snprintf(buf + len, PAGE_SIZE - len, %d %llu\n,
 + cpudata-stat[i].pstate * 10,
 + cpudata-stat[i].time);
 +
 + if (len = PAGE_SIZE)
 + return len;
 + }
 + }
 +
 + return len;
 +}
 +
   show_one(no_turbo, no_turbo);
   show_one(max_perf_pct, max_perf_pct);
   show_one(min_perf_pct, min_perf_pct);
 @@ -331,10 +385,13 @@ define_one_global_rw(no_turbo);
   define_one_global_rw(max_perf_pct);
   define_one_global_rw(min_perf_pct);
   
 +define_one_global_ro(time_in_state);
 +
   static struct attribute *intel_pstate_attributes[] = {
   no_turbo.attr,
   max_perf_pct.attr,
   min_perf_pct.attr,
 + time_in_state.attr,
   NULL
   };
   
 @@ -525,9 +582,11 @@ static void intel_pstate_set_pstate(struct cpudata *cpu, 
 int pstate)
   
   trace_cpu_frequency(pstate * 10, cpu-cpu);
   
 - cpu-pstate.current_pstate = pstate;
 -
   pstate_funcs.set(cpu, pstate);
 +
 + intel_pstate_account_time_to_current_pstate(cpu);
 +
 + cpu-pstate.current_pstate = pstate;
   }
   
   static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
 @@ -751,6 +810,7 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy 
 *policy)

Re: [PATCH] intel_pstate: Turn per cpu printk into pr_debug

2014-08-27 Thread Dirk Brandewie


On 08/27/2014 10:17 AM, Andi Kleen wrote:

From: Andi Kleen 

On larger systems intel_pstate currently spams the boot up
log with its "Intel pstate controlling ..." message for each CPU.
It's the only subsystem that prints a message for each
CPU.

Turn the message into a pr_debug.

Signed-off-by: Andi Kleen 


Acked-by: Dirk Brandewie 


---
  drivers/cpufreq/intel_pstate.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index c5eac94..17be734 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -688,7 +688,7 @@ static int intel_pstate_init_cpu(unsigned int cpunum)

add_timer_on(>timer, cpunum);

-   pr_info("Intel pstate controlling: cpu %d\n", cpunum);
+   pr_debug("Intel pstate controlling: cpu %d\n", cpunum);

return 0;
  }



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] cpufreq: intel_pstate: Add CPU ID for Braswell processor

2014-08-27 Thread Dirk Brandewie


On 08/22/2014 03:19 AM, Viresh Kumar wrote:

On 22 August 2014 15:35, Mika Westerberg
 wrote:

This is pretty much the same as Intel Baytrail, only the CPU ID is
different. Add the new ID to the supported CPU list.

Signed-off-by: Mika Westerberg 
Cc: Dirk Brandewie 


Dirk might be away on holidays..


Yes Sorry

Acked-by: Dirk Brandewie 




---
  drivers/cpufreq/intel_pstate.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index c5eac949760d..a3cf8994160b 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -660,6 +660,7 @@ static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
 ICPU(0x3f, core_params),
 ICPU(0x45, core_params),
 ICPU(0x46, core_params),
+   ICPU(0x4c, byt_params),
 ICPU(0x4f, core_params),
 ICPU(0x56, core_params),
 {}


Acked-by: Viresh Kumar 
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] cpufreq: intel_pstate: Add CPU ID for Braswell processor

2014-08-27 Thread Dirk Brandewie


On 08/22/2014 03:19 AM, Viresh Kumar wrote:

On 22 August 2014 15:35, Mika Westerberg
mika.westerb...@linux.intel.com wrote:

This is pretty much the same as Intel Baytrail, only the CPU ID is
different. Add the new ID to the supported CPU list.

Signed-off-by: Mika Westerberg mika.westerb...@linux.intel.com
Cc: Dirk Brandewie dirk.j.brande...@intel.com


Dirk might be away on holidays..


Yes Sorry

Acked-by: Dirk Brandewie dirk.j.brande...@intel.com




---
  drivers/cpufreq/intel_pstate.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index c5eac949760d..a3cf8994160b 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -660,6 +660,7 @@ static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
 ICPU(0x3f, core_params),
 ICPU(0x45, core_params),
 ICPU(0x46, core_params),
+   ICPU(0x4c, byt_params),
 ICPU(0x4f, core_params),
 ICPU(0x56, core_params),
 {}


Acked-by: Viresh Kumar viresh.ku...@linaro.org
--
To unsubscribe from this list: send the line unsubscribe linux-pm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] intel_pstate: Turn per cpu printk into pr_debug

2014-08-27 Thread Dirk Brandewie


On 08/27/2014 10:17 AM, Andi Kleen wrote:

From: Andi Kleen a...@linux.intel.com

On larger systems intel_pstate currently spams the boot up
log with its Intel pstate controlling ... message for each CPU.
It's the only subsystem that prints a message for each
CPU.

Turn the message into a pr_debug.

Signed-off-by: Andi Kleen a...@linux.intel.com


Acked-by: Dirk Brandewie dirk.j.brande...@intel.com


---
  drivers/cpufreq/intel_pstate.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index c5eac94..17be734 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -688,7 +688,7 @@ static int intel_pstate_init_cpu(unsigned int cpunum)

add_timer_on(cpu-timer, cpunum);

-   pr_info(Intel pstate controlling: cpu %d\n, cpunum);
+   pr_debug(Intel pstate controlling: cpu %d\n, cpunum);

return 0;
  }



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 1/2] cpufreq: Don't destroy/realloc policy/sysfs on hotplug/suspend

2014-07-16 Thread Dirk Brandewie


On 07/15/2014 03:47 PM, Saravana Kannan wrote:

The CPUfreq core moves the cpufreq policy ownership between CPUs when CPUs
within a cluster (CPUs sharing same policy) go ONLINE/OFFLINE. When moving
policy ownership between CPUs, it also moves the cpufreq sysfs directory
between CPUs and also fixes up the symlinks of the other CPUs in the
cluster.

Also, when all the CPUs in a cluster go OFFLINE, all the sysfs nodes and
directories are deleted, the kobject is released and the policy is freed.
And when the first CPU in a cluster comes up, the policy is reallocated and
initialized, kobject is acquired, the sysfs nodes are created or symlinked,
etc.

All these steps end up creating unnecessarily complicated code and locking.
There's no real benefit to adding/removing/moving the sysfs nodes and the
policy between CPUs. Other per CPU sysfs directories like power and cpuidle
are left alone during hotplug. So there's some precedence to what this
patch is trying to do.

This patch simplifies a lot of the code and locking by removing the
adding/removing/moving of policy/sysfs/kobj and just leaves the cpufreq
directory and policy in place irrespective of whether the CPUs are
ONLINE/OFFLINE.

Leaving the policy, sysfs and kobject in place also brings these additional
benefits:
* Faster suspend/resume
* Faster hotplug
* Sysfs file permissions maintained across hotplug
* Policy settings and governor tunables maintained across hotplug
* Cpufreq stats would be maintained across hotplug for all CPUs and can be
   queried even after CPU goes OFFLINE

Tested-by: Stephen Boyd 
Signed-off-by: Saravana Kannan 
---
  drivers/cpufreq/cpufreq.c | 388 +-
  1 file changed, 107 insertions(+), 281 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 62259d2..a0a2ec2 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -37,7 +37,6 @@
   */
  static struct cpufreq_driver *cpufreq_driver;
  static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data);
-static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data_fallback);
  static DEFINE_RWLOCK(cpufreq_driver_lock);
  DEFINE_MUTEX(cpufreq_governor_lock);
  static LIST_HEAD(cpufreq_policy_list);
@@ -859,34 +858,41 @@ void cpufreq_sysfs_remove_file(const struct attribute 
*attr)
  }
  EXPORT_SYMBOL(cpufreq_sysfs_remove_file);

-/* symlink affected CPUs */
-static int cpufreq_add_dev_symlink(struct cpufreq_policy *policy)
+/* symlink related CPUs */
+static int cpufreq_dev_symlink(struct cpufreq_policy *policy, bool add)
  {
-   unsigned int j;
+   unsigned int j, first_cpu = cpumask_first(policy->related_cpus);
int ret = 0;

-   for_each_cpu(j, policy->cpus) {
+   for_each_cpu(j, policy->related_cpus) {
struct device *cpu_dev;

-   if (j == policy->cpu)
+   if (j == first_cpu)
continue;

-   pr_debug("Adding link for CPU: %u\n", j);
cpu_dev = get_cpu_device(j);
-   ret = sysfs_create_link(_dev->kobj, >kobj,
-   "cpufreq");
+   if (add)
+   ret = sysfs_create_link(_dev->kobj, >kobj,
+   "cpufreq");
+   else
+   sysfs_remove_link(_dev->kobj, "cpufreq");
+
if (ret)
break;
}
return ret;
  }

-static int cpufreq_add_dev_interface(struct cpufreq_policy *policy,
-struct device *dev)
+static int cpufreq_add_dev_interface(struct cpufreq_policy *policy)
  {
struct freq_attr **drv_attr;
+   struct device *dev;
int ret = 0;

+   dev = get_cpu_device(cpumask_first(policy->related_cpus));
+   if (!dev)
+   return -EINVAL;
+
/* prepare interface data */
ret = kobject_init_and_add(>kobj, _cpufreq,
   >kobj, "cpufreq");
@@ -917,7 +923,7 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy 
*policy,
goto err_out_kobj_put;
}

-   ret = cpufreq_add_dev_symlink(policy);
+   ret = cpufreq_dev_symlink(policy, true);
if (ret)
goto err_out_kobj_put;

@@ -961,60 +967,58 @@ static void cpufreq_init_policy(struct cpufreq_policy 
*policy)
  }

  #ifdef CONFIG_HOTPLUG_CPU
-static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy,
- unsigned int cpu, struct device *dev)
+static int cpufreq_change_policy_cpus(struct cpufreq_policy *policy,
+ unsigned int cpu, bool add)
  {
int ret = 0;
-   unsigned long flags;
+   unsigned int cpus, pcpu;

-   if (has_target()) {
+   down_write(>rwsem);
+
+   cpus = !cpumask_empty(policy->cpus);
+   if (has_target() && cpus) {
ret =

Re: [PATCH v3 1/2] cpufreq: Don't destroy/realloc policy/sysfs on hotplug/suspend

2014-07-16 Thread Dirk Brandewie


On 07/15/2014 03:47 PM, Saravana Kannan wrote:

The CPUfreq core moves the cpufreq policy ownership between CPUs when CPUs
within a cluster (CPUs sharing same policy) go ONLINE/OFFLINE. When moving
policy ownership between CPUs, it also moves the cpufreq sysfs directory
between CPUs and also fixes up the symlinks of the other CPUs in the
cluster.

Also, when all the CPUs in a cluster go OFFLINE, all the sysfs nodes and
directories are deleted, the kobject is released and the policy is freed.
And when the first CPU in a cluster comes up, the policy is reallocated and
initialized, kobject is acquired, the sysfs nodes are created or symlinked,
etc.

All these steps end up creating unnecessarily complicated code and locking.
There's no real benefit to adding/removing/moving the sysfs nodes and the
policy between CPUs. Other per CPU sysfs directories like power and cpuidle
are left alone during hotplug. So there's some precedence to what this
patch is trying to do.

This patch simplifies a lot of the code and locking by removing the
adding/removing/moving of policy/sysfs/kobj and just leaves the cpufreq
directory and policy in place irrespective of whether the CPUs are
ONLINE/OFFLINE.

Leaving the policy, sysfs and kobject in place also brings these additional
benefits:
* Faster suspend/resume
* Faster hotplug
* Sysfs file permissions maintained across hotplug
* Policy settings and governor tunables maintained across hotplug
* Cpufreq stats would be maintained across hotplug for all CPUs and can be
   queried even after CPU goes OFFLINE

Tested-by: Stephen Boyd sb...@codeaurora.org
Signed-off-by: Saravana Kannan skan...@codeaurora.org
---
  drivers/cpufreq/cpufreq.c | 388 +-
  1 file changed, 107 insertions(+), 281 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 62259d2..a0a2ec2 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -37,7 +37,6 @@
   */
  static struct cpufreq_driver *cpufreq_driver;
  static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data);
-static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data_fallback);
  static DEFINE_RWLOCK(cpufreq_driver_lock);
  DEFINE_MUTEX(cpufreq_governor_lock);
  static LIST_HEAD(cpufreq_policy_list);
@@ -859,34 +858,41 @@ void cpufreq_sysfs_remove_file(const struct attribute 
*attr)
  }
  EXPORT_SYMBOL(cpufreq_sysfs_remove_file);

-/* symlink affected CPUs */
-static int cpufreq_add_dev_symlink(struct cpufreq_policy *policy)
+/* symlink related CPUs */
+static int cpufreq_dev_symlink(struct cpufreq_policy *policy, bool add)
  {
-   unsigned int j;
+   unsigned int j, first_cpu = cpumask_first(policy-related_cpus);
int ret = 0;

-   for_each_cpu(j, policy-cpus) {
+   for_each_cpu(j, policy-related_cpus) {
struct device *cpu_dev;

-   if (j == policy-cpu)
+   if (j == first_cpu)
continue;

-   pr_debug(Adding link for CPU: %u\n, j);
cpu_dev = get_cpu_device(j);
-   ret = sysfs_create_link(cpu_dev-kobj, policy-kobj,
-   cpufreq);
+   if (add)
+   ret = sysfs_create_link(cpu_dev-kobj, policy-kobj,
+   cpufreq);
+   else
+   sysfs_remove_link(cpu_dev-kobj, cpufreq);
+
if (ret)
break;
}
return ret;
  }

-static int cpufreq_add_dev_interface(struct cpufreq_policy *policy,
-struct device *dev)
+static int cpufreq_add_dev_interface(struct cpufreq_policy *policy)
  {
struct freq_attr **drv_attr;
+   struct device *dev;
int ret = 0;

+   dev = get_cpu_device(cpumask_first(policy-related_cpus));
+   if (!dev)
+   return -EINVAL;
+
/* prepare interface data */
ret = kobject_init_and_add(policy-kobj, ktype_cpufreq,
   dev-kobj, cpufreq);
@@ -917,7 +923,7 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy 
*policy,
goto err_out_kobj_put;
}

-   ret = cpufreq_add_dev_symlink(policy);
+   ret = cpufreq_dev_symlink(policy, true);
if (ret)
goto err_out_kobj_put;

@@ -961,60 +967,58 @@ static void cpufreq_init_policy(struct cpufreq_policy 
*policy)
  }

  #ifdef CONFIG_HOTPLUG_CPU
-static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy,
- unsigned int cpu, struct device *dev)
+static int cpufreq_change_policy_cpus(struct cpufreq_policy *policy,
+ unsigned int cpu, bool add)
  {
int ret = 0;
-   unsigned long flags;
+   unsigned int cpus, pcpu;

-   if (has_target()) {
+   down_write(policy-rwsem);
+
+   cpus = !cpumask_empty(policy-cpus);
+   if

Re: [PATCH] cpufreq: intel_pstate: Fix rounding of core_pct

2014-06-13 Thread Dirk Brandewie


On 06/12/2014 01:03 PM, Rafael J. Wysocki wrote:

On Thursday, June 12, 2014 05:35:59 PM Stratos Karafotis wrote:

On 12/06/2014 12:15 πμ, Doug Smythies wrote:



-Original Message-
From: Stratos Karafotis [mailto:strat...@semaphore.gr]
Sent: June-11-2014 13:20
To: Doug Smythies
Cc: linux...@vger.kernel.org; linux-kernel@vger.kernel.org; r...@rjwysocki.net; 
viresh.ku...@linaro.org; dirk.j.brande...@intel.com
Subject: Re: [PATCH] cpufreq: intel_pstate: Fix rounding of core_pct

On 2014.06.11 13:20 Stratos Karafotis wrote:

On 11/06/2014 06:02 μμ, Doug Smythies wrote:


On 2104.06.11 07:08 Stratos Karafotis wrote:

On 11/06/2014 04:41 μμ, Doug Smythies wrote:

No.



The intent was only ever to round properly the pseudo floating point result of 
the divide.
It was much more important (ugh, well 4 times more) when FRACBITS was still 6, 
which also got changed to 8 in a recent patch.



Are you sure?

Yes.


This rounding was very recently added.
As far as I can understand, I don't see the meaning of this rounding, as is.
Even if FRAC_BITS was 6, I think it would have almost no improvement in
calculations.


Note: I had not seen this e-mail when I wrote a few minutes ago:

You may be correct.
If Dirk agrees, I will re-analyse the entire driver for rounding effects soon.
When FRACBITS was 6 there were subtle cases where the driver would get stuck, 
and not make a final pstate change, with the default PID gains.
Other things have changed, and the analysis needs to be re-done.




Could you please elaborate a little bit more what we need these 2 lines below?



Sorry for being MIA on this thread I have been up to my eyeballs.


if ((rem << 1) >= int_tofp(sample->mperf))
core_pct += 1;


The rounding should have been
   core_pct += (1 << (FRAC_BITS-1));
Since core_pct is is in fixeded point notation at this point. Adding .5 to
core_pct to round up.

As Stratos pointed out the the current code only adds 1/256 to core_pct

Since core_pct_busy stays in fixed point through out the rest of the
calculations ans we only do the rounding when the PID is returning an
int I think we can safely remove these two lines.







Because nothing is mentioned for them in commit's changelog.
Do we need to round core_pct or not?
Because if we try to round it, I think this patch should work.


As mentioned originally, they are there just to round the pseudo floating 
number, not the integer portion only.
Let us bring back the very numbers you originally gave and work through it.

aperf = 5024
mperf = 10619

core_pct = 47.31142292%
or 47 and 79.724267 256ths
or to the closest kept fractional part 47 and 80 256ths
or 12112 as a pseudo float.
The maximum error with this rounding will be 1 part in 512 and symmetric 
instead of the 1 part in 256 always in one direction without.

Now if FRACBITS was still 6:
core_pct = 47.31142292%
or 47 and 19.931 64ths
or to the closest kept fractional part 47 and 20 64ths
or 3028 as a pseudo float.
The maximum error with this rounding will be 1 part in 128 and symmetric 
instead of the 1 part in 64 (1.6% !!!) always in one direction without.

Hope this helps.



Yes, it helps. Thanks a lot!

But please note that the maximum error without this rounding will be 1.6% _only_
in fractional part. In the real number it will be much smaller:

47.19 instead of 47.20

And using FRAC_BITS 8:

47.79 instead of 47.80

This is a 0.0002% difference. I can't see how this is can affect the 
calculations
even with FRAC_BITS 6.

Another thing is that this algorithm generally is used to round to the
nearest integer. I'm not sure if it's valid to apply it for the rounding of
the fractional part of fixed point number.


Depending on the original reason, it may or may not be.

In theory, it may help reduce numerical drift resulting from rounding always in
one direction only, but I'm not really sure if that matters here.

Doug seems to have carried out full analysis, though.

Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] cpufreq: intel_pstate: Fix rounding of core_pct

2014-06-13 Thread Dirk Brandewie


On 06/12/2014 01:03 PM, Rafael J. Wysocki wrote:

On Thursday, June 12, 2014 05:35:59 PM Stratos Karafotis wrote:

On 12/06/2014 12:15 πμ, Doug Smythies wrote:



-Original Message-
From: Stratos Karafotis [mailto:strat...@semaphore.gr]
Sent: June-11-2014 13:20
To: Doug Smythies
Cc: linux...@vger.kernel.org; linux-kernel@vger.kernel.org; r...@rjwysocki.net; 
viresh.ku...@linaro.org; dirk.j.brande...@intel.com
Subject: Re: [PATCH] cpufreq: intel_pstate: Fix rounding of core_pct

On 2014.06.11 13:20 Stratos Karafotis wrote:

On 11/06/2014 06:02 μμ, Doug Smythies wrote:


On 2104.06.11 07:08 Stratos Karafotis wrote:

On 11/06/2014 04:41 μμ, Doug Smythies wrote:

No.



The intent was only ever to round properly the pseudo floating point result of 
the divide.
It was much more important (ugh, well 4 times more) when FRACBITS was still 6, 
which also got changed to 8 in a recent patch.



Are you sure?

Yes.


This rounding was very recently added.
As far as I can understand, I don't see the meaning of this rounding, as is.
Even if FRAC_BITS was 6, I think it would have almost no improvement in
calculations.


Note: I had not seen this e-mail when I wrote a few minutes ago:

You may be correct.
If Dirk agrees, I will re-analyse the entire driver for rounding effects soon.
When FRACBITS was 6 there were subtle cases where the driver would get stuck, 
and not make a final pstate change, with the default PID gains.
Other things have changed, and the analysis needs to be re-done.




Could you please elaborate a little bit more what we need these 2 lines below?



Sorry for being MIA on this thread I have been up to my eyeballs.


if ((rem  1) = int_tofp(sample-mperf))
core_pct += 1;


The rounding should have been
   core_pct += (1  (FRAC_BITS-1));
Since core_pct is is in fixeded point notation at this point. Adding .5 to
core_pct to round up.

As Stratos pointed out the the current code only adds 1/256 to core_pct

Since core_pct_busy stays in fixed point through out the rest of the
calculations ans we only do the rounding when the PID is returning an
int I think we can safely remove these two lines.







Because nothing is mentioned for them in commit's changelog.
Do we need to round core_pct or not?
Because if we try to round it, I think this patch should work.


As mentioned originally, they are there just to round the pseudo floating 
number, not the integer portion only.
Let us bring back the very numbers you originally gave and work through it.

aperf = 5024
mperf = 10619

core_pct = 47.31142292%
or 47 and 79.724267 256ths
or to the closest kept fractional part 47 and 80 256ths
or 12112 as a pseudo float.
The maximum error with this rounding will be 1 part in 512 and symmetric 
instead of the 1 part in 256 always in one direction without.

Now if FRACBITS was still 6:
core_pct = 47.31142292%
or 47 and 19.931 64ths
or to the closest kept fractional part 47 and 20 64ths
or 3028 as a pseudo float.
The maximum error with this rounding will be 1 part in 128 and symmetric 
instead of the 1 part in 64 (1.6% !!!) always in one direction without.

Hope this helps.



Yes, it helps. Thanks a lot!

But please note that the maximum error without this rounding will be 1.6% _only_
in fractional part. In the real number it will be much smaller:

47.19 instead of 47.20

And using FRAC_BITS 8:

47.79 instead of 47.80

This is a 0.0002% difference. I can't see how this is can affect the 
calculations
even with FRAC_BITS 6.

Another thing is that this algorithm generally is used to round to the
nearest integer. I'm not sure if it's valid to apply it for the rounding of
the fractional part of fixed point number.


Depending on the original reason, it may or may not be.

In theory, it may help reduce numerical drift resulting from rounding always in
one direction only, but I'm not really sure if that matters here.

Doug seems to have carried out full analysis, though.

Rafael

--
To unsubscribe from this list: send the line unsubscribe linux-pm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

1 2 3 4 >

1 - 100 of 376 matches

Mail list logo