Re: [PATCH 1/2 V6] intel_pstate: skip this driver if Sun server has _PPC method

2014-12-01 Thread Dirk Brandewie

On 11/30/2014 06:32 PM, Ethan Zhao wrote:

Oracle Sun X86 servers have dynamic power capping capability that works via
ACPI _PPC method etc, so skip loading this driver if Sun server has ACPI _PPC
enabled.

Signed-off-by: Ethan Zhao 
Signed-off-by: Dirk Brandewie 
Tested-by: Linda Knippers 


In the future you should not add other peoples Signed-off-by or Tested-by
tags unless they have explicitly told you can do so. Other than that I
am fine with this patch.

--Dirk

---
   v2: fix break HP Proliant issue.
   v3: expand the hardware vendor list.
   v4: refine code.
   v5v6: change enum PCC to PPC.

  drivers/cpufreq/intel_pstate.c | 45 ++
  1 file changed, 41 insertions(+), 4 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 27bb6d3..1bb62ca 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -943,15 +943,46 @@ static bool intel_pstate_no_acpi_pss(void)
return true;
  }

+static bool intel_pstate_has_acpi_ppc(void)
+{
+   int i;
+
+   for_each_possible_cpu(i) {
+   struct acpi_processor *pr = per_cpu(processors, i);
+
+   if (!pr)
+   continue;
+   if (acpi_has_method(pr->handle, "_PPC"))
+   return true;
+   }
+   return false;
+}
+
+enum {
+   PSS,
+   PPC,
+};
+
  struct hw_vendor_info {
u16  valid;
char oem_id[ACPI_OEM_ID_SIZE];
char oem_table_id[ACPI_OEM_TABLE_ID_SIZE];
+   int  oem_pwr_table;
  };

  /* Hardware vendor-specific info that has its own power management modes */
  static struct hw_vendor_info vendor_info[] = {
-   {1, "HP", "ProLiant"},
+   {1, "HP", "ProLiant", PSS},
+   {1, "ORACLE", "X4-2", PPC},
+   {1, "ORACLE", "X4-2L   ", PPC},
+   {1, "ORACLE", "X4-2B   ", PPC},
+   {1, "ORACLE", "X3-2", PPC},
+   {1, "ORACLE", "X3-2L   ", PPC},
+   {1, "ORACLE", "X3-2B   ", PPC},
+   {1, "ORACLE", "X4470M2 ", PPC},
+   {1, "ORACLE", "X4270M3 ", PPC},
+   {1, "ORACLE", "X4270M2 ", PPC},
+   {1, "ORACLE", "X4170M2 ", PPC},
{0, "", ""},
  };

@@ -966,15 +997,21 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void)

for (v_info = vendor_info; v_info->valid; v_info++) {
if (!strncmp(hdr.oem_id, v_info->oem_id, ACPI_OEM_ID_SIZE) &&
-   !strncmp(hdr.oem_table_id, v_info->oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE) &&
-   intel_pstate_no_acpi_pss())
-   return true;
+   !strncmp(hdr.oem_table_id, v_info->oem_table_id,
+   ACPI_OEM_TABLE_ID_SIZE))
+   switch (v_info->oem_pwr_table) {
+   case PSS:
+   return intel_pstate_no_acpi_pss();
+   case PPC:
+   return intel_pstate_has_acpi_ppc();
+   }
}

return false;
  }
  #else /* CONFIG_ACPI not enabled */
  static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return 
false; }
+static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
  #endif /* CONFIG_ACPI */

  static int __init intel_pstate_init(void)



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2 V6] intel_pstate: skip this driver if Sun server has _PPC method

2014-12-01 Thread Dirk Brandewie

On 11/30/2014 06:32 PM, Ethan Zhao wrote:

Oracle Sun X86 servers have dynamic power capping capability that works via
ACPI _PPC method etc, so skip loading this driver if Sun server has ACPI _PPC
enabled.

Signed-off-by: Ethan Zhao ethan.z...@oracle.com
Signed-off-by: Dirk Brandewie dirk.brande...@gmail.com
Tested-by: Linda Knippers linda.knipp...@hp.com


In the future you should not add other peoples Signed-off-by or Tested-by
tags unless they have explicitly told you can do so. Other than that I
am fine with this patch.

--Dirk

---
   v2: fix break HP Proliant issue.
   v3: expand the hardware vendor list.
   v4: refine code.
   v5v6: change enum PCC to PPC.

  drivers/cpufreq/intel_pstate.c | 45 ++
  1 file changed, 41 insertions(+), 4 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 27bb6d3..1bb62ca 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -943,15 +943,46 @@ static bool intel_pstate_no_acpi_pss(void)
return true;
  }

+static bool intel_pstate_has_acpi_ppc(void)
+{
+   int i;
+
+   for_each_possible_cpu(i) {
+   struct acpi_processor *pr = per_cpu(processors, i);
+
+   if (!pr)
+   continue;
+   if (acpi_has_method(pr-handle, _PPC))
+   return true;
+   }
+   return false;
+}
+
+enum {
+   PSS,
+   PPC,
+};
+
  struct hw_vendor_info {
u16  valid;
char oem_id[ACPI_OEM_ID_SIZE];
char oem_table_id[ACPI_OEM_TABLE_ID_SIZE];
+   int  oem_pwr_table;
  };

  /* Hardware vendor-specific info that has its own power management modes */
  static struct hw_vendor_info vendor_info[] = {
-   {1, HP, ProLiant},
+   {1, HP, ProLiant, PSS},
+   {1, ORACLE, X4-2, PPC},
+   {1, ORACLE, X4-2L   , PPC},
+   {1, ORACLE, X4-2B   , PPC},
+   {1, ORACLE, X3-2, PPC},
+   {1, ORACLE, X3-2L   , PPC},
+   {1, ORACLE, X3-2B   , PPC},
+   {1, ORACLE, X4470M2 , PPC},
+   {1, ORACLE, X4270M3 , PPC},
+   {1, ORACLE, X4270M2 , PPC},
+   {1, ORACLE, X4170M2 , PPC},
{0, , },
  };

@@ -966,15 +997,21 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void)

for (v_info = vendor_info; v_info-valid; v_info++) {
if (!strncmp(hdr.oem_id, v_info-oem_id, ACPI_OEM_ID_SIZE) 
-   !strncmp(hdr.oem_table_id, v_info-oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE) 
-   intel_pstate_no_acpi_pss())
-   return true;
+   !strncmp(hdr.oem_table_id, v_info-oem_table_id,
+   ACPI_OEM_TABLE_ID_SIZE))
+   switch (v_info-oem_pwr_table) {
+   case PSS:
+   return intel_pstate_no_acpi_pss();
+   case PPC:
+   return intel_pstate_has_acpi_ppc();
+   }
}

return false;
  }
  #else /* CONFIG_ACPI not enabled */
  static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return 
false; }
+static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
  #endif /* CONFIG_ACPI */

  static int __init intel_pstate_init(void)



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2 v3] intel_pstate: skip this driver if Sun server has _PPC method

2014-11-25 Thread Dirk Brandewie

On 11/24/2014 08:59 PM, Ethan Zhao wrote:

Oracle Sun X86 servers have dynamic power capping capability that works via
ACPI _PPC method etc, so skip loading this driver if Sun server has ACPI _PPC
enabled.



How about this patch? only compile tested.

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 3468387..db7b8b2 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1025,15 +1025,46 @@ static bool intel_pstate_no_acpi_pss(void)
return true;
 }

+static bool intel_pstate_has_acpi_ppc(void)
+{
+   int i;
+
+   for_each_possible_cpu(i) {
+   struct acpi_processor *pr = per_cpu(processors, i);
+
+   if (!pr)
+   continue;
+   if (acpi_has_method(pr->handle, "_PPC"))
+   return true;
+   }
+   return false;
+}
+
+enum {
+   PSS,
+   PCC,
+};
+
 struct hw_vendor_info {
u16  valid;
char oem_id[ACPI_OEM_ID_SIZE];
char oem_table_id[ACPI_OEM_TABLE_ID_SIZE];
+   int  oem_pwr_table;
 };

 /* Hardware vendor-specific info that has its own power management modes */
 static struct hw_vendor_info vendor_info[] = {
-   {1, "HP", "ProLiant"},
+   {1, "HP", "ProLiant", PSS},
+   {1, "ORACLE", "X4-2", PCC},
+   {1, "ORACLE", "X4-2L   ", PCC},
+   {1, "ORACLE", "X4-2B   ", PCC},
+   {1, "ORACLE", "X3-2", PCC},
+   {1, "ORACLE", "X3-2L   ", PCC},
+   {1, "ORACLE", "X3-2B   ", PCC},
+   {1, "ORACLE", "X4470M2 ", PCC},
+   {1, "ORACLE", "X4270M3 ", PCC},
+   {1, "ORACLE", "X4270M2 ", PCC},
+   {1, "ORACLE", "X4170M2 ", PCC},
{0, "", ""},
 };

@@ -1057,15 +1088,20 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void)

for (v_info = vendor_info; v_info->valid; v_info++) {
if (!strncmp(hdr.oem_id, v_info->oem_id, ACPI_OEM_ID_SIZE) &&
-   !strncmp(hdr.oem_table_id, v_info->oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE) &&
-   intel_pstate_no_acpi_pss())
-   return true;
+   !strncmp(hdr.oem_table_id, v_info->oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE))
+   switch (v_info->oem_pwr_table) {
+   case PSS:
+   return intel_pstate_no_acpi_pss();
+   case PCC:
+   return intel_pstate_has_acpi_ppc();
+   }
}

return false;
 }
 #else /* CONFIG_ACPI not enabled */
 static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; 
}
+static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
 #endif /* CONFIG_ACPI */

 static int __init intel_pstate_init(void)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2 v3] intel_pstate: add kernel parameter to enable loading on Sun X86 servers.

2014-11-25 Thread Dirk Brandewie

On 11/24/2014 08:59 PM, Ethan Zhao wrote:

To force loading on Oracle Sun X86 servers, provide one kernel command line
parameter

   intel_pstate = onora

For those who be aware of the risk doing so.

Signed-off-by: Ethan Zhao 
---
  v2: change to hardware vendor specific naming parameter.

  Documentation/kernel-parameters.txt | 3 +++
  drivers/cpufreq/intel_pstate.c  | 6 +-
  2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 479f332..e4b1b81 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1446,6 +1446,9 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
   disable
 Do not enable intel_pstate as the default
 scaling driver for the supported processors
+  onora
+Enable loading intel_pstate on Oracle Sun Servers(X86).
+only for those who be aware of the risk.


What are the risks?  What is the behaviour if platform power management is
enabled and intel_pstate is trying to control P state selection as well?

If intel_pstate will be able to successfully control P state selection
with platform power management enabled then how about the name "oracle_force"?
Also the documentation should say what the risks are.



intremap=   [X86-64, Intel-IOMMU]
on  enable Interrupt Remapping (default)
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index fa67fb3..e49b050 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -866,6 +866,7 @@ static struct cpufreq_driver intel_pstate_driver = {
  };

  static int __initdata no_load;
+static unsigned int  load_on_sun;

  static int intel_pstate_msrs_not_valid(void)
  {
@@ -1005,7 +1006,8 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void)
if (!strncmp(hdr.oem_id, v_info->oem_id, ACPI_OEM_ID_SIZE) &&
!strncmp(hdr.oem_table_id, v_info->oem_table_id,
ACPI_OEM_TABLE_ID_SIZE) &&
-   intel_pstate_has_acpi_ppc())
+   intel_pstate_has_acpi_ppc() &&
+   !load_on_sun)
return true;
}

@@ -1080,6 +1082,8 @@ static int __init intel_pstate_setup(char *str)

if (!strcmp(str, "disable"))
no_load = 1;
+   if (!strcmp(str, "onora"))
+   load_on_sun = 1;
return 0;
  }
  early_param("intel_pstate", intel_pstate_setup);



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2 v3] intel_pstate: add kernel parameter to enable loading on Sun X86 servers.

2014-11-25 Thread Dirk Brandewie

On 11/24/2014 08:59 PM, Ethan Zhao wrote:

To force loading on Oracle Sun X86 servers, provide one kernel command line
parameter

   intel_pstate = onora

For those who be aware of the risk doing so.

Signed-off-by: Ethan Zhao ethan.z...@oracle.com
---
  v2: change to hardware vendor specific naming parameter.

  Documentation/kernel-parameters.txt | 3 +++
  drivers/cpufreq/intel_pstate.c  | 6 +-
  2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 479f332..e4b1b81 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1446,6 +1446,9 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
   disable
 Do not enable intel_pstate as the default
 scaling driver for the supported processors
+  onora
+Enable loading intel_pstate on Oracle Sun Servers(X86).
+only for those who be aware of the risk.


What are the risks?  What is the behaviour if platform power management is
enabled and intel_pstate is trying to control P state selection as well?

If intel_pstate will be able to successfully control P state selection
with platform power management enabled then how about the name oracle_force?
Also the documentation should say what the risks are.



intremap=   [X86-64, Intel-IOMMU]
on  enable Interrupt Remapping (default)
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index fa67fb3..e49b050 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -866,6 +866,7 @@ static struct cpufreq_driver intel_pstate_driver = {
  };

  static int __initdata no_load;
+static unsigned int  load_on_sun;

  static int intel_pstate_msrs_not_valid(void)
  {
@@ -1005,7 +1006,8 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void)
if (!strncmp(hdr.oem_id, v_info-oem_id, ACPI_OEM_ID_SIZE) 
!strncmp(hdr.oem_table_id, v_info-oem_table_id,
ACPI_OEM_TABLE_ID_SIZE) 
-   intel_pstate_has_acpi_ppc())
+   intel_pstate_has_acpi_ppc() 
+   !load_on_sun)
return true;
}

@@ -1080,6 +1082,8 @@ static int __init intel_pstate_setup(char *str)

if (!strcmp(str, disable))
no_load = 1;
+   if (!strcmp(str, onora))
+   load_on_sun = 1;
return 0;
  }
  early_param(intel_pstate, intel_pstate_setup);



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2 v3] intel_pstate: skip this driver if Sun server has _PPC method

2014-11-25 Thread Dirk Brandewie

On 11/24/2014 08:59 PM, Ethan Zhao wrote:

Oracle Sun X86 servers have dynamic power capping capability that works via
ACPI _PPC method etc, so skip loading this driver if Sun server has ACPI _PPC
enabled.



How about this patch? only compile tested.

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 3468387..db7b8b2 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1025,15 +1025,46 @@ static bool intel_pstate_no_acpi_pss(void)
return true;
 }

+static bool intel_pstate_has_acpi_ppc(void)
+{
+   int i;
+
+   for_each_possible_cpu(i) {
+   struct acpi_processor *pr = per_cpu(processors, i);
+
+   if (!pr)
+   continue;
+   if (acpi_has_method(pr-handle, _PPC))
+   return true;
+   }
+   return false;
+}
+
+enum {
+   PSS,
+   PCC,
+};
+
 struct hw_vendor_info {
u16  valid;
char oem_id[ACPI_OEM_ID_SIZE];
char oem_table_id[ACPI_OEM_TABLE_ID_SIZE];
+   int  oem_pwr_table;
 };

 /* Hardware vendor-specific info that has its own power management modes */
 static struct hw_vendor_info vendor_info[] = {
-   {1, HP, ProLiant},
+   {1, HP, ProLiant, PSS},
+   {1, ORACLE, X4-2, PCC},
+   {1, ORACLE, X4-2L   , PCC},
+   {1, ORACLE, X4-2B   , PCC},
+   {1, ORACLE, X3-2, PCC},
+   {1, ORACLE, X3-2L   , PCC},
+   {1, ORACLE, X3-2B   , PCC},
+   {1, ORACLE, X4470M2 , PCC},
+   {1, ORACLE, X4270M3 , PCC},
+   {1, ORACLE, X4270M2 , PCC},
+   {1, ORACLE, X4170M2 , PCC},
{0, , },
 };

@@ -1057,15 +1088,20 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void)

for (v_info = vendor_info; v_info-valid; v_info++) {
if (!strncmp(hdr.oem_id, v_info-oem_id, ACPI_OEM_ID_SIZE) 
-   !strncmp(hdr.oem_table_id, v_info-oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE) 
-   intel_pstate_no_acpi_pss())
-   return true;
+   !strncmp(hdr.oem_table_id, v_info-oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE))
+   switch (v_info-oem_pwr_table) {
+   case PSS:
+   return intel_pstate_no_acpi_pss();
+   case PCC:
+   return intel_pstate_has_acpi_ppc();
+   }
}

return false;
 }
 #else /* CONFIG_ACPI not enabled */
 static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; 
}
+static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
 #endif /* CONFIG_ACPI */

 static int __init intel_pstate_init(void)


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/3] intel_pstate: skip the driver if Sun server has ACPI _PPC method

2014-11-20 Thread Dirk Brandewie

On 11/19/2014 12:22 PM, Linda Knippers wrote:

On 11/18/2014 3:37 AM, Ethan Zhao wrote:

Oracle Sun X86 servers have dynamic power capping capability that works via
ACPI _PPC method etc, so skip loading this driver if Sun server has ACPI _PPC
enabled.

Signed-off-by: Ethan Zhao 
---
  drivers/cpufreq/intel_pstate.c | 20 
  1 file changed, 20 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 27bb6d3..5498eb0 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -943,6 +943,21 @@ static bool intel_pstate_no_acpi_pss(void)
return true;
  }

+static bool intel_pstate_has_acpi_ppc(void)
+{
+   int i;
+
+   for_each_possible_cpu(i) {
+   struct acpi_processor *pr = per_cpu(processors, i);
+
+   if (!pr)
+   continue;
+   if (acpi_has_method(pr->handle, "_PPC"))
+   return true;
+   }
+   return false;
+}
+
  struct hw_vendor_info {
u16  valid;
char oem_id[ACPI_OEM_ID_SIZE];
@@ -952,6 +967,7 @@ struct hw_vendor_info {
  /* Hardware vendor-specific info that has its own power management modes */
  static struct hw_vendor_info vendor_info[] = {
{1, "HP", "ProLiant"},
+   {1, "ORACLE", ""},
{0, "", ""},
  };

@@ -969,12 +985,16 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void)
!strncmp(hdr.oem_table_id, v_info->oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE) &&
intel_pstate_no_acpi_pss())
return true;
+   if (!strncmp(hdr.oem_id, v_info->oem_id, ACPI_OEM_ID_SIZE) &&
+   intel_pstate_has_acpi_ppc())


We need try this on a few platforms to make sure this patch doesn't break the
HP platforms that may or may not need this driver, depending on the BIOS 
settings.



It looks like HP systems would get swept up in this check too if they have _PPC

What about extending the hw_vendor_info struct to include whether _PSS or
_PPC should be done for the platform since it appears that oracle and HP
have implemented similar functionality using two different methods.



I don't suppose you tested on a ProLiant too?

-- ljk


+   return true;
}

return false;
  }
  #else /* CONFIG_ACPI not enabled */
  static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return 
false; }
+static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
  #endif /* CONFIG_ACPI */

  static int __init intel_pstate_init(void)





--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/3] intel_pstate: skip the driver if Sun server has ACPI _PPC method

2014-11-20 Thread Dirk Brandewie

On 11/19/2014 12:22 PM, Linda Knippers wrote:

On 11/18/2014 3:37 AM, Ethan Zhao wrote:

Oracle Sun X86 servers have dynamic power capping capability that works via
ACPI _PPC method etc, so skip loading this driver if Sun server has ACPI _PPC
enabled.

Signed-off-by: Ethan Zhao ethan.z...@oracle.com
---
  drivers/cpufreq/intel_pstate.c | 20 
  1 file changed, 20 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 27bb6d3..5498eb0 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -943,6 +943,21 @@ static bool intel_pstate_no_acpi_pss(void)
return true;
  }

+static bool intel_pstate_has_acpi_ppc(void)
+{
+   int i;
+
+   for_each_possible_cpu(i) {
+   struct acpi_processor *pr = per_cpu(processors, i);
+
+   if (!pr)
+   continue;
+   if (acpi_has_method(pr-handle, _PPC))
+   return true;
+   }
+   return false;
+}
+
  struct hw_vendor_info {
u16  valid;
char oem_id[ACPI_OEM_ID_SIZE];
@@ -952,6 +967,7 @@ struct hw_vendor_info {
  /* Hardware vendor-specific info that has its own power management modes */
  static struct hw_vendor_info vendor_info[] = {
{1, HP, ProLiant},
+   {1, ORACLE, },
{0, , },
  };

@@ -969,12 +985,16 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void)
!strncmp(hdr.oem_table_id, v_info-oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE) 
intel_pstate_no_acpi_pss())
return true;
+   if (!strncmp(hdr.oem_id, v_info-oem_id, ACPI_OEM_ID_SIZE) 
+   intel_pstate_has_acpi_ppc())


We need try this on a few platforms to make sure this patch doesn't break the
HP platforms that may or may not need this driver, depending on the BIOS 
settings.



It looks like HP systems would get swept up in this check too if they have _PPC

What about extending the hw_vendor_info struct to include whether _PSS or
_PPC should be done for the platform since it appears that oracle and HP
have implemented similar functionality using two different methods.



I don't suppose you tested on a ProLiant too?

-- ljk


+   return true;
}

return false;
  }
  #else /* CONFIG_ACPI not enabled */
  static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return 
false; }
+static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
  #endif /* CONFIG_ACPI */

  static int __init intel_pstate_init(void)





--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/3] intel_pstate: skip the driver if Sun server has ACPI _PPC method

2014-11-19 Thread Dirk Brandewie

On 11/18/2014 12:37 AM, Ethan Zhao wrote:

Oracle Sun X86 servers have dynamic power capping capability that works via
ACPI _PPC method etc, so skip loading this driver if Sun server has ACPI _PPC
enabled.

Signed-off-by: Ethan Zhao 
---
  drivers/cpufreq/intel_pstate.c | 20 
  1 file changed, 20 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 27bb6d3..5498eb0 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -943,6 +943,21 @@ static bool intel_pstate_no_acpi_pss(void)
return true;
  }

+static bool intel_pstate_has_acpi_ppc(void)
+{
+   int i;
+
+   for_each_possible_cpu(i) {
+   struct acpi_processor *pr = per_cpu(processors, i);
+
+   if (!pr)
+   continue;
+   if (acpi_has_method(pr->handle, "_PPC"))
+   return true;
+   }
+   return false;
+}
+
  struct hw_vendor_info {
u16  valid;
char oem_id[ACPI_OEM_ID_SIZE];
@@ -952,6 +967,7 @@ struct hw_vendor_info {
  /* Hardware vendor-specific info that has its own power management modes */
  static struct hw_vendor_info vendor_info[] = {
{1, "HP", "ProLiant"},
+   {1, "ORACLE", ""},
{0, "", ""},
  };



Does this apply to ALL oracle systems?

Is the presence or absense of the _PPC method configurable in the oracle BIOS?


@@ -969,12 +985,16 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void)
!strncmp(hdr.oem_table_id, v_info->oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE) &&
intel_pstate_no_acpi_pss())
return true;
+   if (!strncmp(hdr.oem_id, v_info->oem_id, ACPI_OEM_ID_SIZE) &&
+   intel_pstate_has_acpi_ppc())
+   return true;
}

return false;
  }
  #else /* CONFIG_ACPI not enabled */
  static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return 
false; }
+static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
  #endif /* CONFIG_ACPI */

  static int __init intel_pstate_init(void)



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/3] intel_pstate: skip the driver if Sun server has ACPI _PPC method

2014-11-19 Thread Dirk Brandewie

On 11/18/2014 12:37 AM, Ethan Zhao wrote:

Oracle Sun X86 servers have dynamic power capping capability that works via
ACPI _PPC method etc, so skip loading this driver if Sun server has ACPI _PPC
enabled.

Signed-off-by: Ethan Zhao ethan.z...@oracle.com
---
  drivers/cpufreq/intel_pstate.c | 20 
  1 file changed, 20 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 27bb6d3..5498eb0 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -943,6 +943,21 @@ static bool intel_pstate_no_acpi_pss(void)
return true;
  }

+static bool intel_pstate_has_acpi_ppc(void)
+{
+   int i;
+
+   for_each_possible_cpu(i) {
+   struct acpi_processor *pr = per_cpu(processors, i);
+
+   if (!pr)
+   continue;
+   if (acpi_has_method(pr-handle, _PPC))
+   return true;
+   }
+   return false;
+}
+
  struct hw_vendor_info {
u16  valid;
char oem_id[ACPI_OEM_ID_SIZE];
@@ -952,6 +967,7 @@ struct hw_vendor_info {
  /* Hardware vendor-specific info that has its own power management modes */
  static struct hw_vendor_info vendor_info[] = {
{1, HP, ProLiant},
+   {1, ORACLE, },
{0, , },
  };



Does this apply to ALL oracle systems?

Is the presence or absense of the _PPC method configurable in the oracle BIOS?


@@ -969,12 +985,16 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void)
!strncmp(hdr.oem_table_id, v_info-oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE) 
intel_pstate_no_acpi_pss())
return true;
+   if (!strncmp(hdr.oem_id, v_info-oem_id, ACPI_OEM_ID_SIZE) 
+   intel_pstate_has_acpi_ppc())
+   return true;
}

return false;
  }
  #else /* CONFIG_ACPI not enabled */
  static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return 
false; }
+static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
  #endif /* CONFIG_ACPI */

  static int __init intel_pstate_init(void)



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] x86: Add support for Intel HWP feature detection.

2014-11-06 Thread dirk . brandewie
From: Dirk Brandewie 

Add support of Hardware Managed Performance States (HWP) described in Volume 3
section 14.4 of the SDM.

One bit CPUID.06H:EAX[bit 7] expresses the presence of the HWP feature on
the processor. The remaining bits CPUID.06H:EAX[bit 8-11] denote the
presense of various HWP features.

Cc: x...@kernel.org
Signed-off-by: Dirk Brandewie 
---
 arch/x86/include/asm/cpufeature.h | 5 +
 arch/x86/kernel/cpu/scattered.c   | 5 +
 2 files changed, 10 insertions(+)

diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index 0bb1335..aede2c3 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -189,6 +189,11 @@
 #define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */
 #define X86_FEATURE_HW_PSTATE  ( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+#define X86_FEATURE_HWP( 7*32+ 10) /* "hwp" Intel HWP */
+#define X86_FEATURE_HWP_NOITFY ( 7*32+ 11) /* Intel HWP_NOTIFY */
+#define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */
+#define X86_FEATURE_HWP_EPP( 7*32+13) /* Intel HWP_EPP */
+#define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 4a8013d..6063909 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -36,6 +36,11 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{ X86_FEATURE_ARAT, CR_EAX, 2, 0x0006, 0 },
{ X86_FEATURE_PLN,  CR_EAX, 4, 0x0006, 0 },
{ X86_FEATURE_PTS,  CR_EAX, 6, 0x0006, 0 },
+   { X86_FEATURE_HWP,  CR_EAX, 7, 0x0006, 0 },
+   { X86_FEATURE_HWP_NOITFY,   CR_EAX, 8, 0x0006, 0 },
+   { X86_FEATURE_HWP_ACT_WINDOW,   CR_EAX, 9, 0x0006, 0 },
+   { X86_FEATURE_HWP_EPP,  CR_EAX,10, 0x0006, 0 },
+   { X86_FEATURE_HWP_PKG_REQ,  CR_EAX,11, 0x0006, 0 },
{ X86_FEATURE_APERFMPERF,   CR_ECX, 0, 0x0006, 0 },
{ X86_FEATURE_EPB,  CR_ECX, 3, 0x0006, 0 },
{ X86_FEATURE_HW_PSTATE,CR_EDX, 7, 0x8007, 0 },
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] intel_pstate: Add support for HWP

2014-11-06 Thread dirk . brandewie
From: Dirk Brandewie 

Add support of Hardware Managed Performance States (HWP) described in Volume 3
section 14.4 of the SDM.

With HWP enbaled intel_pstate will no longer be responsible for selecting P
states for the processor. intel_pstate will continue to register to
the cpufreq core as the scaling driver for CPUs implementing
HWP. In HWP mode intel_pstate provides three functions reporting
frequency to the cpufreq core, support for the set_policy() interface
from the core and maintaining the intel_pstate sysfs interface in
/sys/devices/system/cpu/intel_pstate.  User preferences expressed via
the set_policy() interface or the sysfs interface are forwared to the
CPU via the HWP MSR interface.

Signed-off-by: Dirk Brandewie 
---
 Documentation/cpu-freq/intel-pstate.txt |  37 
 Documentation/kernel-parameters.txt |   3 +
 arch/x86/include/uapi/asm/msr-index.h   |  41 +
 drivers/cpufreq/intel_pstate.c  | 102 +++-
 4 files changed, 168 insertions(+), 15 deletions(-)

diff --git a/Documentation/cpu-freq/intel-pstate.txt 
b/Documentation/cpu-freq/intel-pstate.txt
index a69ffe1..765d7fc 100644
--- a/Documentation/cpu-freq/intel-pstate.txt
+++ b/Documentation/cpu-freq/intel-pstate.txt
@@ -1,17 +1,28 @@
 Intel P-state driver
 
 
-This driver implements a scaling driver with an internal governor for
-Intel Core processors.  The driver follows the same model as the
-Transmeta scaling driver (longrun.c) and implements the setpolicy()
-instead of target().  Scaling drivers that implement setpolicy() are
-assumed to implement internal governors by the cpufreq core. All the
-logic for selecting the current P state is contained within the
-driver; no external governor is used by the cpufreq core.
-
-Intel SandyBridge+ processors are supported.
-
-New sysfs files for controlling P state selection have been added to
+This driver provides an interface to control the P state selection for
+SandyBridge+ Intel processors.  The driver can operate two different
+modes based on the processor model legacy and Hardware P state (HWP)
+mode.
+
+In legacy mode the driver implements a scaling driver with an internal
+governor for Intel Core processors.  The driver follows the same model
+as the Transmeta scaling driver (longrun.c) and implements the
+setpolicy() instead of target().  Scaling drivers that implement
+setpolicy() are assumed to implement internal governors by the cpufreq
+core. All the logic for selecting the current P state is contained
+within the driver; no external governor is used by the cpufreq core.
+
+In HWP mode P state selection is implemented in the processor
+itself. The driver provides the interfaces between the cpufreq core and
+the processor to control P state selection based on user preferences
+and reporting frequency to the cpufreq core.  In this mode the
+internal governor code is disabled.
+
+In addtion to the interfaces provided by the cpufreq core for
+controlling frequency the driver provides sysfs files for
+controlling P state selection. These files have been added to
 /sys/devices/system/cpu/intel_pstate/
 
   max_perf_pct: limits the maximum P state that will be requested by
@@ -33,7 +44,9 @@ frequency is fiction for Intel Core processors. Even if the 
scaling
 driver selects a single P state the actual frequency the processor
 will run at is selected by the processor itself.
 
-New debugfs files have also been added to /sys/kernel/debug/pstate_snb/
+For legacy mode debugfs files have also been added to allow tuning of
+the internal governor algorythm. These files are located at
+/sys/kernel/debug/pstate_snb/ These files are NOT present in HWP mode.
 
   deadband
   d_gain_pct
diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 4c81a86..571f9d2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1446,6 +1446,9 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
   disable
 Do not enable intel_pstate as the default
 scaling driver for the supported processors
+  no_hwp
+Do not enable hardware P state control (HWP)
+if available.   
 
intremap=   [X86-64, Intel-IOMMU]
on  enable Interrupt Remapping (default)
diff --git a/arch/x86/include/uapi/asm/msr-index.h 
b/arch/x86/include/uapi/asm/msr-index.h
index e21331c..86dda97 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -152,6 +152,45 @@
 #define MSR_CC6_DEMOTION_POLICY_CONFIG 0x0668
 #define MSR_MC6_DEMOTION_POLICY_CONFIG 0x0669
 
+/* Hardware P state interface */
+#define MSR_PPERF  0x064e
+#define MSR_PERF_LIMIT_REASONS 0x064f
+#define MSR_PM_ENABLE

[PATCH 0/2] intel_pstate: Add support for hardware managed P states (HWP)

2014-11-06 Thread dirk . brandewie
From: Dirk Brandewie 

This patch set adds support for HWP. When HWP is enabled the CPU will
do P state autonomously and intel_pstate simply provides an interface
to forward user preferences to the CPU while maintaining the
interfaces required by the cpufreq core. 

Dirk Brandewie (2):
  x86: Add support for Intel HWP feature detection.
  intel_pstate: Add support for HWP

 Documentation/cpu-freq/intel-pstate.txt |  37 
 Documentation/kernel-parameters.txt |   3 +
 arch/x86/include/asm/cpufeature.h   |   5 ++
 arch/x86/include/uapi/asm/msr-index.h   |  41 +
 arch/x86/kernel/cpu/scattered.c |   5 ++
 drivers/cpufreq/intel_pstate.c  | 102 +++-
 6 files changed, 178 insertions(+), 15 deletions(-)

-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/2] intel_pstate: Add support for hardware managed P states (HWP)

2014-11-06 Thread dirk . brandewie
From: Dirk Brandewie dirk.j.brande...@intel.com

This patch set adds support for HWP. When HWP is enabled the CPU will
do P state autonomously and intel_pstate simply provides an interface
to forward user preferences to the CPU while maintaining the
interfaces required by the cpufreq core. 

Dirk Brandewie (2):
  x86: Add support for Intel HWP feature detection.
  intel_pstate: Add support for HWP

 Documentation/cpu-freq/intel-pstate.txt |  37 
 Documentation/kernel-parameters.txt |   3 +
 arch/x86/include/asm/cpufeature.h   |   5 ++
 arch/x86/include/uapi/asm/msr-index.h   |  41 +
 arch/x86/kernel/cpu/scattered.c |   5 ++
 drivers/cpufreq/intel_pstate.c  | 102 +++-
 6 files changed, 178 insertions(+), 15 deletions(-)

-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] intel_pstate: Add support for HWP

2014-11-06 Thread dirk . brandewie
From: Dirk Brandewie dirk.j.brande...@intel.com

Add support of Hardware Managed Performance States (HWP) described in Volume 3
section 14.4 of the SDM.

With HWP enbaled intel_pstate will no longer be responsible for selecting P
states for the processor. intel_pstate will continue to register to
the cpufreq core as the scaling driver for CPUs implementing
HWP. In HWP mode intel_pstate provides three functions reporting
frequency to the cpufreq core, support for the set_policy() interface
from the core and maintaining the intel_pstate sysfs interface in
/sys/devices/system/cpu/intel_pstate.  User preferences expressed via
the set_policy() interface or the sysfs interface are forwared to the
CPU via the HWP MSR interface.

Signed-off-by: Dirk Brandewie dirk.j.brande...@intel.com
---
 Documentation/cpu-freq/intel-pstate.txt |  37 
 Documentation/kernel-parameters.txt |   3 +
 arch/x86/include/uapi/asm/msr-index.h   |  41 +
 drivers/cpufreq/intel_pstate.c  | 102 +++-
 4 files changed, 168 insertions(+), 15 deletions(-)

diff --git a/Documentation/cpu-freq/intel-pstate.txt 
b/Documentation/cpu-freq/intel-pstate.txt
index a69ffe1..765d7fc 100644
--- a/Documentation/cpu-freq/intel-pstate.txt
+++ b/Documentation/cpu-freq/intel-pstate.txt
@@ -1,17 +1,28 @@
 Intel P-state driver
 
 
-This driver implements a scaling driver with an internal governor for
-Intel Core processors.  The driver follows the same model as the
-Transmeta scaling driver (longrun.c) and implements the setpolicy()
-instead of target().  Scaling drivers that implement setpolicy() are
-assumed to implement internal governors by the cpufreq core. All the
-logic for selecting the current P state is contained within the
-driver; no external governor is used by the cpufreq core.
-
-Intel SandyBridge+ processors are supported.
-
-New sysfs files for controlling P state selection have been added to
+This driver provides an interface to control the P state selection for
+SandyBridge+ Intel processors.  The driver can operate two different
+modes based on the processor model legacy and Hardware P state (HWP)
+mode.
+
+In legacy mode the driver implements a scaling driver with an internal
+governor for Intel Core processors.  The driver follows the same model
+as the Transmeta scaling driver (longrun.c) and implements the
+setpolicy() instead of target().  Scaling drivers that implement
+setpolicy() are assumed to implement internal governors by the cpufreq
+core. All the logic for selecting the current P state is contained
+within the driver; no external governor is used by the cpufreq core.
+
+In HWP mode P state selection is implemented in the processor
+itself. The driver provides the interfaces between the cpufreq core and
+the processor to control P state selection based on user preferences
+and reporting frequency to the cpufreq core.  In this mode the
+internal governor code is disabled.
+
+In addtion to the interfaces provided by the cpufreq core for
+controlling frequency the driver provides sysfs files for
+controlling P state selection. These files have been added to
 /sys/devices/system/cpu/intel_pstate/
 
   max_perf_pct: limits the maximum P state that will be requested by
@@ -33,7 +44,9 @@ frequency is fiction for Intel Core processors. Even if the 
scaling
 driver selects a single P state the actual frequency the processor
 will run at is selected by the processor itself.
 
-New debugfs files have also been added to /sys/kernel/debug/pstate_snb/
+For legacy mode debugfs files have also been added to allow tuning of
+the internal governor algorythm. These files are located at
+/sys/kernel/debug/pstate_snb/ These files are NOT present in HWP mode.
 
   deadband
   d_gain_pct
diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 4c81a86..571f9d2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1446,6 +1446,9 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
   disable
 Do not enable intel_pstate as the default
 scaling driver for the supported processors
+  no_hwp
+Do not enable hardware P state control (HWP)
+if available.   
 
intremap=   [X86-64, Intel-IOMMU]
on  enable Interrupt Remapping (default)
diff --git a/arch/x86/include/uapi/asm/msr-index.h 
b/arch/x86/include/uapi/asm/msr-index.h
index e21331c..86dda97 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -152,6 +152,45 @@
 #define MSR_CC6_DEMOTION_POLICY_CONFIG 0x0668
 #define MSR_MC6_DEMOTION_POLICY_CONFIG 0x0669
 
+/* Hardware P state interface */
+#define MSR_PPERF  0x064e
+#define MSR_PERF_LIMIT_REASONS

[PATCH 1/2] x86: Add support for Intel HWP feature detection.

2014-11-06 Thread dirk . brandewie
From: Dirk Brandewie dirk.j.brande...@intel.com

Add support of Hardware Managed Performance States (HWP) described in Volume 3
section 14.4 of the SDM.

One bit CPUID.06H:EAX[bit 7] expresses the presence of the HWP feature on
the processor. The remaining bits CPUID.06H:EAX[bit 8-11] denote the
presense of various HWP features.

Cc: x...@kernel.org
Signed-off-by: Dirk Brandewie dirk.j.brande...@intel.com
---
 arch/x86/include/asm/cpufeature.h | 5 +
 arch/x86/kernel/cpu/scattered.c   | 5 +
 2 files changed, 10 insertions(+)

diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index 0bb1335..aede2c3 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -189,6 +189,11 @@
 #define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */
 #define X86_FEATURE_HW_PSTATE  ( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+#define X86_FEATURE_HWP( 7*32+ 10) /* hwp Intel HWP */
+#define X86_FEATURE_HWP_NOITFY ( 7*32+ 11) /* Intel HWP_NOTIFY */
+#define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */
+#define X86_FEATURE_HWP_EPP( 7*32+13) /* Intel HWP_EPP */
+#define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 4a8013d..6063909 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -36,6 +36,11 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{ X86_FEATURE_ARAT, CR_EAX, 2, 0x0006, 0 },
{ X86_FEATURE_PLN,  CR_EAX, 4, 0x0006, 0 },
{ X86_FEATURE_PTS,  CR_EAX, 6, 0x0006, 0 },
+   { X86_FEATURE_HWP,  CR_EAX, 7, 0x0006, 0 },
+   { X86_FEATURE_HWP_NOITFY,   CR_EAX, 8, 0x0006, 0 },
+   { X86_FEATURE_HWP_ACT_WINDOW,   CR_EAX, 9, 0x0006, 0 },
+   { X86_FEATURE_HWP_EPP,  CR_EAX,10, 0x0006, 0 },
+   { X86_FEATURE_HWP_PKG_REQ,  CR_EAX,11, 0x0006, 0 },
{ X86_FEATURE_APERFMPERF,   CR_ECX, 0, 0x0006, 0 },
{ X86_FEATURE_EPB,  CR_ECX, 3, 0x0006, 0 },
{ X86_FEATURE_HW_PSTATE,CR_EDX, 7, 0x8007, 0 },
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] cpufreq: allow powersave governor as the default without expert mode

2014-10-31 Thread Dirk Brandewie

On 10/30/2014 02:18 PM, Rafael J. Wysocki wrote:

On Thursday, October 16, 2014 07:37:11 AM James Geboski wrote:

The intel_pstate driver only supports the performance and the powersave
governors. With the performance governor ensuring the highest possible
performance settings, userspace tools fail to make any lasting changes.
In order to allow userspace tools to make modifications to the settings,
the powersave governor must be in use. This makes having the powersave
governor as the default convenient for systems where the intel_pstate
driver is being employed. Having to enable expert mode in the kernel
configuration is just a headache for such a trivial task.

This patch applies to all kernel versions 2.6.38 or greater after the
migration from CONFIG_EMBEDDED to CONFIG_EXPERT (6a108a14fa35). Most
importantly, this applies to kernel versions 3.9 or greater when the
intel_pstate driver was introduced.

Signed-off-by: James Geboski 
Acked-by: Viresh Kumar 


Dirk, any objections?


No objection.



---
ChangeLog v2:
   - Acked-by: Viresh Kumar 
---
  drivers/cpufreq/Kconfig | 1 -
  1 file changed, 1 deletion(-)

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 3489f8f..73df7db 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -63,7 +63,6 @@ config CPU_FREQ_DEFAULT_GOV_PERFORMANCE

  config CPU_FREQ_DEFAULT_GOV_POWERSAVE
bool "powersave"
-   depends on EXPERT
select CPU_FREQ_GOV_POWERSAVE
help
  Use the CPUFreq governor 'powersave' as default. This sets





--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] cpufreq: allow powersave governor as the default without expert mode

2014-10-31 Thread Dirk Brandewie

On 10/30/2014 02:18 PM, Rafael J. Wysocki wrote:

On Thursday, October 16, 2014 07:37:11 AM James Geboski wrote:

The intel_pstate driver only supports the performance and the powersave
governors. With the performance governor ensuring the highest possible
performance settings, userspace tools fail to make any lasting changes.
In order to allow userspace tools to make modifications to the settings,
the powersave governor must be in use. This makes having the powersave
governor as the default convenient for systems where the intel_pstate
driver is being employed. Having to enable expert mode in the kernel
configuration is just a headache for such a trivial task.

This patch applies to all kernel versions 2.6.38 or greater after the
migration from CONFIG_EMBEDDED to CONFIG_EXPERT (6a108a14fa35). Most
importantly, this applies to kernel versions 3.9 or greater when the
intel_pstate driver was introduced.

Signed-off-by: James Geboski jgebo...@gmail.com
Acked-by: Viresh Kumar viresh.ku...@linaro.org


Dirk, any objections?


No objection.



---
ChangeLog v2:
   - Acked-by: Viresh Kumar viresh.ku...@linaro.org
---
  drivers/cpufreq/Kconfig | 1 -
  1 file changed, 1 deletion(-)

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 3489f8f..73df7db 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -63,7 +63,6 @@ config CPU_FREQ_DEFAULT_GOV_PERFORMANCE

  config CPU_FREQ_DEFAULT_GOV_POWERSAVE
bool powersave
-   depends on EXPERT
select CPU_FREQ_GOV_POWERSAVE
help
  Use the CPUFreq governor 'powersave' as default. This sets





--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] cpufreq: intel_pstate: Fix setting max_perf_pct in performance policy

2014-10-16 Thread Dirk Brandewie

On 10/15/2014 04:16 PM, Pali Rohár wrote:

Code which changes policy to powersave changes also max_policy_pct based on
max_freq. Code which change max_perf_pct has upper limit base on value
max_policy_pct. When policy is changing from powersave back to performance
then max_policy_pct is not changed. Which means that changing max_perf_pct is
not possible to high values if max_freq was too low in powersave policy.

Test case:

$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_min_freq
80
$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
330
$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
performance
$ cat /sys/devices/system/cpu/intel_pstate/max_perf_pct
100

$ echo powersave > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
$ echo 80 > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
$ echo 20 > /sys/devices/system/cpu/intel_pstate/max_perf_pct

$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
powersave
$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
80
$ cat /sys/devices/system/cpu/intel_pstate/max_perf_pct
20

$ echo performance > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
$ echo 330 > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
$ echo 100 > /sys/devices/system/cpu/intel_pstate/max_perf_pct

$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
performance
$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
330
$ cat /sys/devices/system/cpu/intel_pstate/max_perf_pct
24

And now intel_pstate driver allows to set maximal value for max_perf_pct based
on max_policy_pct which is 24 for previous powersave max_freq 80.

This patch will set default value for max_policy_pct when setting policy to
performance so it will allow to set also max value for max_perf_pct.

Signed-off-by: Pali Rohár 


Acked-by: Dirk Brandewie 


Cc: sta...@vger.kernel.org
---
  drivers/cpufreq/intel_pstate.c |1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 0668b38..7547ab5 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -714,6 +714,7 @@ static int intel_pstate_set_policy(struct cpufreq_policy 
*policy)
if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) {
limits.min_perf_pct = 100;
limits.min_perf = int_tofp(1);
+   limits.max_policy_pct = 100;
limits.max_perf_pct = 100;
limits.max_perf = int_tofp(1);
limits.no_turbo = limits.turbo_disabled;



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] cpufreq: intel_pstate: Fix setting max_perf_pct in performance policy

2014-10-16 Thread Dirk Brandewie

On 10/15/2014 04:16 PM, Pali Rohár wrote:

Code which changes policy to powersave changes also max_policy_pct based on
max_freq. Code which change max_perf_pct has upper limit base on value
max_policy_pct. When policy is changing from powersave back to performance
then max_policy_pct is not changed. Which means that changing max_perf_pct is
not possible to high values if max_freq was too low in powersave policy.

Test case:

$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_min_freq
80
$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
330
$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
performance
$ cat /sys/devices/system/cpu/intel_pstate/max_perf_pct
100

$ echo powersave  /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
$ echo 80  /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
$ echo 20  /sys/devices/system/cpu/intel_pstate/max_perf_pct

$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
powersave
$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
80
$ cat /sys/devices/system/cpu/intel_pstate/max_perf_pct
20

$ echo performance  /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
$ echo 330  /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
$ echo 100  /sys/devices/system/cpu/intel_pstate/max_perf_pct

$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
performance
$ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
330
$ cat /sys/devices/system/cpu/intel_pstate/max_perf_pct
24

And now intel_pstate driver allows to set maximal value for max_perf_pct based
on max_policy_pct which is 24 for previous powersave max_freq 80.

This patch will set default value for max_policy_pct when setting policy to
performance so it will allow to set also max value for max_perf_pct.

Signed-off-by: Pali Rohár pali.ro...@gmail.com


Acked-by: Dirk Brandewie dirk.j.brande...@intel.com


Cc: sta...@vger.kernel.org
---
  drivers/cpufreq/intel_pstate.c |1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 0668b38..7547ab5 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -714,6 +714,7 @@ static int intel_pstate_set_policy(struct cpufreq_policy 
*policy)
if (policy-policy == CPUFREQ_POLICY_PERFORMANCE) {
limits.min_perf_pct = 100;
limits.min_perf = int_tofp(1);
+   limits.max_policy_pct = 100;
limits.max_perf_pct = 100;
limits.max_perf = int_tofp(1);
limits.no_turbo = limits.turbo_disabled;



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.

2014-09-11 Thread Dirk Brandewie

On 09/10/2014 06:04 PM, Sameer Nanda wrote:

On Wed, Sep 10, 2014 at 5:04 PM, Rafael J. Wysocki  wrote:

On Wednesday, September 10, 2014 04:39:05 PM Anup Chenthamarakshan wrote:

On Thu, Sep 11, 2014 at 12:49:48AM +0200, Rafael J. Wysocki wrote:

On Wednesday, September 10, 2014 03:15:08 PM Anup Chenthamarakshan wrote:


Tools like powertop and turbostat are not present by default on all systems,
so it is not always possible to use them :(


Which systems are you referring to in particular?


We're testing on Chrome OS devices (Chromebooks).


How big of a deal is it to install the tools mentioned above on such a system?

At least turbostat is shipped with the kernel source.


Given the web browser based front end of Chrome OS, installing these
tools will only get us so far -- if the system is in developer mode,
the tools are accessible but when the system is in normal (verified
boot mode) these tools cannot be launched directly.

We are in the process of switching Chrome OS x86 kernels from ondemand
governor to intel_pstate.  When debugging power consumption issues,
losing the ability to easily get CPU frequency related information as
a side-effect of this switch is less than ideal.

We are happy to spin this patch to expose aperf/mperf based CPU
frequency information if you think that is the better route to take
longer term.


You can get the frequency as measured by intel_pstate from /proc/cpuinfo
or /sys/devices/system/cpu/cpu[n]/cpufreq/cpuinfo_cur_freq but his is only
for the most recent sample on cpu[n]

reading MSR 0x199 and some reasonable rate will let you graph what request
is being made on each core.





--
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.






--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.

2014-09-11 Thread Dirk Brandewie

On 09/10/2014 06:04 PM, Sameer Nanda wrote:

On Wed, Sep 10, 2014 at 5:04 PM, Rafael J. Wysocki r...@rjwysocki.net wrote:

On Wednesday, September 10, 2014 04:39:05 PM Anup Chenthamarakshan wrote:

On Thu, Sep 11, 2014 at 12:49:48AM +0200, Rafael J. Wysocki wrote:

On Wednesday, September 10, 2014 03:15:08 PM Anup Chenthamarakshan wrote:


Tools like powertop and turbostat are not present by default on all systems,
so it is not always possible to use them :(


Which systems are you referring to in particular?


We're testing on Chrome OS devices (Chromebooks).


How big of a deal is it to install the tools mentioned above on such a system?

At least turbostat is shipped with the kernel source.


Given the web browser based front end of Chrome OS, installing these
tools will only get us so far -- if the system is in developer mode,
the tools are accessible but when the system is in normal (verified
boot mode) these tools cannot be launched directly.

We are in the process of switching Chrome OS x86 kernels from ondemand
governor to intel_pstate.  When debugging power consumption issues,
losing the ability to easily get CPU frequency related information as
a side-effect of this switch is less than ideal.

We are happy to spin this patch to expose aperf/mperf based CPU
frequency information if you think that is the better route to take
longer term.


You can get the frequency as measured by intel_pstate from /proc/cpuinfo
or /sys/devices/system/cpu/cpu[n]/cpufreq/cpuinfo_cur_freq but his is only
for the most recent sample on cpu[n]

reading MSR 0x199 and some reasonable rate will let you graph what request
is being made on each core.





--
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.






--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC v1 0/6] CPPC as a PID backend

2014-09-10 Thread Dirk Brandewie

On 09/10/2014 09:11 AM, Ashwin Chaugule wrote:

On 10 September 2014 11:44, Dirk Brandewie  wrote:

Hi Ashwin,


Hi Dirk,



I think the CPPC based driver should be a separate driver.

We made the conscious decision to not use any of the ACPI mechanisms
to enumerate or control P state selection.  Experience over the years
has shown that the quality/accuracy of the BIOS/ACPI implementations
vary widely across OEM's and platform types from a single OEM. Features
that always work on a server platform from a given OEM may not work or
provide bad information on client platforms for example.

Another reason for doing intel_pstate was to be able to land intel specific
features and fixes without breaking other architectures as the power
management capabilities of the platform evolve. As processors that support
Hardware P states (HWP) as described in section 14.4 of the current SDM
come into the market intel_pstate will change to not doing much other
than enabling HWP and providing an interface to forward user configuration
requests to the processor if the user chooses to enable HWP otherwise the
current mechanisms will be used.  This is why the intel_pstate sysfs
interface is the way it is to be able to map cleanly to HWP and provide
an abstract interface going forward.

Having separate drivers allows the system integrator/user to select the
most appropriate mechanism for their system.

--Dirk


With the current split I think you will still be able to maintain
Intel specific changes for the future in the backend driver. The PID
algorithm seems platform independent anyway and the PID knobs are
exported to userspace for platform specific tuning. The Intel backend
driver should be unaffected by the CPPC (ACPI) backend. We can also
make them mutually exclusive at runtime.


We could make it runtime selectable whether to use CPPC or the
native mechanisms for P state enumeration and selection but we would
get into an awful black/white list situation that would not make
anyone happy.

Using CPPC on Intel platforms implies using HWP which is already
planned for in intel_pstate.  I am not aware of any effort to support
CPPC on Intel platforms that do not support HWP.  For Intel platforms
using CPPC is NOT needed or desirable IMHO.  We had many conversations
over many months while CPPC was being defined and made the decision
to not use this mechanism on Intel Linux platforms.

For other platforms that plan on conforming to ACPI 5.x with respect
to P state enumeration and selection I would like to leave it to them
to hurd all the cats at the OEMs to get CPPC correct on all their platforms.



Or are you suggesting using PID + CPPC as another driver? IIUC, that
would lead to a lot of redundancy.



The redundancy is actually pretty small IMHO if you take out the
enumeration/init code the code shared at runtime is pretty small
sample/calc_busy/PID.



Cheers,
Ashwin



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.

2014-09-10 Thread Dirk Brandewie

On 09/09/2014 04:22 PM, Anup Chenthamarakshan wrote:

On Tue, Sep 09, 2014 at 08:15:13AM -0700, Dirk Brandewie wrote:

On 09/08/2014 05:10 PM, Anup Chenthamarakshan wrote:

Exported stats appear in
/devices/system/cpu/intel_pstate/time_in_state as follows:

## CPU 0
40 3647
50 24342
60 144150
70 202469
## CPU 1
40 4813
50 22628
60 149564
70 211885
80 173890

Signed-off-by: Anup Chenthamarakshan 


What is this information being used for?


I'm using P-state residency information in power consumption tests to calculate
proportion of time spent in each P-state across all processors (one global set
of percentages, corresponding to each P-state). This is used to validate new
changes from the power perspective. Essentially, sanity checks to flag changes
with large difference in P-state residency.

So far, we've been using the data exported by acpi-cpufreq to track this.



Tracking the current P state request for each core is only part of the
story.  The processor aggregates the requests from all cores and then decides
what frequency the package will run at, this evaluation happens at ~1ms time
frame.  If a core is idle then it loses its vote for that package frequency will
be and its frequency will be zero even though it may have been requesting
a high P state when it went idle.  Tracking the residency of the requested
P state doesn't provide much useful information other than ensuring the the
requests are changing over time IMHO.


This is exactly why we're trying to track it.


My point is that you are tracking the residency of the request and not
the P state the package was running at.  On a lightly loaded system
it is not unusual for a core that was very busy and requesting a high
P state to go idle for several seconds.  In this case that core would
lose its vote for the package P state but the stats would show that
the P state was high for a very long time when its real frequency
was zero.

There are a couple of ways to get what I consider better information
about what is actually going on.

  The current turbostat provides C state residency and calculates the
  average/effective frequency of the core over its sample time.
  Turbostat will also measure the power consumption from the CPU point
  of view if your processor supports the RAPL registers.

  Reading MSR 0x198 MSR_IA32_PERF_STATUS will tell you what the core
  would run at if it not idle, this reflects the decision that the
  package made based on current requests.

  Using perf to collect power:pstate_sample event will give information
  about each sample on the core and give you timestamps to detect idle
  times.

  Using perf to collect power:cpu_frequency will show when the P state
  request was changed on each core and is triggered by intel_pstate and
  acpi_cpufreq.

  Powertop collects that same information as turbostat and a bunch of
  other information useful in seeing where you could be burning power
  for no good reason.

For getting an idea of real power turbostat is the easiest to use and
is available on most systems.  Using perf will give you a very fine grained
view of what is going on as well as point to the culprit for bad
behaviour in most cases.





This interface will not be supportable with upcoming processors using
hardware P states as documented in volume 3 of the current SDM Section 14.4
http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
The OS will have no way of knowing what the P state requests are for a
given core are.


Will there be any means to determine the proportion of time spent in different
HWP-states when HWP gets enabled (maybe at a package level)?


Not that I am aware of :-(  There is MSR_PPERF section 14.4.5.1 that will give
the CPUs view of the amount of productive work/scalability of the current load.

--Dirk
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC v1 0/6] CPPC as a PID backend

2014-09-10 Thread Dirk Brandewie

Hi Ashwin,

I think the CPPC based driver should be a separate driver.

We made the conscious decision to not use any of the ACPI mechanisms
to enumerate or control P state selection.  Experience over the years
has shown that the quality/accuracy of the BIOS/ACPI implementations
vary widely across OEM's and platform types from a single OEM. Features
that always work on a server platform from a given OEM may not work or
provide bad information on client platforms for example.

Another reason for doing intel_pstate was to be able to land intel specific
features and fixes without breaking other architectures as the power
management capabilities of the platform evolve. As processors that support
Hardware P states (HWP) as described in section 14.4 of the current SDM
come into the market intel_pstate will change to not doing much other
than enabling HWP and providing an interface to forward user configuration
requests to the processor if the user chooses to enable HWP otherwise the
current mechanisms will be used.  This is why the intel_pstate sysfs
interface is the way it is to be able to map cleanly to HWP and provide
an abstract interface going forward.

Having separate drivers allows the system integrator/user to select the
most appropriate mechanism for their system.

--Dirk

On 09/09/2014 03:12 PM, Ashwin Chaugule wrote:

This patchset introduces CPPC(Collaborative Processor Performance Control) as a 
backend
to the PID governor. The PID governor from intel_pstate.c maps cleanly onto 
some CPPC
interfaces.
e.g. The CPU performance requests are made on a continuous scale as against 
discrete pstate
levels. The CPU performance feedback over an interval is gauged using platform 
specific
counters which are also described by CPPC.

Although CPPC describes several other registers to provide more hints to the 
platform,
Linux as of today does not have the infrastructure to make use of those 
registers.
Some of the CPPC specific information could be made available from the 
scheduler as
part of the CPUfreq and Scheduler intergration work. Until then PID can be used 
as the
front end for CPPC.

Beyond code restructuring and renaming, this patchset does not change the logic 
from the
intel_pstate.c driver. Kernel compilation times were compared with the original 
intel_pstate.c,
intel backend(intel_pid_ctrl.c) and the CPPC backend and no significant 
overheads were noticed.

Testing was performed on a Thinkpad X240 laptop.

PID_CTRL + INTEL_PSTATE:
===
real5m37.742s
user18m42.575s
sys 1m0.521s


PID_CTRL + CPPC_PID_CTRL:

real5m48.321s
user18m24.487s
sys 0m59.327s


ORIGINAL INTEL_PSTATE:
==
real5m40.642s
user18m37.411s
sys 1m0.185s


The complete patchset including the PCC hacks used for testing is available in 
[4].


Changes since V0: [1]
-   Split intel_pstate.c into a generic PID governor and platform specific 
backend.
-   Add CPPC accessors as PID backend.


CPPC:


CPPC (Collaborative Processor Performance Control) is a new way to control CPU
performance using an abstract continous scale as against a discretized P-state 
scale
which is tied to CPU frequency only. It is defined in the ACPI 5.0+ spec. In 
brief,
the basic operation involves:
- OS makes a CPU performance request. (Can provide min and max tolerable bounds)

- Platform (such as BMC) is free to optimize request within requested bounds 
depending
on power/thermal budgets etc.

- Platform conveys its decision back to OS

The communication between OS and platform occurs through another medium called 
(PCC)
Platform communication Channel. This is a generic mailbox like mechanism which 
includes
doorbell semantics to indicate register updates. The PCC driver is being 
discussed in a
separate patchset [3] and is not included here, since CPPC is only one client 
of PCC.

Finer details about the PCC and CPPC spec are available in the latest ACPI 5.1
specification.[2]

[1] - http://lwn.net/Articles/608715/
[2] - http://www.uefi.org/sites/default/files/resources/ACPI_5_1release.pdf
[3] - http://comments.gmane.org/gmane.linux.acpi.devel/70299
[4] - 
http://git.linaro.org/people/ashwin.chaugule/leg-kernel.git/shortlog/refs/heads/cppc-pid-no_freq_domain



Ashwin Chaugule (6):
   PID Controller governor
   PID: Move Turbo detection into backend driver
   PID: Move Baytrail specific accessors into backend driver
   PID: Add new function pointers to read multiple registers
   PID: Rename counters to make them more generic
   PID: Add CPPC (Collaborative Processor Performance) backend driver

  Documentation/cpu-freq/intel-pstate.txt |   43 --
  Documentation/cpu-freq/pid_ctrl.txt |   41 ++
  drivers/cpufreq/Kconfig |   19 +
  drivers/cpufreq/Kconfig.x86 |2 +-
  drivers/cpufreq/Makefile|4 +-
  drivers/cpufreq/cppc_pid_ctrl.c |  406 +
  drivers/cpufreq/intel_pid_ctrl.c

Re: [RFC v1 0/6] CPPC as a PID backend

2014-09-10 Thread Dirk Brandewie

Hi Ashwin,

I think the CPPC based driver should be a separate driver.

We made the conscious decision to not use any of the ACPI mechanisms
to enumerate or control P state selection.  Experience over the years
has shown that the quality/accuracy of the BIOS/ACPI implementations
vary widely across OEM's and platform types from a single OEM. Features
that always work on a server platform from a given OEM may not work or
provide bad information on client platforms for example.

Another reason for doing intel_pstate was to be able to land intel specific
features and fixes without breaking other architectures as the power
management capabilities of the platform evolve. As processors that support
Hardware P states (HWP) as described in section 14.4 of the current SDM
come into the market intel_pstate will change to not doing much other
than enabling HWP and providing an interface to forward user configuration
requests to the processor if the user chooses to enable HWP otherwise the
current mechanisms will be used.  This is why the intel_pstate sysfs
interface is the way it is to be able to map cleanly to HWP and provide
an abstract interface going forward.

Having separate drivers allows the system integrator/user to select the
most appropriate mechanism for their system.

--Dirk

On 09/09/2014 03:12 PM, Ashwin Chaugule wrote:

This patchset introduces CPPC(Collaborative Processor Performance Control) as a 
backend
to the PID governor. The PID governor from intel_pstate.c maps cleanly onto 
some CPPC
interfaces.
e.g. The CPU performance requests are made on a continuous scale as against 
discrete pstate
levels. The CPU performance feedback over an interval is gauged using platform 
specific
counters which are also described by CPPC.

Although CPPC describes several other registers to provide more hints to the 
platform,
Linux as of today does not have the infrastructure to make use of those 
registers.
Some of the CPPC specific information could be made available from the 
scheduler as
part of the CPUfreq and Scheduler intergration work. Until then PID can be used 
as the
front end for CPPC.

Beyond code restructuring and renaming, this patchset does not change the logic 
from the
intel_pstate.c driver. Kernel compilation times were compared with the original 
intel_pstate.c,
intel backend(intel_pid_ctrl.c) and the CPPC backend and no significant 
overheads were noticed.

Testing was performed on a Thinkpad X240 laptop.

PID_CTRL + INTEL_PSTATE:
===
real5m37.742s
user18m42.575s
sys 1m0.521s


PID_CTRL + CPPC_PID_CTRL:

real5m48.321s
user18m24.487s
sys 0m59.327s


ORIGINAL INTEL_PSTATE:
==
real5m40.642s
user18m37.411s
sys 1m0.185s


The complete patchset including the PCC hacks used for testing is available in 
[4].


Changes since V0: [1]
-   Split intel_pstate.c into a generic PID governor and platform specific 
backend.
-   Add CPPC accessors as PID backend.


CPPC:


CPPC (Collaborative Processor Performance Control) is a new way to control CPU
performance using an abstract continous scale as against a discretized P-state 
scale
which is tied to CPU frequency only. It is defined in the ACPI 5.0+ spec. In 
brief,
the basic operation involves:
- OS makes a CPU performance request. (Can provide min and max tolerable bounds)

- Platform (such as BMC) is free to optimize request within requested bounds 
depending
on power/thermal budgets etc.

- Platform conveys its decision back to OS

The communication between OS and platform occurs through another medium called 
(PCC)
Platform communication Channel. This is a generic mailbox like mechanism which 
includes
doorbell semantics to indicate register updates. The PCC driver is being 
discussed in a
separate patchset [3] and is not included here, since CPPC is only one client 
of PCC.

Finer details about the PCC and CPPC spec are available in the latest ACPI 5.1
specification.[2]

[1] - http://lwn.net/Articles/608715/
[2] - http://www.uefi.org/sites/default/files/resources/ACPI_5_1release.pdf
[3] - http://comments.gmane.org/gmane.linux.acpi.devel/70299
[4] - 
http://git.linaro.org/people/ashwin.chaugule/leg-kernel.git/shortlog/refs/heads/cppc-pid-no_freq_domain



Ashwin Chaugule (6):
   PID Controller governor
   PID: Move Turbo detection into backend driver
   PID: Move Baytrail specific accessors into backend driver
   PID: Add new function pointers to read multiple registers
   PID: Rename counters to make them more generic
   PID: Add CPPC (Collaborative Processor Performance) backend driver

  Documentation/cpu-freq/intel-pstate.txt |   43 --
  Documentation/cpu-freq/pid_ctrl.txt |   41 ++
  drivers/cpufreq/Kconfig |   19 +
  drivers/cpufreq/Kconfig.x86 |2 +-
  drivers/cpufreq/Makefile|4 +-
  drivers/cpufreq/cppc_pid_ctrl.c |  406 +
  drivers/cpufreq/intel_pid_ctrl.c

Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.

2014-09-10 Thread Dirk Brandewie

On 09/09/2014 04:22 PM, Anup Chenthamarakshan wrote:

On Tue, Sep 09, 2014 at 08:15:13AM -0700, Dirk Brandewie wrote:

On 09/08/2014 05:10 PM, Anup Chenthamarakshan wrote:

Exported stats appear in
sysfs/devices/system/cpu/intel_pstate/time_in_state as follows:

## CPU 0
40 3647
50 24342
60 144150
70 202469
## CPU 1
40 4813
50 22628
60 149564
70 211885
80 173890

Signed-off-by: Anup Chenthamarakshan an...@chromium.org


What is this information being used for?


I'm using P-state residency information in power consumption tests to calculate
proportion of time spent in each P-state across all processors (one global set
of percentages, corresponding to each P-state). This is used to validate new
changes from the power perspective. Essentially, sanity checks to flag changes
with large difference in P-state residency.

So far, we've been using the data exported by acpi-cpufreq to track this.



Tracking the current P state request for each core is only part of the
story.  The processor aggregates the requests from all cores and then decides
what frequency the package will run at, this evaluation happens at ~1ms time
frame.  If a core is idle then it loses its vote for that package frequency will
be and its frequency will be zero even though it may have been requesting
a high P state when it went idle.  Tracking the residency of the requested
P state doesn't provide much useful information other than ensuring the the
requests are changing over time IMHO.


This is exactly why we're trying to track it.


My point is that you are tracking the residency of the request and not
the P state the package was running at.  On a lightly loaded system
it is not unusual for a core that was very busy and requesting a high
P state to go idle for several seconds.  In this case that core would
lose its vote for the package P state but the stats would show that
the P state was high for a very long time when its real frequency
was zero.

There are a couple of ways to get what I consider better information
about what is actually going on.

  The current turbostat provides C state residency and calculates the
  average/effective frequency of the core over its sample time.
  Turbostat will also measure the power consumption from the CPU point
  of view if your processor supports the RAPL registers.

  Reading MSR 0x198 MSR_IA32_PERF_STATUS will tell you what the core
  would run at if it not idle, this reflects the decision that the
  package made based on current requests.

  Using perf to collect power:pstate_sample event will give information
  about each sample on the core and give you timestamps to detect idle
  times.

  Using perf to collect power:cpu_frequency will show when the P state
  request was changed on each core and is triggered by intel_pstate and
  acpi_cpufreq.

  Powertop collects that same information as turbostat and a bunch of
  other information useful in seeing where you could be burning power
  for no good reason.

For getting an idea of real power turbostat is the easiest to use and
is available on most systems.  Using perf will give you a very fine grained
view of what is going on as well as point to the culprit for bad
behaviour in most cases.





This interface will not be supportable with upcoming processors using
hardware P states as documented in volume 3 of the current SDM Section 14.4
http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
The OS will have no way of knowing what the P state requests are for a
given core are.


Will there be any means to determine the proportion of time spent in different
HWP-states when HWP gets enabled (maybe at a package level)?


Not that I am aware of :-(  There is MSR_PPERF section 14.4.5.1 that will give
the CPUs view of the amount of productive work/scalability of the current load.

--Dirk
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC v1 0/6] CPPC as a PID backend

2014-09-10 Thread Dirk Brandewie

On 09/10/2014 09:11 AM, Ashwin Chaugule wrote:

On 10 September 2014 11:44, Dirk Brandewie dirk.brande...@gmail.com wrote:

Hi Ashwin,


Hi Dirk,



I think the CPPC based driver should be a separate driver.

We made the conscious decision to not use any of the ACPI mechanisms
to enumerate or control P state selection.  Experience over the years
has shown that the quality/accuracy of the BIOS/ACPI implementations
vary widely across OEM's and platform types from a single OEM. Features
that always work on a server platform from a given OEM may not work or
provide bad information on client platforms for example.

Another reason for doing intel_pstate was to be able to land intel specific
features and fixes without breaking other architectures as the power
management capabilities of the platform evolve. As processors that support
Hardware P states (HWP) as described in section 14.4 of the current SDM
come into the market intel_pstate will change to not doing much other
than enabling HWP and providing an interface to forward user configuration
requests to the processor if the user chooses to enable HWP otherwise the
current mechanisms will be used.  This is why the intel_pstate sysfs
interface is the way it is to be able to map cleanly to HWP and provide
an abstract interface going forward.

Having separate drivers allows the system integrator/user to select the
most appropriate mechanism for their system.

--Dirk


With the current split I think you will still be able to maintain
Intel specific changes for the future in the backend driver. The PID
algorithm seems platform independent anyway and the PID knobs are
exported to userspace for platform specific tuning. The Intel backend
driver should be unaffected by the CPPC (ACPI) backend. We can also
make them mutually exclusive at runtime.


We could make it runtime selectable whether to use CPPC or the
native mechanisms for P state enumeration and selection but we would
get into an awful black/white list situation that would not make
anyone happy.

Using CPPC on Intel platforms implies using HWP which is already
planned for in intel_pstate.  I am not aware of any effort to support
CPPC on Intel platforms that do not support HWP.  For Intel platforms
using CPPC is NOT needed or desirable IMHO.  We had many conversations
over many months while CPPC was being defined and made the decision
to not use this mechanism on Intel Linux platforms.

For other platforms that plan on conforming to ACPI 5.x with respect
to P state enumeration and selection I would like to leave it to them
to hurd all the cats at the OEMs to get CPPC correct on all their platforms.



Or are you suggesting using PID + CPPC as another driver? IIUC, that
would lead to a lot of redundancy.



The redundancy is actually pretty small IMHO if you take out the
enumeration/init code the code shared at runtime is pretty small
sample/calc_busy/PID.



Cheers,
Ashwin



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.

2014-09-09 Thread Dirk Brandewie
On 09/08/2014 05:10 PM, Anup Chenthamarakshan wrote:
> Exported stats appear in
> /devices/system/cpu/intel_pstate/time_in_state as follows:
> 
> ## CPU 0
> 40 3647
> 50 24342
> 60 144150
> 70 202469
> ## CPU 1
> 40 4813
> 50 22628
> 60 149564
> 70 211885
> 80 173890
> 
> Signed-off-by: Anup Chenthamarakshan 

What is this information being used for?

Tracking the current P state request for each core is only part of the 
story.  The processor aggregates the requests from all cores and then decides
what frequency the package will run at, this evaluation happens at ~1ms time
frame.  If a core is idle then it loses its vote for that package frequency will
be and its frequency will be zero even though it may have been requesting
a high P state when it went idle.  Tracking the residency of the requested
P state doesn't provide much useful information other than ensuring the the 
requests are changing over time IMHO.

This interface will not be supportable with upcoming processors using
hardware P states as documented in volume 3 of the current SDM Section 14.4
http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
The OS will have no way of knowing what the P state requests are for a
given core are.

--Dirk 
> ---
>   drivers/cpufreq/intel_pstate.c | 77 
> --
>   1 file changed, 74 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
> index 0668b38..7be89bd 100644
> --- a/drivers/cpufreq/intel_pstate.c
> +++ b/drivers/cpufreq/intel_pstate.c
> @@ -84,6 +84,11 @@ struct _pid {
>   int32_t last_err;
>   };
>   
> +struct pstate_stat {
> + int pstate;
> + u64 time;
> +};
> +
>   struct cpudata {
>   int cpu;
>   
> @@ -97,6 +102,9 @@ struct cpudata {
>   u64 prev_aperf;
>   u64 prev_mperf;
>   struct sample sample;
> +
> + struct pstate_stat *stat;
> + u64 last_updated;
>   };
>   
>   static struct cpudata **all_cpu_data;
> @@ -218,6 +226,18 @@ static inline void intel_pstate_reset_all_pid(void)
>   }
>   }
>   
> +static void intel_pstate_account_time_to_current_pstate(struct cpudata *cpu)
> +{
> + /* Handle the initial call from intel_pstate_init_cpu */
> + if (likely(cpu->stat)) {
> + u64 now = jiffies;
> + int index = cpu->pstate.current_pstate - cpu->pstate.min_pstate;
> +
> + cpu->stat[index].time += now - cpu->last_updated;
> + cpu->last_updated = now;
> + }
> +}
> +
>   /** debugfs begin /
>   static int pid_param_set(void *data, u64 val)
>   {
> @@ -323,6 +343,40 @@ static ssize_t store_min_perf_pct(struct kobject *a, 
> struct attribute *b,
>   return count;
>   }
>   
> +static ssize_t show_time_in_state(struct kobject *kobj, struct attribute 
> *attr,
> + char *buf)
> +{
> + unsigned int cpu;
> + struct cpudata *cpudata;
> + int i, len = 0, total_states;
> +
> + for_each_online_cpu(cpu) {
> + if (!all_cpu_data[cpu])
> + continue;
> +
> + cpudata = all_cpu_data[cpu];
> + len += snprintf(buf + len, PAGE_SIZE - len, "## CPU %d\n", cpu);
> + if (len >= PAGE_SIZE)
> + return len;
> +
> + total_states = cpudata->pstate.turbo_pstate -
> + cpudata->pstate.min_pstate + 1;
> +
> + intel_pstate_account_time_to_current_pstate(cpudata);
> +
> + for (i = 0; i < total_states; i++) {
> + len += snprintf(buf + len, PAGE_SIZE - len, "%d %llu\n",
> + cpudata->stat[i].pstate * 10,
> + cpudata->stat[i].time);
> +
> + if (len >= PAGE_SIZE)
> + return len;
> + }
> + }
> +
> + return len;
> +}
> +
>   show_one(no_turbo, no_turbo);
>   show_one(max_perf_pct, max_perf_pct);
>   show_one(min_perf_pct, min_perf_pct);
> @@ -331,10 +385,13 @@ define_one_global_rw(no_turbo);
>   define_one_global_rw(max_perf_pct);
>   define_one_global_rw(min_perf_pct);
>   
> +define_one_global_ro(time_in_state);
> +
>   static struct attribute *intel_pstate_attributes[] = {
>   _turbo.attr,
>   _perf_pct.attr,
>   _perf_pct.attr,
> + _in_state.attr,
>   NULL
>   };
>   
> @@ -525,9 +582,11 @@ static void intel_pstate_set_pstate(struct cpudata *cpu, 
> int pstate)
>   
>   trace_cpu_frequency(pstate * 10, cpu->cpu);
>   
> - cpu->pstate.current_pstate = pstate;
> -
>   pstate_funcs.set(cpu, pstate);
> +
> + intel_pstate_account_time_to_current_pstate(cpu);
> +
> + cpu->pstate.current_pstate = pstate;
>   }
>   
>   static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
> @@ -751,6 

Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.

2014-09-09 Thread Dirk Brandewie
On 09/08/2014 05:10 PM, Anup Chenthamarakshan wrote:
 Exported stats appear in
 sysfs/devices/system/cpu/intel_pstate/time_in_state as follows:
 
 ## CPU 0
 40 3647
 50 24342
 60 144150
 70 202469
 ## CPU 1
 40 4813
 50 22628
 60 149564
 70 211885
 80 173890
 
 Signed-off-by: Anup Chenthamarakshan an...@chromium.org

What is this information being used for?

Tracking the current P state request for each core is only part of the 
story.  The processor aggregates the requests from all cores and then decides
what frequency the package will run at, this evaluation happens at ~1ms time
frame.  If a core is idle then it loses its vote for that package frequency will
be and its frequency will be zero even though it may have been requesting
a high P state when it went idle.  Tracking the residency of the requested
P state doesn't provide much useful information other than ensuring the the 
requests are changing over time IMHO.

This interface will not be supportable with upcoming processors using
hardware P states as documented in volume 3 of the current SDM Section 14.4
http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
The OS will have no way of knowing what the P state requests are for a
given core are.

--Dirk 
 ---
   drivers/cpufreq/intel_pstate.c | 77 
 --
   1 file changed, 74 insertions(+), 3 deletions(-)
 
 diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
 index 0668b38..7be89bd 100644
 --- a/drivers/cpufreq/intel_pstate.c
 +++ b/drivers/cpufreq/intel_pstate.c
 @@ -84,6 +84,11 @@ struct _pid {
   int32_t last_err;
   };
   
 +struct pstate_stat {
 + int pstate;
 + u64 time;
 +};
 +
   struct cpudata {
   int cpu;
   
 @@ -97,6 +102,9 @@ struct cpudata {
   u64 prev_aperf;
   u64 prev_mperf;
   struct sample sample;
 +
 + struct pstate_stat *stat;
 + u64 last_updated;
   };
   
   static struct cpudata **all_cpu_data;
 @@ -218,6 +226,18 @@ static inline void intel_pstate_reset_all_pid(void)
   }
   }
   
 +static void intel_pstate_account_time_to_current_pstate(struct cpudata *cpu)
 +{
 + /* Handle the initial call from intel_pstate_init_cpu */
 + if (likely(cpu-stat)) {
 + u64 now = jiffies;
 + int index = cpu-pstate.current_pstate - cpu-pstate.min_pstate;
 +
 + cpu-stat[index].time += now - cpu-last_updated;
 + cpu-last_updated = now;
 + }
 +}
 +
   /** debugfs begin /
   static int pid_param_set(void *data, u64 val)
   {
 @@ -323,6 +343,40 @@ static ssize_t store_min_perf_pct(struct kobject *a, 
 struct attribute *b,
   return count;
   }
   
 +static ssize_t show_time_in_state(struct kobject *kobj, struct attribute 
 *attr,
 + char *buf)
 +{
 + unsigned int cpu;
 + struct cpudata *cpudata;
 + int i, len = 0, total_states;
 +
 + for_each_online_cpu(cpu) {
 + if (!all_cpu_data[cpu])
 + continue;
 +
 + cpudata = all_cpu_data[cpu];
 + len += snprintf(buf + len, PAGE_SIZE - len, ## CPU %d\n, cpu);
 + if (len = PAGE_SIZE)
 + return len;
 +
 + total_states = cpudata-pstate.turbo_pstate -
 + cpudata-pstate.min_pstate + 1;
 +
 + intel_pstate_account_time_to_current_pstate(cpudata);
 +
 + for (i = 0; i  total_states; i++) {
 + len += snprintf(buf + len, PAGE_SIZE - len, %d %llu\n,
 + cpudata-stat[i].pstate * 10,
 + cpudata-stat[i].time);
 +
 + if (len = PAGE_SIZE)
 + return len;
 + }
 + }
 +
 + return len;
 +}
 +
   show_one(no_turbo, no_turbo);
   show_one(max_perf_pct, max_perf_pct);
   show_one(min_perf_pct, min_perf_pct);
 @@ -331,10 +385,13 @@ define_one_global_rw(no_turbo);
   define_one_global_rw(max_perf_pct);
   define_one_global_rw(min_perf_pct);
   
 +define_one_global_ro(time_in_state);
 +
   static struct attribute *intel_pstate_attributes[] = {
   no_turbo.attr,
   max_perf_pct.attr,
   min_perf_pct.attr,
 + time_in_state.attr,
   NULL
   };
   
 @@ -525,9 +582,11 @@ static void intel_pstate_set_pstate(struct cpudata *cpu, 
 int pstate)
   
   trace_cpu_frequency(pstate * 10, cpu-cpu);
   
 - cpu-pstate.current_pstate = pstate;
 -
   pstate_funcs.set(cpu, pstate);
 +
 + intel_pstate_account_time_to_current_pstate(cpu);
 +
 + cpu-pstate.current_pstate = pstate;
   }
   
   static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
 @@ -751,6 +810,7 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy 
 *policy)
   
   

Re: [PATCH] intel_pstate: Turn per cpu printk into pr_debug

2014-08-27 Thread Dirk Brandewie

On 08/27/2014 10:17 AM, Andi Kleen wrote:

From: Andi Kleen 

On larger systems intel_pstate currently spams the boot up
log with its "Intel pstate controlling ..." message for each CPU.
It's the only subsystem that prints a message for each
CPU.

Turn the message into a pr_debug.

Signed-off-by: Andi Kleen 


Acked-by: Dirk Brandewie 


---
  drivers/cpufreq/intel_pstate.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index c5eac94..17be734 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -688,7 +688,7 @@ static int intel_pstate_init_cpu(unsigned int cpunum)

add_timer_on(>timer, cpunum);

-   pr_info("Intel pstate controlling: cpu %d\n", cpunum);
+   pr_debug("Intel pstate controlling: cpu %d\n", cpunum);

return 0;
  }



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] cpufreq: intel_pstate: Add CPU ID for Braswell processor

2014-08-27 Thread Dirk Brandewie

On 08/22/2014 03:19 AM, Viresh Kumar wrote:

On 22 August 2014 15:35, Mika Westerberg
 wrote:

This is pretty much the same as Intel Baytrail, only the CPU ID is
different. Add the new ID to the supported CPU list.

Signed-off-by: Mika Westerberg 
Cc: Dirk Brandewie 


Dirk might be away on holidays..


Yes Sorry

Acked-by: Dirk Brandewie 




---
  drivers/cpufreq/intel_pstate.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index c5eac949760d..a3cf8994160b 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -660,6 +660,7 @@ static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
 ICPU(0x3f, core_params),
 ICPU(0x45, core_params),
 ICPU(0x46, core_params),
+   ICPU(0x4c, byt_params),
 ICPU(0x4f, core_params),
 ICPU(0x56, core_params),
 {}


Acked-by: Viresh Kumar 
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] cpufreq: intel_pstate: Add CPU ID for Braswell processor

2014-08-27 Thread Dirk Brandewie

On 08/22/2014 03:19 AM, Viresh Kumar wrote:

On 22 August 2014 15:35, Mika Westerberg
mika.westerb...@linux.intel.com wrote:

This is pretty much the same as Intel Baytrail, only the CPU ID is
different. Add the new ID to the supported CPU list.

Signed-off-by: Mika Westerberg mika.westerb...@linux.intel.com
Cc: Dirk Brandewie dirk.j.brande...@intel.com


Dirk might be away on holidays..


Yes Sorry

Acked-by: Dirk Brandewie dirk.j.brande...@intel.com




---
  drivers/cpufreq/intel_pstate.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index c5eac949760d..a3cf8994160b 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -660,6 +660,7 @@ static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
 ICPU(0x3f, core_params),
 ICPU(0x45, core_params),
 ICPU(0x46, core_params),
+   ICPU(0x4c, byt_params),
 ICPU(0x4f, core_params),
 ICPU(0x56, core_params),
 {}


Acked-by: Viresh Kumar viresh.ku...@linaro.org
--
To unsubscribe from this list: send the line unsubscribe linux-pm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] intel_pstate: Turn per cpu printk into pr_debug

2014-08-27 Thread Dirk Brandewie

On 08/27/2014 10:17 AM, Andi Kleen wrote:

From: Andi Kleen a...@linux.intel.com

On larger systems intel_pstate currently spams the boot up
log with its Intel pstate controlling ... message for each CPU.
It's the only subsystem that prints a message for each
CPU.

Turn the message into a pr_debug.

Signed-off-by: Andi Kleen a...@linux.intel.com


Acked-by: Dirk Brandewie dirk.j.brande...@intel.com


---
  drivers/cpufreq/intel_pstate.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index c5eac94..17be734 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -688,7 +688,7 @@ static int intel_pstate_init_cpu(unsigned int cpunum)

add_timer_on(cpu-timer, cpunum);

-   pr_info(Intel pstate controlling: cpu %d\n, cpunum);
+   pr_debug(Intel pstate controlling: cpu %d\n, cpunum);

return 0;
  }



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 1/2] cpufreq: Don't destroy/realloc policy/sysfs on hotplug/suspend

2014-07-16 Thread Dirk Brandewie

On 07/15/2014 03:47 PM, Saravana Kannan wrote:

The CPUfreq core moves the cpufreq policy ownership between CPUs when CPUs
within a cluster (CPUs sharing same policy) go ONLINE/OFFLINE. When moving
policy ownership between CPUs, it also moves the cpufreq sysfs directory
between CPUs and also fixes up the symlinks of the other CPUs in the
cluster.

Also, when all the CPUs in a cluster go OFFLINE, all the sysfs nodes and
directories are deleted, the kobject is released and the policy is freed.
And when the first CPU in a cluster comes up, the policy is reallocated and
initialized, kobject is acquired, the sysfs nodes are created or symlinked,
etc.

All these steps end up creating unnecessarily complicated code and locking.
There's no real benefit to adding/removing/moving the sysfs nodes and the
policy between CPUs. Other per CPU sysfs directories like power and cpuidle
are left alone during hotplug. So there's some precedence to what this
patch is trying to do.

This patch simplifies a lot of the code and locking by removing the
adding/removing/moving of policy/sysfs/kobj and just leaves the cpufreq
directory and policy in place irrespective of whether the CPUs are
ONLINE/OFFLINE.

Leaving the policy, sysfs and kobject in place also brings these additional
benefits:
* Faster suspend/resume
* Faster hotplug
* Sysfs file permissions maintained across hotplug
* Policy settings and governor tunables maintained across hotplug
* Cpufreq stats would be maintained across hotplug for all CPUs and can be
   queried even after CPU goes OFFLINE

Tested-by: Stephen Boyd 
Signed-off-by: Saravana Kannan 
---
  drivers/cpufreq/cpufreq.c | 388 +-
  1 file changed, 107 insertions(+), 281 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 62259d2..a0a2ec2 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -37,7 +37,6 @@
   */
  static struct cpufreq_driver *cpufreq_driver;
  static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data);
-static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data_fallback);
  static DEFINE_RWLOCK(cpufreq_driver_lock);
  DEFINE_MUTEX(cpufreq_governor_lock);
  static LIST_HEAD(cpufreq_policy_list);
@@ -859,34 +858,41 @@ void cpufreq_sysfs_remove_file(const struct attribute 
*attr)
  }
  EXPORT_SYMBOL(cpufreq_sysfs_remove_file);

-/* symlink affected CPUs */
-static int cpufreq_add_dev_symlink(struct cpufreq_policy *policy)
+/* symlink related CPUs */
+static int cpufreq_dev_symlink(struct cpufreq_policy *policy, bool add)
  {
-   unsigned int j;
+   unsigned int j, first_cpu = cpumask_first(policy->related_cpus);
int ret = 0;

-   for_each_cpu(j, policy->cpus) {
+   for_each_cpu(j, policy->related_cpus) {
struct device *cpu_dev;

-   if (j == policy->cpu)
+   if (j == first_cpu)
continue;

-   pr_debug("Adding link for CPU: %u\n", j);
cpu_dev = get_cpu_device(j);
-   ret = sysfs_create_link(_dev->kobj, >kobj,
-   "cpufreq");
+   if (add)
+   ret = sysfs_create_link(_dev->kobj, >kobj,
+   "cpufreq");
+   else
+   sysfs_remove_link(_dev->kobj, "cpufreq");
+
if (ret)
break;
}
return ret;
  }

-static int cpufreq_add_dev_interface(struct cpufreq_policy *policy,
-struct device *dev)
+static int cpufreq_add_dev_interface(struct cpufreq_policy *policy)
  {
struct freq_attr **drv_attr;
+   struct device *dev;
int ret = 0;

+   dev = get_cpu_device(cpumask_first(policy->related_cpus));
+   if (!dev)
+   return -EINVAL;
+
/* prepare interface data */
ret = kobject_init_and_add(>kobj, _cpufreq,
   >kobj, "cpufreq");
@@ -917,7 +923,7 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy 
*policy,
goto err_out_kobj_put;
}

-   ret = cpufreq_add_dev_symlink(policy);
+   ret = cpufreq_dev_symlink(policy, true);
if (ret)
goto err_out_kobj_put;

@@ -961,60 +967,58 @@ static void cpufreq_init_policy(struct cpufreq_policy 
*policy)
  }

  #ifdef CONFIG_HOTPLUG_CPU
-static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy,
- unsigned int cpu, struct device *dev)
+static int cpufreq_change_policy_cpus(struct cpufreq_policy *policy,
+ unsigned int cpu, bool add)
  {
int ret = 0;
-   unsigned long flags;
+   unsigned int cpus, pcpu;

-   if (has_target()) {
+   down_write(>rwsem);
+
+   cpus = !cpumask_empty(policy->cpus);
+   if (has_target() && cpus) {
ret = 

Re: [PATCH v3 1/2] cpufreq: Don't destroy/realloc policy/sysfs on hotplug/suspend

2014-07-16 Thread Dirk Brandewie

On 07/15/2014 03:47 PM, Saravana Kannan wrote:

The CPUfreq core moves the cpufreq policy ownership between CPUs when CPUs
within a cluster (CPUs sharing same policy) go ONLINE/OFFLINE. When moving
policy ownership between CPUs, it also moves the cpufreq sysfs directory
between CPUs and also fixes up the symlinks of the other CPUs in the
cluster.

Also, when all the CPUs in a cluster go OFFLINE, all the sysfs nodes and
directories are deleted, the kobject is released and the policy is freed.
And when the first CPU in a cluster comes up, the policy is reallocated and
initialized, kobject is acquired, the sysfs nodes are created or symlinked,
etc.

All these steps end up creating unnecessarily complicated code and locking.
There's no real benefit to adding/removing/moving the sysfs nodes and the
policy between CPUs. Other per CPU sysfs directories like power and cpuidle
are left alone during hotplug. So there's some precedence to what this
patch is trying to do.

This patch simplifies a lot of the code and locking by removing the
adding/removing/moving of policy/sysfs/kobj and just leaves the cpufreq
directory and policy in place irrespective of whether the CPUs are
ONLINE/OFFLINE.

Leaving the policy, sysfs and kobject in place also brings these additional
benefits:
* Faster suspend/resume
* Faster hotplug
* Sysfs file permissions maintained across hotplug
* Policy settings and governor tunables maintained across hotplug
* Cpufreq stats would be maintained across hotplug for all CPUs and can be
   queried even after CPU goes OFFLINE

Tested-by: Stephen Boyd sb...@codeaurora.org
Signed-off-by: Saravana Kannan skan...@codeaurora.org
---
  drivers/cpufreq/cpufreq.c | 388 +-
  1 file changed, 107 insertions(+), 281 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 62259d2..a0a2ec2 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -37,7 +37,6 @@
   */
  static struct cpufreq_driver *cpufreq_driver;
  static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data);
-static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data_fallback);
  static DEFINE_RWLOCK(cpufreq_driver_lock);
  DEFINE_MUTEX(cpufreq_governor_lock);
  static LIST_HEAD(cpufreq_policy_list);
@@ -859,34 +858,41 @@ void cpufreq_sysfs_remove_file(const struct attribute 
*attr)
  }
  EXPORT_SYMBOL(cpufreq_sysfs_remove_file);

-/* symlink affected CPUs */
-static int cpufreq_add_dev_symlink(struct cpufreq_policy *policy)
+/* symlink related CPUs */
+static int cpufreq_dev_symlink(struct cpufreq_policy *policy, bool add)
  {
-   unsigned int j;
+   unsigned int j, first_cpu = cpumask_first(policy-related_cpus);
int ret = 0;

-   for_each_cpu(j, policy-cpus) {
+   for_each_cpu(j, policy-related_cpus) {
struct device *cpu_dev;

-   if (j == policy-cpu)
+   if (j == first_cpu)
continue;

-   pr_debug(Adding link for CPU: %u\n, j);
cpu_dev = get_cpu_device(j);
-   ret = sysfs_create_link(cpu_dev-kobj, policy-kobj,
-   cpufreq);
+   if (add)
+   ret = sysfs_create_link(cpu_dev-kobj, policy-kobj,
+   cpufreq);
+   else
+   sysfs_remove_link(cpu_dev-kobj, cpufreq);
+
if (ret)
break;
}
return ret;
  }

-static int cpufreq_add_dev_interface(struct cpufreq_policy *policy,
-struct device *dev)
+static int cpufreq_add_dev_interface(struct cpufreq_policy *policy)
  {
struct freq_attr **drv_attr;
+   struct device *dev;
int ret = 0;

+   dev = get_cpu_device(cpumask_first(policy-related_cpus));
+   if (!dev)
+   return -EINVAL;
+
/* prepare interface data */
ret = kobject_init_and_add(policy-kobj, ktype_cpufreq,
   dev-kobj, cpufreq);
@@ -917,7 +923,7 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy 
*policy,
goto err_out_kobj_put;
}

-   ret = cpufreq_add_dev_symlink(policy);
+   ret = cpufreq_dev_symlink(policy, true);
if (ret)
goto err_out_kobj_put;

@@ -961,60 +967,58 @@ static void cpufreq_init_policy(struct cpufreq_policy 
*policy)
  }

  #ifdef CONFIG_HOTPLUG_CPU
-static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy,
- unsigned int cpu, struct device *dev)
+static int cpufreq_change_policy_cpus(struct cpufreq_policy *policy,
+ unsigned int cpu, bool add)
  {
int ret = 0;
-   unsigned long flags;
+   unsigned int cpus, pcpu;

-   if (has_target()) {
+   down_write(policy-rwsem);
+
+   cpus = !cpumask_empty(policy-cpus);
+   if 

Re: [PATCH] cpufreq: intel_pstate: Fix rounding of core_pct

2014-06-13 Thread Dirk Brandewie

On 06/12/2014 01:03 PM, Rafael J. Wysocki wrote:

On Thursday, June 12, 2014 05:35:59 PM Stratos Karafotis wrote:

On 12/06/2014 12:15 πμ, Doug Smythies wrote:



-Original Message-
From: Stratos Karafotis [mailto:strat...@semaphore.gr]
Sent: June-11-2014 13:20
To: Doug Smythies
Cc: linux...@vger.kernel.org; linux-kernel@vger.kernel.org; r...@rjwysocki.net; 
viresh.ku...@linaro.org; dirk.j.brande...@intel.com
Subject: Re: [PATCH] cpufreq: intel_pstate: Fix rounding of core_pct

On 2014.06.11 13:20 Stratos Karafotis wrote:

On 11/06/2014 06:02 μμ, Doug Smythies wrote:


On 2104.06.11 07:08 Stratos Karafotis wrote:

On 11/06/2014 04:41 μμ, Doug Smythies wrote:

No.



The intent was only ever to round properly the pseudo floating point result of 
the divide.
It was much more important (ugh, well 4 times more) when FRACBITS was still 6, 
which also got changed to 8 in a recent patch.



Are you sure?

Yes.


This rounding was very recently added.
As far as I can understand, I don't see the meaning of this rounding, as is.
Even if FRAC_BITS was 6, I think it would have almost no improvement in
calculations.


Note: I had not seen this e-mail when I wrote a few minutes ago:

You may be correct.
If Dirk agrees, I will re-analyse the entire driver for rounding effects soon.
When FRACBITS was 6 there were subtle cases where the driver would get stuck, 
and not make a final pstate change, with the default PID gains.
Other things have changed, and the analysis needs to be re-done.




Could you please elaborate a little bit more what we need these 2 lines below?



Sorry for being MIA on this thread I have been up to my eyeballs.


if ((rem << 1) >= int_tofp(sample->mperf))
core_pct += 1;


The rounding should have been
   core_pct += (1 << (FRAC_BITS-1));
Since core_pct is is in fixeded point notation at this point. Adding .5 to
core_pct to round up.

As Stratos pointed out the the current code only adds 1/256 to core_pct

Since core_pct_busy stays in fixed point through out the rest of the
calculations ans we only do the rounding when the PID is returning an
int I think we can safely remove these two lines.







Because nothing is mentioned for them in commit's changelog.
Do we need to round core_pct or not?
Because if we try to round it, I think this patch should work.


As mentioned originally, they are there just to round the pseudo floating 
number, not the integer portion only.
Let us bring back the very numbers you originally gave and work through it.

aperf = 5024
mperf = 10619

core_pct = 47.31142292%
or 47 and 79.724267 256ths
or to the closest kept fractional part 47 and 80 256ths
or 12112 as a pseudo float.
The maximum error with this rounding will be 1 part in 512 and symmetric 
instead of the 1 part in 256 always in one direction without.

Now if FRACBITS was still 6:
core_pct = 47.31142292%
or 47 and 19.931 64ths
or to the closest kept fractional part 47 and 20 64ths
or 3028 as a pseudo float.
The maximum error with this rounding will be 1 part in 128 and symmetric 
instead of the 1 part in 64 (1.6% !!!) always in one direction without.

Hope this helps.



Yes, it helps. Thanks a lot!

But please note that the maximum error without this rounding will be 1.6% _only_
in fractional part. In the real number it will be much smaller:

47.19 instead of 47.20

And using FRAC_BITS 8:

47.79 instead of 47.80

This is a 0.0002% difference. I can't see how this is can affect the 
calculations
even with FRAC_BITS 6.

Another thing is that this algorithm generally is used to round to the
nearest integer. I'm not sure if it's valid to apply it for the rounding of
the fractional part of fixed point number.


Depending on the original reason, it may or may not be.

In theory, it may help reduce numerical drift resulting from rounding always in
one direction only, but I'm not really sure if that matters here.

Doug seems to have carried out full analysis, though.

Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] cpufreq: intel_pstate: Fix rounding of core_pct

2014-06-13 Thread Dirk Brandewie

On 06/12/2014 01:03 PM, Rafael J. Wysocki wrote:

On Thursday, June 12, 2014 05:35:59 PM Stratos Karafotis wrote:

On 12/06/2014 12:15 πμ, Doug Smythies wrote:



-Original Message-
From: Stratos Karafotis [mailto:strat...@semaphore.gr]
Sent: June-11-2014 13:20
To: Doug Smythies
Cc: linux...@vger.kernel.org; linux-kernel@vger.kernel.org; r...@rjwysocki.net; 
viresh.ku...@linaro.org; dirk.j.brande...@intel.com
Subject: Re: [PATCH] cpufreq: intel_pstate: Fix rounding of core_pct

On 2014.06.11 13:20 Stratos Karafotis wrote:

On 11/06/2014 06:02 μμ, Doug Smythies wrote:


On 2104.06.11 07:08 Stratos Karafotis wrote:

On 11/06/2014 04:41 μμ, Doug Smythies wrote:

No.



The intent was only ever to round properly the pseudo floating point result of 
the divide.
It was much more important (ugh, well 4 times more) when FRACBITS was still 6, 
which also got changed to 8 in a recent patch.



Are you sure?

Yes.


This rounding was very recently added.
As far as I can understand, I don't see the meaning of this rounding, as is.
Even if FRAC_BITS was 6, I think it would have almost no improvement in
calculations.


Note: I had not seen this e-mail when I wrote a few minutes ago:

You may be correct.
If Dirk agrees, I will re-analyse the entire driver for rounding effects soon.
When FRACBITS was 6 there were subtle cases where the driver would get stuck, 
and not make a final pstate change, with the default PID gains.
Other things have changed, and the analysis needs to be re-done.




Could you please elaborate a little bit more what we need these 2 lines below?



Sorry for being MIA on this thread I have been up to my eyeballs.


if ((rem  1) = int_tofp(sample-mperf))
core_pct += 1;


The rounding should have been
   core_pct += (1  (FRAC_BITS-1));
Since core_pct is is in fixeded point notation at this point. Adding .5 to
core_pct to round up.

As Stratos pointed out the the current code only adds 1/256 to core_pct

Since core_pct_busy stays in fixed point through out the rest of the
calculations ans we only do the rounding when the PID is returning an
int I think we can safely remove these two lines.







Because nothing is mentioned for them in commit's changelog.
Do we need to round core_pct or not?
Because if we try to round it, I think this patch should work.


As mentioned originally, they are there just to round the pseudo floating 
number, not the integer portion only.
Let us bring back the very numbers you originally gave and work through it.

aperf = 5024
mperf = 10619

core_pct = 47.31142292%
or 47 and 79.724267 256ths
or to the closest kept fractional part 47 and 80 256ths
or 12112 as a pseudo float.
The maximum error with this rounding will be 1 part in 512 and symmetric 
instead of the 1 part in 256 always in one direction without.

Now if FRACBITS was still 6:
core_pct = 47.31142292%
or 47 and 19.931 64ths
or to the closest kept fractional part 47 and 20 64ths
or 3028 as a pseudo float.
The maximum error with this rounding will be 1 part in 128 and symmetric 
instead of the 1 part in 64 (1.6% !!!) always in one direction without.

Hope this helps.



Yes, it helps. Thanks a lot!

But please note that the maximum error without this rounding will be 1.6% _only_
in fractional part. In the real number it will be much smaller:

47.19 instead of 47.20

And using FRAC_BITS 8:

47.79 instead of 47.80

This is a 0.0002% difference. I can't see how this is can affect the 
calculations
even with FRAC_BITS 6.

Another thing is that this algorithm generally is used to round to the
nearest integer. I'm not sure if it's valid to apply it for the rounding of
the fractional part of fixed point number.


Depending on the original reason, it may or may not be.

In theory, it may help reduce numerical drift resulting from rounding always in
one direction only, but I'm not really sure if that matters here.

Doug seems to have carried out full analysis, though.

Rafael

--
To unsubscribe from this list: send the line unsubscribe linux-pm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 6/7] cpufreq: intel_pstate: Trivial code cleanup

2014-06-10 Thread Dirk Brandewie

On 06/10/2014 08:31 AM, Rafael J. Wysocki wrote:

On Tuesday, June 10, 2014 08:12:48 AM Dirk Brandewie wrote:

On 06/09/2014 02:01 PM, Stratos Karafotis wrote:

Remove unnecessary blank lines.
Remove unnecessary parentheses.
Remove unnecessary braces.
Put the code in one line where possible.
Add blank lines after variable declarations.
Alignment to open parenthesis.



I don't have an issue with this patch in general but I would rather
the cleanup be done when there is a functional change in the given
hunk of code otherwise you are setting up a fence for stable/backporters
of functional changes in the future.


I actually prefer separate cleanups so as to avoid doing multiple things
in one patch.

Rafael


I don't have strong feelings either way I was just trying to be kind
to the maintainers of distro kernels.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 7/7] cpufreq: intel_pstate: Make intel_pstate_kobject local

2014-06-10 Thread Dirk Brandewie

On 06/09/2014 02:01 PM, Stratos Karafotis wrote:

Since we never remove sysfs entry, we can make the intel_pstate_kobject
local.

Signed-off-by: Stratos Karafotis 

Acked-by: Dirk Brandewie 


  drivers/cpufreq/intel_pstate.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index fa44f0f..9533fff 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -387,10 +387,10 @@ static struct attribute *intel_pstate_attributes[] = {
  static struct attribute_group intel_pstate_attr_group = {
.attrs = intel_pstate_attributes,
  };
-static struct kobject *intel_pstate_kobject;

  static void intel_pstate_sysfs_expose_params(void)
  {
+   struct kobject *intel_pstate_kobject;
int rc;

intel_pstate_kobject = kobject_create_and_add("intel_pstate",



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/7] cpufreq: intel_pstate: Remove duplicate CPU ID check

2014-06-10 Thread Dirk Brandewie

On 06/09/2014 10:21 PM, Viresh Kumar wrote:

On 10 June 2014 02:30, Stratos Karafotis  wrote:

We check the CPU ID during driver init. There is no need
to do it again per logical CPU initialization.

So, remove the duplicate check.

Signed-off-by: Stratos Karafotis 
---
  drivers/cpufreq/intel_pstate.c | 6 --
  1 file changed, 6 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index aebd457..4e7f492 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -691,14 +691,8 @@ MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids);

  static int intel_pstate_init_cpu(unsigned int cpunum)
  {
-
-   const struct x86_cpu_id *id;
 struct cpudata *cpu;

-   id = x86_match_cpu(intel_pstate_cpu_ids);
-   if (!id)
-   return -ENODEV;
-
 all_cpu_data[cpunum] = kzalloc(sizeof(struct cpudata), GFP_KERNEL);
 if (!all_cpu_data[cpunum])
 return -ENOMEM;


Acked-by: Viresh Kumar 


Acked-by: Dirk Brandewie 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 4/7] cpufreq: intel_pstate: Simplify code in intel_pstate_adjust_busy_pstate

2014-06-10 Thread Dirk Brandewie

On 06/10/2014 07:51 AM, Stratos Karafotis wrote:

On 10/06/2014 08:27 πμ, Viresh Kumar wrote:

On 10 June 2014 02:30, Stratos Karafotis  wrote:

Simplify the code by removing the inline functions
pstate_increase and pstate_decrease and use directly the
intel_pstate_set_pstate.



Doesn't apply without your scaled_busy change spin this patch with
out the scaled_busy change and explain the change more fully in the
commit message to cover Viresh's question and I am good with this change.



Signed-off-by: Stratos Karafotis 
---
  drivers/cpufreq/intel_pstate.c | 26 +++---
  1 file changed, 3 insertions(+), 23 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 3a49269..26a0262 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -588,21 +588,6 @@ static void intel_pstate_set_pstate(struct cpudata *cpu, 
int pstate)
 pstate_funcs.set(cpu, pstate);
  }

-static inline void intel_pstate_pstate_increase(struct cpudata *cpu, int steps)
-{
-   int target;
-   target = cpu->pstate.current_pstate + steps;
-
-   intel_pstate_set_pstate(cpu, target);
-}
-
-static inline void intel_pstate_pstate_decrease(struct cpudata *cpu, int steps)
-{
-   int target;
-   target = cpu->pstate.current_pstate - steps;
-   intel_pstate_set_pstate(cpu, target);
-}
-
  static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
  {
 cpu->pstate.min_pstate = pstate_funcs.get_min();
@@ -695,20 +680,15 @@ static inline void intel_pstate_calc_scaled_busy(struct 
cpudata *cpu)
  static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
  {
 struct _pid *pid;
-   signed int ctl = 0;
-   int steps;
+   signed int ctl;

 pid = >pid;
 intel_pstate_calc_scaled_busy(cpu);

 ctl = pid_calc(pid, cpu->sample.busy_scaled);

-   steps = abs(ctl);
-
-   if (ctl < 0)
-   intel_pstate_pstate_increase(cpu, steps);
-   else
-   intel_pstate_pstate_decrease(cpu, steps);
+   /* Negative values of ctl increase the pstate and vice versa */
+   intel_pstate_set_pstate(cpu, cpu->pstate.current_pstate - ctl);
  }


I am not very good at this driver but there is some obvious functional
change here. Earlier we used to pass
'cpu->pstate.current_pstate {-|+} steps' and now you are doing '-ctl' only



The original code is:   

if (ctl < 0)
intel_pstate_pstate_increase(cpu, steps);
else
intel_pstate_pstate_decrease(cpu, steps);

Without inlines functions intel_pstate_pstate_increase() and
intel_pstate_pstate_decrease() we get:

if (ctl < 0)
intel_pstate_set_pstate(cpu, cpu->pstate.current_pstate + 
steps);
else
intel_pstate_set_pstate(cpu, cpu->pstate.current_pstate - 
steps);


But steps = abs(ctl), so:

if (ctl < 0)
intel_pstate_set_pstate(cpu, cpu->pstate.current_pstate + 
abs(ctl));
else
intel_pstate_set_pstate(cpu, cpu->pstate.current_pstate - 
abs(ctl));

By definition, abs(ctl) = ctl if ctl >= 0, -ctl if ctl < 0. Thus:

if (ctl < 0)
intel_pstate_set_pstate(cpu, cpu->pstate.current_pstate + 
(-ctl));
else
intel_pstate_set_pstate(cpu, cpu->pstate.current_pstate - ctl);

And:
if (ctl < 0)
intel_pstate_set_pstate(cpu, cpu->pstate.current_pstate - ctl);
else
intel_pstate_set_pstate(cpu, cpu->pstate.current_pstate - ctl);

Finally remove the unnecessary if statement.
intel_pstate_set_pstate(cpu, cpu->pstate.current_pstate - ctl);

So, this is equivalent with the original code.

Thanks,
Stratos
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/7] cpufreq: intel_pstate: Add debugfs file stats

2014-06-10 Thread Dirk Brandewie
On 06/10/2014 09:21 AM, Stratos Karafotis wrote:
> On 10/06/2014 06:47 μμ, Dirk Brandewie wrote:
>> On 06/09/2014 02:00 PM, Stratos Karafotis wrote:
>>> Add stats file in debugfs under driver's parent directory
>>> (pstate_snb) which counts the time in nsecs per requested
>>> P state and the number of times the specific state
>>> was requested.
>>>
>>> The file presents the statistics per logical CPU in the
>>> following format. The time is displayed in msecs:
>>>
>>
>> NAK
>>
>> This adds significantly to the memory footprint to gather information
>> that is available by post processing the perf tracepoint information.
>> The increase isn't horrible on single socket desktop processor machines
>> but gets big with server class machines.  One vendor I have talked to 
>> considers
>> a machine with 1024 cpus to be a SMALL machine.
>>
> 
> If I am not wrong the sizeof pstate_stat is 20B. On my CPU with 20 P states, 
> we
> need 400B per logical CPU (3200B total in my desktop) plus 64B for stats 
> pointers.
> 
> In your example this would need about 400KB - 500KB?
> Is it too much for 1024 a CPUs system?

For something that will likely not be used IMO yes.

> 
> I think it's a useful piece of info that we can have it directly without
> post processing tracepoint.
> Is it acceptable to conditionally compile it with a new CONFIG option?


I can see where the information could be useful but the set of people
that would find it useful is very small.  Having information about residency 
since boot is interesting but just barely.  This file will encourage people
to build tools/scripts that rely on this file and they will complain bitterly
if/when it changes or goes away so you would be creating a defacto ABI in
debugfs.


This functionality will *not* be supportable in up coming processors where HWP
is being used.  See section 14.4 of the current SDM vol. 3 
http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-system-programming-manual-325384.pdf


> 
> 
> Thanks,
> Stratos
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/7] cpufreq: intel_pstate: Avoid duplicate call of intel_pstate_get_scaled_busy

2014-06-10 Thread Dirk Brandewie

On 06/09/2014 02:00 PM, Stratos Karafotis wrote:

Store busy_scaled value to avoid to duplicate call of
intel_pstate_get_scaled_busy on every sampling interval.



The second call *only* happens if the tracepoint is being used otherwise
the whole function call to  trace_pstate_sample() is a noop.

This makes the code less readable IMHO the reader is left wondering
how cpu->sample.busy_scaled was set in intel_pstate_adjust_busy_pstate()



Also, rename the function to intel_pstate_calc_scaled_busy.

Signed-off-by: Stratos Karafotis 
---
  drivers/cpufreq/intel_pstate.c | 12 ++--
  1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 4e7f492..31e2ae5 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -55,6 +55,7 @@ static inline int32_t div_fp(int32_t x, int32_t y)

  struct sample {
int32_t core_pct_busy;
+   int32_t busy_scaled;
u64 aperf;
u64 mperf;
int freq;
@@ -604,7 +605,7 @@ static inline void intel_pstate_set_sample_time(struct 
cpudata *cpu)
mod_timer_pinned(>timer, jiffies + delay);
  }

-static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu)
+static inline void intel_pstate_calc_scaled_busy(struct cpudata *cpu)
  {
int32_t core_busy, max_pstate, current_pstate, sample_ratio;
u32 duration_us;
@@ -624,20 +625,19 @@ static inline int32_t intel_pstate_get_scaled_busy(struct 
cpudata *cpu)
core_busy = mul_fp(core_busy, sample_ratio);
}

-   return core_busy;
+   cpu->sample.busy_scaled = core_busy;
  }

  static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
  {
-   int32_t busy_scaled;
struct _pid *pid;
signed int ctl = 0;
int steps;

pid = >pid;
-   busy_scaled = intel_pstate_get_scaled_busy(cpu);
+   intel_pstate_calc_scaled_busy(cpu);

-   ctl = pid_calc(pid, busy_scaled);
+   ctl = pid_calc(pid, cpu->sample.busy_scaled);

steps = abs(ctl);

@@ -659,7 +659,7 @@ static void intel_pstate_timer_func(unsigned long __data)
intel_pstate_adjust_busy_pstate(cpu);

trace_pstate_sample(fp_toint(sample->core_pct_busy),
-   fp_toint(intel_pstate_get_scaled_busy(cpu)),
+   fp_toint(sample->busy_scaled),
cpu->pstate.current_pstate,
sample->mperf,
sample->aperf,



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/7] cpufreq: intel_pstate: Add debugfs file stats

2014-06-10 Thread Dirk Brandewie

On 06/09/2014 02:00 PM, Stratos Karafotis wrote:

Add stats file in debugfs under driver's parent directory
(pstate_snb) which counts the time in nsecs per requested
P state and the number of times the specific state
was requested.

The file presents the statistics per logical CPU in the
following format. The time is displayed in msecs:



NAK

This adds significantly to the memory footprint to gather information
that is available by post processing the perf tracepoint information.
The increase isn't horrible on single socket desktop processor machines
but gets big with server class machines.  One vendor I have talked to considers
a machine with 1024 cpus to be a SMALL machine.



CPU0
P-stateTime Count
  16 4882777 23632
  17   21210   174
  18  549781  3300
  19   51171   461
  20   35487   394
  21   18173   219
  22   13752   258
  236048   172
  247754   177
  254587   151
  265465   162
  27143247
  28 86354
  29144850
  30103047
  31147262
  32222168
  33186960
  34214070
  39   85446  3803

...

The file can be used for debugging but also for monitoring
various system workloads.

Also, make the debugfs_parent local as we never remove
the driver's debugfs files.

Signed-off-by: Stratos Karafotis 
---
  drivers/cpufreq/intel_pstate.c | 80 +-
  1 file changed, 79 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 31e2ae5..3a49269 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -86,6 +86,12 @@ struct _pid {
int32_t last_err;
  };

+struct pstate_stat {
+   int pstate;
+   u64 time;
+   u64 count;
+};
+
  struct cpudata {
int cpu;

@@ -99,6 +105,7 @@ struct cpudata {
u64 prev_aperf;
u64 prev_mperf;
struct sample sample;
+   struct pstate_stat *stats;
  };

  static struct cpudata **all_cpu_data;
@@ -256,9 +263,59 @@ static struct pid_param pid_files[] = {
{NULL, NULL}
  };

-static struct dentry *debugfs_parent;
+static inline unsigned int stats_state_index(struct cpudata *cpu, int pstate)
+{
+   if (pstate <= cpu->pstate.max_pstate)
+   return pstate - cpu->pstate.min_pstate;
+   else
+   return cpu->pstate.max_pstate - cpu->pstate.min_pstate + 1;
+}
+
+static int stats_debug_show(struct seq_file *m, void *unused)
+{
+   struct cpudata *cpu;
+   int i, j, cnt;
+
+   get_online_cpus();
+   for_each_online_cpu(i) {
+   if (all_cpu_data[i])
+   cpu = all_cpu_data[i];
+   else
+   continue;
+
+   seq_printf(m, "CPU%u\n", i);
+   seq_puts(m, "P-stateTime Count\n");
+
+   cnt = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 2;
+   for (j = 0; j < cnt; j++)
+   seq_printf(m, "%7u %11llu %9llu\n",
+  cpu->stats[j].pstate,
+  cpu->stats[j].time / USEC_PER_MSEC,
+  cpu->stats[j].count);
+
+   seq_puts(m, "\n");
+   }
+   put_online_cpus();
+
+   return 0;
+}
+
+static int stats_debug_open(struct inode *inode, struct file *file)
+{
+   return single_open(file, stats_debug_show, inode->i_private);
+}
+
+static const struct file_operations fops_stats_pstate = {
+   .open   = stats_debug_open,
+   .read   = seq_read,
+   .llseek = seq_lseek,
+   .release= single_release,
+   .owner  = THIS_MODULE,
+};
+
  static void intel_pstate_debug_expose_params(void)
  {
+   struct dentry *debugfs_parent;
int i = 0;

debugfs_parent = debugfs_create_dir("pstate_snb", NULL);
@@ -270,6 +327,8 @@ static void intel_pstate_debug_expose_params(void)
_pid_param);
i++;
}
+   debugfs_create_file("stats", S_IRUSR | S_IRGRP, debugfs_parent, NULL,
+   _stats_pstate);
  }

  /** debugfs end /
@@ -610,6 +669,7 @@ static inline void intel_pstate_calc_scaled_busy(struct 
cpudata *cpu)
int32_t core_busy, max_pstate, current_pstate, sample_ratio;
u32 duration_us;
u32 sample_time;
+   unsigned int i;

core_busy = cpu->sample.core_pct_busy;
max_pstate = int_tofp(cpu->pstate.max_pstate);
@@ -626,6 +686,10 @@ static inline void intel_pstate_calc_scaled_busy(struct 
cpudata *cpu)
}

cpu->sample.busy_scaled = core_busy;
+
+   i = 

Re: [PATCH 6/7] cpufreq: intel_pstate: Trivial code cleanup

2014-06-10 Thread Dirk Brandewie

On 06/09/2014 02:01 PM, Stratos Karafotis wrote:

Remove unnecessary blank lines.
Remove unnecessary parentheses.
Remove unnecessary braces.
Put the code in one line where possible.
Add blank lines after variable declarations.
Alignment to open parenthesis.



I don't have an issue with this patch in general but I would rather
the cleanup be done when there is a functional change in the given
hunk of code otherwise you are setting up a fence for stable/backporters
of functional changes in the future.



Signed-off-by: Stratos Karafotis 
---
  drivers/cpufreq/intel_pstate.c | 96 --
  1 file changed, 45 insertions(+), 51 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index d4f0518..fa44f0f 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -142,7 +142,7 @@ static struct perf_limits limits = {
  };

  static inline void pid_reset(struct _pid *pid, int setpoint, int busy,
-   int deadband, int integral) {
+int deadband, int integral) {
pid->setpoint = setpoint;
pid->deadband  = deadband;
pid->integral  = int_tofp(integral);
@@ -161,7 +161,6 @@ static inline void pid_i_gain_set(struct _pid *pid, int 
percent)

  static inline void pid_d_gain_set(struct _pid *pid, int percent)
  {
-
pid->d_gain = div_fp(int_tofp(percent), int_tofp(100));
  }

@@ -192,9 +191,9 @@ static signed int pid_calc(struct _pid *pid, int32_t busy)

result = pterm + mul_fp(pid->integral, pid->i_gain) + dterm;
if (result >= 0)
-   result = result + (1 << (FRAC_BITS-1));
+   result += 1 << (FRAC_BITS-1);
else
-   result = result - (1 << (FRAC_BITS-1));
+   result -= 1 << (FRAC_BITS-1);
return (signed int)fp_toint(result);
  }

@@ -204,20 +203,16 @@ static inline void intel_pstate_busy_pid_reset(struct 
cpudata *cpu)
pid_d_gain_set(>pid, pid_params.d_gain_pct);
pid_i_gain_set(>pid, pid_params.i_gain_pct);

-   pid_reset(>pid,
-   pid_params.setpoint,
-   100,
-   pid_params.deadband,
-   0);
+   pid_reset(>pid, pid_params.setpoint, 100, pid_params.deadband, 0);
  }

  static inline void intel_pstate_reset_all_pid(void)
  {
unsigned int cpu;
-   for_each_online_cpu(cpu) {
+
+   for_each_online_cpu(cpu)
if (all_cpu_data[cpu])
intel_pstate_busy_pid_reset(all_cpu_data[cpu]);
-   }
  }

  /** debugfs begin /
@@ -227,13 +222,13 @@ static int pid_param_set(void *data, u64 val)
intel_pstate_reset_all_pid();
return 0;
  }
+
  static int pid_param_get(void *data, u64 *val)
  {
*val = *(u32 *)data;
return 0;
  }
-DEFINE_SIMPLE_ATTRIBUTE(fops_pid_param, pid_param_get,
-   pid_param_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_pid_param, pid_param_get, pid_param_set, 
"%llu\n");

  struct pid_param {
char *name;
@@ -310,8 +305,8 @@ static void intel_pstate_debug_expose_params(void)
return;
while (pid_files[i].name) {
debugfs_create_file(pid_files[i].name, 0660,
-   debugfs_parent, pid_files[i].value,
-   _pid_param);
+   debugfs_parent, pid_files[i].value,
+   _pid_param);
i++;
}
debugfs_create_file("stats", S_IRUSR | S_IRGRP, debugfs_parent, NULL,
@@ -329,10 +324,11 @@ static void intel_pstate_debug_expose_params(void)
}

  static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
-   const char *buf, size_t count)
+ const char *buf, size_t count)
  {
unsigned int input;
int ret;
+
ret = sscanf(buf, "%u", );
if (ret != 1)
return -EINVAL;
@@ -342,10 +338,11 @@ static ssize_t store_no_turbo(struct kobject *a, struct 
attribute *b,
  }

  static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
-   const char *buf, size_t count)
+ const char *buf, size_t count)
  {
unsigned int input;
int ret;
+
ret = sscanf(buf, "%u", );
if (ret != 1)
return -EINVAL;
@@ -353,14 +350,16 @@ static ssize_t store_max_perf_pct(struct kobject *a, 
struct attribute *b,
limits.max_sysfs_pct = clamp_t(int, input, 0 , 100);
limits.max_perf_pct = min(limits.max_policy_pct, limits.max_sysfs_pct);
limits.max_perf = div_fp(int_tofp(limits.max_perf_pct), int_tofp(100));
+
return count;
  }

  static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
-   const char *buf, 

Re: [PATCH 6/7] cpufreq: intel_pstate: Trivial code cleanup

2014-06-10 Thread Dirk Brandewie

On 06/09/2014 02:01 PM, Stratos Karafotis wrote:

Remove unnecessary blank lines.
Remove unnecessary parentheses.
Remove unnecessary braces.
Put the code in one line where possible.
Add blank lines after variable declarations.
Alignment to open parenthesis.



I don't have an issue with this patch in general but I would rather
the cleanup be done when there is a functional change in the given
hunk of code otherwise you are setting up a fence for stable/backporters
of functional changes in the future.



Signed-off-by: Stratos Karafotis strat...@semaphore.gr
---
  drivers/cpufreq/intel_pstate.c | 96 --
  1 file changed, 45 insertions(+), 51 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index d4f0518..fa44f0f 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -142,7 +142,7 @@ static struct perf_limits limits = {
  };

  static inline void pid_reset(struct _pid *pid, int setpoint, int busy,
-   int deadband, int integral) {
+int deadband, int integral) {
pid-setpoint = setpoint;
pid-deadband  = deadband;
pid-integral  = int_tofp(integral);
@@ -161,7 +161,6 @@ static inline void pid_i_gain_set(struct _pid *pid, int 
percent)

  static inline void pid_d_gain_set(struct _pid *pid, int percent)
  {
-
pid-d_gain = div_fp(int_tofp(percent), int_tofp(100));
  }

@@ -192,9 +191,9 @@ static signed int pid_calc(struct _pid *pid, int32_t busy)

result = pterm + mul_fp(pid-integral, pid-i_gain) + dterm;
if (result = 0)
-   result = result + (1  (FRAC_BITS-1));
+   result += 1  (FRAC_BITS-1);
else
-   result = result - (1  (FRAC_BITS-1));
+   result -= 1  (FRAC_BITS-1);
return (signed int)fp_toint(result);
  }

@@ -204,20 +203,16 @@ static inline void intel_pstate_busy_pid_reset(struct 
cpudata *cpu)
pid_d_gain_set(cpu-pid, pid_params.d_gain_pct);
pid_i_gain_set(cpu-pid, pid_params.i_gain_pct);

-   pid_reset(cpu-pid,
-   pid_params.setpoint,
-   100,
-   pid_params.deadband,
-   0);
+   pid_reset(cpu-pid, pid_params.setpoint, 100, pid_params.deadband, 0);
  }

  static inline void intel_pstate_reset_all_pid(void)
  {
unsigned int cpu;
-   for_each_online_cpu(cpu) {
+
+   for_each_online_cpu(cpu)
if (all_cpu_data[cpu])
intel_pstate_busy_pid_reset(all_cpu_data[cpu]);
-   }
  }

  /** debugfs begin /
@@ -227,13 +222,13 @@ static int pid_param_set(void *data, u64 val)
intel_pstate_reset_all_pid();
return 0;
  }
+
  static int pid_param_get(void *data, u64 *val)
  {
*val = *(u32 *)data;
return 0;
  }
-DEFINE_SIMPLE_ATTRIBUTE(fops_pid_param, pid_param_get,
-   pid_param_set, %llu\n);
+DEFINE_SIMPLE_ATTRIBUTE(fops_pid_param, pid_param_get, pid_param_set, 
%llu\n);

  struct pid_param {
char *name;
@@ -310,8 +305,8 @@ static void intel_pstate_debug_expose_params(void)
return;
while (pid_files[i].name) {
debugfs_create_file(pid_files[i].name, 0660,
-   debugfs_parent, pid_files[i].value,
-   fops_pid_param);
+   debugfs_parent, pid_files[i].value,
+   fops_pid_param);
i++;
}
debugfs_create_file(stats, S_IRUSR | S_IRGRP, debugfs_parent, NULL,
@@ -329,10 +324,11 @@ static void intel_pstate_debug_expose_params(void)
}

  static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
-   const char *buf, size_t count)
+ const char *buf, size_t count)
  {
unsigned int input;
int ret;
+
ret = sscanf(buf, %u, input);
if (ret != 1)
return -EINVAL;
@@ -342,10 +338,11 @@ static ssize_t store_no_turbo(struct kobject *a, struct 
attribute *b,
  }

  static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
-   const char *buf, size_t count)
+ const char *buf, size_t count)
  {
unsigned int input;
int ret;
+
ret = sscanf(buf, %u, input);
if (ret != 1)
return -EINVAL;
@@ -353,14 +350,16 @@ static ssize_t store_max_perf_pct(struct kobject *a, 
struct attribute *b,
limits.max_sysfs_pct = clamp_t(int, input, 0 , 100);
limits.max_perf_pct = min(limits.max_policy_pct, limits.max_sysfs_pct);
limits.max_perf = div_fp(int_tofp(limits.max_perf_pct), int_tofp(100));
+
return count;
  }

  static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
-

Re: [PATCH 3/7] cpufreq: intel_pstate: Add debugfs file stats

2014-06-10 Thread Dirk Brandewie

On 06/09/2014 02:00 PM, Stratos Karafotis wrote:

Add stats file in debugfs under driver's parent directory
(pstate_snb) which counts the time in nsecs per requested
P state and the number of times the specific state
was requested.

The file presents the statistics per logical CPU in the
following format. The time is displayed in msecs:



NAK

This adds significantly to the memory footprint to gather information
that is available by post processing the perf tracepoint information.
The increase isn't horrible on single socket desktop processor machines
but gets big with server class machines.  One vendor I have talked to considers
a machine with 1024 cpus to be a SMALL machine.



CPU0
P-stateTime Count
  16 4882777 23632
  17   21210   174
  18  549781  3300
  19   51171   461
  20   35487   394
  21   18173   219
  22   13752   258
  236048   172
  247754   177
  254587   151
  265465   162
  27143247
  28 86354
  29144850
  30103047
  31147262
  32222168
  33186960
  34214070
  39   85446  3803

...

The file can be used for debugging but also for monitoring
various system workloads.

Also, make the debugfs_parent local as we never remove
the driver's debugfs files.

Signed-off-by: Stratos Karafotis strat...@semaphore.gr
---
  drivers/cpufreq/intel_pstate.c | 80 +-
  1 file changed, 79 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 31e2ae5..3a49269 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -86,6 +86,12 @@ struct _pid {
int32_t last_err;
  };

+struct pstate_stat {
+   int pstate;
+   u64 time;
+   u64 count;
+};
+
  struct cpudata {
int cpu;

@@ -99,6 +105,7 @@ struct cpudata {
u64 prev_aperf;
u64 prev_mperf;
struct sample sample;
+   struct pstate_stat *stats;
  };

  static struct cpudata **all_cpu_data;
@@ -256,9 +263,59 @@ static struct pid_param pid_files[] = {
{NULL, NULL}
  };

-static struct dentry *debugfs_parent;
+static inline unsigned int stats_state_index(struct cpudata *cpu, int pstate)
+{
+   if (pstate = cpu-pstate.max_pstate)
+   return pstate - cpu-pstate.min_pstate;
+   else
+   return cpu-pstate.max_pstate - cpu-pstate.min_pstate + 1;
+}
+
+static int stats_debug_show(struct seq_file *m, void *unused)
+{
+   struct cpudata *cpu;
+   int i, j, cnt;
+
+   get_online_cpus();
+   for_each_online_cpu(i) {
+   if (all_cpu_data[i])
+   cpu = all_cpu_data[i];
+   else
+   continue;
+
+   seq_printf(m, CPU%u\n, i);
+   seq_puts(m, P-stateTime Count\n);
+
+   cnt = cpu-pstate.max_pstate - cpu-pstate.min_pstate + 2;
+   for (j = 0; j  cnt; j++)
+   seq_printf(m, %7u %11llu %9llu\n,
+  cpu-stats[j].pstate,
+  cpu-stats[j].time / USEC_PER_MSEC,
+  cpu-stats[j].count);
+
+   seq_puts(m, \n);
+   }
+   put_online_cpus();
+
+   return 0;
+}
+
+static int stats_debug_open(struct inode *inode, struct file *file)
+{
+   return single_open(file, stats_debug_show, inode-i_private);
+}
+
+static const struct file_operations fops_stats_pstate = {
+   .open   = stats_debug_open,
+   .read   = seq_read,
+   .llseek = seq_lseek,
+   .release= single_release,
+   .owner  = THIS_MODULE,
+};
+
  static void intel_pstate_debug_expose_params(void)
  {
+   struct dentry *debugfs_parent;
int i = 0;

debugfs_parent = debugfs_create_dir(pstate_snb, NULL);
@@ -270,6 +327,8 @@ static void intel_pstate_debug_expose_params(void)
fops_pid_param);
i++;
}
+   debugfs_create_file(stats, S_IRUSR | S_IRGRP, debugfs_parent, NULL,
+   fops_stats_pstate);
  }

  /** debugfs end /
@@ -610,6 +669,7 @@ static inline void intel_pstate_calc_scaled_busy(struct 
cpudata *cpu)
int32_t core_busy, max_pstate, current_pstate, sample_ratio;
u32 duration_us;
u32 sample_time;
+   unsigned int i;

core_busy = cpu-sample.core_pct_busy;
max_pstate = int_tofp(cpu-pstate.max_pstate);
@@ -626,6 +686,10 @@ static inline void intel_pstate_calc_scaled_busy(struct 
cpudata *cpu)
}

cpu-sample.busy_scaled = core_busy;
+
+   i = 

Re: [PATCH 2/7] cpufreq: intel_pstate: Avoid duplicate call of intel_pstate_get_scaled_busy

2014-06-10 Thread Dirk Brandewie

On 06/09/2014 02:00 PM, Stratos Karafotis wrote:

Store busy_scaled value to avoid to duplicate call of
intel_pstate_get_scaled_busy on every sampling interval.



The second call *only* happens if the tracepoint is being used otherwise
the whole function call to  trace_pstate_sample() is a noop.

This makes the code less readable IMHO the reader is left wondering
how cpu-sample.busy_scaled was set in intel_pstate_adjust_busy_pstate()



Also, rename the function to intel_pstate_calc_scaled_busy.

Signed-off-by: Stratos Karafotis strat...@semaphore.gr
---
  drivers/cpufreq/intel_pstate.c | 12 ++--
  1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 4e7f492..31e2ae5 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -55,6 +55,7 @@ static inline int32_t div_fp(int32_t x, int32_t y)

  struct sample {
int32_t core_pct_busy;
+   int32_t busy_scaled;
u64 aperf;
u64 mperf;
int freq;
@@ -604,7 +605,7 @@ static inline void intel_pstate_set_sample_time(struct 
cpudata *cpu)
mod_timer_pinned(cpu-timer, jiffies + delay);
  }

-static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu)
+static inline void intel_pstate_calc_scaled_busy(struct cpudata *cpu)
  {
int32_t core_busy, max_pstate, current_pstate, sample_ratio;
u32 duration_us;
@@ -624,20 +625,19 @@ static inline int32_t intel_pstate_get_scaled_busy(struct 
cpudata *cpu)
core_busy = mul_fp(core_busy, sample_ratio);
}

-   return core_busy;
+   cpu-sample.busy_scaled = core_busy;
  }

  static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
  {
-   int32_t busy_scaled;
struct _pid *pid;
signed int ctl = 0;
int steps;

pid = cpu-pid;
-   busy_scaled = intel_pstate_get_scaled_busy(cpu);
+   intel_pstate_calc_scaled_busy(cpu);

-   ctl = pid_calc(pid, busy_scaled);
+   ctl = pid_calc(pid, cpu-sample.busy_scaled);

steps = abs(ctl);

@@ -659,7 +659,7 @@ static void intel_pstate_timer_func(unsigned long __data)
intel_pstate_adjust_busy_pstate(cpu);

trace_pstate_sample(fp_toint(sample-core_pct_busy),
-   fp_toint(intel_pstate_get_scaled_busy(cpu)),
+   fp_toint(sample-busy_scaled),
cpu-pstate.current_pstate,
sample-mperf,
sample-aperf,



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/7] cpufreq: intel_pstate: Add debugfs file stats

2014-06-10 Thread Dirk Brandewie
On 06/10/2014 09:21 AM, Stratos Karafotis wrote:
 On 10/06/2014 06:47 μμ, Dirk Brandewie wrote:
 On 06/09/2014 02:00 PM, Stratos Karafotis wrote:
 Add stats file in debugfs under driver's parent directory
 (pstate_snb) which counts the time in nsecs per requested
 P state and the number of times the specific state
 was requested.

 The file presents the statistics per logical CPU in the
 following format. The time is displayed in msecs:


 NAK

 This adds significantly to the memory footprint to gather information
 that is available by post processing the perf tracepoint information.
 The increase isn't horrible on single socket desktop processor machines
 but gets big with server class machines.  One vendor I have talked to 
 considers
 a machine with 1024 cpus to be a SMALL machine.

 
 If I am not wrong the sizeof pstate_stat is 20B. On my CPU with 20 P states, 
 we
 need 400B per logical CPU (3200B total in my desktop) plus 64B for stats 
 pointers.
 
 In your example this would need about 400KB - 500KB?
 Is it too much for 1024 a CPUs system?

For something that will likely not be used IMO yes.

 
 I think it's a useful piece of info that we can have it directly without
 post processing tracepoint.
 Is it acceptable to conditionally compile it with a new CONFIG option?


I can see where the information could be useful but the set of people
that would find it useful is very small.  Having information about residency 
since boot is interesting but just barely.  This file will encourage people
to build tools/scripts that rely on this file and they will complain bitterly
if/when it changes or goes away so you would be creating a defacto ABI in
debugfs.


This functionality will *not* be supportable in up coming processors where HWP
is being used.  See section 14.4 of the current SDM vol. 3 
http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-system-programming-manual-325384.pdf


 
 
 Thanks,
 Stratos
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 4/7] cpufreq: intel_pstate: Simplify code in intel_pstate_adjust_busy_pstate

2014-06-10 Thread Dirk Brandewie

On 06/10/2014 07:51 AM, Stratos Karafotis wrote:

On 10/06/2014 08:27 πμ, Viresh Kumar wrote:

On 10 June 2014 02:30, Stratos Karafotis strat...@semaphore.gr wrote:

Simplify the code by removing the inline functions
pstate_increase and pstate_decrease and use directly the
intel_pstate_set_pstate.



Doesn't apply without your scaled_busy change spin this patch with
out the scaled_busy change and explain the change more fully in the
commit message to cover Viresh's question and I am good with this change.



Signed-off-by: Stratos Karafotis strat...@semaphore.gr
---
  drivers/cpufreq/intel_pstate.c | 26 +++---
  1 file changed, 3 insertions(+), 23 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 3a49269..26a0262 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -588,21 +588,6 @@ static void intel_pstate_set_pstate(struct cpudata *cpu, 
int pstate)
 pstate_funcs.set(cpu, pstate);
  }

-static inline void intel_pstate_pstate_increase(struct cpudata *cpu, int steps)
-{
-   int target;
-   target = cpu-pstate.current_pstate + steps;
-
-   intel_pstate_set_pstate(cpu, target);
-}
-
-static inline void intel_pstate_pstate_decrease(struct cpudata *cpu, int steps)
-{
-   int target;
-   target = cpu-pstate.current_pstate - steps;
-   intel_pstate_set_pstate(cpu, target);
-}
-
  static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
  {
 cpu-pstate.min_pstate = pstate_funcs.get_min();
@@ -695,20 +680,15 @@ static inline void intel_pstate_calc_scaled_busy(struct 
cpudata *cpu)
  static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
  {
 struct _pid *pid;
-   signed int ctl = 0;
-   int steps;
+   signed int ctl;

 pid = cpu-pid;
 intel_pstate_calc_scaled_busy(cpu);

 ctl = pid_calc(pid, cpu-sample.busy_scaled);

-   steps = abs(ctl);
-
-   if (ctl  0)
-   intel_pstate_pstate_increase(cpu, steps);
-   else
-   intel_pstate_pstate_decrease(cpu, steps);
+   /* Negative values of ctl increase the pstate and vice versa */
+   intel_pstate_set_pstate(cpu, cpu-pstate.current_pstate - ctl);
  }


I am not very good at this driver but there is some obvious functional
change here. Earlier we used to pass
'cpu-pstate.current_pstate {-|+} steps' and now you are doing '-ctl' only



The original code is:   

if (ctl  0)
intel_pstate_pstate_increase(cpu, steps);
else
intel_pstate_pstate_decrease(cpu, steps);

Without inlines functions intel_pstate_pstate_increase() and
intel_pstate_pstate_decrease() we get:

if (ctl  0)
intel_pstate_set_pstate(cpu, cpu-pstate.current_pstate + 
steps);
else
intel_pstate_set_pstate(cpu, cpu-pstate.current_pstate - 
steps);


But steps = abs(ctl), so:

if (ctl  0)
intel_pstate_set_pstate(cpu, cpu-pstate.current_pstate + 
abs(ctl));
else
intel_pstate_set_pstate(cpu, cpu-pstate.current_pstate - 
abs(ctl));

By definition, abs(ctl) = ctl if ctl = 0, -ctl if ctl  0. Thus:

if (ctl  0)
intel_pstate_set_pstate(cpu, cpu-pstate.current_pstate + 
(-ctl));
else
intel_pstate_set_pstate(cpu, cpu-pstate.current_pstate - ctl);

And:
if (ctl  0)
intel_pstate_set_pstate(cpu, cpu-pstate.current_pstate - ctl);
else
intel_pstate_set_pstate(cpu, cpu-pstate.current_pstate - ctl);

Finally remove the unnecessary if statement.
intel_pstate_set_pstate(cpu, cpu-pstate.current_pstate - ctl);

So, this is equivalent with the original code.

Thanks,
Stratos
--
To unsubscribe from this list: send the line unsubscribe linux-pm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/7] cpufreq: intel_pstate: Remove duplicate CPU ID check

2014-06-10 Thread Dirk Brandewie

On 06/09/2014 10:21 PM, Viresh Kumar wrote:

On 10 June 2014 02:30, Stratos Karafotis strat...@semaphore.gr wrote:

We check the CPU ID during driver init. There is no need
to do it again per logical CPU initialization.

So, remove the duplicate check.

Signed-off-by: Stratos Karafotis strat...@semaphore.gr
---
  drivers/cpufreq/intel_pstate.c | 6 --
  1 file changed, 6 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index aebd457..4e7f492 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -691,14 +691,8 @@ MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids);

  static int intel_pstate_init_cpu(unsigned int cpunum)
  {
-
-   const struct x86_cpu_id *id;
 struct cpudata *cpu;

-   id = x86_match_cpu(intel_pstate_cpu_ids);
-   if (!id)
-   return -ENODEV;
-
 all_cpu_data[cpunum] = kzalloc(sizeof(struct cpudata), GFP_KERNEL);
 if (!all_cpu_data[cpunum])
 return -ENOMEM;


Acked-by: Viresh Kumar viresh.ku...@linaro.org


Acked-by: Dirk Brandewie dirk.j.brande...@intel.com

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 7/7] cpufreq: intel_pstate: Make intel_pstate_kobject local

2014-06-10 Thread Dirk Brandewie

On 06/09/2014 02:01 PM, Stratos Karafotis wrote:

Since we never remove sysfs entry, we can make the intel_pstate_kobject
local.

Signed-off-by: Stratos Karafotis strat...@semaphore.gr

Acked-by: Dirk Brandewie dirk.j.brande...@intel.com


  drivers/cpufreq/intel_pstate.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index fa44f0f..9533fff 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -387,10 +387,10 @@ static struct attribute *intel_pstate_attributes[] = {
  static struct attribute_group intel_pstate_attr_group = {
.attrs = intel_pstate_attributes,
  };
-static struct kobject *intel_pstate_kobject;

  static void intel_pstate_sysfs_expose_params(void)
  {
+   struct kobject *intel_pstate_kobject;
int rc;

intel_pstate_kobject = kobject_create_and_add(intel_pstate,



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 6/7] cpufreq: intel_pstate: Trivial code cleanup

2014-06-10 Thread Dirk Brandewie

On 06/10/2014 08:31 AM, Rafael J. Wysocki wrote:

On Tuesday, June 10, 2014 08:12:48 AM Dirk Brandewie wrote:

On 06/09/2014 02:01 PM, Stratos Karafotis wrote:

Remove unnecessary blank lines.
Remove unnecessary parentheses.
Remove unnecessary braces.
Put the code in one line where possible.
Add blank lines after variable declarations.
Alignment to open parenthesis.



I don't have an issue with this patch in general but I would rather
the cleanup be done when there is a functional change in the given
hunk of code otherwise you are setting up a fence for stable/backporters
of functional changes in the future.


I actually prefer separate cleanups so as to avoid doing multiple things
in one patch.

Rafael


I don't have strong feelings either way I was just trying to be kind
to the maintainers of distro kernels.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [REGRESSION] 3.15: Seems to turbo mode Intel Sandybridge Dual Core without need, overheating CPU

2014-06-09 Thread Dirk Brandewie

On 06/09/2014 03:02 PM, Martin Steigerwald wrote:

Am Montag, 9. Juni 2014, 23:41:40 schrieb Martin Steigerwald:

Am Montag, 9. Juni 2014, 23:33:43 schrieb Martin Steigerwald:

Hi!

Added linux-pm to Cc. Also reboots seems to fix up the condition:

merkaba:~> grep . /sys/devices/system/cpu/cpu[0-3]/cpufreq/cpuinfo_cur_freq
/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq:830957
/sys/devices/system/cpu/cpu1/cpufreq/cpuinfo_cur_freq:819628
/sys/devices/system/cpu/cpu2/cpufreq/cpuinfo_cur_freq:80
/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_cur_freq:813476
merkaba:~> sensors
acpitz-virtual-0
Adapter: Virtual device
temp1:+71.0°C  (crit = +98.0°C)

coretemp-isa-
Adapter: ISA adapter
Physical id 0:  +71.0°C  (high = +86.0°C, crit = +100.0°C)
Core 0: +70.0°C  (high = +86.0°C, crit = +100.0°C)
Core 1: +71.0°C  (high = +86.0°C, crit = +100.0°C)

thinkpad-isa-
Adapter: ISA adapter
fan1:3137 R


Still hot in here and after reboot and login into KDE session there is quite
some CPU activity for a while.

But way better than before.

I can test whether this also happens with ACPI cpufreq driver.

I think I didn´t see this with 3.14.


Its not just me:

Please change intel_pstate default to disable
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1188647


Way better with ACPI cpufreq driver and ondemand governor:

merkaba:~> grep . /sys/devices/system/cpu/cpu[0-3]/cpufreq/cpuinfo_cur_freq
/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq:2501000
/sys/devices/system/cpu/cpu1/cpufreq/cpuinfo_cur_freq:80
/sys/devices/system/cpu/cpu2/cpufreq/cpuinfo_cur_freq:80
/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_cur_freq:80
merkaba:~> grep . /sys/devices/system/cpu/cpu[0-3]/cpufreq/cpuinfo_cur_freq
/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq:160
/sys/devices/system/cpu/cpu1/cpufreq/cpuinfo_cur_freq:250
/sys/devices/system/cpu/cpu2/cpufreq/cpuinfo_cur_freq:80
/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_cur_freq:160
merkaba:~> grep . /sys/devices/system/cpu/cpu[0-3]/cpufreq/cpuinfo_cur_freq
/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq:180
/sys/devices/system/cpu/cpu1/cpufreq/cpuinfo_cur_freq:2501000
/sys/devices/system/cpu/cpu2/cpufreq/cpuinfo_cur_freq:250
/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_cur_freq:80
merkaba:~> grep . /sys/devices/system/cpu/cpu[0-3]/cpufreq/cpuinfo_cur_freq
/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq:80
/sys/devices/system/cpu/cpu1/cpufreq/cpuinfo_cur_freq:2501000
/sys/devices/system/cpu/cpu2/cpufreq/cpuinfo_cur_freq:80
/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_cur_freq:180
merkaba:~> grep . /sys/devices/system/cpu/cpu[0-3]/cpufreq/cpuinfo_cur_freq
/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq:120
/sys/devices/system/cpu/cpu1/cpufreq/cpuinfo_cur_freq:2501000
/sys/devices/system/cpu/cpu2/cpufreq/cpuinfo_cur_freq:80
/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_cur_freq:80


This is ondemand/acpi_cpufreq telling a well documented lie.  They tell you
what P state was requested not the frequency the core is running at.

/sys/devices/system/cpu/cpu1/cpufreq/cpuinfo_cur_freq:2501000
is code for select the highest turbo P state in ACPI frequency table
terms.  With the HW coordination on the chip all the cores will actually be
running at the frequency of the highest requested P state.

intel_pstate returns the measured/actual frequency the core ran at during
the most recent sample that the driver took.

Something is convincing intel_pstate and ondemand that one(or more) cores
is very busy.


merkaba:~> sensors
acpitz-virtual-0
Adapter: Virtual device
temp1:+83.0°C  (crit = +98.0°C)

coretemp-isa-
Adapter: ISA adapter
Physical id 0:  +84.0°C  (high = +86.0°C, crit = +100.0°C)
Core 0: +83.0°C  (high = +86.0°C, crit = +100.0°C)
Core 1: +84.0°C  (high = +86.0°C, crit = +100.0°C)

thinkpad-isa-
Adapter: ISA adapter
fan1:3586 RPM



Its still hot in this room, but this definately looks saner.

Thanks,
Martin





Am Montag, 9. Juni 2014, 23:24:54 schrieb Martin Steigerwald:

Hi!

I get:

Jun  9 22:41:32 merkaba kernel: [39978.006479] CPU0: Package temperature/speed 
normal
Jun  9 22:41:32 merkaba kernel: [39978.006481] CPU3: Package temperature/speed 
normal
Jun  9 22:41:32 merkaba kernel: [39978.006482] CPU2: Package temperature/speed 
normal
Jun  9 22:41:32 merkaba kernel: [39978.006487] CPU1: Package temperature/speed 
normal
Jun  9 22:44:02 merkaba kernel: [40127.673372] CPU2: Core temperature above 
threshold, cpu clock throttled (total events = 56554)
Jun  9 22:44:02 merkaba kernel: [40127.673383] CPU3: Core temperature above 
threshold, cpu clock throttled (total events = 56554)
Jun  9 22:44:02 merkaba kernel: [40127.674313] CPU3: Core temperature/speed 
normal
Jun  9 22:44:02 merkaba kernel: [40127.674352] CPU2: Core temperature/speed 
normal
Jun  9 22:45:21 merkaba kernel: [40207.302287] mce: [Hardware 

Re: [REGRESSION] 3.15: Seems to turbo mode Intel Sandybridge Dual Core without need, overheating CPU

2014-06-09 Thread Dirk Brandewie


Hi Martin,

Can you send the output of:
   turbostat sleep 10
and
   for i in 0 1 2 3; do rdmsr  -p $i -u -f15:8 0x198; done

For the normal and bad case please.

--Dirk

On 06/09/2014 02:33 PM, Martin Steigerwald wrote:

Hi!

Added linux-pm to Cc. Also reboots seems to fix up the condition:

merkaba:~> grep . /sys/devices/system/cpu/cpu[0-3]/cpufreq/cpuinfo_cur_freq
/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq:830957
/sys/devices/system/cpu/cpu1/cpufreq/cpuinfo_cur_freq:819628
/sys/devices/system/cpu/cpu2/cpufreq/cpuinfo_cur_freq:80
/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_cur_freq:813476
merkaba:~> sensors
acpitz-virtual-0
Adapter: Virtual device
temp1:+71.0°C  (crit = +98.0°C)

coretemp-isa-
Adapter: ISA adapter
Physical id 0:  +71.0°C  (high = +86.0°C, crit = +100.0°C)
Core 0: +70.0°C  (high = +86.0°C, crit = +100.0°C)
Core 1: +71.0°C  (high = +86.0°C, crit = +100.0°C)

thinkpad-isa-
Adapter: ISA adapter
fan1:3137 R


Still hot in here and after reboot and login into KDE session there is quite
some CPU activity for a while.

But way better than before.

I can test whether this also happens with ACPI cpufreq driver.

I think I didn´t see this with 3.14.



Am Montag, 9. Juni 2014, 23:24:54 schrieb Martin Steigerwald:

Hi!

I get:

Jun  9 22:41:32 merkaba kernel: [39978.006479] CPU0: Package temperature/speed 
normal
Jun  9 22:41:32 merkaba kernel: [39978.006481] CPU3: Package temperature/speed 
normal
Jun  9 22:41:32 merkaba kernel: [39978.006482] CPU2: Package temperature/speed 
normal
Jun  9 22:41:32 merkaba kernel: [39978.006487] CPU1: Package temperature/speed 
normal
Jun  9 22:44:02 merkaba kernel: [40127.673372] CPU2: Core temperature above 
threshold, cpu clock throttled (total events = 56554)
Jun  9 22:44:02 merkaba kernel: [40127.673383] CPU3: Core temperature above 
threshold, cpu clock throttled (total events = 56554)
Jun  9 22:44:02 merkaba kernel: [40127.674313] CPU3: Core temperature/speed 
normal
Jun  9 22:44:02 merkaba kernel: [40127.674352] CPU2: Core temperature/speed 
normal
Jun  9 22:45:21 merkaba kernel: [40207.302287] mce: [Hardware Error]: Machine 
check events logged
Jun  9 22:46:32 merkaba kernel: [40278.054568] CPU0: Package temperature/speed 
normal
Jun  9 22:46:32 merkaba kernel: [40278.054572] CPU3: Package temperature/speed 
normal
Jun  9 22:46:32 merkaba kernel: [40278.054574] CPU2: Package temperature/speed 
normal
Jun  9 22:46:32 merkaba kernel: [40278.054578] CPU1: Package temperature/speed 
normal
Jun  9 22:48:06 merkaba kernel: [40371.570654] perf interrupt took too long (19348 
> 17857), lowering kernel.perf_event_max_sample_rate to 7000
Jun  9 22:51:32 merkaba kernel: [40578.103629] CPU3: Package temperature/speed 
normal
Jun  9 22:51:32 merkaba kernel: [40578.103633] CPU0: Package temperature/speed 
normal
Jun  9 22:51:32 merkaba kernel: [40578.103638] CPU2: Package temperature/speed 
normal
Jun  9 22:51:32 merkaba kernel: [40578.103639] CPU1: Package temperature/speed 
normal
Jun  9 22:56:32 merkaba kernel: [40878.174734] CPU1: Package temperature above 
threshold, cpu clock throttled (total events = 152620)
Jun  9 22:56:32 merkaba kernel: [40878.174737] CPU0: Package temperature above 
threshold, cpu clock throttled (total events = 152620)
Jun  9 22:56:32 merkaba kernel: [40878.174742] CPU3: Package temperature above 
threshold, cpu clock throttled (total events = 152620)
Jun  9 22:56:32 merkaba kernel: [40878.174744] CPU2: Package temperature above 
threshold, cpu clock throttled (total events = 152620)
Jun  9 22:56:32 merkaba kernel: [40878.176744] CPU3: Package temperature/speed 
normal
Jun  9 22:56:32 merkaba kernel: [40878.176746] CPU2: Package temperature/speed 
normal
Jun  9 22:56:32 merkaba kernel: [40878.176748] CPU1: Package temperature/speed 
normal
Jun  9 22:56:32 merkaba kernel: [40878.176749] CPU0: Package temperature/speed 
normal
Jun  9 22:59:11 merkaba kernel: [41037.278705] CPU3: Core temperature/speed 
normal
Jun  9 22:59:11 merkaba kernel: [41037.278707] CPU2: Core temperature/speed 
normal
Jun  9 23:01:32 merkaba kernel: [41178.225837] CPU2: Package temperature above 
threshold, cpu clock throttled (total events = 177343)
Jun  9 23:01:32 merkaba kernel: [41178.225841] CPU0: Package temperature above 
threshold, cpu clock throttled (total events = 177343)
Jun  9 23:01:32 merkaba kernel: [41178.225843] CPU3: Package temperature above 
threshold, cpu clock throttled (total events = 177343)
Jun  9 23:01:32 merkaba kernel: [41178.225845] CPU1: Package temperature above 
threshold, cpu clock throttled (total events = 177343)
Jun  9 23:01:32 merkaba kernel: [41178.237850] CPU1: Package temperature/speed 
normal
Jun  9 23:01:32 merkaba kernel: [41178.237853] CPU2: Package temperature/speed 
normal
Jun  9 23:01:32 merkaba kernel: [41178.237855] CPU0: Package temperature/speed 
normal
Jun  9 23:01:32 merkaba kernel: [41178.237856] CPU3: Package temperature/speed 
normal
Jun  9 23:01:36 merkaba 

Re: [REGRESSION] 3.15: Seems to turbo mode Intel Sandybridge Dual Core without need, overheating CPU

2014-06-09 Thread Dirk Brandewie


Hi Martin,

Can you send the output of:
   turbostat sleep 10
and
   for i in 0 1 2 3; do rdmsr  -p $i -u -f15:8 0x198; done

For the normal and bad case please.

--Dirk

On 06/09/2014 02:33 PM, Martin Steigerwald wrote:

Hi!

Added linux-pm to Cc. Also reboots seems to fix up the condition:

merkaba:~ grep . /sys/devices/system/cpu/cpu[0-3]/cpufreq/cpuinfo_cur_freq
/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq:830957
/sys/devices/system/cpu/cpu1/cpufreq/cpuinfo_cur_freq:819628
/sys/devices/system/cpu/cpu2/cpufreq/cpuinfo_cur_freq:80
/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_cur_freq:813476
merkaba:~ sensors
acpitz-virtual-0
Adapter: Virtual device
temp1:+71.0°C  (crit = +98.0°C)

coretemp-isa-
Adapter: ISA adapter
Physical id 0:  +71.0°C  (high = +86.0°C, crit = +100.0°C)
Core 0: +70.0°C  (high = +86.0°C, crit = +100.0°C)
Core 1: +71.0°C  (high = +86.0°C, crit = +100.0°C)

thinkpad-isa-
Adapter: ISA adapter
fan1:3137 R


Still hot in here and after reboot and login into KDE session there is quite
some CPU activity for a while.

But way better than before.

I can test whether this also happens with ACPI cpufreq driver.

I think I didn´t see this with 3.14.



Am Montag, 9. Juni 2014, 23:24:54 schrieb Martin Steigerwald:

Hi!

I get:

Jun  9 22:41:32 merkaba kernel: [39978.006479] CPU0: Package temperature/speed 
normal
Jun  9 22:41:32 merkaba kernel: [39978.006481] CPU3: Package temperature/speed 
normal
Jun  9 22:41:32 merkaba kernel: [39978.006482] CPU2: Package temperature/speed 
normal
Jun  9 22:41:32 merkaba kernel: [39978.006487] CPU1: Package temperature/speed 
normal
Jun  9 22:44:02 merkaba kernel: [40127.673372] CPU2: Core temperature above 
threshold, cpu clock throttled (total events = 56554)
Jun  9 22:44:02 merkaba kernel: [40127.673383] CPU3: Core temperature above 
threshold, cpu clock throttled (total events = 56554)
Jun  9 22:44:02 merkaba kernel: [40127.674313] CPU3: Core temperature/speed 
normal
Jun  9 22:44:02 merkaba kernel: [40127.674352] CPU2: Core temperature/speed 
normal
Jun  9 22:45:21 merkaba kernel: [40207.302287] mce: [Hardware Error]: Machine 
check events logged
Jun  9 22:46:32 merkaba kernel: [40278.054568] CPU0: Package temperature/speed 
normal
Jun  9 22:46:32 merkaba kernel: [40278.054572] CPU3: Package temperature/speed 
normal
Jun  9 22:46:32 merkaba kernel: [40278.054574] CPU2: Package temperature/speed 
normal
Jun  9 22:46:32 merkaba kernel: [40278.054578] CPU1: Package temperature/speed 
normal
Jun  9 22:48:06 merkaba kernel: [40371.570654] perf interrupt took too long (19348 
 17857), lowering kernel.perf_event_max_sample_rate to 7000
Jun  9 22:51:32 merkaba kernel: [40578.103629] CPU3: Package temperature/speed 
normal
Jun  9 22:51:32 merkaba kernel: [40578.103633] CPU0: Package temperature/speed 
normal
Jun  9 22:51:32 merkaba kernel: [40578.103638] CPU2: Package temperature/speed 
normal
Jun  9 22:51:32 merkaba kernel: [40578.103639] CPU1: Package temperature/speed 
normal
Jun  9 22:56:32 merkaba kernel: [40878.174734] CPU1: Package temperature above 
threshold, cpu clock throttled (total events = 152620)
Jun  9 22:56:32 merkaba kernel: [40878.174737] CPU0: Package temperature above 
threshold, cpu clock throttled (total events = 152620)
Jun  9 22:56:32 merkaba kernel: [40878.174742] CPU3: Package temperature above 
threshold, cpu clock throttled (total events = 152620)
Jun  9 22:56:32 merkaba kernel: [40878.174744] CPU2: Package temperature above 
threshold, cpu clock throttled (total events = 152620)
Jun  9 22:56:32 merkaba kernel: [40878.176744] CPU3: Package temperature/speed 
normal
Jun  9 22:56:32 merkaba kernel: [40878.176746] CPU2: Package temperature/speed 
normal
Jun  9 22:56:32 merkaba kernel: [40878.176748] CPU1: Package temperature/speed 
normal
Jun  9 22:56:32 merkaba kernel: [40878.176749] CPU0: Package temperature/speed 
normal
Jun  9 22:59:11 merkaba kernel: [41037.278705] CPU3: Core temperature/speed 
normal
Jun  9 22:59:11 merkaba kernel: [41037.278707] CPU2: Core temperature/speed 
normal
Jun  9 23:01:32 merkaba kernel: [41178.225837] CPU2: Package temperature above 
threshold, cpu clock throttled (total events = 177343)
Jun  9 23:01:32 merkaba kernel: [41178.225841] CPU0: Package temperature above 
threshold, cpu clock throttled (total events = 177343)
Jun  9 23:01:32 merkaba kernel: [41178.225843] CPU3: Package temperature above 
threshold, cpu clock throttled (total events = 177343)
Jun  9 23:01:32 merkaba kernel: [41178.225845] CPU1: Package temperature above 
threshold, cpu clock throttled (total events = 177343)
Jun  9 23:01:32 merkaba kernel: [41178.237850] CPU1: Package temperature/speed 
normal
Jun  9 23:01:32 merkaba kernel: [41178.237853] CPU2: Package temperature/speed 
normal
Jun  9 23:01:32 merkaba kernel: [41178.237855] CPU0: Package temperature/speed 
normal
Jun  9 23:01:32 merkaba kernel: [41178.237856] CPU3: Package temperature/speed 
normal
Jun  9 23:01:36 merkaba 

Re: [REGRESSION] 3.15: Seems to turbo mode Intel Sandybridge Dual Core without need, overheating CPU

2014-06-09 Thread Dirk Brandewie

On 06/09/2014 03:02 PM, Martin Steigerwald wrote:

Am Montag, 9. Juni 2014, 23:41:40 schrieb Martin Steigerwald:

Am Montag, 9. Juni 2014, 23:33:43 schrieb Martin Steigerwald:

Hi!

Added linux-pm to Cc. Also reboots seems to fix up the condition:

merkaba:~ grep . /sys/devices/system/cpu/cpu[0-3]/cpufreq/cpuinfo_cur_freq
/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq:830957
/sys/devices/system/cpu/cpu1/cpufreq/cpuinfo_cur_freq:819628
/sys/devices/system/cpu/cpu2/cpufreq/cpuinfo_cur_freq:80
/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_cur_freq:813476
merkaba:~ sensors
acpitz-virtual-0
Adapter: Virtual device
temp1:+71.0°C  (crit = +98.0°C)

coretemp-isa-
Adapter: ISA adapter
Physical id 0:  +71.0°C  (high = +86.0°C, crit = +100.0°C)
Core 0: +70.0°C  (high = +86.0°C, crit = +100.0°C)
Core 1: +71.0°C  (high = +86.0°C, crit = +100.0°C)

thinkpad-isa-
Adapter: ISA adapter
fan1:3137 R


Still hot in here and after reboot and login into KDE session there is quite
some CPU activity for a while.

But way better than before.

I can test whether this also happens with ACPI cpufreq driver.

I think I didn´t see this with 3.14.


Its not just me:

Please change intel_pstate default to disable
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1188647


Way better with ACPI cpufreq driver and ondemand governor:

merkaba:~ grep . /sys/devices/system/cpu/cpu[0-3]/cpufreq/cpuinfo_cur_freq
/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq:2501000
/sys/devices/system/cpu/cpu1/cpufreq/cpuinfo_cur_freq:80
/sys/devices/system/cpu/cpu2/cpufreq/cpuinfo_cur_freq:80
/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_cur_freq:80
merkaba:~ grep . /sys/devices/system/cpu/cpu[0-3]/cpufreq/cpuinfo_cur_freq
/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq:160
/sys/devices/system/cpu/cpu1/cpufreq/cpuinfo_cur_freq:250
/sys/devices/system/cpu/cpu2/cpufreq/cpuinfo_cur_freq:80
/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_cur_freq:160
merkaba:~ grep . /sys/devices/system/cpu/cpu[0-3]/cpufreq/cpuinfo_cur_freq
/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq:180
/sys/devices/system/cpu/cpu1/cpufreq/cpuinfo_cur_freq:2501000
/sys/devices/system/cpu/cpu2/cpufreq/cpuinfo_cur_freq:250
/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_cur_freq:80
merkaba:~ grep . /sys/devices/system/cpu/cpu[0-3]/cpufreq/cpuinfo_cur_freq
/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq:80
/sys/devices/system/cpu/cpu1/cpufreq/cpuinfo_cur_freq:2501000
/sys/devices/system/cpu/cpu2/cpufreq/cpuinfo_cur_freq:80
/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_cur_freq:180
merkaba:~ grep . /sys/devices/system/cpu/cpu[0-3]/cpufreq/cpuinfo_cur_freq
/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq:120
/sys/devices/system/cpu/cpu1/cpufreq/cpuinfo_cur_freq:2501000
/sys/devices/system/cpu/cpu2/cpufreq/cpuinfo_cur_freq:80
/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_cur_freq:80


This is ondemand/acpi_cpufreq telling a well documented lie.  They tell you
what P state was requested not the frequency the core is running at.

/sys/devices/system/cpu/cpu1/cpufreq/cpuinfo_cur_freq:2501000
is code for select the highest turbo P state in ACPI frequency table
terms.  With the HW coordination on the chip all the cores will actually be
running at the frequency of the highest requested P state.

intel_pstate returns the measured/actual frequency the core ran at during
the most recent sample that the driver took.

Something is convincing intel_pstate and ondemand that one(or more) cores
is very busy.


merkaba:~ sensors
acpitz-virtual-0
Adapter: Virtual device
temp1:+83.0°C  (crit = +98.0°C)

coretemp-isa-
Adapter: ISA adapter
Physical id 0:  +84.0°C  (high = +86.0°C, crit = +100.0°C)
Core 0: +83.0°C  (high = +86.0°C, crit = +100.0°C)
Core 1: +84.0°C  (high = +86.0°C, crit = +100.0°C)

thinkpad-isa-
Adapter: ISA adapter
fan1:3586 RPM



Its still hot in this room, but this definately looks saner.

Thanks,
Martin





Am Montag, 9. Juni 2014, 23:24:54 schrieb Martin Steigerwald:

Hi!

I get:

Jun  9 22:41:32 merkaba kernel: [39978.006479] CPU0: Package temperature/speed 
normal
Jun  9 22:41:32 merkaba kernel: [39978.006481] CPU3: Package temperature/speed 
normal
Jun  9 22:41:32 merkaba kernel: [39978.006482] CPU2: Package temperature/speed 
normal
Jun  9 22:41:32 merkaba kernel: [39978.006487] CPU1: Package temperature/speed 
normal
Jun  9 22:44:02 merkaba kernel: [40127.673372] CPU2: Core temperature above 
threshold, cpu clock throttled (total events = 56554)
Jun  9 22:44:02 merkaba kernel: [40127.673383] CPU3: Core temperature above 
threshold, cpu clock throttled (total events = 56554)
Jun  9 22:44:02 merkaba kernel: [40127.674313] CPU3: Core temperature/speed 
normal
Jun  9 22:44:02 merkaba kernel: [40127.674352] CPU2: Core temperature/speed 
normal
Jun  9 22:45:21 merkaba kernel: [40207.302287] mce: [Hardware Error]: 

Re: [PATCH] Documentation, intel_pstate: Add a description of the intel_pstate internal governors [v2]

2014-06-05 Thread Dirk Brandewie

On 06/05/2014 08:07 AM, Prarit Bhargava wrote:

The current documentation is incomplete wrt the intel_pstate internal
governors.  The confusion comes from the general use internal governors
which also use the names performance and powersave.  This patch
differentiates between the two sets of governors.

Cc: Viresh Kumar 
Cc: Dirk Brandewie 
Cc: Randy Dunlap 
Cc: Russell King 
Cc: Jesper Nilsson 
Cc: Viresh Kumar 
Cc: "David S. Miller" 
Cc: Ramkumar Ramachandra 
Cc: "Rafael J. Wysocki" 
Cc: linux-...@vger.kernel.org
Signed-off-by: Prarit Bhargava 


Acked-by: Dirk Brandewie 


[v2]: text update
---
  Documentation/cpu-freq/governors.txt|2 +-
  Documentation/cpu-freq/intel-pstate.txt |8 
  2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/Documentation/cpu-freq/governors.txt 
b/Documentation/cpu-freq/governors.txt
index 77ec215..c15aa75 100644
--- a/Documentation/cpu-freq/governors.txt
+++ b/Documentation/cpu-freq/governors.txt
@@ -36,7 +36,7 @@ Contents:
  1. What Is A CPUFreq Governor?
  ==

-Most cpufreq drivers (in fact, all except one, longrun) or even most
+Most cpufreq drivers (except the intel_pstate and longrun) or even most
  cpu frequency scaling algorithms only offer the CPU to be set to one
  frequency. In order to offer dynamic frequency scaling, the cpufreq
  core must be able to tell these drivers of a "target frequency". So
diff --git a/Documentation/cpu-freq/intel-pstate.txt 
b/Documentation/cpu-freq/intel-pstate.txt
index e742d21..2edd8d9 100644
--- a/Documentation/cpu-freq/intel-pstate.txt
+++ b/Documentation/cpu-freq/intel-pstate.txt
@@ -9,6 +9,14 @@ assumed to implement internal governors by the cpufreq core. 
All the
  logic for selecting the current P state is contained within the
  driver; no external governor is used by the cpufreq core.

+The Intel P-state driver has two internal governors, performance and
+powersave.  These governors differ from the generally used governors of the
+same name in the kernel.  The internal performance governor sets the
+max_perf_pct and min_perf_pct to 100; that is, the governor selects the
+highest available P state to maximize the performance of the core.  The
+internal powersave governor, selects the appropriate P state based on the
+current load on the CPU.
+
  Intel SandyBridge+ processors are supported.

  New sysfs files for controlling P state selection have been added to



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH 06/16] arm: topology: Define TC2 sched energy and provide it to scheduler

2014-06-05 Thread Dirk Brandewie

On 06/04/2014 11:52 PM, Peter Zijlstra wrote:

On Wed, Jun 04, 2014 at 11:56:55PM +0200, Rafael J. Wysocki wrote:

On Wednesday, June 04, 2014 07:27:12 PM Peter Zijlstra wrote:



Well, we eventually want to go there I think. Although we still needed
to come up with something for Intel, because I'm not at all sure how all
that works.


Do you mean power numbers or how P-states work on Intel in general?


P-states, I'm still not at all sure how all that works on Intel and what
we can sanely do with them.

Supposedly Intel has a means of setting P-states (there's a driver after
all), but then is completely free to totally ignore it and do something
entirely different anyhow.


You can request a P state per core but the package does coordination at
a package level for the P state that will be used based on all requests.
This is due to the fact that most SKUs have a single VR and PLL. So
the highest P state wins.  When a core goes idle it loses it's vote
for the current package P state and that cores clock it turned off.



And while APERF/MPERF allows observing what it did, its afaik, nigh on
impossible to predict wtf its going to do, and therefore any such energy
computation is going to be a PRNG at best.

Now, given all that I'm not sure what we need that P-state driver for,
so supposedly I'm missing something.


intel_pstate tries to keep the core P state as low as possible to satisfy
the given load, so when various cores go idle the package P state can be
as low as possible.  The big power win is a core going idle.



Ideally Len (or someone equally in-the-know) would explain to me how
exactly all that works and what we can rely upon. All I've gotten so far
is, you can't rely on anything, and magik. Which is entirely useless.


The only thing you can rely on is that you will get "at least" the P state
requested in the presence of hardware coordination.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH 06/16] arm: topology: Define TC2 sched energy and provide it to scheduler

2014-06-05 Thread Dirk Brandewie

On 06/04/2014 11:52 PM, Peter Zijlstra wrote:

On Wed, Jun 04, 2014 at 11:56:55PM +0200, Rafael J. Wysocki wrote:

On Wednesday, June 04, 2014 07:27:12 PM Peter Zijlstra wrote:



Well, we eventually want to go there I think. Although we still needed
to come up with something for Intel, because I'm not at all sure how all
that works.


Do you mean power numbers or how P-states work on Intel in general?


P-states, I'm still not at all sure how all that works on Intel and what
we can sanely do with them.

Supposedly Intel has a means of setting P-states (there's a driver after
all), but then is completely free to totally ignore it and do something
entirely different anyhow.


You can request a P state per core but the package does coordination at
a package level for the P state that will be used based on all requests.
This is due to the fact that most SKUs have a single VR and PLL. So
the highest P state wins.  When a core goes idle it loses it's vote
for the current package P state and that cores clock it turned off.



And while APERF/MPERF allows observing what it did, its afaik, nigh on
impossible to predict wtf its going to do, and therefore any such energy
computation is going to be a PRNG at best.

Now, given all that I'm not sure what we need that P-state driver for,
so supposedly I'm missing something.


intel_pstate tries to keep the core P state as low as possible to satisfy
the given load, so when various cores go idle the package P state can be
as low as possible.  The big power win is a core going idle.



Ideally Len (or someone equally in-the-know) would explain to me how
exactly all that works and what we can rely upon. All I've gotten so far
is, you can't rely on anything, and magik. Which is entirely useless.


The only thing you can rely on is that you will get at least the P state
requested in the presence of hardware coordination.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] Documentation, intel_pstate: Add a description of the intel_pstate internal governors [v2]

2014-06-05 Thread Dirk Brandewie

On 06/05/2014 08:07 AM, Prarit Bhargava wrote:

The current documentation is incomplete wrt the intel_pstate internal
governors.  The confusion comes from the general use internal governors
which also use the names performance and powersave.  This patch
differentiates between the two sets of governors.

Cc: Viresh Kumar viresh.ku...@linaro.org
Cc: Dirk Brandewie dirk.brande...@gmail.com
Cc: Randy Dunlap rdun...@infradead.org
Cc: Russell King li...@arm.linux.org.uk
Cc: Jesper Nilsson jesper.nils...@axis.com
Cc: Viresh Kumar viresh.ku...@linaro.org
Cc: David S. Miller da...@davemloft.net
Cc: Ramkumar Ramachandra artag...@gmail.com
Cc: Rafael J. Wysocki r...@rjwysocki.net
Cc: linux-...@vger.kernel.org
Signed-off-by: Prarit Bhargava pra...@redhat.com


Acked-by: Dirk Brandewie dirk.j.brande...@intel.com


[v2]: text update
---
  Documentation/cpu-freq/governors.txt|2 +-
  Documentation/cpu-freq/intel-pstate.txt |8 
  2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/Documentation/cpu-freq/governors.txt 
b/Documentation/cpu-freq/governors.txt
index 77ec215..c15aa75 100644
--- a/Documentation/cpu-freq/governors.txt
+++ b/Documentation/cpu-freq/governors.txt
@@ -36,7 +36,7 @@ Contents:
  1. What Is A CPUFreq Governor?
  ==

-Most cpufreq drivers (in fact, all except one, longrun) or even most
+Most cpufreq drivers (except the intel_pstate and longrun) or even most
  cpu frequency scaling algorithms only offer the CPU to be set to one
  frequency. In order to offer dynamic frequency scaling, the cpufreq
  core must be able to tell these drivers of a target frequency. So
diff --git a/Documentation/cpu-freq/intel-pstate.txt 
b/Documentation/cpu-freq/intel-pstate.txt
index e742d21..2edd8d9 100644
--- a/Documentation/cpu-freq/intel-pstate.txt
+++ b/Documentation/cpu-freq/intel-pstate.txt
@@ -9,6 +9,14 @@ assumed to implement internal governors by the cpufreq core. 
All the
  logic for selecting the current P state is contained within the
  driver; no external governor is used by the cpufreq core.

+The Intel P-state driver has two internal governors, performance and
+powersave.  These governors differ from the generally used governors of the
+same name in the kernel.  The internal performance governor sets the
+max_perf_pct and min_perf_pct to 100; that is, the governor selects the
+highest available P state to maximize the performance of the core.  The
+internal powersave governor, selects the appropriate P state based on the
+current load on the CPU.
+
  Intel SandyBridge+ processors are supported.

  New sysfs files for controlling P state selection have been added to



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-30 Thread Dirk Brandewie

On 05/30/2014 01:07 PM, Tim Chen wrote:

On Fri, 2014-05-30 at 12:38 -0700, Dirk Brandewie wrote:


Dirk,

Thanks for checking things out.

I tested on a Haswell system, and I see that the frequency
can dip below the max even when I set the min_perf_pct to 100.
Let me know if you want to log on to my system and check if
there's something I missed. It is odd that the package 1's
cores are at a much higher frequency and close to
max than package 0, once min_perf_pct is set to 100.



Can you run turbostat for a few samples it reports an average over the sample
time.



Here it is.



You have me at a loss here I can come in on Monday if you are around and
we can try to figure out what is happening.

--Dirk

Tim

Package Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  
CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt 
RAMWatt   PKG_%   RAM_%
-   -   -   00.0220482594   00.23
0.00   99.750.00  33  425.930.00   91.520.00   23.22
4.150.120.00
0   0   0   10.0619972594   00.16
0.00   99.780.00  32  427.920.00   91.550.00   16.88
1.950.060.00
0   0  28   00.0113382594   00.21
0   1   1   00.0216962594   00.11
0.00   99.870.00  33
0   1  29   00.0114552594   00.11
0   2   2   00.0116182594   00.07
0.00   99.910.00  30
0   2  30   00.0115132594   00.07
0   3   3   00.0117242594   00.08
0.00   99.910.00  31
0   3  31   00.0114472594   00.08
0   4   4   00.0117692594   00.06
0.00   99.920.00  32
0   4  32   00.0114832594   00.06
0   5   5   00.0116702594   00.07
0.00   99.920.00  29
0   5  33   00.0115152594   00.07
0   6   6   00.0116002594   00.07
0.00   99.920.00  33
0   6  34   00.0114122594   00.07
0   8   7   00.0115882594   00.07
0.00   99.920.00  30
0   8  35   00.0114322594   00.07
0   9   8   00.0116622594   00.11
0.00   99.880.00  32
0   9  36   00.0216582594   00.10
0  10   9   00.0115702594   00.07
0.00   99.910.00  32
0  10  37   00.0114682594   00.07
0  11  10   00.0116802594   00.07
0.00   99.920.00  31
0  11  38   00.0115112594   00.07
0  12  11   00.0116902594   00.08
0.00   99.910.00  30
0  12  39   00.0115602594   00.08
0  13  12   00.0216042594   00.11
0.00   99.870.00  29
0  13  40   00.0214362594   00.11
0  14  13   00.0216202594   00.09
0.00   99.890.00  29
0  14  41   00.0214402594   00.09
1   0  14   00.0316662594   00.16
0.00   99.820.00  28  363.940.00   91.500.006.34
2.200.060.00
1   0  42   30.0832632594   00.11
1   1  15   00.0121942594   00.09
0.00   99.900.00  30
1   1  43   00.0123582594   00.09
1   2  16   00.0126502594   00.08
0.00   99.910.00  28
1   2  44   00.0120322594   00.08
1   3  17   10.0323052594   04.11
0.00   95.860.00  30
1   3  45   00.0122902594   04.13
1   4  18   00.0123622594   00.09
0.00   99.900.00  28
1   4  46   00.0123252594   00.09
1   5  19   00.0123742594   00.07
0.00   99.920.00  30
1   5  47   00.0124422594   00.07
1   6  20   00.0124762594   00.08
0.00   99.910.00  30
1   6  48   00.01

Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-30 Thread Dirk Brandewie

On 05/30/2014 12:32 PM, Tim Chen wrote:

On Fri, 2014-05-30 at 11:45 -0700, Dirk Brandewie wrote:



With turbostat from rc7.
[root@echolake turbostat]# ./turbostat
 Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt 
GFXWatt
-   -   10.0811783492   00.120.08
0.01   99.71  29  29   99.230.000.000.002.180.00
0.00
0   0   20.1911893492   00.220.30
0.00   99.29  29  29   99.240.000.000.002.180.00
0.00
0   4   10.1212533492   00.29
1   1   00.0310653492   00.030.00
0.00   99.93  23
1   5   00.0111043492   00.05
2   2   00.0212753492   00.220.00
0.03   99.73  24
2   6   20.1812203492   00.06
3   3   00.01 9923492   00.070.00
0.01   99.90  23
3   7   00.05 9153492   00.04
 Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt 
GFXWatt
-   -   10.0610343492   00.095.15
0.00   94.70  28  28   99.490.000.000.002.480.01
0.00
0   0   10.0910663492   00.170.01
0.00   99.73  28  28   99.490.000.000.002.480.01
0.00
0   4   10.1210363492   00.14
1   1   00.0410093492   00.05   20.59
0.00   79.32  24
1   5   00.02 9223492   00.07
2   2   00.03 9243492   00.150.00
0.00   99.82  25
2   6   10.1211173492   00.06
3   3   00.01 9113492   00.040.01
0.00   99.94  22
3   7   00.03 8563492   00.02
 Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt 
GFXWatt
-   -   10.08 8893492   00.120.03
0.06   99.71  29  29   99.320.000.000.002.210.00
0.00
0   0   10.11 8673492   00.200.02
0.22   99.44  29  29   99.320.000.000.002.210.00
0.00
0   4   10.14 9073492   00.17
1   1   10.12 8093492   00.040.11
0.01   99.73  24
1   5   00.01 7983492   00.14
2   2   00.03 8633492   00.180.00
0.01   99.78  24
2   6   10.1410133492   00.07
3   3   00.02 8533492   00.090.00
0.00   99.89  23
3   7   10.06 8153492   00.05
^C
[root@echolake turbostat]# echo 100 > 
/sys/devices/system/cpu/intel_pstate/min_perf_pct
[root@echolake turbostat]# ./turbostat
 Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt 
GFXWatt
-   -   10.0334893492   02.430.01
0.00   97.53  30  30   90.200.000.000.002.850.06
0.00
0   0   10.0434703492   00.090.00
0.00   99.88  30  30   90.200.000.000.002.850.06
0.00
0   4   20.0634923492   00.07
1   1   10.0234953492   00.050.03
0.00   99.90  25
1   5   00.0034943492   00.07
2   2   00.0134923492   09.530.00
0.01   90.45  25
2   6   10.0434923492   09.50
3   3   10.0334923492   00.050.01
0.00   99.91  23
3   7   10.0234933492   00.06
 Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt 
GFXWatt
-   -   10.0234923492   04.930.00
0.00   95.04  30  30   80.190.000.000.003.540.10
0.00
0   0   10.0234913492   00.080.01
0.00   99.89  30  30   80.190.000.000.003.540

Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-30 Thread Dirk Brandewie
On 05/30/2014 10:56 AM, Tim Chen wrote:
> On Thu, 2014-05-29 at 21:16 -0400, Dave Jones wrote:
>> On Thu, May 29, 2014 at 06:07:16PM -0700, Tim Chen wrote:
>>   > On Thu, 2014-05-29 at 19:54 -0400, George Spelvin wrote:
>>   > > Sorry for the delay; my Ivy Bridge test machine isn't in my
>>   > > office and getting to the console to tweak the BIOS is a
>>   > > bit of a bother.
>>   > >
>>   > > Anyway, i7-4930K, turbo boost & hyperthreading disabled,
>>   > > $ cat /sys/devices/system/cpu/cpu?/cpufreq/scaling_governor
>>   > > performance
>>   > > performance
>>   > > performance
>>   > > performance
>>   > > performance
>>   > > performance
>>   > >
>>   > > Oddly, though, CPU speed still seems to be fluctuating:
>>   > > $ grep MHz /proc/cpuinfo
>>   > > cpu MHz : 1255.875
>>   > > cpu MHz : 3168.375
>>   > > cpu MHz : 3062.125
>>   > > cpu MHz : 1468.375
>>   > > cpu MHz : 1309.000
>>   > > cpu MHz : 2212.125
>>   > > $ grep MHz /proc/cpuinfo
>>   > > cpu MHz : 1255.875
>>   > > cpu MHz : 2690.250
>>   > > cpu MHz : 1255.875
>>   > > cpu MHz : 2530.875
>>   > > cpu MHz : 2212.125
>>   > > cpu MHz : 1521.500
>>   >
>>   > This is odd.  On my Ivy Bridge system the CPU speed from /proc/cpuinfo
>>   > is at max freq once I set the performance governor.
>>   > The numbers above almost look like
>>   > the cpu frequency is fluctuating and an average is taken.
>>   > What version of the kernel are you running?  Is
>>   > CONFIG_CPU_FREQ_GOV_PERFORMANCE compiled in?
>>   >
>>   > Does /sys/devices/system/cpu/cpu?/cpufreq/scaling_cur_freq
>>   > also changes?
>>   >
>>   > Can you check what are the available governors in your system
>>   > and available frequencies?
>>   >
>>   > cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_governors
>>   > cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies
>>   >
>>   > If userspace governor is available, you can try set the governor
>>   > to userspace, then pin frequency to 3400 MHz (assuming that's your
>>   > max) with command like:
>>   
>> intel_pstate overrides any governor choice you make through sysfs.
>>
>>  Dave
>>
> 
> Dirk,
> 
> Wonder if this the right behavior for intel_pstate that when I set the
> governor to performance, intel_pstate driver still adjusts
> the cpu frequencies around?

No, the value returned is a measured/delivered frequency instead of the P state
requested which is what the other governors return.

> 
> Turbotstat also confirms that the frequencies are not at max,
> even though the max_perf_pct and min_perf_pct are both set at 100.
> 

I calculate frequency the same way turbostat does but my samples are a *lot* 
shorter.
 

> I ran on my HSW system with 3.15-rc7 kernel and see similar
> issue that Geroge reported.
> 
> It is really a pain when we need to do performance benchmarking and
> need to have a constant cpu frequency.
> 

With turbostat from rc7.
[root@echolake turbostat]# ./turbostat 
Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt 
GFXWatt 
   -   -   10.0811783492   00.120.08
0.01   99.71  29  29   99.230.000.000.002.180.00
0.00
   0   0   20.1911893492   00.220.30
0.00   99.29  29  29   99.240.000.000.002.180.00
0.00
   0   4   10.1212533492   00.29
   1   1   00.0310653492   00.030.00
0.00   99.93  23
   1   5   00.0111043492   00.05
   2   2   00.0212753492   00.220.00
0.03   99.73  24
   2   6   20.1812203492   00.06
   3   3   00.01 9923492   00.070.00
0.01   99.90  23
   3   7   00.05 9153492   00.04
Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt 
GFXWatt 
   -   -   10.0610343492   00.095.15
0.00   94.70  28  28   99.490.000.000.002.480.01
0.00
   0   0   10.0910663492   00.170.01
0.00   99.73  28  28   99.490.000.000.002.480.01
0.00
   0   4   10.1210363492   00.14
   1   1   00.0410093492   00.05   20.59
0.00   79.32  24
   1   5   00.02 9223492   00.07
   2   2   00.03 9243492   00.150.00
0.00   99.82  25
   2   6   10.1211173492   00.06
   3   3   0

Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-30 Thread Dirk Brandewie
On 05/30/2014 10:56 AM, Tim Chen wrote:
 On Thu, 2014-05-29 at 21:16 -0400, Dave Jones wrote:
 On Thu, May 29, 2014 at 06:07:16PM -0700, Tim Chen wrote:
On Thu, 2014-05-29 at 19:54 -0400, George Spelvin wrote:
 Sorry for the delay; my Ivy Bridge test machine isn't in my
 office and getting to the console to tweak the BIOS is a
 bit of a bother.

 Anyway, i7-4930K, turbo boost  hyperthreading disabled,
 $ cat /sys/devices/system/cpu/cpu?/cpufreq/scaling_governor
 performance
 performance
 performance
 performance
 performance
 performance

 Oddly, though, CPU speed still seems to be fluctuating:
 $ grep MHz /proc/cpuinfo
 cpu MHz : 1255.875
 cpu MHz : 3168.375
 cpu MHz : 3062.125
 cpu MHz : 1468.375
 cpu MHz : 1309.000
 cpu MHz : 2212.125
 $ grep MHz /proc/cpuinfo
 cpu MHz : 1255.875
 cpu MHz : 2690.250
 cpu MHz : 1255.875
 cpu MHz : 2530.875
 cpu MHz : 2212.125
 cpu MHz : 1521.500
   
This is odd.  On my Ivy Bridge system the CPU speed from /proc/cpuinfo
is at max freq once I set the performance governor.
The numbers above almost look like
the cpu frequency is fluctuating and an average is taken.
What version of the kernel are you running?  Is
CONFIG_CPU_FREQ_GOV_PERFORMANCE compiled in?
   
Does /sys/devices/system/cpu/cpu?/cpufreq/scaling_cur_freq
also changes?
   
Can you check what are the available governors in your system
and available frequencies?
   
cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_governors
cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies
   
If userspace governor is available, you can try set the governor
to userspace, then pin frequency to 3400 MHz (assuming that's your
max) with command like:
   
 intel_pstate overrides any governor choice you make through sysfs.

  Dave

 
 Dirk,
 
 Wonder if this the right behavior for intel_pstate that when I set the
 governor to performance, intel_pstate driver still adjusts
 the cpu frequencies around?

No, the value returned is a measured/delivered frequency instead of the P state
requested which is what the other governors return.

 
 Turbotstat also confirms that the frequencies are not at max,
 even though the max_perf_pct and min_perf_pct are both set at 100.
 

I calculate frequency the same way turbostat does but my samples are a *lot* 
shorter.
 

 I ran on my HSW system with 3.15-rc7 kernel and see similar
 issue that Geroge reported.
 
 It is really a pain when we need to do performance benchmarking and
 need to have a constant cpu frequency.
 

With turbostat from rc7.
[root@echolake turbostat]# ./turbostat 
Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt 
GFXWatt 
   -   -   10.0811783492   00.120.08
0.01   99.71  29  29   99.230.000.000.002.180.00
0.00
   0   0   20.1911893492   00.220.30
0.00   99.29  29  29   99.240.000.000.002.180.00
0.00
   0   4   10.1212533492   00.29
   1   1   00.0310653492   00.030.00
0.00   99.93  23
   1   5   00.0111043492   00.05
   2   2   00.0212753492   00.220.00
0.03   99.73  24
   2   6   20.1812203492   00.06
   3   3   00.01 9923492   00.070.00
0.01   99.90  23
   3   7   00.05 9153492   00.04
Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt 
GFXWatt 
   -   -   10.0610343492   00.095.15
0.00   94.70  28  28   99.490.000.000.002.480.01
0.00
   0   0   10.0910663492   00.170.01
0.00   99.73  28  28   99.490.000.000.002.480.01
0.00
   0   4   10.1210363492   00.14
   1   1   00.0410093492   00.05   20.59
0.00   79.32  24
   1   5   00.02 9223492   00.07
   2   2   00.03 9243492   00.150.00
0.00   99.82  25
   2   6   10.1211173492   00.06
   3   3   00.01 9113492   00.040.01
0.00   99.94  22
   3   7   00.03 8563492   00.02
Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  

Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-30 Thread Dirk Brandewie

On 05/30/2014 12:32 PM, Tim Chen wrote:

On Fri, 2014-05-30 at 11:45 -0700, Dirk Brandewie wrote:



With turbostat from rc7.
[root@echolake turbostat]# ./turbostat
 Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt 
GFXWatt
-   -   10.0811783492   00.120.08
0.01   99.71  29  29   99.230.000.000.002.180.00
0.00
0   0   20.1911893492   00.220.30
0.00   99.29  29  29   99.240.000.000.002.180.00
0.00
0   4   10.1212533492   00.29
1   1   00.0310653492   00.030.00
0.00   99.93  23
1   5   00.0111043492   00.05
2   2   00.0212753492   00.220.00
0.03   99.73  24
2   6   20.1812203492   00.06
3   3   00.01 9923492   00.070.00
0.01   99.90  23
3   7   00.05 9153492   00.04
 Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt 
GFXWatt
-   -   10.0610343492   00.095.15
0.00   94.70  28  28   99.490.000.000.002.480.01
0.00
0   0   10.0910663492   00.170.01
0.00   99.73  28  28   99.490.000.000.002.480.01
0.00
0   4   10.1210363492   00.14
1   1   00.0410093492   00.05   20.59
0.00   79.32  24
1   5   00.02 9223492   00.07
2   2   00.03 9243492   00.150.00
0.00   99.82  25
2   6   10.1211173492   00.06
3   3   00.01 9113492   00.040.01
0.00   99.94  22
3   7   00.03 8563492   00.02
 Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt 
GFXWatt
-   -   10.08 8893492   00.120.03
0.06   99.71  29  29   99.320.000.000.002.210.00
0.00
0   0   10.11 8673492   00.200.02
0.22   99.44  29  29   99.320.000.000.002.210.00
0.00
0   4   10.14 9073492   00.17
1   1   10.12 8093492   00.040.11
0.01   99.73  24
1   5   00.01 7983492   00.14
2   2   00.03 8633492   00.180.00
0.01   99.78  24
2   6   10.1410133492   00.07
3   3   00.02 8533492   00.090.00
0.00   99.89  23
3   7   10.06 8153492   00.05
^C
[root@echolake turbostat]# echo 100  
/sys/devices/system/cpu/intel_pstate/min_perf_pct
[root@echolake turbostat]# ./turbostat
 Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt 
GFXWatt
-   -   10.0334893492   02.430.01
0.00   97.53  30  30   90.200.000.000.002.850.06
0.00
0   0   10.0434703492   00.090.00
0.00   99.88  30  30   90.200.000.000.002.850.06
0.00
0   4   20.0634923492   00.07
1   1   10.0234953492   00.050.03
0.00   99.90  25
1   5   00.0034943492   00.07
2   2   00.0134923492   09.530.00
0.01   90.45  25
2   6   10.0434923492   09.50
3   3   10.0334923492   00.050.01
0.00   99.91  23
3   7   10.0234933492   00.06
 Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  CPU%c3  
CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt 
GFXWatt
-   -   10.0234923492   04.930.00
0.00   95.04  30  30   80.190.000.000.003.540.10
0.00
0   0   10.0234913492   00.080.01
0.00   99.89  30  30   80.190.000.000.003.540.10

Re: [RFC PATCH] crypto: crc32c-pclmul - Use pmovzxdq to shrink K_table

2014-05-30 Thread Dirk Brandewie

On 05/30/2014 01:07 PM, Tim Chen wrote:

On Fri, 2014-05-30 at 12:38 -0700, Dirk Brandewie wrote:


Dirk,

Thanks for checking things out.

I tested on a Haswell system, and I see that the frequency
can dip below the max even when I set the min_perf_pct to 100.
Let me know if you want to log on to my system and check if
there's something I missed. It is odd that the package 1's
cores are at a much higher frequency and close to
max than package 0, once min_perf_pct is set to 100.



Can you run turbostat for a few samples it reports an average over the sample
time.



Here it is.



You have me at a loss here I can come in on Monday if you are around and
we can try to figure out what is happening.

--Dirk

Tim

Package Core CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz SMI  CPU%c1  
CPU%c3  CPU%c6  CPU%c7 CoreTmp  PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt 
RAMWatt   PKG_%   RAM_%
-   -   -   00.0220482594   00.23
0.00   99.750.00  33  425.930.00   91.520.00   23.22
4.150.120.00
0   0   0   10.0619972594   00.16
0.00   99.780.00  32  427.920.00   91.550.00   16.88
1.950.060.00
0   0  28   00.0113382594   00.21
0   1   1   00.0216962594   00.11
0.00   99.870.00  33
0   1  29   00.0114552594   00.11
0   2   2   00.0116182594   00.07
0.00   99.910.00  30
0   2  30   00.0115132594   00.07
0   3   3   00.0117242594   00.08
0.00   99.910.00  31
0   3  31   00.0114472594   00.08
0   4   4   00.0117692594   00.06
0.00   99.920.00  32
0   4  32   00.0114832594   00.06
0   5   5   00.0116702594   00.07
0.00   99.920.00  29
0   5  33   00.0115152594   00.07
0   6   6   00.0116002594   00.07
0.00   99.920.00  33
0   6  34   00.0114122594   00.07
0   8   7   00.0115882594   00.07
0.00   99.920.00  30
0   8  35   00.0114322594   00.07
0   9   8   00.0116622594   00.11
0.00   99.880.00  32
0   9  36   00.0216582594   00.10
0  10   9   00.0115702594   00.07
0.00   99.910.00  32
0  10  37   00.0114682594   00.07
0  11  10   00.0116802594   00.07
0.00   99.920.00  31
0  11  38   00.0115112594   00.07
0  12  11   00.0116902594   00.08
0.00   99.910.00  30
0  12  39   00.0115602594   00.08
0  13  12   00.0216042594   00.11
0.00   99.870.00  29
0  13  40   00.0214362594   00.11
0  14  13   00.0216202594   00.09
0.00   99.890.00  29
0  14  41   00.0214402594   00.09
1   0  14   00.0316662594   00.16
0.00   99.820.00  28  363.940.00   91.500.006.34
2.200.060.00
1   0  42   30.0832632594   00.11
1   1  15   00.0121942594   00.09
0.00   99.900.00  30
1   1  43   00.0123582594   00.09
1   2  16   00.0126502594   00.08
0.00   99.910.00  28
1   2  44   00.0120322594   00.08
1   3  17   10.0323052594   04.11
0.00   95.860.00  30
1   3  45   00.0122902594   04.13
1   4  18   00.0123622594   00.09
0.00   99.900.00  28
1   4  46   00.0123252594   00.09
1   5  19   00.0123742594   00.07
0.00   99.920.00  30
1   5  47   00.0124422594   00.07
1   6  20   00.0124762594   00.08
0.00   99.910.00  30
1   6  48   00.01

Re: [PATCH] cpufreq: intel_pstate: Remove unused member name of cpudata

2014-05-21 Thread Dirk Brandewie

On 05/20/2014 11:12 AM, Stratos Karafotis wrote:

Although, a value is assigned to member name of struct cpudata,
it is never used.

We can safely remove it.

Signed-off-by: Stratos Karafotis 


Acked-by: Dirk Brandewie 

---
  drivers/cpufreq/intel_pstate.c | 4 
  1 file changed, 4 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 24a534a..a6d5afa 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -88,8 +88,6 @@ struct _pid {
  struct cpudata {
int cpu;

-   char name[64];
-
struct timer_list timer;

struct pstate_data pstate;
@@ -544,8 +542,6 @@ static inline void intel_pstate_pstate_decrease(struct 
cpudata *cpu, int steps)

  static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
  {
-   sprintf(cpu->name, "Intel 2nd generation core");
-
cpu->pstate.min_pstate = pstate_funcs.get_min();
cpu->pstate.max_pstate = pstate_funcs.get_max();
cpu->pstate.turbo_pstate = pstate_funcs.get_turbo();



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] cpufreq: intel_pstate: Remove unused member name of cpudata

2014-05-21 Thread Dirk Brandewie

On 05/20/2014 11:12 AM, Stratos Karafotis wrote:

Although, a value is assigned to member name of struct cpudata,
it is never used.

We can safely remove it.

Signed-off-by: Stratos Karafotis strat...@semaphore.gr


Acked-by: Dirk Brandewie dirk.j.brande...@intel.com

---
  drivers/cpufreq/intel_pstate.c | 4 
  1 file changed, 4 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 24a534a..a6d5afa 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -88,8 +88,6 @@ struct _pid {
  struct cpudata {
int cpu;

-   char name[64];
-
struct timer_list timer;

struct pstate_data pstate;
@@ -544,8 +542,6 @@ static inline void intel_pstate_pstate_decrease(struct 
cpudata *cpu, int steps)

  static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
  {
-   sprintf(cpu-name, Intel 2nd generation core);
-
cpu-pstate.min_pstate = pstate_funcs.get_min();
cpu-pstate.max_pstate = pstate_funcs.get_max();
cpu-pstate.turbo_pstate = pstate_funcs.get_turbo();



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 4/5] intel_pstate: Remove C0 tracking

2014-05-12 Thread Dirk Brandewie

On 05/12/2014 05:16 AM, Rafael J. Wysocki wrote:

On Monday, May 12, 2014 05:27:25 AM Stratos Karafotis wrote:

Hi,



[cut]



With this patch, my CPU (Core i7-3770 @ 3.90GHz) seems to never use lowest
frequencies. Even on an idle system I get always ~2GHz. Normally,
on an idle system it used to be 1.6GHz.
On very small loads (mp3 decoding) the CPU goes up to 2.7G GHz (it used to
be 1.6GHz)

Reverting, this patch on my local build, the problem is resolved.


Dirk, seriously, I can't regard this as a fix that can go into -rc6.



Ok I will resubmit after more testing


Which of the other patch in the series are must-go for 3.15?  [1-2/5] I guess?
And do we need [2/5] it in -stable too?


1/5  is for stable it fixes a random MCE on baytrail.
2/5  is for stable it should have went with the stop_cpu patch
5/5  can go too since it is just adding CPU IDs






--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 4/5] intel_pstate: Remove C0 tracking

2014-05-12 Thread Dirk Brandewie

On 05/12/2014 05:16 AM, Rafael J. Wysocki wrote:

On Monday, May 12, 2014 05:27:25 AM Stratos Karafotis wrote:

Hi,



[cut]



With this patch, my CPU (Core i7-3770 @ 3.90GHz) seems to never use lowest
frequencies. Even on an idle system I get always ~2GHz. Normally,
on an idle system it used to be 1.6GHz.
On very small loads (mp3 decoding) the CPU goes up to 2.7G GHz (it used to
be 1.6GHz)

Reverting, this patch on my local build, the problem is resolved.


Dirk, seriously, I can't regard this as a fix that can go into -rc6.



Ok I will resubmit after more testing


Which of the other patch in the series are must-go for 3.15?  [1-2/5] I guess?
And do we need [2/5] it in -stable too?


1/5  is for stable it fixes a random MCE on baytrail.
2/5  is for stable it should have went with the stop_cpu patch
5/5  can go too since it is just adding CPU IDs






--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/5] updates for intel_pstate

2014-05-08 Thread Dirk Brandewie

On 05/08/2014 01:30 PM, Rafael J. Wysocki wrote:

On Thursday, May 08, 2014 12:57:22 PM dirk.brande...@gmail.com wrote:

From: Dirk Brandewie 

Patches 1-4 are bugfixes.  Patch 4 specifically removes the wreckage
caused by C0 tracking change.


I would appreciate sending such stuff before the -rc4 time frame.


Agreed.

Patch 1 is a resend the others were sent as soon as they were ready/tested




Patch 5 Adds new CPU IDs for the Broadwell processors.

Dirk Brandewie (5):
   intel_pstate: Set turbo VID for Baytrail
   intel_pstate: remove setting P state to MAX on init
   intel_pstate: Fix fixed point rounding macro
   intel_pstate: Remove C0 tracking
   intel_pstate: Add CPU IDs for Broadwell processors

  drivers/cpufreq/intel_pstate.c | 52 +-
  1 file changed, 21 insertions(+), 31 deletions(-)


I'll queue them up for 3.15-rc6, but that's a bit too late.

Next time I may just wait for the next merge window, fixes or not.


OK the reporters of the issues already have the fixes in hand.






--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] cpufreq: intel_pstate: Change the calculation of next pstate

2014-05-08 Thread Dirk Brandewie
On 05/05/2014 04:57 PM, Stratos Karafotis wrote:
> Currently the driver calculates the next pstate proportional to
> core_busy factor, scaled by the ratio max_pstate / current_pstate.
> 
> Using the scaled load (core_busy) to calculate the next pstate
> is not always correct, because there are cases that the load is
> independent from current pstate. For example, a tight 'for' loop
> through many sampling intervals will cause a load of 100% in
> every pstate.
> 
> So, change the above method and calculate the next pstate with
> the assumption that the next pstate should not depend on the
> current pstate. The next pstate should only be proportional
> to measured load. Use the linear function to calculate the load:
> 
> Next P-state = A + B * load
> 
> where A = min_state and B = (max_pstate - min_pstate) / 100
> If turbo is enabled the B = (turbo_pstate - min_pstate) / 100
> The load is calculated using the kernel time functions.
> 

This will hurt your power numbers under "normal" conditions where you
are not running a performance workload. Consider the following:

   1. The system is idle, all core at min P state and utilization is low say < 
10%
   2. You run something that drives the load as seen by the kernel to 100%
  which scaled by the current P state.

This would cause the P state to go from min -> max in one step.  Which is
what you want if you are only looking at a single core.  But this will also
drag every core in the package to the max P state as well.  This would be fine
if the power vs frequency cure was linear all the cores would finish
their work faster and go idle sooner (race to halt) and maybe spend
more time in a deeper C state which dwarfs the amount of power we can
save by controlling P states. Unfortunately this is *not* the case, 
power vs frequency curve is non-linear and get very steep in the turbo
range.  If it were linear there would be no reason to have P state
control you could select the highest P state and walk away.

Being conservative on the way up and aggressive on way down give you
the best power efficiency on non-benchmark loads.  Most benchmarks
are pretty useless for measuring power efficiency (unless they were
designed for it) since they are measuring how fast something can be
done which is measuring the efficiency at max performance.

The performance issues you pointed out were caused by commit 
fcb6a15c intel_pstate: Take core C0 time into account for core busy calculation
and the ensuing problem is caused. These have been fixed in the patch set

   https://lkml.org/lkml/2014/5/8/574

The performance comparison between before/after this patch set, your patch
and ondemand/acpi_cpufreq is available at:
http://openbenchmarking.org/result/1405085-PL-C0200965993
ffmpeg was added to the set of benchmarks because there was a regression
reported against this benchmark as well.
https://bugzilla.kernel.org/show_bug.cgi?id=75121

--Dirk



 
   
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/5] intel_pstate: remove setting P state to MAX on init

2014-05-08 Thread dirk . brandewie
From: Dirk Brandewie 

Setting the P state of the core to max at init time is a hold over
from early implementation of intel_pstate where intel_pstate disabled
cpufreq and loaded VERY early in the boot sequence.  This was to
ensure that intel_pstate did not affect boot time. This in not needed
now that intel_pstate is a cpufreq driver.

Removing this covers the case where a CPU has gone through a manual
CPU offline/online cycle and the P state is set to MAX on init and the
CPU immediately goes idle.  Due to HW coordination the P state request
on the idle CPU will drag all cores to MAX P state until the load is
reevaluated when to core goes non-idle.

Reported-by: Patrick Marlier 

Signed-off-by: Dirk Brandewie 
---
 drivers/cpufreq/intel_pstate.c | 13 +
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 39c4f85..eab8ccf 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -554,12 +554,7 @@ static void intel_pstate_get_cpu_pstates(struct cpudata 
*cpu)
 
if (pstate_funcs.get_vid)
pstate_funcs.get_vid(cpu);
-
-   /*
-* goto max pstate so we don't slow up boot if we are built-in if we are
-* a module we will take care of it during normal operation
-*/
-   intel_pstate_set_pstate(cpu, cpu->pstate.max_pstate);
+   intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate);
 }
 
 static inline void intel_pstate_calc_busy(struct cpudata *cpu,
@@ -704,11 +699,6 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
cpu = all_cpu_data[cpunum];
 
intel_pstate_get_cpu_pstates(cpu);
-   if (!cpu->pstate.current_pstate) {
-   all_cpu_data[cpunum] = NULL;
-   kfree(cpu);
-   return -ENODATA;
-   }
 
cpu->cpu = cpunum;
 
@@ -719,7 +709,6 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
cpu->timer.expires = jiffies + HZ/100;
intel_pstate_busy_pid_reset(cpu);
intel_pstate_sample(cpu);
-   intel_pstate_set_pstate(cpu, cpu->pstate.max_pstate);
 
add_timer_on(>timer, cpunum);
 
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/5] intel_pstate: Remove C0 tracking

2014-05-08 Thread dirk . brandewie
From: Dirk Brandewie 

Commit fcb6a15c intel_pstate: Take core C0 time into account for core busy
introduced a regression referenced below.  The issue with "lockup"
after suspend that this commit was addressing is now dealt with in the
suspend path.

References:
   https://bugzilla.kernel.org/show_bug.cgi?id=66581
   https://bugzilla.kernel.org/show_bug.cgi?id=75121

Reported-by: Doug Smythies 
Signed-off-by: Dirk Brandewie 
---
 drivers/cpufreq/intel_pstate.c | 13 +
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index bb20881..4c26faf 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -59,7 +59,6 @@ struct sample {
int32_t core_pct_busy;
u64 aperf;
u64 mperf;
-   unsigned long long tsc;
int freq;
 };
 
@@ -100,7 +99,6 @@ struct cpudata {
 
u64 prev_aperf;
u64 prev_mperf;
-   unsigned long long prev_tsc;
struct sample sample;
 };
 
@@ -561,46 +559,37 @@ static inline void intel_pstate_calc_busy(struct cpudata 
*cpu,
struct sample *sample)
 {
int32_t core_pct;
-   int32_t c0_pct;
 
core_pct = div_fp(int_tofp((sample->aperf)),
int_tofp((sample->mperf)));
core_pct = mul_fp(core_pct, int_tofp(100));
FP_ROUNDUP(core_pct);
 
-   c0_pct = div_fp(int_tofp(sample->mperf), int_tofp(sample->tsc));
-
sample->freq = fp_toint(
mul_fp(int_tofp(cpu->pstate.max_pstate * 1000), core_pct));
 
-   sample->core_pct_busy = mul_fp(core_pct, c0_pct);
+   sample->core_pct_busy = core_pct;
 }
 
 static inline void intel_pstate_sample(struct cpudata *cpu)
 {
u64 aperf, mperf;
-   unsigned long long tsc;
 
rdmsrl(MSR_IA32_APERF, aperf);
rdmsrl(MSR_IA32_MPERF, mperf);
-   tsc = native_read_tsc();
 
aperf = aperf >> FRAC_BITS;
mperf = mperf >> FRAC_BITS;
-   tsc = tsc >> FRAC_BITS;
 
cpu->sample.aperf = aperf;
cpu->sample.mperf = mperf;
-   cpu->sample.tsc = tsc;
cpu->sample.aperf -= cpu->prev_aperf;
cpu->sample.mperf -= cpu->prev_mperf;
-   cpu->sample.tsc -= cpu->prev_tsc;
 
intel_pstate_calc_busy(cpu, >sample);
 
cpu->prev_aperf = aperf;
cpu->prev_mperf = mperf;
-   cpu->prev_tsc = tsc;
 }
 
 static inline void intel_pstate_set_sample_time(struct cpudata *cpu)
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/5] updates for intel_pstate

2014-05-08 Thread dirk . brandewie
From: Dirk Brandewie 

Patches 1-4 are bugfixes.  Patch 4 specifically removes the wreckage
caused by C0 tracking change. 

Patch 5 Adds new CPU IDs for the Broadwell processors.

Dirk Brandewie (5):
  intel_pstate: Set turbo VID for Baytrail
  intel_pstate: remove setting P state to MAX on init
  intel_pstate: Fix fixed point rounding macro
  intel_pstate: Remove C0 tracking
  intel_pstate: Add CPU IDs for Broadwell processors

 drivers/cpufreq/intel_pstate.c | 52 +-
 1 file changed, 21 insertions(+), 31 deletions(-)

-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 5/5] intel_pstate: Add CPU IDs for Broadwell processors

2014-05-08 Thread dirk . brandewie
From: Dirk Brandewie 

Add support the Broadwell processors.

Signed-off-by: Dirk Brandewie 
---
 drivers/cpufreq/intel_pstate.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 4c26faf..6ef6f7f 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -663,10 +663,13 @@ static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
ICPU(0x37, byt_params),
ICPU(0x3a, core_params),
ICPU(0x3c, core_params),
+   ICPU(0x3d, core_params),
ICPU(0x3e, core_params),
ICPU(0x3f, core_params),
ICPU(0x45, core_params),
ICPU(0x46, core_params),
+   ICPU(0x4f, core_params),
+   ICPU(0x56, core_params),
{}
 };
 MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids);
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/5] intel_pstate: Fix fixed point rounding macro

2014-05-08 Thread dirk . brandewie
From: Dirk Brandewie 

Change the FP_ROUNDUP macro to add 0.5 in fixed point representation
instead of 1.0

Signed-off-by: Dirk Brandewie 
---
 drivers/cpufreq/intel_pstate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index eab8ccf..bb20881 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -43,7 +43,7 @@
 #define FRAC_BITS 6
 #define int_tofp(X) ((int64_t)(X) << FRAC_BITS)
 #define fp_toint(X) ((X) >> FRAC_BITS)
-#define FP_ROUNDUP(X) ((X) += 1 << FRAC_BITS)
+#define FP_ROUNDUP(X) ((X) += 1 << (FRAC_BITS-1))
 
 static inline int32_t mul_fp(int32_t x, int32_t y)
 {
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/5] intel_pstate: Fix fixed point rounding macro

2014-05-08 Thread dirk . brandewie
From: Dirk Brandewie dirk.j.brande...@intel.com

Change the FP_ROUNDUP macro to add 0.5 in fixed point representation
instead of 1.0

Signed-off-by: Dirk Brandewie dirk.j.brande...@intel.com
---
 drivers/cpufreq/intel_pstate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index eab8ccf..bb20881 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -43,7 +43,7 @@
 #define FRAC_BITS 6
 #define int_tofp(X) ((int64_t)(X)  FRAC_BITS)
 #define fp_toint(X) ((X)  FRAC_BITS)
-#define FP_ROUNDUP(X) ((X) += 1  FRAC_BITS)
+#define FP_ROUNDUP(X) ((X) += 1  (FRAC_BITS-1))
 
 static inline int32_t mul_fp(int32_t x, int32_t y)
 {
-- 
1.9.0

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 5/5] intel_pstate: Add CPU IDs for Broadwell processors

2014-05-08 Thread dirk . brandewie
From: Dirk Brandewie dirk.j.brande...@intel.com

Add support the Broadwell processors.

Signed-off-by: Dirk Brandewie dirk.j.brande...@intel.com
---
 drivers/cpufreq/intel_pstate.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 4c26faf..6ef6f7f 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -663,10 +663,13 @@ static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
ICPU(0x37, byt_params),
ICPU(0x3a, core_params),
ICPU(0x3c, core_params),
+   ICPU(0x3d, core_params),
ICPU(0x3e, core_params),
ICPU(0x3f, core_params),
ICPU(0x45, core_params),
ICPU(0x46, core_params),
+   ICPU(0x4f, core_params),
+   ICPU(0x56, core_params),
{}
 };
 MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids);
-- 
1.9.0

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/5] updates for intel_pstate

2014-05-08 Thread dirk . brandewie
From: Dirk Brandewie dirk.j.brande...@intel.com

Patches 1-4 are bugfixes.  Patch 4 specifically removes the wreckage
caused by C0 tracking change. 

Patch 5 Adds new CPU IDs for the Broadwell processors.

Dirk Brandewie (5):
  intel_pstate: Set turbo VID for Baytrail
  intel_pstate: remove setting P state to MAX on init
  intel_pstate: Fix fixed point rounding macro
  intel_pstate: Remove C0 tracking
  intel_pstate: Add CPU IDs for Broadwell processors

 drivers/cpufreq/intel_pstate.c | 52 +-
 1 file changed, 21 insertions(+), 31 deletions(-)

-- 
1.9.0

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/5] intel_pstate: Remove C0 tracking

2014-05-08 Thread dirk . brandewie
From: Dirk Brandewie dirk.j.brande...@intel.com

Commit fcb6a15c intel_pstate: Take core C0 time into account for core busy
introduced a regression referenced below.  The issue with lockup
after suspend that this commit was addressing is now dealt with in the
suspend path.

References:
   https://bugzilla.kernel.org/show_bug.cgi?id=66581
   https://bugzilla.kernel.org/show_bug.cgi?id=75121

Reported-by: Doug Smythies dsmyth...@telus.net
Signed-off-by: Dirk Brandewie dirk.j.brande...@intel.com
---
 drivers/cpufreq/intel_pstate.c | 13 +
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index bb20881..4c26faf 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -59,7 +59,6 @@ struct sample {
int32_t core_pct_busy;
u64 aperf;
u64 mperf;
-   unsigned long long tsc;
int freq;
 };
 
@@ -100,7 +99,6 @@ struct cpudata {
 
u64 prev_aperf;
u64 prev_mperf;
-   unsigned long long prev_tsc;
struct sample sample;
 };
 
@@ -561,46 +559,37 @@ static inline void intel_pstate_calc_busy(struct cpudata 
*cpu,
struct sample *sample)
 {
int32_t core_pct;
-   int32_t c0_pct;
 
core_pct = div_fp(int_tofp((sample-aperf)),
int_tofp((sample-mperf)));
core_pct = mul_fp(core_pct, int_tofp(100));
FP_ROUNDUP(core_pct);
 
-   c0_pct = div_fp(int_tofp(sample-mperf), int_tofp(sample-tsc));
-
sample-freq = fp_toint(
mul_fp(int_tofp(cpu-pstate.max_pstate * 1000), core_pct));
 
-   sample-core_pct_busy = mul_fp(core_pct, c0_pct);
+   sample-core_pct_busy = core_pct;
 }
 
 static inline void intel_pstate_sample(struct cpudata *cpu)
 {
u64 aperf, mperf;
-   unsigned long long tsc;
 
rdmsrl(MSR_IA32_APERF, aperf);
rdmsrl(MSR_IA32_MPERF, mperf);
-   tsc = native_read_tsc();
 
aperf = aperf  FRAC_BITS;
mperf = mperf  FRAC_BITS;
-   tsc = tsc  FRAC_BITS;
 
cpu-sample.aperf = aperf;
cpu-sample.mperf = mperf;
-   cpu-sample.tsc = tsc;
cpu-sample.aperf -= cpu-prev_aperf;
cpu-sample.mperf -= cpu-prev_mperf;
-   cpu-sample.tsc -= cpu-prev_tsc;
 
intel_pstate_calc_busy(cpu, cpu-sample);
 
cpu-prev_aperf = aperf;
cpu-prev_mperf = mperf;
-   cpu-prev_tsc = tsc;
 }
 
 static inline void intel_pstate_set_sample_time(struct cpudata *cpu)
-- 
1.9.0

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/5] intel_pstate: remove setting P state to MAX on init

2014-05-08 Thread dirk . brandewie
From: Dirk Brandewie dirk.j.brande...@intel.com

Setting the P state of the core to max at init time is a hold over
from early implementation of intel_pstate where intel_pstate disabled
cpufreq and loaded VERY early in the boot sequence.  This was to
ensure that intel_pstate did not affect boot time. This in not needed
now that intel_pstate is a cpufreq driver.

Removing this covers the case where a CPU has gone through a manual
CPU offline/online cycle and the P state is set to MAX on init and the
CPU immediately goes idle.  Due to HW coordination the P state request
on the idle CPU will drag all cores to MAX P state until the load is
reevaluated when to core goes non-idle.

Reported-by: Patrick Marlier patrick.marl...@gmail.com

Signed-off-by: Dirk Brandewie dirk.j.brande...@intel.com
---
 drivers/cpufreq/intel_pstate.c | 13 +
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 39c4f85..eab8ccf 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -554,12 +554,7 @@ static void intel_pstate_get_cpu_pstates(struct cpudata 
*cpu)
 
if (pstate_funcs.get_vid)
pstate_funcs.get_vid(cpu);
-
-   /*
-* goto max pstate so we don't slow up boot if we are built-in if we are
-* a module we will take care of it during normal operation
-*/
-   intel_pstate_set_pstate(cpu, cpu-pstate.max_pstate);
+   intel_pstate_set_pstate(cpu, cpu-pstate.min_pstate);
 }
 
 static inline void intel_pstate_calc_busy(struct cpudata *cpu,
@@ -704,11 +699,6 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
cpu = all_cpu_data[cpunum];
 
intel_pstate_get_cpu_pstates(cpu);
-   if (!cpu-pstate.current_pstate) {
-   all_cpu_data[cpunum] = NULL;
-   kfree(cpu);
-   return -ENODATA;
-   }
 
cpu-cpu = cpunum;
 
@@ -719,7 +709,6 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
cpu-timer.expires = jiffies + HZ/100;
intel_pstate_busy_pid_reset(cpu);
intel_pstate_sample(cpu);
-   intel_pstate_set_pstate(cpu, cpu-pstate.max_pstate);
 
add_timer_on(cpu-timer, cpunum);
 
-- 
1.9.0

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] cpufreq: intel_pstate: Change the calculation of next pstate

2014-05-08 Thread Dirk Brandewie
On 05/05/2014 04:57 PM, Stratos Karafotis wrote:
 Currently the driver calculates the next pstate proportional to
 core_busy factor, scaled by the ratio max_pstate / current_pstate.
 
 Using the scaled load (core_busy) to calculate the next pstate
 is not always correct, because there are cases that the load is
 independent from current pstate. For example, a tight 'for' loop
 through many sampling intervals will cause a load of 100% in
 every pstate.
 
 So, change the above method and calculate the next pstate with
 the assumption that the next pstate should not depend on the
 current pstate. The next pstate should only be proportional
 to measured load. Use the linear function to calculate the load:
 
 Next P-state = A + B * load
 
 where A = min_state and B = (max_pstate - min_pstate) / 100
 If turbo is enabled the B = (turbo_pstate - min_pstate) / 100
 The load is calculated using the kernel time functions.
 

This will hurt your power numbers under normal conditions where you
are not running a performance workload. Consider the following:

   1. The system is idle, all core at min P state and utilization is low say  
10%
   2. You run something that drives the load as seen by the kernel to 100%
  which scaled by the current P state.

This would cause the P state to go from min - max in one step.  Which is
what you want if you are only looking at a single core.  But this will also
drag every core in the package to the max P state as well.  This would be fine
if the power vs frequency cure was linear all the cores would finish
their work faster and go idle sooner (race to halt) and maybe spend
more time in a deeper C state which dwarfs the amount of power we can
save by controlling P states. Unfortunately this is *not* the case, 
power vs frequency curve is non-linear and get very steep in the turbo
range.  If it were linear there would be no reason to have P state
control you could select the highest P state and walk away.

Being conservative on the way up and aggressive on way down give you
the best power efficiency on non-benchmark loads.  Most benchmarks
are pretty useless for measuring power efficiency (unless they were
designed for it) since they are measuring how fast something can be
done which is measuring the efficiency at max performance.

The performance issues you pointed out were caused by commit 
fcb6a15c intel_pstate: Take core C0 time into account for core busy calculation
and the ensuing problem is caused. These have been fixed in the patch set

   https://lkml.org/lkml/2014/5/8/574

The performance comparison between before/after this patch set, your patch
and ondemand/acpi_cpufreq is available at:
http://openbenchmarking.org/result/1405085-PL-C0200965993
ffmpeg was added to the set of benchmarks because there was a regression
reported against this benchmark as well.
https://bugzilla.kernel.org/show_bug.cgi?id=75121

--Dirk



 
   
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/5] updates for intel_pstate

2014-05-08 Thread Dirk Brandewie

On 05/08/2014 01:30 PM, Rafael J. Wysocki wrote:

On Thursday, May 08, 2014 12:57:22 PM dirk.brande...@gmail.com wrote:

From: Dirk Brandewie dirk.j.brande...@intel.com

Patches 1-4 are bugfixes.  Patch 4 specifically removes the wreckage
caused by C0 tracking change.


I would appreciate sending such stuff before the -rc4 time frame.


Agreed.

Patch 1 is a resend the others were sent as soon as they were ready/tested




Patch 5 Adds new CPU IDs for the Broadwell processors.

Dirk Brandewie (5):
   intel_pstate: Set turbo VID for Baytrail
   intel_pstate: remove setting P state to MAX on init
   intel_pstate: Fix fixed point rounding macro
   intel_pstate: Remove C0 tracking
   intel_pstate: Add CPU IDs for Broadwell processors

  drivers/cpufreq/intel_pstate.c | 52 +-
  1 file changed, 21 insertions(+), 31 deletions(-)


I'll queue them up for 3.15-rc6, but that's a bit too late.

Next time I may just wait for the next merge window, fixes or not.


OK the reporters of the issues already have the fixes in hand.






--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Performance regression in v3.14

2014-05-07 Thread Dirk Brandewie

On 05/06/2014 10:40 PM, Viresh Kumar wrote:

Cc'ing Dirk who is taking care of intel-pstate driver.



Thanks Viresh I had seen this thread.

I am looking into it

--Dirk

On 6 May 2014 22:05, Johan Hovold  wrote:

After updating my main system from v3.13 to v3.14.2, I found that the
git bash-completion was extremely sluggish. Completing a file name would
take roughly six rather than one second on this Haswell machine
(i7-4770). (Other things, such as git rebase, also felt slower, but
the completion issue was much more obvious and easy to measure).

I managed to reproduce the problem using the following minimal construct

 cat dmesg.repeat | while read x; do true; done

where dmesg.repeat is simply dmesg concatenated together to an
equivalent number of lines as produced by git ls-files in the
kernel-source tree root (45k), and where the actual processing of each
line has been removed.

Most of the time I get:

 $ time cat dmesg.repeat | while read x; do true; done

 real0m6.091s
 user0m3.674s
 sys 0m2.447s

but sometimes it only takes one second.

 $ time cat dmesg.repeat | while read x; do true; done

 real0m1.100s
 user0m0.544s
 sys 0m0.570s

I don't seem to be able to reproduce the problem on 3.13 where the pipe
always takes about one second to finish.

Taking all but one core offline seems to make the problem go away, and so
does using the performance rather than powersave governor of the
intel_pstate cpufreq driver (on at least one of two online cores).

Moving the mouse cursor makes to loop finish faster, and so does
switching to a another terminal to print cpufreq/cpuinfo_cur_freq which
was around cpuinfo_min_freq several times (when tracing, see below).

I could not reproduce the problem when using perf record, but I can get
function-profile traces using ftrace (in which case the loop takes about
60 seconds instead of six seconds to finish).

Comparing the traces I see a lot of functions taking ten times longer to
finish, but I guess that's expected if this is indeed a cpufreq issue.

Since this is my main machine (and only multi-core machine at the
moment) I'm not able to bisect this myself. And for the same reason I
have not verified that the problem persists in v3.15-rc.

I don't see any cpufreq patches in the v3.14.3 stable queue nor anything
obviously related and marked for stable in v3.15-rc.

Any ideas about what might be going on?


I tried to take a look at the diff for cpufreq between 3.13 and 3.14.2 and
couldn't pin point on any change which might cause it. Don't have a clue
of what's going on. I don't know how to help you on this.

Normally I test my stuff on a ARM board and I don't remember facing
any such behavior there. There might be something wrong with intel-pstate
as well..

Also, can you try to use acpi-cpufreq instead? And see how that is behaving?

--
viresh



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Performance regression in v3.14

2014-05-07 Thread Dirk Brandewie

On 05/06/2014 10:40 PM, Viresh Kumar wrote:

Cc'ing Dirk who is taking care of intel-pstate driver.



Thanks Viresh I had seen this thread.

I am looking into it

--Dirk

On 6 May 2014 22:05, Johan Hovold jhov...@gmail.com wrote:

After updating my main system from v3.13 to v3.14.2, I found that the
git bash-completion was extremely sluggish. Completing a file name would
take roughly six rather than one second on this Haswell machine
(i7-4770). (Other things, such as git rebase, also felt slower, but
the completion issue was much more obvious and easy to measure).

I managed to reproduce the problem using the following minimal construct

 cat dmesg.repeat | while read x; do true; done

where dmesg.repeat is simply dmesg concatenated together to an
equivalent number of lines as produced by git ls-files in the
kernel-source tree root (45k), and where the actual processing of each
line has been removed.

Most of the time I get:

 $ time cat dmesg.repeat | while read x; do true; done

 real0m6.091s
 user0m3.674s
 sys 0m2.447s

but sometimes it only takes one second.

 $ time cat dmesg.repeat | while read x; do true; done

 real0m1.100s
 user0m0.544s
 sys 0m0.570s

I don't seem to be able to reproduce the problem on 3.13 where the pipe
always takes about one second to finish.

Taking all but one core offline seems to make the problem go away, and so
does using the performance rather than powersave governor of the
intel_pstate cpufreq driver (on at least one of two online cores).

Moving the mouse cursor makes to loop finish faster, and so does
switching to a another terminal to print cpufreq/cpuinfo_cur_freq which
was around cpuinfo_min_freq several times (when tracing, see below).

I could not reproduce the problem when using perf record, but I can get
function-profile traces using ftrace (in which case the loop takes about
60 seconds instead of six seconds to finish).

Comparing the traces I see a lot of functions taking ten times longer to
finish, but I guess that's expected if this is indeed a cpufreq issue.

Since this is my main machine (and only multi-core machine at the
moment) I'm not able to bisect this myself. And for the same reason I
have not verified that the problem persists in v3.15-rc.

I don't see any cpufreq patches in the v3.14.3 stable queue nor anything
obviously related and marked for stable in v3.15-rc.

Any ideas about what might be going on?


I tried to take a look at the diff for cpufreq between 3.13 and 3.14.2 and
couldn't pin point on any change which might cause it. Don't have a clue
of what's going on. I don't know how to help you on this.

Normally I test my stuff on a ARM board and I don't remember facing
any such behavior there. There might be something wrong with intel-pstate
as well..

Also, can you try to use acpi-cpufreq instead? And see how that is behaving?

--
viresh



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] cpufreq: intel_pstate: Change the calculation of next pstate

2014-05-01 Thread Dirk Brandewie

On 05/01/2014 04:18 PM, Rafael J. Wysocki wrote:

On Thursday, May 01, 2014 02:30:42 PM Dirk Brandewie wrote:

On 05/01/2014 02:00 PM, Stratos Karafotis wrote:

Currently the driver calculates the next pstate proportional to
core_busy factor, scaled by the ratio max_pstate / current_pstate.

Using the scaled load (core_busy) to calculate the next pstate
is not always correct, because there are cases that the load is
independent from current pstate. For example, a tight 'for' loop
through many sampling intervals will cause a load of 100% in
every pstate.

So, change the above method and calculate the next pstate with
the assumption that the next pstate should not depend on the
current pstate. The next pstate should only be directly
proportional to measured load.

Tested on Intel i7-3770 CPU @ 3.40GHz.
Phoronix benchmark of Linux Kernel Compilation 3.1 test shows an
increase ~1.5% in performance. Below the test results using turbostat
(5 iterations):

Without patch:

Ph. avg TimeTotal time  PkgWatt Total Energy
79.63   266.416 57.74   15382.85984
79.63   265.609 57.87   15370.79283
79.57   266.994 57.54   15362.83476
79.53   265.304 57.83   15342.53032
79.71   265.977 57.76   15362.83152
avg 79.61   266.06  57.74   15364.36985

With patch:

Ph. avg TimeTotal time  PkgWatt Total Energy
78.23   258.826 59.14   15306.96964
78.41   259.110 59.15   15326.35650
78.40   258.530 59.26   15320.48780
78.46   258.673 59.20   15313.44160
78.19   259.075 59.16   15326.87700
avg 78.34   258.842 59.18   15318.82650

The total test time reduced by ~2.6%, while the total energy
consumption during a test iteration reduced by ~0.35%

Signed-off-by: Stratos Karafotis 
---

Changes v1 -> v2
- Enhance change log as Rafael and Viresh suggested


   drivers/cpufreq/intel_pstate.c | 15 +++
   1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 0999673..8e309db 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -608,28 +608,27 @@ static inline void intel_pstate_set_sample_time(struct 
cpudata *cpu)
mod_timer_pinned(>timer, jiffies + delay);
   }

-static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu)
+static inline int32_t intel_pstate_get_busy(struct cpudata *cpu)
   {
-   int32_t core_busy, max_pstate, current_pstate;
+   int32_t core_busy, max_pstate;

core_busy = cpu->sample.core_pct_busy;
max_pstate = int_tofp(cpu->pstate.max_pstate);
-   current_pstate = int_tofp(cpu->pstate.current_pstate);
-   core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate));
+   core_busy = mul_fp(core_busy, max_pstate);


NAK,  The goal of this code is to find out how busy the core is at the current
P state. This change will return a value WAY too high.

Assume core_busy is 100 and the max non-turbo P state is 34 (3.4GHz) this code
would return a busy value of 3400. The PID  is trying to keep the busy value
at the setpoint any value of ~3% will drive the P state to the highest turbo
P state in this example.


Well, the problem is that the numbers above indicate an improvement in energy
efficiency as a result of this patch and we need to explain that result.


The performance governor is the best option for this workload.

This change will give you the highest trubo for all but very idle work loads.

Lets say you have a processor with max P state of 3.4GHz  The current P state
is 1.6 GHz so if the processor was 100% in C0 the core_busy values would be
47% This number scaled would be 100%.  With the change above the PID would be
reacting to a load of 1598%.  APERF/MPERF give you the percent of entire
core scaling it lets you find out how busy your are within the cureent P state.

--Dirk



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] cpufreq: intel_pstate: Change the calculation of next pstate

2014-05-01 Thread Dirk Brandewie

On 05/01/2014 02:00 PM, Stratos Karafotis wrote:

Currently the driver calculates the next pstate proportional to
core_busy factor, scaled by the ratio max_pstate / current_pstate.

Using the scaled load (core_busy) to calculate the next pstate
is not always correct, because there are cases that the load is
independent from current pstate. For example, a tight 'for' loop
through many sampling intervals will cause a load of 100% in
every pstate.

So, change the above method and calculate the next pstate with
the assumption that the next pstate should not depend on the
current pstate. The next pstate should only be directly
proportional to measured load.

Tested on Intel i7-3770 CPU @ 3.40GHz.
Phoronix benchmark of Linux Kernel Compilation 3.1 test shows an
increase ~1.5% in performance. Below the test results using turbostat
(5 iterations):

Without patch:

Ph. avg TimeTotal time  PkgWatt Total Energy
79.63   266.416 57.74   15382.85984
79.63   265.609 57.87   15370.79283
79.57   266.994 57.54   15362.83476
79.53   265.304 57.83   15342.53032
79.71   265.977 57.76   15362.83152
avg 79.61   266.06  57.74   15364.36985

With patch:

Ph. avg TimeTotal time  PkgWatt Total Energy
78.23   258.826 59.14   15306.96964
78.41   259.110 59.15   15326.35650
78.40   258.530 59.26   15320.48780
78.46   258.673 59.20   15313.44160
78.19   259.075 59.16   15326.87700
avg 78.34   258.842 59.18   15318.82650

The total test time reduced by ~2.6%, while the total energy
consumption during a test iteration reduced by ~0.35%

Signed-off-by: Stratos Karafotis 
---

Changes v1 -> v2
- Enhance change log as Rafael and Viresh suggested


  drivers/cpufreq/intel_pstate.c | 15 +++
  1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 0999673..8e309db 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -608,28 +608,27 @@ static inline void intel_pstate_set_sample_time(struct 
cpudata *cpu)
mod_timer_pinned(>timer, jiffies + delay);
  }

-static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu)
+static inline int32_t intel_pstate_get_busy(struct cpudata *cpu)
  {
-   int32_t core_busy, max_pstate, current_pstate;
+   int32_t core_busy, max_pstate;

core_busy = cpu->sample.core_pct_busy;
max_pstate = int_tofp(cpu->pstate.max_pstate);
-   current_pstate = int_tofp(cpu->pstate.current_pstate);
-   core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate));
+   core_busy = mul_fp(core_busy, max_pstate);


NAK,  The goal of this code is to find out how busy the core is at the current
P state. This change will return a value WAY too high.

Assume core_busy is 100 and the max non-turbo P state is 34 (3.4GHz) this code
would return a busy value of 3400. The PID  is trying to keep the busy value
at the setpoint any value of ~3% will drive the P state to the highest turbo
P state in this example.



return FP_ROUNDUP(core_busy);
  }

  static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
  {
-   int32_t busy_scaled;
+   int32_t busy;
struct _pid *pid;
signed int ctl = 0;
int steps;

pid = >pid;
-   busy_scaled = intel_pstate_get_scaled_busy(cpu);
+   busy = intel_pstate_get_busy(cpu);

-   ctl = pid_calc(pid, busy_scaled);
+   ctl = pid_calc(pid, busy);

steps = abs(ctl);

@@ -651,7 +650,7 @@ static void intel_pstate_timer_func(unsigned long __data)
intel_pstate_adjust_busy_pstate(cpu);

trace_pstate_sample(fp_toint(sample->core_pct_busy),
-   fp_toint(intel_pstate_get_scaled_busy(cpu)),
+   fp_toint(intel_pstate_get_busy(cpu)),
cpu->pstate.current_pstate,
sample->mperf,
sample->aperf,



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] cpufreq: intel_pstate: Change the calculation of next pstate

2014-05-01 Thread Dirk Brandewie

On 04/29/2014 02:52 PM, Rafael J. Wysocki wrote:

On Tuesday, April 29, 2014 07:34:46 PM Stratos Karafotis wrote:

On 29/04/2014 07:58 πμ, Viresh Kumar wrote:

Cc'd Dirk,

On 28 April 2014 03:42, Stratos Karafotis  wrote:

Currently the driver calculates the next pstate proportional to
core_busy factor and reverse proportional to current pstate.

Change the above method and calculate the next pstate independently
of current pstate.


We must mention why the change is required.



Hi Viresh,

Actually, I can't say that it's required. :)
I just believe that calculation of next p-state should be independent
from current one. In my opinion we can't scale the load across different
p-states, because it's not always equivalent.

For example suppose a load of 100% because of a tight for loop in the
current p-state. It will be also a 100% load in any other p-state.
It will be wrong if we scale the load in the calculation formula
according to the current p-state.

I included the test results in the change log to point out an improvement
because of this patch.

I will enrich more the change log as you suggested.


Please do so.

Also, we need to take your patch to our power lab and see if we can reproduce
your results in other workloads.

And I'm waiting for the intel_pstate developer Dirk Brandewie to comment.


Sorry I just returned from dealing with a family emergency and am digging
out of my inbox.

I will run this patch through some tests.




Thanks!



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] cpufreq: intel_pstate: Remove sample parameter in intel_pstate_calc_busy

2014-05-01 Thread Dirk Brandewie

On 04/29/2014 10:53 AM, Stratos Karafotis wrote:

Since commit d37e2b7644 ("intel_pstate: remove unneeded sample buffers")
we use only one sample. So, there is no need to pass the sample
pointer to intel_pstate_calc_busy. Instead, get the pointer from
cpudata. Also, remove the unused SAMPLE_COUNT macro.

While at it, reformat the first line in this function.

Signed-off-by: Stratos Karafotis 


Acked-by: Dirk Brandewie 


---
  drivers/cpufreq/intel_pstate.c | 11 ---
  1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 95b3958..8192ff1 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -32,8 +32,6 @@
  #include 
  #include 

-#define SAMPLE_COUNT   3
-
  #define BYT_RATIOS0x66a
  #define BYT_VIDS  0x66b
  #define BYT_TURBO_RATIOS  0x66c
@@ -553,14 +551,13 @@ static void intel_pstate_get_cpu_pstates(struct cpudata 
*cpu)
intel_pstate_set_pstate(cpu, cpu->pstate.max_pstate);
  }

-static inline void intel_pstate_calc_busy(struct cpudata *cpu,
-   struct sample *sample)
+static inline void intel_pstate_calc_busy(struct cpudata *cpu)
  {
+   struct sample *sample = >sample;
int32_t core_pct;
int32_t c0_pct;

-   core_pct = div_fp(int_tofp((sample->aperf)),
-   int_tofp((sample->mperf)));
+   core_pct = div_fp(int_tofp(sample->aperf), int_tofp(sample->mperf));
core_pct = mul_fp(core_pct, int_tofp(100));
FP_ROUNDUP(core_pct);

@@ -595,7 +592,7 @@ static inline void intel_pstate_sample(struct cpudata *cpu)
cpu->sample.mperf -= cpu->prev_mperf;
cpu->sample.tsc -= cpu->prev_tsc;

-   intel_pstate_calc_busy(cpu, >sample);
+   intel_pstate_calc_busy(cpu);

cpu->prev_aperf = aperf;
cpu->prev_mperf = mperf;



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] cpufreq: intel_pstate: Remove sample parameter in intel_pstate_calc_busy

2014-05-01 Thread Dirk Brandewie

On 04/29/2014 10:53 AM, Stratos Karafotis wrote:

Since commit d37e2b7644 (intel_pstate: remove unneeded sample buffers)
we use only one sample. So, there is no need to pass the sample
pointer to intel_pstate_calc_busy. Instead, get the pointer from
cpudata. Also, remove the unused SAMPLE_COUNT macro.

While at it, reformat the first line in this function.

Signed-off-by: Stratos Karafotis strat...@semaphore.gr


Acked-by: Dirk Brandewie dirk.j.brande...@intel.com


---
  drivers/cpufreq/intel_pstate.c | 11 ---
  1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 95b3958..8192ff1 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -32,8 +32,6 @@
  #include asm/msr.h
  #include asm/cpu_device_id.h

-#define SAMPLE_COUNT   3
-
  #define BYT_RATIOS0x66a
  #define BYT_VIDS  0x66b
  #define BYT_TURBO_RATIOS  0x66c
@@ -553,14 +551,13 @@ static void intel_pstate_get_cpu_pstates(struct cpudata 
*cpu)
intel_pstate_set_pstate(cpu, cpu-pstate.max_pstate);
  }

-static inline void intel_pstate_calc_busy(struct cpudata *cpu,
-   struct sample *sample)
+static inline void intel_pstate_calc_busy(struct cpudata *cpu)
  {
+   struct sample *sample = cpu-sample;
int32_t core_pct;
int32_t c0_pct;

-   core_pct = div_fp(int_tofp((sample-aperf)),
-   int_tofp((sample-mperf)));
+   core_pct = div_fp(int_tofp(sample-aperf), int_tofp(sample-mperf));
core_pct = mul_fp(core_pct, int_tofp(100));
FP_ROUNDUP(core_pct);

@@ -595,7 +592,7 @@ static inline void intel_pstate_sample(struct cpudata *cpu)
cpu-sample.mperf -= cpu-prev_mperf;
cpu-sample.tsc -= cpu-prev_tsc;

-   intel_pstate_calc_busy(cpu, cpu-sample);
+   intel_pstate_calc_busy(cpu);

cpu-prev_aperf = aperf;
cpu-prev_mperf = mperf;



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] cpufreq: intel_pstate: Change the calculation of next pstate

2014-05-01 Thread Dirk Brandewie

On 04/29/2014 02:52 PM, Rafael J. Wysocki wrote:

On Tuesday, April 29, 2014 07:34:46 PM Stratos Karafotis wrote:

On 29/04/2014 07:58 πμ, Viresh Kumar wrote:

Cc'd Dirk,

On 28 April 2014 03:42, Stratos Karafotis strat...@semaphore.gr wrote:

Currently the driver calculates the next pstate proportional to
core_busy factor and reverse proportional to current pstate.

Change the above method and calculate the next pstate independently
of current pstate.


We must mention why the change is required.



Hi Viresh,

Actually, I can't say that it's required. :)
I just believe that calculation of next p-state should be independent
from current one. In my opinion we can't scale the load across different
p-states, because it's not always equivalent.

For example suppose a load of 100% because of a tight for loop in the
current p-state. It will be also a 100% load in any other p-state.
It will be wrong if we scale the load in the calculation formula
according to the current p-state.

I included the test results in the change log to point out an improvement
because of this patch.

I will enrich more the change log as you suggested.


Please do so.

Also, we need to take your patch to our power lab and see if we can reproduce
your results in other workloads.

And I'm waiting for the intel_pstate developer Dirk Brandewie to comment.


Sorry I just returned from dealing with a family emergency and am digging
out of my inbox.

I will run this patch through some tests.




Thanks!



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] cpufreq: intel_pstate: Change the calculation of next pstate

2014-05-01 Thread Dirk Brandewie

On 05/01/2014 02:00 PM, Stratos Karafotis wrote:

Currently the driver calculates the next pstate proportional to
core_busy factor, scaled by the ratio max_pstate / current_pstate.

Using the scaled load (core_busy) to calculate the next pstate
is not always correct, because there are cases that the load is
independent from current pstate. For example, a tight 'for' loop
through many sampling intervals will cause a load of 100% in
every pstate.

So, change the above method and calculate the next pstate with
the assumption that the next pstate should not depend on the
current pstate. The next pstate should only be directly
proportional to measured load.

Tested on Intel i7-3770 CPU @ 3.40GHz.
Phoronix benchmark of Linux Kernel Compilation 3.1 test shows an
increase ~1.5% in performance. Below the test results using turbostat
(5 iterations):

Without patch:

Ph. avg TimeTotal time  PkgWatt Total Energy
79.63   266.416 57.74   15382.85984
79.63   265.609 57.87   15370.79283
79.57   266.994 57.54   15362.83476
79.53   265.304 57.83   15342.53032
79.71   265.977 57.76   15362.83152
avg 79.61   266.06  57.74   15364.36985

With patch:

Ph. avg TimeTotal time  PkgWatt Total Energy
78.23   258.826 59.14   15306.96964
78.41   259.110 59.15   15326.35650
78.40   258.530 59.26   15320.48780
78.46   258.673 59.20   15313.44160
78.19   259.075 59.16   15326.87700
avg 78.34   258.842 59.18   15318.82650

The total test time reduced by ~2.6%, while the total energy
consumption during a test iteration reduced by ~0.35%

Signed-off-by: Stratos Karafotis strat...@semaphore.gr
---

Changes v1 - v2
- Enhance change log as Rafael and Viresh suggested


  drivers/cpufreq/intel_pstate.c | 15 +++
  1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 0999673..8e309db 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -608,28 +608,27 @@ static inline void intel_pstate_set_sample_time(struct 
cpudata *cpu)
mod_timer_pinned(cpu-timer, jiffies + delay);
  }

-static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu)
+static inline int32_t intel_pstate_get_busy(struct cpudata *cpu)
  {
-   int32_t core_busy, max_pstate, current_pstate;
+   int32_t core_busy, max_pstate;

core_busy = cpu-sample.core_pct_busy;
max_pstate = int_tofp(cpu-pstate.max_pstate);
-   current_pstate = int_tofp(cpu-pstate.current_pstate);
-   core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate));
+   core_busy = mul_fp(core_busy, max_pstate);


NAK,  The goal of this code is to find out how busy the core is at the current
P state. This change will return a value WAY too high.

Assume core_busy is 100 and the max non-turbo P state is 34 (3.4GHz) this code
would return a busy value of 3400. The PID  is trying to keep the busy value
at the setpoint any value of ~3% will drive the P state to the highest turbo
P state in this example.



return FP_ROUNDUP(core_busy);
  }

  static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
  {
-   int32_t busy_scaled;
+   int32_t busy;
struct _pid *pid;
signed int ctl = 0;
int steps;

pid = cpu-pid;
-   busy_scaled = intel_pstate_get_scaled_busy(cpu);
+   busy = intel_pstate_get_busy(cpu);

-   ctl = pid_calc(pid, busy_scaled);
+   ctl = pid_calc(pid, busy);

steps = abs(ctl);

@@ -651,7 +650,7 @@ static void intel_pstate_timer_func(unsigned long __data)
intel_pstate_adjust_busy_pstate(cpu);

trace_pstate_sample(fp_toint(sample-core_pct_busy),
-   fp_toint(intel_pstate_get_scaled_busy(cpu)),
+   fp_toint(intel_pstate_get_busy(cpu)),
cpu-pstate.current_pstate,
sample-mperf,
sample-aperf,



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] cpufreq: intel_pstate: Change the calculation of next pstate

2014-05-01 Thread Dirk Brandewie

On 05/01/2014 04:18 PM, Rafael J. Wysocki wrote:

On Thursday, May 01, 2014 02:30:42 PM Dirk Brandewie wrote:

On 05/01/2014 02:00 PM, Stratos Karafotis wrote:

Currently the driver calculates the next pstate proportional to
core_busy factor, scaled by the ratio max_pstate / current_pstate.

Using the scaled load (core_busy) to calculate the next pstate
is not always correct, because there are cases that the load is
independent from current pstate. For example, a tight 'for' loop
through many sampling intervals will cause a load of 100% in
every pstate.

So, change the above method and calculate the next pstate with
the assumption that the next pstate should not depend on the
current pstate. The next pstate should only be directly
proportional to measured load.

Tested on Intel i7-3770 CPU @ 3.40GHz.
Phoronix benchmark of Linux Kernel Compilation 3.1 test shows an
increase ~1.5% in performance. Below the test results using turbostat
(5 iterations):

Without patch:

Ph. avg TimeTotal time  PkgWatt Total Energy
79.63   266.416 57.74   15382.85984
79.63   265.609 57.87   15370.79283
79.57   266.994 57.54   15362.83476
79.53   265.304 57.83   15342.53032
79.71   265.977 57.76   15362.83152
avg 79.61   266.06  57.74   15364.36985

With patch:

Ph. avg TimeTotal time  PkgWatt Total Energy
78.23   258.826 59.14   15306.96964
78.41   259.110 59.15   15326.35650
78.40   258.530 59.26   15320.48780
78.46   258.673 59.20   15313.44160
78.19   259.075 59.16   15326.87700
avg 78.34   258.842 59.18   15318.82650

The total test time reduced by ~2.6%, while the total energy
consumption during a test iteration reduced by ~0.35%

Signed-off-by: Stratos Karafotis strat...@semaphore.gr
---

Changes v1 - v2
- Enhance change log as Rafael and Viresh suggested


   drivers/cpufreq/intel_pstate.c | 15 +++
   1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 0999673..8e309db 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -608,28 +608,27 @@ static inline void intel_pstate_set_sample_time(struct 
cpudata *cpu)
mod_timer_pinned(cpu-timer, jiffies + delay);
   }

-static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu)
+static inline int32_t intel_pstate_get_busy(struct cpudata *cpu)
   {
-   int32_t core_busy, max_pstate, current_pstate;
+   int32_t core_busy, max_pstate;

core_busy = cpu-sample.core_pct_busy;
max_pstate = int_tofp(cpu-pstate.max_pstate);
-   current_pstate = int_tofp(cpu-pstate.current_pstate);
-   core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate));
+   core_busy = mul_fp(core_busy, max_pstate);


NAK,  The goal of this code is to find out how busy the core is at the current
P state. This change will return a value WAY too high.

Assume core_busy is 100 and the max non-turbo P state is 34 (3.4GHz) this code
would return a busy value of 3400. The PID  is trying to keep the busy value
at the setpoint any value of ~3% will drive the P state to the highest turbo
P state in this example.


Well, the problem is that the numbers above indicate an improvement in energy
efficiency as a result of this patch and we need to explain that result.


The performance governor is the best option for this workload.

This change will give you the highest trubo for all but very idle work loads.

Lets say you have a processor with max P state of 3.4GHz  The current P state
is 1.6 GHz so if the processor was 100% in C0 the core_busy values would be
47% This number scaled would be 100%.  With the change above the PID would be
reacting to a load of 1598%.  APERF/MPERF give you the percent of entire
core scaling it lets you find out how busy your are within the cureent P state.

--Dirk



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] intel_pstate: Use del_timer_sync in intel_pstate_cpu_stop

2014-03-24 Thread dirk . brandewie
From: Dirk Brandewie 

Ensure that no timer callback is running since we are about to free
the timer structure.  We cannot guarantee that the call back is called
on the CPU where the timer is running.

Reported-by: Thomas Gleixner 
Signed-off-by: Dirk Brandewie 
Cc: Thomas Gleixner 
Cc: "Rafael J. Wysocki" 
Cc: cpufreq 
---
 drivers/cpufreq/intel_pstate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index e9092fd..a4a9ba5 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -780,7 +780,7 @@ static int intel_pstate_cpu_stop(struct cpufreq_policy 
*policy)
 
pr_info("intel_pstate CPU %d exiting\n", cpu_num);
 
-   del_timer(_cpu_data[cpu_num]->timer);
+   del_timer_sync(_cpu_data[cpu_num]->timer);
intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate);
kfree(all_cpu_data[cpu_num]);
all_cpu_data[cpu_num] = NULL;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 15/16] cpufreq: intel-pstate: Use del_timer_sync in intel_pstate_cpu_exit()

2014-03-24 Thread Dirk Brandewie

Hi Thomas,
On 03/23/2014 06:56 PM, Rafael J. Wysocki wrote:

On Sunday, March 23, 2014 03:09:32 PM Thomas Gleixner wrote:

We are about to free the data structure. Make sure no timer callback
is running. I might be paranoid, but the ->exit callback can be
invoked from so many places, that it is not entirely clear whether
del_timer is always called on the cpu on which it is enqueued.

While looking through the call sites I noticed, that
cpufreq_init_policy() can fail and invoke cpufreq_driver->exit() but
it does not return the failure and the callsite happily proceeds.



The call to del_timer() has been moved to a new callback in material
in Rafaels pull request for v3.15.

I will send a patch adding this change to the v3.15 material.

--Dirk

Signed-off-by: Thomas Gleixner 
Cc: "Rafael J. Wysocki" 
Cc: cpufreq 
Cc: pm 


Dirk?


---

  drivers/cpufreq/intel_pstate.c |2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

Index: tip/drivers/cpufreq/intel_pstate.c
===
--- tip.orig/drivers/cpufreq/intel_pstate.c
+++ tip/drivers/cpufreq/intel_pstate.c
@@ -777,7 +777,7 @@ static int intel_pstate_cpu_exit(struct
  {
int cpu = policy->cpu;

-   del_timer(_cpu_data[cpu]->timer);
+   del_timer_sync(_cpu_data[cpu]->timer);
kfree(all_cpu_data[cpu]);
all_cpu_data[cpu] = NULL;
return 0;






--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 15/16] cpufreq: intel-pstate: Use del_timer_sync in intel_pstate_cpu_exit()

2014-03-24 Thread Dirk Brandewie

Hi Thomas,
On 03/23/2014 06:56 PM, Rafael J. Wysocki wrote:

On Sunday, March 23, 2014 03:09:32 PM Thomas Gleixner wrote:

We are about to free the data structure. Make sure no timer callback
is running. I might be paranoid, but the -exit callback can be
invoked from so many places, that it is not entirely clear whether
del_timer is always called on the cpu on which it is enqueued.

While looking through the call sites I noticed, that
cpufreq_init_policy() can fail and invoke cpufreq_driver-exit() but
it does not return the failure and the callsite happily proceeds.



The call to del_timer() has been moved to a new callback in material
in Rafaels pull request for v3.15.

I will send a patch adding this change to the v3.15 material.

--Dirk

Signed-off-by: Thomas Gleixner t...@linutronix.de
Cc: Rafael J. Wysocki r...@rjwysocki.net
Cc: cpufreq cpuf...@vger.kernel.org
Cc: pm linux...@vger.kernel.org


Dirk?


---

  drivers/cpufreq/intel_pstate.c |2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

Index: tip/drivers/cpufreq/intel_pstate.c
===
--- tip.orig/drivers/cpufreq/intel_pstate.c
+++ tip/drivers/cpufreq/intel_pstate.c
@@ -777,7 +777,7 @@ static int intel_pstate_cpu_exit(struct
  {
int cpu = policy-cpu;

-   del_timer(all_cpu_data[cpu]-timer);
+   del_timer_sync(all_cpu_data[cpu]-timer);
kfree(all_cpu_data[cpu]);
all_cpu_data[cpu] = NULL;
return 0;






--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] intel_pstate: Use del_timer_sync in intel_pstate_cpu_stop

2014-03-24 Thread dirk . brandewie
From: Dirk Brandewie dirk.j.brande...@intel.com

Ensure that no timer callback is running since we are about to free
the timer structure.  We cannot guarantee that the call back is called
on the CPU where the timer is running.

Reported-by: Thomas Gleixner t...@linutronix.de
Signed-off-by: Dirk Brandewie dirk.j.brande...@intel.com
Cc: Thomas Gleixner t...@linutronix.de
Cc: Rafael J. Wysocki r...@rjwysocki.net
Cc: cpufreq cpuf...@vger.kernel.org
---
 drivers/cpufreq/intel_pstate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index e9092fd..a4a9ba5 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -780,7 +780,7 @@ static int intel_pstate_cpu_stop(struct cpufreq_policy 
*policy)
 
pr_info(intel_pstate CPU %d exiting\n, cpu_num);
 
-   del_timer(all_cpu_data[cpu_num]-timer);
+   del_timer_sync(all_cpu_data[cpu_num]-timer);
intel_pstate_set_pstate(cpu, cpu-pstate.min_pstate);
kfree(all_cpu_data[cpu_num]);
all_cpu_data[cpu_num] = NULL;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


  1   2   3   4   >