[PATCH v2 4/8] cpupower: Remove unused pscur variable.

2021-01-25 Thread Nathan Fontenot
The pscur variable is set but not uused, just remove it.

This may have previsously been set to validate the MSR_AMD_PSTATE_STATUS
MSR. With the addition of the CPUPOWER_CAP_AMD_HW_PSTATE cap flag this
is no longer needed since the cpuid bit to enable this cap flag also
validates that the MSR_AMD_PSTATE_STATUS MSR is present.

Signed-off-by: Nathan Fontenot 
---
 tools/power/cpupower/utils/helpers/amd.c |9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/tools/power/cpupower/utils/helpers/amd.c 
b/tools/power/cpupower/utils/helpers/amd.c
index 8b69c7ff639a..fc2ac1e6bfb2 100644
--- a/tools/power/cpupower/utils/helpers/amd.c
+++ b/tools/power/cpupower/utils/helpers/amd.c
@@ -90,7 +90,7 @@ static int get_cof(int family, union core_pstate pstate)
 int decode_pstates(unsigned int cpu, unsigned int cpu_family,
   int boost_states, unsigned long *pstates, int *no)
 {
-   int i, psmax, pscur;
+   int i, psmax;
union core_pstate pstate;
unsigned long long val;
 
@@ -104,13 +104,6 @@ int decode_pstates(unsigned int cpu, unsigned int 
cpu_family,
return -1;
 
psmax = (val >> 4) & 0x7;
-
-   if (read_msr(cpu, MSR_AMD_PSTATE_STATUS, ))
-   return -1;
-
-   pscur = val & 0x7;
-
-   pscur += boost_states;
psmax += boost_states;
for (i = 0; i <= psmax; i++) {
if (i >= MAX_HW_PSTATES) {



[PATCH v2 6/8] cpupower: Condense pstate enabled bit checks in decode_pstates()

2021-01-25 Thread Nathan Fontenot
The enabled bit (bit 63) is common for all families so we can remove
the multiple enabled checks based on family and have a common check
for HW pstate enabled.

Signed-off-by: Nathan Fontenot 
---
 tools/power/cpupower/utils/helpers/amd.c |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/power/cpupower/utils/helpers/amd.c 
b/tools/power/cpupower/utils/helpers/amd.c
index b4731daa6820..216240e2b771 100644
--- a/tools/power/cpupower/utils/helpers/amd.c
+++ b/tools/power/cpupower/utils/helpers/amd.c
@@ -113,9 +113,9 @@ int decode_pstates(unsigned int cpu, unsigned int 
cpu_family,
}
if (read_msr(cpu, MSR_AMD_PSTATE + i, ))
return -1;
-   if ((cpu_family == 0x17) && (!pstate.pstatedef.en))
-   continue;
-   else if (!pstate.pstate.en)
+
+   /* The enabled bit (bit 63) is common for all families */
+   if (!pstate.pstatedef.en)
continue;
 
pstates[i] = get_cof(cpu_family, pstate);



[PATCH v2 3/8] cpupower: Add CPUPOWER_CAP_AMD_HW_PSTATE cpuid caps flag

2021-01-25 Thread Nathan Fontenot
Add a check in get_cpu_info() for the ability to read frequencies
from hardware and set the CPUPOWER_CAP_AMD_HW_PSTATE cpuid flag.
The cpuid flag is set when CPUID_8007_EDX[7] is set,
which is all families >= 10h. The check excludes family 14h
because HW pstate reporting was not implemented on family 14h.

This is intended to reduce family checks in the main code paths.

Signed-off-by: Nathan Fontenot 
---

Updates for v2: Update and add back previously removed comment.
---
 tools/power/cpupower/utils/helpers/amd.c |9 -
 tools/power/cpupower/utils/helpers/cpuid.c   |   12 +---
 tools/power/cpupower/utils/helpers/helpers.h |1 +
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/tools/power/cpupower/utils/helpers/amd.c 
b/tools/power/cpupower/utils/helpers/amd.c
index 34368436bbd6..8b69c7ff639a 100644
--- a/tools/power/cpupower/utils/helpers/amd.c
+++ b/tools/power/cpupower/utils/helpers/amd.c
@@ -94,11 +94,10 @@ int decode_pstates(unsigned int cpu, unsigned int 
cpu_family,
union core_pstate pstate;
unsigned long long val;
 
-   /* Only read out frequencies from HW when CPU might be boostable
-  to keep the code as short and clean as possible.
-  Otherwise frequencies are exported via ACPI tables.
-   */
-   if (cpu_family < 0x10 || cpu_family == 0x14)
+   /* Only read out frequencies from HW if HW Pstate is supported,
+* otherwise frequencies are exported via ACPI tables.
+*/
+   if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_HW_PSTATE))
return -1;
 
if (read_msr(cpu, MSR_AMD_PSTATE_LIMIT, ))
diff --git a/tools/power/cpupower/utils/helpers/cpuid.c 
b/tools/power/cpupower/utils/helpers/cpuid.c
index f9a66a430b72..d577220a193b 100644
--- a/tools/power/cpupower/utils/helpers/cpuid.c
+++ b/tools/power/cpupower/utils/helpers/cpuid.c
@@ -128,9 +128,15 @@ int get_cpu_info(struct cpupower_cpu_info *cpu_info)
/* AMD or Hygon Boost state enable/disable register */
if (cpu_info->vendor == X86_VENDOR_AMD ||
cpu_info->vendor == X86_VENDOR_HYGON) {
-   if (ext_cpuid_level >= 0x8007 &&
-   (cpuid_edx(0x8007) & (1 << 9)))
-   cpu_info->caps |= CPUPOWER_CAP_AMD_CPB;
+   if (ext_cpuid_level >= 0x8007) {
+   if (cpuid_edx(0x8007) & (1 << 9))
+   cpu_info->caps |= CPUPOWER_CAP_AMD_CPB;
+
+   if ((cpuid_edx(0x8007) & (1 << 7)) &&
+   cpu_info->family != 0x14)
+   /* HW pstate was not implemented in family 0x14 
*/
+   cpu_info->caps |= CPUPOWER_CAP_AMD_HW_PSTATE;
+   }
 
if (ext_cpuid_level >= 0x8008 &&
cpuid_ebx(0x8008) & (1 << 4))
diff --git a/tools/power/cpupower/utils/helpers/helpers.h 
b/tools/power/cpupower/utils/helpers/helpers.h
index a84f85a9dbd2..5f61eefff5b2 100644
--- a/tools/power/cpupower/utils/helpers/helpers.h
+++ b/tools/power/cpupower/utils/helpers/helpers.h
@@ -70,6 +70,7 @@ enum cpupower_cpu_vendor {X86_VENDOR_UNKNOWN = 0, 
X86_VENDOR_INTEL,
 #define CPUPOWER_CAP_IS_SNB0x0020
 #define CPUPOWER_CAP_INTEL_IDA 0x0040
 #define CPUPOWER_CAP_AMD_RDPRU 0x0080
+#define CPUPOWER_CAP_AMD_HW_PSTATE 0x0100
 
 #define CPUPOWER_AMD_CPBDIS0x0200
 



[PATCH v2 5/8] cpupower: Update family checks when decoding HW pstates

2021-01-25 Thread Nathan Fontenot
The family checks in get_cof() and get_did() need to use the
correct MSR format depending on the family. Add a cpupower
capability for using the pstatedef (family 17h and newer) to
control this instead of direct family checks.

Signed-off-by: Nathan Fontenot 
---
 tools/power/cpupower/utils/helpers/amd.c |8 
 tools/power/cpupower/utils/helpers/cpuid.c   |6 +-
 tools/power/cpupower/utils/helpers/helpers.h |1 +
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/tools/power/cpupower/utils/helpers/amd.c 
b/tools/power/cpupower/utils/helpers/amd.c
index fc2ac1e6bfb2..b4731daa6820 100644
--- a/tools/power/cpupower/utils/helpers/amd.c
+++ b/tools/power/cpupower/utils/helpers/amd.c
@@ -45,10 +45,10 @@ static int get_did(int family, union core_pstate pstate)
 {
int t;
 
-   if (family == 0x12)
-   t = pstate.val & 0xf;
-   else if (family == 0x17 || family == 0x18)
+   if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_PSTATEDEF)
t = pstate.pstatedef.did;
+   else if (family == 0x12)
+   t = pstate.val & 0xf;
else
t = pstate.pstate.did;
 
@@ -61,7 +61,7 @@ static int get_cof(int family, union core_pstate pstate)
int fid, did, cof;
 
did = get_did(family, pstate);
-   if (family == 0x17 || family == 0x18) {
+   if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_PSTATEDEF) {
fid = pstate.pstatedef.fid;
cof = 200 * fid / did;
} else {
diff --git a/tools/power/cpupower/utils/helpers/cpuid.c 
b/tools/power/cpupower/utils/helpers/cpuid.c
index d577220a193b..db2e88ceb67b 100644
--- a/tools/power/cpupower/utils/helpers/cpuid.c
+++ b/tools/power/cpupower/utils/helpers/cpuid.c
@@ -133,9 +133,13 @@ int get_cpu_info(struct cpupower_cpu_info *cpu_info)
cpu_info->caps |= CPUPOWER_CAP_AMD_CPB;
 
if ((cpuid_edx(0x8007) & (1 << 7)) &&
-   cpu_info->family != 0x14)
+   cpu_info->family != 0x14) {
/* HW pstate was not implemented in family 0x14 
*/
cpu_info->caps |= CPUPOWER_CAP_AMD_HW_PSTATE;
+
+   if (cpu_info->family >= 0x17)
+   cpu_info->caps |= 
CPUPOWER_CAP_AMD_PSTATEDEF;
+   }
}
 
if (ext_cpuid_level >= 0x8008 &&
diff --git a/tools/power/cpupower/utils/helpers/helpers.h 
b/tools/power/cpupower/utils/helpers/helpers.h
index 5f61eefff5b2..e4dc44ced770 100644
--- a/tools/power/cpupower/utils/helpers/helpers.h
+++ b/tools/power/cpupower/utils/helpers/helpers.h
@@ -71,6 +71,7 @@ enum cpupower_cpu_vendor {X86_VENDOR_UNKNOWN = 0, 
X86_VENDOR_INTEL,
 #define CPUPOWER_CAP_INTEL_IDA 0x0040
 #define CPUPOWER_CAP_AMD_RDPRU 0x0080
 #define CPUPOWER_CAP_AMD_HW_PSTATE 0x0100
+#define CPUPOWER_CAP_AMD_PSTATEDEF 0x0200
 
 #define CPUPOWER_AMD_CPBDIS0x0200
 



[PATCH v2 7/8] cpupower: Remove family arg to decode_pstates()

2021-01-25 Thread Nathan Fontenot
The decode_pstates() routine no longer uses the CPU family and
the caleed routines (get_cof() and get_did()) can grab the family
from the global cpupower_cpu_info struct. These update removes
passing the family arg to all these routines.

Signed-off-by: Nathan Fontenot 
---
 tools/power/cpupower/utils/cpufreq-info.c|3 +--
 tools/power/cpupower/utils/helpers/amd.c |   19 +--
 tools/power/cpupower/utils/helpers/helpers.h |9 -
 3 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/tools/power/cpupower/utils/cpufreq-info.c 
b/tools/power/cpupower/utils/cpufreq-info.c
index 6efc0f6b1b11..f9895e31ff5a 100644
--- a/tools/power/cpupower/utils/cpufreq-info.c
+++ b/tools/power/cpupower/utils/cpufreq-info.c
@@ -186,8 +186,7 @@ static int get_boost_mode_x86(unsigned int cpu)
if ((cpupower_cpu_info.vendor == X86_VENDOR_AMD &&
 cpupower_cpu_info.family >= 0x10) ||
 cpupower_cpu_info.vendor == X86_VENDOR_HYGON) {
-   ret = decode_pstates(cpu, cpupower_cpu_info.family, b_states,
-pstates, _no);
+   ret = decode_pstates(cpu, b_states, pstates, _no);
if (ret)
return ret;
 
diff --git a/tools/power/cpupower/utils/helpers/amd.c 
b/tools/power/cpupower/utils/helpers/amd.c
index 216240e2b771..97f2c857048e 100644
--- a/tools/power/cpupower/utils/helpers/amd.c
+++ b/tools/power/cpupower/utils/helpers/amd.c
@@ -41,13 +41,13 @@ union core_pstate {
unsigned long long val;
 };
 
-static int get_did(int family, union core_pstate pstate)
+static int get_did(union core_pstate pstate)
 {
int t;
 
if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_PSTATEDEF)
t = pstate.pstatedef.did;
-   else if (family == 0x12)
+   else if (cpupower_cpu_info.family == 0x12)
t = pstate.val & 0xf;
else
t = pstate.pstate.did;
@@ -55,19 +55,19 @@ static int get_did(int family, union core_pstate pstate)
return t;
 }
 
-static int get_cof(int family, union core_pstate pstate)
+static int get_cof(union core_pstate pstate)
 {
int t;
int fid, did, cof;
 
-   did = get_did(family, pstate);
+   did = get_did(pstate);
if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_PSTATEDEF) {
fid = pstate.pstatedef.fid;
cof = 200 * fid / did;
} else {
t = 0x10;
fid = pstate.pstate.fid;
-   if (family == 0x11)
+   if (cpupower_cpu_info.family == 0x11)
t = 0x8;
cof = (100 * (fid + t)) >> did;
}
@@ -76,8 +76,7 @@ static int get_cof(int family, union core_pstate pstate)
 
 /* Needs:
  * cpu  -> the cpu that gets evaluated
- * cpu_family   -> The cpu's family (0x10, 0x12,...)
- * boots_states -> how much boost states the machines support
+ * boost_states -> how much boost states the machines support
  *
  * Fills up:
  * pstates -> a pointer to an array of size MAX_HW_PSTATES
@@ -87,8 +86,8 @@ static int get_cof(int family, union core_pstate pstate)
  *
  * returns zero on success, -1 on failure
  */
-int decode_pstates(unsigned int cpu, unsigned int cpu_family,
-  int boost_states, unsigned long *pstates, int *no)
+int decode_pstates(unsigned int cpu, int boost_states,
+  unsigned long *pstates, int *no)
 {
int i, psmax;
union core_pstate pstate;
@@ -118,7 +117,7 @@ int decode_pstates(unsigned int cpu, unsigned int 
cpu_family,
if (!pstate.pstatedef.en)
continue;
 
-   pstates[i] = get_cof(cpu_family, pstate);
+   pstates[i] = get_cof(pstate);
}
*no = i;
return 0;
diff --git a/tools/power/cpupower/utils/helpers/helpers.h 
b/tools/power/cpupower/utils/helpers/helpers.h
index e4dc44ced770..8a0c11c6ec63 100644
--- a/tools/power/cpupower/utils/helpers/helpers.h
+++ b/tools/power/cpupower/utils/helpers/helpers.h
@@ -127,8 +127,8 @@ extern struct pci_dev *pci_slot_func_init(struct pci_access 
**pacc,
 
 /* AMD HW pstate decoding **/
 
-extern int decode_pstates(unsigned int cpu, unsigned int cpu_family,
- int boost_states, unsigned long *pstates, int *no);
+extern int decode_pstates(unsigned int cpu, int boost_states,
+ unsigned long *pstates, int *no);
 
 /* AMD HW pstate decoding **/
 
@@ -145,9 +145,8 @@ unsigned int cpuid_edx(unsigned int op);
 /* cpuid and cpuinfo helpers  **/
 /* X86 ONLY /
 #else
-static inline int decode_pstates(unsigned int cpu, unsigned int cpu_family,
-int boost_states, unsigned long *pstates,
-int *no)
+static inl

[PATCH v2 8/8] cpupower: Add cpuid cap flag for MSR_AMD_HWCR support

2021-01-25 Thread Nathan Fontenot
Remove the family check for accessing the MSR_AMD_HWCR MSR and replace
it with a cpupower cap flag.

This update also allows for the removal of the local cpupower_cpu_info
variable in cpufreq_has_boost_support() since we no longer need it to
check the family.

Signed-off-by: Nathan Fontenot 
---
 tools/power/cpupower/utils/helpers/cpuid.c   |6 +-
 tools/power/cpupower/utils/helpers/helpers.h |1 +
 tools/power/cpupower/utils/helpers/misc.c|7 +--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tools/power/cpupower/utils/helpers/cpuid.c 
b/tools/power/cpupower/utils/helpers/cpuid.c
index db2e88ceb67b..72eb43593180 100644
--- a/tools/power/cpupower/utils/helpers/cpuid.c
+++ b/tools/power/cpupower/utils/helpers/cpuid.c
@@ -129,9 +129,13 @@ int get_cpu_info(struct cpupower_cpu_info *cpu_info)
if (cpu_info->vendor == X86_VENDOR_AMD ||
cpu_info->vendor == X86_VENDOR_HYGON) {
if (ext_cpuid_level >= 0x8007) {
-   if (cpuid_edx(0x8007) & (1 << 9))
+   if (cpuid_edx(0x8007) & (1 << 9)) {
cpu_info->caps |= CPUPOWER_CAP_AMD_CPB;
 
+   if (cpu_info->family >= 0x17)
+   cpu_info->caps |= 
CPUPOWER_CAP_AMD_CPB_MSR;
+   }
+
if ((cpuid_edx(0x8007) & (1 << 7)) &&
cpu_info->family != 0x14) {
/* HW pstate was not implemented in family 0x14 
*/
diff --git a/tools/power/cpupower/utils/helpers/helpers.h 
b/tools/power/cpupower/utils/helpers/helpers.h
index 8a0c11c6ec63..33ffacee7fcb 100644
--- a/tools/power/cpupower/utils/helpers/helpers.h
+++ b/tools/power/cpupower/utils/helpers/helpers.h
@@ -72,6 +72,7 @@ enum cpupower_cpu_vendor {X86_VENDOR_UNKNOWN = 0, 
X86_VENDOR_INTEL,
 #define CPUPOWER_CAP_AMD_RDPRU 0x0080
 #define CPUPOWER_CAP_AMD_HW_PSTATE 0x0100
 #define CPUPOWER_CAP_AMD_PSTATEDEF 0x0200
+#define CPUPOWER_CAP_AMD_CPB_MSR   0x0400
 
 #define CPUPOWER_AMD_CPBDIS0x0200
 
diff --git a/tools/power/cpupower/utils/helpers/misc.c 
b/tools/power/cpupower/utils/helpers/misc.c
index f9bcce9c72d5..fc6e34511721 100644
--- a/tools/power/cpupower/utils/helpers/misc.c
+++ b/tools/power/cpupower/utils/helpers/misc.c
@@ -16,16 +16,11 @@
 int cpufreq_has_boost_support(unsigned int cpu, int *support, int *active,
int *states)
 {
-   struct cpupower_cpu_info cpu_info;
int ret;
unsigned long long val;
 
*support = *active = *states = 0;
 
-   ret = get_cpu_info(_info);
-   if (ret)
-   return ret;
-
if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_CPB) {
*support = 1;
 
@@ -34,7 +29,7 @@ int cpufreq_has_boost_support(unsigned int cpu, int *support, 
int *active,
 * has Hardware determined variable increments instead.
 */
 
-   if (cpu_info.family == 0x17 || cpu_info.family == 0x18) {
+   if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_CPB_MSR) {
if (!read_msr(cpu, MSR_AMD_HWCR, )) {
if (!(val & CPUPOWER_AMD_CPBDIS))
*active = 1;



[PATCH v2 2/8] cpupower: Correct macro name for CPB caps flag

2021-01-25 Thread Nathan Fontenot
From: Robert Richter 

The name is Core Performance Boost (CPB) for the cpuid flag. Correct
cpuid caps flag to use this name (instead of CBP).

Signed-off-by: Robert Richter 
Signed-off-by: Nathan Fontenot 
---
 tools/power/cpupower/utils/helpers/cpuid.c   |2 +-
 tools/power/cpupower/utils/helpers/helpers.h |2 +-
 tools/power/cpupower/utils/helpers/misc.c|2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/power/cpupower/utils/helpers/cpuid.c 
b/tools/power/cpupower/utils/helpers/cpuid.c
index 73bfafc60e9b..f9a66a430b72 100644
--- a/tools/power/cpupower/utils/helpers/cpuid.c
+++ b/tools/power/cpupower/utils/helpers/cpuid.c
@@ -130,7 +130,7 @@ int get_cpu_info(struct cpupower_cpu_info *cpu_info)
cpu_info->vendor == X86_VENDOR_HYGON) {
if (ext_cpuid_level >= 0x8007 &&
(cpuid_edx(0x8007) & (1 << 9)))
-   cpu_info->caps |= CPUPOWER_CAP_AMD_CBP;
+   cpu_info->caps |= CPUPOWER_CAP_AMD_CPB;
 
if (ext_cpuid_level >= 0x8008 &&
cpuid_ebx(0x8008) & (1 << 4))
diff --git a/tools/power/cpupower/utils/helpers/helpers.h 
b/tools/power/cpupower/utils/helpers/helpers.h
index 0642e60a6ce1..a84f85a9dbd2 100644
--- a/tools/power/cpupower/utils/helpers/helpers.h
+++ b/tools/power/cpupower/utils/helpers/helpers.h
@@ -64,7 +64,7 @@ enum cpupower_cpu_vendor {X86_VENDOR_UNKNOWN = 0, 
X86_VENDOR_INTEL,
 
 #define CPUPOWER_CAP_INV_TSC   0x0001
 #define CPUPOWER_CAP_APERF 0x0002
-#define CPUPOWER_CAP_AMD_CBP   0x0004
+#define CPUPOWER_CAP_AMD_CPB   0x0004
 #define CPUPOWER_CAP_PERF_BIAS 0x0008
 #define CPUPOWER_CAP_HAS_TURBO_RATIO   0x0010
 #define CPUPOWER_CAP_IS_SNB0x0020
diff --git a/tools/power/cpupower/utils/helpers/misc.c 
b/tools/power/cpupower/utils/helpers/misc.c
index 650b9a9a6584..f9bcce9c72d5 100644
--- a/tools/power/cpupower/utils/helpers/misc.c
+++ b/tools/power/cpupower/utils/helpers/misc.c
@@ -26,7 +26,7 @@ int cpufreq_has_boost_support(unsigned int cpu, int *support, 
int *active,
if (ret)
return ret;
 
-   if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_CBP) {
+   if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_CPB) {
*support = 1;
 
/* AMD Family 0x17 does not utilize PCI D18F4 like prior



[PATCH v2 0/8] cpupower: Updates and cleanup to support AMD Family 0x19

2021-01-25 Thread Nathan Fontenot
Updates to the cpupower command to add support for AMD family 0x19
and cleanup the code to remove many of the family checks to hopefully
make any future family updates easier.

The first couple of patches are simple updates to rename the structs
in the msr_pstate union to better reflect current support and correcting
the name of the CPUPOWER_CAP_AMD_CPB cpuid cap flag.

Patches 3, 5, and 8 update the family checks to either replace
them with a new cpuid cap flag based off of cpuid checks or check for
family >= 0x17 where removing the direct family check isn't possible.

The reamianing patches are cleanups to remove unneeded extra enabled bit
checking, remove passing no longer used variables, and remove unused
variables in decode_pstates().
---

Updates for v2:
- Patch 1/8: Add links to AMD PPR and BKDG to commit message.
- Patch 3/8: Update and add back removed comment.

---

Nathan Fontenot (7):
  cpupower: Update msr_pstate union struct naming
  cpupower: Add CPUPOWER_CAP_AMD_HW_PSTATE cpuid caps flag
  cpupower: Remove unused pscur variable.
  cpupower: Update family checks when decoding HW pstates
  cpupower: Condense pstate enabled bit checks in decode_pstates()
  cpupower: Remove family arg to decode_pstates()
  cpupower: Add cpuid cap flag for MSR_AMD_HWCR support

Robert Richter (1):
  cpupower: Correct macro name for CPB caps flag


 tools/power/cpupower/utils/cpufreq-info.c|3 -
 tools/power/cpupower/utils/helpers/amd.c |   65 --
 tools/power/cpupower/utils/helpers/cpuid.c   |   20 +++-
 tools/power/cpupower/utils/helpers/helpers.h |   14 +++---
 tools/power/cpupower/utils/helpers/misc.c|9 +---
 5 files changed, 57 insertions(+), 54 deletions(-)

--
Nathan Fontenot


[PATCH v2 1/8] cpupower: Update msr_pstate union struct naming

2021-01-25 Thread Nathan Fontenot
The msr_pstate union struct named fam17h_bits is misleading since
this is the struct to use for all families >= 0x17, not just
for family 0x17. Rename the bits structs to be 'pstate' (for pre
family 17h CPUs) and 'pstatedef' (for CPUs since fam 17h) to align
closer with PPR/BDKG (1) naming.

There are no functional changes as part of this update.

1: AMD Processor Programming Reference (PPR) and BIOS and
Kernel Developer's Guide (BKDG) available at:
http://developer.amd.com/resources/developer-guides-manuals

Signed-off-by: Nathan Fontenot 
---

Updates for v2: Add links to PPR/BKDG in commit message
---
 tools/power/cpupower/utils/helpers/amd.c |   26 ++
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/tools/power/cpupower/utils/helpers/amd.c 
b/tools/power/cpupower/utils/helpers/amd.c
index 7c4f83a8c973..34368436bbd6 100644
--- a/tools/power/cpupower/utils/helpers/amd.c
+++ b/tools/power/cpupower/utils/helpers/amd.c
@@ -13,7 +13,8 @@
 #define MSR_AMD_PSTATE 0xc0010064
 #define MSR_AMD_PSTATE_LIMIT   0xc0010061
 
-union msr_pstate {
+union core_pstate {
+   /* pre fam 17h: */
struct {
unsigned fid:6;
unsigned did:3;
@@ -26,7 +27,8 @@ union msr_pstate {
unsigned idddiv:2;
unsigned res3:21;
unsigned en:1;
-   } bits;
+   } pstate;
+   /* since fam 17h: */
struct {
unsigned fid:8;
unsigned did:6;
@@ -35,36 +37,36 @@ union msr_pstate {
unsigned idddiv:2;
unsigned res1:31;
unsigned en:1;
-   } fam17h_bits;
+   } pstatedef;
unsigned long long val;
 };
 
-static int get_did(int family, union msr_pstate pstate)
+static int get_did(int family, union core_pstate pstate)
 {
int t;
 
if (family == 0x12)
t = pstate.val & 0xf;
else if (family == 0x17 || family == 0x18)
-   t = pstate.fam17h_bits.did;
+   t = pstate.pstatedef.did;
else
-   t = pstate.bits.did;
+   t = pstate.pstate.did;
 
return t;
 }
 
-static int get_cof(int family, union msr_pstate pstate)
+static int get_cof(int family, union core_pstate pstate)
 {
int t;
int fid, did, cof;
 
did = get_did(family, pstate);
if (family == 0x17 || family == 0x18) {
-   fid = pstate.fam17h_bits.fid;
+   fid = pstate.pstatedef.fid;
cof = 200 * fid / did;
} else {
t = 0x10;
-   fid = pstate.bits.fid;
+   fid = pstate.pstate.fid;
if (family == 0x11)
t = 0x8;
cof = (100 * (fid + t)) >> did;
@@ -89,7 +91,7 @@ int decode_pstates(unsigned int cpu, unsigned int cpu_family,
   int boost_states, unsigned long *pstates, int *no)
 {
int i, psmax, pscur;
-   union msr_pstate pstate;
+   union core_pstate pstate;
unsigned long long val;
 
/* Only read out frequencies from HW when CPU might be boostable
@@ -119,9 +121,9 @@ int decode_pstates(unsigned int cpu, unsigned int 
cpu_family,
}
if (read_msr(cpu, MSR_AMD_PSTATE + i, ))
return -1;
-   if ((cpu_family == 0x17) && (!pstate.fam17h_bits.en))
+   if ((cpu_family == 0x17) && (!pstate.pstatedef.en))
continue;
-   else if (!pstate.bits.en)
+   else if (!pstate.pstate.en)
continue;
 
pstates[i] = get_cof(cpu_family, pstate);



[PATCH 4/8] cpupower: Remove unused pscur variable.

2021-01-22 Thread Nathan Fontenot
The pscur variable is set but not uused, just remove it.

This may have previsously been set to validate the MSR_AMD_PSTATE_STATUS
MSR. With the addition of the CPUPOWER_CAP_AMD_HW_PSTATE cap flag this
is no longer needed since the cpuid bit to enable this cap flag also
validates that the MSR_AMD_PSTATE_STATUS MSR is present.

Signed-off-by: Nathan Fontenot 
---
 tools/power/cpupower/utils/helpers/amd.c |9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/tools/power/cpupower/utils/helpers/amd.c 
b/tools/power/cpupower/utils/helpers/amd.c
index 496844a20fe2..bd4db2e9a8a0 100644
--- a/tools/power/cpupower/utils/helpers/amd.c
+++ b/tools/power/cpupower/utils/helpers/amd.c
@@ -90,7 +90,7 @@ static int get_cof(int family, union core_pstate pstate)
 int decode_pstates(unsigned int cpu, unsigned int cpu_family,
   int boost_states, unsigned long *pstates, int *no)
 {
-   int i, psmax, pscur;
+   int i, psmax;
union core_pstate pstate;
unsigned long long val;
 
@@ -101,13 +101,6 @@ int decode_pstates(unsigned int cpu, unsigned int 
cpu_family,
return -1;
 
psmax = (val >> 4) & 0x7;
-
-   if (read_msr(cpu, MSR_AMD_PSTATE_STATUS, ))
-   return -1;
-
-   pscur = val & 0x7;
-
-   pscur += boost_states;
psmax += boost_states;
for (i = 0; i <= psmax; i++) {
if (i >= MAX_HW_PSTATES) {



[PATCH 8/8] cpupower: Add cpuid cap flag for MSR_AMD_HWCR support

2021-01-22 Thread Nathan Fontenot
Remove the family check for accessing the MSR_AMD_HWCR MSR and replace
it with a cpupower cap flag.

This update also allows for the removal of the local cpupower_cpu_info
variable in cpufreq_has_boost_support() since we no longer need it to
check the family.

Signed-off-by: Nathan Fontenot 
---
 tools/power/cpupower/utils/helpers/cpuid.c   |6 +-
 tools/power/cpupower/utils/helpers/helpers.h |1 +
 tools/power/cpupower/utils/helpers/misc.c|7 +--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tools/power/cpupower/utils/helpers/cpuid.c 
b/tools/power/cpupower/utils/helpers/cpuid.c
index db2e88ceb67b..72eb43593180 100644
--- a/tools/power/cpupower/utils/helpers/cpuid.c
+++ b/tools/power/cpupower/utils/helpers/cpuid.c
@@ -129,9 +129,13 @@ int get_cpu_info(struct cpupower_cpu_info *cpu_info)
if (cpu_info->vendor == X86_VENDOR_AMD ||
cpu_info->vendor == X86_VENDOR_HYGON) {
if (ext_cpuid_level >= 0x8007) {
-   if (cpuid_edx(0x8007) & (1 << 9))
+   if (cpuid_edx(0x8007) & (1 << 9)) {
cpu_info->caps |= CPUPOWER_CAP_AMD_CPB;
 
+   if (cpu_info->family >= 0x17)
+   cpu_info->caps |= 
CPUPOWER_CAP_AMD_CPB_MSR;
+   }
+
if ((cpuid_edx(0x8007) & (1 << 7)) &&
cpu_info->family != 0x14) {
/* HW pstate was not implemented in family 0x14 
*/
diff --git a/tools/power/cpupower/utils/helpers/helpers.h 
b/tools/power/cpupower/utils/helpers/helpers.h
index 8a0c11c6ec63..33ffacee7fcb 100644
--- a/tools/power/cpupower/utils/helpers/helpers.h
+++ b/tools/power/cpupower/utils/helpers/helpers.h
@@ -72,6 +72,7 @@ enum cpupower_cpu_vendor {X86_VENDOR_UNKNOWN = 0, 
X86_VENDOR_INTEL,
 #define CPUPOWER_CAP_AMD_RDPRU 0x0080
 #define CPUPOWER_CAP_AMD_HW_PSTATE 0x0100
 #define CPUPOWER_CAP_AMD_PSTATEDEF 0x0200
+#define CPUPOWER_CAP_AMD_CPB_MSR   0x0400
 
 #define CPUPOWER_AMD_CPBDIS0x0200
 
diff --git a/tools/power/cpupower/utils/helpers/misc.c 
b/tools/power/cpupower/utils/helpers/misc.c
index f9bcce9c72d5..fc6e34511721 100644
--- a/tools/power/cpupower/utils/helpers/misc.c
+++ b/tools/power/cpupower/utils/helpers/misc.c
@@ -16,16 +16,11 @@
 int cpufreq_has_boost_support(unsigned int cpu, int *support, int *active,
int *states)
 {
-   struct cpupower_cpu_info cpu_info;
int ret;
unsigned long long val;
 
*support = *active = *states = 0;
 
-   ret = get_cpu_info(_info);
-   if (ret)
-   return ret;
-
if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_CPB) {
*support = 1;
 
@@ -34,7 +29,7 @@ int cpufreq_has_boost_support(unsigned int cpu, int *support, 
int *active,
 * has Hardware determined variable increments instead.
 */
 
-   if (cpu_info.family == 0x17 || cpu_info.family == 0x18) {
+   if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_CPB_MSR) {
if (!read_msr(cpu, MSR_AMD_HWCR, )) {
if (!(val & CPUPOWER_AMD_CPBDIS))
*active = 1;



[PATCH 7/8] cpupower: Remove family arg to decode_pstates()

2021-01-22 Thread Nathan Fontenot
The decode_pstates() routine no longer uses the CPU family and
the caleed routines (get_cof() and get_did()) can grab the family
from the global cpupower_cpu_info struct. These update removes
passing the family arg to all these routines.

Signed-off-by: Nathan Fontenot 
---
 tools/power/cpupower/utils/cpufreq-info.c|3 +--
 tools/power/cpupower/utils/helpers/amd.c |   19 +--
 tools/power/cpupower/utils/helpers/helpers.h |9 -
 3 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/tools/power/cpupower/utils/cpufreq-info.c 
b/tools/power/cpupower/utils/cpufreq-info.c
index 6efc0f6b1b11..f9895e31ff5a 100644
--- a/tools/power/cpupower/utils/cpufreq-info.c
+++ b/tools/power/cpupower/utils/cpufreq-info.c
@@ -186,8 +186,7 @@ static int get_boost_mode_x86(unsigned int cpu)
if ((cpupower_cpu_info.vendor == X86_VENDOR_AMD &&
 cpupower_cpu_info.family >= 0x10) ||
 cpupower_cpu_info.vendor == X86_VENDOR_HYGON) {
-   ret = decode_pstates(cpu, cpupower_cpu_info.family, b_states,
-pstates, _no);
+   ret = decode_pstates(cpu, b_states, pstates, _no);
if (ret)
return ret;
 
diff --git a/tools/power/cpupower/utils/helpers/amd.c 
b/tools/power/cpupower/utils/helpers/amd.c
index 20694c3f367b..01bb85121216 100644
--- a/tools/power/cpupower/utils/helpers/amd.c
+++ b/tools/power/cpupower/utils/helpers/amd.c
@@ -41,13 +41,13 @@ union core_pstate {
unsigned long long val;
 };
 
-static int get_did(int family, union core_pstate pstate)
+static int get_did(union core_pstate pstate)
 {
int t;
 
if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_PSTATEDEF)
t = pstate.pstatedef.did;
-   else if (family == 0x12)
+   else if (cpupower_cpu_info.family == 0x12)
t = pstate.val & 0xf;
else
t = pstate.pstate.did;
@@ -55,19 +55,19 @@ static int get_did(int family, union core_pstate pstate)
return t;
 }
 
-static int get_cof(int family, union core_pstate pstate)
+static int get_cof(union core_pstate pstate)
 {
int t;
int fid, did, cof;
 
-   did = get_did(family, pstate);
+   did = get_did(pstate);
if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_PSTATEDEF) {
fid = pstate.pstatedef.fid;
cof = 200 * fid / did;
} else {
t = 0x10;
fid = pstate.pstate.fid;
-   if (family == 0x11)
+   if (cpupower_cpu_info.family == 0x11)
t = 0x8;
cof = (100 * (fid + t)) >> did;
}
@@ -76,8 +76,7 @@ static int get_cof(int family, union core_pstate pstate)
 
 /* Needs:
  * cpu  -> the cpu that gets evaluated
- * cpu_family   -> The cpu's family (0x10, 0x12,...)
- * boots_states -> how much boost states the machines support
+ * boost_states -> how much boost states the machines support
  *
  * Fills up:
  * pstates -> a pointer to an array of size MAX_HW_PSTATES
@@ -87,8 +86,8 @@ static int get_cof(int family, union core_pstate pstate)
  *
  * returns zero on success, -1 on failure
  */
-int decode_pstates(unsigned int cpu, unsigned int cpu_family,
-  int boost_states, unsigned long *pstates, int *no)
+int decode_pstates(unsigned int cpu, int boost_states,
+  unsigned long *pstates, int *no)
 {
int i, psmax;
union core_pstate pstate;
@@ -115,7 +114,7 @@ int decode_pstates(unsigned int cpu, unsigned int 
cpu_family,
if (!pstate.pstatedef.en)
continue;
 
-   pstates[i] = get_cof(cpu_family, pstate);
+   pstates[i] = get_cof(pstate);
}
*no = i;
return 0;
diff --git a/tools/power/cpupower/utils/helpers/helpers.h 
b/tools/power/cpupower/utils/helpers/helpers.h
index e4dc44ced770..8a0c11c6ec63 100644
--- a/tools/power/cpupower/utils/helpers/helpers.h
+++ b/tools/power/cpupower/utils/helpers/helpers.h
@@ -127,8 +127,8 @@ extern struct pci_dev *pci_slot_func_init(struct pci_access 
**pacc,
 
 /* AMD HW pstate decoding **/
 
-extern int decode_pstates(unsigned int cpu, unsigned int cpu_family,
- int boost_states, unsigned long *pstates, int *no);
+extern int decode_pstates(unsigned int cpu, int boost_states,
+ unsigned long *pstates, int *no);
 
 /* AMD HW pstate decoding **/
 
@@ -145,9 +145,8 @@ unsigned int cpuid_edx(unsigned int op);
 /* cpuid and cpuinfo helpers  **/
 /* X86 ONLY /
 #else
-static inline int decode_pstates(unsigned int cpu, unsigned int cpu_family,
-int boost_states, unsigned long *pstates,
-int *no)
+static inl

[PATCH 6/8] cpupower: Condense pstate enabled bit checks in decode_pstates()

2021-01-22 Thread Nathan Fontenot
The enabled bit (bit 63) is common for all families so we can remove
the multiple enabled checks based on family and have a common check
for HW pstate enabled.

Signed-off-by: Nathan Fontenot 
---
 tools/power/cpupower/utils/helpers/amd.c |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/power/cpupower/utils/helpers/amd.c 
b/tools/power/cpupower/utils/helpers/amd.c
index 519a21e92666..20694c3f367b 100644
--- a/tools/power/cpupower/utils/helpers/amd.c
+++ b/tools/power/cpupower/utils/helpers/amd.c
@@ -110,9 +110,9 @@ int decode_pstates(unsigned int cpu, unsigned int 
cpu_family,
}
if (read_msr(cpu, MSR_AMD_PSTATE + i, ))
return -1;
-   if ((cpu_family == 0x17) && (!pstate.pstatedef.en))
-   continue;
-   else if (!pstate.pstate.en)
+
+   /* The enabled bit (bit 63) is common for all families */
+   if (!pstate.pstatedef.en)
continue;
 
pstates[i] = get_cof(cpu_family, pstate);



[PATCH 2/8] cpupower: Correct macro name for CPB caps flag

2021-01-22 Thread Nathan Fontenot
From: Robert Richter 

The name is Core Performance Boost (CPB) for the cpuid flag. Correct
cpuid caps flag to use this name (instead of CBP).

Signed-off-by: Robert Richter 
Signed-off-by: Nathan Fontenot 
---
 tools/power/cpupower/utils/helpers/cpuid.c   |2 +-
 tools/power/cpupower/utils/helpers/helpers.h |2 +-
 tools/power/cpupower/utils/helpers/misc.c|2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/power/cpupower/utils/helpers/cpuid.c 
b/tools/power/cpupower/utils/helpers/cpuid.c
index 73bfafc60e9b..f9a66a430b72 100644
--- a/tools/power/cpupower/utils/helpers/cpuid.c
+++ b/tools/power/cpupower/utils/helpers/cpuid.c
@@ -130,7 +130,7 @@ int get_cpu_info(struct cpupower_cpu_info *cpu_info)
cpu_info->vendor == X86_VENDOR_HYGON) {
if (ext_cpuid_level >= 0x8007 &&
(cpuid_edx(0x8007) & (1 << 9)))
-   cpu_info->caps |= CPUPOWER_CAP_AMD_CBP;
+   cpu_info->caps |= CPUPOWER_CAP_AMD_CPB;
 
if (ext_cpuid_level >= 0x8008 &&
cpuid_ebx(0x8008) & (1 << 4))
diff --git a/tools/power/cpupower/utils/helpers/helpers.h 
b/tools/power/cpupower/utils/helpers/helpers.h
index 0642e60a6ce1..a84f85a9dbd2 100644
--- a/tools/power/cpupower/utils/helpers/helpers.h
+++ b/tools/power/cpupower/utils/helpers/helpers.h
@@ -64,7 +64,7 @@ enum cpupower_cpu_vendor {X86_VENDOR_UNKNOWN = 0, 
X86_VENDOR_INTEL,
 
 #define CPUPOWER_CAP_INV_TSC   0x0001
 #define CPUPOWER_CAP_APERF 0x0002
-#define CPUPOWER_CAP_AMD_CBP   0x0004
+#define CPUPOWER_CAP_AMD_CPB   0x0004
 #define CPUPOWER_CAP_PERF_BIAS 0x0008
 #define CPUPOWER_CAP_HAS_TURBO_RATIO   0x0010
 #define CPUPOWER_CAP_IS_SNB0x0020
diff --git a/tools/power/cpupower/utils/helpers/misc.c 
b/tools/power/cpupower/utils/helpers/misc.c
index 650b9a9a6584..f9bcce9c72d5 100644
--- a/tools/power/cpupower/utils/helpers/misc.c
+++ b/tools/power/cpupower/utils/helpers/misc.c
@@ -26,7 +26,7 @@ int cpufreq_has_boost_support(unsigned int cpu, int *support, 
int *active,
if (ret)
return ret;
 
-   if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_CBP) {
+   if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_CPB) {
*support = 1;
 
/* AMD Family 0x17 does not utilize PCI D18F4 like prior



[PATCH 5/8] cpupower: Update family checks when decoding HW pstates

2021-01-22 Thread Nathan Fontenot
The family checks in get_cof() and get_did() need to use the
correct MSR format depending on the family. Add a cpupower
capability for using the pstatedef (family 17h and newer) to
control this instead of direct family checks.

Signed-off-by: Nathan Fontenot 
---
 tools/power/cpupower/utils/helpers/amd.c |8 
 tools/power/cpupower/utils/helpers/cpuid.c   |6 +-
 tools/power/cpupower/utils/helpers/helpers.h |1 +
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/tools/power/cpupower/utils/helpers/amd.c 
b/tools/power/cpupower/utils/helpers/amd.c
index bd4db2e9a8a0..519a21e92666 100644
--- a/tools/power/cpupower/utils/helpers/amd.c
+++ b/tools/power/cpupower/utils/helpers/amd.c
@@ -45,10 +45,10 @@ static int get_did(int family, union core_pstate pstate)
 {
int t;
 
-   if (family == 0x12)
-   t = pstate.val & 0xf;
-   else if (family == 0x17 || family == 0x18)
+   if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_PSTATEDEF)
t = pstate.pstatedef.did;
+   else if (family == 0x12)
+   t = pstate.val & 0xf;
else
t = pstate.pstate.did;
 
@@ -61,7 +61,7 @@ static int get_cof(int family, union core_pstate pstate)
int fid, did, cof;
 
did = get_did(family, pstate);
-   if (family == 0x17 || family == 0x18) {
+   if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_PSTATEDEF) {
fid = pstate.pstatedef.fid;
cof = 200 * fid / did;
} else {
diff --git a/tools/power/cpupower/utils/helpers/cpuid.c 
b/tools/power/cpupower/utils/helpers/cpuid.c
index d577220a193b..db2e88ceb67b 100644
--- a/tools/power/cpupower/utils/helpers/cpuid.c
+++ b/tools/power/cpupower/utils/helpers/cpuid.c
@@ -133,9 +133,13 @@ int get_cpu_info(struct cpupower_cpu_info *cpu_info)
cpu_info->caps |= CPUPOWER_CAP_AMD_CPB;
 
if ((cpuid_edx(0x8007) & (1 << 7)) &&
-   cpu_info->family != 0x14)
+   cpu_info->family != 0x14) {
/* HW pstate was not implemented in family 0x14 
*/
cpu_info->caps |= CPUPOWER_CAP_AMD_HW_PSTATE;
+
+   if (cpu_info->family >= 0x17)
+   cpu_info->caps |= 
CPUPOWER_CAP_AMD_PSTATEDEF;
+   }
}
 
if (ext_cpuid_level >= 0x8008 &&
diff --git a/tools/power/cpupower/utils/helpers/helpers.h 
b/tools/power/cpupower/utils/helpers/helpers.h
index 5f61eefff5b2..e4dc44ced770 100644
--- a/tools/power/cpupower/utils/helpers/helpers.h
+++ b/tools/power/cpupower/utils/helpers/helpers.h
@@ -71,6 +71,7 @@ enum cpupower_cpu_vendor {X86_VENDOR_UNKNOWN = 0, 
X86_VENDOR_INTEL,
 #define CPUPOWER_CAP_INTEL_IDA 0x0040
 #define CPUPOWER_CAP_AMD_RDPRU 0x0080
 #define CPUPOWER_CAP_AMD_HW_PSTATE 0x0100
+#define CPUPOWER_CAP_AMD_PSTATEDEF 0x0200
 
 #define CPUPOWER_AMD_CPBDIS0x0200
 



[PATCH 3/8] cpupower: Add CPUPOWER_CAP_AMD_HW_PSTATE cpuid caps flag

2021-01-22 Thread Nathan Fontenot
Add a check in get_cpu_info() for the ability to read frequencies
from hardware and set the CPUPOWER_CAP_AMD_HW_PSTATE cpuid flag.
The cpuid flag is set when CPUID_8007_EDX[7] is set,
which is all families >= 10h. The check excludes family 14h
because HW pstate reporting was not implemented on family 14h.

This is intended to reduce family checks in the main code paths.

Signed-off-by: Nathan Fontenot 
---
 tools/power/cpupower/utils/helpers/amd.c |6 +-
 tools/power/cpupower/utils/helpers/cpuid.c   |   12 +---
 tools/power/cpupower/utils/helpers/helpers.h |1 +
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/tools/power/cpupower/utils/helpers/amd.c 
b/tools/power/cpupower/utils/helpers/amd.c
index 34368436bbd6..496844a20fe2 100644
--- a/tools/power/cpupower/utils/helpers/amd.c
+++ b/tools/power/cpupower/utils/helpers/amd.c
@@ -94,11 +94,7 @@ int decode_pstates(unsigned int cpu, unsigned int cpu_family,
union core_pstate pstate;
unsigned long long val;
 
-   /* Only read out frequencies from HW when CPU might be boostable
-  to keep the code as short and clean as possible.
-  Otherwise frequencies are exported via ACPI tables.
-   */
-   if (cpu_family < 0x10 || cpu_family == 0x14)
+   if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_HW_PSTATE))
return -1;
 
if (read_msr(cpu, MSR_AMD_PSTATE_LIMIT, ))
diff --git a/tools/power/cpupower/utils/helpers/cpuid.c 
b/tools/power/cpupower/utils/helpers/cpuid.c
index f9a66a430b72..d577220a193b 100644
--- a/tools/power/cpupower/utils/helpers/cpuid.c
+++ b/tools/power/cpupower/utils/helpers/cpuid.c
@@ -128,9 +128,15 @@ int get_cpu_info(struct cpupower_cpu_info *cpu_info)
/* AMD or Hygon Boost state enable/disable register */
if (cpu_info->vendor == X86_VENDOR_AMD ||
cpu_info->vendor == X86_VENDOR_HYGON) {
-   if (ext_cpuid_level >= 0x8007 &&
-   (cpuid_edx(0x8007) & (1 << 9)))
-   cpu_info->caps |= CPUPOWER_CAP_AMD_CPB;
+   if (ext_cpuid_level >= 0x8007) {
+   if (cpuid_edx(0x8007) & (1 << 9))
+   cpu_info->caps |= CPUPOWER_CAP_AMD_CPB;
+
+   if ((cpuid_edx(0x8007) & (1 << 7)) &&
+   cpu_info->family != 0x14)
+   /* HW pstate was not implemented in family 0x14 
*/
+   cpu_info->caps |= CPUPOWER_CAP_AMD_HW_PSTATE;
+   }
 
if (ext_cpuid_level >= 0x8008 &&
cpuid_ebx(0x8008) & (1 << 4))
diff --git a/tools/power/cpupower/utils/helpers/helpers.h 
b/tools/power/cpupower/utils/helpers/helpers.h
index a84f85a9dbd2..5f61eefff5b2 100644
--- a/tools/power/cpupower/utils/helpers/helpers.h
+++ b/tools/power/cpupower/utils/helpers/helpers.h
@@ -70,6 +70,7 @@ enum cpupower_cpu_vendor {X86_VENDOR_UNKNOWN = 0, 
X86_VENDOR_INTEL,
 #define CPUPOWER_CAP_IS_SNB0x0020
 #define CPUPOWER_CAP_INTEL_IDA 0x0040
 #define CPUPOWER_CAP_AMD_RDPRU 0x0080
+#define CPUPOWER_CAP_AMD_HW_PSTATE 0x0100
 
 #define CPUPOWER_AMD_CPBDIS0x0200
 



[PATCH 0/8] cpupower: Updates and cleanup to support AMD Family 0x19

2021-01-22 Thread Nathan Fontenot
Updates to the cpupower command to add support for AMD family 0x19
and cleanup the code to remove many of the family checks to hopefully
make any future family updates easier.

The first couple of patches are simple updates to rename the structs
in the msr_pstate union to better reflect current support and correcting
the name of the CPUPOWER_CAP_AMD_CPB cpuid cap flag.

Patches 3, 5, and 8 update the family checks to either replace
them with a new cpuid cap flag based off of cpuid checks or check for
family >= 0x17 where removing the direct family check isn't possible.

The reamianing patches are cleanups to remove unneeded extra enabled bit
checking, remove passing no longer used variables, and remove unused
variables in decode_pstates().

---

Nathan Fontenot (7):
  cpupower: Update msr_pstate union struct naming
  cpupower: Add CPUPOWER_CAP_AMD_HW_PSTATE cpuid caps flag
  cpupower: Remove unused pscur variable.
  cpupower: Update family checks when decoding HW pstates
  cpupower: Condense pstate enabled bit checks in decode_pstates()
  cpupower: Remove family arg to decode_pstates()
  cpupower: Add cpuid cap flag for MSR_AMD_HWCR support

Robert Richter (1):
  cpupower: Correct macro name for CPB caps flag


 tools/power/cpupower/utils/cpufreq-info.c|3 -
 tools/power/cpupower/utils/helpers/amd.c |   62 +++---
 tools/power/cpupower/utils/helpers/cpuid.c   |   20 +++-
 tools/power/cpupower/utils/helpers/helpers.h |   14 +++---
 tools/power/cpupower/utils/helpers/misc.c|9 +---
 5 files changed, 54 insertions(+), 54 deletions(-)

--
Nathan Fontenot


[PATCH 1/8] cpupower: Update msr_pstate union struct naming

2021-01-22 Thread Nathan Fontenot
The msr_pstate union struct named fam17h_bits is misleading since
this is the struct to use for all families >= 0x17, not just
for family 0x17. Rename the bits structs to be 'pstate' (for pre
family 17h CPUs) and 'pstatedef' (for CPUs since fam 17h) to align
closer with PPR/BDKG naming.

There are no functional changes as part of this update.

Signed-off-by: Nathan Fontenot 
---
 tools/power/cpupower/utils/helpers/amd.c |   26 ++
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/tools/power/cpupower/utils/helpers/amd.c 
b/tools/power/cpupower/utils/helpers/amd.c
index 7c4f83a8c973..34368436bbd6 100644
--- a/tools/power/cpupower/utils/helpers/amd.c
+++ b/tools/power/cpupower/utils/helpers/amd.c
@@ -13,7 +13,8 @@
 #define MSR_AMD_PSTATE 0xc0010064
 #define MSR_AMD_PSTATE_LIMIT   0xc0010061
 
-union msr_pstate {
+union core_pstate {
+   /* pre fam 17h: */
struct {
unsigned fid:6;
unsigned did:3;
@@ -26,7 +27,8 @@ union msr_pstate {
unsigned idddiv:2;
unsigned res3:21;
unsigned en:1;
-   } bits;
+   } pstate;
+   /* since fam 17h: */
struct {
unsigned fid:8;
unsigned did:6;
@@ -35,36 +37,36 @@ union msr_pstate {
unsigned idddiv:2;
unsigned res1:31;
unsigned en:1;
-   } fam17h_bits;
+   } pstatedef;
unsigned long long val;
 };
 
-static int get_did(int family, union msr_pstate pstate)
+static int get_did(int family, union core_pstate pstate)
 {
int t;
 
if (family == 0x12)
t = pstate.val & 0xf;
else if (family == 0x17 || family == 0x18)
-   t = pstate.fam17h_bits.did;
+   t = pstate.pstatedef.did;
else
-   t = pstate.bits.did;
+   t = pstate.pstate.did;
 
return t;
 }
 
-static int get_cof(int family, union msr_pstate pstate)
+static int get_cof(int family, union core_pstate pstate)
 {
int t;
int fid, did, cof;
 
did = get_did(family, pstate);
if (family == 0x17 || family == 0x18) {
-   fid = pstate.fam17h_bits.fid;
+   fid = pstate.pstatedef.fid;
cof = 200 * fid / did;
} else {
t = 0x10;
-   fid = pstate.bits.fid;
+   fid = pstate.pstate.fid;
if (family == 0x11)
t = 0x8;
cof = (100 * (fid + t)) >> did;
@@ -89,7 +91,7 @@ int decode_pstates(unsigned int cpu, unsigned int cpu_family,
   int boost_states, unsigned long *pstates, int *no)
 {
int i, psmax, pscur;
-   union msr_pstate pstate;
+   union core_pstate pstate;
unsigned long long val;
 
/* Only read out frequencies from HW when CPU might be boostable
@@ -119,9 +121,9 @@ int decode_pstates(unsigned int cpu, unsigned int 
cpu_family,
}
if (read_msr(cpu, MSR_AMD_PSTATE + i, ))
return -1;
-   if ((cpu_family == 0x17) && (!pstate.fam17h_bits.en))
+   if ((cpu_family == 0x17) && (!pstate.pstatedef.en))
continue;
-   else if (!pstate.bits.en)
+   else if (!pstate.pstate.en)
continue;
 
pstates[i] = get_cof(cpu_family, pstate);



[tip: sched/core] x86, sched: Calculate frequency invariance for AMD systems

2020-12-11 Thread tip-bot2 for Nathan Fontenot
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 41ea667227bad5c247d76e6605054e96e4d95f51
Gitweb:
https://git.kernel.org/tip/41ea667227bad5c247d76e6605054e96e4d95f51
Author:Nathan Fontenot 
AuthorDate:Thu, 12 Nov 2020 19:26:12 +01:00
Committer: Ingo Molnar 
CommitterDate: Fri, 11 Dec 2020 10:26:00 +01:00

x86, sched: Calculate frequency invariance for AMD systems

This is the first pass in creating the ability to calculate the
frequency invariance on AMD systems. This approach uses the CPPC
highest performance and nominal performance values that range from
0 - 255 instead of a high and base frquency. This is because we do
not have the ability on AMD to get a highest frequency value.

On AMD systems the highest performance and nominal performance
vaues do correspond to the highest and base frequencies for the system
so using them should produce an appropriate ratio but some tweaking
is likely necessary.

Due to CPPC being initialized later in boot than when the frequency
invariant calculation is currently made, I had to create a callback
from the CPPC init code to do the calculation after we have CPPC
data.

Special thanks to "kernel test robot " for reporting that
compilation of drivers/acpi/cppc_acpi.c is conditional to
CONFIG_ACPI_CPPC_LIB, not just CONFIG_ACPI.

[ ggherdov...@suse.cz: made safe under CPU hotplug, edited changelog. ]

Signed-off-by: Nathan Fontenot 
Signed-off-by: Giovanni Gherdovich 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Ingo Molnar 
Link: https://lkml.kernel.org/r/20201112182614.10700-2-ggherdov...@suse.cz
---
 arch/x86/include/asm/topology.h |  5 ++-
 arch/x86/kernel/smpboot.c   | 76 +---
 drivers/acpi/cppc_acpi.c|  7 +++-
 3 files changed, 83 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index f423457..488a8e8 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -218,4 +218,9 @@ static inline void arch_set_max_freq_ratio(bool 
turbo_disabled)
 }
 #endif
 
+#ifdef CONFIG_ACPI_CPPC_LIB
+void init_freq_invariance_cppc(void);
+#define init_freq_invariance_cppc init_freq_invariance_cppc
+#endif
+
 #endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index de776b2..a4ab5cf 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -82,6 +82,10 @@
 #include 
 #include 
 
+#ifdef CONFIG_ACPI_CPPC_LIB
+#include 
+#endif
+
 /* representing HT siblings of each logical CPU */
 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
@@ -148,7 +152,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
 }
 
-static void init_freq_invariance(bool secondary);
+static void init_freq_invariance(bool secondary, bool cppc_ready);
 
 /*
  * Report back to the Boot Processor during boot time or to the caller 
processor
@@ -186,7 +190,7 @@ static void smp_callin(void)
 */
set_cpu_sibling_map(raw_smp_processor_id());
 
-   init_freq_invariance(true);
+   init_freq_invariance(true, false);
 
/*
 * Get our bogomips.
@@ -1340,7 +1344,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
set_sched_topology(x86_topology);
 
set_cpu_sibling_map(0);
-   init_freq_invariance(false);
+   init_freq_invariance(false, false);
smp_sanity_check();
 
switch (apic_intr_mode) {
@@ -2027,6 +2031,46 @@ out:
return true;
 }
 
+#ifdef CONFIG_ACPI_CPPC_LIB
+static bool amd_set_max_freq_ratio(void)
+{
+   struct cppc_perf_caps perf_caps;
+   u64 highest_perf, nominal_perf;
+   u64 perf_ratio;
+   int rc;
+
+   rc = cppc_get_perf_caps(0, _caps);
+   if (rc) {
+   pr_debug("Could not retrieve perf counters (%d)\n", rc);
+   return false;
+   }
+
+   highest_perf = perf_caps.highest_perf;
+   nominal_perf = perf_caps.nominal_perf;
+
+   if (!highest_perf || !nominal_perf) {
+   pr_debug("Could not retrieve highest or nominal performance\n");
+   return false;
+   }
+
+   perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf);
+   if (!perf_ratio) {
+   pr_debug("Non-zero highest/nominal perf values led to a 0 
ratio\n");
+   return false;
+   }
+
+   arch_turbo_freq_ratio = perf_ratio;
+   arch_set_max_freq_ratio(false);
+
+   return true;
+}
+#else
+static bool amd_set_max_freq_ratio(void)
+{
+   return false;
+}
+#endif
+
 static void init_counter_refs(void)
 {
u64 aperf, mperf;
@@ -2038,7 +2082,7 @@ static void init_counter_refs(void)
this_cpu_write(arch_prev_mperf, mperf);
 }
 
-static void init_freq_invariance(bool secondary

[tip: sched/core] x86, sched: Calculate frequency invariance for AMD systems

2020-12-03 Thread tip-bot2 for Nathan Fontenot
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 0edb0fb35fa687e633322d23e5f44b7cfd21a5c5
Gitweb:
https://git.kernel.org/tip/0edb0fb35fa687e633322d23e5f44b7cfd21a5c5
Author:Nathan Fontenot 
AuthorDate:Thu, 12 Nov 2020 19:26:12 +01:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 03 Dec 2020 10:00:34 +01:00

x86, sched: Calculate frequency invariance for AMD systems

This is the first pass in creating the ability to calculate the
frequency invariance on AMD systems. This approach uses the CPPC
highest performance and nominal performance values that range from
0 - 255 instead of a high and base frquency. This is because we do
not have the ability on AMD to get a highest frequency value.

On AMD systems the highest performance and nominal performance
vaues do correspond to the highest and base frequencies for the system
so using them should produce an appropriate ratio but some tweaking
is likely necessary.

Due to CPPC being initialized later in boot than when the frequency
invariant calculation is currently made, I had to create a callback
from the CPPC init code to do the calculation after we have CPPC
data.

Special thanks to "kernel test robot " for reporting that
compilation of drivers/acpi/cppc_acpi.c is conditional to
CONFIG_ACPI_CPPC_LIB, not just CONFIG_ACPI.

[ ggherdov...@suse.cz: made safe under CPU hotplug, edited changelog ]
Signed-off-by: Giovanni Gherdovich 
Signed-off-by: Nathan Fontenot 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20201112182614.10700-2-ggherdov...@suse.cz
---
 arch/x86/include/asm/topology.h |  5 ++-
 arch/x86/kernel/smpboot.c   | 76 +---
 drivers/acpi/cppc_acpi.c|  7 +++-
 3 files changed, 83 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index f423457..488a8e8 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -218,4 +218,9 @@ static inline void arch_set_max_freq_ratio(bool 
turbo_disabled)
 }
 #endif
 
+#ifdef CONFIG_ACPI_CPPC_LIB
+void init_freq_invariance_cppc(void);
+#define init_freq_invariance_cppc init_freq_invariance_cppc
+#endif
+
 #endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index de776b2..a4ab5cf 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -82,6 +82,10 @@
 #include 
 #include 
 
+#ifdef CONFIG_ACPI_CPPC_LIB
+#include 
+#endif
+
 /* representing HT siblings of each logical CPU */
 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
@@ -148,7 +152,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
 }
 
-static void init_freq_invariance(bool secondary);
+static void init_freq_invariance(bool secondary, bool cppc_ready);
 
 /*
  * Report back to the Boot Processor during boot time or to the caller 
processor
@@ -186,7 +190,7 @@ static void smp_callin(void)
 */
set_cpu_sibling_map(raw_smp_processor_id());
 
-   init_freq_invariance(true);
+   init_freq_invariance(true, false);
 
/*
 * Get our bogomips.
@@ -1340,7 +1344,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
set_sched_topology(x86_topology);
 
set_cpu_sibling_map(0);
-   init_freq_invariance(false);
+   init_freq_invariance(false, false);
smp_sanity_check();
 
switch (apic_intr_mode) {
@@ -2027,6 +2031,46 @@ out:
return true;
 }
 
+#ifdef CONFIG_ACPI_CPPC_LIB
+static bool amd_set_max_freq_ratio(void)
+{
+   struct cppc_perf_caps perf_caps;
+   u64 highest_perf, nominal_perf;
+   u64 perf_ratio;
+   int rc;
+
+   rc = cppc_get_perf_caps(0, _caps);
+   if (rc) {
+   pr_debug("Could not retrieve perf counters (%d)\n", rc);
+   return false;
+   }
+
+   highest_perf = perf_caps.highest_perf;
+   nominal_perf = perf_caps.nominal_perf;
+
+   if (!highest_perf || !nominal_perf) {
+   pr_debug("Could not retrieve highest or nominal performance\n");
+   return false;
+   }
+
+   perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf);
+   if (!perf_ratio) {
+   pr_debug("Non-zero highest/nominal perf values led to a 0 
ratio\n");
+   return false;
+   }
+
+   arch_turbo_freq_ratio = perf_ratio;
+   arch_set_max_freq_ratio(false);
+
+   return true;
+}
+#else
+static bool amd_set_max_freq_ratio(void)
+{
+   return false;
+}
+#endif
+
 static void init_counter_refs(void)
 {
u64 aperf, mperf;
@@ -2038,7 +2082,7 @@ static void init_counter_refs(void)
this_cpu_write(arch_prev_mperf, mperf);
 }
 
-static void init_freq_invariance(bool secondary)
+static void init_freq_in

Re: [PATCH 1/2] powerpc/nodes: Ensure enough nodes avail for operations

2017-10-17 Thread Nathan Fontenot


On 10/17/2017 12:22 PM, Michael Bringmann wrote:
> 
> 
> On 10/17/2017 12:02 PM, Nathan Fontenot wrote:
>> On 10/17/2017 11:14 AM, Michael Bringmann wrote:
>>> See below.
>>>
>>> On 10/16/2017 07:33 AM, Michael Ellerman wrote:
>>>> Michael Bringmann <m...@linux.vnet.ibm.com> writes:
>>>>
>>>>> powerpc/nodes: On systems like PowerPC which allow 'hot-add' of CPU
>>>>
>>>> This is a powerpc-only patch, so saying "systems like PowerPC" is
>>>> confusing. What you should be saying is "On pseries systems".
>>>>
>>>>> or memory resources, it may occur that the new resources are to be
>>>>> inserted into nodes that were not used for these resources at bootup.
>>>>> In the kernel, any node that is used must be defined and initialized
>>>>> at boot.
>>>>>
>>>>> This patch extracts the value of the lowest domain level (number of
>>>>> allocable resources) from the "rtas" device tree property
>>>>> "ibm,current-associativity-domains" or the device tree property
>>>>
>>>> What is current associativity domains? I've not heard of it, where is it
>>>> documented, and what does it mean.
>>>>
>>>> Why would use the "current" set vs the "max"? I thought the whole point
>>>> was to discover the maximum possible set of nodes that could be
>>>> hotplugged.
>>>>
>>>>> "ibm,max-associativity-domains" to use as the maximum number of nodes
>>>>> to setup as possibly available in the system.  This new setting will
>>>>> override the instruction,
>>>>>
>>>>> nodes_and(node_possible_map, node_possible_map, node_online_map);
>>>>>
>>>>> presently seen in the function arch/powerpc/mm/numa.c:initmem_init().
>>>>>
>>>>> If the property is not present at boot, no operation will be performed
>>>>> to define or enable additional nodes.
>>>>>
>>>>> Signed-off-by: Michael Bringmann <m...@linux.vnet.ibm.com>
>>>>> ---
>>>>>  arch/powerpc/mm/numa.c |   47 
>>>>> +++
>>>>>  1 file changed, 47 insertions(+)
>>>>>
>>>>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>>>>> index ec098b3..b385cd0 100644
>>>>> --- a/arch/powerpc/mm/numa.c
>>>>> +++ b/arch/powerpc/mm/numa.c
>>>>> @@ -892,6 +892,51 @@ static void __init setup_node_data(int nid, u64 
>>>>> start_pfn, u64 end_pfn)
>>>>>   NODE_DATA(nid)->node_spanned_pages = spanned_pages;
>>>>>  }
>>>>>  
>>>>> +static void __init node_associativity_setup(void)
>>>>
>>>> This should really be called "find_possible_nodes()" or something more
>>>> descriptive.
>>>
>>> Okay.
>>>>
>>>>> +{
>>>>> + struct device_node *rtas;
>>>>> +
>>>>> + rtas = of_find_node_by_path("/rtas");
>>>>> + if (rtas) {
>>>>
>>>> If you just short-circuit that return the whole function body can be
>>>> deintented, making it significantly more readable.
>>>>
>>>> ie:
>>>> +  rtas = of_find_node_by_path("/rtas");
>>>> +  if (!rtas)
>>>> +  return;
>>>
>>> Okay.
>>>
>>>>
>>>>> + const __be32 *prop;
>>>>> + u32 len, entries, numnodes, i;
>>>>> +
>>>>> + prop = of_get_property(rtas,
>>>>> + "ibm,current-associativity-domains", 
>>>>> );
>>>>
>>>> Please don't use of_get_property() in new code, we have much better
>>>> accessors these days, which do better error checking and handle the
>>>> endian conversions for you.
>>>>
>>>> In this case you'd use eg:
>>>>
>>>>u32 entries;
>>>>rc = of_property_read_u32(rtas, "ibm,current-associativity-domains", 
>>>> );
>>>
>>> The property 'ibm,current-associativity-domains' has the same format as the 
>>> property
>>> 'ibm,max-associativity-domains' i.e. it is an integer array.  Th

Re: [PATCH 1/2] powerpc/nodes: Ensure enough nodes avail for operations

2017-10-17 Thread Nathan Fontenot


On 10/17/2017 12:22 PM, Michael Bringmann wrote:
> 
> 
> On 10/17/2017 12:02 PM, Nathan Fontenot wrote:
>> On 10/17/2017 11:14 AM, Michael Bringmann wrote:
>>> See below.
>>>
>>> On 10/16/2017 07:33 AM, Michael Ellerman wrote:
>>>> Michael Bringmann  writes:
>>>>
>>>>> powerpc/nodes: On systems like PowerPC which allow 'hot-add' of CPU
>>>>
>>>> This is a powerpc-only patch, so saying "systems like PowerPC" is
>>>> confusing. What you should be saying is "On pseries systems".
>>>>
>>>>> or memory resources, it may occur that the new resources are to be
>>>>> inserted into nodes that were not used for these resources at bootup.
>>>>> In the kernel, any node that is used must be defined and initialized
>>>>> at boot.
>>>>>
>>>>> This patch extracts the value of the lowest domain level (number of
>>>>> allocable resources) from the "rtas" device tree property
>>>>> "ibm,current-associativity-domains" or the device tree property
>>>>
>>>> What is current associativity domains? I've not heard of it, where is it
>>>> documented, and what does it mean.
>>>>
>>>> Why would use the "current" set vs the "max"? I thought the whole point
>>>> was to discover the maximum possible set of nodes that could be
>>>> hotplugged.
>>>>
>>>>> "ibm,max-associativity-domains" to use as the maximum number of nodes
>>>>> to setup as possibly available in the system.  This new setting will
>>>>> override the instruction,
>>>>>
>>>>> nodes_and(node_possible_map, node_possible_map, node_online_map);
>>>>>
>>>>> presently seen in the function arch/powerpc/mm/numa.c:initmem_init().
>>>>>
>>>>> If the property is not present at boot, no operation will be performed
>>>>> to define or enable additional nodes.
>>>>>
>>>>> Signed-off-by: Michael Bringmann 
>>>>> ---
>>>>>  arch/powerpc/mm/numa.c |   47 
>>>>> +++
>>>>>  1 file changed, 47 insertions(+)
>>>>>
>>>>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>>>>> index ec098b3..b385cd0 100644
>>>>> --- a/arch/powerpc/mm/numa.c
>>>>> +++ b/arch/powerpc/mm/numa.c
>>>>> @@ -892,6 +892,51 @@ static void __init setup_node_data(int nid, u64 
>>>>> start_pfn, u64 end_pfn)
>>>>>   NODE_DATA(nid)->node_spanned_pages = spanned_pages;
>>>>>  }
>>>>>  
>>>>> +static void __init node_associativity_setup(void)
>>>>
>>>> This should really be called "find_possible_nodes()" or something more
>>>> descriptive.
>>>
>>> Okay.
>>>>
>>>>> +{
>>>>> + struct device_node *rtas;
>>>>> +
>>>>> + rtas = of_find_node_by_path("/rtas");
>>>>> + if (rtas) {
>>>>
>>>> If you just short-circuit that return the whole function body can be
>>>> deintented, making it significantly more readable.
>>>>
>>>> ie:
>>>> +  rtas = of_find_node_by_path("/rtas");
>>>> +  if (!rtas)
>>>> +  return;
>>>
>>> Okay.
>>>
>>>>
>>>>> + const __be32 *prop;
>>>>> + u32 len, entries, numnodes, i;
>>>>> +
>>>>> + prop = of_get_property(rtas,
>>>>> + "ibm,current-associativity-domains", 
>>>>> );
>>>>
>>>> Please don't use of_get_property() in new code, we have much better
>>>> accessors these days, which do better error checking and handle the
>>>> endian conversions for you.
>>>>
>>>> In this case you'd use eg:
>>>>
>>>>u32 entries;
>>>>rc = of_property_read_u32(rtas, "ibm,current-associativity-domains", 
>>>> );
>>>
>>> The property 'ibm,current-associativity-domains' has the same format as the 
>>> property
>>> 'ibm,max-associativity-domains' i.e. it is an integer array.  The accessor 
>>> of_property_read_32,
>>

Re: [PATCH 1/2] powerpc/nodes: Ensure enough nodes avail for operations

2017-10-17 Thread Nathan Fontenot
On 10/17/2017 11:14 AM, Michael Bringmann wrote:
> See below.
> 
> On 10/16/2017 07:33 AM, Michael Ellerman wrote:
>> Michael Bringmann  writes:
>>
>>> powerpc/nodes: On systems like PowerPC which allow 'hot-add' of CPU
>>
>> This is a powerpc-only patch, so saying "systems like PowerPC" is
>> confusing. What you should be saying is "On pseries systems".
>>
>>> or memory resources, it may occur that the new resources are to be
>>> inserted into nodes that were not used for these resources at bootup.
>>> In the kernel, any node that is used must be defined and initialized
>>> at boot.
>>>
>>> This patch extracts the value of the lowest domain level (number of
>>> allocable resources) from the "rtas" device tree property
>>> "ibm,current-associativity-domains" or the device tree property
>>
>> What is current associativity domains? I've not heard of it, where is it
>> documented, and what does it mean.
>>
>> Why would use the "current" set vs the "max"? I thought the whole point
>> was to discover the maximum possible set of nodes that could be
>> hotplugged.
>>
>>> "ibm,max-associativity-domains" to use as the maximum number of nodes
>>> to setup as possibly available in the system.  This new setting will
>>> override the instruction,
>>>
>>> nodes_and(node_possible_map, node_possible_map, node_online_map);
>>>
>>> presently seen in the function arch/powerpc/mm/numa.c:initmem_init().
>>>
>>> If the property is not present at boot, no operation will be performed
>>> to define or enable additional nodes.
>>>
>>> Signed-off-by: Michael Bringmann 
>>> ---
>>>  arch/powerpc/mm/numa.c |   47 
>>> +++
>>>  1 file changed, 47 insertions(+)
>>>
>>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>>> index ec098b3..b385cd0 100644
>>> --- a/arch/powerpc/mm/numa.c
>>> +++ b/arch/powerpc/mm/numa.c
>>> @@ -892,6 +892,51 @@ static void __init setup_node_data(int nid, u64 
>>> start_pfn, u64 end_pfn)
>>> NODE_DATA(nid)->node_spanned_pages = spanned_pages;
>>>  }
>>>  
>>> +static void __init node_associativity_setup(void)
>>
>> This should really be called "find_possible_nodes()" or something more
>> descriptive.
> 
> Okay.
>>
>>> +{
>>> +   struct device_node *rtas;
>>> +
>>> +   rtas = of_find_node_by_path("/rtas");
>>> +   if (rtas) {
>>
>> If you just short-circuit that return the whole function body can be
>> deintented, making it significantly more readable.
>>
>> ie:
>> +rtas = of_find_node_by_path("/rtas");
>> +if (!rtas)
>> +return;
> 
> Okay.
> 
>>
>>> +   const __be32 *prop;
>>> +   u32 len, entries, numnodes, i;
>>> +
>>> +   prop = of_get_property(rtas,
>>> +   "ibm,current-associativity-domains", 
>>> );
>>
>> Please don't use of_get_property() in new code, we have much better
>> accessors these days, which do better error checking and handle the
>> endian conversions for you.
>>
>> In this case you'd use eg:
>>
>>  u32 entries;
>>  rc = of_property_read_u32(rtas, "ibm,current-associativity-domains", 
>> );
> 
> The property 'ibm,current-associativity-domains' has the same format as the 
> property
> 'ibm,max-associativity-domains' i.e. it is an integer array.  The accessor 
> of_property_read_32,
> however, expects it to be an integer singleton value.  Instead, it needs:

I think for this case where the property is an array of values you could use
of_property_count_elems_of_size() to get the number of elements in the array
and then use of_property_read_u32_array() to read the array.

-Nathan

> 
>>
>>> +   if (!prop || len < sizeof(unsigned int)) {
>>> +   prop = of_get_property(rtas,
>>> +   "ibm,max-associativity-domains", );
> if (!prop || len < sizeof(unsigned int))
>>> +   goto endit;
>>> +   }
>>> +
>>> +   entries = of_read_number(prop++, 1);
>>> +
>>> +   if (len < (entries * sizeof(unsigned int)))
>>> +   goto endit;
>>> +
>>> +   if ((0 <= min_common_depth) && (min_common_depth <= 
>>> (entries-1)))
>>> +   entries = min_common_depth;
>>> +   else
>>> +   entries -= 1;
>>  ^
>> You can't just guess that will be the right entry.
>>
>> If min_common_depth is < 0 the function should have just returned
>> immediately at the top.
> 
> Okay.
> 
>>
>> If min_common_depth is outside the range of the property that's a buggy
>> device tree, you should print a warning and return.
>>
>>> +   numnodes = of_read_number([entries], 1);
>>
>>  u32 num_nodes;
>>  rc = of_property_read_u32_index(rtas, 
>> "ibm,current-associativity-domains", min_common_depth, _nodes);
>>> +
>>> +   printk(KERN_INFO "numa: Nodes = %d (mcd = %d)\n", numnodes,
>>> +   

Re: [PATCH 1/2] powerpc/nodes: Ensure enough nodes avail for operations

2017-10-17 Thread Nathan Fontenot
On 10/17/2017 11:14 AM, Michael Bringmann wrote:
> See below.
> 
> On 10/16/2017 07:33 AM, Michael Ellerman wrote:
>> Michael Bringmann  writes:
>>
>>> powerpc/nodes: On systems like PowerPC which allow 'hot-add' of CPU
>>
>> This is a powerpc-only patch, so saying "systems like PowerPC" is
>> confusing. What you should be saying is "On pseries systems".
>>
>>> or memory resources, it may occur that the new resources are to be
>>> inserted into nodes that were not used for these resources at bootup.
>>> In the kernel, any node that is used must be defined and initialized
>>> at boot.
>>>
>>> This patch extracts the value of the lowest domain level (number of
>>> allocable resources) from the "rtas" device tree property
>>> "ibm,current-associativity-domains" or the device tree property
>>
>> What is current associativity domains? I've not heard of it, where is it
>> documented, and what does it mean.
>>
>> Why would use the "current" set vs the "max"? I thought the whole point
>> was to discover the maximum possible set of nodes that could be
>> hotplugged.
>>
>>> "ibm,max-associativity-domains" to use as the maximum number of nodes
>>> to setup as possibly available in the system.  This new setting will
>>> override the instruction,
>>>
>>> nodes_and(node_possible_map, node_possible_map, node_online_map);
>>>
>>> presently seen in the function arch/powerpc/mm/numa.c:initmem_init().
>>>
>>> If the property is not present at boot, no operation will be performed
>>> to define or enable additional nodes.
>>>
>>> Signed-off-by: Michael Bringmann 
>>> ---
>>>  arch/powerpc/mm/numa.c |   47 
>>> +++
>>>  1 file changed, 47 insertions(+)
>>>
>>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>>> index ec098b3..b385cd0 100644
>>> --- a/arch/powerpc/mm/numa.c
>>> +++ b/arch/powerpc/mm/numa.c
>>> @@ -892,6 +892,51 @@ static void __init setup_node_data(int nid, u64 
>>> start_pfn, u64 end_pfn)
>>> NODE_DATA(nid)->node_spanned_pages = spanned_pages;
>>>  }
>>>  
>>> +static void __init node_associativity_setup(void)
>>
>> This should really be called "find_possible_nodes()" or something more
>> descriptive.
> 
> Okay.
>>
>>> +{
>>> +   struct device_node *rtas;
>>> +
>>> +   rtas = of_find_node_by_path("/rtas");
>>> +   if (rtas) {
>>
>> If you just short-circuit that return the whole function body can be
>> deintented, making it significantly more readable.
>>
>> ie:
>> +rtas = of_find_node_by_path("/rtas");
>> +if (!rtas)
>> +return;
> 
> Okay.
> 
>>
>>> +   const __be32 *prop;
>>> +   u32 len, entries, numnodes, i;
>>> +
>>> +   prop = of_get_property(rtas,
>>> +   "ibm,current-associativity-domains", 
>>> );
>>
>> Please don't use of_get_property() in new code, we have much better
>> accessors these days, which do better error checking and handle the
>> endian conversions for you.
>>
>> In this case you'd use eg:
>>
>>  u32 entries;
>>  rc = of_property_read_u32(rtas, "ibm,current-associativity-domains", 
>> );
> 
> The property 'ibm,current-associativity-domains' has the same format as the 
> property
> 'ibm,max-associativity-domains' i.e. it is an integer array.  The accessor 
> of_property_read_32,
> however, expects it to be an integer singleton value.  Instead, it needs:

I think for this case where the property is an array of values you could use
of_property_count_elems_of_size() to get the number of elements in the array
and then use of_property_read_u32_array() to read the array.

-Nathan

> 
>>
>>> +   if (!prop || len < sizeof(unsigned int)) {
>>> +   prop = of_get_property(rtas,
>>> +   "ibm,max-associativity-domains", );
> if (!prop || len < sizeof(unsigned int))
>>> +   goto endit;
>>> +   }
>>> +
>>> +   entries = of_read_number(prop++, 1);
>>> +
>>> +   if (len < (entries * sizeof(unsigned int)))
>>> +   goto endit;
>>> +
>>> +   if ((0 <= min_common_depth) && (min_common_depth <= 
>>> (entries-1)))
>>> +   entries = min_common_depth;
>>> +   else
>>> +   entries -= 1;
>>  ^
>> You can't just guess that will be the right entry.
>>
>> If min_common_depth is < 0 the function should have just returned
>> immediately at the top.
> 
> Okay.
> 
>>
>> If min_common_depth is outside the range of the property that's a buggy
>> device tree, you should print a warning and return.
>>
>>> +   numnodes = of_read_number([entries], 1);
>>
>>  u32 num_nodes;
>>  rc = of_property_read_u32_index(rtas, 
>> "ibm,current-associativity-domains", min_common_depth, _nodes);
>>> +
>>> +   printk(KERN_INFO "numa: Nodes = %d (mcd = %d)\n", numnodes,
>>> +   min_common_depth);
>>> +
>>> +   for (i = 

Re: [PATCH V13 4/4] powerpc/vphn: Fix numa update end-loop bug

2017-09-07 Thread Nathan Fontenot
On 09/06/2017 05:03 PM, Michael Bringmann wrote:
> 
> 
> On 09/06/2017 09:45 AM, Nathan Fontenot wrote:
>> On 09/01/2017 10:48 AM, Michael Bringmann wrote:
>>> powerpc/vphn: On Power systems with shared configurations of CPUs
>>> and memory, there are some issues with the association of additional
>>> CPUs and memory to nodes when hot-adding resources.  This patch
>>> fixes an end-of-updates processing problem observed occasionally
>>> in numa_update_cpu_topology().
>>>
>>> Signed-off-by: Michael Bringmann <m...@linux.vnet.ibm.com>
>>> ---
>>>  arch/powerpc/mm/numa.c |7 +++
>>>  1 file changed, 7 insertions(+)
>>>
>>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>>> index 3a5b334..fccf23f 100644
>>> --- a/arch/powerpc/mm/numa.c
>>> +++ b/arch/powerpc/mm/numa.c
>>> @@ -1410,6 +1410,13 @@ int numa_update_cpu_topology(bool cpus_locked)
>>> cpu = cpu_last_thread_sibling(cpu);
>>> }
>>>
>>> +   /*
>>> +* Prevent processing of 'updates' from overflowing array
>>> +* in cases where last entry filled in a 'next' pointer.
>>> +*/
>>> +   if (i)
>>> +   updates[i-1].next = NULL;
>>> +
>>
>> This really looks like the bug is in the code above this where we
>> fill in the updates array for each of the sibling cpus. The code
>> there assumes that if the current update entry is not the end that
>> there will be more updates and blindly sets the next pointer.
>>
>> Perhaps correcting the logic in that code to next pointers. Set the
>> ud pointer to NULL before the outer for_each_cpu() loop. Then in the
>> inner for_each_cpu(sibling,...) loop update the ud-> next pointer as
>> the first operation.
>>
>>  for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
>>  if (ud)
>>  ud->next = [i];
>>  ...
>>  }
>>
>> Obviously untested, but I think this would prevent setting the next
>> pointer in the last update entry that is filled out erroneously.
> 
> The above fragment looks to skip initialization of the 'next' pointer
> in the first element of the the 'updates'.  That would abort subsequent
> evaluation of the array too soon, I believe.  I would like to take another 
> look
> to see whether the current check 'if (i < weight) ud->next = [i];'
> is having problems due to i being 0-relative and weight being 1-relative.

Another thing to keep in mind is that cpus can be skipped by checks earlier
in the loop. There is not guarantee that we will add 'weight' elements to
the ud list.

-Nathan
 
> 
>>   
>> -Nathan
> 
> Michael
> 
>>
>>> pr_debug("Topology update for the following CPUs:\n");
>>> if (cpumask_weight(_cpus)) {
>>> for (ud = [0]; ud; ud = ud->next) {
>>>
>>
> 



Re: [PATCH V13 4/4] powerpc/vphn: Fix numa update end-loop bug

2017-09-07 Thread Nathan Fontenot
On 09/06/2017 05:03 PM, Michael Bringmann wrote:
> 
> 
> On 09/06/2017 09:45 AM, Nathan Fontenot wrote:
>> On 09/01/2017 10:48 AM, Michael Bringmann wrote:
>>> powerpc/vphn: On Power systems with shared configurations of CPUs
>>> and memory, there are some issues with the association of additional
>>> CPUs and memory to nodes when hot-adding resources.  This patch
>>> fixes an end-of-updates processing problem observed occasionally
>>> in numa_update_cpu_topology().
>>>
>>> Signed-off-by: Michael Bringmann 
>>> ---
>>>  arch/powerpc/mm/numa.c |7 +++
>>>  1 file changed, 7 insertions(+)
>>>
>>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>>> index 3a5b334..fccf23f 100644
>>> --- a/arch/powerpc/mm/numa.c
>>> +++ b/arch/powerpc/mm/numa.c
>>> @@ -1410,6 +1410,13 @@ int numa_update_cpu_topology(bool cpus_locked)
>>> cpu = cpu_last_thread_sibling(cpu);
>>> }
>>>
>>> +   /*
>>> +* Prevent processing of 'updates' from overflowing array
>>> +* in cases where last entry filled in a 'next' pointer.
>>> +*/
>>> +   if (i)
>>> +   updates[i-1].next = NULL;
>>> +
>>
>> This really looks like the bug is in the code above this where we
>> fill in the updates array for each of the sibling cpus. The code
>> there assumes that if the current update entry is not the end that
>> there will be more updates and blindly sets the next pointer.
>>
>> Perhaps correcting the logic in that code to next pointers. Set the
>> ud pointer to NULL before the outer for_each_cpu() loop. Then in the
>> inner for_each_cpu(sibling,...) loop update the ud-> next pointer as
>> the first operation.
>>
>>  for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
>>  if (ud)
>>  ud->next = [i];
>>  ...
>>  }
>>
>> Obviously untested, but I think this would prevent setting the next
>> pointer in the last update entry that is filled out erroneously.
> 
> The above fragment looks to skip initialization of the 'next' pointer
> in the first element of the the 'updates'.  That would abort subsequent
> evaluation of the array too soon, I believe.  I would like to take another 
> look
> to see whether the current check 'if (i < weight) ud->next = [i];'
> is having problems due to i being 0-relative and weight being 1-relative.

Another thing to keep in mind is that cpus can be skipped by checks earlier
in the loop. There is not guarantee that we will add 'weight' elements to
the ud list.

-Nathan
 
> 
>>   
>> -Nathan
> 
> Michael
> 
>>
>>> pr_debug("Topology update for the following CPUs:\n");
>>> if (cpumask_weight(_cpus)) {
>>> for (ud = [0]; ud; ud = ud->next) {
>>>
>>
> 



Re: [PATCH V13 4/4] powerpc/vphn: Fix numa update end-loop bug

2017-09-06 Thread Nathan Fontenot
On 09/01/2017 10:48 AM, Michael Bringmann wrote:
> powerpc/vphn: On Power systems with shared configurations of CPUs
> and memory, there are some issues with the association of additional
> CPUs and memory to nodes when hot-adding resources.  This patch
> fixes an end-of-updates processing problem observed occasionally
> in numa_update_cpu_topology().
> 
> Signed-off-by: Michael Bringmann 
> ---
>  arch/powerpc/mm/numa.c |7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 3a5b334..fccf23f 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -1410,6 +1410,13 @@ int numa_update_cpu_topology(bool cpus_locked)
>   cpu = cpu_last_thread_sibling(cpu);
>   }
> 
> + /*
> +  * Prevent processing of 'updates' from overflowing array
> +  * in cases where last entry filled in a 'next' pointer.
> +  */
> + if (i)
> + updates[i-1].next = NULL;
> +

This really looks like the bug is in the code above this where we
fill in the updates array for each of the sibling cpus. The code
there assumes that if the current update entry is not the end that
there will be more updates and blindly sets the next pointer.

Perhaps correcting the logic in that code to next pointers. Set the
ud pointer to NULL before the outer for_each_cpu() loop. Then in the
inner for_each_cpu(sibling,...) loop update the ud-> next pointer as
the first operation.

for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
if (ud)
ud->next = [i];
...
}

Obviously untested, but I think this would prevent setting the next
pointer in the last update entry that is filled out erroneously.
  
-Nathan

>   pr_debug("Topology update for the following CPUs:\n");
>   if (cpumask_weight(_cpus)) {
>   for (ud = [0]; ud; ud = ud->next) {
> 



Re: [PATCH V13 4/4] powerpc/vphn: Fix numa update end-loop bug

2017-09-06 Thread Nathan Fontenot
On 09/01/2017 10:48 AM, Michael Bringmann wrote:
> powerpc/vphn: On Power systems with shared configurations of CPUs
> and memory, there are some issues with the association of additional
> CPUs and memory to nodes when hot-adding resources.  This patch
> fixes an end-of-updates processing problem observed occasionally
> in numa_update_cpu_topology().
> 
> Signed-off-by: Michael Bringmann 
> ---
>  arch/powerpc/mm/numa.c |7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 3a5b334..fccf23f 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -1410,6 +1410,13 @@ int numa_update_cpu_topology(bool cpus_locked)
>   cpu = cpu_last_thread_sibling(cpu);
>   }
> 
> + /*
> +  * Prevent processing of 'updates' from overflowing array
> +  * in cases where last entry filled in a 'next' pointer.
> +  */
> + if (i)
> + updates[i-1].next = NULL;
> +

This really looks like the bug is in the code above this where we
fill in the updates array for each of the sibling cpus. The code
there assumes that if the current update entry is not the end that
there will be more updates and blindly sets the next pointer.

Perhaps correcting the logic in that code to next pointers. Set the
ud pointer to NULL before the outer for_each_cpu() loop. Then in the
inner for_each_cpu(sibling,...) loop update the ud-> next pointer as
the first operation.

for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
if (ud)
ud->next = [i];
...
}

Obviously untested, but I think this would prevent setting the next
pointer in the last update entry that is filled out erroneously.
  
-Nathan

>   pr_debug("Topology update for the following CPUs:\n");
>   if (cpumask_weight(_cpus)) {
>   for (ud = [0]; ud; ud = ud->next) {
> 



Re: [PATCH V13 3/4] powerpc/hotplug: Improve responsiveness of hotplug change

2017-09-06 Thread Nathan Fontenot
On 09/01/2017 10:48 AM, Michael Bringmann wrote:
> powerpc/hotplug: On Power systems with shared configurations of CPUs
> and memory, there are some issues with the association of additional
> CPUs and memory to nodes when hot-adding resources.  During hotplug
> CPU operations, this patch resets the timer on topology update work
> function to a small value to better ensure that the CPU topology is
> detected and configured sooner.

Looking through the changes you've made here I don't see where the
topology timeout ever gets set to the default timeout. When calculating
the next timeout you use topology_timer_secs which is initialized to 1, so
the timer pops every second after initialization. Then after a dlpar cpu
operation the timer is set to pop every second. There is no place that I
see where the timeout is set to the default 60 seconds.

> 
> Signed-off-by: Michael Bringmann 
> ---
>  arch/powerpc/include/asm/topology.h  |8 
>  arch/powerpc/mm/numa.c   |   21 -
>  arch/powerpc/platforms/pseries/hotplug-cpu.c |2 ++
>  3 files changed, 30 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/topology.h 
> b/arch/powerpc/include/asm/topology.h
> index dc4e159..beb9bca 100644
> --- a/arch/powerpc/include/asm/topology.h
> +++ b/arch/powerpc/include/asm/topology.h
> @@ -98,6 +98,14 @@ static inline int prrn_is_enabled(void)
>  }
>  #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
> 
> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_NEED_MULTIPLE_NODES)
> +#if defined(CONFIG_PPC_SPLPAR)
> +extern int timed_topology_update(int nsecs);
> +#else
> +#define  timed_topology_update(nsecs)
> +#endif /* CONFIG_PPC_SPLPAR */
> +#endif /* CONFIG_HOTPLUG_CPU || CONFIG_NEED_MULTIPLE_NODES */
> +
>  #include 
> 
>  #ifdef CONFIG_SMP
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index c08d736..3a5b334 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -1148,15 +1148,34 @@ struct topology_update_data {
>   int new_nid;
>  };
> 
> +#define TOPOLOGY_DEF_TIMER_SECS  60
> +
>  static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
>  static cpumask_t cpu_associativity_changes_mask;
>  static int vphn_enabled;
>  static int prrn_enabled;
>  static void reset_topology_timer(void);
> +static int topology_timer_secs = 1;
>  static int topology_inited;
>  static int topology_update_needed;
> 
>  /*
> + * Change polling interval for associativity changes.
> + */
> +int timed_topology_update(int nsecs)
> +{
> + if (nsecs > 0)
> + topology_timer_secs = nsecs;
> + else
> + topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS;
> +
> + if (vphn_enabled)
> + reset_topology_timer();

Should this whole thing be wrapped by if (vphn_enabled) ?

-Nathan

> +
> + return 0;
> +}
> +
> +/*
>   * Store the current values of the associativity change counters in the
>   * hypervisor.
>   */
> @@ -1489,7 +1508,7 @@ static void topology_timer_fn(unsigned long ignored)
>  static void reset_topology_timer(void)
>  {
>   topology_timer.data = 0;
> - topology_timer.expires = jiffies + 60 * HZ;
> + topology_timer.expires = jiffies + topology_timer_secs * HZ;
>   mod_timer(_timer, topology_timer.expires);
>  }
> 
> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
> b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> index 6afd1ef..5a7fb1e 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> @@ -356,6 +356,7 @@ static int dlpar_online_cpu(struct device_node *dn)
>   BUG_ON(get_cpu_current_state(cpu)
>   != CPU_STATE_OFFLINE);
>   cpu_maps_update_done();
> + timed_topology_update(1);
>   rc = device_online(get_cpu_device(cpu));
>   if (rc)
>   goto out;
> @@ -522,6 +523,7 @@ static int dlpar_offline_cpu(struct device_node *dn)
>   set_preferred_offline_state(cpu,
>   CPU_STATE_OFFLINE);
>   cpu_maps_update_done();
> + timed_topology_update(1);
>   rc = device_offline(get_cpu_device(cpu));
>   if (rc)
>   goto out;
> 



Re: [PATCH V13 3/4] powerpc/hotplug: Improve responsiveness of hotplug change

2017-09-06 Thread Nathan Fontenot
On 09/01/2017 10:48 AM, Michael Bringmann wrote:
> powerpc/hotplug: On Power systems with shared configurations of CPUs
> and memory, there are some issues with the association of additional
> CPUs and memory to nodes when hot-adding resources.  During hotplug
> CPU operations, this patch resets the timer on topology update work
> function to a small value to better ensure that the CPU topology is
> detected and configured sooner.

Looking through the changes you've made here I don't see where the
topology timeout ever gets set to the default timeout. When calculating
the next timeout you use topology_timer_secs which is initialized to 1, so
the timer pops every second after initialization. Then after a dlpar cpu
operation the timer is set to pop every second. There is no place that I
see where the timeout is set to the default 60 seconds.

> 
> Signed-off-by: Michael Bringmann 
> ---
>  arch/powerpc/include/asm/topology.h  |8 
>  arch/powerpc/mm/numa.c   |   21 -
>  arch/powerpc/platforms/pseries/hotplug-cpu.c |2 ++
>  3 files changed, 30 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/topology.h 
> b/arch/powerpc/include/asm/topology.h
> index dc4e159..beb9bca 100644
> --- a/arch/powerpc/include/asm/topology.h
> +++ b/arch/powerpc/include/asm/topology.h
> @@ -98,6 +98,14 @@ static inline int prrn_is_enabled(void)
>  }
>  #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
> 
> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_NEED_MULTIPLE_NODES)
> +#if defined(CONFIG_PPC_SPLPAR)
> +extern int timed_topology_update(int nsecs);
> +#else
> +#define  timed_topology_update(nsecs)
> +#endif /* CONFIG_PPC_SPLPAR */
> +#endif /* CONFIG_HOTPLUG_CPU || CONFIG_NEED_MULTIPLE_NODES */
> +
>  #include 
> 
>  #ifdef CONFIG_SMP
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index c08d736..3a5b334 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -1148,15 +1148,34 @@ struct topology_update_data {
>   int new_nid;
>  };
> 
> +#define TOPOLOGY_DEF_TIMER_SECS  60
> +
>  static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
>  static cpumask_t cpu_associativity_changes_mask;
>  static int vphn_enabled;
>  static int prrn_enabled;
>  static void reset_topology_timer(void);
> +static int topology_timer_secs = 1;
>  static int topology_inited;
>  static int topology_update_needed;
> 
>  /*
> + * Change polling interval for associativity changes.
> + */
> +int timed_topology_update(int nsecs)
> +{
> + if (nsecs > 0)
> + topology_timer_secs = nsecs;
> + else
> + topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS;
> +
> + if (vphn_enabled)
> + reset_topology_timer();

Should this whole thing be wrapped by if (vphn_enabled) ?

-Nathan

> +
> + return 0;
> +}
> +
> +/*
>   * Store the current values of the associativity change counters in the
>   * hypervisor.
>   */
> @@ -1489,7 +1508,7 @@ static void topology_timer_fn(unsigned long ignored)
>  static void reset_topology_timer(void)
>  {
>   topology_timer.data = 0;
> - topology_timer.expires = jiffies + 60 * HZ;
> + topology_timer.expires = jiffies + topology_timer_secs * HZ;
>   mod_timer(_timer, topology_timer.expires);
>  }
> 
> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
> b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> index 6afd1ef..5a7fb1e 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> @@ -356,6 +356,7 @@ static int dlpar_online_cpu(struct device_node *dn)
>   BUG_ON(get_cpu_current_state(cpu)
>   != CPU_STATE_OFFLINE);
>   cpu_maps_update_done();
> + timed_topology_update(1);
>   rc = device_online(get_cpu_device(cpu));
>   if (rc)
>   goto out;
> @@ -522,6 +523,7 @@ static int dlpar_offline_cpu(struct device_node *dn)
>   set_preferred_offline_state(cpu,
>   CPU_STATE_OFFLINE);
>   cpu_maps_update_done();
> + timed_topology_update(1);
>   rc = device_offline(get_cpu_device(cpu));
>   if (rc)
>   goto out;
> 



Re: [PATCH V13 2/4] powerpc/vphn: Improve recognition of PRRN/VPHN

2017-09-06 Thread Nathan Fontenot


On 09/01/2017 10:48 AM, Michael Bringmann wrote:
> powerpc/vphn: On Power systems with shared configurations of CPUs
> and memory, there are some issues with the association of additional
> CPUs and memory to nodes when hot-adding resources.  This patch
> updates the initialization checks to independently recognize PRRN
> or VPHN support.
> 
> Signed-off-by: Michael Bringmann 
> ---
> Changes in V13:
>   -- Split patch to improve review
> ---
>  arch/powerpc/mm/numa.c |6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 312f6ee..c08d736 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -1543,7 +1543,8 @@ int start_topology_update(void)
>   rc = of_reconfig_notifier_register(_update_nb);
>  #endif
>   }
> - } else if (firmware_has_feature(FW_FEATURE_VPHN) &&
> + }
> + if (firmware_has_feature(FW_FEATURE_VPHN) &&
>  lppaca_shared_proc(get_lppaca())) {
>   if (!vphn_enabled) {
>   vphn_enabled = 1;

In patch 1/4, you removed the setting of prrn_enabled and vphn_enabled
to 0. It seems like that update would be part of this patch.

-Nathan

> @@ -1568,7 +1569,8 @@ int stop_topology_update(void)
>  #ifdef CONFIG_SMP
>   rc = of_reconfig_notifier_unregister(_update_nb);
>  #endif
> - } else if (vphn_enabled) {
> + }
> + if (vphn_enabled) {
>   vphn_enabled = 0;>  rc = 
> del_timer_sync(_timer);
>   }
> 



Re: [PATCH V13 2/4] powerpc/vphn: Improve recognition of PRRN/VPHN

2017-09-06 Thread Nathan Fontenot


On 09/01/2017 10:48 AM, Michael Bringmann wrote:
> powerpc/vphn: On Power systems with shared configurations of CPUs
> and memory, there are some issues with the association of additional
> CPUs and memory to nodes when hot-adding resources.  This patch
> updates the initialization checks to independently recognize PRRN
> or VPHN support.
> 
> Signed-off-by: Michael Bringmann 
> ---
> Changes in V13:
>   -- Split patch to improve review
> ---
>  arch/powerpc/mm/numa.c |6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 312f6ee..c08d736 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -1543,7 +1543,8 @@ int start_topology_update(void)
>   rc = of_reconfig_notifier_register(_update_nb);
>  #endif
>   }
> - } else if (firmware_has_feature(FW_FEATURE_VPHN) &&
> + }
> + if (firmware_has_feature(FW_FEATURE_VPHN) &&
>  lppaca_shared_proc(get_lppaca())) {
>   if (!vphn_enabled) {
>   vphn_enabled = 1;

In patch 1/4, you removed the setting of prrn_enabled and vphn_enabled
to 0. It seems like that update would be part of this patch.

-Nathan

> @@ -1568,7 +1569,8 @@ int stop_topology_update(void)
>  #ifdef CONFIG_SMP
>   rc = of_reconfig_notifier_unregister(_update_nb);
>  #endif
> - } else if (vphn_enabled) {
> + }
> + if (vphn_enabled) {
>   vphn_enabled = 0;>  rc = 
> del_timer_sync(_timer);
>   }
> 



Re: [PATCH V13 1/4] powerpc/vphn: Update CPU topology when VPHN enabled

2017-09-06 Thread Nathan Fontenot
On 09/01/2017 10:48 AM, Michael Bringmann wrote:
> powerpc/vphn: On Power systems with shared configurations of CPUs
> and memory, there are some issues with the association of additional
> CPUs and memory to nodes when hot-adding resources.  This patch
> corrects the currently broken capability to set the topology for
> shared CPUs in LPARs.  At boot time for shared CPU lpars, the
> topology for each CPU was being set to node zero.  Now when
> numa_update_cpu_topology() is called appropriately, the Virtual
> Processor Home Node (VPHN) capabilities information provided by the
> pHyp allows the appropriate node in the shared configuration to be
> selected for the CPU.
> 
> Signed-off-by: Michael Bringmann 
> ---
> Changes in V13:
>   -- Split patch for improved review
> ---
>  arch/powerpc/mm/numa.c |   31 ---
>  1 file changed, 28 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index b95c584..312f6ee 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -1153,6 +1153,8 @@ struct topology_update_data {
>  static int vphn_enabled;
>  static int prrn_enabled;
>  static void reset_topology_timer(void);
> +static int topology_inited;
> +static int topology_update_needed;
> 
>  /*
>   * Store the current values of the associativity change counters in the
> @@ -1246,6 +1248,11 @@ static long vphn_get_associativity(unsigned long cpu,
>   "hcall_vphn() experienced a hardware fault "
>   "preventing VPHN. Disabling polling...\n");
>   stop_topology_update();
> + break;
> + case H_SUCCESS:
> + dbg("VPHN hcall succeeded. Reset polling...\n");
> + timed_topology_update(0);
> + break;
>   }
> 
>   return rc;
> @@ -1323,8 +1330,11 @@ int numa_update_cpu_topology(bool cpus_locked)
>   struct device *dev;
>   int weight, new_nid, i = 0;
> 
> - if (!prrn_enabled && !vphn_enabled)
> + if (!prrn_enabled && !vphn_enabled) {
> + if (!topology_inited)
> + topology_update_needed = 1;
>   return 0;
> + }
> 
>   weight = cpumask_weight(_associativity_changes_mask);
>   if (!weight)
> @@ -1363,6 +1373,8 @@ int numa_update_cpu_topology(bool cpus_locked)
>   cpumask_andnot(_associativity_changes_mask,
>   _associativity_changes_mask,
>   cpu_sibling_mask(cpu));
> + pr_info("Assoc chg gives same node %d for cpu%d\n",
> + new_nid, cpu);

As mentioned previously, this should either be removed or changed to a
debug statement.

>   cpu = cpu_last_thread_sibling(cpu);
>   continue;
>   }
> @@ -1433,6 +1445,7 @@ int numa_update_cpu_topology(bool cpus_locked)
> 
>  out:
>   kfree(updates);
> + topology_update_needed = 0;
>   return changed;
>  }
> 
> @@ -1453,6 +1466,13 @@ static void topology_schedule_update(void)
>   schedule_work(_work);
>  }
> 
> +void shared_topology_update(void)
> +{
> + if (firmware_has_feature(FW_FEATURE_VPHN) &&
> +lppaca_shared_proc(get_lppaca()))

Could we just check vphn_enabled here? The init routine will set this to true
only if the feature is supported and we are using shared processors.

Also, this routine seems a bit odd. The only place it is called from is an
init routine, topology_update_init. Is there a reason this check couldn't just
be in that routine, or it seems like you could just call 
topology_schedule_update
directly from start_topology_update when vphn is initialized.

-Nathan

> + topology_schedule_update();
> +}
> +
>  static void topology_timer_fn(unsigned long ignored)
>  {
>   if (prrn_enabled && cpumask_weight(_associativity_changes_mask))
> @@ -1519,7 +1539,6 @@ int start_topology_update(void)
>   if (firmware_has_feature(FW_FEATURE_PRRN)) {
>   if (!prrn_enabled) {
>   prrn_enabled = 1;
> - vphn_enabled = 0;
>  #ifdef CONFIG_SMP
>   rc = of_reconfig_notifier_register(_update_nb);
>  #endif
> @@ -1527,7 +1546,6 @@ int start_topology_update(void)
>   } else if (firmware_has_feature(FW_FEATURE_VPHN) &&
>  lppaca_shared_proc(get_lppaca())) {
>   if (!vphn_enabled) {
> - prrn_enabled = 0;
>   vphn_enabled = 1;
>   setup_cpu_associativity_change_counters();
>   init_timer_deferrable(_timer);
> @@ -1613,9 +1631,16 @@ static int topology_update_init(void)
>   if (topology_updates_enabled)
>   start_topology_update();
> 
> + shared_topology_update();
> +
>   if (!proc_create("powerpc/topology_updates", 0644, NULL, _ops))
>   

Re: [PATCH V13 1/4] powerpc/vphn: Update CPU topology when VPHN enabled

2017-09-06 Thread Nathan Fontenot
On 09/01/2017 10:48 AM, Michael Bringmann wrote:
> powerpc/vphn: On Power systems with shared configurations of CPUs
> and memory, there are some issues with the association of additional
> CPUs and memory to nodes when hot-adding resources.  This patch
> corrects the currently broken capability to set the topology for
> shared CPUs in LPARs.  At boot time for shared CPU lpars, the
> topology for each CPU was being set to node zero.  Now when
> numa_update_cpu_topology() is called appropriately, the Virtual
> Processor Home Node (VPHN) capabilities information provided by the
> pHyp allows the appropriate node in the shared configuration to be
> selected for the CPU.
> 
> Signed-off-by: Michael Bringmann 
> ---
> Changes in V13:
>   -- Split patch for improved review
> ---
>  arch/powerpc/mm/numa.c |   31 ---
>  1 file changed, 28 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index b95c584..312f6ee 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -1153,6 +1153,8 @@ struct topology_update_data {
>  static int vphn_enabled;
>  static int prrn_enabled;
>  static void reset_topology_timer(void);
> +static int topology_inited;
> +static int topology_update_needed;
> 
>  /*
>   * Store the current values of the associativity change counters in the
> @@ -1246,6 +1248,11 @@ static long vphn_get_associativity(unsigned long cpu,
>   "hcall_vphn() experienced a hardware fault "
>   "preventing VPHN. Disabling polling...\n");
>   stop_topology_update();
> + break;
> + case H_SUCCESS:
> + dbg("VPHN hcall succeeded. Reset polling...\n");
> + timed_topology_update(0);
> + break;
>   }
> 
>   return rc;
> @@ -1323,8 +1330,11 @@ int numa_update_cpu_topology(bool cpus_locked)
>   struct device *dev;
>   int weight, new_nid, i = 0;
> 
> - if (!prrn_enabled && !vphn_enabled)
> + if (!prrn_enabled && !vphn_enabled) {
> + if (!topology_inited)
> + topology_update_needed = 1;
>   return 0;
> + }
> 
>   weight = cpumask_weight(_associativity_changes_mask);
>   if (!weight)
> @@ -1363,6 +1373,8 @@ int numa_update_cpu_topology(bool cpus_locked)
>   cpumask_andnot(_associativity_changes_mask,
>   _associativity_changes_mask,
>   cpu_sibling_mask(cpu));
> + pr_info("Assoc chg gives same node %d for cpu%d\n",
> + new_nid, cpu);

As mentioned previously, this should either be removed or changed to a
debug statement.

>   cpu = cpu_last_thread_sibling(cpu);
>   continue;
>   }
> @@ -1433,6 +1445,7 @@ int numa_update_cpu_topology(bool cpus_locked)
> 
>  out:
>   kfree(updates);
> + topology_update_needed = 0;
>   return changed;
>  }
> 
> @@ -1453,6 +1466,13 @@ static void topology_schedule_update(void)
>   schedule_work(_work);
>  }
> 
> +void shared_topology_update(void)
> +{
> + if (firmware_has_feature(FW_FEATURE_VPHN) &&
> +lppaca_shared_proc(get_lppaca()))

Could we just check vphn_enabled here? The init routine will set this to true
only if the feature is supported and we are using shared processors.

Also, this routine seems a bit odd. The only place it is called from is an
init routine, topology_update_init. Is there a reason this check couldn't just
be in that routine, or it seems like you could just call 
topology_schedule_update
directly from start_topology_update when vphn is initialized.

-Nathan

> + topology_schedule_update();
> +}
> +
>  static void topology_timer_fn(unsigned long ignored)
>  {
>   if (prrn_enabled && cpumask_weight(_associativity_changes_mask))
> @@ -1519,7 +1539,6 @@ int start_topology_update(void)
>   if (firmware_has_feature(FW_FEATURE_PRRN)) {
>   if (!prrn_enabled) {
>   prrn_enabled = 1;
> - vphn_enabled = 0;
>  #ifdef CONFIG_SMP
>   rc = of_reconfig_notifier_register(_update_nb);
>  #endif
> @@ -1527,7 +1546,6 @@ int start_topology_update(void)
>   } else if (firmware_has_feature(FW_FEATURE_VPHN) &&
>  lppaca_shared_proc(get_lppaca())) {
>   if (!vphn_enabled) {
> - prrn_enabled = 0;
>   vphn_enabled = 1;
>   setup_cpu_associativity_change_counters();
>   init_timer_deferrable(_timer);
> @@ -1613,9 +1631,16 @@ static int topology_update_init(void)
>   if (topology_updates_enabled)
>   start_topology_update();
> 
> + shared_topology_update();
> +
>   if (!proc_create("powerpc/topology_updates", 0644, NULL, _ops))
>   return -ENOMEM;
> 
> + 

Re: [PATCH V10 1/2] powerpc/numa: Update CPU topology when VPHN enabled

2017-08-28 Thread Nathan Fontenot
On 08/24/2017 05:07 PM, Michael Bringmann wrote:
> 
> powerpc/numa: Correct the currently broken capability to set the
> topology for shared CPUs in LPARs.  At boot time for shared CPU
> lpars, the topology for each shared CPU is set to node zero, however,
> this is now updated correctly using the Virtual Processor Home Node
> (VPHN) capabilities information provided by the pHyp.
> 
> Also, update initialization checks for device-tree attributes to
> independently recognize PRRN or VPHN usage.
> 
> Finally, try to distinguish the VPHN code from the NUMA code better,
> and move relevant functions to another file.

You need to split the move of the vphn code to a different file into
a separate patch. With thia all in one patch it is really difficult
to distinguish what pieces are code changes and what is just moving
code around.

-Nathan

> 
> Signed-off-by: Michael Bringmann 
> ---
> Changes in V10:
>   -- Reorganize VPHN code to distinguish it from NUMA processing
> ---
>  arch/powerpc/include/asm/topology.h  |8 
>  arch/powerpc/mm/numa.c   |  503 --
>  arch/powerpc/mm/vphn.c   |  586 
> ++
>  arch/powerpc/mm/vphn.h   |4 
>  arch/powerpc/platforms/pseries/hotplug-cpu.c |2 
>  5 files changed, 609 insertions(+), 494 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/topology.h 
> b/arch/powerpc/include/asm/topology.h
> index dc4e159..600e1c6 100644
> --- a/arch/powerpc/include/asm/topology.h
> +++ b/arch/powerpc/include/asm/topology.h
> @@ -98,6 +98,14 @@ static inline int prrn_is_enabled(void)
>  }
>  #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
> 
> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_NEED_MULTIPLE_NODES)
> +#if defined(CONFIG_PPC_SPLPAR)
> +extern int timed_topology_update(int nsecs);
> +#else
> +#define  timed_topology_update(nsecs)0
> +#endif /* CONFIG_PPC_SPLPAR */
> +#endif /* CONFIG_HOTPLUG_CPU || CONFIG_NEED_MULTIPLE_NODES */
> +
>  #include 
> 
>  #ifdef CONFIG_SMP
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index b95c584..73427e290 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -29,6 +29,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -41,8 +42,12 @@
>  #include 
>  #include 
> 
> +#include "vphn.h"
> +
>  static int numa_enabled = 1;
> 
> +bool topology_updates_enabled = true;
> +
>  static char *cmdline __initdata;
> 
>  static int numa_debug;
> @@ -60,8 +65,7 @@
>  static int n_mem_addr_cells, n_mem_size_cells;
>  static int form1_affinity;
> 
> -#define MAX_DISTANCE_REF_POINTS 4
> -static int distance_ref_points_depth;
> +int distance_ref_points_depth;
>  static const __be32 *distance_ref_points;
>  static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
> 
> @@ -142,12 +146,12 @@ static void reset_numa_cpu_lookup_table(void)
>   numa_cpu_lookup_table[cpu] = -1;
>  }
> 
> -static void update_numa_cpu_lookup_table(unsigned int cpu, int node)
> +void update_numa_cpu_lookup_table(unsigned int cpu, int node)
>  {
>   numa_cpu_lookup_table[cpu] = node;
>  }
> 
> -static void map_cpu_to_node(int cpu, int node)
> +void map_cpu_to_node(int cpu, int node)
>  {
>   update_numa_cpu_lookup_table(cpu, node);
> 
> @@ -158,7 +162,7 @@ static void map_cpu_to_node(int cpu, int node)
>  }
> 
>  #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
> -static void unmap_cpu_from_node(unsigned long cpu)
> +void unmap_cpu_from_node(unsigned long cpu)
>  {
>   int node = numa_cpu_lookup_table[cpu];
> 
> @@ -233,7 +237,7 @@ static void initialize_distance_lookup_table(int nid,
>  /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
>   * info is found.
>   */
> -static int associativity_to_nid(const __be32 *associativity)
> +int associativity_to_nid(const __be32 *associativity)
>  {
>   int nid = -1;
> 
> @@ -957,8 +961,6 @@ static int __init early_numa(char *p)
>  }
>  early_param("numa", early_numa);
> 
> -static bool topology_updates_enabled = true;
> -
>  static int __init early_topology_updates(char *p)
>  {
>   if (!p)
> @@ -1135,488 +1137,3 @@ u64 memory_hotplug_max(void)
>  return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
>  }
>  #endif /* CONFIG_MEMORY_HOTPLUG */
> -
> -/* Virtual Processor Home Node (VPHN) support */
> -#ifdef CONFIG_PPC_SPLPAR
> -
> -#include "vphn.h"
> -
> -struct topology_update_data {
> - struct topology_update_data *next;
> - unsigned int cpu;
> - int old_nid;
> - int new_nid;
> -};
> -
> -static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
> -static cpumask_t cpu_associativity_changes_mask;
> -static int vphn_enabled;
> -static int prrn_enabled;
> -static void reset_topology_timer(void);
> -
> -/*
> - * Store the current values of the associativity change counters in the
> - * 

Re: [PATCH V10 1/2] powerpc/numa: Update CPU topology when VPHN enabled

2017-08-28 Thread Nathan Fontenot
On 08/24/2017 05:07 PM, Michael Bringmann wrote:
> 
> powerpc/numa: Correct the currently broken capability to set the
> topology for shared CPUs in LPARs.  At boot time for shared CPU
> lpars, the topology for each shared CPU is set to node zero, however,
> this is now updated correctly using the Virtual Processor Home Node
> (VPHN) capabilities information provided by the pHyp.
> 
> Also, update initialization checks for device-tree attributes to
> independently recognize PRRN or VPHN usage.
> 
> Finally, try to distinguish the VPHN code from the NUMA code better,
> and move relevant functions to another file.

You need to split the move of the vphn code to a different file into
a separate patch. With thia all in one patch it is really difficult
to distinguish what pieces are code changes and what is just moving
code around.

-Nathan

> 
> Signed-off-by: Michael Bringmann 
> ---
> Changes in V10:
>   -- Reorganize VPHN code to distinguish it from NUMA processing
> ---
>  arch/powerpc/include/asm/topology.h  |8 
>  arch/powerpc/mm/numa.c   |  503 --
>  arch/powerpc/mm/vphn.c   |  586 
> ++
>  arch/powerpc/mm/vphn.h   |4 
>  arch/powerpc/platforms/pseries/hotplug-cpu.c |2 
>  5 files changed, 609 insertions(+), 494 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/topology.h 
> b/arch/powerpc/include/asm/topology.h
> index dc4e159..600e1c6 100644
> --- a/arch/powerpc/include/asm/topology.h
> +++ b/arch/powerpc/include/asm/topology.h
> @@ -98,6 +98,14 @@ static inline int prrn_is_enabled(void)
>  }
>  #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
> 
> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_NEED_MULTIPLE_NODES)
> +#if defined(CONFIG_PPC_SPLPAR)
> +extern int timed_topology_update(int nsecs);
> +#else
> +#define  timed_topology_update(nsecs)0
> +#endif /* CONFIG_PPC_SPLPAR */
> +#endif /* CONFIG_HOTPLUG_CPU || CONFIG_NEED_MULTIPLE_NODES */
> +
>  #include 
> 
>  #ifdef CONFIG_SMP
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index b95c584..73427e290 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -29,6 +29,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -41,8 +42,12 @@
>  #include 
>  #include 
> 
> +#include "vphn.h"
> +
>  static int numa_enabled = 1;
> 
> +bool topology_updates_enabled = true;
> +
>  static char *cmdline __initdata;
> 
>  static int numa_debug;
> @@ -60,8 +65,7 @@
>  static int n_mem_addr_cells, n_mem_size_cells;
>  static int form1_affinity;
> 
> -#define MAX_DISTANCE_REF_POINTS 4
> -static int distance_ref_points_depth;
> +int distance_ref_points_depth;
>  static const __be32 *distance_ref_points;
>  static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
> 
> @@ -142,12 +146,12 @@ static void reset_numa_cpu_lookup_table(void)
>   numa_cpu_lookup_table[cpu] = -1;
>  }
> 
> -static void update_numa_cpu_lookup_table(unsigned int cpu, int node)
> +void update_numa_cpu_lookup_table(unsigned int cpu, int node)
>  {
>   numa_cpu_lookup_table[cpu] = node;
>  }
> 
> -static void map_cpu_to_node(int cpu, int node)
> +void map_cpu_to_node(int cpu, int node)
>  {
>   update_numa_cpu_lookup_table(cpu, node);
> 
> @@ -158,7 +162,7 @@ static void map_cpu_to_node(int cpu, int node)
>  }
> 
>  #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
> -static void unmap_cpu_from_node(unsigned long cpu)
> +void unmap_cpu_from_node(unsigned long cpu)
>  {
>   int node = numa_cpu_lookup_table[cpu];
> 
> @@ -233,7 +237,7 @@ static void initialize_distance_lookup_table(int nid,
>  /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
>   * info is found.
>   */
> -static int associativity_to_nid(const __be32 *associativity)
> +int associativity_to_nid(const __be32 *associativity)
>  {
>   int nid = -1;
> 
> @@ -957,8 +961,6 @@ static int __init early_numa(char *p)
>  }
>  early_param("numa", early_numa);
> 
> -static bool topology_updates_enabled = true;
> -
>  static int __init early_topology_updates(char *p)
>  {
>   if (!p)
> @@ -1135,488 +1137,3 @@ u64 memory_hotplug_max(void)
>  return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
>  }
>  #endif /* CONFIG_MEMORY_HOTPLUG */
> -
> -/* Virtual Processor Home Node (VPHN) support */
> -#ifdef CONFIG_PPC_SPLPAR
> -
> -#include "vphn.h"
> -
> -struct topology_update_data {
> - struct topology_update_data *next;
> - unsigned int cpu;
> - int old_nid;
> - int new_nid;
> -};
> -
> -static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
> -static cpumask_t cpu_associativity_changes_mask;
> -static int vphn_enabled;
> -static int prrn_enabled;
> -static void reset_topology_timer(void);
> -
> -/*
> - * Store the current values of the associativity change counters in the
> - * hypervisor.
> - */
> 

Re: [PATCH V9 2/2] powerpc/nodes: Ensure enough nodes avail for operations

2017-08-23 Thread Nathan Fontenot
On 08/21/2017 04:44 PM, Michael Bringmann wrote:
> To: linuxppc-...@lists.ozlabs.org
> 
> From: Michael Bringmann <m...@linux.vnet.ibm.com>
> 
> To: linux-kernel@vger.kernel.org
> Cc: Michael Ellerman <m...@ellerman.id.au>
> Cc: Michael Bringmann <m...@linux.vnet.ibm.com>
> Cc: John Allen <jal...@linux.vnet.ibm.com>
> Cc: Nathan Fontenot <nf...@linux.vnet.ibm.com>
> Subject: [PATCH V9 2/2] powerpc/nodes: Ensure enough nodes avail for 
> operations
> 
> powerpc/nodes: On systems like PowerPC which allow 'hot-add' of CPU
> or memory resources, it may occur that the new resources are to be
> inserted into nodes that were not used for these resources at bootup.
> In the kernel, any node that is used must be defined and initialized
> at boot.
> 
> This patch extracts the value of the lowest domain level (number of
> allocable resources) from the "rtas" device tree property
> "ibm,max-associativity-domains" to use as the maximum number of nodes
> to setup as possibly available in the system.  This new setting will
> override the instruction,
> 
> nodes_and(node_possible_map, node_possible_map, node_online_map);
> 
> presently seen in the function arch/powerpc/mm/numa.c:initmem_init().
> 
> If the property is not present at boot, no operation will be performed
> to define or enable additional nodes.
> 
> Signed-off-by: Michael Bringmann <m...@linux.vnet.ibm.com>
> ---
>  arch/powerpc/mm/numa.c |   44 
>  1 file changed, 44 insertions(+)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 3fd4536..3ae6510 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -893,6 +893,48 @@ static void __init setup_node_data(int nid, u64 
> start_pfn, u64 end_pfn)
>   NODE_DATA(nid)->node_spanned_pages = spanned_pages;
>  }
> 
> +static void __init node_associativity_setup(void)
> +{
> + struct device_node *rtas;
> + printk(KERN_INFO "%s:%d\n", __FUNCTION__, __LINE__);

Is there a reson we need to have all these KERN_INFO printk's?

This looks like debug statements that accidentally were left in.

> +
> + rtas = of_find_node_by_path("/rtas");
> + if (rtas) {
> + const __be32 *prop;
> + u32 len, entries, levelval, i;
> + printk(KERN_INFO "%s:%d\n", __FUNCTION__, __LINE__);
> +
> + prop = of_get_property(rtas, "ibm,max-associativity-domains", 
> );

You could put the of_node_put() call here after getting the property and get
rid of all the goto's.

> + if (!prop || len < sizeof(unsigned int)) {
> + printk(KERN_INFO "%s:%d\n", __FUNCTION__, __LINE__);
> + goto endit;
> + }
> +
> + entries = of_read_number(prop++, 1);
> +
> + if (len < (entries * sizeof(unsigned int))) {
> + printk(KERN_INFO "%s:%d\n", __FUNCTION__, __LINE__);
> + goto endit;
> + }
> +
> + for (i = 0; i < entries; i++)
> + levelval = of_read_number(prop++, 1);

Couldn't you just read the last enbtry instead of doing a loop reading each
entry until you get to the last one?

-Nathan

> +
> + printk(KERN_INFO "Numa nodes avail: %d (%d) \n", (int) 
> levelval, (int) entries);
> +
> + for (i = 0; i < levelval; i++) {
> + if (!node_possible(i)) {
> + setup_node_data(i, 0, 0);
> + node_set(i, node_possible_map);
> + }
> + }
> + }
> +
> +endit:
> + if (rtas)
> + of_node_put(rtas)> +}
> +
>  void __init initmem_init(void)
>  {
>   int nid, cpu;
> @@ -912,6 +954,8 @@ void __init initmem_init(void)
>*/
>   nodes_and(node_possible_map, node_possible_map, node_online_map);
> 
> + node_associativity_setup();
> +
>   for_each_online_node(nid) {
>   unsigned long start_pfn, end_pfn;
> 



Re: [PATCH V9 2/2] powerpc/nodes: Ensure enough nodes avail for operations

2017-08-23 Thread Nathan Fontenot
On 08/21/2017 04:44 PM, Michael Bringmann wrote:
> To: linuxppc-...@lists.ozlabs.org
> 
> From: Michael Bringmann 
> 
> To: linux-kernel@vger.kernel.org
> Cc: Michael Ellerman 
> Cc: Michael Bringmann 
> Cc: John Allen 
> Cc: Nathan Fontenot 
> Subject: [PATCH V9 2/2] powerpc/nodes: Ensure enough nodes avail for 
> operations
> 
> powerpc/nodes: On systems like PowerPC which allow 'hot-add' of CPU
> or memory resources, it may occur that the new resources are to be
> inserted into nodes that were not used for these resources at bootup.
> In the kernel, any node that is used must be defined and initialized
> at boot.
> 
> This patch extracts the value of the lowest domain level (number of
> allocable resources) from the "rtas" device tree property
> "ibm,max-associativity-domains" to use as the maximum number of nodes
> to setup as possibly available in the system.  This new setting will
> override the instruction,
> 
> nodes_and(node_possible_map, node_possible_map, node_online_map);
> 
> presently seen in the function arch/powerpc/mm/numa.c:initmem_init().
> 
> If the property is not present at boot, no operation will be performed
> to define or enable additional nodes.
> 
> Signed-off-by: Michael Bringmann 
> ---
>  arch/powerpc/mm/numa.c |   44 
>  1 file changed, 44 insertions(+)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 3fd4536..3ae6510 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -893,6 +893,48 @@ static void __init setup_node_data(int nid, u64 
> start_pfn, u64 end_pfn)
>   NODE_DATA(nid)->node_spanned_pages = spanned_pages;
>  }
> 
> +static void __init node_associativity_setup(void)
> +{
> + struct device_node *rtas;
> + printk(KERN_INFO "%s:%d\n", __FUNCTION__, __LINE__);

Is there a reson we need to have all these KERN_INFO printk's?

This looks like debug statements that accidentally were left in.

> +
> + rtas = of_find_node_by_path("/rtas");
> + if (rtas) {
> + const __be32 *prop;
> + u32 len, entries, levelval, i;
> + printk(KERN_INFO "%s:%d\n", __FUNCTION__, __LINE__);
> +
> + prop = of_get_property(rtas, "ibm,max-associativity-domains", 
> );

You could put the of_node_put() call here after getting the property and get
rid of all the goto's.

> + if (!prop || len < sizeof(unsigned int)) {
> + printk(KERN_INFO "%s:%d\n", __FUNCTION__, __LINE__);
> + goto endit;
> + }
> +
> + entries = of_read_number(prop++, 1);
> +
> + if (len < (entries * sizeof(unsigned int))) {
> + printk(KERN_INFO "%s:%d\n", __FUNCTION__, __LINE__);
> + goto endit;
> + }
> +
> + for (i = 0; i < entries; i++)
> + levelval = of_read_number(prop++, 1);

Couldn't you just read the last enbtry instead of doing a loop reading each
entry until you get to the last one?

-Nathan

> +
> + printk(KERN_INFO "Numa nodes avail: %d (%d) \n", (int) 
> levelval, (int) entries);
> +
> + for (i = 0; i < levelval; i++) {
> + if (!node_possible(i)) {
> + setup_node_data(i, 0, 0);
> + node_set(i, node_possible_map);
> + }
> + }
> + }
> +
> +endit:
> + if (rtas)
> + of_node_put(rtas)> +}
> +
>  void __init initmem_init(void)
>  {
>   int nid, cpu;
> @@ -912,6 +954,8 @@ void __init initmem_init(void)
>*/
>   nodes_and(node_possible_map, node_possible_map, node_online_map);
> 
> + node_associativity_setup();
> +
>   for_each_online_node(nid) {
>   unsigned long start_pfn, end_pfn;
> 



Re: [PATCH V9 1/2] powerpc/numa: Update CPU topology when VPHN enabled

2017-08-23 Thread Nathan Fontenot
On 08/23/2017 06:41 AM, Michael Ellerman wrote:
> Michael Bringmann  writes:
> 
>> powerpc/numa: Correct the currently broken capability to set the
>> topology for shared CPUs in LPARs.  At boot time for shared CPU
>> lpars, the topology for each shared CPU is set to node zero, however,
>> this is now updated correctly using the Virtual Processor Home Node
>> (VPHN) capabilities information provided by the pHyp.
>>
>> Also, update initialization checks for device-tree attributes to
>> independently recognize PRRN or VPHN usage.
> 
> Did you ever do anything to address Nathan's comments on v4 ?
> 
>   http://patchwork.ozlabs.org/patch/767587/

Looking at this patch I do not see that VPHN is always enabled.

> 
> 
> Also your change log doesn't describe anything about what the patch does
> and why it is the correct fix for the problem.
> 
> When a DLPAR happens you modify the VPHN timer to run in 1 nsec, but you
> don't wait for it. Why would we not just run the logic synchronously?
> 
> It also seems to make VPHN and PRRN no longer exclusive, which looking
> at PAPR seems like it might be correct, but is also a major change so
> please justify it in detail.

This is correct, they are not exclusive. When we first added PRRN support
we mistakenly thought they were exclusive which is why the code currently
only starts PRRN, or VPHN if PRRN is not present.

> 
> Comments below.
> 
> 
>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>> index b95c584..3fd4536 100644
>> --- a/arch/powerpc/mm/numa.c
>> +++ b/arch/powerpc/mm/numa.c
>> @@ -906,7 +907,7 @@ void __init initmem_init(void)
>>  
>>  /*
>>   * Reduce the possible NUMA nodes to the online NUMA nodes,
>> - * since we do not support node hotplug. This ensures that  we
>> + * since we do not support node hotplug. This ensures that we
> 
> Please do whitespace/spelling changes in a separate patch.
> 
>>   * lower the maximum NUMA node ID to what is actually present.
>>   */
>>  nodes_and(node_possible_map, node_possible_map, node_online_map);
>> @@ -1148,11 +1149,32 @@ struct topology_update_data {
>>  int new_nid;
>>  };
>>  
>> +#define TOPOLOGY_DEF_TIMER_SECS 60
>> +
>>  static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
>>  static cpumask_t cpu_associativity_changes_mask;
>>  static int vphn_enabled;
>>  static int prrn_enabled;
>>  static void reset_topology_timer(void);
>> +static int topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS;
>> +static int topology_inited;
>> +static int topology_update_needed;
> 
> None of this code should be in numa.c. Which is not your fault but I'm
> inclined to move it before we make it worse.

Agreed. Perhaps this should all be in mm/vphn.c

-Nathan

> 
>> +
>> +/*
>> + * Change polling interval for associativity changes.
>> + */
>> +int timed_topology_update(int nsecs)
>> +{
>> +if (nsecs > 0)
>> +topology_timer_secs = nsecs;
>> +else
>> +topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS;
>> +
>> +if (vphn_enabled)
>> +reset_topology_timer();
>> +
>> +return 0;
>> +}
>>  
>>  /*
>>   * Store the current values of the associativity change counters in the
>> @@ -1246,6 +1268,12 @@ static long vphn_get_associativity(unsigned long cpu,
>>  "hcall_vphn() experienced a hardware fault "
>>  "preventing VPHN. Disabling polling...\n");
>>  stop_topology_update();
>> +break;
>> +case H_SUCCESS:
>> +printk(KERN_INFO
>> +"VPHN hcall succeeded. Reset polling...\n");
> 
> We don't need that to hit everyone's console once a minute. Remove it or
> pr_debug() if you like.
> 
>> @@ -1363,6 +1394,8 @@ int numa_update_cpu_topology(bool cpus_locked)
>>  cpumask_andnot(_associativity_changes_mask,
>>  _associativity_changes_mask,
>>  cpu_sibling_mask(cpu));
>> +pr_info("Assoc chg gives same node %d for cpu%d\n",
>> +new_nid, cpu);
> 
> No thanks.
> 
>> @@ -1379,6 +1412,9 @@ int numa_update_cpu_topology(bool cpus_locked)
>>  cpu = cpu_last_thread_sibling(cpu);
>>  }
>>  
>> +if (i)
>> +updates[i-1].next = NULL;
> 
> ???
> 
>> @@ -1453,6 +1490,14 @@ static void topology_schedule_update(void)
>>  schedule_work(_work);
>>  }
>>  
>> +void shared_topology_update(void)
>> +{
>> +if (firmware_has_feature(FW_FEATURE_VPHN) &&
>> +   lppaca_shared_proc(get_lppaca()))
>> +topology_schedule_update();
>> +}
>> +EXPORT_SYMBOL(shared_topology_update);
> 
> There's no reason for that to be exported AFAICS.
> 
>> diff --git a/arch/powerpc/platforms/pseries/dlpar.c 
>> b/arch/powerpc/platforms/pseries/dlpar.c
>> index 3918769..ba9a4a0 100644
>> --- a/arch/powerpc/platforms/pseries/dlpar.c
>> +++ 

Re: [PATCH V9 1/2] powerpc/numa: Update CPU topology when VPHN enabled

2017-08-23 Thread Nathan Fontenot
On 08/23/2017 06:41 AM, Michael Ellerman wrote:
> Michael Bringmann  writes:
> 
>> powerpc/numa: Correct the currently broken capability to set the
>> topology for shared CPUs in LPARs.  At boot time for shared CPU
>> lpars, the topology for each shared CPU is set to node zero, however,
>> this is now updated correctly using the Virtual Processor Home Node
>> (VPHN) capabilities information provided by the pHyp.
>>
>> Also, update initialization checks for device-tree attributes to
>> independently recognize PRRN or VPHN usage.
> 
> Did you ever do anything to address Nathan's comments on v4 ?
> 
>   http://patchwork.ozlabs.org/patch/767587/

Looking at this patch I do not see that VPHN is always enabled.

> 
> 
> Also your change log doesn't describe anything about what the patch does
> and why it is the correct fix for the problem.
> 
> When a DLPAR happens you modify the VPHN timer to run in 1 nsec, but you
> don't wait for it. Why would we not just run the logic synchronously?
> 
> It also seems to make VPHN and PRRN no longer exclusive, which looking
> at PAPR seems like it might be correct, but is also a major change so
> please justify it in detail.

This is correct, they are not exclusive. When we first added PRRN support
we mistakenly thought they were exclusive which is why the code currently
only starts PRRN, or VPHN if PRRN is not present.

> 
> Comments below.
> 
> 
>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>> index b95c584..3fd4536 100644
>> --- a/arch/powerpc/mm/numa.c
>> +++ b/arch/powerpc/mm/numa.c
>> @@ -906,7 +907,7 @@ void __init initmem_init(void)
>>  
>>  /*
>>   * Reduce the possible NUMA nodes to the online NUMA nodes,
>> - * since we do not support node hotplug. This ensures that  we
>> + * since we do not support node hotplug. This ensures that we
> 
> Please do whitespace/spelling changes in a separate patch.
> 
>>   * lower the maximum NUMA node ID to what is actually present.
>>   */
>>  nodes_and(node_possible_map, node_possible_map, node_online_map);
>> @@ -1148,11 +1149,32 @@ struct topology_update_data {
>>  int new_nid;
>>  };
>>  
>> +#define TOPOLOGY_DEF_TIMER_SECS 60
>> +
>>  static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
>>  static cpumask_t cpu_associativity_changes_mask;
>>  static int vphn_enabled;
>>  static int prrn_enabled;
>>  static void reset_topology_timer(void);
>> +static int topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS;
>> +static int topology_inited;
>> +static int topology_update_needed;
> 
> None of this code should be in numa.c. Which is not your fault but I'm
> inclined to move it before we make it worse.

Agreed. Perhaps this should all be in mm/vphn.c

-Nathan

> 
>> +
>> +/*
>> + * Change polling interval for associativity changes.
>> + */
>> +int timed_topology_update(int nsecs)
>> +{
>> +if (nsecs > 0)
>> +topology_timer_secs = nsecs;
>> +else
>> +topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS;
>> +
>> +if (vphn_enabled)
>> +reset_topology_timer();
>> +
>> +return 0;
>> +}
>>  
>>  /*
>>   * Store the current values of the associativity change counters in the
>> @@ -1246,6 +1268,12 @@ static long vphn_get_associativity(unsigned long cpu,
>>  "hcall_vphn() experienced a hardware fault "
>>  "preventing VPHN. Disabling polling...\n");
>>  stop_topology_update();
>> +break;
>> +case H_SUCCESS:
>> +printk(KERN_INFO
>> +"VPHN hcall succeeded. Reset polling...\n");
> 
> We don't need that to hit everyone's console once a minute. Remove it or
> pr_debug() if you like.
> 
>> @@ -1363,6 +1394,8 @@ int numa_update_cpu_topology(bool cpus_locked)
>>  cpumask_andnot(_associativity_changes_mask,
>>  _associativity_changes_mask,
>>  cpu_sibling_mask(cpu));
>> +pr_info("Assoc chg gives same node %d for cpu%d\n",
>> +new_nid, cpu);
> 
> No thanks.
> 
>> @@ -1379,6 +1412,9 @@ int numa_update_cpu_topology(bool cpus_locked)
>>  cpu = cpu_last_thread_sibling(cpu);
>>  }
>>  
>> +if (i)
>> +updates[i-1].next = NULL;
> 
> ???
> 
>> @@ -1453,6 +1490,14 @@ static void topology_schedule_update(void)
>>  schedule_work(_work);
>>  }
>>  
>> +void shared_topology_update(void)
>> +{
>> +if (firmware_has_feature(FW_FEATURE_VPHN) &&
>> +   lppaca_shared_proc(get_lppaca()))
>> +topology_schedule_update();
>> +}
>> +EXPORT_SYMBOL(shared_topology_update);
> 
> There's no reason for that to be exported AFAICS.
> 
>> diff --git a/arch/powerpc/platforms/pseries/dlpar.c 
>> b/arch/powerpc/platforms/pseries/dlpar.c
>> index 3918769..ba9a4a0 100644
>> --- a/arch/powerpc/platforms/pseries/dlpar.c
>> +++ b/arch/powerpc/platforms/pseries/dlpar.c

Re: [PATCH v6] workqueue: Fix edge cases for calc of pool's cpumask

2017-07-27 Thread Nathan Fontenot
On 07/27/2017 01:15 PM, Michael Bringmann wrote:
> 
> On NUMA systems with dynamic processors, the content of the cpumask
> may change over time.  As new processors are added via DLPAR operations,
> workqueues are created for them.  Depending upon the order in which CPUs
> are added/removed, we may run into problems with the content of the
> cpumask used by the workqueues.  This patch deals with situations where
> the online cpumask for a node is a proper superset of possible cpumask
> for the node.  It also deals with edge cases where the order in which
> CPUs are removed/added from the online cpumask may leave the set for a
> node empty, and require execution by CPUs on another node.
> 
> In these and other cases, the patch attempts to ensure that a valid,
> usable cpumask is used to set up newly created pools for workqueues.
> This patch provides a fix for NUMA systems which can add/subtract
> processors dynamically.  The patch is expected to be an intermediate
> one while developers search for any underlying issues.
> 
> [With additions to the patch provided by Tejun Hao ]
> 
> Signed-off-by: Michael Bringmann 
> ---
> Changes in V6:
>   -- Update descriptive text
> ---
>  kernel/workqueue.c |7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index c74bf39..6b6d540 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -3577,6 +3577,13 @@ static bool wq_calc_node_cpumask(const struct 
> workqueue_attrs *attrs, int node,
> 
>   /* yeap, return possible CPUs in @node that @attrs wants */
>   cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
> +
> + if (cpumask_empty(cpumask)) {
> + pr_warn_once("WARNING: workqueue cpumask: onl intersect > "
  ^^^
This message doesn't seem right, or I am missing something, "onl"?

-Nathan

> + "possible intersect\n");
> + return false;
> + }
> +
>   return !cpumask_equal(cpumask, attrs->cpumask);
> 
>  use_dfl:
> 



Re: [PATCH v6] workqueue: Fix edge cases for calc of pool's cpumask

2017-07-27 Thread Nathan Fontenot
On 07/27/2017 01:15 PM, Michael Bringmann wrote:
> 
> On NUMA systems with dynamic processors, the content of the cpumask
> may change over time.  As new processors are added via DLPAR operations,
> workqueues are created for them.  Depending upon the order in which CPUs
> are added/removed, we may run into problems with the content of the
> cpumask used by the workqueues.  This patch deals with situations where
> the online cpumask for a node is a proper superset of possible cpumask
> for the node.  It also deals with edge cases where the order in which
> CPUs are removed/added from the online cpumask may leave the set for a
> node empty, and require execution by CPUs on another node.
> 
> In these and other cases, the patch attempts to ensure that a valid,
> usable cpumask is used to set up newly created pools for workqueues.
> This patch provides a fix for NUMA systems which can add/subtract
> processors dynamically.  The patch is expected to be an intermediate
> one while developers search for any underlying issues.
> 
> [With additions to the patch provided by Tejun Hao ]
> 
> Signed-off-by: Michael Bringmann 
> ---
> Changes in V6:
>   -- Update descriptive text
> ---
>  kernel/workqueue.c |7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index c74bf39..6b6d540 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -3577,6 +3577,13 @@ static bool wq_calc_node_cpumask(const struct 
> workqueue_attrs *attrs, int node,
> 
>   /* yeap, return possible CPUs in @node that @attrs wants */
>   cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
> +
> + if (cpumask_empty(cpumask)) {
> + pr_warn_once("WARNING: workqueue cpumask: onl intersect > "
  ^^^
This message doesn't seem right, or I am missing something, "onl"?

-Nathan

> + "possible intersect\n");
> + return false;
> + }
> +
>   return !cpumask_equal(cpumask, attrs->cpumask);
> 
>  use_dfl:
> 



Re: [PATCH V4 1/2] powerpc/numa: Update CPU topology when VPHN enabled

2017-05-30 Thread Nathan Fontenot
On 05/26/2017 04:29 PM, Michael Bringmann wrote:
> 
> powerpc/numa: Correct the currently broken capability to set the
> topology for shared CPUs in LPARs.  At boot time for shared CPU
> lpars, the topology for each shared CPU is set to node zero, however,
> this is now updated correctly using the Virtual Processor Home Node
> (VPHN) capabilities information provided by the pHyp. The VPHN handling
> in Linux is disabled, if PRRN handling is present.

I'm still not sure this is what we want. Looking at the topology updating
code, we only enable VPHN if PRRN is not present.

My understanding of the current situation is that the node for partitions
with shared cpus are not set. The reason for this is that the device tree
presented to a partition using shared cpus at boot puts all cpus in node
zero and then uses the VPHN capability to inform the partition which node
each cpu really belongs to.

Additionally, I think this is how DLPAR of shared cpu partitions work.
After the cpu is DLPAR added we should get a VPHN notification to inform
us of the true node that the cpu belongs to.

When the PRRN capability was introduced it was thought to be a follow-on
to the VPHN capability and so the code to start topology updating only
enables VPHN if PRRN is not present. I think what we need to do is always
enable VPHN for shared cpu partitions.

-Nathan

> 
> Signed-off-by: Michael Bringmann 
> ---
> Changes in V4:
>  -- Fix conditional compile bug.
> ---
>  arch/powerpc/mm/numa.c |   19 ++-
>  arch/powerpc/platforms/pseries/dlpar.c |2 ++
>  2 files changed, 20 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 371792e..afcee3f 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -29,6 +29,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -1153,6 +1154,8 @@ struct topology_update_data {
>  static int vphn_enabled;
>  static int prrn_enabled;
>  static void reset_topology_timer(void);
> +static int topology_inited;
> +static int topology_update_needed;
> 
>  /*
>   * Store the current values of the associativity change counters in the
> @@ -1321,8 +1324,11 @@ int arch_update_cpu_topology(void)
>   struct device *dev;
>   int weight, new_nid, i = 0;
> 
> - if (!prrn_enabled && !vphn_enabled)
> + if (!prrn_enabled && !vphn_enabled) {
> + if (!topology_inited)
> + topology_update_needed = 1;
>   return 0;
> + }
> 
>   weight = cpumask_weight(_associativity_changes_mask);
>   if (!weight)
> @@ -1361,6 +1367,8 @@ int arch_update_cpu_topology(void)
>   cpumask_andnot(_associativity_changes_mask,
>   _associativity_changes_mask,
>   cpu_sibling_mask(cpu));
> + pr_info("Assoc chg gives same node %d for cpu%d\n",
> + new_nid, cpu);
>   cpu = cpu_last_thread_sibling(cpu);
>   continue;
>   }
> @@ -1377,6 +1385,9 @@ int arch_update_cpu_topology(void)
>   cpu = cpu_last_thread_sibling(cpu);
>   }
> 
> + if (i)
> + updates[i-1].next = NULL;
> +
>   pr_debug("Topology update for the following CPUs:\n");
>   if (cpumask_weight(_cpus)) {
>   for (ud = [0]; ud; ud = ud->next) {
> @@ -1423,6 +1434,7 @@ int arch_update_cpu_topology(void)
> 
>  out:
>   kfree(updates);
> + topology_update_needed = 0;
>   return changed;
>  }
> 
> @@ -1600,6 +1612,11 @@ static int topology_update_init(void)
>   if (!proc_create("powerpc/topology_updates", 0644, NULL, _ops))
>   return -ENOMEM;
> 
> + topology_inited = 1;
> + if (topology_update_needed)
> + bitmap_fill(cpumask_bits(_associativity_changes_mask),
> + nr_cpumask_bits);
> +
>   return 0;
>  }
>  device_initcall(topology_update_init);
> diff --git a/arch/powerpc/platforms/pseries/dlpar.c 
> b/arch/powerpc/platforms/pseries/dlpar.c
> index bda18d8..5106263 100644
> --- a/arch/powerpc/platforms/pseries/dlpar.c
> +++ b/arch/powerpc/platforms/pseries/dlpar.c
> @@ -592,6 +592,8 @@ static ssize_t dlpar_show(struct class *class, struct 
> class_attribute *attr,
> 
>  static int __init pseries_dlpar_init(void)
>  {
> + arch_update_cpu_topology();
> +
>   pseries_hp_wq = alloc_workqueue("pseries hotplug workqueue",
>   WQ_UNBOUND, 1);
>   return sysfs_create_file(kernel_kobj, _attr_dlpar.attr);
> 



Re: [PATCH V4 1/2] powerpc/numa: Update CPU topology when VPHN enabled

2017-05-30 Thread Nathan Fontenot
On 05/26/2017 04:29 PM, Michael Bringmann wrote:
> 
> powerpc/numa: Correct the currently broken capability to set the
> topology for shared CPUs in LPARs.  At boot time for shared CPU
> lpars, the topology for each shared CPU is set to node zero, however,
> this is now updated correctly using the Virtual Processor Home Node
> (VPHN) capabilities information provided by the pHyp. The VPHN handling
> in Linux is disabled, if PRRN handling is present.

I'm still not sure this is what we want. Looking at the topology updating
code, we only enable VPHN if PRRN is not present.

My understanding of the current situation is that the node for partitions
with shared cpus are not set. The reason for this is that the device tree
presented to a partition using shared cpus at boot puts all cpus in node
zero and then uses the VPHN capability to inform the partition which node
each cpu really belongs to.

Additionally, I think this is how DLPAR of shared cpu partitions work.
After the cpu is DLPAR added we should get a VPHN notification to inform
us of the true node that the cpu belongs to.

When the PRRN capability was introduced it was thought to be a follow-on
to the VPHN capability and so the code to start topology updating only
enables VPHN if PRRN is not present. I think what we need to do is always
enable VPHN for shared cpu partitions.

-Nathan

> 
> Signed-off-by: Michael Bringmann 
> ---
> Changes in V4:
>  -- Fix conditional compile bug.
> ---
>  arch/powerpc/mm/numa.c |   19 ++-
>  arch/powerpc/platforms/pseries/dlpar.c |2 ++
>  2 files changed, 20 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 371792e..afcee3f 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -29,6 +29,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -1153,6 +1154,8 @@ struct topology_update_data {
>  static int vphn_enabled;
>  static int prrn_enabled;
>  static void reset_topology_timer(void);
> +static int topology_inited;
> +static int topology_update_needed;
> 
>  /*
>   * Store the current values of the associativity change counters in the
> @@ -1321,8 +1324,11 @@ int arch_update_cpu_topology(void)
>   struct device *dev;
>   int weight, new_nid, i = 0;
> 
> - if (!prrn_enabled && !vphn_enabled)
> + if (!prrn_enabled && !vphn_enabled) {
> + if (!topology_inited)
> + topology_update_needed = 1;
>   return 0;
> + }
> 
>   weight = cpumask_weight(_associativity_changes_mask);
>   if (!weight)
> @@ -1361,6 +1367,8 @@ int arch_update_cpu_topology(void)
>   cpumask_andnot(_associativity_changes_mask,
>   _associativity_changes_mask,
>   cpu_sibling_mask(cpu));
> + pr_info("Assoc chg gives same node %d for cpu%d\n",
> + new_nid, cpu);
>   cpu = cpu_last_thread_sibling(cpu);
>   continue;
>   }
> @@ -1377,6 +1385,9 @@ int arch_update_cpu_topology(void)
>   cpu = cpu_last_thread_sibling(cpu);
>   }
> 
> + if (i)
> + updates[i-1].next = NULL;
> +
>   pr_debug("Topology update for the following CPUs:\n");
>   if (cpumask_weight(_cpus)) {
>   for (ud = [0]; ud; ud = ud->next) {
> @@ -1423,6 +1434,7 @@ int arch_update_cpu_topology(void)
> 
>  out:
>   kfree(updates);
> + topology_update_needed = 0;
>   return changed;
>  }
> 
> @@ -1600,6 +1612,11 @@ static int topology_update_init(void)
>   if (!proc_create("powerpc/topology_updates", 0644, NULL, _ops))
>   return -ENOMEM;
> 
> + topology_inited = 1;
> + if (topology_update_needed)
> + bitmap_fill(cpumask_bits(_associativity_changes_mask),
> + nr_cpumask_bits);
> +
>   return 0;
>  }
>  device_initcall(topology_update_init);
> diff --git a/arch/powerpc/platforms/pseries/dlpar.c 
> b/arch/powerpc/platforms/pseries/dlpar.c
> index bda18d8..5106263 100644
> --- a/arch/powerpc/platforms/pseries/dlpar.c
> +++ b/arch/powerpc/platforms/pseries/dlpar.c
> @@ -592,6 +592,8 @@ static ssize_t dlpar_show(struct class *class, struct 
> class_attribute *attr,
> 
>  static int __init pseries_dlpar_init(void)
>  {
> + arch_update_cpu_topology();
> +
>   pseries_hp_wq = alloc_workqueue("pseries hotplug workqueue",
>   WQ_UNBOUND, 1);
>   return sysfs_create_file(kernel_kobj, _attr_dlpar.attr);
> 



Re: [PATCH] powerpc/hotplug-mem: Fix aa_index match bug for hotplug

2017-05-30 Thread Nathan Fontenot
On 05/30/2017 06:41 AM, Michael Ellerman wrote:
> Michael Bringmann  writes:
> 
>> When adding or removing memory, the aa_index (affinity value) for the
>> memblock must also be converted to match the endianness of the rest
>> of the 'ibm,dynamic-memory' property.  Otherwise, subsequent retrieval
>> of the attribute will likely lead to non-existent nodes, followed by
>> using the default node in the code inappropriately.
>>
>> Signed-off-by: Michael Bringmann 
>>
>> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
>> b/arch/powerpc/platforms/pseries/hotplug-memory.c
>> index e104c71..1fb162b 100644
>> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
>> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
>> @@ -124,6 +124,7 @@ static struct property 
>> *dlpar_clone_drconf_property(struct device_node *dn)
>>  for (i = 0; i < num_lmbs; i++) {
>>  lmbs[i].base_addr = be64_to_cpu(lmbs[i].base_addr);
>>  lmbs[i].drc_index = be32_to_cpu(lmbs[i].drc_index);
>> +lmbs[i].aa_index = be32_to_cpu(lmbs[i].aa_index);
>>  lmbs[i].flags = be32_to_cpu(lmbs[i].flags);
>>  }
>>  
> 
> AFAICS this code was added in commit 5f97b2a0d176 ("powerpc/pseries:
> Implement memory hotplug add in the kernel").
> 
> So this should probably be marked:
> 
>   Fixes: 5f97b2a0d176 ("powerpc/pseries: Implement memory hotplug add in the 
> kernel")
> 
> And it seems like a bug we'd want fixed in stable, so:
> 
>   Cc: sta...@vger.kernel.org # v4.1+
> 
> 
> Am I right?

Yes, that is correct.

-Nathan



Re: [PATCH] powerpc/hotplug-mem: Fix aa_index match bug for hotplug

2017-05-30 Thread Nathan Fontenot
On 05/30/2017 06:41 AM, Michael Ellerman wrote:
> Michael Bringmann  writes:
> 
>> When adding or removing memory, the aa_index (affinity value) for the
>> memblock must also be converted to match the endianness of the rest
>> of the 'ibm,dynamic-memory' property.  Otherwise, subsequent retrieval
>> of the attribute will likely lead to non-existent nodes, followed by
>> using the default node in the code inappropriately.
>>
>> Signed-off-by: Michael Bringmann 
>>
>> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
>> b/arch/powerpc/platforms/pseries/hotplug-memory.c
>> index e104c71..1fb162b 100644
>> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
>> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
>> @@ -124,6 +124,7 @@ static struct property 
>> *dlpar_clone_drconf_property(struct device_node *dn)
>>  for (i = 0; i < num_lmbs; i++) {
>>  lmbs[i].base_addr = be64_to_cpu(lmbs[i].base_addr);
>>  lmbs[i].drc_index = be32_to_cpu(lmbs[i].drc_index);
>> +lmbs[i].aa_index = be32_to_cpu(lmbs[i].aa_index);
>>  lmbs[i].flags = be32_to_cpu(lmbs[i].flags);
>>  }
>>  
> 
> AFAICS this code was added in commit 5f97b2a0d176 ("powerpc/pseries:
> Implement memory hotplug add in the kernel").
> 
> So this should probably be marked:
> 
>   Fixes: 5f97b2a0d176 ("powerpc/pseries: Implement memory hotplug add in the 
> kernel")
> 
> And it seems like a bug we'd want fixed in stable, so:
> 
>   Cc: sta...@vger.kernel.org # v4.1+
> 
> 
> Am I right?

Yes, that is correct.

-Nathan



Re: [PATCH] powerpc/pseries: fix spelling mistake: "Attemping" -> "Attempting"

2016-10-25 Thread Nathan Fontenot
On 10/24/2016 05:02 PM, Colin King wrote:
> From: Colin Ian King <colin.k...@canonical.com>
> 
> trivial fix to spelling mistake in pr_debug message
> 
> Signed-off-by: Colin Ian King <colin.k...@canonical.com>

Reviewed-by: Nathan Fontenot <nf...@linux.vnet.ibm.com>

> ---
>  arch/powerpc/platforms/pseries/hotplug-cpu.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
> b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> index a1b63e0..c8929cb 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> @@ -553,7 +553,7 @@ static ssize_t dlpar_cpu_remove(struct device_node *dn, 
> u32 drc_index)
>  {
>   int rc;
> 
> - pr_debug("Attemping to remove CPU %s, drc index: %x\n",
> + pr_debug("Attempting to remove CPU %s, drc index: %x\n",
>dn->name, drc_index);
> 
>   rc = dlpar_offline_cpu(dn);
> 



Re: [PATCH] powerpc/pseries: fix spelling mistake: "Attemping" -> "Attempting"

2016-10-25 Thread Nathan Fontenot
On 10/24/2016 05:02 PM, Colin King wrote:
> From: Colin Ian King 
> 
> trivial fix to spelling mistake in pr_debug message
> 
> Signed-off-by: Colin Ian King 

Reviewed-by: Nathan Fontenot 

> ---
>  arch/powerpc/platforms/pseries/hotplug-cpu.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
> b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> index a1b63e0..c8929cb 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> @@ -553,7 +553,7 @@ static ssize_t dlpar_cpu_remove(struct device_node *dn, 
> u32 drc_index)
>  {
>   int rc;
> 
> - pr_debug("Attemping to remove CPU %s, drc index: %x\n",
> + pr_debug("Attempting to remove CPU %s, drc index: %x\n",
>dn->name, drc_index);
> 
>   rc = dlpar_offline_cpu(dn);
> 



Re: [PATCH 0/4] powerpc/mm: movable hotplug memory nodes

2016-08-10 Thread Nathan Fontenot
On 08/10/2016 05:30 AM, Michael Ellerman wrote:
> Reza Arbab  writes:
> 
>> These changes enable onlining memory into ZONE_MOVABLE on power, and the
>> creation of discrete nodes of movable memory.
>>
>> Node hotplug is not supported on power [1].
> 
> But maybe it should be?
> 
Yes, it should be supported.

I have briefly looked into this recently only to find
this will not be a simple update.

-Nathan 



Re: [PATCH 0/4] powerpc/mm: movable hotplug memory nodes

2016-08-10 Thread Nathan Fontenot
On 08/10/2016 05:30 AM, Michael Ellerman wrote:
> Reza Arbab  writes:
> 
>> These changes enable onlining memory into ZONE_MOVABLE on power, and the
>> creation of discrete nodes of movable memory.
>>
>> Node hotplug is not supported on power [1].
> 
> But maybe it should be?
> 
Yes, it should be supported.

I have briefly looked into this recently only to find
this will not be a simple update.

-Nathan 



Re: [Patch v2] rpaphp: fix slot registration for multiple slots under a PHB

2016-07-19 Thread Nathan Fontenot
On 07/11/2016 05:16 PM, Tyrel Datwyler wrote:
> PowerVM seems to only ever provide a single hotplug slot per PHB.
> The under lying slot hotplug registration code assumed multiple slots,
> but the actual implementation is broken for multiple slots. This went
> unnoticed for years due to the nature of PowerVM as mentioned
> previously. Under qemu/kvm the hotplug slot model aligns more with
> x86 where multiple slots are presented under a single PHB. As seen
> in the following each additional slot after the first fails to
> register due to each slot always being compared against the first
> child node of the PHB in the device tree.
> 
> [6.492291] rpaphp: RPA HOT Plug PCI Controller Driver version: 0.1
> [6.492569] rpaphp: Slot [Slot 0] registered
> [6.492577] rpaphp: pci_hp_register failed with error -16
> [6.493082] rpaphp: pci_hp_register failed with error -16
> [6.493138] rpaphp: pci_hp_register failed with error -16
> [6.493161] rpaphp: pci_hp_register failed with error -16
> 
> The registration logic is fixed so that each slot is compared
> against the existing child devices of the PHB in the device tree to
> determine present slots vs empty slots.
> 
> [   38.481750] rpaphp: RPA HOT Plug PCI Controller Driver version: 0.1
> [   38.482004] rpaphp: Slot [C0] registered
> [   38.482127] rpaphp: Slot [C1] registered
> [   38.482241] rpaphp: Slot [C2] registered
> [   38.482356] rpaphp: Slot [C3] registered
> [   38.482495] rpaphp: Slot [C4] registered
> 
> Signed-off-by: Tyrel Datwyler <tyr...@linux.vnet.ibm.com>

Reviewed-by: Nathan Fontenot <nf...@linux.vnet.ibm.com>

> ---
> 
> Changes in v2: corrected ibm,my-drc-index property name
> 
> ---
>  drivers/pci/hotplug/rpaphp_slot.c | 17 -
>  1 file changed, 12 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/pci/hotplug/rpaphp_slot.c 
> b/drivers/pci/hotplug/rpaphp_slot.c
> index 6937c72..388c4d8 100644
> --- a/drivers/pci/hotplug/rpaphp_slot.c
> +++ b/drivers/pci/hotplug/rpaphp_slot.c
> @@ -117,8 +117,10 @@ EXPORT_SYMBOL_GPL(rpaphp_deregister_slot);
>  int rpaphp_register_slot(struct slot *slot)
>  {
>   struct hotplug_slot *php_slot = slot->hotplug_slot;
> + struct device_node *child;
> + u32 my_index;
>   int retval;
> - int slotno;
> + int slotno = -1;
> 
>   dbg("%s registering slot:path[%s] index[%x], name[%s] pdomain[%x] 
> type[%d]\n",
>   __func__, slot->dn->full_name, slot->index, slot->name,
> @@ -130,10 +132,15 @@ int rpaphp_register_slot(struct slot *slot)
>   return -EAGAIN;
>   }
> 
> - if (slot->dn->child)
> - slotno = PCI_SLOT(PCI_DN(slot->dn->child)->devfn);
> - else
> - slotno = -1;
> + for_each_child_of_node(slot->dn, child) {
> + retval = of_property_read_u32(child, "ibm,my-drc-index", 
> _index);
> + if (my_index == slot->index) {
> + slotno = PCI_SLOT(PCI_DN(child)->devfn);
> + of_node_put(child);
> + break;
> + }
> + }
> +
>   retval = pci_hp_register(php_slot, slot->bus, slotno, slot->name);
>   if (retval) {
>   err("pci_hp_register failed with error %d\n", retval);
> 



Re: [Patch v2] rpaphp: fix slot registration for multiple slots under a PHB

2016-07-19 Thread Nathan Fontenot
On 07/11/2016 05:16 PM, Tyrel Datwyler wrote:
> PowerVM seems to only ever provide a single hotplug slot per PHB.
> The under lying slot hotplug registration code assumed multiple slots,
> but the actual implementation is broken for multiple slots. This went
> unnoticed for years due to the nature of PowerVM as mentioned
> previously. Under qemu/kvm the hotplug slot model aligns more with
> x86 where multiple slots are presented under a single PHB. As seen
> in the following each additional slot after the first fails to
> register due to each slot always being compared against the first
> child node of the PHB in the device tree.
> 
> [6.492291] rpaphp: RPA HOT Plug PCI Controller Driver version: 0.1
> [6.492569] rpaphp: Slot [Slot 0] registered
> [6.492577] rpaphp: pci_hp_register failed with error -16
> [6.493082] rpaphp: pci_hp_register failed with error -16
> [6.493138] rpaphp: pci_hp_register failed with error -16
> [6.493161] rpaphp: pci_hp_register failed with error -16
> 
> The registration logic is fixed so that each slot is compared
> against the existing child devices of the PHB in the device tree to
> determine present slots vs empty slots.
> 
> [   38.481750] rpaphp: RPA HOT Plug PCI Controller Driver version: 0.1
> [   38.482004] rpaphp: Slot [C0] registered
> [   38.482127] rpaphp: Slot [C1] registered
> [   38.482241] rpaphp: Slot [C2] registered
> [   38.482356] rpaphp: Slot [C3] registered
> [   38.482495] rpaphp: Slot [C4] registered
> 
> Signed-off-by: Tyrel Datwyler 

Reviewed-by: Nathan Fontenot 

> ---
> 
> Changes in v2: corrected ibm,my-drc-index property name
> 
> ---
>  drivers/pci/hotplug/rpaphp_slot.c | 17 -
>  1 file changed, 12 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/pci/hotplug/rpaphp_slot.c 
> b/drivers/pci/hotplug/rpaphp_slot.c
> index 6937c72..388c4d8 100644
> --- a/drivers/pci/hotplug/rpaphp_slot.c
> +++ b/drivers/pci/hotplug/rpaphp_slot.c
> @@ -117,8 +117,10 @@ EXPORT_SYMBOL_GPL(rpaphp_deregister_slot);
>  int rpaphp_register_slot(struct slot *slot)
>  {
>   struct hotplug_slot *php_slot = slot->hotplug_slot;
> + struct device_node *child;
> + u32 my_index;
>   int retval;
> - int slotno;
> + int slotno = -1;
> 
>   dbg("%s registering slot:path[%s] index[%x], name[%s] pdomain[%x] 
> type[%d]\n",
>   __func__, slot->dn->full_name, slot->index, slot->name,
> @@ -130,10 +132,15 @@ int rpaphp_register_slot(struct slot *slot)
>   return -EAGAIN;
>   }
> 
> - if (slot->dn->child)
> - slotno = PCI_SLOT(PCI_DN(slot->dn->child)->devfn);
> - else
> - slotno = -1;
> + for_each_child_of_node(slot->dn, child) {
> + retval = of_property_read_u32(child, "ibm,my-drc-index", 
> _index);
> + if (my_index == slot->index) {
> + slotno = PCI_SLOT(PCI_DN(child)->devfn);
> + of_node_put(child);
> + break;
> + }
> + }
> +
>   retval = pci_hp_register(php_slot, slot->bus, slotno, slot->name);
>   if (retval) {
>   err("pci_hp_register failed with error %d\n", retval);
> 



Re: [PATCH] rpaphp: fix slot registration for multiple slots under a PHB

2016-07-08 Thread Nathan Fontenot
On 07/08/2016 06:19 PM, Tyrel Datwyler wrote:
> PowerVM seems to only ever provide a single hotplug slot per PHB.
> The under lying slot hotplug registration code assumed multiple slots,
> but the actual implementation is broken for multiple slots. This went
> unnoticed for years due to the nature of PowerVM as mentioned
> previously. Under qemu/kvm the hotplug slot model aligns more with
> x86 where multiple slots are presented under a single PHB. As seen
> in the following each additional slot after the first fails to
> register due to each slot always being compared against the first
> child node of the PHB in the device tree.
> 
> [6.492291] rpaphp: RPA HOT Plug PCI Controller Driver version: 0.1
> [6.492569] rpaphp: Slot [Slot 0] registered
> [6.492577] rpaphp: pci_hp_register failed with error -16
> [6.493082] rpaphp: pci_hp_register failed with error -16
> [6.493138] rpaphp: pci_hp_register failed with error -16
> [6.493161] rpaphp: pci_hp_register failed with error -16
> 
> The registration logic is fixed so that each slot is compared
> against the existing child devices of the PHB in the device tree to
> determine present slots vs empty slots.
> 
> [   38.481750] rpaphp: RPA HOT Plug PCI Controller Driver version: 0.1
> [   38.482004] rpaphp: Slot [C0] registered
> [   38.482127] rpaphp: Slot [C1] registered
> [   38.482241] rpaphp: Slot [C2] registered
> [   38.482356] rpaphp: Slot [C3] registered
> [   38.482495] rpaphp: Slot [C4] registered
> 
> Signed-off-by: Tyrel Datwyler 
> ---
>  drivers/pci/hotplug/rpaphp_slot.c | 17 -
>  1 file changed, 12 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/pci/hotplug/rpaphp_slot.c 
> b/drivers/pci/hotplug/rpaphp_slot.c
> index 6937c72..c90fa8d 100644
> --- a/drivers/pci/hotplug/rpaphp_slot.c
> +++ b/drivers/pci/hotplug/rpaphp_slot.c
> @@ -117,8 +117,10 @@ EXPORT_SYMBOL_GPL(rpaphp_deregister_slot);
>  int rpaphp_register_slot(struct slot *slot)
>  {
>   struct hotplug_slot *php_slot = slot->hotplug_slot;
> + struct device_node *child;
> + u32 my_index;
>   int retval;
> - int slotno;
> + int slotno = -1;
> 
>   dbg("%s registering slot:path[%s] index[%x], name[%s] pdomain[%x] 
> type[%d]\n",
>   __func__, slot->dn->full_name, slot->index, slot->name,
> @@ -130,10 +132,15 @@ int rpaphp_register_slot(struct slot *slot)
>   return -EAGAIN;
>   }
> 
> - if (slot->dn->child)
> - slotno = PCI_SLOT(PCI_DN(slot->dn->child)->devfn);
> - else
> - slotno = -1;
> + for_each_child_of_node(slot->dn, child) {
> + retval = of_property_read_u32(child, "my,ibm-drc-index", 
> _index);

Shouldn't this be reading ibm,my-drc-index? instead of my,ibm-drc-index.

-Nathan

> + if (my_index == slot->index) {
> + slotno = PCI_SLOT(PCI_DN(child)->devfn);
> + of_node_put(child);
> + break;
> + }
> + }
> +
>   retval = pci_hp_register(php_slot, slot->bus, slotno, slot->name);
>   if (retval) {
>   err("pci_hp_register failed with error %d\n", retval);
> 



Re: [PATCH] rpaphp: fix slot registration for multiple slots under a PHB

2016-07-08 Thread Nathan Fontenot
On 07/08/2016 06:19 PM, Tyrel Datwyler wrote:
> PowerVM seems to only ever provide a single hotplug slot per PHB.
> The under lying slot hotplug registration code assumed multiple slots,
> but the actual implementation is broken for multiple slots. This went
> unnoticed for years due to the nature of PowerVM as mentioned
> previously. Under qemu/kvm the hotplug slot model aligns more with
> x86 where multiple slots are presented under a single PHB. As seen
> in the following each additional slot after the first fails to
> register due to each slot always being compared against the first
> child node of the PHB in the device tree.
> 
> [6.492291] rpaphp: RPA HOT Plug PCI Controller Driver version: 0.1
> [6.492569] rpaphp: Slot [Slot 0] registered
> [6.492577] rpaphp: pci_hp_register failed with error -16
> [6.493082] rpaphp: pci_hp_register failed with error -16
> [6.493138] rpaphp: pci_hp_register failed with error -16
> [6.493161] rpaphp: pci_hp_register failed with error -16
> 
> The registration logic is fixed so that each slot is compared
> against the existing child devices of the PHB in the device tree to
> determine present slots vs empty slots.
> 
> [   38.481750] rpaphp: RPA HOT Plug PCI Controller Driver version: 0.1
> [   38.482004] rpaphp: Slot [C0] registered
> [   38.482127] rpaphp: Slot [C1] registered
> [   38.482241] rpaphp: Slot [C2] registered
> [   38.482356] rpaphp: Slot [C3] registered
> [   38.482495] rpaphp: Slot [C4] registered
> 
> Signed-off-by: Tyrel Datwyler 
> ---
>  drivers/pci/hotplug/rpaphp_slot.c | 17 -
>  1 file changed, 12 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/pci/hotplug/rpaphp_slot.c 
> b/drivers/pci/hotplug/rpaphp_slot.c
> index 6937c72..c90fa8d 100644
> --- a/drivers/pci/hotplug/rpaphp_slot.c
> +++ b/drivers/pci/hotplug/rpaphp_slot.c
> @@ -117,8 +117,10 @@ EXPORT_SYMBOL_GPL(rpaphp_deregister_slot);
>  int rpaphp_register_slot(struct slot *slot)
>  {
>   struct hotplug_slot *php_slot = slot->hotplug_slot;
> + struct device_node *child;
> + u32 my_index;
>   int retval;
> - int slotno;
> + int slotno = -1;
> 
>   dbg("%s registering slot:path[%s] index[%x], name[%s] pdomain[%x] 
> type[%d]\n",
>   __func__, slot->dn->full_name, slot->index, slot->name,
> @@ -130,10 +132,15 @@ int rpaphp_register_slot(struct slot *slot)
>   return -EAGAIN;
>   }
> 
> - if (slot->dn->child)
> - slotno = PCI_SLOT(PCI_DN(slot->dn->child)->devfn);
> - else
> - slotno = -1;
> + for_each_child_of_node(slot->dn, child) {
> + retval = of_property_read_u32(child, "my,ibm-drc-index", 
> _index);

Shouldn't this be reading ibm,my-drc-index? instead of my,ibm-drc-index.

-Nathan

> + if (my_index == slot->index) {
> + slotno = PCI_SLOT(PCI_DN(child)->devfn);
> + of_node_put(child);
> + break;
> + }
> + }
> +
>   retval = pci_hp_register(php_slot, slot->bus, slotno, slot->name);
>   if (retval) {
>   err("pci_hp_register failed with error %d\n", retval);
> 



Re: [PATCH 2/3] powerpc: make kernel/nvram_64.c explicitly non-modular

2016-03-28 Thread Nathan Fontenot
On 03/27/2016 05:08 PM, Paul Gortmaker wrote:
> The Makefile/Kconfig currently controlling compilation of this code is:
> 
> obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \
>signal_64.o ptrace32.o \
>paca.o nvram_64.o firmware.o
> 
> arch/powerpc/platforms/Kconfig.cputype:config PPC64
> arch/powerpc/platforms/Kconfig.cputype: bool "64-bit kernel"
> 
> ...meaning that it currently is not being built as a module by anyone.
> 
> Lets remove the modular code that is essentially orphaned, so that
> when reading the driver there is no doubt it is builtin-only.
> 
> Since module_init translates to device_initcall in the non-modular
> case, the init ordering remains unchanged with this commit.
> 
> We don't replace module.h with init.h since the file already has that.
> 
> We delete the MODULE_LICENSE tag since that information is already
> contained at the top of the file in the comments.
> 
> Cc: Benjamin Herrenschmidt <b...@kernel.crashing.org>
> Cc: Paul Mackerras <pau...@samba.org>
> Cc: Michael Ellerman <m...@ellerman.id.au>
> Cc: Hari Bathini <hbath...@linux.vnet.ibm.com>
> Cc: Nathan Fontenot <nf...@linux.vnet.ibm.com>
> Cc: Andrzej Hajda <a.ha...@samsung.com>
> Cc: Anton Blanchard <an...@samba.org>
> Cc: linuxppc-...@lists.ozlabs.org
> Signed-off-by: Paul Gortmaker <paul.gortma...@windriver.com>
> ---

I think at some point in the past we thought this may be useful as a module
but I'm not sure it has ever been used that way.

Reviewed-by: Nathan Fontenot <nf...@linux.vnet.ibm.com>

>  arch/powerpc/kernel/nvram_64.c | 12 +---
>  1 file changed, 1 insertion(+), 11 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
> index 0cab9e8c3794..856f9a7944cd 100644
> --- a/arch/powerpc/kernel/nvram_64.c
> +++ b/arch/powerpc/kernel/nvram_64.c
> @@ -15,8 +15,6 @@
>   *   parsing code.
>   */
> 
> -#include 
> -
>  #include 
>  #include 
>  #include 
> @@ -1231,12 +1229,4 @@ static int __init nvram_init(void)
>   
>   return rc;
>  }
> -
> -static void __exit nvram_cleanup(void)
> -{
> -misc_deregister( _dev );
> -}
> -
> -module_init(nvram_init);
> -module_exit(nvram_cleanup);
> -MODULE_LICENSE("GPL");
> +device_initcall(nvram_init);
> 



Re: [PATCH 2/3] powerpc: make kernel/nvram_64.c explicitly non-modular

2016-03-28 Thread Nathan Fontenot
On 03/27/2016 05:08 PM, Paul Gortmaker wrote:
> The Makefile/Kconfig currently controlling compilation of this code is:
> 
> obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \
>signal_64.o ptrace32.o \
>paca.o nvram_64.o firmware.o
> 
> arch/powerpc/platforms/Kconfig.cputype:config PPC64
> arch/powerpc/platforms/Kconfig.cputype: bool "64-bit kernel"
> 
> ...meaning that it currently is not being built as a module by anyone.
> 
> Lets remove the modular code that is essentially orphaned, so that
> when reading the driver there is no doubt it is builtin-only.
> 
> Since module_init translates to device_initcall in the non-modular
> case, the init ordering remains unchanged with this commit.
> 
> We don't replace module.h with init.h since the file already has that.
> 
> We delete the MODULE_LICENSE tag since that information is already
> contained at the top of the file in the comments.
> 
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Michael Ellerman 
> Cc: Hari Bathini 
> Cc: Nathan Fontenot 
> Cc: Andrzej Hajda 
> Cc: Anton Blanchard 
> Cc: linuxppc-...@lists.ozlabs.org
> Signed-off-by: Paul Gortmaker 
> ---

I think at some point in the past we thought this may be useful as a module
but I'm not sure it has ever been used that way.

Reviewed-by: Nathan Fontenot 

>  arch/powerpc/kernel/nvram_64.c | 12 +---
>  1 file changed, 1 insertion(+), 11 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
> index 0cab9e8c3794..856f9a7944cd 100644
> --- a/arch/powerpc/kernel/nvram_64.c
> +++ b/arch/powerpc/kernel/nvram_64.c
> @@ -15,8 +15,6 @@
>   *   parsing code.
>   */
> 
> -#include 
> -
>  #include 
>  #include 
>  #include 
> @@ -1231,12 +1229,4 @@ static int __init nvram_init(void)
>   
>   return rc;
>  }
> -
> -static void __exit nvram_cleanup(void)
> -{
> -misc_deregister( _dev );
> -}
> -
> -module_init(nvram_init);
> -module_exit(nvram_cleanup);
> -MODULE_LICENSE("GPL");
> +device_initcall(nvram_init);
> 



Re: [PATCH] powerpc/nvram: Fix a memory leak in err path

2015-12-09 Thread Nathan Fontenot
On 12/09/2015 04:00 AM, xinhui wrote:
> 
> If kmemdup fails, We need kfree *buff* first then return -ENOMEM.
> Otherwise there is a memory leak.
> 
> Signed-off-by: Pan Xinhui 

Reviewed-by: Nathan Fontenot 

> ---
>  arch/powerpc/kernel/nvram_64.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
> index 32e2652..21a278b7 100644
> --- a/arch/powerpc/kernel/nvram_64.c
> +++ b/arch/powerpc/kernel/nvram_64.c
> @@ -542,9 +542,9 @@ static ssize_t nvram_pstore_read(u64 *id, enum 
> pstore_type_id *type,
>   time->tv_nsec = 0;
>   }
>   *buf = kmemdup(buff + hdr_size, length, GFP_KERNEL);
> + kfree(buff);
>   if (*buf == NULL)
>   return -ENOMEM;
> - kfree(buff);
>  
>   if (err_type == ERR_TYPE_KERNEL_PANIC_GZ)
>   *compressed = true;
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] powerpc/nvram: Fix a memory leak in err path

2015-12-09 Thread Nathan Fontenot
On 12/09/2015 04:00 AM, xinhui wrote:
> 
> If kmemdup fails, We need kfree *buff* first then return -ENOMEM.
> Otherwise there is a memory leak.
> 
> Signed-off-by: Pan Xinhui <xinhui@linux.vnet.ibm.com>

Reviewed-by: Nathan Fontenot <nf...@linux.vnet.ibm.com>

> ---
>  arch/powerpc/kernel/nvram_64.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
> index 32e2652..21a278b7 100644
> --- a/arch/powerpc/kernel/nvram_64.c
> +++ b/arch/powerpc/kernel/nvram_64.c
> @@ -542,9 +542,9 @@ static ssize_t nvram_pstore_read(u64 *id, enum 
> pstore_type_id *type,
>   time->tv_nsec = 0;
>   }
>   *buf = kmemdup(buff + hdr_size, length, GFP_KERNEL);
> + kfree(buff);
>   if (*buf == NULL)
>   return -ENOMEM;
> - kfree(buff);
>  
>   if (err_type == ERR_TYPE_KERNEL_PANIC_GZ)
>   *compressed = true;
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] memory-hotplug: Fix kernel warning during memory hotplug on ppc64

2015-11-09 Thread Nathan Fontenot
On 11/03/2015 11:21 AM, John Allen wrote:
> This patch fixes a bug where a kernel warning is triggered when performing
> a memory hotplug on ppc64. This warning may also occur on any architecture
> that has multiple sections per memory block.
> 
> [   78.300767] [ cut here ]
> [   78.300768] WARNING: at ../drivers/base/memory.c:210
> [   78.300769] Modules linked in: rpadlpar_io(X) rpaphp(X) tcp_diag udp_diag 
> inet_diag unix_diag af_packet_diag netlink_diag af_packet xfs libcrc32c 
> ibmveth(X) rtc_generic btrfs xor raid6_pq xts gf128mul dm_crypt sd_mod sr_mod 
> cdrom crc_t10dif ibmvscsi(X) scsi_transport_srp scsi_tgt dm_mod sg scsi_mod 
> autofs4
> [   78.300789] Supported: Yes, External
> [   78.300791] CPU: 1 PID: 3090 Comm: systemd-udevd Tainted: G  X 
> 3.12.45-1-default #1
> [   78.300793] task: c004d7d1d970 ti: c004d7b9 task.ti: 
> c004d7b9
> [   78.300794] NIP: c04fcff8 LR: c04fda84 CTR: 
> 
> [   78.300795] REGS: c004d7b93930 TRAP: 0700   Tainted: G  X  
> (3.12.45-1-default)
> [   78.300796] MSR: 80029033   CR: 24088848  
> XER: 
> [   78.300800] CFAR: c04fcf98 SOFTE: 1
> GPR00: 0537 c004d7b93bb0 c0e7f200 00053000
> GPR04: 1000 0001 c0e0f200 
> GPR08:  0001 0537 014dc000
> GPR12: 00054000 ce7f0900 10041040 
> GPR16: 0100206f0010 1003ff78 1006c824 100410b0
> GPR20: 1003ff90 1006c00c 01002073cd20 0100206f0760
> GPR24: 0100206f85a0 c076d950 c004ef7c95e0 c004d7b93e00
> GPR28: c004de601738 0001 c1218f80 003f
> [   78.300818] NIP [c04fcff8] memory_block_action+0x258/0x2e0
> [   78.300820] LR [c04fda84] memory_subsys_online+0x54/0x100
> [   78.300821] Call Trace:
> [   78.300822] [c004d7b93bb0] [c9071ce0] 0xc9071ce0 
> (unreliable)
> [   78.300824] [c004d7b93c40] [c04fda84] 
> memory_subsys_online+0x54/0x100
> [   78.300826] [c004d7b93c70] [c04df784] device_online+0xb4/0x120
> [   78.300828] [c004d7b93cb0] [c04fd738] 
> store_mem_state+0x88/0x220
> [   78.300830] [c004d7b93cf0] [c04db448] dev_attr_store+0x68/0xa0
> [   78.300833] [c004d7b93d30] [c031f938] 
> sysfs_write_file+0xf8/0x1d0
> [   78.300835] [c004d7b93d90] [c027d29c] vfs_write+0xec/0x250
> [   78.300837] [c004d7b93de0] [c027dfdc] SyS_write+0x6c/0xf0
> [   78.300839] [c004d7b93e30] [c000a17c] syscall_exit+0x0/0x7c
> [   78.300840] Instruction dump:
> [   78.300841] 780a0560 79482ea4 7ce94214 2fa7 41de0014 7d09402a 396b4000 
> 7907ffe3
> [   78.300844] 4082ff54 3cc2fff9 8926b83a 69290001 <0b09> 2fa9 
> 40de006c 3860fff0
> [   78.300847] ---[ end trace dfec8da06ebbc762 ]---
> 
> The warning is triggered because there is a udev rule that automatically
> tries to online memory after it has been added. The udev rule varies from
> distro to distro, but will generally look something like:
> 
> SUBSYSTEM=="memory", ACTION=="add", ATTR{state}=="offline", 
> ATTR{state}="online"
> 
> On any architecture that uses memory_probe_store to reserve memory,
> this can interrupt the memory reservation process. This patch modifies
> memory_probe_store to take the hotplug sysfs lock to prevent the online
> of added memory before the completion of the probe.
> 
> Signed-off-by: John Allen 

Reviewed-by: Nathan Fontenot 

> ---
> v2: Move call to unlock_device_hotplug under "out" label
> 
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index bece691..7c50415 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -422,6 +422,10 @@ memory_probe_store(struct device *dev, struct 
> device_attribute *attr,
>   if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
>   return -EINVAL;
> 
> + ret = lock_device_hotplug_sysfs();
> + if (ret)
> + return ret;
> +
>   for (i = 0; i < sections_per_block; i++) {
>   nid = memory_add_physaddr_to_nid(phys_addr);
>   ret = add_memory(nid, phys_addr,
> @@ -434,6 +438,7 @@ memory_probe_store(struct device *dev, struct 
> device_attribute *attr,
> 
>   ret = count;
>  out:
> + unlock_device_hotplug();
>   return ret;
>  }
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] memory-hotplug: Fix kernel warning during memory hotplug on ppc64

2015-11-09 Thread Nathan Fontenot
On 11/03/2015 11:21 AM, John Allen wrote:
> This patch fixes a bug where a kernel warning is triggered when performing
> a memory hotplug on ppc64. This warning may also occur on any architecture
> that has multiple sections per memory block.
> 
> [   78.300767] [ cut here ]
> [   78.300768] WARNING: at ../drivers/base/memory.c:210
> [   78.300769] Modules linked in: rpadlpar_io(X) rpaphp(X) tcp_diag udp_diag 
> inet_diag unix_diag af_packet_diag netlink_diag af_packet xfs libcrc32c 
> ibmveth(X) rtc_generic btrfs xor raid6_pq xts gf128mul dm_crypt sd_mod sr_mod 
> cdrom crc_t10dif ibmvscsi(X) scsi_transport_srp scsi_tgt dm_mod sg scsi_mod 
> autofs4
> [   78.300789] Supported: Yes, External
> [   78.300791] CPU: 1 PID: 3090 Comm: systemd-udevd Tainted: G  X 
> 3.12.45-1-default #1
> [   78.300793] task: c004d7d1d970 ti: c004d7b9 task.ti: 
> c004d7b9
> [   78.300794] NIP: c04fcff8 LR: c04fda84 CTR: 
> 
> [   78.300795] REGS: c004d7b93930 TRAP: 0700   Tainted: G  X  
> (3.12.45-1-default)
> [   78.300796] MSR: 80029033 <SF,EE,ME,IR,DR,RI,LE>  CR: 24088848  
> XER: 
> [   78.300800] CFAR: c04fcf98 SOFTE: 1
> GPR00: 0537 c004d7b93bb0 c0e7f200 00053000
> GPR04: 1000 0001 c0e0f200 
> GPR08:  0001 0537 014dc000
> GPR12: 00054000 ce7f0900 10041040 
> GPR16: 0100206f0010 1003ff78 1006c824 100410b0
> GPR20: 1003ff90 1006c00c 01002073cd20 0100206f0760
> GPR24: 0100206f85a0 c076d950 c004ef7c95e0 c004d7b93e00
> GPR28: c004de601738 0001 c1218f80 003f
> [   78.300818] NIP [c04fcff8] memory_block_action+0x258/0x2e0
> [   78.300820] LR [c04fda84] memory_subsys_online+0x54/0x100
> [   78.300821] Call Trace:
> [   78.300822] [c004d7b93bb0] [c9071ce0] 0xc9071ce0 
> (unreliable)
> [   78.300824] [c004d7b93c40] [c04fda84] 
> memory_subsys_online+0x54/0x100
> [   78.300826] [c004d7b93c70] [c04df784] device_online+0xb4/0x120
> [   78.300828] [c004d7b93cb0] [c04fd738] 
> store_mem_state+0x88/0x220
> [   78.300830] [c004d7b93cf0] [c04db448] dev_attr_store+0x68/0xa0
> [   78.300833] [c004d7b93d30] [c031f938] 
> sysfs_write_file+0xf8/0x1d0
> [   78.300835] [c004d7b93d90] [c027d29c] vfs_write+0xec/0x250
> [   78.300837] [c004d7b93de0] [c027dfdc] SyS_write+0x6c/0xf0
> [   78.300839] [c004d7b93e30] [c000a17c] syscall_exit+0x0/0x7c
> [   78.300840] Instruction dump:
> [   78.300841] 780a0560 79482ea4 7ce94214 2fa7 41de0014 7d09402a 396b4000 
> 7907ffe3
> [   78.300844] 4082ff54 3cc2fff9 8926b83a 69290001 <0b09> 2fa9 
> 40de006c 3860fff0
> [   78.300847] ---[ end trace dfec8da06ebbc762 ]---
> 
> The warning is triggered because there is a udev rule that automatically
> tries to online memory after it has been added. The udev rule varies from
> distro to distro, but will generally look something like:
> 
> SUBSYSTEM=="memory", ACTION=="add", ATTR{state}=="offline", 
> ATTR{state}="online"
> 
> On any architecture that uses memory_probe_store to reserve memory,
> this can interrupt the memory reservation process. This patch modifies
> memory_probe_store to take the hotplug sysfs lock to prevent the online
> of added memory before the completion of the probe.
> 
> Signed-off-by: John Allen <jal...@linux.vnet.ibm.com>

Reviewed-by: Nathan Fontenot <nf...@linux.vnet.ibm.com>

> ---
> v2: Move call to unlock_device_hotplug under "out" label
> 
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index bece691..7c50415 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -422,6 +422,10 @@ memory_probe_store(struct device *dev, struct 
> device_attribute *attr,
>   if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
>   return -EINVAL;
> 
> + ret = lock_device_hotplug_sysfs();
> + if (ret)
> + return ret;
> +
>   for (i = 0; i < sections_per_block; i++) {
>   nid = memory_add_physaddr_to_nid(phys_addr);
>   ret = add_memory(nid, phys_addr,
> @@ -434,6 +438,7 @@ memory_probe_store(struct device *dev, struct 
> device_attribute *attr,
> 
>   ret = count;
>  out:
> + unlock_device_hotplug();
>   return ret;
>  }
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] driver: base: memory: Maintain correct mem->end_section_nr when memory block is partially filled

2015-08-14 Thread Nathan Fontenot
On 08/13/2015 04:17 AM, Bharata B Rao wrote:
> Last section of memory block is always initialized to
> 
> mem->start_section_nr + sections_per_block - 1
> 
> which will not be true for a section that doesn't contain sections_per_block
> sections due to the memory size specified. This causes the following
> kernel crash when memory blocks under a node are registered during reboot
> that follows a memory hotplug operation on pseries guest.
> 
> Unable to handle kernel paging request for data at address 0xf03f0020
> Faulting instruction address: 0xc07657cc
> Oops: Kernel access of bad area, sig: 11 [#1]
> SMP NR_CPUS=1024 NUMA pSeries
> 
> Modules linked in:
> 
> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.2.0-rc6+ #48
> task: c000ba3c ti: c0013c58 task.ti: c0013c58
> NIP: c07657cc LR: c0592dbc CTR: 0400
> REGS: c0013c5836f0 TRAP: 0300   Not tainted  (4.2.0-rc6+)
> MSR: 80009032  MSR: 80009032 
> <>  CR: 4848  XER: 
>   CR: 4848  XER: 
> CFAR: 3fff990f50ec CFAR: 3fff990f50ec DAR: f03f0020 DSISR: 
> 4000 DAR: f03f0020 DSISR: 4000 SOFTE: 1 SOFTE: 1
> GPR00: c0592dbc c0592dbc c0013c583970 c0013c583970 
> c14f0300 c14f0300 003f 003f
> GPR04:   c000f43b2900 c000f43b2900 
> c000ba324668 c000ba324668 0001 0001
> GPR08: c1540300 c1540300 f000 f000 
> f03f f03f 0001 0001
> GPR12: 2484 2484 cff2 cff2 
> c000b5b0 c000b5b0  
> GPR16:     
>    
> GPR20:     
>    
> GPR24: c188c380 c188c380   
> 00014000 00014000 c18b54e8 c18b54e8
> GPR28: c0013c06e800 c0013c06e800   
>   fc00 fc00
> 
> NIP [c07657cc] .get_nid_for_pfn+0x2c/0x60
> LR [c0592dbc] .register_mem_sect_under_node+0x8c/0x150
> Call Trace:
> [c0013c583970] [c056e44c] .put_device+0x2c/0x50
> [c0013c5839f0] [c0592dbc] .register_mem_sect_under_node+0x8c/0x150
> [c0013c583a80] [c05932b4] .register_one_node+0x2c4/0x380
> [c0013c583b30] [c0c882b8] .topology_init+0x44/0x1e0
> [c0013c583bf0] [c000ad30] .do_one_initcall+0x110/0x270
> [c0013c583ce0] [c0c845d4] .kernel_init_freeable+0x278/0x360
> [c0013c583db0] [c000b5d4] .kernel_init+0x24/0x130
> [c0013c583e30] [c00094e8] .ret_from_kernel_thread+0x58/0x70
> 
> Fix this by updating the memory block to always contain the right
> number of sections instead of assuming sections_per_block.
> 
> Signed-off-by: Bharata B Rao 
> Cc: Nathan Fontenot 
> ---
>  drivers/base/memory.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index 2804aed..7f3ce2e 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -645,6 +645,7 @@ static int add_memory_block(int base_section_nr)
>   if (ret)
>   return ret;
>   mem->section_count = section_count;
> +mem->end_section_nr = mem->start_section_nr + section_count -1;

I think this change may be correct but makes me wonder if we need to update
code elsewhere. There are places (at least in drivers/base/memory.c) that assume
a memory block contains sections_per_block sections.

Also, I think you may need to cc GregKH for this patch.

-Nathan
 
>   return 0;
>  }
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] driver: base: memory: Maintain correct mem-end_section_nr when memory block is partially filled

2015-08-14 Thread Nathan Fontenot
On 08/13/2015 04:17 AM, Bharata B Rao wrote:
 Last section of memory block is always initialized to
 
 mem-start_section_nr + sections_per_block - 1
 
 which will not be true for a section that doesn't contain sections_per_block
 sections due to the memory size specified. This causes the following
 kernel crash when memory blocks under a node are registered during reboot
 that follows a memory hotplug operation on pseries guest.
 
 Unable to handle kernel paging request for data at address 0xf03f0020
 Faulting instruction address: 0xc07657cc
 Oops: Kernel access of bad area, sig: 11 [#1]
 SMP NR_CPUS=1024 NUMA pSeries
 
 Modules linked in:
 
 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.2.0-rc6+ #48
 task: c000ba3c ti: c0013c58 task.ti: c0013c58
 NIP: c07657cc LR: c0592dbc CTR: 0400
 REGS: c0013c5836f0 TRAP: 0300   Not tainted  (4.2.0-rc6+)
 MSR: 80009032  MSR: 80009032 
 SFSF,EE,EE,ME,ME,IR,IR,DR,DR,RI,RI  CR: 4848  XER: 
   CR: 4848  XER: 
 CFAR: 3fff990f50ec CFAR: 3fff990f50ec DAR: f03f0020 DSISR: 
 4000 DAR: f03f0020 DSISR: 4000 SOFTE: 1 SOFTE: 1
 GPR00: c0592dbc c0592dbc c0013c583970 c0013c583970 
 c14f0300 c14f0300 003f 003f
 GPR04:   c000f43b2900 c000f43b2900 
 c000ba324668 c000ba324668 0001 0001
 GPR08: c1540300 c1540300 f000 f000 
 f03f f03f 0001 0001
 GPR12: 2484 2484 cff2 cff2 
 c000b5b0 c000b5b0  
 GPR16:     
    
 GPR20:     
    
 GPR24: c188c380 c188c380   
 00014000 00014000 c18b54e8 c18b54e8
 GPR28: c0013c06e800 c0013c06e800   
   fc00 fc00
 
 NIP [c07657cc] .get_nid_for_pfn+0x2c/0x60
 LR [c0592dbc] .register_mem_sect_under_node+0x8c/0x150
 Call Trace:
 [c0013c583970] [c056e44c] .put_device+0x2c/0x50
 [c0013c5839f0] [c0592dbc] .register_mem_sect_under_node+0x8c/0x150
 [c0013c583a80] [c05932b4] .register_one_node+0x2c4/0x380
 [c0013c583b30] [c0c882b8] .topology_init+0x44/0x1e0
 [c0013c583bf0] [c000ad30] .do_one_initcall+0x110/0x270
 [c0013c583ce0] [c0c845d4] .kernel_init_freeable+0x278/0x360
 [c0013c583db0] [c000b5d4] .kernel_init+0x24/0x130
 [c0013c583e30] [c00094e8] .ret_from_kernel_thread+0x58/0x70
 
 Fix this by updating the memory block to always contain the right
 number of sections instead of assuming sections_per_block.
 
 Signed-off-by: Bharata B Rao bhar...@linux.vnet.ibm.com
 Cc: Nathan Fontenot nf...@linux.vnet.ibm.com
 ---
  drivers/base/memory.c | 1 +
  1 file changed, 1 insertion(+)
 
 diff --git a/drivers/base/memory.c b/drivers/base/memory.c
 index 2804aed..7f3ce2e 100644
 --- a/drivers/base/memory.c
 +++ b/drivers/base/memory.c
 @@ -645,6 +645,7 @@ static int add_memory_block(int base_section_nr)
   if (ret)
   return ret;
   mem-section_count = section_count;
 +mem-end_section_nr = mem-start_section_nr + section_count -1;

I think this change may be correct but makes me wonder if we need to update
code elsewhere. There are places (at least in drivers/base/memory.c) that assume
a memory block contains sections_per_block sections.

Also, I think you may need to cc GregKH for this patch.

-Nathan
 
   return 0;
  }
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 06/31] arch/powerpc/pseries: use kmemdup rather than duplicating its implementation

2015-08-07 Thread Nathan Fontenot
On 08/07/2015 02:59 AM, Andrzej Hajda wrote:
> The patch was generated using fixed coccinelle semantic patch
> scripts/coccinelle/api/memdup.cocci [1].
> 
> [1]: http://permalink.gmane.org/gmane.linux.kernel/2014320
> 
> Signed-off-by: Andrzej Hajda 

Reviewed-by: Nathan Fontenot 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 05/31] arch/powerpc/nvram: use kmemdup rather than duplicating its implementation

2015-08-07 Thread Nathan Fontenot
On 08/07/2015 02:59 AM, Andrzej Hajda wrote:
> The patch was generated using fixed coccinelle semantic patch
> scripts/coccinelle/api/memdup.cocci [1].
> 
> [1]: http://permalink.gmane.org/gmane.linux.kernel/2014320
> 
> Signed-off-by: Andrzej Hajda 

Reviewed-by: Nathan Fontenot 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 05/31] arch/powerpc/nvram: use kmemdup rather than duplicating its implementation

2015-08-07 Thread Nathan Fontenot
On 08/07/2015 02:59 AM, Andrzej Hajda wrote:
 The patch was generated using fixed coccinelle semantic patch
 scripts/coccinelle/api/memdup.cocci [1].
 
 [1]: http://permalink.gmane.org/gmane.linux.kernel/2014320
 
 Signed-off-by: Andrzej Hajda a.ha...@samsung.com

Reviewed-by: Nathan Fontenot nf...@linux.vnet.ibm.com

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 06/31] arch/powerpc/pseries: use kmemdup rather than duplicating its implementation

2015-08-07 Thread Nathan Fontenot
On 08/07/2015 02:59 AM, Andrzej Hajda wrote:
 The patch was generated using fixed coccinelle semantic patch
 scripts/coccinelle/api/memdup.cocci [1].
 
 [1]: http://permalink.gmane.org/gmane.linux.kernel/2014320
 
 Signed-off-by: Andrzej Hajda a.ha...@samsung.com

Reviewed-by: Nathan Fontenot nf...@linux.vnet.ibm.com

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: powerpc,numa: Memory hotplug to memory-less nodes ?

2015-06-24 Thread Nathan Fontenot
On 06/23/2015 11:01 PM, Bharata B Rao wrote:
> So will it be correct to say that memory hotplug to memory-less node
> isn't supported by PowerPC kernel ? Should I enforce the same in QEMU
> for PowerKVM ?
>

I'm not sure if that is correct. It appears that we initialize all online
nodes, even those without spanned_pages, at boot time. This occurs
in setup_node_data() called from initmem_init().

Looking at this I would think that we could add memory to any online node
even if it does not have any spanned_pages. I think an interesting test
we be to check for the node being online instead of checking to see if
it has any memory.

-Nathan

> On Mon, Jun 22, 2015 at 10:18 AM, Bharata B Rao  wrote:
>> Hi,
>>
>> While developing memory hotplug support in QEMU for PoweKVM, I
>> realized that guest kernel has specific checks to prevent hot addition
>> of memory to a memory-less node.
>>
>> I am referring to arch/powerpc/mm/numa.c:hot_add_scn_to_nid() which
>> has explicit checks to ensure that it returns a nid that has some some
>> memory (NODE_DATA(nid)->node_spanned_pages) even when user wants to
>> hotplug to a node that currently has zero memory.
>>
>> Is this limitation by design ?
>>
>> Regards,
>> Bharata.
>> --
>> http://raobharata.wordpress.com/
> 
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: powerpc,numa: Memory hotplug to memory-less nodes ?

2015-06-24 Thread Nathan Fontenot
On 06/23/2015 11:01 PM, Bharata B Rao wrote:
 So will it be correct to say that memory hotplug to memory-less node
 isn't supported by PowerPC kernel ? Should I enforce the same in QEMU
 for PowerKVM ?


I'm not sure if that is correct. It appears that we initialize all online
nodes, even those without spanned_pages, at boot time. This occurs
in setup_node_data() called from initmem_init().

Looking at this I would think that we could add memory to any online node
even if it does not have any spanned_pages. I think an interesting test
we be to check for the node being online instead of checking to see if
it has any memory.

-Nathan

 On Mon, Jun 22, 2015 at 10:18 AM, Bharata B Rao bharata@gmail.com wrote:
 Hi,

 While developing memory hotplug support in QEMU for PoweKVM, I
 realized that guest kernel has specific checks to prevent hot addition
 of memory to a memory-less node.

 I am referring to arch/powerpc/mm/numa.c:hot_add_scn_to_nid() which
 has explicit checks to ensure that it returns a nid that has some some
 memory (NODE_DATA(nid)-node_spanned_pages) even when user wants to
 hotplug to a node that currently has zero memory.

 Is this limitation by design ?

 Regards,
 Bharata.
 --
 http://raobharata.wordpress.com/
 
 
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: of/dynamic: Fix test for PPC_PSERIES

2015-06-04 Thread Nathan Fontenot
On 06/04/2015 05:57 AM, Michael Ellerman wrote:
> On Thu, 2015-04-06 at 09:34:41 UTC, Geert Uytterhoeven wrote:
>> "IS_ENABLED(PPC_PSERIES)" always evaluates to false, as IS_ENABLED() is
>> supposed to be used with the full Kconfig symbol name, including the
>> "CONFIG_" prefix.
>>
>> Add the missing "CONFIG_" prefix to fix this.
>>
>> Fixes: a25095d451ece23b ("of: Move dynamic node fixups out of powerpc and 
>> into common code")
>> Signed-off-by: Geert Uytterhoeven 
>> ---
> 
>> Did this bug cause any breakage?
>> If yes, the fix should go to stable (for v3.17 and later).
> 
> Yikes. Not that I've heard of. But it's reasonably new so possibly it's not 
> hit
> distros that folks tend to run on those machines.

I think we do have some distros that have picked this up.

> 
> I'm also not clear how it would break, it could be subtle and we've not 
> noticed.
> 

The only place I can find that this might cause an issue is during device tree
updating that pseries does after a live migration or suspend/resume. When
removing or updating a device tree node we look up the node by ibm,phandle and
without this patch we wouldn't find these nodes.

I have not seen any issues because of this but I think pushing this to stable
would be good.

-Nathan

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: of/dynamic: Fix test for PPC_PSERIES

2015-06-04 Thread Nathan Fontenot
On 06/04/2015 05:57 AM, Michael Ellerman wrote:
 On Thu, 2015-04-06 at 09:34:41 UTC, Geert Uytterhoeven wrote:
 IS_ENABLED(PPC_PSERIES) always evaluates to false, as IS_ENABLED() is
 supposed to be used with the full Kconfig symbol name, including the
 CONFIG_ prefix.

 Add the missing CONFIG_ prefix to fix this.

 Fixes: a25095d451ece23b (of: Move dynamic node fixups out of powerpc and 
 into common code)
 Signed-off-by: Geert Uytterhoeven geert+rene...@glider.be
 ---
 
 Did this bug cause any breakage?
 If yes, the fix should go to stable (for v3.17 and later).
 
 Yikes. Not that I've heard of. But it's reasonably new so possibly it's not 
 hit
 distros that folks tend to run on those machines.

I think we do have some distros that have picked this up.

 
 I'm also not clear how it would break, it could be subtle and we've not 
 noticed.
 

The only place I can find that this might cause an issue is during device tree
updating that pseries does after a live migration or suspend/resume. When
removing or updating a device tree node we look up the node by ibm,phandle and
without this patch we wouldn't find these nodes.

I have not seen any issues because of this but I think pushing this to stable
would be good.

-Nathan

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 07/14] of/reconfig: Always use the same structure for notifiers

2014-11-25 Thread Nathan Fontenot
On 11/25/2014 05:07 PM, Benjamin Herrenschmidt wrote:
> On Mon, 2014-11-24 at 22:33 +, Grant Likely wrote:
>> The OF_RECONFIG notifier callback uses a different structure depending
>> on whether it is a node change or a property change. This is silly, and
>> not very safe. Rework the code to use the same data structure regardless
>> of the type of notifier.
> 
> I fell pretty good about this one except...
> 
>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>> index b9d1dfdbe5bb..9fe6002c1d5a 100644
>> --- a/arch/powerpc/mm/numa.c
>> +++ b/arch/powerpc/mm/numa.c
>> @@ -1711,12 +1711,11 @@ static void stage_topology_update(int core_id)
>>  static int dt_update_callback(struct notifier_block *nb,
>>  unsigned long action, void *data)
>>  {
>> -struct of_prop_reconfig *update;
>> +struct of_reconfig_data *update = data;
>>  int rc = NOTIFY_DONE;
>>  
>>  switch (action) {
>>  case OF_RECONFIG_UPDATE_PROPERTY:
>> -update = (struct of_prop_reconfig *)data;
> 
> Should we assert/bug on !update->dn / update->prop ?
> 
> (Same for the rest of the patch)
> 
> Or do you reckon it's pointless ?
> 

I'm not sure it's worth it, if those are NULL pointers the drivers/of
code would have tried to use them before invoking the notifier chain.
We won't make it this far if they're NULL.

Otherwise the patch looks good to me,

Reviewed-by: Nathan Fontenot 

-Nathan

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 07/14] of/reconfig: Always use the same structure for notifiers

2014-11-25 Thread Nathan Fontenot
On 11/25/2014 05:07 PM, Benjamin Herrenschmidt wrote:
 On Mon, 2014-11-24 at 22:33 +, Grant Likely wrote:
 The OF_RECONFIG notifier callback uses a different structure depending
 on whether it is a node change or a property change. This is silly, and
 not very safe. Rework the code to use the same data structure regardless
 of the type of notifier.
 
 I fell pretty good about this one except...
 
 diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
 index b9d1dfdbe5bb..9fe6002c1d5a 100644
 --- a/arch/powerpc/mm/numa.c
 +++ b/arch/powerpc/mm/numa.c
 @@ -1711,12 +1711,11 @@ static void stage_topology_update(int core_id)
  static int dt_update_callback(struct notifier_block *nb,
  unsigned long action, void *data)
  {
 -struct of_prop_reconfig *update;
 +struct of_reconfig_data *update = data;
  int rc = NOTIFY_DONE;
  
  switch (action) {
  case OF_RECONFIG_UPDATE_PROPERTY:
 -update = (struct of_prop_reconfig *)data;
 
 Should we assert/bug on !update-dn / update-prop ?
 
 (Same for the rest of the patch)
 
 Or do you reckon it's pointless ?
 

I'm not sure it's worth it, if those are NULL pointers the drivers/of
code would have tried to use them before invoking the notifier chain.
We won't make it this far if they're NULL.

Otherwise the patch looks good to me,

Reviewed-by: Nathan Fontenot nf...@linux.vnet.ibm.com

-Nathan

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] powerpc: use device_online/offline() instead of cpu_up/down()

2014-11-03 Thread Nathan Fontenot
On 10/31/2014 02:41 PM, Dan Streetman wrote:
> In powerpc pseries platform dlpar operations, Use device_online() and
> device_offline() instead of cpu_up() and cpu_down().
> 
> Calling cpu_up/down directly does not update the cpu device offline
> field, which is used to online/offline a cpu from sysfs.  Calling
> device_online/offline instead keeps the sysfs cpu online value correct.
> The hotplug lock, which is required to be held when calling
> device_online/offline, is already held when dlpar_online/offline_cpu
> are called, since they are called only from cpu_probe|release_store.
> 
> This patch fixes errors on PowerVM systems that have cpu(s) added/removed
> using dlpar operations; without this patch, the
> /sys/devices/system/cpu/cpuN/online nodes do not correctly show the
> online state of added/removed cpus.
> 
> Signed-off-by: Dan Streetman 
> Cc: Nathan Fontenot 

Acked-by: Nathan Fontenot 

> ---
> 
> Previous discussion for this:
> https://lkml.org/lkml/2014/10/29/839
> 
>  arch/powerpc/platforms/pseries/dlpar.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/dlpar.c 
> b/arch/powerpc/platforms/pseries/dlpar.c
> index 6ad83bd..c22bb1b 100644
> --- a/arch/powerpc/platforms/pseries/dlpar.c
> +++ b/arch/powerpc/platforms/pseries/dlpar.c
> @@ -382,7 +382,7 @@ static int dlpar_online_cpu(struct device_node *dn)
>   BUG_ON(get_cpu_current_state(cpu)
>   != CPU_STATE_OFFLINE);
>   cpu_maps_update_done();
> - rc = cpu_up(cpu);
> + rc = device_online(get_cpu_device(cpu));
>   if (rc)
>   goto out;
>   cpu_maps_update_begin();
> @@ -467,7 +467,7 @@ static int dlpar_offline_cpu(struct device_node *dn)
>   if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) {
>   set_preferred_offline_state(cpu, 
> CPU_STATE_OFFLINE);
>   cpu_maps_update_done();
> - rc = cpu_down(cpu);
> + rc = device_offline(get_cpu_device(cpu));
>   if (rc)
>   goto out;
>   cpu_maps_update_begin();
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] powerpc: use device_online/offline() instead of cpu_up/down()

2014-11-03 Thread Nathan Fontenot
On 10/31/2014 02:41 PM, Dan Streetman wrote:
 In powerpc pseries platform dlpar operations, Use device_online() and
 device_offline() instead of cpu_up() and cpu_down().
 
 Calling cpu_up/down directly does not update the cpu device offline
 field, which is used to online/offline a cpu from sysfs.  Calling
 device_online/offline instead keeps the sysfs cpu online value correct.
 The hotplug lock, which is required to be held when calling
 device_online/offline, is already held when dlpar_online/offline_cpu
 are called, since they are called only from cpu_probe|release_store.
 
 This patch fixes errors on PowerVM systems that have cpu(s) added/removed
 using dlpar operations; without this patch, the
 /sys/devices/system/cpu/cpuN/online nodes do not correctly show the
 online state of added/removed cpus.
 
 Signed-off-by: Dan Streetman ddstr...@ieee.org
 Cc: Nathan Fontenot nf...@linux.vnet.ibm.com

Acked-by: Nathan Fontenot nf...@linux.vnet.ibm.com

 ---
 
 Previous discussion for this:
 https://lkml.org/lkml/2014/10/29/839
 
  arch/powerpc/platforms/pseries/dlpar.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)
 
 diff --git a/arch/powerpc/platforms/pseries/dlpar.c 
 b/arch/powerpc/platforms/pseries/dlpar.c
 index 6ad83bd..c22bb1b 100644
 --- a/arch/powerpc/platforms/pseries/dlpar.c
 +++ b/arch/powerpc/platforms/pseries/dlpar.c
 @@ -382,7 +382,7 @@ static int dlpar_online_cpu(struct device_node *dn)
   BUG_ON(get_cpu_current_state(cpu)
   != CPU_STATE_OFFLINE);
   cpu_maps_update_done();
 - rc = cpu_up(cpu);
 + rc = device_online(get_cpu_device(cpu));
   if (rc)
   goto out;
   cpu_maps_update_begin();
 @@ -467,7 +467,7 @@ static int dlpar_offline_cpu(struct device_node *dn)
   if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) {
   set_preferred_offline_state(cpu, 
 CPU_STATE_OFFLINE);
   cpu_maps_update_done();
 - rc = cpu_down(cpu);
 + rc = device_offline(get_cpu_device(cpu));
   if (rc)
   goto out;
   cpu_maps_update_begin();
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] MAINTAINERS: nx-842 driver maintainer change

2014-10-17 Thread Nathan Fontenot
On 10/17/2014 06:19 PM, Dan Streetman wrote:
> Change maintainer of nx-842 compression coprocessor driver
> to Dan Streetman.
> 
> Signed-off-by: Dan Streetman 
> Cc: Nathan Fontenot 

Acked-by: Nathan Fontenot 

> ---
>  MAINTAINERS | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index b0f17d5..3cc3e41 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -4605,7 +4605,7 @@ S:  Supported
>  F:   drivers/crypto/nx/
> 
>  IBM Power 842 compression accelerator
> -M:   Nathan Fontenot 
> +M:   Dan Streetman 
>  S:   Supported
>  F:   drivers/crypto/nx/nx-842.c
>  F:   include/linux/nx842.h
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] MAINTAINERS: nx-842 driver maintainer change

2014-10-17 Thread Nathan Fontenot
On 10/17/2014 06:19 PM, Dan Streetman wrote:
 Change maintainer of nx-842 compression coprocessor driver
 to Dan Streetman.
 
 Signed-off-by: Dan Streetman ddstr...@ieee.org
 Cc: Nathan Fontenot nf...@linux.vnet.ibm.com

Acked-by: Nathan Fontenot nf...@linux.vnet.ibm.com

 ---
  MAINTAINERS | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/MAINTAINERS b/MAINTAINERS
 index b0f17d5..3cc3e41 100644
 --- a/MAINTAINERS
 +++ b/MAINTAINERS
 @@ -4605,7 +4605,7 @@ S:  Supported
  F:   drivers/crypto/nx/
 
  IBM Power 842 compression accelerator
 -M:   Nathan Fontenot nf...@linux.vnet.ibm.com
 +M:   Dan Streetman ddstr...@us.ibm.com
  S:   Supported
  F:   drivers/crypto/nx/nx-842.c
  F:   include/linux/nx842.h
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] pseries: Make CPU hotplug path endian safe

2014-09-05 Thread Nathan Fontenot
On 09/05/2014 04:16 AM, bharata@gmail.com wrote:
> From: Bharata B Rao 
> 
> - ibm,rtas-configure-connector should treat the RTAS data as big endian.
> - Treat ibm,ppc-interrupt-server#s as big-endian when setting
>   smp_processor_id during hotplug.
> 
> Signed-off-by: Bharata B Rao 
> ---
>  arch/powerpc/platforms/pseries/dlpar.c   | 10 +-
>  arch/powerpc/platforms/pseries/hotplug-cpu.c |  4 ++--
>  2 files changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/dlpar.c 
> b/arch/powerpc/platforms/pseries/dlpar.c
> index 2d0b4d6..dc55f9c 100644
> --- a/arch/powerpc/platforms/pseries/dlpar.c
> +++ b/arch/powerpc/platforms/pseries/dlpar.c
> @@ -48,11 +48,11 @@ static struct property *dlpar_parse_cc_property(struct 
> cc_workarea *ccwa)
>   if (!prop)
>   return NULL;
>  
> - name = (char *)ccwa + ccwa->name_offset;
> + name = (char *)ccwa + be32_to_cpu(ccwa->name_offset);
>   prop->name = kstrdup(name, GFP_KERNEL);
>  
> - prop->length = ccwa->prop_length;
> - value = (char *)ccwa + ccwa->prop_offset;
> + prop->length = be32_to_cpu(ccwa->prop_length);
> + value = (char *)ccwa + be32_to_cpu(ccwa->prop_offset);
>   prop->value = kmemdup(value, prop->length, GFP_KERNEL);
>   if (!prop->value) {
>   dlpar_free_cc_property(prop);
> @@ -78,7 +78,7 @@ static struct device_node *dlpar_parse_cc_node(struct 
> cc_workarea *ccwa,
>   if (!dn)
>   return NULL;
>  
> - name = (char *)ccwa + ccwa->name_offset;
> + name = (char *)ccwa + be32_to_cpu(ccwa->name_offset);
>   dn->full_name = kasprintf(GFP_KERNEL, "%s/%s", path, name);
>   if (!dn->full_name) {
>   kfree(dn);
> @@ -148,7 +148,7 @@ struct device_node *dlpar_configure_connector(u32 
> drc_index,
>   return NULL;
>  
>   ccwa = (struct cc_workarea *)_buf[0];
> - ccwa->drc_index = drc_index;
> + ccwa->drc_index = cpu_to_be32(drc_index);

I need to look at this some more but I think this may cause an issue for
partition migration. If I am following the code correctly, starting in
pseries_devicetree_update(), the drc_index value passed to 
dlpar_configure_connector is pulled directly out of a buffer we get from
firmware. This would mean the drc_index value is already in BE format.

Whereas for cpu hotplug the drc_index value is passed in from userspace
via the cpu probe interface in sysfs. I assume that you are seeing the
drc_index value getting passed in in LE format.

-Nathan

>   ccwa->zero = 0;
>  
>   do {
> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
> b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> index 20d6297..447f8c6 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> @@ -247,7 +247,7 @@ static int pseries_add_processor(struct device_node *np)
>   unsigned int cpu;
>   cpumask_var_t candidate_mask, tmp;
>   int err = -ENOSPC, len, nthreads, i;
> - const u32 *intserv;
> + const __be32 *intserv;
>  
>   intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", );
>   if (!intserv)
> @@ -293,7 +293,7 @@ static int pseries_add_processor(struct device_node *np)
>   for_each_cpu(cpu, tmp) {
>   BUG_ON(cpu_present(cpu));
>   set_cpu_present(cpu, true);
> - set_hard_smp_processor_id(cpu, *intserv++);
> + set_hard_smp_processor_id(cpu, be32_to_cpu(*intserv++));
>   }
>   err = 0;
>  out_unlock:
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] pseries: Make CPU hotplug path endian safe

2014-09-05 Thread Nathan Fontenot
On 09/05/2014 04:16 AM, bharata@gmail.com wrote:
 From: Bharata B Rao bhar...@linux.vnet.ibm.com
 
 - ibm,rtas-configure-connector should treat the RTAS data as big endian.
 - Treat ibm,ppc-interrupt-server#s as big-endian when setting
   smp_processor_id during hotplug.
 
 Signed-off-by: Bharata B Rao bhar...@linux.vnet.ibm.com
 ---
  arch/powerpc/platforms/pseries/dlpar.c   | 10 +-
  arch/powerpc/platforms/pseries/hotplug-cpu.c |  4 ++--
  2 files changed, 7 insertions(+), 7 deletions(-)
 
 diff --git a/arch/powerpc/platforms/pseries/dlpar.c 
 b/arch/powerpc/platforms/pseries/dlpar.c
 index 2d0b4d6..dc55f9c 100644
 --- a/arch/powerpc/platforms/pseries/dlpar.c
 +++ b/arch/powerpc/platforms/pseries/dlpar.c
 @@ -48,11 +48,11 @@ static struct property *dlpar_parse_cc_property(struct 
 cc_workarea *ccwa)
   if (!prop)
   return NULL;
  
 - name = (char *)ccwa + ccwa-name_offset;
 + name = (char *)ccwa + be32_to_cpu(ccwa-name_offset);
   prop-name = kstrdup(name, GFP_KERNEL);
  
 - prop-length = ccwa-prop_length;
 - value = (char *)ccwa + ccwa-prop_offset;
 + prop-length = be32_to_cpu(ccwa-prop_length);
 + value = (char *)ccwa + be32_to_cpu(ccwa-prop_offset);
   prop-value = kmemdup(value, prop-length, GFP_KERNEL);
   if (!prop-value) {
   dlpar_free_cc_property(prop);
 @@ -78,7 +78,7 @@ static struct device_node *dlpar_parse_cc_node(struct 
 cc_workarea *ccwa,
   if (!dn)
   return NULL;
  
 - name = (char *)ccwa + ccwa-name_offset;
 + name = (char *)ccwa + be32_to_cpu(ccwa-name_offset);
   dn-full_name = kasprintf(GFP_KERNEL, %s/%s, path, name);
   if (!dn-full_name) {
   kfree(dn);
 @@ -148,7 +148,7 @@ struct device_node *dlpar_configure_connector(u32 
 drc_index,
   return NULL;
  
   ccwa = (struct cc_workarea *)data_buf[0];
 - ccwa-drc_index = drc_index;
 + ccwa-drc_index = cpu_to_be32(drc_index);

I need to look at this some more but I think this may cause an issue for
partition migration. If I am following the code correctly, starting in
pseries_devicetree_update(), the drc_index value passed to 
dlpar_configure_connector is pulled directly out of a buffer we get from
firmware. This would mean the drc_index value is already in BE format.

Whereas for cpu hotplug the drc_index value is passed in from userspace
via the cpu probe interface in sysfs. I assume that you are seeing the
drc_index value getting passed in in LE format.

-Nathan

   ccwa-zero = 0;
  
   do {
 diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
 b/arch/powerpc/platforms/pseries/hotplug-cpu.c
 index 20d6297..447f8c6 100644
 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
 +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
 @@ -247,7 +247,7 @@ static int pseries_add_processor(struct device_node *np)
   unsigned int cpu;
   cpumask_var_t candidate_mask, tmp;
   int err = -ENOSPC, len, nthreads, i;
 - const u32 *intserv;
 + const __be32 *intserv;
  
   intserv = of_get_property(np, ibm,ppc-interrupt-server#s, len);
   if (!intserv)
 @@ -293,7 +293,7 @@ static int pseries_add_processor(struct device_node *np)
   for_each_cpu(cpu, tmp) {
   BUG_ON(cpu_present(cpu));
   set_cpu_present(cpu, true);
 - set_hard_smp_processor_id(cpu, *intserv++);
 + set_hard_smp_processor_id(cpu, be32_to_cpu(*intserv++));
   }
   err = 0;
  out_unlock:
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 00/10] of: Core devicetree changeset support

2014-07-25 Thread Nathan Fontenot
On 07/23/2014 06:44 PM, Grant Likely wrote:
> Hi all,
> 
> This is a rollup of all the patches that I queued up today for
> linux-next for the devicetree changeset and overlay work. I'm reposting
> these patches because some have been significantly changed since their
> first posting. None of the code is actually live yet, but the test cases
> are there.
> 
> There is one significant functional change which may affect pseries. The
> OF_DYNAMIC code has been changed to emit notifiers after applying a
> change to the tree instead of before. I think I've got all they users
> taken care of correctly, but I'd like to see some testing. I don't have
> access to pseries (and I can't get the QEMU pseries model to boot reliably)
> 
> Nathan/Tyrel, can you give it a spin? The changes can be found in my git tree:

Grant,

I was able to do some sniff testing by adding/removing memory, cpus, and pci
devices and it appears that all device tree nodes and properties were updated
properly.

I would still like to try doing live partition migration where we can update
many pieces of the device tree. I'll be out next week, will try to get
on that when I return.

-Nathan

> 
>   git://git.secretlab.ca/git/linux devicetree/next
> 
> Grant Likely (8):
>   of/platform: Fix of_platform_device_destroy iteration of devices
>   of: Move CONFIG_OF_DYNAMIC code into a separate file
>   of: Make devicetree sysfs update functions consistent.
>   of: Make sure attached nodes don't carry along extra children
>   of: Move dynamic node fixups out of powerpc and into common code
>   of: Reorder device tree changes and notifiers
>   of: Add todo tasklist for Devicetree
>   Merge branch 'devicetree/next-overlay' into devicetree/next
> 
> Pantelis Antoniou (4):
>   of: rename of_aliases_mutex to just of_mutex
>   OF: Utility helper functions for dynamic nodes
>   of: Create unlocked versions of node and property add/remove functions
>   of: Transactional DT support.
> 
>  Documentation/devicetree/changesets.txt |  40 ++
>  Documentation/devicetree/todo.txt   |  11 +
>  arch/powerpc/kernel/prom.c  |  70 ---
>  arch/powerpc/platforms/pseries/hotplug-memory.c |   2 +-
>  drivers/crypto/nx/nx-842.c  |  30 +-
>  drivers/of/Makefile |   1 +
>  drivers/of/base.c   | 423 ---
>  drivers/of/device.c |   4 +-
>  drivers/of/dynamic.c| 660 
> 
>  drivers/of/of_private.h |  59 ++-
>  drivers/of/platform.c   |  32 +-
>  drivers/of/selftest.c   |  79 +++
>  drivers/of/testcase-data/testcases.dtsi |  10 +
>  include/linux/of.h  |  80 ++-
>  include/linux/of_platform.h |   7 +-
>  15 files changed, 1065 insertions(+), 443 deletions(-)
>  create mode 100644 Documentation/devicetree/changesets.txt
>  create mode 100644 Documentation/devicetree/todo.txt
>  create mode 100644 drivers/of/dynamic.c
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 00/10] of: Core devicetree changeset support

2014-07-25 Thread Nathan Fontenot
On 07/23/2014 06:44 PM, Grant Likely wrote:
 Hi all,
 
 This is a rollup of all the patches that I queued up today for
 linux-next for the devicetree changeset and overlay work. I'm reposting
 these patches because some have been significantly changed since their
 first posting. None of the code is actually live yet, but the test cases
 are there.
 
 There is one significant functional change which may affect pseries. The
 OF_DYNAMIC code has been changed to emit notifiers after applying a
 change to the tree instead of before. I think I've got all they users
 taken care of correctly, but I'd like to see some testing. I don't have
 access to pseries (and I can't get the QEMU pseries model to boot reliably)
 
 Nathan/Tyrel, can you give it a spin? The changes can be found in my git tree:

Grant,

I was able to do some sniff testing by adding/removing memory, cpus, and pci
devices and it appears that all device tree nodes and properties were updated
properly.

I would still like to try doing live partition migration where we can update
many pieces of the device tree. I'll be out next week, will try to get
on that when I return.

-Nathan

 
   git://git.secretlab.ca/git/linux devicetree/next
 
 Grant Likely (8):
   of/platform: Fix of_platform_device_destroy iteration of devices
   of: Move CONFIG_OF_DYNAMIC code into a separate file
   of: Make devicetree sysfs update functions consistent.
   of: Make sure attached nodes don't carry along extra children
   of: Move dynamic node fixups out of powerpc and into common code
   of: Reorder device tree changes and notifiers
   of: Add todo tasklist for Devicetree
   Merge branch 'devicetree/next-overlay' into devicetree/next
 
 Pantelis Antoniou (4):
   of: rename of_aliases_mutex to just of_mutex
   OF: Utility helper functions for dynamic nodes
   of: Create unlocked versions of node and property add/remove functions
   of: Transactional DT support.
 
  Documentation/devicetree/changesets.txt |  40 ++
  Documentation/devicetree/todo.txt   |  11 +
  arch/powerpc/kernel/prom.c  |  70 ---
  arch/powerpc/platforms/pseries/hotplug-memory.c |   2 +-
  drivers/crypto/nx/nx-842.c  |  30 +-
  drivers/of/Makefile |   1 +
  drivers/of/base.c   | 423 ---
  drivers/of/device.c |   4 +-
  drivers/of/dynamic.c| 660 
 
  drivers/of/of_private.h |  59 ++-
  drivers/of/platform.c   |  32 +-
  drivers/of/selftest.c   |  79 +++
  drivers/of/testcase-data/testcases.dtsi |  10 +
  include/linux/of.h  |  80 ++-
  include/linux/of_platform.h |   7 +-
  15 files changed, 1065 insertions(+), 443 deletions(-)
  create mode 100644 Documentation/devicetree/changesets.txt
  create mode 100644 Documentation/devicetree/todo.txt
  create mode 100644 drivers/of/dynamic.c
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/1] drivers/crypto/nx/nx-842.c: use PAGE_ALIGNED instead of IS_ALIGNED(PAGE_SIZE

2014-06-16 Thread Nathan Fontenot
On 06/14/2014 05:09 PM, Fabian Frederick wrote:
> use mm.h definition
> 
> Cc: Nathan Fontenot 
> Cc: Marcelo Henrique Cerri 
> Signed-off-by: Fabian Frederick 

Acked-by: Nathan Fontenot 

> ---
>  drivers/crypto/nx/nx-842.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/crypto/nx/nx-842.c b/drivers/crypto/nx/nx-842.c
> index 502edf0..923c4b6 100644
> --- a/drivers/crypto/nx/nx-842.c
> +++ b/drivers/crypto/nx/nx-842.c
> @@ -350,7 +350,7 @@ int nx842_compress(const unsigned char *in, unsigned int 
> inlen,
>* the alignment is guaranteed.
>   */
>   inbuf = (unsigned long)in;
> - if (!IS_ALIGNED(inbuf, PAGE_SIZE) || inlen != PAGE_SIZE)
> + if (!PAGE_ALIGNED(inbuf) || inlen != PAGE_SIZE)
>   return -EINVAL;
> 
>   rcu_read_lock();
> @@ -545,7 +545,7 @@ int nx842_decompress(const unsigned char *in, unsigned 
> int inlen,
> 
>   /* Ensure page alignment and size */
>   outbuf = (unsigned long)out;
> - if (!IS_ALIGNED(outbuf, PAGE_SIZE) || *outlen != PAGE_SIZE)
> + if (!PAGE_ALIGNED(outbuf) || *outlen != PAGE_SIZE)
>   return -EINVAL;
> 
>   rcu_read_lock();
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/1] drivers/crypto/nx/nx-842.c: use PAGE_ALIGNED instead of IS_ALIGNED(PAGE_SIZE

2014-06-16 Thread Nathan Fontenot
On 06/14/2014 05:09 PM, Fabian Frederick wrote:
 use mm.h definition
 
 Cc: Nathan Fontenot nf...@linux.vnet.ibm.com
 Cc: Marcelo Henrique Cerri mhce...@linux.vnet.ibm.com
 Signed-off-by: Fabian Frederick f...@skynet.be

Acked-by: Nathan Fontenot nf...@linux.vnet.ibm.com

 ---
  drivers/crypto/nx/nx-842.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)
 
 diff --git a/drivers/crypto/nx/nx-842.c b/drivers/crypto/nx/nx-842.c
 index 502edf0..923c4b6 100644
 --- a/drivers/crypto/nx/nx-842.c
 +++ b/drivers/crypto/nx/nx-842.c
 @@ -350,7 +350,7 @@ int nx842_compress(const unsigned char *in, unsigned int 
 inlen,
* the alignment is guaranteed.
   */
   inbuf = (unsigned long)in;
 - if (!IS_ALIGNED(inbuf, PAGE_SIZE) || inlen != PAGE_SIZE)
 + if (!PAGE_ALIGNED(inbuf) || inlen != PAGE_SIZE)
   return -EINVAL;
 
   rcu_read_lock();
 @@ -545,7 +545,7 @@ int nx842_decompress(const unsigned char *in, unsigned 
 int inlen,
 
   /* Ensure page alignment and size */
   outbuf = (unsigned long)out;
 - if (!IS_ALIGNED(outbuf, PAGE_SIZE) || *outlen != PAGE_SIZE)
 + if (!PAGE_ALIGNED(outbuf) || *outlen != PAGE_SIZE)
   return -EINVAL;
 
   rcu_read_lock();
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] memory driver: make phys_index/end_phys_index reflect the start/end section number

2014-04-11 Thread Nathan Fontenot
On 04/09/2014 11:17 PM, Li Zhong wrote:
> On Wed, 2014-04-09 at 12:39 -0500, Nathan Fontenot wrote:
>> On 04/08/2014 02:47 PM, Dave Hansen wrote:
>>>
>>> That document really needs to be updated to stop referring to sections
>>> (at least in the descriptions of the user interface).  We can not change
>>> the units of phys_index/end_phys_index without also changing
>>> block_size_bytes.
>>>
>>
>> Here is a first pass at updating the documentation.
>>
>> I have tried to update the documentation to refer to memory blocks instead
>> of memory sections where appropriate and added a paragraph to explain
>> that memory blocks are mode of memory sections.
>>
>> Thoughts?
> 
> If we all agree to hide the information about sections, then I think we
> also need to update the section id's used for phys_index/end_phys_index,
> something like following on top of yours?
> 
> --
> diff --git a/Documentation/memory-hotplug.txt 
> b/Documentation/memory-hotplug.txt
> index 92d15e2..9fbb025 100644
> --- a/Documentation/memory-hotplug.txt
> +++ b/Documentation/memory-hotplug.txt
> @@ -138,10 +138,7 @@ is described under /sys/devices/system/memory as
>  /sys/devices/system/memory/memoryXXX
>  (XXX is the memory block id.)
> 
> -Now, XXX is defined as (start_address_of_section / section_size) of the first
> -section contained in the memory block.  The files 'phys_index' and
> -'end_phys_index' under each directory report the beginning and end section 
> id's
> -for the memory block covered by the sysfs directory.  It is expected that all
> +For the memory block covered by the sysfs directory.  It is expected that all
>  memory sections in this range are present and no memory holes exist in the
>  range. Currently there is no way to determine if there is a memory hole, but
>  the existence of one should not affect the hotplug capabilities of the memory
> @@ -155,16 +152,14 @@ This device covers address range [0x1 ... 
> 0x14000)
>  Under each memory block, you can see 4 or 5 files, the end_phys_index file
>  being a recent addition and not present on older kernels.
> 
> -/sys/devices/system/memory/memoryXXX/start_phys_index
> +/sys/devices/system/memory/memoryXXX/phys_index
>  /sys/devices/system/memory/memoryXXX/end_phys_index
>  /sys/devices/system/memory/memoryXXX/phys_device
>  /sys/devices/system/memory/memoryXXX/state
>  /sys/devices/system/memory/memoryXXX/removable
> 
> -'phys_index'  : read-only and contains section id of the first section
> - in the memory block, same as XXX.
> -'end_phys_index'  : read-only and contains section id of the last section
> - in the memory block.
> +'phys_index'  : read-only and contains memory block id, same as XXX.
> +'end_phys_index'  : read-only and contains memory block id, same as XXX.
>  'state'   : read-write
>  at read:  contains online/offline state of memory.
>  at write: user can specify "online_kernel",
> --
> 
> Not sure whether it is proper to remove end_phys_index, too? 

If we are going to leave the code as it is today such that the start_phys_index
and end_phys_index files both contain the same value I don't see why we should
not do this.

Li Zhong, unless anyone has objections, can you submit a patch to update the
files in sysfs and the documentation?

-Nathan

> 
> Thanks, 
> Zhong
> 
> 
> 
> 
>>
>> -Nathan
>> ---
>>  Documentation/memory-hotplug.txt |  113 
>> ---
>>  1 file changed, 59 insertions(+), 54 deletions(-)
>>
>> Index: linux/Documentation/memory-hotplug.txt
>> ===
>> --- linux.orig/Documentation/memory-hotplug.txt
>> +++ linux/Documentation/memory-hotplug.txt
>> @@ -88,16 +88,21 @@ phase by hand.
>>
>>  1.3. Unit of Memory online/offline operation
>>  
>> -Memory hotplug uses SPARSEMEM memory model. SPARSEMEM divides the whole 
>> memory
>> -into chunks of the same size. The chunk is called a "section". The size of
>> -a section is architecture dependent. For example, power uses 16MiB, ia64 
>> uses
>> -1GiB. The unit of online/offline operation is "one section". (see Section 
>> 3.)
>> +Memory hotplug uses SPARSEMEM memory model which allows memory to be divided
>> +into chunks of the same size. These chunks are called "sections". The size 
>> of
>> +a memory section is architecture dependent. For example, power uses 16MiB, 
>> ia64
>> +uses 1GiB.
>> +
>> +Memor

Re: [RFC PATCH] memory driver: make phys_index/end_phys_index reflect the start/end section number

2014-04-11 Thread Nathan Fontenot
On 04/09/2014 11:17 PM, Li Zhong wrote:
 On Wed, 2014-04-09 at 12:39 -0500, Nathan Fontenot wrote:
 On 04/08/2014 02:47 PM, Dave Hansen wrote:

 That document really needs to be updated to stop referring to sections
 (at least in the descriptions of the user interface).  We can not change
 the units of phys_index/end_phys_index without also changing
 block_size_bytes.


 Here is a first pass at updating the documentation.

 I have tried to update the documentation to refer to memory blocks instead
 of memory sections where appropriate and added a paragraph to explain
 that memory blocks are mode of memory sections.

 Thoughts?
 
 If we all agree to hide the information about sections, then I think we
 also need to update the section id's used for phys_index/end_phys_index,
 something like following on top of yours?
 
 --
 diff --git a/Documentation/memory-hotplug.txt 
 b/Documentation/memory-hotplug.txt
 index 92d15e2..9fbb025 100644
 --- a/Documentation/memory-hotplug.txt
 +++ b/Documentation/memory-hotplug.txt
 @@ -138,10 +138,7 @@ is described under /sys/devices/system/memory as
  /sys/devices/system/memory/memoryXXX
  (XXX is the memory block id.)
 
 -Now, XXX is defined as (start_address_of_section / section_size) of the first
 -section contained in the memory block.  The files 'phys_index' and
 -'end_phys_index' under each directory report the beginning and end section 
 id's
 -for the memory block covered by the sysfs directory.  It is expected that all
 +For the memory block covered by the sysfs directory.  It is expected that all
  memory sections in this range are present and no memory holes exist in the
  range. Currently there is no way to determine if there is a memory hole, but
  the existence of one should not affect the hotplug capabilities of the memory
 @@ -155,16 +152,14 @@ This device covers address range [0x1 ... 
 0x14000)
  Under each memory block, you can see 4 or 5 files, the end_phys_index file
  being a recent addition and not present on older kernels.
 
 -/sys/devices/system/memory/memoryXXX/start_phys_index
 +/sys/devices/system/memory/memoryXXX/phys_index
  /sys/devices/system/memory/memoryXXX/end_phys_index
  /sys/devices/system/memory/memoryXXX/phys_device
  /sys/devices/system/memory/memoryXXX/state
  /sys/devices/system/memory/memoryXXX/removable
 
 -'phys_index'  : read-only and contains section id of the first section
 - in the memory block, same as XXX.
 -'end_phys_index'  : read-only and contains section id of the last section
 - in the memory block.
 +'phys_index'  : read-only and contains memory block id, same as XXX.
 +'end_phys_index'  : read-only and contains memory block id, same as XXX.
  'state'   : read-write
  at read:  contains online/offline state of memory.
  at write: user can specify online_kernel,
 --
 
 Not sure whether it is proper to remove end_phys_index, too? 

If we are going to leave the code as it is today such that the start_phys_index
and end_phys_index files both contain the same value I don't see why we should
not do this.

Li Zhong, unless anyone has objections, can you submit a patch to update the
files in sysfs and the documentation?

-Nathan

 
 Thanks, 
 Zhong
 
 
 
 

 -Nathan
 ---
  Documentation/memory-hotplug.txt |  113 
 ---
  1 file changed, 59 insertions(+), 54 deletions(-)

 Index: linux/Documentation/memory-hotplug.txt
 ===
 --- linux.orig/Documentation/memory-hotplug.txt
 +++ linux/Documentation/memory-hotplug.txt
 @@ -88,16 +88,21 @@ phase by hand.

  1.3. Unit of Memory online/offline operation
  
 -Memory hotplug uses SPARSEMEM memory model. SPARSEMEM divides the whole 
 memory
 -into chunks of the same size. The chunk is called a section. The size of
 -a section is architecture dependent. For example, power uses 16MiB, ia64 
 uses
 -1GiB. The unit of online/offline operation is one section. (see Section 
 3.)
 +Memory hotplug uses SPARSEMEM memory model which allows memory to be divided
 +into chunks of the same size. These chunks are called sections. The size 
 of
 +a memory section is architecture dependent. For example, power uses 16MiB, 
 ia64
 +uses 1GiB.
 +
 +Memory sections are combined into chunks referred to as memory blocks. The
 +size of a memory block is architecture dependent and represents the logical
 +unit upon which memory online/offline operations are to be performed. The
 +default size of a memory block is the same as memory section size unless an
 +architecture specifies otherwise. (see Section 3.)

 -To determine the size of sections, please read this file:
 +To determine the size (in bytes) of a memory block please read this file:

  /sys/devices/system/memory/block_size_bytes

 -This file shows the size of sections in byte.

  ---
  2. Kernel Configuration
 @@ -123,14

[PATCH] Update Maintainers for IBM Power 842, vscsi, and vfc drivers

2014-04-09 Thread Nathan Fontenot
Update the MAINTAINERS file to indicate the current maintainers
for the IBM Power 842 Compression driver, IBM Power Virtual SCSI
driver and the IBM Power Virtual FC Driver.

Signed-off-by: Nathan Fontenot 
---
 MAINTAINERS |   16 +++-
 1 file changed, 11 insertions(+), 5 deletions(-)

Index: linux/MAINTAINERS
===
--- linux.orig/MAINTAINERS  2014-04-08 14:14:57.0 -0500
+++ linux/MAINTAINERS   2014-04-08 14:25:29.0 -0500
@@ -4358,7 +4358,7 @@
 F: drivers/crypto/nx/
 
 IBM Power 842 compression accelerator
-M: Robert Jennings 
+M: Nathan Fontenot 
 S: Supported
 F: drivers/crypto/nx/nx-842.c
 F: include/linux/nx842.h
@@ -4374,12 +4374,18 @@
 S: Supported
 F: drivers/net/ethernet/ibm/ibmveth.*
 
-IBM Power Virtual SCSI/FC Device Drivers
-M: Robert Jennings 
+IBM Power Virtual SCSI Device Drivers
+M: Nathan Fontenot 
 L: linux-s...@vger.kernel.org
 S: Supported
-F: drivers/scsi/ibmvscsi/
-X: drivers/scsi/ibmvscsi/ibmvstgt.c
+F: drivers/scsi/ibmvscsi/ibmvscsi*
+F: drivers/scsi/ibmvscsi/viosrp.h
+
+IBM Power Virtual FC Device Drivers
+M: Brian King 
+L: linux-s...@vger.kernel.org
+S: Supported
+F: drivers/scsi/ibmvscsi/ibmvfc*
 
 IBM ServeRAID RAID DRIVER
 P: Jack Hammer

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] memory driver: make phys_index/end_phys_index reflect the start/end section number

2014-04-09 Thread Nathan Fontenot
On 04/08/2014 02:47 PM, Dave Hansen wrote:
> 
> That document really needs to be updated to stop referring to sections
> (at least in the descriptions of the user interface).  We can not change
> the units of phys_index/end_phys_index without also changing
> block_size_bytes.
> 

Here is a first pass at updating the documentation.

I have tried to update the documentation to refer to memory blocks instead
of memory sections where appropriate and added a paragraph to explain
that memory blocks are mode of memory sections.

Thoughts?

-Nathan
---
 Documentation/memory-hotplug.txt |  113 ---
 1 file changed, 59 insertions(+), 54 deletions(-)

Index: linux/Documentation/memory-hotplug.txt
===
--- linux.orig/Documentation/memory-hotplug.txt
+++ linux/Documentation/memory-hotplug.txt
@@ -88,16 +88,21 @@ phase by hand.
 
 1.3. Unit of Memory online/offline operation
 
-Memory hotplug uses SPARSEMEM memory model. SPARSEMEM divides the whole memory
-into chunks of the same size. The chunk is called a "section". The size of
-a section is architecture dependent. For example, power uses 16MiB, ia64 uses
-1GiB. The unit of online/offline operation is "one section". (see Section 3.)
+Memory hotplug uses SPARSEMEM memory model which allows memory to be divided
+into chunks of the same size. These chunks are called "sections". The size of
+a memory section is architecture dependent. For example, power uses 16MiB, ia64
+uses 1GiB.
+
+Memory sections are combined into chunks referred to as "memory blocks". The
+size of a memory block is architecture dependent and represents the logical
+unit upon which memory online/offline operations are to be performed. The
+default size of a memory block is the same as memory section size unless an
+architecture specifies otherwise. (see Section 3.)
 
-To determine the size of sections, please read this file:
+To determine the size (in bytes) of a memory block please read this file:
 
 /sys/devices/system/memory/block_size_bytes
 
-This file shows the size of sections in byte.
 
 ---
 2. Kernel Configuration
@@ -123,14 +128,15 @@ config options.
 (CONFIG_ACPI_CONTAINER).
 This option can be kernel module too.
 
+
 
-4 sysfs files for memory hotplug
+3 sysfs files for memory hotplug
 
-All sections have their device information in sysfs.  Each section is part of
-a memory block under /sys/devices/system/memory as
+All memory blocks have their device information in sysfs.  Each memory block
+is described under /sys/devices/system/memory as
 
 /sys/devices/system/memory/memoryXXX
-(XXX is the section id.)
+(XXX is the memory block id.)
 
 Now, XXX is defined as (start_address_of_section / section_size) of the first
 section contained in the memory block.  The files 'phys_index' and
@@ -141,13 +147,13 @@ range. Currently there is no way to dete
 the existence of one should not affect the hotplug capabilities of the memory
 block.
 
-For example, assume 1GiB section size. A device for a memory starting at
+For example, assume 1GiB memory block size. A device for a memory starting at
 0x1 is /sys/device/system/memory/memory4
 (0x1 / 1Gib = 4)
 This device covers address range [0x1 ... 0x14000)
 
-Under each section, you can see 4 or 5 files, the end_phys_index file being
-a recent addition and not present on older kernels.
+Under each memory block, you can see 4 or 5 files, the end_phys_index file
+being a recent addition and not present on older kernels.
 
 /sys/devices/system/memory/memoryXXX/start_phys_index
 /sys/devices/system/memory/memoryXXX/end_phys_index
@@ -185,6 +191,7 @@ For example:
 A backlink will also be created:
 /sys/devices/system/memory/memory9/node0 -> ../../node/node0
 
+
 
 4. Physical memory hot-add phase
 
@@ -227,11 +234,10 @@ You can tell the physical address of new
 
 % echo start_address_of_new_memory > /sys/devices/system/memory/probe
 
-Then, [start_address_of_new_memory, start_address_of_new_memory + section_size)
-memory range is hot-added. In this case, hotplug script is not called (in
-current implementation). You'll have to online memory by yourself.
-Please see "How to online memory" in this text.
-
+Then, [start_address_of_new_memory, start_address_of_new_memory +
+memory_block_size] memory range is hot-added. In this case, hotplug script is
+not called (in current implementation). You'll have to online memory by
+yourself.  Please see "How to online memory" in this text.
 
 
 --
@@ -240,36 +246,36 @@ Please see "How to online memory" in thi
 
 5.1. State of memory
 
-To see (online/offline) state of memory section, read 'state' file.
+To see (online/offline) state of a memory block, read 'state' file.
 
 % cat 

Re: [RFC PATCH] memory driver: make phys_index/end_phys_index reflect the start/end section number

2014-04-09 Thread Nathan Fontenot
On 04/08/2014 02:47 PM, Dave Hansen wrote:
> On 04/08/2014 11:23 AM, Nathan Fontenot wrote:
>> On 04/08/2014 11:13 AM, Dave Hansen wrote:
>>> On 04/08/2014 01:27 AM, Li Zhong wrote:
>>>> If Dave and others don't have further objections, it seems this small
>>>> userspace incompatibility could be accepted by most of us, and I don't
>>>> need to make a version 2. 
>>>
>>> Let me ask another question then.  What are the units of
>>> phys_index/end_phys_index?  How do we expose those units to userspace?
>>>
>>
>> The documentation for these files just states that the files contain
>> the first and last section id of memory in the memory block for
>> phys_index and end_phys_index respectively.
>>
>> I'm not sure the values have ever been units of anything, at least not
>> that I remember.
> 
> 
> 
> There are two units.  SECTION_SIZE, which is completely internal to the
> kernel, and block_size_bytes which used to be the same as SECTION_SIZE,
> but is not now.  Which one of those two is phys_index/end_phys_index in,
> and if it is in terms of SECTION_SIZE like this patch proposes, how do
> we tell userspace how large SECTION_SIZE is?
> 
> block_size_bytes is supposed to tell you how large the sections are.  In
> the case where we lumped a bunch of sections together, we also bumped up
> block_size_bytes.  That's why we currently divide the *ACTUAL* section
> number in phys_index/end_phys_index by block_size_bytes.
> 
> That document really needs to be updated to stop referring to sections
> (at least in the descriptions of the user interface).  We can not change
> the units of phys_index/end_phys_index without also changing
> block_size_bytes.
> 

Re-reading the documentation. You're correct, it needs help.

-Nathan

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] memory driver: make phys_index/end_phys_index reflect the start/end section number

2014-04-09 Thread Nathan Fontenot
On 04/09/2014 10:49 AM, Dave Hansen wrote:
> On 04/09/2014 02:20 AM, Li Zhong wrote:
>> Or do you mean we don't need to expose any information related to
>> SECTION to userspace? 
> 
> Right, we don't need to expose sections themselves to userspace.  Do we?
>

No. the layout in sysfs is based in block_size_bytes so I do not see any
need to expose sections to userspace.

-Nathan
 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] memory driver: make phys_index/end_phys_index reflect the start/end section number

2014-04-09 Thread Nathan Fontenot
On 04/09/2014 10:49 AM, Dave Hansen wrote:
 On 04/09/2014 02:20 AM, Li Zhong wrote:
 Or do you mean we don't need to expose any information related to
 SECTION to userspace? 
 
 Right, we don't need to expose sections themselves to userspace.  Do we?


No. the layout in sysfs is based in block_size_bytes so I do not see any
need to expose sections to userspace.

-Nathan
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] memory driver: make phys_index/end_phys_index reflect the start/end section number

2014-04-09 Thread Nathan Fontenot
On 04/08/2014 02:47 PM, Dave Hansen wrote:
 On 04/08/2014 11:23 AM, Nathan Fontenot wrote:
 On 04/08/2014 11:13 AM, Dave Hansen wrote:
 On 04/08/2014 01:27 AM, Li Zhong wrote:
 If Dave and others don't have further objections, it seems this small
 userspace incompatibility could be accepted by most of us, and I don't
 need to make a version 2. 

 Let me ask another question then.  What are the units of
 phys_index/end_phys_index?  How do we expose those units to userspace?


 The documentation for these files just states that the files contain
 the first and last section id of memory in the memory block for
 phys_index and end_phys_index respectively.

 I'm not sure the values have ever been units of anything, at least not
 that I remember.
 
 sigh
 
 There are two units.  SECTION_SIZE, which is completely internal to the
 kernel, and block_size_bytes which used to be the same as SECTION_SIZE,
 but is not now.  Which one of those two is phys_index/end_phys_index in,
 and if it is in terms of SECTION_SIZE like this patch proposes, how do
 we tell userspace how large SECTION_SIZE is?
 
 block_size_bytes is supposed to tell you how large the sections are.  In
 the case where we lumped a bunch of sections together, we also bumped up
 block_size_bytes.  That's why we currently divide the *ACTUAL* section
 number in phys_index/end_phys_index by block_size_bytes.
 
 That document really needs to be updated to stop referring to sections
 (at least in the descriptions of the user interface).  We can not change
 the units of phys_index/end_phys_index without also changing
 block_size_bytes.
 

Re-reading the documentation. You're correct, it needs help.

-Nathan

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] memory driver: make phys_index/end_phys_index reflect the start/end section number

2014-04-09 Thread Nathan Fontenot
On 04/08/2014 02:47 PM, Dave Hansen wrote:
 
 That document really needs to be updated to stop referring to sections
 (at least in the descriptions of the user interface).  We can not change
 the units of phys_index/end_phys_index without also changing
 block_size_bytes.
 

Here is a first pass at updating the documentation.

I have tried to update the documentation to refer to memory blocks instead
of memory sections where appropriate and added a paragraph to explain
that memory blocks are mode of memory sections.

Thoughts?

-Nathan
---
 Documentation/memory-hotplug.txt |  113 ---
 1 file changed, 59 insertions(+), 54 deletions(-)

Index: linux/Documentation/memory-hotplug.txt
===
--- linux.orig/Documentation/memory-hotplug.txt
+++ linux/Documentation/memory-hotplug.txt
@@ -88,16 +88,21 @@ phase by hand.
 
 1.3. Unit of Memory online/offline operation
 
-Memory hotplug uses SPARSEMEM memory model. SPARSEMEM divides the whole memory
-into chunks of the same size. The chunk is called a section. The size of
-a section is architecture dependent. For example, power uses 16MiB, ia64 uses
-1GiB. The unit of online/offline operation is one section. (see Section 3.)
+Memory hotplug uses SPARSEMEM memory model which allows memory to be divided
+into chunks of the same size. These chunks are called sections. The size of
+a memory section is architecture dependent. For example, power uses 16MiB, ia64
+uses 1GiB.
+
+Memory sections are combined into chunks referred to as memory blocks. The
+size of a memory block is architecture dependent and represents the logical
+unit upon which memory online/offline operations are to be performed. The
+default size of a memory block is the same as memory section size unless an
+architecture specifies otherwise. (see Section 3.)
 
-To determine the size of sections, please read this file:
+To determine the size (in bytes) of a memory block please read this file:
 
 /sys/devices/system/memory/block_size_bytes
 
-This file shows the size of sections in byte.
 
 ---
 2. Kernel Configuration
@@ -123,14 +128,15 @@ config options.
 (CONFIG_ACPI_CONTAINER).
 This option can be kernel module too.
 
+
 
-4 sysfs files for memory hotplug
+3 sysfs files for memory hotplug
 
-All sections have their device information in sysfs.  Each section is part of
-a memory block under /sys/devices/system/memory as
+All memory blocks have their device information in sysfs.  Each memory block
+is described under /sys/devices/system/memory as
 
 /sys/devices/system/memory/memoryXXX
-(XXX is the section id.)
+(XXX is the memory block id.)
 
 Now, XXX is defined as (start_address_of_section / section_size) of the first
 section contained in the memory block.  The files 'phys_index' and
@@ -141,13 +147,13 @@ range. Currently there is no way to dete
 the existence of one should not affect the hotplug capabilities of the memory
 block.
 
-For example, assume 1GiB section size. A device for a memory starting at
+For example, assume 1GiB memory block size. A device for a memory starting at
 0x1 is /sys/device/system/memory/memory4
 (0x1 / 1Gib = 4)
 This device covers address range [0x1 ... 0x14000)
 
-Under each section, you can see 4 or 5 files, the end_phys_index file being
-a recent addition and not present on older kernels.
+Under each memory block, you can see 4 or 5 files, the end_phys_index file
+being a recent addition and not present on older kernels.
 
 /sys/devices/system/memory/memoryXXX/start_phys_index
 /sys/devices/system/memory/memoryXXX/end_phys_index
@@ -185,6 +191,7 @@ For example:
 A backlink will also be created:
 /sys/devices/system/memory/memory9/node0 - ../../node/node0
 
+
 
 4. Physical memory hot-add phase
 
@@ -227,11 +234,10 @@ You can tell the physical address of new
 
 % echo start_address_of_new_memory  /sys/devices/system/memory/probe
 
-Then, [start_address_of_new_memory, start_address_of_new_memory + section_size)
-memory range is hot-added. In this case, hotplug script is not called (in
-current implementation). You'll have to online memory by yourself.
-Please see How to online memory in this text.
-
+Then, [start_address_of_new_memory, start_address_of_new_memory +
+memory_block_size] memory range is hot-added. In this case, hotplug script is
+not called (in current implementation). You'll have to online memory by
+yourself.  Please see How to online memory in this text.
 
 
 --
@@ -240,36 +246,36 @@ Please see How to online memory in thi
 
 5.1. State of memory
 
-To see (online/offline) state of memory section, read 'state' file.
+To see (online/offline) state of a memory block, read 'state' file.
 
 % cat 

[PATCH] Update Maintainers for IBM Power 842, vscsi, and vfc drivers

2014-04-09 Thread Nathan Fontenot
Update the MAINTAINERS file to indicate the current maintainers
for the IBM Power 842 Compression driver, IBM Power Virtual SCSI
driver and the IBM Power Virtual FC Driver.

Signed-off-by: Nathan Fontenot nf...@linux.vnet.ibm.com
---
 MAINTAINERS |   16 +++-
 1 file changed, 11 insertions(+), 5 deletions(-)

Index: linux/MAINTAINERS
===
--- linux.orig/MAINTAINERS  2014-04-08 14:14:57.0 -0500
+++ linux/MAINTAINERS   2014-04-08 14:25:29.0 -0500
@@ -4358,7 +4358,7 @@
 F: drivers/crypto/nx/
 
 IBM Power 842 compression accelerator
-M: Robert Jennings r...@linux.vnet.ibm.com
+M: Nathan Fontenot nf...@linux.vnet.ibm.com
 S: Supported
 F: drivers/crypto/nx/nx-842.c
 F: include/linux/nx842.h
@@ -4374,12 +4374,18 @@
 S: Supported
 F: drivers/net/ethernet/ibm/ibmveth.*
 
-IBM Power Virtual SCSI/FC Device Drivers
-M: Robert Jennings r...@linux.vnet.ibm.com
+IBM Power Virtual SCSI Device Drivers
+M: Nathan Fontenot nf...@linux.vnet.ibm.com
 L: linux-s...@vger.kernel.org
 S: Supported
-F: drivers/scsi/ibmvscsi/
-X: drivers/scsi/ibmvscsi/ibmvstgt.c
+F: drivers/scsi/ibmvscsi/ibmvscsi*
+F: drivers/scsi/ibmvscsi/viosrp.h
+
+IBM Power Virtual FC Device Drivers
+M: Brian King brk...@linux.vnet.ibm.com
+L: linux-s...@vger.kernel.org
+S: Supported
+F: drivers/scsi/ibmvscsi/ibmvfc*
 
 IBM ServeRAID RAID DRIVER
 P: Jack Hammer

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] memory driver: make phys_index/end_phys_index reflect the start/end section number

2014-04-08 Thread Nathan Fontenot
On 04/08/2014 11:13 AM, Dave Hansen wrote:
> On 04/08/2014 01:27 AM, Li Zhong wrote:
>> If Dave and others don't have further objections, it seems this small
>> userspace incompatibility could be accepted by most of us, and I don't
>> need to make a version 2. 
> 
> Let me ask another question then.  What are the units of
> phys_index/end_phys_index?  How do we expose those units to userspace?
>

The documentation for these files just states that the files contain
the first and last section id of memory in the memory block for
phys_index and end_phys_index respectively.

I'm not sure the values have ever been units of anything, at least not
that I remember.

-Nathan
 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] memory driver: make phys_index/end_phys_index reflect the start/end section number

2014-04-08 Thread Nathan Fontenot
On 04/08/2014 11:13 AM, Dave Hansen wrote:
 On 04/08/2014 01:27 AM, Li Zhong wrote:
 If Dave and others don't have further objections, it seems this small
 userspace incompatibility could be accepted by most of us, and I don't
 need to make a version 2. 
 
 Let me ask another question then.  What are the units of
 phys_index/end_phys_index?  How do we expose those units to userspace?


The documentation for these files just states that the files contain
the first and last section id of memory in the memory block for
phys_index and end_phys_index respectively.

I'm not sure the values have ever been units of anything, at least not
that I remember.

-Nathan
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] powerpc/le: enable RTAS events support

2014-04-07 Thread Nathan Fontenot
On 04/04/2014 02:35 AM, Greg Kurz wrote:
> The current kernel code assumes big endian and parses RTAS events all
> wrong. The most visible effect is that we cannot honor EPOW events,
> meaning, for example, we cannot shut down a guest properly from the
> hypervisor.
> 
> This new patch is largely inspired by Nathan's work: we get rid of all
> the bit fields in the RTAS event structures (even the unused ones, for
> consistency). We also introduce endian safe accessors for the fields used
> by the kernel (trivial rtas_error_type() accessor added for consistency).
> 
> Cc: Nathan Fontenot 
> Signed-off-by: Greg Kurz 

Looks good, thanks for getting this done Greg.
-Nathan

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] powerpc/le: enable RTAS events support

2014-04-07 Thread Nathan Fontenot
On 04/04/2014 02:35 AM, Greg Kurz wrote:
 The current kernel code assumes big endian and parses RTAS events all
 wrong. The most visible effect is that we cannot honor EPOW events,
 meaning, for example, we cannot shut down a guest properly from the
 hypervisor.
 
 This new patch is largely inspired by Nathan's work: we get rid of all
 the bit fields in the RTAS event structures (even the unused ones, for
 consistency). We also introduce endian safe accessors for the fields used
 by the kernel (trivial rtas_error_type() accessor added for consistency).
 
 Cc: Nathan Fontenot nf...@linux.vnet.ibm.com
 Signed-off-by: Greg Kurz gk...@linux.vnet.ibm.com

Looks good, thanks for getting this done Greg.
-Nathan

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] powerpc/le: enable RTAS events support

2014-03-31 Thread Nathan Fontenot
This is the patch that I worked up at the same time as Greg, the
biggest difference being that I took the approach of doing and's,
and shifting as opposed to re-defining the bit fields for LE.

One other difference is that I left out defines for bits in the
error log structures that we currently do not use. I did leave the
comments in the structs describing the bit layout for future reference
but did not feel we needed to provide a define for all of them.

NOTE: This patch has not been tested.

-Nathan

---
 arch/powerpc/include/asm/rtas.h   |   92 +++-
 arch/powerpc/kernel/rtas.c|   24 ++--
 arch/powerpc/kernel/rtasd.c   |   11 ++--
 arch/powerpc/platforms/pseries/mobility.c |2 +-
 arch/powerpc/platforms/pseries/ras.c  |   18 --
 5 files changed, 97 insertions(+), 50 deletions(-)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index a0e1add..6efa1b6 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -150,19 +150,45 @@ struct rtas_suspend_me_data {
 #define RTAS_VECTOR_EXTERNAL_INTERRUPT 0x500
 
 struct rtas_error_log {
-   unsigned long version:8;/* Architectural version */
-   unsigned long severity:3;   /* Severity level of error */
-   unsigned long disposition:2;/* Degree of recovery */
-   unsigned long extended:1;   /* extended log present? */
-   unsigned long /* reserved */ :2;/* Reserved for future use */
-   unsigned long initiator:4;  /* Initiator of event */
-   unsigned long target:4; /* Target of failed operation */
-   unsigned long type:8;   /* General event or error*/
-   unsigned long extended_log_length:32;   /* length in bytes */
-   unsigned char buffer[1];/* Start of extended log */
+   /* Byte 0 */
+   uint8_t version;/* Architectural version */
+
+   /* Byte 1 */
+   uint8_t severity;
+   /* 
+* XXX  3: Severity level of error
+*XX2: Degree of recovery
+*  X   1: Extended log present?
+*   XX 2: Reserved
+*/
+   
+   /* Byte 2 */
+   uint8_t :8;
+   /* 
+*  4: Initiator of event
+*  4: Target of failed operation
+*/
+   uint8_t type;   /* General event or error*/
+   uint32_textended_log_length;/* length in bytes */
+   unsigned char   buffer[1];  /* Start of extended log */
/* Variable length.  */
 };
 
+static inline uint8_t rtas_error_severity(struct rtas_error_log *elog)
+{
+   return (elog->severity & 0xE0) >> 5;
+}
+
+static inline uint8_t rtas_error_disposition(struct rtas_error_log *elog)
+{
+   return (elog->severity & 0x18) >> 3;
+}
+
+static inline uint8_t rtas_error_extended(struct rtas_error_log *elog)
+{
+   return elog->severity & 0x04;
+}
+
 #define RTAS_V6EXT_LOG_FORMAT_EVENT_LOG14
 
 #define RTAS_V6EXT_COMPANY_ID_IBM  (('I' << 24) | ('B' << 16) | ('M' << 8))
@@ -172,34 +198,40 @@ struct rtas_error_log {
  */
 struct rtas_ext_event_log_v6 {
/* Byte 0 */
-   uint32_t log_valid:1;   /* 1:Log valid */
-   uint32_t unrecoverable_error:1; /* 1:Unrecoverable error */
-   uint32_t recoverable_error:1;   /* 1:recoverable (correctable   */
-   /*   or successfully retried)   */
-   uint32_t degraded_operation:1;  /* 1:Unrecoverable err, bypassed*/
-   /*   - degraded operation (e.g. */
-   /*   CPU or mem taken off-line) */
-   uint32_t predictive_error:1;
-   uint32_t new_log:1; /* 1:"New" log (Always 1 for*/
-   /*   data returned from RTAS*/
-   uint32_t big_endian:1;  /* 1: Big endian */
-   uint32_t :1;/* reserved */
+   uint8_t :8;
+   /* 
+* X1: Log valid
+*  X   1: Unrecoverable error
+*   X  1: Recoverable (correctable or successfully retried)
+*X 1: Unrecoverable err, bypassed - degraded operation
+* (e.g. CPU or mem taken off-line)
+* X1: Preduictive error
+*  X   1: "New" log (Always 1 for data returned from RTAS)
+*   X  1: Big endian
+*X 1: reserved
+*/
+   
/* Byte 1 */
-   uint32_t :8;/* reserved */
+   uint8_t :8; /* reserved */
/* Byte 2 */
-   uint32_t powerpc_format:1;  /* Set to 1 (indicating log is  */
-

Re: [RFC PATCH] powerpc/le: enable RTAS events support

2014-03-31 Thread Nathan Fontenot
This is the patch that I worked up at the same time as Greg, the
biggest difference being that I took the approach of doing and's,
and shifting as opposed to re-defining the bit fields for LE.

One other difference is that I left out defines for bits in the
error log structures that we currently do not use. I did leave the
comments in the structs describing the bit layout for future reference
but did not feel we needed to provide a define for all of them.

NOTE: This patch has not been tested.

-Nathan

---
 arch/powerpc/include/asm/rtas.h   |   92 +++-
 arch/powerpc/kernel/rtas.c|   24 ++--
 arch/powerpc/kernel/rtasd.c   |   11 ++--
 arch/powerpc/platforms/pseries/mobility.c |2 +-
 arch/powerpc/platforms/pseries/ras.c  |   18 --
 5 files changed, 97 insertions(+), 50 deletions(-)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index a0e1add..6efa1b6 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -150,19 +150,45 @@ struct rtas_suspend_me_data {
 #define RTAS_VECTOR_EXTERNAL_INTERRUPT 0x500
 
 struct rtas_error_log {
-   unsigned long version:8;/* Architectural version */
-   unsigned long severity:3;   /* Severity level of error */
-   unsigned long disposition:2;/* Degree of recovery */
-   unsigned long extended:1;   /* extended log present? */
-   unsigned long /* reserved */ :2;/* Reserved for future use */
-   unsigned long initiator:4;  /* Initiator of event */
-   unsigned long target:4; /* Target of failed operation */
-   unsigned long type:8;   /* General event or error*/
-   unsigned long extended_log_length:32;   /* length in bytes */
-   unsigned char buffer[1];/* Start of extended log */
+   /* Byte 0 */
+   uint8_t version;/* Architectural version */
+
+   /* Byte 1 */
+   uint8_t severity;
+   /* 
+* XXX  3: Severity level of error
+*XX2: Degree of recovery
+*  X   1: Extended log present?
+*   XX 2: Reserved
+*/
+   
+   /* Byte 2 */
+   uint8_t :8;
+   /* 
+*  4: Initiator of event
+*  4: Target of failed operation
+*/
+   uint8_t type;   /* General event or error*/
+   uint32_textended_log_length;/* length in bytes */
+   unsigned char   buffer[1];  /* Start of extended log */
/* Variable length.  */
 };
 
+static inline uint8_t rtas_error_severity(struct rtas_error_log *elog)
+{
+   return (elog-severity  0xE0)  5;
+}
+
+static inline uint8_t rtas_error_disposition(struct rtas_error_log *elog)
+{
+   return (elog-severity  0x18)  3;
+}
+
+static inline uint8_t rtas_error_extended(struct rtas_error_log *elog)
+{
+   return elog-severity  0x04;
+}
+
 #define RTAS_V6EXT_LOG_FORMAT_EVENT_LOG14
 
 #define RTAS_V6EXT_COMPANY_ID_IBM  (('I'  24) | ('B'  16) | ('M'  8))
@@ -172,34 +198,40 @@ struct rtas_error_log {
  */
 struct rtas_ext_event_log_v6 {
/* Byte 0 */
-   uint32_t log_valid:1;   /* 1:Log valid */
-   uint32_t unrecoverable_error:1; /* 1:Unrecoverable error */
-   uint32_t recoverable_error:1;   /* 1:recoverable (correctable   */
-   /*   or successfully retried)   */
-   uint32_t degraded_operation:1;  /* 1:Unrecoverable err, bypassed*/
-   /*   - degraded operation (e.g. */
-   /*   CPU or mem taken off-line) */
-   uint32_t predictive_error:1;
-   uint32_t new_log:1; /* 1:New log (Always 1 for*/
-   /*   data returned from RTAS*/
-   uint32_t big_endian:1;  /* 1: Big endian */
-   uint32_t :1;/* reserved */
+   uint8_t :8;
+   /* 
+* X1: Log valid
+*  X   1: Unrecoverable error
+*   X  1: Recoverable (correctable or successfully retried)
+*X 1: Unrecoverable err, bypassed - degraded operation
+* (e.g. CPU or mem taken off-line)
+* X1: Preduictive error
+*  X   1: New log (Always 1 for data returned from RTAS)
+*   X  1: Big endian
+*X 1: reserved
+*/
+   
/* Byte 1 */
-   uint32_t :8;/* reserved */
+   uint8_t :8; /* reserved */
/* Byte 2 */
-   uint32_t powerpc_format:1;  /* Set to 1 (indicating log is  */
-

Re: [PATCH] powerpc/le: enable RTAS events support

2014-03-28 Thread Nathan Fontenot
Greg,

There is one more place that needs fixing up, in mobility_rtas_call(),
and handle_rtas_event() in arch/powerpc/platforms/pseries/mobility.c.

This relates to rtas event handling for PRRN notifications, we need to
convert the scope variable (PRRN notifications re-use the extended log
length field) when read from the rtas event in handle_rtas_event(), then
convert it back to big endian in mobility_rtas_call().

The double conversion seems yucky but mobility_rtas_call can be called
other places where the scope variable is not originally big endian.

-Nathan

On 03/28/2014 02:33 AM, Greg Kurz wrote:
> The current kernel code assumes big endian and parses RTAS events all
> wrong. The most visible effect is that we cannot honor EPOW events,
> meaning, for example, we cannot shut down a guest properly from the
> hypervisor.
> 
> This patch fixes that.
> 
> Signed-off-by: Greg Kurz 
> ---
>  arch/powerpc/include/asm/rtas.h  |   46 
> ++
>  arch/powerpc/kernel/rtas.c   |   11 
>  arch/powerpc/kernel/rtasd.c  |8 --
>  arch/powerpc/platforms/pseries/ras.c |3 +-
>  4 files changed, 59 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
> index 9bd52c6..8bb99d0 100644
> --- a/arch/powerpc/include/asm/rtas.h
> +++ b/arch/powerpc/include/asm/rtas.h
> @@ -150,15 +150,37 @@ struct rtas_suspend_me_data {
>  #define RTAS_VECTOR_EXTERNAL_INTERRUPT   0x500
>  
>  struct rtas_error_log {
> +#ifdef __BIG_ENDIAN__
> + /* Byte 0 */
>   unsigned long version:8;/* Architectural version */
> + /* Byte 1 */
>   unsigned long severity:3;   /* Severity level of error */
>   unsigned long disposition:2;/* Degree of recovery */
>   unsigned long extended:1;   /* extended log present? */
>   unsigned long /* reserved */ :2;/* Reserved for future use */
> + /* Byte 2 */
>   unsigned long initiator:4;  /* Initiator of event */
>   unsigned long target:4; /* Target of failed operation */
> + /* Byte 3 */
>   unsigned long type:8;   /* General event or error*/
> + /* Byte 4 */
>   unsigned long extended_log_length:32;   /* length in bytes */
> +#else
> + /* Byte 0 */
> + unsigned long version:8;
> + /* Byte 1 */
> + unsigned long :2;
> + unsigned long extended:1;
> + unsigned long disposition:2;
> + unsigned long severity:3;
> + unsigned long target:4;
> + /* Byte 2 */
> + unsigned long initiator:4;
> + /* Byte 3 */
> + unsigned long type:8;
> + /* Byte 4 */
> + unsigned long extended_log_length:32;
> +#endif
>   unsigned char buffer[1];/* Start of extended log */
>   /* Variable length.  */
>  };
> @@ -171,6 +193,7 @@ struct rtas_error_log {
>   * from "buffer" field of struct rtas_error_log defined above.
>   */
>  struct rtas_ext_event_log_v6 {
> +#ifdef __BIG_ENDIAN__
>   /* Byte 0 */
>   uint32_t log_valid:1;   /* 1:Log valid */
>   uint32_t unrecoverable_error:1; /* 1:Unrecoverable error */
> @@ -200,6 +223,29 @@ struct rtas_ext_event_log_v6 {
>   uint32_t company_id;/* Company ID of the company*/
>   /* that defines the format for  */
>   /* the vendor specific log type */
> +#else
> + /* Byte 0 */
> + uint32_t :1;
> + uint32_t big_endian:1;
> + uint32_t new_log:1;
> + uint32_t predictive_error:1;
> + uint32_t degraded_operation:1;
> + uint32_t recoverable_error:1;
> + uint32_t unrecoverable_error:1;
> + uint32_t log_valid:1;
> + /* Byte 1 */
> + uint32_t :8;
> + /* Byte 2 */
> + uint32_t log_format:4;
> + uint32_t :3;
> + uint32_t powerpc_format:1;
> + /* Byte 3 */
> + uint32_t :8;
> + /* Byte 4-11 */
> + uint8_t reserved[8];
> + /* Byte 12-15 */
> + uint32_t company_id;
> +#endif
>   /* Byte 16-end of log */
>   uint8_t vendor_log[1];  /* Start of vendor specific log */
>   /* Variable length. */
> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
> index f386296..e18ab12 100644
> --- a/arch/powerpc/kernel/rtas.c
> +++ b/arch/powerpc/kernel/rtas.c
> @@ -993,21 +993,22 @@ struct pseries_errorlog *get_pseries_errorlog(struct 
> rtas_error_log *log,
>   (struct rtas_ext_event_log_v6 *)log->buffer;
>   struct pseries_errorlog *sect;
>   unsigned char *p, *log_end;
> + uint32_t extended_log_length = be32_to_cpu(log->extended_log_length);
>  
>   /* Check that we understand the format */
> - if (log->extended_log_length < sizeof(struct rtas_ext_event_log_v6) ||
> + if (extended_log_length < sizeof(struct 

Re: [PATCH] powerpc/le: enable RTAS events support

2014-03-28 Thread Nathan Fontenot
Greg,

There is one more place that needs fixing up, in mobility_rtas_call(),
and handle_rtas_event() in arch/powerpc/platforms/pseries/mobility.c.

This relates to rtas event handling for PRRN notifications, we need to
convert the scope variable (PRRN notifications re-use the extended log
length field) when read from the rtas event in handle_rtas_event(), then
convert it back to big endian in mobility_rtas_call().

The double conversion seems yucky but mobility_rtas_call can be called
other places where the scope variable is not originally big endian.

-Nathan

On 03/28/2014 02:33 AM, Greg Kurz wrote:
 The current kernel code assumes big endian and parses RTAS events all
 wrong. The most visible effect is that we cannot honor EPOW events,
 meaning, for example, we cannot shut down a guest properly from the
 hypervisor.
 
 This patch fixes that.
 
 Signed-off-by: Greg Kurz gk...@linux.vnet.ibm.com
 ---
  arch/powerpc/include/asm/rtas.h  |   46 
 ++
  arch/powerpc/kernel/rtas.c   |   11 
  arch/powerpc/kernel/rtasd.c  |8 --
  arch/powerpc/platforms/pseries/ras.c |3 +-
  4 files changed, 59 insertions(+), 9 deletions(-)
 
 diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
 index 9bd52c6..8bb99d0 100644
 --- a/arch/powerpc/include/asm/rtas.h
 +++ b/arch/powerpc/include/asm/rtas.h
 @@ -150,15 +150,37 @@ struct rtas_suspend_me_data {
  #define RTAS_VECTOR_EXTERNAL_INTERRUPT   0x500
  
  struct rtas_error_log {
 +#ifdef __BIG_ENDIAN__
 + /* Byte 0 */
   unsigned long version:8;/* Architectural version */
 + /* Byte 1 */
   unsigned long severity:3;   /* Severity level of error */
   unsigned long disposition:2;/* Degree of recovery */
   unsigned long extended:1;   /* extended log present? */
   unsigned long /* reserved */ :2;/* Reserved for future use */
 + /* Byte 2 */
   unsigned long initiator:4;  /* Initiator of event */
   unsigned long target:4; /* Target of failed operation */
 + /* Byte 3 */
   unsigned long type:8;   /* General event or error*/
 + /* Byte 4 */
   unsigned long extended_log_length:32;   /* length in bytes */
 +#else
 + /* Byte 0 */
 + unsigned long version:8;
 + /* Byte 1 */
 + unsigned long :2;
 + unsigned long extended:1;
 + unsigned long disposition:2;
 + unsigned long severity:3;
 + unsigned long target:4;
 + /* Byte 2 */
 + unsigned long initiator:4;
 + /* Byte 3 */
 + unsigned long type:8;
 + /* Byte 4 */
 + unsigned long extended_log_length:32;
 +#endif
   unsigned char buffer[1];/* Start of extended log */
   /* Variable length.  */
  };
 @@ -171,6 +193,7 @@ struct rtas_error_log {
   * from buffer field of struct rtas_error_log defined above.
   */
  struct rtas_ext_event_log_v6 {
 +#ifdef __BIG_ENDIAN__
   /* Byte 0 */
   uint32_t log_valid:1;   /* 1:Log valid */
   uint32_t unrecoverable_error:1; /* 1:Unrecoverable error */
 @@ -200,6 +223,29 @@ struct rtas_ext_event_log_v6 {
   uint32_t company_id;/* Company ID of the company*/
   /* that defines the format for  */
   /* the vendor specific log type */
 +#else
 + /* Byte 0 */
 + uint32_t :1;
 + uint32_t big_endian:1;
 + uint32_t new_log:1;
 + uint32_t predictive_error:1;
 + uint32_t degraded_operation:1;
 + uint32_t recoverable_error:1;
 + uint32_t unrecoverable_error:1;
 + uint32_t log_valid:1;
 + /* Byte 1 */
 + uint32_t :8;
 + /* Byte 2 */
 + uint32_t log_format:4;
 + uint32_t :3;
 + uint32_t powerpc_format:1;
 + /* Byte 3 */
 + uint32_t :8;
 + /* Byte 4-11 */
 + uint8_t reserved[8];
 + /* Byte 12-15 */
 + uint32_t company_id;
 +#endif
   /* Byte 16-end of log */
   uint8_t vendor_log[1];  /* Start of vendor specific log */
   /* Variable length. */
 diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
 index f386296..e18ab12 100644
 --- a/arch/powerpc/kernel/rtas.c
 +++ b/arch/powerpc/kernel/rtas.c
 @@ -993,21 +993,22 @@ struct pseries_errorlog *get_pseries_errorlog(struct 
 rtas_error_log *log,
   (struct rtas_ext_event_log_v6 *)log-buffer;
   struct pseries_errorlog *sect;
   unsigned char *p, *log_end;
 + uint32_t extended_log_length = be32_to_cpu(log-extended_log_length);
  
   /* Check that we understand the format */
 - if (log-extended_log_length  sizeof(struct rtas_ext_event_log_v6) ||
 + if (extended_log_length  sizeof(struct rtas_ext_event_log_v6) ||
   ext_log-log_format != RTAS_V6EXT_LOG_FORMAT_EVENT_LOG ||
 -   

  1   2   >