[PATCH 2/5] perf, amd: Generalize northbridge constraints code for family 15h

2012-11-26 Thread Jacob Shin
From: Robert Richter 

Generalize northbridge constraints code for family 10h so that later
we can reuse the same code path with other AMD processor families that
have the same northbridge event constraints.

Signed-off-by: Robert Richter 
Signed-off-by: Jacob Shin 
---
 arch/x86/kernel/cpu/perf_event_amd.c |   43 --
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index d60c5c7..04ef43f 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -188,20 +188,13 @@ static inline int amd_has_nb(struct cpu_hw_events *cpuc)
return nb && nb->nb_id != -1;
 }
 
-static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
- struct perf_event *event)
+static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
+  struct perf_event *event)
 {
-   struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
int i;
 
/*
-* only care about NB events
-*/
-   if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
-   return;
-
-   /*
 * need to scan whole list because event may not have
 * been assigned during scheduling
 *
@@ -247,12 +240,13 @@ static void amd_put_event_constraints(struct 
cpu_hw_events *cpuc,
   *
   * Given that resources are allocated (cmpxchg), they must be
   * eventually freed for others to use. This is accomplished by
-  * calling amd_put_event_constraints().
+  * calling __amd_put_nb_event_constraints()
   *
   * Non NB events are not impacted by this restriction.
   */
 static struct event_constraint *
-amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event 
*event,
+  struct event_constraint *c)
 {
struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
@@ -260,12 +254,6 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
struct perf_event *event)
int idx, new = -1;
 
/*
-* if not NB event or no NB, then no constraints
-*/
-   if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
-   return &unconstrained;
-
-   /*
 * detect if already present, if so reuse
 *
 * cannot merge with actual allocation
@@ -275,7 +263,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
struct perf_event *event)
 * because of successive calls to x86_schedule_events() from
 * hw_perf_group_sched_in() without hw_perf_enable()
 */
-   for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+   for_each_set_bit(idx, c->idxmsk, X86_PMC_IDX_MAX) {
if (new == -1 || hwc->idx == idx)
/* assign free slot, prefer hwc->idx */
old = cmpxchg(nb->owners + idx, NULL, event);
@@ -391,6 +379,25 @@ static void amd_pmu_cpu_dead(int cpu)
}
 }
 
+static struct event_constraint *
+amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+   /*
+* if not NB event or no NB, then no constraints
+*/
+   if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
+   return &unconstrained;
+
+   return __amd_get_nb_event_constraints(cpuc, event, &unconstrained);
+}
+
+static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+   if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
+   __amd_put_nb_event_constraints(cpuc, event);
+}
+
 PMU_FORMAT_ATTR(event, "config:0-7,32-35");
 PMU_FORMAT_ATTR(umask, "config:8-15"   );
 PMU_FORMAT_ATTR(edge,  "config:18" );
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH V3 0/5] perf, amd: Enable AMD family 15h northbridge counters

2012-11-26 Thread Jacob Shin
The following patchset enables 4 additional performance counters in
AMD family 15h processors that counts northbridge events -- such as
number of DRAM accesses.

This patchset is based on previous work done by Robert Richter
 :

https://lkml.org/lkml/2012/6/19/324

The main differences are:

* The northbridge counters are indexed contiguously right above the
  core performance counters.

* MSR address offset calculations are moved to architecture specific
  files.

* Interrups are set up to be delivered only to a single core.

V3:
Addressed the following feedback/comments from Robert's review
* https://lkml.org/lkml/2012/11/16/484
* https://lkml.org/lkml/2012/11/26/162

V2:
Separate out Robert's patches, and add properly ordered certificate of
origins.

Jacob Shin (3):
  perf, x86: Move MSR address offset calculation to architecture
specific files
  perf, amd: Enable northbridge performance counters on AMD family 15h
  perf, amd: Use proper naming scheme for AMD bit field definitions

Robert Richter (2):
  perf, amd: Rework northbridge event constraints handler
  perf, amd: Generalize northbridge constraints code for family 15h

 arch/x86/include/asm/cpufeature.h|2 +
 arch/x86/include/asm/msr-index.h |2 +
 arch/x86/include/asm/perf_event.h|   13 +-
 arch/x86/kernel/cpu/perf_event.h |   21 +--
 arch/x86/kernel/cpu/perf_event_amd.c |  269 --
 5 files changed, 211 insertions(+), 96 deletions(-)

-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/5] perf, x86: Move MSR address offset calculation to architecture specific files

2012-11-26 Thread Jacob Shin
Move counter index to MSR address offset calculation to architecture
specific files. This prepares the way for perf_event_amd to enable
counter addresses that are not contiguous -- for example AMD Family
15h processors have 6 core performance counters starting at 0xc0010200
and 4 northbridge performance counters starting at 0xc0010240.

Signed-off-by: Jacob Shin 
---
 arch/x86/kernel/cpu/perf_event.h |   21 +---
 arch/x86/kernel/cpu/perf_event_amd.c |   35 ++
 2 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 271d257..aacf025 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -325,6 +325,7 @@ struct x86_pmu {
int (*schedule_events)(struct cpu_hw_events *cpuc, int n, 
int *assign);
unsignedeventsel;
unsignedperfctr;
+   int (*addr_offset)(int index);
u64 (*event_map)(int);
int max_events;
int num_counters;
@@ -444,28 +445,16 @@ extern u64 __read_mostly hw_cache_extra_regs
 
 u64 x86_perf_event_update(struct perf_event *event);
 
-static inline int x86_pmu_addr_offset(int index)
-{
-   int offset;
-
-   /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
-   alternative_io(ASM_NOP2,
-  "shll $1, %%eax",
-  X86_FEATURE_PERFCTR_CORE,
-  "=a" (offset),
-  "a"  (index));
-
-   return offset;
-}
-
 static inline unsigned int x86_pmu_config_addr(int index)
 {
-   return x86_pmu.eventsel + x86_pmu_addr_offset(index);
+   return x86_pmu.eventsel +
+   (x86_pmu.addr_offset ? x86_pmu.addr_offset(index) : index);
 }
 
 static inline unsigned int x86_pmu_event_addr(int index)
 {
-   return x86_pmu.perfctr + x86_pmu_addr_offset(index);
+   return x86_pmu.perfctr +
+   (x86_pmu.addr_offset ? x86_pmu.addr_offset(index) : index);
 }
 
 int x86_setup_perfctr(struct perf_event *event);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index 04ef43f..d6e3337 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -132,6 +132,40 @@ static u64 amd_pmu_event_map(int hw_event)
return amd_perfmon_event_map[hw_event];
 }
 
+/*
+ * Previously calculated offsets
+ */
+static unsigned int addr_offsets[X86_PMC_IDX_MAX] __read_mostly;
+
+/*
+ * Legacy CPUs:
+ *   4 counters starting at 0xc001 each offset by 1
+ *
+ * CPUs with core performance counter extensions:
+ *   6 counters starting at 0xc0010200 each offset by 2
+ */
+static inline int amd_pmu_addr_offset(int index)
+{
+   int offset;
+
+   if (!index)
+   return index;
+
+   offset = addr_offsets[index];
+
+   if (offset)
+   return offset;
+
+   if (!cpu_has_perfctr_core)
+   offset = index;
+   else
+   offset = index << 1;
+
+   addr_offsets[index] = offset;
+
+   return offset;
+}
+
 static int amd_pmu_hw_config(struct perf_event *event)
 {
int ret;
@@ -570,6 +604,7 @@ static __initconst const struct x86_pmu amd_pmu = {
.schedule_events= x86_schedule_events,
.eventsel   = MSR_K7_EVNTSEL0,
.perfctr= MSR_K7_PERFCTR0,
+   .addr_offset= amd_pmu_addr_offset,
.event_map  = amd_pmu_event_map,
.max_events = ARRAY_SIZE(amd_perfmon_event_map),
.num_counters   = AMD64_NUM_COUNTERS,
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/5] perf, amd: Enable northbridge performance counters on AMD family 15h

2012-11-26 Thread Jacob Shin
On AMD family 15h processors, there are 4 new performance counters
(in addition to 6 core performance counters) that can be used for
counting northbridge events (i.e. DRAM accesses). Their bit fields are
almost identical to the core performance counters. However, unlike the
core performance counters, these MSRs are shared between multiple
cores (that share the same northbridge). We will reuse the same code
path as existing family 10h northbridge event constraints handler
logic to enforce this sharing.

These new counters are indexed contiguously right above the existing
core performance counters, and their indexes correspond to RDPMC ECX
values.

Signed-off-by: Jacob Shin 
---
 arch/x86/include/asm/cpufeature.h|2 +
 arch/x86/include/asm/msr-index.h |2 +
 arch/x86/include/asm/perf_event.h|9 +++
 arch/x86/kernel/cpu/perf_event_amd.c |  131 --
 4 files changed, 122 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index 8c297aa..b05c722 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -167,6 +167,7 @@
 #define X86_FEATURE_TBM(6*32+21) /* trailing bit manipulations 
*/
 #define X86_FEATURE_TOPOEXT(6*32+22) /* topology extensions CPUID leafs */
 #define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter 
extensions */
+#define X86_FEATURE_PERFCTR_NB  (6*32+24) /* core performance counter 
extensions */
 
 /*
  * Auxiliary flags: Linux defined - For features scattered in various
@@ -308,6 +309,7 @@ extern const char * const x86_power_flags[32];
 #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
 #define cpu_has_pclmulqdq  boot_cpu_has(X86_FEATURE_PCLMULQDQ)
 #define cpu_has_perfctr_core   boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
+#define cpu_has_perfctr_nb boot_cpu_has(X86_FEATURE_PERFCTR_NB)
 #define cpu_has_cx8boot_cpu_has(X86_FEATURE_CX8)
 #define cpu_has_cx16   boot_cpu_has(X86_FEATURE_CX16)
 #define cpu_has_eager_fpu  boot_cpu_has(X86_FEATURE_EAGER_FPU)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 7f0edce..e67ff1e 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -157,6 +157,8 @@
 /* Fam 15h MSRs */
 #define MSR_F15H_PERF_CTL  0xc0010200
 #define MSR_F15H_PERF_CTR  0xc0010201
+#define MSR_F15H_NB_PERF_CTL   0xc0010240
+#define MSR_F15H_NB_PERF_CTR   0xc0010241
 
 /* Fam 10h MSRs */
 #define MSR_FAM10H_MMIO_CONF_BASE  0xc0010058
diff --git a/arch/x86/include/asm/perf_event.h 
b/arch/x86/include/asm/perf_event.h
index 4fabcdf..63aba8f 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -29,9 +29,14 @@
 #define ARCH_PERFMON_EVENTSEL_INV  (1ULL << 23)
 #define ARCH_PERFMON_EVENTSEL_CMASK0xFF00ULL
 
+#define AMD64_EVENTSEL_INT_CORE_ENABLE (1ULL << 36)
 #define AMD_PERFMON_EVENTSEL_GUESTONLY (1ULL << 40)
 #define AMD_PERFMON_EVENTSEL_HOSTONLY  (1ULL << 41)
 
+#define AMD64_EVENTSEL_INT_CORE_SEL_SHIFT  37
+#define AMD64_EVENTSEL_INT_CORE_SEL_MASK   \
+   (0xFULL << AMD64_EVENTSEL_INT_CORE_SEL_SHIFT)
+
 #define AMD64_EVENTSEL_EVENT   \
(ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32))
 #define INTEL_ARCH_EVENT_MASK  \
@@ -46,8 +51,12 @@
 #define AMD64_RAW_EVENT_MASK   \
(X86_RAW_EVENT_MASK  |  \
 AMD64_EVENTSEL_EVENT)
+#define AMD64_RAW_EVENT_MASK_NB\
+   (AMD64_EVENTSEL_EVENT|  \
+ARCH_PERFMON_EVENTSEL_UMASK)
 #define AMD64_NUM_COUNTERS 4
 #define AMD64_NUM_COUNTERS_CORE6
+#define AMD64_NUM_COUNTERS_NB  4
 
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL  0x3c
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK(0x00 << 8)
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index d6e3337..5e3a6a3 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -132,6 +132,9 @@ static u64 amd_pmu_event_map(int hw_event)
return amd_perfmon_event_map[hw_event];
 }
 
+static int num_core_counters;
+static struct event_constraint *amd_nb_event_constraint;
+
 /*
  * Previously calculated offsets
  */
@@ -143,6 +146,10 @@ static unsigned int addr_offsets[X86_PMC_IDX_MAX] 
__read_mostly;
  *
  * CPUs with core performance counter extensions:
  *   6 counters starting at 0xc0010200 each offset by 2
+ *
+ * CPUs with north bridge performance counter extensions:
+ *   4 additional counters starting at 0xc0010240 each offset by 2
+ *   (indexed right above either one of the above core counters)
  */
 static inline int a

[PATCH 1/5] perf, amd: Rework northbridge event constraints handler

2012-11-26 Thread Jacob Shin
From: Robert Richter 

Code simplification. No functional changes.

Signed-off-by: Robert Richter 
Signed-off-by: Jacob Shin 
---
 arch/x86/kernel/cpu/perf_event_amd.c |   68 +-
 1 file changed, 26 insertions(+), 42 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index 4528ae7..d60c5c7 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -256,9 +256,8 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
struct perf_event *event)
 {
struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
-   struct perf_event *old = NULL;
-   int max = x86_pmu.num_counters;
-   int i, j, k = -1;
+   struct perf_event *old;
+   int idx, new = -1;
 
/*
 * if not NB event or no NB, then no constraints
@@ -276,48 +275,33 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
struct perf_event *event)
 * because of successive calls to x86_schedule_events() from
 * hw_perf_group_sched_in() without hw_perf_enable()
 */
-   for (i = 0; i < max; i++) {
-   /*
-* keep track of first free slot
-*/
-   if (k == -1 && !nb->owners[i])
-   k = i;
+   for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+   if (new == -1 || hwc->idx == idx)
+   /* assign free slot, prefer hwc->idx */
+   old = cmpxchg(nb->owners + idx, NULL, event);
+   else if (nb->owners[idx] == event)
+   /* event already present */
+   old = event;
+   else
+   continue;
+
+   if (old && old != event)
+   continue;
+
+   /* reassign to this slot */
+   if (new != -1)
+   cmpxchg(nb->owners + new, event, NULL);
+   new = idx;
 
/* already present, reuse */
-   if (nb->owners[i] == event)
-   goto done;
-   }
-   /*
-* not present, so grab a new slot
-* starting either at:
-*/
-   if (hwc->idx != -1) {
-   /* previous assignment */
-   i = hwc->idx;
-   } else if (k != -1) {
-   /* start from free slot found */
-   i = k;
-   } else {
-   /*
-* event not found, no slot found in
-* first pass, try again from the
-* beginning
-*/
-   i = 0;
-   }
-   j = i;
-   do {
-   old = cmpxchg(nb->owners+i, NULL, event);
-   if (!old)
+   if (old == event)
break;
-   if (++i == max)
-   i = 0;
-   } while (i != j);
-done:
-   if (!old)
-   return &nb->event_constraints[i];
-
-   return &emptyconstraint;
+   }
+
+   if (new == -1)
+   return &emptyconstraint;
+
+   return &nb->event_constraints[new];
 }
 
 static struct amd_nb *amd_alloc_nb(int cpu)
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 5/5] perf, amd: Use proper naming scheme for AMD bit field definitions

2012-11-26 Thread Jacob Shin
Update these AMD bit field names to be consistent with naming
convention followed by the rest of the file.

Signed-off-by: Jacob Shin 
---
 arch/x86/include/asm/perf_event.h|4 ++--
 arch/x86/kernel/cpu/perf_event_amd.c |8 
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/perf_event.h 
b/arch/x86/include/asm/perf_event.h
index 63aba8f..57cb634 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -30,8 +30,8 @@
 #define ARCH_PERFMON_EVENTSEL_CMASK0xFF00ULL
 
 #define AMD64_EVENTSEL_INT_CORE_ENABLE (1ULL << 36)
-#define AMD_PERFMON_EVENTSEL_GUESTONLY (1ULL << 40)
-#define AMD_PERFMON_EVENTSEL_HOSTONLY  (1ULL << 41)
+#define AMD64_EVENTSEL_GUESTONLY   (1ULL << 40)
+#define AMD64_EVENTSEL_HOSTONLY(1ULL << 41)
 
 #define AMD64_EVENTSEL_INT_CORE_SEL_SHIFT  37
 #define AMD64_EVENTSEL_INT_CORE_SEL_MASK   \
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index 5e3a6a3..0faa35f 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -188,9 +188,9 @@ static int amd_core_hw_config(struct perf_event *event)
event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
  ARCH_PERFMON_EVENTSEL_OS);
else if (event->attr.exclude_host)
-   event->hw.config |= AMD_PERFMON_EVENTSEL_GUESTONLY;
+   event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
else if (event->attr.exclude_guest)
-   event->hw.config |= AMD_PERFMON_EVENTSEL_HOSTONLY;
+   event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
 
return 0;
 }
@@ -429,7 +429,7 @@ static void amd_pmu_cpu_starting(int cpu)
struct amd_nb *nb;
int i, nb_id;
 
-   cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
+   cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
 
if (boot_cpu_data.x86_max_cores < 2)
return;
@@ -782,7 +782,7 @@ void amd_pmu_disable_virt(void)
 * SVM is disabled the Guest-only bits still gets set and the counter
 * will not count anything.
 */
-   cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
+   cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
 
/* Reload all events */
x86_pmu_disable_all();
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 4/5] perf, amd: Enable northbridge performance counters on AMD family 15h

2012-11-27 Thread Jacob Shin
On Tue, Nov 27, 2012 at 01:10:51PM +0100, Robert Richter wrote:
> One minor comment:
> 
> On 26.11.12 16:48:30, Jacob Shin wrote:
> >  __init int amd_pmu_init(void)
> >  {
> > /* Performance-monitoring supported from K7 and later: */
> > @@ -666,6 +749,10 @@ __init int amd_pmu_init(void)
> > setup_event_constraints();
> > setup_perfctr_core();
> >  
> > +   num_core_counters = x86_pmu.num_counters;
> 
> I would better move this to setup_perfctr_nb().

Okay, see revised patch 4/5 below

> 
> > +
> > +   setup_perfctr_nb();
> > +
> > /* Events are common for all AMDs */
> > memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
> >sizeof(hw_cache_event_ids));
> 
> Otherwise the whole patch set looks good.
> 
> Acked-by: Robert Richter 

Great! Thanks for taking the time!

--

>From 0d813d2c29740315b5c90e4e32336c492afe Mon Sep 17 00:00:00 2001
From: Jacob Shin 
Date: Mon, 26 Nov 2012 14:38:55 -0600
Subject: [PATCH 1/1] perf, amd: Enable northbridge performance counters on
 AMD family 15h

On AMD family 15h processors, there are 4 new performance counters
(in addition to 6 core performance counters) that can be used for
counting northbridge events (i.e. DRAM accesses). Their bit fields are
almost identical to the core performance counters. However, unlike the
core performance counters, these MSRs are shared between multiple
cores (that share the same northbridge). We will reuse the same code
path as existing family 10h northbridge event constraints handler
logic to enforce this sharing.

These new counters are indexed contiguously right above the existing
core performance counters, and their indexes correspond to RDPMC ECX
values.

Signed-off-by: Jacob Shin 
---
 arch/x86/include/asm/cpufeature.h|2 +
 arch/x86/include/asm/msr-index.h |2 +
 arch/x86/include/asm/perf_event.h|9 +++
 arch/x86/kernel/cpu/perf_event_amd.c |  130 --
 4 files changed, 121 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index 8c297aa..b05c722 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -167,6 +167,7 @@
 #define X86_FEATURE_TBM(6*32+21) /* trailing bit manipulations 
*/
 #define X86_FEATURE_TOPOEXT(6*32+22) /* topology extensions CPUID leafs */
 #define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter 
extensions */
+#define X86_FEATURE_PERFCTR_NB  (6*32+24) /* core performance counter 
extensions */
 
 /*
  * Auxiliary flags: Linux defined - For features scattered in various
@@ -308,6 +309,7 @@ extern const char * const x86_power_flags[32];
 #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
 #define cpu_has_pclmulqdq  boot_cpu_has(X86_FEATURE_PCLMULQDQ)
 #define cpu_has_perfctr_core   boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
+#define cpu_has_perfctr_nb boot_cpu_has(X86_FEATURE_PERFCTR_NB)
 #define cpu_has_cx8boot_cpu_has(X86_FEATURE_CX8)
 #define cpu_has_cx16   boot_cpu_has(X86_FEATURE_CX16)
 #define cpu_has_eager_fpu  boot_cpu_has(X86_FEATURE_EAGER_FPU)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 7f0edce..e67ff1e 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -157,6 +157,8 @@
 /* Fam 15h MSRs */
 #define MSR_F15H_PERF_CTL  0xc0010200
 #define MSR_F15H_PERF_CTR  0xc0010201
+#define MSR_F15H_NB_PERF_CTL   0xc0010240
+#define MSR_F15H_NB_PERF_CTR   0xc0010241
 
 /* Fam 10h MSRs */
 #define MSR_FAM10H_MMIO_CONF_BASE  0xc0010058
diff --git a/arch/x86/include/asm/perf_event.h 
b/arch/x86/include/asm/perf_event.h
index 4fabcdf..63aba8f 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -29,9 +29,14 @@
 #define ARCH_PERFMON_EVENTSEL_INV  (1ULL << 23)
 #define ARCH_PERFMON_EVENTSEL_CMASK0xFF00ULL
 
+#define AMD64_EVENTSEL_INT_CORE_ENABLE (1ULL << 36)
 #define AMD_PERFMON_EVENTSEL_GUESTONLY (1ULL << 40)
 #define AMD_PERFMON_EVENTSEL_HOSTONLY  (1ULL << 41)
 
+#define AMD64_EVENTSEL_INT_CORE_SEL_SHIFT  37
+#define AMD64_EVENTSEL_INT_CORE_SEL_MASK   \
+   (0xFULL << AMD64_EVENTSEL_INT_CORE_SEL_SHIFT)
+
 #define AMD64_EVENTSEL_EVENT   \
(ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32))
 #define INTEL_ARCH_EVENT_MASK  \
@@ -46,8 +51,12 @@
 #define AMD64_RAW_EVENT_MASK   \
(X86_RAW_EVENT_MASK  |  \
 AMD64_EVENTSEL_EVENT)
+#define AMD64_RAW_EVENT_MASK_NB\
+   (AMD64_EVENTSEL_EVENT|  \
+ARCH_PERFMON_EVENTSEL_UMASK)
 #define AMD64_NUM_COUNTERS

Re: [PATCH 4/4] perf, amd: Enable northbridge performance counters on AMD family 15h

2012-11-28 Thread Jacob Shin
Robert,

On Fri, Nov 16, 2012 at 08:32:24PM +0100, Robert Richter wrote:
> On 16.11.12 13:00:30, Jacob Shin wrote:
> > On Fri, Nov 16, 2012 at 07:43:44PM +0100, Robert Richter wrote:
> > > On 15.11.12 15:31:53, Jacob Shin wrote:
> > > > @@ -323,6 +368,16 @@ __amd_get_nb_event_constraints(struct 
> > > > cpu_hw_events *cpuc, struct perf_event *ev
> > > > if (new == -1)
> > > > return &emptyconstraint;
> > > >  
> > > > +   /* set up interrupts to be delivered only to this core */
> > > > +   if (cpu_has_perfctr_nb) {
> > > > +   struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
> > > > +
> > > > +   hwc->config |= AMD_PERFMON_EVENTSEL_INT_CORE_ENABLE;
> > > > +   hwc->config &= ~AMD_PERFMON_EVENTSEL_INT_CORE_SEL_MASK;
> > > > +   hwc->config |= (0ULL | (c->cpu_core_id)) <<
> > > > +   AMD_PERFMON_EVENTSEL_INT_CORE_SEL_SHIFT;
> > > > +   }
> > > 
> > > Looks like a hack to me. The constaints handler is only supposed to
> > > determine constraints and not to touch anything in the event's
> > > structure. This should be done later when setting up hwc->config in
> > > amd_nb_event_config() or so.
> > 
> > Hm.. is the hwc->config called after constraints have been set up
> > already? If so, I'll change it ..
> 
> Should be, since the hw register can be setup only after the counter
> is selected.

Ahh .. looking at this further, it looks like ->config is called
before constraints are set up (before we know what cpu we are going to
run on).

Sorry for not seeing this sooner, but it really looks like the event
constraints function is the right time to set up the INT_CORE_SEL bits
. Are you okay with this?

> > > I also do not think that smp_processor_id() is the right thing to do
> > > here. Since cpu_hw_events is per-cpu the cpu is already selected.
> > 
> > Yeah, I could not figure out how to get the cpu number from cpuc. Is
> > there a container_of kind of thing that I can do to get the cpu number
> > ?
> 
> At some point event->cpu is assigned, I think.

Furthermore, event->cpu can only be used if --cpu flag is specified
from userlan, otherwise event->cpu is 0x. And we do not know until
the schedule happens, what cpu we are going to be running on.

I tried to figure out if there was a way to get from cpu_hw_events to
a cpu number, but I didn't see any obvious ways. The cpu_hw_events is
derived from __get_cpu_var from the schedule function that calls the
constraints, so smp_processor_id seems okay to use here.

..

So I'll have to change things back, unless do you have any other
ideas ?

Thanks,

-Jacob

> 
> -Robert
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH V6 0/6] perf, amd: Enable AMD family 15h northbridge counters

2013-02-06 Thread Jacob Shin
The following patchset enables 4 additional performance counters in
AMD family 15h processors that count northbridge events -- such as
number of DRAM accesses.

This patchset is based on previous work done by Robert Richter
 :

https://lkml.org/lkml/2012/6/19/324

The main differences are:

* The northbridge counters are indexed contiguously right above the
  core performance counters.

* MSR address offset calculations are moved to architecture specific
  files.

* Interrups are set up to be delivered only to a single core.

v6:
Revised per feedback from Stephane Eranian. Updated to only allow
counting mode on northbridge counters.

V5:
Rebased against latest tip

V4:
* Moved interrupt core select set up back to event constraints
  function, sicne during ->hw_config time we do not yet know on which
  CPU the the event will run on.
* Tested on and made minor revisions to make sure that the patchset is
  compatible with upcoming AMD Family 16h processors, and will support
  core and NB counters without any further patches.

V3:
Addressed the following feedback/comments from Robert's review
* https://lkml.org/lkml/2012/11/16/484
* https://lkml.org/lkml/2012/11/26/162

V2:
Separate out Robert's patches, and add properly ordered certificate of
origins.

Jacob Shin (4):
  perf, amd: Use proper naming scheme for AMD bit field definitions
  perf, x86: Move MSR address offset calculation to architecture
specific files
  perf, x86: Allow for architecture specific RDPMC indexes
  perf, amd: Enable northbridge performance counters on AMD family 15h

Robert Richter (2):
  perf, amd: Rework northbridge event constraints handler
  perf, amd: Generalize northbridge constraints code for family 15h

 arch/x86/include/asm/cpufeature.h |2 +
 arch/x86/include/asm/perf_event.h |   13 +-
 arch/x86/include/uapi/asm/msr-index.h |2 +
 arch/x86/kernel/cpu/perf_event.c  |2 +-
 arch/x86/kernel/cpu/perf_event.h  |   25 +--
 arch/x86/kernel/cpu/perf_event_amd.c  |  322 +
 6 files changed, 272 insertions(+), 94 deletions(-)

-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 5/6] perf, x86: Allow for architecture specific RDPMC indexes

2013-02-06 Thread Jacob Shin
Similar to config_base and event_base, allow architecture specific
RDPMC ECX values.

Signed-off-by: Jacob Shin 
Acked-by: Stephane Eranian 
---
 arch/x86/kernel/cpu/perf_event.c |2 +-
 arch/x86/kernel/cpu/perf_event.h |6 ++
 arch/x86/kernel/cpu/perf_event_amd.c |6 ++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index c6ef37a..5ed7a4c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -829,7 +829,7 @@ static inline void x86_assign_hw_event(struct perf_event 
*event,
} else {
hwc->config_base = x86_pmu_config_addr(hwc->idx);
hwc->event_base  = x86_pmu_event_addr(hwc->idx);
-   hwc->event_base_rdpmc = hwc->idx;
+   hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
}
 }
 
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index c455cba..1a2ea03 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -352,6 +352,7 @@ struct x86_pmu {
unsignedeventsel;
unsignedperfctr;
int (*addr_offset)(int index, bool eventsel);
+   int (*rdpmc_index)(int index);
u64 (*event_map)(int);
int max_events;
int num_counters;
@@ -510,6 +511,11 @@ static inline unsigned int x86_pmu_event_addr(int index)
  x86_pmu.addr_offset(index, false) : index);
 }
 
+static inline int x86_pmu_rdpmc_index(int index)
+{
+   return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
+}
+
 int x86_setup_perfctr(struct perf_event *event);
 
 int x86_pmu_hw_config(struct perf_event *event);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index b60f31c..05462f0 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -173,6 +173,11 @@ static inline int amd_pmu_addr_offset(int index, bool 
eventsel)
return offset;
 }
 
+static inline int amd_pmu_rdpmc_index(int index)
+{
+   return index;
+}
+
 static int amd_pmu_hw_config(struct perf_event *event)
 {
int ret;
@@ -620,6 +625,7 @@ static __initconst const struct x86_pmu amd_pmu = {
.eventsel   = MSR_K7_EVNTSEL0,
.perfctr= MSR_K7_PERFCTR0,
.addr_offset= amd_pmu_addr_offset,
+   .rdpmc_index= amd_pmu_rdpmc_index,
.event_map  = amd_pmu_event_map,
.max_events = ARRAY_SIZE(amd_perfmon_event_map),
.num_counters   = AMD64_NUM_COUNTERS,
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/6] perf, x86: Move MSR address offset calculation to architecture specific files

2013-02-06 Thread Jacob Shin
Move counter index to MSR address offset calculation to architecture
specific files. This prepares the way for perf_event_amd to enable
counter addresses that are not contiguous -- for example AMD Family
15h processors have 6 core performance counters starting at 0xc0010200
and 4 northbridge performance counters starting at 0xc0010240.

Signed-off-by: Jacob Shin 
---
 arch/x86/kernel/cpu/perf_event.h |   21 -
 arch/x86/kernel/cpu/perf_event_amd.c |   42 ++
 2 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 158f46b..c455cba 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -351,6 +351,7 @@ struct x86_pmu {
int (*schedule_events)(struct cpu_hw_events *cpuc, int n, 
int *assign);
unsignedeventsel;
unsignedperfctr;
+   int (*addr_offset)(int index, bool eventsel);
u64 (*event_map)(int);
int max_events;
int num_counters;
@@ -497,28 +498,16 @@ extern u64 __read_mostly hw_cache_extra_regs
 
 u64 x86_perf_event_update(struct perf_event *event);
 
-static inline int x86_pmu_addr_offset(int index)
-{
-   int offset;
-
-   /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
-   alternative_io(ASM_NOP2,
-  "shll $1, %%eax",
-  X86_FEATURE_PERFCTR_CORE,
-  "=a" (offset),
-  "a"  (index));
-
-   return offset;
-}
-
 static inline unsigned int x86_pmu_config_addr(int index)
 {
-   return x86_pmu.eventsel + x86_pmu_addr_offset(index);
+   return x86_pmu.eventsel + (x86_pmu.addr_offset ?
+  x86_pmu.addr_offset(index, true) : index);
 }
 
 static inline unsigned int x86_pmu_event_addr(int index)
 {
-   return x86_pmu.perfctr + x86_pmu_addr_offset(index);
+   return x86_pmu.perfctr + (x86_pmu.addr_offset ?
+ x86_pmu.addr_offset(index, false) : index);
 }
 
 int x86_setup_perfctr(struct perf_event *event);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index aea8c20..b60f31c 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -132,6 +132,47 @@ static u64 amd_pmu_event_map(int hw_event)
return amd_perfmon_event_map[hw_event];
 }
 
+/*
+ * Previously calculated offsets
+ */
+static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
+static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
+
+/*
+ * Legacy CPUs:
+ *   4 counters starting at 0xc001 each offset by 1
+ *
+ * CPUs with core performance counter extensions:
+ *   6 counters starting at 0xc0010200 each offset by 2
+ */
+static inline int amd_pmu_addr_offset(int index, bool eventsel)
+{
+   int offset;
+
+   if (!index)
+   return index;
+
+   if (eventsel)
+   offset = event_offsets[index];
+   else
+   offset = count_offsets[index];
+
+   if (offset)
+   return offset;
+
+   if (!cpu_has_perfctr_core)
+   offset = index;
+   else
+   offset = index << 1;
+
+   if (eventsel)
+   event_offsets[index] = offset;
+   else
+   count_offsets[index] = offset;
+
+   return offset;
+}
+
 static int amd_pmu_hw_config(struct perf_event *event)
 {
int ret;
@@ -578,6 +619,7 @@ static __initconst const struct x86_pmu amd_pmu = {
.schedule_events= x86_schedule_events,
.eventsel   = MSR_K7_EVNTSEL0,
.perfctr= MSR_K7_PERFCTR0,
+   .addr_offset= amd_pmu_addr_offset,
.event_map  = amd_pmu_event_map,
.max_events = ARRAY_SIZE(amd_perfmon_event_map),
.num_counters   = AMD64_NUM_COUNTERS,
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/6] perf, amd: Generalize northbridge constraints code for family 15h

2013-02-06 Thread Jacob Shin
From: Robert Richter 

Generalize northbridge constraints code for family 10h so that later
we can reuse the same code path with other AMD processor families that
have the same northbridge event constraints.

Signed-off-by: Robert Richter 
Signed-off-by: Jacob Shin 
---
 arch/x86/kernel/cpu/perf_event_amd.c |   43 --
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index e7963c7..f8c9dfb 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -188,20 +188,13 @@ static inline int amd_has_nb(struct cpu_hw_events *cpuc)
return nb && nb->nb_id != -1;
 }
 
-static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
- struct perf_event *event)
+static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
+  struct perf_event *event)
 {
-   struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
int i;
 
/*
-* only care about NB events
-*/
-   if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
-   return;
-
-   /*
 * need to scan whole list because event may not have
 * been assigned during scheduling
 *
@@ -247,12 +240,13 @@ static void amd_put_event_constraints(struct 
cpu_hw_events *cpuc,
   *
   * Given that resources are allocated (cmpxchg), they must be
   * eventually freed for others to use. This is accomplished by
-  * calling amd_put_event_constraints().
+  * calling __amd_put_nb_event_constraints()
   *
   * Non NB events are not impacted by this restriction.
   */
 static struct event_constraint *
-amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event 
*event,
+  struct event_constraint *c)
 {
struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
@@ -260,12 +254,6 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
struct perf_event *event)
int idx, new = -1;
 
/*
-* if not NB event or no NB, then no constraints
-*/
-   if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
-   return &unconstrained;
-
-   /*
 * detect if already present, if so reuse
 *
 * cannot merge with actual allocation
@@ -275,7 +263,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
struct perf_event *event)
 * because of successive calls to x86_schedule_events() from
 * hw_perf_group_sched_in() without hw_perf_enable()
 */
-   for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+   for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) {
if (new == -1 || hwc->idx == idx)
/* assign free slot, prefer hwc->idx */
old = cmpxchg(nb->owners + idx, NULL, event);
@@ -391,6 +379,25 @@ static void amd_pmu_cpu_dead(int cpu)
}
 }
 
+static struct event_constraint *
+amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+   /*
+* if not NB event or no NB, then no constraints
+*/
+   if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
+   return &unconstrained;
+
+   return __amd_get_nb_event_constraints(cpuc, event, &unconstrained);
+}
+
+static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+   if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
+   __amd_put_nb_event_constraints(cpuc, event);
+}
+
 PMU_FORMAT_ATTR(event, "config:0-7,32-35");
 PMU_FORMAT_ATTR(umask, "config:8-15"   );
 PMU_FORMAT_ATTR(edge,  "config:18" );
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 6/6] perf, amd: Enable northbridge performance counters on AMD family 15h

2013-02-06 Thread Jacob Shin
On AMD family 15h processors, there are 4 new performance counters
(in addition to 6 core performance counters) that can be used for
counting northbridge events (i.e. DRAM accesses). Their bit fields are
almost identical to the core performance counters. However, unlike the
core performance counters, these MSRs are shared between multiple
cores (that share the same northbridge). We will reuse the same code
path as existing family 10h northbridge event constraints handler
logic to enforce this sharing.

Signed-off-by: Jacob Shin 
---
 arch/x86/include/asm/cpufeature.h |2 +
 arch/x86/include/asm/perf_event.h |9 ++
 arch/x86/include/uapi/asm/msr-index.h |2 +
 arch/x86/kernel/cpu/perf_event_amd.c  |  171 +
 4 files changed, 164 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index 2d9075e..93fe929 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -167,6 +167,7 @@
 #define X86_FEATURE_TBM(6*32+21) /* trailing bit manipulations 
*/
 #define X86_FEATURE_TOPOEXT(6*32+22) /* topology extensions CPUID leafs */
 #define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter 
extensions */
+#define X86_FEATURE_PERFCTR_NB  (6*32+24) /* NB performance counter extensions 
*/
 
 /*
  * Auxiliary flags: Linux defined - For features scattered in various
@@ -309,6 +310,7 @@ extern const char * const x86_power_flags[32];
 #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
 #define cpu_has_pclmulqdq  boot_cpu_has(X86_FEATURE_PCLMULQDQ)
 #define cpu_has_perfctr_core   boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
+#define cpu_has_perfctr_nb boot_cpu_has(X86_FEATURE_PERFCTR_NB)
 #define cpu_has_cx8boot_cpu_has(X86_FEATURE_CX8)
 #define cpu_has_cx16   boot_cpu_has(X86_FEATURE_CX16)
 #define cpu_has_eager_fpu  boot_cpu_has(X86_FEATURE_EAGER_FPU)
diff --git a/arch/x86/include/asm/perf_event.h 
b/arch/x86/include/asm/perf_event.h
index 2234eaaec..57cb634 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -29,9 +29,14 @@
 #define ARCH_PERFMON_EVENTSEL_INV  (1ULL << 23)
 #define ARCH_PERFMON_EVENTSEL_CMASK0xFF00ULL
 
+#define AMD64_EVENTSEL_INT_CORE_ENABLE (1ULL << 36)
 #define AMD64_EVENTSEL_GUESTONLY   (1ULL << 40)
 #define AMD64_EVENTSEL_HOSTONLY(1ULL << 41)
 
+#define AMD64_EVENTSEL_INT_CORE_SEL_SHIFT  37
+#define AMD64_EVENTSEL_INT_CORE_SEL_MASK   \
+   (0xFULL << AMD64_EVENTSEL_INT_CORE_SEL_SHIFT)
+
 #define AMD64_EVENTSEL_EVENT   \
(ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32))
 #define INTEL_ARCH_EVENT_MASK  \
@@ -46,8 +51,12 @@
 #define AMD64_RAW_EVENT_MASK   \
(X86_RAW_EVENT_MASK  |  \
 AMD64_EVENTSEL_EVENT)
+#define AMD64_RAW_EVENT_MASK_NB\
+   (AMD64_EVENTSEL_EVENT|  \
+ARCH_PERFMON_EVENTSEL_UMASK)
 #define AMD64_NUM_COUNTERS 4
 #define AMD64_NUM_COUNTERS_CORE6
+#define AMD64_NUM_COUNTERS_NB  4
 
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL  0x3c
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK(0x00 << 8)
diff --git a/arch/x86/include/uapi/asm/msr-index.h 
b/arch/x86/include/uapi/asm/msr-index.h
index 1031604..27c05d2 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -195,6 +195,8 @@
 /* Fam 15h MSRs */
 #define MSR_F15H_PERF_CTL  0xc0010200
 #define MSR_F15H_PERF_CTR  0xc0010201
+#define MSR_F15H_NB_PERF_CTL   0xc0010240
+#define MSR_F15H_NB_PERF_CTR   0xc0010241
 
 /* Fam 10h MSRs */
 #define MSR_FAM10H_MMIO_CONF_BASE  0xc0010058
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index 05462f0..dfdab42 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -132,11 +132,14 @@ static u64 amd_pmu_event_map(int hw_event)
return amd_perfmon_event_map[hw_event];
 }
 
+static struct event_constraint *amd_nb_event_constraint;
+
 /*
  * Previously calculated offsets
  */
 static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
 static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
+static unsigned int rdpmc_indexes[X86_PMC_IDX_MAX] __read_mostly;
 
 /*
  * Legacy CPUs:
@@ -144,10 +147,14 @@ static unsigned int count_offsets[X86_PMC_IDX_MAX] 
__read_mostly;
  *
  * CPUs with core performance counter extensions:
  *   6 counters starting at 0xc0010200 each offset by 2
+ *
+ * CPUs with north bridge performance counter extensions:
+ *   4 additional counters starting at 0xc0010240 each offset by 2
+ *   (i

[PATCH 1/6] perf, amd: Rework northbridge event constraints handler

2013-02-06 Thread Jacob Shin
From: Robert Richter 

Code simplification. No functional changes.

Signed-off-by: Robert Richter 
Signed-off-by: Jacob Shin 
Acked-by: Stephane Eranian 
---
 arch/x86/kernel/cpu/perf_event_amd.c |   68 +-
 1 file changed, 26 insertions(+), 42 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index c93bc4e..e7963c7 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -256,9 +256,8 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
struct perf_event *event)
 {
struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
-   struct perf_event *old = NULL;
-   int max = x86_pmu.num_counters;
-   int i, j, k = -1;
+   struct perf_event *old;
+   int idx, new = -1;
 
/*
 * if not NB event or no NB, then no constraints
@@ -276,48 +275,33 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
struct perf_event *event)
 * because of successive calls to x86_schedule_events() from
 * hw_perf_group_sched_in() without hw_perf_enable()
 */
-   for (i = 0; i < max; i++) {
-   /*
-* keep track of first free slot
-*/
-   if (k == -1 && !nb->owners[i])
-   k = i;
+   for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+   if (new == -1 || hwc->idx == idx)
+   /* assign free slot, prefer hwc->idx */
+   old = cmpxchg(nb->owners + idx, NULL, event);
+   else if (nb->owners[idx] == event)
+   /* event already present */
+   old = event;
+   else
+   continue;
+
+   if (old && old != event)
+   continue;
+
+   /* reassign to this slot */
+   if (new != -1)
+   cmpxchg(nb->owners + new, event, NULL);
+   new = idx;
 
/* already present, reuse */
-   if (nb->owners[i] == event)
-   goto done;
-   }
-   /*
-* not present, so grab a new slot
-* starting either at:
-*/
-   if (hwc->idx != -1) {
-   /* previous assignment */
-   i = hwc->idx;
-   } else if (k != -1) {
-   /* start from free slot found */
-   i = k;
-   } else {
-   /*
-* event not found, no slot found in
-* first pass, try again from the
-* beginning
-*/
-   i = 0;
-   }
-   j = i;
-   do {
-   old = cmpxchg(nb->owners+i, NULL, event);
-   if (!old)
+   if (old == event)
break;
-   if (++i == max)
-   i = 0;
-   } while (i != j);
-done:
-   if (!old)
-   return &nb->event_constraints[i];
-
-   return &emptyconstraint;
+   }
+
+   if (new == -1)
+   return &emptyconstraint;
+
+   return &nb->event_constraints[new];
 }
 
 static struct amd_nb *amd_alloc_nb(int cpu)
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/6] perf, amd: Use proper naming scheme for AMD bit field definitions

2013-02-06 Thread Jacob Shin
Update these AMD bit field names to be consistent with naming
convention followed by the rest of the file.

Signed-off-by: Jacob Shin 
Acked-by: Stephane Eranian 
---
 arch/x86/include/asm/perf_event.h|4 ++--
 arch/x86/kernel/cpu/perf_event_amd.c |8 
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/perf_event.h 
b/arch/x86/include/asm/perf_event.h
index 4fabcdf..2234eaaec 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -29,8 +29,8 @@
 #define ARCH_PERFMON_EVENTSEL_INV  (1ULL << 23)
 #define ARCH_PERFMON_EVENTSEL_CMASK0xFF00ULL
 
-#define AMD_PERFMON_EVENTSEL_GUESTONLY (1ULL << 40)
-#define AMD_PERFMON_EVENTSEL_HOSTONLY  (1ULL << 41)
+#define AMD64_EVENTSEL_GUESTONLY   (1ULL << 40)
+#define AMD64_EVENTSEL_HOSTONLY(1ULL << 41)
 
 #define AMD64_EVENTSEL_EVENT   \
(ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32))
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index f8c9dfb..aea8c20 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -156,9 +156,9 @@ static int amd_pmu_hw_config(struct perf_event *event)
event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
  ARCH_PERFMON_EVENTSEL_OS);
else if (event->attr.exclude_host)
-   event->hw.config |= AMD_PERFMON_EVENTSEL_GUESTONLY;
+   event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
else if (event->attr.exclude_guest)
-   event->hw.config |= AMD_PERFMON_EVENTSEL_HOSTONLY;
+   event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
 
if (event->attr.type != PERF_TYPE_RAW)
return 0;
@@ -336,7 +336,7 @@ static void amd_pmu_cpu_starting(int cpu)
struct amd_nb *nb;
int i, nb_id;
 
-   cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
+   cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
 
if (boot_cpu_data.x86_max_cores < 2)
return;
@@ -669,7 +669,7 @@ void amd_pmu_disable_virt(void)
 * SVM is disabled the Guest-only bits still gets set and the counter
 * will not count anything.
 */
-   cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
+   cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
 
/* Reload all events */
x86_pmu_disable_all();
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V6 0/6] perf, amd: Enable AMD family 15h northbridge counters

2013-02-06 Thread Jacob Shin
On Wed, Feb 06, 2013 at 11:26:23AM -0600, Jacob Shin wrote:
> The following patchset enables 4 additional performance counters in
> AMD family 15h processors that count northbridge events -- such as
> number of DRAM accesses.
> 

Here is the libpfm4 counterpart,

Thanks!

>From acbc2e6f66dc131658a0fa1283d830327a44919f Mon Sep 17 00:00:00 2001
From: Jacob Shin 
Date: Thu, 31 Jan 2013 14:34:06 -0600
Subject: [PATCH V2] Add AMD Family 15h northbridge performance events

libpfm4 side support for the following Linux kernel patchset:
  http://lkml.org/lkml/2013/1/10/450

Reference -- BIOS and Kernel Developer Guide (BKDG) for AMD Family 15h
 Models 00h-0Fh Processors:
  http://support.amd.com/us/Processor_TechDocs/42301_15h_Mod_00h-0Fh_BKDG.pdf
---
 lib/events/amd64_events_fam15h.h | 1128 ++
 1 file changed, 1128 insertions(+)

diff --git a/lib/events/amd64_events_fam15h.h b/lib/events/amd64_events_fam15h.h
index 7f654e8..8700ab2 100644
--- a/lib/events/amd64_events_fam15h.h
+++ b/lib/events/amd64_events_fam15h.h
@@ -752,6 +752,910 @@ static const amd64_umask_t 
amd64_fam15h_l2_prefetcher_trigger_events[]={
},
 };
 
+static const amd64_umask_t amd64_fam15h_dram_accesses[]={
+   { .uname = "DCT0_PAGE_HIT",
+ .udesc = "DCT0 Page hit",
+ .ucode = 0x1,
+   },
+   { .uname = "DCT0_PAGE_MISS",
+ .udesc = "DCT0 Page Miss",
+ .ucode = 0x2,
+   },
+   { .uname = "DCT0_PAGE_CONFLICT",
+ .udesc = "DCT0 Page Conflict",
+ .ucode = 0x4,
+   },
+   { .uname = "DCT1_PAGE_HIT",
+ .udesc = "DCT1 Page hit",
+ .ucode = 0x8,
+   },
+   { .uname = "DCT1_PAGE_MISS",
+ .udesc = "DCT1 Page Miss",
+ .ucode = 0x10,
+   },
+   { .uname = "DCT1_PAGE_CONFLICT",
+ .udesc = "DCT1 Page Conflict",
+ .ucode = 0x20,
+   },
+   { .uname  = "ALL",
+ .udesc  = "All sub-events selected",
+ .ucode = 0x3f,
+ .uflags= AMD64_FL_NCOMBO | AMD64_FL_DFL,
+   },
+};
+
+static const amd64_umask_t 
amd64_fam15h_dram_controller_page_table_overflows[]={
+   { .uname = "DCT0_PAGE_TABLE_OVERFLOW",
+ .udesc = "DCT0 Page Table Overflow",
+ .ucode = 0x1,
+   },
+   { .uname = "DCT1_PAGE_TABLE_OVERFLOW",
+ .udesc = "DCT1 Page Table Overflow",
+ .ucode = 0x2,
+   },
+   { .uname  = "ALL",
+ .udesc  = "All sub-events selected",
+ .ucode  = 0x3,
+ .uflags = AMD64_FL_NCOMBO | AMD64_FL_DFL,
+   },
+};
+
+static const amd64_umask_t 
amd64_fam15h_memory_controller_dram_command_slots_missed[]={
+   { .uname = "DCT0_COMMAND_SLOTS_MISSED",
+ .udesc = "DCT0 Command Slots Missed (in MemClks)",
+ .ucode = 0x1,
+   },
+   { .uname = "DCT1_COMMAND_SLOTS_MISSED",
+ .udesc = "DCT1 Command Slots Missed (in MemClks)",
+ .ucode = 0x2,
+   },
+   { .uname  = "ALL",
+ .udesc  = "All sub-events selected",
+ .ucode  = 0x3,
+ .uflags = AMD64_FL_NCOMBO | AMD64_FL_DFL,
+   },
+};
+
+static const amd64_umask_t amd64_fam15h_memory_controller_turnarounds[]={
+   { .uname = "DCT0_DIMM_TURNAROUND",
+ .udesc = "DCT0 DIMM (chip select) turnaround",
+ .ucode = 0x1,
+   },
+   { .uname = "DCT0_READ_WRITE_TURNAROUND",
+ .udesc = "DCT0 Read to write turnaround",
+ .ucode = 0x2,
+   },
+   { .uname = "DCT0_WRITE_READ_TURNAROUND",
+ .udesc = "DCT0 Write to read turnaround",
+ .ucode = 0x4,
+   },
+   { .uname = "DCT1_DIMM_TURNAROUND",
+ .udesc = "DCT1 DIMM (chip select) turnaround",
+ .ucode = 0x8,
+   },
+   { .uname = "DCT1_READ_WRITE_TURNAROUND",
+ .udesc = "DCT1 Read to write turnaround",
+ .ucode = 0x10,
+   },
+   { .uname = "DCT1_WRITE_READ_TURNAROUND",
+ .udesc = "DCT1 Write to read turnaround",
+ .ucode = 0x20,
+   },
+   { .uname  = "ALL",
+ .udesc  = "All sub-events selected",
+ .ucode  = 0x3f,
+ .uflags = AMD64_FL_NCOMBO | AMD64_FL_DFL,
+   },
+};
+
+static const amd64_umask_t 
amd64_fam15h_memory_controller_bypass_counter_saturation[]={
+   { .uname = "MEMORY_CONTROLLER_HIGH_PRIORITY_BYPASS",
+ .udesc = "Memory controller high priority bypass",
+ .ucode = 0x1,
+   },
+   { .uname = "MEMORY_CONTROLLER_MEDIUM_PRIORITY_BYPASS",
+ .udesc = "Memory controller medium priority bypass",
+ .ucode = 0x2,
+   },
+   { .uname = "DCT0_DCQ_BYPASS",
+ .udesc = "DCT0 DCQ bypass",
+ .ucode = 0x4,
+   },
+   { .uname = "DCT1_DCQ_BYPASS",
+ .udesc = "DCT1 DCQ bypass",
+ .ucode = 0x8,
+   },
+   { .uname  = "ALL",
+ .udesc  = "All sub-events selected",

Re: [PATCH 6/6] perf, amd: Enable northbridge performance counters on AMD family 15h

2013-02-07 Thread Jacob Shin
On Wed, Feb 06, 2013 at 11:26:29AM -0600, Jacob Shin wrote:
> On AMD family 15h processors, there are 4 new performance counters
> (in addition to 6 core performance counters) that can be used for
> counting northbridge events (i.e. DRAM accesses). Their bit fields are
> almost identical to the core performance counters. However, unlike the
> core performance counters, these MSRs are shared between multiple
> cores (that share the same northbridge). We will reuse the same code
> path as existing family 10h northbridge event constraints handler
> logic to enforce this sharing.
> 
> Signed-off-by: Jacob Shin 

Hi Ingo, could you please apply this one to tip as well? I recieved
tip-bot emails for all other patches in this series except for this
last one 6/6.

Or was that intentional? If so, what other changes are required/
recommended?

Thanks!

-Jacob

> ---
>  arch/x86/include/asm/cpufeature.h |2 +
>  arch/x86/include/asm/perf_event.h |9 ++
>  arch/x86/include/uapi/asm/msr-index.h |2 +
>  arch/x86/kernel/cpu/perf_event_amd.c  |  171 
> +
>  4 files changed, 164 insertions(+), 20 deletions(-)
> 
> diff --git a/arch/x86/include/asm/cpufeature.h 
> b/arch/x86/include/asm/cpufeature.h
> index 2d9075e..93fe929 100644
> --- a/arch/x86/include/asm/cpufeature.h
> +++ b/arch/x86/include/asm/cpufeature.h
> @@ -167,6 +167,7 @@
>  #define X86_FEATURE_TBM  (6*32+21) /* trailing bit manipulations 
> */
>  #define X86_FEATURE_TOPOEXT  (6*32+22) /* topology extensions CPUID leafs */
>  #define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter 
> extensions */
> +#define X86_FEATURE_PERFCTR_NB  (6*32+24) /* NB performance counter 
> extensions */
>  
>  /*
>   * Auxiliary flags: Linux defined - For features scattered in various
> @@ -309,6 +310,7 @@ extern const char * const x86_power_flags[32];
>  #define cpu_has_hypervisor   boot_cpu_has(X86_FEATURE_HYPERVISOR)
>  #define cpu_has_pclmulqdqboot_cpu_has(X86_FEATURE_PCLMULQDQ)
>  #define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
> +#define cpu_has_perfctr_nb   boot_cpu_has(X86_FEATURE_PERFCTR_NB)
>  #define cpu_has_cx8  boot_cpu_has(X86_FEATURE_CX8)
>  #define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16)
>  #define cpu_has_eager_fpuboot_cpu_has(X86_FEATURE_EAGER_FPU)
> diff --git a/arch/x86/include/asm/perf_event.h 
> b/arch/x86/include/asm/perf_event.h
> index 2234eaaec..57cb634 100644
> --- a/arch/x86/include/asm/perf_event.h
> +++ b/arch/x86/include/asm/perf_event.h
> @@ -29,9 +29,14 @@
>  #define ARCH_PERFMON_EVENTSEL_INV(1ULL << 23)
>  #define ARCH_PERFMON_EVENTSEL_CMASK  0xFF00ULL
>  
> +#define AMD64_EVENTSEL_INT_CORE_ENABLE   (1ULL << 36)
>  #define AMD64_EVENTSEL_GUESTONLY (1ULL << 40)
>  #define AMD64_EVENTSEL_HOSTONLY  (1ULL << 41)
>  
> +#define AMD64_EVENTSEL_INT_CORE_SEL_SHIFT37
> +#define AMD64_EVENTSEL_INT_CORE_SEL_MASK \
> + (0xFULL << AMD64_EVENTSEL_INT_CORE_SEL_SHIFT)
> +
>  #define AMD64_EVENTSEL_EVENT \
>   (ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32))
>  #define INTEL_ARCH_EVENT_MASK\
> @@ -46,8 +51,12 @@
>  #define AMD64_RAW_EVENT_MASK \
>   (X86_RAW_EVENT_MASK  |  \
>AMD64_EVENTSEL_EVENT)
> +#define AMD64_RAW_EVENT_MASK_NB  \
> + (AMD64_EVENTSEL_EVENT|  \
> +  ARCH_PERFMON_EVENTSEL_UMASK)
>  #define AMD64_NUM_COUNTERS   4
>  #define AMD64_NUM_COUNTERS_CORE  6
> +#define AMD64_NUM_COUNTERS_NB4
>  
>  #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL0x3c
>  #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK  (0x00 << 8)
> diff --git a/arch/x86/include/uapi/asm/msr-index.h 
> b/arch/x86/include/uapi/asm/msr-index.h
> index 1031604..27c05d2 100644
> --- a/arch/x86/include/uapi/asm/msr-index.h
> +++ b/arch/x86/include/uapi/asm/msr-index.h
> @@ -195,6 +195,8 @@
>  /* Fam 15h MSRs */
>  #define MSR_F15H_PERF_CTL0xc0010200
>  #define MSR_F15H_PERF_CTR0xc0010201
> +#define MSR_F15H_NB_PERF_CTL 0xc0010240
> +#define MSR_F15H_NB_PERF_CTR 0xc0010241
>  
>  /* Fam 10h MSRs */
>  #define MSR_FAM10H_MMIO_CONF_BASE0xc0010058
> diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
> b/arch/x86/kernel/cpu/perf_event_amd.c
> index 05462f0..dfdab42 100644
> --- a/arch/x86/kernel/cpu/perf_event_amd.c
> +++ b/arch/x86/kernel/cpu/perf_event_amd.c
> @@ -132,11 +132,14 @@ static 

Re: [PATCH 6/6] perf, amd: Enable northbridge performance counters on AMD family 15h

2013-02-11 Thread Jacob Shin
On Fri, Feb 08, 2013 at 12:16:28PM +0100, Stephane Eranian wrote:
> On Wed, Feb 6, 2013 at 6:26 PM, Jacob Shin  wrote:
> > On AMD family 15h processors, there are 4 new performance counters
> > (in addition to 6 core performance counters) that can be used for
> > counting northbridge events (i.e. DRAM accesses). Their bit fields are
> > almost identical to the core performance counters. However, unlike the
> > core performance counters, these MSRs are shared between multiple
> > cores (that share the same northbridge). We will reuse the same code
> > path as existing family 10h northbridge event constraints handler
> > logic to enforce this sharing.
> >
> > Signed-off-by: Jacob Shin 
> 
> Works for me.
> 
> I simply regret that the design decision ties uncore with core
> even though hardware-wise they are separate. If I recall the earlier
> discussion the motivation was to limit code duplication. That's true
> but that's at the expense of isolation. For instance, now if the core
> PMU is overcommitted, but not the uncore, then uncore still goes
> thru event rescheduling for nothing.
> 
> But what matters at this point, is that there is coverage
> for uncore, so we can get some bandwidth measurements
> out. So i recommend we merge this in. Thanks.
> 
> Acked-by: Stephane Eranian 

Stephane, thank you for your time reviewing/testing the patchset.

Ingo, could you please commit this patch 6/6 to tip?

Thank you,

-Jacob

> 
> > ---
> >  arch/x86/include/asm/cpufeature.h |2 +
> >  arch/x86/include/asm/perf_event.h |9 ++
> >  arch/x86/include/uapi/asm/msr-index.h |2 +
> >  arch/x86/kernel/cpu/perf_event_amd.c  |  171 
> > +
> >  4 files changed, 164 insertions(+), 20 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/cpufeature.h 
> > b/arch/x86/include/asm/cpufeature.h
> > index 2d9075e..93fe929 100644
> > --- a/arch/x86/include/asm/cpufeature.h
> > +++ b/arch/x86/include/asm/cpufeature.h
> > @@ -167,6 +167,7 @@
> >  #define X86_FEATURE_TBM(6*32+21) /* trailing bit 
> > manipulations */
> >  #define X86_FEATURE_TOPOEXT(6*32+22) /* topology extensions CPUID 
> > leafs */
> >  #define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter 
> > extensions */
> > +#define X86_FEATURE_PERFCTR_NB  (6*32+24) /* NB performance counter 
> > extensions */
> >
> >  /*
> >   * Auxiliary flags: Linux defined - For features scattered in various
> > @@ -309,6 +310,7 @@ extern const char * const x86_power_flags[32];
> >  #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
> >  #define cpu_has_pclmulqdq  boot_cpu_has(X86_FEATURE_PCLMULQDQ)
> >  #define cpu_has_perfctr_core   boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
> > +#define cpu_has_perfctr_nb boot_cpu_has(X86_FEATURE_PERFCTR_NB)
> >  #define cpu_has_cx8boot_cpu_has(X86_FEATURE_CX8)
> >  #define cpu_has_cx16   boot_cpu_has(X86_FEATURE_CX16)
> >  #define cpu_has_eager_fpu  boot_cpu_has(X86_FEATURE_EAGER_FPU)
> > diff --git a/arch/x86/include/asm/perf_event.h 
> > b/arch/x86/include/asm/perf_event.h
> > index 2234eaaec..57cb634 100644
> > --- a/arch/x86/include/asm/perf_event.h
> > +++ b/arch/x86/include/asm/perf_event.h
> > @@ -29,9 +29,14 @@
> >  #define ARCH_PERFMON_EVENTSEL_INV  (1ULL << 23)
> >  #define ARCH_PERFMON_EVENTSEL_CMASK0xFF00ULL
> >
> > +#define AMD64_EVENTSEL_INT_CORE_ENABLE (1ULL << 36)
> >  #define AMD64_EVENTSEL_GUESTONLY   (1ULL << 40)
> >  #define AMD64_EVENTSEL_HOSTONLY(1ULL << 41)
> >
> > +#define AMD64_EVENTSEL_INT_CORE_SEL_SHIFT  37
> > +#define AMD64_EVENTSEL_INT_CORE_SEL_MASK   \
> > +   (0xFULL << AMD64_EVENTSEL_INT_CORE_SEL_SHIFT)
> > +
> >  #define AMD64_EVENTSEL_EVENT   \
> > (ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32))
> >  #define INTEL_ARCH_EVENT_MASK  \
> > @@ -46,8 +51,12 @@
> >  #define AMD64_RAW_EVENT_MASK   \
> > (X86_RAW_EVENT_MASK  |  \
> >  AMD64_EVENTSEL_EVENT)
> > +#define AMD64_RAW_EVENT_MASK_NB\
> > +   (AMD64_EVENTSEL_EVENT|  \
> > +ARCH_PERFMON_EVENTSEL_UMASK)
> >  #define AMD64_NUM_COUNTERS 4
> >  #define AMD64_NUM_COUNTERS_CORE6
> > +#define AMD64_NUM_COUNTERS_NB  4
&

Re: [PATCH V2 2/2] cpufreq: AMD "frequency sensitivity feedback" powersave bias for ondemand governor

2013-04-02 Thread Jacob Shin
On Tue, Apr 02, 2013 at 01:40:13PM +0200, Thomas Renninger wrote:
> On Thursday, March 28, 2013 01:24:17 PM Jacob Shin wrote:
> > Future AMD processors, starting with Family 16h, can provide software
> > with feedback on how the workload may respond to frequency change --
> > memory-bound workloads will not benefit from higher frequency, where
> > as compute-bound workloads will. This patch enables this "frequency
> > sensitivity feedback" to aid the ondemand governor to make better
> > frequency change decisions by hooking into the powersave bias.
> If I read this correctly, nothing changes even if the driver is loaded,
> unless user modifies:
> /sys/devices/system/cpu/cpufreq/ondemand/powersave_bias
> is this correct?

Hi, yes that is correct.

> 
> I wonder who should modify:
> /sys/devices/system/cpu/cpufreq/ondemand/powersave_bias
> Even cpupower is not aware of this very specific tunable.

Hmm .. I had thought that end user or distros would already know about
it/use it since the powersave_bias tunable sysfs already exists.

I guess not?

> 
> Also, are you sure cpufreq subsystem will be the only user
> of this one?
> Or could cpuidle or others also make use of this somewhen in the future?

I think so, since this register deals with how the workload is
affected by frequency changes -- cpufreq.

> 
> Then this could more be done like:
> drivers/cpufreq/mperf.c
> And scheduler, cpuidle, cpufreq or whatever could use this as well.
> 
> Just some thinking:
> I wonder how one could check/verify that the right thing is done
> (by CPU and kernel). Ideally it would be nice to have the CPU register
> appended to a cpufreq or cpuidle event trace.
> But this very (AMD or X86 only?) specific data would not look nice there.
> An arch placeholder value would be needed or similar?
> 
> ...
> > +}
> > +
> > +static int __init amd_freq_sensitivity_init(void)
> > +{
> > +   int i;
> > +   u32 eax, edx, dummy;
> > +
> > +   if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
> > +   return -ENODEV;
> > +
> > +   cpuid(0x8007, &eax, &dummy, &dummy, &edx);
> If this really should be a separate module:
> Does/will Intel have the same (feature/cpuid bit)?
> Anyway, this should get a general AMD or X86 CPU capability flag.

As far as I know, this is AMD specific, yes I'll add the AMD vendor
check.

> 
> Then you can also autoload this driver similar to how it's done in acpi-
> cpufreq:
> static const struct x86_cpu_id acpi_cpufreq_ids[] = {
> X86_FEATURE_MATCH(X86_FEATURE_ACPI),
> X86_FEATURE_MATCH(X86_FEATURE_HW_PSTATE),
> {}
> };
> MODULE_DEVICE_TABLE(x86cpu, acpi_cpufreq_ids);

Okay.

Thanks.

> 
>Thomas
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2 1/2] cpufreq: ondemand: allow custom powersave_bias_target function to be registered

2013-04-02 Thread Jacob Shin
On Tue, Apr 02, 2013 at 02:43:32PM +0200, Borislav Petkov wrote:
> On Thu, Mar 28, 2013 at 01:24:16PM -0500, Jacob Shin wrote:
> > This allows for another [arch specific] driver to hook into existing
> > powersave bias function of the ondemand governor. i.e. This allows AMD
> > specific powersave bias function (in a separate AMD specific driver)
> > to aid ondemand governor's frequency transition deicisions.
> > 
> > Signed-off-by: Jacob Shin 
> > ---
> >  drivers/cpufreq/cpufreq_governor.h |3 +++
> >  drivers/cpufreq/cpufreq_ondemand.c |   22 +++---
> >  2 files changed, 22 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/cpufreq/cpufreq_governor.h 
> > b/drivers/cpufreq/cpufreq_governor.h
> > index c83cabf..4b6808f 100644
> > --- a/drivers/cpufreq/cpufreq_governor.h
> > +++ b/drivers/cpufreq/cpufreq_governor.h
> > @@ -262,4 +262,7 @@ bool need_load_eval(struct cpu_dbs_common_info *cdbs,
> > unsigned int sampling_rate);
> >  int cpufreq_governor_dbs(struct cpufreq_policy *policy,
> > struct common_dbs_data *cdata, unsigned int event);
> > +void od_register_powersave_bias_function(unsigned int (*f)
> > +   (struct cpufreq_policy *, unsigned int, unsigned int));
> > +void od_unregister_powersave_bias_function(void);
> 
> We generally call those a "callback" or a "handler". I.e.,
> od_register_powersave_bias_handler or something.

Okay, will change.

> 
> >  #endif /* _CPUFREQ_GOVERNER_H */
> > diff --git a/drivers/cpufreq/cpufreq_ondemand.c 
> > b/drivers/cpufreq/cpufreq_ondemand.c
> > index 15e80ee..36f0798 100644
> > --- a/drivers/cpufreq/cpufreq_ondemand.c
> > +++ b/drivers/cpufreq/cpufreq_ondemand.c
> > @@ -40,6 +40,8 @@
> >  
> >  static DEFINE_PER_CPU(struct od_cpu_dbs_info_s, od_cpu_dbs_info);
> >  
> > +static struct od_ops od_ops;
> > +
> >  #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
> >  static struct cpufreq_governor cpufreq_gov_ondemand;
> >  #endif
> > @@ -145,7 +147,8 @@ static void dbs_freq_increase(struct cpufreq_policy *p, 
> > unsigned int freq)
> > struct od_dbs_tuners *od_tuners = dbs_data->tuners;
> >  
> > if (od_tuners->powersave_bias)
> > -   freq = powersave_bias_target(p, freq, CPUFREQ_RELATION_H);
> > +   freq = od_ops.powersave_bias_target(p, freq,
> > +   CPUFREQ_RELATION_H);
> > else if (p->cur == p->max)
> > return;
> >  
> > @@ -206,8 +209,8 @@ static void od_check_cpu(int cpu, unsigned int 
> > load_freq)
> > __cpufreq_driver_target(policy, freq_next,
> > CPUFREQ_RELATION_L);
> > } else {
> > -   int freq = powersave_bias_target(policy, freq_next,
> > -   CPUFREQ_RELATION_L);
> > +   int freq = od_ops.powersave_bias_target(policy,
> > +   freq_next, CPUFREQ_RELATION_L);
> > __cpufreq_driver_target(policy, freq,
> > CPUFREQ_RELATION_L);
> > }
> > @@ -565,6 +568,19 @@ static struct common_dbs_data od_dbs_cdata = {
> > .exit = od_exit,
> >  };
> >  
> > +void od_register_powersave_bias_function(unsigned int (*f)
> > +   (struct cpufreq_policy *, unsigned int, unsigned int))
> > +{
> > +   od_ops.powersave_bias_target = f;
> > +}
> > +EXPORT_SYMBOL_GPL(od_register_powersave_bias_function);
> > +
> > +void od_unregister_powersave_bias_function(void)
> > +{
> > +   od_ops.powersave_bias_target = powersave_bias_target;
> 
> This is very confusing: we have ->powersave_bias_target and the default
> powersave_bias_target in the ondemand governor. Can we call the default
> one generic_powersave_bias_target or default_* or whatever.

Okay, will do.

Thanks!

> 
> Thanks.
> 
> -- 
> Regards/Gruss,
> Boris.
> 
> Sent from a fat crate under my desk. Formatting is fine.
> --
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2 2/2] cpufreq: AMD "frequency sensitivity feedback" powersave bias for ondemand governor

2013-04-02 Thread Jacob Shin
On Tue, Apr 02, 2013 at 03:42:55PM +0200, Borislav Petkov wrote:
> On Thu, Mar 28, 2013 at 01:24:17PM -0500, Jacob Shin wrote:
> > Future AMD processors, starting with Family 16h, can provide software
> > with feedback on how the workload may respond to frequency change --
> > memory-bound workloads will not benefit from higher frequency, where
> > as compute-bound workloads will. This patch enables this "frequency
> > sensitivity feedback" to aid the ondemand governor to make better
> > frequency change decisions by hooking into the powersave bias.
> > 
> > Signed-off-by: Jacob Shin 
> > ---
> >  arch/x86/include/uapi/asm/msr-index.h  |1 +
> >  drivers/cpufreq/Kconfig.x86|   10 +++
> >  drivers/cpufreq/Makefile   |1 +
> >  drivers/cpufreq/amd_freq_sensitivity.c |  147 
> > 
> >  4 files changed, 159 insertions(+)
> >  create mode 100644 drivers/cpufreq/amd_freq_sensitivity.c
> > 
> > diff --git a/arch/x86/include/uapi/asm/msr-index.h 
> > b/arch/x86/include/uapi/asm/msr-index.h
> > index 7a060f4..b2e6c49 100644
> > --- a/arch/x86/include/uapi/asm/msr-index.h
> > +++ b/arch/x86/include/uapi/asm/msr-index.h
> > @@ -173,6 +173,7 @@
> >  #define MSR_AMD64_TSC_RATIO0xc104
> >  #define MSR_AMD64_NB_CFG   0xc001001f
> >  #define MSR_AMD64_PATCH_LOADER 0xc0010020
> > +#define MSR_AMD64_FREQ_SENSITIVITY 0xc0010080
> >  #define MSR_AMD64_OSVW_ID_LENGTH   0xc0010140
> >  #define MSR_AMD64_OSVW_STATUS  0xc0010141
> >  #define MSR_AMD64_DC_CFG   0xc0011022
> 
> My guess is, this MSR won't be used outside of cpufreq so you probably
> want to define it there, in amd_freq_sensitivity.c
> 
> > diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
> > index d7dc0ed..6c714b0 100644
> > --- a/drivers/cpufreq/Kconfig.x86
> > +++ b/drivers/cpufreq/Kconfig.x86
> > @@ -129,6 +129,16 @@ config X86_POWERNOW_K8
> >  
> >   For details, take a look at .
> >  
> > +config X86_AMD_FREQ_SENSITIVITY
> > +   tristate "AMD 'frequency sensitivity feedback' powersave bias"
> 
> Why in ' '? Isn't that the final name?

You are right,

It does not need to be in quotes. I had first written this as its
own governor, and I was mimicking Kconfig entries of 'ondemand',
'performance' .. and so on.

> 
> > +   depends on CPU_FREQ_GOV_ONDEMAND && X86_ACPI_CPUFREQ
> 
> depends on CPU_SUP_AMD
> 
> > +   help
> > + This adds support for 'frequency sensitivity feedback' feature on
> > + supported AMD processors, which hooks into the ondemand governor's
> > + powersave bias to influence frequency change decisions.
> 
> Your description about the feature in the 0/2 message is much better
> than this one here. How about adding it here too?
> 
> > +
> > + If in doubt, say N.
> > +
> >  config X86_GX_SUSPMOD
> > tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
> > depends on X86_32 && PCI
> > diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
> > index 863fd18..01dfdaf 100644
> > --- a/drivers/cpufreq/Makefile
> > +++ b/drivers/cpufreq/Makefile
> > @@ -41,6 +41,7 @@ obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO)  += 
> > speedstep-centrino.o
> >  obj-$(CONFIG_X86_P4_CLOCKMOD)  += p4-clockmod.o
> >  obj-$(CONFIG_X86_CPUFREQ_NFORCE2)  += cpufreq-nforce2.o
> >  obj-$(CONFIG_X86_INTEL_PSTATE) += intel_pstate.o
> > +obj-$(CONFIG_X86_AMD_FREQ_SENSITIVITY) += amd_freq_sensitivity.o
> >  
> >  
> > ##
> >  # ARM SoC drivers
> > diff --git a/drivers/cpufreq/amd_freq_sensitivity.c 
> > b/drivers/cpufreq/amd_freq_sensitivity.c
> > new file mode 100644
> > index 000..997feb0
> > --- /dev/null
> > +++ b/drivers/cpufreq/amd_freq_sensitivity.c
> > @@ -0,0 +1,147 @@
> > +/*
> > + * amd_freq_sensitivity.c: AMD "frequency sensitivity feedback" powersave 
> > bias
> > + * for ondemand governor.
> > + *
> > + * Copyright (C) 2013 Advanced Micro Devices, Inc.
> 
> You probably want to leave an email address in here for contacting you
> when it is b0rked. :-)
> 
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 

Re: [PATCH 0/3] perf, amd: Support for Family 16h L2I Performance Counters

2013-04-02 Thread Jacob Shin
On Wed, Mar 27, 2013 at 06:07:01PM -0500, Jacob Shin wrote:
> Upcoming AMD Family 16h Processors provide 4 new performance counters
> to count L2 related events. Similar to northbridge counters, these new
> counters are shared across multiple CPUs that share the same L2 cache.
> This patchset adds support for these new counters and enforces sharing
> by leveraging the existing sharing logic used for the northbridge
> counters.

Ingo, please consider commiting to perf/core for 3.10. This patchset
is very similar to our northbridge counter support that went into 3.9:
https://lkml.org/lkml/2013/2/18/81

This series adds support for yet another set of new counters.

Thank you,

> 
> Jacob Shin (3):
>   perf, amd: Further generalize NB event constraints handling logic
>   perf, x86: Allow for multiple kfree_on_online pointers
>   perf, amd: Enable L2I performance counters on AMD Family 16h
> 
>  arch/x86/include/asm/cpufeature.h  |2 +
>  arch/x86/include/asm/perf_event.h  |4 +
>  arch/x86/include/uapi/asm/msr-index.h  |4 +
>  arch/x86/kernel/cpu/perf_event.c   |7 +-
>  arch/x86/kernel/cpu/perf_event.h   |   11 +-
>  arch/x86/kernel/cpu/perf_event_amd.c   |  227 
> +---
>  arch/x86/kernel/cpu/perf_event_intel.c |2 +-
>  7 files changed, 199 insertions(+), 58 deletions(-)
> 
> -- 
> 1.7.9.5
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH V3 0/2] cpufreq: ondemand: add AMD specific powersave bias

2013-04-02 Thread Jacob Shin
This patchset adds AMD specific powersave bias function to the ondemand
governor; which can be used to help ondemand governor make more power conscious
frequency change decisions based on feedback from hardware (availble on AMD
Family 16h and above).

Hardware feedback tells software how "sensitive" to frequency changes the
workloads are. CPU-bound workloads will be more sensitive -- they will
perform better as frequency increases. Memory/IO-bound workloads will be less
sensitive -- they will not necessarily perform better as frequnecy increases.

This patchset was compared against ondemand governor without powersave bias
and did not show any performance degradation on CPU-bound workloads such as
kernbench and unixbench. While saving power on Memory-bound workloads such as
stream.

V3:
* Added to CPUID bit to cpufeature.h
* Added MODULE_DEVICE_TABLE to autoload this driver.
* Other small changes per feedback from:
  https://lkml.org/lkml/2013/4/2/349

V2:
* Added proper include files to amd_freq_sensitivity.c
* Only register powersave_bias_target function pointer and not the entire
  od_ops.

Jacob Shin (2):
  cpufreq: ondemand: allow custom powersave_bias_target handler to be
registered
  cpufreq: AMD "frequency sensitivity feedback" powersave bias for
ondemand governor

 arch/x86/include/asm/cpufeature.h  |1 +
 arch/x86/kernel/cpu/scattered.c|3 +-
 drivers/cpufreq/Kconfig.x86|   17 
 drivers/cpufreq/Makefile   |1 +
 drivers/cpufreq/amd_freq_sensitivity.c |  150 
 drivers/cpufreq/cpufreq_governor.h |3 +
 drivers/cpufreq/cpufreq_ondemand.c |   32 +--
 7 files changed, 198 insertions(+), 9 deletions(-)
 create mode 100644 drivers/cpufreq/amd_freq_sensitivity.c

-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH V3 1/2] cpufreq: ondemand: allow custom powersave_bias_target handler to be registered

2013-04-02 Thread Jacob Shin
This allows for another [arch specific] driver to hook into existing
powersave bias function of the ondemand governor. i.e. This allows AMD
specific powersave bias function (in a separate AMD specific driver)
to aid ondemand governor's frequency transition deicisions.

Signed-off-by: Jacob Shin 
---
 drivers/cpufreq/cpufreq_governor.h |3 +++
 drivers/cpufreq/cpufreq_ondemand.c |   32 
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_governor.h 
b/drivers/cpufreq/cpufreq_governor.h
index 6593769..f52bf17 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -263,4 +263,7 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy,
struct common_dbs_data *cdata, unsigned int event);
 void gov_queue_work(struct dbs_data *dbs_data, struct cpufreq_policy *policy,
unsigned int delay, bool all_cpus);
+void od_register_powersave_bias_handler(unsigned int (*f)
+   (struct cpufreq_policy *, unsigned int, unsigned int));
+void od_unregister_powersave_bias_handler(void);
 #endif /* _CPUFREQ_GOVERNOR_H */
diff --git a/drivers/cpufreq/cpufreq_ondemand.c 
b/drivers/cpufreq/cpufreq_ondemand.c
index 1471478..e43611d 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -40,6 +40,8 @@
 
 static DEFINE_PER_CPU(struct od_cpu_dbs_info_s, od_cpu_dbs_info);
 
+static struct od_ops od_ops;
+
 #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
 static struct cpufreq_governor cpufreq_gov_ondemand;
 #endif
@@ -80,7 +82,7 @@ static int should_io_be_busy(void)
  * Returns the freq_hi to be used right now and will set freq_hi_jiffies,
  * freq_lo, and freq_lo_jiffies in percpu area for averaging freqs.
  */
-static unsigned int powersave_bias_target(struct cpufreq_policy *policy,
+static unsigned int generic_powersave_bias_target(struct cpufreq_policy 
*policy,
unsigned int freq_next, unsigned int relation)
 {
unsigned int freq_req, freq_reduc, freq_avg;
@@ -145,7 +147,8 @@ static void dbs_freq_increase(struct cpufreq_policy *p, 
unsigned int freq)
struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 
if (od_tuners->powersave_bias)
-   freq = powersave_bias_target(p, freq, CPUFREQ_RELATION_H);
+   freq = od_ops.powersave_bias_target(p, freq,
+   CPUFREQ_RELATION_H);
else if (p->cur == p->max)
return;
 
@@ -205,12 +208,12 @@ static void od_check_cpu(int cpu, unsigned int load_freq)
if (!od_tuners->powersave_bias) {
__cpufreq_driver_target(policy, freq_next,
CPUFREQ_RELATION_L);
-   } else {
-   int freq = powersave_bias_target(policy, freq_next,
-   CPUFREQ_RELATION_L);
-   __cpufreq_driver_target(policy, freq,
-   CPUFREQ_RELATION_L);
+   return;
}
+
+   freq_next = od_ops.powersave_bias_target(policy, freq_next,
+   CPUFREQ_RELATION_L);
+   __cpufreq_driver_target(policy, freq_next, CPUFREQ_RELATION_L);
}
 }
 
@@ -557,7 +560,7 @@ define_get_cpu_dbs_routines(od_cpu_dbs_info);
 
 static struct od_ops od_ops = {
.powersave_bias_init_cpu = ondemand_powersave_bias_init_cpu,
-   .powersave_bias_target = powersave_bias_target,
+   .powersave_bias_target = generic_powersave_bias_target,
.freq_increase = dbs_freq_increase,
 };
 
@@ -574,6 +577,19 @@ static struct common_dbs_data od_dbs_cdata = {
.exit = od_exit,
 };
 
+void od_register_powersave_bias_handler(unsigned int (*f)
+   (struct cpufreq_policy *, unsigned int, unsigned int))
+{
+   od_ops.powersave_bias_target = f;
+}
+EXPORT_SYMBOL_GPL(od_register_powersave_bias_handler);
+
+void od_unregister_powersave_bias_handler(void)
+{
+   od_ops.powersave_bias_target = generic_powersave_bias_target;
+}
+EXPORT_SYMBOL_GPL(od_unregister_powersave_bias_handler);
+
 static int od_cpufreq_governor_dbs(struct cpufreq_policy *policy,
unsigned int event)
 {
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH V3 2/2] cpufreq: AMD "frequency sensitivity feedback" powersave bias for ondemand governor

2013-04-02 Thread Jacob Shin
Future AMD processors, starting with Family 16h, can provide software
with feedback on how the workload may respond to frequency change --
memory-bound workloads will not benefit from higher frequency, where
as compute-bound workloads will. This patch enables this "frequency
sensitivity feedback" to aid the ondemand governor to make better
frequency change decisions by hooking into the powersave bias.

Signed-off-by: Jacob Shin 
---
 arch/x86/include/asm/cpufeature.h  |1 +
 arch/x86/kernel/cpu/scattered.c|3 +-
 drivers/cpufreq/Kconfig.x86|   17 
 drivers/cpufreq/Makefile   |1 +
 drivers/cpufreq/amd_freq_sensitivity.c |  150 
 5 files changed, 171 insertions(+), 1 deletion(-)
 create mode 100644 drivers/cpufreq/amd_freq_sensitivity.c

diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index 93fe929..9e22520 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -182,6 +182,7 @@
 #define X86_FEATURE_PTS(7*32+ 6) /* Intel Package Thermal 
Status */
 #define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */
 #define X86_FEATURE_HW_PSTATE  (7*32+ 8) /* AMD HW-PState */
+#define X86_FEATURE_PROC_FEEDBACK (7*32+ 9) /* AMD ProcFeedbackInterface */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index ee8e9ab..d92b5da 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -39,8 +39,9 @@ void __cpuinit init_scattered_cpuid_features(struct 
cpuinfo_x86 *c)
{ X86_FEATURE_APERFMPERF,   CR_ECX, 0, 0x0006, 0 },
{ X86_FEATURE_EPB,  CR_ECX, 3, 0x0006, 0 },
{ X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x000d, 1 },
-   { X86_FEATURE_CPB,  CR_EDX, 9, 0x8007, 0 },
{ X86_FEATURE_HW_PSTATE,CR_EDX, 7, 0x8007, 0 },
+   { X86_FEATURE_CPB,  CR_EDX, 9, 0x8007, 0 },
+   { X86_FEATURE_PROC_FEEDBACK,CR_EDX,11, 0x8007, 0 },
{ X86_FEATURE_NPT,  CR_EDX, 0, 0x800a, 0 },
{ X86_FEATURE_LBRV, CR_EDX, 1, 0x800a, 0 },
{ X86_FEATURE_SVML, CR_EDX, 2, 0x800a, 0 },
diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
index d7dc0ed..018fced 100644
--- a/drivers/cpufreq/Kconfig.x86
+++ b/drivers/cpufreq/Kconfig.x86
@@ -129,6 +129,23 @@ config X86_POWERNOW_K8
 
  For details, take a look at .
 
+config X86_AMD_FREQ_SENSITIVITY
+   tristate "AMD frequency sensitivity feedback powersave bias"
+   depends on CPU_FREQ_GOV_ONDEMAND && X86_ACPI_CPUFREQ && CPU_SUP_AMD
+   help
+ This adds AMD specific powersave bias function to the ondemand
+ governor; which can be used to help ondemand governor make more power
+ conscious frequency change decisions based on feedback from hardware
+ (availble on AMD Family 16h and above).
+
+ Hardware feedback tells software how "sensitive" to frequency changes
+ the CPUs' workloads are. CPU-bound workloads will be more sensitive
+ -- they will perform better as frequency increases. Memory/IO-bound
+ workloads will be less sensitive -- they will not necessarily perform
+ better as frequnecy increases.
+
+ If in doubt, say N.
+
 config X86_GX_SUSPMOD
tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
depends on X86_32 && PCI
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 863fd18..01dfdaf 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO)  += speedstep-centrino.o
 obj-$(CONFIG_X86_P4_CLOCKMOD)  += p4-clockmod.o
 obj-$(CONFIG_X86_CPUFREQ_NFORCE2)  += cpufreq-nforce2.o
 obj-$(CONFIG_X86_INTEL_PSTATE) += intel_pstate.o
+obj-$(CONFIG_X86_AMD_FREQ_SENSITIVITY) += amd_freq_sensitivity.o
 
 
##
 # ARM SoC drivers
diff --git a/drivers/cpufreq/amd_freq_sensitivity.c 
b/drivers/cpufreq/amd_freq_sensitivity.c
new file mode 100644
index 000..e3e62d2
--- /dev/null
+++ b/drivers/cpufreq/amd_freq_sensitivity.c
@@ -0,0 +1,150 @@
+/*
+ * amd_freq_sensitivity.c: AMD frequency sensitivity feedback powersave bias
+ * for the ondemand governor.
+ *
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Jacob Shin 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free S

Re: [PATCH V3 2/2] cpufreq: AMD "frequency sensitivity feedback" powersave bias for ondemand governor

2013-04-02 Thread Jacob Shin
On Tue, Apr 02, 2013 at 09:23:52PM +0200, Borislav Petkov wrote:
> On Tue, Apr 02, 2013 at 01:11:44PM -0500, Jacob Shin wrote:
> > Future AMD processors, starting with Family 16h, can provide software
> > with feedback on how the workload may respond to frequency change --
> > memory-bound workloads will not benefit from higher frequency, where
> > as compute-bound workloads will. This patch enables this "frequency
> > sensitivity feedback" to aid the ondemand governor to make better
> > frequency change decisions by hooking into the powersave bias.
> > 
> > Signed-off-by: Jacob Shin 
> > ---
> 
> [ … ]
> 
> > --- a/drivers/cpufreq/Kconfig.x86
> > +++ b/drivers/cpufreq/Kconfig.x86
> > @@ -129,6 +129,23 @@ config X86_POWERNOW_K8
> >  
> >   For details, take a look at .
> >  
> > +config X86_AMD_FREQ_SENSITIVITY
> 
> /me is turning on his spell checker...

Yikes, sorry about that (*ashamed*), will remeber to run spellcheck 
next time.

> 
> > +   tristate "AMD frequency sensitivity feedback powersave bias"
> > +   depends on CPU_FREQ_GOV_ONDEMAND && X86_ACPI_CPUFREQ && CPU_SUP_AMD
> > +   help
> > + This adds AMD specific powersave bias function to the ondemand
> 
>   AMD-specific
> 
> > + governor; which can be used to help ondemand governor make more power
> 
> "... governor, which allows it to make more power-conscious frequency
> change decisions based on ..."
> 
> > + conscious frequency change decisions based on feedback from hardware
> > + (availble on AMD Family 16h and above).
> 
> s/availble/available/
> 
> > +
> > + Hardware feedback tells software how "sensitive" to frequency changes
> > + the CPUs' workloads are. CPU-bound workloads will be more sensitive
> > + -- they will perform better as frequency increases. Memory/IO-bound
> > + workloads will be less sensitive -- they will not necessarily perform
> > + better as frequnecy increases.
> 
> s/frequnecy/frequency/
> 
> > +
> > + If in doubt, say N.
> > +
> >  config X86_GX_SUSPMOD
> > tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
> > depends on X86_32 && PCI
> > diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
> > index 863fd18..01dfdaf 100644
> > --- a/drivers/cpufreq/Makefile
> > +++ b/drivers/cpufreq/Makefile
> > @@ -41,6 +41,7 @@ obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO)  += 
> > speedstep-centrino.o
> >  obj-$(CONFIG_X86_P4_CLOCKMOD)  += p4-clockmod.o
> >  obj-$(CONFIG_X86_CPUFREQ_NFORCE2)  += cpufreq-nforce2.o
> >  obj-$(CONFIG_X86_INTEL_PSTATE) += intel_pstate.o
> > +obj-$(CONFIG_X86_AMD_FREQ_SENSITIVITY) += amd_freq_sensitivity.o
> >  
> >  
> > ##
> >  # ARM SoC drivers
> > diff --git a/drivers/cpufreq/amd_freq_sensitivity.c 
> > b/drivers/cpufreq/amd_freq_sensitivity.c
> > new file mode 100644
> > index 000..e3e62d2
> > --- /dev/null
> > +++ b/drivers/cpufreq/amd_freq_sensitivity.c
> > @@ -0,0 +1,150 @@
> > +/*
> > + * amd_freq_sensitivity.c: AMD frequency sensitivity feedback powersave 
> > bias
> > + * for the ondemand governor.
> > + *
> > + * Copyright (C) 2013 Advanced Micro Devices, Inc.
> > + *
> > + * Author: Jacob Shin 
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + */
> > +
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +
> > +#include 
> > +#include 
> > +
> > +#include "cpufreq_governor.h"
> > +
> > +#define MSR_AMD64_FREQ_SENSITIVITY_ACTUAL  0xc0010080
> > +#define MSR_AMD64_FREQ_SENSITIVITY_REFERENCE   0xc0010081
> > +#define CLASS_CODE_SHIFT   56
> > +#define CLASS_CODE_CORE_FREQ_SENSITIVITY   0x01
> > +#define POWERSAVE_BIAS_MAX 1000
> > +
> > +struct cpu_data_t {
> > +   u64 actual;
> > +   u64 reference;
> > +   unsigned int freq_prev;
> > +};
> > +
> > +static DEFINE_PER_CPU(struct cpu_data_t, cpu_data);
> > +
> > +static unsigned int amd_powersave_bias_target(struct cpufreq_policy 
> 

Re: [PATCH V3 2/2] cpufreq: AMD "frequency sensitivity feedback" powersave bias for ondemand governor

2013-04-03 Thread Jacob Shin
On Tue, Apr 02, 2013 at 11:01:24PM +0200, Borislav Petkov wrote:
> On Tue, Apr 02, 2013 at 10:51:51PM +0200, Thomas Renninger wrote:
> > powersave_bias is undocumented in Documentation/cpu-freq/...
> > I guess its use-case is for people who want to get some percent more
> > power savings out of their laptop and do not care of the one or other
> > percent performance.
> > In fact I would like to get rid of this extra code and I expect nobody 
> > would 
> > miss it.
> > I might miss a configuration tool where someone went through the code,
> > documented things and allows users to set powersave_bias values through
> > some /etc/* config files.
> > Yep, if you want anyone to make use of this, it should better get
> > embedded in more general, at least general ondemand code.
> 
> Yeah, it all sounds like we want to enable this by default on systems
> which support it. Maybe with an off-switch for people who want plain
> ondemand decisions.
> 
> The remaining systems with ripped out powersave_bias would get plain
> ondemand governor decisions. Provided, of course, nobody uses
> powersave_bias and the functionality doesn't make any sense anyway.

Rafael, any thoughts on removing powersave_bias altogether ?

If we remove it, then is it acceptable to add an alternate callback/
handler registration to ondemand governor to account for hardware
feedback ?

Or, if we don't want to remove powersave_bias,

Then Thomas, Boris, would it be acceptable if enable the frequency
feedback feature by default with a sane powersave_bias tunable value ?
And also add proper documentation for both vanila powersave_bias and
powersave_bias with AMD frequency sensitivity loaded to
Documentation/cpu-freq/ondemand ?

> 
> Thanks.
> 
> -- 
> Regards/Gruss,
> Boris.
> 
> Sent from a fat crate under my desk. Formatting is fine.
> --
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V3 2/2] cpufreq: AMD "frequency sensitivity feedback" powersave bias for ondemand governor

2013-04-03 Thread Jacob Shin
On Wed, Apr 03, 2013 at 07:04:56PM +0200, Borislav Petkov wrote:
> On Wed, Apr 03, 2013 at 11:53:24AM -0500, Jacob Shin wrote:
> > Then Thomas, Boris, would it be acceptable if enable the frequency
> > feedback feature by default with a sane powersave_bias tunable value?
> > And also add proper documentation for both vanila powersave_bias
> > and powersave_bias with AMD frequency sensitivity loaded to
> > Documentation/cpu-freq/ondemand ?
> 
> Yeah, this was what I was proposing, basically. The only question here
> is, would anyone want to disable freq decisions on systems with hw
> feedback? If yes, then you'd need to be able to disable the feedback
> thing, maybe have a magic value for powersave_bias...

Writing 0 to powersave_bias or unloading the AMD driver could do that.

When the AMD driver loads, it will give a sane default value to
powersave_bias to enable it, when it unloads, it will put it back to 0

> 
> Thanks.
> 
> -- 
> Regards/Gruss,
> Boris.
> 
> Sent from a fat crate under my desk. Formatting is fine.
> --
> --
> To unsubscribe from this list: send the line "unsubscribe cpufreq" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH V4 1/2] cpufreq: ondemand: allow custom powersave_bias_target handler to be registered

2013-04-04 Thread Jacob Shin
This allows for another [arch specific] driver to hook into existing
powersave bias function of the ondemand governor. i.e. This allows AMD
specific powersave bias function (in a separate AMD specific driver)
to aid ondemand governor's frequency transition decisions.

Signed-off-by: Jacob Shin 
---
 drivers/cpufreq/cpufreq_governor.h |4 +++
 drivers/cpufreq/cpufreq_ondemand.c |   56 ++--
 2 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_governor.h 
b/drivers/cpufreq/cpufreq_governor.h
index 6593769..8ac3353 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -263,4 +263,8 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy,
struct common_dbs_data *cdata, unsigned int event);
 void gov_queue_work(struct dbs_data *dbs_data, struct cpufreq_policy *policy,
unsigned int delay, bool all_cpus);
+void od_register_powersave_bias_handler(unsigned int (*f)
+   (struct cpufreq_policy *, unsigned int, unsigned int),
+   unsigned int powersave_bias);
+void od_unregister_powersave_bias_handler(void);
 #endif /* _CPUFREQ_GOVERNOR_H */
diff --git a/drivers/cpufreq/cpufreq_ondemand.c 
b/drivers/cpufreq/cpufreq_ondemand.c
index 1471478..e5d1e8c 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -40,6 +40,8 @@
 
 static DEFINE_PER_CPU(struct od_cpu_dbs_info_s, od_cpu_dbs_info);
 
+static struct od_ops od_ops;
+
 #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
 static struct cpufreq_governor cpufreq_gov_ondemand;
 #endif
@@ -80,7 +82,7 @@ static int should_io_be_busy(void)
  * Returns the freq_hi to be used right now and will set freq_hi_jiffies,
  * freq_lo, and freq_lo_jiffies in percpu area for averaging freqs.
  */
-static unsigned int powersave_bias_target(struct cpufreq_policy *policy,
+static unsigned int generic_powersave_bias_target(struct cpufreq_policy 
*policy,
unsigned int freq_next, unsigned int relation)
 {
unsigned int freq_req, freq_reduc, freq_avg;
@@ -145,7 +147,8 @@ static void dbs_freq_increase(struct cpufreq_policy *p, 
unsigned int freq)
struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 
if (od_tuners->powersave_bias)
-   freq = powersave_bias_target(p, freq, CPUFREQ_RELATION_H);
+   freq = od_ops.powersave_bias_target(p, freq,
+   CPUFREQ_RELATION_H);
else if (p->cur == p->max)
return;
 
@@ -205,12 +208,12 @@ static void od_check_cpu(int cpu, unsigned int load_freq)
if (!od_tuners->powersave_bias) {
__cpufreq_driver_target(policy, freq_next,
CPUFREQ_RELATION_L);
-   } else {
-   int freq = powersave_bias_target(policy, freq_next,
-   CPUFREQ_RELATION_L);
-   __cpufreq_driver_target(policy, freq,
-   CPUFREQ_RELATION_L);
+   return;
}
+
+   freq_next = od_ops.powersave_bias_target(policy, freq_next,
+   CPUFREQ_RELATION_L);
+   __cpufreq_driver_target(policy, freq_next, CPUFREQ_RELATION_L);
}
 }
 
@@ -557,7 +560,7 @@ define_get_cpu_dbs_routines(od_cpu_dbs_info);
 
 static struct od_ops od_ops = {
.powersave_bias_init_cpu = ondemand_powersave_bias_init_cpu,
-   .powersave_bias_target = powersave_bias_target,
+   .powersave_bias_target = generic_powersave_bias_target,
.freq_increase = dbs_freq_increase,
 };
 
@@ -574,6 +577,43 @@ static struct common_dbs_data od_dbs_cdata = {
.exit = od_exit,
 };
 
+static void od_set_powersave_bias(unsigned int powersave_bias)
+{
+   unsigned int cpu;
+   struct od_dbs_tuners *od_tuners;
+
+   if (!have_governor_per_policy()) {
+   od_tuners = od_dbs_cdata.gdbs_data->tuners;
+   od_tuners->powersave_bias = powersave_bias;
+   return;
+   }
+
+   for_each_online_cpu(cpu) {
+   struct cpufreq_policy *policy;
+   struct dbs_data *dbs_data;
+   policy = per_cpu(od_cpu_dbs_info, cpu).cdbs.cur_policy;
+   dbs_data = policy->governor_data;
+   od_tuners = dbs_data->tuners;
+   od_tuners->powersave_bias = powersave_bias;
+   }
+}
+
+void od_register_powersave_bias_handler(unsigned int (*f)
+   (struct cpufreq_policy *, unsigned int, unsigned int),
+   unsigned int powersave_bias)
+{
+   od_ops.powersave_bias_target = f;
+   od_set_powersave_bias(powersave_bias);
+}
+EXPORT_SYMBOL_GPL(od_register_powersave_bias_handler);
+
+void od_unregister_powersave_bias_handler(void)
+{
+   od_ops.powersave_bias_tar

[PATCH V4 2/2] cpufreq: AMD "frequency sensitivity feedback" powersave bias for ondemand governor

2013-04-04 Thread Jacob Shin
Future AMD processors, starting with Family 16h, can provide software
with feedback on how the workload may respond to frequency change --
memory-bound workloads will not benefit from higher frequency, where
as compute-bound workloads will. This patch enables this "frequency
sensitivity feedback" to aid the ondemand governor to make better
frequency change decisions by hooking into the powersave bias.

Signed-off-by: Jacob Shin 
---
 Documentation/cpu-freq/governors.txt   |   21 +
 arch/x86/include/asm/cpufeature.h  |1 +
 arch/x86/kernel/cpu/scattered.c|3 +-
 drivers/cpufreq/Kconfig.x86|   17 
 drivers/cpufreq/Makefile   |1 +
 drivers/cpufreq/amd_freq_sensitivity.c |  148 
 6 files changed, 190 insertions(+), 1 deletion(-)
 create mode 100644 drivers/cpufreq/amd_freq_sensitivity.c

diff --git a/Documentation/cpu-freq/governors.txt 
b/Documentation/cpu-freq/governors.txt
index 4dfed30..66f9cc3 100644
--- a/Documentation/cpu-freq/governors.txt
+++ b/Documentation/cpu-freq/governors.txt
@@ -167,6 +167,27 @@ of load evaluation and helping the CPU stay at its top 
speed when truly
 busy, rather than shifting back and forth in speed. This tunable has no
 effect on behavior at lower speeds/lower CPU loads.
 
+powersave_bias: this parameter takes a value between 0 to 1000. It
+defines the percentage (times 10) value of the target frequency that
+will be shaved off of the target. For example, when set to 100 -- 10%,
+when ondemand governor would have targeted 1000 MHz, it will target
+1000 MHz - (10% of 1000 MHz) = 900 MHz instead. This is set to 0
+(disabled) by default.
+When AMD frequency sensitivity powersave bias driver --
+drivers/cpufreq/amd_freq_sensitivity.c is loaded, this parameter
+defines the workload frequency sensitivity threshold in which a lower
+frequency is chosen instead of ondemand governor's original target.
+The frequency sensitivity is a hardware reported (on AMD Family 16h
+Processors and above) value between 0 to 100% that tells software how
+the performance of the workload running on a CPU will change when
+frequency changes. A workload with sensitivity of 0% (memory/IO-bound)
+will not perform any better on higher core frequency, whereas a
+workload with sensitivity of 100% (CPU-bound) will perform better
+higher the frequency. When the driver is loaded, this is set to 400
+by default -- for CPUs running workloads with sensitivity value below
+40%, a lower frequency is chosen. Unloading the driver or writing 0
+will disable this feature.
+
 
 2.5 Conservative
 
diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index 93fe929..9e22520 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -182,6 +182,7 @@
 #define X86_FEATURE_PTS(7*32+ 6) /* Intel Package Thermal 
Status */
 #define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */
 #define X86_FEATURE_HW_PSTATE  (7*32+ 8) /* AMD HW-PState */
+#define X86_FEATURE_PROC_FEEDBACK (7*32+ 9) /* AMD ProcFeedbackInterface */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index ee8e9ab..d92b5da 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -39,8 +39,9 @@ void __cpuinit init_scattered_cpuid_features(struct 
cpuinfo_x86 *c)
{ X86_FEATURE_APERFMPERF,   CR_ECX, 0, 0x0006, 0 },
{ X86_FEATURE_EPB,  CR_ECX, 3, 0x0006, 0 },
{ X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x000d, 1 },
-   { X86_FEATURE_CPB,  CR_EDX, 9, 0x8007, 0 },
{ X86_FEATURE_HW_PSTATE,CR_EDX, 7, 0x8007, 0 },
+   { X86_FEATURE_CPB,  CR_EDX, 9, 0x8007, 0 },
+   { X86_FEATURE_PROC_FEEDBACK,CR_EDX,11, 0x8007, 0 },
{ X86_FEATURE_NPT,  CR_EDX, 0, 0x800a, 0 },
{ X86_FEATURE_LBRV, CR_EDX, 1, 0x800a, 0 },
{ X86_FEATURE_SVML, CR_EDX, 2, 0x800a, 0 },
diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
index d7dc0ed..2b8a8c3 100644
--- a/drivers/cpufreq/Kconfig.x86
+++ b/drivers/cpufreq/Kconfig.x86
@@ -129,6 +129,23 @@ config X86_POWERNOW_K8
 
  For details, take a look at .
 
+config X86_AMD_FREQ_SENSITIVITY
+   tristate "AMD frequency sensitivity feedback powersave bias"
+   depends on CPU_FREQ_GOV_ONDEMAND && X86_ACPI_CPUFREQ && CPU_SUP_AMD
+   help
+ This adds AMD-specific powersave bias function to the ondemand
+ governor, which allows it to make more power-conscious frequency
+ change decisions based on feedback from hardware (avail

[PATCH V4 0/2] cpufreq: ondemand: add AMD specific powersave bias

2013-04-04 Thread Jacob Shin
This patchset adds AMD specific powersave bias function to the ondemand
governor; which can be used to help ondemand governor make more power conscious
frequency change decisions based on feedback from hardware (availble on AMD
Family 16h and above).

Hardware feedback tells software how "sensitive" to frequency changes the
workloads are. CPU-bound workloads will be more sensitive -- they will
perform better as frequency increases. Memory/IO-bound workloads will be less
sensitive -- they will not necessarily perform better as frequnecy increases.

This patchset was compared against ondemand governor without powersave bias
and did not show any performance degradation on CPU-bound workloads such as
kernbench and unixbench. While saving power on Memory-bound workloads such as
stream.

V4:
* Added proper documentation to Documentation/cpu-freq/
* Revised so that when this driver loads, the feature is enabled by
  default with a sane tunable value.

V3:
* Added to CPUID bit to cpufeature.h
* Added MODULE_DEVICE_TABLE to autoload this driver.
* Other small changes per feedback from:
  https://lkml.org/lkml/2013/4/2/349

V2:
* Added proper include files to amd_freq_sensitivity.c
* Only register powersave_bias_target function pointer and not the entire
  od_ops.

Jacob Shin (2):
  cpufreq: ondemand: allow custom powersave_bias_target handler to be
registered
  cpufreq: AMD "frequency sensitivity feedback" powersave bias for
ondemand governor

 Documentation/cpu-freq/governors.txt   |   21 +
 arch/x86/include/asm/cpufeature.h  |1 +
 arch/x86/kernel/cpu/scattered.c|3 +-
 drivers/cpufreq/Kconfig.x86|   17 
 drivers/cpufreq/Makefile   |1 +
 drivers/cpufreq/amd_freq_sensitivity.c |  148 
 drivers/cpufreq/cpufreq_governor.h |4 +
 drivers/cpufreq/cpufreq_ondemand.c |   56 ++--
 8 files changed, 242 insertions(+), 9 deletions(-)
 create mode 100644 drivers/cpufreq/amd_freq_sensitivity.c

-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V4 1/2] cpufreq: ondemand: allow custom powersave_bias_target handler to be registered

2013-04-04 Thread Jacob Shin
On Thu, Apr 04, 2013 at 10:06:35PM +0530, Viresh Kumar wrote:
> On 4 April 2013 21:49, Jacob Shin  wrote:
> > diff --git a/drivers/cpufreq/cpufreq_ondemand.c 
> > b/drivers/cpufreq/cpufreq_ondemand.c
> 
> > +static void od_set_powersave_bias(unsigned int powersave_bias)
> > +{
> > +   unsigned int cpu;
> > +   struct od_dbs_tuners *od_tuners;
> > +
> > +   if (!have_governor_per_policy()) {
> > +   od_tuners = od_dbs_cdata.gdbs_data->tuners;
> > +   od_tuners->powersave_bias = powersave_bias;
> > +   return;
> > +   }
> > +
> > +   for_each_online_cpu(cpu) {
> > +   struct cpufreq_policy *policy;
> > +   struct dbs_data *dbs_data;
> > +   policy = per_cpu(od_cpu_dbs_info, cpu).cdbs.cur_policy;
> > +   dbs_data = policy->governor_data;
> > +   od_tuners = dbs_data->tuners;
> > +   od_tuners->powersave_bias = powersave_bias;
> > +   }
> 
> You can keep only the for_each_online_cpu() loop and remove the other
> one. And in that one also, you don't have to do this for every cpu...
> 
> something like this will help you...
> 
> cpus_processed = NULL;
> 
> for_each_online_cpu(cpu) {
> if cpu-is-present-in cpus_processed
> continue;
> 
> cpu-set-mask(cpus_processed, policy->cpus);
> 
> }
> 
> Syntax is poor, please choose the correct one.

Ah okay, thanks for the hint, here:

>From 59728d09d0dc5403c9bb0238336ecb367c04694f Mon Sep 17 00:00:00 2001
From: Jacob Shin 
Date: Tue, 2 Apr 2013 09:56:56 -0500
Subject: [PATCH 1/2] cpufreq: ondemand: allow custom powersave_bias_target
 handler to be registered

This allows for another [arch specific] driver to hook into existing
powersave bias function of the ondemand governor. i.e. This allows AMD
specific powersave bias function (in a separate AMD specific driver)
to aid ondemand governor's frequency transition decisions.

Signed-off-by: Jacob Shin 
---
 drivers/cpufreq/cpufreq_governor.h |4 +++
 drivers/cpufreq/cpufreq_ondemand.c |   58 +++-
 2 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_governor.h 
b/drivers/cpufreq/cpufreq_governor.h
index 6593769..8ac3353 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -263,4 +263,8 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy,
struct common_dbs_data *cdata, unsigned int event);
 void gov_queue_work(struct dbs_data *dbs_data, struct cpufreq_policy *policy,
unsigned int delay, bool all_cpus);
+void od_register_powersave_bias_handler(unsigned int (*f)
+   (struct cpufreq_policy *, unsigned int, unsigned int),
+   unsigned int powersave_bias);
+void od_unregister_powersave_bias_handler(void);
 #endif /* _CPUFREQ_GOVERNOR_H */
diff --git a/drivers/cpufreq/cpufreq_ondemand.c 
b/drivers/cpufreq/cpufreq_ondemand.c
index 1471478..80fb624 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -40,6 +40,8 @@
 
 static DEFINE_PER_CPU(struct od_cpu_dbs_info_s, od_cpu_dbs_info);
 
+static struct od_ops od_ops;
+
 #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
 static struct cpufreq_governor cpufreq_gov_ondemand;
 #endif
@@ -80,7 +82,7 @@ static int should_io_be_busy(void)
  * Returns the freq_hi to be used right now and will set freq_hi_jiffies,
  * freq_lo, and freq_lo_jiffies in percpu area for averaging freqs.
  */
-static unsigned int powersave_bias_target(struct cpufreq_policy *policy,
+static unsigned int generic_powersave_bias_target(struct cpufreq_policy 
*policy,
unsigned int freq_next, unsigned int relation)
 {
unsigned int freq_req, freq_reduc, freq_avg;
@@ -145,7 +147,8 @@ static void dbs_freq_increase(struct cpufreq_policy *p, 
unsigned int freq)
struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 
if (od_tuners->powersave_bias)
-   freq = powersave_bias_target(p, freq, CPUFREQ_RELATION_H);
+   freq = od_ops.powersave_bias_target(p, freq,
+   CPUFREQ_RELATION_H);
else if (p->cur == p->max)
return;
 
@@ -205,12 +208,12 @@ static void od_check_cpu(int cpu, unsigned int load_freq)
if (!od_tuners->powersave_bias) {
__cpufreq_driver_target(policy, freq_next,
CPUFREQ_RELATION_L);
-   } else {
-   int freq = powersave_bias_target(policy, freq_next,
-   CPUFREQ_RELATION_L);
-   __cpufreq_driver_target(policy, freq,
- 

Re: [PATCH V4 1/2] cpufreq: ondemand: allow custom powersave_bias_target handler to be registered

2013-04-04 Thread Jacob Shin
On Thu, Apr 04, 2013 at 09:12:25PM +0200, Borislav Petkov wrote:
> On Thu, Apr 04, 2013 at 12:18:04PM -0500, Jacob Shin wrote:
> > @@ -574,6 +577,45 @@ static struct common_dbs_data od_dbs_cdata = {
> > .exit = od_exit,
> >  };
> >  
> > +static void od_set_powersave_bias(unsigned int powersave_bias)
> > +{
> > +   struct cpufreq_policy *policy;
> > +   struct dbs_data *dbs_data;
> > +   struct od_dbs_tuners *od_tuners;
> > +   unsigned int cpu;
> > +   cpumask_t done;
> > +
> > +   cpumask_clear(&done);
> > +
> 
> get_online_cpus();
> 
> > +   for_each_online_cpu(cpu) {
> > +   if (cpumask_test_cpu(cpu, &done))
> > +   continue;
> > +
> > +   policy = per_cpu(od_cpu_dbs_info, cpu).cdbs.cur_policy;
> > +   dbs_data = policy->governor_data;
> > +   od_tuners = dbs_data->tuners;
> > +   od_tuners->powersave_bias = powersave_bias;
> > +
> > +   cpumask_or(&done, &done, policy->cpus);
> > +   }
> 
> put_online_cpus();
> 
> -- 
> Regards/Gruss,
> Boris.

Ah, okay .. here is the fixup:

>From 7236287faa1a499686c9aac1d3f3f224516a7bbf Mon Sep 17 00:00:00 2001
From: Jacob Shin 
Date: Tue, 2 Apr 2013 09:56:56 -0500
Subject: [PATCH 1/2] cpufreq: ondemand: allow custom powersave_bias_target
 handler to be registered

This allows for another [arch specific] driver to hook into existing
powersave bias function of the ondemand governor. i.e. This allows AMD
specific powersave bias function (in a separate AMD specific driver)
to aid ondemand governor's frequency transition decisions.

Signed-off-by: Jacob Shin 
---
 drivers/cpufreq/cpufreq_governor.h |4 +++
 drivers/cpufreq/cpufreq_ondemand.c |   61 +++-
 2 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_governor.h 
b/drivers/cpufreq/cpufreq_governor.h
index 6593769..8ac3353 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -263,4 +263,8 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy,
struct common_dbs_data *cdata, unsigned int event);
 void gov_queue_work(struct dbs_data *dbs_data, struct cpufreq_policy *policy,
unsigned int delay, bool all_cpus);
+void od_register_powersave_bias_handler(unsigned int (*f)
+   (struct cpufreq_policy *, unsigned int, unsigned int),
+   unsigned int powersave_bias);
+void od_unregister_powersave_bias_handler(void);
 #endif /* _CPUFREQ_GOVERNOR_H */
diff --git a/drivers/cpufreq/cpufreq_ondemand.c 
b/drivers/cpufreq/cpufreq_ondemand.c
index 1471478..b0ffef9 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "cpufreq_governor.h"
 
@@ -40,6 +41,8 @@
 
 static DEFINE_PER_CPU(struct od_cpu_dbs_info_s, od_cpu_dbs_info);
 
+static struct od_ops od_ops;
+
 #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
 static struct cpufreq_governor cpufreq_gov_ondemand;
 #endif
@@ -80,7 +83,7 @@ static int should_io_be_busy(void)
  * Returns the freq_hi to be used right now and will set freq_hi_jiffies,
  * freq_lo, and freq_lo_jiffies in percpu area for averaging freqs.
  */
-static unsigned int powersave_bias_target(struct cpufreq_policy *policy,
+static unsigned int generic_powersave_bias_target(struct cpufreq_policy 
*policy,
unsigned int freq_next, unsigned int relation)
 {
unsigned int freq_req, freq_reduc, freq_avg;
@@ -145,7 +148,8 @@ static void dbs_freq_increase(struct cpufreq_policy *p, 
unsigned int freq)
struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 
if (od_tuners->powersave_bias)
-   freq = powersave_bias_target(p, freq, CPUFREQ_RELATION_H);
+   freq = od_ops.powersave_bias_target(p, freq,
+   CPUFREQ_RELATION_H);
else if (p->cur == p->max)
return;
 
@@ -205,12 +209,12 @@ static void od_check_cpu(int cpu, unsigned int load_freq)
if (!od_tuners->powersave_bias) {
__cpufreq_driver_target(policy, freq_next,
CPUFREQ_RELATION_L);
-   } else {
-   int freq = powersave_bias_target(policy, freq_next,
-   CPUFREQ_RELATION_L);
-   __cpufreq_driver_target(policy, freq,
-   CPUFREQ_RELATION_L);
+   return;
}
+
+   freq_next = od_ops.powersave_bias_target(policy, freq_next,
+   CPUFREQ_RELATION_L);
+   __cpufreq_driver_target(policy, freq_n

Re: [PATCH V4 2/2] cpufreq: AMD "frequency sensitivity feedback" powersave bias for ondemand governor

2013-04-04 Thread Jacob Shin
On Thu, Apr 04, 2013 at 09:23:23PM +0200, Borislav Petkov wrote:
> On Thu, Apr 04, 2013 at 11:19:04AM -0500, Jacob Shin wrote:
> > Future AMD processors, starting with Family 16h, can provide software
> > with feedback on how the workload may respond to frequency change --
> > memory-bound workloads will not benefit from higher frequency, where
> > as compute-bound workloads will. This patch enables this "frequency
> > sensitivity feedback" to aid the ondemand governor to make better
> > frequency change decisions by hooking into the powersave bias.
> > 
> > Signed-off-by: Jacob Shin 
> 
> Looks good to me.
> 
> Acked-by: Borislav Petkov 

Rafael, got acks from both Boris and Thomas. Please commit to
linux-next when you get the chance.

Thanks,

-Jacob

> 
> -- 
> Regards/Gruss,
> Boris.
> 
> Sent from a fat crate under my desk. Formatting is fine.
> --
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RESEND V5 0/6] perf, amd: Enable AMD family 15h northbridge counters

2013-01-24 Thread Jacob Shin
On Thu, Jan 24, 2013 at 02:31:59PM +0100, Stephane Eranian wrote:
> On Thu, Jan 10, 2013 at 8:50 PM, Jacob Shin  wrote:
> > The following patchset enables 4 additional performance counters in
> > AMD family 15h processors that count northbridge events -- such as
> > number of DRAM accesses.
> >
> In order for me to test this patch set more thoroughly it would help if you
> could also provide me a patch to add the Fam15h uncore events to libpfm4.
> In the past, Robert Richter took care of this. I hope you can fill his role 
> for
> this. So please, if you could send me the patch quickly, that would help
> the review of your patch.

Hi Stephane,

Here is the corresponding libpfm4 patch. Thank you for taking the time
to review the patchset. I hope this helps .. If we can get AMD related
perf kernel side patchsets to upstream, I will be more than happy to
support AMD related libpfm4 efforts going forward.

Thanks!

>From 47d3267dfa24b9071c76f4a22bd059b0e4032002 Mon Sep 17 00:00:00 2001
From: Jacob Shin 
Date: Thu, 24 Jan 2013 15:37:37 -0600
Subject: [PATCH 1/1] Add AMD Family 15h northbridge performance events

libpfm4 side support for the following Linux kernel patchset:
  http://lkml.org/lkml/2013/1/10/450

Reference -- BIOS and Kernel Developer Guide (BKDG) for AMD Family 15h
 Models 00h-0Fh Processors:
  http://support.amd.com/us/Processor_TechDocs/42301_15h_Mod_00h-0Fh_BKDG.pdf

Signed-off-by: Jacob Shin 
---
 lib/events/amd64_events_fam15h.h |  155 ++
 1 file changed, 155 insertions(+)

diff --git a/lib/events/amd64_events_fam15h.h b/lib/events/amd64_events_fam15h.h
index 7f654e8..0276782 100644
--- a/lib/events/amd64_events_fam15h.h
+++ b/lib/events/amd64_events_fam15h.h
@@ -752,6 +752,126 @@ static const amd64_umask_t 
amd64_fam15h_l2_prefetcher_trigger_events[]={
},
 };
 
+static const amd64_umask_t amd64_fam15h_dram_accesses[]={
+   { .uname = "DCT0_PAGE_HIT",
+ .udesc = "DCT0 Page hit",
+ .ucode = 0x1,
+   },
+   { .uname = "DCT0_PAGE_MISS",
+ .udesc = "DCT0 Page Miss",
+ .ucode = 0x2,
+   },
+   { .uname = "DCT0_PAGE_CONFLICT",
+ .udesc = "DCT0 Page Conflict",
+ .ucode = 0x4,
+   },
+   { .uname = "DCT1_PAGE_HIT",
+ .udesc = "DCT1 Page hit",
+ .ucode = 0x8,
+   },
+   { .uname = "DCT1_PAGE_MISS",
+ .udesc = "DCT1 Page Miss",
+ .ucode = 0x10,
+   },
+   { .uname = "DCT1_PAGE_CONFLICT",
+ .udesc = "DCT1 Page Conflict",
+ .ucode = 0x20,
+   },
+   { .uname  = "ALL",
+ .udesc  = "All sub-events selected",
+ .ucode  = 0x3f,
+ .uflags = AMD64_FL_NCOMBO | AMD64_FL_DFL,
+   },
+};
+
+static const amd64_umask_t 
amd64_fam15h_dram_controller_page_table_overflows[]={
+   { .uname = "DCT0_PAGE_TABLE_OVERFLOW",
+ .udesc = "DCT0 Page Table Overflow",
+ .ucode = 0x1,
+   },
+   { .uname = "DCT1_PAGE_TABLE_OVERFLOW",
+ .udesc = "DCT1 Page Table Overflow",
+ .ucode = 0x2,
+   },
+   { .uname  = "ALL",
+ .udesc  = "All sub-events selected",
+ .ucode  = 0x3,
+ .uflags = AMD64_FL_NCOMBO | AMD64_FL_DFL,
+   },
+};
+
+static const amd64_umask_t 
amd64_fam15h_memory_controller_dram_command_slots_missed[]={
+   { .uname = "DCT0_COMMAND_SLOTS_MISSED",
+ .udesc = "DCT0 Command Slots Missed (in MemClks)",
+ .ucode = 0x1,
+   },
+   { .uname = "DCT1_COMMAND_SLOTS_MISSED",
+ .udesc = "DCT1 Command Slots Missed (in MemClks)",
+ .ucode = 0x2,
+   },
+   { .uname  = "ALL",
+ .udesc  = "All sub-events selected",
+ .ucode  = 0x3,
+ .uflags = AMD64_FL_NCOMBO | AMD64_FL_DFL,
+   },
+};
+
+static const amd64_umask_t amd64_fam15h_memory_controller_turnarounds[]={
+   { .uname = "DCT0_DIMM_TURNAROUND",
+ .udesc = "DCT0 DIMM (chip select) turnaround",
+ .ucode = 0x1,
+   },
+   { .uname = "DCT0_READ_TO_WRITE_TURNAROUND",
+ .udesc = "DCT0 Read to write turnaround",
+ .ucode = 0x2,
+   },
+   { .uname = "DCT0_WRITE_TO_READ_TURNAROUND",
+ .udesc = "DCT0 Write to read turnaround",
+ .ucode = 0x4,
+   },
+   { .uname = "DCT1_DIMM_TURNAROUND",
+ .udesc = "DCT1 DIMM (chip select) turnaround",
+ .ucode = 0x8,
+   },
+   { .uname = "DCT1_READ_TO_WRITE_TURNAROUND",
+ .udesc = "DCT1 Read to write turnaround",
+ .ucode = 0x10,
+   },
+   { .uname = "DCT1_WRITE_TO_READ_TURNAROUND",
+ .udesc = "DCT1 Write to read turnaround",
+ .ucode = 0x20,
+   },
+   { .uname  = "ALL",
+ .udesc  = "All sub-events selected",
+ .ucode  = 0x3f,
+ .uflags = AMD64_FL_NCOMBO | AMD64_FL_DFL,
+   },
+};
+
+static const 

Re: [PATCH 04/35] x86: Clean up e820 add kernel range

2013-01-24 Thread Jacob Shin
On Thu, Jan 24, 2013 at 12:19:45PM -0800, Yinghai Lu wrote:
> Separate it out to another function instead.
> 
> Also add support for case when memmap=xxM$yyM is used without exactmap.
> Need to remove reserved range at first before we add E820_RAM
> range, otherwise added E820_RAM range will be ignored.
> 
> Signed-off-by: Yinghai Lu 
> Cc: Jacob Shin 

Acked-by: Jacob Shin 

Thanks,

> ---
>  arch/x86/kernel/setup.c |   36 ++--
>  1 file changed, 22 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index b23362f..2242356 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -702,6 +702,27 @@ static void __init trim_bios_range(void)
>   sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
>  }
>  
> +/* called before trim_bios_range() to spare extra sanitize */
> +static void __init e820_add_kernel_range(void)
> +{
> + u64 start = __pa_symbol(_text);
> + u64 size = __pa_symbol(_end) - start;
> +
> + /*
> +  * Complain if .text .data and .bss are not marked as E820_RAM and
> +  * attempt to fix it by adding the range. We may have a confused BIOS,
> +  * or the user may have used memmap=exactmap or memmap=xxM$yyM to
> +  * exclude kernel range. If we really are running on top non-RAM,
> +  * we will crash later anyways.
> +  */
> + if (e820_all_mapped(start, start + size, E820_RAM))
> + return;
> +
> + pr_warn(".text .data .bss are not marked as E820_RAM!\n");
> + e820_remove_range(start, size, E820_RAM, 0);
> + e820_add_region(start, size, E820_RAM);
> +}
> +
>  static int __init parse_reservelow(char *p)
>  {
>   unsigned long long size;
> @@ -897,20 +918,7 @@ void __init setup_arch(char **cmdline_p)
>   insert_resource(&iomem_resource, &data_resource);
>   insert_resource(&iomem_resource, &bss_resource);
>  
> - /*
> -  * Complain if .text .data and .bss are not marked as E820_RAM and
> -  * attempt to fix it by adding the range. We may have a confused BIOS,
> -  * or the user may have incorrectly supplied it via memmap=exactmap. If
> -  * we really are running on top non-RAM, we will crash later anyways.
> -  */
> - if (!e820_all_mapped(code_resource.start, __pa(__brk_limit), E820_RAM)) 
> {
> - pr_warn(".text .data .bss are not marked as E820_RAM!\n");
> -
> - e820_add_region(code_resource.start,
> - __pa(__brk_limit) - code_resource.start + 1,
> - E820_RAM);
> - }
> -
> + e820_add_kernel_range();
>   trim_bios_range();
>  #ifdef CONFIG_X86_32
>   if (ppro_with_ram_bug()) {
> -- 
> 1.7.10.4
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [perfmon2] [PATCH RESEND V5 0/6] perf, amd: Enable AMD family 15h northbridge counters

2013-01-25 Thread Jacob Shin
On Fri, Jan 25, 2013 at 10:42:57AM +0100, Stephane Eranian wrote:
> Hi Jacob,
> 
> I will apply this patch to libpfm4.
> But I have a question. Why aren't the other uncore
> events included here as well? I am talking about
> the events listed in BKDG sections 3.16.2 to 3.16.6?
> Are  those NOT supported by your kernel patchset?

Oh, you are right, they are supported. I'm not sure why I overlooked
them. I will send out a V2 that includes all of those events as well.

Sorry about that.

-Jacob

> 
> 
> On Thu, Jan 24, 2013 at 11:06 PM, Jacob Shin  wrote:
> > On Thu, Jan 24, 2013 at 02:31:59PM +0100, Stephane Eranian wrote:
> >> On Thu, Jan 10, 2013 at 8:50 PM, Jacob Shin  wrote:
> >> > The following patchset enables 4 additional performance counters in
> >> > AMD family 15h processors that count northbridge events -- such as
> >> > number of DRAM accesses.
> >> >
> >> In order for me to test this patch set more thoroughly it would help if you
> >> could also provide me a patch to add the Fam15h uncore events to libpfm4.
> >> In the past, Robert Richter took care of this. I hope you can fill his 
> >> role for
> >> this. So please, if you could send me the patch quickly, that would help
> >> the review of your patch.
> >
> > Hi Stephane,
> >
> > Here is the corresponding libpfm4 patch. Thank you for taking the time
> > to review the patchset. I hope this helps .. If we can get AMD related
> > perf kernel side patchsets to upstream, I will be more than happy to
> > support AMD related libpfm4 efforts going forward.
> >
> > Thanks!
> >
> > >From 47d3267dfa24b9071c76f4a22bd059b0e4032002 Mon Sep 17 00:00:00 2001
> > From: Jacob Shin 
> > Date: Thu, 24 Jan 2013 15:37:37 -0600
> > Subject: [PATCH 1/1] Add AMD Family 15h northbridge performance events
> >
> > libpfm4 side support for the following Linux kernel patchset:
> >   http://lkml.org/lkml/2013/1/10/450
> >
> > Reference -- BIOS and Kernel Developer Guide (BKDG) for AMD Family 15h
> >  Models 00h-0Fh Processors:
> >   
> > http://support.amd.com/us/Processor_TechDocs/42301_15h_Mod_00h-0Fh_BKDG.pdf
> >
> > Signed-off-by: Jacob Shin 
> > ---
> >  lib/events/amd64_events_fam15h.h |  155 
> > ++
> >  1 file changed, 155 insertions(+)
> >
> > diff --git a/lib/events/amd64_events_fam15h.h 
> > b/lib/events/amd64_events_fam15h.h
> > index 7f654e8..0276782 100644
> > --- a/lib/events/amd64_events_fam15h.h
> > +++ b/lib/events/amd64_events_fam15h.h
> > @@ -752,6 +752,126 @@ static const amd64_umask_t 
> > amd64_fam15h_l2_prefetcher_trigger_events[]={
> > },
> >  };
> >
> > +static const amd64_umask_t amd64_fam15h_dram_accesses[]={
> > +   { .uname = "DCT0_PAGE_HIT",
> > + .udesc = "DCT0 Page hit",
> > + .ucode = 0x1,
> > +   },
> > +   { .uname = "DCT0_PAGE_MISS",
> > + .udesc = "DCT0 Page Miss",
> > + .ucode = 0x2,
> > +   },
> > +   { .uname = "DCT0_PAGE_CONFLICT",
> > + .udesc = "DCT0 Page Conflict",
> > + .ucode = 0x4,
> > +   },
> > +   { .uname = "DCT1_PAGE_HIT",
> > + .udesc = "DCT1 Page hit",
> > + .ucode = 0x8,
> > +   },
> > +   { .uname = "DCT1_PAGE_MISS",
> > + .udesc = "DCT1 Page Miss",
> > + .ucode = 0x10,
> > +   },
> > +   { .uname = "DCT1_PAGE_CONFLICT",
> > + .udesc = "DCT1 Page Conflict",
> > + .ucode = 0x20,
> > +   },
> > +   { .uname  = "ALL",
> > + .udesc  = "All sub-events selected",
> > + .ucode  = 0x3f,
> > + .uflags = AMD64_FL_NCOMBO | AMD64_FL_DFL,
> > +   },
> > +};
> > +
> > +static const amd64_umask_t 
> > amd64_fam15h_dram_controller_page_table_overflows[]={
> > +   { .uname = "DCT0_PAGE_TABLE_OVERFLOW",
> > + .udesc = "DCT0 Page Table Overflow",
> > + .ucode = 0x1,
> > +   },
> > +   { .uname = "DCT1_PAGE_TABLE_OVERFLOW",
> > + .udesc = "DCT1 Page Table Overflow",
> > + .ucode = 0x2,
> > +   },
> > +   { .uname  = "ALL",
> > + .udesc  = "All sub-events selected",
> > + .ucode  = 0x3,
> > + .uflags = AMD64_FL_NCOMBO | AMD64_FL_DFL,
> > +   },
> > +};
> > +
> > +static const amd64_umask_t 
> > amd64_fam15h_memory_control

Re: [PATCH RESEND V5 2/6] perf, amd: Generalize northbridge constraints code for family 15h

2013-01-25 Thread Jacob Shin
On Fri, Jan 25, 2013 at 12:07:40PM +0100, Stephane Eranian wrote:
> On Thu, Jan 10, 2013 at 8:50 PM, Jacob Shin  wrote:
> > From: Robert Richter 
> >
> > Generalize northbridge constraints code for family 10h so that later
> > we can reuse the same code path with other AMD processor families that
> > have the same northbridge event constraints.
> >
> > Signed-off-by: Robert Richter 
> > Signed-off-by: Jacob Shin 
> > ---
> >  arch/x86/kernel/cpu/perf_event_amd.c |   43 
> > --
> >  1 file changed, 25 insertions(+), 18 deletions(-)
> >
> > diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
> > b/arch/x86/kernel/cpu/perf_event_amd.c
> > index e7963c7..9541fe5 100644
> > --- a/arch/x86/kernel/cpu/perf_event_amd.c
> > +++ b/arch/x86/kernel/cpu/perf_event_amd.c
> > @@ -188,20 +188,13 @@ static inline int amd_has_nb(struct cpu_hw_events 
> > *cpuc)
> > return nb && nb->nb_id != -1;
> >  }
> >
> > -static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
> > - struct perf_event *event)
> > +static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
> > +  struct perf_event *event)
> >  {
> > -   struct hw_perf_event *hwc = &event->hw;
> > struct amd_nb *nb = cpuc->amd_nb;
> > int i;
> >
> > /*
> > -* only care about NB events
> > -*/
> > -   if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
> > -   return;
> > -
> > -   /*
> >  * need to scan whole list because event may not have
> >  * been assigned during scheduling
> >  *
> > @@ -247,12 +240,13 @@ static void amd_put_event_constraints(struct 
> > cpu_hw_events *cpuc,
> >*
> >* Given that resources are allocated (cmpxchg), they must be
> >* eventually freed for others to use. This is accomplished by
> > -  * calling amd_put_event_constraints().
> > +  * calling __amd_put_nb_event_constraints()
> >*
> >* Non NB events are not impacted by this restriction.
> >*/
> >  static struct event_constraint *
> > -amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event 
> > *event)
> > +__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct 
> > perf_event *event,
> > +  struct event_constraint *c)
> >  {
> > struct hw_perf_event *hwc = &event->hw;
> > struct amd_nb *nb = cpuc->amd_nb;
> > @@ -260,12 +254,6 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
> > struct perf_event *event)
> > int idx, new = -1;
> >
> > /*
> > -* if not NB event or no NB, then no constraints
> > -*/
> > -   if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
> > -   return &unconstrained;
> > -
> > -   /*
> >  * detect if already present, if so reuse
> >  *
> >  * cannot merge with actual allocation
> > @@ -275,7 +263,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
> > struct perf_event *event)
> >  * because of successive calls to x86_schedule_events() from
> >  * hw_perf_group_sched_in() without hw_perf_enable()
> >  */
> > -   for (idx = 0; idx < x86_pmu.num_counters; idx++) {
> > +   for_each_set_bit(idx, c->idxmsk, X86_PMC_IDX_MAX) {
> 
> So here you're using   X86_PMC_IDX_MAX but in
> __amd_put_nb_event_constraints() you're using
> x86_pmu.num_counters.
> 
> There is implicit assumption in the AMD code the counters index
> namespace is contiguous. That
> means the uncore counters show up right after the core counters. On
> Fam15h, that would be NB
> counters start at index 6, on Fam10h at index 4. In that case, the
> constraint mask cannot have bits set
> beyond num_counters, so why not use that limit in
> amd_get_event_constraints()? It would significantly
> cut down on the number of iterations in the loop from 64 down to 10 on Fam15h.

Yes, you are right, I will change that in V6.

Thanks,

> 
> 
> > if (new == -1 || hwc->idx == idx)
> > /* assign free slot, prefer hwc->idx */
> > old = cmpxchg(nb->owners + idx, NULL, event);
> > @@ -391,6 +379,25 @@ static void amd_pmu_cpu_dead(int cpu)
> > }
> &g

Re: [PATCH RESEND V5 4/6] perf, x86: Move MSR address offset calculation to architecture specific files

2013-01-25 Thread Jacob Shin
On Fri, Jan 25, 2013 at 12:15:37PM +0100, Stephane Eranian wrote:
> On Thu, Jan 10, 2013 at 8:50 PM, Jacob Shin  wrote:
> > Move counter index to MSR address offset calculation to architecture
> > specific files. This prepares the way for perf_event_amd to enable
> > counter addresses that are not contiguous -- for example AMD Family
> > 15h processors have 6 core performance counters starting at 0xc0010200
> > and 4 northbridge performance counters starting at 0xc0010240.
> >
> > Signed-off-by: Jacob Shin 
> > ---
> >  arch/x86/kernel/cpu/perf_event.h |   21 -
> >  arch/x86/kernel/cpu/perf_event_amd.c |   42 
> > ++
> >  2 files changed, 47 insertions(+), 16 deletions(-)
> >
> > diff --git a/arch/x86/kernel/cpu/perf_event.h 
> > b/arch/x86/kernel/cpu/perf_event.h
> > index 115c1ea..4440218 100644
> > --- a/arch/x86/kernel/cpu/perf_event.h
> > +++ b/arch/x86/kernel/cpu/perf_event.h
> > @@ -325,6 +325,7 @@ struct x86_pmu {
> > int (*schedule_events)(struct cpu_hw_events *cpuc, int 
> > n, int *assign);
> > unsignedeventsel;
> > unsignedperfctr;
> > +   int (*addr_offset)(int index, int eventsel);
> > u64 (*event_map)(int);
> > int max_events;
> > int num_counters;
> > @@ -446,28 +447,16 @@ extern u64 __read_mostly hw_cache_extra_regs
> >
> >  u64 x86_perf_event_update(struct perf_event *event);
> >
> > -static inline int x86_pmu_addr_offset(int index)
> > -{
> > -   int offset;
> > -
> > -   /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
> > -   alternative_io(ASM_NOP2,
> > -  "shll $1, %%eax",
> > -  X86_FEATURE_PERFCTR_CORE,
> > -  "=a" (offset),
> > -  "a"  (index));
> > -
> > -   return offset;
> > -}
> > -
> >  static inline unsigned int x86_pmu_config_addr(int index)
> >  {
> > -   return x86_pmu.eventsel + x86_pmu_addr_offset(index);
> > +   return x86_pmu.eventsel +
> > +   (x86_pmu.addr_offset ? x86_pmu.addr_offset(index, 1) : 
> > index);
> >  }
> >
> >  static inline unsigned int x86_pmu_event_addr(int index)
> >  {
> > -   return x86_pmu.perfctr + x86_pmu_addr_offset(index);
> > +   return x86_pmu.perfctr +
> > +   (x86_pmu.addr_offset ? x86_pmu.addr_offset(index, 0) : 
> > index);
> >  }
> Would be better to use a constant name instead of 1 and 0 to name a event_sel
> vs. a counter. It would help the reader understand what this is about
> as that may
> be useful for other processors as well.

Yes will do .. or should I use bool instead? Which would be preferred?

> 
> >  int x86_setup_perfctr(struct perf_event *event);
> > diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
> > b/arch/x86/kernel/cpu/perf_event_amd.c
> > index 0c2cc51..ef1df38 100644
> > --- a/arch/x86/kernel/cpu/perf_event_amd.c
> > +++ b/arch/x86/kernel/cpu/perf_event_amd.c
> > @@ -132,6 +132,47 @@ static u64 amd_pmu_event_map(int hw_event)
> > return amd_perfmon_event_map[hw_event];
> >  }
> >
> > +/*
> > + * Previously calculated offsets
> > + */
> > +static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
> > +static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
> > +
> > +/*
> > + * Legacy CPUs:
> > + *   4 counters starting at 0xc001 each offset by 1
> > + *
> > + * CPUs with core performance counter extensions:
> > + *   6 counters starting at 0xc0010200 each offset by 2
> > + */
> > +static inline int amd_pmu_addr_offset(int index, int eventsel)
> > +{
> > +   int offset;
> > +
> > +   if (!index)
> > +   return index;
> > +
> > +   if (eventsel)
> > +   offset = event_offsets[index];
> > +   else
> > +   offset = count_offsets[index];
> > +
> > +   if (offset)
> > +   return offset;
> > +
> > +   if (!cpu_has_perfctr_core)
> > +   offset = index;
> > +   else
> > +   offset = index << 1;
> > +
> > +   if (eventsel)
> > +   event_offsets[index] = offset;
> > +   else
> > +   count_offsets[index] = offset;
> > +
> > +   return

Re: [PATCH 3/5] x86: Only direct map addresses that are marked as E820_RAM

2012-08-28 Thread Jacob Shin
On Fri, Aug 24, 2012 at 09:54:04PM -0700, Yinghai Lu wrote:
> On Fri, Aug 24, 2012 at 9:24 PM, Jacob Shin  wrote:
> > On Fri, Aug 24, 2012 at 06:07:01PM -0700, Yinghai Lu wrote:
> >> On Fri, Aug 24, 2012 at 4:55 PM, Jacob Shin  wrote:
> >>
> >> looks like you could avoid add pfn_mapped[] array.
> >>
> >> pfn_range_is_mapped() should be
> >> check max_low_pfn_mapped, max_pfn_mapped with
> >> e820_all_mapped(start, end, E820_RAM).
> >
> > Hmm .. I guess that could work .. but what about EFI code that keys off of
> > EFI memory map? Does the EFI code update e820 and mark as E820_RAM whatever
> > ranges that it calls init_memory_mapping on (via efi_ioremap?)
> 
> they are converted to e820 memmap before init_memory_mapping is called.

Yinghai, looking into this further on my EFI enabled machine, there is a
memory range where:

- e820 marks it as E820_RESERVED
- EFI memory map marks it as EFI_RUNTIME_SERVICES_DATA

During EFI init, the range is added (redundantly) to e820 as E820_RESERVED,
but during efi_enter_virtual_mode, direct mappings are created for the
range with a call to efi_ioremap.

Another such region is EFI_RUNTIME_SERVICES_CODE which is marked as
ACPI NVS.

So these are not E820_RAM, but direct mapped by EFI code path .. what do
we do here? I think we should just stick with keeping the pfn_mapped[]
array .. no?

-Jacob

> 
> Thanks
> 
> Yinghai
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/6] x86, mm: Add page_size_mask()

2012-08-29 Thread Jacob Shin
From: Yinghai Lu 

detect if need to use 1G or 2M and store them in page_size_mask.

Only probe them one time.

Suggested-by: Ingo Molnar 
Signed-off-by: Yinghai Lu 
---
 arch/x86/include/asm/pgtable.h |1 +
 arch/x86/kernel/setup.c|1 +
 arch/x86/mm/init.c |   66 +++-
 3 files changed, 33 insertions(+), 35 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 49afb3f..e47e4db 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -597,6 +597,7 @@ static inline int pgd_none(pgd_t pgd)
 #ifndef __ASSEMBLY__
 
 extern int direct_gbpages;
+void probe_page_size_mask(void);
 
 /* local pte updates need not use xchg for locking */
 static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f4b9b80..d6e8c03 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -912,6 +912,7 @@ void __init setup_arch(char **cmdline_p)
setup_real_mode();
 
init_gbpages();
+   probe_page_size_mask();
 
/* max_pfn_mapped is updated here */
max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<> PUD_SHIFT;
tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
 
-   if (use_gbpages) {
+   if (page_size_mask & (1 << PG_LEVEL_1G)) {
unsigned long extra;
 
extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
@@ -54,7 +56,7 @@ static void __init find_early_table_space(struct map_range 
*mr, unsigned long en
 
tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
 
-   if (use_pse) {
+   if (page_size_mask & (1 << PG_LEVEL_2M)) {
unsigned long extra;
 
extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
@@ -90,6 +92,30 @@ static void __init find_early_table_space(struct map_range 
*mr, unsigned long en
(pgt_buf_top << PAGE_SHIFT) - 1);
 }
 
+void probe_page_size_mask(void)
+{
+#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
+   /*
+* For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+* This will simplify cpa(), which otherwise needs to support splitting
+* large pages into small in interrupt context, etc.
+*/
+   if (direct_gbpages)
+   page_size_mask |= 1 << PG_LEVEL_1G;
+   if (cpu_has_pse)
+   page_size_mask |= 1 << PG_LEVEL_2M;
+#endif
+
+   /* Enable PSE if available */
+   if (cpu_has_pse)
+   set_in_cr4(X86_CR4_PSE);
+
+   /* Enable PGE if available */
+   if (cpu_has_pge) {
+   set_in_cr4(X86_CR4_PGE);
+   __supported_pte_mask |= _PAGE_GLOBAL;
+   }
+}
 void __init native_pagetable_reserve(u64 start, u64 end)
 {
memblock_reserve(start, end - start);
@@ -125,45 +151,15 @@ static int __meminit save_mr(struct map_range *mr, int 
nr_range,
 unsigned long __init_refok init_memory_mapping(unsigned long start,
   unsigned long end)
 {
-   unsigned long page_size_mask = 0;
unsigned long start_pfn, end_pfn;
unsigned long ret = 0;
unsigned long pos;
-
struct map_range mr[NR_RANGE_MR];
int nr_range, i;
-   int use_pse, use_gbpages;
 
printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n",
   start, end - 1);
 
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
-   /*
-* For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
-* This will simplify cpa(), which otherwise needs to support splitting
-* large pages into small in interrupt context, etc.
-*/
-   use_pse = use_gbpages = 0;
-#else
-   use_pse = cpu_has_pse;
-   use_gbpages = direct_gbpages;
-#endif
-
-   /* Enable PSE if available */
-   if (cpu_has_pse)
-   set_in_cr4(X86_CR4_PSE);
-
-   /* Enable PGE if available */
-   if (cpu_has_pge) {
-   set_in_cr4(X86_CR4_PGE);
-   __supported_pte_mask |= _PAGE_GLOBAL;
-   }
-
-   if (use_gbpages)
-   page_size_mask |= 1 << PG_LEVEL_1G;
-   if (use_pse)
-   page_size_mask |= 1 << PG_LEVEL_2M;
-
memset(mr, 0, sizeof(mr));
nr_range = 0;
 
@@ -267,7 +263,7 @@ unsigned long __init_refok init_memory_mapping(unsigned 
long start,
 * nodes are discovered.
 */
if (!after_bootmem)
-   find_early_table_space(&mr[0], end, use_pse, use_gbpages);
+   find_early_table_space(&mr[0], end);
 
for (i = 0; i < nr_range; i++)
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-inf

[PATCH 2/6] x86, mm: Split out split_mem_range

2012-08-29 Thread Jacob Shin
From: Yinghai Lu 

from init_memory_mapping, so make init_memory_mapping readable.

Suggested-by: Ingo Molnar 
Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init.c |   42 ++
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 838e9bc..41e615b 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -143,25 +143,13 @@ static int __meminit save_mr(struct map_range *mr, int 
nr_range,
return nr_range;
 }
 
-/*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
- */
-unsigned long __init_refok init_memory_mapping(unsigned long start,
-  unsigned long end)
+static int __meminit split_mem_range(struct map_range *mr, int nr_range,
+unsigned long start,
+unsigned long end)
 {
unsigned long start_pfn, end_pfn;
-   unsigned long ret = 0;
unsigned long pos;
-   struct map_range mr[NR_RANGE_MR];
-   int nr_range, i;
-
-   printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n",
-  start, end - 1);
-
-   memset(mr, 0, sizeof(mr));
-   nr_range = 0;
+   int i;
 
/* head if not big page alignment ? */
start_pfn = start >> PAGE_SHIFT;
@@ -255,6 +243,28 @@ unsigned long __init_refok init_memory_mapping(unsigned 
long start,
(mr[i].page_size_mask & (1

[PATCH 3/6] x86/mm: find_early_table_space based on memory ranges that are being mapped

2012-08-29 Thread Jacob Shin
Current logic finds enough space for direct mapping page tables from 0
to end. Instead, we only need to find enough space to cover mr[0].start
to mr[nr_range].end -- the range that is actually being mapped by
init_memory_mapping()

This patch also reportedly fixes suspend/resume issue reported in:

https://lkml.org/lkml/2012/8/11/83

Signed-off-by: Jacob Shin 
---
 arch/x86/mm/init.c |   62 +---
 1 file changed, 35 insertions(+), 27 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 41e615b..916b15b 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -37,40 +37,48 @@ struct map_range {
 
 static int page_size_mask;
 
-static void __init find_early_table_space(struct map_range *mr,
- unsigned long end)
+/*
+ * First calculate space needed for kernel direct mapping page tables to cover
+ * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 
1GB
+ * pages. Then find enough contiguous space for those page tables.
+ */
+static void __init find_early_table_space(struct map_range *mr, int nr_range)
 {
-   unsigned long puds, pmds, ptes, tables, start = 0, good_end = end;
+   int i;
+   unsigned long puds = 0, pmds = 0, ptes = 0, tables;
+   unsigned long start = 0, good_end;
phys_addr_t base;
 
-   puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
-   tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
-
-   if (page_size_mask & (1 << PG_LEVEL_1G)) {
-   unsigned long extra;
+   for (i = 0; i < nr_range; i++) {
+   unsigned long range, extra;
 
-   extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
-   pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
-   } else
-   pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+   range = mr[i].end - mr[i].start;
+   puds += (range + PUD_SIZE - 1) >> PUD_SHIFT;
 
-   tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
+   if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) {
+   extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT);
+   pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+   } else {
+   pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT;
+   }
 
-   if (page_size_mask & (1 << PG_LEVEL_2M)) {
-   unsigned long extra;
-
-   extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+   if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) {
+   extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT);
 #ifdef CONFIG_X86_32
-   extra += PMD_SIZE;
+   extra += PMD_SIZE;
 #endif
-   /* The first 2/4M doesn't use large pages. */
-   if (mr->start < PMD_SIZE)
-   extra += mr->end - mr->start;
-
-   ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
-   } else
-   ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+   /* The first 2/4M doesn't use large pages. */
+   if (mr[i].start < PMD_SIZE)
+   extra += range;
+
+   ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+   } else {
+   ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT;
+   }
+   }
 
+   tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+   tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
 
 #ifdef CONFIG_X86_32
@@ -88,7 +96,7 @@ static void __init find_early_table_space(struct map_range 
*mr,
pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
 
printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem 
%#010lx-%#010lx]\n",
-   end - 1, pgt_buf_start << PAGE_SHIFT,
+   mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT,
(pgt_buf_top << PAGE_SHIFT) - 1);
 }
 
@@ -273,7 +281,7 @@ unsigned long __init_refok init_memory_mapping(unsigned 
long start,
 * nodes are discovered.
 */
if (!after_bootmem)
-   find_early_table_space(&mr[0], end);
+   find_early_table_space(mr, nr_range);
 
for (i = 0; i < nr_range; i++)
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/6] x86: Only direct map addresses that are marked as E820_RAM

2012-08-29 Thread Jacob Shin
Currently direct mappings are created for [ 0 to max_low_pfn<
---
 arch/x86/include/asm/page_types.h |9 
 arch/x86/kernel/setup.c   |  100 +++--
 arch/x86/mm/init.c|2 +
 arch/x86/mm/init_64.c |6 +--
 4 files changed, 99 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/page_types.h 
b/arch/x86/include/asm/page_types.h
index e21fdd1..409047a 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -3,6 +3,7 @@
 
 #include 
 #include 
+#include 
 
 /* PAGE_SHIFT determines the page size */
 #define PAGE_SHIFT 12
@@ -40,12 +41,20 @@
 #endif /* CONFIG_X86_64 */
 
 #ifndef __ASSEMBLY__
+#include 
 
 extern int devmem_is_allowed(unsigned long pagenr);
 
 extern unsigned long max_low_pfn_mapped;
 extern unsigned long max_pfn_mapped;
 
+extern struct range pfn_mapped[E820_X_MAX];
+extern int nr_pfn_mapped;
+
+extern void add_pfn_range_mapped(unsigned long start_pfn, unsigned long 
end_pfn);
+extern bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long 
end_pfn);
+extern bool pfn_is_mapped(unsigned long pfn);
+
 static inline phys_addr_t get_max_mapped(void)
 {
return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d6e8c03..a2e392e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -115,13 +115,47 @@
 #include 
 
 /*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
+ * max_low_pfn_mapped: highest direct mapped pfn under 4GB
+ * max_pfn_mapped: highest direct mapped pfn over 4GB
+ *
+ * The direct mapping only covers E820_RAM regions, so the ranges and gaps are
+ * represented by pfn_mapped
  */
 unsigned long max_low_pfn_mapped;
 unsigned long max_pfn_mapped;
 
+struct range pfn_mapped[E820_X_MAX];
+int nr_pfn_mapped;
+
+void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+   nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX,
+nr_pfn_mapped, start_pfn, end_pfn);
+   nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX);
+
+   max_pfn_mapped = max(max_pfn_mapped, end_pfn);
+
+   if (end_pfn <= (1UL << (32 - PAGE_SHIFT)))
+   max_low_pfn_mapped = max(max_low_pfn_mapped, end_pfn);
+}
+
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+   int i;
+
+   for (i = 0; i < nr_pfn_mapped; i++)
+   if ((start_pfn >= pfn_mapped[i].start) &&
+   (end_pfn <= pfn_mapped[i].end))
+   return true;
+
+   return false;
+}
+
+bool pfn_is_mapped(unsigned long pfn)
+{
+   return pfn_range_is_mapped(pfn, pfn + 1);
+}
+
 #ifdef CONFIG_DMI
 RESERVE_BRK(dmi_alloc, 65536);
 #endif
@@ -296,6 +330,54 @@ static void __init cleanup_highmap(void)
 }
 #endif
 
+/*
+ * Iterate through E820 memory map and create direct mappings for only E820_RAM
+ * regions. We cannot simply create direct mappings for all pfns from
+ * [0 to max_low_pfn) and [4GB to max_pfn) because of possible memory holes in
+ * high addresses that cannot be marked as UC by fixed/variable range MTRRs.
+ * Depending on the alignment of E820 ranges, this may possibly result in using
+ * smaller size (i.e. 4K instead of 2M or 1G) page tables.
+ */
+static void __init init_direct_mapping(void)
+{
+   int i;
+
+   /* the ISA range is always mapped regardless of memory holes */
+   init_memory_mapping(0, ISA_END_ADDRESS);
+
+   for (i = 0; i < e820.nr_map; i++) {
+   struct e820entry *ei = &e820.map[i];
+   u64 start = ei->addr;
+   u64 end = ei->addr + ei->size;
+
+   /* we only map E820_RAM */
+   if (ei->type != E820_RAM)
+   continue;
+
+   if (end <= ISA_END_ADDRESS)
+   continue;
+
+   if (start < ISA_END_ADDRESS)
+   start = ISA_END_ADDRESS;
+#ifdef CONFIG_X86_32
+   /* on 32 bit, we only map up to max_low_pfn */
+   if ((start >> PAGE_SHIFT) >= max_low_pfn)
+   continue;
+
+   if ((end >> PAGE_SHIFT) > max_low_pfn)
+   end = max_low_pfn << PAGE_SHIFT;
+#endif
+   init_memory_mapping(start, end);
+   }
+
+#ifdef CONFIG_X86_64
+   if (max_pfn > max_low_pfn) {
+   /* can we preseve max_low_pfn ?*/
+   max_low_pfn = max_pfn;
+   }
+#endif
+}
+
 static void __init reserve_brk(void)
 {
if (_brk_end > _brk_start)
@@ -914,18 +996,8 @@ void __init setup_arch(char **cmdline_p)
init_gbpages();
probe_page_size_mask();
 
-   /* max_pfn_mapped is updated here */
-   

[PATCH 5/6] x86: Fixup code testing if a pfn is direct mapped

2012-08-29 Thread Jacob Shin
Update code that previously assumed pfns [ 0 - max_low_pfn_mapped ) and
[ 4GB - max_pfn_mapped ) were always direct mapped, to now look up
pfn_mapped ranges instead.

Signed-off-by: Jacob Shin 
---
 arch/x86/kernel/cpu/amd.c   |6 +-
 arch/x86/platform/efi/efi.c |8 
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 9d92e19..554ccfc 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -677,11 +677,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 */
if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
printk(KERN_DEBUG "tseg: %010llx\n", tseg);
-   if ((tseg>>PMD_SHIFT) <
-   (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
-   ((tseg>>PMD_SHIFT) <
-   (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
-   (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT
+   if (pfn_is_mapped(tseg))
set_memory_4k((unsigned long)__va(tseg), 1);
}
}
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 92660eda..f1facde 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -776,7 +776,7 @@ void __init efi_enter_virtual_mode(void)
efi_memory_desc_t *md, *prev_md = NULL;
efi_status_t status;
unsigned long size;
-   u64 end, systab, addr, npages, end_pfn;
+   u64 end, systab, addr, npages, start_pfn, end_pfn;
void *p, *va, *new_memmap = NULL;
int count = 0;
 
@@ -827,10 +827,10 @@ void __init efi_enter_virtual_mode(void)
size = md->num_pages << EFI_PAGE_SHIFT;
end = md->phys_addr + size;
 
+   start_pfn = PFN_DOWN(md->phys_addr);
end_pfn = PFN_UP(end);
-   if (end_pfn <= max_low_pfn_mapped
-   || (end_pfn > (1UL << (32 - PAGE_SHIFT))
-   && end_pfn <= max_pfn_mapped))
+
+   if (pfn_range_is_mapped(start_pfn, end_pfn))
va = __va(md->phys_addr);
else
va = efi_ioremap(md->phys_addr, size, md->type);
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH V5 0/6] x86: Create direct mappings for E820_RAM only

2012-08-29 Thread Jacob Shin
This is the 5th revision of the patchset, which aims to create direct
mappings only for E820_RAM memory ranges. The problem description and
justification can be found in patch 4/6.

Previous discussion history can be found in the following threads:

* https://lkml.org/lkml/2012/8/24/474
* https://lkml.org/lkml/2012/8/22/680
* https://lkml.org/lkml/2012/8/13/512
* https://lkml.org/lkml/2012/8/9/536
* https://lkml.org/lkml/2011/10/20/323

Jacob Shin (4):
  x86/mm: find_early_table_space based on memory ranges that are being
mapped
  x86: Only direct map addresses that are marked as E820_RAM
  x86: Fixup code testing if a pfn is direct mapped
  x86: if kernel .text .data .bss are not marked as E820_RAM, complain
and fix

Yinghai Lu (2):
  x86, mm: Add page_size_mask()
  x86, mm: Split out split_mem_range

 arch/x86/include/asm/page_types.h |9 +++
 arch/x86/include/asm/pgtable.h|1 +
 arch/x86/kernel/cpu/amd.c |6 +-
 arch/x86/kernel/setup.c   |  115 ++
 arch/x86/mm/init.c|  162 -
 arch/x86/mm/init_64.c |6 +-
 arch/x86/platform/efi/efi.c   |8 +-
 7 files changed, 207 insertions(+), 100 deletions(-)

-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 6/6] x86: if kernel .text .data .bss are not marked as E820_RAM, complain and fix

2012-08-29 Thread Jacob Shin
There could be cases where user supplied memmap=exactmap memory
mappings do not mark the region where the kernel .text .data and
.bss reside as E820_RAM, as reported here:

https://lkml.org/lkml/2012/8/14/86

Handle it by complaining, and adding the range back into the e820.

Signed-off-by: Jacob Shin 
---
 arch/x86/kernel/setup.c |   14 ++
 1 file changed, 14 insertions(+)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index a2e392e..68f82d2 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -913,6 +913,20 @@ void __init setup_arch(char **cmdline_p)
insert_resource(&iomem_resource, &data_resource);
insert_resource(&iomem_resource, &bss_resource);
 
+   /*
+* Complain if .text .data and .bss are not marked as E820_RAM and
+* attempt to fix it by adding the range. We may have a confused BIOS,
+* or the user may have incorrectly supplied it via memmap=exactmap. If
+* we really are running on top non-RAM, we will crash later anyways.
+*/
+   if (!e820_all_mapped(code_resource.start, __pa(__brk_limit), E820_RAM)) 
{
+   pr_warn(".text .data .bss are not marked as E820_RAM!\n");
+
+   e820_add_region(code_resource.start,
+   __pa(__brk_limit) - code_resource.start + 1,
+   E820_RAM);
+   }
+
trim_bios_range();
 #ifdef CONFIG_X86_32
if (ppro_with_ram_bug()) {
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 4/6] x86: Only direct map addresses that are marked as E820_RAM

2012-08-29 Thread Jacob Shin
On Wed, Aug 29, 2012 at 02:17:51PM -0700, Yinghai Lu wrote:
> On Wed, Aug 29, 2012 at 12:04 PM, Jacob Shin  wrote:
> > Currently direct mappings are created for [ 0 to max_low_pfn< > and [ 4GB to max_pfn< > backed by actual DRAM. This is fine for holes under 4GB which are covered
> > by fixed and variable range MTRRs to be UC. However, we run into trouble
> > on higher memory addresses which cannot be covered by MTRRs.
> >
> > Our system with 1TB of RAM has an e820 that looks like this:
> >
> >  BIOS-e820: [mem 0x-0x000983ff] usable
> >  BIOS-e820: [mem 0x00098400-0x0009] reserved
> >  BIOS-e820: [mem 0x000d-0x000f] reserved
> >  BIOS-e820: [mem 0x0010-0xc7eb] usable
> >  BIOS-e820: [mem 0xc7ec-0xc7ed7fff] ACPI data
> >  BIOS-e820: [mem 0xc7ed8000-0xc7ed9fff] ACPI NVS
> >  BIOS-e820: [mem 0xc7eda000-0xc7ff] reserved
> >  BIOS-e820: [mem 0xfec0-0xfec0] reserved
> >  BIOS-e820: [mem 0xfee0-0xfee00fff] reserved
> >  BIOS-e820: [mem 0xfff0-0x] reserved
> >  BIOS-e820: [mem 0x0001-0x00e037ff] usable
> >  BIOS-e820: [mem 0x00e03800-0x00fc] reserved
> >  BIOS-e820: [mem 0x0100-0x011ffeff] usable
> >
> > and so direct mappings are created for huge memory hole between
> > 0x00e03800 to 0x0100. Even though the kernel never
> > generates memory accesses in that region, since the page tables mark
> > them incorrectly as being WB, our (AMD) processor ends up causing a MCE
> > while doing some memory bookkeeping/optimizations around that area.
> >
> > This patch iterates through e820 and only direct maps ranges that are
> > marked as E820_RAM, and keeps track of those pfn ranges. Depending on
> > the alignment of E820 ranges, this may possibly result in using smaller
> > size (i.e. 4K instead of 2M or 1G) page tables.
> >
> > Signed-off-by: Jacob Shin 
> > ---
> >  arch/x86/include/asm/page_types.h |9 
> >  arch/x86/kernel/setup.c   |  100 
> > +++--
> >  arch/x86/mm/init.c|2 +
> >  arch/x86/mm/init_64.c |6 +--
> >  4 files changed, 99 insertions(+), 18 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/page_types.h 
> > b/arch/x86/include/asm/page_types.h
> > index e21fdd1..409047a 100644
> > --- a/arch/x86/include/asm/page_types.h
> > +++ b/arch/x86/include/asm/page_types.h
> > @@ -3,6 +3,7 @@
> >
> >  #include 
> >  #include 
> > +#include 
> >
> >  /* PAGE_SHIFT determines the page size */
> >  #define PAGE_SHIFT 12
> > @@ -40,12 +41,20 @@
> >  #endif /* CONFIG_X86_64 */
> >
> >  #ifndef __ASSEMBLY__
> > +#include 
> >
> >  extern int devmem_is_allowed(unsigned long pagenr);
> >
> >  extern unsigned long max_low_pfn_mapped;
> >  extern unsigned long max_pfn_mapped;
> >
> > +extern struct range pfn_mapped[E820_X_MAX];
> > +extern int nr_pfn_mapped;
> > +
> > +extern void add_pfn_range_mapped(unsigned long start_pfn, unsigned long 
> > end_pfn);
> > +extern bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long 
> > end_pfn);
> > +extern bool pfn_is_mapped(unsigned long pfn);
> > +
> >  static inline phys_addr_t get_max_mapped(void)
> >  {
> > return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
> > diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> > index d6e8c03..a2e392e 100644
> > --- a/arch/x86/kernel/setup.c
> > +++ b/arch/x86/kernel/setup.c
> > @@ -115,13 +115,47 @@
> >  #include 
> >
> >  /*
> > - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 
> > entries.
> > - * The direct mapping extends to max_pfn_mapped, so that we can directly 
> > access
> > - * apertures, ACPI and other tables without having to play with fixmaps.
> > + * max_low_pfn_mapped: highest direct mapped pfn under 4GB
> > + * max_pfn_mapped: highest direct mapped pfn over 4GB
> > + *
> > + * The direct mapping only covers E820_RAM regions, so the ranges and gaps 
> > are
> > + * represented by pfn_mapped
> >   */
> >  unsigned long max_low_pfn_mapped;
> >  unsigned long max_pfn_mapped;
> >
> > +struct range pfn_mapped[E820_X_MAX];
> > +int nr_pfn_mapped;
> 
> change to static?

Hm .. 

Re: [PATCH 0/8] x86, mm: init_memory_mapping cleanup

2012-08-30 Thread Jacob Shin
On Thu, Aug 30, 2012 at 04:06:07PM -0700, Yinghai Lu wrote:
> Only create mapping for E820_820 and E820_RESERVED_KERN.
> 
> Also seperate find_early_page_table out with init_memory_mapping.
> 
> Jacob Shin (3):
>   x86: if kernel .text .data .bss are not marked as E820_RAM, complain
> and fix
>   x86: Fixup code testing if a pfn is direct mapped
>   x86: Only direct map addresses that are marked as E820_RAM
> 
> Yinghai Lu (5):
>   x86, mm: Add global page_size_mask
>   x86, mm: Split out split_mem_range
>   x86, mm: Moving init_memory_mapping calling
>   x86, mm: Revert back good_end setting for 64bit
>   x86, mm: Find early page table only one time
> 
>  arch/x86/include/asm/init.h   |1 -
>  arch/x86/include/asm/page_types.h |3 +
>  arch/x86/include/asm/pgtable.h|1 +
>  arch/x86/kernel/cpu/amd.c |8 +-
>  arch/x86/kernel/setup.c   |   34 ---
>  arch/x86/mm/init.c|  225 
> ++---
>  arch/x86/mm/init_64.c |6 +-
>  arch/x86/platform/efi/efi.c   |8 +-
>  8 files changed, 191 insertions(+), 95 deletions(-)
> 
> -- 
> 1.7.7
> 
> 

I'll be out of office tomorrow, and Monday is a holiday, so I'll test it
on our machines on Tuesday,

Thanks,

-Jacob

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH -v3 00/14] x86, mm: init_memory_mapping cleanup

2012-09-13 Thread Jacob Shin
On Wed, Sep 05, 2012 at 03:08:15PM -0500, Jacob Shin wrote:
> On Tue, Sep 04, 2012 at 10:46:17PM -0700, Yinghai Lu wrote:
> > Only create mapping for E820_820 and E820_RESERVED_KERN.
> > 
> > Seperate calculate_table_space_size and find_early_page_table out with
> > init_memory_mapping.
> > 
> > For all ranges, will allocate page table one time, but init mapping
> > only for E820 RAM and E820_RESERVED_KERN.
> > 
> > Could be found at:
> > git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-yinghai.git 
> > for-x86-mm
> > 
> > Thanks
> > Yinghai
> > 
> > 
> > Jacob Shin (4):
> >   x86: if kernel .text .data .bss are not marked as E820_RAM, complain and 
> > fix
> >   x86: Fixup code testing if a pfn is direct mapped
> >   x86: Only direct map addresses that are marked as E820_RAM
> >   x86/mm: calculate_table_space_size based on memory ranges that are being 
> > mapped
> > 
> > Yinghai Lu (10):
> >   x86, mm: Add global page_size_mask
> >   x86, mm: Split out split_mem_range
> >   x86, mm: Moving init_memory_mapping calling
> >   x86, mm: Revert back good_end setting for 64bit
> >   x86, mm: Find early page table only one time
> >   x86, mm: Separate out calculate_table_space_size()
> >   x86, mm: Move down two calculate_table_space_size down.
> >   x86, mm: set memblock initial limit to 1M
> >   x86, mm: Use func pointer to table size calculation and mapping
> >   x86, mm: Map ISA area with connected ram range at the same time
> > 
> >  arch/x86/include/asm/init.h   |1 -
> >  arch/x86/include/asm/page_types.h |2 +
> >  arch/x86/include/asm/pgtable.h|1 +
> >  arch/x86/kernel/cpu/amd.c |8 +-
> >  arch/x86/kernel/setup.c   |   36 +++--
> >  arch/x86/mm/init.c|  357 
> > +
> >  arch/x86/mm/init_64.c |6 +-
> >  arch/x86/platform/efi/efi.c   |8 +-
> >  8 files changed, 280 insertions(+), 139 deletions(-)
> > 
> > -- 
> > 1.7.7
> > 
> > 
> 
> Tested -v3 on our (AMD) machines and everything looks good.

Hi, hpa, wondering if this version finally looks okay to you for 3.7 ?

Thanks,

-Jacob

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 04/13] x86, mm: Revert back good_end setting for 64bit

2012-10-03 Thread Jacob Shin
On Mon, Oct 01, 2012 at 12:00:26PM +0100, Stefano Stabellini wrote:
> On Sun, 30 Sep 2012, Yinghai Lu wrote:
> > After
> > 
> > | commit 8548c84da2f47e71bbbe300f55edb768492575f7
> > | Author: Takashi Iwai 
> > | Date:   Sun Oct 23 23:19:12 2011 +0200
> > |
> > |x86: Fix S4 regression
> > |
> > |Commit 4b239f458 ("x86-64, mm: Put early page table high") causes a S4
> > |regression since 2.6.39, namely the machine reboots occasionally at S4
> > |resume.  It doesn't happen always, overall rate is about 1/20.  But,
> > |like other bugs, once when this happens, it continues to happen.
> > |
> > |This patch fixes the problem by essentially reverting the memory
> > |assignment in the older way.
> > 
> > Have some page table around 512M again, that will prevent kdump to find 512M
> > under 768M.
> > 
> > We need revert that reverting, so we could put page table high again for 
> > 64bit.
> > 
> > Takashi agreed that S4 regression could be something else.
> > 
> > https://lkml.org/lkml/2012/6/15/182
> > 
> > Signed-off-by: Yinghai Lu 
> > ---
> >  arch/x86/mm/init.c |2 +-
> >  1 files changed, 1 insertions(+), 1 deletions(-)
> > 
> > diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
> > index 9f69180..aadb154 100644
> > --- a/arch/x86/mm/init.c
> > +++ b/arch/x86/mm/init.c
> > @@ -76,8 +76,8 @@ static void __init find_early_table_space(struct 
> > map_range *mr,
> >  #ifdef CONFIG_X86_32
> > /* for fixmap */
> > tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
> > -#endif
> > good_end = max_pfn_mapped << PAGE_SHIFT;
> > +#endif
> >  
> > base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
> > if (!base)
> 
> Isn't this going to cause init_memory_mapping to allocate pagetable
> pages from memory not yet mapped?
> Last time I spoke with HPA and Thomas about this, they seem to agree
> that it isn't a very good idea.
> Also, it is proven to cause a certain amount of headaches on Xen,
> see commit d8aa5ec3382e6a545b8f25178d1e0992d4927f19.
> 

Any comments, thoughts? hpa? Yinghai?

So it seems that during init_memory_mapping Xen needs to modify page table 
bits and the memory where the page tables live needs to be direct mapped at
that time.

Since we now call init_memory_mapping for every E820_RAM range sequencially,
the only way to satisfy Xen is to find_early_page_table_space (good_end needs
to be within memory already mapped at the time) for every init_memory_mapping
call.

What do you think Yinghai?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG: 1bbbbe7 (x86: Exclude E820_RESERVED regions...) PANIC on boot

2012-10-24 Thread Jacob Shin
On Mon, Oct 22, 2012 at 04:35:18PM -0700, Yinghai Lu wrote:
> On Mon, Oct 22, 2012 at 2:27 PM, H. Peter Anvin  wrote:
> >>
> >> We never know that bios guys will not let bios produce crazy e820 map.
> >>
> >
> > Yeah, well, that just *will* happen... that's a given.
> >
> > We can trim those ranges, though.  Who cares if we lose some RAM.
> >
> 
> please check attached two patches that handle partial pages for 3.7.
> 
> and you still need patch in
>https://lkml.org/lkml/2012/8/24/469
> 
> to address early page table size calculation problem for Tom Rini

Acked-by: Jacob Shin 

hpa, we need this patch: https://lkml.org/lkml/2012/8/24/469 and the above
2 from Yinghai to handle corner case E820 layouts.

I got an email from Greg KH that 1e779aabe1f0768c2bf8f8c0a5583679b54a is
queued for stable, so these need to go to stable as well.

Thanks,

-Jacob

> 
> Thanks
> 
> Yinghai




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG: 1bbbbe7 (x86: Exclude E820_RESERVED regions...) PANIC on boot

2012-10-24 Thread Jacob Shin
On Wed, Oct 24, 2012 at 11:53:16AM -0700, H. Peter Anvin wrote:
> On 10/24/2012 09:48 AM, Jacob Shin wrote:
> > 
> > hpa, we need this patch: https://lkml.org/lkml/2012/8/24/469 and the above
> > 2 from Yinghai to handle corner case E820 layouts.
> > 
> 
> I can apply Yinghai's patches, but the above patch no longer applies.
> Could you refresh it on top of tip:x86/u, please?

Sorry about that, it applied to Linus's 3.7-rc2 so I just assumed .. :-(

>From 7d2a67f6b435ede202bdf5d1982f9b5af90cce34 Mon Sep 17 00:00:00 2001
From: Jacob Shin 
Date: Wed, 24 Oct 2012 14:24:44 -0500
Subject: [PATCH] x86/mm: find_early_table_space based on ranges that are
 actually being mapped

Current logic finds enough space for direct mapping page tables from 0
to end. Instead, we only need to find enough space to cover mr[0].start
to mr[nr_range].end -- the range that is actually being mapped by
init_memory_mapping()

This is needed after 1e779aabe1f0768c2bf8f8c0a5583679b54a, to address
the panic reported here:

  https://lkml.org/lkml/2012/10/20/160
  https://lkml.org/lkml/2012/10/21/157

Signed-off-by: Jacob Shin 
Tested-by: Tom Rini 

---
 arch/x86/mm/init.c |   70 ++--
 1 file changed, 41 insertions(+), 29 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 8653b3a..bc287d6 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -29,36 +29,54 @@ int direct_gbpages
 #endif
 ;
 
-static void __init find_early_table_space(unsigned long end, int use_pse,
- int use_gbpages)
+struct map_range {
+   unsigned long start;
+   unsigned long end;
+   unsigned page_size_mask;
+};
+
+/*
+ * First calculate space needed for kernel direct mapping page tables to cover
+ * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 
1GB
+ * pages. Then find enough contiguous space for those page tables.
+ */
+static void __init find_early_table_space(struct map_range *mr, int nr_range)
 {
-   unsigned long puds, pmds, ptes, tables, start = 0, good_end = end;
+   int i;
+   unsigned long puds = 0, pmds = 0, ptes = 0, tables;
+   unsigned long start = 0, good_end;
phys_addr_t base;
 
-   puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
-   tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+   for (i = 0; i < nr_range; i++) {
+   unsigned long range, extra;
 
-   if (use_gbpages) {
-   unsigned long extra;
+   range = mr[i].end - mr[i].start;
+   puds += (range + PUD_SIZE - 1) >> PUD_SHIFT;
 
-   extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
-   pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
-   } else
-   pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
-
-   tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
+   if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) {
+   extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT);
+   pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+   } else {
+   pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT;
+   }
 
-   if (use_pse) {
-   unsigned long extra;
-
-   extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+   if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) {
+   extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT);
 #ifdef CONFIG_X86_32
-   extra += PMD_SIZE;
+   extra += PMD_SIZE;
 #endif
-   ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
-   } else
-   ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+   /* The first 2/4M doesn't use large pages. */
+   if (mr[i].start < PMD_SIZE)
+   extra += range;
+
+   ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+   } else {
+   ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT;
+   }
+   }
 
+   tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+   tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
 
 #ifdef CONFIG_X86_32
@@ -76,7 +94,7 @@ static void __init find_early_table_space(unsigned long end, 
int use_pse,
pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
 
printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem 
%#010lx-%#010lx]\n",
-   end - 1, pgt_buf_start << PAGE_SHIFT,
+   mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT,
(pgt_buf_top << PAGE_SHIFT) - 1);
 }
 
@@ -85,12 +103,6 @@ void __init native_pagetable_re

Re: [tip:x86/urgent] x86, mm: Find_early_table_space based on ranges that are actually being mapped

2012-10-25 Thread Jacob Shin
On Thu, Oct 25, 2012 at 07:33:32AM -0700, Yinghai Lu wrote:
> On Thu, Oct 25, 2012 at 12:55 AM, Ingo Molnar  wrote:
> >> > -   ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
> >> > +   /* The first 2/4M doesn't use large pages. */
> >> > +   if (mr[i].start < PMD_SIZE)
> >> > +   extra += range;
> >>
> >> those three lines should be added back.
> 
> missed "not" ...
> 
> >>
> >> it just get reverted in 7b16bbf9
> >
> > Could you please send a delta patch against tip:x86/urgent?
> 
> please check attached one.

Acked-by: Jacob Shin 

Sorry about that, I just retrofitted the patch and didn't see those lines got
reverted out,

Thanks!

> 
> Thanks
> 
> Yinghai



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Regression from 3.4.9 to 3.4.16 "stable" kernel

2012-10-29 Thread Jacob Shin
On Mon, Oct 29, 2012 at 02:40:58PM +, Ben Hutchings wrote:
> On Mon, 2012-10-29 at 10:22 -0400, Mark Lord wrote:
> > On 12-10-29 02:46 AM, Willy Tarreau wrote:
> > > On Mon, Oct 29, 2012 at 12:03:55AM -0400, Mark Lord wrote:
> > >> My server here runs the 3.4.xx series of "stable" kernels.
> > >> Until today, it was running 3.4.9.
> > >> Today I tried to upgrade it to 3.4.16.
> > >> It hangs in setup.c.
> > >>
> > >> I've isolated the fault down to this specific change
> > >> that was made between 3.4.9 and 3.4.16.
> > >> Reverting this change allows the system to boot/run normally again.
> > >>
> > >>
> > >> --- linux-3.4.9/arch/x86/kernel/setup.c  2012-08-15 11:17:17.0 
> > >> -0400
> > >> +++ linux-3.4.16/arch/x86/kernel/setup.c 2012-10-28 13:36:33.0 
> > >> -0400
> > >> @@ -927,8 +927,21 @@
> > >>
> > >>  #ifdef CONFIG_X86_64
> > >>  if (max_pfn > max_low_pfn) {
> > >> -max_pfn_mapped = init_memory_mapping(1UL<<32,
> > >> - 
> > >> max_pfn< > >> +int i;
> > >> +for (i = 0; i < e820.nr_map; i++) {
> > >> +struct e820entry *ei = &e820.map[i];
> > >> +
> > >> +if (ei->addr + ei->size <= 1UL << 32)
> > >> +continue;
> > >> +
> > >> +if (ei->type == E820_RESERVED)
> > >> +continue;
> > >> +
> > >> +max_pfn_mapped = init_memory_mapping(
> > >> +ei->addr < 1UL << 32 ? 1UL << 32 : 
> > >> ei->addr,
> > >> +ei->addr + ei->size);
> > >> +}
> > >> +
> > >>  /* can we preseve max_low_pfn ?*/
> > >>  max_low_pfn = max_pfn;
> > >>  }
> > > 
> > > For the record, it is this commit introduced in 3.4.16 :
> > > 
> > > commit efd5fa0c1a1d1b46846ea6e8d1a783d0d8a6a721
> > > Author: Jacob Shin 
> > > Date:   Thu Oct 20 16:15:26 2011 -0500
> > > 
> > > x86: Exclude E820_RESERVED regions and memory holes above 4 GB from 
> > > direct mapping.
> > > 
> > > commit 1e779aabe1f0768c2bf8f8c0a5583679b54a upstream.
> > > 
> > > On systems with very large memory (1 TB in our case), BIOS may report 
> > > a
> > > reserved region or a hole in the E820 map, even above the 4 GB range. 
> > > Exclude
> > > these from the direct mapping.
> > > 
> > > [ hpa: this should be done not just for > 4 GB but for everything 
> > > above the legacy
> > >   region (1 MB), at the very least.  That, however, turns out to 
> > > require significant
> > >   restructuring.  That work is well underway, but is not suitable for 
> > > rc/stable. ]
> > > 
> > > Signed-off-by: Jacob Shin 
> > > Link: 
> > > http://lkml.kernel.org/r/1319145326-13902-1-git-send-email-jacob.s...@amd.com
> > > Signed-off-by: H. Peter Anvin 
> > > Signed-off-by: Greg Kroah-Hartman 
> > > 
> > > Willy
> > 
> > 
> > Thanks, Willy.
> > 
> > I've also now downloaded linux-3.7.0-rc3, and it boots/runs without need 
> > for patching.
> > So there's a fix somewhere in between that perhaps could also get 
> > backported to -stable.
> 
> Might well be:
> 
> commit 1f2ff682ac951ed82cc043cf140d2851084512df
> Author: Yinghai Lu 
> Date:   Mon Oct 22 16:35:18 2012 -0700
> 
> x86, mm: Use memblock memory loop instead of e820_RAM
> 
> However I'm not sure that this loop is correct either.  Yinghai, does
> your version definitely iterate in increasing pfn order?  If not then
> the max_pfn_mapped assignment must be conditional.

Hi, I believe these two commits in mainline should fix Alexander's failing
machien:

844ab6f993b1d32eb40512503d35ff6ad0c57030
f82f64dd9f485e13f29f369772d4a0e868e5633a

This thread has some more details:

https://lkml.org/lkml/2012/10/21/157

Sorry, and thanks!

> 
> Ben.
> 
> -- 
> Ben Hutchings
> Humans are not rational beings; they are rationalising beings.



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Regression from 3.4.9 to 3.4.16 "stable" kernel

2012-10-29 Thread Jacob Shin
On Mon, Oct 29, 2012 at 09:58:23AM -0700, Greg Kroah-Hartman wrote:
> On Mon, Oct 29, 2012 at 09:47:22AM -0500, Jacob Shin wrote:
> > On Mon, Oct 29, 2012 at 02:40:58PM +, Ben Hutchings wrote:
> > > On Mon, 2012-10-29 at 10:22 -0400, Mark Lord wrote:
> > > > On 12-10-29 02:46 AM, Willy Tarreau wrote:
> > > > > On Mon, Oct 29, 2012 at 12:03:55AM -0400, Mark Lord wrote:
> > > > >> My server here runs the 3.4.xx series of "stable" kernels.
> > > > >> Until today, it was running 3.4.9.
> > > > >> Today I tried to upgrade it to 3.4.16.
> > > > >> It hangs in setup.c.
> > > > >>
> > > > >> I've isolated the fault down to this specific change
> > > > >> that was made between 3.4.9 and 3.4.16.
> > > > >> Reverting this change allows the system to boot/run normally again.
> > > > >>
> > > > >>
> > > > >> --- linux-3.4.9/arch/x86/kernel/setup.c  2012-08-15 
> > > > >> 11:17:17.0 -0400
> > > > >> +++ linux-3.4.16/arch/x86/kernel/setup.c 2012-10-28 
> > > > >> 13:36:33.0 -0400
> > > > >> @@ -927,8 +927,21 @@
> > > > >>
> > > > >>  #ifdef CONFIG_X86_64
> > > > >>  if (max_pfn > max_low_pfn) {
> > > > >> -max_pfn_mapped = init_memory_mapping(1UL<<32,
> > > > >> - 
> > > > >> max_pfn< > > > >> +int i;
> > > > >> +for (i = 0; i < e820.nr_map; i++) {
> > > > >> +struct e820entry *ei = &e820.map[i];
> > > > >> +
> > > > >> +if (ei->addr + ei->size <= 1UL << 32)
> > > > >> +continue;
> > > > >> +
> > > > >> +if (ei->type == E820_RESERVED)
> > > > >> +continue;
> > > > >> +
> > > > >> +max_pfn_mapped = init_memory_mapping(
> > > > >> +ei->addr < 1UL << 32 ? 1UL << 32 : 
> > > > >> ei->addr,
> > > > >> +ei->addr + ei->size);
> > > > >> +}
> > > > >> +
> > > > >>  /* can we preseve max_low_pfn ?*/
> > > > >>  max_low_pfn = max_pfn;
> > > > >>  }
> > > > > 
> > > > > For the record, it is this commit introduced in 3.4.16 :
> > > > > 
> > > > > commit efd5fa0c1a1d1b46846ea6e8d1a783d0d8a6a721
> > > > > Author: Jacob Shin 
> > > > > Date:   Thu Oct 20 16:15:26 2011 -0500
> > > > > 
> > > > > x86: Exclude E820_RESERVED regions and memory holes above 4 GB 
> > > > > from direct mapping.
> > > > > 
> > > > > commit 1e779aabe1f0768c2bf8f8c0a5583679b54a upstream.
> > > > > 
> > > > > On systems with very large memory (1 TB in our case), BIOS may 
> > > > > report a
> > > > > reserved region or a hole in the E820 map, even above the 4 GB 
> > > > > range. Exclude
> > > > > these from the direct mapping.
> > > > > 
> > > > > [ hpa: this should be done not just for > 4 GB but for everything 
> > > > > above the legacy
> > > > >   region (1 MB), at the very least.  That, however, turns out to 
> > > > > require significant
> > > > >   restructuring.  That work is well underway, but is not suitable 
> > > > > for rc/stable. ]
> > > > > 
> > > > > Signed-off-by: Jacob Shin 
> > > > > Link: 
> > > > > http://lkml.kernel.org/r/1319145326-13902-1-git-send-email-jacob.s...@amd.com
> > > > > Signed-off-by: H. Peter Anvin 
> > > > > Signed-off-by: Greg Kroah-Hartman 
> > > > > 
> > > > > Willy
> > > > 
> > > > 
> > > > Thanks, Willy.
> > > > 
> > > > I've also now downloaded linux-3.7.0-rc3, and it boots/runs without 
> > > > need for patching.
> > > > 

Re: [PATCH -v3 00/14] x86, mm: init_memory_mapping cleanup

2012-09-05 Thread Jacob Shin
On Tue, Sep 04, 2012 at 10:46:17PM -0700, Yinghai Lu wrote:
> Only create mapping for E820_820 and E820_RESERVED_KERN.
> 
> Seperate calculate_table_space_size and find_early_page_table out with
> init_memory_mapping.
> 
> For all ranges, will allocate page table one time, but init mapping
> only for E820 RAM and E820_RESERVED_KERN.
> 
> Could be found at:
> git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-yinghai.git 
> for-x86-mm
> 
> Thanks
> Yinghai
> 
> 
> Jacob Shin (4):
>   x86: if kernel .text .data .bss are not marked as E820_RAM, complain and fix
>   x86: Fixup code testing if a pfn is direct mapped
>   x86: Only direct map addresses that are marked as E820_RAM
>   x86/mm: calculate_table_space_size based on memory ranges that are being 
> mapped
> 
> Yinghai Lu (10):
>   x86, mm: Add global page_size_mask
>   x86, mm: Split out split_mem_range
>   x86, mm: Moving init_memory_mapping calling
>   x86, mm: Revert back good_end setting for 64bit
>   x86, mm: Find early page table only one time
>   x86, mm: Separate out calculate_table_space_size()
>   x86, mm: Move down two calculate_table_space_size down.
>   x86, mm: set memblock initial limit to 1M
>   x86, mm: Use func pointer to table size calculation and mapping
>   x86, mm: Map ISA area with connected ram range at the same time
> 
>  arch/x86/include/asm/init.h   |1 -
>  arch/x86/include/asm/page_types.h |2 +
>  arch/x86/include/asm/pgtable.h|1 +
>  arch/x86/kernel/cpu/amd.c |8 +-
>  arch/x86/kernel/setup.c   |   36 +++--
>  arch/x86/mm/init.c|  357 
> +
>  arch/x86/mm/init_64.c |6 +-
>  arch/x86/platform/efi/efi.c   |8 +-
>  8 files changed, 280 insertions(+), 139 deletions(-)
> 
> -- 
> 1.7.7
> 
> 

Tested -v3 on our (AMD) machines and everything looks good.

Thanks,

-Jacob

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH -v3 14/14] x86, mm: Map ISA area with connected ram range at the same time

2012-09-06 Thread Jacob Shin
On Thu, Sep 06, 2012 at 10:22:19AM +0300, Pekka Enberg wrote:
> On Wed, Sep 5, 2012 at 1:02 AM, Pekka Enberg  wrote:
> > > How significant is the speed gain? The "isa_done" flag makes code flow
> > > more difficult to follow.
> 
> On Wed, 5 Sep 2012, Yinghai Lu wrote:
> > Not really much.
> > 
> > when booting system:
> > memmap=16m$128m memmap=16m$512m memmap=16m$256m memmap=16m$768m 
> > memmap=16m$1024m
> > 
> > with the patch
> > [0.00] init_memory_mapping: [mem 0x-0x07ff]
> > [0.00]  [mem 0x-0x07ff] page 2M
> > [0.00] init_memory_mapping: [mem 0x0900-0x0fff]
> > [0.00]  [mem 0x0900-0x0fff] page 2M
> > [0.00] init_memory_mapping: [mem 0x1100-0x1fff]
> > [0.00]  [mem 0x1100-0x1fff] page 2M
> > [0.00] init_memory_mapping: [mem 0x2100-0x2fff]
> > [0.00]  [mem 0x2100-0x2fff] page 2M
> > [0.00] init_memory_mapping: [mem 0x3100-0x3fff]
> > [0.00]  [mem 0x3100-0x3fff] page 2M
> > [0.00] init_memory_mapping: [mem 0x4100-0x7fffdfff]
> > [0.00]  [mem 0x4100-0x7fdf] page 2M
> > [0.00]  [mem 0x7fe0-0x7fffdfff] page 4k
> > 
> > otherwise will have
> > 
> > [0.00] init_memory_mapping: [mem 0x-0x000f]
> > [0.00]  [mem 0x-0x000f] page 4k
> > [0.00] init_memory_mapping: [mem 0x0010-0x07ff]
> > [0.00]  [mem 0x0010-0x001f] page 4k
> > [0.00]  [mem 0x0020-0x07ff] page 2M
> > [0.00] init_memory_mapping: [mem 0x0900-0x0fff]
> > [0.00]  [mem 0x0900-0x0fff] page 2M
> > [0.00] init_memory_mapping: [mem 0x1100-0x1fff]
> > [0.00]  [mem 0x1100-0x1fff] page 2M
> > [0.00] init_memory_mapping: [mem 0x2100-0x2fff]
> > [0.00]  [mem 0x2100-0x2fff] page 2M
> > [0.00] init_memory_mapping: [mem 0x3100-0x3fff]
> > [0.00]  [mem 0x3100-0x3fff] page 2M
> > [0.00] init_memory_mapping: [mem 0x4100-0x7fffdfff]
> > [0.00]  [mem 0x4100-0x7fdf] page 2M
> > [0.00]  [mem 0x7fe0-0x7fffdfff] page 4k
> 
> OK. Is there any other reason than performance to do this?

May be minor, but ..

The first range [mem 0x-0x07ff] is covered entirely by 2M
page tables, instead of some 4K + some 2M.

-Jacob

> 
>   Pekka
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH V4 0/6] perf, amd: Enable AMD family 15h northbridge counters

2012-12-05 Thread Jacob Shin
The following patchset enables 4 additional performance counters in
AMD family 15h processors that count northbridge events -- such as
number of DRAM accesses.

This patchset is based on previous work done by Robert Richter
 :

https://lkml.org/lkml/2012/6/19/324

The main differences are:

* The northbridge counters are indexed contiguously right above the
  core performance counters.

* MSR address offset calculations are moved to architecture specific
  files.

* Interrups are set up to be delivered only to a single core.

V4:
* Moved interrupt core select set up back to event constraints
  function, sicne during ->hw_config time we do not yet know on which
  CPU the the event will run on.
* Tested on and made minor revisions to make sure that the patchset is
  compatible with upcoming AMD Family 16h processors, and will support
  core and NB counters without any further patches.

V3:
Addressed the following feedback/comments from Robert's review
* https://lkml.org/lkml/2012/11/16/484
* https://lkml.org/lkml/2012/11/26/162

V2:
Separate out Robert's patches, and add properly ordered certificate of
origins.

Jacob Shin (4):
  perf, amd: Use proper naming scheme for AMD bit field definitions
  perf, x86: Move MSR address offset calculation to architecture
specific files
  perf, x86: Allow for architecture specific RDPMC indexes
  perf, amd: Enable northbridge performance counters on AMD family 15h

Robert Richter (2):
  perf, amd: Rework northbridge event constraints handler
  perf, amd: Generalize northbridge constraints code for family 15h

 arch/x86/include/asm/cpufeature.h|2 +
 arch/x86/include/asm/msr-index.h |2 +
 arch/x86/include/asm/perf_event.h|   13 +-
 arch/x86/kernel/cpu/perf_event.c |2 +-
 arch/x86/kernel/cpu/perf_event.h |   25 ++-
 arch/x86/kernel/cpu/perf_event_amd.c |  318 ++
 6 files changed, 268 insertions(+), 94 deletions(-)

-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/6] perf, amd: Generalize northbridge constraints code for family 15h

2012-12-05 Thread Jacob Shin
From: Robert Richter 

Generalize northbridge constraints code for family 10h so that later
we can reuse the same code path with other AMD processor families that
have the same northbridge event constraints.

Signed-off-by: Robert Richter 
Signed-off-by: Jacob Shin 
---
 arch/x86/kernel/cpu/perf_event_amd.c |   43 --
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index e7963c7..9541fe5 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -188,20 +188,13 @@ static inline int amd_has_nb(struct cpu_hw_events *cpuc)
return nb && nb->nb_id != -1;
 }
 
-static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
- struct perf_event *event)
+static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
+  struct perf_event *event)
 {
-   struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
int i;
 
/*
-* only care about NB events
-*/
-   if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
-   return;
-
-   /*
 * need to scan whole list because event may not have
 * been assigned during scheduling
 *
@@ -247,12 +240,13 @@ static void amd_put_event_constraints(struct 
cpu_hw_events *cpuc,
   *
   * Given that resources are allocated (cmpxchg), they must be
   * eventually freed for others to use. This is accomplished by
-  * calling amd_put_event_constraints().
+  * calling __amd_put_nb_event_constraints()
   *
   * Non NB events are not impacted by this restriction.
   */
 static struct event_constraint *
-amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event 
*event,
+  struct event_constraint *c)
 {
struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
@@ -260,12 +254,6 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
struct perf_event *event)
int idx, new = -1;
 
/*
-* if not NB event or no NB, then no constraints
-*/
-   if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
-   return &unconstrained;
-
-   /*
 * detect if already present, if so reuse
 *
 * cannot merge with actual allocation
@@ -275,7 +263,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
struct perf_event *event)
 * because of successive calls to x86_schedule_events() from
 * hw_perf_group_sched_in() without hw_perf_enable()
 */
-   for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+   for_each_set_bit(idx, c->idxmsk, X86_PMC_IDX_MAX) {
if (new == -1 || hwc->idx == idx)
/* assign free slot, prefer hwc->idx */
old = cmpxchg(nb->owners + idx, NULL, event);
@@ -391,6 +379,25 @@ static void amd_pmu_cpu_dead(int cpu)
}
 }
 
+static struct event_constraint *
+amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+   /*
+* if not NB event or no NB, then no constraints
+*/
+   if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
+   return &unconstrained;
+
+   return __amd_get_nb_event_constraints(cpuc, event, &unconstrained);
+}
+
+static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+   if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
+   __amd_put_nb_event_constraints(cpuc, event);
+}
+
 PMU_FORMAT_ATTR(event, "config:0-7,32-35");
 PMU_FORMAT_ATTR(umask, "config:8-15"   );
 PMU_FORMAT_ATTR(edge,  "config:18" );
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/6] perf, amd: Rework northbridge event constraints handler

2012-12-05 Thread Jacob Shin
From: Robert Richter 

Code simplification. No functional changes.

Signed-off-by: Robert Richter 
Signed-off-by: Jacob Shin 
---
 arch/x86/kernel/cpu/perf_event_amd.c |   68 +-
 1 file changed, 26 insertions(+), 42 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index c93bc4e..e7963c7 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -256,9 +256,8 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
struct perf_event *event)
 {
struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
-   struct perf_event *old = NULL;
-   int max = x86_pmu.num_counters;
-   int i, j, k = -1;
+   struct perf_event *old;
+   int idx, new = -1;
 
/*
 * if not NB event or no NB, then no constraints
@@ -276,48 +275,33 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
struct perf_event *event)
 * because of successive calls to x86_schedule_events() from
 * hw_perf_group_sched_in() without hw_perf_enable()
 */
-   for (i = 0; i < max; i++) {
-   /*
-* keep track of first free slot
-*/
-   if (k == -1 && !nb->owners[i])
-   k = i;
+   for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+   if (new == -1 || hwc->idx == idx)
+   /* assign free slot, prefer hwc->idx */
+   old = cmpxchg(nb->owners + idx, NULL, event);
+   else if (nb->owners[idx] == event)
+   /* event already present */
+   old = event;
+   else
+   continue;
+
+   if (old && old != event)
+   continue;
+
+   /* reassign to this slot */
+   if (new != -1)
+   cmpxchg(nb->owners + new, event, NULL);
+   new = idx;
 
/* already present, reuse */
-   if (nb->owners[i] == event)
-   goto done;
-   }
-   /*
-* not present, so grab a new slot
-* starting either at:
-*/
-   if (hwc->idx != -1) {
-   /* previous assignment */
-   i = hwc->idx;
-   } else if (k != -1) {
-   /* start from free slot found */
-   i = k;
-   } else {
-   /*
-* event not found, no slot found in
-* first pass, try again from the
-* beginning
-*/
-   i = 0;
-   }
-   j = i;
-   do {
-   old = cmpxchg(nb->owners+i, NULL, event);
-   if (!old)
+   if (old == event)
break;
-   if (++i == max)
-   i = 0;
-   } while (i != j);
-done:
-   if (!old)
-   return &nb->event_constraints[i];
-
-   return &emptyconstraint;
+   }
+
+   if (new == -1)
+   return &emptyconstraint;
+
+   return &nb->event_constraints[new];
 }
 
 static struct amd_nb *amd_alloc_nb(int cpu)
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/6] perf, x86: Move MSR address offset calculation to architecture specific files

2012-12-05 Thread Jacob Shin
Move counter index to MSR address offset calculation to architecture
specific files. This prepares the way for perf_event_amd to enable
counter addresses that are not contiguous -- for example AMD Family
15h processors have 6 core performance counters starting at 0xc0010200
and 4 northbridge performance counters starting at 0xc0010240.

Signed-off-by: Jacob Shin 
---
 arch/x86/kernel/cpu/perf_event.h |   21 -
 arch/x86/kernel/cpu/perf_event_amd.c |   42 ++
 2 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 115c1ea..015826e 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -325,6 +325,7 @@ struct x86_pmu {
int (*schedule_events)(struct cpu_hw_events *cpuc, int n, 
int *assign);
unsignedeventsel;
unsignedperfctr;
+   int (*addr_offset)(int index, int eventsel);
u64 (*event_map)(int);
int max_events;
int num_counters;
@@ -446,28 +447,16 @@ extern u64 __read_mostly hw_cache_extra_regs
 
 u64 x86_perf_event_update(struct perf_event *event);
 
-static inline int x86_pmu_addr_offset(int index)
-{
-   int offset;
-
-   /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
-   alternative_io(ASM_NOP2,
-  "shll $1, %%eax",
-  X86_FEATURE_PERFCTR_CORE,
-  "=a" (offset),
-  "a"  (index));
-
-   return offset;
-}
-
 static inline unsigned int x86_pmu_config_addr(int index)
 {
-   return x86_pmu.eventsel + x86_pmu_addr_offset(index);
+   return x86_pmu.eventsel +
+   (x86_pmu.addr_offset ? x86_pmu.addr_offset(index, 1) : index);
 }
 
 static inline unsigned int x86_pmu_event_addr(int index)
 {
-   return x86_pmu.perfctr + x86_pmu_addr_offset(index);
+   return x86_pmu.perfctr +
+   (x86_pmu.addr_offset ? x86_pmu.addr_offset(index, 0) : index);
 }
 
 int x86_setup_perfctr(struct perf_event *event);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index 0c2cc51..ef1df38 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -132,6 +132,47 @@ static u64 amd_pmu_event_map(int hw_event)
return amd_perfmon_event_map[hw_event];
 }
 
+/*
+ * Previously calculated offsets
+ */
+static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
+static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
+
+/*
+ * Legacy CPUs:
+ *   4 counters starting at 0xc001 each offset by 1
+ *
+ * CPUs with core performance counter extensions:
+ *   6 counters starting at 0xc0010200 each offset by 2
+ */
+static inline int amd_pmu_addr_offset(int index, int eventsel)
+{
+   int offset;
+
+   if (!index)
+   return index;
+
+   if (eventsel)
+   offset = event_offsets[index];
+   else
+   offset = count_offsets[index];
+
+   if (offset)
+   return offset;
+
+   if (!cpu_has_perfctr_core)
+   offset = index;
+   else
+   offset = index << 1;
+
+   if (eventsel)
+   event_offsets[index] = offset;
+   else
+   count_offsets[index] = offset;
+
+   return offset;
+}
+
 static int amd_pmu_hw_config(struct perf_event *event)
 {
int ret;
@@ -578,6 +619,7 @@ static __initconst const struct x86_pmu amd_pmu = {
.schedule_events= x86_schedule_events,
.eventsel   = MSR_K7_EVNTSEL0,
.perfctr= MSR_K7_PERFCTR0,
+   .addr_offset= amd_pmu_addr_offset,
.event_map  = amd_pmu_event_map,
.max_events = ARRAY_SIZE(amd_perfmon_event_map),
.num_counters   = AMD64_NUM_COUNTERS,
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 5/6] perf, x86: Allow for architecture specific RDPMC indexes

2012-12-05 Thread Jacob Shin
Similar to config_base and event_base, allow architecture specific
RDPMC ECX values.

Signed-off-by: Jacob Shin 
---
 arch/x86/kernel/cpu/perf_event.c |2 +-
 arch/x86/kernel/cpu/perf_event.h |6 ++
 arch/x86/kernel/cpu/perf_event_amd.c |6 ++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 4428fd1..b63982b 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -835,7 +835,7 @@ static inline void x86_assign_hw_event(struct perf_event 
*event,
} else {
hwc->config_base = x86_pmu_config_addr(hwc->idx);
hwc->event_base  = x86_pmu_event_addr(hwc->idx);
-   hwc->event_base_rdpmc = hwc->idx;
+   hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
}
 }
 
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 015826e..4a26fb1 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -326,6 +326,7 @@ struct x86_pmu {
unsignedeventsel;
unsignedperfctr;
int (*addr_offset)(int index, int eventsel);
+   int (*rdpmc_index)(int index);
u64 (*event_map)(int);
int max_events;
int num_counters;
@@ -459,6 +460,11 @@ static inline unsigned int x86_pmu_event_addr(int index)
(x86_pmu.addr_offset ? x86_pmu.addr_offset(index, 0) : index);
 }
 
+static inline int x86_pmu_rdpmc_index(int index)
+{
+   return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
+}
+
 int x86_setup_perfctr(struct perf_event *event);
 
 int x86_pmu_hw_config(struct perf_event *event);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index ef1df38..faf9072 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -173,6 +173,11 @@ static inline int amd_pmu_addr_offset(int index, int 
eventsel)
return offset;
 }
 
+static inline int amd_pmu_rdpmc_index(int index)
+{
+   return index;
+}
+
 static int amd_pmu_hw_config(struct perf_event *event)
 {
int ret;
@@ -620,6 +625,7 @@ static __initconst const struct x86_pmu amd_pmu = {
.eventsel   = MSR_K7_EVNTSEL0,
.perfctr= MSR_K7_PERFCTR0,
.addr_offset= amd_pmu_addr_offset,
+   .rdpmc_index= amd_pmu_rdpmc_index,
.event_map  = amd_pmu_event_map,
.max_events = ARRAY_SIZE(amd_perfmon_event_map),
.num_counters   = AMD64_NUM_COUNTERS,
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/6] perf, amd: Use proper naming scheme for AMD bit field definitions

2012-12-05 Thread Jacob Shin
Update these AMD bit field names to be consistent with naming
convention followed by the rest of the file.

Signed-off-by: Jacob Shin 
---
 arch/x86/include/asm/perf_event.h|4 ++--
 arch/x86/kernel/cpu/perf_event_amd.c |8 
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/perf_event.h 
b/arch/x86/include/asm/perf_event.h
index 4fabcdf..2234eaaec 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -29,8 +29,8 @@
 #define ARCH_PERFMON_EVENTSEL_INV  (1ULL << 23)
 #define ARCH_PERFMON_EVENTSEL_CMASK0xFF00ULL
 
-#define AMD_PERFMON_EVENTSEL_GUESTONLY (1ULL << 40)
-#define AMD_PERFMON_EVENTSEL_HOSTONLY  (1ULL << 41)
+#define AMD64_EVENTSEL_GUESTONLY   (1ULL << 40)
+#define AMD64_EVENTSEL_HOSTONLY(1ULL << 41)
 
 #define AMD64_EVENTSEL_EVENT   \
(ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32))
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index 9541fe5..0c2cc51 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -156,9 +156,9 @@ static int amd_pmu_hw_config(struct perf_event *event)
event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
  ARCH_PERFMON_EVENTSEL_OS);
else if (event->attr.exclude_host)
-   event->hw.config |= AMD_PERFMON_EVENTSEL_GUESTONLY;
+   event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
else if (event->attr.exclude_guest)
-   event->hw.config |= AMD_PERFMON_EVENTSEL_HOSTONLY;
+   event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
 
if (event->attr.type != PERF_TYPE_RAW)
return 0;
@@ -336,7 +336,7 @@ static void amd_pmu_cpu_starting(int cpu)
struct amd_nb *nb;
int i, nb_id;
 
-   cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
+   cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
 
if (boot_cpu_data.x86_max_cores < 2)
return;
@@ -669,7 +669,7 @@ void amd_pmu_disable_virt(void)
 * SVM is disabled the Guest-only bits still gets set and the counter
 * will not count anything.
 */
-   cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
+   cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
 
/* Reload all events */
x86_pmu_disable_all();
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 6/6] perf, amd: Enable northbridge performance counters on AMD family 15h

2012-12-05 Thread Jacob Shin
On AMD family 15h processors, there are 4 new performance counters
(in addition to 6 core performance counters) that can be used for
counting northbridge events (i.e. DRAM accesses). Their bit fields are
almost identical to the core performance counters. However, unlike the
core performance counters, these MSRs are shared between multiple
cores (that share the same northbridge). We will reuse the same code
path as existing family 10h northbridge event constraints handler
logic to enforce this sharing.

Signed-off-by: Jacob Shin 
---
 arch/x86/include/asm/cpufeature.h|2 +
 arch/x86/include/asm/msr-index.h |2 +
 arch/x86/include/asm/perf_event.h|9 ++
 arch/x86/kernel/cpu/perf_event_amd.c |  167 ++
 4 files changed, 160 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index c22a492..b4cd472 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -167,6 +167,7 @@
 #define X86_FEATURE_TBM(6*32+21) /* trailing bit manipulations 
*/
 #define X86_FEATURE_TOPOEXT(6*32+22) /* topology extensions CPUID leafs */
 #define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter 
extensions */
+#define X86_FEATURE_PERFCTR_NB  (6*32+24) /* NB performance counter extensions 
*/
 
 /*
  * Auxiliary flags: Linux defined - For features scattered in various
@@ -308,6 +309,7 @@ extern const char * const x86_power_flags[32];
 #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
 #define cpu_has_pclmulqdq  boot_cpu_has(X86_FEATURE_PCLMULQDQ)
 #define cpu_has_perfctr_core   boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
+#define cpu_has_perfctr_nb boot_cpu_has(X86_FEATURE_PERFCTR_NB)
 #define cpu_has_cx8boot_cpu_has(X86_FEATURE_CX8)
 #define cpu_has_cx16   boot_cpu_has(X86_FEATURE_CX16)
 #define cpu_has_eager_fpu  boot_cpu_has(X86_FEATURE_EAGER_FPU)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index e400cdb..736fbf6 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -157,6 +157,8 @@
 /* Fam 15h MSRs */
 #define MSR_F15H_PERF_CTL  0xc0010200
 #define MSR_F15H_PERF_CTR  0xc0010201
+#define MSR_F15H_NB_PERF_CTL   0xc0010240
+#define MSR_F15H_NB_PERF_CTR   0xc0010241
 
 /* Fam 10h MSRs */
 #define MSR_FAM10H_MMIO_CONF_BASE  0xc0010058
diff --git a/arch/x86/include/asm/perf_event.h 
b/arch/x86/include/asm/perf_event.h
index 2234eaaec..57cb634 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -29,9 +29,14 @@
 #define ARCH_PERFMON_EVENTSEL_INV  (1ULL << 23)
 #define ARCH_PERFMON_EVENTSEL_CMASK0xFF00ULL
 
+#define AMD64_EVENTSEL_INT_CORE_ENABLE (1ULL << 36)
 #define AMD64_EVENTSEL_GUESTONLY   (1ULL << 40)
 #define AMD64_EVENTSEL_HOSTONLY(1ULL << 41)
 
+#define AMD64_EVENTSEL_INT_CORE_SEL_SHIFT  37
+#define AMD64_EVENTSEL_INT_CORE_SEL_MASK   \
+   (0xFULL << AMD64_EVENTSEL_INT_CORE_SEL_SHIFT)
+
 #define AMD64_EVENTSEL_EVENT   \
(ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32))
 #define INTEL_ARCH_EVENT_MASK  \
@@ -46,8 +51,12 @@
 #define AMD64_RAW_EVENT_MASK   \
(X86_RAW_EVENT_MASK  |  \
 AMD64_EVENTSEL_EVENT)
+#define AMD64_RAW_EVENT_MASK_NB\
+   (AMD64_EVENTSEL_EVENT|  \
+ARCH_PERFMON_EVENTSEL_UMASK)
 #define AMD64_NUM_COUNTERS 4
 #define AMD64_NUM_COUNTERS_CORE6
+#define AMD64_NUM_COUNTERS_NB  4
 
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL  0x3c
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK(0x00 << 8)
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index faf9072..1a80e05 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -132,11 +132,14 @@ static u64 amd_pmu_event_map(int hw_event)
return amd_perfmon_event_map[hw_event];
 }
 
+static struct event_constraint *amd_nb_event_constraint;
+
 /*
  * Previously calculated offsets
  */
 static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
 static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
+static unsigned int rdpmc_indexes[X86_PMC_IDX_MAX] __read_mostly;
 
 /*
  * Legacy CPUs:
@@ -144,10 +147,14 @@ static unsigned int count_offsets[X86_PMC_IDX_MAX] 
__read_mostly;
  *
  * CPUs with core performance counter extensions:
  *   6 counters starting at 0xc0010200 each offset by 2
+ *
+ * CPUs with north bridge performance counter extensions:
+ *   4 additional counters starting at 0xc0010240 each offset by 2
+ *   (indexed right a

Re: [PATCH -v3 0/7] x86: Use BRK to pre mapping page table to make xen happy

2012-10-18 Thread Jacob Shin
On Thu, Oct 18, 2012 at 05:17:28PM +0100, Stefano Stabellini wrote:
> On Thu, 11 Oct 2012, Yinghai Lu wrote:
> > On Wed, Oct 10, 2012 at 9:40 AM, Stefano Stabellini
> >  wrote:
> > >
> > > So you are missing the Xen patches entirely in this iteration of the
> > > series?
> > 
> > please check updated for-x86-mm branch.
> > 
> > [PATCH -v4 00/15] x86: Use BRK to pre mapping page table to make xen happy
> > 
> > on top of current linus/master and tip/x86/mm2, but please zap last
> > patch in that branch.
> > 
> > 1. use brk to mapping first PMD_SIZE range.
> > 2. top down to initialize page table range by range.
> > 3. get rid of calculate page table, and find_early_page_table.
> > 4. remove early_ioremap in page table accessing.
> > 
> > v2: changes, update xen interface about pagetable_reserve, so not
> >use pgt_buf_* in xen code directly.
> > v3: use range top-down to initialize page table, so will not use
> >calculating/find early table anymore.
> >also reorder the patches sequence.
> > v4: add mapping_mark_page_ro to fix xen, also move pgt_buf_* to init.c
> > and merge alloc_low_page()
> > 
> > could be found at:
> > 
> > git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-yinghai.git
> > for-x86-mm
> 
> I find that patch series are easier to review than having to checkout
> your code and read the commit messages. Please post your patch series to
> the LKML next time.
> 
> In any case, regarding "x86, xen: Add xen_mapping_mark_page_ro": please
> take Peter's feedback into account; mark_page_ro is not a good name for
> a pvops.
> Also I don't believe that this call is actually needed, see below.
> 
> Regarding "x86, mm: setup page table in top-down": if you mark the
> pagetable page RO in alloc_low_page, won't the entire thing crash as
> soon as you try to write to it? You are supposed to mark it RO after
> filling up the pagetable page and before hooking it into the live
> pagetable.
> However contrary to my expectations, I did a quick test and it seems to
> be working, that is probably due to a bug: maybe __pa or lookup_address
> don't work correctly when called so early?

Hi Yinghai, I just tried it and dom0 died during init_memory_mapping, here
is the Oops snippet, full logs are attached:

e820: last_pfn = 0x22f000 max_arch_pfn = 0x4
e820: last_pfn = 0xcff00 max_arch_pfn = 0x4
initial memory mapped: [mem 0x-0x022a]
Base memory trampoline at [88096000] 96000 size 24576
init_memory_mapping: [mem 0x-0x000f]
 [mem 0x-0x000f] page 4k
init_memory_mapping: [mem 0x21fe0-0x21fe77fff]
 [mem 0x21fe0-0x21fe77fff] page 4k
init_memory_mapping: [mem 0x21c00-0x21fdf]
 [mem 0x21c00-0x21fdf] page 4k
init_memory_mapping: [mem 0x2-0x21bff]
 [mem 0x2-0x21bff] page 4k
init_memory_mapping: [mem 0x0010-0xcec6dfff]
 [mem 0x0010-0xcec6dfff] page 4k
init_memory_mapping: [mem 0xcf5f4000-0xcf5f4fff]
 [mem 0xcf5f4000-0xcf5f4fff] page 4k
init_memory_mapping: [mem 0xcf7fb000-0xcfc19fff]
 [mem 0xcf7fb000-0xcfc19fff] page 4k
init_memory_mapping: [mem 0xcfef4000-0xcfef]
 [mem 0xcfef4000-0xcfef] page 4k
init_memory_mapping: [mem 0x11000-0x1]
 [mem 0x11000-0x1] page 4k
PGD 0 
Oops: 0003 [#1] SMP 
Modules linked in:
CPU 0 
Pid: 0, comm: swapper Not tainted 3.6.0+ #3 AMD Pike/Pike
RIP: e030:[]  [] xen_set_pte_init+0x1/0x9
RSP: e02b:81c01ae8  EFLAGS: 00010086
RAX: 8001913d1063 RBX: 88021f63b1c8 RCX: 8163
RDX: 01ff RSI: 8001913d1063 RDI: 88021f63b1c8
RBP: 81c01af8 R08:  R09: 
R10:  R11:  R12: 00011b039000
R13: 003a R14: 00011b03a000 R15: 00011b039000
FS:  () GS:81cbe000() knlGS:
CS:  e033 DS:  ES:  CR0: 80050033
CR2:  CR3: 01c0b000 CR4: 0660
DR0:  DR1:  DR2: 
DR3:  DR6:  DR7: 
Process swapper (pid: 0, threadinfo 81c0, task 81c13410)
Stack:
 81c01af8 810330f5 81c01b58 816aa9f3
 8163 88021f63c000 0002 00011b039000
 81c01b38  00011b00 88021f7146c0
Call Trace:
 [] ? set_pte+0xb/0xd
 [] phys_pte_init+0xd4/0x106
 [] phys_pmd_init+0x1bb/0x215
 [] phys_pud_init+0x1b9/0x218
 [] kernel_physical_mapping_init+0xad/0x14a
 [] init_memory_mapping+0x275/0x303
 [] init_range_memory_mapping+0x8b/0xc8
 [] init_mem_mapping+0xf2/0x162
 [] setup_arch+0x682/0xaac
 [] ? printk+0x48/0x4a
 [] start_kernel+0x8e/0x3d8
 [] x86_64_start_reservations+0xae/0xb2
 [] xen_start_kernel+0x63d/0x63f
Code: 00 00 48 c7 c7 f2 a8 aa 81 e8 ee 61 36 ff c7 05 59 10 06 00 01 00 00 00 
5d c3 55 48 89 f7 48 89 e5 e8 95 cf 32 ff 31 c0 5d c3 55 <48> 89 37 48 89 

Re: [PATCH -v3 0/7] x86: Use BRK to pre mapping page table to make xen happy

2012-10-18 Thread Jacob Shin
On Thu, Oct 18, 2012 at 01:40:21PM -0700, Yinghai Lu wrote:
> On Thu, Oct 18, 2012 at 9:26 AM, Jacob Shin  wrote:
> 
> i tested dom0 conf on 32 bit  and 64 bit, they are all working.
> 
> and just now I tried with mem=8944m, and they are still working.
> 
> anyway, can you please updated -v5 for-x86-mm branch ?
> I removed mark_page_ro workaround according to stefano

Just tested -v5 with Xen, and Dom0 no longer dies. Also tried it on HVM DomU
and also boots fine.

Also tested on our 1TB machine, and it looks good, only E820_RAM are mapped:

[0.00] e820: BIOS-provided physical RAM map:
[0.00] BIOS-e820: [mem 0x-0x000983ff] usable
[0.00] BIOS-e820: [mem 0x00098400-0x0009] reserved
[0.00] BIOS-e820: [mem 0x000d2000-0x000f] reserved
[0.00] BIOS-e820: [mem 0x0010-0xc7eb] usable
[0.00] BIOS-e820: [mem 0xc7ec-0xc7ed7fff] ACPI data
[0.00] BIOS-e820: [mem 0xc7ed8000-0xc7ed9fff] ACPI NVS
[0.00] BIOS-e820: [mem 0xc7eda000-0xc7ff] reserved
[0.00] BIOS-e820: [mem 0xfec0-0xfec0] reserved
[0.00] BIOS-e820: [mem 0xfee0-0xfee00fff] reserved
[0.00] BIOS-e820: [mem 0xfff0-0x] reserved
[0.00] BIOS-e820: [mem 0x0001-0x00e037ff] usable
[0.00] BIOS-e820: [mem 0x00e03800-0x00ff] reserved
[0.00] BIOS-e820: [mem 0x0100-0x011ffeff] usable

...

[0.00] e820: last_pfn = 0xc7ec0 max_arch_pfn = 0x4
[0.00] initial memory mapped: [mem 0x-0x1fff]
[0.00] Base memory trampoline at [88092000] 92000 size 24576
[0.00] Using GB pages for direct mapping
[0.00] init_memory_mapping: [mem 0x-0x000f]
[0.00]  [mem 0x-0x000f] page 4k
[0.00] init_memory_mapping: [mem 0x11ffee0-0x11ffeff]
[0.00]  [mem 0x11ffee0-0x11ffeff] page 2M
[0.00] init_memory_mapping: [mem 0x11ffc00-0x11ffedf]
[0.00]  [mem 0x11ffc00-0x11ffedf] page 2M
[0.00] init_memory_mapping: [mem 0x11f8000-0x11ffbff]
[0.00]  [mem 0x11f8000-0x11fbfff] page 1G
[0.00]  [mem 0x11fc000-0x11ffbff] page 2M
[0.00] init_memory_mapping: [mem 0x110-0x11f7fff]
[0.00]  [mem 0x110-0x11f7fff] page 1G
[0.00] init_memory_mapping: [mem 0x0010-0xc7eb]
[0.00]  [mem 0x0010-0x001f] page 4k
[0.00]  [mem 0x0020-0x3fff] page 2M
[0.00]  [mem 0x4000-0xbfff] page 1G
[0.00]  [mem 0xc000-0xc7df] page 2M
[0.00]  [mem 0xc7e0-0xc7eb] page 4k
[0.00] init_memory_mapping: [mem 0x1-0xe037ff]
[0.00]  [mem 0x1-0xdf] page 1G
[0.00]  [mem 0xe0-0xe037ff] page 2M
[0.00] init_memory_mapping: [mem 0x100-0x10f]
[0.00]  [mem 0x100-0x10f] page 1G

Thanks!!

> 
> Thanks
> 
> Yinghai
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG: 1bbbbe7 (x86: Exclude E820_RESERVED regions...) PANIC on boot

2012-10-20 Thread Jacob Shin
On Sat, Oct 20, 2012 at 09:01:43PM -0700, Yinghai Lu wrote:
> On Sat, Oct 20, 2012 at 5:17 PM, Tom Rini  wrote:
> > On 10/20/12 17:11, Shin, Jacob wrote:
> >> Hi could you please attach the dmesg output? Before rc2 is fine as well.
> >> I would like to see the E820 table. Thank you,
> >
> > dmesg is quite long so I've put it on pastebin: http://pastebin.com/4eSPEAvB
> >
> > --
> 
> [0.00] BIOS-e820: [mem 0x00011000-0x00042fff] usable
> 
> pre-calculate table size is too small, so it crashes.

Right,

I think just this one patch 3/6 on top of -rc2 should work:

https://lkml.org/lkml/2012/8/29/223

That would be a simpler path for 3.7,

Thanks!

> 
> can you please try
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-yinghai.git
> for-x86-mm
> 
> and post bootlog?
> 
> Thanks
> 
> Yinghai
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG: 1bbbbe7 (x86: Exclude E820_RESERVED regions...) PANIC on boot

2012-10-21 Thread Jacob Shin
On Sun, Oct 21, 2012 at 10:51:35AM -0700, Tom Rini wrote:
> On 10/20/12 21:18, Jacob Shin wrote:
> > On Sat, Oct 20, 2012 at 09:01:43PM -0700, Yinghai Lu wrote:
> >> On Sat, Oct 20, 2012 at 5:17 PM, Tom Rini  wrote:
> >>> On 10/20/12 17:11, Shin, Jacob wrote:
> >>>> Hi could you please attach the dmesg output? Before rc2 is fine as well.
> >>>> I would like to see the E820 table. Thank you,
> >>>
> >>> dmesg is quite long so I've put it on pastebin: 
> >>> http://pastebin.com/4eSPEAvB
> >>>
> >>> --
> >>
> >> [0.00] BIOS-e820: [mem 0x00011000-0x00042fff] 
> >> usable
> >>
> >> pre-calculate table size is too small, so it crashes.
> > 
> > Right,
> > 
> > I think just this one patch 3/6 on top of -rc2 should work:
> > 
> > https://lkml.org/lkml/2012/8/29/223
> > 
> > That would be a simpler path for 3.7,
> 
> It doesn't apply easily (for me) on top of 3.7-rc2 however.  Happy to
> test a patch on top of 3.7-rc2 when you're able to.

Ah, sorry, this one should apply on top of 3.7-rc2:

https://lkml.org/lkml/2012/8/24/469

Could you try that? Just that single patch, not the whole patchset.

Thanks!

-Jacob

> 
> -- 
> Tom
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG: 1bbbbe7 (x86: Exclude E820_RESERVED regions...) PANIC on boot

2012-10-22 Thread Jacob Shin
On Sun, Oct 21, 2012 at 02:23:58PM -0700, Tom Rini wrote:
> On 10/21/12 14:06, Jacob Shin wrote:
> > Ah, sorry, this one should apply on top of 3.7-rc2:
> > 
> > https://lkml.org/lkml/2012/8/24/469
> > 
> > Could you try that? Just that single patch, not the whole patchset.
> 
> That fixes it, replied with a note and Tested-by, thanks!

Thanks for testing!

hpa, so sorry, but it looks like we need one more patch [PATCH 2/5] x86:
find_early_table_space based on memory ranges that are being mapped:

  https://lkml.org/lkml/2012/8/24/469

on top of this, because find_early_table_space calculation does not come out
correctly for this particular E820 table that Tom has:

  http://pastebin.com/4eSPEAvB

The reason why we hit this now, and never hit it before is because before the
start was hard coded to 1UL<<32.

Thanks,

-Jacob

> 
> -- 
> Tom
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG: 1bbbbe7 (x86: Exclude E820_RESERVED regions...) PANIC on boot

2012-10-22 Thread Jacob Shin
On Mon, Oct 22, 2012 at 11:05:29AM -0700, Yinghai Lu wrote:
> On Mon, Oct 22, 2012 at 7:40 AM, Jacob Shin  wrote:
> > On Sun, Oct 21, 2012 at 02:23:58PM -0700, Tom Rini wrote:
> >> On 10/21/12 14:06, Jacob Shin wrote:
> >> > Ah, sorry, this one should apply on top of 3.7-rc2:
> >> >
> >> > https://lkml.org/lkml/2012/8/24/469
> >> >
> >> > Could you try that? Just that single patch, not the whole patchset.
> >>
> >> That fixes it, replied with a note and Tested-by, thanks!
> >
> > Thanks for testing!
> >
> > hpa, so sorry, but it looks like we need one more patch [PATCH 2/5] x86:
> > find_early_table_space based on memory ranges that are being mapped:
> >
> >   https://lkml.org/lkml/2012/8/24/469
> >
> > on top of this, because find_early_table_space calculation does not come out
> > correctly for this particular E820 table that Tom has:
> >
> >   http://pastebin.com/4eSPEAvB
> >
> > The reason why we hit this now, and never hit it before is because before 
> > the
> > start was hard coded to 1UL<<32.
> >
> 
> I'm afraid that  we may need add more patches to make v3.7 really
> handle every corner case.
> 
> During testing, I found more problem:
> 1. E820_RAM and E820_RESEVED_KERN
>EFI change some E820_RAM to E820_RESREVED_KERN to cover
>efi setup_data. and will pass to e820_saved, to next kexec-ed kernel.
>   So we can use E820_RAM to loop it, and should still E820_RAM and
> E820_RESERVED_KERN combined.
>   otherwise will render page table with small pages, or every some partial
>   is not covered.
>   So i change to for_each_mem_pfn_range(), we fill the memblock with
>   E820_RAM and E820_RESERVED_KERN, and memblock will merge
>   range together, that will make mapping still use big page size.

Does EFI do this on above 4G memory? All the EFI BIOSes we have in house looked
to be only touching under 4G.

> 
> 2. partial page:
>E820 or user could pass memmap that  is not page aligned.
>old cold will guarded by max_low_pfn and max_pfn. so the end partial
>page will be trimmed down, and memblock can one use it.
>middle partial page will still get covered by directly mapping, and
> memblock still can use them.
>Now we will not map middle partial page and memblock still try to use it
> we could get panic when accessing those pages.
> 
> So I would suggest to just revert that temporary patch at this time,
> and later come out one complete patch for stable kernels.

Hm okay, I was hoping not, but if it has to be ..

> 
> Thanks
> 
> Yinghai
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V4 0/6] perf, amd: Enable AMD family 15h northbridge counters

2012-12-10 Thread Jacob Shin
On Wed, Dec 05, 2012 at 05:04:12PM -0600, Jacob Shin wrote:
> The following patchset enables 4 additional performance counters in
> AMD family 15h processors that count northbridge events -- such as
> number of DRAM accesses.
> 
> This patchset is based on previous work done by Robert Richter
>  :
> 
> https://lkml.org/lkml/2012/6/19/324
> 
> The main differences are:
> 
> * The northbridge counters are indexed contiguously right above the
>   core performance counters.
> 
> * MSR address offset calculations are moved to architecture specific
>   files.
> 
> * Interrups are set up to be delivered only to a single core.
> 
> V4:
> * Moved interrupt core select set up back to event constraints
>   function, sicne during ->hw_config time we do not yet know on which
>   CPU the the event will run on.
> * Tested on and made minor revisions to make sure that the patchset is
>   compatible with upcoming AMD Family 16h processors, and will support
>   core and NB counters without any further patches.
> 
> V3:
> Addressed the following feedback/comments from Robert's review
> * https://lkml.org/lkml/2012/11/16/484
> * https://lkml.org/lkml/2012/11/26/162
> 
> V2:
> Separate out Robert's patches, and add properly ordered certificate of
> origins.
> 
> Jacob Shin (4):
>   perf, amd: Use proper naming scheme for AMD bit field definitions
>   perf, x86: Move MSR address offset calculation to architecture
> specific files
>   perf, x86: Allow for architecture specific RDPMC indexes
>   perf, amd: Enable northbridge performance counters on AMD family 15h
> 
> Robert Richter (2):
>   perf, amd: Rework northbridge event constraints handler
>   perf, amd: Generalize northbridge constraints code for family 15h
> 
>  arch/x86/include/asm/cpufeature.h|2 +
>  arch/x86/include/asm/msr-index.h |2 +
>  arch/x86/include/asm/perf_event.h|   13 +-
>  arch/x86/kernel/cpu/perf_event.c |2 +-
>  arch/x86/kernel/cpu/perf_event.h |   25 ++-
>  arch/x86/kernel/cpu/perf_event_amd.c |  318 
> ++
>  6 files changed, 268 insertions(+), 94 deletions(-)
> 
> -- 

Ping .. ? any comments/feedback ? If things look okay, could you
please commit to tip perf/core ?

Thank you,

-Jacob

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/4] perf, amd: Enable AMD family 15h northbridge counters

2012-11-12 Thread Jacob Shin
On Mon, Nov 12, 2012 at 03:22:33PM +0100, Robert Richter wrote:
> Stephane,
> 
> On 12.11.12 13:24:38, Stephane Eranian wrote:
> > Anybody from AMD or formerly @ AMD care to submit a libpfm4 patch
> > to add the Fam15th NB events?
> > 
> > I'd like to avoid having to type them in manually.
> 
> Suravee may probably help you here.

Suravee is out of the office until the end of the month,

Is this necessary ASAP for this perf patchset to go upstream? If so,
I'll commit to getting a patch out for libpfm4 soon.

Otherwise, I'll ask Suravee to work on it when he gets back.

Thanks,

-Jacob

> 
> HTH
> 
> -Robert
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/4] x86, cacheinfo: Use AMD topology extensions

2012-11-13 Thread Jacob Shin
On Wed, Nov 07, 2012 at 10:48:35AM +0100, H. Peter Anvin wrote:
> Too many of us at LCE right now...

Hi, ping'ing once again, could you take a look at the patchset when
you get the chance, and if no problems commit them into tip?

Thanks!

-Jacob


> 
> Jacob Shin  wrote:
> 
> >On Fri, Oct 19, 2012 at 10:55:19AM +0200, Andreas Herrmann wrote:
> >> Hi,
> >> 
> >> Following patches modify cachinfo code to make use of AMD's topology
> >> extension CPUID functions. Thus (hopefully) we can avoid CPU specific
> >> modifications whenever cache topology changes.
> >> 
> >> Please apply.
> >
> >Acked-by: Jacob Shin 
> >
> >
> >
> >Ping ?
> >
> >Any feedback ? If not could we get it into tip ?
> >
> >Thank you,
> >
> >> 
> >> 
> >> Thanks,
> >> 
> >> Andreas
> >> 
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe
> >linux-kernel" in
> >> the body of a message to majord...@vger.kernel.org
> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >> Please read the FAQ at  http://www.tux.org/lkml/
> >> 
> >> 
> >> 
> 
> -- 
> Sent from my mobile phone. Please excuse brevity and lack of formatting.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/4] x86, cacheinfo: Use AMD topology extensions

2012-11-06 Thread Jacob Shin
On Fri, Oct 19, 2012 at 10:55:19AM +0200, Andreas Herrmann wrote:
> Hi,
> 
> Following patches modify cachinfo code to make use of AMD's topology
> extension CPUID functions. Thus (hopefully) we can avoid CPU specific
> modifications whenever cache topology changes.
> 
> Please apply.

Acked-by: Jacob Shin 



Ping ?

Any feedback ? If not could we get it into tip ?

Thank you,

> 
> 
> Thanks,
> 
> Andreas
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/4] perf, amd: Simplify northbridge event constraints handler

2012-11-09 Thread Jacob Shin
From: Robert Richter 

Code simplification, there is no functional change.

Signed-off-by: Robert Richter 
Signed-off-by: Jacob Shin 
---
 arch/x86/kernel/cpu/perf_event_amd.c |   68 +-
 1 file changed, 26 insertions(+), 42 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index 4528ae7..d60c5c7 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -256,9 +256,8 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
struct perf_event *event)
 {
struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
-   struct perf_event *old = NULL;
-   int max = x86_pmu.num_counters;
-   int i, j, k = -1;
+   struct perf_event *old;
+   int idx, new = -1;
 
/*
 * if not NB event or no NB, then no constraints
@@ -276,48 +275,33 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
struct perf_event *event)
 * because of successive calls to x86_schedule_events() from
 * hw_perf_group_sched_in() without hw_perf_enable()
 */
-   for (i = 0; i < max; i++) {
-   /*
-* keep track of first free slot
-*/
-   if (k == -1 && !nb->owners[i])
-   k = i;
+   for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+   if (new == -1 || hwc->idx == idx)
+   /* assign free slot, prefer hwc->idx */
+   old = cmpxchg(nb->owners + idx, NULL, event);
+   else if (nb->owners[idx] == event)
+   /* event already present */
+   old = event;
+   else
+   continue;
+
+   if (old && old != event)
+   continue;
+
+   /* reassign to this slot */
+   if (new != -1)
+   cmpxchg(nb->owners + new, event, NULL);
+   new = idx;
 
/* already present, reuse */
-   if (nb->owners[i] == event)
-   goto done;
-   }
-   /*
-* not present, so grab a new slot
-* starting either at:
-*/
-   if (hwc->idx != -1) {
-   /* previous assignment */
-   i = hwc->idx;
-   } else if (k != -1) {
-   /* start from free slot found */
-   i = k;
-   } else {
-   /*
-* event not found, no slot found in
-* first pass, try again from the
-* beginning
-*/
-   i = 0;
-   }
-   j = i;
-   do {
-   old = cmpxchg(nb->owners+i, NULL, event);
-   if (!old)
+   if (old == event)
break;
-   if (++i == max)
-   i = 0;
-   } while (i != j);
-done:
-   if (!old)
-   return &nb->event_constraints[i];
-
-   return &emptyconstraint;
+   }
+
+   if (new == -1)
+   return &emptyconstraint;
+
+   return &nb->event_constraints[new];
 }
 
 static struct amd_nb *amd_alloc_nb(int cpu)
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/4] perf, amd: Enable AMD family 15h northbridge counters

2012-11-09 Thread Jacob Shin
The following patchset enables 4 additional performance counters in
AMD family 15h processors that counts northbridge events -- such as
DRAM accesses.

This patchset is based on previous work done by Robert Richter
 :

https://lkml.org/lkml/2012/6/19/324

The main differences are:

- The northbridge counters are indexed contiguously right above the
  core performance counters.

- MSR address offset calculations are moved to architecture specific
  files.

- Interrups are set up to be delivered only to a single core.

Jacob Shin (3):
  perf, amd: Refactor northbridge event constraints handler for code
sharing
  perf, x86: Move MSR address offset calculation to architecture
specific files
  perf, amd: Enable northbridge performance counters on AMD family 15h

Robert Richter (1):
  perf, amd: Simplify northbridge event constraints handler

 arch/x86/include/asm/cpufeature.h|2 +
 arch/x86/include/asm/msr-index.h |2 +
 arch/x86/include/asm/perf_event.h|6 +
 arch/x86/kernel/cpu/perf_event.h |   21 +--
 arch/x86/kernel/cpu/perf_event_amd.c |  279 +++---
 5 files changed, 207 insertions(+), 103 deletions(-)

-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/4] perf, x86: Move MSR address offset calculation to architecture specific files

2012-11-09 Thread Jacob Shin
Move counter index to MSR address offset calculation to architecture
specific files. This prepares the way for perf_event_amd to enable
counter addresses that are not contiguous -- for example AMD Family
15h processors have 6 core performance counters starting at 0xc0010200
and 4 northbridge performance counters starting at 0xc0010240.

Signed-off-by: Jacob Shin 
---
 arch/x86/kernel/cpu/perf_event.h |   21 +---
 arch/x86/kernel/cpu/perf_event_amd.c |   36 ++
 2 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 271d257..aacf025 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -325,6 +325,7 @@ struct x86_pmu {
int (*schedule_events)(struct cpu_hw_events *cpuc, int n, 
int *assign);
unsignedeventsel;
unsignedperfctr;
+   int (*addr_offset)(int index);
u64 (*event_map)(int);
int max_events;
int num_counters;
@@ -444,28 +445,16 @@ extern u64 __read_mostly hw_cache_extra_regs
 
 u64 x86_perf_event_update(struct perf_event *event);
 
-static inline int x86_pmu_addr_offset(int index)
-{
-   int offset;
-
-   /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
-   alternative_io(ASM_NOP2,
-  "shll $1, %%eax",
-  X86_FEATURE_PERFCTR_CORE,
-  "=a" (offset),
-  "a"  (index));
-
-   return offset;
-}
-
 static inline unsigned int x86_pmu_config_addr(int index)
 {
-   return x86_pmu.eventsel + x86_pmu_addr_offset(index);
+   return x86_pmu.eventsel +
+   (x86_pmu.addr_offset ? x86_pmu.addr_offset(index) : index);
 }
 
 static inline unsigned int x86_pmu_event_addr(int index)
 {
-   return x86_pmu.perfctr + x86_pmu_addr_offset(index);
+   return x86_pmu.perfctr +
+   (x86_pmu.addr_offset ? x86_pmu.addr_offset(index) : index);
 }
 
 int x86_setup_perfctr(struct perf_event *event);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index d17debd..078beb5 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -132,6 +132,41 @@ static u64 amd_pmu_event_map(int hw_event)
return amd_perfmon_event_map[hw_event];
 }
 
+/*
+ * Previously calculated offsets
+ */
+static unsigned int addr_offsets[X86_PMC_IDX_MAX] __read_mostly;
+
+/*
+ * Legacy CPUs:
+ *   4 counters starting at 0xc001 each offset by 1
+ *
+ * CPUs with core performance counter extensions:
+ *   6 counters starting at 0xc0010200 each offset by 2
+ */
+static inline int amd_pmu_addr_offset(int index)
+{
+   int offset;
+
+   if (!index)
+   return index;
+
+   offset = addr_offsets[index];
+
+   if (offset)
+   return offset;
+
+   if (!cpu_has_perfctr_core) {
+   offset = index;
+   } else {
+   offset = index << 1;
+   }
+
+   addr_offsets[index] = offset;
+
+   return offset;
+}
+
 static int amd_pmu_hw_config(struct perf_event *event)
 {
int ret;
@@ -570,6 +605,7 @@ static __initconst const struct x86_pmu amd_pmu = {
.schedule_events= x86_schedule_events,
.eventsel   = MSR_K7_EVNTSEL0,
.perfctr= MSR_K7_PERFCTR0,
+   .addr_offset= amd_pmu_addr_offset,
.event_map  = amd_pmu_event_map,
.max_events = ARRAY_SIZE(amd_perfmon_event_map),
.num_counters   = AMD64_NUM_COUNTERS,
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/4] perf, amd: Refactor northbridge event constraints handler for code sharing

2012-11-09 Thread Jacob Shin
Breakout and generalize family 10h northbridge event contraints code
so that later we can reuse the same code path with other AMD processor
families that have the same northbridge event constraints.

Based on previous patch by Robert Richter 

Signed-off-by: Jacob Shin 
Signed-off-by: Robert Richter 
---
 arch/x86/kernel/cpu/perf_event_amd.c |   43 --
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index d60c5c7..d17debd 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -188,20 +188,13 @@ static inline int amd_has_nb(struct cpu_hw_events *cpuc)
return nb && nb->nb_id != -1;
 }
 
-static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
- struct perf_event *event)
+static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
+  struct perf_event *event)
 {
-   struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
int i;
 
/*
-* only care about NB events
-*/
-   if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
-   return;
-
-   /*
 * need to scan whole list because event may not have
 * been assigned during scheduling
 *
@@ -247,12 +240,13 @@ static void amd_put_event_constraints(struct 
cpu_hw_events *cpuc,
   *
   * Given that resources are allocated (cmpxchg), they must be
   * eventually freed for others to use. This is accomplished by
-  * calling amd_put_event_constraints().
+  * calling __amd_put_nb_event_constraints()
   *
   * Non NB events are not impacted by this restriction.
   */
 static struct event_constraint *
-amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event 
*event,
+  struct event_constraint *c)
 {
struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
@@ -260,12 +254,6 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
struct perf_event *event)
int idx, new = -1;
 
/*
-* if not NB event or no NB, then no constraints
-*/
-   if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
-   return &unconstrained;
-
-   /*
 * detect if already present, if so reuse
 *
 * cannot merge with actual allocation
@@ -275,7 +263,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
struct perf_event *event)
 * because of successive calls to x86_schedule_events() from
 * hw_perf_group_sched_in() without hw_perf_enable()
 */
-   for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+   for_each_set_bit(idx, c->idxmsk, X86_PMC_IDX_MAX) {
if (new == -1 || hwc->idx == idx)
/* assign free slot, prefer hwc->idx */
old = cmpxchg(nb->owners + idx, NULL, event);
@@ -391,6 +379,25 @@ static void amd_pmu_cpu_dead(int cpu)
}
 }
 
+static struct event_constraint *
+amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+   /*
+* if not NB event or no NB, then no constraints
+*/
+   if ((amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
+   return &unconstrained;
+
+   return __amd_get_nb_event_constraints(cpuc, event, &unconstrained);
+}
+
+static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+   if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
+   __amd_put_nb_event_constraints(cpuc, event);
+}
+
 PMU_FORMAT_ATTR(event, "config:0-7,32-35");
 PMU_FORMAT_ATTR(umask, "config:8-15"   );
 PMU_FORMAT_ATTR(edge,  "config:18" );
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/4] perf, amd: Enable northbridge performance counters on AMD family 15h

2012-11-09 Thread Jacob Shin
On AMD family 15h processors, there are 4 new performance counters
(in addition to 6 core performance counters) that can be used for
counting northbridge events (i.e. DRAM accesses). Their bit fields are
almost identical to the core performance counters. However, the same
set of MSRs are shared between multiple cores (that share the same
northbridge). We will reuse the same code path as existing family 10h
northbridge event constraints handler logic to enforce sharing.

Based on previous patch by Robert Richter 

Signed-off-by: Jacob Shin 
Signed-off-by: Robert Richter 
---
 arch/x86/include/asm/cpufeature.h|2 +
 arch/x86/include/asm/msr-index.h |2 +
 arch/x86/include/asm/perf_event.h|6 ++
 arch/x86/kernel/cpu/perf_event_amd.c |  142 ++
 4 files changed, 120 insertions(+), 32 deletions(-)

diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index 8c297aa..17f75b8 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -167,6 +167,7 @@
 #define X86_FEATURE_TBM(6*32+21) /* trailing bit manipulations 
*/
 #define X86_FEATURE_TOPOEXT(6*32+22) /* topology extensions CPUID leafs */
 #define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter 
extensions */
+#define X86_FEATURE_PERFCTR_NB (6*32+24) /* nb performance counter extensions 
*/
 
 /*
  * Auxiliary flags: Linux defined - For features scattered in various
@@ -308,6 +309,7 @@ extern const char * const x86_power_flags[32];
 #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
 #define cpu_has_pclmulqdq  boot_cpu_has(X86_FEATURE_PCLMULQDQ)
 #define cpu_has_perfctr_core   boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
+#define cpu_has_perfctr_nb boot_cpu_has(X86_FEATURE_PERFCTR_NB)
 #define cpu_has_cx8boot_cpu_has(X86_FEATURE_CX8)
 #define cpu_has_cx16   boot_cpu_has(X86_FEATURE_CX16)
 #define cpu_has_eager_fpu  boot_cpu_has(X86_FEATURE_EAGER_FPU)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 7f0edce..e67ff1e 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -157,6 +157,8 @@
 /* Fam 15h MSRs */
 #define MSR_F15H_PERF_CTL  0xc0010200
 #define MSR_F15H_PERF_CTR  0xc0010201
+#define MSR_F15H_NB_PERF_CTL   0xc0010240
+#define MSR_F15H_NB_PERF_CTR   0xc0010241
 
 /* Fam 10h MSRs */
 #define MSR_FAM10H_MMIO_CONF_BASE  0xc0010058
diff --git a/arch/x86/include/asm/perf_event.h 
b/arch/x86/include/asm/perf_event.h
index 4fabcdf..75e039c 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -29,6 +29,8 @@
 #define ARCH_PERFMON_EVENTSEL_INV  (1ULL << 23)
 #define ARCH_PERFMON_EVENTSEL_CMASK0xFF00ULL
 
+#define AMD_PERFMON_EVENTSEL_INT_CORE_ENABLE   (1ULL << 36)
+#define AMD_PERFMON_EVENTSEL_INT_CORE_SEL_MASK (0x0FULL << 37)
 #define AMD_PERFMON_EVENTSEL_GUESTONLY (1ULL << 40)
 #define AMD_PERFMON_EVENTSEL_HOSTONLY  (1ULL << 41)
 
@@ -46,8 +48,12 @@
 #define AMD64_RAW_EVENT_MASK   \
(X86_RAW_EVENT_MASK  |  \
 AMD64_EVENTSEL_EVENT)
+#define AMD64_NB_EVENT_MASK\
+   (AMD64_EVENTSEL_EVENT|  \
+ARCH_PERFMON_EVENTSEL_UMASK)
 #define AMD64_NUM_COUNTERS 4
 #define AMD64_NUM_COUNTERS_CORE6
+#define AMD64_NUM_COUNTERS_NB  4
 
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL  0x3c
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK(0x00 << 8)
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index 078beb5..adf4026 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -143,10 +143,15 @@ static unsigned int addr_offsets[X86_PMC_IDX_MAX] 
__read_mostly;
  *
  * CPUs with core performance counter extensions:
  *   6 counters starting at 0xc0010200 each offset by 2
+ *
+ * CPUs with north bridge performance counter extensions:
+ *   4 additional counters starting at 0xc0010240 each offset by 2
+ *   (indexed right above either one of the above core counters)
  */
 static inline int amd_pmu_addr_offset(int index)
 {
int offset;
+   int ncore;
 
if (!index)
return index;
@@ -158,8 +163,17 @@ static inline int amd_pmu_addr_offset(int index)
 
if (!cpu_has_perfctr_core) {
offset = index;
+   ncore = AMD64_NUM_COUNTERS;
} else {
offset = index << 1;
+   ncore = AMD64_NUM_COUNTERS_CORE;
+   }
+
+   /* find offset of NB counters with respect to x86_pmu.eventsel */
+   if (cpu_has_perfctr_nb) {
+   if (index >= nc

Re: [PATCH 0/4] perf, amd: Enable AMD family 15h northbridge counters

2012-11-11 Thread Jacob Shin
On Sat, Nov 10, 2012 at 12:50:27PM +0100, Robert Richter wrote:
> On 09.11.12 19:01:34, Jacob Shin wrote:
> > The following patchset enables 4 additional performance counters in
> > AMD family 15h processors that counts northbridge events -- such as
> > DRAM accesses.
> > 
> > This patchset is based on previous work done by Robert Richter
> >  :
> > 
> > https://lkml.org/lkml/2012/6/19/324
> 
> The original patch set of this is here (a rebased version):
> 
>  
> http://git.kernel.org/?p=linux/kernel/git/rric/oprofile.git;a=shortlog;h=refs/heads/perf-nb
> 
> This code was tested in detail.
> 
> > The main differences are:
> > 
> > - The northbridge counters are indexed contiguously right above the
> >   core performance counters.
> > 
> > - MSR address offset calculations are moved to architecture specific
> >   files.
> > 
> > - Interrups are set up to be delivered only to a single core.
> 
> So I rather suggest to make delta patches on top of my patches.

Okay, if we have to, I can rework my patches on top of that, as long 
as the end result looks something like I'm suggesting above. Because
in an upcoming processor family, there is no core performance counter
extensions, but we do have northbridge performance counters. Meaning
the counter address base would be c001 and northbridge counters
live in c0010240, being 0x240 apart, we could make counter masks work
but that testng awful alot of 0's for every address offset calculation
.

> 
> Peter's main concerns were that my patch set is not in the
> Intel-uncore style. I started reworking this but was not able to
> finish my work. This concerns still exist.

Right, I considered this too, and still, I agree with you Robert that
it makes more sense to just extend AMD's x86 PMU.

1. Because the hardware interface -- register bit fields, are alost
   identical

2. Because the interrupt delivery mechanism is also identical --
   delivered via same APIC interrupt vector.

I think my proposed patchset on top of current Linus's tree is pretty
minimal, and is isolated to AMD so it should be easier to swallow.

Peter, could you take a look at the patchset and if you still prefer
a intel uncore like implementation?

> 
> Due to the current situation I would rather prefer to create a
> tip:perf/amd-nb branch that includes my patches and then add all
> further necessary steps for mainline acceptance on top of it.

Okay, Peter, let me know if this is a route to go, and I'll generate
my patchset on top of that.

Thanks,

-Jacob

> 
> Thanks,
> 
> -Robert
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] MCE, AMD: MCE decoding support for AMD Family 16h

2012-12-18 Thread Jacob Shin
On Tue, Dec 18, 2012 at 06:19:15PM +0100, Borislav Petkov wrote:
> On Mon, Dec 17, 2012 at 01:39:48PM -0600, Jacob Shin wrote:
> > Add MCE decoding logic for AMD Family 16h processors.
> > 
> > Signed-off-by: Jacob Shin 
> > ---
> >  drivers/edac/mce_amd.c |  120 
> > ++--
> >  drivers/edac/mce_amd.h |6 +++
> >  2 files changed, 122 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
> > index 84320f9..7d2d037 100644
> > --- a/drivers/edac/mce_amd.c
> > +++ b/drivers/edac/mce_amd.c
> > @@ -64,6 +64,10 @@ EXPORT_SYMBOL_GPL(to_msgs);
> >  const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
> >  EXPORT_SYMBOL_GPL(ii_msgs);
> >  
> > +/* internal error type */
> > +const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
> > +EXPORT_SYMBOL_GPL(uu_msgs);
> 
> Seems like those aren't used anywhere?
> 
> >  static const char * const f15h_mc1_mce_desc[] = {
> > "UC during a demand linefill from L2",
> > "Parity error during data load from IC",
> > @@ -275,6 +279,23 @@ static bool f15h_mc0_mce(u16 ec, u8 xec)
> > return ret;
> >  }
> >  
> > +static bool f16h_mc0_mce(u16 ec, u8 xec)
> > +{
> > +   u8 r4 = R4(ec);
> > +
> > +   if (MEM_ERROR(ec) && TT(ec) == TT_DATA && LL(ec) == LL_L1 &&
> > +   (r4 == R4_DRD || r4 == R4_DWR)) {
> > +
> > +   pr_cont("%s parity error due to %s.\n",
> > +   (xec == 0x0? "Data" : "Tag"),
> > +   (r4  == R4_DRD ? "load" : "store"));
> > +
> > +   return true;
> > +   }
> > +
> > +   return f14h_mc0_mce(ec, xec);
> 
> Looks like this could be merged with f14h_mc0_mce no? You can call the
> function then cat_mc0_mce (for all the *cat cores) and assign it to
> fam_ops->mc0_mce in the f14h and f16h case.

Okay

> 
> > +}
> > +
> >  static void decode_mc0_mce(struct mce *m)
> >  {
> > u16 ec = EC(m->status);
> > @@ -379,6 +400,36 @@ static bool f15h_mc1_mce(u16 ec, u8 xec)
> > return ret;
> >  }
> >  
> > +static bool f16h_mc1_mce(u16 ec, u8 xec)
> > +{
> > +   u8 r4= R4(ec);
> > +   bool ret = true;
> > +
> > +   if (MEM_ERROR(ec)) {
> > +   if (TT(ec) != TT_INSTR)
> > +   ret = false;
> > +
> > +   else if (r4 == R4_IRD)
> > +   pr_cont("%s array parity error for a tag hit.\n",
> > +   (xec == 0x0 ? "Data" : "Tag"));
> > +
> > +   else if (r4 == R4_SNOOP)
> > +   pr_cont("Tag error during snoop/victimization.\n");
> > +
> > +   else if (xec == 0x0)
> > +   pr_cont("Tag parity error from victim castout.\n");
> > +
> > +   else if (xec == 0x2)
> > +   pr_cont("Microcode patch RAM parity error.\n");
> 
> Also no need for a family-special function - just rename f14h_mc1_mce
> to cat_mc1_mce() as above and add a special case like this as the last
> else-branch of the if conditional there:

Okay

> 
> + if (boot_cpu_data.x86 == 0x16) {
> + if (LL(ec) == LL_LG && xec == 2)
> + pr_cont("Microcode patch RAM parity error.\n");
> + else
> + pr_cont("IC Tag parity error from victim 
> castout.\n");
> + return true;
> + }
> 
> > +
> > +   else
> > +   ret = false;
> > +   } else
> > +   ret = false;
> > +
> > +   return ret;
> > +}
> > +
> >  static void decode_mc1_mce(struct mce *m)
> >  {
> > u16 ec = EC(m->status);
> > @@ -469,6 +520,48 @@ static bool f15h_mc2_mce(u16 ec, u8 xec)
> > return ret;
> >  }
> >  
> > +static bool f16h_mc2_mce(u16 ec, u8 xec)
> > +{
> > +   u8 r4= R4(ec);
> > +   bool ret = true;
> > +
> > +   if (MEM_ERROR(ec) && TT(ec) == TT_GEN && LL(ec) == LL_L2) {
> 
> You can exit early here:
> 
>   if (!MEM_ERROR(ec))
>   return false;
> 
> Also, no need to test for TT and LL - we're relying on th

Re: [PATCH 2/2] MCE, AMD: MCE decoding support for AMD Family 16h

2012-12-18 Thread Jacob Shin
On Tue, Dec 18, 2012 at 11:30:24AM -0600, Jacob Shin wrote:
> On Tue, Dec 18, 2012 at 06:19:15PM +0100, Borislav Petkov wrote:
> > On Mon, Dec 17, 2012 at 01:39:48PM -0600, Jacob Shin wrote:

> > > +/* internal error type */
> > > +const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
> > > +EXPORT_SYMBOL_GPL(uu_msgs);
> > 
> > Seems like those aren't used anywhere?

> > > @@ -634,6 +727,10 @@ static void decode_mc6_mce(struct mce *m)
> > >  
> > >  static inline void amd_decode_err_code(u16 ec)
> > >  {
> > > + if (INT_ERROR(ec)) {
> > > + pr_emerg(HW_ERR "internal: %s\n", LL_MSG(ec));
> > > + return;
> > > + }
> > 
> > Is this correct? I'm just confirming because I don't have the internal
> > info anymore.
> > 
> > Uuh, hold on, maybe those otherwise unused uu_msgs above were meant to
> > be used here instead of the LL_MSG? IOW,
> 
> > 
> > pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
> > 
> > Right?
> 
> Ah, yes thats right, sorry about the typo. It looks like:
> 
> Error Code   Error Code TypeDescription
>  01UU    Internal Unclassified  UU = Internal Error Type
> 
> And the UU encoding is as is in the mce_amd.h file

I think I meant to say as in uu_msgs[] above. HWA stands for
"Hardware Assertion" ..

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well

2012-12-18 Thread Jacob Shin
Currently only AMD Family 15h processors have special handling for MC2
errors, since upcoming Family 16h will also need unique handling,
let's make MC2 handling part of amd_decoder_ops.

Signed-off-by: Jacob Shin 
---
 drivers/edac/mce_amd.c |   56 +---
 drivers/edac/mce_amd.h |1 +
 2 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index ad63757..e4752be 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -399,12 +399,9 @@ static void decode_mc1_mce(struct mce *m)
pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
 }
 
-static void decode_mc2_mce(struct mce *m)
+static bool k8_mc2_mce(u16 ec, u8 xec)
 {
-   u16 ec = EC(m->status);
-   u8 xec = XEC(m->status, xec_mask);
-
-   pr_emerg(HW_ERR "MC2 Error");
+   bool ret = true;
 
if (xec == 0x1)
pr_cont(" in the write data buffers.\n");
@@ -429,24 +426,18 @@ static void decode_mc2_mce(struct mce *m)
pr_cont(": %s parity/ECC error during data "
"access from L2.\n", R4_MSG(ec));
else
-   goto wrong_mc2_mce;
+   ret = false;
} else
-   goto wrong_mc2_mce;
+   ret = false;
} else
-   goto wrong_mc2_mce;
-
-   return;
+   ret = false;
 
- wrong_mc2_mce:
-   pr_emerg(HW_ERR "Corrupted MC2 MCE info?\n");
+   return ret;
 }
 
-static void decode_f15_mc2_mce(struct mce *m)
+static bool f15h_mc2_mce(u16 ec, u8 xec)
 {
-   u16 ec = EC(m->status);
-   u8 xec = XEC(m->status, xec_mask);
-
-   pr_emerg(HW_ERR "MC2 Error: ");
+   bool ret = true;
 
if (TLB_ERROR(ec)) {
if (xec == 0x0)
@@ -454,10 +445,10 @@ static void decode_f15_mc2_mce(struct mce *m)
else if (xec == 0x1)
pr_cont("Poison data provided for TLB fill.\n");
else
-   goto wrong_f15_mc2_mce;
+   ret = false;
} else if (BUS_ERROR(ec)) {
if (xec > 2)
-   goto wrong_f15_mc2_mce;
+   ret = false;
 
pr_cont("Error during attempted NB data read.\n");
} else if (MEM_ERROR(ec)) {
@@ -471,14 +462,22 @@ static void decode_f15_mc2_mce(struct mce *m)
break;
 
default:
-   goto wrong_f15_mc2_mce;
+   ret = false;
}
}
 
-   return;
+   return ret;
+}
+
+static void decode_mc2_mce(struct mce *m)
+{
+   u16 ec = EC(m->status);
+   u8 xec = XEC(m->status, xec_mask);
 
- wrong_f15_mc2_mce:
-   pr_emerg(HW_ERR "Corrupted MC2 MCE info?\n");
+   pr_emerg(HW_ERR "MC2 Error: ");
+
+   if (!fam_ops->mc2_mce(ec, xec))
+   pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
 }
 
 static void decode_mc3_mce(struct mce *m)
@@ -702,10 +701,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned 
long val, void *data)
break;
 
case 2:
-   if (c->x86 == 0x15)
-   decode_f15_mc2_mce(m);
-   else
-   decode_mc2_mce(m);
+   decode_mc2_mce(m);
break;
 
case 3:
@@ -783,33 +779,39 @@ static int __init mce_amd_init(void)
case 0xf:
fam_ops->mc0_mce = k8_mc0_mce;
fam_ops->mc1_mce = k8_mc1_mce;
+   fam_ops->mc2_mce = k8_mc2_mce;
break;
 
case 0x10:
fam_ops->mc0_mce = f10h_mc0_mce;
fam_ops->mc1_mce = k8_mc1_mce;
+   fam_ops->mc2_mce = k8_mc2_mce;
break;
 
case 0x11:
fam_ops->mc0_mce = k8_mc0_mce;
fam_ops->mc1_mce = k8_mc1_mce;
+   fam_ops->mc2_mce = k8_mc2_mce;
break;
 
case 0x12:
fam_ops->mc0_mce = f12h_mc0_mce;
fam_ops->mc1_mce = k8_mc1_mce;
+   fam_ops->mc2_mce = k8_mc2_mce;
break;
 
case 0x14:
nb_err_cpumask  = 0x3;
fam_ops->mc0_mce = f14h_mc0_mce;
fam_ops->mc1_mce = f14h_mc1_mce;
+   fam_ops->mc2_mce = k8_mc2_mce;
break;
 
case 0x15:
xec_mask = 0x1f;
fam_ops->mc0_mce = f15h_mc0_mce;
fam_ops->mc1_mce = f15h_mc1_mce;
+   fam_ops->mc2_mce = f15h_mc2_mce;
break;
 
default:
diff --git a/drivers/edac/mce_amd.h 

[PATCH V2 0/2] MCE, AMD: MCE decoding support for AMD Family 16h

2012-12-18 Thread Jacob Shin
The following patchset enables MCE decoding support for AMD Family 16h
processors.

Changes in V2:
* Changed if/else style and pr_cont usage per feedback from:
  https://lkml.org/lkml/2012/12/17/365

* Merged f14h and f16h mc0 and mc2 decoding into common function per
  feedback from: https://lkml.org/lkml/2012/12/18/269

* Fixed typo in INT_ERROR decoding.

Jacob Shin (2):
  MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well
  MCE, AMD: MCE decoding support for AMD Family 16h

 drivers/edac/mce_amd.c |  139 +++-
 drivers/edac/mce_amd.h |4 ++
 2 files changed, 105 insertions(+), 38 deletions(-)

-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] MCE, AMD: MCE decoding support for AMD Family 16h

2012-12-18 Thread Jacob Shin
Add MCE decoding logic for AMD Family 16h processors.

Signed-off-by: Jacob Shin 
---
 drivers/edac/mce_amd.c |   83 +---
 drivers/edac/mce_amd.h |3 ++
 2 files changed, 75 insertions(+), 11 deletions(-)

diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index e4752be..ce4c8b7 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -64,6 +64,10 @@ EXPORT_SYMBOL_GPL(to_msgs);
 const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
 EXPORT_SYMBOL_GPL(ii_msgs);
 
+/* internal error type */
+const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
+EXPORT_SYMBOL_GPL(uu_msgs);
+
 static const char * const f15h_mc1_mce_desc[] = {
"UC during a demand linefill from L2",
"Parity error during data load from IC",
@@ -176,7 +180,7 @@ static bool k8_mc0_mce(u16 ec, u8 xec)
return f10h_mc0_mce(ec, xec);
 }
 
-static bool f14h_mc0_mce(u16 ec, u8 xec)
+static bool cat_mc0_mce(u16 ec, u8 xec)
 {
u8 r4= R4(ec);
bool ret = true;
@@ -330,22 +334,27 @@ static bool k8_mc1_mce(u16 ec, u8 xec)
return ret;
 }
 
-static bool f14h_mc1_mce(u16 ec, u8 xec)
+static bool cat_mc1_mce(u16 ec, u8 xec)
 {
u8 r4= R4(ec);
bool ret = true;
 
if (MEM_ERROR(ec)) {
-   if (TT(ec) != 0 || LL(ec) != 1)
+   if (TT(ec) != TT_INSTR)
ret = false;
-
-   if (r4 == R4_IRD)
+   else if (r4 == R4_IRD)
pr_cont("Data/tag array parity error for a tag hit.\n");
else if (r4 == R4_SNOOP)
pr_cont("Tag error during snoop/victimization.\n");
+   else if (xec == 0x0)
+   pr_cont("Tag parity error from victim castout.\n");
+   else if (xec == 0x2)
+   pr_cont("Microcode patch RAM parity error.\n");
else
ret = false;
-   }
+   } else
+   ret = false;
+
return ret;
 }
 
@@ -469,6 +478,47 @@ static bool f15h_mc2_mce(u16 ec, u8 xec)
return ret;
 }
 
+static bool f16h_mc2_mce(u16 ec, u8 xec)
+{
+   u8 r4 = R4(ec);
+
+   if (!MEM_ERROR(ec))
+   return false;
+
+   switch (xec) {
+   case 0x04 ... 0x05:
+   pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
+   break;
+
+   case 0x09 ... 0x0b:
+   case 0x0d ... 0x0f:
+   pr_cont("ECC error in L2 tag (%s).\n",
+   ((r4 == R4_GEN)   ? "BankReq" :
+   ((r4 == R4_SNOOP) ? "Prb" : "Fill")));
+   break;
+
+   case 0x10 ... 0x19:
+   case 0x1b:
+   pr_cont("ECC error in L2 data array (%s).\n",
+   (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
+   ((r4 == R4_GEN)   ? "Attr" :
+   ((r4 == R4_EVICT) ? "Vict" : "Fill";
+   break;
+
+   case 0x1c ... 0x1d:
+   case 0x1f:
+   pr_cont("Parity error in L2 attribute bits (%s).\n",
+   ((r4 == R4_RD)  ? "Hit"  :
+   ((r4 == R4_GEN) ? "Attr" : "Fill")));
+   break;
+
+   default:
+   return false;
+   }
+
+   return true;
+}
+
 static void decode_mc2_mce(struct mce *m)
 {
u16 ec = EC(m->status);
@@ -546,7 +596,7 @@ static void decode_mc4_mce(struct mce *m)
return;
 
case 0x19:
-   if (boot_cpu_data.x86 == 0x15)
+   if (boot_cpu_data.x86 == 0x15 || boot_cpu_data.x86 == 0x16)
pr_cont("Compute Unit Data Error.\n");
else
goto wrong_mc4_mce;
@@ -632,6 +682,10 @@ static void decode_mc6_mce(struct mce *m)
 
 static inline void amd_decode_err_code(u16 ec)
 {
+   if (INT_ERROR(ec)) {
+   pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
+   return;
+   }
 
pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
 
@@ -736,7 +790,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long 
val, void *data)
((m->status & MCI_STATUS_PCC)   ? "PCC"   : "-"),
((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
 
-   if (c->x86 == 0x15)
+   if (c->x86 == 0x15 || c->x86 == 0x16)
pr_cont("|%s|%s",
((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
((m

Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 09:37:51PM +0100, Borislav Petkov wrote:
> On Sat, Dec 15, 2012 at 03:17:05PM -0800, H. Peter Anvin wrote:
> > On 12/15/2012 03:15 PM, Yinghai Lu wrote:
> > >>
> > >>That is for the kernel region itself (that code is actually unchanged from
> > >>the current code), and yes, we could cap that one to _end if there are
> > >>systems which have bugs in that area.  The dynamic page tables map 1G
> > >>aligned at a time.
> > >
> > >dynamic should be 2M too.
> > >
> > >AMD system:
> > >
> > >http://git.kernel.org/?p=linux/kernel/git/tip/tip.git;a=commitdiff;h=66520ebc2df3fe52eb4792f8101fac573b766baf
> > >
> > >  BIOS-e820: [mem 0x0001-0x00e037ff] usable
> > >  BIOS-e820: [mem 0x00e03800-0x00fc] reserved
> > >  BIOS-e820: [mem 0x0100-0x011ffeff] usable
> > >
> > >the hole is not 1G aligned.
> > >
> > >or HT region is from e04000 ?
> > >
> > 
> > The HT region starts at 0xfd -- after that reserved region,
> > so I have no idea what that particular system is trying to do or
> > what is requirements are (nor what its MTRR setup is, since you
> > didn't post it.)
> 
> This is something that Jacob should be able to answer since he's been
> dealing with the 1T support.
> 
> Jacob, how is the HT hole marked on AMD? I know hazily that we do say
> "all memory regions cacheable by default if not explicitly marked" but
> we need to exclude the HT hole from that, right?
> 
> So how are we doing that, MTRRs?

HT hole is architectural, I guess in manuals somewhere and is:
0xfd ~ 0x100. CPU cannot generate memory read/write in
that region.

On that above particular system, there is 1TB of total RAM, and since
we do not want to loose memory around the HT hole, what BIOS has done
is programmed the DRAM controller to move the last 128 GB of memory
to above the HT region. There are 8 memory nodes, the last DRAM
address of the 7th node is 0xe03800. Then there is a hole and the
first address of the last memory node starts at 1TB.

MTRRs only cover under 4GB, and does not cover the HT hole.

Yinghai's mm patchset to only direct map regions backed by RAM solves
our memory hole around HT area.

I've tested Yinghai's patchset (several of early versions)
successfully on our above 1TB system. I'll try the latest tip/mm2
again sometime later today, but I'm pretty sure it should be fine.

Thanks,

-Jacob

> 
> Thanks.
> 
> -- 
> Regards/Gruss,
> Boris.
> 
> Sent from a fat crate under my desk. Formatting is fine.
> --
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 01:48:33PM -0800, H. Peter Anvin wrote:
> There are a few very serious problems we need to figure out related to 
> generalizing very early boot.  If this range gets mapped, will the CPU treat 
> it as WB?  If so, with what consequences for either the HT region or the hole 
> below it?

Hm .. I guess I need to read the whole email thread .. but if you can
explain it in short, what are the problems?

Yes the CPU treats it as WB because the region is under TOM2, so by
default it is WB, and also when you create direct mapping page tables,
the PATs mark them as WB.

What we have seen is that even though the kernel never generate memory
accesses in the hole (since E820 says that it is not RAM) when kernel
read/writes memory near the hole, the CPU was prefetching into the
hole because PATs say that it is WB. This resulted in MCE because
there is no physical RAM there.

-Jacob

> 
> Jacob Shin  wrote:
> 
> >On Wed, Dec 19, 2012 at 09:37:51PM +0100, Borislav Petkov wrote:
> >> On Sat, Dec 15, 2012 at 03:17:05PM -0800, H. Peter Anvin wrote:
> >> > On 12/15/2012 03:15 PM, Yinghai Lu wrote:
> >> > >>
> >> > >>That is for the kernel region itself (that code is actually
> >unchanged from
> >> > >>the current code), and yes, we could cap that one to _end if
> >there are
> >> > >>systems which have bugs in that area.  The dynamic page tables
> >map 1G
> >> > >>aligned at a time.
> >> > >
> >> > >dynamic should be 2M too.
> >> > >
> >> > >AMD system:
> >> > >
> >> >
> >>http://git.kernel.org/?p=linux/kernel/git/tip/tip.git;a=commitdiff;h=66520ebc2df3fe52eb4792f8101fac573b766baf
> >> > >
> >> > >  BIOS-e820: [mem 0x0001-0x00e037ff] usable
> >> > >  BIOS-e820: [mem 0x00e03800-0x00fc] reserved
> >> > >  BIOS-e820: [mem 0x0100-0x011ffeff] usable
> >> > >
> >> > >the hole is not 1G aligned.
> >> > >
> >> > >or HT region is from e04000 ?
> >> > >
> >> > 
> >> > The HT region starts at 0xfd -- after that reserved region,
> >> > so I have no idea what that particular system is trying to do or
> >> > what is requirements are (nor what its MTRR setup is, since you
> >> > didn't post it.)
> >> 
> >> This is something that Jacob should be able to answer since he's been
> >> dealing with the 1T support.
> >> 
> >> Jacob, how is the HT hole marked on AMD? I know hazily that we do say
> >> "all memory regions cacheable by default if not explicitly marked"
> >but
> >> we need to exclude the HT hole from that, right?
> >> 
> >> So how are we doing that, MTRRs?
> >
> >HT hole is architectural, I guess in manuals somewhere and is:
> >0xfd ~ 0x100. CPU cannot generate memory read/write in
> >that region.
> >
> >On that above particular system, there is 1TB of total RAM, and since
> >we do not want to loose memory around the HT hole, what BIOS has done
> >is programmed the DRAM controller to move the last 128 GB of memory
> >to above the HT region. There are 8 memory nodes, the last DRAM
> >address of the 7th node is 0xe03800. Then there is a hole and the
> >first address of the last memory node starts at 1TB.
> >
> >MTRRs only cover under 4GB, and does not cover the HT hole.
> >
> >Yinghai's mm patchset to only direct map regions backed by RAM solves
> >our memory hole around HT area.
> >
> >I've tested Yinghai's patchset (several of early versions)
> >successfully on our above 1TB system. I'll try the latest tip/mm2
> >again sometime later today, but I'm pretty sure it should be fine.
> >
> >Thanks,
> >
> >-Jacob
> >
> >> 
> >> Thanks.
> >> 
> >> -- 
> >> Regards/Gruss,
> >> Boris.
> >> 
> >> Sent from a fat crate under my desk. Formatting is fine.
> >> --
> >> 
> 
> -- 
> Sent from my mobile phone. Please excuse brevity and lack of formatting.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 02:25:44PM -0800, H. Peter Anvin wrote:
> On 12/19/2012 02:05 PM, Jacob Shin wrote:
> >On Wed, Dec 19, 2012 at 01:48:33PM -0800, H. Peter Anvin wrote:
> >>There are a few very serious problems we need to figure out related to 
> >>generalizing very early boot.  If this range gets mapped, will the CPU 
> >>treat it as WB?  If so, with what consequences for either the HT region or 
> >>the hole below it?
> >
> >Hm .. I guess I need to read the whole email thread .. but if you can
> >explain it in short, what are the problems?
> >
> >Yes the CPU treats it as WB because the region is under TOM2, so by
> >default it is WB, and also when you create direct mapping page tables,
> >the PATs mark them as WB.
> >
> >What we have seen is that even though the kernel never generate memory
> >accesses in the hole (since E820 says that it is not RAM) when kernel
> >read/writes memory near the hole, the CPU was prefetching into the
> >hole because PATs say that it is WB. This resulted in MCE because
> >there is no physical RAM there.
> >
> 
> IOW, epic f*ckup.
> 
> The problem is that before we have awareness of the memory map, we
> need to map things in order to access them.  This is a big problem
> and right now there are ridiculous heuristics.  I have been working
> on mapping on demand, but there are concerns about the boundaries
> (i.e. what happens if the mapping spill over into a pit like this.)
> 
> This kind of stuff is really not acceptable.  A region which will
> cause malfunction if prefetched should not be WB in the MTRR system
> (I include TOM* in that.)  The real question is what we can do to
> mitigate the damage.

Well, really the problem is with any memory hole above 4GB that is too
big to be covered by variable range MTRRs as UC. Because the kernel
use to just simply do init_memory_mapping for 4GB ~ top of memory,
any memory hole above 4GB are marked as WB in PATs.

How is this handled in Intel architecture? If there are memory holes
that are too big to be covered by variable range MTRRs as UC, are
there other MTRR like CPU registers that the BIOS programs?


Thanks,

-Jacob

> 
>   -hpa
> 
> -- 
> H. Peter Anvin, Intel Open Source Technology Center
> I work for Intel.  I don't speak on their behalf.
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 11:51:55PM +0100, Borislav Petkov wrote:
> On Wed, Dec 19, 2012 at 02:25:44PM -0800, H. Peter Anvin wrote:
> > The real question is what we can do to mitigate the damage.
> 
> Let's try the first thing that comes to mind: waste a variable MTRR on
> it:
> 
> [0.00] MTRR variable ranges enabled:
> [0.00]   0 base  mask 8000 write-back
> [0.00]   1 base 8000 mask C000 write-back
> [0.00]   2 base C000 mask F000 write-back
> [0.00]   3 base 0001 mask  write-back
> [0.00]   4 base 0002 mask E000 write-back
> [0.00]   5 base 00022000 mask F000 write-back
> [0.00]   6 disabled
> [0.00]   7 disabled
> 
> one of those last two. This is a small box though so I'm guessing on 1T
> boxes those last two won't be disabled. Jacob?

I can check but right, they might be used up. But even if we had slots
available, the memory range that needs to be covered is in large
enough address and aligned in such a way that you cannot cover it with
variable range MTRRs.

> 
> -- 
> Regards/Gruss,
> Boris.
> 
> Sent from a fat crate under my desk. Formatting is fine.
> --
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Thu, Dec 20, 2012 at 12:03:29AM +0100, Borislav Petkov wrote:
> On Wed, Dec 19, 2012 at 04:59:41PM -0600, Jacob Shin wrote:
> > I can check but right, they might be used up. But even if we had slots
> > available, the memory range that needs to be covered is in large
> > enough address and aligned in such a way that you cannot cover it with
> > variable range MTRRs.
> 
> Actually, if I'm not mistaken, you only need to cover the HT hole with
> one MTRR - the rest remains WB. And in order the mask bits to work, we
> could make it a little bigger - we waste some memory but that's nothing
> in comparison to the MCE.

Actually all memory hole above 4GB and under TOM2 needs to be marked
as UC, if the kernel just blanket calls init_memory_mapping from 4GB
to top of memory.

Right we would be loosing memory, and I think depending on the
alignment of the boundary and how many MTRRs you have avaiable to use,
significant chunks of memory could be lost. I need to go refresh on
how variable range MTRRs are programmed, it has been a while.

> 
> You might need to talk to hw guys about the feasibility of this deal
> though.
> 
> Thanks.
> 
> -- 
> Regards/Gruss,
> Boris.
> 
> Sent from a fat crate under my desk. Formatting is fine.
> --
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 03:22:13PM -0800, H. Peter Anvin wrote:
> On 12/19/2012 03:03 PM, Borislav Petkov wrote:
> > On Wed, Dec 19, 2012 at 04:59:41PM -0600, Jacob Shin wrote:
> >> I can check but right, they might be used up. But even if we had slots
> >> available, the memory range that needs to be covered is in large
> >> enough address and aligned in such a way that you cannot cover it with
> >> variable range MTRRs.
> > 
> > Actually, if I'm not mistaken, you only need to cover the HT hole with
> > one MTRR - the rest remains WB. And in order the mask bits to work, we
> > could make it a little bigger - we waste some memory but that's nothing
> > in comparison to the MCE.
> > 
> > You might need to talk to hw guys about the feasibility of this deal
> > though.
> > 
> 
> Just make the hole a bit bigger, so it starts at 0xfc, then you
> only need one MTRR.  This is the correct BIOS-level fix, and it really
> needs to happen.
> 
> Do these systems actually exist in the field or are they engineering
> prototypes?  In the latter case, we might be done at that point.

Yes, HP is shipping (or will ship soon) such systems.

> 
> Really, though, AMD should have added a TOM3 for memory above the 1T
> mark since they should have been able to see a 1T hole coming from the
> design of HyperTransport.  This would be the correct hardware-level fix,
> but I don't expect that to happen.
> 

I'll feed this conversation back to our hardware folks, but yes we
still need to handle today's systems.

> Now, calming down a little bit, we are definitely dealing with BIOS
> engineers and so f*ckups are going to happen, again and again.  The
> question is what to do about it.
> 
> The only truly "safe" option is to limit early mappings to 4K pages.
> This is highly undesirable for a bunch of reasons.  Reducing mapping
> granularity to 2M rather than 1G (what Yinghai is proposing) does reduce
> the exposure somewhat; it would be interesting to gather trap statistics
> and try to get a feel for if this actually changes the boot time
> measurably or not.
> 
> The other bit is that building the real kernel page tables iteratively
> (ignoring the early page tables here) is safer, since the real page
> table builder is fully aware of the memory map.  This means any
> "spillover" from the early page tables gets minimized to regions where
> there are data objects that have to be accessed early.  Since Yinghai
> already had iterative page table building working, I don't see any
> reason to not use that capability.

Yes, I'll test again with latest, but Yinghai's patchset mapping only
RAM from top down solved our problem.

Thanks,

> 
> Thoughts?
> 
>   -hpa
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 03:50:14PM -0800, H. Peter Anvin wrote:
> On 12/19/2012 03:40 PM, Jacob Shin wrote:
> >>
> >>Just make the hole a bit bigger, so it starts at 0xfc, then you
> >>only need one MTRR.  This is the correct BIOS-level fix, and it really
> >>needs to happen.
> >>
> >>Do these systems actually exist in the field or are they engineering
> >>prototypes?  In the latter case, we might be done at that point.
> >
> >Yes, HP is shipping (or will ship soon) such systems.
> >
> 
> Can you get them to fix the BIOS first, or at least ship a BIOS
> update?  Otherwise there will be a probabilistic failure, and it
> sounds like it is your (AMD's) fault.
> 
> >>The other bit is that building the real kernel page tables iteratively
> >>(ignoring the early page tables here) is safer, since the real page
> >>table builder is fully aware of the memory map.  This means any
> >>"spillover" from the early page tables gets minimized to regions where
> >>there are data objects that have to be accessed early.  Since Yinghai
> >>already had iterative page table building working, I don't see any
> >>reason to not use that capability.
> >
> >Yes, I'll test again with latest, but Yinghai's patchset mapping only
> >RAM from top down solved our problem.
> 
> Please don't make me go Steve Ballmer on you.
> 
> We're talking about two different things... the early page tables
> versus the permanent page tables.  The permanent page tables we can
> handle because the page table creation at that point is aware of the
> memory map.

Ah okay,

> 
> The early page tables are what is used before we get to that point.
> Creating them on demand means that if there are no early-needed data
> structures near the hole, there will be no access and everything
> will be okay, but as the early page table creation *is not and
> cannot be* aware of the memory map.  Right now that simply cannot
> happen, because all such data structures are confined to 32-bit
> addresses, however *THAT WILL CHANGE AND WILL CHANGE SOON*, exactly
> because these kinds of large-memory system needs that to happen.
> You may start seeing failures at that time, and there isn't a huge
> lot we can do about it.
> 
> We are trying to discuss mitigation strategies with you, but you
> haven't really given us any useful information, e.g. what happens
> near the various boundaries of the hole, what could trigger
> prefeching into the range, and what it would take to fix the BIOSes.

>From what I remember, accessing memory around the memory hole (not
just the HT hole, but e03800 ~ 100 on our mentioned system
) generated prefetches because the memory hole was marked as WB in PAT.

I'll take a look at the system again, try the blanket MTRR covering
0xe0 ~ 1TB, and talk to our BIOS guys.

> 
>   -hpa
> 
> -- 
> H. Peter Anvin, Intel Open Source Technology Center
> I work for Intel.  I don't speak on their behalf.
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 04:24:09PM -0800, H. Peter Anvin wrote:
> On 12/19/2012 04:07 PM, Jacob Shin wrote:
> > 
> > From what I remember, accessing memory around the memory hole (not
> > just the HT hole, but e03800 ~ 100 on our mentioned system
> > ) generated prefetches because the memory hole was marked as WB in PAT.
> > 
> > I'll take a look at the system again, try the blanket MTRR covering
> > 0xe0 ~ 1TB, and talk to our BIOS guys.
> > 
> 
> Yes, but do they all #MC (as opposed to, say, fetching all FFs)?

Yes, MCE every time and it was fatal.

> 
>   -hpa
> 
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 06:37:45PM -0800, H. Peter Anvin wrote:
> On 12/19/2012 04:29 PM, Jacob Shin wrote:
> > On Wed, Dec 19, 2012 at 04:24:09PM -0800, H. Peter Anvin wrote:
> >> On 12/19/2012 04:07 PM, Jacob Shin wrote:
> >>>
> >>> From what I remember, accessing memory around the memory hole (not
> >>> just the HT hole, but e03800 ~ 100 on our mentioned system
> >>> ) generated prefetches because the memory hole was marked as WB in PAT.
> >>>
> >>> I'll take a look at the system again, try the blanket MTRR covering
> >>> 0xe0 ~ 1TB, and talk to our BIOS guys.
> >>>
> >>
> >> Yes, but do they all #MC (as opposed to, say, fetching all FFs)?
> > 
> > Yes, MCE every time and it was fatal.
> > 
> 
> OK, one more question... there is something odd with the memory ranges here:
> 
>  BIOS-e820: [mem 0x0001-0x00e037ff] usable
>  BIOS-e820: [mem 0x00e03800-0x00fc] reserved
>  BIOS-e820: [mem 0x0100-0x011ffeff] usable
> 
> The first usable range here is 4G to 896G + 896M which is an awfully
> strange number.  Similarly, the second range is 1T to 1T + 128G - 16M.
> The little fiddly bits imply that there is either overshoot of some sort
> going on -- possibly reserved memory -- or these are fairly arbitrary
> sizes that don't match any physical bank sizes in which case it should
> be possible to shuffle it differently...

Not exactly sure why the wierd boundaries, I'll have to ask the BIOS
side folks to be sure. But if I were to guess ..

Here is the NUMA spew out, physically there is 128 GB connected to
each memory controller node. The PCI MMIO region starts at 0xc800.
4 GB - 0xc800 = 0x380 (896 MB). So we loose 896 MB due to PCI
MMIO hole, so the first node ends at 128 GB + 896 MB to talk to all of
128 GB off of the first memory controller, and hence the weird 896 MB
offset.

[0.00] SRAT: Node 0 PXM 0 0-a
[0.00] SRAT: Node 0 PXM 0 10-c800
[0.00] SRAT: Node 0 PXM 0 1-203800
[0.00] SRAT: Node 1 PXM 1 203800-403800
[0.00] SRAT: Node 2 PXM 2 403800-603800
[0.00] SRAT: Node 3 PXM 3 603800-803800
[0.00] SRAT: Node 4 PXM 4 803800-a03800
[0.00] SRAT: Node 5 PXM 5 a03800-c03800
[0.00] SRAT: Node 6 PXM 6 c03800-e03800
[0.00] SRAT: Node 7 PXM 7 100-11fff00
[0.00] NUMA: Initialized distance table, cnt=8
[0.00] NUMA: Node 0 [0,a) + [10,c800) -> [0,c800)
[0.00] NUMA: Node 0 [0,c800) + [1,203800) -> 
[0,203800)
[0.00] Initmem setup node 0 -00203800
[0.00]   NODE_DATA [002037ff5000 - 002037ff]
[0.00] Initmem setup node 1 00203800-00403800
[0.00]   NODE_DATA [004037ff5000 - 004037ff]
[0.00] Initmem setup node 2 00403800-00603800
[0.00]   NODE_DATA [006037ff5000 - 006037ff]
[0.00] Initmem setup node 3 00603800-00803800
[0.00]   NODE_DATA [008037ff5000 - 008037ff]
[0.00] Initmem setup node 4 00803800-00a03800
[0.00]   NODE_DATA [00a037ff5000 - 00a037ff]
[0.00] Initmem setup node 5 00a03800-00c03800
[0.00]   NODE_DATA [00c037ff5000 - 00c037ff]
[0.00] Initmem setup node 6 00c03800-00e03800
[0.00]   NODE_DATA [00e037ff2000 - 00e037ffcfff]
[0.00] Initmem setup node 7 0100-011fff00
[0.00]   NODE_DATA [011ffeff1000 - 011ffeffbfff]
[0.00] Zone PFN ranges:
[0.00]   DMA  0x0010 -> 0x1000
[0.00]   DMA320x1000 -> 0x0010
[0.00]   Normal   0x0010 -> 0x11fff000
[0.00] Movable zone start PFN for each node
[0.00] early_node_map[10] active PFN ranges
[0.00] 0: 0x0010 -> 0x0099
[0.00] 0: 0x0100 -> 0x000c7ec0
[0.00] 0: 0x0010 -> 0x02038000
[0.00] 1: 0x02038000 -> 0x04038000
[0.00] 2: 0x04038000 -> 0x06038000
[0.00] 3: 0x06038000 -> 0x08038000
[0.00] 4: 0x08038000 -> 0x0a038000
[0.00] 5: 0x0a038000 -> 0x0c038000
[0.00] 6: 0x0c038000 -> 0x0e038000
[0.00] 7: 0x1000 -> 0x11fff000
[0.00] On node 0 totalpages: 33553993
[0.00]   DMA zone: 56 pages used for memmap
[0.00]   DMA zone: 5 pages reserved
[0.00]   DMA zone: 3916 pages, LIFO batch:0
[0.00]   DMA32 zone: 14280 pages used for memmap
[0.00

Re: AMD microcode site (amd64.org) down for a while now

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 11:11:09PM -0200, Henrique de Moraes Holschuh wrote:
> Jacob,
> 
> Since you seem to be dealing with AMD microcode, do you know anything about
> the amd64.org demise?  Where do we get the microcode update data, now?

Hi,

Yes, the server amd64.org was hosted on is down at the moment, sorry
about that. We are hoping to get it back up soon. But eventually we
are planning to push the microcode patch update files to kernel.org
linux-firmware git repository.

Thanks,

-Jacob

> 
> -- 
>   "One disk to rule them all, One disk to find them. One disk to bring
>   them all and in the darkness grind them. In the Land of Redmond
>   where the shadows lie." -- The Silicon Valley Tarot
>   Henrique Holschuh
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/4] perf, x86: Move MSR address offset calculation to architecture specific files

2012-11-15 Thread Jacob Shin
Move counter index to MSR address offset calculation to architecture
specific files. This prepares the way for perf_event_amd to enable
counter addresses that are not contiguous -- for example AMD Family
15h processors have 6 core performance counters starting at 0xc0010200
and 4 northbridge performance counters starting at 0xc0010240.

Signed-off-by: Jacob Shin 
---
 arch/x86/kernel/cpu/perf_event.h |   21 +---
 arch/x86/kernel/cpu/perf_event_amd.c |   35 ++
 2 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 271d257..aacf025 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -325,6 +325,7 @@ struct x86_pmu {
int (*schedule_events)(struct cpu_hw_events *cpuc, int n, 
int *assign);
unsignedeventsel;
unsignedperfctr;
+   int (*addr_offset)(int index);
u64 (*event_map)(int);
int max_events;
int num_counters;
@@ -444,28 +445,16 @@ extern u64 __read_mostly hw_cache_extra_regs
 
 u64 x86_perf_event_update(struct perf_event *event);
 
-static inline int x86_pmu_addr_offset(int index)
-{
-   int offset;
-
-   /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
-   alternative_io(ASM_NOP2,
-  "shll $1, %%eax",
-  X86_FEATURE_PERFCTR_CORE,
-  "=a" (offset),
-  "a"  (index));
-
-   return offset;
-}
-
 static inline unsigned int x86_pmu_config_addr(int index)
 {
-   return x86_pmu.eventsel + x86_pmu_addr_offset(index);
+   return x86_pmu.eventsel +
+   (x86_pmu.addr_offset ? x86_pmu.addr_offset(index) : index);
 }
 
 static inline unsigned int x86_pmu_event_addr(int index)
 {
-   return x86_pmu.perfctr + x86_pmu_addr_offset(index);
+   return x86_pmu.perfctr +
+   (x86_pmu.addr_offset ? x86_pmu.addr_offset(index) : index);
 }
 
 int x86_setup_perfctr(struct perf_event *event);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index 04ef43f..d6e3337 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -132,6 +132,40 @@ static u64 amd_pmu_event_map(int hw_event)
return amd_perfmon_event_map[hw_event];
 }
 
+/*
+ * Previously calculated offsets
+ */
+static unsigned int addr_offsets[X86_PMC_IDX_MAX] __read_mostly;
+
+/*
+ * Legacy CPUs:
+ *   4 counters starting at 0xc001 each offset by 1
+ *
+ * CPUs with core performance counter extensions:
+ *   6 counters starting at 0xc0010200 each offset by 2
+ */
+static inline int amd_pmu_addr_offset(int index)
+{
+   int offset;
+
+   if (!index)
+   return index;
+
+   offset = addr_offsets[index];
+
+   if (offset)
+   return offset;
+
+   if (!cpu_has_perfctr_core)
+   offset = index;
+   else
+   offset = index << 1;
+
+   addr_offsets[index] = offset;
+
+   return offset;
+}
+
 static int amd_pmu_hw_config(struct perf_event *event)
 {
int ret;
@@ -570,6 +604,7 @@ static __initconst const struct x86_pmu amd_pmu = {
.schedule_events= x86_schedule_events,
.eventsel   = MSR_K7_EVNTSEL0,
.perfctr= MSR_K7_PERFCTR0,
+   .addr_offset= amd_pmu_addr_offset,
.event_map  = amd_pmu_event_map,
.max_events = ARRAY_SIZE(amd_perfmon_event_map),
.num_counters   = AMD64_NUM_COUNTERS,
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/4] perf, amd: Rework northbridge event constraints handler

2012-11-15 Thread Jacob Shin
From: Robert Richter 

Code simplification. No functional changes.

Signed-off-by: Robert Richter 
Signed-off-by: Jacob Shin 
---
 arch/x86/kernel/cpu/perf_event_amd.c |   68 +-
 1 file changed, 26 insertions(+), 42 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index 4528ae7..d60c5c7 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -256,9 +256,8 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
struct perf_event *event)
 {
struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
-   struct perf_event *old = NULL;
-   int max = x86_pmu.num_counters;
-   int i, j, k = -1;
+   struct perf_event *old;
+   int idx, new = -1;
 
/*
 * if not NB event or no NB, then no constraints
@@ -276,48 +275,33 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
struct perf_event *event)
 * because of successive calls to x86_schedule_events() from
 * hw_perf_group_sched_in() without hw_perf_enable()
 */
-   for (i = 0; i < max; i++) {
-   /*
-* keep track of first free slot
-*/
-   if (k == -1 && !nb->owners[i])
-   k = i;
+   for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+   if (new == -1 || hwc->idx == idx)
+   /* assign free slot, prefer hwc->idx */
+   old = cmpxchg(nb->owners + idx, NULL, event);
+   else if (nb->owners[idx] == event)
+   /* event already present */
+   old = event;
+   else
+   continue;
+
+   if (old && old != event)
+   continue;
+
+   /* reassign to this slot */
+   if (new != -1)
+   cmpxchg(nb->owners + new, event, NULL);
+   new = idx;
 
/* already present, reuse */
-   if (nb->owners[i] == event)
-   goto done;
-   }
-   /*
-* not present, so grab a new slot
-* starting either at:
-*/
-   if (hwc->idx != -1) {
-   /* previous assignment */
-   i = hwc->idx;
-   } else if (k != -1) {
-   /* start from free slot found */
-   i = k;
-   } else {
-   /*
-* event not found, no slot found in
-* first pass, try again from the
-* beginning
-*/
-   i = 0;
-   }
-   j = i;
-   do {
-   old = cmpxchg(nb->owners+i, NULL, event);
-   if (!old)
+   if (old == event)
break;
-   if (++i == max)
-   i = 0;
-   } while (i != j);
-done:
-   if (!old)
-   return &nb->event_constraints[i];
-
-   return &emptyconstraint;
+   }
+
+   if (new == -1)
+   return &emptyconstraint;
+
+   return &nb->event_constraints[new];
 }
 
 static struct amd_nb *amd_alloc_nb(int cpu)
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH V2 0/4] perf, amd: Enable AMD family 15h northbridge counters

2012-11-15 Thread Jacob Shin
The following patchset enables 4 additional performance counters in
AMD family 15h processors that counts northbridge events -- such as
number of DRAM accesses.

This patchset is based on top of previous work done by Robert Richter
 :

https://lkml.org/lkml/2012/6/19/324

The main differences are:

- The northbridge counters are indexed contiguously right above the
  core performance counters.

- MSR address offset calculations are moved to architecture specific
  files.

- Interrups are set up to be delivered only to a single core.

V2:
Seprate out Robert's patches, and add properly ordered certificate of
origins.

Jacob Shin (2):
  perf, x86: Move MSR address offset calculation to architecture
specific files
  perf, amd: Enable northbridge performance counters on AMD family 15h

Robert Richter (2):
  perf, amd: Rework northbridge event constraints handler
  perf, amd: Generalize northbridge constraints code for family 15h

 arch/x86/include/asm/cpufeature.h|2 +
 arch/x86/include/asm/msr-index.h |2 +
 arch/x86/include/asm/perf_event.h|6 +
 arch/x86/kernel/cpu/perf_event.h |   21 +--
 arch/x86/kernel/cpu/perf_event_amd.c |  246 --
 5 files changed, 187 insertions(+), 90 deletions(-)

-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/4] perf, amd: Generalize northbridge constraints code for family 15h

2012-11-15 Thread Jacob Shin
From: Robert Richter 

Generalize northbridge constraints code for family 10h so that later
we can reuse the same code path with other AMD processor families that
have the same northbridge event constraints.

Signed-off-by: Robert Richter 
Signed-off-by: Jacob Shin 
---
 arch/x86/kernel/cpu/perf_event_amd.c |   43 --
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c 
b/arch/x86/kernel/cpu/perf_event_amd.c
index d60c5c7..04ef43f 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -188,20 +188,13 @@ static inline int amd_has_nb(struct cpu_hw_events *cpuc)
return nb && nb->nb_id != -1;
 }
 
-static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
- struct perf_event *event)
+static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
+  struct perf_event *event)
 {
-   struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
int i;
 
/*
-* only care about NB events
-*/
-   if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
-   return;
-
-   /*
 * need to scan whole list because event may not have
 * been assigned during scheduling
 *
@@ -247,12 +240,13 @@ static void amd_put_event_constraints(struct 
cpu_hw_events *cpuc,
   *
   * Given that resources are allocated (cmpxchg), they must be
   * eventually freed for others to use. This is accomplished by
-  * calling amd_put_event_constraints().
+  * calling __amd_put_nb_event_constraints()
   *
   * Non NB events are not impacted by this restriction.
   */
 static struct event_constraint *
-amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event 
*event,
+  struct event_constraint *c)
 {
struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
@@ -260,12 +254,6 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
struct perf_event *event)
int idx, new = -1;
 
/*
-* if not NB event or no NB, then no constraints
-*/
-   if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
-   return &unconstrained;
-
-   /*
 * detect if already present, if so reuse
 *
 * cannot merge with actual allocation
@@ -275,7 +263,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, 
struct perf_event *event)
 * because of successive calls to x86_schedule_events() from
 * hw_perf_group_sched_in() without hw_perf_enable()
 */
-   for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+   for_each_set_bit(idx, c->idxmsk, X86_PMC_IDX_MAX) {
if (new == -1 || hwc->idx == idx)
/* assign free slot, prefer hwc->idx */
old = cmpxchg(nb->owners + idx, NULL, event);
@@ -391,6 +379,25 @@ static void amd_pmu_cpu_dead(int cpu)
}
 }
 
+static struct event_constraint *
+amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+   /*
+* if not NB event or no NB, then no constraints
+*/
+   if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
+   return &unconstrained;
+
+   return __amd_get_nb_event_constraints(cpuc, event, &unconstrained);
+}
+
+static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+   if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
+   __amd_put_nb_event_constraints(cpuc, event);
+}
+
 PMU_FORMAT_ATTR(event, "config:0-7,32-35");
 PMU_FORMAT_ATTR(umask, "config:8-15"   );
 PMU_FORMAT_ATTR(edge,  "config:18" );
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


  1   2   3   4   >