[PATCH V4 12/23] perf/core: Support a REMOVE transaction

2019-03-26 Thread kan . liang
From: Andi Kleen 

The TopDown events can be collected per thread/process on Icelake. To
use TopDown through RDPMC in applications, the metrics and slots MSR
values have to be saved/restored during context switching.
It is useful to have a remove transaction when the counter is
unscheduled, so that the values can be saved correctly.
Add a remove transaction to the perf core.

Signed-off-by: Andi Kleen 
Signed-off-by: Kan Liang 
---

No changes since V3.

 arch/x86/events/core.c | 3 +--
 include/linux/perf_event.h | 1 +
 kernel/events/core.c   | 5 +
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 9378c6b2128f..9c14b4b3e457 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1912,8 +1912,7 @@ static inline void x86_pmu_read(struct perf_event *event)
  * Set the flag to make pmu::enable() not perform the
  * schedulability test, it will be performed at commit time
  *
- * We only support PERF_PMU_TXN_ADD transactions. Save the
- * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
+ * Save the transaction flags and ignore non-PERF_PMU_TXN_ADD
  * transactions.
  */
 static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e47ef764f613..fb258c171b2c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -233,6 +233,7 @@ struct perf_event;
  */
 #define PERF_PMU_TXN_ADD  0x1  /* txn to add/schedule event on PMU */
 #define PERF_PMU_TXN_READ 0x2  /* txn to read event group from PMU */
+#define PERF_PMU_TXN_REMOVE 0x4/* txn to remove event on PMU */
 
 /**
  * pmu::capabilities flags
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1032a16bd186..dea8cfe2a891 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2032,6 +2032,7 @@ group_sched_out(struct perf_event *group_event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
 {
+   struct pmu *pmu = ctx->pmu;
struct perf_event *event;
 
if (group_event->state != PERF_EVENT_STATE_ACTIVE)
@@ -2039,6 +2040,8 @@ group_sched_out(struct perf_event *group_event,
 
perf_pmu_disable(ctx->pmu);
 
+   pmu->start_txn(pmu, PERF_PMU_TXN_REMOVE);
+
event_sched_out(group_event, cpuctx, ctx);
 
/*
@@ -2051,6 +2054,8 @@ group_sched_out(struct perf_event *group_event,
 
if (group_event->attr.exclusive)
cpuctx->exclusive = 0;
+
+   pmu->commit_txn(pmu);
 }
 
 #define DETACH_GROUP   0x01UL
-- 
2.17.1



[PATCH V4 14/23] perf/x86/intel: Support overflows on SLOTS

2019-03-26 Thread kan . liang
From: Andi Kleen 

The internal counters used for the metrics can overflow. If this happens
an overflow is triggered on the SLOTS fixed counter. Add special code
that resets all the slave metric counters in this case.

Signed-off-by: Andi Kleen 
Signed-off-by: Kan Liang 
---

No changes since V3.

 arch/x86/events/intel/core.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 5c8f0df137bc..2da822414627 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2231,12 +2231,35 @@ static void intel_pmu_add_event(struct perf_event 
*event)
intel_pmu_lbr_add(event);
 }
 
+/* When SLOTS overflowed update all the active topdown-* events */
+static void intel_pmu_update_metrics(struct perf_event *event)
+{
+   struct cpu_hw_events *cpuc = this_cpu_ptr(_hw_events);
+   int idx;
+   u64 slots_events;
+
+   slots_events = *(u64 *)cpuc->enabled_events & INTEL_PMC_MSK_ANY_SLOTS;
+
+   for_each_set_bit(idx, (unsigned long *)_events, 64) {
+   struct perf_event *ev = cpuc->events[idx];
+
+   if (ev == event)
+   continue;
+   x86_perf_event_update(event);
+   }
+}
+
 /*
  * Save and restart an expired event. Called by NMI contexts,
  * so it has to be careful about preempting normal event ops:
  */
 int intel_pmu_save_and_restart(struct perf_event *event)
 {
+   struct hw_perf_event *hwc = >hw;
+
+   if (unlikely(hwc->reg_idx == INTEL_PMC_IDX_FIXED_SLOTS))
+   intel_pmu_update_metrics(event);
+
x86_perf_event_update(event);
/*
 * For a checkpointed counter always reset back to 0.  This
-- 
2.17.1



[PATCH V4 16/23] perf/x86/intel: Set correct weight for topdown subevent counters

2019-03-26 Thread kan . liang
From: Andi Kleen 

The top down sub event counters are mapped to a fixed counter,
but should have the normal weight for the scheduler.
So special case this.

Signed-off-by: Andi Kleen 
Signed-off-by: Kan Liang 
---

No changes since V3.

 arch/x86/events/intel/core.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 6a8a221dc188..31e4e283e7c5 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -5081,6 +5081,15 @@ __init int intel_pmu_init(void)
 * counter, so do not extend mask to generic counters
 */
for_each_event_constraint(c, x86_pmu.event_constraints) {
+   /*
+* Don't limit the event mask for topdown sub event
+* counters.
+*/
+   if (x86_pmu.num_counters_fixed >= 3 &&
+   c->idxmsk64 & INTEL_PMC_MSK_ANY_SLOTS) {
+   c->weight = hweight64(c->idxmsk64);
+   continue;
+   }
if (c->cmask == FIXED_EVENT_FLAGS
&& c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 
1;
-- 
2.17.1



[PATCH V4 18/23] perf/x86/intel: Disable sampling read slots and topdown

2019-03-26 Thread kan . liang
From: Kan Liang 

To get correct PERF_METRICS value, the fixed counter 3 must start from
0. It would bring problems when sampling read slots and topdown events.
For example,
perf record -e '{slots, topdown-retiring}:S'
The slots would not overflow if it starts from 0.

Add specific validate_group() support to reject the case and error out
for Icelake.

Signed-off-by: Kan Liang 
---

No changes since V3.

 arch/x86/events/core.c   |  2 ++
 arch/x86/events/intel/core.c | 20 
 arch/x86/events/perf_event.h |  2 ++
 3 files changed, 24 insertions(+)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 7d4d56f76436..b9bee53e53d8 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2122,6 +2122,8 @@ static int validate_group(struct perf_event *event)
 
ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
 
+   if (x86_pmu.validate_group)
+   ret = x86_pmu.validate_group(fake_cpuc, n);
 out:
free_fake_cpuc(fake_cpuc);
return ret;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index b08e361fc718..ef6045544628 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4331,6 +4331,25 @@ static int icl_set_period(struct perf_event *event)
return 1;
 }
 
+static int icl_validate_group(struct cpu_hw_events *cpuc, int n)
+{
+   bool has_sampling_slots = false, has_metrics = false;
+   struct perf_event *e;
+   int i;
+
+   for (i = 0; i < n; i++) {
+   e = cpuc->event_list[i];
+   if (is_slots_event(e) && is_sampling_event(e))
+   has_sampling_slots = true;
+
+   if (is_perf_metrics_event(e))
+   has_metrics = true;
+   }
+   if (unlikely(has_sampling_slots && has_metrics))
+   return -EINVAL;
+   return 0;
+}
+
 EVENT_ATTR_STR(mem-loads,  mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3");
 EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82")
 
@@ -5037,6 +5056,7 @@ __init int intel_pmu_init(void)
x86_pmu.has_metric = x86_pmu.intel_cap.perf_metrics;
x86_pmu.metric_update_event = icl_metric_update_event;
x86_pmu.set_period = icl_set_period;
+   x86_pmu.validate_group = icl_validate_group;
pr_cont("Icelake events, ");
name = "icelake";
break;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 6500463bda85..077d44a96d31 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -631,6 +631,8 @@ struct x86_pmu {
u64 (*limit_period)(struct perf_event *event, u64 l);
int (*set_period)(struct perf_event *event);
 
+   int (*validate_group)(struct cpu_hw_events *cpuc, int n);
+
/* PMI handler bits */
unsigned intlate_ack:1,
counter_freezing:1;
-- 
2.17.1



[PATCH V4 11/23] perf/x86/intel/uncore: Add Intel Icelake uncore support

2019-03-26 Thread kan . liang
From: Kan Liang 

Add Intel Icelake uncore support,
 - The init code is based on Skylake
 - Add new pci id for IMC
 - New MSR address for CBOX
 - Get CBOX# from CNL_UNC_CBO_CONFIG MSR directly
 - Create a new PMU for fixed clocktick counter

Signed-off-by: Kan Liang 
---

No changes since V3.

 arch/x86/events/intel/uncore.c |  6 ++
 arch/x86/events/intel/uncore.h |  1 +
 arch/x86/events/intel/uncore_snb.c | 91 ++
 3 files changed, 98 insertions(+)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 9fe64c01a2e5..fc40a1473058 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1367,6 +1367,11 @@ static const struct intel_uncore_init_fun 
skx_uncore_init __initconst = {
.pci_init = skx_uncore_pci_init,
 };
 
+static const struct intel_uncore_init_fun icl_uncore_init __initconst = {
+   .cpu_init = icl_uncore_cpu_init,
+   .pci_init = skl_uncore_pci_init,
+};
+
 static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EP, nhm_uncore_init),
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM,nhm_uncore_init),
@@ -1393,6 +1398,7 @@ static const struct x86_cpu_id intel_uncore_match[] 
__initconst = {
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X,  skx_uncore_init),
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_uncore_init),
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_uncore_init),
+   X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, icl_uncore_init),
{},
 };
 
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 853a49a8ccf6..79eb2e21e4f0 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -512,6 +512,7 @@ int skl_uncore_pci_init(void);
 void snb_uncore_cpu_init(void);
 void nhm_uncore_cpu_init(void);
 void skl_uncore_cpu_init(void);
+void icl_uncore_cpu_init(void);
 int snb_pci2phy_map_init(int devid);
 
 /* uncore_snbep.c */
diff --git a/arch/x86/events/intel/uncore_snb.c 
b/arch/x86/events/intel/uncore_snb.c
index 13493f43b247..f8431819b3e1 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -34,6 +34,8 @@
 #define PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC   0x3e33
 #define PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC   0x3eca
 #define PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC   0x3e32
+#define PCI_DEVICE_ID_INTEL_ICL_U_IMC  0x8a02
+#define PCI_DEVICE_ID_INTEL_ICL_U2_IMC 0x8a12
 
 /* SNB event control */
 #define SNB_UNC_CTL_EV_SEL_MASK0x00ff
@@ -93,6 +95,12 @@
 #define SKL_UNC_PERF_GLOBAL_CTL0xe01
 #define SKL_UNC_GLOBAL_CTL_CORE_ALL((1 << 5) - 1)
 
+/* ICL Cbo register */
+#define ICL_UNC_CBO_CONFIG 0x396
+#define ICL_UNC_NUM_CBO_MASK   0xf
+#define ICL_UNC_CBO_0_PER_CTR0 0x702
+#define ICL_UNC_CBO_MSR_OFFSET 0x8
+
 DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
 DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
 DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
@@ -280,6 +288,70 @@ void skl_uncore_cpu_init(void)
snb_uncore_arb.ops = _uncore_msr_ops;
 }
 
+static struct intel_uncore_type icl_uncore_cbox = {
+   .name   = "cbox",
+   .num_counters   = 4,
+   .perf_ctr_bits  = 44,
+   .perf_ctr   = ICL_UNC_CBO_0_PER_CTR0,
+   .event_ctl  = SNB_UNC_CBO_0_PERFEVTSEL0,
+   .event_mask = SNB_UNC_RAW_EVENT_MASK,
+   .msr_offset = ICL_UNC_CBO_MSR_OFFSET,
+   .ops= _uncore_msr_ops,
+   .format_group   = _uncore_format_group,
+};
+
+static struct uncore_event_desc icl_uncore_events[] = {
+   INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff"),
+   { /* end: all zeroes */ },
+};
+
+static struct attribute *icl_uncore_clock_formats_attr[] = {
+   _attr_event.attr,
+   NULL,
+};
+
+static struct attribute_group icl_uncore_clock_format_group = {
+   .name = "format",
+   .attrs = icl_uncore_clock_formats_attr,
+};
+
+static struct intel_uncore_type icl_uncore_clockbox = {
+   .name   = "clock",
+   .num_counters   = 1,
+   .num_boxes  = 1,
+   .fixed_ctr_bits = 48,
+   .fixed_ctr  = SNB_UNC_FIXED_CTR,
+   .fixed_ctl  = SNB_UNC_FIXED_CTR_CTRL,
+   .single_fixed   = 1,
+   .event_mask = SNB_UNC_CTL_EV_SEL_MASK,
+   .format_group   = _uncore_clock_format_group,
+   .ops= _uncore_msr_ops,
+   .event_descs= icl_uncore_events,
+};
+
+static struct intel_uncore_type *icl_msr_uncores[] = {
+   _uncore_cbox,
+   _uncore_arb,
+   _uncore_clockbox,
+   NULL,
+};
+
+static int icl_get_cbox_num(void)
+{
+   u64 num_boxes;
+
+   rdmsrl(ICL_UNC_CBO_CONFIG, num_boxes);
+
+   return num_boxes & ICL_UNC_NUM_CBO_MASK;
+}
+
+void 

[PATCH V4 19/23] perf/x86/intel: Support CPUID 10.ECX to disable fixed counters

2019-03-26 Thread kan . liang
From: Andi Kleen 

Icelake supports a new CPUID 10.ECX cpu leaf to indicate some fixed
counters are not supported.  This extends the previous count to a bitmap
which allows to disable even lower counters.

It's a nop on Icelake (all fixed counters are supported), but let's
implement it here.  This adds the necessary checks. In theory it could
be used today by a Hypervisor.

For disabled counters disable any constraint events. I reuse the
existing intel_ctrl variable to remember which counters are disabled.
All code that reads all counters is fixed to check this extra bitmask.

Signed-off-by: Andi Kleen 
Signed-off-by: Kan Liang 
---

No changes since V3.

 arch/x86/events/core.c   |  8 +++-
 arch/x86/events/intel/core.c | 22 +++---
 arch/x86/events/perf_event.h |  6 ++
 3 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index b9bee53e53d8..12d7d591843e 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -225,6 +225,8 @@ static bool check_hw_exists(void)
if (ret)
goto msr_fail;
for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
+   if (fixed_counter_disabled(i))
+   continue;
if (val & (0x03 << i*4)) {
bios_fail = 1;
val_fail = val;
@@ -1372,6 +1374,8 @@ void perf_event_print_debug(void)
cpu, idx, prev_left);
}
for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+   if (fixed_counter_disabled(idx))
+   continue;
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
 
pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1887,7 +1891,9 @@ static int __init init_hw_perf_events(void)
pr_info("... generic registers:  %d\n", x86_pmu.num_counters);
pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
pr_info("... max period: %016Lx\n", x86_pmu.max_period);
-   pr_info("... fixed-purpose events:   %d\n", 
x86_pmu.num_counters_fixed);
+   pr_info("... fixed-purpose events:   %lu\n",
+   hweight641ULL << x86_pmu.num_counters_fixed) - 1)
+   << INTEL_PMC_IDX_FIXED) & 
x86_pmu.intel_ctrl));
pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
 
/*
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index ef6045544628..a4b7711ef0ee 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2311,8 +2311,11 @@ static void intel_pmu_reset(void)
wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
wrmsrl_safe(x86_pmu_event_addr(idx),  0ull);
}
-   for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
+   for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+   if (fixed_counter_disabled(idx))
+   continue;
wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+   }
 
if (ds)
ds->bts_index = ds->bts_buffer_base;
@@ -4551,7 +4554,7 @@ __init int intel_pmu_init(void)
union cpuid10_eax eax;
union cpuid10_ebx ebx;
struct event_constraint *c;
-   unsigned int unused;
+   unsigned int fixed_mask;
struct extra_reg *er;
int version, i;
char *name;
@@ -4572,9 +4575,11 @@ __init int intel_pmu_init(void)
 * Check whether the Architectural PerfMon supports
 * Branch Misses Retired hw_event or not.
 */
-   cpuid(10, , , , );
+   cpuid(10, , , _mask, );
if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
return -ENODEV;
+   if (!fixed_mask)
+   fixed_mask = -1;
 
version = eax.split.version_id;
if (version < 2)
@@ -5104,7 +5109,8 @@ __init int intel_pmu_init(void)
}
 
x86_pmu.intel_ctrl |=
-   ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+   (((1LL << x86_pmu.num_counters_fixed)-1) & (u64)fixed_mask)
+   << INTEL_PMC_IDX_FIXED;
 
if (x86_pmu.event_constraints) {
/*
@@ -5121,9 +5127,11 @@ __init int intel_pmu_init(void)
c->weight = hweight64(c->idxmsk64);
continue;
}
-   if (c->cmask == FIXED_EVENT_FLAGS
-   && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
-   c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 
1;
+   if (c->cmask == FIXED_EVENT_FLAGS)  {
+   if (c->idxmsk64 != 
INTEL_PMC_MSK_FIXED_REF_CYCLES)
+   c->idxmsk64 |= (1ULL 

[PATCH V4 20/23] perf, tools: Add support for recording and printing XMM registers

2019-03-26 Thread kan . liang
From: Andi Kleen 

Newer kernel code can collect XMM registers in some cases.
Add support for perf script to dump them, and support
for the register parser in perf record -I ... to configure them.
For now they are just printed in hex, could potentially add
other formats too.

Signed-off-by: Andi Kleen 
Signed-off-by: Kan Liang 
---

Changes since V3:
- Update perf_regs.h accordingly

 tools/arch/x86/include/uapi/asm/perf_regs.h | 23 ++-
 tools/perf/arch/x86/include/perf_regs.h | 25 +++--
 tools/perf/arch/x86/util/perf_regs.c| 16 +
 tools/perf/util/perf_regs.h |  1 +
 4 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/tools/arch/x86/include/uapi/asm/perf_regs.h 
b/tools/arch/x86/include/uapi/asm/perf_regs.h
index f3329cabce5c..ac67bbea10ca 100644
--- a/tools/arch/x86/include/uapi/asm/perf_regs.h
+++ b/tools/arch/x86/include/uapi/asm/perf_regs.h
@@ -27,8 +27,29 @@ enum perf_event_x86_regs {
PERF_REG_X86_R13,
PERF_REG_X86_R14,
PERF_REG_X86_R15,
-
+   /* These are the limits for the GPRs. */
PERF_REG_X86_32_MAX = PERF_REG_X86_GS + 1,
PERF_REG_X86_64_MAX = PERF_REG_X86_R15 + 1,
+
+   /* These all need two bits set because they are 128bit */
+   PERF_REG_X86_XMM0  = 32,
+   PERF_REG_X86_XMM1  = 34,
+   PERF_REG_X86_XMM2  = 36,
+   PERF_REG_X86_XMM3  = 38,
+   PERF_REG_X86_XMM4  = 40,
+   PERF_REG_X86_XMM5  = 42,
+   PERF_REG_X86_XMM6  = 44,
+   PERF_REG_X86_XMM7  = 46,
+   PERF_REG_X86_XMM8  = 48,
+   PERF_REG_X86_XMM9  = 50,
+   PERF_REG_X86_XMM10 = 52,
+   PERF_REG_X86_XMM11 = 54,
+   PERF_REG_X86_XMM12 = 56,
+   PERF_REG_X86_XMM13 = 58,
+   PERF_REG_X86_XMM14 = 60,
+   PERF_REG_X86_XMM15 = 62,
+
+   /* These include both GPRs and XMMX registers */
+   PERF_REG_X86_XMM_MAX = PERF_REG_X86_XMM15 + 2,
 };
 #endif /* _ASM_X86_PERF_REGS_H */
diff --git a/tools/perf/arch/x86/include/perf_regs.h 
b/tools/perf/arch/x86/include/perf_regs.h
index 7f6d538f8a89..b7321337d100 100644
--- a/tools/perf/arch/x86/include/perf_regs.h
+++ b/tools/perf/arch/x86/include/perf_regs.h
@@ -8,9 +8,9 @@
 
 void perf_regs_load(u64 *regs);
 
+#define PERF_REGS_MAX PERF_REG_X86_XMM_MAX
 #ifndef HAVE_ARCH_X86_64_SUPPORT
 #define PERF_REGS_MASK ((1ULL << PERF_REG_X86_32_MAX) - 1)
-#define PERF_REGS_MAX PERF_REG_X86_32_MAX
 #define PERF_SAMPLE_REGS_ABI PERF_SAMPLE_REGS_ABI_32
 #else
 #define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \
@@ -18,7 +18,6 @@ void perf_regs_load(u64 *regs);
   (1ULL << PERF_REG_X86_FS) | \
   (1ULL << PERF_REG_X86_GS))
 #define PERF_REGS_MASK (((1ULL << PERF_REG_X86_64_MAX) - 1) & ~REG_NOSUPPORT)
-#define PERF_REGS_MAX PERF_REG_X86_64_MAX
 #define PERF_SAMPLE_REGS_ABI PERF_SAMPLE_REGS_ABI_64
 #endif
 #define PERF_REG_IP PERF_REG_X86_IP
@@ -77,6 +76,28 @@ static inline const char *perf_reg_name(int id)
case PERF_REG_X86_R15:
return "R15";
 #endif /* HAVE_ARCH_X86_64_SUPPORT */
+
+#define XMM(x) \
+   case PERF_REG_X86_XMM ## x: \
+   case PERF_REG_X86_XMM ## x + 1: \
+   return "XMM" #x;
+   XMM(0)
+   XMM(1)
+   XMM(2)
+   XMM(3)
+   XMM(4)
+   XMM(5)
+   XMM(6)
+   XMM(7)
+   XMM(8)
+   XMM(9)
+   XMM(10)
+   XMM(11)
+   XMM(12)
+   XMM(13)
+   XMM(14)
+   XMM(15)
+#undef XMM
default:
return NULL;
}
diff --git a/tools/perf/arch/x86/util/perf_regs.c 
b/tools/perf/arch/x86/util/perf_regs.c
index fead6b3b4206..71d7604dbf0b 100644
--- a/tools/perf/arch/x86/util/perf_regs.c
+++ b/tools/perf/arch/x86/util/perf_regs.c
@@ -31,6 +31,22 @@ const struct sample_reg sample_reg_masks[] = {
SMPL_REG(R14, PERF_REG_X86_R14),
SMPL_REG(R15, PERF_REG_X86_R15),
 #endif
+   SMPL_REG2(XMM0, PERF_REG_X86_XMM0),
+   SMPL_REG2(XMM1, PERF_REG_X86_XMM1),
+   SMPL_REG2(XMM2, PERF_REG_X86_XMM2),
+   SMPL_REG2(XMM3, PERF_REG_X86_XMM3),
+   SMPL_REG2(XMM4, PERF_REG_X86_XMM4),
+   SMPL_REG2(XMM5, PERF_REG_X86_XMM5),
+   SMPL_REG2(XMM6, PERF_REG_X86_XMM6),
+   SMPL_REG2(XMM7, PERF_REG_X86_XMM7),
+   SMPL_REG2(XMM8, PERF_REG_X86_XMM8),
+   SMPL_REG2(XMM9, PERF_REG_X86_XMM9),
+   SMPL_REG2(XMM10, PERF_REG_X86_XMM10),
+   SMPL_REG2(XMM11, PERF_REG_X86_XMM11),
+   SMPL_REG2(XMM12, PERF_REG_X86_XMM12),
+   SMPL_REG2(XMM13, PERF_REG_X86_XMM13),
+   SMPL_REG2(XMM14, PERF_REG_X86_XMM14),
+   SMPL_REG2(XMM15, PERF_REG_X86_XMM15),
SMPL_REG_END
 };
 
diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h
index c9319f8d17a6..1a15a4bfc28d 100644
--- a/tools/perf/util/perf_regs.h
+++ b/tools/perf/util/perf_regs.h
@@ -12,6 +12,7 @@ struct sample_reg {
uint64_t mask;
 };
 #define SMPL_REG(n, b) { .name = #n, .mask = 1ULL << (b) }
+#define SMPL_REG2(n, b) { .name = #n, 

[PATCH V4 17/23] perf/x86/intel: Export new top down events for Icelake

2019-03-26 Thread kan . liang
From: Andi Kleen 

Export new top down events for perf that map to the sub metrics
in the metrics register, and another for the new slots fixed counter.
This makes the new fixed counters in Icelake visible to the perf
user tools.

Signed-off-by: Andi Kleen 
Signed-off-by: Kan Liang 
---

No changes since V3.

 arch/x86/events/intel/core.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 31e4e283e7c5..b08e361fc718 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -320,6 +320,12 @@ EVENT_ATTR_STR_HT(topdown-recovery-bubbles, 
td_recovery_bubbles,
 EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale,
"4", "2");
 
+EVENT_ATTR_STR(slots,  slots,  "event=0x00,umask=0x4");
+EVENT_ATTR_STR(topdown-retiring,   td_retiring,"event=0xff,umask=0x1");
+EVENT_ATTR_STR(topdown-bad-spec,   td_bad_spec,"event=0xff,umask=0x2");
+EVENT_ATTR_STR(topdown-fe-bound,   td_fe_bound,"event=0xff,umask=0x3");
+EVENT_ATTR_STR(topdown-be-bound,   td_be_bound,"event=0xff,umask=0x4");
+
 static struct attribute *snb_events_attrs[] = {
EVENT_PTR(td_slots_issued),
EVENT_PTR(td_slots_retired),
@@ -4383,6 +4389,11 @@ EVENT_ATTR_STR(el-capacity-write, el_capacity_write, 
"event=0x54,umask=0x2");
 static struct attribute *icl_events_attrs[] = {
EVENT_PTR(mem_ld_hsw),
EVENT_PTR(mem_st_hsw),
+   EVENT_PTR(slots),
+   EVENT_PTR(td_retiring),
+   EVENT_PTR(td_bad_spec),
+   EVENT_PTR(td_fe_bound),
+   EVENT_PTR(td_be_bound),
NULL,
 };
 
-- 
2.17.1



[PATCH V4 21/23] perf, tools, stat: Support new per thread TopDown metrics

2019-03-26 Thread kan . liang
From: Andi Kleen 

Icelake has support for reporting per thread TopDown metrics.
These are reported differently than the previous TopDown support,
each metric is standalone, but scaled to pipeline "slots".
We don't need to do anything special for HyperThreading anymore.
Teach perf stat --topdown to handle these new metrics and
print them in the same way as the previous TopDown metrics.
The restrictions of only being able to report information per core is
gone.

Signed-off-by: Andi Kleen 
Signed-off-by: Kan Liang 
---

No changes since V3.

 tools/perf/Documentation/perf-stat.txt |  9 ++-
 tools/perf/builtin-stat.c  | 24 +++
 tools/perf/util/stat-shadow.c  | 89 ++
 tools/perf/util/stat.c |  4 ++
 tools/perf/util/stat.h |  8 +++
 5 files changed, 132 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/perf-stat.txt 
b/tools/perf/Documentation/perf-stat.txt
index 4bc2085e5197..f4469751ef0a 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -266,8 +266,13 @@ if the workload is actually bound by the CPU and not by 
something else.
 For best results it is usually a good idea to use it with interval
 mode like -I 1000, as the bottleneck of workloads can change often.
 
-The top down metrics are collected per core instead of per
-CPU thread. Per core mode is automatically enabled
+This enables --metric-only, unless overridden with --no-metric-only.
+
+The following restrictions only apply to older Intel CPUs and Atom,
+on newer CPUs (IceLake and later) TopDown can be collected for any thread:
+
+The top down metrics are collected per core instead of per CPU thread.
+Per core mode is automatically enabled
 and -a (global monitoring) is needed, requiring root rights or
 perf.perf_event_paranoid=-1.
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 7b8f09b0b8bf..5068396c241b 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -123,6 +123,14 @@ static const char * topdown_attrs[] = {
NULL,
 };
 
+static const char *topdown_metric_attrs[] = {
+   "topdown-retiring",
+   "topdown-bad-spec",
+   "topdown-fe-bound",
+   "topdown-be-bound",
+   NULL,
+};
+
 static const char *smi_cost_attrs = {
"{"
"msr/aperf/,"
@@ -1215,6 +1223,21 @@ static int add_default_attributes(void)
char *str = NULL;
bool warn = false;
 
+   if (topdown_filter_events(topdown_metric_attrs, , 1) < 0) {
+   pr_err("Out of memory\n");
+   return -1;
+   }
+   if (topdown_metric_attrs[0] && str) {
+   if (!stat_config.interval) {
+   fprintf(stat_config.output,
+   "Topdown accuracy may decreases when 
measuring long period.\n"
+   "Please print the result regularly, 
e.g. -I1000\n");
+   }
+   goto setup_metrics;
+   }
+
+   str = NULL;
+
if (stat_config.aggr_mode != AGGR_GLOBAL &&
stat_config.aggr_mode != AGGR_CORE) {
pr_err("top down event configuration requires 
--per-core mode\n");
@@ -1236,6 +1259,7 @@ static int add_default_attributes(void)
if (topdown_attrs[0] && str) {
if (warn)
arch_topdown_group_warn();
+setup_metrics:
err = parse_events(evsel_list, str, );
if (err) {
fprintf(stderr,
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 83d8094be4fe..6b6ffd64a4c4 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -238,6 +238,18 @@ void perf_stat__update_shadow_stats(struct perf_evsel 
*counter, u64 count,
else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES))
update_runtime_stat(st, STAT_TOPDOWN_RECOVERY_BUBBLES,
ctx, cpu, count);
+   else if (perf_stat_evsel__is(counter, TOPDOWN_RETIRING))
+   update_runtime_stat(st, STAT_TOPDOWN_RETIRING,
+   ctx, cpu, count);
+   else if (perf_stat_evsel__is(counter, TOPDOWN_BAD_SPEC))
+   update_runtime_stat(st, STAT_TOPDOWN_BAD_SPEC,
+   ctx, cpu, count);
+   else if (perf_stat_evsel__is(counter, TOPDOWN_FE_BOUND))
+   update_runtime_stat(st, STAT_TOPDOWN_FE_BOUND,
+   ctx, cpu, count);
+   else if (perf_stat_evsel__is(counter, TOPDOWN_BE_BOUND))
+   update_runtime_stat(st, STAT_TOPDOWN_BE_BOUND,
+   ctx, cpu, count);
else if (perf_evsel__match(counter, 

[PATCH V4 13/23] perf/x86/intel: Basic support for metrics counters

2019-03-26 Thread kan . liang
From: Andi Kleen 

Metrics counters (hardware counters containing multiple metrics)
are modelled as separate registers for each sub-event, with an
extra reg being used for coordinating access to the underlying
register in the scheduler.

This patch adds the basic infrastructure to separate the scheduler
register indexes from the actual hardware register indexes. In
most cases the MSR address is already used correctly, but for
code using indexes we need a separate reg_idx field in the event
to indicate the correct underlying register.

Signed-off-by: Andi Kleen 
Signed-off-by: Kan Liang 
---

No changes since V3.

 arch/x86/events/core.c| 18 --
 arch/x86/events/intel/core.c  | 29 -
 arch/x86/events/perf_event.h  | 15 +++
 arch/x86/include/asm/msr-index.h  |  1 +
 arch/x86/include/asm/perf_event.h | 30 ++
 include/linux/perf_event.h|  1 +
 6 files changed, 83 insertions(+), 11 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 9c14b4b3e457..d24f8d009529 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1006,16 +1006,30 @@ static inline void x86_assign_hw_event(struct 
perf_event *event,
struct hw_perf_event *hwc = >hw;
 
hwc->idx = cpuc->assign[i];
+   hwc->reg_idx = hwc->idx;
hwc->last_cpu = smp_processor_id();
hwc->last_tag = ++cpuc->tags[i];
 
+   /*
+* Metrics counters use different indexes in the scheduler
+* versus the hardware.
+*
+* Map metrics to fixed counter 3 (which is the base count),
+* but the update event callback reads the extra metric register
+* and converts to the right metric.
+*/
+   if (is_metric_idx(hwc->idx))
+   hwc->reg_idx = INTEL_PMC_IDX_FIXED_SLOTS;
+
if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
hwc->config_base = 0;
hwc->event_base = 0;
} else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
-   hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - 
INTEL_PMC_IDX_FIXED);
-   hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 
1<<30;
+   hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 +
+   (hwc->reg_idx - INTEL_PMC_IDX_FIXED);
+   hwc->event_base_rdpmc = (hwc->reg_idx - INTEL_PMC_IDX_FIXED)
+   | 1<<30;
} else {
hwc->config_base = x86_pmu_config_addr(hwc->idx);
hwc->event_base  = x86_pmu_event_addr(hwc->idx);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index ef95d73ef4f0..5c8f0df137bc 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2090,7 +2090,7 @@ static inline void intel_pmu_ack_status(u64 ack)
 
 static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
 {
-   int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+   int idx = hwc->reg_idx - INTEL_PMC_IDX_FIXED;
u64 ctrl_val, mask;
 
mask = 0xfULL << (idx * 4);
@@ -2116,9 +2116,19 @@ static void intel_pmu_disable_event(struct perf_event 
*event)
return;
}
 
-   cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
-   cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
-   cpuc->intel_cp_status &= ~(1ull << hwc->idx);
+   __clear_bit(hwc->idx, cpuc->enabled_events);
+
+   /*
+* When any other slots sharing event is still enabled,
+* cancel the disabling.
+*/
+   if (is_any_slots_idx(hwc->idx) &&
+   (*(u64 *)>enabled_events & INTEL_PMC_MSK_ANY_SLOTS))
+   return;
+
+   cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->reg_idx);
+   cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->reg_idx);
+   cpuc->intel_cp_status &= ~(1ull << hwc->reg_idx);
 
if (unlikely(event->attr.precise_ip))
intel_pmu_pebs_disable(event);
@@ -2150,7 +2160,7 @@ static void intel_pmu_read_event(struct perf_event *event)
 static void intel_pmu_enable_fixed(struct perf_event *event)
 {
struct hw_perf_event *hwc = >hw;
-   int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+   int idx = hwc->reg_idx - INTEL_PMC_IDX_FIXED;
u64 ctrl_val, mask, bits = 0;
 
/*
@@ -2194,18 +2204,19 @@ static void intel_pmu_enable_event(struct perf_event 
*event)
}
 
if (event->attr.exclude_host)
-   cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
+   cpuc->intel_ctrl_guest_mask |= (1ull << hwc->reg_idx);
if (event->attr.exclude_guest)
-   cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx);
+   cpuc->intel_ctrl_host_mask |= (1ull << hwc->reg_idx);
 
if (unlikely(event_is_checkpointed(event)))
-   cpuc->intel_cp_status |= (1ull << hwc->idx);
+   

[PATCH V4 09/23] perf/x86/intel/rapl: Add Icelake support

2019-03-26 Thread kan . liang
From: Kan Liang 

Icelake support the same RAPL counters as Skylake.

Signed-off-by: Kan Liang 
---

No changes since V3.

 arch/x86/events/intel/rapl.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 94dc564146ca..37ebf6fc5415 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -775,6 +775,8 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst 
= {
X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_X, hsw_rapl_init),
 
X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, hsw_rapl_init),
+
+   X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE,  skl_rapl_init),
{},
 };
 
-- 
2.17.1



[PATCH V4 23/23] perf vendor events intel: Add JSON files for Icelake

2019-03-26 Thread kan . liang
From: Kan Liang 

Add V1 event list for Icelake.

Signed-off-by: Kan Liang 
---

No changes since V3.

 .../pmu-events/arch/x86/icelake/cache.json| 552 +++
 .../arch/x86/icelake/floating-point.json  |  90 ++
 .../pmu-events/arch/x86/icelake/frontend.json | 424 +
 .../pmu-events/arch/x86/icelake/memory.json   | 410 
 .../pmu-events/arch/x86/icelake/other.json| 133 +++
 .../pmu-events/arch/x86/icelake/pipeline.json | 892 ++
 .../arch/x86/icelake/virtual-memory.json  | 236 +
 tools/perf/pmu-events/arch/x86/mapfile.csv|   1 +
 8 files changed, 2738 insertions(+)
 create mode 100644 tools/perf/pmu-events/arch/x86/icelake/cache.json
 create mode 100644 tools/perf/pmu-events/arch/x86/icelake/floating-point.json
 create mode 100644 tools/perf/pmu-events/arch/x86/icelake/frontend.json
 create mode 100644 tools/perf/pmu-events/arch/x86/icelake/memory.json
 create mode 100644 tools/perf/pmu-events/arch/x86/icelake/other.json
 create mode 100644 tools/perf/pmu-events/arch/x86/icelake/pipeline.json
 create mode 100644 tools/perf/pmu-events/arch/x86/icelake/virtual-memory.json

diff --git a/tools/perf/pmu-events/arch/x86/icelake/cache.json 
b/tools/perf/pmu-events/arch/x86/icelake/cache.json
new file mode 100644
index ..3529fc338c17
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/icelake/cache.json
@@ -0,0 +1,552 @@
+[
+{
+"CollectPEBSRecord": "2",
+"PublicDescription": "Counts the number of demand Data Read requests 
that miss L2 cache. Only not rejected loads are counted.",
+"EventCode": "0x24",
+"Counter": "0,1,2,3",
+"UMask": "0x21",
+"PEBScounters": "0,1,2,3",
+"EventName": "L2_RQSTS.DEMAND_DATA_RD_MISS",
+"SampleAfterValue": "23",
+"BriefDescription": "Demand Data Read miss L2, no rejects"
+},
+{
+"CollectPEBSRecord": "2",
+"PublicDescription": "Counts the RFO (Read-for-Ownership) requests 
that miss L2 cache.",
+"EventCode": "0x24",
+"Counter": "0,1,2,3",
+"UMask": "0x22",
+"PEBScounters": "0,1,2,3",
+"EventName": "L2_RQSTS.RFO_MISS",
+"SampleAfterValue": "23",
+"BriefDescription": "RFO requests that miss L2 cache"
+},
+{
+"CollectPEBSRecord": "2",
+"PublicDescription": "Counts L2 cache misses when fetching 
instructions.",
+"EventCode": "0x24",
+"Counter": "0,1,2,3",
+"UMask": "0x24",
+"PEBScounters": "0,1,2,3",
+"EventName": "L2_RQSTS.CODE_RD_MISS",
+"SampleAfterValue": "23",
+"BriefDescription": "L2 cache misses when fetching instructions"
+},
+{
+"CollectPEBSRecord": "2",
+"PublicDescription": "Counts demand requests that miss L2 cache.",
+"EventCode": "0x24",
+"Counter": "0,1,2,3",
+"UMask": "0x27",
+"PEBScounters": "0,1,2,3",
+"EventName": "L2_RQSTS.ALL_DEMAND_MISS",
+"SampleAfterValue": "23",
+"BriefDescription": "Demand requests that miss L2 cache"
+},
+{
+"CollectPEBSRecord": "2",
+"PublicDescription": "Counts Software prefetch requests that miss the 
L2 cache. This event accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions.",
+"EventCode": "0x24",
+"Counter": "0,1,2,3",
+"UMask": "0x28",
+"PEBScounters": "0,1,2,3",
+"EventName": "L2_RQSTS.SWPF_MISS",
+"SampleAfterValue": "23",
+"BriefDescription": "SW prefetch requests that miss L2 cache."
+},
+{
+"CollectPEBSRecord": "2",
+"PublicDescription": "Counts the number of demand Data Read requests 
initiated by load instructions that hit L2 cache.",
+"EventCode": "0x24",
+"Counter": "0,1,2,3",
+"UMask": "0xc1",
+"PEBScounters": "0,1,2,3",
+"EventName": "L2_RQSTS.DEMAND_DATA_RD_HIT",
+"SampleAfterValue": "23",
+"BriefDescription": "Demand Data Read requests that hit L2 cache"
+},
+{
+"CollectPEBSRecord": "2",
+"PublicDescription": "Counts the RFO (Read-for-Ownership) requests 
that hit L2 cache.",
+"EventCode": "0x24",
+"Counter": "0,1,2,3",
+"UMask": "0xc2",
+"PEBScounters": "0,1,2,3",
+"EventName": "L2_RQSTS.RFO_HIT",
+"SampleAfterValue": "23",
+"BriefDescription": "RFO requests that hit L2 cache"
+},
+{
+"CollectPEBSRecord": "2",
+"PublicDescription": "Counts L2 cache hits when fetching instructions, 
code reads.",
+"EventCode": "0x24",
+"Counter": "0,1,2,3",
+"UMask": "0xc4",
+"PEBScounters": "0,1,2,3",
+"EventName": "L2_RQSTS.CODE_RD_HIT",
+"SampleAfterValue": "23",
+"BriefDescription": "L2 cache hits when fetching instructions, code 
reads."
+},
+{
+"CollectPEBSRecord": "2",
+

[PATCH V4 08/23] perf/x86/intel/cstate: Add Icelake support

2019-03-26 Thread kan . liang
From: Kan Liang 

Icelake uses the same C-state residency events as Sandy Bridge.

Signed-off-by: Kan Liang 
---

No changes since V3.

 arch/x86/events/intel/cstate.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 94a4b7fc75d0..dd5658ec31d5 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -578,6 +578,8 @@ static const struct x86_cpu_id intel_cstates_match[] 
__initconst = {
X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_X, glm_cstates),
 
X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates),
+
+   X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_MOBILE, snb_cstates),
{ },
 };
 MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
-- 
2.17.1



[PATCH V4 01/23] perf/x86: Support outputting XMM registers

2019-03-26 Thread kan . liang
From: Kan Liang 

Starting from Icelake, XMM registers can be collected in PEBS record.
But current code only output the pt_regs.

Add a new struct x86_perf_regs for both pt_regs and xmm_regs.
XMM registers are 128 bit. To simplify the code, they are handled like
two different registers, which means setting two bits in the register
bitmap. This also allows only sampling the lower 64bit bits in XMM.

The index of XMM registers starts from 32. There are 16 XMM registers.
So all reserved space for regs are used. Remove REG_RESERVED.

Add PERF_REG_X86_XMM_MAX, which stands for the max number of all x86
regs including both GPRs and XMM.

XMM is not supported on all platforms. Adding has_xmm_regs to indicate
the specific platform. Also add checks in x86_pmu_hw_config() to reject
invalid config of regs_user and regs_intr.

Add REG_NOSUPPORT for 32bit to exclude unsupported registers.

Originally-by: Andi Kleen 
Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Kan Liang 
---

Changes since V3:
- Keep the old names for GPRs. Rename PERF_REG_X86_MAX to
  PERF_REG_X86_XMM_MAX
- Remove unnecessary REG_RESERVED
- Add REG_NOSUPPORT for 32bit

 arch/x86/events/core.c| 10 ++
 arch/x86/events/perf_event.h  |  2 ++
 arch/x86/include/asm/perf_event.h |  5 +
 arch/x86/include/uapi/asm/perf_regs.h | 23 ++-
 arch/x86/kernel/perf_regs.c   | 27 ---
 5 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index e2b1447192a8..9378c6b2128f 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -560,6 +560,16 @@ int x86_pmu_hw_config(struct perf_event *event)
return -EINVAL;
}
 
+   if (event->attr.sample_regs_user & ~PEBS_REGS)
+   return -EINVAL;
+   /*
+* Besides the general purpose registers, XMM registers may
+* be collected in PEBS on some platforms, e.g. Icelake
+*/
+   if ((event->attr.sample_regs_intr & ~PEBS_REGS) &&
+   (!x86_pmu.has_xmm_regs || !event->attr.precise_ip))
+   return -EINVAL;
+
return x86_setup_perfctr(event);
 }
 
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index a75955741c50..6428941a5073 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -657,6 +657,8 @@ struct x86_pmu {
 * Check period value for PERF_EVENT_IOC_PERIOD ioctl.
 */
int (*check_period) (struct perf_event *event, u64 period);
+
+   unsigned inthas_xmm_regs : 1; /* support XMM regs */
 };
 
 struct x86_perf_task_context {
diff --git a/arch/x86/include/asm/perf_event.h 
b/arch/x86/include/asm/perf_event.h
index 8bdf74902293..d9f5bbe44b3c 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -248,6 +248,11 @@ extern void perf_events_lapic_init(void);
 #define PERF_EFLAGS_VM (1UL << 5)
 
 struct pt_regs;
+struct x86_perf_regs {
+   struct pt_regs  regs;
+   u64 *xmm_regs;
+};
+
 extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
 extern unsigned long perf_misc_flags(struct pt_regs *regs);
 #define perf_misc_flags(regs)  perf_misc_flags(regs)
diff --git a/arch/x86/include/uapi/asm/perf_regs.h 
b/arch/x86/include/uapi/asm/perf_regs.h
index f3329cabce5c..ac67bbea10ca 100644
--- a/arch/x86/include/uapi/asm/perf_regs.h
+++ b/arch/x86/include/uapi/asm/perf_regs.h
@@ -27,8 +27,29 @@ enum perf_event_x86_regs {
PERF_REG_X86_R13,
PERF_REG_X86_R14,
PERF_REG_X86_R15,
-
+   /* These are the limits for the GPRs. */
PERF_REG_X86_32_MAX = PERF_REG_X86_GS + 1,
PERF_REG_X86_64_MAX = PERF_REG_X86_R15 + 1,
+
+   /* These all need two bits set because they are 128bit */
+   PERF_REG_X86_XMM0  = 32,
+   PERF_REG_X86_XMM1  = 34,
+   PERF_REG_X86_XMM2  = 36,
+   PERF_REG_X86_XMM3  = 38,
+   PERF_REG_X86_XMM4  = 40,
+   PERF_REG_X86_XMM5  = 42,
+   PERF_REG_X86_XMM6  = 44,
+   PERF_REG_X86_XMM7  = 46,
+   PERF_REG_X86_XMM8  = 48,
+   PERF_REG_X86_XMM9  = 50,
+   PERF_REG_X86_XMM10 = 52,
+   PERF_REG_X86_XMM11 = 54,
+   PERF_REG_X86_XMM12 = 56,
+   PERF_REG_X86_XMM13 = 58,
+   PERF_REG_X86_XMM14 = 60,
+   PERF_REG_X86_XMM15 = 62,
+
+   /* These include both GPRs and XMMX registers */
+   PERF_REG_X86_XMM_MAX = PERF_REG_X86_XMM15 + 2,
 };
 #endif /* _ASM_X86_PERF_REGS_H */
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index c06c4c16c6b6..07c30ee17425 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -59,18 +59,34 @@ static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = {
 
 u64 perf_reg_value(struct pt_regs *regs, int idx)
 {
+   struct x86_perf_regs *perf_regs;
+
+   if (idx >= PERF_REG_X86_XMM0 && idx < PERF_REG_X86_XMM_MAX) {
+   perf_regs = 

[PATCH V4 00/23] perf: Add Icelake support

2019-03-26 Thread kan . liang
From: Kan Liang 

The patch series intends to add Icelake support for Linux perf.

PATCH 1-18: Kernel patches to support Icelake.
 - 1-5: Support adaptive PEBS feature
 - 6-7: Enable core support with some new features, e.g. 8 generic
   counters, new event constraints, a new fixed counter.
 - 8-11: Enable cstate, rapl, msr and uncore support on Icelake
 - 12-18: Support hardware Metrics counters and SLOT fixed counter for
   Topdown events.
 - 19: Support CPUID 10.ECX to disable fixed counters

PATCH 20-23: Perf tool patches to support XMM, Topdown and event list.

Changes since V3:
- Keep the old names for GPRs. Rename PERF_REG_X86_MAX to
  PERF_REG_X86_XMM_MAX
- Remove unnecessary REG_RESERVED
- Add REG_NOSUPPORT for 32bit

Changes since V2:
- Make the setup_pebs_sample_data() a function pointer argument
- Use cpuc->pebs_record_size unconditionally
- Add comments for EVENT_CONSTRAINT_RANGE
- Correct the Author of "perf/x86: Support constraint ranges"

Changes since V1:
- Avoid the interface changes for perf_reg_value() and
  perf_output_sample_regs().
- Remove the extra_regs in struct perf_sample_data.
- Add struct x86_perf_regs
- Add has_xmm_regs to indicate the specific platform which support XMM
  registers collection.
- Add check in x86_pmu_hw_config() to reject invalid config of regs_user
  and regs_intr.
- Rename intel_hsw_weight and intel_hsw_transaction
- Add missed inline for intel_get_tsx_transaction()
- Add new patch to extract code of event update in short period
- Code rebase on top of c634dc6bdede
- Rename @d to pebs_data_cfg
- Make pebs_update_adaptive_cfg readable
- Clear pebs_data_cfg and pebs_record_size for first PEBS in add
- Don't clear ICL_EVENTSEL_ADAPTIVE. Rely on MSR_PEBS_CFG settings
- Change PEBS record parsing order (bug fix)
- Support struct x86_perf_regs
- make get_pebs_status generic
- specific intel_pmu_drain_pebs_icl()
- Use cpuc->pebs_record_size to replace format_size
- Use 'size' to replace 'range_end' for constraint ranges
- Add x86_pmu.has_xmm_regs = true;
- Add more explanation in change log of REMOVE transaction
- Make perf_regs.h consistent between kernel and user space

Andi Kleen (11):
  perf/x86/intel: Extract memory code PEBS parser for reuse
  perf/x86/lbr: Avoid reading the LBRs when adaptive PEBS handles them
  perf/core: Support a REMOVE transaction
  perf/x86/intel: Basic support for metrics counters
  perf/x86/intel: Support overflows on SLOTS
  perf/x86/intel: Set correct weight for topdown subevent counters
  perf/x86/intel: Export new top down events for Icelake
  perf/x86/intel: Support CPUID 10.ECX to disable fixed counters
  perf, tools: Add support for recording and printing XMM registers
  perf, tools, stat: Support new per thread TopDown metrics
  perf, tools: Add documentation for topdown metrics

Kan Liang (11):
  perf/x86: Support outputting XMM registers
  perf/x86/intel/ds: Extract code of event update in short period
  perf/x86/intel: Support adaptive PEBSv4
  perf/x86/intel: Add Icelake support
  perf/x86/intel/cstate: Add Icelake support
  perf/x86/intel/rapl: Add Icelake support
  perf/x86/msr: Add Icelake support
  perf/x86/intel/uncore: Add Intel Icelake uncore support
  perf/x86/intel: Support hardware TopDown metrics
  perf/x86/intel: Disable sampling read slots and topdown
  perf vendor events intel: Add JSON files for Icelake

Peter Zijlstra (1):
  perf/x86: Support constraint ranges

 arch/x86/events/core.c|  81 +-
 arch/x86/events/intel/core.c  | 422 -
 arch/x86/events/intel/cstate.c|   2 +
 arch/x86/events/intel/ds.c| 496 --
 arch/x86/events/intel/lbr.c   |  35 +-
 arch/x86/events/intel/rapl.c  |   2 +
 arch/x86/events/intel/uncore.c|   6 +
 arch/x86/events/intel/uncore.h|   1 +
 arch/x86/events/intel/uncore_snb.c|  91 ++
 arch/x86/events/msr.c |   1 +
 arch/x86/events/perf_event.h  |  93 +-
 arch/x86/include/asm/intel_ds.h   |   2 +-
 arch/x86/include/asm/msr-index.h  |   4 +
 arch/x86/include/asm/perf_event.h |  79 +-
 arch/x86/include/uapi/asm/perf_regs.h |  23 +-
 arch/x86/kernel/perf_regs.c   |  27 +-
 include/linux/perf_event.h|   7 +
 kernel/events/core.c  |   5 +
 tools/arch/x86/include/uapi/asm/perf_regs.h   |  23 +-
 tools/perf/Documentation/perf-stat.txt|   9 +-
 tools/perf/Documentation/topdown.txt  | 223 +
 tools/perf/arch/x86/include/perf_regs.h   |  25 +-
 tools/perf/arch/x86/util/perf_regs.c  |  16 +
 tools/perf/builtin-stat.c |  24 +
 .../pmu-events/arch/x86/icelake/cache.json| 552 +++
 .../arch/x86/icelake/floating-point.json  |  90 ++
 .../pmu-events/arch/x86/icelake/frontend.json | 424 +
 

[PATCH V4 05/23] perf/x86/lbr: Avoid reading the LBRs when adaptive PEBS handles them

2019-03-26 Thread kan . liang
From: Andi Kleen 

With adaptive PEBS the CPU can directly supply the LBR information,
so we don't need to read it again. But the LBRs still need to be
enabled. Add a special count to the cpuc that distinguishes these
two cases, and avoid reading the LBRs unnecessarily when PEBS is
active.

Signed-off-by: Andi Kleen 
Signed-off-by: Kan Liang 
---

No changes since V3.

 arch/x86/events/intel/lbr.c  | 13 -
 arch/x86/events/perf_event.h |  1 +
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 07b7175fc378..6f814a27416b 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -488,6 +488,8 @@ void intel_pmu_lbr_add(struct perf_event *event)
 * be 'new'. Conversely, a new event can get installed through the
 * context switch path for the first time.
 */
+   if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
+   cpuc->lbr_pebs_users++;
perf_sched_cb_inc(event->ctx->pmu);
if (!cpuc->lbr_users++ && !event->total_time_running)
intel_pmu_lbr_reset();
@@ -507,8 +509,11 @@ void intel_pmu_lbr_del(struct perf_event *event)
task_ctx->lbr_callstack_users--;
}
 
+   if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
+   cpuc->lbr_pebs_users--;
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
+   WARN_ON_ONCE(cpuc->lbr_pebs_users < 0);
perf_sched_cb_dec(event->ctx->pmu);
 }
 
@@ -658,7 +663,13 @@ void intel_pmu_lbr_read(void)
 {
struct cpu_hw_events *cpuc = this_cpu_ptr(_hw_events);
 
-   if (!cpuc->lbr_users)
+   /*
+* Don't read when all LBRs users are using adaptive PEBS.
+*
+* This could be smarter and actually check the event,
+* but this simple approach seems to work for now.
+*/
+   if (!cpuc->lbr_users || cpuc->lbr_users == cpuc->lbr_pebs_users)
return;
 
if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 4ca20078ca92..f2351e47de3d 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -216,6 +216,7 @@ struct cpu_hw_events {
 * Intel LBR bits
 */
int lbr_users;
+   int lbr_pebs_users;
struct perf_branch_stacklbr_stack;
struct perf_branch_entrylbr_entries[MAX_LBR_ENTRIES];
struct er_account   *lbr_sel;
-- 
2.17.1



[tip:x86/cleanups] x86/platform/uv: Fix missing checks of kcalloc() return values

2019-03-26 Thread tip-bot for Kangjie Lu
Commit-ID:  766460852cfaeca4042e5f3aeb9616b3689147bc
Gitweb: https://git.kernel.org/tip/766460852cfaeca4042e5f3aeb9616b3689147bc
Author: Kangjie Lu 
AuthorDate: Mon, 25 Mar 2019 15:29:22 -0500
Committer:  Borislav Petkov 
CommitDate: Tue, 26 Mar 2019 17:01:30 +0100

x86/platform/uv: Fix missing checks of kcalloc() return values

Handle potential errors returned from kcalloc().

 [ bp: rewrite commit message. ]

Signed-off-by: Kangjie Lu 
Signed-off-by: Borislav Petkov 
Cc: Andrew Banman 
Cc: Andy Shevchenko 
Cc: Colin Ian King 
Cc: Darren Hart 
Cc: "Gustavo A. R. Silva" 
Cc: "H. Peter Anvin" 
Cc: Ingo Molnar 
Cc: Kees Cook 
Cc: Mike Travis 
Cc: Nicolai Stange 
Cc: pakki...@umn.edu
Cc: platform-driver-...@vger.kernel.org
Cc: Thomas Gleixner 
Cc: Varsha Rao 
Cc: x86-ml 
Link: https://lkml.kernel.org/r/20190325202924.4624-1-k...@umn.edu
---
 arch/x86/platform/uv/tlb_uv.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 2c53b0f19329..1297e185b8c8 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -2133,14 +2133,19 @@ static int __init summarize_uvhub_sockets(int nuvhubs,
  */
 static int __init init_per_cpu(int nuvhubs, int base_part_pnode)
 {
-   unsigned char *uvhub_mask;
struct uvhub_desc *uvhub_descs;
+   unsigned char *uvhub_mask = NULL;
 
if (is_uv3_hub() || is_uv2_hub() || is_uv1_hub())
timeout_us = calculate_destination_timeout();
 
uvhub_descs = kcalloc(nuvhubs, sizeof(struct uvhub_desc), GFP_KERNEL);
+   if (!uvhub_descs)
+   goto fail;
+
uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
+   if (!uvhub_mask)
+   goto fail;
 
if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask))
goto fail;


Re: [PATCH v2 1/2] phy: qcom-qmp: Add msm8998 PCIe QMP PHY support

2019-03-26 Thread Marc Gonzalez
On 26/03/2019 14:24, Marc Gonzalez wrote:

> +static const struct qmp_phy_cfg msm8998_pciephy_cfg = {
> + .type   = PHY_TYPE_PCIE,
> + .nlanes = 1,
> +
> + .serdes_tbl = msm8998_pcie_serdes_tbl,
> + .serdes_tbl_num = ARRAY_SIZE(msm8998_pcie_serdes_tbl),
> + .tx_tbl = msm8998_pcie_tx_tbl,
> + .tx_tbl_num = ARRAY_SIZE(msm8998_pcie_tx_tbl),
> + .rx_tbl = msm8998_pcie_rx_tbl,
> + .rx_tbl_num = ARRAY_SIZE(msm8998_pcie_rx_tbl),
> + .pcs_tbl= msm8998_pcie_pcs_tbl,
> + .pcs_tbl_num= ARRAY_SIZE(msm8998_pcie_pcs_tbl),
> + .clk_list   = msm8996_phy_clk_l,
> + .num_clks   = ARRAY_SIZE(msm8996_phy_clk_l),
> + .reset_list = msm8996_pciephy_reset_l,
> + .num_resets = ARRAY_SIZE(msm8996_pciephy_reset_l),

Looking more closely at the code downstream, it looks like the
reset situation is slightly different. Let me spin a v3.

Regards.


Re: [PATCH v2] x86/syscalls: Mark expected switch fall-throughs

2019-03-26 Thread Thomas Gleixner
On Tue, 26 Mar 2019, Oleg Nesterov wrote:
> On 03/23, Thomas Gleixner wrote:
> >
> > On Thu, 28 Feb 2019, Gustavo A. R. Silva wrote:
> > 
> > >  arch/x86/include/asm/syscall.h | 28 
> > >  1 file changed, 28 insertions(+)
> > 
> > Second thoughts. So this adds 28 /* fall through */ comments. Now I
> > appreciate the effort, but can we pretty please look at the code in
> > question and figure out whether the implementation makes sense in the first
> > place before adding falltrough comments blindly?
> > 
> > The whole exercise can be simplified. Untested patch below.
> > 
> > Looking at that stuff makes me wonder about two things:
> > 
> >  1) The third argument of get/set(), i.e. the argument offset, is 0 on all
> > call sites. Do we need it at all?
> 
> Probably "maxargs" can be removed too, Steven sent the patches a long ago, see
> https://lore.kernel.org/lkml/20161107212634.529267...@goodmis.org/

Indeed. We should resurrect them.

> >  2) syscall_set_arguments() has been introduced in 2008 and we still have
> > no caller. Instead of polishing it, can it be removed completely or are
> > there plans to actually use it?
> 
> I think it can die.

Good. Removed code is the least buggy code :)

Gustavo, it would be really appreciated if you could take care of that,
unless Steven wants to polish his old set up himself. If you have no
cycles, please let us know.

Thanks,

tglx



Re: [PATCH 2/4] pid: add pidctl()

2019-03-26 Thread Joel Fernandes
On Mon, Mar 25, 2019 at 07:18:42PM +0100, Jann Horn wrote:
> On Mon, Mar 25, 2019 at 5:21 PM Christian Brauner  
> wrote:
> > The pidctl() syscalls builds on, extends, and improves translate_pid() [4].
> > I quote Konstantins original patchset first that has already been acked and
> > picked up by Eric before and whose functionality is preserved in this
> > syscall:
> [...]
> > +
> > +static struct pid_namespace *get_pid_ns_by_fd(int fd)
> > +{
> > +   struct pid_namespace *pidns = ERR_PTR(-EINVAL);
> > +
> > +   if (fd >= 0) {
> > +#ifdef CONFIG_PID_NS
> > +   struct ns_common *ns;
> > +   struct file *file = proc_ns_fget(fd);
> > +   if (IS_ERR(file))
> > +   return ERR_CAST(file);
> > +
> > +   ns = get_proc_ns(file_inode(file));
> > +   if (ns->ops->type == CLONE_NEWPID)
> > +   pidns = get_pid_ns(
> > +   container_of(ns, struct pid_namespace, ns));
> 
> This increments the refcount of the pidns...
> 
> > +
> > +   fput(file);
> > +#endif
> > +   } else {
> > +   pidns = task_active_pid_ns(current);
> 
> ... but this doesn't. That's pretty subtle; could you please put a
> comment on top of this function that points this out? Or even better,
> change the function to always take a reference, so that the caller
> doesn't have to worry about figuring this out.
> 
> > +   }
> > +
> > +   return pidns;
> > +}
> [...]
> > +SYSCALL_DEFINE5(pidctl, unsigned int, cmd, pid_t, pid, int, source, int, 
> > target,
> > +   unsigned int, flags)
> > +{
> > +   struct pid_namespace *source_ns = NULL, *target_ns = NULL;
> > +   struct pid *struct_pid;
> > +   pid_t result;
> > +
> > +   switch (cmd) {
> > +   case PIDCMD_QUERY_PIDNS:
> > +   if (pid != 0)
> > +   return -EINVAL;
> > +   pid = 1;
> > +   /* fall through */
> > +   case PIDCMD_QUERY_PID:
> > +   if (flags != 0)
> > +   return -EINVAL;
> > +   break;
> > +   case PIDCMD_GET_PIDFD:
> > +   if (flags & ~PIDCTL_CLOEXEC)
> > +   return -EINVAL;
> > +   break;
> > +   default:
> > +   return -EOPNOTSUPP;
> > +   }
> > +
> > +   source_ns = get_pid_ns_by_fd(source);
> > +   result = PTR_ERR(source_ns);
> 
> I very much dislike using PTR_ERR() on pointers before checking
> whether they contain an error value or not. I understand that the
> result of this won't actually be used, but it still seems weird to
> have what is essentially a cast of a potentially valid pointer to a
> potentially smaller integer here.
> 
> Could you maybe move the PTR_ERR() into the error branch? Like so:
> 
> if (IS_ERR(source_ns)) {
>   result = PTR_ERR(source_ns);
>   goto err_source;
> }

FWIW, thought of mentioning that once the get_pid_ns_by_fd can be modified to
always take a reference on the ns, a further simplifcation here could be:

if (IS_ERR(source_ns)) {
result = PTR_ERR(source_ns);
source_ns = NULL;
goto error;
}

if (IS_ERR(target_ns)) {
result = PTR_ERR(target_ns);
target_ns = NULL;
goto error;
}

And the error patch can be simplified as well which also avoids the "if 
(target)"
issues Jan mentioned in the error path:

error:
if (source_ns)
put_pid_ns(source_ns);
if (target_ns)
put_pid_ns(target_ns);
return result;

 
> > +   if (IS_ERR(source_ns))
> > +   goto err_source;
> > +
> > +   target_ns = get_pid_ns_by_fd(target);
> > +   result = PTR_ERR(target_ns);
> > +   if (IS_ERR(target_ns))
> > +   goto err_target;
> > +
> > +   if (cmd == PIDCMD_QUERY_PIDNS) {
> > +   result = pidns_related(source_ns, target_ns);
> > +   } else {
> > +   rcu_read_lock();
> > +   struct_pid = find_pid_ns(pid, source_ns);
> 
> find_pid_ns() doesn't take a reference on its return value, the return
> value is only pinned into memory by the RCU read-side critical
> section...
> 
> > +   result = struct_pid ? pid_nr_ns(struct_pid, target_ns) : 
> > -ESRCH;
> > +   rcu_read_unlock();
> 
> ... which ends here, making struct_pid a dangling pointer...
> 
> > +
> > +   if (cmd == PIDCMD_GET_PIDFD) {
> > +   int cloexec = (flags & PIDCTL_CLOEXEC) ? O_CLOEXEC 
> > : 0;
> > +   if (result > 0)
> > +   result = pidfd_create_fd(struct_pid, 
> > cloexec);
> 
> ... and then here you continue to use struct_pid. That seems bogus.

Absolutely.

> > +   else if (result == 0)
> > +   result = -ENOENT;
> 
> You don't need to have flags for this for new syscalls, you can just
> make everything 

Re: [PATCH] efi: Downgrade "EFI_MEMMAP is not enabled" message

2019-03-26 Thread Takashi Iwai
On Tue, 26 Mar 2019 17:04:30 +0100,
Ard Biesheuvel wrote:
> 
> On Tue, 26 Mar 2019 at 16:25, Takashi Iwai  wrote:
> >
> > On Fri, 01 Mar 2019 16:27:24 +0100,
> > Takashi Iwai wrote:
> > >
> > > On Fri, 01 Mar 2019 15:57:03 +0100,
> > > Ard Biesheuvel wrote:
> > > >
> > > > On Fri, 1 Mar 2019 at 15:14, Takashi Iwai  wrote:
> > > > >
> > > > > On Fri, 01 Mar 2019 15:02:23 +0100,
> > > > > Ard Biesheuvel wrote:
> > > > > >
> > > > > > On Fri, 1 Mar 2019 at 15:01, Takashi Iwai  wrote:
> > > > > > >
> > > > > > > On Fri, 01 Mar 2019 14:53:39 +0100,
> > > > > > > Ard Biesheuvel wrote:
> > > > > > > >
> > > > > > > > On Fri, 1 Mar 2019 at 14:40, Takashi Iwai  wrote:
> > > > > > > > >
> > > > > > > > > Since 38ac0287b7f4 ("fbdev/efifb: Honour UEFI memory map 
> > > > > > > > > attributes
> > > > > > > > > when mapping the FB"), efifb_probe() checks its memory range 
> > > > > > > > > via
> > > > > > > > > efi_mem_desc_lookup(), and this leads to a spurious error 
> > > > > > > > > message
> > > > > > > > > "EFI_MEMMAP is not enabled" at every boot on KVM.  This is 
> > > > > > > > > quite
> > > > > > > > > annoying since the error message appears even if you set 
> > > > > > > > > "quiet" boot
> > > > > > > > > option.
> > > > > > > > >
> > > > > > > > > Actually there are only a few places that call 
> > > > > > > > > efi_mem_desc_lookup()
> > > > > > > > > function, and the other callers do give the explicit error 
> > > > > > > > > messages
> > > > > > > > > when the function returns an error in anyway.  That is, the 
> > > > > > > > > error
> > > > > > > > > message in the function is more or less moot.
> > > > > > > > >
> > > > > > > > > So let's downgrade the error message for stop annoying users.
> > > > > > > > >
> > > > > > > > > Fixes: 38ac0287b7f4 ("fbdev/efifb: Honour UEFI memory map 
> > > > > > > > > attributes when mapping the FB")
> > > > > > > > > Bugzilla: https://bugzilla.suse.com/show_bug.cgi?id=1127339
> > > > > > > > > Signed-off-by: Takashi Iwai 
> > > > > > > > > ---
> > > > > > > > >  drivers/firmware/efi/efi.c | 2 +-
> > > > > > > > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > > > > > > >
> > > > > > > > > diff --git a/drivers/firmware/efi/efi.c 
> > > > > > > > > b/drivers/firmware/efi/efi.c
> > > > > > > > > index 55b77c576c42..50ac33097458 100644
> > > > > > > > > --- a/drivers/firmware/efi/efi.c
> > > > > > > > > +++ b/drivers/firmware/efi/efi.c
> > > > > > > > > @@ -409,7 +409,7 @@ int efi_mem_desc_lookup(u64 phys_addr, 
> > > > > > > > > efi_memory_desc_t *out_md)
> > > > > > > > > efi_memory_desc_t *md;
> > > > > > > > >
> > > > > > > > > if (!efi_enabled(EFI_MEMMAP)) {
> > > > > > > > > -   pr_err_once("EFI_MEMMAP is not enabled.\n");
> > > > > > > > > +   pr_debug("EFI_MEMMAP is not enabled.\n");
> > > > > > > > > return -EINVAL;
> > > > > > > > > }
> > > > > > > > >
> > > > > > > >
> > > > > > > > efifb_probe() only calls efi_mem_desc_lookup() if
> > > > > > > > screen_info.orig_video_isVGA == VIDEO_TYPE_EFI, which only gets
> > > > > > > > assigned on a EFI boot.
> > > > > > > >
> > > > > > > > So even though I don't object to the patch as is, I would like 
> > > > > > > > to
> > > > > > > > understand where this error message is coming from, given that 
> > > > > > > > it
> > > > > > > > means that you are running on a UEFI system without the EFI 
> > > > > > > > memory
> > > > > > > > map.
> > > > > > > >
> > > > > > > > Is this system booting via GRUB in EFI mode?
> > > > > > >
> > > > > > > No, it's booted in legacy boot mode.  But the primary fb is 
> > > > > > > efifb, and
> > > > > > > that's why the message appears.
> > > > > > >
> > > > > >
> > > > > > So how are we ending up with
> > > > > >
> > > > > > screen_info.orig_video_isVGA == VIDEO_TYPE_EFI
> > > > > >
> > > > > > ??
> > > > >
> > > > > Ah, sorry, my description was too ambiguous.
> > > > >
> > > > > Actually our GRUB2 default setup boots the Linux kernel with linuxefi.
> > > > > What I meant was that I invoked qemu-kvm without any -bios option, so
> > > > > it's no EFI BIOS.
> > > > >
> > > >
> > > > Some from the link here
> > > >
> > > > https://openqa.opensuse.org/tests/864184/file/journal_check-full_journal.log
> > > >
> > > > I got
> > > >
> > > > Feb 27 13:13:41 linux-e2c3 kernel: efifb: probing for efifb
> > > > Feb 27 13:13:41 linux-e2c3 kernel: efi: EFI_MEMMAP is not enabled.
> > > > Feb 27 13:13:41 linux-e2c3 kernel: fbcon: Taking over console
> > > > Feb 27 13:13:41 linux-e2c3 kernel: efifb: No BGRT, not showing boot 
> > > > graphics
> > > > Feb 27 13:13:41 linux-e2c3 kernel: efifb: framebuffer at 0xfc00,
> > > > using 1408k, total 1408k
> > > > Feb 27 13:13:41 linux-e2c3 kernel: efifb: mode is 800x600x24,
> > > > linelength=2400, pages=1
> > > > Feb 27 13:13:41 linux-e2c3 kernel: efifb: scrolling: redraw
> > > > Feb 27 13:13:41 linux-e2c3 kernel: efifb: Truecolor: size=0:8:8:8,
> > > > shift=0:16:8:0
> > > > Feb 27 

Re: [PATCH v3] kmemleaak: survive in a low-memory situation

2019-03-26 Thread Catalin Marinas
On Tue, Mar 26, 2019 at 11:43:38AM -0400, Qian Cai wrote:
> Kmemleak could quickly fail to allocate an object structure and then
> disable itself in a low-memory situation. For example, running a mmap()
> workload triggering swapping and OOM. This is especially problematic for
> running things like LTP testsuite where one OOM test case would disable
> the whole kmemleak and render the rest of test cases without kmemleak
> watching for leaking.
> 
> Kmemleak allocation could fail even though the tracked memory is
> succeeded. Hence, it could still try to start a direct reclaim if it is
> not executed in an atomic context (spinlock, irq-handler etc), or a
> high-priority allocation in an atomic context as a last-ditch effort.
> Since kmemleak is a debug feature, it is unlikely to be used in
> production that memory resources is scarce where direct reclaim or
> high-priority atomic allocations should not be granted lightly.
> 
> Unless there is a brave soul to reimplement the kmemleak to embed it's
> metadata into the tracked memory itself in a foreseeable future, this
> provides a good balance between enabling kmemleak in a low-memory
> situation and not introducing too much hackiness into the existing
> code for now.

Embedding the metadata would help with the slab allocations (though not
with vmalloc) but it comes with its own potential issues. There are some
bits of kmemleak that rely on deferred freeing of metadata for RCU
traversal, so this wouldn't go well with embedding it.

I wonder whether we'd be better off to replace the metadata allocator
with gen_pool. This way we'd also get rid of early logging/replaying of
the memory allocations since we can populate the gen_pool early with a
static buffer.

-- 
Catalin


Re: [PATCH v3] kmemleaak: survive in a low-memory situation

2019-03-26 Thread Matthew Wilcox
On Tue, Mar 26, 2019 at 11:43:38AM -0400, Qian Cai wrote:
> Unless there is a brave soul to reimplement the kmemleak to embed it's
> metadata into the tracked memory itself in a foreseeable future, this
> provides a good balance between enabling kmemleak in a low-memory
> situation and not introducing too much hackiness into the existing
> code for now.

I don't understand kmemleak.  Kirill pointed me at this a few days ago:

https://gist.github.com/kiryl/3225e235fea390aa2e49bf625bbe83ec

It's caused by the XArray allocating memory using GFP_NOWAIT | __GFP_NOWARN.
kmemleak then decides it needs to allocate memory to track this memory.
So it calls kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));

#define gfp_kmemleak_mask(gfp)  (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
 __GFP_NORETRY | __GFP_NOMEMALLOC | \
 __GFP_NOWARN | __GFP_NOFAIL)

then the page allocator gets to see GFP_NOFAIL | GFP_NOWAIT and gets angry.

But I don't understand why kmemleak needs to mess with the GFP flags at
all.  Just allocate using the same flags as the caller, and fail the original
allocation if the kmemleak allocation fails.  Like this:

+++ b/mm/slab.h
@@ -435,12 +435,22 @@ static inline void slab_post_alloc_hook(struct kmem_cache 
*s, gfp_t flags,
for (i = 0; i < size; i++) {
p[i] = kasan_slab_alloc(s, p[i], flags);
/* As p[i] might get tagged, call kmemleak hook after KASAN. */
-   kmemleak_alloc_recursive(p[i], s->object_size, 1,
-s->flags, flags);
+   if (kmemleak_alloc_recursive(p[i], s->object_size, 1,
+s->flags, flags))
+   goto fail;
}
 
if (memcg_kmem_enabled())
memcg_kmem_put_cache(s);
+   return;
+
+fail:
+   while (i > 0) {
+   kasan_blah(...);
+   kmemleak_blah();
+   i--;
+   }
+   free_blah(p);
+   *p = NULL;
 }
 
 #ifndef CONFIG_SLOB


and if we had something like this, we wouldn't need kmemleak to have this
self-disabling or must-succeed property.


Re: [PATCH 2/3] genirq/timings: Add array suffix computation code

2019-03-26 Thread Thomas Gleixner
Daniel,

On Tue, 26 Mar 2019, Daniel Lezcano wrote:
> >> +/*
> >> + * Exponential moving average computation
> >> + */
> >> +static int irq_timings_ema_new(s64 value, s64 ema_old)
> > 
> > There is a mixed bag of s64/u64 all over this code. Please stay
> > consistent. We had enough sign confusion bugs in the past.
> 
> Right.
> 
> I have a question, ema_old and value will be always u64 type and the
> function irq_timings_ema_new() will return an u64 ...
> 
> > value = (value - ema_old) * EMA_ALPHA_VAL;
> > return ema_old + value >> EMA_ALPHA_SHIFT;
> 
> ... how can I deal with the operations above when value < ema_old ?
> 
> Shall I use an intermediate s64 ?
> 
> eg:
> 
>   s64 aux = (value - ema_old) * EMA_ALPHA_VAL;
>   return ema_old + aux >> EMA_ALPHA_SHIFT;
> ?

That should work if ema_old is not ever having the topmost bit set :)

Thanks,

tglx


Re: [ANNOUNCE] v5.0.3-rt1

2019-03-26 Thread John Ogness
Hi Julien,

On 2019-03-26, Julien Grall  wrote:
>>> [...]
>>> [1.169151] 002: Serial: AMBA PL011 UART driver
>>> [1.254891] 002: 7ff8.uart: ttyAMA0 at MMIO 0x7ff8 (irq = 32, 
>>> base_baud = 0) is a PL011 rev3
>>> [1.255007] 002: printk: console [ttyAMA0] enabled
>>
>> The ttyAMA drivers do not have support for atomic printing, so it is
>> not the new atomic feature that is causing the mangling. For your
>> setup, all printk console printing is being handled within a specific
>> context, the printk kernel thread.
>
> This series is somehow making worst when using ttyAMA0. I haven't see
> any mangling with 4.19-rt.

I will setup some tests using systemd on 4.19 and 5.0 to see if I can
see what is going on. It seems there may be some synchronization missing
between the printk kernel thread and /dev/console.

John Ogness


Re: [PATCH] rcutorture: Select from only online CPUs

2019-03-26 Thread Paul E. McKenney
On Mon, Mar 25, 2019 at 06:40:17PM -0400, Joel Fernandes wrote:
> On Mon, Mar 25, 2019 at 12:42 PM Paul E. McKenney  
> wrote:
> >
> > On Mon, Mar 25, 2019 at 12:33:37PM -0400, Joel Fernandes wrote:
> > > On Mon, Mar 25, 2019 at 11:02 AM Paul E. McKenney  
> > > wrote:
> > > >
> > > > On Fri, Mar 22, 2019 at 11:46:19PM -0400, Joel Fernandes (Google) wrote:
> > > > > The rcutorture jitter.sh script selects a random CPU but does not 
> > > > > check
> > > > > if it is offline or online. This leads to taskset errors many times. 
> > > > > On
> > > > > my machine, hyper threading is disabled so half the cores are offline
> > > > > causing taskset errors a lot of times. Let us fix this by checking 
> > > > > from
> > > > > only the online CPUs on the system.
> > > > >
> > > > > Signed-off-by: Joel Fernandes (Google) 
> > > >
> > > > Good catch!
> > > >
> > > > Please see below for one suggestion for simplification.
> > > >
> > > > Thanx, Paul
> > > >
> > > > > ---
> > > > >  tools/testing/selftests/rcutorture/bin/jitter.sh | 11 ++-
> > > > >  1 file changed, 10 insertions(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh 
> > > > > b/tools/testing/selftests/rcutorture/bin/jitter.sh
> > > > > index 3633828375e3..53bf9d99b5cd 100755
> > > > > --- a/tools/testing/selftests/rcutorture/bin/jitter.sh
> > > > > +++ b/tools/testing/selftests/rcutorture/bin/jitter.sh
> > > > > @@ -47,10 +47,19 @@ do
> > > > >   exit 0;
> > > > >   fi
> > > > >
> > > > > - # Set affinity to randomly selected CPU
> > > > > + # Set affinity to randomly selected online CPU
> > > > >   cpus=`ls /sys/devices/system/cpu/*/online |
> > > >
> > > > cpus=`grep 1 /sys/devices/system/cpu/*/online |
> > >
> > > Yes, this is better. Lets do it this way :)
> > >
> > > > >   sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//' |
> > > > >   grep -v '^0*$'`
> > > >
> > > > Of course, now I have no idea why I excluded CPU 0...  :-/
> > >
> > > Yes, I was wondering as well about that :-)
> >
> > Please feel free to try including CPU 0 and running the set of single-CPU
> > rcutorture scenarios.  ;-)
> 
> Will do and then will update the patch by adding the CPU back, if all
> is well. Thanks.

And rcutorture doesn't like the rcu_is_cpu_rrupt_from_idle() patch on
scenarios SRCU-P, TASKS01, and TREE05, which are the Tree RCU scenarios
that enable CONFIG_PROVE_RCU.  The compiler error is:

kernel/rcu/tree.c:391:2: error: implicit declaration of function 
‘_this_cpu_read’ [-Werror=implicit-function-declaration]

My guess is that the initial underscore needs to go.  I will drop
these two patches in favor of an update from you.  ;-)

Thanx, Paul



Re: [PATCH v2] PCI: al: Add Amazon Annapurna Labs PCIe host controller driver

2019-03-26 Thread Lorenzo Pieralisi
On Tue, Mar 26, 2019 at 01:24:41PM +, David Woodhouse wrote:
> On Tue, 2019-03-26 at 12:17 +, Lorenzo Pieralisi wrote:
> > [+Zhou, Gustavo]
> > 
> > On Tue, Mar 26, 2019 at 12:00:55PM +0200, Jonathan Chocron wrote:
> > > Adding support for Amazon's Annapurna Labs PCIe driver.
> > > The HW controller is based on DesignWare's IP.
> > > 
> > > The HW doesn't support accessing the Root Port's config space via
> > > ECAM, so we obtain its base address via an AMZN0001 device.
> > > 
> > > Furthermore, the DesignWare PCIe controller doesn't filter out
> > > config transactions sent to devices 1 and up on its bus, so they
> > > are filtered by the driver.
> > > All subordinate buses do support ECAM access.
> > > 
> > > Implementing specific PCI config access functions involves:
> > >  - Adding an init function to obtain the Root Port's base address
> > >from an AMZN0001 device.
> > >  - Adding a new entry in the mcfg quirk array
> > > 
> > > Co-developed-by: Vladimir Aerov 
> > > Signed-off-by: Jonathan Chocron 
> > > Signed-off-by: Vladimir Aerov 
> > > Reviewed-by: Benjamin Herrenschmidt 
> > > Reviewed-by: David Woodhouse 
> > 
> > Review tags should be given on public mailing lists for public
> > review and I have not seen them (they were already there in v1) so
> > you should drop them.
> 
> We did that internally. You really don't want me telling engineers to
> post to the list *first* without running things by me to get the basics
> right. Not to start with, at least.

Hi David,

I am obviously in favour of internal review and I do not question it was
carried out internally, I just kindly ask developers to drop review tags
given internally when going to public mailing lists - I understand it is
churn for you but I prefer them to be given explicitly.

Thanks !
Lorenzo

> Reviewed-by: David Woodhouse 
> 
> 
> > > Changes from v1:
> > >   - Fix commit message comments (incl. using AMZN0001
> > > instead of PNP0C02)
> > >   - Use the usual multi-line comment style
> > > 
> > >  MAINTAINERS  |  6 +++
> > >  drivers/acpi/pci_mcfg.c  | 12 +
> > >  drivers/pci/controller/dwc/Makefile  |  1 +
> > >  drivers/pci/controller/dwc/pcie-al.c | 93 
> > > 
> > >  include/linux/pci-ecam.h |  1 +
> > >  5 files changed, 113 insertions(+)
> > >  create mode 100644 drivers/pci/controller/dwc/pcie-al.c
> > > 
> > > diff --git a/MAINTAINERS b/MAINTAINERS
> > > index 32d76a90..7a17017f9f82 100644
> > > --- a/MAINTAINERS
> > > +++ b/MAINTAINERS
> > > @@ -11769,6 +11769,12 @@ T:   git 
> > > git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git/
> > >  S:   Supported
> > >  F:   drivers/pci/controller/
> > >  
> > > +PCIE DRIVER FOR ANNAPURNA LABS
> > > +M:   Jonathan Chocron 
> > > +L:   linux-...@vger.kernel.org
> > > +S:   Maintained
> > > +F:   drivers/pci/controller/dwc/pcie-al.c
> > 
> > I do not think we need a maintainer file for that see below, and
> > actually this quirk should be handled by DWC maintainers since it is a
> > DWC quirk, not a platform one.
> 
> Many of the others already have this, it seems.
> 
> It's also fine to drop it, and include it when we add the rest of the
> Alpine SOC support and a MAINTAINERS entry for that.
> 




Re: [PATCH v3] kmemleaak: survive in a low-memory situation

2019-03-26 Thread Christopher Lameter
On Tue, 26 Mar 2019, Qian Cai wrote:

> + if (!object) {
> + /*
> +  * The tracked memory was allocated successful, if the kmemleak
> +  * object failed to allocate for some reasons, it ends up with
> +  * the whole kmemleak disabled, so let it success at all cost.

"let it succeed at all costs"

> +  */
> + gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC :
> +gfp_kmemleak_mask(gfp) | __GFP_DIRECT_RECLAIM;
> + object = kmem_cache_alloc(object_cache, gfp);
> + }
> +
>   if (!object) {

If the alloc must succeed then this check is no longer necessary.


Re: [PATCH 09/10] ALSA: pcm: Add snd_pcm_ops for snd_pcm_link()

2019-03-26 Thread Takashi Iwai
On Tue, 26 Mar 2019 16:16:54 +0100,
Timo Wischer wrote:
> 
> On 3/26/19 15:23, Takashi Iwai wrote:
> > On Tue, 26 Mar 2019 12:25:37 +0100,
> > Timo Wischer wrote:
> >> On 3/26/19 09:35, Takashi Iwai wrote:
> >>
> >>  On Tue, 26 Mar 2019 08:49:33 +0100,
> >>   wrote:
> >>   From: Timo Wischer 
> >>   snd_pcm_link() can be called by the user as long
> >> as the device is not
> >>  yet started. Therefore currently a driver which wants to iterate 
> >> over
> >>  the linked substreams has to do this at the start trigger. But 
> >> the start
> >>  trigger should not block for a long time. Therefore there is no 
> >> callback
> >>  which can be used to iterate over the linked substreams without 
> >> delaying
> >>  the start trigger.
> >>  This patch introduces a new callback function which will be 
> >> called after
> >>  the linked substream list was updated by snd_pcm_link(). This 
> >> callback
> >>  function is allowed to block for a longer time without 
> >> interfering the
> >>  synchronized start up of linked substreams.
> >>   Signed-off-by: Timo Wischer
> >> 
> >>   Well, the idea appears interesting, but I'm afraid
> >> that the
> >>  implementation is still racy.  The place you're calling the new
> >>  callback isn't protected, hence the stream can be triggered while
> >>  calling it.  That is, even during operating your loopback link_changed
> >>  callback, another thread is able to start the stream.
> >>  Hi Takashi,
> >>
> >> As far as I got you mean the following scenario:
> >>
> >>* snd_pcm_link() is called for a HW sound card
> >>+ loopback_snd_timer_link_changed()
> > The start may happen at this point.
> 
> In this case the last link status will be used and aloop will print a
> warning "Another sound timer was requested but at least one device is
> already running...".
> 
> Without this patch set a similar issue already exists. When calling
> snd_pcm_start() before snd_pcm_link() was done the additional device
> linked by the snd_pcm_link() will not be started.
> Therefore the application has already to take care about the order of
> the calls.

Yes, but it doesn't matter for now, just because other drivers do care
the PCM links only for trigger callback.  Now you're trying to add
something new but in an incomplete manner.

> 
> >
> >>+ loopback_snd_timer_open()
> >>+ spin_lock_irqsave(>cable->lock, flags);
> >>* snd_pcm_start() called for aloop sound card
> >>+ loopback_trigger()
> >>+ spin_lock(>lock) -> has to wait till 
> >> loopback_snd_timer_open()
> >>  calls spin_unlock_irqrestore()
> >>
> >> So far snd_pcm_start() has to wait for loopback_snd_timer_open().
> >>
> >>* loopback_snd_timer_open() will continue with
> >>+ dpcm->cable->snd_timer.instance = NULL;
> >>+ spin_unlock_irqrestore()
> >>* loopback_trigger() can enter the lock
> >>+ loopback_snd_timer_start() will fail with -EINVAL due to
> >>  (loopback_trigger == NULL)
> >>
> >> At least this will not result into memory corruption due to race or any 
> >> other
> >> wired behavior.
> > I don't expect the memory corruption, but my point is that dealing
> > with linked streams is still tricky.  It was considered for the
> > lightweight coupled start/stop operation, and something intensively
> > depending on the linked status was out of the original design...
> >
> >> But my expectation is that snd_pcm_link(hw, aloop) or snd_pcm_link(aloop, 
> >> hw)
> >> is only called by the application calling snd_pcm_start(aloop)
> >> because the same aloop device cannot be opened by multiple applications at 
> >> the
> >> same time.
> >>
> >> Do you see an use case where one application would call snd_pcm_start() in
> >> parallel with snd_pcm_link() (somehow configuring the device)?
> > It's not about the actual application usages but rather against the
> > malicious attacks.  Especially aloop is a virtual device that is
> > available allover the places, it may be deployed / attacked easily.
> The attack we are identifying here can only be done by the application
> opening the aloop device.
> An application allowed to open the aloop device is anyway able to
> manipulate the audio streaming.

Right, and if it such a racy access may lead to a driver misbehavior,
it's a big concern.  The proposed callback usage is racy, so some
other implementation might be broken easily in future.

> Or do you see an attack which would influence any other device/stream
> not opened by this application?
>
> >> May be we should add an additional synchronization mechanism in 
> >> pcm_native.c
> >> to avoid call of snd_pcm_link() in parallel with snd_pcm_start().
> > If it really matters...  Honestly speaking, I'm not fully convinced
> > whether we want to deal with this using the PCM link mechanism.
> >
> > What's 

Re: [PATCH RESEND v3] tpm: fix an invalid condition in tpm_common_poll

2019-03-26 Thread Tadeusz Struk
Hi Jarkko,
On 3/25/19 7:09 AM, Jarkko Sakkinen wrote:
> It is still missing the comment I asked to add. Otherwise, it is good.
> 

Sorry, I didn't see your email with the suggestion earlier.
To be honest I'm not sure if this comment adds much value, or if it is
even correct. The poll doesn't "succeed" or "fail". It just returns
a mask indicating if there is any data to read or if the user can write.

Isn't the commit message + 'git blame' enough to remember why it was
done this way?

Thanks,
-- 
Tadeusz


[PATCH v1 4/4] tests: add pidctl() tests

2019-03-26 Thread Christian Brauner
This adds test cases for all three subcommands and verifies that they
succeed and fail as expected.
Additionally, the tests verify that pidctl() pidfds are correctly useable
with pidfd_send_signal().

Signed-off-by: Christian Brauner 
Cc: Arnd Bergmann 
Cc: "Eric W. Biederman" 
Cc: Kees Cook 
Cc: Alexey Dobriyan 
Cc: Thomas Gleixner 
Cc: Jann Horn 
Cc: "Michael Kerrisk (man-pages)" 
Cc: Konstantin Khlebnikov 
Cc: Jonathan Kowalski 
Cc: "Dmitry V. Levin" 
Cc: Andy Lutomirsky 
Cc: Andrew Morton 
Cc: Oleg Nesterov 
Cc: Nagarathnam Muthusamy 
Cc: Aleksa Sarai 
Cc: Al Viro 
---
/* changelog */
v1:
- Christian Brauner :
  - adapt to changing pidfds to CLOEXEC by default
---
 tools/testing/selftests/pidfd/Makefile  |   2 +-
 tools/testing/selftests/pidfd/pidctl_test.c | 537 
 2 files changed, 538 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/pidfd/pidctl_test.c

diff --git a/tools/testing/selftests/pidfd/Makefile 
b/tools/testing/selftests/pidfd/Makefile
index deaf8073bc06..29dfa29b3afa 100644
--- a/tools/testing/selftests/pidfd/Makefile
+++ b/tools/testing/selftests/pidfd/Makefile
@@ -1,6 +1,6 @@
 CFLAGS += -g -I../../../../usr/include/
 
-TEST_GEN_PROGS := pidfd_test
+TEST_GEN_PROGS := pidfd_test pidctl_test
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/pidfd/pidctl_test.c 
b/tools/testing/selftests/pidfd/pidctl_test.c
new file mode 100644
index ..a39d3cd81089
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidctl_test.c
@@ -0,0 +1,537 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "../kselftest.h"
+
+static int parent_pidns_fd = -1;
+static pid_t parent_pidns_pid = 0;
+
+static int child_pidns_fd = -1;
+static pid_t child_pidns_pid = 0;
+
+static int cousin_pidns_fd = -1;
+static pid_t cousin_pidns_pid = 0;
+
+static bool pidns_supported = false;
+
+static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
+   unsigned int flags)
+{
+   return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags);
+}
+
+static inline int sys_pidctl(unsigned int cmd, pid_t pid, int source,
+int target, unsigned int flags)
+{
+   return syscall(__NR_pidctl, cmd, pid, source, target, flags);
+}
+
+struct cr_clone_arg {
+   char stack[128] __attribute__((aligned(16)));
+   char stack_ptr[0];
+};
+
+static int child_pidns_creator(void *args)
+{
+   (void)prctl(PR_SET_PDEATHSIG, SIGKILL);
+   while (1)
+   sleep(5);
+
+   exit(0);
+}
+
+static int prepare_pid_namespaces(void)
+{
+   char path[512];
+   struct cr_clone_arg ca;
+   pid_t pid;
+
+   parent_pidns_fd = open("/proc/self/ns/pid", O_RDONLY | O_CLOEXEC);
+   if (parent_pidns_fd < 0) {
+   ksft_print_msg("failed to open current pid namespace");
+   return -1;
+   }
+   parent_pidns_pid = getpid();
+
+   pid = clone(child_pidns_creator, ca.stack_ptr, CLONE_NEWPID | SIGCHLD,
+   NULL);
+   if (pid < 0) {
+   ksft_print_msg("failed to clone child-pidns process in new pid 
namespace");
+   return -1;
+   }
+
+   snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid);
+
+   child_pidns_fd = open(path, O_RDONLY | O_CLOEXEC);
+   if (child_pidns_fd < 0) {
+   ksft_print_msg("failed to open pid namespace");
+   return -1;
+   }
+   child_pidns_pid = pid;
+
+   pid = clone(child_pidns_creator, ca.stack_ptr, CLONE_NEWPID | SIGCHLD,
+   NULL);
+   if (pid < 0) {
+   ksft_print_msg("failed to clone cousin-pidns process in new pid 
namespace");
+   return -1;
+   }
+
+   snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid);
+
+   cousin_pidns_fd = open(path, O_RDONLY | O_CLOEXEC);
+   if (cousin_pidns_fd < 0) {
+   ksft_print_msg("failed to open cousin pid namespace");
+   return -1;
+   }
+   cousin_pidns_pid = pid;
+
+   return 0;
+}
+
+static int test_pidcmd_query_pid(void)
+{
+   const char *test_name = "pidctl PIDCMD_QUERY_PID";
+   pid_t pid, self;
+   int parent_pidns_fd2;
+
+   self = getpid();
+
+   pid = sys_pidctl(PIDCMD_QUERY_PID, self, -1, -1, 1);
+   if (pid >= 0) {
+   ksft_print_msg("%s test %d: managed to pass invalid flag\n",
+  test_name, ksft_test_num());
+   return -1;
+   }
+
+   pid = sys_pidctl(PIDCMD_QUERY_PID, self, -1, -1, 0);
+   if (!pid || (pid != self)) {
+   ksft_print_msg("%s test %d: argument pid %d, translated pid 
%d\n",
+  test_name, ksft_test_num(), self, pid);
+   return -1;
+  

[PATCH v1 3/4] signal: support pidctl() with pidfd_send_signal()

2019-03-26 Thread Christian Brauner
Let pidfd_send_signal() use pidfds retrieved via pidctl(). With this patch
pidfd_send_signal() becomes independent of procfs. This fullfils the
request made when we merged the pidfd_send_signal() patchset. The
pidfd_send_signal() syscall is now always available allowing for it to be
used by users without procfs mounted or even users without procfs support
compiled into the kernel.

Signed-off-by: Christian Brauner 
Reviewed-by: David Howells 
Acked-by: Serge Hallyn 
Cc: Arnd Bergmann 
Cc: "Eric W. Biederman" 
Cc: Kees Cook 
Cc: Alexey Dobriyan 
Cc: Thomas Gleixner 
Cc: Jann Horn 
Cc: Konstantin Khlebnikov 
Cc: Jonathan Kowalski 
Cc: "Dmitry V. Levin" 
Cc: Andy Lutomirsky 
Cc: Andrew Morton 
Cc: Oleg Nesterov 
Cc: Nagarathnam Muthusamy 
Cc: Aleksa Sarai 
Cc: Al Viro 
---
/* changelog */
v1:
- Jann Horn  in [1]:
  - make access_pidfd_pidns() more readable
---
 kernel/signal.c | 29 -
 kernel/sys_ni.c |  3 ---
 2 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index b7953934aa99..7bdeda8333c8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3513,26 +3513,14 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
return kill_something_info(sig, , pid);
 }
 
-#ifdef CONFIG_PROC_FS
 /*
  * Verify that the signaler and signalee either are in the same pid namespace
  * or that the signaler's pid namespace is an ancestor of the signalee's pid
  * namespace.
  */
-static bool access_pidfd_pidns(struct pid *pid)
+static inline bool access_pidfd_pidns(struct pid *pid)
 {
-   struct pid_namespace *active = task_active_pid_ns(current);
-   struct pid_namespace *p = ns_of_pid(pid);
-
-   for (;;) {
-   if (!p)
-   return false;
-   if (p == active)
-   break;
-   p = p->parent;
-   }
-
-   return true;
+   return pidnscmp(task_active_pid_ns(current), ns_of_pid(pid)) >= 0;
 }
 
 static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
@@ -3550,6 +3538,14 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t 
*kinfo, siginfo_t *info)
return copy_siginfo_from_user(kinfo, info);
 }
 
+static struct pid *pidfd_to_pid(const struct file *file)
+{
+   if (file->f_op == _fops)
+   return file->private_data;
+
+   return tgid_pidfd_to_pid(file);
+}
+
 /**
  * sys_pidfd_send_signal - send a signal to a process through a task file
  *  descriptor
@@ -3581,12 +3577,12 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
if (flags)
return -EINVAL;
 
-   f = fdget_raw(pidfd);
+   f = fdget(pidfd);
if (!f.file)
return -EBADF;
 
/* Is this a pidfd? */
-   pid = tgid_pidfd_to_pid(f.file);
+   pid = pidfd_to_pid(f.file);
if (IS_ERR(pid)) {
ret = PTR_ERR(pid);
goto err;
@@ -3625,7 +3621,6 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
fdput(f);
return ret;
 }
-#endif /* CONFIG_PROC_FS */
 
 static int
 do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index d21f4befaea4..4d9ae5ea6caf 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -167,9 +167,6 @@ COND_SYSCALL(syslog);
 
 /* kernel/sched/core.c */
 
-/* kernel/signal.c */
-COND_SYSCALL(pidfd_send_signal);
-
 /* kernel/sys.c */
 COND_SYSCALL(setregid);
 COND_SYSCALL(setgid);
-- 
2.21.0



[PATCH v1 1/4] Make anon_inodes unconditional

2019-03-26 Thread Christian Brauner
From: David Howells 

Make the anon_inodes facility unconditional so that it can be used by core
VFS code and the pidctl() syscall.

Signed-off-by: David Howells 
Signed-off-by: Al Viro 
[christ...@brauner.io: adapt commit message to mention pidctl()]
Signed-off-by: Christian Brauner 
---
 arch/arm/kvm/Kconfig   |  1 -
 arch/arm64/kvm/Kconfig |  1 -
 arch/mips/kvm/Kconfig  |  1 -
 arch/powerpc/kvm/Kconfig   |  1 -
 arch/s390/kvm/Kconfig  |  1 -
 arch/x86/Kconfig   |  1 -
 arch/x86/kvm/Kconfig   |  1 -
 drivers/base/Kconfig   |  1 -
 drivers/char/tpm/Kconfig   |  1 -
 drivers/dma-buf/Kconfig|  1 -
 drivers/gpio/Kconfig   |  1 -
 drivers/iio/Kconfig|  1 -
 drivers/infiniband/Kconfig |  1 -
 drivers/vfio/Kconfig   |  1 -
 fs/Makefile|  2 +-
 fs/notify/fanotify/Kconfig |  1 -
 fs/notify/inotify/Kconfig  |  1 -
 init/Kconfig   | 10 --
 18 files changed, 1 insertion(+), 27 deletions(-)

diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index 3f5320f46de2..f591026347a5 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -22,7 +22,6 @@ config KVM
bool "Kernel-based Virtual Machine (KVM) support"
depends on MMU && OF
select PREEMPT_NOTIFIERS
-   select ANON_INODES
select ARM_GIC
select ARM_GIC_V3
select ARM_GIC_V3_ITS
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index a3f85624313e..a67121d419a2 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -23,7 +23,6 @@ config KVM
depends on OF
select MMU_NOTIFIER
select PREEMPT_NOTIFIERS
-   select ANON_INODES
select HAVE_KVM_CPU_RELAX_INTERCEPT
select HAVE_KVM_ARCH_TLB_FLUSH_ALL
select KVM_MMIO
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index 4528bc9c3cb1..eac25aef21e0 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -21,7 +21,6 @@ config KVM
depends on MIPS_FP_SUPPORT
select EXPORT_UASM
select PREEMPT_NOTIFIERS
-   select ANON_INODES
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
select HAVE_KVM_VCPU_ASYNC_IOCTL
select KVM_MMIO
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index bfdde04e4905..f53997a8ca62 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,7 +20,6 @@ if VIRTUALIZATION
 config KVM
bool
select PREEMPT_NOTIFIERS
-   select ANON_INODES
select HAVE_KVM_EVENTFD
select HAVE_KVM_VCPU_ASYNC_IOCTL
select SRCU
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 767453faacfc..1816ee48eadd 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -21,7 +21,6 @@ config KVM
prompt "Kernel-based Virtual Machine (KVM) support"
depends on HAVE_KVM
select PREEMPT_NOTIFIERS
-   select ANON_INODES
select HAVE_KVM_CPU_RELAX_INTERCEPT
select HAVE_KVM_VCPU_ASYNC_IOCTL
select HAVE_KVM_EVENTFD
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c1f9b3cf437c..18f2c954464e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -44,7 +44,6 @@ config X86
#
select ACPI_LEGACY_TABLES_LOOKUPif ACPI
select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
-   select ANON_INODES
select ARCH_32BIT_OFF_T if X86_32
select ARCH_CLOCKSOURCE_DATA
select ARCH_CLOCKSOURCE_INIT
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 72fa955f4a15..fc042419e670 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -27,7 +27,6 @@ config KVM
depends on X86_LOCAL_APIC
select PREEMPT_NOTIFIERS
select MMU_NOTIFIER
-   select ANON_INODES
select HAVE_KVM_IRQCHIP
select HAVE_KVM_IRQFD
select IRQ_BYPASS_MANAGER
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 059700ea3521..03f067da12ee 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -174,7 +174,6 @@ source "drivers/base/regmap/Kconfig"
 config DMA_SHARED_BUFFER
bool
default n
-   select ANON_INODES
select IRQ_WORK
help
  This option enables the framework for buffer-sharing between
diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig
index 536e55d3919f..f3e4bc490cf0 100644
--- a/drivers/char/tpm/Kconfig
+++ b/drivers/char/tpm/Kconfig
@@ -157,7 +157,6 @@ config TCG_CRB
 config TCG_VTPM_PROXY
tristate "VTPM Proxy Interface"
depends on TCG_TPM
-   select ANON_INODES
---help---
  This driver proxies for an emulated TPM (vTPM) running in userspace.
  A device /dev/vtpmx is provided that creates a device pair
diff --git a/drivers/dma-buf/Kconfig b/drivers/dma-buf/Kconfig
index 2e5a0faa2cb1..3fc9c2efc583 100644
--- a/drivers/dma-buf/Kconfig
+++ b/drivers/dma-buf/Kconfig
@@ -3,7 +3,6 @@ menu "DMABUF options"
 config SYNC_FILE

Re: [PATCH 3.18 132/134] rcu: Do RCU GP kthread self-wakeup from softirq and interrupt

2019-03-26 Thread Paul E. McKenney
On Tue, Mar 26, 2019 at 08:43:45AM +, He, Bo wrote:
> Hi, Paul:
>   I have tried on my PC and not hit any hang issue with RCU torture test 
> for one hour, the configurations are like:
> OS: ubuntu 16.04
> kenrel: 3.18.136 + 3.18 rcu patch
> CPU:  Intel(R) Xeon(R) CPU E3-1225 V2 @ 3.20GHz

Sounds good, please proceed!

Thanx, Paul

> -Original Message-
> From: Paul E. McKenney  
> Sent: Tuesday, March 26, 2019 12:00 AM
> To: Greg Kroah-Hartman 
> Cc: He, Bo ; linux-kernel@vger.kernel.org; 
> sta...@vger.kernel.org; Zhang, Jun ; Xiao, Jin 
> ; Bai, Jie A 
> Subject: Re: [PATCH 3.18 132/134] rcu: Do RCU GP kthread self-wakeup from 
> softirq and interrupt
> 
> On Sat, Mar 23, 2019 at 07:33:15AM +0100, Greg Kroah-Hartman wrote:
> > On Fri, Mar 22, 2019 at 04:00:17PM +, He, Bo wrote:
> > > Hi, Greg:
> > >   Can you hold on the 3.18-stable branch, it seems there are some issue, 
> > > please see the comments from Paul:
> > > 
> > > Comments from Paul:
> > > I subjected all of the others to light rcutorture testing, which 
> > > they passed.  This v3.18 patch hung, however.  Trying it again with 
> > > stock
> > > v3.18 got the same hang, so I believe we can exonerate the patch and give 
> > > it a good firm "maybe" on 3.18.
> > > 
> > > Worth paying special attention to further test results from 3.18.x, 
> > > though!
> > 
> > Ok, I've dropped this from the 3.18.y queue now, thanks.
> 
> Bo, if you know of a "y" for 3.18.y that would likely pass rcutorture 
> testing, please let me know.
> 
>   Thanx, Paul
> 



[PATCH v1 2/4] pid: add pidctl()

2019-03-26 Thread Christian Brauner
The pidctl() syscalls builds on, extends, and improves translate_pid() [4].
I quote Konstantins original patchset first that has already been acked and
picked up by Eric before and whose functionality is preserved in this
syscall:

"Each process have different pids, one for each pid namespace it belongs.
 When interaction happens within single pid-ns translation isn't required.
 More complicated scenarios needs special handling.

 For example:
 - reading pid-files or logs written inside container with pid namespace
 - writing logs with internal pids outside container for pushing them into
 - attaching with ptrace to tasks from different pid namespace

 Generally speaking, any cross pid-ns API with pids needs translation.

 Currently there are several interfaces that could be used here:

 Pid namespaces are identified by device and inode of /proc/[pid]/ns/pid.

 Pids for nested pid namespaces are shown in file /proc/[pid]/status.
 In some cases pid translation could be easily done using this information.
 Backward translation requires scanning all tasks and becomes really
 complicated for deeper namespace nesting.

 Unix socket automatically translates pid attached to SCM_CREDENTIALS.
 This requires CAP_SYS_ADMIN for sending arbitrary pids and entering
 into pid namespace, this expose process and could be insecure."

The original patchset allowed two distinct operations implicitly:
- discovering whether pid namespaces (pidns) have a parent-child
  relationship
- translating a pid from a source pidns into a target pidns

Both tasks are accomplished in the original patchset by passing a pid
along. If the pid argument is passed as 1 the relationship between two pid
namespaces can be discovered.
The syscall will gain a lot clearer syntax and will be easier to use for
userspace if the task it is asked to perform is passed through a
command argument. Additionally, it allows us to remove an intrinsic race
caused by using the pid argument as a way to discover the relationship
between pid namespaces.
This patch introduces three commands:

/* PIDCMD_QUERY_PID */
PIDCMD_QUERY_PID allows to translate a pid between pid namespaces.
Given a source pid namespace fd return the pid of the process in the target
namespace:
1. pidctl(PIDCMD_QUERY_PID, pid, source_fd, -1, 0)
  - retrieve pidns identified by source_fd
  - retrieve struct pid identifed by pid in pidns identified by source_fd
  - retrieve callers pidns
  - return pid in callers pidns

2. pidctl(PIDCMD_QUERY_PID, pid, -1, target_fd, 0)
  - retrieve callers pidns
  - retrieve struct pid identifed by pid in callers pidns
  - retrieve pidns identified by target_fd
  - return pid in pidns identified by target_fd

3. pidctl(PIDCMD_QUERY_PID, 1, source_fd, -1, 0)
  - retrieve pidns identified by source_fd
  - retrieve struct pid identifed by init task in pidns identified by source_fd
  - retrieve callers pidns
  - return pid of init task of pidns identified by source_fd in callers pidns

4. pidctl(PIDCMD_QUERY_PID, pid, source_fd, target_fd, 0)
  - retrieve pidns identified by source_fd
  - retrieve struct pid identifed by pid in pidns identified by source_fd
  - retrieve pidns identified by target_fd
  - check whether struct pid can be found in pidns identified by target_fd
  - return pid in pidns identified by target_fd

/* PIDCMD_QUERY_PIDNS */
PIDCMD_QUERY_PIDNS allows to determine the relationship between pid
namespaces.
In the original version of the pachset passing pid as 1 would allow to
deterimine the relationship between the pid namespaces. This is inherhently
racy. If pid 1 inside a pid namespace has died it would report false
negatives. For example, if pid 1 inside of the target pid namespace already
died, it would report that the target pid namespace cannot be reached from
the source pid namespace because it couldn't find the pid inside of the
target pid namespace and thus falsely report to the user that the two pid
namespaces are not related. This problem is simple to avoid. In the new
version we simply walk the list of ancestors and check whether the
namespace are related to each other. By doing it this way we can reliably
report what the relationship between two pid namespace file descriptors
looks like.

1. pidctl(PIDCMD_QUERY_PIDNS, 0, ns_fd1, ns_fd1, 0) == 0
   - pidns_of(ns_fd1) and pidns_of(ns_fd2) are unrelated to each other

2. pidctl(PIDCMD_QUERY_PIDNS, 0, ns_fd1, ns_fd2, 0) == 1
   - pidns_of(ns_fd1) == pidns_of(ns_fd2)

3. pidctl(PIDCMD_QUERY_PIDNS, 0, ns_fd1, ns_fd2, 0) == 2
   - pidns_of(ns_fd1) is ancestor of pidns_of(ns_fd2)

4. pidctl(PIDCMD_QUERY_PIDNS, 0, ns_fd1, ns_fd2, 0) == 3
   - pidns_of(ns_fd2) is ancestor of pidns_of(ns_fd1)

These two commands - PIDCMD_QUERY_PID and PIDCMD_QUERY_PIDNS - cover and
improve the functionality expressed implicitly in translate_pid() before.

/* PIDCMD_GET_PIDFD */
This command allows to retrieve file descriptors for processes and removes
the dependency of pidfds and thereby the pidfd_send_signal() 

Re: [PATCH] EDAC/amd64: Use maximum channel count for the EDAC channel layer size

2019-03-26 Thread Borislav Petkov
On Mon, Mar 25, 2019 at 08:33:30PM +, Ghannam, Yazen wrote:
> From: Yazen Ghannam 
> 
> The AMD64 EDAC module current hardcodes the EDAC channel layer size
> (count) to two. Future AMD systems may have more channels than this.
> 
> Set the EDAC channel layer size equal to the maximum number of channels
> possible for the system. On Family 17h and later, this is set in the
> num_umcs variable. Older systems will continue to use two as the
> default.
> 
> Signed-off-by: Yazen Ghannam 
> ---
>  drivers/edac/amd64_edac.c | 8 +++-
>  1 file changed, 7 insertions(+), 1 deletion(-)

Ok, whole pile here:

https://git.kernel.org/pub/scm/linux/kernel/git/bp/bp.git/log/?h=edac-for-5.2-amd64

Please run it to check all is still good.

Thx.

-- 
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.


[PATCH v1 0/4] pid: add pidctl()

2019-03-26 Thread Christian Brauner
This is v1 of this patchset with various minor fixes which are listed in
the individual commits. Notably, pidfds are now O_CLOEXEC by default.

The pidctl() syscalls builds on, extends, and improves translate_pid()
[4] and serves as the natural connection between the pid-based and the
pidfd-based api.

I quote Konstantins original patchset first that has already been acked
and picked up by Eric before and whose functionality is preserved in
this syscall. Multiple people have asked when this patchset will be sent
in for merging (cf. [1], [2]). It has recently been revived by
Nagarathnam Muthusamy from Oracle [3].

The intention of the original translate_pid() syscall was twofold:
1. Provide translation of pids between pid namespaces especially for the
   case of deeply nested pid namespaces.
   The most obvious use-case is strace which has been waiting for this
   feature for a while.
2. Provide implicit pid namespace introspection

Both functionalities are preserved. The latter task has been improved
upon though. In the original version of the pachset passing pid as 1
would allow to deterimine the relationship between the pid namespaces.
This is inherhently racy. If pid 1 inside a pid namespace has died it
would report false negatives. For example, if pid 1 inside of the target
pid namespace already died, it would report that the target pid
namespace cannot be reached from the source pid namespace because it
couldn't find the pid inside of the target pid namespace and thus
falsely report to the user that the two pid namespaces are not related.
This problem is simple to avoid. In the new version we simply walk the
list of ancestors and check whether the namespace are related to each
other. By doing it this way we can reliably report what the relationship
between two pid namespace file descriptors looks like.

Additionally, this syscall has been extended to allow the retrieval of
pidfds independent of procfs. These pidfds can e.g. be used with the new
pidfd_send_signal() syscall we recently merged. The ability to retrieve
pidfds independent of procfs had already been requested in the
pidfd_send_signal patchset by e.g. Andrew [4] and later again by Alexey
[5]. A use-case where a kernel is compiled without procfs but where
pidfds are still useful has been outlined by Andy in [6]. Regular
anon-inode based file descriptors are used that stash a reference to
struct pid in file->private_data and drop that reference on close.

With this pidctl() has three closely related functionalities that
provide a natural connection between the pid-based and the pidfd-based
api. To clarify the semantics and to make it easier for userspace to use
the syscall it has a command argument and three commands clearly
reflecting the functionalities (PIDCMD_QUERY_PID, PIDCMD_QUERY_PIDNS,
PIDCMD_GET_PIDFD).

Embedding the retrieval of pidfds into this syscall has two main
advantages:
- pidctl provides a natural and clean connection between the traditional
  pid-based and the newer pidfd-based process API
- allows the retrieval of pidfds for other pid namespaces while
  enforcing that
  - the caller must have been given access to two file descriptors
referring to target and source pid namespace
  - the source pid namespace must be an ancestor of the target pid
namespace
  - the pid must be translatable from the source pid namespace into the
target pid namespace

Note that this patchset also includes Al's and David's commit to make anon
inodes unconditional. The original intention is to make it possible to use
anon inodes in core vfs functions. pidctl() has the same requirement so
David suggested I sent this in alongside this patch. Both are informed of
this.

The syscall comes with extensive testing for all functionalities.

/* References */
[1]: 
https://lore.kernel.org/lkml/37b17950-b130-7933-99a1-4846c61c8...@oracle.com/
[2]: https://lore.kernel.org/lkml/20181109034919.ga21...@altlinux.org/
[3]: 
https://lore.kernel.org/lkml/37b17950-b130-7933-99a1-4846c61c8...@oracle.com/
[4]: 3eb39f47934f9d5a3027fe00d906a45fe3a15fad
[5]: https://lore.kernel.org/lkml/20190320203910.GA2842@avx2/
[6]: 
https://lore.kernel.org/lkml/CALCETrXO=V=+qedldvpf8ecglzib9botrufe0v-u-tuzoeo...@mail.gmail.com/

Thanks!
Christian

Christian Brauner (3):
  pid: add pidctl()
  signal: support pidctl() with pidfd_send_signal()
  tests: add pidctl() tests

David Howells (1):
  Make anon_inodes unconditional

 arch/arm/kvm/Kconfig|   1 -
 arch/arm64/kvm/Kconfig  |   1 -
 arch/mips/kvm/Kconfig   |   1 -
 arch/powerpc/kvm/Kconfig|   1 -
 arch/s390/kvm/Kconfig   |   1 -
 arch/x86/Kconfig|   1 -
 arch/x86/entry/syscalls/syscall_32.tbl  |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl  |   1 +
 arch/x86/kvm/Kconfig|   1 -
 drivers/base/Kconfig|   1 -
 drivers/char/tpm/Kconfig   

Re: [PATCH v2] drivers: infiniband: Kconfig: pedantic formatting

2019-03-26 Thread Jason Gunthorpe
On Wed, Mar 06, 2019 at 11:08:45PM +0100, Enrico Weigelt, metux IT consult 
wrote:
> Formatting of Kconfig files doesn't look so pretty, so just
> take damp cloth and clean it up.
> 
> Signed-off-by: Enrico Weigelt, metux IT consult 
> ---
>  drivers/infiniband/hw/bnxt_re/Kconfig | 10 +-
>  drivers/infiniband/ulp/iser/Kconfig   |  4 ++--
>  2 files changed, 7 insertions(+), 7 deletions(-)

Applied to for-next thanks

Jason


Re: [PATCH 5/5] lib/vsprintf: Add %pfw conversion specifier for printing fwnode names

2019-03-26 Thread Petr Mladek
On Tue 2019-03-26 16:30:21, Andy Shevchenko wrote:
> On Tue, Mar 26, 2019 at 04:12:43PM +0200, Sakari Ailus wrote:
> > On Tue, Mar 26, 2019 at 04:06:33PM +0200, Heikki Krogerus wrote:
> > > On Tue, Mar 26, 2019 at 03:13:53PM +0200, Andy Shevchenko wrote:
> 
> > > > > > Do we support swnode here?
> > > > > 
> > > > > Good question. The swnodes have no hierarchy at the moment (they're 
> > > > > only
> > > > > created for a struct device as a parent) and they do not have 
> > > > > human-readable
> > > > > names. So I'd say it's not relevant right now. Should these two 
> > > > > change,
> > > > > support for swnode could (and should) be added later on.
> > > > 
> > > > Heikki, what do you think about this?
> > > 
> > > Well, the swnodes do have hierarchy. That was kind of the whole point
> > > of introducing them. They now can also be named using "name" property.
> > > See commit 344798206f171c5abea7ab1f9762fa526d7f539d.
> > 
> > Right; I saw the function after initially replying to Andy but I missed
> > where the node name came from. :-) Now I know...
> > 
> > I can add support for swnode, too, if you like.
> 
> Definitely!

It might really make sense to obsolete %pOF and handle all three
(OF, ACPI, Software) nodes using the same %pfw modifiers.

If I get it correctly, we could distinguish them by
fwnode->ops, see is_of_node(), is_acpi_static_node(),
is_software_node().

Best Regards,
Petr


Re: [PATCH v4 1/2] x86/MCE: Add function to allow filtering of MCA errors

2019-03-26 Thread Borislav Petkov
On Tue, Mar 26, 2019 at 11:41:05AM +, Ghannam, Yazen wrote:
> They don't apply cleanly to v4.14 anymore because of the recent header change.
> 
> I figured they would need to be fixed up and submitted separately to older 
> stable
> versions. Is that okay?

Ah yes, right.

Thx.

-- 
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.


[PATCH 1/8] tools/perf,tools/lib/traceevent: Make traceevent APIs more consistent

2019-03-26 Thread Tzvetomir Stoyanov
Rename some traceevent APIs for consistency:

tep_pid_is_registered() to tep_is_pid_registered()
tep_file_bigendian() to tep_is_file_bigendian()
  to make the names and return values consistent with other tep_is_... APIs

tep_data_lat_fmt() to tep_data_latency_format()
  to make the name more descriptive

tep_host_bigendian() to tep_is_bigendian()
tep_set_host_bigendian() to tep_set_local_bigendian()
tep_is_host_bigendian() to tep_is_local_bigendian()
  "host" can be confused with VMs, and "local" is about the local
  machine. All tep_is_..._bigendian(struct tep_handle *tep) APIs return
  the saved data in the tep handle, while tep_is_bigendian() returns
  the running machine's endianness.

All tep_is_... functions are modified to return bool value, instead of int.

Signed-off-by: Tzvetomir Stoyanov 
---
 tools/lib/traceevent/event-parse-api.c | 44 +-
 tools/lib/traceevent/event-parse.c | 26 +++
 tools/lib/traceevent/event-parse.h | 16 +-
 tools/lib/traceevent/plugin_kvm.c  |  4 +--
 tools/perf/util/trace-event-read.c |  2 +-
 tools/perf/util/trace-event.c  |  4 +--
 6 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/tools/lib/traceevent/event-parse-api.c 
b/tools/lib/traceevent/event-parse-api.c
index 2ac8b44854ce..46670bb87051 100644
--- a/tools/lib/traceevent/event-parse-api.c
+++ b/tools/lib/traceevent/event-parse-api.c
@@ -88,7 +88,7 @@ void tep_clear_flag(struct tep_handle *tep, enum tep_flag 
flag)
 bool tep_test_flag(struct tep_handle *tep, enum tep_flag flag)
 {
if (tep)
-   return (tep->flags & flag);
+   return !!(tep->flags & flag);
return false;
 }
 
@@ -155,10 +155,10 @@ int tep_get_header_page_size(struct tep_handle *pevent)
 }
 
 /**
- * tep_get_header_timestamp_size - get size of the time stamp in the header 
page
+ * tep_get_header_timestamp_size - get size of the timestamp in the header page
  * @tep: a handle to the tep_handle
  *
- * This returns size of the time stamp in the header page
+ * This returns size of the timestamp in the header page
  * If @tep is NULL, 0 is returned.
  */
 int tep_get_header_timestamp_size(struct tep_handle *tep)
@@ -249,17 +249,17 @@ void tep_set_page_size(struct tep_handle *pevent, int 
_page_size)
 }
 
 /**
- * tep_file_bigendian - get if the file is in big endian order
+ * tep_is_file_bigendian - return the endian of the file
  * @pevent: a handle to the tep_handle
  *
- * This returns if the file is in big endian order
- * If @pevent is NULL, 0 is returned.
+ * This returns true if the file is in big endian order
+ * If @pevent is NULL, false is returned.
  */
-int tep_file_bigendian(struct tep_handle *pevent)
+bool tep_is_file_bigendian(struct tep_handle *pevent)
 {
if (pevent)
-   return pevent->file_bigendian;
-   return 0;
+   return (pevent->file_bigendian == TEP_BIG_ENDIAN);
+   return false;
 }
 
 /**
@@ -276,27 +276,27 @@ void tep_set_file_bigendian(struct tep_handle *pevent, 
enum tep_endian endian)
 }
 
 /**
- * tep_is_host_bigendian - get if the order of the current host is big endian
+ * tep_is_local_bigendian - return the endian of the saved local machine
  * @pevent: a handle to the tep_handle
  *
- * This gets if the order of the current host is big endian
- * If @pevent is NULL, 0 is returned.
+ * This returns true if the saved local machine in @pevent is big endian.
+ * If @pevent is NULL, false is returned.
  */
-int tep_is_host_bigendian(struct tep_handle *pevent)
+bool tep_is_local_bigendian(struct tep_handle *pevent)
 {
if (pevent)
-   return pevent->host_bigendian;
+   return (pevent->host_bigendian == TEP_BIG_ENDIAN);
return 0;
 }
 
 /**
- * tep_set_host_bigendian - set the order of the local host
+ * tep_set_local_bigendian - set the stored local machine endian order
  * @pevent: a handle to the tep_handle
  * @endian: non zero, if the local host has big endian order
  *
- * This sets the order of the local host
+ * This sets the endian order for the local machine.
  */
-void tep_set_host_bigendian(struct tep_handle *pevent, enum tep_endian endian)
+void tep_set_local_bigendian(struct tep_handle *pevent, enum tep_endian endian)
 {
if (pevent)
pevent->host_bigendian = endian;
@@ -306,14 +306,14 @@ void tep_set_host_bigendian(struct tep_handle *pevent, 
enum tep_endian endian)
  * tep_is_latency_format - get if the latency output format is configured
  * @pevent: a handle to the tep_handle
  *
- * This gets if the latency output format is configured
- * If @pevent is NULL, 0 is returned.
+ * This returns true if the latency output format is configured
+ * If @pevent is NULL, false is returned.
  */
-int tep_is_latency_format(struct tep_handle *pevent)
+bool tep_is_latency_format(struct tep_handle *pevent)
 {
if (pevent)
-   return pevent->latency_format;
-   return 0;
+   

[PATCH 6/8] tools/perf,tools/lib/traceevent: rename "pevent" member of struct tep_event_filter to "tep"

2019-03-26 Thread Tzvetomir Stoyanov
This patch renames "pevent" member of the struct tep_event_filter to "tep".
This makes the struct consistent with the chosen naming convention:
tep (trace event parser), instead of the old pevent.

Signed-off-by: Tzvetomir Stoyanov 
---
 tools/lib/traceevent/event-parse.h  |  2 +-
 tools/lib/traceevent/parse-filter.c | 14 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tools/lib/traceevent/event-parse.h 
b/tools/lib/traceevent/event-parse.h
index d98362ff4bb1..17fb8d2dd081 100644
--- a/tools/lib/traceevent/event-parse.h
+++ b/tools/lib/traceevent/event-parse.h
@@ -718,7 +718,7 @@ struct tep_filter_type {
 #define TEP_FILTER_ERROR_BUFSZ  1024
 
 struct tep_event_filter {
-   struct tep_handle   *pevent;
+   struct tep_handle   *tep;
int filters;
struct tep_filter_type  *event_filters;
charerror_buffer[TEP_FILTER_ERROR_BUFSZ];
diff --git a/tools/lib/traceevent/parse-filter.c 
b/tools/lib/traceevent/parse-filter.c
index 231e9cae5729..1d2f1343090d 100644
--- a/tools/lib/traceevent/parse-filter.c
+++ b/tools/lib/traceevent/parse-filter.c
@@ -154,7 +154,7 @@ add_filter_type(struct tep_event_filter *filter, int id)
 
filter_type = >event_filters[i];
filter_type->event_id = id;
-   filter_type->event = tep_find_event(filter->pevent, id);
+   filter_type->event = tep_find_event(filter->tep, id);
filter_type->filter = NULL;
 
filter->filters++;
@@ -175,7 +175,7 @@ struct tep_event_filter *tep_filter_alloc(struct tep_handle 
*tep)
return NULL;
 
memset(filter, 0, sizeof(*filter));
-   filter->pevent = tep;
+   filter->tep = tep;
tep_ref(tep);
 
return filter;
@@ -1257,7 +1257,7 @@ static void filter_init_error_buf(struct tep_event_filter 
*filter)
 enum tep_errno tep_filter_add_filter_str(struct tep_event_filter *filter,
 const char *filter_str)
 {
-   struct tep_handle *pevent = filter->pevent;
+   struct tep_handle *pevent = filter->tep;
struct event_list *event;
struct event_list *events = NULL;
const char *filter_start;
@@ -1380,7 +1380,7 @@ int tep_filter_strerror(struct tep_event_filter *filter, 
enum tep_errno err,
return 0;
}
 
-   return tep_strerror(filter->pevent, err, buf, buflen);
+   return tep_strerror(filter->tep, err, buf, buflen);
 }
 
 /**
@@ -1443,7 +1443,7 @@ void tep_filter_reset(struct tep_event_filter *filter)
 
 void tep_filter_free(struct tep_event_filter *filter)
 {
-   tep_unref(filter->pevent);
+   tep_unref(filter->tep);
 
tep_filter_reset(filter);
 
@@ -1465,7 +1465,7 @@ static int copy_filter_type(struct tep_event_filter 
*filter,
/* Can't assume that the pevent's are the same */
sys = filter_type->event->system;
name = filter_type->event->name;
-   event = tep_find_event_by_name(filter->pevent, sys, name);
+   event = tep_find_event_by_name(filter->tep, sys, name);
if (!event)
return -1;
 
@@ -1875,7 +1875,7 @@ int tep_event_filtered(struct tep_event_filter *filter, 
int event_id)
 enum tep_errno tep_filter_match(struct tep_event_filter *filter,
struct tep_record *record)
 {
-   struct tep_handle *pevent = filter->pevent;
+   struct tep_handle *pevent = filter->tep;
struct tep_filter_type *filter_type;
int event_id;
int ret;
-- 
2.20.1



[PATCH 8/8] tools/lib/traceevent: remove call to exit() from tep_filter_add_filter_str()

2019-03-26 Thread Tzvetomir Stoyanov
This patch removes call to exit() from tep_filter_add_filter_str(). A library
function should not force the application to exit. In the current implementation
tep_filter_add_filter_str() calls exit() when a special "test_filters" mode is
set, used only for debugging purposes. When this mode is set and a filter is
added - its string is printed to the console and exit() is called. This patch
changes the logic - when in "test_filters" mode, the filter string is still
printed, but the exit() is not called. It is up to the application to track when
"test_filters" mode is set and to call exit, if needed.

Signed-off-by: Tzvetomir Stoyanov 
---
 tools/lib/traceevent/parse-filter.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tools/lib/traceevent/parse-filter.c 
b/tools/lib/traceevent/parse-filter.c
index 298694356277..552592d153fb 100644
--- a/tools/lib/traceevent/parse-filter.c
+++ b/tools/lib/traceevent/parse-filter.c
@@ -1346,9 +1346,6 @@ enum tep_errno tep_filter_add_filter_str(struct 
tep_event_filter *filter,
 
free_events(events);
 
-   if (rtn >= 0 && tep->test_filters)
-   exit(0);
-
return rtn;
 }
 
-- 
2.20.1



[PATCH 5/8] tools/perf,tools/lib/traceevent: rename "pevent" member of struct tep_event to "tep"

2019-03-26 Thread Tzvetomir Stoyanov
This patch renames "pevent" member of the struct tep_event to "tep". This makes
the struct consistent with the chosen naming convention:
tep (trace event parser), instead of the old pevent.

Signed-off-by: Tzvetomir Stoyanov 
---
 tools/lib/traceevent/event-parse.c| 32 +--
 tools/lib/traceevent/event-parse.h|  2 +-
 tools/lib/traceevent/parse-filter.c   |  6 ++--
 tools/lib/traceevent/plugin_function.c|  2 +-
 tools/lib/traceevent/plugin_kmem.c|  4 +--
 tools/lib/traceevent/plugin_kvm.c |  4 +--
 tools/lib/traceevent/plugin_sched_switch.c|  2 +-
 tools/perf/builtin-kmem.c |  2 +-
 tools/perf/util/data-convert-bt.c |  4 +--
 tools/perf/util/python.c  |  2 +-
 .../util/scripting-engines/trace-event-perl.c |  2 +-
 .../scripting-engines/trace-event-python.c|  2 +-
 tools/perf/util/trace-event-parse.c   |  2 +-
 13 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/tools/lib/traceevent/event-parse.c 
b/tools/lib/traceevent/event-parse.c
index 6265d3bb72a1..de17fe9f0f62 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -804,7 +804,7 @@ static int add_event(struct tep_handle *pevent, struct 
tep_event *event)
pevent->events[i] = event;
pevent->nr_events++;
 
-   event->pevent = pevent;
+   event->tep = pevent;
 
return 0;
 }
@@ -1656,8 +1656,8 @@ static int event_read_fields(struct tep_event *event, 
struct tep_format_field **
else if (field->flags & TEP_FIELD_IS_STRING)
field->elementsize = 1;
else if (field->flags & TEP_FIELD_IS_LONG)
-   field->elementsize = event->pevent ?
-event->pevent->long_size :
+   field->elementsize = event->tep ?
+event->tep->long_size :
 sizeof(long);
} else
field->elementsize = field->size;
@@ -3075,7 +3075,7 @@ process_function(struct tep_event *event, struct 
tep_print_arg *arg,
return process_dynamic_array_len(event, arg, tok);
}
 
-   func = find_func_handler(event->pevent, token);
+   func = find_func_handler(event->tep, token);
if (func) {
free_token(token);
return process_func_handler(event, func, arg, tok);
@@ -3405,7 +3405,7 @@ int tep_read_number_field(struct tep_format_field *field, 
const void *data,
case 2:
case 4:
case 8:
-   *value = tep_read_number(field->event->pevent,
+   *value = tep_read_number(field->event->tep,
 data + field->offset, field->size);
return 0;
default:
@@ -3566,7 +3566,7 @@ tep_find_event_by_name(struct tep_handle *tep,
 static unsigned long long
 eval_num_arg(void *data, int size, struct tep_event *event, struct 
tep_print_arg *arg)
 {
-   struct tep_handle *pevent = event->pevent;
+   struct tep_handle *pevent = event->tep;
unsigned long long val = 0;
unsigned long long left, right;
struct tep_print_arg *typearg = NULL;
@@ -3907,7 +3907,7 @@ static void print_str_arg(struct trace_seq *s, void 
*data, int size,
  struct tep_event *event, const char *format,
  int len_arg, struct tep_print_arg *arg)
 {
-   struct tep_handle *pevent = event->pevent;
+   struct tep_handle *pevent = event->tep;
struct tep_print_flag_sym *flag;
struct tep_format_field *field;
struct printk_map *printk;
@@ -4256,7 +4256,7 @@ static void free_args(struct tep_print_arg *args)
 
 static struct tep_print_arg *make_bprint_args(char *fmt, void *data, int size, 
struct tep_event *event)
 {
-   struct tep_handle *pevent = event->pevent;
+   struct tep_handle *pevent = event->tep;
struct tep_format_field *field, *ip_field;
struct tep_print_arg *args, *arg, **next;
unsigned long long ip, val;
@@ -4433,7 +4433,7 @@ static char *
 get_bprint_format(void *data, int size __maybe_unused,
  struct tep_event *event)
 {
-   struct tep_handle *pevent = event->pevent;
+   struct tep_handle *pevent = event->tep;
unsigned long long addr;
struct tep_format_field *field;
struct printk_map *printk;
@@ -4834,7 +4834,7 @@ void tep_print_field(struct trace_seq *s, void *data,
 {
unsigned long long val;
unsigned int offset, len, i;
-   struct tep_handle *pevent = field->event->pevent;
+   struct tep_handle *pevent = field->event->tep;
 
if (field->flags & TEP_FIELD_IS_ARRAY) {
offset = field->offset;
@@ -4909,7 +4909,7 @@ 

[PATCH 3/8] tools/lib/traceevent: Remove tep filter trivial APIs

2019-03-26 Thread Tzvetomir Stoyanov
This patch removes trivial filter tep APIs:
 enum tep_filter_trivial_type
 tep_filter_event_has_trivial()
 tep_update_trivial()
 tep_filter_clear_trivial()

Trivial filters is an optimization, used only in the first
version of KernelShark. The API is deprecated, the next KernelShark
release does not use it.

Signed-off-by: Tzvetomir Stoyanov 
---
 tools/lib/traceevent/event-parse.h  |  16 ---
 tools/lib/traceevent/parse-filter.c | 169 
 2 files changed, 185 deletions(-)

diff --git a/tools/lib/traceevent/event-parse.h 
b/tools/lib/traceevent/event-parse.h
index dcf0385684b8..02932d3d2632 100644
--- a/tools/lib/traceevent/event-parse.h
+++ b/tools/lib/traceevent/event-parse.h
@@ -732,12 +732,6 @@ struct tep_event_filter *tep_filter_alloc(struct 
tep_handle *pevent);
 #define FILTER_MISSTEP_ERRNO__FILTER_MISS
 #define FILTER_MATCH   TEP_ERRNO__FILTER_MATCH
 
-enum tep_filter_trivial_type {
-   TEP_FILTER_TRIVIAL_FALSE,
-   TEP_FILTER_TRIVIAL_TRUE,
-   TEP_FILTER_TRIVIAL_BOTH,
-};
-
 enum tep_errno tep_filter_add_filter_str(struct tep_event_filter *filter,
 const char *filter_str);
 
@@ -752,9 +746,6 @@ int tep_event_filtered(struct tep_event_filter *filter,
 
 void tep_filter_reset(struct tep_event_filter *filter);
 
-int tep_filter_clear_trivial(struct tep_event_filter *filter,
-enum tep_filter_trivial_type type);
-
 void tep_filter_free(struct tep_event_filter *filter);
 
 char *tep_filter_make_string(struct tep_event_filter *filter, int event_id);
@@ -762,15 +753,8 @@ char *tep_filter_make_string(struct tep_event_filter 
*filter, int event_id);
 int tep_filter_remove_event(struct tep_event_filter *filter,
int event_id);
 
-int tep_filter_event_has_trivial(struct tep_event_filter *filter,
-int event_id,
-enum tep_filter_trivial_type type);
-
 int tep_filter_copy(struct tep_event_filter *dest, struct tep_event_filter 
*source);
 
-int tep_update_trivial(struct tep_event_filter *dest, struct tep_event_filter 
*source,
-   enum tep_filter_trivial_type type);
-
 int tep_filter_compare(struct tep_event_filter *filter1, struct 
tep_event_filter *filter2);
 
 #endif /* _PARSE_EVENTS_H */
diff --git a/tools/lib/traceevent/parse-filter.c 
b/tools/lib/traceevent/parse-filter.c
index cb5ce66dab6e..4ffd8b25a852 100644
--- a/tools/lib/traceevent/parse-filter.c
+++ b/tools/lib/traceevent/parse-filter.c
@@ -1522,167 +1522,6 @@ int tep_filter_copy(struct tep_event_filter *dest, 
struct tep_event_filter *sour
return ret;
 }
 
-
-/**
- * tep_update_trivial - update the trivial filters with the given filter
- * @dest - the filter to update
- * @source - the filter as the source of the update
- * @type - the type of trivial filter to update.
- *
- * Scan dest for trivial events matching @type to replace with the source.
- *
- * Returns 0 on success and -1 if there was a problem updating, but
- *   events may have still been updated on error.
- */
-int tep_update_trivial(struct tep_event_filter *dest, struct tep_event_filter 
*source,
-  enum tep_filter_trivial_type type)
-{
-   struct tep_handle *src_pevent;
-   struct tep_handle *dest_pevent;
-   struct tep_event *event;
-   struct tep_filter_type *filter_type;
-   struct tep_filter_arg *arg;
-   char *str;
-   int i;
-
-   src_pevent = source->pevent;
-   dest_pevent = dest->pevent;
-
-   /* Do nothing if either of the filters has nothing to filter */
-   if (!dest->filters || !source->filters)
-   return 0;
-
-   for (i = 0; i < dest->filters; i++) {
-   filter_type = >event_filters[i];
-   arg = filter_type->filter;
-   if (arg->type != TEP_FILTER_ARG_BOOLEAN)
-   continue;
-   if ((arg->boolean.value && type == TEP_FILTER_TRIVIAL_FALSE) ||
-   (!arg->boolean.value && type == TEP_FILTER_TRIVIAL_TRUE))
-   continue;
-
-   event = filter_type->event;
-
-   if (src_pevent != dest_pevent) {
-   /* do a look up */
-   event = tep_find_event_by_name(src_pevent,
-  event->system,
-  event->name);
-   if (!event)
-   return -1;
-   }
-
-   str = tep_filter_make_string(source, event->id);
-   if (!str)
-   continue;
-
-   /* Don't bother if the filter is trivial too */
-   if (strcmp(str, "TRUE") != 0 && strcmp(str, "FALSE") != 0)
-   filter_event(dest, event, str, NULL);
-   free(str);
-   }
-   return 0;
-}
-
-/**
- * 

[PATCH 4/8] tools/lib/traceevent: rename input arguments of libtraceevent APIs from pevent to tep

2019-03-26 Thread Tzvetomir Stoyanov
This patch renames "struct tep_handle *pevent" input arguments of libtraceevent
APIs to "struct tep_handle *tep". This makes the API consistent with the chosen
naming convention: tep (trace event parser), instead of the old pevent.

Signed-off-by: Tzvetomir Stoyanov 
---
 tools/lib/traceevent/event-parse-api.c | 132 +++---
 tools/lib/traceevent/event-parse-local.h   |   6 +-
 tools/lib/traceevent/event-parse.c | 475 ++---
 tools/lib/traceevent/event-parse.h | 120 +++---
 tools/lib/traceevent/event-plugin.c|   8 +-
 tools/lib/traceevent/parse-filter.c|   8 +-
 tools/lib/traceevent/plugin_cfg80211.c |   8 +-
 tools/lib/traceevent/plugin_function.c |   8 +-
 tools/lib/traceevent/plugin_hrtimer.c  |  12 +-
 tools/lib/traceevent/plugin_jbd2.c |  12 +-
 tools/lib/traceevent/plugin_kmem.c |  28 +-
 tools/lib/traceevent/plugin_kvm.c  |  44 +-
 tools/lib/traceevent/plugin_mac80211.c |   8 +-
 tools/lib/traceevent/plugin_sched_switch.c |  16 +-
 tools/lib/traceevent/plugin_scsi.c |   8 +-
 tools/lib/traceevent/plugin_xen.c  |   8 +-
 16 files changed, 450 insertions(+), 451 deletions(-)

diff --git a/tools/lib/traceevent/event-parse-api.c 
b/tools/lib/traceevent/event-parse-api.c
index 1fe284b1fac8..70b9bdb246fe 100644
--- a/tools/lib/traceevent/event-parse-api.c
+++ b/tools/lib/traceevent/event-parse-api.c
@@ -92,11 +92,11 @@ bool tep_test_flag(struct tep_handle *tep, enum tep_flag 
flag)
return false;
 }
 
-unsigned short tep_data2host2(struct tep_handle *pevent, unsigned short data)
+unsigned short tep_data2host2(struct tep_handle *tep, unsigned short data)
 {
unsigned short swap;
 
-   if (!pevent || pevent->host_bigendian == pevent->file_bigendian)
+   if (!tep || tep->host_bigendian == tep->file_bigendian)
return data;
 
swap = ((data & 0xffULL) << 8) |
@@ -105,11 +105,11 @@ unsigned short tep_data2host2(struct tep_handle *pevent, 
unsigned short data)
return swap;
 }
 
-unsigned int tep_data2host4(struct tep_handle *pevent, unsigned int data)
+unsigned int tep_data2host4(struct tep_handle *tep, unsigned int data)
 {
unsigned int swap;
 
-   if (!pevent || pevent->host_bigendian == pevent->file_bigendian)
+   if (!tep || tep->host_bigendian == tep->file_bigendian)
return data;
 
swap = ((data & 0xffULL) << 24) |
@@ -121,11 +121,11 @@ unsigned int tep_data2host4(struct tep_handle *pevent, 
unsigned int data)
 }
 
 unsigned long long
-tep_data2host8(struct tep_handle *pevent, unsigned long long data)
+tep_data2host8(struct tep_handle *tep, unsigned long long data)
 {
unsigned long long swap;
 
-   if (!pevent || pevent->host_bigendian == pevent->file_bigendian)
+   if (!tep || tep->host_bigendian == tep->file_bigendian)
return data;
 
swap = ((data & 0xffULL) << 56) |
@@ -142,15 +142,15 @@ tep_data2host8(struct tep_handle *pevent, unsigned long 
long data)
 
 /**
  * tep_get_header_page_size - get size of the header page
- * @pevent: a handle to the tep_handle
+ * @tep: a handle to the tep_handle
  *
  * This returns size of the header page
- * If @pevent is NULL, 0 is returned.
+ * If @tep is NULL, 0 is returned.
  */
-int tep_get_header_page_size(struct tep_handle *pevent)
+int tep_get_header_page_size(struct tep_handle *tep)
 {
-   if (pevent)
-   return pevent->header_page_size_size;
+   if (tep)
+   return tep->header_page_size_size;
return 0;
 }
 
@@ -170,163 +170,163 @@ int tep_get_header_timestamp_size(struct tep_handle 
*tep)
 
 /**
  * tep_get_cpus - get the number of CPUs
- * @pevent: a handle to the tep_handle
+ * @tep: a handle to the tep_handle
  *
  * This returns the number of CPUs
- * If @pevent is NULL, 0 is returned.
+ * If @tep is NULL, 0 is returned.
  */
-int tep_get_cpus(struct tep_handle *pevent)
+int tep_get_cpus(struct tep_handle *tep)
 {
-   if (pevent)
-   return pevent->cpus;
+   if (tep)
+   return tep->cpus;
return 0;
 }
 
 /**
  * tep_set_cpus - set the number of CPUs
- * @pevent: a handle to the tep_handle
+ * @tep: a handle to the tep_handle
  *
  * This sets the number of CPUs
  */
-void tep_set_cpus(struct tep_handle *pevent, int cpus)
+void tep_set_cpus(struct tep_handle *tep, int cpus)
 {
-   if (pevent)
-   pevent->cpus = cpus;
+   if (tep)
+   tep->cpus = cpus;
 }
 
 /**
  * tep_get_long_size - get the size of a long integer on the traced machine
- * @pevent: a handle to the tep_handle
+ * @tep: a handle to the tep_handle
  *
  * This returns the size of a long integer on the traced machine
- * If @pevent is NULL, 0 is returned.
+ * If @tep is NULL, 0 is returned.
  */
-int tep_get_long_size(struct tep_handle *pevent)
+int tep_get_long_size(struct tep_handle *tep)
 {
-   if (pevent)
-   return 

[PATCH v3] kmemleaak: survive in a low-memory situation

2019-03-26 Thread Qian Cai
Kmemleak could quickly fail to allocate an object structure and then
disable itself in a low-memory situation. For example, running a mmap()
workload triggering swapping and OOM. This is especially problematic for
running things like LTP testsuite where one OOM test case would disable
the whole kmemleak and render the rest of test cases without kmemleak
watching for leaking.

Kmemleak allocation could fail even though the tracked memory is
succeeded. Hence, it could still try to start a direct reclaim if it is
not executed in an atomic context (spinlock, irq-handler etc), or a
high-priority allocation in an atomic context as a last-ditch effort.
Since kmemleak is a debug feature, it is unlikely to be used in
production that memory resources is scarce where direct reclaim or
high-priority atomic allocations should not be granted lightly.

Unless there is a brave soul to reimplement the kmemleak to embed it's
metadata into the tracked memory itself in a foreseeable future, this
provides a good balance between enabling kmemleak in a low-memory
situation and not introducing too much hackiness into the existing
code for now.

Signed-off-by: Qian Cai 
---

v3: Update the commit log.
Simplify the code inspired by graph_trace_open() from ftrace.
v2: Remove the needless checking for NULL objects in slab_post_alloc_hook()
per Catalin.

 mm/kmemleak.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index a2d894d3de07..239927166894 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -581,6 +581,17 @@ static struct kmemleak_object *create_object(unsigned long 
ptr, size_t size,
unsigned long untagged_ptr;
 
object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
+   if (!object) {
+   /*
+* The tracked memory was allocated successful, if the kmemleak
+* object failed to allocate for some reasons, it ends up with
+* the whole kmemleak disabled, so let it success at all cost.
+*/
+   gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC :
+  gfp_kmemleak_mask(gfp) | __GFP_DIRECT_RECLAIM;
+   object = kmem_cache_alloc(object_cache, gfp);
+   }
+
if (!object) {
pr_warn("Cannot allocate a kmemleak_object structure\n");
kmemleak_disable();
-- 
2.17.2 (Apple Git-113)



[PATCH 7/8] tools/lib/traceevent: rename input arguments and local variables of libtraceevent from pevent to tep

2019-03-26 Thread Tzvetomir Stoyanov
This patch renames from "pevent" to "tep":
 - all "pevent" input arguments of libtraceevent internal functions.
 - all local "pevent" variables of libtraceevent.
This makes the implementation consistent with the chosen naming convention,
tep (trace event parser), and will avoid any confusion with the old pevent name

Signed-off-by: Tzvetomir Stoyanov 
---
 tools/lib/traceevent/event-parse.c | 306 -
 tools/lib/traceevent/event-plugin.c|  24 +-
 tools/lib/traceevent/parse-filter.c|  26 +--
 tools/lib/traceevent/plugin_function.c |   6 +-
 4 files changed, 181 insertions(+), 181 deletions(-)

diff --git a/tools/lib/traceevent/event-parse.c 
b/tools/lib/traceevent/event-parse.c
index de17fe9f0f62..d9cbf172d99f 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -148,14 +148,14 @@ struct cmdline_list {
int pid;
 };
 
-static int cmdline_init(struct tep_handle *pevent)
+static int cmdline_init(struct tep_handle *tep)
 {
-   struct cmdline_list *cmdlist = pevent->cmdlist;
+   struct cmdline_list *cmdlist = tep->cmdlist;
struct cmdline_list *item;
struct tep_cmdline *cmdlines;
int i;
 
-   cmdlines = malloc(sizeof(*cmdlines) * pevent->cmdline_count);
+   cmdlines = malloc(sizeof(*cmdlines) * tep->cmdline_count);
if (!cmdlines)
return -1;
 
@@ -169,15 +169,15 @@ static int cmdline_init(struct tep_handle *pevent)
free(item);
}
 
-   qsort(cmdlines, pevent->cmdline_count, sizeof(*cmdlines), cmdline_cmp);
+   qsort(cmdlines, tep->cmdline_count, sizeof(*cmdlines), cmdline_cmp);
 
-   pevent->cmdlines = cmdlines;
-   pevent->cmdlist = NULL;
+   tep->cmdlines = cmdlines;
+   tep->cmdlist = NULL;
 
return 0;
 }
 
-static const char *find_cmdline(struct tep_handle *pevent, int pid)
+static const char *find_cmdline(struct tep_handle *tep, int pid)
 {
const struct tep_cmdline *comm;
struct tep_cmdline key;
@@ -185,13 +185,13 @@ static const char *find_cmdline(struct tep_handle 
*pevent, int pid)
if (!pid)
return "";
 
-   if (!pevent->cmdlines && cmdline_init(pevent))
+   if (!tep->cmdlines && cmdline_init(tep))
return "";
 
key.pid = pid;
 
-   comm = bsearch(, pevent->cmdlines, pevent->cmdline_count,
-  sizeof(*pevent->cmdlines), cmdline_cmp);
+   comm = bsearch(, tep->cmdlines, tep->cmdline_count,
+  sizeof(*tep->cmdlines), cmdline_cmp);
 
if (comm)
return comm->comm;
@@ -232,10 +232,10 @@ bool tep_is_pid_registered(struct tep_handle *tep, int 
pid)
  * we must add this pid. This is much slower than when cmdlines
  * are added before the array is initialized.
  */
-static int add_new_comm(struct tep_handle *pevent,
+static int add_new_comm(struct tep_handle *tep,
const char *comm, int pid, bool override)
 {
-   struct tep_cmdline *cmdlines = pevent->cmdlines;
+   struct tep_cmdline *cmdlines = tep->cmdlines;
struct tep_cmdline *cmdline;
struct tep_cmdline key;
char *new_comm;
@@ -246,8 +246,8 @@ static int add_new_comm(struct tep_handle *pevent,
/* avoid duplicates */
key.pid = pid;
 
-   cmdline = bsearch(, pevent->cmdlines, pevent->cmdline_count,
-  sizeof(*pevent->cmdlines), cmdline_cmp);
+   cmdline = bsearch(, tep->cmdlines, tep->cmdline_count,
+ sizeof(*tep->cmdlines), cmdline_cmp);
if (cmdline) {
if (!override) {
errno = EEXIST;
@@ -264,26 +264,26 @@ static int add_new_comm(struct tep_handle *pevent,
return 0;
}
 
-   cmdlines = realloc(cmdlines, sizeof(*cmdlines) * (pevent->cmdline_count 
+ 1));
+   cmdlines = realloc(cmdlines, sizeof(*cmdlines) * (tep->cmdline_count + 
1));
if (!cmdlines) {
errno = ENOMEM;
return -1;
}
 
-   cmdlines[pevent->cmdline_count].comm = strdup(comm);
-   if (!cmdlines[pevent->cmdline_count].comm) {
+   cmdlines[tep->cmdline_count].comm = strdup(comm);
+   if (!cmdlines[tep->cmdline_count].comm) {
free(cmdlines);
errno = ENOMEM;
return -1;
}
 
-   cmdlines[pevent->cmdline_count].pid = pid;
+   cmdlines[tep->cmdline_count].pid = pid;

-   if (cmdlines[pevent->cmdline_count].comm)
-   pevent->cmdline_count++;
+   if (cmdlines[tep->cmdline_count].comm)
+   tep->cmdline_count++;
 
-   qsort(cmdlines, pevent->cmdline_count, sizeof(*cmdlines), cmdline_cmp);
-   pevent->cmdlines = cmdlines;
+   qsort(cmdlines, tep->cmdline_count, sizeof(*cmdlines), cmdline_cmp);
+   tep->cmdlines = cmdlines;
 
return 0;
 }
@@ -408,18 +408,18 @@ static 

[PATCH 0/8] Cleanup traceevent API and make it more consistent

2019-03-26 Thread Tzvetomir Stoyanov
This patch series does a cleanup of traceevent implementation and APIs:
 - All "pevent" function parameters and local variables are renamed to "tep". 
   This makes the implementation consistent with the chosen naming convention, 
   tep (trace event parser), and avoids any confusion with the old "pevent" 
name 
 - Removed deprecated tep filter trivial APIs.
 - Removed call to exit() from one of traceevent APIs, a library function 
should 
   not force the application to exit.
 - Added counter to track parsing failures.

Changes depend on "Few patches, related to libtracevent APIs" patchset and 
should be applied on top of it.

Tzvetomir Stoyanov (8):
  tools/perf,tools/lib/traceevent: Make traceevent APIs more consistent
  tools/lib/traceevent: Add counter to track parsing failures
  tools/lib/traceevent: Remove tep filter trivial APIs
  tools/lib/traceevent: rename input arguments of libtraceevent APIs
from pevent to tep
  tools/perf,tools/lib/traceevent: rename "pevent" member of struct
tep_event to "tep"
  tools/perf,tools/lib/traceevent: rename "pevent" member of struct
tep_event_filter to "tep"
  tools/lib/traceevent: rename input arguments and local variables of
libtraceevent from pevent to tep
  tools/lib/traceevent: remove call to exit() from
tep_filter_add_filter_str()

 tools/lib/traceevent/event-parse-api.c| 181 ++--
 tools/lib/traceevent/event-parse-local.h  |   8 +-
 tools/lib/traceevent/event-parse.c| 817 +-
 tools/lib/traceevent/event-parse.h| 148 ++--
 tools/lib/traceevent/event-plugin.c   |  32 +-
 tools/lib/traceevent/parse-filter.c   | 216 +
 tools/lib/traceevent/plugin_cfg80211.c|   8 +-
 tools/lib/traceevent/plugin_function.c|  14 +-
 tools/lib/traceevent/plugin_hrtimer.c |  12 +-
 tools/lib/traceevent/plugin_jbd2.c|  12 +-
 tools/lib/traceevent/plugin_kmem.c|  32 +-
 tools/lib/traceevent/plugin_kvm.c |  48 +-
 tools/lib/traceevent/plugin_mac80211.c|   8 +-
 tools/lib/traceevent/plugin_sched_switch.c|  18 +-
 tools/lib/traceevent/plugin_scsi.c|   8 +-
 tools/lib/traceevent/plugin_xen.c |   8 +-
 tools/perf/builtin-kmem.c |   2 +-
 tools/perf/util/data-convert-bt.c |   4 +-
 tools/perf/util/python.c  |   2 +-
 .../util/scripting-engines/trace-event-perl.c |   2 +-
 .../scripting-engines/trace-event-python.c|   2 +-
 tools/perf/util/trace-event-parse.c   |   2 +-
 tools/perf/util/trace-event-read.c|   2 +-
 tools/perf/util/trace-event.c |   4 +-
 24 files changed, 719 insertions(+), 871 deletions(-)

-- 
2.20.1



[PATCH 2/8] tools/lib/traceevent: Add counter to track parsing failures

2019-03-26 Thread Tzvetomir Stoyanov
This patch adds a parsing failures counter to struct tep_handle. The counter can
be used to track failures on parsing event format files. It is updated
automatically by tep_parse_event(), when failure is detected. The patch also
adds two new APIs for accessing the counter:
 tep_get_parsing_failures() - returns the current value of the counter.
 tep_clear_parsing_failures() - clears the counter.

Signed-off-by: Tzvetomir Stoyanov 
---
 tools/lib/traceevent/event-parse-api.c   | 27 
 tools/lib/traceevent/event-parse-local.h |  2 ++
 tools/lib/traceevent/event-parse.c   |  8 ++-
 tools/lib/traceevent/event-parse.h   |  2 ++
 4 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/tools/lib/traceevent/event-parse-api.c 
b/tools/lib/traceevent/event-parse-api.c
index 46670bb87051..1fe284b1fac8 100644
--- a/tools/lib/traceevent/event-parse-api.c
+++ b/tools/lib/traceevent/event-parse-api.c
@@ -371,3 +371,30 @@ void tep_set_test_filters(struct tep_handle *tep, int 
test_filters)
if (tep)
tep->test_filters = test_filters;
 }
+
+/**
+ * tep_get_parsing_failures - get the count of parsing failures
+ * @tep: a handle to a tep_handle
+ *
+ * Every time when tep_parse_event() fails to parse an event format file,
+ * a parsing failure is registered. This returns the parsing failures count.
+ */
+int tep_get_parsing_failures(struct tep_handle *tep)
+{
+   if (tep)
+   return tep->parsing_failures;
+   return 0;
+}
+
+/**
+ * tep_clear_parsing_failures - clear parsing failures count
+ * @tep: a handle to a tep_handle
+ *
+ * Every time when tep_parse_event() fails to parse an event format file,
+ * a parsing failure is registered. This clears the parsing failures counter.
+ */
+void tep_clear_parsing_failures(struct tep_handle *tep)
+{
+   if (tep)
+   tep->parsing_failures = 0;
+}
diff --git a/tools/lib/traceevent/event-parse-local.h 
b/tools/lib/traceevent/event-parse-local.h
index 35833ee32d6c..c5c8eb4c4ab7 100644
--- a/tools/lib/traceevent/event-parse-local.h
+++ b/tools/lib/traceevent/event-parse-local.h
@@ -83,6 +83,8 @@ struct tep_handle {
struct event_handler *handlers;
struct tep_function_handler *func_handlers;
 
+   int parsing_failures;
+
/* cache */
struct tep_event *last_event;
 
diff --git a/tools/lib/traceevent/event-parse.c 
b/tools/lib/traceevent/event-parse.c
index f309b6d7e08a..4144c4e20e4e 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -6230,7 +6230,13 @@ enum tep_errno tep_parse_event(struct tep_handle 
*pevent, const char *buf,
   unsigned long size, const char *sys)
 {
struct tep_event *event = NULL;
-   return __parse_event(pevent, , buf, size, sys);
+   enum tep_errno ret;
+
+   ret = __parse_event(pevent, , buf, size, sys);
+   if (ret != TEP_ERRNO__SUCCESS)
+   pevent->parsing_failures++;
+
+   return ret;
 }
 
 int get_field_val(struct trace_seq *s, struct tep_format_field *field,
diff --git a/tools/lib/traceevent/event-parse.h 
b/tools/lib/traceevent/event-parse.h
index f3b822736d87..dcf0385684b8 100644
--- a/tools/lib/traceevent/event-parse.h
+++ b/tools/lib/traceevent/event-parse.h
@@ -464,6 +464,8 @@ enum tep_errno tep_parse_format(struct tep_handle *pevent,
struct tep_event **eventp,
const char *buf,
unsigned long size, const char *sys);
+int tep_get_parsing_failures(struct tep_handle *tep);
+void tep_clear_parsing_failures(struct tep_handle *tep);
 
 void *tep_get_field_raw(struct trace_seq *s, struct tep_event *event,
const char *name, struct tep_record *record,
-- 
2.20.1



[tip:core/urgent] proc/kcore: Remove unused kclist_add_remap()

2019-03-26 Thread tip-bot for Bhupesh Sharma
Commit-ID:  db779ef67ffeadbb44e9e818eb64dbe528e2f48f
Gitweb: https://git.kernel.org/tip/db779ef67ffeadbb44e9e818eb64dbe528e2f48f
Author: Bhupesh Sharma 
AuthorDate: Tue, 26 Mar 2019 12:20:28 +0530
Committer:  Borislav Petkov 
CommitDate: Tue, 26 Mar 2019 16:36:03 +0100

proc/kcore: Remove unused kclist_add_remap()

Commit

  bf904d2762ee ("x86/pti/64: Remove the SYSCALL64 entry trampoline")

removed the sole usage of kclist_add_remap() but it did not remove the
left-over definition from the include file.

Fix the same.

Signed-off-by: Bhupesh Sharma 
Signed-off-by: Borislav Petkov 
Cc: Adrian Hunter 
Cc: Andrew Morton 
Cc: Dave Anderson 
Cc: Dave Young 
Cc: "David S. Miller" 
Cc: Ingo Molnar 
Cc: James Morse 
Cc: Kairui Song 
Cc: ke...@lists.infradead.org
Cc: linux-arm-ker...@lists.infradead.org
Cc: linuxppc-...@lists.ozlabs.org
Cc: Michael Ellerman 
Cc: Omar Sandoval 
Cc: "Peter Zijlstra (Intel)" 
Cc: Rahul Lakkireddy 
Cc: Thomas Gleixner 
Cc: x86-ml 
Link: 
https://lkml.kernel.org/r/1553583028-17804-1-git-send-email-bhsha...@redhat.com
---
 include/linux/kcore.h | 11 ---
 1 file changed, 11 deletions(-)

diff --git a/include/linux/kcore.h b/include/linux/kcore.h
index 8c3f8c14eeaa..94b561df3877 100644
--- a/include/linux/kcore.h
+++ b/include/linux/kcore.h
@@ -38,22 +38,11 @@ struct vmcoredd_node {
 
 #ifdef CONFIG_PROC_KCORE
 void __init kclist_add(struct kcore_list *, void *, size_t, int type);
-static inline
-void kclist_add_remap(struct kcore_list *m, void *addr, void *vaddr, size_t sz)
-{
-   m->vaddr = (unsigned long)vaddr;
-   kclist_add(m, addr, sz, KCORE_REMAP);
-}
 #else
 static inline
 void kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
 {
 }
-
-static inline
-void kclist_add_remap(struct kcore_list *m, void *addr, void *vaddr, size_t sz)
-{
-}
 #endif
 
 #endif /* _LINUX_KCORE_H */


Re: [RFC v2 1/2] vfio/pci: export common symbols in vfio-pci

2019-03-26 Thread Alex Williamson
On Tue, 26 Mar 2019 12:37:37 +
"Liu, Yi L"  wrote:

> > From: Alex Williamson [mailto:alex.william...@redhat.com]
> > Sent: Tuesday, March 26, 2019 2:17 AM
> > To: Liu, Yi L 
> > Subject: Re: [RFC v2 1/2] vfio/pci: export common symbols in vfio-pci
> > 
> > On Sat, 23 Mar 2019 11:06:44 +
> > "Liu, Yi L"  wrote:  
> > > Hi Alex,  
> 
> [...]
> 
> > >
> > > I tried to get a common file which includes the definitions of the module
> > > options and the common interfaces and get it linked separately with each
> > > module. It works well when linked separately by config the
> > > CONFIG_VFIO_PCI=m and CONFIG_VFIO_PCI_MDEV=m in kernel
> > > configuration file. CONFIG_VFIO_PCI_MDEV is a new Kconfig macro
> > > for the mdev wrapped version driver. However, if building the vfio-pci
> > > and the mdev wrapped version into kernel image (config the
> > > CONFIG_VFIO_PCI=y and CONFIG_VFIO_PCI_MDEV=y), then the symbols
> > > defined in the common file will be shared thus doesn't allow dissimilar
> > > user settings.
> > >
> > > Per my understanding, I think we expect to allow simultaneous usage of
> > > the two drivers. So I think the way above doesn't meet our expectation.  
> > 
> > I agree.  They should be related in implementation only, from a user
> > perspective they should be entirely separate.
> >   
> > > I considered a possible proposal as below. May listen to your opinion
> > > on it before heading to cook. Also, better idea is welcomed. :-)
> > >
> > > - get a common file includes interfaces which are common and have
> > >   input parameters to differentiate the calling from vfio-pci and the
> > >   wrapped version. e.g. vfio_pci_rw(). may call it as vfio_pci_common.c.
> > >
> > > - get another common file includes the definitions of the module options,
> > >   and the functions which referred the options. Define all of them as 
> > > static.
> > >   may call it as common.c
> > >
> > > - get vfio_pci.c which includes the module_init/exit interfaces and driver
> > >   registration operations of vfio-pci.ko. This file should include the 
> > > common.c
> > >   above to have same module options with the mdev wrapped version.
> > >
> > > - get vfio_pci_mdev.c which includes the module_init/exit interfaces and
> > >   driver registration operations of vfio-pci-mdev.ko. It should also 
> > > include
> > >   the common.c above to have same module options with vfio-pci.ko.
> > >
> > > - Makefile:
> > > vfio-pci-y := vfio_pci.o vfio_pci_common.o vfio_pci_intrs.o 
> > > vfio_pci_rdwr.o  
> > vfio_pci_config.o  
> > > vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
> > > vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
> > >
> > > vfio-pci-mdev-y := vfio_pci_mdev.o vfio_pci_common.o vfio_pci_intrs.o  
> > vfio_pci_rdwr.o vfio_pci_config.o  
> > > vfio-pci-mdev-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
> > > vfio-pci-mdev-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
> > >
> > > obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
> > > obj-$(CONFIG_VFIO_PCI_MDEV) += vfio-pci-mdev.o  
> > 
> > Each module needs it's own module_init/exit and will register its own
> > struct pci_driver, which gives us separate control of the probe and  
> 
> Agreed.
> 
> > remove callbacks.  I think we want the drivers to have the same module
> > parameters initially, but we don't necessarily want to require it for
> > any future options, so we can duplicate the parameter declarations.
> > Then to support the shared code, I think we can easily push nointxmask,
> > disable_vga, and disable_idle_d3 into bools on the struct
> > vfio_pci_device, which would be allocated and set by each module's
> > probe function before calling the shared probe function.  
> 
> sounds good to me. 
> 
> > vfio_fill_ids() could take a pointer to the array to keep them separate
> > between modules.   
> 
> Agreed.
> 
> > I think that just leaves the config permission bits,
> > vfio_pci_{un}init_perm_bits(). Could we use a simple atomic reference
> > counter on those to potentially share them so they get initialized by
> > the first caller and freed by the last user, at least in the case of
> > both drivers being compiled statically into the kernel?  Thanks,  
> 
> Sure, I can add it. The two modules will still share the cap_perms and
> ecap_perms config bits when built statically in kernel. However, I think
> such share is reasonable. I'll check if any other similar bits in other files.
> 
> > Alex  
> 
> Thanks for the suggestions, Alex. Let me prepare another RFC.

Thank Yi, I appreciate your work on this.  Also, I wonder if we might
want to reconsider placing this driver in samples, the Makefile might
be a little bit ugly with paths back to drivers/vfio/pci, but I don't
think we run into the same barriers as you did with previous
approaches.  Placing it in samples would at least alleviate any
confusion that this isn't a vfio-pci replacement, but more of an mdev
wrapper proof of concept.  Thanks,

Alex


Re: [PATCH 27/27] kexec: Allow kexec_file() with appropriate IMA policy when locked down

2019-03-26 Thread Mimi Zohar
On Mon, 2019-03-25 at 15:09 -0700, Matthew Garrett wrote:
> Systems in lockdown mode should block the kexec of untrusted kernels.
> For x86 and ARM we can ensure that a kernel is trustworthy by validating
> a PE signature, but this isn't possible on other architectures. On those
> platforms we can use IMA digital signatures instead. Add a function to
> determine whether IMA has or will verify signatures for a given event type,
> and if so permit kexec_file() even if the kernel is otherwise locked down.
> This is restricted to cases where CONFIG_INTEGRITY_TRUSTED_KEYRING is set
> in order to prevent an attacker from loading additional keys at runtime.
> 
> Signed-off-by: Matthew Garrett 
> Cc: Mimi Zohar 
> Cc: Dmitry Kasatkin 
> Cc: linux-integr...@vger.kernel.org

Acked-by: Mimi Zohar 

> ---
>  include/linux/ima.h |  9 ++
>  kernel/kexec_file.c |  7 +++-
>  security/integrity/ima/ima.h|  2 ++
>  security/integrity/ima/ima_main.c   |  2 +-
>  security/integrity/ima/ima_policy.c | 50 +
>  5 files changed, 68 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/ima.h b/include/linux/ima.h
> index b5e16b8c50b7..05921227d700 100644
> --- a/include/linux/ima.h
> +++ b/include/linux/ima.h
> @@ -127,4 +127,13 @@ static inline int ima_inode_removexattr(struct dentry 
> *dentry,
>   return 0;
>  }
>  #endif /* CONFIG_IMA_APPRAISE */
> +
> +#if defined(CONFIG_IMA_APPRAISE) && defined(CONFIG_INTEGRITY_TRUSTED_KEYRING)
> +extern bool ima_appraise_signature(enum kernel_read_file_id func);
> +#else
> +static inline bool ima_appraise_kexec_signature(enum kernel_read_file_id 
> func)
> +{
> + return false;
> +}
> +#endif /* CONFIG_IMA_APPRAISE && CONFIG_INTEGRITY_TRUSTED_KEYRING */
>  #endif /* _LINUX_IMA_H */
> diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
> index 0cfe4f6f7f85..8ffa4b75c620 100644
> --- a/kernel/kexec_file.c
> +++ b/kernel/kexec_file.c
> @@ -240,7 +240,12 @@ kimage_file_prepare_segments(struct kimage *image, int 
> kernel_fd, int initrd_fd,
>  
>   ret = 0;
>  
> - if (kernel_is_locked_down(reason)) {
> + /* If IMA is guaranteed to appraise a signature on the kexec
> +  * image, permit it even if the kernel is otherwise locked
> +  * down.
> +  */
> + if (!ima_appraise_signature(READING_KEXEC_IMAGE) &&
> + kernel_is_locked_down(reason)) {
>   ret = -EPERM;
>   goto out;
>   }
> diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
> index cc12f3449a72..fe03cc6f1ca4 100644
> --- a/security/integrity/ima/ima.h
> +++ b/security/integrity/ima/ima.h
> @@ -115,6 +115,8 @@ struct ima_kexec_hdr {
>   u64 count;
>  };
>  
> +extern const int read_idmap[];
> +
>  #ifdef CONFIG_HAVE_IMA_KEXEC
>  void ima_load_kexec_buffer(void);
>  #else
> diff --git a/security/integrity/ima/ima_main.c 
> b/security/integrity/ima/ima_main.c
> index 4ffac4f5c647..106f06dee9d1 100644
> --- a/security/integrity/ima/ima_main.c
> +++ b/security/integrity/ima/ima_main.c
> @@ -442,7 +442,7 @@ int ima_read_file(struct file *file, enum 
> kernel_read_file_id read_id)
>   return 0;
>  }
>  
> -static const int read_idmap[READING_MAX_ID] = {
> +const int read_idmap[READING_MAX_ID] = {
>   [READING_FIRMWARE] = FIRMWARE_CHECK,
>   [READING_FIRMWARE_PREALLOC_BUFFER] = FIRMWARE_CHECK,
>   [READING_MODULE] = MODULE_CHECK,
> diff --git a/security/integrity/ima/ima_policy.c 
> b/security/integrity/ima/ima_policy.c
> index 122797023bdb..f8f1cdb74a4f 100644
> --- a/security/integrity/ima/ima_policy.c
> +++ b/security/integrity/ima/ima_policy.c
> @@ -1341,3 +1341,53 @@ int ima_policy_show(struct seq_file *m, void *v)
>   return 0;
>  }
>  #endif   /* CONFIG_IMA_READ_POLICY */
> +
> +#if defined(CONFIG_IMA_APPRAISE) && defined(CONFIG_INTEGRITY_TRUSTED_KEYRING)
> +/*
> + * ima_appraise_signature: whether IMA will appraise a given function using
> + * an IMA digital signature. This is restricted to cases where the kernel
> + * has a set of built-in trusted keys in order to avoid an attacker simply
> + * loading additional keys.
> + */
> +bool ima_appraise_signature(enum kernel_read_file_id id)
> +{
> + struct ima_rule_entry *entry;
> + bool found = false;
> + enum ima_hooks func;
> +
> + if (id >= READING_MAX_ID)
> + return false;
> +
> + func = read_idmap[id] ?: FILE_CHECK;
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(entry, ima_rules, list) {
> + if (entry->action != APPRAISE)
> + continue;
> +
> + /*
> +  * A generic entry will match, but otherwise require that it
> +  * match the func we're looking for
> +  */
> + if (entry->func && entry->func != func)
> + continue;
> +
> + /*
> + 

Re: [PATCH -next] x86/apic: Reduce print level of CPU limit announcement

2019-03-26 Thread Leon Romanovsky
On Tue, Mar 26, 2019 at 04:12:27PM +0100, Rafael J. Wysocki wrote:
> On Tue, Mar 26, 2019 at 3:41 PM Leon Romanovsky  wrote:
> >
> > On Tue, Mar 26, 2019 at 01:29:54PM +0100, Rafael J. Wysocki wrote:
> > > On Tue, Mar 26, 2019 at 1:02 PM Leon Romanovsky  wrote:
> > > >
> > > > From: Leon Romanovsky 
> > > >
> > > > Kernel is booted with less possible CPUs (possible_cpus kernel boot
> > > > option) than available CPUs will have prints like this:
> > > >
> > > > [1.131039] APIC: NR_CPUS/possible_cpus limit of 8 reached. 
> > > > Processor 55/0x1f ignored.
> > > > [1.132228] ACPI: Unable to map lapic to logical cpu number
> > > >
> > > > Those warnings are printed for every not-enabled CPU and on the systems
> > > > with large number of such CPUs, we see a lot of those prints for default
> > > > print level.
> > > >
> > > > Simple conversion of those prints to be in debug level removes them
> > > > while leaving the option to debug system.
> > >
> > > But generally dynamic debug must be enabled in order for pr_debug()
> > > prints to be visible which is kind of cumbersome to do via the command
> > > line.
> >
> > It is doable and documented pretty well, which is uncommon :)
> > https://www.kernel.org/doc/html/latest/admin-guide/dynamic-debug-howto.html#debug-messages-during-boot-process
>
> I know.
>
> That's what I mean by "kind of cumbersome", because you need to know
> which debug messages to enable upfront.
>
> > >
> > > > Signed-off-by: Leon Romanovsky 
> > > > ---
> > > >  arch/x86/kernel/acpi/boot.c | 2 +-
> > > >  arch/x86/kernel/apic/apic.c | 6 +++---
> > > >  2 files changed, 4 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
> > > > index 8dcbf6890714..3ef8ab89c02d 100644
> > > > --- a/arch/x86/kernel/acpi/boot.c
> > > > +++ b/arch/x86/kernel/acpi/boot.c
> > > > @@ -770,7 +770,7 @@ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t 
> > > > physid, u32 acpi_id,
> > > >
> > > > cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED);
> > > > if (cpu < 0) {
> > > > -   pr_info(PREFIX "Unable to map lapic to logical cpu 
> > > > number\n");
> > > > +   pr_debug(PREFIX "Unable to map lapic to logical cpu 
> > > > number\n");
> > >
> > > And this one is printed sometimes when something really goes wrong
> > > which may be really hard to debug otherwise, so there is value in the
> > > info level here.
> > >
> > > Would it be possible to avoid printing it just in some cases?
> >
> > This can do the trick:
> >
> > diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
> > index 3ef8ab89c02d..00212b3991e0 100644
> > --- a/arch/x86/kernel/acpi/boot.c
> > +++ b/arch/x86/kernel/acpi/boot.c
> > @@ -770,7 +770,10 @@ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t 
> > physid, u32 acpi_id,
> >
> > cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED);
> > if (cpu < 0) {
> > -   pr_debug(PREFIX "Unable to map lapic to logical cpu 
> > number\n");
> > +   if (cpu == -ENOENT)
> > +   pr_debug(PREFIX "Unable to map lapic to logical cpu 
> > number\n");
>
> I don't think it is necessary to print this in the -ENOENT case, as
> there is a message for that case that will be printed anyway.

Agree, how do you want me to progress? Should I resend patch?

Thanks


signature.asc
Description: PGP signature


RE: [PATCH 8/8] vfio/mdev: Improve the create/remove sequence

2019-03-26 Thread Parav Pandit


> -Original Message-
> From: Kirti Wankhede 
> Sent: Tuesday, March 26, 2019 2:06 AM
> To: Parav Pandit ; k...@vger.kernel.org; linux-
> ker...@vger.kernel.org; alex.william...@redhat.com
> Cc: Neo Jia 
> Subject: Re: [PATCH 8/8] vfio/mdev: Improve the create/remove sequence
> 
> 
> 
> On 3/23/2019 4:50 AM, Parav Pandit wrote:
> > There are five problems with current code structure.
> > 1. mdev device is placed on the mdev bus before it is created in the
> > vendor driver. Once a device is placed on the mdev bus without
> > creating its supporting underlying vendor device, an open() can get
> > triggered by userspace on partially initialized device.
> > Below ladder diagram highlight it.
> >
> >   cpu-0   cpu-1
> >   -   -
> >create_store()
> >  mdev_create_device()
> >device_register()
> >   ...
> >  vfio_mdev_probe()
> >  ...creates char device
> > vfio_mdev_open()
> >   parent->ops->open(mdev)
> > vfio_ap_mdev_open()
> >   matrix_mdev = NULL
> > [...]
> > parent->ops->create()
> >   vfio_ap_mdev_create()
> > mdev_set_drvdata(mdev, matrix_mdev);
> > /* Valid pointer set above */
> >
> 
> VFIO interface uses sysfs path of device or PCI device's BDF where it checks
> sysfs file for that device exist.
> In case of VFIO mdev device, above situation will never happen as open will
> only get called if sysfs entry for that device exist.
> 
> If you don't use VFIO interface then this situation can arise. In that case
> probe() can be used for very basic initialization then create actual char
> device from create().
> 
I explained you that create() cannot do the heavy lifting work of creating 
netdev and rdma dev because at that stage driver doesn't know whether its 
getting used for VM or host.
create() needs to create the device that probe() can work on in stable manner.

> 
> > 2. Current creation sequence is,
> >parent->ops_create()
> >groups_register()
> >
> > Remove sequence is,
> >parent->ops->remove()
> >groups_unregister()
> > However, remove sequence should be exact mirror of creation sequence.
> > Once this is achieved, all users of the mdev will be terminated first
> > before removing underlying vendor device.
> > (Follow standard linux driver model).
> > At that point vendor's remove() ops shouldn't failed because device is
> > taken off the bus that should terminate the users.
> >
> 
> If VMM or user space application is using mdev device,
> parent->ops->remove() can return failure. In that case sysfs files
> shouldn't be removed. Hence above sequence is followed for remove.
> 
> Standard linux driver model doesn't allow remove() to fail, but in of mdev
> framework, interface is defined to handle such error case.
> 
But the sequence is incorrect for wider use case.
> 
> > 3. Additionally any new mdev driver that wants to work on mdev device
> > during probe() routine registered using mdev_register_driver() needs
> > to get stable mdev structure.
> >
> 
> Things that you are trying to handle with mdev structure from probe(),
> couldn't that be moved to create()?
> 
No, as explained before and above.
That approach just doesn't look right.
 
> 
> > 4. In following sequence, child devices created while removing mdev
> > parent device can be left out, or it may lead to race of removing half
> > initialized child mdev devices.
> >
> > issue-1:
> > 
> >cpu-0 cpu-1
> >- -
> >   mdev_unregister_device()
> >  device_for_each_child()
> > mdev_device_remove_cb()
> > mdev_device_remove()
> > create_store()
> >   mdev_device_create()   [...]
> >device_register()
> >   parent_remove_sysfs_files()
> >   /* BUG: device added by cpu-0
> >* whose parent is getting removed.
> >*/
> >
> > issue-2:
> > 
> >cpu-0 cpu-1
> >- -
> > create_store()
> >   mdev_device_create()   [...]
> >device_register()
> >
> >[...]  mdev_unregister_device()
> >  device_for_each_child()
> > mdev_device_remove_cb()
> > mdev_device_remove()
> >
> >mdev_create_sysfs_files()
> >/* BUG: create is adding
> > * sysfs files for a 

Re: [PATCH v2 3/3] rcu: validate arguments for rcu tracepoints

2019-03-26 Thread Steven Rostedt
On Tue, 26 Mar 2019 08:18:15 -0700
"Paul E. McKenney"  wrote:

> On Tue, Mar 26, 2019 at 08:13:11PM +0800, Yafang Shao wrote:
> > When CONFIG_RCU_TRACE is not set, all these tracepoints are defined as
> > do-nothing macro.
> > We'd better make those inline functions that take proper arguments.
> > 
> > As RCU_TRACE() is defined as do-nothing marco as well when
> > CONFIG_RCU_TRACE is not set, so we can clean it up.  
> 
> How about this for the commit log?
> 
>   Unless the CONFIG_RCU_TRACE kconfig option is set, almost all
>   of RCU's tracepoints are defined as empty macros.  It would
>   be better if these tracepoints could instead be empty inline
>   functions with proper arguments and type checking.  It would
>   also be good to get rid of the RCU_TRACE() macro, which
>   compiles its argument in CONFIG_RCU_TRACE=y kernels and
>   omits them otherwise.
> 
>   This commit therefore creates a TRACE_EVENT_RCU macro that
>   is defined as TRACE_EVENT in CONFIG_RCU_TRACE=y kernels and
>   as the new TRACE_EVENT_NOP otherwise, which allows the
>   empty macros and the RCU_TRACE() macro to be eliminated.
> 
> With that:
> 
> Reviewed-by: Paul E. McKenney 

Yafang,

If you are OK with the above changes, I'll take this patch with the
updated change log.

-- Steve


Re: [PATCH 8/8] vfio/mdev: Improve the create/remove sequence

2019-03-26 Thread Alex Williamson
On Tue, 26 Mar 2019 12:36:22 +0530
Kirti Wankhede  wrote:

> On 3/23/2019 4:50 AM, Parav Pandit wrote:
> > There are five problems with current code structure.
> > 1. mdev device is placed on the mdev bus before it is created in the
> > vendor driver. Once a device is placed on the mdev bus without creating
> > its supporting underlying vendor device, an open() can get triggered by
> > userspace on partially initialized device.
> > Below ladder diagram highlight it.
> > 
> >   cpu-0   cpu-1
> >   -   -
> >create_store()
> >  mdev_create_device()
> >device_register()
> >   ...
> >  vfio_mdev_probe()
> >  ...creates char device
> > vfio_mdev_open()
> >   parent->ops->open(mdev)
> > vfio_ap_mdev_open()
> >   matrix_mdev = NULL
> > [...]
> > parent->ops->create()
> >   vfio_ap_mdev_create()
> > mdev_set_drvdata(mdev, matrix_mdev);
> > /* Valid pointer set above */
> >   
> 
> VFIO interface uses sysfs path of device or PCI device's BDF where it
> checks sysfs file for that device exist.
> In case of VFIO mdev device, above situation will never happen as open
> will only get called if sysfs entry for that device exist.
> 
> If you don't use VFIO interface then this situation can arise. In that
> case probe() can be used for very basic initialization then create
> actual char device from create().
> 
> 
> > 2. Current creation sequence is,
> >parent->ops_create()
> >groups_register()
> > 
> > Remove sequence is,
> >parent->ops->remove()
> >groups_unregister()
> > However, remove sequence should be exact mirror of creation sequence.
> > Once this is achieved, all users of the mdev will be terminated first
> > before removing underlying vendor device.
> > (Follow standard linux driver model).
> > At that point vendor's remove() ops shouldn't failed because device is
> > taken off the bus that should terminate the users.
> >   
> 
> If VMM or user space application is using mdev device,
> parent->ops->remove() can return failure. In that case sysfs files
> shouldn't be removed. Hence above sequence is followed for remove.
> 
> Standard linux driver model doesn't allow remove() to fail, but in
> of mdev framework, interface is defined to handle such error case.
> 
> 
> > 3. Additionally any new mdev driver that wants to work on mdev device
> > during probe() routine registered using mdev_register_driver() needs to
> > get stable mdev structure.
> >   
> 
> Things that you are trying to handle with mdev structure from probe(),
> couldn't that be moved to create()?
> 
> 
> > 4. In following sequence, child devices created while removing mdev parent
> > device can be left out, or it may lead to race of removing half
> > initialized child mdev devices.
> > 
> > issue-1:
> > 
> >cpu-0 cpu-1
> >- -
> >   mdev_unregister_device()
> >  device_for_each_child()
> > mdev_device_remove_cb()
> > mdev_device_remove()
> > create_store()
> >   mdev_device_create()   [...]
> >device_register()
> >   parent_remove_sysfs_files()
> >   /* BUG: device added by cpu-0
> >* whose parent is getting removed.
> >*/
> > 
> > issue-2:
> > 
> >cpu-0 cpu-1
> >- -
> > create_store()
> >   mdev_device_create()   [...]
> >device_register()
> > 
> >[...]  mdev_unregister_device()
> >  device_for_each_child()
> > mdev_device_remove_cb()
> > mdev_device_remove()
> > 
> >mdev_create_sysfs_files()
> >/* BUG: create is adding
> > * sysfs files for a device
> > * which is undergoing removal.
> > */
> >  parent_remove_sysfs_files()
> > 
> > 5. Below crash is observed when user initiated remove is in progress
> > and mdev_unregister_driver() completes parent unregistration.
> > 
> >cpu-0 cpu-1
> >- -
> > remove_store()
> >mdev_device_remove()
> >active = false;
> >   mdev_unregister_device()
> > remove type
> >[...]
> >mdev_remove_ops() crashes.
> > 

Re: [PATCH 2/3] genirq/timings: Add array suffix computation code

2019-03-26 Thread Daniel Lezcano


Hi Thomas,

thanks for reviewing this patch.

[ ... ]

>> +
>> +/*
>> + * Exponential moving average computation
>> + */
>> +static int irq_timings_ema_new(s64 value, s64 ema_old)
> 
> There is a mixed bag of s64/u64 all over this code. Please stay
> consistent. We had enough sign confusion bugs in the past.

Right.

I have a question, ema_old and value will be always u64 type and the
function irq_timings_ema_new() will return an u64 ...

>   value = (value - ema_old) * EMA_ALPHA_VAL;
>   return ema_old + value >> EMA_ALPHA_SHIFT;

... how can I deal with the operations above when value < ema_old ?

Shall I use an intermediate s64 ?

eg:

s64 aux = (value - ema_old) * EMA_ALPHA_VAL;
return ema_old + aux >> EMA_ALPHA_SHIFT;
?

[ ... ]

 > Other than that this looks good to me. Nice work!

Thanks, I appreciate.

-- 
  Linaro.org │ Open source software for ARM SoCs

Follow Linaro:   Facebook |
 Twitter |
 Blog



[PATCH v4 1/2] staging: mt7621-mmc: Remove obsolete Kconfig flags

2019-03-26 Thread George Hilliard
These values are not referred to anywhere else in the kernel. Card
detect is controlled by the device tree property "mediatek,cd-poll",
and there is no driver support for eMMC whatsoever.

Signed-off-by: George Hilliard 
---
v2: Rewrite of v1
v3: [Not present]
v4: Resubmit of v2

 drivers/staging/mt7621-mmc/Kconfig | 8 
 1 file changed, 8 deletions(-)

diff --git a/drivers/staging/mt7621-mmc/Kconfig 
b/drivers/staging/mt7621-mmc/Kconfig
index 1eb79cd6e22f..01f231dd8511 100644
--- a/drivers/staging/mt7621-mmc/Kconfig
+++ b/drivers/staging/mt7621-mmc/Kconfig
@@ -6,11 +6,3 @@ config MTK_AEE_KDUMP
bool "MTK AEE KDUMP"
depends on MTK_MMC
 
-config MTK_MMC_CD_POLL
-   bool "Card Detect with Polling"
-   depends on MTK_MMC
-
-config MTK_MMC_EMMC_8BIT
-   bool "eMMC 8-bit support"
-   depends on MTK_MMC && RALINK_MT7628
-
-- 
2.21.0



Re: [PATCH 8/8] vfio/mdev: Improve the create/remove sequence

2019-03-26 Thread Alex Williamson
On Tue, 26 Mar 2019 05:53:22 +
Parav Pandit  wrote:

> > -Original Message-
> > From: linux-kernel-ow...@vger.kernel.org  > ow...@vger.kernel.org> On Behalf Of Parav Pandit  
> > Sent: Monday, March 25, 2019 10:19 PM
> > To: Alex Williamson 
> > Cc: k...@vger.kernel.org; linux-kernel@vger.kernel.org;
> > kwankh...@nvidia.com
> > Subject: RE: [PATCH 8/8] vfio/mdev: Improve the create/remove sequence
> > 
> > 
> >   
> > > -Original Message-
> > > From: Alex Williamson 
> > > Sent: Monday, March 25, 2019 9:17 PM
> > > To: Parav Pandit 
> > > Cc: k...@vger.kernel.org; linux-kernel@vger.kernel.org;
> > > kwankh...@nvidia.com
> > > Subject: Re: [PATCH 8/8] vfio/mdev: Improve the create/remove sequence
> > >
> > > On Tue, 26 Mar 2019 01:43:44 +
> > > Parav Pandit  wrote:
> > >  
> > > > > -Original Message-
> > > > > From: Alex Williamson   
> 
> > > > > I mean the callback iterator on the parent remove can do a WARN_ON
> > > > > if this returns an error while the device remove path can silently
> > > > > return -EBUSY, the common function doesn't need to decide whether
> > > > > the parent ops remove function deserves a dev_err.
> > > > >  
> > > > Ok. I understood.
> > > > But device remove returning silent -EBUSY looks an error that should
> > > > get logged in, because this is something not expected. Its probably
> > > > late for sysfs layer to return report an error by that time it
> > > > prints device name, because put_device() is done. So if remove()
> > > > returns an error, I think its legitimate failure to do WARN_ON or  
> > dev_err().  
> > >
> > > Calling put_device() if the parent remove op fails looks like a bug
> > > introduced by this series, the current code allows that failure
> > > leaving the device in a coherent state and returning errno to the sysfs  
> > store function.  
> > >  
> > Why should it fail?
> > We are taking off the device bus first as describe in commit log.
> > This ensures that everything is closed before calling the remove().
> > We cannot avoid put_device() and put_parent, it all buggy path...  
> 
> I audited remove() callbacks of kvmgt.c, vfio_ccw_ops.c,
> vfio_ap_ops.c, mbochs.c, mdpy.c, mtty.c, who makes the remove
> possible once the device release is executed. This should complete
> once the device is taken off the bus. This was not the case before
> this sequence where remove() is done while device is open...hence the
> check was needed in past. dev_err() is to help catch any errors/bugs
> in this area.
> 
> I doubt we need to retry remove() like vfio_del_group_dev(), in
> mdev_core if release() is not yet complete.

I'm ok with this, I've always thought the 'force' semantics and
allowing remove to fail were not terribly inline with other drivers,
even if ultimately I wish drivers could nak a remove request to avoid
the ugliness of blocking.  But ultimately you'll need to come to an
agreement with Kirti, the drivers we have in-tree are not the complete
set of mdev drivers, but it also doesn't necessarily make sense to cater
to the lone out-of-tree driver either.  Thanks,

Alex


[PATCH v4 2/2] staging: mt7621-mmc: Initialize completions a single time during probe

2019-03-26 Thread George Hilliard
The module was initializing completions whenever it was going to wait on
them, and not when the completion was allocated.  This is incorrect
according to the completion docs:

Calling init_completion() on the same completion object twice is
most likely a bug [...]

Re-initialization is also unnecessary because the module never uses
complete_all().  Fix this by only ever initializing the completion a
single time, and log if the completions are not consumed as intended
(this is not a fatal problem, but should not go unnoticed).

Signed-off-by: George Hilliard 
---
v2: rewrite of v1
v3: Remove BUG_ON() calls
v4: Indent style fixup

 drivers/staging/mt7621-mmc/sd.c | 18 ++
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/staging/mt7621-mmc/sd.c b/drivers/staging/mt7621-mmc/sd.c
index e346167754bd..ed63bd3ba6cc 100644
--- a/drivers/staging/mt7621-mmc/sd.c
+++ b/drivers/staging/mt7621-mmc/sd.c
@@ -466,7 +466,11 @@ static unsigned int msdc_command_start(struct msdc_host   
*host,
host->cmd = cmd;
host->cmd_rsp = resp;
 
-   init_completion(>cmd_done);
+   // The completion should have been consumed by the previous command
+   // response handler, because the mmc requests should be serialized
+   if(completion_done(>cmd_done))
+   dev_err(mmc_dev(host->mmc),
+   "previous command was not handled\n");
 
sdr_set_bits(host->base + MSDC_INTEN, wints);
sdc_send_cmd(rawcmd, cmd->arg);
@@ -488,7 +492,6 @@ static unsigned int msdc_command_resp(struct msdc_host   
*host,
MSDC_INT_ACMD19_DONE;
 
BUG_ON(in_interrupt());
-   //init_completion(>cmd_done);
//sdr_set_bits(host->base + MSDC_INTEN, wints);
 
spin_unlock(>lock);
@@ -670,7 +673,13 @@ static int msdc_do_request(struct mmc_host *mmc, struct 
mmc_request *mrq)
//msdc_clr_fifo(host);  /* no need */
 
msdc_dma_on();  /* enable DMA mode first!! */
-   init_completion(>xfer_done);
+
+   // The completion should have been consumed by the previous
+   // xfer response handler, because the mmc requests should be
+   // serialized
+   if(completion_done(>cmd_done))
+   dev_err(mmc_dev(host->mmc),
+   "previous transfer was not handled\n");
 
/* start the command first*/
if (msdc_command_start(host, cmd, CMD_TIMEOUT) != 0)
@@ -696,7 +705,6 @@ static int msdc_do_request(struct mmc_host *mmc, struct 
mmc_request *mrq)
/* for read, the data coming too fast, then CRC error
 *  start DMA no business with CRC.
 */
-   //init_completion(>xfer_done);
msdc_dma_start(host);
 
spin_unlock(>lock);
@@ -1687,6 +1695,8 @@ static int msdc_drv_probe(struct platform_device *pdev)
}
msdc_init_gpd_bd(host, >dma);
 
+   init_completion(>cmd_done);
+   init_completion(>xfer_done);
INIT_DELAYED_WORK(>card_delaywork, msdc_tasklet_card);
spin_lock_init(>lock);
msdc_init_hw(host);
-- 
2.21.0



Re: [PATCH 5/5] lib/vsprintf: Add %pfw conversion specifier for printing fwnode names

2019-03-26 Thread Petr Mladek
On Tue 2019-03-26 15:55:57, Andy Shevchenko wrote:
> On Tue, Mar 26, 2019 at 03:39:47PM +0200, Sakari Ailus wrote:
> > On Tue, Mar 26, 2019 at 03:13:53PM +0200, Andy Shevchenko wrote:
> > > On Sun, Mar 24, 2019 at 08:17:46PM +0200, Sakari Ailus wrote:
> 
> > > The patch series by Petr I mentioned takes care about OF case. But it 
> > > doesn't
> > > have covered yours by obvious reasons.
> > 
> > Do you happen to have a pointer to it?
> 
> Petr, can you share what is the state of affairs with that series?

I might send a new version this week but I do not promise it.
It is regularly pushed aside by more urgent work.

But we could work on both changes independently. Both can be
updated easily depending on which one gets accepted earlier.

Best Regards,
Petr


[PATCH v4 0/2] staging: mt7621-mmc: correctness fixes

2019-03-26 Thread George Hilliard
Coding style fixup and rebase of v3, and resubmit of the Kconfig patch
that got dropped from v2.  No other changes.

Thanks for your continued attention and reviews!

George




Re:Re: [PATCH v2] mtd: spi-nor: Return error when nor->addr_width does not match the device size

2019-03-26 Thread Liu Xiang



Hi, Vignesh






At 2019-03-19 13:22:15, "Vignesh Raghavendra"  wrote:
>Hi,
>
>On 13/03/19 7:15 PM, Liu Xiang wrote:
>> In some is25lp256, the DWORD1 of JEDEC Basic Flash Parameter Header
>> is 0xfff920e5. So the DWORD1[18:17] Address Bytes bits are 0b00,
>> means that 3-Byte only addressing. But the device size is larger
>> than 16MB, nor->addr_width must be 4 to access the whole address.
>> An error should be returned when nor->addr_width does not match
>> the device size in spi_nor_parse_bfpt(). Then it can go back to
>> use spi_nor_ids[] for setting the right addr_width.
>> 
>> Suggested-by: Boris Brezillon 
>> Signed-off-by: Liu Xiang 
>> ---
>>  drivers/mtd/spi-nor/spi-nor.c | 8 
>>  1 file changed, 8 insertions(+)
>> 
>> diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c
>> index 6e13bbd..63933c7 100644
>> --- a/drivers/mtd/spi-nor/spi-nor.c
>> +++ b/drivers/mtd/spi-nor/spi-nor.c
>> @@ -2811,6 +2811,14 @@ static int spi_nor_parse_bfpt(struct spi_nor *nor,
>>  }
>>  params->size >>= 3; /* Convert to bytes. */
>>  
>> +/*
>> + * If the device exceeds 16MiB, addr_width must be 4.
>> + * addr_width == 3 means the Address Bytes we are
>> + * reading from BFPT is wrong.
>> + */
>
>JESD216 standard does not mandate flash devices >16MiB to always support
>4 byte addressing opcode. So, its okay for flash vendor to support
>>16MiB flash with 3 byte addressing and Bank/extended address register.
>
>> +if (params->size > 0x100 && nor->addr_width == 3)
>> +return -EINVAL;
>> +
>
>Assuming only DWORD1[18:17] bits are wrong, then returning from here
>would mean we miss parsing Sector Erase settings, Quad Enable
>Requirements etc from BFPT which is kind of bad.
>I suggest to move the fix to[1], addr_width indicated in flash_info
>struct of the device can take precedence over SFDP.
>
>[1]https://elixir.bootlin.com/linux/latest/source/drivers/mtd/spi-nor/spi-nor.c#L4106

Boris has added a fixup function, do you think this is more better:

static int
is25lp256_post_bfpt_fixups(struct spi_nor *nor,
const struct sfdp_parameter_header *bfpt_header,
const struct sfdp_bfpt *bfpt,
struct spi_nor_flash_parameter *params)
{
/*
 * IS25LP256 supports 4B opcodes.
 * Unfortunately, some devices get BFPT_DWORD1_ADDRESS_BYTES_3_ONLY  
 * from BFPT table for address width. We should fix it.
 */
if (bfpt.dwords[BFPT_DWORD(1)] & BFPT_DWORD1_ADDRESS_BYTES_MASK == 
BFPT_DWORD1_ADDRESS_BYTES_3_ONLY)
nor->addr_width = 4;

return 0;
}

static struct spi_nor_fixups is25lp256_fixups = {
.post_bfpt = is25lp256_post_bfpt_fixups,
};


>
>
>>  /* Fast Read settings. */
>>  for (i = 0; i < ARRAY_SIZE(sfdp_bfpt_reads); i++) {
>>  const struct sfdp_bfpt_read *rd = _bfpt_reads[i];
>> 
>
>-- 
>Regards
>Vignesh


[PATCH v7 4/4] perf/smmuv3: Enable HiSilicon Erratum 162001800 quirk

2019-03-26 Thread Shameer Kolothum
HiSilicon erratum 162001800 describes the limitation of
SMMUv3 PMCG implementation on HiSilicon Hip08 platforms.

On these platforms, the PMCG event counter registers
(SMMU_PMCG_EVCNTRn) are read only and as a result it
is not possible to set the initial counter period value
on event monitor start.

To work around this, the current value of the counter
is read and used for delta calculations. OEM information
from ACPI header is used to identify the affected hardware
platforms.

Signed-off-by: Shameer Kolothum 
Reviewed-by: Hanjun Guo 
Reviewed-by: Robin Murphy 
---
 drivers/acpi/arm64/iort.c | 16 ++-
 drivers/perf/arm_smmuv3_pmu.c | 48 ---
 include/linux/acpi_iort.h |  1 +
 3 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index e2c9b26..4dc68de 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1366,9 +1366,23 @@ static void __init 
arm_smmu_v3_pmcg_init_resources(struct resource *res,
   ACPI_EDGE_SENSITIVE, [2]);
 }
 
+static struct acpi_platform_list pmcg_plat_info[] __initdata = {
+   /* HiSilicon Hip08 Platform */
+   {"HISI  ", "HIP08   ", 0, ACPI_SIG_IORT, greater_than_or_equal, 0,
+IORT_SMMU_V3_PMCG_HISI_HIP08},
+   { }
+};
+
 static int __init arm_smmu_v3_pmcg_add_platdata(struct platform_device *pdev)
 {
-   u32 model = IORT_SMMU_V3_PMCG_GENERIC;
+   u32 model;
+   int idx;
+
+   idx = acpi_match_platform_list(pmcg_plat_info);
+   if (idx >= 0)
+   model = pmcg_plat_info[idx].data;
+   else
+   model = IORT_SMMU_V3_PMCG_GENERIC;
 
return platform_device_add_data(pdev, , sizeof(model));
 }
diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index 7803e9e..6b3c0ed 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -35,6 +35,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -93,6 +94,8 @@
 
 #define SMMU_PMCG_PA_SHIFT  12
 
+#define SMMU_PMCG_EVCNTR_RDONLY BIT(0)
+
 static int cpuhp_state_num;
 
 struct smmu_pmu {
@@ -108,6 +111,7 @@ struct smmu_pmu {
void __iomem *reg_base;
void __iomem *reloc_base;
u64 counter_mask;
+   u32 options;
bool global_filter;
u32 global_filter_span;
u32 global_filter_sid;
@@ -222,15 +226,27 @@ static void smmu_pmu_set_period(struct smmu_pmu *smmu_pmu,
u32 idx = hwc->idx;
u64 new;
 
-   /*
-* We limit the max period to half the max counter value of the counter
-* size, so that even in the case of extreme interrupt latency the
-* counter will (hopefully) not wrap past its initial value.
-*/
-   new = smmu_pmu->counter_mask >> 1;
+   if (smmu_pmu->options & SMMU_PMCG_EVCNTR_RDONLY) {
+   /*
+* On platforms that require this quirk, if the counter starts
+* at < half_counter value and wraps, the current logic of
+* handling the overflow may not work. It is expected that,
+* those platforms will have full 64 counter bits implemented
+* so that such a possibility is remote(eg: HiSilicon HIP08).
+*/
+   new = smmu_pmu_counter_get_value(smmu_pmu, idx);
+   } else {
+   /*
+* We limit the max period to half the max counter value
+* of the counter size, so that even in the case of extreme
+* interrupt latency the counter will (hopefully) not wrap
+* past its initial value.
+*/
+   new = smmu_pmu->counter_mask >> 1;
+   smmu_pmu_counter_set_value(smmu_pmu, idx, new);
+   }
 
local64_set(>prev_count, new);
-   smmu_pmu_counter_set_value(smmu_pmu, idx, new);
 }
 
 static void smmu_pmu_set_event_filter(struct perf_event *event,
@@ -669,6 +685,22 @@ static void smmu_pmu_reset(struct smmu_pmu *smmu_pmu)
   smmu_pmu->reloc_base + SMMU_PMCG_OVSCLR0);
 }
 
+static void smmu_pmu_get_acpi_options(struct smmu_pmu *smmu_pmu)
+{
+   u32 model;
+
+   model = *(u32 *)dev_get_platdata(smmu_pmu->dev);
+
+   switch (model) {
+   case IORT_SMMU_V3_PMCG_HISI_HIP08:
+   /* HiSilicon Erratum 162001800 */
+   smmu_pmu->options |= SMMU_PMCG_EVCNTR_RDONLY;
+   break;
+   }
+
+   dev_notice(smmu_pmu->dev, "option mask 0x%x\n", smmu_pmu->options);
+}
+
 static int smmu_pmu_probe(struct platform_device *pdev)
 {
struct smmu_pmu *smmu_pmu;
@@ -748,6 +780,8 @@ static int smmu_pmu_probe(struct platform_device *pdev)
return -EINVAL;
}
 
+   smmu_pmu_get_acpi_options(smmu_pmu);
+
/* Pick one CPU to be the preferred one to use */
smmu_pmu->on_cpu = raw_smp_processor_id();
 

Re: [PATCH 5.0 00/52] 5.0.5-stable review

2019-03-26 Thread Jon Hunter


On 26/03/2019 06:29, Greg Kroah-Hartman wrote:
> This is the start of the stable review cycle for the 5.0.5 release.
> There are 52 patches in this series, all will be posted as a response
> to this one.  If anyone has any issues with these being applied, please
> let me know.
> 
> Responses should be made by Thu Mar 28 04:26:38 UTC 2019.
> Anything received after that time might be too late.
> 
> The whole patch series can be found in one patch at:
>   
> https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.0.5-rc1.gz
> or in the git tree and branch at:
>   
> git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
> linux-5.0.y
> and the diffstat can be found below.
> 
> thanks,
> 
> greg k-h

All tests are passing for Tegra ...

Test results for stable-v5.0:
11 builds:  11 pass, 0 fail
22 boots:   22 pass, 0 fail
28 tests:   28 pass, 0 fail

Linux version:  5.0.5-rc1-gebcb1bb
Boards tested:  tegra124-jetson-tk1, tegra186-p2771-,
tegra194-p2972-, tegra20-ventana,
tegra210-p2371-2180, tegra30-cardhu-a04

Cheers
Jon

-- 
nvpublic


[PATCH v7 3/4] perf/smmuv3: Add MSI irq support

2019-03-26 Thread Shameer Kolothum
This adds support for MSI-based counter overflow interrupt.

Signed-off-by: Shameer Kolothum 
Reviewed-by: Robin Murphy 
---
 drivers/perf/arm_smmuv3_pmu.c | 58 +++
 1 file changed, 58 insertions(+)

diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index c0924e5..7803e9e 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -67,6 +67,7 @@
 #define SMMU_PMCG_OVSSET0   0xCC0
 #define SMMU_PMCG_CFGR  0xE00
 #define SMMU_PMCG_CFGR_SID_FILTER_TYPE  BIT(23)
+#define SMMU_PMCG_CFGR_MSI  BIT(21)
 #define SMMU_PMCG_CFGR_RELOC_CTRS   BIT(20)
 #define SMMU_PMCG_CFGR_SIZE GENMASK(13, 8)
 #define SMMU_PMCG_CFGR_NCTR GENMASK(5, 0)
@@ -77,6 +78,12 @@
 #define SMMU_PMCG_IRQ_CTRL  0xE50
 #define SMMU_PMCG_IRQ_CTRL_IRQENBIT(0)
 #define SMMU_PMCG_IRQ_CFG0  0xE58
+#define SMMU_PMCG_IRQ_CFG1  0xE60
+#define SMMU_PMCG_IRQ_CFG2  0xE64
+
+/* MSI config fields */
+#define MSI_CFG0_ADDR_MASK  GENMASK_ULL(51, 2)
+#define MSI_CFG2_MEMATTR_DEVICE_nGnRE   0x1
 
 #define SMMU_PMCG_DEFAULT_FILTER_SPAN   1
 #define SMMU_PMCG_DEFAULT_FILTER_SIDGENMASK(31, 0)
@@ -584,11 +591,62 @@ static irqreturn_t smmu_pmu_handle_irq(int irq_num, void 
*data)
return IRQ_HANDLED;
 }
 
+static void smmu_pmu_free_msis(void *data)
+{
+   struct device *dev = data;
+
+   platform_msi_domain_free_irqs(dev);
+}
+
+static void smmu_pmu_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+   phys_addr_t doorbell;
+   struct device *dev = msi_desc_to_dev(desc);
+   struct smmu_pmu *pmu = dev_get_drvdata(dev);
+
+   doorbell = (((u64)msg->address_hi) << 32) | msg->address_lo;
+   doorbell &= MSI_CFG0_ADDR_MASK;
+
+   writeq_relaxed(doorbell, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+   writel_relaxed(msg->data, pmu->reg_base + SMMU_PMCG_IRQ_CFG1);
+   writel_relaxed(MSI_CFG2_MEMATTR_DEVICE_nGnRE,
+  pmu->reg_base + SMMU_PMCG_IRQ_CFG2);
+}
+
+static void smmu_pmu_setup_msi(struct smmu_pmu *pmu)
+{
+   struct msi_desc *desc;
+   struct device *dev = pmu->dev;
+   int ret;
+
+   /* Clear MSI address reg */
+   writeq_relaxed(0, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+
+   /* MSI supported or not */
+   if (!(readl(pmu->reg_base + SMMU_PMCG_CFGR) & SMMU_PMCG_CFGR_MSI))
+   return;
+
+   ret = platform_msi_domain_alloc_irqs(dev, 1, smmu_pmu_write_msi_msg);
+   if (ret) {
+   dev_warn(dev, "failed to allocate MSIs\n");
+   return;
+   }
+
+   desc = first_msi_entry(dev);
+   if (desc)
+   pmu->irq = desc->irq;
+
+   /* Add callback to free MSIs on teardown */
+   devm_add_action(dev, smmu_pmu_free_msis, dev);
+}
+
 static int smmu_pmu_setup_irq(struct smmu_pmu *pmu)
 {
unsigned long flags = IRQF_NOBALANCING | IRQF_SHARED | IRQF_NO_THREAD;
int irq, ret = -ENXIO;
 
+   smmu_pmu_setup_msi(pmu);
+
irq = pmu->irq;
if (irq)
ret = devm_request_irq(pmu->dev, irq, smmu_pmu_handle_irq,
-- 
2.7.4




[PATCH v7 2/4] perf/smmuv3: Add arm64 smmuv3 pmu driver

2019-03-26 Thread Shameer Kolothum
From: Neil Leeder 

Adds a new driver to support the SMMUv3 PMU and add it into the
perf events framework.

Each SMMU node may have multiple PMUs associated with it, each of
which may support different events.

SMMUv3 PMCG devices are named as smmuv3_pmcg_ where
 is the physical page address of the SMMU PMCG
wrapped to 4K boundary. For example, the PMCG at 0xff8884 is
named smmuv3_pmcg_ff88840

Filtering by stream id is done by specifying filtering parameters
with the event. options are:
   filter_enable- 0 = no filtering, 1 = filtering enabled
   filter_span  - 0 = exact match, 1 = pattern match
   filter_stream_id - pattern to filter against

Example: perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
   filter_span=1,filter_stream_id=0x42/ -a netperf

Applies filter pattern 0x42 to transaction events, which means events
matching stream ids 0x42 & 0x43 are counted as only upper StreamID
bits are required to match the given filter. Further filtering
information is available in the SMMU documentation.

SMMU events are not attributable to a CPU, so task mode and sampling
are not supported.

Signed-off-by: Neil Leeder 
Signed-off-by: Shameer Kolothum 
Reviewed-by: Robin Murphy 
---
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 776 ++
 3 files changed, 786 insertions(+)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index af9bc17..6a472fc 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -52,6 +52,15 @@ config ARM_PMU_ACPI
depends on ARM_PMU && ACPI
def_bool y
 
+config ARM_SMMU_V3_PMU
+bool "ARM SMMUv3 Performance Monitors Extension"
+depends on ARM64 && ACPI && ARM_SMMU_V3
+  help
+  Provides support for the SMMU version 3 performance monitor unit 
(PMU)
+  on ARM-based systems.
+  Adds the SMMU PMU into the perf events subsystem for
+  monitoring SMMU performance events.
+
 config ARM_DSU_PMU
tristate "ARM DynamIQ Shared Unit (DSU) PMU"
depends on ARM64
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 909f27f..3048994 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_ARM_CCN) += arm-ccn.o
 obj-$(CONFIG_ARM_DSU_PMU) += arm_dsu_pmu.o
 obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o
 obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o
+obj-$(CONFIG_ARM_SMMU_V3_PMU) += arm_smmuv3_pmu.o
 obj-$(CONFIG_HISI_PMU) += hisilicon/
 obj-$(CONFIG_QCOM_L2_PMU)  += qcom_l2_pmu.o
 obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
new file mode 100644
index 000..c0924e5
--- /dev/null
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -0,0 +1,776 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * This driver adds support for perf events to use the Performance
+ * Monitor Counter Groups (PMCG) associated with an SMMUv3 node
+ * to monitor that node.
+ *
+ * SMMUv3 PMCG devices are named as smmuv3_pmcg_ where
+ *  is the physical page address of the SMMU PMCG wrapped
+ * to 4K boundary. For example, the PMCG at 0xff8884 is named
+ * smmuv3_pmcg_ff88840
+ *
+ * Filtering by stream id is done by specifying filtering parameters
+ * with the event. options are:
+ *   filter_enable- 0 = no filtering, 1 = filtering enabled
+ *   filter_span  - 0 = exact match, 1 = pattern match
+ *   filter_stream_id - pattern to filter against
+ *
+ * To match a partial StreamID where the X most-significant bits must match
+ * but the Y least-significant bits might differ, STREAMID is programmed
+ * with a value that contains:
+ *  STREAMID[Y - 1] == 0.
+ *  STREAMID[Y - 2:0] == 1 (where Y > 1).
+ * The remainder of implemented bits of STREAMID (X bits, from bit Y upwards)
+ * contain a value to match from the corresponding bits of event StreamID.
+ *
+ * Example: perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
+ *filter_span=1,filter_stream_id=0x42/ -a netperf
+ * Applies filter pattern 0x42 to transaction events, which means events
+ * matching stream ids 0x42 and 0x43 are counted. Further filtering
+ * information is available in the SMMU documentation.
+ *
+ * SMMU events are not attributable to a CPU, so task mode and sampling
+ * are not supported.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define SMMU_PMCG_EVCNTR0   0x0
+#define SMMU_PMCG_EVCNTR(n, stride) (SMMU_PMCG_EVCNTR0 + (n) * (stride))
+#define SMMU_PMCG_EVTYPER0  0x400
+#define SMMU_PMCG_EVTYPER(n)(SMMU_PMCG_EVTYPER0 + (n) * 4)
+#define SMMU_PMCG_SID_SPAN_SHIFT29
+#define SMMU_PMCG_SMR0  0xA00

[PATCH v7 1/4] ACPI/IORT: Add support for PMCG

2019-03-26 Thread Shameer Kolothum
From: Neil Leeder 

Add support for the SMMU Performance Monitor Counter Group
information from ACPI. This is in preparation for its use
in the SMMUv3 PMU driver.

Signed-off-by: Neil Leeder 
Signed-off-by: Hanjun Guo 
Signed-off-by: Shameer Kolothum 
Reviewed-by: Robin Murphy 
Acked-by: Lorenzo Pieralisi 
---
 drivers/acpi/arm64/iort.c | 117 --
 include/linux/acpi_iort.h |   7 +++
 2 files changed, 100 insertions(+), 24 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index e48894e..e2c9b26 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -356,7 +356,8 @@ static struct acpi_iort_node *iort_node_get_id(struct 
acpi_iort_node *node,
if (map->flags & ACPI_IORT_ID_SINGLE_MAPPING) {
if (node->type == ACPI_IORT_NODE_NAMED_COMPONENT ||
node->type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX ||
-   node->type == ACPI_IORT_NODE_SMMU_V3) {
+   node->type == ACPI_IORT_NODE_SMMU_V3 ||
+   node->type == ACPI_IORT_NODE_PMCG) {
*id_out = map->output_base;
return parent;
}
@@ -394,6 +395,8 @@ static int iort_get_id_mapping_index(struct acpi_iort_node 
*node)
}
 
return smmu->id_mapping_index;
+   case ACPI_IORT_NODE_PMCG:
+   return 0;
default:
return -EINVAL;
}
@@ -1218,14 +1221,23 @@ static void __init arm_smmu_v3_init_resources(struct 
resource *res,
}
 }
 
-static bool __init arm_smmu_v3_is_coherent(struct acpi_iort_node *node)
+static void __init arm_smmu_v3_dma_configure(struct device *dev,
+struct acpi_iort_node *node)
 {
struct acpi_iort_smmu_v3 *smmu;
+   enum dev_dma_attr attr;
 
/* Retrieve SMMUv3 specific data */
smmu = (struct acpi_iort_smmu_v3 *)node->node_data;
 
-   return smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE;
+   attr = (smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE) ?
+   DEV_DMA_COHERENT : DEV_DMA_NON_COHERENT;
+
+   /* We expect the dma masks to be equivalent for all SMMUv3 set-ups */
+   dev->dma_mask = >coherent_dma_mask;
+
+   /* Configure DMA for the page table walker */
+   acpi_dma_configure(dev, attr);
 }
 
 #if defined(CONFIG_ACPI_NUMA)
@@ -1301,30 +1313,82 @@ static void __init arm_smmu_init_resources(struct 
resource *res,
}
 }
 
-static bool __init arm_smmu_is_coherent(struct acpi_iort_node *node)
+static void __init arm_smmu_dma_configure(struct device *dev,
+ struct acpi_iort_node *node)
 {
struct acpi_iort_smmu *smmu;
+   enum dev_dma_attr attr;
 
/* Retrieve SMMU specific data */
smmu = (struct acpi_iort_smmu *)node->node_data;
 
-   return smmu->flags & ACPI_IORT_SMMU_COHERENT_WALK;
+   attr = (smmu->flags & ACPI_IORT_SMMU_COHERENT_WALK) ?
+   DEV_DMA_COHERENT : DEV_DMA_NON_COHERENT;
+
+   /* We expect the dma masks to be equivalent for SMMU set-ups */
+   dev->dma_mask = >coherent_dma_mask;
+
+   /* Configure DMA for the page table walker */
+   acpi_dma_configure(dev, attr);
+}
+
+static int __init arm_smmu_v3_pmcg_count_resources(struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   /*
+* There are always 2 memory resources.
+* If the overflow_gsiv is present then add that for a total of 3.
+*/
+   return pmcg->overflow_gsiv ? 3 : 2;
+}
+
+static void __init arm_smmu_v3_pmcg_init_resources(struct resource *res,
+  struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   res[0].start = pmcg->page0_base_address;
+   res[0].end = pmcg->page0_base_address + SZ_4K - 1;
+   res[0].flags = IORESOURCE_MEM;
+   res[1].start = pmcg->page1_base_address;
+   res[1].end = pmcg->page1_base_address + SZ_4K - 1;
+   res[1].flags = IORESOURCE_MEM;
+
+   if (pmcg->overflow_gsiv)
+   acpi_iort_register_irq(pmcg->overflow_gsiv, "overflow",
+  ACPI_EDGE_SENSITIVE, [2]);
+}
+
+static int __init arm_smmu_v3_pmcg_add_platdata(struct platform_device *pdev)
+{
+   u32 model = IORT_SMMU_V3_PMCG_GENERIC;
+
+   return platform_device_add_data(pdev, , sizeof(model));
 }
 
 struct iort_dev_config {
const char *name;
int (*dev_init)(struct acpi_iort_node *node);
-   bool (*dev_is_coherent)(struct acpi_iort_node *node);
+   void (*dev_dma_configure)(struct device *dev,
+ struct 

Re: [PATCH 4.19 00/45] 4.19.32-stable review

2019-03-26 Thread Jon Hunter


On 26/03/2019 06:29, Greg Kroah-Hartman wrote:
> This is the start of the stable review cycle for the 4.19.32 release.
> There are 45 patches in this series, all will be posted as a response
> to this one.  If anyone has any issues with these being applied, please
> let me know.
> 
> Responses should be made by Thu Mar 28 04:26:41 UTC 2019.
> Anything received after that time might be too late.
> 
> The whole patch series can be found in one patch at:
>   
> https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.19.32-rc1.gz
> or in the git tree and branch at:
>   
> git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
> linux-4.19.y
> and the diffstat can be found below.
> 
> thanks,
> 
> greg k-h

All tests are passing for Tegra ...

Test results for stable-v4.19:
11 builds:  11 pass, 0 fail
22 boots:   22 pass, 0 fail
28 tests:   28 pass, 0 fail

Linux version:  4.19.32-rc1-g91ed293
Boards tested:  tegra124-jetson-tk1, tegra186-p2771-,
tegra194-p2972-, tegra20-ventana,
tegra210-p2371-2180, tegra30-cardhu-a04

Cheers
Jon

-- 
nvpublic


[PATCH v7 0/4] arm64 SMMUv3 PMU driver with IORT support

2019-03-26 Thread Shameer Kolothum
This adds a driver for the SMMUv3 PMU into the perf framework.
It includes an IORT update to support PM Counter Groups.

This is based on the initial work done by Neil Leeder[1]

SMMUv3 PMCG devices are named as smmuv3_pmcg_
where  is the physical page address of the SMMU PMCG.
For example, the PMCG at 0xff8884 is named smmuv3_pmcg_ff88840

Usage example:
For common arch supported events:
perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
 filter_span=1,filter_stream_id=0x42/ -a netperf

For IMP DEF events:
perf stat -e smmuv3_pmcg_ff88840/event=id/ -a netperf

This is sanity tested on a HiSilicon platform that requires
a quirk to run  it properly. As per HiSilicon erratum  #162001800,
PMCG event counter registers (SMMU_PMCG_EVCNTRn) on HiSilicon Hip08
platforms are read only and this prevents the software from setting
the initial period on event start. Unfortunately we were a bit late
in the cycle to detect this issue and now require software workaround
for this. Patch #4 is added to this series to provide a workaround
for this issue.

Further testing on supported platforms are very much welcome.

v6 --> v7
-Addressed comments from Robin and Lorenzo.
-Added R-by from Robin/Hanjun and A-by from Lorenzo.

v5 ---> v6
-Addressed comments from Robin and Andrew.
-Changed the way global filter settings are applied as a probable
 fix to the v5 bug where in-use settings gets overwritten.
-Use of PMCG model number to identify the platform.
-Added R-by from Robin to patches #1 and #3.

v4 ---> v5
-IORT code is modified to pass the option/quirk flags to the driver
 through platform_data (patch #4), based on Robin's comments.
-Removed COMPILE_TEST (patch #2).

v3 --> v4

-Addressed comments from Jean and Robin.
-Merged dma config callbacks as per Lorenzo's comments(patch #1).
-Added handling of Global(Counter0) filter settings mode(patch #2).
-Added patch #4 to address HiSilicon erratum  #162001800
-
v2 --> v3

-Addressed comments from Robin.
-Removed iort helper function to retrieve the PMCG reference smmu.
-PMCG devices are now named using the base address

v1 --> v2

- Addressed comments from Robin.
- Added an helper to retrieve the associated smmu dev and named PMUs
  to make the association visible to user.
- Added MSI support  for overflow irq

[1]https://www.spinics.net/lists/arm-kernel/msg598591.html


Neil Leeder (2):
  ACPI/IORT: Add support for PMCG
  perf/smmuv3: Add arm64 smmuv3 pmu driver

Shameer Kolothum (2):
  perf/smmuv3: Add MSI irq support
  perf/smmuv3: Enable HiSilicon Erratum 162001800 quirk

 drivers/acpi/arm64/iort.c | 131 +--
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 868 ++
 include/linux/acpi_iort.h |   8 +
 5 files changed, 993 insertions(+), 24 deletions(-)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

-- 
2.7.4




Re: [PATCH 4.14 00/41] 4.14.109-stable review

2019-03-26 Thread Jon Hunter


On 26/03/2019 06:29, Greg Kroah-Hartman wrote:
> This is the start of the stable review cycle for the 4.14.109 release.
> There are 41 patches in this series, all will be posted as a response
> to this one.  If anyone has any issues with these being applied, please
> let me know.
> 
> Responses should be made by Thu Mar 28 04:26:32 UTC 2019.
> Anything received after that time might be too late.
> 
> The whole patch series can be found in one patch at:
>   
> https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.14.109-rc1.gz
> or in the git tree and branch at:
>   
> git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
> linux-4.14.y
> and the diffstat can be found below.
> 
> thanks,
> 
> greg k-h

All tests are passing for Tegra ...

Test results for stable-v4.14:
8 builds:   8 pass, 0 fail
16 boots:   16 pass, 0 fail
20 tests:   20 pass, 0 fail

Linux version:  4.14.109-rc1-g4bb6d9c
Boards tested:  tegra124-jetson-tk1, tegra20-ventana,
tegra210-p2371-2180, tegra30-cardhu-a04

Cheers
Jon

-- 
nvpublic


Re: [PATCH 4.9 00/30] 4.9.166-stable review

2019-03-26 Thread Jon Hunter


On 26/03/2019 06:29, Greg Kroah-Hartman wrote:
> This is the start of the stable review cycle for the 4.9.166 release.
> There are 30 patches in this series, all will be posted as a response
> to this one.  If anyone has any issues with these being applied, please
> let me know.
> 
> Responses should be made by Thu Mar 28 04:25:51 UTC 2019.
> Anything received after that time might be too late.
> 
> The whole patch series can be found in one patch at:
>   
> https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.9.166-rc1.gz
> or in the git tree and branch at:
>   
> git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
> linux-4.9.y
> and the diffstat can be found below.
> 
> thanks,
> 
> greg k-h

All tests are passing for Tegra ...

Test results for stable-v4.9:
8 builds:   8 pass, 0 fail
16 boots:   16 pass, 0 fail
20 tests:   20 pass, 0 fail

Linux version:  4.9.166-rc1-gb7e63ff
Boards tested:  tegra124-jetson-tk1, tegra20-ventana,
tegra210-p2371-2180, tegra30-cardhu-a04

Cheers
Jon

-- 
nvpublic


Re: [PATCH v2 3/3] rcu: validate arguments for rcu tracepoints

2019-03-26 Thread Paul E. McKenney
On Tue, Mar 26, 2019 at 08:13:11PM +0800, Yafang Shao wrote:
> When CONFIG_RCU_TRACE is not set, all these tracepoints are defined as
> do-nothing macro.
> We'd better make those inline functions that take proper arguments.
> 
> As RCU_TRACE() is defined as do-nothing marco as well when
> CONFIG_RCU_TRACE is not set, so we can clean it up.

How about this for the commit log?

Unless the CONFIG_RCU_TRACE kconfig option is set, almost all
of RCU's tracepoints are defined as empty macros.  It would
be better if these tracepoints could instead be empty inline
functions with proper arguments and type checking.  It would
also be good to get rid of the RCU_TRACE() macro, which
compiles its argument in CONFIG_RCU_TRACE=y kernels and
omits them otherwise.

This commit therefore creates a TRACE_EVENT_RCU macro that
is defined as TRACE_EVENT in CONFIG_RCU_TRACE=y kernels and
as the new TRACE_EVENT_NOP otherwise, which allows the
empty macros and the RCU_TRACE() macro to be eliminated.

With that:

Reviewed-by: Paul E. McKenney 

> Signed-off-by: Yafang Shao 
> ---
>  include/trace/events/rcu.h | 81 
> ++
>  kernel/rcu/rcu.h   |  9 ++
>  kernel/rcu/tree.c  |  8 ++---
>  3 files changed, 31 insertions(+), 67 deletions(-)
> 
> diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
> index f0c4d10..e3f357b 100644
> --- a/include/trace/events/rcu.h
> +++ b/include/trace/events/rcu.h
> @@ -7,6 +7,12 @@
>  
>  #include 
>  
> +#ifdef CONFIG_RCU_TRACE
> +#define TRACE_EVENT_RCU TRACE_EVENT
> +#else
> +#define TRACE_EVENT_RCU TRACE_EVENT_NOP
> +#endif
> +
>  /*
>   * Tracepoint for start/end markers used for utilization calculations.
>   * By convention, the string is of the following forms:
> @@ -35,8 +41,6 @@
>   TP_printk("%s", __entry->s)
>  );
>  
> -#ifdef CONFIG_RCU_TRACE
> -
>  #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
>  
>  /*
> @@ -62,7 +66,7 @@
>   *   "end": End a grace period.
>   *   "cpuend": CPU first notices a grace-period end.
>   */
> -TRACE_EVENT(rcu_grace_period,
> +TRACE_EVENT_RCU(rcu_grace_period,
>  
>   TP_PROTO(const char *rcuname, unsigned long gp_seq, const char 
> *gpevent),
>  
> @@ -101,7 +105,7 @@
>   * "Cleanup": Clean up rcu_node structure after previous GP.
>   * "CleanupMore": Clean up, and another GP is needed.
>   */
> -TRACE_EVENT(rcu_future_grace_period,
> +TRACE_EVENT_RCU(rcu_future_grace_period,
>  
>   TP_PROTO(const char *rcuname, unsigned long gp_seq,
>unsigned long gp_seq_req, u8 level, int grplo, int grphi,
> @@ -141,7 +145,7 @@
>   * rcu_node structure, and the mask of CPUs that will be waited for.
>   * All but the type of RCU are extracted from the rcu_node structure.
>   */
> -TRACE_EVENT(rcu_grace_period_init,
> +TRACE_EVENT_RCU(rcu_grace_period_init,
>  
>   TP_PROTO(const char *rcuname, unsigned long gp_seq, u8 level,
>int grplo, int grphi, unsigned long qsmask),
> @@ -186,7 +190,7 @@
>   *   "endwake": Woke piggybackers up.
>   *   "done": Someone else did the expedited grace period for us.
>   */
> -TRACE_EVENT(rcu_exp_grace_period,
> +TRACE_EVENT_RCU(rcu_exp_grace_period,
>  
>   TP_PROTO(const char *rcuname, unsigned long gpseq, const char *gpevent),
>  
> @@ -218,7 +222,7 @@
>   *   "nxtlvl": Advance to next level of rcu_node funnel
>   *   "wait": Wait for someone else to do expedited GP
>   */
> -TRACE_EVENT(rcu_exp_funnel_lock,
> +TRACE_EVENT_RCU(rcu_exp_funnel_lock,
>  
>   TP_PROTO(const char *rcuname, u8 level, int grplo, int grphi,
>const char *gpevent),
> @@ -269,7 +273,7 @@
>   *   "WaitQueue": Enqueue partially done, timed wait for it to complete.
>   *   "WokeQueue": Partial enqueue now complete.
>   */
> -TRACE_EVENT(rcu_nocb_wake,
> +TRACE_EVENT_RCU(rcu_nocb_wake,
>  
>   TP_PROTO(const char *rcuname, int cpu, const char *reason),
>  
> @@ -297,7 +301,7 @@
>   * include SRCU), the grace-period number that the task is blocking
>   * (the current or the next), and the task's PID.
>   */
> -TRACE_EVENT(rcu_preempt_task,
> +TRACE_EVENT_RCU(rcu_preempt_task,
>  
>   TP_PROTO(const char *rcuname, int pid, unsigned long gp_seq),
>  
> @@ -324,7 +328,7 @@
>   * read-side critical section exiting that critical section.  Track the
>   * type of RCU (which one day might include SRCU) and the task's PID.
>   */
> -TRACE_EVENT(rcu_unlock_preempted_task,
> +TRACE_EVENT_RCU(rcu_unlock_preempted_task,
>  
>   TP_PROTO(const char *rcuname, unsigned long gp_seq, int pid),
>  
> @@ -353,7 +357,7 @@
>   * whether there are any blocked tasks blocking the current grace period.
>   * All but the type of RCU are extracted from the rcu_node structure.
>   */
> -TRACE_EVENT(rcu_quiescent_state_report,
> +TRACE_EVENT_RCU(rcu_quiescent_state_report,
>  
>   TP_PROTO(const char *rcuname, unsigned long gp_seq,
> 

Re: [PATCH -tip v3 04/10] x86/kprobes: Prohibit probing on IRQ handlers directly

2019-03-26 Thread Andrea Righi
On Tue, Mar 26, 2019 at 11:50:52PM +0900, Masami Hiramatsu wrote:
> On Mon, 25 Mar 2019 17:23:34 -0400
> Steven Rostedt  wrote:
> 
> > On Wed, 13 Feb 2019 01:12:44 +0900
> > Masami Hiramatsu  wrote:
> > 
> > > Prohibit probing on IRQ handlers in irqentry_text because
> > > if it interrupts user mode, at that point we haven't changed
> > > to kernel space yet and which eventually leads a double fault.
> > > E.g.
> > > 
> > >  # echo p apic_timer_interrupt > kprobe_events
> > 
> > Hmm, this breaks one of my tests (which I probe on do_IRQ).
> 
> OK, it seems this patch is a bit redundant, because
> I found that these interrupt handler issue has been fixed
> by Andrea's commit before merge this patch.
> 
> commit a50480cb6d61d5c5fc13308479407b628b6bc1c5
> Author: Andrea Righi 
> Date:   Thu Dec 6 10:56:48 2018 +0100
> 
> kprobes/x86: Blacklist non-attachable interrupt functions
> 
> These interrupt functions are already non-attachable by kprobes.
> Blacklist them explicitly so that they can show up in
> /sys/kernel/debug/kprobes/blacklist and tools like BCC can use this
> additional information.
> 
> This description is a bit odd (maybe his patch is after mine?) I think
> while updating this series, the patches were merged out of order.
> Anyway, with above patch, the core problematic probe points are blacklisted.

This is the previous thread when I posted my patch (not sure if it helps
to figure out what happened - maybe it was just an out of order merge
issue, like you said):

https://lkml.org/lkml/2018/12/6/212

> 
> > 
> > It's been working for years.
> > 
> > 
> > >  # echo 1 > events/kprobes/enable
> > >  PANIC: double fault, error_code: 0x0
> > >  CPU: 1 PID: 814 Comm: less Not tainted 4.20.0-rc3+ #30
> > >  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996)
> > >  RIP: 0010:error_entry+0x12/0xf0
> > >  [snip]
> > >  Call Trace:
> > >   
> > >   ? native_iret+0x7/0x7
> > >   ? async_page_fault+0x8/0x30
> > >   ? trace_hardirqs_on_thunk+0x1c/0x1c
> > >   ? error_entry+0x7c/0xf0
> > >   ? async_page_fault+0x8/0x30
> > >   ? native_iret+0x7/0x7
> > >   ? int3+0xa/0x20
> > >   ? trace_hardirqs_on_thunk+0x1c/0x1c
> > >   ? error_entry+0x7c/0xf0
> > >   ? int3+0xa/0x20
> > >   ? apic_timer_interrupt+0x1/0x20
> > >   
> > >  Kernel panic - not syncing: Machine halted.
> > >  Kernel Offset: disabled
> > 
> > I'm not able to reproduce this (by removing this commit). 
> 
> I ensured that if I revert both of this patch and Andrea's patch,
> I can reproduce this with probing on apic_timer_interrupt().
> 
> > I'm thinking something else may have changed, as I've been tracing
> > interrupt entries for years, and interrupting userspace while doing
> > this.
> > 
> > I've even added probes where ftrace isn't (where it uses an int3) and
> > still haven't hit a problem.
> > 
> > I think this patch is swatting a symptom of a bug and not addressing
> > the bug itself. Can you send me the config that triggers this?
> 
> Yes, it seems you're right. Andrea's commit specifically fixed the
> issue and mine is redundant. (I'm not sure why do_IRQ is in 
> __irqentry_text...)

Not sure if there are specific reasons for that, but do_IRQ is part of
__irqentry_text because it's explicitly marked with __irq_entry.

> 
> So, Ingo, please revert this, since this bug already has been fixed by
> commit a50480cb6d61 ("kprobes: x86_64: blacklist non-attachable interrupt
> functions")
> 
> BTW, for further error investigation, I attached my kconfig which is
> usually I'm testing (some options can be changed) on Qemu.
> I'm using my mini-container shellscript ( https://github.com/mhiramat/mincs 
> ) which supports qemu-container.
> 
> 
> Thank you,
> 
> -- 
> Masami Hiramatsu 

Thanks,
-Andrea


Re: [PATCH 09/10] ALSA: pcm: Add snd_pcm_ops for snd_pcm_link()

2019-03-26 Thread Timo Wischer

On 3/26/19 15:23, Takashi Iwai wrote:

On Tue, 26 Mar 2019 12:25:37 +0100,
Timo Wischer wrote:

On 3/26/19 09:35, Takashi Iwai wrote:

 On Tue, 26 Mar 2019 08:49:33 +0100,
  wrote:
 
 From: Timo Wischer 
 
 snd_pcm_link() can be called by the user as long as the device is not

 yet started. Therefore currently a driver which wants to iterate over
 the linked substreams has to do this at the start trigger. But the 
start
 trigger should not block for a long time. Therefore there is no 
callback
 which can be used to iterate over the linked substreams without 
delaying
 the start trigger.
 This patch introduces a new callback function which will be called 
after
 the linked substream list was updated by snd_pcm_link(). This callback
 function is allowed to block for a longer time without interfering the
 synchronized start up of linked substreams.
 
 Signed-off-by: Timo Wischer 
 
 Well, the idea appears interesting, but I'm afraid that the

 implementation is still racy.  The place you're calling the new
 callback isn't protected, hence the stream can be triggered while
 calling it.  That is, even during operating your loopback link_changed
 callback, another thread is able to start the stream.
 
Hi Takashi,


As far as I got you mean the following scenario:

   * snd_pcm_link() is called for a HW sound card
   + loopback_snd_timer_link_changed()

The start may happen at this point.


In this case the last link status will be used and aloop will print a 
warning "Another sound timer was requested but at least one device is 
already running...".


Without this patch set a similar issue already exists. When calling 
snd_pcm_start() before snd_pcm_link() was done the additional device 
linked by the snd_pcm_link() will not be started.
Therefore the application has already to take care about the order of 
the calls.





   + loopback_snd_timer_open()
   + spin_lock_irqsave(>cable->lock, flags);
   * snd_pcm_start() called for aloop sound card
   + loopback_trigger()
   + spin_lock(>lock) -> has to wait till loopback_snd_timer_open()
 calls spin_unlock_irqrestore()

So far snd_pcm_start() has to wait for loopback_snd_timer_open().

   * loopback_snd_timer_open() will continue with
   + dpcm->cable->snd_timer.instance = NULL;
   + spin_unlock_irqrestore()
   * loopback_trigger() can enter the lock
   + loopback_snd_timer_start() will fail with -EINVAL due to
 (loopback_trigger == NULL)

At least this will not result into memory corruption due to race or any other
wired behavior.

I don't expect the memory corruption, but my point is that dealing
with linked streams is still tricky.  It was considered for the
lightweight coupled start/stop operation, and something intensively
depending on the linked status was out of the original design...


But my expectation is that snd_pcm_link(hw, aloop) or snd_pcm_link(aloop, hw)
is only called by the application calling snd_pcm_start(aloop)
because the same aloop device cannot be opened by multiple applications at the
same time.

Do you see an use case where one application would call snd_pcm_start() in
parallel with snd_pcm_link() (somehow configuring the device)?

It's not about the actual application usages but rather against the
malicious attacks.  Especially aloop is a virtual device that is
available allover the places, it may be deployed / attacked easily.
The attack we are identifying here can only be done by the application 
opening the aloop device.
An application allowed to open the aloop device is anyway able to 
manipulate the audio streaming.
Or do you see an attack which would influence any other device/stream 
not opened by this application?



May be we should add an additional synchronization mechanism in pcm_native.c
to avoid call of snd_pcm_link() in parallel with snd_pcm_start().

If it really matters...  Honestly speaking, I'm not fully convinced
whether we want to deal with this using the PCM link mechanism.

What's the motivation for using the linked streams at the first place?
That's one of the biggest missing piece in the whole picture.
In general when the user uses snd_pcm_link() it expects that the linked 
devices are somehow synchronized.
Any applications already using snd_pcm_link() do not need to be adapted 
to use the new feature of aloop (for example JACK or ALSA multi plugin)


But when linking a HW sound card and aloop without this patch set, both 
devices will be started in sync but
the snd_pcm_period_eleapsed() calls of the different devices will drift. 
To avoid this the aloop plugin can automatically use the right timer.
If this feature is not implemented the user has to use snd_pcm_link() to 
trigger snd_pcm_start() in sync but also has to configure the aloop 
plugin to use the right sound timer.
May be the linked cards can change 

[PATCH 09/10] PCI: tegra: Add Tegra194 PCIe support

2019-03-26 Thread Vidya Sagar
Add support for Synopsys DesignWare core IP based PCIe host controller
present in Tegra194 SoC.

Signed-off-by: Vidya Sagar 
---
 drivers/pci/controller/dwc/Kconfig |   10 +
 drivers/pci/controller/dwc/Makefile|1 +
 drivers/pci/controller/dwc/pcie-tegra194.c | 1862 
 3 files changed, 1873 insertions(+)
 create mode 100644 drivers/pci/controller/dwc/pcie-tegra194.c

diff --git a/drivers/pci/controller/dwc/Kconfig 
b/drivers/pci/controller/dwc/Kconfig
index 6ea74b1c0d94..d80f2d77892a 100644
--- a/drivers/pci/controller/dwc/Kconfig
+++ b/drivers/pci/controller/dwc/Kconfig
@@ -213,4 +213,14 @@ config PCIE_UNIPHIER
  Say Y here if you want PCIe controller support on UniPhier SoCs.
  This driver supports LD20 and PXs3 SoCs.
 
+config PCIE_TEGRA194
+   bool "NVIDIA Tegra (T194) PCIe controller"
+   depends on TEGRA_BPMP && (ARCH_TEGRA || COMPILE_TEST)
+   depends on PCI_MSI_IRQ_DOMAIN
+   select PCIE_DW_HOST
+   select PHY_TEGRA194_PCIE_P2U
+   help
+ Say Y here if you want support for DesignWare core based PCIe host
+ controller found in NVIDIA Tegra T194 SoC.
+
 endmenu
diff --git a/drivers/pci/controller/dwc/Makefile 
b/drivers/pci/controller/dwc/Makefile
index b5f3b83cc2b3..4362f0ea89ac 100644
--- a/drivers/pci/controller/dwc/Makefile
+++ b/drivers/pci/controller/dwc/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_PCIE_KIRIN) += pcie-kirin.o
 obj-$(CONFIG_PCIE_HISI_STB) += pcie-histb.o
 obj-$(CONFIG_PCI_MESON) += pci-meson.o
 obj-$(CONFIG_PCIE_UNIPHIER) += pcie-uniphier.o
+obj-$(CONFIG_PCIE_TEGRA194) += pcie-tegra194.o
 
 # The following drivers are for devices that use the generic ACPI
 # pci_root.c driver but don't support standard ECAM config access.
diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c 
b/drivers/pci/controller/dwc/pcie-tegra194.c
new file mode 100644
index ..7f6be38c8456
--- /dev/null
+++ b/drivers/pci/controller/dwc/pcie-tegra194.c
@@ -0,0 +1,1862 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * PCIe host controller driver for Tegra T194 SoC
+ *
+ * Copyright (C) 2018 NVIDIA Corporation.
+ *
+ * Author: Vidya Sagar 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "pcie-designware.h"
+#include 
+#include 
+#include "../../pcie/portdrv.h"
+
+#define dw_pcie_to_tegra_pcie(x) container_of(x, struct tegra_pcie_dw, pci)
+
+#define CTRL_5 5
+
+#define APPL_PINMUX0x0
+#define APPL_PINMUX_PEX_RSTBIT(0)
+#define APPL_PINMUX_CLKREQ_OVERRIDE_EN BIT(2)
+#define APPL_PINMUX_CLKREQ_OVERRIDEBIT(3)
+#define APPL_PINMUX_CLK_OUTPUT_IN_OVERRIDE_EN  BIT(4)
+#define APPL_PINMUX_CLK_OUTPUT_IN_OVERRIDE BIT(5)
+#define APPL_PINMUX_CLKREQ_OUT_OVRD_EN BIT(9)
+#define APPL_PINMUX_CLKREQ_OUT_OVRDBIT(10)
+
+#define APPL_CTRL  0x4
+#define APPL_CTRL_SYS_PRE_DET_STATEBIT(6)
+#define APPL_CTRL_LTSSM_EN BIT(7)
+#define APPL_CTRL_HW_HOT_RST_ENBIT(20)
+#define APPL_CTRL_HW_HOT_RST_MODE_MASK GENMASK(1, 0)
+#define APPL_CTRL_HW_HOT_RST_MODE_SHIFT22
+#define APPL_CTRL_HW_HOT_RST_MODE_IMDT_RST 0x1
+
+#define APPL_INTR_EN_L0_0  0x8
+#define APPL_INTR_EN_L0_0_LINK_STATE_INT_ENBIT(0)
+#define APPL_INTR_EN_L0_0_MSI_RCV_INT_EN   BIT(4)
+#define APPL_INTR_EN_L0_0_INT_INT_EN   BIT(8)
+#define APPL_INTR_EN_L0_0_CDM_REG_CHK_INT_EN   BIT(19)
+#define APPL_INTR_EN_L0_0_SYS_INTR_EN  BIT(30)
+#define APPL_INTR_EN_L0_0_SYS_MSI_INTR_EN  BIT(31)
+
+#define APPL_INTR_STATUS_L00xC
+#define APPL_INTR_STATUS_L0_LINK_STATE_INT BIT(0)
+#define APPL_INTR_STATUS_L0_INT_INTBIT(8)
+#define APPL_INTR_STATUS_L0_CDM_REG_CHK_INTBIT(18)
+
+#define APPL_INTR_EN_L1_0_00x1C
+#define APPL_INTR_EN_L1_0_0_LINK_REQ_RST_NOT_INT_ENBIT(1)
+
+#define APPL_INTR_STATUS_L1_0_00x20
+#define APPL_INTR_STATUS_L1_0_0_LINK_REQ_RST_NOT_CHGED BIT(1)
+
+#define APPL_INTR_STATUS_L1_1  0x2C
+#define APPL_INTR_STATUS_L1_2  0x30
+#define APPL_INTR_STATUS_L1_3  0x34
+#define APPL_INTR_STATUS_L1_6  0x3C
+#define APPL_INTR_STATUS_L1_7  0x40
+
+#define APPL_INTR_EN_L1_8_00x44
+#define APPL_INTR_EN_L1_8_BW_MGT_INT_ENBIT(2)
+#define APPL_INTR_EN_L1_8_AUTO_BW_INT_EN   BIT(3)
+#define APPL_INTR_EN_L1_8_INTX_EN  BIT(11)
+#define APPL_INTR_EN_L1_8_AER_INT_EN   BIT(15)
+
+#define APPL_INTR_STATUS_L1_8_00x4C
+#define 

[PATCH 10/10] arm64: Add Tegra194 PCIe driver to defconfig

2019-03-26 Thread Vidya Sagar
Add PCIe host controller driver for DesignWare core based
PCIe controller IP present in Tegra194.

Signed-off-by: Vidya Sagar 
---
 arch/arm64/configs/defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 2d9c39033c1a..2ddea5c4e87d 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -87,6 +87,7 @@ CONFIG_PCIE_QCOM=y
 CONFIG_PCIE_ARMADA_8K=y
 CONFIG_PCIE_KIRIN=y
 CONFIG_PCIE_HISI_STB=y
+CONFIG_PCIE_TEGRA194=y
 CONFIG_ARM64_VA_BITS_48=y
 CONFIG_SCHED_MC=y
 CONFIG_NUMA=y
-- 
2.7.4



[PATCH 07/10] arm64: tegra: Enable PCIe slots in P2972-0000 board

2019-03-26 Thread Vidya Sagar
Enable PCIe controller nodes to enable respective PCIe slots on
P2972- board. Following is the ownership of slots by different
PCIe controllers.
Controller-0 : M.2 Key-M slot
Controller-1 : On-board Marvell eSATA controller
Controller-3 : M.2 Key-E slot

Signed-off-by: Vidya Sagar 
---
 arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi |  2 +-
 arch/arm64/boot/dts/nvidia/tegra194-p2972-.dts | 50 ++
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi 
b/arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi
index 246c1ebbd055..13263529125b 100644
--- a/arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi
+++ b/arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi
@@ -191,7 +191,7 @@
regulator-boot-on;
};
 
-   sd3 {
+   vdd_1v8ao: sd3 {
regulator-name = "VDD_1V8AO";
regulator-min-microvolt = 
<180>;
regulator-max-microvolt = 
<180>;
diff --git a/arch/arm64/boot/dts/nvidia/tegra194-p2972-.dts 
b/arch/arm64/boot/dts/nvidia/tegra194-p2972-.dts
index b62e96945846..732756feb698 100644
--- a/arch/arm64/boot/dts/nvidia/tegra194-p2972-.dts
+++ b/arch/arm64/boot/dts/nvidia/tegra194-p2972-.dts
@@ -169,4 +169,54 @@
};
};
};
+
+   pcie@1418 {
+   status = "okay";
+
+   vddio-pex-ctl-supply = <_1v8ao>;
+
+   phys = <_2>,
+  <_3>,
+  <_4>,
+  <_5>;
+   phy-names = "pcie-p2u-0", "pcie-p2u-1", "pcie-p2u-2",
+   "pcie-p2u-3";
+   };
+
+   pcie@1410 {
+   status = "okay";
+
+   vddio-pex-ctl-supply = <_1v8ao>;
+
+   phys = <_0>;
+   phy-names = "pcie-p2u-0";
+   };
+
+   pcie@1414 {
+   status = "okay";
+
+   vddio-pex-ctl-supply = <_1v8ao>;
+
+   phys = <_7>;
+   phy-names = "pcie-p2u-0";
+   };
+
+   pcie@141a {
+   status = "disabled";
+
+   vddio-pex-ctl-supply = <_1v8ao>;
+
+   phys = <_12>,
+  <_13>,
+  <_14>,
+  <_15>,
+  <_16>,
+  <_17>,
+  <_18>,
+  <_19>;
+
+   phy-names = "pcie-p2u-0", "pcie-p2u-1", "pcie-p2u-2",
+   "pcie-p2u-3", "pcie-p2u-4", "pcie-p2u-5",
+   "pcie-p2u-6", "pcie-p2u-7";
+   };
 };
-- 
2.7.4



[PATCH 08/10] phy: tegra: Add PCIe PIPE2UPHY support

2019-03-26 Thread Vidya Sagar
Synopsys DesignWare core based PCIe controllers in Tegra 194 SoC interface
with Universal PHY (UPHY) module through a PIPE2UPHY (P2U) module.
For each PCIe lane of a controller, there is a P2U unit instantiated at
hardware level. This driver provides support for the programming required
for each P2U that is going to be used for a PCIe controller.

Signed-off-by: Vidya Sagar 
---
 drivers/phy/tegra/Kconfig |   7 ++
 drivers/phy/tegra/Makefile|   1 +
 drivers/phy/tegra/pcie-p2u-tegra194.c | 138 ++
 3 files changed, 146 insertions(+)
 create mode 100644 drivers/phy/tegra/pcie-p2u-tegra194.c

diff --git a/drivers/phy/tegra/Kconfig b/drivers/phy/tegra/Kconfig
index a3b1de953fb7..1460c060fa70 100644
--- a/drivers/phy/tegra/Kconfig
+++ b/drivers/phy/tegra/Kconfig
@@ -6,3 +6,10 @@ config PHY_TEGRA_XUSB
 
  To compile this driver as a module, choose M here: the module will
  be called phy-tegra-xusb.
+
+config PHY_TEGRA194_PCIE_P2U
+tristate "NVIDIA Tegra P2U PHY Driver"
+depends on ARCH_TEGRA
+select GENERIC_PHY
+help
+  Enable this to support the P2U (PIPE to UPHY) that is part of Tegra 
19x SOCs.
diff --git a/drivers/phy/tegra/Makefile b/drivers/phy/tegra/Makefile
index 898589238fd9..f85b2c86643d 100644
--- a/drivers/phy/tegra/Makefile
+++ b/drivers/phy/tegra/Makefile
@@ -4,3 +4,4 @@ phy-tegra-xusb-y += xusb.o
 phy-tegra-xusb-$(CONFIG_ARCH_TEGRA_124_SOC) += xusb-tegra124.o
 phy-tegra-xusb-$(CONFIG_ARCH_TEGRA_132_SOC) += xusb-tegra124.o
 phy-tegra-xusb-$(CONFIG_ARCH_TEGRA_210_SOC) += xusb-tegra210.o
+obj-$(CONFIG_PHY_TEGRA194_PCIE_P2U) += pcie-p2u-tegra194.o
diff --git a/drivers/phy/tegra/pcie-p2u-tegra194.c 
b/drivers/phy/tegra/pcie-p2u-tegra194.c
new file mode 100644
index ..bb2412ec4765
--- /dev/null
+++ b/drivers/phy/tegra/pcie-p2u-tegra194.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * P2U (PIPE to UPHY) driver for Tegra T194 SoC
+ *
+ * Copyright (C) 2018 NVIDIA Corporation.
+ *
+ * Author: Vidya Sagar 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define P2U_PERIODIC_EQ_CTRL_GEN3  0xc0
+#define P2U_PERIODIC_EQ_CTRL_GEN3_PERIODIC_EQ_EN   BIT(0)
+#define P2U_PERIODIC_EQ_CTRL_GEN3_INIT_PRESET_EQ_TRAIN_EN  BIT(1)
+#define P2U_PERIODIC_EQ_CTRL_GEN4  0xc4
+#define P2U_PERIODIC_EQ_CTRL_GEN4_INIT_PRESET_EQ_TRAIN_EN  BIT(1)
+
+#define P2U_RX_DEBOUNCE_TIME   0xa4
+#define P2U_RX_DEBOUNCE_TIME_DEBOUNCE_TIMER_MASK   0x
+#define P2U_RX_DEBOUNCE_TIME_DEBOUNCE_TIMER_VAL160
+
+struct tegra_p2u {
+   void __iomem*base;
+};
+
+static int tegra_p2u_power_off(struct phy *x)
+{
+   return 0;
+}
+
+static int tegra_p2u_power_on(struct phy *x)
+{
+   u32 val;
+   struct tegra_p2u *phy = phy_get_drvdata(x);
+
+   val = readl(phy->base + P2U_PERIODIC_EQ_CTRL_GEN3);
+   val &= ~P2U_PERIODIC_EQ_CTRL_GEN3_PERIODIC_EQ_EN;
+   val |= P2U_PERIODIC_EQ_CTRL_GEN3_INIT_PRESET_EQ_TRAIN_EN;
+   writel(val, phy->base + P2U_PERIODIC_EQ_CTRL_GEN3);
+
+   val = readl(phy->base + P2U_PERIODIC_EQ_CTRL_GEN4);
+   val |= P2U_PERIODIC_EQ_CTRL_GEN4_INIT_PRESET_EQ_TRAIN_EN;
+   writel(val, phy->base + P2U_PERIODIC_EQ_CTRL_GEN4);
+
+   val = readl(phy->base + P2U_RX_DEBOUNCE_TIME);
+   val &= ~P2U_RX_DEBOUNCE_TIME_DEBOUNCE_TIMER_MASK;
+   val |= P2U_RX_DEBOUNCE_TIME_DEBOUNCE_TIMER_VAL;
+   writel(val, phy->base + P2U_RX_DEBOUNCE_TIME);
+
+   return 0;
+}
+
+static int tegra_p2u_init(struct phy *x)
+{
+   return 0;
+}
+
+static int tegra_p2u_exit(struct phy *x)
+{
+   return 0;
+}
+
+static const struct phy_ops ops = {
+   .init   = tegra_p2u_init,
+   .exit   = tegra_p2u_exit,
+   .power_on   = tegra_p2u_power_on,
+   .power_off  = tegra_p2u_power_off,
+   .owner  = THIS_MODULE,
+};
+
+static int tegra_p2u_probe(struct platform_device *pdev)
+{
+   struct tegra_p2u *phy;
+   struct phy *generic_phy;
+   struct phy_provider *phy_provider;
+   struct device *dev = >dev;
+   struct resource *res;
+
+   phy = devm_kzalloc(dev, sizeof(*phy), GFP_KERNEL);
+   if (!phy)
+   return -ENOMEM;
+
+   res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "base");
+   phy->base = devm_ioremap_resource(dev, res);
+   if (IS_ERR(phy->base))
+   return PTR_ERR(phy->base);
+
+   platform_set_drvdata(pdev, phy);
+
+   generic_phy = devm_phy_create(dev, NULL, );
+   if (IS_ERR(generic_phy))
+   return PTR_ERR(generic_phy);
+
+   phy_set_drvdata(generic_phy, phy);
+
+   phy_provider = devm_of_phy_provider_register(dev, of_phy_simple_xlate);
+   if (IS_ERR(phy_provider))
+   return PTR_ERR(phy_provider);
+
+   return 0;
+}
+
+static int 

[PATCH 05/10] dt-bindings: PCI: tegra: Add device tree support for T194

2019-03-26 Thread Vidya Sagar
Add support for Tegra194 PCIe controllers. These controllers are based
on Synopsys DesignWare core IP.

Signed-off-by: Vidya Sagar 
---
 .../bindings/pci/nvidia,tegra194-pcie.txt  | 209 +
 .../devicetree/bindings/phy/phy-tegra194-p2u.txt   |  34 
 2 files changed, 243 insertions(+)
 create mode 100644 
Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt
 create mode 100644 Documentation/devicetree/bindings/phy/phy-tegra194-p2u.txt

diff --git a/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt 
b/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt
new file mode 100644
index ..31527283a0cd
--- /dev/null
+++ b/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt
@@ -0,0 +1,209 @@
+NVIDIA Tegra PCIe controller (Synopsys DesignWare Core based)
+
+This PCIe host controller is based on the Synopsis Designware PCIe IP
+and thus inherits all the common properties defined in designware-pcie.txt.
+
+Required properties:
+- compatible: For Tegra19x, must contain "nvidia,tegra194-pcie".
+- device_type: Must be "pci"
+- reg: A list of physical base address and length for each set of controller
+  registers. Must contain an entry for each entry in the reg-names property.
+- reg-names: Must include the following entries:
+  "appl": Controller's application logic registers
+  "window1": This is the aperture of controller available under 4GB boundary
+ (i.e. within 32-bit space). This aperture is typically used for
+ accessing config space of root port itself and also the connected
+ endpoints (by appropriately programming internal Address
+ Translation Unit's (iATU) out bound region) and also to map
+ prefetchable/non-prefetchable BARs.
+  "config": As per the definition in designware-pcie.txt
+  "atu_dma": iATU and DMA register. This is where the iATU (internal Address
+ Translation Unit) registers of the PCIe core are made available
+ fow SW access.
+  "dbi": The aperture where root port's own configuration registers are
+ available
+  "window2": This is the larger (compared to window1) aperture available above
+ 4GB boundary (i.e. in 64-bit space). This is typically used for
+ mapping prefetchable/non-prefetchable BARs of endpoints
+- interrupts: A list of interrupt outputs of the controller. Must contain an
+  entry for each entry in the interrupt-names property.
+- interrupt-names: Must include the following entries:
+  "intr": The Tegra interrupt that is asserted for controller interrupts
+  "msi": The Tegra interrupt that is asserted when an MSI is received
+- bus-range: Range of bus numbers associated with this controller
+- #address-cells: Address representation for root ports (must be 3)
+  - cell 0 specifies the bus and device numbers of the root port:
+[23:16]: bus number
+[15:11]: device number
+  - cell 1 denotes the upper 32 address bits and should be 0
+  - cell 2 contains the lower 32 address bits and is used to translate to the
+CPU address space
+- #size-cells: Size representation for root ports (must be 2)
+- ranges: Describes the translation of addresses for root ports and standard
+  PCI regions. The entries must be 7 cells each, where the first three cells
+  correspond to the address as described for the #address-cells property
+  above, the fourth and fifth cells are for the physical CPU address to
+  translate to and the sixth and seventh cells are as described for the
+  #size-cells property above.
+  - Entries setup the mapping for the standard I/O, memory and
+prefetchable PCI regions. The first cell determines the type of region
+that is setup:
+- 0x8100: I/O memory region
+- 0x8200: non-prefetchable memory region
+- 0xc200: prefetchable memory region
+  Please refer to the standard PCI bus binding document for a more detailed
+  explanation.
+- #interrupt-cells: Size representation for interrupts (must be 1)
+- interrupt-map-mask and interrupt-map: Standard PCI IRQ mapping properties
+  Please refer to the standard PCI bus binding document for a more detailed
+  explanation.
+- clocks: Must contain an entry for each entry in clock-names.
+  See ../clocks/clock-bindings.txt for details.
+- clock-names: Must include the following entries:
+  - core_clk
+- resets: Must contain an entry for each entry in reset-names.
+  See ../reset/reset.txt for details.
+- reset-names: Must include the following entries:
+  - core_apb_rst
+  - core_rst
+- phys: Must contain a phandle to P2U PHY for each entry in phy-names.
+- phy-names: Must include an entry for each active lane.
+  "pcie-p2u-N": where N ranges from 0 to one less than the total number of 
lanes
+- Controller dependent register offsets
+  - nvidia,event-cntr-ctrl: EVENT_COUNTER_CONTROL reg offset
+  0x168 - FPGA
+  0x1a8 - C1, C2 and C3
+  0x1c4 - C4
+  0x1d8 - C0 and C5
+  - 

[PATCH 06/10] arm64: tegra: Add P2U and PCIe controller nodes to Tegra194 DT

2019-03-26 Thread Vidya Sagar
Add P2U (PIPE to UPHY) and PCIe controller nodes to device tree.
The Tegra194 SoC contains six PCIe controllers and twenty P2U instances
grouped into two different PHY bricks namely High-Speed IO (HSIO-12 P2Us)
and NVIDIA High Speed (NVHS-8 P2Us) respectively.

Signed-off-by: Vidya Sagar 
---
 arch/arm64/boot/dts/nvidia/tegra194.dtsi | 473 +++
 1 file changed, 473 insertions(+)

diff --git a/arch/arm64/boot/dts/nvidia/tegra194.dtsi 
b/arch/arm64/boot/dts/nvidia/tegra194.dtsi
index c77ca211fa8f..266a3058fa66 100644
--- a/arch/arm64/boot/dts/nvidia/tegra194.dtsi
+++ b/arch/arm64/boot/dts/nvidia/tegra194.dtsi
@@ -1054,4 +1054,477 @@
(GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>;
interrupt-parent = <>;
};
+
+   hsio-p2u {
+   compatible = "simple-bus";
+   #address-cells = <2>;
+   #size-cells = <2>;
+   ranges;
+   p2u_0: p2u@03e1 {
+   compatible = "nvidia,tegra194-phy-p2u";
+   reg = <0x0 0x03e1 0x0 0x0001>;
+   reg-names = "base";
+
+   #phy-cells = <0>;
+   };
+   p2u_1: p2u@03e2 {
+   compatible = "nvidia,tegra194-phy-p2u";
+   reg = <0x0 0x03e2 0x0 0x0001>;
+   reg-names = "base";
+
+   #phy-cells = <0>;
+   };
+   p2u_2: p2u@03e3 {
+   compatible = "nvidia,tegra194-phy-p2u";
+   reg = <0x0 0x03e3 0x0 0x0001>;
+   reg-names = "base";
+
+   #phy-cells = <0>;
+   };
+   p2u_3: p2u@03e4 {
+   compatible = "nvidia,tegra194-phy-p2u";
+   reg = <0x0 0x03e4 0x0 0x0001>;
+   reg-names = "base";
+
+   #phy-cells = <0>;
+   };
+   p2u_4: p2u@03e5 {
+   compatible = "nvidia,tegra194-phy-p2u";
+   reg = <0x0 0x03e5 0x0 0x0001>;
+   reg-names = "base";
+
+   #phy-cells = <0>;
+   };
+   p2u_5: p2u@03e6 {
+   compatible = "nvidia,tegra194-phy-p2u";
+   reg = <0x0 0x03e6 0x0 0x0001>;
+   reg-names = "base";
+
+   #phy-cells = <0>;
+   };
+   p2u_6: p2u@03e7 {
+   compatible = "nvidia,tegra194-phy-p2u";
+   reg = <0x0 0x03e7 0x0 0x0001>;
+   reg-names = "base";
+
+   #phy-cells = <0>;
+   };
+   p2u_7: p2u@03e8 {
+   compatible = "nvidia,tegra194-phy-p2u";
+   reg = <0x0 0x03e8 0x0 0x0001>;
+   reg-names = "base";
+
+   #phy-cells = <0>;
+   };
+   p2u_8: p2u@03e9 {
+   compatible = "nvidia,tegra194-phy-p2u";
+   reg = <0x0 0x03e9 0x0 0x0001>;
+   reg-names = "base";
+
+   #phy-cells = <0>;
+   };
+   p2u_9: p2u@03ea {
+   compatible = "nvidia,tegra194-phy-p2u";
+   reg = <0x0 0x03ea 0x0 0x0001>;
+   reg-names = "base";
+
+   #phy-cells = <0>;
+   };
+   p2u_10: p2u@03f3 {
+   compatible = "nvidia,tegra194-phy-p2u";
+   reg = <0x0 0x03f3 0x0 0x0001>;
+   reg-names = "base";
+
+   #phy-cells = <0>;
+   };
+   p2u_11: p2u@03f4 {
+   compatible = "nvidia,tegra194-phy-p2u";
+   reg = <0x0 0x03f4 0x0 0x0001>;
+   reg-names = "base";
+
+   #phy-cells = <0>;
+   };
+   };
+
+   nvhs-p2u {
+   compatible = "simple-bus";
+   #address-cells = <2>;
+   #size-cells = <2>;
+   ranges;
+   p2u_12: p2u@03eb {
+   compatible = "nvidia,tegra194-phy-p2u";
+   reg = <0x0 0x03eb 0x0 0x0001>;
+   reg-names = "base";
+
+   #phy-cells = <0>;
+   };
+   p2u_13: p2u@03ec {
+   compatible = "nvidia,tegra194-phy-p2u";
+   reg = <0x0 0x03ec 0x0 0x0001>;
+   reg-names = "base";
+
+   #phy-cells = <0>;
+   };
+   p2u_14: p2u@03ed {
+  

[PATCH 04/10] PCI: Add #defines for PCIe spec r4.0 features

2019-03-26 Thread Vidya Sagar
Add #defines for the Data Link Feature and Physical Layer 16.0 GT/s
features.

Signed-off-by: Vidya Sagar 
---
 include/uapi/linux/pci_regs.h | 22 +-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 5c98133f2c94..3e01b55d548d 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -705,7 +705,9 @@
 #define PCI_EXT_CAP_ID_DPC 0x1D/* Downstream Port Containment */
 #define PCI_EXT_CAP_ID_L1SS0x1E/* L1 PM Substates */
 #define PCI_EXT_CAP_ID_PTM 0x1F/* Precision Time Measurement */
-#define PCI_EXT_CAP_ID_MAX PCI_EXT_CAP_ID_PTM
+#define PCI_EXT_CAP_ID_DLF 0x25/* Data Link Feature */
+#define PCI_EXT_CAP_ID_PL  0x26/* Physical Layer 16.0 GT/s */
+#define PCI_EXT_CAP_ID_MAX PCI_EXT_CAP_ID_PL
 
 #define PCI_EXT_CAP_DSN_SIZEOF 12
 #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40
@@ -1045,4 +1047,22 @@
 #define  PCI_L1SS_CTL1_LTR_L12_TH_SCALE0xe000  /* 
LTR_L1.2_THRESHOLD_Scale */
 #define PCI_L1SS_CTL2  0x0c/* Control 2 Register */
 
+/* Data Link Feature */
+#define PCI_DLF_CAP0x04/* Capabilities Register */
+#define  PCI_DLF_LOCAL_DLF_SUP_MASK0x007f  /* Local Data Link Feature 
Supported */
+#define  PCI_DLF_EXCHANGE_ENABLE   0x8000  /* Data Link Feature 
Exchange Enable */
+#define PCI_DLF_STS0x08/* Status Register */
+#define  PCI_DLF_REMOTE_DLF_SUP_MASK   0x007f  /* Remote Data Link Feature 
Supported */
+#define  PCI_DLF_REMOTE_DLF_SUP_VALID  0x8000  /* Remote Data Link Feature 
Support Valid */
+
+/* Physical Layer 16.0 GT/s */
+#define PCI_PL_16GT_CAP0x04/* Capabilities Register */
+#define PCI_PL_16GT_CTRL   0x08/* Control Register */
+#define PCI_PL_16GT_STS0x0c/* Status Register */
+#define PCI_PL_16GT_LDPM_STS   0x10/* Local Data Parity Mismatch Status 
Register */
+#define PCI_PL_16GT_FRDPM_STS  0x14/* First Retimer Data Parity Mismatch 
Status Register */
+#define PCI_PL_16GT_SRDPM_STS  0x18/* Second Retimer Data Parity Mismatch 
Status Register */
+#define PCI_PL_16GT_RSVD   0x1C/* Reserved */
+#define PCI_PL_16GT_LE_CTRL0x20/* Lane Equalization Control Register */
+
 #endif /* LINUX_PCI_REGS_H */
-- 
2.7.4



[PATCH 03/10] PCI: dwc: Move config space capability search API

2019-03-26 Thread Vidya Sagar
move PCIe config space capability search API to common designware file
as this can be used by both host and ep mode codes.
It also adds extended capability search APIs.

Signed-off-by: Vidya Sagar 
---
 drivers/pci/controller/dwc/pcie-designware-ep.c | 37 +
 drivers/pci/controller/dwc/pcie-designware.c| 73 +
 drivers/pci/controller/dwc/pcie-designware.h|  3 +
 3 files changed, 78 insertions(+), 35 deletions(-)

diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c 
b/drivers/pci/controller/dwc/pcie-designware-ep.c
index 24f5a775ad34..b9d9c9a4ba6d 100644
--- a/drivers/pci/controller/dwc/pcie-designware-ep.c
+++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
@@ -40,39 +40,6 @@ void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum 
pci_barno bar)
__dw_pcie_ep_reset_bar(pci, bar, 0);
 }
 
-static u8 __dw_pcie_ep_find_next_cap(struct dw_pcie *pci, u8 cap_ptr,
- u8 cap)
-{
-   u8 cap_id, next_cap_ptr;
-   u16 reg;
-
-   reg = dw_pcie_readw_dbi(pci, cap_ptr);
-   next_cap_ptr = (reg & 0xff00) >> 8;
-   cap_id = (reg & 0x00ff);
-
-   if (!next_cap_ptr || cap_id > PCI_CAP_ID_MAX)
-   return 0;
-
-   if (cap_id == cap)
-   return cap_ptr;
-
-   return __dw_pcie_ep_find_next_cap(pci, next_cap_ptr, cap);
-}
-
-static u8 dw_pcie_ep_find_capability(struct dw_pcie *pci, u8 cap)
-{
-   u8 next_cap_ptr;
-   u16 reg;
-
-   reg = dw_pcie_readw_dbi(pci, PCI_CAPABILITY_LIST);
-   next_cap_ptr = (reg & 0x00ff);
-
-   if (!next_cap_ptr)
-   return 0;
-
-   return __dw_pcie_ep_find_next_cap(pci, next_cap_ptr, cap);
-}
-
 static int dw_pcie_ep_write_header(struct pci_epc *epc, u8 func_no,
   struct pci_epf_header *hdr)
 {
@@ -591,9 +558,9 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep)
dev_err(dev, "Failed to reserve memory for MSI/MSI-X\n");
return -ENOMEM;
}
-   ep->msi_cap = dw_pcie_ep_find_capability(pci, PCI_CAP_ID_MSI);
+   ep->msi_cap = dw_pcie_find_capability(pci, PCI_CAP_ID_MSI);
 
-   ep->msix_cap = dw_pcie_ep_find_capability(pci, PCI_CAP_ID_MSIX);
+   ep->msix_cap = dw_pcie_find_capability(pci, PCI_CAP_ID_MSIX);
 
dw_pcie_setup(pci);
 
diff --git a/drivers/pci/controller/dwc/pcie-designware.c 
b/drivers/pci/controller/dwc/pcie-designware.c
index 31f6331ca46f..164a63b7688a 100644
--- a/drivers/pci/controller/dwc/pcie-designware.c
+++ b/drivers/pci/controller/dwc/pcie-designware.c
@@ -20,6 +20,79 @@
 #define PCIE_PHY_DEBUG_R1_LINK_UP  (0x1 << 4)
 #define PCIE_PHY_DEBUG_R1_LINK_IN_TRAINING (0x1 << 29)
 
+static u8 __dw_pcie_find_next_cap(struct dw_pcie *pci, u8 cap_ptr,
+ u8 cap)
+{
+   u8 cap_id, next_cap_ptr;
+   u16 reg;
+
+   reg = dw_pcie_readw_dbi(pci, cap_ptr);
+   next_cap_ptr = (reg & 0xff00) >> 8;
+   cap_id = (reg & 0x00ff);
+
+   if (!next_cap_ptr || cap_id > PCI_CAP_ID_MAX)
+   return 0;
+
+   if (cap_id == cap)
+   return cap_ptr;
+
+   return __dw_pcie_find_next_cap(pci, next_cap_ptr, cap);
+}
+
+u8 dw_pcie_find_capability(struct dw_pcie *pci, u8 cap)
+{
+   u8 next_cap_ptr;
+   u16 reg;
+
+   reg = dw_pcie_readw_dbi(pci, PCI_CAPABILITY_LIST);
+   next_cap_ptr = (reg & 0x00ff);
+
+   if (!next_cap_ptr)
+   return 0;
+
+   return __dw_pcie_find_next_cap(pci, next_cap_ptr, cap);
+}
+
+static int dw_pcie_find_next_ext_capability(struct dw_pcie *pci, int start,
+   int cap)
+{
+   u32 header;
+   int ttl;
+   int pos = PCI_CFG_SPACE_SIZE;
+
+   /* minimum 8 bytes per capability */
+   ttl = (PCI_CFG_SPACE_EXP_SIZE - PCI_CFG_SPACE_SIZE) / 8;
+
+   if (start)
+   pos = start;
+
+   header = dw_pcie_readl_dbi(pci, pos);
+   /*
+* If we have no capabilities, this is indicated by cap ID,
+* cap version and next pointer all being 0.
+*/
+   if (header == 0)
+   return 0;
+
+   while (ttl-- > 0) {
+   if (PCI_EXT_CAP_ID(header) == cap && pos != start)
+   return pos;
+
+   pos = PCI_EXT_CAP_NEXT(header);
+   if (pos < PCI_CFG_SPACE_SIZE)
+   break;
+
+   header = dw_pcie_readl_dbi(pci, pos);
+   }
+
+   return 0;
+}
+
+int dw_pcie_find_ext_capability(struct dw_pcie *pci, int cap)
+{
+   return dw_pcie_find_next_ext_capability(pci, 0, cap);
+}
+
 int dw_pcie_read(void __iomem *addr, int size, u32 *val)
 {
if (!IS_ALIGNED((uintptr_t)addr, size)) {
diff --git a/drivers/pci/controller/dwc/pcie-designware.h 
b/drivers/pci/controller/dwc/pcie-designware.h
index 70007276bc93..47996f433a57 100644
--- a/drivers/pci/controller/dwc/pcie-designware.h
+++ b/drivers/pci/controller/dwc/pcie-designware.h

[PATCH 01/10] PCI: save pci_bus pointer in pcie_port structure

2019-03-26 Thread Vidya Sagar
save pci_bus pointer created by PCIe sub-system's
pci_scan_root_bus_bridge() to be used by host controller drivers for post
processing. Tegra host controller driver needs it for the following
reasons
- to derive pci_host_bridge structure from pci_bus which is used to
configure iATU's outbound regions for different windows accesses
- to traverse and configure downstream hierarchy. One such case is,
configuring all immediate downstream devices to D0 state before transiting
link to L2 state. Saving pci_bus pointer seems the best method compared to
deriving it by other means.

Signed-off-by: Vidya Sagar 
---
 drivers/pci/controller/dwc/pcie-designware-host.c | 1 +
 drivers/pci/controller/dwc/pcie-designware.h  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c 
b/drivers/pci/controller/dwc/pcie-designware-host.c
index 25087d3c9a82..15add3cf3945 100644
--- a/drivers/pci/controller/dwc/pcie-designware-host.c
+++ b/drivers/pci/controller/dwc/pcie-designware-host.c
@@ -494,6 +494,7 @@ int dw_pcie_host_init(struct pcie_port *pp)
goto error;
 
bus = bridge->bus;
+   pp->bus = bus;
 
if (pp->ops->scan_bus)
pp->ops->scan_bus(pp);
diff --git a/drivers/pci/controller/dwc/pcie-designware.h 
b/drivers/pci/controller/dwc/pcie-designware.h
index 377f4c0b52da..70007276bc93 100644
--- a/drivers/pci/controller/dwc/pcie-designware.h
+++ b/drivers/pci/controller/dwc/pcie-designware.h
@@ -175,6 +175,7 @@ struct pcie_port {
struct resource *busn;
int irq;
const struct dw_pcie_host_ops *ops;
+   struct pci_bus  *bus;
int msi_irq;
struct irq_domain   *irq_domain;
struct irq_domain   *msi_domain;
-- 
2.7.4



[PATCH 02/10] PCI: perform dbi regs write lock towards the end

2019-03-26 Thread Vidya Sagar
Remove multiple write enable and disable sequences of dbi registers as
Tegra194 implements writes to BAR-0 register (offset: 0x10) controlled by
DBI write-lock enable bit thereby not allowing any further writes to BAR-0
register in config space to take place. Hence disabling write permission
only towards the end.

Signed-off-by: Vidya Sagar 
---
 drivers/pci/controller/dwc/pcie-designware-host.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c 
b/drivers/pci/controller/dwc/pcie-designware-host.c
index 15add3cf3945..e17213f2217e 100644
--- a/drivers/pci/controller/dwc/pcie-designware-host.c
+++ b/drivers/pci/controller/dwc/pcie-designware-host.c
@@ -670,7 +670,6 @@ void dw_pcie_setup_rc(struct pcie_port *pp)
val &= 0x00ff;
val |= 0x0100;
dw_pcie_writel_dbi(pci, PCI_INTERRUPT_LINE, val);
-   dw_pcie_dbi_ro_wr_dis(pci);
 
/* Setup bus numbers */
val = dw_pcie_readl_dbi(pci, PCI_PRIMARY_BUS);
@@ -710,8 +709,6 @@ void dw_pcie_setup_rc(struct pcie_port *pp)
 
dw_pcie_wr_own_conf(pp, PCI_BASE_ADDRESS_0, 4, 0);
 
-   /* Enable write permission for the DBI read-only register */
-   dw_pcie_dbi_ro_wr_en(pci);
/* Program correct class for RC */
dw_pcie_wr_own_conf(pp, PCI_CLASS_DEVICE, 2, PCI_CLASS_BRIDGE_PCI);
/* Better disable write permission right after the update */
-- 
2.7.4



[PATCH 00/10] Add Tegra194 PCIe support

2019-03-26 Thread Vidya Sagar
Tegra194 has six PCIe controllers based on Synopsys DesignWare core.
There are two Universal PHY (UPHY) blocks with each supporting 12(HSIO:
Hisg Speed IO) and 8(NVHS: NVIDIA High Speed) lanes respectively.
Controllers:0~4 use UPHY lanes from HSIO brick whereas Controller:5 uses
UPHY lanes from NVHS brick. Lane mapping in HSIO UPHY brick to each PCIe
controller (0~4) is controlled in XBAR module by BPMP-FW. Since PCIe
core has PIPE interface, a glue module called PIPE-to-UPHY (P2U) is used
to connect each UPHY lane (applicable to both HSIO and NVHS UPHY bricks)
to PCIe controller
This patch series
- Adds support for P2U PHY driver
- Adds support for PCIe host controller
- Adds device tree nodes each PCIe controllers
- Enables nodes applicable to p2972- platform
- Adds helper APIs in Designware core driver to get capability regs offset
- Adds defines for new feature registers of PCIe spec revision 4
- Makes changes in DesignWare core driver to get Tegra194 PCIe working

Testing done on P2972- platform
- Able to get PCIe link up with on-board Marvel eSATA controller
- Able to get PCIe link up with NVMe cards connected to M.2 Key-M slot
- Able to do data transfers with both SATA drives and NVMe cards

Note
- Enabling x8 slot on P2972- platform requires pinmux driver for Tegra194.
  It is being worked on currently and hence Controller:5 (i.e. x8 slot) is
  disabled in this patch series. A future patch series would enable this.
Vidya Sagar (10):
  PCI: save pci_bus pointer in pcie_port structure
  PCI: perform dbi regs write lock towards the end
  PCI: dwc: Move config space capability search API
  PCI: Add #defines for PCIe spec r4.0 features
  dt-bindings: PCI: tegra: Add device tree support for T194
  arm64: tegra: Add P2U and PCIe controller nodes to Tegra194 DT
  arm64: tegra: Enable PCIe slots in P2972- board
  phy: tegra: Add PCIe PIPE2UPHY support
  PCI: tegra: Add Tegra194 PCIe support
  arm64: Add Tegra194 PCIe driver to defconfig

 .../bindings/pci/nvidia,tegra194-pcie.txt  |  209 +++
 .../devicetree/bindings/phy/phy-tegra194-p2u.txt   |   34 +
 arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi |2 +-
 arch/arm64/boot/dts/nvidia/tegra194-p2972-.dts |   50 +
 arch/arm64/boot/dts/nvidia/tegra194.dtsi   |  473 +
 arch/arm64/configs/defconfig   |1 +
 drivers/pci/controller/dwc/Kconfig |   10 +
 drivers/pci/controller/dwc/Makefile|1 +
 drivers/pci/controller/dwc/pcie-designware-ep.c|   37 +-
 drivers/pci/controller/dwc/pcie-designware-host.c  |4 +-
 drivers/pci/controller/dwc/pcie-designware.c   |   73 +
 drivers/pci/controller/dwc/pcie-designware.h   |4 +
 drivers/pci/controller/dwc/pcie-tegra194.c | 1862 
 drivers/phy/tegra/Kconfig  |7 +
 drivers/phy/tegra/Makefile |1 +
 drivers/phy/tegra/pcie-p2u-tegra194.c  |  138 ++
 include/uapi/linux/pci_regs.h  |   22 +-
 17 files changed, 2888 insertions(+), 40 deletions(-)
 create mode 100644 
Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt
 create mode 100644 Documentation/devicetree/bindings/phy/phy-tegra194-p2u.txt
 create mode 100644 drivers/pci/controller/dwc/pcie-tegra194.c
 create mode 100644 drivers/phy/tegra/pcie-p2u-tegra194.c

-- 
2.7.4



Re: [PATCH 5/5] lib/vsprintf: Add %pfw conversion specifier for printing fwnode names

2019-03-26 Thread Petr Mladek
On Fri 2019-03-22 17:29:30, Sakari Ailus wrote:
> Add support for %pfw conversion specifier (with "f" and "P" modifiers) to
> support printing full path of the node, including its name ("f") and only
> the node's name ("P") in the printk family of functions. The two flags
> have equivalent functionality to existing %pOF with the same two modifiers
> ("f" and "P") on OF based systems. The ability to do the same on ACPI
> based systems is added by this patch.
> 
> On ACPI based systems the resulting strings look like
> 
>   \_SB.PCI0.CIO2.port@1.endpoint@0
> 
> where the nodes are separated by a dot (".") and the first three are
> ACPI device nodes and the latter two ACPI data nodes.
> 
> Depends-on: ("vsprintf: Remove support for %pF and %pf in favour of %pS and 
> %ps")

Reusing obsolete modifiers is dangerous from many reasons:

   + people might miss the change of the meaning
   + backporting mistakes
   + 3rd party modules

It might be acceptable if the long term gain is bigger
than a short time difficulties. But it would be better
to it a safe way when possible.

Fortunately, we could keep the backward compatibility
for "%pf" and handle only "%pfw*" with the fwnode api.

Best Regards,
Petr


Re: [PATCH v2] x86/syscalls: Mark expected switch fall-throughs

2019-03-26 Thread Oleg Nesterov
On 03/23, Thomas Gleixner wrote:
>
> On Thu, 28 Feb 2019, Gustavo A. R. Silva wrote:
> 
> >  arch/x86/include/asm/syscall.h | 28 
> >  1 file changed, 28 insertions(+)
> 
> Second thoughts. So this adds 28 /* fall through */ comments. Now I
> appreciate the effort, but can we pretty please look at the code in
> question and figure out whether the implementation makes sense in the first
> place before adding falltrough comments blindly?
> 
> The whole exercise can be simplified. Untested patch below.
> 
> Looking at that stuff makes me wonder about two things:
> 
>  1) The third argument of get/set(), i.e. the argument offset, is 0 on all
> call sites. Do we need it at all?

Probably "maxargs" can be removed too, Steven sent the patches a long ago, see
https://lore.kernel.org/lkml/20161107212634.529267...@goodmis.org/

>  2) syscall_set_arguments() has been introduced in 2008 and we still have
> no caller. Instead of polishing it, can it be removed completely or are
> there plans to actually use it?

I think it can die.

> 
> Thanks,
> 
>   tglx
> 
> 8<
> 
> arch/x86/include/asm/syscall.h |  174 
> +++--
>  1 file changed, 64 insertions(+), 110 deletions(-)
> 
> --- a/arch/x86/include/asm/syscall.h
> +++ b/arch/x86/include/asm/syscall.h
> @@ -114,126 +114,80 @@ static inline int syscall_get_arch(void)
>  
>  #else /* CONFIG_X86_64 */
>  
> +static inline unsigned long syscall_get_argreg(struct pt_regs *regs,
> +unsigned int idx)
> +{
> + switch (idx) {
> + case  0: return regs->di;
> + case  1: return regs->si;
> + case  2: return regs->dx;
> + case  3: return regs->r10;
> + case  4: return regs->r8;
> + case  5: return regs->r9;
> +#ifdef CONFIG_IA32_EMULATION
> + case  6: return regs->bx;
> + case  7: return regs->cx;
> + case  8: return regs->dx;
> + case  9: return regs->si;
> + case 10: return regs->di;
> + case 11: return regs->bp;
> +#endif
> + }
> + return 0;
> +}
> +
>  static inline void syscall_get_arguments(struct task_struct *task,
>struct pt_regs *regs,
> -  unsigned int i, unsigned int n,
> +  unsigned int idx, unsigned int cnt,
>unsigned long *args)
>  {
> -# ifdef CONFIG_IA32_EMULATION
> - if (task->thread_info.status & TS_COMPAT)
> - switch (i) {
> - case 0:
> - if (!n--) break;
> - *args++ = regs->bx;
> - case 1:
> - if (!n--) break;
> - *args++ = regs->cx;
> - case 2:
> - if (!n--) break;
> - *args++ = regs->dx;
> - case 3:
> - if (!n--) break;
> - *args++ = regs->si;
> - case 4:
> - if (!n--) break;
> - *args++ = regs->di;
> - case 5:
> - if (!n--) break;
> - *args++ = regs->bp;
> - case 6:
> - if (!n--) break;
> - default:
> - BUG();
> - break;
> - }
> - else
> -# endif
> - switch (i) {
> - case 0:
> - if (!n--) break;
> - *args++ = regs->di;
> - case 1:
> - if (!n--) break;
> - *args++ = regs->si;
> - case 2:
> - if (!n--) break;
> - *args++ = regs->dx;
> - case 3:
> - if (!n--) break;
> - *args++ = regs->r10;
> - case 4:
> - if (!n--) break;
> - *args++ = regs->r8;
> - case 5:
> - if (!n--) break;
> - *args++ = regs->r9;
> - case 6:
> - if (!n--) break;
> - default:
> - BUG();
> - break;
> - }
> + if (WARN_ON((idx + cnt) > 6))
> + return;
> +
> + if (IS_ENABLED(CONFIG_IA32_EMULATION) &&
> + task->thread_info.status & TS_COMPAT)
> + idx += 6;
> +
> + for (; cnt > 0; cnt--)
> + *args++ = syscall_get_argreg(regs, idx++);
> +}
> +
> +static inline void syscall_set_argreg(struct pt_regs *regs,
> +   unsigned int idx,
> +   unsigned long val)
> +{
> + switch (idx) {
> + case  0: regs->di  = val; break;
> + case  1: regs->si  = val; break;
> + case  2: regs->dx  = val; break;
> + case  3: regs->r10 = val; break;
> + case  4: regs->r8  = val; break;
> + case  

Re: [PATCH v2 2/2] tty/serial: atmel: RS485 HD w/DMA: enable RX after TX is stopped

2019-03-26 Thread Richard Genoud
Le 19/03/2019 à 14:20, Razvan Stefanescu a écrit :
> In half-duplex operation, RX should be started after TX completes.
> 
> If DMA is used, there is a case when the DMA transfer completes but the
> TX FIFO is not emptied, so the RX cannot be restarted just yet.
> 
> Use a boolean variable to store this state and rearm TX interrupt mask
> to be signaled again that the transfer finished. In interrupt transmit
> handler this variable is used to start RX. A warning message is generated
> if RX is activated before TX fifo is cleared.
> 
> Fixes: b389f173aaa1 ("tty/serial: atmel: RS485 half duplex w/DMA: enable
> RX after TX is done")
> Signed-off-by: Razvan Stefanescu 
Acked-by: Richard Genoud 

NB: backport on kernel older than 4.20 will fail because of the iso7816
variables fidi_min/fidi_max.
> ---
> Changelog:
> v2:
>   - start RX and display warning in case of error
>   - add fix info
> 
>  drivers/tty/serial/atmel_serial.c | 25 ++---
>  1 file changed, 22 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/tty/serial/atmel_serial.c 
> b/drivers/tty/serial/atmel_serial.c
> index b4b89a16a41b..5b2f859c327c 100644
> --- a/drivers/tty/serial/atmel_serial.c
> +++ b/drivers/tty/serial/atmel_serial.c
> @@ -166,6 +166,8 @@ struct atmel_uart_port {
>   unsigned intpending_status;
>   spinlock_t  lock_suspended;
>  
> + boolhd_start_rx;/* can start RX during 
> half-duplex operation */
> +
>   /* ISO7816 */
>   unsigned intfidi_min;
>   unsigned intfidi_max;
> @@ -933,8 +935,13 @@ static void atmel_complete_tx_dma(void *arg)
>   if (!uart_circ_empty(xmit))
>   atmel_tasklet_schedule(atmel_port, _port->tasklet_tx);
>   else if (atmel_uart_is_half_duplex(port)) {
> - /* DMA done, stop TX, start RX for RS485 */
> - atmel_start_rx(port);
> + /*
> +  * DMA done, re-enable TXEMPTY and signal that we can stop
> +  * TX and start RX for RS485
> +  */
> + atmel_port->hd_start_rx = true;
> + atmel_uart_writel(port, ATMEL_US_IER,
> +   atmel_port->tx_done_mask);
>   }
>  
>   spin_unlock_irqrestore(>lock, flags);
> @@ -1378,9 +1385,20 @@ atmel_handle_transmit(struct uart_port *port, unsigned 
> int pending)
>   struct atmel_uart_port *atmel_port = to_atmel_uart_port(port);
>  
>   if (pending & atmel_port->tx_done_mask) {
> - /* Either PDC or interrupt transmission */
>   atmel_uart_writel(port, ATMEL_US_IDR,
> atmel_port->tx_done_mask);
> +
> + /* Start RX if flag was set and FIFO is empty */
> + if (atmel_port->hd_start_rx) {
> + if (!(atmel_uart_readl(port, ATMEL_US_CSR)
> + & ATMEL_US_TXEMPTY))
> + dev_warn(port->dev, "Should start RX, but TX 
> fifo is not empty\n");
> +
> + atmel_port->hd_start_rx = false;
> + atmel_start_rx(port);
> + return;
> + }
> +
>   atmel_tasklet_schedule(atmel_port, _port->tasklet_tx);
>   }
>  }
> 



Re: [PATCH ghak109 V1] audit: link integrity evm_write_xattrs record to syscall event

2019-03-26 Thread Mimi Zohar
On Wed, 2019-03-20 at 20:50 -0400, Richard Guy Briggs wrote:
> On 2019-03-20 19:48, Paul Moore wrote:
> > On Sat, Mar 16, 2019 at 8:10 AM Richard Guy Briggs  wrote:
> > > In commit fa516b66a1bf ("EVM: Allow runtime modification of the set of
> > > verified xattrs"), the call to audit_log_start() is missing a context to
> > > link it to an audit event. Since this event is in user context, add
> > > the process' syscall context to the record.
> > >
> > > In addition, the orphaned keyword "locked" appears in the record.
> > > Normalize this by changing it to "xattr=(locked)".
> > >
> > > Please see the github issue
> > > https://github.com/linux-audit/audit-kernel/issues/109
> > >
> > > Signed-off-by: Richard Guy Briggs 
> > > ---
> > >  security/integrity/evm/evm_secfs.c | 5 +++--
> > >  1 file changed, 3 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/security/integrity/evm/evm_secfs.c 
> > > b/security/integrity/evm/evm_secfs.c
> > > index 015aea8fdf1e..4171d174e9da 100644
> > > --- a/security/integrity/evm/evm_secfs.c
> > > +++ b/security/integrity/evm/evm_secfs.c
> > > @@ -192,7 +192,8 @@ static ssize_t evm_write_xattrs(struct file *file, 
> > > const char __user *buf,
> > > if (count > XATTR_NAME_MAX)
> > > return -E2BIG;
> > >
> > > -   ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_INTEGRITY_EVM_XATTR);
> > > +   ab = audit_log_start(audit_context(), GFP_KERNEL,
> > > +AUDIT_INTEGRITY_EVM_XATTR);
> > 
> > This part is fine.
> > 
> > > if (!ab)
> > > return -ENOMEM;
> > >
> > > @@ -222,7 +223,7 @@ static ssize_t evm_write_xattrs(struct file *file, 
> > > const char __user *buf,
> > > inode_lock(inode);
> > > err = simple_setattr(evm_xattrs, );
> > > inode_unlock(inode);
> > > -   audit_log_format(ab, "locked");
> > > +   audit_log_format(ab, "xattr=(locked)");
> > 
> > Two things come to mind:
> > 
> > * While we can clearly trust the string above, should we be logging
> > the xattr field value as an untrusted string so it is consistent with
> > how we record other xattr names?
> 
> That would be a question for Steve.
> 
> > * I'm not sure you can ever have parens in a xattr (I would hope not),
> > but if we are going to use the xattr field, perhaps we should simply
> > stick with the name as provided (".") so we don't ever run afoul of
> > xattr names?  I'm curious to hear what the IMA/EVM folks think of
> > this.
> 
> The legal xaddr names start with XATTR_SECURITY_PREFIX which is
> "security." so there is no danger of collision with legal names, but I
> suppose someone could try to use "(locked)" as a name which would look
> identical but fail with a different res= number.  I think I prefer your
> idea of printing the given value verbatim.

I really don't have a preference - "locked", "(locked)", "." or "(.)".
 Any of them is fine.

Thanks!

Mimi



Re: [RFC PATCH 1/2] dt-bindings: spi: Add device tree binding documentation for Zynq QSPI controller

2019-03-26 Thread Rob Herring
On Thu, 28 Feb 2019 12:31:53 +0530, Naga Sureshkumar Relli wrote:
> This patch adds the dts binding document for Zynq SOC QSPI
> controller.
> 
> Signed-off-by: Naga Sureshkumar Relli 
> ---
>  .../devicetree/bindings/spi/spi-zynq-qspi.txt  | 25 
> ++
>  1 file changed, 25 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/spi/spi-zynq-qspi.txt
> 

Reviewed-by: Rob Herring 


Re: [PATCH -next] x86/apic: Reduce print level of CPU limit announcement

2019-03-26 Thread Rafael J. Wysocki
On Tue, Mar 26, 2019 at 3:41 PM Leon Romanovsky  wrote:
>
> On Tue, Mar 26, 2019 at 01:29:54PM +0100, Rafael J. Wysocki wrote:
> > On Tue, Mar 26, 2019 at 1:02 PM Leon Romanovsky  wrote:
> > >
> > > From: Leon Romanovsky 
> > >
> > > Kernel is booted with less possible CPUs (possible_cpus kernel boot
> > > option) than available CPUs will have prints like this:
> > >
> > > [1.131039] APIC: NR_CPUS/possible_cpus limit of 8 reached. Processor 
> > > 55/0x1f ignored.
> > > [1.132228] ACPI: Unable to map lapic to logical cpu number
> > >
> > > Those warnings are printed for every not-enabled CPU and on the systems
> > > with large number of such CPUs, we see a lot of those prints for default
> > > print level.
> > >
> > > Simple conversion of those prints to be in debug level removes them
> > > while leaving the option to debug system.
> >
> > But generally dynamic debug must be enabled in order for pr_debug()
> > prints to be visible which is kind of cumbersome to do via the command
> > line.
>
> It is doable and documented pretty well, which is uncommon :)
> https://www.kernel.org/doc/html/latest/admin-guide/dynamic-debug-howto.html#debug-messages-during-boot-process

I know.

That's what I mean by "kind of cumbersome", because you need to know
which debug messages to enable upfront.

> >
> > > Signed-off-by: Leon Romanovsky 
> > > ---
> > >  arch/x86/kernel/acpi/boot.c | 2 +-
> > >  arch/x86/kernel/apic/apic.c | 6 +++---
> > >  2 files changed, 4 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
> > > index 8dcbf6890714..3ef8ab89c02d 100644
> > > --- a/arch/x86/kernel/acpi/boot.c
> > > +++ b/arch/x86/kernel/acpi/boot.c
> > > @@ -770,7 +770,7 @@ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t 
> > > physid, u32 acpi_id,
> > >
> > > cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED);
> > > if (cpu < 0) {
> > > -   pr_info(PREFIX "Unable to map lapic to logical cpu 
> > > number\n");
> > > +   pr_debug(PREFIX "Unable to map lapic to logical cpu 
> > > number\n");
> >
> > And this one is printed sometimes when something really goes wrong
> > which may be really hard to debug otherwise, so there is value in the
> > info level here.
> >
> > Would it be possible to avoid printing it just in some cases?
>
> This can do the trick:
>
> diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
> index 3ef8ab89c02d..00212b3991e0 100644
> --- a/arch/x86/kernel/acpi/boot.c
> +++ b/arch/x86/kernel/acpi/boot.c
> @@ -770,7 +770,10 @@ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t 
> physid, u32 acpi_id,
>
> cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED);
> if (cpu < 0) {
> -   pr_debug(PREFIX "Unable to map lapic to logical cpu 
> number\n");
> +   if (cpu == -ENOENT)
> +   pr_debug(PREFIX "Unable to map lapic to logical cpu 
> number\n");

I don't think it is necessary to print this in the -ENOENT case, as
there is a message for that case that will be printed anyway.

> +   else
> +   pr_info(PREFIX "Unable to map lapic to logical cpu 
> number\n");
> return cpu;
> }
>


Re: tools bugs: make clean deletes files in the git tree

2019-03-26 Thread Joe Lawrence

On 3/26/19 4:45 AM, Adrian Hunter wrote:

Hi

Doing:

make -C tools clean

Results in:

git diff --stat
 tools/pci/pcitest.sh  |  72 

 tools/testing/selftests/livepatch/test-callbacks.sh   | 587 

 tools/testing/selftests/livepatch/test-livepatch.sh   | 168 

 tools/testing/selftests/livepatch/test-shadow-vars.sh |  60 
-
 4 files changed, 887 deletions(-)

i.e. 'make clean' seems to be deleting files that are in the git tree.

Regards
Adrian



Hi Adrian -- thanks for the report.  I will fixup the livepatch 
selftests Makefile and post a patch shortly.


-- Joe


Re: [PATCH][next] RDMA/nes: remove redundant check on udata

2019-03-26 Thread Jason Gunthorpe
On Sat, Mar 02, 2019 at 11:06:36PM +, Colin King wrote:
> From: Colin Ian King 
> 
> The non-null check on udata is redundant as this check was performed
> just a few statements earlier and the check is always true as udata
> must be non-null at this point. Remove redundant the check on udata
> and the redundant else part that can never be executed.
> 
> Detected by CoverityScan, CID#1477317 ("Logically dead code")
> 
> Signed-off-by: Colin Ian King 
> ---
>  drivers/infiniband/hw/nes/nes_verbs.c | 73 +--
>  1 file changed, 34 insertions(+), 39 deletions(-)

Applied to for-next

Thanks,
Jason


Re: [PATCH v2 1/2] tty/serial: atmel: Add is_half_duplex helper

2019-03-26 Thread Richard Genoud
Le 19/03/2019 à 14:20, Razvan Stefanescu a écrit :
> Use a helper function to check that a port needs to use half duplex
> communication, replacing several occurrences of multi-line bit checking.
> 
> Fixes: b389f173aaa1 ("tty/serial: atmel: RS485 half duplex w/DMA: enable
> RX after TX is done")
> Signed-off-by: Razvan Stefanescu 
Acked-by: Richard Genoud 

NB: backport on kernel older than 4.20 will fail because of the
SER_ISO7816_ENABLED flag.
> ---
> Changelog:
> v2: 
>   - remove extra check
>   - add fix info
> 
>  drivers/tty/serial/atmel_serial.c | 24 
>  1 file changed, 12 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/tty/serial/atmel_serial.c 
> b/drivers/tty/serial/atmel_serial.c
> index 05147fe24343..b4b89a16a41b 100644
> --- a/drivers/tty/serial/atmel_serial.c
> +++ b/drivers/tty/serial/atmel_serial.c
> @@ -231,6 +231,13 @@ static inline void atmel_uart_write_char(struct 
> uart_port *port, u8 value)
>   __raw_writeb(value, port->membase + ATMEL_US_THR);
>  }
>  
> +static inline int atmel_uart_is_half_duplex(struct uart_port *port)
> +{
> + return ((port->rs485.flags & SER_RS485_ENABLED) &&
> + !(port->rs485.flags & SER_RS485_RX_DURING_TX)) ||
> + (port->iso7816.flags & SER_ISO7816_ENABLED);
> +}
> +
>  #ifdef CONFIG_SERIAL_ATMEL_PDC
>  static bool atmel_use_pdc_rx(struct uart_port *port)
>  {
> @@ -608,10 +615,9 @@ static void atmel_stop_tx(struct uart_port *port)
>   /* Disable interrupts */
>   atmel_uart_writel(port, ATMEL_US_IDR, atmel_port->tx_done_mask);
>  
> - if (((port->rs485.flags & SER_RS485_ENABLED) &&
> -  !(port->rs485.flags & SER_RS485_RX_DURING_TX)) ||
> - port->iso7816.flags & SER_ISO7816_ENABLED)
> + if (atmel_uart_is_half_duplex(port))
>   atmel_start_rx(port);
> +
>  }
>  
>  /*
> @@ -628,9 +634,7 @@ static void atmel_start_tx(struct uart_port *port)
>   return;
>  
>   if (atmel_use_pdc_tx(port) || atmel_use_dma_tx(port))
> - if (((port->rs485.flags & SER_RS485_ENABLED) &&
> -  !(port->rs485.flags & SER_RS485_RX_DURING_TX)) ||
> - port->iso7816.flags & SER_ISO7816_ENABLED)
> + if (atmel_uart_is_half_duplex(port))
>   atmel_stop_rx(port);
>  
>   if (atmel_use_pdc_tx(port))
> @@ -928,9 +932,7 @@ static void atmel_complete_tx_dma(void *arg)
>*/
>   if (!uart_circ_empty(xmit))
>   atmel_tasklet_schedule(atmel_port, _port->tasklet_tx);
> - else if (((port->rs485.flags & SER_RS485_ENABLED) &&
> -   !(port->rs485.flags & SER_RS485_RX_DURING_TX)) ||
> -  port->iso7816.flags & SER_ISO7816_ENABLED) {
> + else if (atmel_uart_is_half_duplex(port)) {
>   /* DMA done, stop TX, start RX for RS485 */
>   atmel_start_rx(port);
>   }
> @@ -1508,9 +1510,7 @@ static void atmel_tx_pdc(struct uart_port *port)
>   atmel_uart_writel(port, ATMEL_US_IER,
> atmel_port->tx_done_mask);
>   } else {
> - if (((port->rs485.flags & SER_RS485_ENABLED) &&
> -  !(port->rs485.flags & SER_RS485_RX_DURING_TX)) ||
> - port->iso7816.flags & SER_ISO7816_ENABLED) {
> + if (atmel_uart_is_half_duplex(port)) {
>   /* DMA done, stop TX, start RX for RS485 */
>   atmel_start_rx(port);
>   }
> 



[PATCH v2] mfd: Add support for Merrifield Basin Cove PMIC

2019-03-26 Thread Andy Shevchenko
Add an mfd driver for Intel Merrifield Basin Cove PMIC.

Signed-off-by: Andy Shevchenko 
---
- corrected name of Power Source detection driver
 drivers/mfd/Kconfig  |  11 ++
 drivers/mfd/Makefile |   1 +
 drivers/mfd/intel_soc_pmic_mrfld.c   | 157 +++
 include/linux/mfd/intel_soc_pmic_mrfld.h |  81 
 4 files changed, 250 insertions(+)
 create mode 100644 drivers/mfd/intel_soc_pmic_mrfld.c
 create mode 100644 include/linux/mfd/intel_soc_pmic_mrfld.h

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 0ce2d8dfc5f1..2adf9d393029 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -572,6 +572,17 @@ config INTEL_SOC_PMIC_CHTDC_TI
  Select this option for supporting Dollar Cove (TI version) PMIC
  device that is found on some Intel Cherry Trail systems.
 
+config INTEL_SOC_PMIC_MRFLD
+   tristate "Support for Intel Merrifield Basin Cove PMIC"
+   depends on GPIOLIB
+   depends on ACPI
+   depends on INTEL_SCU_IPC
+   select MFD_CORE
+   select REGMAP_IRQ
+   help
+ Select this option for supporting Basin Cove PMIC device
+ that is found on Intel Merrifield systems.
+
 config MFD_INTEL_LPSS
tristate
select COMMON_CLK
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index b4569ed7f3f3..1b746bd01ac5 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -234,6 +234,7 @@ obj-$(CONFIG_INTEL_SOC_PMIC)+= intel-soc-pmic.o
 obj-$(CONFIG_INTEL_SOC_PMIC_BXTWC) += intel_soc_pmic_bxtwc.o
 obj-$(CONFIG_INTEL_SOC_PMIC_CHTWC) += intel_soc_pmic_chtwc.o
 obj-$(CONFIG_INTEL_SOC_PMIC_CHTDC_TI)  += intel_soc_pmic_chtdc_ti.o
+obj-$(CONFIG_INTEL_SOC_PMIC_MRFLD) += intel_soc_pmic_mrfld.o
 obj-$(CONFIG_MFD_MT6397)   += mt6397-core.o
 
 obj-$(CONFIG_MFD_ALTERA_A10SR) += altera-a10sr.o
diff --git a/drivers/mfd/intel_soc_pmic_mrfld.c 
b/drivers/mfd/intel_soc_pmic_mrfld.c
new file mode 100644
index ..1f21a51f6e26
--- /dev/null
+++ b/drivers/mfd/intel_soc_pmic_mrfld.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Device access for Basin Cove PMIC
+ *
+ * Copyright (c) 2018, Intel Corporation.
+ * Author: Andy Shevchenko 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+/*
+ * Level 2 IRQs
+ *
+ * Firmware on the systems with Basin Cove PMIC services Level 1 IRQs
+ * without an assistance. Thus, each of the Level 1 IRQ is represented
+ * as a separate RTE in IOAPIC.
+ */
+static struct resource irq_level2_resources[] = {
+   DEFINE_RES_IRQ(0), /* power button */
+   DEFINE_RES_IRQ(0), /* TMU */
+   DEFINE_RES_IRQ(0), /* thermal */
+   DEFINE_RES_IRQ(0), /* BCU */
+   DEFINE_RES_IRQ(0), /* ADC */
+   DEFINE_RES_IRQ(0), /* charger */
+   DEFINE_RES_IRQ(0), /* GPIO */
+};
+
+static const struct mfd_cell bcove_dev[] = {
+   {
+   .name = "mrfld_bcove_pwrbtn",
+   .num_resources = 1,
+   .resources = _level2_resources[0],
+   }, {
+   .name = "mrfld_bcove_tmu",
+   .num_resources = 1,
+   .resources = _level2_resources[1],
+   }, {
+   .name = "mrfld_bcove_thermal",
+   .num_resources = 1,
+   .resources = _level2_resources[2],
+   }, {
+   .name = "mrfld_bcove_bcu",
+   .num_resources = 1,
+   .resources = _level2_resources[3],
+   }, {
+   .name = "mrfld_bcove_adc",
+   .num_resources = 1,
+   .resources = _level2_resources[4],
+   }, {
+   .name = "mrfld_bcove_charger",
+   .num_resources = 1,
+   .resources = _level2_resources[5],
+   }, {
+   .name = "mrfld_bcove_pwrsrc",
+   .num_resources = 1,
+   .resources = _level2_resources[5],
+   }, {
+   .name = "mrfld_bcove_gpio",
+   .num_resources = 1,
+   .resources = _level2_resources[6],
+   },
+   {   .name = "mrfld_bcove_region", },
+};
+
+static int regmap_ipc_byte_reg_read(void *context, unsigned int reg,
+   unsigned int *val)
+{
+   u8 ipc_out;
+   int ret;
+
+   ret = intel_scu_ipc_ioread8(reg, _out);
+   if (ret)
+   return ret;
+
+   *val = ipc_out;
+   return 0;
+}
+
+static int regmap_ipc_byte_reg_write(void *context, unsigned int reg,
+unsigned int val)
+{
+   u8 ipc_in = val;
+   int ret;
+
+   ret = intel_scu_ipc_iowrite8(reg, ipc_in);
+   if (ret)
+   return ret;
+
+   return 0;
+}
+
+static const struct regmap_config bcove_regmap_config = {
+   .reg_bits = 16,
+   .val_bits = 8,
+   .max_register = 0xff,
+   .reg_write = regmap_ipc_byte_reg_write,
+   .reg_read = 

Re: [PATCH] KVM: x86: nVMX: allow RSM to restore VMXE CR4 flag

2019-03-26 Thread Liran Alon



> On 26 Mar 2019, at 15:48, Vitaly Kuznetsov  wrote:
> 
> Liran Alon  writes:
> 
>>> On 26 Mar 2019, at 15:07, Vitaly Kuznetsov  wrote:
>>> - Instread of putting the temporary HF_SMM_MASK drop to
>>> rsm_enter_protected_mode() (as was suggested by Liran), move it to
>>> emulator_set_cr() modifying its interface. emulate.c seems to be
>>> vcpu-specifics-free at this moment, we may want to keep it this way.
>>> - It seems that Hyper-V+UEFI on KVM is still broken, I'm observing sporadic
>>> hangs even with this patch. These hangs, however, seem to be unrelated to
>>> rsm.
>> 
>> Feel free to share details on these hangs ;)
>> 
> 
> You've asked for it)
> 
> The immediate issue I'm observing is some sort of a lockup which is easy
> to trigger with e.g. "-usb -device usb-tablet" on Qemu command line; it
> seems we get too many interrupts and combined with preemtion timer for
> L2 we're not making any progress:
> 
> kvm_userspace_exit:   reason KVM_EXIT_IOAPIC_EOI (26)
> kvm_set_irq:  gsi 18 level 1 source 0
> kvm_msi_set_irq:  dst 0 vec 177 (Fixed|physical|level)
> kvm_apic_accept_irq:  apicid 0 vec 177 (Fixed|edge)
> kvm_fpu:  load
> kvm_entry:vcpu 0
> kvm_exit: reason VMRESUME rip 0xf8848115 info 0 0
> kvm_entry:vcpu 0
> kvm_exit: reason PREEMPTION_TIMER rip 0xf800f4448e01 info 0 0
> kvm_nested_vmexit:rip f800f4448e01 reason PREEMPTION_TIMER info1 0 
> info2 0 int_info 0 int_info_err 0
> kvm_nested_vmexit_inject: reason EXTERNAL_INTERRUPT info1 0 info2 0 int_info 
> 80b1 int_info_err 0
> kvm_entry:vcpu 0
> kvm_exit: reason APIC_ACCESS rip 0xf881fe11 info 10b0 0
> kvm_apic: apic_write APIC_EOI = 0x0
> kvm_eoi:  apicid 0 vector 177
> kvm_fpu:  unload
> kvm_userspace_exit:   reason KVM_EXIT_IOAPIC_EOI (26)
> ...
> (and the pattern repeats)
> 
> Maybe it is a usb-only/Qemu-only problem, maybe not.
> 
> -- 
> Vitaly

The trace of kvm_apic_accept_irq should indicate that __apic_accept_irq() was 
called to inject an interrupt to L1 guest.
(I know that now we are running in L1 because next exit is a VMRESUME).

However, it is surprising to see that on next entry to guest, no interrupt was 
injected by vmx_inject_irq().
It may be because L1 guest is currently running with interrupt disabled and 
therefore only an IRQ-window was requested.
(Too bad we don’t have a trace for this…)

Next, we got an exit from L1 guest on VMRESUME. As part of it’s handling, 
active VMCS was changed from vmcs01 to vmcs02.
I believe the immediate exit later on preemption-timer was because the 
immediate-exit-request mechanism was invoked
which is now implemented by setting a VMX preemption-timer with value of 0 
(Thanks to Sean).
(See vmx_vcpu_run() -> vmx_update_hv_timer() -> vmx_arm_hv_timer(vmx, 0)).
(Note that the pending interrupt was evaluated because of a recent patch of 
mine to nested_vmx_enter_non_root_mode()
to request KVM_REQ_EVENT when vmcs01 have requested an IRQ-window)

Therefore when entering L2, you immediately get an exit on PREEMPTION_TIMER 
which will cause eventually L0 to call
vmx_check_nested_events() which notices now the pending interrupt that should 
have been injected before to L1
and now exit from L2 to L1 on EXTERNAL_INTERRUPT on vector 0xb1.

Then L1 handles the interrupt by performing an EOI to LAPIC which propagate an 
EOI to IOAPIC which immediately re-inject
the interrupt (after clearing the remote_irr) as the irq-line is still set. 
i.e. QEMU’s ioapic_eoi_broadcast() calls ioapic_service() immediate after it 
clears remote-irr for this pin.

Also note that in trace we see only a single kvm_set_irq to level 1 but we 
don’t see immediately another kvm_set_irq to level 0.
This should indicate that in QEMU’s IOAPIC redirection-table, this pin is 
configured as level-triggered interrupt.
However, the trace of kvm_apic_accept_irq indicates that this interrupt is 
raised as an edge-triggered interrupt.

To sum up:
1) I would create a patch to add a trace to vcpu_enter_guest() when calling 
enable_smi_window() / enable_nmi_window() / enable_irq_window().
2) It is worth investigating why MSI trigger-mode is edge-triggered instead of 
level-triggered.
3) If this is indeed a level-triggered interrupt, it is worth investigating how 
the interrupt source behaves. i.e. What cause this device to lower the irq-line?
(As we don’t see any I/O Port or MMIO access by L1 guest interrupt-handler 
before performing the EOI)
4) Does this issue reproduce also when running with kernel-irqchip? (Instead of 
split-irqchip)

-Liran






New feature/ABI review process [was Re: [RESEND PATCH v6 04/12] x86/fsgsbase/64:..]

2019-03-26 Thread Thomas Gleixner
Andi,

On Mon, 25 Mar 2019, Andi Kleen wrote:

> >So on user space to kernel space transitions swapping in kernel GS should
> >simply do:
> >  userGS = RDGSBASE()
> >  WRGSBASE(kernelGS)
> 
> This would also need to find kernelGS first, by doing RDPID and then
> reading it  from memory in the right index
> (which might be a full cache miss if you're unlucky)

I'm well aware of that.

> SWAPGS will be a lot faster, especially in these rare worst cases
> because it has all its state inside the CPU.

The well known 'will/might/could/should' word weaseling is not solving
anything.

If you want to advocate the more complex design of mixed SWAPGS/FSGSBASE
then provide numbers and not hand-waving. Numbers of real-world workloads,
not numbers of artificial test cases which exercise the rare worst case.

Yes, it's extra work and it's well spent. If the numbers are not
significantly different then the simpler and consistent design is a clear
win.

According to the changelog on which I reacted you seem to have investigated
that already. Let me cite it again:

  > Accessing user GSBASE needs a couple of SWAPGS operations. It is
  > avoidable if the user GSBASE is saved at kernel entry, being updated as
  > changes, and restored back at kernel exit. However, it seems to spend
  > more cycles for savings and restorations. Little or no benefit was
  > measured from experiments.

So little or no benefit was measured. I don't see how that maps to your
'SWAPGS will be a lot faster' claim. One of those claims is obviously
wrong.

Aside of this needs more than numbers:

  1) Proper documentation how the mixed bag is managed.

  2) Extensive comments explaining the subtle inner workings and caveats.

  3) Proper changelogs.

You have a track record of not caring much about either of these, but I
very much care for good reasons. I've been bitten by glued on and half
baked patches from Intel in the past 10 years so many times, that I'm
simply refusing to take anything which is not properly structured and
documented.

Especially not when it is touching sensitive areas like this and also has
an impact on the user space ABI.

> BTW you managed to only review after Chang went on a long vacation.

I'm terribly sorry about that. I'll try to adjust my own schedules and
workflow to Intel employees vacation plans in the future.

> 
> I don't understand why it takes that long to review these changes
> It's one of the largest performance improvements for the context
> switch and the NMI in many years plus gives a new free register
> to user space, but it only makes progress at a glacial pace.
> The original patches for this were posted in 2016.
> 

Care to look at the real history of this:

  11/2015: First patch-set posted by you, which was rejected on technical 
grounds

So this so important feature was in limbo for 20 months until Luto picked it
up again. That's surely the fault of the x86 maintainers, right?

  07/2017: Discussion about ABI considerations initiated by Andy Lutomirksi.

And it takes another 8 month until patches come around:

  03/19/2018: V1 from Chang. Reviewed within days

2 month gap caused by Intel:

  05/31/2018: V2 Request from Andy to split the set

  06/04/2018: Base-V1 The first chunk of changes.

  06/06/2018: Base-V2 Slight modifications

  06/07/2018: Base-V3 Slight modifications. Review on 08/18

  06/20/2018: Base-V4 Review on 06/22

  06/27/2018: Base-V5

2 month gap caused by busy maintainers. You know what they were busy with
at that time, right? Chasing subtle bugs in the so perfect L1TF patches
which were thrown over the fence by you and dealing with the Intel induced
speculation crap to have a consistent and maintainable mitigation including
proper documentation.

  08/23/2018: Base-V5 Resend. Review on 9/14

  09/18/2018: Base-V6. Merged 10/08

  10/23/2018: Full-V3. Review immediate

  10/24/2018: Regression detected caused by Base-V6

The so perfect base patch set caused a regression and it takes more than a
month to fix it properly:

  10/30/2018: Fix-V1. Broken
  10/31/2018: Fix-V2. Broken
  11/01/2018: Fix-V3. Broken
  11/14/2018: Fix-V4. Broken
  11/15/2018: Fix-V5. Broken
  11/26/2018: Fix-V6. Finally

2 months to address the Full-V3 feedback:

  01/16/2019: Full-V4. Change request

  02/01/2019: Full-V5. Review immediate

  02/13/2019: Full-V6.

1 month gap caused by busy maintainers. Ash on my main...

  03/15/2019: Full-V6 resend

So just to put this straight:

 Out of 40 month since the first post in 11/2015:

   20 months nothing happened from Intel side
8 months consumed to produce the next set
1 month  to fix a regression
2 months consumed to react on review feedback
  --
   31 months

 versus:

   2 months maintainers dealing with Intel crap
   1 month  maintainers being busy

 The rest is the usual review/re-post ping pong delay which sums up, but
 from the larger gaps more than 75% are Intel induced and 7% 

Re: [PATCH] phy: qcom: qmp: Add SDM845 PCIe QMP PHY support

2019-03-26 Thread Rob Herring
On Mon, Feb 25, 2019 at 10:59:19PM -0800, Bjorn Andersson wrote:
> qcom_qmp_phy_init() is extended to support the additional register
> writes needed in PCS MISC and the appropriate sequences and resources
> are defined for SDM845.
> 
> Signed-off-by: Bjorn Andersson 
> ---
>  .../devicetree/bindings/phy/qcom-qmp-phy.txt  |   7 +

Please split bindings.

>  drivers/phy/qualcomm/phy-qcom-qmp.c   | 160 ++
>  drivers/phy/qualcomm/phy-qcom-qmp.h   |  12 ++
>  3 files changed, 179 insertions(+)
> 
> diff --git a/Documentation/devicetree/bindings/phy/qcom-qmp-phy.txt 
> b/Documentation/devicetree/bindings/phy/qcom-qmp-phy.txt
> index 5d181fc3cc18..dd2725a9d3f7 100644
> --- a/Documentation/devicetree/bindings/phy/qcom-qmp-phy.txt
> +++ b/Documentation/devicetree/bindings/phy/qcom-qmp-phy.txt
> @@ -11,6 +11,7 @@ Required properties:
>  "qcom,msm8996-qmp-usb3-phy" for 14nm USB3 phy on msm8996,
>  "qcom,msm8998-qmp-usb3-phy" for USB3 QMP V3 phy on msm8998,
>  "qcom,msm8998-qmp-ufs-phy" for UFS QMP phy on msm8998,
> +"qcom,sdm845-qmp-pcie-phy" for PCIe phy on sdm845,
>  "qcom,sdm845-qmp-usb3-phy" for USB3 QMP V3 phy on sdm845,
>  "qcom,sdm845-qmp-usb3-uni-phy" for USB3 QMP V3 UNI phy on sdm845,
>  "qcom,sdm845-qmp-ufs-phy" for UFS QMP phy on sdm845.
> @@ -48,6 +49,10 @@ Required properties:
>   "aux", "cfg_ahb", "ref".
>   For "qcom,msm8998-qmp-ufs-phy" must contain:
>   "ref", "ref_aux".
> + For "qcom,sdm845-qmp-usb3-phy" must contain:
> + "aux", "cfg_ahb", "ref", "refgen".
> + For "qcom,sdm845-qmp-usb3-phy" must contain:
> + "aux", "cfg_ahb", "ref", "com_aux".

Copy-n-paste error?

>   For "qcom,sdm845-qmp-usb3-phy" must contain:
>   "aux", "cfg_ahb", "ref", "com_aux".
>   For "qcom,sdm845-qmp-usb3-uni-phy" must contain:
> @@ -70,6 +75,8 @@ Required properties:
>   For "qcom,msm8998-qmp-usb3-phy" must contain
>   "phy", "common".
>   For "qcom,msm8998-qmp-ufs-phy": no resets are listed.
> + For "qcom,sdm845-qmp-pcie-phy" must contain:
> + "phy".
>   For "qcom,sdm845-qmp-usb3-phy" must contain:
>   "phy", "common".
>   For "qcom,sdm845-qmp-usb3-uni-phy" must contain:


<    1   2   3   4   5   6   7   8   9   10   >