On 3/22/2021 2:06 AM, Like Xu wrote:
If the kernel is compiled with the CONFIG_LOCKDEP option, the conditional
might_sleep_if() deep in kmem_cache_alloc() will generate the following
trace, and potentially cause a deadlock when another LBR event is added:

[  243.115549] BUG: sleeping function called from invalid context at 
include/linux/sched/mm.h:196
[  243.117576] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 839, 
name: perf
[  243.119326] INFO: lockdep is turned off.
[  243.120249] irq event stamp: 0
[  243.120967] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
[  243.122415] hardirqs last disabled at (0): [<ffffffff810d9bf5>] 
copy_process+0xa45/0x1dc0
[  243.124302] softirqs last  enabled at (0): [<ffffffff810d9bf5>] 
copy_process+0xa45/0x1dc0
[  243.126255] softirqs last disabled at (0): [<0000000000000000>] 0x0
[  243.128119] CPU: 0 PID: 839 Comm: perf Not tainted 5.11.0-rc4-guest+ #8
[  243.129654] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
0.0.0 02/06/2015
[  243.131520] Call Trace:
[  243.132112]  dump_stack+0x8d/0xb5
[  243.132896]  ___might_sleep.cold.106+0xb3/0xc3
[  243.133984]  slab_pre_alloc_hook.constprop.85+0x96/0xd0
[  243.135208]  ? intel_pmu_lbr_add+0x152/0x170
[  243.136207]  kmem_cache_alloc+0x36/0x250
[  243.137126]  intel_pmu_lbr_add+0x152/0x170
[  243.138088]  x86_pmu_add+0x83/0xd0
[  243.138889]  ? lock_acquire+0x158/0x350
[  243.139791]  ? lock_acquire+0x158/0x350
[  243.140694]  ? lock_acquire+0x158/0x350
[  243.141625]  ? lock_acquired+0x1e3/0x360
[  243.142544]  ? lock_release+0x1bf/0x340
[  243.143726]  ? trace_hardirqs_on+0x1a/0xd0
[  243.144823]  ? lock_acquired+0x1e3/0x360
[  243.145742]  ? lock_release+0x1bf/0x340
[  243.147107]  ? __slab_free+0x49/0x540
[  243.147966]  ? trace_hardirqs_on+0x1a/0xd0
[  243.148924]  event_sched_in.isra.129+0xf8/0x2a0
[  243.149989]  merge_sched_in+0x261/0x3e0
[  243.150889]  ? trace_hardirqs_on+0x1a/0xd0
[  243.151869]  visit_groups_merge.constprop.135+0x130/0x4a0
[  243.153122]  ? sched_clock_cpu+0xc/0xb0
[  243.154023]  ctx_sched_in+0x101/0x210
[  243.154884]  ctx_resched+0x6f/0xc0
[  243.155686]  perf_event_exec+0x21e/0x2e0
[  243.156641]  begin_new_exec+0x5e5/0xbd0
[  243.157540]  load_elf_binary+0x6af/0x1770
[  243.158478]  ? __kernel_read+0x19d/0x2b0
[  243.159977]  ? lock_acquire+0x158/0x350
[  243.160876]  ? __kernel_read+0x19d/0x2b0
[  243.161796]  bprm_execve+0x3c8/0x840
[  243.162638]  do_execveat_common.isra.38+0x1a5/0x1c0
[  243.163776]  __x64_sys_execve+0x32/0x40
[  243.164676]  do_syscall_64+0x33/0x40
[  243.165514]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  243.166746] RIP: 0033:0x7f6180a26feb
[  243.167590] Code: Unable to access opcode bytes at RIP 0x7f6180a26fc1.
[  243.169097] RSP: 002b:00007ffc6558ce18 EFLAGS: 00000202 ORIG_RAX: 
000000000000003b
[  243.170844] RAX: ffffffffffffffda RBX: 00007ffc65592d30 RCX: 00007f6180a26feb
[  243.172514] RDX: 000055657f408dc0 RSI: 00007ffc65592410 RDI: 00007ffc65592d30
[  243.174162] RBP: 00007ffc6558ce80 R08: 00007ffc6558cde0 R09: 0000000000000000
[  243.176042] R10: 0000000000000008 R11: 0000000000000202 R12: 00007ffc65592410
[  243.177696] R13: 000055657f408dc0 R14: 0000000000000001 R15: 00007ffc65592410

One of the solution is to use GFP_ATOMIC, but it will make the code less
reliable under memory pressue. Let's move the memory allocation out of
the sleeping region and put it into the x86_reserve_hardware().

The disadvantage of this fix is that the cpuc->lbr_xsave memory
will be allocated for each cpu like the legacy ds_buffer.

Fixes: c085fb8774 ("perf/x86/intel/lbr: Support XSAVES for arch LBR read")
Suggested-by: Kan Liang <kan.li...@linux.intel.com>
Signed-off-by: Like Xu <like...@linux.intel.com>

I observed the same issue when I did LBR test on an ADL machine. This patch fixes the issue.

Tested-by: Kan Liang <kan.li...@linux.intel.com>

Thanks,
Kan
---
  arch/x86/events/core.c       |  8 +++++---
  arch/x86/events/intel/bts.c  |  2 +-
  arch/x86/events/intel/lbr.c  | 22 ++++++++++++++++------
  arch/x86/events/perf_event.h |  8 +++++++-
  4 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 18df17129695..a4ce669cc78d 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -373,7 +373,7 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct 
perf_event *event)
        return x86_pmu_extra_regs(val, event);
  }
-int x86_reserve_hardware(void)
+int x86_reserve_hardware(struct perf_event *event)
  {
        int err = 0;
@@ -382,8 +382,10 @@ int x86_reserve_hardware(void)
                if (atomic_read(&pmc_refcount) == 0) {
                        if (!reserve_pmc_hardware())
                                err = -EBUSY;
-                       else
+                       else {
                                reserve_ds_buffers();
+                               reserve_lbr_buffers(event);
+                       }
                }
                if (!err)
                        atomic_inc(&pmc_refcount);
@@ -634,7 +636,7 @@ static int __x86_pmu_event_init(struct perf_event *event)
        if (!x86_pmu_initialized())
                return -ENODEV;
- err = x86_reserve_hardware();
+       err = x86_reserve_hardware(event);
        if (err)
                return err;
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 731dd8d0dbb1..057bb2f761a9 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -564,7 +564,7 @@ static int bts_event_init(struct perf_event *event)
        if (x86_add_exclusive(x86_lbr_exclusive_bts))
                return -EBUSY;
- ret = x86_reserve_hardware();
+       ret = x86_reserve_hardware(event);
        if (ret) {
                x86_del_exclusive(x86_lbr_exclusive_bts);
                return ret;
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 355ea70f1879..237876733e12 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -658,7 +658,6 @@ static inline bool branch_user_callstack(unsigned br_sel)
void intel_pmu_lbr_add(struct perf_event *event)
  {
-       struct kmem_cache *kmem_cache = event->pmu->task_ctx_cache;
        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (!x86_pmu.lbr_nr)
@@ -696,11 +695,6 @@ void intel_pmu_lbr_add(struct perf_event *event)
        perf_sched_cb_inc(event->ctx->pmu);
        if (!cpuc->lbr_users++ && !event->total_time_running)
                intel_pmu_lbr_reset();
-
-       if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
-           kmem_cache && !cpuc->lbr_xsave &&
-           (cpuc->lbr_users != cpuc->lbr_pebs_users))
-               cpuc->lbr_xsave = kmem_cache_alloc(kmem_cache, GFP_KERNEL);
  }
void release_lbr_buffers(void)
@@ -721,6 +715,22 @@ void release_lbr_buffers(void)
        }
  }
+void reserve_lbr_buffers(struct perf_event *event)
+{
+       struct kmem_cache *kmem_cache = x86_get_pmu()->task_ctx_cache;
+       struct cpu_hw_events *cpuc;
+       int cpu;
+
+       if (!static_cpu_has(X86_FEATURE_ARCH_LBR))
+               return;
+
+       for_each_possible_cpu(cpu) {
+               cpuc = per_cpu_ptr(&cpu_hw_events, cpu);
+               if (kmem_cache && !cpuc->lbr_xsave && !event->attr.precise_ip)
+                       cpuc->lbr_xsave = kmem_cache_alloc(kmem_cache, 
GFP_KERNEL);
+       }
+}
+
  void intel_pmu_lbr_del(struct perf_event *event)
  {
        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 53b2b5fc23bc..2fe77d3a98d6 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -968,7 +968,7 @@ int x86_add_exclusive(unsigned int what);
void x86_del_exclusive(unsigned int what); -int x86_reserve_hardware(void);
+int x86_reserve_hardware(struct perf_event *event);
void x86_release_hardware(void); @@ -1135,6 +1135,8 @@ void reserve_ds_buffers(void); void release_lbr_buffers(void); +void reserve_lbr_buffers(struct perf_event *event);
+
  extern struct event_constraint bts_constraint;
  extern struct event_constraint vlbr_constraint;
@@ -1282,6 +1284,10 @@ static inline void release_lbr_buffers(void)
  {
  }
+static inline void reserve_lbr_buffers(struct perf_event *event)
+{
+}
+
  static inline int intel_pmu_init(void)
  {
        return 0;

Reply via email to