When samples are generated, there is no way via the perf_event ABI to fetch per-thread data. This data is very useful in tracing scenarios that involve correlation IDs, such as OpenTelemetry. They are also useful for tracking per-thread performance details directly within a cooperating user process.
The newly establish OpenTelemetry profiling group requires a way to get tracing correlations on both Linux and Windows. On Windows this correlation is on a per-thread basis directly via ETW. On Linux we need a fast mechanism to store these details and TLS seems like the best option, see links for more details. Add a new sample type (PERF_SAMPLE_TLS_USER) that fetches TLS data up to X bytes per-sample. Use the existing PERF_SAMPLE_STACK_USER ABI for outputting data out to consumers. Store requested data size by the user in the previously reserved u16 (__reserved_2) within perf_event_attr. Add tls_addr and tls_user_size to perf_sample_data and calculate them during sample preparation. This allows the output side to know if truncation is going to occur and not having to re-fetch the TLS value from the user process a second time. Add CONFIG_HAVE_PERF_USER_TLS_DUMP so that architectures can specify if they have a TLS specific register (or other logic) that can be used for dumping. This does not yet enable any architecture to do TLS dump, it simply makes it possible by allowing a arch defined method named arch_perf_user_tls_pointer(). Add perf_tls struct that arch_perf_user_tls_pointer() utilizes to set TLS details of the address and size (for 32bit on 64bit compat cases). Link: https://opentelemetry.io/blog/2024/profiling/ Link: https://www.elastic.co/blog/continuous-profiling-distributed-tracing-correlation Signed-off-by: Beau Belgrave <be...@linux.microsoft.com> --- arch/Kconfig | 7 +++ include/linux/perf_event.h | 7 +++ include/uapi/linux/perf_event.h | 5 +- kernel/events/core.c | 105 +++++++++++++++++++++++++++++++- kernel/events/internal.h | 16 +++++ 5 files changed, 137 insertions(+), 3 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 9f066785bb71..6afaf5f46e2f 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -430,6 +430,13 @@ config HAVE_PERF_USER_STACK_DUMP access to the user stack pointer which is not unified across architectures. +config HAVE_PERF_USER_TLS_DUMP + bool + help + Support user tls dumps for perf event samples. This needs + access to the user tls pointer which is not unified across + architectures. + config HAVE_ARCH_JUMP_LABEL bool diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d2a15c0c6f8a..7fac81929eed 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1202,8 +1202,15 @@ struct perf_sample_data { u64 data_page_size; u64 code_page_size; u64 aux_size; + u64 tls_addr; + u64 tls_user_size; } ____cacheline_aligned; +struct perf_tls { + unsigned long base; /* Base address for TLS */ + unsigned long size; /* Size of base address */ +}; + /* default value for data source */ #define PERF_MEM_NA (PERF_MEM_S(OP, NA) |\ PERF_MEM_S(LVL, NA) |\ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 3a64499b0f5d..b62669cfe581 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -162,8 +162,9 @@ enum perf_event_sample_format { PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22, PERF_SAMPLE_CODE_PAGE_SIZE = 1U << 23, PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24, + PERF_SAMPLE_TLS_USER = 1U << 25, - PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 26, /* non-ABI */ }; #define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) @@ -509,7 +510,7 @@ struct perf_event_attr { */ __u32 aux_watermark; __u16 sample_max_stack; - __u16 __reserved_2; + __u16 sample_tls_user; /* Size of TLS data to dump on samples */ __u32 aux_sample_size; __u32 __reserved_3; diff --git a/kernel/events/core.c b/kernel/events/core.c index 07de5cc2aa25..f848bf4be9bd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6926,6 +6926,45 @@ static u64 perf_ustack_task_size(struct pt_regs *regs) return TASK_SIZE - addr; } +/* + * Get remaining task size from user tls pointer. + * + * Outputs the address to use for the dump to avoid doing + * this twice (prepare and output). + */ +static u64 +perf_utls_task_size(struct pt_regs *regs, u64 dump_size, u64 *tls_addr) +{ + struct perf_tls tls; + unsigned long addr; + + *tls_addr = 0; + + /* No regs, no tls pointer, no dump. */ + if (!regs) + return 0; + + perf_user_tls_pointer(&tls); + + if (WARN_ONCE(tls.size > sizeof(addr), "perf: Bad TLS size.\n")) + return 0; + + addr = 0; + arch_perf_out_copy_user(&addr, (void *)tls.base, tls.size); + + if (addr < dump_size) + return 0; + + addr -= dump_size; + + if (!addr || addr >= TASK_SIZE) + return 0; + + *tls_addr = addr; + + return TASK_SIZE - addr; +} + static u16 perf_sample_dump_size(u16 dump_size, u16 header_size, u64 task_size) { @@ -6997,6 +7036,43 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, } } +static void +perf_output_sample_utls(struct perf_output_handle *handle, u64 addr, + u64 dump_size, struct pt_regs *regs) +{ + /* Case of a kernel thread, nothing to dump */ + if (!regs) { + u64 size = 0; + perf_output_put(handle, size); + } else { + unsigned int rem; + u64 dyn_size; + + /* + * We dump: + * static size + * - the size requested by user or the best one we can fit + * in to the sample max size + * data + * - user tls dump data + * dynamic size + * - the actual dumped size + */ + + /* Static size. */ + perf_output_put(handle, dump_size); + + /* Data. */ + rem = __output_copy_user(handle, (void *)addr, dump_size); + dyn_size = dump_size - rem; + + perf_output_skip(handle, rem); + + /* Dynamic size. */ + perf_output_put(handle, dyn_size); + } +} + static unsigned long perf_prepare_sample_aux(struct perf_event *event, struct perf_sample_data *data, size_t size) @@ -7474,6 +7550,13 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) perf_output_put(handle, data->code_page_size); + if (sample_type & PERF_SAMPLE_TLS_USER) { + perf_output_sample_utls(handle, + data->tls_addr, + data->tls_user_size, + data->regs_user.regs); + } + if (sample_type & PERF_SAMPLE_AUX) { perf_output_put(handle, data->aux_size); @@ -7759,6 +7842,19 @@ void perf_prepare_sample(struct perf_sample_data *data, data->sample_flags |= PERF_SAMPLE_STACK_USER; } + if (filtered_sample_type & PERF_SAMPLE_TLS_USER) { + u16 tls_size = event->attr.sample_tls_user; + u64 task_size = perf_utls_task_size(data->regs_user.regs, + tls_size, + &data->tls_addr); + + tls_size = perf_prepare_dump_data(data, event, regs, + tls_size, task_size); + + data->tls_user_size = tls_size; + data->sample_flags |= PERF_SAMPLE_TLS_USER; + } + if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) { data->weight.full = 0; data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; @@ -12159,7 +12255,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, attr->size = size; - if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) + if (attr->__reserved_1 || attr->__reserved_3) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) @@ -12225,6 +12321,13 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, return -EINVAL; } + if (attr->sample_type & PERF_SAMPLE_TLS_USER) { + if (!arch_perf_have_user_tls_dump()) + return -ENOSYS; + else if (!IS_ALIGNED(attr->sample_tls_user, sizeof(u64))) + return -EINVAL; + } + if (!attr->sample_max_stack) attr->sample_max_stack = sysctl_perf_event_max_stack; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 5150d5f84c03..b42747b1eb04 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -243,4 +243,20 @@ static inline bool arch_perf_have_user_stack_dump(void) #define perf_user_stack_pointer(regs) 0 #endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */ +#ifdef CONFIG_HAVE_PERF_USER_TLS_DUMP +static inline bool arch_perf_have_user_tls_dump(void) +{ + return true; +} + +#define perf_user_tls_pointer(tls) arch_perf_user_tls_pointer(tls) +#else +static inline bool arch_perf_have_user_tls_dump(void) +{ + return false; +} + +#define perf_user_tls_pointer(tls) memset(tls, 0, sizeof(*tls)) +#endif /* CONFIG_HAVE_PERF_USER_TLS_DUMP */ + #endif /* _KERNEL_EVENTS_INTERNAL_H */ -- 2.34.1