Long ring buffer data records currently store the aligned in-buffer size in their length field. That makes ring_buffer_event_length() report padded sizes, and small TRACE_PRINT / TRACE_RAW_DATA records lose their true payload length entirely when they use the short type_len encoding.
Teach long data events to keep the true payload size in array[0], and let the ring buffer derive the aligned in-buffer size separately when it needs to walk or discard records. Then add a long-reserve helper and use it for TRACE_PRINT and TRACE_RAW_DATA so their zero-length-array tails always preserve the real payload size. The temporary filtered-event buffer keeps the same long-record payload length semantics, and a QEMU runtime reproducer for trace_marker_raw now reports the expected byte counts again. Link: https://bugzilla.kernel.org/show_bug.cgi?id=210173 Signed-off-by: Cao Ruichuang <[email protected]> --- include/linux/ring_buffer.h | 2 ++ kernel/trace/ring_buffer.c | 56 ++++++++++++++++++++++++++----------- kernel/trace/trace.c | 8 +++--- kernel/trace/trace.h | 15 ++++++++++ kernel/trace/trace_printk.c | 8 +++--- 5 files changed, 65 insertions(+), 24 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index d862fa610..a4e46cb53 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -137,6 +137,8 @@ void ring_buffer_change_overwrite(struct trace_buffer *buffer, int val); struct ring_buffer_event *ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length); +struct ring_buffer_event *ring_buffer_lock_reserve_long(struct trace_buffer *buffer, + unsigned long length); int ring_buffer_unlock_commit(struct trace_buffer *buffer); int ring_buffer_write(struct trace_buffer *buffer, unsigned long length, void *data); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 170170bd8..c9ade62df 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -206,10 +206,14 @@ rb_event_data_length(struct ring_buffer_event *event) unsigned length; if (event->type_len) - length = event->type_len * RB_ALIGNMENT; - else - length = event->array[0]; - return length + RB_EVNT_HDR_SIZE; + return event->type_len * RB_ALIGNMENT + RB_EVNT_HDR_SIZE; + + /* + * Long records store the true payload size in array[0], but still + * consume an aligned amount of space in the buffer. + */ + length = event->array[0] + RB_EVNT_HDR_SIZE + sizeof(event->array[0]); + return ALIGN(length, RB_ARCH_ALIGNMENT); } /* @@ -276,12 +280,13 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event) if (extended_time(event)) event = skip_time_extend(event); + if (!event->type_len) + return event->array[0]; + length = rb_event_length(event); if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) return length; length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) - length -= sizeof(event->array[0]); return length; } EXPORT_SYMBOL_GPL(ring_buffer_event_length); @@ -463,9 +468,11 @@ struct rb_event_info { u64 delta; u64 before; u64 after; + unsigned long data_length; unsigned long length; struct buffer_page *tail_page; int add_timestamp; + bool force_long; }; /* @@ -3796,14 +3803,15 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, event->time_delta = delta; length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT || + info->force_long) { event->type_len = 0; - event->array[0] = length; + event->array[0] = info->data_length; } else event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); } -static unsigned rb_calculate_event_length(unsigned length) +static unsigned int rb_calculate_event_length(unsigned int length, bool force_long) { struct ring_buffer_event event; /* Used only for sizeof array */ @@ -3811,7 +3819,7 @@ static unsigned rb_calculate_event_length(unsigned length) if (!length) length++; - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT || force_long) length += sizeof(event.array[0]); length += RB_EVNT_HDR_SIZE; @@ -4605,7 +4613,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, static __always_inline struct ring_buffer_event * rb_reserve_next_event(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length) + unsigned long length, bool force_long) { struct ring_buffer_event *event; struct rb_event_info info; @@ -4641,7 +4649,9 @@ rb_reserve_next_event(struct trace_buffer *buffer, } #endif - info.length = rb_calculate_event_length(length); + info.length = rb_calculate_event_length(length, force_long); + info.data_length = length ? : 1; + info.force_long = force_long; if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { add_ts_default = RB_ADD_STAMP_ABSOLUTE; @@ -4698,8 +4708,9 @@ rb_reserve_next_event(struct trace_buffer *buffer, * Must be paired with ring_buffer_unlock_commit, unless NULL is returned. * If NULL is returned, then nothing has been allocated or locked. */ -struct ring_buffer_event * -ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length) +static struct ring_buffer_event * +__ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length, + bool force_long) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; @@ -4727,7 +4738,7 @@ ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length) if (unlikely(trace_recursive_lock(cpu_buffer))) goto out; - event = rb_reserve_next_event(buffer, cpu_buffer, length); + event = rb_reserve_next_event(buffer, cpu_buffer, length, force_long); if (!event) goto out_unlock; @@ -4739,8 +4750,21 @@ ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length) preempt_enable_notrace(); return NULL; } + +struct ring_buffer_event * +ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length) +{ + return __ring_buffer_lock_reserve(buffer, length, false); +} EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); +struct ring_buffer_event * +ring_buffer_lock_reserve_long(struct trace_buffer *buffer, unsigned long length) +{ + return __ring_buffer_lock_reserve(buffer, length, true); +} +EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve_long); + /* * Decrement the entries to the page that an event is on. * The event does not even need to exist, only the pointer @@ -4874,7 +4898,7 @@ int ring_buffer_write(struct trace_buffer *buffer, if (unlikely(trace_recursive_lock(cpu_buffer))) return -EBUSY; - event = rb_reserve_next_event(buffer, cpu_buffer, length); + event = rb_reserve_next_event(buffer, cpu_buffer, length, false); if (!event) goto out_unlock; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a626211ce..ffc1b1e9c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6503,8 +6503,8 @@ static ssize_t write_marker_to_buffer(struct trace_array *tr, const char *buf, size = cnt + meta_size; buffer = tr->array_buffer.buffer; - event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - tracing_gen_ctx()); + event = __trace_buffer_lock_reserve_long(buffer, TRACE_PRINT, size, + tracing_gen_ctx()); if (unlikely(!event)) { /* * If the size was greater than what was allowed, then @@ -6917,8 +6917,8 @@ static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, if (size > ring_buffer_max_event_size(buffer)) return -EINVAL; - event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, - tracing_gen_ctx()); + event = __trace_buffer_lock_reserve_long(buffer, TRACE_RAW_DATA, size, + tracing_gen_ctx()); if (!event) /* Ring buffer disabled, return as if not open for write */ return -EBADF; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b8f380458..da55717c9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1613,6 +1613,21 @@ __trace_buffer_lock_reserve(struct trace_buffer *buffer, return event; } +static __always_inline struct ring_buffer_event * +__trace_buffer_lock_reserve_long(struct trace_buffer *buffer, + int type, + unsigned long len, + unsigned int trace_ctx) +{ + struct ring_buffer_event *event; + + event = ring_buffer_lock_reserve_long(buffer, len); + if (event != NULL) + trace_event_setup(event, type, trace_ctx); + + return event; +} + static __always_inline void __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) { diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 9f67ce42e..1441b2bd4 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -444,8 +444,8 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip, trace_ctx = tracing_gen_ctx(); buffer = tr->array_buffer.buffer; guard(ring_buffer_nest)(buffer); - event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, - trace_ctx); + event = __trace_buffer_lock_reserve_long(buffer, TRACE_PRINT, alloc, + trace_ctx); if (!event) return 0; @@ -725,8 +725,8 @@ int __trace_array_vprintk(struct trace_buffer *buffer, size = sizeof(*entry) + len + 1; scoped_guard(ring_buffer_nest, buffer) { - event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - trace_ctx); + event = __trace_buffer_lock_reserve_long(buffer, TRACE_PRINT, size, + trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); -- 2.39.5 (Apple Git-154)
