AMD General [Tao] The buffer has three different names in this patch: trace, trace_arr, trace_arry, can we simplify it?
> -----Original Message----- > From: Chai, Thomas <[email protected]> > Sent: Monday, May 18, 2026 3:22 PM > To: [email protected] > Cc: Chai, Thomas <[email protected]>; Zhang, Hawking > <[email protected]>; Zhou1, Tao <[email protected]>; Yang, > Stanley <[email protected]>; Chai, Thomas <[email protected]> > Subject: [PATCH 6/7] drm/amd/ras: copy ras log data instead of referencing > pointers > > When generating ras cper file, the original data nodes in the ras log ring > buffer > may be deleted, leading to invalid pointer access. Copy the data from the ras > log ring instead of directly referencing the pointers to avoid this issue. > > Signed-off-by: YiPeng Chai <[email protected]> > --- > .../drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c | 12 +++--- > drivers/gpu/drm/amd/ras/rascore/ras_cmd.c | 42 +++++++++++++------ > drivers/gpu/drm/amd/ras/rascore/ras_cper.c | 20 ++++----- > drivers/gpu/drm/amd/ras/rascore/ras_cper.h | 2 +- > .../gpu/drm/amd/ras/rascore/ras_log_ring.c | 23 +++++----- > .../gpu/drm/amd/ras/rascore/ras_log_ring.h | 2 +- > 6 files changed, 58 insertions(+), 43 deletions(-) > > diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c > b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c > index b8e9442b2ca5..537f709d8570 100644 > --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c > +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c > @@ -219,7 +219,7 @@ static bool > amdgpu_virt_ras_check_batch_cached(struct ras_cmd_batch_trace_record } > > static int amdgpu_virt_ras_get_batch_records(struct ras_core_context > *ras_core, uint64_t batch_id, > - struct ras_log_info **trace_arr, uint32_t arr_num, > + struct ras_log_info *trace_arr, uint32_t arr_num, > struct ras_cmd_batch_trace_record_rsp *rsp_cache) { > struct ras_cmd_batch_trace_record_req req = { @@ -255,7 +255,8 > @@ static int amdgpu_virt_ras_get_batch_records(struct ras_core_context > *ras_core, > } > > for (i = 0; i < batch->trace_num && i < arr_num; i++) > - trace_arr[i] = &rsp->records[batch->offset + i]; > + memcpy(&trace_arr[i], > + &rsp->records[batch->offset + i], sizeof(*trace_arr)); > > return i; > } > @@ -272,7 +273,8 @@ static int amdgpu_virt_ras_get_cper_records(struct > ras_core_context *ras_core, > (struct ras_cmd_cper_record_rsp *)cmd->output_buff_raw; > struct ras_log_batch_overview *overview = &virt_ras- > >batch_mgr.batch_overview; > struct ras_cmd_batch_trace_record_rsp *rsp_cache = &virt_ras- > >batch_mgr.batch_trace; > - struct ras_log_info **trace; > + struct ras_log_info *trace; > + uint32_t trace_count = MAX_RECORD_PER_BATCH; > uint32_t offset = 0, real_data_len = 0; > uint64_t batch_id; > uint8_t *out_buf; > @@ -289,7 +291,7 @@ static int amdgpu_virt_ras_get_cper_records(struct > ras_core_context *ras_core, > req->cper_num > RAS_CMD_MAX_CPER_FETCH_NUM) > return RAS_CMD__ERROR_INVALID_INPUT_DATA; > > - trace = kcalloc(MAX_RECORD_PER_BATCH, sizeof(*trace), > GFP_KERNEL); > + trace = kcalloc(trace_count, sizeof(*trace), GFP_KERNEL); > if (!trace) > return RAS_CMD__ERROR_GENERIC; > > @@ -306,7 +308,7 @@ static int amdgpu_virt_ras_get_cper_records(struct > ras_core_context *ras_core, > if (batch_id >= overview->last_batch_id) > break; > count = amdgpu_virt_ras_get_batch_records(ras_core, > batch_id, > - trace, > MAX_RECORD_PER_BATCH, > + trace, trace_count, > rsp_cache); > if (count > 0) { > ret = ras_cper_generate_cper(ras_core, trace, count, > diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c > b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c > index 5b7a36596b02..088b9b153f7f 100644 > --- a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c > +++ b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c > @@ -202,11 +202,12 @@ static int ras_cmd_get_cper_records(struct > ras_core_context *ras_core, > (struct ras_cmd_cper_record_req *)cmd- > >input_buff_raw; > struct ras_cmd_cper_record_rsp *rsp = > (struct ras_cmd_cper_record_rsp *)cmd- > >output_buff_raw; > - struct ras_log_info *trace[MAX_RECORD_PER_BATCH] = {0}; > + struct ras_log_info *trace = NULL; > + uint32_t trace_count = MAX_RECORD_PER_BATCH; > struct ras_log_batch_overview overview; > uint32_t offset = 0, real_data_len = 0; > uint64_t batch_id; > - uint8_t *buffer; > + uint8_t *buffer = NULL; > int ret = 0, i, count; > > if ((cmd->input_size != sizeof(struct ras_cmd_cper_record_req)) || > @@ -224,6 +225,12 @@ static int ras_cmd_get_cper_records(struct > ras_core_context *ras_core, > if (!buffer) > return RAS_CMD__ERROR_GENERIC; > > + trace = kcalloc(trace_count, sizeof(*trace), GFP_KERNEL); > + if (!trace) { > + ret = RAS_CMD__ERROR_GENERIC; > + goto out; > + } > + > ras_log_ring_get_batch_overview(ras_core, &overview); > for (i = 0; i < req->cper_num; i++) { > batch_id = req->cper_start_id + i; > @@ -231,7 +238,7 @@ static int ras_cmd_get_cper_records(struct > ras_core_context *ras_core, > break; > > count = ras_log_ring_get_batch_records(ras_core, batch_id, > trace, > - ARRAY_SIZE(trace)); > + trace_count); > if (count > 0) { > ret = ras_cper_generate_cper(ras_core, trace, count, > &buffer[offset], req->buf_size - offset, > &real_data_len); @@ -244,8 +251,8 @@ static int > ras_cmd_get_cper_records(struct ras_core_context *ras_core, > > if ((ret && (ret != -ENOMEM)) || > copy_to_user(u64_to_user_ptr(req->buf_ptr), buffer, offset)) > { > - kfree(buffer); > - return RAS_CMD__ERROR_GENERIC; > + ret = RAS_CMD__ERROR_GENERIC; > + goto out; > } > > rsp->real_data_size = offset; > @@ -254,10 +261,12 @@ static int ras_cmd_get_cper_records(struct > ras_core_context *ras_core, > rsp->version = 0; > > cmd->output_size = sizeof(struct ras_cmd_cper_record_rsp); > + ret = RAS_CMD__SUCCESS; > > +out: > + kfree(trace); > kfree(buffer); > - > - return RAS_CMD__SUCCESS; > + return ret; > } > > static int ras_cmd_get_batch_trace_snapshot(struct ras_core_context > *ras_core, @@ -291,7 +300,8 @@ static int > ras_cmd_get_batch_trace_records(struct ras_core_context *ras_core, > struct ras_cmd_batch_trace_record_rsp *output_data = > (struct ras_cmd_batch_trace_record_rsp *)cmd- > >output_buff_raw; > struct ras_log_batch_overview overview; > - struct ras_log_info *trace_arry[MAX_RECORD_PER_BATCH] = {0}; > + struct ras_log_info *trace_arry = NULL; > + uint32_t trace_count = MAX_RECORD_PER_BATCH; > struct ras_log_info *record; > int i, j, count = 0, offset = 0; > uint64_t id; > @@ -309,6 +319,10 @@ static int ras_cmd_get_batch_trace_records(struct > ras_core_context *ras_core, > (input_data->start_batch_id >= overview.last_batch_id)) > return RAS_CMD__ERROR_INVALID_INPUT_SIZE; > > + trace_arry = kcalloc(trace_count, sizeof(*trace_arry), GFP_KERNEL); > + if (!trace_arry) > + return RAS_CMD__ERROR_GENERIC; > + > for (i = 0; i < input_data->batch_num; i++) { > id = input_data->start_batch_id + i; > if (id >= overview.last_batch_id) { > @@ -317,17 +331,17 @@ static int ras_cmd_get_batch_trace_records(struct > ras_core_context *ras_core, > } > > count = ras_log_ring_get_batch_records(ras_core, > - id, trace_arry, > ARRAY_SIZE(trace_arry)); > + id, trace_arry, trace_count); > if (count > 0) { > if ((offset + count) > RAS_CMD_MAX_TRACE_NUM) > break; > for (j = 0; j < count; j++) { > record = &output_data->records[offset + j]; > - record->seqno = trace_arry[j]->seqno; > - record->timestamp = trace_arry[j]- > >timestamp; > - record->event = trace_arry[j]->event; > + record->seqno = trace_arry[j].seqno; > + record->timestamp = trace_arry[j].timestamp; > + record->event = trace_arry[j].event; > memcpy(&record->aca_reg, > - &trace_arry[j]->aca_reg, > sizeof(trace_arry[j]->aca_reg)); > + &trace_arry[j].aca_reg, > sizeof(trace_arry[j].aca_reg)); > } > } else { > count = 0; > @@ -346,6 +360,8 @@ static int ras_cmd_get_batch_trace_records(struct > ras_core_context *ras_core, > > cmd->output_size = sizeof(struct ras_cmd_batch_trace_record_rsp); > > + kfree(trace_arry); > + > return RAS_CMD__SUCCESS; > } > > diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cper.c > b/drivers/gpu/drm/amd/ras/rascore/ras_cper.c > index 0fc7522b7ab6..6e93a13bbc4c 100644 > --- a/drivers/gpu/drm/amd/ras/rascore/ras_cper.c > +++ b/drivers/gpu/drm/amd/ras/rascore/ras_cper.c > @@ -175,14 +175,14 @@ static int fill_section_runtime(struct > ras_core_context *ras_core, } > > static int cper_generate_runtime_record(struct ras_core_context *ras_core, > - struct cper_section_hdr *hdr, struct ras_log_info **trace_arr, uint32_t > arr_num, > + struct cper_section_hdr *hdr, struct ras_log_info *trace_arr, uint32_t > +arr_num, > enum ras_cper_severity sev) > { > struct cper_section_descriptor *descriptor; > struct cper_section_runtime *runtime; > int i; > > - fill_section_hdr(ras_core, hdr, RAS_CPER_TYPE_RUNTIME, sev, > trace_arr[0]); > + fill_section_hdr(ras_core, hdr, RAS_CPER_TYPE_RUNTIME, sev, > +&trace_arr[0]); > hdr->record_length = RAS_HDR_LEN + ((RAS_SEC_DESC_LEN + > RAS_NONSTD_SEC_LEN) * arr_num); > hdr->sec_cnt = arr_num; > for (i = 0; i < arr_num; i++) { > @@ -194,21 +194,21 @@ static int cper_generate_runtime_record(struct > ras_core_context *ras_core, > fill_section_descriptor(ras_core, descriptor, sev, RUNTIME, > RAS_NONSTD_SEC_OFFSET(hdr->sec_cnt, i), > sizeof(struct cper_section_runtime)); > - fill_section_runtime(ras_core, runtime, trace_arr[i], sev); > + fill_section_runtime(ras_core, runtime, &trace_arr[i], sev); > } > > return 0; > } > > static int cper_generate_fatal_record(struct ras_core_context *ras_core, > - uint8_t *buffer, struct ras_log_info **trace_arr, uint32_t arr_num) > + uint8_t *buffer, struct ras_log_info *trace_arr, uint32_t arr_num) > { > struct ras_cper_fatal_record record = {0}; > int i = 0; > > for (i = 0; i < arr_num; i++) { > fill_section_hdr(ras_core, &record.hdr, > RAS_CPER_TYPE_FATAL, > - RAS_CPER_SEV_FATAL_UE, trace_arr[i]); > + RAS_CPER_SEV_FATAL_UE, &trace_arr[i]); > record.hdr.record_length = RAS_HDR_LEN + > RAS_SEC_DESC_LEN + RAS_FATAL_SEC_LEN; > record.hdr.sec_cnt = 1; > > @@ -216,7 +216,7 @@ static int cper_generate_fatal_record(struct > ras_core_context *ras_core, > CRASHDUMP, offsetof(struct > ras_cper_fatal_record, fatal), > sizeof(struct cper_section_fatal)); > > - fill_section_fatal(ras_core, &record.fatal, trace_arr[i]); > + fill_section_fatal(ras_core, &record.fatal, &trace_arr[i]); > > memcpy(buffer + (i * record.hdr.record_length), > &record, record.hdr.record_length); @@ - > 271,7 +271,7 @@ static enum ras_cper_type > cper_ras_log_event_to_cper_type(enum ras_log_event eve } > > int ras_cper_generate_cper(struct ras_core_context *ras_core, > - struct ras_log_info **trace_list, uint32_t count, > + struct ras_log_info *trace_list, uint32_t count, > uint8_t *buf, uint32_t buf_len, uint32_t *real_data_len) { > uint8_t *buffer = buf; > @@ -281,14 +281,14 @@ int ras_cper_generate_cper(struct ras_core_context > *ras_core, > > /* All the batch traces share the same event */ > record_size = cper_get_record_size( > - cper_ras_log_event_to_cper_type(trace_list[0]- > >event), count); > + > cper_ras_log_event_to_cper_type(trace_list[0].event), count); > > if ((record_size + saved_size) > buf_size) > return -ENOMEM; > > hdr = (struct cper_section_hdr *)(buffer + saved_size); > > - switch (trace_list[0]->event) { > + switch (trace_list[0].event) { > case RAS_LOG_EVENT_RMA: > cper_generate_runtime_record(ras_core, hdr, trace_list, > count, RAS_CPER_SEV_RMA); > break; > @@ -304,7 +304,7 @@ int ras_cper_generate_cper(struct ras_core_context > *ras_core, > cper_generate_fatal_record(ras_core, buffer + saved_size, > trace_list, count); > break; > default: > - RAS_DEV_WARN(ras_core->dev, "Unprocessed trace > event: %d\n", trace_list[0]->event); > + RAS_DEV_WARN(ras_core->dev, "Unprocessed trace > event: %d\n", > +trace_list[0].event); > break; > } > > diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cper.h > b/drivers/gpu/drm/amd/ras/rascore/ras_cper.h > index 076c1883c1ce..e4e3615ecc2e 100644 > --- a/drivers/gpu/drm/amd/ras/rascore/ras_cper.h > +++ b/drivers/gpu/drm/amd/ras/rascore/ras_cper.h > @@ -299,6 +299,6 @@ struct ras_cper_fatal_record { struct > ras_core_context; struct ras_log_info; int ras_cper_generate_cper(struct > ras_core_context *ras_core, > - struct ras_log_info **trace_list, uint32_t count, > + struct ras_log_info *trace_list, uint32_t count, > uint8_t *buf, uint32_t buf_len, uint32_t *real_data_len); > #endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c > b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c > index 0a838fdcb2f6..c2fca1a1e780 100644 > --- a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c > +++ b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c > @@ -265,8 +265,8 @@ void ras_log_ring_add_log_event(struct > ras_core_context *ras_core, > ras_log_ring_add_data(ras_core, log, batch_tag); } > > -static struct ras_log_info *ras_log_ring_lookup_data(struct ras_core_context > *ras_core, > - uint64_t idx) > +static int ras_log_ring_lookup_data(struct ras_core_context *ras_core, > + uint64_t idx, struct ras_log_info *log) > { > struct ras_log_ring *log_ring = &ras_core->ras_log_ring; > unsigned long flags = 0; > @@ -274,30 +274,27 @@ static struct ras_log_info > *ras_log_ring_lookup_data(struct ras_core_context *ra > > spin_lock_irqsave(&log_ring->spin_lock, flags); > data = radix_tree_lookup(&log_ring->ras_log_root, idx); > + if (data) > + memcpy(log, data, sizeof(*log)); > spin_unlock_irqrestore(&log_ring->spin_lock, flags); > > - return (struct ras_log_info *)data; > + return data ? 0 : -ENODATA; > } > > int ras_log_ring_get_batch_records(struct ras_core_context *ras_core, > uint64_t batch_id, > - struct ras_log_info **log_arr, uint32_t arr_num) > + struct ras_log_info *log_arr, uint32_t arr_num) > { > struct ras_log_ring *log_ring = &ras_core->ras_log_ring; > uint32_t i, idx, count = 0; > - void *data; > > - if ((batch_id >= log_ring->mono_upward_batch_id) || > + if (!log_arr || !arr_num || (batch_id >= > +log_ring->mono_upward_batch_id) || > (batch_id < log_ring->last_del_batch_id)) > return -EINVAL; > > - for (i = 0; i < MAX_RECORD_PER_BATCH; i++) { > + for (i = 0; i < MAX_RECORD_PER_BATCH && i < arr_num; i++) { > idx = BATCH_IDX_TO_TREE_IDX(batch_id, i); > - data = ras_log_ring_lookup_data(ras_core, idx); > - if (data) { > - log_arr[count++] = data; > - if (count >= arr_num) > - break; > - } > + if (!ras_log_ring_lookup_data(ras_core, idx, &log_arr[count])) > + count++; > } > > return count; > diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h > b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h > index 0ff6cc35678d..cb66beaa9f43 100644 > --- a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h > +++ b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h > @@ -86,7 +86,7 @@ void ras_log_ring_add_log_event(struct > ras_core_context *ras_core, > enum ras_log_event event, void *data, struct > ras_log_batch_tag *tag); > > int ras_log_ring_get_batch_records(struct ras_core_context *ras_core, > uint64_t batch_idx, > - struct ras_log_info **log_arr, uint32_t arr_num); > + struct ras_log_info *log_arr, uint32_t arr_num); > > int ras_log_ring_get_batch_overview(struct ras_core_context *ras_core, > struct ras_log_batch_overview *overview); > -- > 2.43.0
