RE: [PATCH] drm/amdgpu: Move calculation of xcp per memory node
[AMD Official Use Only - General] Reviewed-by: Hawking Zhang Regards, Hawking -Original Message- From: Lazar, Lijo Sent: Wednesday, June 14, 2023 12:28 To: amd-gfx@lists.freedesktop.org Cc: Zhang, Hawking ; Deucher, Alexander ; Yang, Philip ; Kamal, Asad ; Ma, Le Subject: [PATCH] drm/amdgpu: Move calculation of xcp per memory node Its value is required for finding the memory id of xcp. Fixes: 2130f4ca70b7f ("drm/amdgpu: Add xcp manager num_xcp_per_mem_partition") Signed-off-by: Lijo Lazar --- drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c index d733fa6e7477..9687df9841ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c @@ -132,6 +132,9 @@ int amdgpu_xcp_init(struct amdgpu_xcp_mgr *xcp_mgr, int num_xcps, int mode) for (i = 0; i < MAX_XCP; ++i) xcp_mgr->xcp[i].valid = false; + /* This is needed for figuring out memory id of xcp */ + xcp_mgr->num_xcp_per_mem_partition = num_xcps / +xcp_mgr->adev->gmc.num_mem_partitions; + for (i = 0; i < num_xcps; ++i) { for (j = AMDGPU_XCP_GFXHUB; j < AMDGPU_XCP_MAX_BLOCKS; ++j) { ret = xcp_mgr->funcs->get_ip_details(xcp_mgr, i, j, @@ -157,7 +160,6 @@ int amdgpu_xcp_init(struct amdgpu_xcp_mgr *xcp_mgr, int num_xcps, int mode) xcp_mgr->num_xcps = num_xcps; amdgpu_xcp_update_partition_sched_list(adev); - xcp_mgr->num_xcp_per_mem_partition = num_xcps / xcp_mgr->adev->gmc.num_mem_partitions; return 0; } -- 2.25.1
[PATCH] drm/amdgpu: Move calculation of xcp per memory node
Its value is required for finding the memory id of xcp. Fixes: 2130f4ca70b7f ("drm/amdgpu: Add xcp manager num_xcp_per_mem_partition") Signed-off-by: Lijo Lazar --- drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c index d733fa6e7477..9687df9841ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c @@ -132,6 +132,9 @@ int amdgpu_xcp_init(struct amdgpu_xcp_mgr *xcp_mgr, int num_xcps, int mode) for (i = 0; i < MAX_XCP; ++i) xcp_mgr->xcp[i].valid = false; + /* This is needed for figuring out memory id of xcp */ + xcp_mgr->num_xcp_per_mem_partition = num_xcps / xcp_mgr->adev->gmc.num_mem_partitions; + for (i = 0; i < num_xcps; ++i) { for (j = AMDGPU_XCP_GFXHUB; j < AMDGPU_XCP_MAX_BLOCKS; ++j) { ret = xcp_mgr->funcs->get_ip_details(xcp_mgr, i, j, @@ -157,7 +160,6 @@ int amdgpu_xcp_init(struct amdgpu_xcp_mgr *xcp_mgr, int num_xcps, int mode) xcp_mgr->num_xcps = num_xcps; amdgpu_xcp_update_partition_sched_list(adev); - xcp_mgr->num_xcp_per_mem_partition = num_xcps / xcp_mgr->adev->gmc.num_mem_partitions; return 0; } -- 2.25.1
RE: [PATCH 3/3] drm/amdgpu: Remove unused NBIO interface
[AMD Official Use Only - General] Series is Reviewed-by: Le Ma > -Original Message- > From: Lazar, Lijo > Sent: Tuesday, June 13, 2023 6:54 PM > To: amd-gfx@lists.freedesktop.org > Cc: Zhang, Hawking ; Deucher, Alexander > ; Kamal, Asad ; Ma, > Le > Subject: [PATCH 3/3] drm/amdgpu: Remove unused NBIO interface > > Set compute partition mode interface in NBIO is no longer used. Remove the > only implementation from NBIO v7.9 > > Signed-off-by: Lijo Lazar > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h | 2 -- > drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 14 -- > 2 files changed, 16 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h > index 095aecfb201e..8ab8ae01f87c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h > @@ -99,8 +99,6 @@ struct amdgpu_nbio_funcs { > int (*get_compute_partition_mode)(struct amdgpu_device *adev); > u32 (*get_memory_partition_mode)(struct amdgpu_device *adev, >u32 *supp_modes); > - void (*set_compute_partition_mode)(struct amdgpu_device *adev, > -enum amdgpu_gfx_partition mode); > }; > > struct amdgpu_nbio { > diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c > b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c > index b033935d6749..cd1a02d30420 100644 > --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c > +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c > @@ -393,19 +393,6 @@ static int > nbio_v7_9_get_compute_partition_mode(struct amdgpu_device *adev) > return px; > } > > -static void nbio_v7_9_set_compute_partition_mode(struct amdgpu_device > *adev, > - enum amdgpu_gfx_partition mode) > -{ > - u32 tmp; > - > - /* SPX=0, DPX=1, TPX=2, QPX=3, CPX=4 */ > - tmp = RREG32_SOC15(NBIO, 0, > regBIF_BX_PF0_PARTITION_COMPUTE_STATUS); > - tmp = REG_SET_FIELD(tmp, > BIF_BX_PF0_PARTITION_COMPUTE_STATUS, > - PARTITION_MODE, mode); > - > - WREG32_SOC15(NBIO, 0, > regBIF_BX_PF0_PARTITION_COMPUTE_STATUS, tmp); > -} > - > static u32 nbio_v7_9_get_memory_partition_mode(struct amdgpu_device > *adev, > u32 *supp_modes) > { > @@ -461,7 +448,6 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = { > .ih_control = nbio_v7_9_ih_control, > .remap_hdp_registers = nbio_v7_9_remap_hdp_registers, > .get_compute_partition_mode = > nbio_v7_9_get_compute_partition_mode, > - .set_compute_partition_mode = > nbio_v7_9_set_compute_partition_mode, > .get_memory_partition_mode = > nbio_v7_9_get_memory_partition_mode, > .init_registers = nbio_v7_9_init_registers, }; > -- > 2.25.1
[PATCH] drm/amd/display: Convert to kdoc formats in dc/core/dc.c
Fixes the following gcc with W=1: drivers/gpu/drm/amd/amdgpu/../display/dc/core/dc.c:3483: warning: Cannot understand * *** drivers/gpu/drm/amd/amdgpu/../display/dc/core/dc.c:4204: warning: Cannot understand * *** Cc: Rodrigo Siqueira Cc: Aurabindo Pillai Signed-off-by: Srinivasan Shanmugam --- drivers/gpu/drm/amd/display/dc/core/dc.c | 34 +++- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 5d3d61faeb28..e6bd20dbfc0a 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -3480,23 +3480,21 @@ static void build_dmub_update_dirty_rect( /** - * - * build_dmub_cmd_list: Build an array of DMCUB commands to be sent to DMCUB + * build_dmub_cmd_list() - Build an array of DMCUB commands to be sent to DMCUB * - * @param [in]: dc: Current DC state - * @param [in]: srf_updates: Array of surface updates - * @param [in]: surface_count: Number of surfaces that have an updated - * @param [in]: stream: Correponding stream to be updated in the current flip - * @param [in]: context: New DC state to be programmed + * @dc: Current DC state + * @srf_updates: Array of surface updates + * @surface_count: Number of surfaces that have an updated + * @stream: Corresponding stream to be updated in the current flip + * @context: New DC state to be programmed * - * @param [out]: dc_dmub_cmd: Array of DMCUB commands to be sent to DMCUB - * @param [out]: dmub_cmd_count: Count indicating the number of DMCUB commands in dc_dmub_cmd array + * @dc_dmub_cmd: Array of DMCUB commands to be sent to DMCUB + * @dmub_cmd_count: Count indicating the number of DMCUB commands in dc_dmub_cmd array * * This function builds an array of DMCUB commands to be sent to DMCUB. This function is required * to build an array of commands and have them sent while the OTG lock is acquired. * - * @return: void - * + * Return: void */ static void build_dmub_cmd_list(struct dc *dc, struct dc_surface_update *srf_updates, @@ -4201,20 +4199,18 @@ static bool commit_minimal_transition_state(struct dc *dc, } /** - * *** - * update_seamless_boot_flags: Helper function for updating seamless boot flags + * update_seamless_boot_flags() - Helper function for updating seamless boot flags * - * @param [in]: dc: Current DC state - * @param [in]: context: New DC state to be programmed - * @param [in]: surface_count: Number of surfaces that have an updated - * @param [in]: stream: Correponding stream to be updated in the current flip + * @dc: Current DC state + * @context: New DC state to be programmed + * @surface_count: Number of surfaces that have an updated + * @stream: Corresponding stream to be updated in the current flip * * Updating seamless boot flags do not need to be part of the commit sequence. This * helper function will update the seamless boot flags on each flip (if required) * outside of the HW commit sequence (fast or slow). * - * @return: void - * *** + * Return: void */ static void update_seamless_boot_flags(struct dc *dc, struct dc_state *context, -- 2.25.1
Re: [PATCH v5] drm/dp_mst: Clear MSG_RDY flag before sending new message
Alright, managed to figure out my MST woes! Just tested with nouveau and I see no regressions :) Reviewed-by: Lyude Paul On Fri, 2023-06-09 at 18:49 +0800, Wayne Lin wrote: > [Why] > The sequence for collecting down_reply from source perspective should > be: > > Request_n->repeat (get partial reply of Request_n->clear message ready > flag to ack DPRX that the message is received) till all partial > replies for Request_n are received->new Request_n+1. > > Now there is chance that drm_dp_mst_hpd_irq() will fire new down > request in the tx queue when the down reply is incomplete. Source is > restricted to generate interveleaved message transactions so we should > avoid it. > > Also, while assembling partial reply packets, reading out DPCD DOWN_REP > Sideband MSG buffer + clearing DOWN_REP_MSG_RDY flag should be > wrapped up as a complete operation for reading out a reply packet. > Kicking off a new request before clearing DOWN_REP_MSG_RDY flag might > be risky. e.g. If the reply of the new request has overwritten the > DPRX DOWN_REP Sideband MSG buffer before source writing one to clear > DOWN_REP_MSG_RDY flag, source then unintentionally flushes the reply > for the new request. Should handle the up request in the same way. > > [How] > Separete drm_dp_mst_hpd_irq() into 2 steps. After acking the MST IRQ > event, driver calls drm_dp_mst_hpd_irq_send_new_request() and might > trigger drm_dp_mst_kick_tx() only when there is no on going message > transaction. > > Changes since v1: > * Reworked on review comments received > -> Adjust the fix to let driver explicitly kick off new down request > when mst irq event is handled and acked > -> Adjust the commit message > > Changes since v2: > * Adjust the commit message > * Adjust the naming of the divided 2 functions and add a new input > parameter "ack". > * Adjust code flow as per review comments. > > Changes since v3: > * Update the function description of drm_dp_mst_hpd_irq_handle_event > > Changes since v4: > * Change ack of drm_dp_mst_hpd_irq_handle_event() to be an array align > the size of esi[] > > Signed-off-by: Wayne Lin > Cc: sta...@vger.kernel.org > --- > .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 32 +-- > drivers/gpu/drm/display/drm_dp_mst_topology.c | 54 --- > drivers/gpu/drm/i915/display/intel_dp.c | 7 +-- > drivers/gpu/drm/nouveau/dispnv50/disp.c | 12 +++-- > include/drm/display/drm_dp_mst_helper.h | 7 ++- > 5 files changed, 81 insertions(+), 31 deletions(-) > > diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c > b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c > index d5cec03eaa8d..ec629b4037e4 100644 > --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c > +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c > @@ -3263,6 +3263,7 @@ static void dm_handle_mst_sideband_msg(struct > amdgpu_dm_connector *aconnector) > > while (dret == dpcd_bytes_to_read && > process_count < max_process_count) { > + u8 ack[DP_PSR_ERROR_STATUS - DP_SINK_COUNT_ESI] = {}; > u8 retry; > dret = 0; > > @@ -3271,28 +3272,29 @@ static void dm_handle_mst_sideband_msg(struct > amdgpu_dm_connector *aconnector) > DRM_DEBUG_DRIVER("ESI %02x %02x %02x\n", esi[0], esi[1], > esi[2]); > /* handle HPD short pulse irq */ > if (aconnector->mst_mgr.mst_state) > - drm_dp_mst_hpd_irq( > - >mst_mgr, > - esi, > - _irq_handled); > + drm_dp_mst_hpd_irq_handle_event(>mst_mgr, > + esi, > + ack, > + _irq_handled); > > if (new_irq_handled) { > /* ACK at DPCD to notify down stream */ > - const int ack_dpcd_bytes_to_write = > - dpcd_bytes_to_read - 1; > - > for (retry = 0; retry < 3; retry++) { > - u8 wret; > - > - wret = drm_dp_dpcd_write( > - >dm_dp_aux.aux, > - dpcd_addr + 1, > - [1], > - ack_dpcd_bytes_to_write); > - if (wret == ack_dpcd_bytes_to_write) > + ssize_t wret; > + > + wret = > drm_dp_dpcd_writeb(>dm_dp_aux.aux, > + dpcd_addr + 1, > + ack[1]); > + if (wret == 1) > break; > } > > + if (retry == 3) { > +
[PATCH v2] drm/amd/pm: Align eccinfo table structure with smu v13_0_0 interface
Update eccinfo table structure according to smu v13_0_0 interface. v2: Calculate array size instead of using macro definition. Signed-off-by: Candice Li Reviewed-by: Lijo Lazar --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index 413e592f0ed611..cbf0b2d738c1a6 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -46,7 +46,6 @@ #include "asic_reg/mp/mp_13_0_0_sh_mask.h" #include "smu_cmn.h" #include "amdgpu_ras.h" -#include "umc_v8_10.h" /* * DO NOT use these for err/warn/info/debug messages. @@ -2580,7 +2579,7 @@ static ssize_t smu_v13_0_0_get_ecc_info(struct smu_context *smu, ecc_table = (EccInfoTable_t *)smu_table->ecc_table; - for (i = 0; i < UMC_V8_10_TOTAL_CHANNEL_NUM(adev); i++) { + for (i = 0; i < ARRAY_SIZE(ecc_table->EccInfo); i++) { ecc_info_per_channel = &(eccinfo->ecc[i]); ecc_info_per_channel->ce_count_lo_chip = ecc_table->EccInfo[i].ce_count_lo_chip; -- 2.25.1
[PATCH 2/2] drm/amdgpu: Add channel_dis_num to ras init flags
Add disabled channel number to ras init flags. Signed-off-by: Candice Li Reviewed-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 1 + drivers/gpu/drm/amd/amdgpu/ta_ras_if.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index dd865beb39a8c4..6070c91f0b8293 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1657,6 +1657,7 @@ int psp_ras_initialize(struct psp_context *psp) ras_cmd->ras_in_message.init_flags.dgpu_mode = 1; ras_cmd->ras_in_message.init_flags.xcc_mask = adev->gfx.xcc_mask; + ras_cmd->ras_in_message.init_flags.channel_dis_num = hweight32(adev->gmc.m_half_use) * 2; ret = psp_ta_load(psp, >ras_context.context); diff --git a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h index be2984ac00a56d..879bb7af297c7b 100644 --- a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h +++ b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h @@ -130,6 +130,7 @@ struct ta_ras_init_flags { uint8_t poison_mode_en; uint8_t dgpu_mode; uint16_t xcc_mask; + uint8_t channel_dis_num; }; struct ta_ras_output_flags { -- 2.25.1
[PATCH 1/2] drm/amdgpu: Update total channel number for umc v8_10
Update total channel number for umc v8_10. Signed-off-by: Candice Li Reviewed-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 2 ++ drivers/gpu/drm/amd/amdgpu/umc_v8_10.h| 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index 859882109f55d6..16cf7b199457e3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -1515,6 +1515,7 @@ static int amdgpu_discovery_get_mall_info(struct amdgpu_device *adev) mall_size += mall_size_per_umc; } adev->gmc.mall_size = mall_size; + adev->gmc.m_half_use = half_use; break; default: dev_err(adev->dev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h index 6794edd1d2d2ae..56d73fade56850 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h @@ -301,6 +301,8 @@ struct amdgpu_gmc { /* MALL size */ u64 mall_size; + uint32_t m_half_use; + /* number of UMC instances */ int num_umc; /* mode2 save restore */ diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.h b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.h index c6dfd433fec7bc..dc12e0af5451e9 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.h +++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.h @@ -33,7 +33,8 @@ /* Total channel instances for all available umc nodes */ #define UMC_V8_10_TOTAL_CHANNEL_NUM(adev) \ - (UMC_V8_10_CHANNEL_INSTANCE_NUM * UMC_V8_10_UMC_INSTANCE_NUM * (adev)->gmc.num_umc) + (UMC_V8_10_CHANNEL_INSTANCE_NUM * UMC_V8_10_UMC_INSTANCE_NUM * \ + (adev)->gmc.num_umc - hweight32((adev)->gmc.m_half_use) * 2) /* UMC regiser per channel offset */ #define UMC_V8_10_PER_CHANNEL_OFFSET 0x400 -- 2.25.1
Re: [PATCH] drm/amdkfd: decrement queue count on mes queue destroy
On 2023-06-13 17:48, Jonathan Kim wrote: Queue count should decrement on queue destruction regardless of HWS support type. Signed-off-by: Jonathan Kim Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 8a39a9e0ed5a..f515cb8f30ca 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -2089,8 +2089,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, list_del(>list); qpd->queue_count--; if (q->properties.is_active) { + decrement_queue_count(dqm, qpd, q); if (!dqm->dev->kfd->shared_resources.enable_mes) { - decrement_queue_count(dqm, qpd, q); retval = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
[PATCH] drm/amdkfd: decrement queue count on mes queue destroy
Queue count should decrement on queue destruction regardless of HWS support type. Signed-off-by: Jonathan Kim --- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 8a39a9e0ed5a..f515cb8f30ca 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -2089,8 +2089,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, list_del(>list); qpd->queue_count--; if (q->properties.is_active) { + decrement_queue_count(dqm, qpd, q); if (!dqm->dev->kfd->shared_resources.enable_mes) { - decrement_queue_count(dqm, qpd, q); retval = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD); -- 2.25.1
[linux-next:master] BUILD REGRESSION 1f6ce8392d6ff486af5ca96df9ded5882c4b6977
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master branch HEAD: 1f6ce8392d6ff486af5ca96df9ded5882c4b6977 Add linux-next specific files for 20230613 Error/Warning reports: https://lore.kernel.org/oe-kbuild-all/202306082341.uqtcm8po-...@intel.com https://lore.kernel.org/oe-kbuild-all/20230613.hher4zoo-...@intel.com https://lore.kernel.org/oe-kbuild-all/202306132155.bfzc9arf-...@intel.com https://lore.kernel.org/oe-kbuild-all/202306132237.z4lje8bp-...@intel.com https://lore.kernel.org/oe-kbuild-all/202306140347.s9njs3al-...@intel.com Error/Warning: (recently discovered and may have been fixed) arch/microblaze/include/asm/page.h:34: warning: "ARCH_DMA_MINALIGN" redefined arch/parisc/kernel/pdt.c:65:6: warning: no previous prototype for 'arch_report_meminfo' [-Wmissing-prototypes] csky-linux-ld: drivers/net/ethernet/sfc/ef100_netdev.c:114: undefined reference to `efx_tc_netevent_event' drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c:76: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst drivers/gpu/drm/i915/display/intel_display_power.h:256:70: error: declaration of 'struct seq_file' will not be visible outside of this function [-Werror,-Wvisibility] drivers/leds/leds-cht-wcove.c:144:21: warning: no previous prototype for 'cht_wc_leds_brightness_get' [-Wmissing-prototypes] include/asm-generic/bitops/instrumented-non-atomic.h:141: undefined reference to `uv_info' lib/kunit/executor_test.c:138:4: warning: cast from 'void (*)(const void *)' to 'kunit_action_t *' (aka 'void (*)(void *)') converts to incompatible function type [-Wcast-function-type-strict] lib/kunit/test.c:775:38: warning: cast from 'void (*)(const void *)' to 'kunit_action_t *' (aka 'void (*)(void *)') converts to incompatible function type [-Wcast-function-type-strict] Unverified Error/Warning (likely false positive, please contact us if interested): arch/arm64/kvm/mmu.c:147:3-9: preceding lock on line 140 drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c:98 mlx5_devcom_register_device() error: uninitialized symbol 'tmp_dev'. drivers/usb/cdns3/cdns3-starfive.c:23: warning: expecting prototype for cdns3(). Prototype was for USB_STRAP_HOST() instead fs/btrfs/volumes.c:6404 btrfs_map_block() error: we previously assumed 'mirror_num_ret' could be null (see line 6242) fs/smb/client/cifsfs.c:982 cifs_smb3_do_mount() warn: possible memory leak of 'cifs_sb' fs/smb/client/cifssmb.c:4089 CIFSFindFirst() warn: missing error code? 'rc' fs/smb/client/cifssmb.c:4216 CIFSFindNext() warn: missing error code? 'rc' fs/smb/client/connect.c:2775 cifs_match_super() error: 'tlink' dereferencing possible ERR_PTR() fs/smb/client/connect.c:2974 generic_ip_connect() error: we previously assumed 'socket' could be null (see line 2962) lib/kunit/test.c:336 __kunit_abort() warn: ignoring unreachable code. Error/Warning ids grouped by kconfigs: gcc_recent_errors |-- alpha-allyesconfig | `-- drivers-gpu-drm-amd-amdgpu-amdgpu_ras_eeprom.c:warning:This-comment-starts-with-but-isn-t-a-kernel-doc-comment.-Refer-Documentation-doc-guide-kernel-doc.rst |-- arc-allyesconfig | `-- drivers-gpu-drm-amd-amdgpu-amdgpu_ras_eeprom.c:warning:This-comment-starts-with-but-isn-t-a-kernel-doc-comment.-Refer-Documentation-doc-guide-kernel-doc.rst |-- arm-allmodconfig | `-- drivers-gpu-drm-amd-amdgpu-amdgpu_ras_eeprom.c:warning:This-comment-starts-with-but-isn-t-a-kernel-doc-comment.-Refer-Documentation-doc-guide-kernel-doc.rst |-- arm-allyesconfig | `-- drivers-gpu-drm-amd-amdgpu-amdgpu_ras_eeprom.c:warning:This-comment-starts-with-but-isn-t-a-kernel-doc-comment.-Refer-Documentation-doc-guide-kernel-doc.rst |-- arm64-allyesconfig | `-- drivers-gpu-drm-amd-amdgpu-amdgpu_ras_eeprom.c:warning:This-comment-starts-with-but-isn-t-a-kernel-doc-comment.-Refer-Documentation-doc-guide-kernel-doc.rst |-- arm64-randconfig-c033-20230611 | `-- arch-arm64-kvm-mmu.c:preceding-lock-on-line |-- csky-randconfig-c044-20230612 | |-- csky-linux-ld:drivers-net-ethernet-sfc-ef100_netdev.c:undefined-reference-to-efx_tc_netevent_event | `-- drivers-gpu-drm-amd-amdgpu-amdgpu_ras_eeprom.c:warning:This-comment-starts-with-but-isn-t-a-kernel-doc-comment.-Refer-Documentation-doc-guide-kernel-doc.rst |-- i386-allyesconfig | |-- drivers-gpu-drm-amd-amdgpu-amdgpu_ras_eeprom.c:warning:This-comment-starts-with-but-isn-t-a-kernel-doc-comment.-Refer-Documentation-doc-guide-kernel-doc.rst | `-- drivers-leds-leds-cht-wcove.c:warning:no-previous-prototype-for-cht_wc_leds_brightness_get |-- i386-randconfig-m021-20230612 | |-- fs-smb-client-cifsfs.c-cifs_smb3_do_mount()-warn:possible-memory-leak-of-cifs_sb | |-- fs-smb-client-cifssmb.c-CIFSFindFirst()-warn:missing-error-code-rc | |-- fs-smb-client-cifssmb.c-CIFSFindNext()-warn:missing-error-code-rc | |-- fs-smb-client-connect.c-cifs_match_super()-error:tlink-dereferencing-possible-ERR_PTR() | `-- fs-
Re: [RFC PATCH v2 00/18] Add DRM CRTC 3D LUT interface
Hello I'm completing the support for 3D LUT on R-Car DU peripheral and I have used this series as a base. I'm wondering, since quite some time has passed without any update if this series is still a thing and it makes any sense for me to try to bring it forward. I'm asking as I've noticed: "[PATCH 00/36] drm/amd/display: add AMD driver-specific properties for color mgmt" which seems to supersede this proposal with driver-specific properties. I asked Melissa privately but I wasn't able to get an hold of her, so if anyone has any clue feel free to reply :) Thanks j On Mon, Jan 09, 2023 at 01:38:28PM -0100, Melissa Wen wrote: > Hi, > > After collecting comments in different places, here is a second version > of the work on adding DRM CRTC 3D LUT support to the current DRM color > mgmt interface. In comparison to previous proposals [1][2][3], here we > add 3D LUT before gamma 1D LUT, but also a shaper 1D LUT before 3D LUT, > that means the following DRM CRTC color correction pipeline: > > Blend -> Degamma 1D LUT -> CTM -> Shaper 1D LUT -> 3D LUT -> Gamma 1D LUT > > and we also add a DRM CRTC LUT3D_MODE property, based on Alex Hung > proposal for pre-blending 3D LUT [4] (Thanks!), instead of just a > LUT3D_SIZE, that allows userspace to use different supported settings of > 3D LUT, fitting VA-API and new color API better. In this sense, I > adjusted the pre-blending proposal for post-blending usage. > > Patches 1-6 targets the addition of shaper LUT and 3D LUT properties to > the current DRM CRTC color mgmt pipeline. Patch 6 can be considered an > extra/optional patch to define a default value for LUT3D_MODE, inspired > by what we do for the plane blend mode property (pre-multiplied). > > Patches 7-18 targets AMD display code to enable shaper and 3D LUT usage > on DCN 301 (our HW case). Patches 7-9 performs code cleanups on current > AMD DM colors code, patch 10 updates AMD stream in case of user 3D LUT > changes, patch 11/12 rework AMD MPC 3D LUT resource handling by context > for DCN 301 (easily extendible to other DCN families). Finally, from > 13-18, we wire up SHAPER LUT, LUT3D and LUT3D MODE to AMD display > driver, exposing modes supported by HW and programming user shaper and > 3D LUT accordingly. > > Our target userspace is Gamescope/SteamOS. > > Basic IGT tests were based on [5][6] and are available here (in-progress): > https://gitlab.freedesktop.org/mwen/igt-gpu-tools/-/commits/crtc-lut3d-api > > [1] > https://lore.kernel.org/all/20201221015730.28333-1-laurent.pinchart+rene...@ideasonboard.com/ > [2] > https://github.com/vsyrjala/linux/commit/4d28e8ddf2a076f30f9e5bdc17cbb4656fe23e69 > [3] https://lore.kernel.org/amd-gfx/20220619223104.667413-1-m...@igalia.com/ > [4] > https://lore.kernel.org/dri-devel/20221004211451.1475215-1-alex.h...@amd.com/ > [5] https://patchwork.freedesktop.org/series/90165/ > [6] https://patchwork.freedesktop.org/series/109402/ > [VA_API] > http://intel.github.io/libva/structVAProcFilterParameterBuffer3DLUT.html > [KMS_pipe_API] https://gitlab.freedesktop.org/pq/color-and-hdr/-/issues/11 > > Let me know your thoughts. > > Thanks, > > Melissa > > Alex Hung (2): > drm: Add 3D LUT mode and its attributes > drm/amd/display: Define 3D LUT struct for HDR planes > > Melissa Wen (16): > drm/drm_color_mgmt: add shaper LUT to color mgmt properties > drm/drm_color_mgmt: add 3D LUT props to DRM color mgmt > drm/drm_color_mgmt: add function to create 3D LUT modes supported > drm/drm_color_mgmt: add function to attach 3D LUT props > drm/drm_color_mgmt: set first lut3d mode as default > drm/amd/display: remove unused regamma condition > drm/amd/display: add comments to describe DM crtc color mgmt behavior > drm/amd/display: encapsulate atomic regamma operation > drm/amd/display: update lut3d and shaper lut to stream > drm/amd/display: handle MPC 3D LUT resources for a given context > drm/amd/display: acquire/release 3D LUT resources for ctx on DCN301 > drm/amd/display: expand array of supported 3D LUT modes > drm/amd/display: enable 3D-LUT DRM properties if supported > drm/amd/display: add user 3D LUT support to the amdgpu_dm color > pipeline > drm/amd/display: decouple steps to reuse in shaper LUT support > drm/amd/display: add user shaper LUT support to amdgpu_dm color > pipeline > > .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 6 + > .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 3 + > .../amd/display/amdgpu_dm/amdgpu_dm_color.c | 370 -- > .../amd/display/amdgpu_dm/amdgpu_dm_crtc.c| 2 + > drivers/gpu/drm/amd/display/dc/core/dc.c | 49 ++- > drivers/gpu/drm/amd/display/dc/dc.h | 8 + > .../amd/display/dc/dcn301/dcn301_resource.c | 47 ++- > .../amd/display/modules/color/color_gamma.h | 43 ++ > drivers/gpu/drm/drm_atomic_state_helper.c | 7 + > drivers/gpu/drm/drm_atomic_uapi.c | 24 ++ > drivers/gpu/drm/drm_color_mgmt.c | 127 ++ >
[PATCH 2/2] drm/amdgpu/pm: make mclk consistent for smu 13.0.7
Use current uclk to be consistent with other dGPUs. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c index cda4e818aab7..8eb8c30e6c69 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c @@ -949,7 +949,7 @@ static int smu_v13_0_7_read_sensor(struct smu_context *smu, break; case AMDGPU_PP_SENSOR_GFX_MCLK: ret = smu_v13_0_7_get_smu_metrics_data(smu, - METRICS_AVERAGE_UCLK, + METRICS_CURR_UCLK, (uint32_t *)data); *(uint32_t *)data *= 100; *size = 4; -- 2.40.1
[PATCH 1/2] drm/amdgpu/pm: make gfxclock consistent for sienna cichlid
Use average gfxclock for consistency with other dGPUs. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c index f7ed3e655e39..1b7d93709a35 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c @@ -1927,12 +1927,16 @@ static int sienna_cichlid_read_sensor(struct smu_context *smu, *size = 4; break; case AMDGPU_PP_SENSOR_GFX_MCLK: - ret = sienna_cichlid_get_current_clk_freq_by_table(smu, SMU_UCLK, (uint32_t *)data); + ret = sienna_cichlid_get_smu_metrics_data(smu, + METRICS_CURR_UCLK, + (uint32_t *)data); *(uint32_t *)data *= 100; *size = 4; break; case AMDGPU_PP_SENSOR_GFX_SCLK: - ret = sienna_cichlid_get_current_clk_freq_by_table(smu, SMU_GFXCLK, (uint32_t *)data); + ret = sienna_cichlid_get_smu_metrics_data(smu, + METRICS_AVERAGE_GFXCLK, + (uint32_t *)data); *(uint32_t *)data *= 100; *size = 4; break; -- 2.40.1
Re: [PATCH 10/66] drm/amd/display: Do not set drr on pipe commit
On 6/12/23 20:14, Pillai, Aurabindo wrote: > > I want to double check if we're identifying the correct monitor for applying > the workaround. Could you please try the attached patch and let me know the > panel id ? amdgpu: ### Not applying any edid quirk for panel 4c2d71ac I'm attaching the EDID. BTW, I'm using the monitor firmware version 1011.0, which AFAICT is the latest. -- Earthling Michel Dänzer| https://redhat.com Libre software enthusiast | Mesa and Xwayland developer edid Description: Binary data
[PATCHv3] drm/amdgpu: Update invalid PTE flag setting
Update the invalid PTE flag setting with TF enabled. This is to ensure, in addition to transitioning the retry fault to a no-retry fault, it also causes the wavefront to enter the trap handler. With the current setting, the fault only transitions to a no-retry fault. Additionally, have 2 sets of invalid PTE settings, one for TF enabled, the other for TF disabled. The setting with TF disabled, doesn't work with TF enabled. Signed-off-by: Mukul Joshi --- v1->v2: - Update handling according to Christian's feedback. v2->v3: - Remove ASIC specific callback (Felix). drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h| 6 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 21 + 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 1cb14ea18cd9..ff9db7e5c086 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2583,7 +2583,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, /* Intentionally setting invalid PTE flag * combination to force a no-retry-fault */ - flags = AMDGPU_PTE_SNOOPED | AMDGPU_PTE_PRT; + flags = AMDGPU_VM_NORETRY_FLAGS; value = 0; } else if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_NEVER) { /* Redirect the access to the dummy page */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 9c85d494f2a2..b81fcb962d8f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -84,7 +84,13 @@ struct amdgpu_mem_stats; /* PDE Block Fragment Size for VEGA10 */ #define AMDGPU_PDE_BFS(a) ((uint64_t)a << 59) +/* Flag combination to set no-retry with TF disabled */ +#define AMDGPU_VM_NORETRY_FLAGS(AMDGPU_PTE_EXECUTABLE | AMDGPU_PDE_PTE | \ + AMDGPU_PTE_TF) +/* Flag combination to set no-retry with TF enabled */ +#define AMDGPU_VM_NORETRY_FLAGS_TF (AMDGPU_PTE_VALID | AMDGPU_PTE_SYSTEM | \ + AMDGPU_PTE_PRT) /* For GFX9 */ #define AMDGPU_PTE_MTYPE_VG10(a) ((uint64_t)(a) << 57) #define AMDGPU_PTE_MTYPE_VG10_MASK AMDGPU_PTE_MTYPE_VG10(3ULL) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index dea1a64be44d..45b26cad59cd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -778,6 +778,24 @@ int amdgpu_vm_pde_update(struct amdgpu_vm_update_params *params, 1, 0, flags); } +/** + * amdgpu_vm_pte_update_noretry_flags - Update PTE no-retry flags + * + * @adev - amdgpu_device pointer + * @flags: pointer to PTE flags + * + * Update PTE no-retry flags when TF is enabled. + */ +static void amdgpu_vm_pte_update_noretry_flags(struct amdgpu_device *adev, + uint64_t *flags) +{ + /* Update no retry flags when TF is enabled */ + if ((*flags & AMDGPU_VM_NORETRY_FLAGS) == AMDGPU_VM_NORETRY_FLAGS) { + *flags &= ~AMDGPU_VM_NORETRY_FLAGS; + *flags |= AMDGPU_VM_NORETRY_FLAGS_TF; + } +} + /* * amdgpu_vm_pte_update_flags - figure out flags for PTE updates * @@ -804,6 +822,9 @@ static void amdgpu_vm_pte_update_flags(struct amdgpu_vm_update_params *params, flags |= AMDGPU_PTE_EXECUTABLE; } + if (adev->gmc.translate_further && level == AMDGPU_VM_PTB) + amdgpu_vm_pte_update_noretry_flags(adev, ); + /* APUs mapping system memory may need different MTYPEs on different * NUMA nodes. Only do this for contiguous ranges that can be assumed * to be on the same NUMA node. -- 2.35.1
[PATCH 2/3] drm/amdgpu: cache gpuvm fault information for gmc7+
Cache the current fault info in the vm struct. This can be queried by userspace later to help debug UMDs. Cc: samuel.pitoi...@gmail.com Acked-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 3 +++ drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 3 +++ drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c | 3 +++ drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 3 +++ drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 11 +++ 5 files changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index b2e42f1b0f12..ccb69f5b06fe 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -155,6 +155,9 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev, status = RREG32(hub->vm_l2_pro_fault_status); WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1); + + amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, +entry->vmid_src ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0)); } if (!printk_ratelimit()) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index c571f0d95994..ae35dc6ba502 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -115,6 +115,9 @@ static int gmc_v11_0_process_interrupt(struct amdgpu_device *adev, status = RREG32(hub->vm_l2_pro_fault_status); WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1); + + amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, +entry->vmid_src ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0)); } if (printk_ratelimit()) { diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c index acd2b407860f..d51cad788769 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c @@ -1273,6 +1273,9 @@ static int gmc_v7_0_process_interrupt(struct amdgpu_device *adev, if (!addr && !status) return 0; + amdgpu_vm_update_fault_cache(adev, entry->pasid, +((u64)addr) << AMDGPU_GPU_PAGE_SHIFT, status, AMDGPU_GFXHUB(0)); + if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_FIRST) gmc_v7_0_set_fault_enable_default(adev, false); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c index 85dead2a5702..8ce7455ff3f6 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c @@ -1448,6 +1448,9 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev, if (!addr && !status) return 0; + amdgpu_vm_update_fault_cache(adev, entry->pasid, +((u64)addr) << AMDGPU_GPU_PAGE_SHIFT, status, AMDGPU_GFXHUB(0)); + if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_FIRST) gmc_v8_0_set_fault_enable_default(adev, false); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 3ed286b72cae..fbd65872050a 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -555,6 +555,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, struct amdgpu_vmhub *hub; const char *mmhub_cid; const char *hub_name; + unsigned int vmhub; u64 addr; uint32_t cam_index = 0; int ret, xcc_id = 0; @@ -567,10 +568,10 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, if (entry->client_id == SOC15_IH_CLIENTID_VMC) { hub_name = "mmhub0"; - hub = >vmhub[AMDGPU_MMHUB0(node_id / 4)]; + vmhub = AMDGPU_MMHUB0(node_id / 4); } else if (entry->client_id == SOC15_IH_CLIENTID_VMC1) { hub_name = "mmhub1"; - hub = >vmhub[AMDGPU_MMHUB1(0)]; + vmhub = AMDGPU_MMHUB1(0); } else { hub_name = "gfxhub0"; if (adev->gfx.funcs->ih_node_to_logical_xcc) { @@ -579,8 +580,9 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, if (xcc_id < 0) xcc_id = 0; } - hub = >vmhub[xcc_id]; + vmhub = xcc_id; } + hub = >vmhub[vmhub]; if (retry_fault) { if (adev->irq.retry_cam_enabled) { @@ -626,7 +628,6 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, if (!printk_ratelimit()) return 0; - memset(_info, 0, sizeof(struct amdgpu_task_info)); amdgpu_vm_get_task_info(adev, entry->pasid, _info); @@ -663,6 +664,8 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
[PATCH 3/3] drm/amdgpu: add new INFO ioctl query for the last GPU page fault
Add a interface to query the last GPU page fault for the process. Useful for debugging context lost errors. v2: split vmhub representation between kernel and userspace Mesa MR: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23238 libdrm MR: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23238 Cc: samuel.pitoi...@gmail.com Acked-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 16 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 16 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 13 ++--- include/uapi/drm/amdgpu_drm.h | 16 5 files changed, 59 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 999d008b6b48..6b053bab799c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -113,9 +113,10 @@ *gl1c_cache_size, gl2c_cache_size, mall_size, enabled_rb_pipes_mask_hi * 3.53.0 - Support for GFX11 CP GFX shadowing * 3.54.0 - Add AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS support + * - 3.55.0 - Add AMDGPU_INFO_GPUVM_FAULT query */ #define KMS_DRIVER_MAJOR 3 -#define KMS_DRIVER_MINOR 54 +#define KMS_DRIVER_MINOR 55 #define KMS_DRIVER_PATCHLEVEL 0 unsigned int amdgpu_vram_limit = UINT_MAX; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index e3531aa3c8bd..2289f8eb3d1d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -1163,6 +1163,22 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) return copy_to_user(out, max_ibs, min((size_t)size, sizeof(max_ibs))) ? -EFAULT : 0; } + case AMDGPU_INFO_GPUVM_FAULT: { + struct amdgpu_fpriv *fpriv = filp->driver_priv; + struct amdgpu_vm *vm = >vm; + struct drm_amdgpu_info_gpuvm_fault gpuvm_fault; + + if (!vm) + return -EINVAL; + + memset(_fault, 0, sizeof(gpuvm_fault)); + gpuvm_fault.addr = vm->fault_info.addr; + gpuvm_fault.status = vm->fault_info.status; + gpuvm_fault.vmhub = vm->fault_info.vmhub; + + return copy_to_user(out, _fault, + min((size_t)size, sizeof(gpuvm_fault))) ? -EFAULT : 0; + } default: DRM_DEBUG_KMS("Invalid request %d\n", info->query); return -EINVAL; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 5423f66a9ed8..2c1106855492 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2666,7 +2666,21 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev, if (vm) { vm->fault_info.addr = addr; vm->fault_info.status = status; - vm->fault_info.vmhub = vmhub; + if (AMDGPU_IS_GFXHUB(vmhub)) { + vm->fault_info.vmhub = AMDGPU_VMHUB_TYPE_GFX; + vm->fault_info.vmhub |= + (vmhub - AMDGPU_GFXHUB_START) << AMDGPU_VMHUB_IDX_SHIFT; + } else if (AMDGPU_IS_MMHUB0(vmhub)) { + vm->fault_info.vmhub = AMDGPU_VMHUB_TYPE_MM0; + vm->fault_info.vmhub |= + (vmhub - AMDGPU_MMHUB0_START) << AMDGPU_VMHUB_IDX_SHIFT; + } else if (AMDGPU_IS_MMHUB1(vmhub)) { + vm->fault_info.vmhub = AMDGPU_VMHUB_TYPE_MM1; + vm->fault_info.vmhub |= + (vmhub - AMDGPU_MMHUB1_START) << AMDGPU_VMHUB_IDX_SHIFT; + } else { + WARN_ONCE(1, "Invalid vmhub %u\n", vmhub); + } } xa_unlock_irqrestore(>vm_manager.pasids, flags); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index fb66a413110c..1a34fea9acb9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -116,9 +116,16 @@ struct amdgpu_mem_stats; * layout: max 8 GFXHUB + 4 MMHUB0 + 1 MMHUB1 */ #define AMDGPU_MAX_VMHUBS 13 -#define AMDGPU_GFXHUB(x) (x) -#define AMDGPU_MMHUB0(x) (8 + x) -#define AMDGPU_MMHUB1(x) (8 + 4 + x) +#define AMDGPU_GFXHUB_START0 +#define AMDGPU_MMHUB0_START8 +#define AMDGPU_MMHUB1_START12 +#define AMDGPU_GFXHUB(x) (AMDGPU_GFXHUB_START + (x)) +#define AMDGPU_MMHUB0(x) (AMDGPU_MMHUB0_START + (x)) +#define
[PATCH 1/3] drm/amdgpu: add cached GPU fault structure to vm struct
When we get a GPU page fault, cache the fault for later analysis. Cc: samuel.pitoi...@gmail.com Acked-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 31 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 18 +++ 2 files changed, 49 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index dc80c9c8fd14..5423f66a9ed8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2640,3 +2640,34 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) total_done_objs); } #endif + +/** + * amdgpu_vm_update_fault_cache - update cached fault into. + * @adev: amdgpu device pointer + * @pasid: PASID of the VM + * @addr: Address of the fault + * @status: GPUVM fault status register + * @vmhub: which vmhub got the fault + * + * Cache the fault info for later use by userspace in debugging. + */ +void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev, + unsigned int pasid, + uint64_t addr, + uint32_t status, + unsigned int vmhub) +{ + struct amdgpu_vm *vm; + unsigned long flags; + + xa_lock_irqsave(>vm_manager.pasids, flags); + + vm = xa_load(>vm_manager.pasids, pasid); + if (vm) { + vm->fault_info.addr = addr; + vm->fault_info.status = status; + vm->fault_info.vmhub = vmhub; + } + xa_unlock_irqrestore(>vm_manager.pasids, flags); +} + diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 14f9a2bf3acb..fb66a413110c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -244,6 +244,15 @@ struct amdgpu_vm_update_funcs { struct dma_fence **fence); }; +struct amdgpu_vm_fault_info { + /* fault address */ + uint64_taddr; + /* fault status register */ + uint32_tstatus; + /* which vmhub? gfxhub, mmhub, etc. */ + unsigned intvmhub; +}; + struct amdgpu_vm { /* tree of virtual addresses mapped */ struct rb_root_cached va; @@ -332,6 +341,9 @@ struct amdgpu_vm { /* Memory partition number, -1 means any partition */ int8_t mem_id; + + /* cached fault info */ + struct amdgpu_vm_fault_info fault_info; }; struct amdgpu_vm_manager { @@ -540,4 +552,10 @@ static inline void amdgpu_vm_eviction_unlock(struct amdgpu_vm *vm) mutex_unlock(>eviction_lock); } +void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev, + unsigned int pasid, + uint64_t addr, + uint32_t status, + unsigned int vmhub); + #endif -- 2.40.1
[PATCH v2 0/3] Add GPU page fault query interface
This patch set adds support for an application to query GPU page faults. It's useful for debugging and there are vulkan extensions that could make use of this. Preliminary user space code which uses this can be found here: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23238 https://gitlab.freedesktop.org/mesa/drm/-/merge_requests/298 Note, that I made a small change to the vmhub definition to decouple it from how the kernel tracks vmhubs so that we have a consistent user view even if we decide to add more vmhubs like we recently did for gfx 9.4.3. I've also pushed the changed to: https://gitlab.freedesktop.org/agd5f/linux/-/commits/gpu_fault_info_ioctl Open question, currently we just expose the raw GPU fault status register value for each GPU so UMDs need GPU specific knowlege to decode it, although it's largely the same across generations. One option would be to translate to a generic GPU independent fault status. Opinions? v2: - Fix spelling typos noted by Guchun Alex Deucher (3): drm/amdgpu: add cached GPU fault structure to vm struct drm/amdgpu: cache gpuvm fault information for gmc7+ drm/amdgpu: add new INFO ioctl query for the last GPU page fault drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 16 + drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 45 + drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 31 +++-- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 3 ++ drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 3 ++ drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c | 3 ++ drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 3 ++ drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 11 +++--- include/uapi/drm/amdgpu_drm.h | 16 + 10 files changed, 126 insertions(+), 8 deletions(-) -- 2.40.1
Re: [PATCH 3/9] drm/ttm: use per BO cleanup workers
On Tue, Jun 13, 2023 at 3:59 PM Christian König wrote: > > Am 13.06.23 um 15:05 schrieb Karol Herbst: > > On Mon, Dec 5, 2022 at 2:40 PM Christian König > > wrote: > >> Am 29.11.22 um 22:14 schrieb Felix Kuehling: > >>> On 2022-11-25 05:21, Christian König wrote: > Instead of a single worker going over the list of delete BOs in regular > intervals use a per BO worker which blocks for the resv object and > locking of the BO. > > This not only simplifies the handling massively, but also results in > much better response time when cleaning up buffers. > > Signed-off-by: Christian König > >>> Just thinking out loud: If I understand it correctly, this can cause a > >>> lot of sleeping worker threads when > >>> AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE is used and many BOs are freed > >>> at the same time. This happens e.g. when a KFD process terminates or > >>> crashes. I guess with a concurrency-managed workqueue this isn't going > >>> to be excessive. And since it's on a per device workqueue, it doesn't > >>> stall work items on the system work queue or from other devices. > >> Yes, exactly that. The last parameter to alloc_workqueue() limits how > >> many work items can be sleeping. > >> > >>> I'm trying to understand why you set WQ_MEM_RECLAIM. This work queue > >>> is not about freeing ttm_resources but about freeing the BOs. But it > >>> affects freeing of ghost_objs that are holding the ttm_resources being > >>> freed. > >> Well if the BO is idle, but not immediately lockable we delegate freeing > >> the backing pages in the TT object to those workers as well. It might > >> even be a good idea to use a separate wq for this case. > >> > >>> If those assumptions all make sense, patches 1-3 are > >>> > >>> Reviewed-by: Felix Kuehling > >> Thanks, > >> Christian. > >> > > This patch causes a heap use-after-free when using nouveau with the > > potential of trashing filesystems, is there a way to revert it until > > we figure out a proper solution to the problem? > > Uff I don't think so, we have quite some work based on top of this. But > let me double check. > yeah.. I already talked with Dave about fixing this issue as Dave has more knowledge on this part of the driver (I hope), so we might have a fix soonish, but the concerning part is, that it's already out to users, so might be better to be able to revert it if the fix takes a while to emerge. > On the other hand have you tried running this with KASAN to catch use > after free errors? yes: https://gitlab.freedesktop.org/drm/nouveau/-/issues/213#note_1942777 > > Since we now block for work to finish and not check every few > milliseconds to garbage collect memory will now be reclaimed much faster > after freeing it. yeah, that kinda makes sense. This entire issue feels like a race happening as I need to run the OpenGL CTS in parallel with 8+ threads to trigger it reliably. > > Regards, > Christian. > > > > > Bug: https://gitlab.freedesktop.org/drm/nouveau/-/issues/213 > > > > example trace on affected systems: > > > > [ 4102.946946] general protection fault, probably for non-canonical > > address 0x5f775ce3bd949b45: [#3] PREEMPT SMP NOPTI > > [ 4102.957794] CPU: 12 PID: 89561 Comm: glcts Tainted: G D > > 6.3.5-200.fc38.x86_64 #1 > > [ 4102.966556] Hardware name: ASUS System Product Name/PRIME B660-PLUS > > D4, BIOS 0418 10/13/2021 > > [ 4102.974972] RIP: 0010:__kmem_cache_alloc_node+0x1ba/0x320 > > [ 4102.980362] Code: 2b 14 25 28 00 00 00 0f 85 74 01 00 00 48 83 c4 > > 18 5b 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc 41 8b 47 28 4d 8b 07 > > 48 01 f8 <48> 8b 18 48 89 c1 49 33 9f b8 00 00 00 48 0f c9 48 31 cb 41 > > f6 c0 > > [ 4102.999073] RSP: 0018:9764e0057b40 EFLAGS: 00010202 > > [ 4103.004291] RAX: 5f775ce3bd949b45 RBX: 0dc0 RCX: > > 0046 > > [ 4103.011408] RDX: 0002cf87600c RSI: 0dc0 RDI: > > 5f775ce3bd949b15 > > [ 4103.018528] RBP: 0dc0 R08: 000390c0 R09: > > 30302d6d > > [ 4103.025649] R10: 756c7473 R11: 20090298 R12: > > > > [ 4103.032767] R13: R14: 0046 R15: > > 8bda80042600 > > [ 4103.039887] FS: 7f386a85ef00() GS:8be1df70() > > knlGS: > > [ 4103.047958] CS: 0010 DS: ES: CR0: 80050033 > > [ 4103.053692] CR2: 0493b868 CR3: 00014c3ba000 CR4: > > 00f50ee0 > > [ 4103.060812] PKRU: 5554 > > [ 4103.063520] Call Trace: > > [ 4103.065970] > > [ 4103.068071] ? die_addr+0x36/0x90 > > [ 4103.071384] ? exc_general_protection+0x1be/0x420 > > [ 4103.076081] ? asm_exc_general_protection+0x26/0x30 > > [ 4103.080952] ? __kmem_cache_alloc_node+0x1ba/0x320 > > [ 4103.085734] ? ext4_htree_store_dirent+0x42/0x180 > > [ 4103.090431] ? ext4_htree_store_dirent+0x42/0x180 > > [ 4103.095132] __kmalloc+0x4d/0x150 > > [ 4103.098444] ext4_htree_store_dirent+0x42/0x180 > > [
Re: [PATCH] drm/amdgpu/sdma4: set align mask to 255
Reviewed-by: Christian König Am 13.06.23 um 03:14 schrieb Liu, Aaron: [AMD Official Use Only - General] Reviewed-by: Aaron Liu -Original Message- From: amd-gfx On Behalf Of Alex Deucher Sent: Tuesday, June 13, 2023 5:48 AM To: Deucher, Alexander Cc: amd-gfx@lists.freedesktop.org Subject: Re: [PATCH] drm/amdgpu/sdma4: set align mask to 255 ping? On Wed, Jun 7, 2023 at 12:31 PM Alex Deucher wrote: The wptr needs to be incremented at at least 64 dword intervals, use 256 to align with windows. This should fix potential hangs with unaligned updates. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index 1f83eebfc8a7..cd37f45e01a1 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -2312,7 +2312,7 @@ const struct amd_ip_funcs sdma_v4_0_ip_funcs = { static const struct amdgpu_ring_funcs sdma_v4_0_ring_funcs = { .type = AMDGPU_RING_TYPE_SDMA, - .align_mask = 0xf, + .align_mask = 0xff, .nop = SDMA_PKT_NOP_HEADER_OP(SDMA_OP_NOP), .support_64bit_ptrs = true, .secure_submission_supported = true, @@ -2344,7 +2344,7 @@ static const struct amdgpu_ring_funcs sdma_v4_0_ring_funcs = { static const struct amdgpu_ring_funcs sdma_v4_0_page_ring_funcs = { .type = AMDGPU_RING_TYPE_SDMA, - .align_mask = 0xf, + .align_mask = 0xff, .nop = SDMA_PKT_NOP_HEADER_OP(SDMA_OP_NOP), .support_64bit_ptrs = true, .secure_submission_supported = true, diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 8eebf9c2bbcd..05bb0691ee0e 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -1823,7 +1823,7 @@ const struct amd_ip_funcs sdma_v4_4_2_ip_funcs = { static const struct amdgpu_ring_funcs sdma_v4_4_2_ring_funcs = { .type = AMDGPU_RING_TYPE_SDMA, - .align_mask = 0xf, + .align_mask = 0xff, .nop = SDMA_PKT_NOP_HEADER_OP(SDMA_OP_NOP), .support_64bit_ptrs = true, .get_rptr = sdma_v4_4_2_ring_get_rptr, @@ -1854,7 +1854,7 @@ static const struct amdgpu_ring_funcs sdma_v4_4_2_ring_funcs = { static const struct amdgpu_ring_funcs sdma_v4_4_2_page_ring_funcs = { .type = AMDGPU_RING_TYPE_SDMA, - .align_mask = 0xf, + .align_mask = 0xff, .nop = SDMA_PKT_NOP_HEADER_OP(SDMA_OP_NOP), .support_64bit_ptrs = true, .get_rptr = sdma_v4_4_2_ring_get_rptr, -- 2.40.1
Re: [PATCH 3/9] drm/ttm: use per BO cleanup workers
Am 13.06.23 um 15:05 schrieb Karol Herbst: On Mon, Dec 5, 2022 at 2:40 PM Christian König wrote: Am 29.11.22 um 22:14 schrieb Felix Kuehling: On 2022-11-25 05:21, Christian König wrote: Instead of a single worker going over the list of delete BOs in regular intervals use a per BO worker which blocks for the resv object and locking of the BO. This not only simplifies the handling massively, but also results in much better response time when cleaning up buffers. Signed-off-by: Christian König Just thinking out loud: If I understand it correctly, this can cause a lot of sleeping worker threads when AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE is used and many BOs are freed at the same time. This happens e.g. when a KFD process terminates or crashes. I guess with a concurrency-managed workqueue this isn't going to be excessive. And since it's on a per device workqueue, it doesn't stall work items on the system work queue or from other devices. Yes, exactly that. The last parameter to alloc_workqueue() limits how many work items can be sleeping. I'm trying to understand why you set WQ_MEM_RECLAIM. This work queue is not about freeing ttm_resources but about freeing the BOs. But it affects freeing of ghost_objs that are holding the ttm_resources being freed. Well if the BO is idle, but not immediately lockable we delegate freeing the backing pages in the TT object to those workers as well. It might even be a good idea to use a separate wq for this case. If those assumptions all make sense, patches 1-3 are Reviewed-by: Felix Kuehling Thanks, Christian. This patch causes a heap use-after-free when using nouveau with the potential of trashing filesystems, is there a way to revert it until we figure out a proper solution to the problem? Uff I don't think so, we have quite some work based on top of this. But let me double check. On the other hand have you tried running this with KASAN to catch use after free errors? Since we now block for work to finish and not check every few milliseconds to garbage collect memory will now be reclaimed much faster after freeing it. Regards, Christian. Bug: https://gitlab.freedesktop.org/drm/nouveau/-/issues/213 example trace on affected systems: [ 4102.946946] general protection fault, probably for non-canonical address 0x5f775ce3bd949b45: [#3] PREEMPT SMP NOPTI [ 4102.957794] CPU: 12 PID: 89561 Comm: glcts Tainted: G D 6.3.5-200.fc38.x86_64 #1 [ 4102.966556] Hardware name: ASUS System Product Name/PRIME B660-PLUS D4, BIOS 0418 10/13/2021 [ 4102.974972] RIP: 0010:__kmem_cache_alloc_node+0x1ba/0x320 [ 4102.980362] Code: 2b 14 25 28 00 00 00 0f 85 74 01 00 00 48 83 c4 18 5b 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc 41 8b 47 28 4d 8b 07 48 01 f8 <48> 8b 18 48 89 c1 49 33 9f b8 00 00 00 48 0f c9 48 31 cb 41 f6 c0 [ 4102.999073] RSP: 0018:9764e0057b40 EFLAGS: 00010202 [ 4103.004291] RAX: 5f775ce3bd949b45 RBX: 0dc0 RCX: 0046 [ 4103.011408] RDX: 0002cf87600c RSI: 0dc0 RDI: 5f775ce3bd949b15 [ 4103.018528] RBP: 0dc0 R08: 000390c0 R09: 30302d6d [ 4103.025649] R10: 756c7473 R11: 20090298 R12: [ 4103.032767] R13: R14: 0046 R15: 8bda80042600 [ 4103.039887] FS: 7f386a85ef00() GS:8be1df70() knlGS: [ 4103.047958] CS: 0010 DS: ES: CR0: 80050033 [ 4103.053692] CR2: 0493b868 CR3: 00014c3ba000 CR4: 00f50ee0 [ 4103.060812] PKRU: 5554 [ 4103.063520] Call Trace: [ 4103.065970] [ 4103.068071] ? die_addr+0x36/0x90 [ 4103.071384] ? exc_general_protection+0x1be/0x420 [ 4103.076081] ? asm_exc_general_protection+0x26/0x30 [ 4103.080952] ? __kmem_cache_alloc_node+0x1ba/0x320 [ 4103.085734] ? ext4_htree_store_dirent+0x42/0x180 [ 4103.090431] ? ext4_htree_store_dirent+0x42/0x180 [ 4103.095132] __kmalloc+0x4d/0x150 [ 4103.098444] ext4_htree_store_dirent+0x42/0x180 [ 4103.102970] htree_dirblock_to_tree+0x1ed/0x370 [ 4103.107494] ext4_htree_fill_tree+0x109/0x3d0 [ 4103.111846] ext4_readdir+0x6d4/0xa80 [ 4103.115505] iterate_dir+0x178/0x1c0 [ 4103.119076] __x64_sys_getdents64+0x88/0x130 [ 4103.123341] ? __pfx_filldir64+0x10/0x10 [ 4103.127260] do_syscall_64+0x5d/0x90 [ 4103.130835] ? handle_mm_fault+0x11e/0x310 [ 4103.134927] ? do_user_addr_fault+0x1e0/0x720 [ 4103.139278] ? exc_page_fault+0x7c/0x180 [ 4103.143195] entry_SYSCALL_64_after_hwframe+0x72/0xdc [ 4103.148240] RIP: 0033:0x7f386a418047 [ 4103.151828] Code: 24 fb ff 4c 89 e0 5b 41 5c 5d c3 0f 1f 84 00 00 00 00 00 f3 0f 1e fa b8 ff ff ff 7f 48 39 c2 48 0f 47 d0 b8 d9 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 01 c3 48 8b 15 91 cd 0f 00 f7 d8 64 89 02 48 [ 4103.170543] RSP: 002b:7ffd4793ff38 EFLAGS: 0293 ORIG_RAX: 00d9 [ 4103.178095] RAX: ffda RBX: 04933830 RCX: 7f386a418047 [ 4103.185214] RDX: 8000 RSI: 04933860 RDI:
Re: [PATCH 3/3] drm/amdgpu: Remove unused NBIO interface
[Public] Sorry, replied to the wrong rev of the patch. my AB applies to v2 as well. Alex From: amd-gfx on behalf of Deucher, Alexander Sent: Tuesday, June 13, 2023 9:25 AM To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org Cc: Ma, Le ; Kamal, Asad ; Zhang, Hawking Subject: Re: [PATCH 3/3] drm/amdgpu: Remove unused NBIO interface [Public] [Public] Series is: Acked-by: Alex Deucher From: Lazar, Lijo Sent: Tuesday, June 13, 2023 6:53 AM To: amd-gfx@lists.freedesktop.org Cc: Zhang, Hawking ; Deucher, Alexander ; Kamal, Asad ; Ma, Le Subject: [PATCH 3/3] drm/amdgpu: Remove unused NBIO interface Set compute partition mode interface in NBIO is no longer used. Remove the only implementation from NBIO v7.9 Signed-off-by: Lijo Lazar --- drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h | 2 -- drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 14 -- 2 files changed, 16 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h index 095aecfb201e..8ab8ae01f87c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h @@ -99,8 +99,6 @@ struct amdgpu_nbio_funcs { int (*get_compute_partition_mode)(struct amdgpu_device *adev); u32 (*get_memory_partition_mode)(struct amdgpu_device *adev, u32 *supp_modes); - void (*set_compute_partition_mode)(struct amdgpu_device *adev, - enum amdgpu_gfx_partition mode); }; struct amdgpu_nbio { diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c index b033935d6749..cd1a02d30420 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c @@ -393,19 +393,6 @@ static int nbio_v7_9_get_compute_partition_mode(struct amdgpu_device *adev) return px; } -static void nbio_v7_9_set_compute_partition_mode(struct amdgpu_device *adev, - enum amdgpu_gfx_partition mode) -{ - u32 tmp; - - /* SPX=0, DPX=1, TPX=2, QPX=3, CPX=4 */ - tmp = RREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS); - tmp = REG_SET_FIELD(tmp, BIF_BX_PF0_PARTITION_COMPUTE_STATUS, - PARTITION_MODE, mode); - - WREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS, tmp); -} - static u32 nbio_v7_9_get_memory_partition_mode(struct amdgpu_device *adev, u32 *supp_modes) { @@ -461,7 +448,6 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = { .ih_control = nbio_v7_9_ih_control, .remap_hdp_registers = nbio_v7_9_remap_hdp_registers, .get_compute_partition_mode = nbio_v7_9_get_compute_partition_mode, - .set_compute_partition_mode = nbio_v7_9_set_compute_partition_mode, .get_memory_partition_mode = nbio_v7_9_get_memory_partition_mode, .init_registers = nbio_v7_9_init_registers, }; -- 2.25.1
Re: [PATCH 3/3] drm/amdgpu: Remove unused NBIO interface
[Public] Series is: Acked-by: Alex Deucher From: Lazar, Lijo Sent: Tuesday, June 13, 2023 6:53 AM To: amd-gfx@lists.freedesktop.org Cc: Zhang, Hawking ; Deucher, Alexander ; Kamal, Asad ; Ma, Le Subject: [PATCH 3/3] drm/amdgpu: Remove unused NBIO interface Set compute partition mode interface in NBIO is no longer used. Remove the only implementation from NBIO v7.9 Signed-off-by: Lijo Lazar --- drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h | 2 -- drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 14 -- 2 files changed, 16 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h index 095aecfb201e..8ab8ae01f87c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h @@ -99,8 +99,6 @@ struct amdgpu_nbio_funcs { int (*get_compute_partition_mode)(struct amdgpu_device *adev); u32 (*get_memory_partition_mode)(struct amdgpu_device *adev, u32 *supp_modes); - void (*set_compute_partition_mode)(struct amdgpu_device *adev, - enum amdgpu_gfx_partition mode); }; struct amdgpu_nbio { diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c index b033935d6749..cd1a02d30420 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c @@ -393,19 +393,6 @@ static int nbio_v7_9_get_compute_partition_mode(struct amdgpu_device *adev) return px; } -static void nbio_v7_9_set_compute_partition_mode(struct amdgpu_device *adev, - enum amdgpu_gfx_partition mode) -{ - u32 tmp; - - /* SPX=0, DPX=1, TPX=2, QPX=3, CPX=4 */ - tmp = RREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS); - tmp = REG_SET_FIELD(tmp, BIF_BX_PF0_PARTITION_COMPUTE_STATUS, - PARTITION_MODE, mode); - - WREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS, tmp); -} - static u32 nbio_v7_9_get_memory_partition_mode(struct amdgpu_device *adev, u32 *supp_modes) { @@ -461,7 +448,6 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = { .ih_control = nbio_v7_9_ih_control, .remap_hdp_registers = nbio_v7_9_remap_hdp_registers, .get_compute_partition_mode = nbio_v7_9_get_compute_partition_mode, - .set_compute_partition_mode = nbio_v7_9_set_compute_partition_mode, .get_memory_partition_mode = nbio_v7_9_get_memory_partition_mode, .init_registers = nbio_v7_9_init_registers, }; -- 2.25.1
Re: [PATCH] drm/amdgpu: update external rev_id for gc_11_0_1 and gc_11_0_4
[Public] Acked-by: Alex Deucher From: amd-gfx on behalf of Aaron Liu Sent: Monday, June 12, 2023 11:09 PM To: amd-gfx@lists.freedesktop.org Cc: Deucher, Alexander ; Zhang, Yifan ; Liu, Aaron Subject: [PATCH] drm/amdgpu: update external rev_id for gc_11_0_1 and gc_11_0_4 For gc_11_0_1, the external rev_id of A0/A1 series is 0x1, the external rev_id of A2 is 0x10. Signed-off-by: Aaron Liu --- drivers/gpu/drm/amd/amdgpu/soc21.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index e5e5d68a4d70..caaf9da4c1c0 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -665,7 +665,10 @@ static int soc21_common_early_init(void *handle) AMD_PG_SUPPORT_VCN | AMD_PG_SUPPORT_VCN_DPG | AMD_PG_SUPPORT_JPEG; - adev->external_rev_id = adev->rev_id + 0x1; + if (adev->rev_id < 0xA) + adev->external_rev_id = 0x1; + else + adev->external_rev_id = 0x10; break; case IP_VERSION(11, 0, 3): adev->cg_flags = AMD_CG_SUPPORT_VCN_MGCG | @@ -705,7 +708,7 @@ static int soc21_common_early_init(void *handle) AMD_PG_SUPPORT_VCN_DPG | AMD_PG_SUPPORT_GFX_PG | AMD_PG_SUPPORT_JPEG; - adev->external_rev_id = adev->rev_id + 0x80; + adev->external_rev_id = 0x80; break; default: -- 2.39.0
Re: [PATCH] drm/amdkfd: Remove DUMMY_VRAM_SIZE
[Public] Acked-by: Alex Deucher From: amd-gfx on behalf of Mukul Joshi Sent: Monday, June 12, 2023 7:06 PM To: amd-gfx@lists.freedesktop.org Cc: Joshi, Mukul ; Kuehling, Felix Subject: [PATCH] drm/amdkfd: Remove DUMMY_VRAM_SIZE Remove DUMMY_VRAM_SIZE as it is not needed and can result in reporting incorrect memory size. Signed-off-by: Mukul Joshi --- drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 5 - 1 file changed, 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index 3dcd8f8bc98e..49f40d9f16e8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -30,9 +30,6 @@ #include "amdgpu.h" #include "amdgpu_amdkfd.h" -/* Fixme: Fake 32GB for 1PNPS1 mode bringup */ -#define DUMMY_VRAM_SIZE 31138512896 - /* GPU Processor ID base for dGPUs for which VCRAT needs to be created. * GPU processor ID are expressed with Bit[31]=1. * The base is set to 0x8000_ + 0x1000 to avoid collision with GPU IDs @@ -1056,8 +1053,6 @@ static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem, props->heap_type = heap_type; props->flags = flags; - if (size_in_bytes == 0) - size_in_bytes = DUMMY_VRAM_SIZE; /* Fixme: TBD */ props->size_in_bytes = size_in_bytes; props->width = width; -- 2.35.1
Re: [PATCH 3/9] drm/ttm: use per BO cleanup workers
On Mon, Dec 5, 2022 at 2:40 PM Christian König wrote: > > Am 29.11.22 um 22:14 schrieb Felix Kuehling: > > On 2022-11-25 05:21, Christian König wrote: > >> Instead of a single worker going over the list of delete BOs in regular > >> intervals use a per BO worker which blocks for the resv object and > >> locking of the BO. > >> > >> This not only simplifies the handling massively, but also results in > >> much better response time when cleaning up buffers. > >> > >> Signed-off-by: Christian König > > > > Just thinking out loud: If I understand it correctly, this can cause a > > lot of sleeping worker threads when > > AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE is used and many BOs are freed > > at the same time. This happens e.g. when a KFD process terminates or > > crashes. I guess with a concurrency-managed workqueue this isn't going > > to be excessive. And since it's on a per device workqueue, it doesn't > > stall work items on the system work queue or from other devices. > > Yes, exactly that. The last parameter to alloc_workqueue() limits how > many work items can be sleeping. > > > I'm trying to understand why you set WQ_MEM_RECLAIM. This work queue > > is not about freeing ttm_resources but about freeing the BOs. But it > > affects freeing of ghost_objs that are holding the ttm_resources being > > freed. > > Well if the BO is idle, but not immediately lockable we delegate freeing > the backing pages in the TT object to those workers as well. It might > even be a good idea to use a separate wq for this case. > > > > > If those assumptions all make sense, patches 1-3 are > > > > Reviewed-by: Felix Kuehling > > Thanks, > Christian. > This patch causes a heap use-after-free when using nouveau with the potential of trashing filesystems, is there a way to revert it until we figure out a proper solution to the problem? Bug: https://gitlab.freedesktop.org/drm/nouveau/-/issues/213 example trace on affected systems: [ 4102.946946] general protection fault, probably for non-canonical address 0x5f775ce3bd949b45: [#3] PREEMPT SMP NOPTI [ 4102.957794] CPU: 12 PID: 89561 Comm: glcts Tainted: G D 6.3.5-200.fc38.x86_64 #1 [ 4102.966556] Hardware name: ASUS System Product Name/PRIME B660-PLUS D4, BIOS 0418 10/13/2021 [ 4102.974972] RIP: 0010:__kmem_cache_alloc_node+0x1ba/0x320 [ 4102.980362] Code: 2b 14 25 28 00 00 00 0f 85 74 01 00 00 48 83 c4 18 5b 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc 41 8b 47 28 4d 8b 07 48 01 f8 <48> 8b 18 48 89 c1 49 33 9f b8 00 00 00 48 0f c9 48 31 cb 41 f6 c0 [ 4102.999073] RSP: 0018:9764e0057b40 EFLAGS: 00010202 [ 4103.004291] RAX: 5f775ce3bd949b45 RBX: 0dc0 RCX: 0046 [ 4103.011408] RDX: 0002cf87600c RSI: 0dc0 RDI: 5f775ce3bd949b15 [ 4103.018528] RBP: 0dc0 R08: 000390c0 R09: 30302d6d [ 4103.025649] R10: 756c7473 R11: 20090298 R12: [ 4103.032767] R13: R14: 0046 R15: 8bda80042600 [ 4103.039887] FS: 7f386a85ef00() GS:8be1df70() knlGS: [ 4103.047958] CS: 0010 DS: ES: CR0: 80050033 [ 4103.053692] CR2: 0493b868 CR3: 00014c3ba000 CR4: 00f50ee0 [ 4103.060812] PKRU: 5554 [ 4103.063520] Call Trace: [ 4103.065970] [ 4103.068071] ? die_addr+0x36/0x90 [ 4103.071384] ? exc_general_protection+0x1be/0x420 [ 4103.076081] ? asm_exc_general_protection+0x26/0x30 [ 4103.080952] ? __kmem_cache_alloc_node+0x1ba/0x320 [ 4103.085734] ? ext4_htree_store_dirent+0x42/0x180 [ 4103.090431] ? ext4_htree_store_dirent+0x42/0x180 [ 4103.095132] __kmalloc+0x4d/0x150 [ 4103.098444] ext4_htree_store_dirent+0x42/0x180 [ 4103.102970] htree_dirblock_to_tree+0x1ed/0x370 [ 4103.107494] ext4_htree_fill_tree+0x109/0x3d0 [ 4103.111846] ext4_readdir+0x6d4/0xa80 [ 4103.115505] iterate_dir+0x178/0x1c0 [ 4103.119076] __x64_sys_getdents64+0x88/0x130 [ 4103.123341] ? __pfx_filldir64+0x10/0x10 [ 4103.127260] do_syscall_64+0x5d/0x90 [ 4103.130835] ? handle_mm_fault+0x11e/0x310 [ 4103.134927] ? do_user_addr_fault+0x1e0/0x720 [ 4103.139278] ? exc_page_fault+0x7c/0x180 [ 4103.143195] entry_SYSCALL_64_after_hwframe+0x72/0xdc [ 4103.148240] RIP: 0033:0x7f386a418047 [ 4103.151828] Code: 24 fb ff 4c 89 e0 5b 41 5c 5d c3 0f 1f 84 00 00 00 00 00 f3 0f 1e fa b8 ff ff ff 7f 48 39 c2 48 0f 47 d0 b8 d9 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 01 c3 48 8b 15 91 cd 0f 00 f7 d8 64 89 02 48 [ 4103.170543] RSP: 002b:7ffd4793ff38 EFLAGS: 0293 ORIG_RAX: 00d9 [ 4103.178095] RAX: ffda RBX: 04933830 RCX: 7f386a418047 [ 4103.185214] RDX: 8000 RSI: 04933860 RDI: 0006 [ 4103.192335] RBP: 7ffd4793ff70 R08: R09: 0001 [ 4103.199454] R10: 0004 R11: 0293 R12: 04933834 [ 4103.206573] R13: 04933860 R14: ff60 R15: [ 4103.213695] [ 4103.215883]
RE: [PATCH v2 2/3] drm/amdgpu: Use PSP FW API for partition switch
[AMD Official Use Only - General] Series is Reviewed-by: Hawking Zhang Regards Hawking -Original Message- From: amd-gfx On Behalf Of Lijo Lazar Sent: Tuesday, June 13, 2023 19:03 To: amd-gfx@lists.freedesktop.org Cc: Deucher, Alexander ; Ma, Le ; Kamal, Asad ; Zhang, Hawking Subject: [PATCH v2 2/3] drm/amdgpu: Use PSP FW API for partition switch Use PSP firmware interface for switching compute partitions. Signed-off-by: Lijo Lazar --- v2: Changed the return value to int .../drm/amd/amdgpu/aqua_vanjaram_reg_init.c| 3 --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c| 18 ++ 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c index a595bb958215..16471b81a1f5 100644 --- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c +++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c @@ -518,9 +518,6 @@ static int aqua_vanjaram_switch_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr, adev->gfx.funcs->switch_partition_mode(xcp_mgr->adev, num_xcc_per_xcp); - if (adev->nbio.funcs->set_compute_partition_mode) - adev->nbio.funcs->set_compute_partition_mode(adev, mode); - /* Init info about new xcps */ *num_xcps = num_xcc / num_xcc_per_xcp; amdgpu_xcp_init(xcp_mgr, *num_xcps, mode); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index f5b8d3f388ff..c1ee54d4c3d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -623,22 +623,16 @@ static void gfx_v9_4_3_select_me_pipe_q(struct amdgpu_device *adev, static int gfx_v9_4_3_switch_compute_partition(struct amdgpu_device *adev, int num_xccs_per_xcp) { - int i, num_xcc; - u32 tmp = 0; - - num_xcc = NUM_XCC(adev->gfx.xcc_mask); + int ret; - for (i = 0; i < num_xcc; i++) { - tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, NUM_XCC_IN_XCP, - num_xccs_per_xcp); - tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, VIRTUAL_XCC_ID, - i % num_xccs_per_xcp); - WREG32_SOC15(GC, GET_INST(GC, i), regCP_HYP_XCP_CTL, tmp); - } + ret = psp_spatial_partition(>psp, NUM_XCC(adev->gfx.xcc_mask) / + num_xccs_per_xcp); + if (ret) + return ret; adev->gfx.num_xcc_per_xcp = num_xccs_per_xcp; - return 0; + return ret; } static int gfx_v9_4_3_ih_to_xcc_inst(struct amdgpu_device *adev, int ih_node) -- 2.25.1
RE: [PATCH] drm/amdgpu: Release SDMAv4.4.2 ecc irq properly
[AMD Official Use Only - General] Reviewed-by: Hawking Zhang Regards, Hawking -Original Message- From: Lazar, Lijo Sent: Tuesday, June 13, 2023 18:46 To: amd-gfx@lists.freedesktop.org Cc: Zhang, Hawking ; Deucher, Alexander ; Kamal, Asad ; Ma, Le Subject: [PATCH] drm/amdgpu: Release SDMAv4.4.2 ecc irq properly Release ECC irq only if irq is enabled - only when RAS feature is enabled ECC irq gets enabled. Signed-off-by: Lijo Lazar --- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 17 +++-- 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 8eebf9c2bbcd..77ebf27981e4 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -1434,9 +1434,11 @@ static int sdma_v4_4_2_hw_fini(void *handle) return 0; inst_mask = GENMASK(adev->sdma.num_instances - 1, 0); - for (i = 0; i < adev->sdma.num_instances; i++) { - amdgpu_irq_put(adev, >sdma.ecc_irq, - AMDGPU_SDMA_IRQ_INSTANCE0 + i); + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) { + for (i = 0; i < adev->sdma.num_instances; i++) { + amdgpu_irq_put(adev, >sdma.ecc_irq, + AMDGPU_SDMA_IRQ_INSTANCE0 + i); + } } sdma_v4_4_2_inst_ctx_switch_enable(adev, false, inst_mask); @@ -2073,9 +2075,12 @@ static int sdma_v4_4_2_xcp_suspend(void *handle, uint32_t inst_mask) uint32_t tmp_mask = inst_mask; int i; - for_each_inst(i, tmp_mask) { - amdgpu_irq_put(adev, >sdma.ecc_irq, - AMDGPU_SDMA_IRQ_INSTANCE0 + i); + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) { + for_each_inst(i, tmp_mask) + { + amdgpu_irq_put(adev, >sdma.ecc_irq, + AMDGPU_SDMA_IRQ_INSTANCE0 + i); + } } sdma_v4_4_2_inst_ctx_switch_enable(adev, false, inst_mask); -- 2.25.1
[PATCH v2 2/3] drm/amdgpu: Use PSP FW API for partition switch
Use PSP firmware interface for switching compute partitions. Signed-off-by: Lijo Lazar --- v2: Changed the return value to int .../drm/amd/amdgpu/aqua_vanjaram_reg_init.c| 3 --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c| 18 ++ 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c index a595bb958215..16471b81a1f5 100644 --- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c +++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c @@ -518,9 +518,6 @@ static int aqua_vanjaram_switch_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr, adev->gfx.funcs->switch_partition_mode(xcp_mgr->adev, num_xcc_per_xcp); - if (adev->nbio.funcs->set_compute_partition_mode) - adev->nbio.funcs->set_compute_partition_mode(adev, mode); - /* Init info about new xcps */ *num_xcps = num_xcc / num_xcc_per_xcp; amdgpu_xcp_init(xcp_mgr, *num_xcps, mode); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index f5b8d3f388ff..c1ee54d4c3d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -623,22 +623,16 @@ static void gfx_v9_4_3_select_me_pipe_q(struct amdgpu_device *adev, static int gfx_v9_4_3_switch_compute_partition(struct amdgpu_device *adev, int num_xccs_per_xcp) { - int i, num_xcc; - u32 tmp = 0; - - num_xcc = NUM_XCC(adev->gfx.xcc_mask); + int ret; - for (i = 0; i < num_xcc; i++) { - tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, NUM_XCC_IN_XCP, - num_xccs_per_xcp); - tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, VIRTUAL_XCC_ID, - i % num_xccs_per_xcp); - WREG32_SOC15(GC, GET_INST(GC, i), regCP_HYP_XCP_CTL, tmp); - } + ret = psp_spatial_partition(>psp, NUM_XCC(adev->gfx.xcc_mask) / + num_xccs_per_xcp); + if (ret) + return ret; adev->gfx.num_xcc_per_xcp = num_xccs_per_xcp; - return 0; + return ret; } static int gfx_v9_4_3_ih_to_xcc_inst(struct amdgpu_device *adev, int ih_node) -- 2.25.1
[PATCH v2 3/3] drm/amdgpu: Remove unused NBIO interface
Set compute partition mode interface in NBIO is no longer used. Remove the only implementation from NBIO v7.9 Signed-off-by: Lijo Lazar --- drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h | 2 -- drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 14 -- 2 files changed, 16 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h index 095aecfb201e..8ab8ae01f87c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h @@ -99,8 +99,6 @@ struct amdgpu_nbio_funcs { int (*get_compute_partition_mode)(struct amdgpu_device *adev); u32 (*get_memory_partition_mode)(struct amdgpu_device *adev, u32 *supp_modes); - void (*set_compute_partition_mode)(struct amdgpu_device *adev, - enum amdgpu_gfx_partition mode); }; struct amdgpu_nbio { diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c index b033935d6749..cd1a02d30420 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c @@ -393,19 +393,6 @@ static int nbio_v7_9_get_compute_partition_mode(struct amdgpu_device *adev) return px; } -static void nbio_v7_9_set_compute_partition_mode(struct amdgpu_device *adev, - enum amdgpu_gfx_partition mode) -{ - u32 tmp; - - /* SPX=0, DPX=1, TPX=2, QPX=3, CPX=4 */ - tmp = RREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS); - tmp = REG_SET_FIELD(tmp, BIF_BX_PF0_PARTITION_COMPUTE_STATUS, - PARTITION_MODE, mode); - - WREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS, tmp); -} - static u32 nbio_v7_9_get_memory_partition_mode(struct amdgpu_device *adev, u32 *supp_modes) { @@ -461,7 +448,6 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = { .ih_control = nbio_v7_9_ih_control, .remap_hdp_registers = nbio_v7_9_remap_hdp_registers, .get_compute_partition_mode = nbio_v7_9_get_compute_partition_mode, - .set_compute_partition_mode = nbio_v7_9_set_compute_partition_mode, .get_memory_partition_mode = nbio_v7_9_get_memory_partition_mode, .init_registers = nbio_v7_9_init_registers, }; -- 2.25.1
[PATCH v2 1/3] drm/amdgpu: Change nbio v7.9 xcp status definition
PARTITION_MODE field in PARTITION_COMPUTE_STATUS register is defined as below by firmware. SPX = 0, DPX = 1, TPX = 2, QPX = 3, CPX = 4 Change driver definition accordingly. Signed-off-by: Lijo Lazar --- drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 8 +++- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c index d19325476752..b033935d6749 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c @@ -390,7 +390,7 @@ static int nbio_v7_9_get_compute_partition_mode(struct amdgpu_device *adev) px = REG_GET_FIELD(tmp, BIF_BX_PF0_PARTITION_COMPUTE_STATUS, PARTITION_MODE); - return ffs(px); + return px; } static void nbio_v7_9_set_compute_partition_mode(struct amdgpu_device *adev, @@ -398,12 +398,10 @@ static void nbio_v7_9_set_compute_partition_mode(struct amdgpu_device *adev, { u32 tmp; - /* Each bit represents DPX,TPX,QPX,CPX mode. No bit set means default -* SPX mode. -*/ + /* SPX=0, DPX=1, TPX=2, QPX=3, CPX=4 */ tmp = RREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS); tmp = REG_SET_FIELD(tmp, BIF_BX_PF0_PARTITION_COMPUTE_STATUS, - PARTITION_MODE, mode ? BIT(mode - 1) : mode); + PARTITION_MODE, mode); WREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS, tmp); } -- 2.25.1
[PATCH 2/3] drm/amdgpu: Use PSP FW API for partition switch
Use PSP firmware interface for switching compute partitions. Signed-off-by: Lijo Lazar --- .../drm/amd/amdgpu/aqua_vanjaram_reg_init.c| 3 --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c| 18 ++ 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c index a595bb958215..16471b81a1f5 100644 --- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c +++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c @@ -518,9 +518,6 @@ static int aqua_vanjaram_switch_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr, adev->gfx.funcs->switch_partition_mode(xcp_mgr->adev, num_xcc_per_xcp); - if (adev->nbio.funcs->set_compute_partition_mode) - adev->nbio.funcs->set_compute_partition_mode(adev, mode); - /* Init info about new xcps */ *num_xcps = num_xcc / num_xcc_per_xcp; amdgpu_xcp_init(xcp_mgr, *num_xcps, mode); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index f5b8d3f388ff..9e3e4fcf344d 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -623,22 +623,16 @@ static void gfx_v9_4_3_select_me_pipe_q(struct amdgpu_device *adev, static int gfx_v9_4_3_switch_compute_partition(struct amdgpu_device *adev, int num_xccs_per_xcp) { - int i, num_xcc; - u32 tmp = 0; - - num_xcc = NUM_XCC(adev->gfx.xcc_mask); + u32 ret; - for (i = 0; i < num_xcc; i++) { - tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, NUM_XCC_IN_XCP, - num_xccs_per_xcp); - tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, VIRTUAL_XCC_ID, - i % num_xccs_per_xcp); - WREG32_SOC15(GC, GET_INST(GC, i), regCP_HYP_XCP_CTL, tmp); - } + ret = psp_spatial_partition(>psp, NUM_XCC(adev->gfx.xcc_mask) / + num_xccs_per_xcp); + if (ret) + return ret; adev->gfx.num_xcc_per_xcp = num_xccs_per_xcp; - return 0; + return ret; } static int gfx_v9_4_3_ih_to_xcc_inst(struct amdgpu_device *adev, int ih_node) -- 2.25.1
[PATCH 3/3] drm/amdgpu: Remove unused NBIO interface
Set compute partition mode interface in NBIO is no longer used. Remove the only implementation from NBIO v7.9 Signed-off-by: Lijo Lazar --- drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h | 2 -- drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 14 -- 2 files changed, 16 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h index 095aecfb201e..8ab8ae01f87c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h @@ -99,8 +99,6 @@ struct amdgpu_nbio_funcs { int (*get_compute_partition_mode)(struct amdgpu_device *adev); u32 (*get_memory_partition_mode)(struct amdgpu_device *adev, u32 *supp_modes); - void (*set_compute_partition_mode)(struct amdgpu_device *adev, - enum amdgpu_gfx_partition mode); }; struct amdgpu_nbio { diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c index b033935d6749..cd1a02d30420 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c @@ -393,19 +393,6 @@ static int nbio_v7_9_get_compute_partition_mode(struct amdgpu_device *adev) return px; } -static void nbio_v7_9_set_compute_partition_mode(struct amdgpu_device *adev, - enum amdgpu_gfx_partition mode) -{ - u32 tmp; - - /* SPX=0, DPX=1, TPX=2, QPX=3, CPX=4 */ - tmp = RREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS); - tmp = REG_SET_FIELD(tmp, BIF_BX_PF0_PARTITION_COMPUTE_STATUS, - PARTITION_MODE, mode); - - WREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS, tmp); -} - static u32 nbio_v7_9_get_memory_partition_mode(struct amdgpu_device *adev, u32 *supp_modes) { @@ -461,7 +448,6 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = { .ih_control = nbio_v7_9_ih_control, .remap_hdp_registers = nbio_v7_9_remap_hdp_registers, .get_compute_partition_mode = nbio_v7_9_get_compute_partition_mode, - .set_compute_partition_mode = nbio_v7_9_set_compute_partition_mode, .get_memory_partition_mode = nbio_v7_9_get_memory_partition_mode, .init_registers = nbio_v7_9_init_registers, }; -- 2.25.1
[PATCH 1/3] drm/amdgpu: Change nbio v7.9 xcp status definition
PARTITION_MODE field in PARTITION_COMPUTE_STATUS register is defined as below by firmware. SPX = 0, DPX = 1, TPX = 2, QPX = 3, CPX = 4 Change driver definition accordingly. Signed-off-by: Lijo Lazar --- drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 8 +++- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c index d19325476752..b033935d6749 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c @@ -390,7 +390,7 @@ static int nbio_v7_9_get_compute_partition_mode(struct amdgpu_device *adev) px = REG_GET_FIELD(tmp, BIF_BX_PF0_PARTITION_COMPUTE_STATUS, PARTITION_MODE); - return ffs(px); + return px; } static void nbio_v7_9_set_compute_partition_mode(struct amdgpu_device *adev, @@ -398,12 +398,10 @@ static void nbio_v7_9_set_compute_partition_mode(struct amdgpu_device *adev, { u32 tmp; - /* Each bit represents DPX,TPX,QPX,CPX mode. No bit set means default -* SPX mode. -*/ + /* SPX=0, DPX=1, TPX=2, QPX=3, CPX=4 */ tmp = RREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS); tmp = REG_SET_FIELD(tmp, BIF_BX_PF0_PARTITION_COMPUTE_STATUS, - PARTITION_MODE, mode ? BIT(mode - 1) : mode); + PARTITION_MODE, mode); WREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS, tmp); } -- 2.25.1
[PATCH] drm/amdgpu: Release SDMAv4.4.2 ecc irq properly
Release ECC irq only if irq is enabled - only when RAS feature is enabled ECC irq gets enabled. Signed-off-by: Lijo Lazar --- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 17 +++-- 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 8eebf9c2bbcd..77ebf27981e4 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -1434,9 +1434,11 @@ static int sdma_v4_4_2_hw_fini(void *handle) return 0; inst_mask = GENMASK(adev->sdma.num_instances - 1, 0); - for (i = 0; i < adev->sdma.num_instances; i++) { - amdgpu_irq_put(adev, >sdma.ecc_irq, - AMDGPU_SDMA_IRQ_INSTANCE0 + i); + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) { + for (i = 0; i < adev->sdma.num_instances; i++) { + amdgpu_irq_put(adev, >sdma.ecc_irq, + AMDGPU_SDMA_IRQ_INSTANCE0 + i); + } } sdma_v4_4_2_inst_ctx_switch_enable(adev, false, inst_mask); @@ -2073,9 +2075,12 @@ static int sdma_v4_4_2_xcp_suspend(void *handle, uint32_t inst_mask) uint32_t tmp_mask = inst_mask; int i; - for_each_inst(i, tmp_mask) { - amdgpu_irq_put(adev, >sdma.ecc_irq, - AMDGPU_SDMA_IRQ_INSTANCE0 + i); + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) { + for_each_inst(i, tmp_mask) + { + amdgpu_irq_put(adev, >sdma.ecc_irq, + AMDGPU_SDMA_IRQ_INSTANCE0 + i); + } } sdma_v4_4_2_inst_ctx_switch_enable(adev, false, inst_mask); -- 2.25.1
RE: [PATCH v5] drm/dp_mst: Clear MSG_RDY flag before sending new message
[AMD Official Use Only - General] Noted. Thanks, Lyude! Regards, Wayne Lin > -Original Message- > From: Lyude Paul > Sent: Tuesday, June 13, 2023 6:34 AM > To: Lin, Wayne ; dri-de...@lists.freedesktop.org; > amd-gfx@lists.freedesktop.org > Cc: ville.syrj...@linux.intel.com; jani.nik...@intel.com; imre.d...@intel.com; > Wentland, Harry ; Zuo, Jerry > ; sta...@vger.kernel.org > Subject: Re: [PATCH v5] drm/dp_mst: Clear MSG_RDY flag before sending new > message > > FWIW: Should have a response to this very soon, figured out the cause of my > MST issues so I should be able to test this very soon > > On Fri, 2023-06-09 at 18:49 +0800, Wayne Lin wrote: > > [Why] > > The sequence for collecting down_reply from source perspective should > > be: > > > > Request_n->repeat (get partial reply of Request_n->clear message ready > > flag to ack DPRX that the message is received) till all partial > > replies for Request_n are received->new Request_n+1. > > > > Now there is chance that drm_dp_mst_hpd_irq() will fire new down > > request in the tx queue when the down reply is incomplete. Source is > > restricted to generate interveleaved message transactions so we should > > avoid it. > > > > Also, while assembling partial reply packets, reading out DPCD > > DOWN_REP Sideband MSG buffer + clearing DOWN_REP_MSG_RDY flag > should > > be wrapped up as a complete operation for reading out a reply packet. > > Kicking off a new request before clearing DOWN_REP_MSG_RDY flag might > > be risky. e.g. If the reply of the new request has overwritten the > > DPRX DOWN_REP Sideband MSG buffer before source writing one to clear > > DOWN_REP_MSG_RDY flag, source then unintentionally flushes the reply > > for the new request. Should handle the up request in the same way. > > > > [How] > > Separete drm_dp_mst_hpd_irq() into 2 steps. After acking the MST IRQ > > event, driver calls drm_dp_mst_hpd_irq_send_new_request() and might > > trigger drm_dp_mst_kick_tx() only when there is no on going message > > transaction. > > > > Changes since v1: > > * Reworked on review comments received > > -> Adjust the fix to let driver explicitly kick off new down request > > when mst irq event is handled and acked > > -> Adjust the commit message > > > > Changes since v2: > > * Adjust the commit message > > * Adjust the naming of the divided 2 functions and add a new input > > parameter "ack". > > * Adjust code flow as per review comments. > > > > Changes since v3: > > * Update the function description of drm_dp_mst_hpd_irq_handle_event > > > > Changes since v4: > > * Change ack of drm_dp_mst_hpd_irq_handle_event() to be an array align > > the size of esi[] > > > > Signed-off-by: Wayne Lin > > Cc: sta...@vger.kernel.org > > --- > > .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 32 +-- > > drivers/gpu/drm/display/drm_dp_mst_topology.c | 54 > --- > > drivers/gpu/drm/i915/display/intel_dp.c | 7 +-- > > drivers/gpu/drm/nouveau/dispnv50/disp.c | 12 +++-- > > include/drm/display/drm_dp_mst_helper.h | 7 ++- > > 5 files changed, 81 insertions(+), 31 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c > > b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c > > index d5cec03eaa8d..ec629b4037e4 100644 > > --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c > > +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c > > @@ -3263,6 +3263,7 @@ static void dm_handle_mst_sideband_msg(struct > > amdgpu_dm_connector *aconnector) > > > > while (dret == dpcd_bytes_to_read && > > process_count < max_process_count) { > > + u8 ack[DP_PSR_ERROR_STATUS - DP_SINK_COUNT_ESI] = {}; > > u8 retry; > > dret = 0; > > > > @@ -3271,28 +3272,29 @@ static void > dm_handle_mst_sideband_msg(struct amdgpu_dm_connector *aconnector) > > DRM_DEBUG_DRIVER("ESI %02x %02x %02x\n", esi[0], > esi[1], esi[2]); > > /* handle HPD short pulse irq */ > > if (aconnector->mst_mgr.mst_state) > > - drm_dp_mst_hpd_irq( > > - >mst_mgr, > > - esi, > > - _irq_handled); > > + drm_dp_mst_hpd_irq_handle_event( > >mst_mgr, > > + esi, > > + ack, > > + _irq_handled); > > > > if (new_irq_handled) { > > /* ACK at DPCD to notify down stream */ > > - const int ack_dpcd_bytes_to_write = > > - dpcd_bytes_to_read - 1; > > - > > for (retry = 0; retry < 3; retry++) { > > - u8 wret; > > - > > - wret = drm_dp_dpcd_write( > > - >dm_dp_aux.aux, > > - dpcd_addr + 1, > > -
RE: [PATCH] drm/amdgpu: add wait_for helper for spirom update
[AMD Official Use Only - General] Reviewed-by: Hawking Zhang Regards, Hawking -Original Message- From: amd-gfx On Behalf Of Gao, Likun Sent: Tuesday, June 13, 2023 15:37 To: amd-gfx list Subject: [PATCH] drm/amdgpu: add wait_for helper for spirom update [AMD Official Use Only - General] [AMD Official Use Only - General] From: Likun Gao Sent: Tuesday, June 13, 2023 3:29 PM To: brahma_sw_dev Cc: Zhang, Hawking ; Gao, Likun Subject: [PATCH] drm/amdgpu: add wait_for helper for spirom update From: Likun Gao Spirom update typically requires extremely long duration for command execution, and special helper function to wait for it's completion. Signed-off-by: Likun Gao --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 20 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 2 ++ drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 9 + drivers/gpu/drm/amd/amdgpu/psp_v13_0.h | 2 ++ 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index a39d4ddf7743..fa06da014473 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -560,6 +560,26 @@ int psp_wait_for(struct psp_context *psp, uint32_t reg_index, return -ETIME; } +int psp_wait_for_spirom_update(struct psp_context *psp, uint32_t reg_index, + uint32_t reg_val, uint32_t mask, uint32_t msec_timeout) { + uint32_t val; + int i; + struct amdgpu_device *adev = psp->adev; + + if (psp->adev->no_hw_access) + return 0; + + for (i = 0; i < msec_timeout; i++) { + val = RREG32(reg_index); + if ((val & mask) == reg_val) + return 0; + msleep(1); + } + + return -ETIME; +} + static const char *psp_gfx_cmd_name(enum psp_gfx_cmd_id cmd_id) { switch (cmd_id) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h index cf4f60c66122..ec3f3fe5efff 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h @@ -455,6 +455,8 @@ extern const struct amdgpu_ip_block_version psp_v13_0_4_ip_block; extern int psp_wait_for(struct psp_context *psp, uint32_t reg_index, uint32_t field_val, uint32_t mask, bool check_changed); +extern int psp_wait_for_spirom_update(struct psp_context *psp, uint32_t reg_index, + uint32_t field_val, uint32_t mask, uint32_t msec_timeout); int psp_gpu_reset(struct amdgpu_device *adev); int psp_update_vcn_sram(struct amdgpu_device *adev, int inst_idx, diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c index caee76ab7110..67e216373585 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c @@ -624,10 +624,11 @@ static int psp_v13_0_exec_spi_cmd(struct psp_context *psp, int cmd) WREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_73, 1); if (cmd == C2PMSG_CMD_SPI_UPDATE_FLASH_IMAGE) - return 0; - - ret = psp_wait_for(psp, SOC15_REG_OFFSET(MP0, 0, regMP0_SMN_C2PMSG_115), - MBOX_READY_FLAG, MBOX_READY_MASK, false); + ret = psp_wait_for_spirom_update(psp, SOC15_REG_OFFSET(MP0, 0, regMP0_SMN_C2PMSG_115), +MBOX_READY_FLAG, MBOX_READY_MASK, PSP_SPIROM_UPDATE_TIMEOUT); + else + ret = psp_wait_for(psp, SOC15_REG_OFFSET(MP0, 0, regMP0_SMN_C2PMSG_115), + MBOX_READY_FLAG, MBOX_READY_MASK, false); if (ret) { dev_err(adev->dev, "SPI cmd %x timed out, ret = %d", cmd, ret); return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.h b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.h index b2414a729ca1..9eae5e23b2e5 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.h +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.h @@ -25,6 +25,8 @@ #include "amdgpu_psp.h" +#define PSP_SPIROM_UPDATE_TIMEOUT 6 /* 60s */ + void psp_v13_0_set_psp_funcs(struct psp_context *psp); #endif -- 2.34.1
[PATCH] drm/amdgpu: add wait_for helper for spirom update
[AMD Official Use Only - General] From: Likun Gao Sent: Tuesday, June 13, 2023 3:29 PM To: brahma_sw_dev Cc: Zhang, Hawking ; Gao, Likun Subject: [PATCH] drm/amdgpu: add wait_for helper for spirom update From: Likun Gao Spirom update typically requires extremely long duration for command execution, and special helper function to wait for it's completion. Signed-off-by: Likun Gao --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 20 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 2 ++ drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 9 + drivers/gpu/drm/amd/amdgpu/psp_v13_0.h | 2 ++ 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index a39d4ddf7743..fa06da014473 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -560,6 +560,26 @@ int psp_wait_for(struct psp_context *psp, uint32_t reg_index, return -ETIME; } +int psp_wait_for_spirom_update(struct psp_context *psp, uint32_t reg_index, + uint32_t reg_val, uint32_t mask, uint32_t msec_timeout) { + uint32_t val; + int i; + struct amdgpu_device *adev = psp->adev; + + if (psp->adev->no_hw_access) + return 0; + + for (i = 0; i < msec_timeout; i++) { + val = RREG32(reg_index); + if ((val & mask) == reg_val) + return 0; + msleep(1); + } + + return -ETIME; +} + static const char *psp_gfx_cmd_name(enum psp_gfx_cmd_id cmd_id) { switch (cmd_id) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h index cf4f60c66122..ec3f3fe5efff 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h @@ -455,6 +455,8 @@ extern const struct amdgpu_ip_block_version psp_v13_0_4_ip_block; extern int psp_wait_for(struct psp_context *psp, uint32_t reg_index, uint32_t field_val, uint32_t mask, bool check_changed); +extern int psp_wait_for_spirom_update(struct psp_context *psp, uint32_t reg_index, + uint32_t field_val, uint32_t mask, uint32_t msec_timeout); int psp_gpu_reset(struct amdgpu_device *adev); int psp_update_vcn_sram(struct amdgpu_device *adev, int inst_idx, diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c index caee76ab7110..67e216373585 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c @@ -624,10 +624,11 @@ static int psp_v13_0_exec_spi_cmd(struct psp_context *psp, int cmd) WREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_73, 1); if (cmd == C2PMSG_CMD_SPI_UPDATE_FLASH_IMAGE) - return 0; - - ret = psp_wait_for(psp, SOC15_REG_OFFSET(MP0, 0, regMP0_SMN_C2PMSG_115), - MBOX_READY_FLAG, MBOX_READY_MASK, false); + ret = psp_wait_for_spirom_update(psp, SOC15_REG_OFFSET(MP0, 0, regMP0_SMN_C2PMSG_115), +MBOX_READY_FLAG, MBOX_READY_MASK, PSP_SPIROM_UPDATE_TIMEOUT); + else + ret = psp_wait_for(psp, SOC15_REG_OFFSET(MP0, 0, regMP0_SMN_C2PMSG_115), + MBOX_READY_FLAG, MBOX_READY_MASK, false); if (ret) { dev_err(adev->dev, "SPI cmd %x timed out, ret = %d", cmd, ret); return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.h b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.h index b2414a729ca1..9eae5e23b2e5 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.h +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.h @@ -25,6 +25,8 @@ #include "amdgpu_psp.h" +#define PSP_SPIROM_UPDATE_TIMEOUT 6 /* 60s */ + void psp_v13_0_set_psp_funcs(struct psp_context *psp); #endif -- 2.34.1
RE: [PATCH Review 1/2] drm/amdgpu: Optimze checking ras supported
[AMD Official Use Only - General] > -Original Message- > From: Zhou1, Tao > Sent: Tuesday, June 13, 2023 3:08 PM > To: Yang, Stanley ; amd-gfx@lists.freedesktop.org; > Zhang, Hawking > Cc: Yang, Stanley > Subject: RE: [PATCH Review 1/2] drm/amdgpu: Optimze checking ras > supported > > [AMD Official Use Only - General] > > [Tao] typo in title: Optimze -> Optimize [Stanley]: Thanks Tao, will update before submitting. Regards, Stanley > > > -Original Message- > > From: Stanley.Yang > > Sent: Tuesday, June 13, 2023 11:53 AM > > To: amd-gfx@lists.freedesktop.org; Zhang, Hawking > > ; Zhou1, Tao > > Cc: Yang, Stanley > > Subject: [PATCH Review 1/2] drm/amdgpu: Optimze checking ras supported > > > > Using "is_app_apu" to identify device in the native APU mode or carveout > mode. > > > > Signed-off-by: Stanley.Yang > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 2 +- > > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 8 +++--- > > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 34 ++- > -- > > 3 files changed, 23 insertions(+), 21 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > > index 78bacea951a9..352e958b190a 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > > @@ -1653,7 +1653,7 @@ int psp_ras_initialize(struct psp_context *psp) > > > > if (amdgpu_ras_is_poison_mode_supported(adev)) > > ras_cmd->ras_in_message.init_flags.poison_mode_en = 1; > > - if (!adev->gmc.xgmi.connected_to_cpu) > > + if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) > > ras_cmd->ras_in_message.init_flags.dgpu_mode = 1; > > ras_cmd->ras_in_message.init_flags.xcc_mask = > > adev->gfx.xcc_mask; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > index 7a0924469e4f..56bb0db207b9 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > @@ -1689,8 +1689,7 @@ static void > > amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * > > } > > } > > > > - if (!adev->gmc.xgmi.connected_to_cpu) > > - amdgpu_umc_poison_handler(adev, false); > > + amdgpu_umc_poison_handler(adev, false); > > > > if (block_obj->hw_ops && block_obj->hw_ops- > > >handle_poison_consumption) > > poison_stat = block_obj->hw_ops- > > >handle_poison_consumption(adev); > > @@ -2458,11 +2457,10 @@ static void > amdgpu_ras_check_supported(struct > > amdgpu_device *adev) { > > adev->ras_hw_enabled = adev->ras_enabled = 0; > > > > - if (!adev->is_atom_fw || > > - !amdgpu_ras_asic_supported(adev)) > > + if (!amdgpu_ras_asic_supported(adev)) > > return; > > > > - if (!adev->gmc.xgmi.connected_to_cpu) { > > + if (!adev->gmc.xgmi.connected_to_cpu && !adev- > > [Tao] the tab should be replaced with space. > > > >gmc.is_app_apu) { > > if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { > > dev_info(adev->dev, "MEM ECC is active.\n"); > > adev->ras_hw_enabled |= (1 << > > AMDGPU_RAS_BLOCK__UMC | diff --git > > a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > > index 1edf8e6aeb16..db0d94ca4ffc 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > > @@ -169,27 +169,31 @@ int amdgpu_umc_poison_handler(struct > > amdgpu_device *adev, bool reset) { > > int ret = AMDGPU_RAS_SUCCESS; > > > > - if (!amdgpu_sriov_vf(adev)) { > > - if (!adev->gmc.xgmi.connected_to_cpu) { > > - struct ras_err_data err_data = {0, 0, 0, NULL}; > > - struct ras_common_if head = { > > - .block = AMDGPU_RAS_BLOCK__UMC, > > - }; > > - struct ras_manager *obj = amdgpu_ras_find_obj(adev, > > ); > > - > > - ret = amdgpu_umc_do_page_retirement(adev, > > _data, NULL, reset); > > - > > - if (ret == AMDGPU_RAS_SUCCESS && obj) { > > - obj->err_data.ue_count += err_data.ue_count; > > - obj->err_data.ce_count += err_data.ce_count; > > - } > > - } else if (reset) { > > + if (adev->gmc.xgmi.connected_to_cpu || > > + adev->gmc.is_app_apu) { > > + if (reset) { > > /* MCA poison handler is only responsible for GPU > > reset, > >* let MCA notifier do page retirement. > >*/ > > kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); > > amdgpu_ras_reset_gpu(adev); > > } > > + return ret; > > + } > > + > > +
RE: [PATCH Review 2/2] drm/amdgpu: Add checking mc_vram_size
[AMD Official Use Only - General] With my concerns fixed, the series is: Reviewed-by: Tao Zhou > -Original Message- > From: Stanley.Yang > Sent: Tuesday, June 13, 2023 11:53 AM > To: amd-gfx@lists.freedesktop.org; Zhang, Hawking ; > Zhou1, Tao > Cc: Yang, Stanley > Subject: [PATCH Review 2/2] drm/amdgpu: Add checking mc_vram_size > > Do not compare injection address with mc_vram_size if mc_vram_size is zero. > > Signed-off-by: Stanley.Yang > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ++- > 1 file changed, 2 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 56bb0db207b9..3c041efcf0c4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -494,7 +494,8 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file > *f, > ret = amdgpu_ras_feature_enable(adev, , 1); > break; > case 2: > - if ((data.inject.address >= adev->gmc.mc_vram_size) || > + if ((data.inject.address >= adev->gmc.mc_vram_size && > + adev->gmc.mc_vram_size) || > (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) { > dev_warn(adev->dev, "RAS WARN: input address " > "0x%llx is invalid.", > -- > 2.17.1
RE: [PATCH Review 1/2] drm/amdgpu: Optimze checking ras supported
[AMD Official Use Only - General] [Tao] typo in title: Optimze -> Optimize > -Original Message- > From: Stanley.Yang > Sent: Tuesday, June 13, 2023 11:53 AM > To: amd-gfx@lists.freedesktop.org; Zhang, Hawking ; > Zhou1, Tao > Cc: Yang, Stanley > Subject: [PATCH Review 1/2] drm/amdgpu: Optimze checking ras supported > > Using "is_app_apu" to identify device in the native APU mode or carveout mode. > > Signed-off-by: Stanley.Yang > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 8 +++--- > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 34 ++--- > 3 files changed, 23 insertions(+), 21 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > index 78bacea951a9..352e958b190a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > @@ -1653,7 +1653,7 @@ int psp_ras_initialize(struct psp_context *psp) > > if (amdgpu_ras_is_poison_mode_supported(adev)) > ras_cmd->ras_in_message.init_flags.poison_mode_en = 1; > - if (!adev->gmc.xgmi.connected_to_cpu) > + if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) > ras_cmd->ras_in_message.init_flags.dgpu_mode = 1; > ras_cmd->ras_in_message.init_flags.xcc_mask = > adev->gfx.xcc_mask; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 7a0924469e4f..56bb0db207b9 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -1689,8 +1689,7 @@ static void > amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * > } > } > > - if (!adev->gmc.xgmi.connected_to_cpu) > - amdgpu_umc_poison_handler(adev, false); > + amdgpu_umc_poison_handler(adev, false); > > if (block_obj->hw_ops && block_obj->hw_ops- > >handle_poison_consumption) > poison_stat = block_obj->hw_ops- > >handle_poison_consumption(adev); > @@ -2458,11 +2457,10 @@ static void amdgpu_ras_check_supported(struct > amdgpu_device *adev) { > adev->ras_hw_enabled = adev->ras_enabled = 0; > > - if (!adev->is_atom_fw || > - !amdgpu_ras_asic_supported(adev)) > + if (!amdgpu_ras_asic_supported(adev)) > return; > > - if (!adev->gmc.xgmi.connected_to_cpu) { > + if (!adev->gmc.xgmi.connected_to_cpu && !adev- [Tao] the tab should be replaced with space. > >gmc.is_app_apu) { > if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { > dev_info(adev->dev, "MEM ECC is active.\n"); > adev->ras_hw_enabled |= (1 << > AMDGPU_RAS_BLOCK__UMC | diff --git > a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > index 1edf8e6aeb16..db0d94ca4ffc 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > @@ -169,27 +169,31 @@ int amdgpu_umc_poison_handler(struct > amdgpu_device *adev, bool reset) { > int ret = AMDGPU_RAS_SUCCESS; > > - if (!amdgpu_sriov_vf(adev)) { > - if (!adev->gmc.xgmi.connected_to_cpu) { > - struct ras_err_data err_data = {0, 0, 0, NULL}; > - struct ras_common_if head = { > - .block = AMDGPU_RAS_BLOCK__UMC, > - }; > - struct ras_manager *obj = amdgpu_ras_find_obj(adev, > ); > - > - ret = amdgpu_umc_do_page_retirement(adev, > _data, NULL, reset); > - > - if (ret == AMDGPU_RAS_SUCCESS && obj) { > - obj->err_data.ue_count += err_data.ue_count; > - obj->err_data.ce_count += err_data.ce_count; > - } > - } else if (reset) { > + if (adev->gmc.xgmi.connected_to_cpu || > + adev->gmc.is_app_apu) { > + if (reset) { > /* MCA poison handler is only responsible for GPU reset, >* let MCA notifier do page retirement. >*/ > kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); > amdgpu_ras_reset_gpu(adev); > } > + return ret; > + } > + > + if (!amdgpu_sriov_vf(adev)) { > + struct ras_err_data err_data = {0, 0, 0, NULL}; > + struct ras_common_if head = { > + .block = AMDGPU_RAS_BLOCK__UMC, > + }; > + struct ras_manager *obj = amdgpu_ras_find_obj(adev, ); > + > + ret = amdgpu_umc_do_page_retirement(adev, _data, NULL, > reset); > + > + if (ret == AMDGPU_RAS_SUCCESS && obj) { > + obj->err_data.ue_count += err_data.ue_count; > + obj->err_data.ce_count += err_data.ce_count; > + } > }