RE: [PATCH] drm/amdgpu: Move calculation of xcp per memory node

2023-06-13 Thread Zhang, Hawking
[AMD Official Use Only - General]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Lazar, Lijo 
Sent: Wednesday, June 14, 2023 12:28
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Yang, Philip ; Kamal, Asad 
; Ma, Le 
Subject: [PATCH] drm/amdgpu: Move calculation of xcp per memory node

Its value is required for finding the memory id of xcp.

Fixes: 2130f4ca70b7f ("drm/amdgpu: Add xcp manager num_xcp_per_mem_partition")

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
index d733fa6e7477..9687df9841ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
@@ -132,6 +132,9 @@ int amdgpu_xcp_init(struct amdgpu_xcp_mgr *xcp_mgr, int 
num_xcps, int mode)
for (i = 0; i < MAX_XCP; ++i)
xcp_mgr->xcp[i].valid = false;

+   /* This is needed for figuring out memory id of xcp */
+   xcp_mgr->num_xcp_per_mem_partition = num_xcps /
+xcp_mgr->adev->gmc.num_mem_partitions;
+
for (i = 0; i < num_xcps; ++i) {
for (j = AMDGPU_XCP_GFXHUB; j < AMDGPU_XCP_MAX_BLOCKS; ++j) {
ret = xcp_mgr->funcs->get_ip_details(xcp_mgr, i, j, @@ 
-157,7 +160,6 @@ int amdgpu_xcp_init(struct amdgpu_xcp_mgr *xcp_mgr, int 
num_xcps, int mode)
xcp_mgr->num_xcps = num_xcps;
amdgpu_xcp_update_partition_sched_list(adev);

-   xcp_mgr->num_xcp_per_mem_partition = num_xcps / 
xcp_mgr->adev->gmc.num_mem_partitions;
return 0;
 }

--
2.25.1



[PATCH] drm/amdgpu: Move calculation of xcp per memory node

2023-06-13 Thread Lijo Lazar
Its value is required for finding the memory id of xcp.

Fixes: 2130f4ca70b7f ("drm/amdgpu: Add xcp manager num_xcp_per_mem_partition")

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
index d733fa6e7477..9687df9841ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
@@ -132,6 +132,9 @@ int amdgpu_xcp_init(struct amdgpu_xcp_mgr *xcp_mgr, int 
num_xcps, int mode)
for (i = 0; i < MAX_XCP; ++i)
xcp_mgr->xcp[i].valid = false;
 
+   /* This is needed for figuring out memory id of xcp */
+   xcp_mgr->num_xcp_per_mem_partition = num_xcps / 
xcp_mgr->adev->gmc.num_mem_partitions;
+
for (i = 0; i < num_xcps; ++i) {
for (j = AMDGPU_XCP_GFXHUB; j < AMDGPU_XCP_MAX_BLOCKS; ++j) {
ret = xcp_mgr->funcs->get_ip_details(xcp_mgr, i, j,
@@ -157,7 +160,6 @@ int amdgpu_xcp_init(struct amdgpu_xcp_mgr *xcp_mgr, int 
num_xcps, int mode)
xcp_mgr->num_xcps = num_xcps;
amdgpu_xcp_update_partition_sched_list(adev);
 
-   xcp_mgr->num_xcp_per_mem_partition = num_xcps / 
xcp_mgr->adev->gmc.num_mem_partitions;
return 0;
 }
 
-- 
2.25.1



RE: [PATCH 3/3] drm/amdgpu: Remove unused NBIO interface

2023-06-13 Thread Ma, Le
[AMD Official Use Only - General]

Series is Reviewed-by: Le Ma 

> -Original Message-
> From: Lazar, Lijo 
> Sent: Tuesday, June 13, 2023 6:54 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking ; Deucher, Alexander
> ; Kamal, Asad ; Ma,
> Le 
> Subject: [PATCH 3/3] drm/amdgpu: Remove unused NBIO interface
>
> Set compute partition mode interface in NBIO is no longer used. Remove the
> only implementation from NBIO v7.9
>
> Signed-off-by: Lijo Lazar 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h |  2 --
>  drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c   | 14 --
>  2 files changed, 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
> index 095aecfb201e..8ab8ae01f87c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
> @@ -99,8 +99,6 @@ struct amdgpu_nbio_funcs {
>   int (*get_compute_partition_mode)(struct amdgpu_device *adev);
>   u32 (*get_memory_partition_mode)(struct amdgpu_device *adev,
>u32 *supp_modes);
> - void (*set_compute_partition_mode)(struct amdgpu_device *adev,
> -enum amdgpu_gfx_partition mode);
>  };
>
>  struct amdgpu_nbio {
> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
> b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
> index b033935d6749..cd1a02d30420 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
> @@ -393,19 +393,6 @@ static int
> nbio_v7_9_get_compute_partition_mode(struct amdgpu_device *adev)
>   return px;
>  }
>
> -static void nbio_v7_9_set_compute_partition_mode(struct amdgpu_device
> *adev,
> - enum amdgpu_gfx_partition mode)
> -{
> - u32 tmp;
> -
> - /* SPX=0, DPX=1, TPX=2, QPX=3, CPX=4 */
> - tmp = RREG32_SOC15(NBIO, 0,
> regBIF_BX_PF0_PARTITION_COMPUTE_STATUS);
> - tmp = REG_SET_FIELD(tmp,
> BIF_BX_PF0_PARTITION_COMPUTE_STATUS,
> - PARTITION_MODE, mode);
> -
> - WREG32_SOC15(NBIO, 0,
> regBIF_BX_PF0_PARTITION_COMPUTE_STATUS, tmp);
> -}
> -
>  static u32 nbio_v7_9_get_memory_partition_mode(struct amdgpu_device
> *adev,
>  u32 *supp_modes)
>  {
> @@ -461,7 +448,6 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = {
>   .ih_control = nbio_v7_9_ih_control,
>   .remap_hdp_registers = nbio_v7_9_remap_hdp_registers,
>   .get_compute_partition_mode =
> nbio_v7_9_get_compute_partition_mode,
> - .set_compute_partition_mode =
> nbio_v7_9_set_compute_partition_mode,
>   .get_memory_partition_mode =
> nbio_v7_9_get_memory_partition_mode,
>   .init_registers = nbio_v7_9_init_registers,  };
> --
> 2.25.1



[PATCH] drm/amd/display: Convert to kdoc formats in dc/core/dc.c

2023-06-13 Thread Srinivasan Shanmugam
Fixes the following gcc with W=1:

drivers/gpu/drm/amd/amdgpu/../display/dc/core/dc.c:3483: warning: Cannot 
understand  * 
***
drivers/gpu/drm/amd/amdgpu/../display/dc/core/dc.c:4204: warning: Cannot 
understand  * 
***

Cc: Rodrigo Siqueira 
Cc: Aurabindo Pillai 
Signed-off-by: Srinivasan Shanmugam 
---
 drivers/gpu/drm/amd/display/dc/core/dc.c | 34 +++-
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c 
b/drivers/gpu/drm/amd/display/dc/core/dc.c
index 5d3d61faeb28..e6bd20dbfc0a 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc.c
@@ -3480,23 +3480,21 @@ static void build_dmub_update_dirty_rect(
 
 
 /**
- * 

- * build_dmub_cmd_list: Build an array of DMCUB commands to be sent to DMCUB
+ * build_dmub_cmd_list() - Build an array of DMCUB commands to be sent to DMCUB
  *
- * @param [in]: dc: Current DC state
- * @param [in]: srf_updates: Array of surface updates
- * @param [in]: surface_count: Number of surfaces that have an updated
- * @param [in]: stream: Correponding stream to be updated in the current flip
- * @param [in]: context: New DC state to be programmed
+ * @dc: Current DC state
+ * @srf_updates: Array of surface updates
+ * @surface_count: Number of surfaces that have an updated
+ * @stream: Corresponding stream to be updated in the current flip
+ * @context: New DC state to be programmed
  *
- * @param [out]: dc_dmub_cmd: Array of DMCUB commands to be sent to DMCUB
- * @param [out]: dmub_cmd_count: Count indicating the number of DMCUB commands 
in dc_dmub_cmd array
+ * @dc_dmub_cmd: Array of DMCUB commands to be sent to DMCUB
+ * @dmub_cmd_count: Count indicating the number of DMCUB commands in 
dc_dmub_cmd array
  *
  * This function builds an array of DMCUB commands to be sent to DMCUB. This 
function is required
  * to build an array of commands and have them sent while the OTG lock is 
acquired.
  *
- * @return: void
- * 

+ * Return: void
  */
 static void build_dmub_cmd_list(struct dc *dc,
struct dc_surface_update *srf_updates,
@@ -4201,20 +4199,18 @@ static bool commit_minimal_transition_state(struct dc 
*dc,
 }
 
 /**
- * 
***
- * update_seamless_boot_flags: Helper function for updating seamless boot flags
+ * update_seamless_boot_flags() - Helper function for updating seamless boot 
flags
  *
- * @param [in]: dc: Current DC state
- * @param [in]: context: New DC state to be programmed
- * @param [in]: surface_count: Number of surfaces that have an updated
- * @param [in]: stream: Correponding stream to be updated in the current flip
+ * @dc: Current DC state
+ * @context: New DC state to be programmed
+ * @surface_count: Number of surfaces that have an updated
+ * @stream: Corresponding stream to be updated in the current flip
  *
  * Updating seamless boot flags do not need to be part of the commit sequence. 
This
  * helper function will update the seamless boot flags on each flip (if 
required)
  * outside of the HW commit sequence (fast or slow).
  *
- * @return: void
- * 
***
+ * Return: void
  */
 static void update_seamless_boot_flags(struct dc *dc,
struct dc_state *context,
-- 
2.25.1



Re: [PATCH v5] drm/dp_mst: Clear MSG_RDY flag before sending new message

2023-06-13 Thread Lyude Paul
Alright, managed to figure out my MST woes! Just tested with nouveau and I see
no regressions :)

Reviewed-by: Lyude Paul 


On Fri, 2023-06-09 at 18:49 +0800, Wayne Lin wrote:
> [Why]
> The sequence for collecting down_reply from source perspective should
> be:
> 
> Request_n->repeat (get partial reply of Request_n->clear message ready
> flag to ack DPRX that the message is received) till all partial
> replies for Request_n are received->new Request_n+1.
> 
> Now there is chance that drm_dp_mst_hpd_irq() will fire new down
> request in the tx queue when the down reply is incomplete. Source is
> restricted to generate interveleaved message transactions so we should
> avoid it.
> 
> Also, while assembling partial reply packets, reading out DPCD DOWN_REP
> Sideband MSG buffer + clearing DOWN_REP_MSG_RDY flag should be
> wrapped up as a complete operation for reading out a reply packet.
> Kicking off a new request before clearing DOWN_REP_MSG_RDY flag might
> be risky. e.g. If the reply of the new request has overwritten the
> DPRX DOWN_REP Sideband MSG buffer before source writing one to clear
> DOWN_REP_MSG_RDY flag, source then unintentionally flushes the reply
> for the new request. Should handle the up request in the same way.
> 
> [How]
> Separete drm_dp_mst_hpd_irq() into 2 steps. After acking the MST IRQ
> event, driver calls drm_dp_mst_hpd_irq_send_new_request() and might
> trigger drm_dp_mst_kick_tx() only when there is no on going message
> transaction.
> 
> Changes since v1:
> * Reworked on review comments received
> -> Adjust the fix to let driver explicitly kick off new down request
> when mst irq event is handled and acked
> -> Adjust the commit message
> 
> Changes since v2:
> * Adjust the commit message
> * Adjust the naming of the divided 2 functions and add a new input
>   parameter "ack".
> * Adjust code flow as per review comments.
> 
> Changes since v3:
> * Update the function description of drm_dp_mst_hpd_irq_handle_event
> 
> Changes since v4:
> * Change ack of drm_dp_mst_hpd_irq_handle_event() to be an array align
>   the size of esi[]
> 
> Signed-off-by: Wayne Lin 
> Cc: sta...@vger.kernel.org
> ---
>  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 32 +--
>  drivers/gpu/drm/display/drm_dp_mst_topology.c | 54 ---
>  drivers/gpu/drm/i915/display/intel_dp.c   |  7 +--
>  drivers/gpu/drm/nouveau/dispnv50/disp.c   | 12 +++--
>  include/drm/display/drm_dp_mst_helper.h   |  7 ++-
>  5 files changed, 81 insertions(+), 31 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index d5cec03eaa8d..ec629b4037e4 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -3263,6 +3263,7 @@ static void dm_handle_mst_sideband_msg(struct 
> amdgpu_dm_connector *aconnector)
>  
>   while (dret == dpcd_bytes_to_read &&
>   process_count < max_process_count) {
> + u8 ack[DP_PSR_ERROR_STATUS - DP_SINK_COUNT_ESI] = {};
>   u8 retry;
>   dret = 0;
>  
> @@ -3271,28 +3272,29 @@ static void dm_handle_mst_sideband_msg(struct 
> amdgpu_dm_connector *aconnector)
>   DRM_DEBUG_DRIVER("ESI %02x %02x %02x\n", esi[0], esi[1], 
> esi[2]);
>   /* handle HPD short pulse irq */
>   if (aconnector->mst_mgr.mst_state)
> - drm_dp_mst_hpd_irq(
> - >mst_mgr,
> - esi,
> - _irq_handled);
> + drm_dp_mst_hpd_irq_handle_event(>mst_mgr,
> + esi,
> + ack,
> + _irq_handled);
>  
>   if (new_irq_handled) {
>   /* ACK at DPCD to notify down stream */
> - const int ack_dpcd_bytes_to_write =
> - dpcd_bytes_to_read - 1;
> -
>   for (retry = 0; retry < 3; retry++) {
> - u8 wret;
> -
> - wret = drm_dp_dpcd_write(
> - >dm_dp_aux.aux,
> - dpcd_addr + 1,
> - [1],
> - ack_dpcd_bytes_to_write);
> - if (wret == ack_dpcd_bytes_to_write)
> + ssize_t wret;
> +
> + wret = 
> drm_dp_dpcd_writeb(>dm_dp_aux.aux,
> +   dpcd_addr + 1,
> +   ack[1]);
> + if (wret == 1)
>   break;
>   }
>  
> + if (retry == 3) {
> + 

[PATCH v2] drm/amd/pm: Align eccinfo table structure with smu v13_0_0 interface

2023-06-13 Thread Candice Li
Update eccinfo table structure according to smu v13_0_0 interface.

v2: Calculate array size instead of using macro definition.

Signed-off-by: Candice Li 
Reviewed-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index 413e592f0ed611..cbf0b2d738c1a6 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -46,7 +46,6 @@
 #include "asic_reg/mp/mp_13_0_0_sh_mask.h"
 #include "smu_cmn.h"
 #include "amdgpu_ras.h"
-#include "umc_v8_10.h"
 
 /*
  * DO NOT use these for err/warn/info/debug messages.
@@ -2580,7 +2579,7 @@ static ssize_t smu_v13_0_0_get_ecc_info(struct 
smu_context *smu,
 
ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
 
-   for (i = 0; i < UMC_V8_10_TOTAL_CHANNEL_NUM(adev); i++) {
+   for (i = 0; i < ARRAY_SIZE(ecc_table->EccInfo); i++) {
ecc_info_per_channel = &(eccinfo->ecc[i]);
ecc_info_per_channel->ce_count_lo_chip =
ecc_table->EccInfo[i].ce_count_lo_chip;
-- 
2.25.1



[PATCH 2/2] drm/amdgpu: Add channel_dis_num to ras init flags

2023-06-13 Thread Candice Li
Add disabled channel number to ras init flags.

Signed-off-by: Candice Li 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 1 +
 drivers/gpu/drm/amd/amdgpu/ta_ras_if.h  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index dd865beb39a8c4..6070c91f0b8293 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1657,6 +1657,7 @@ int psp_ras_initialize(struct psp_context *psp)
ras_cmd->ras_in_message.init_flags.dgpu_mode = 1;
ras_cmd->ras_in_message.init_flags.xcc_mask =
adev->gfx.xcc_mask;
+   ras_cmd->ras_in_message.init_flags.channel_dis_num = 
hweight32(adev->gmc.m_half_use) * 2;
 
ret = psp_ta_load(psp, >ras_context.context);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h 
b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
index be2984ac00a56d..879bb7af297c7b 100644
--- a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
+++ b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
@@ -130,6 +130,7 @@ struct ta_ras_init_flags {
uint8_t poison_mode_en;
uint8_t dgpu_mode;
uint16_t xcc_mask;
+   uint8_t channel_dis_num;
 };
 
 struct ta_ras_output_flags {
-- 
2.25.1



[PATCH 1/2] drm/amdgpu: Update total channel number for umc v8_10

2023-06-13 Thread Candice Li
Update total channel number for umc v8_10.

Signed-off-by: Candice Li 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h   | 2 ++
 drivers/gpu/drm/amd/amdgpu/umc_v8_10.h| 3 ++-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index 859882109f55d6..16cf7b199457e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -1515,6 +1515,7 @@ static int amdgpu_discovery_get_mall_info(struct 
amdgpu_device *adev)
mall_size += mall_size_per_umc;
}
adev->gmc.mall_size = mall_size;
+   adev->gmc.m_half_use = half_use;
break;
default:
dev_err(adev->dev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 6794edd1d2d2ae..56d73fade56850 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -301,6 +301,8 @@ struct amdgpu_gmc {
 
/* MALL size */
u64 mall_size;
+   uint32_t m_half_use;
+
/* number of UMC instances */
int num_umc;
/* mode2 save restore */
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.h 
b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.h
index c6dfd433fec7bc..dc12e0af5451e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.h
@@ -33,7 +33,8 @@
 
 /* Total channel instances for all available umc nodes */
 #define UMC_V8_10_TOTAL_CHANNEL_NUM(adev) \
-   (UMC_V8_10_CHANNEL_INSTANCE_NUM * UMC_V8_10_UMC_INSTANCE_NUM * 
(adev)->gmc.num_umc)
+   (UMC_V8_10_CHANNEL_INSTANCE_NUM * UMC_V8_10_UMC_INSTANCE_NUM * \
+   (adev)->gmc.num_umc - hweight32((adev)->gmc.m_half_use) * 2)
 
 /* UMC regiser per channel offset */
 #define UMC_V8_10_PER_CHANNEL_OFFSET   0x400
-- 
2.25.1



Re: [PATCH] drm/amdkfd: decrement queue count on mes queue destroy

2023-06-13 Thread Felix Kuehling

On 2023-06-13 17:48, Jonathan Kim wrote:

Queue count should decrement on queue destruction regardless of HWS
support type.

Signed-off-by: Jonathan Kim 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 8a39a9e0ed5a..f515cb8f30ca 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -2089,8 +2089,8 @@ static int destroy_queue_cpsch(struct 
device_queue_manager *dqm,
list_del(>list);
qpd->queue_count--;
if (q->properties.is_active) {
+   decrement_queue_count(dqm, qpd, q);
if (!dqm->dev->kfd->shared_resources.enable_mes) {
-   decrement_queue_count(dqm, qpd, q);
retval = execute_queues_cpsch(dqm,
  
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
  USE_DEFAULT_GRACE_PERIOD);


[PATCH] drm/amdkfd: decrement queue count on mes queue destroy

2023-06-13 Thread Jonathan Kim
Queue count should decrement on queue destruction regardless of HWS
support type.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 8a39a9e0ed5a..f515cb8f30ca 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -2089,8 +2089,8 @@ static int destroy_queue_cpsch(struct 
device_queue_manager *dqm,
list_del(>list);
qpd->queue_count--;
if (q->properties.is_active) {
+   decrement_queue_count(dqm, qpd, q);
if (!dqm->dev->kfd->shared_resources.enable_mes) {
-   decrement_queue_count(dqm, qpd, q);
retval = execute_queues_cpsch(dqm,
  
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
  USE_DEFAULT_GRACE_PERIOD);
-- 
2.25.1



[linux-next:master] BUILD REGRESSION 1f6ce8392d6ff486af5ca96df9ded5882c4b6977

2023-06-13 Thread kernel test robot
tree/branch: 
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master
branch HEAD: 1f6ce8392d6ff486af5ca96df9ded5882c4b6977  Add linux-next specific 
files for 20230613

Error/Warning reports:

https://lore.kernel.org/oe-kbuild-all/202306082341.uqtcm8po-...@intel.com
https://lore.kernel.org/oe-kbuild-all/20230613.hher4zoo-...@intel.com
https://lore.kernel.org/oe-kbuild-all/202306132155.bfzc9arf-...@intel.com
https://lore.kernel.org/oe-kbuild-all/202306132237.z4lje8bp-...@intel.com
https://lore.kernel.org/oe-kbuild-all/202306140347.s9njs3al-...@intel.com

Error/Warning: (recently discovered and may have been fixed)

arch/microblaze/include/asm/page.h:34: warning: "ARCH_DMA_MINALIGN" redefined
arch/parisc/kernel/pdt.c:65:6: warning: no previous prototype for 
'arch_report_meminfo' [-Wmissing-prototypes]
csky-linux-ld: drivers/net/ethernet/sfc/ef100_netdev.c:114: undefined reference 
to `efx_tc_netevent_event'
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c:76: warning: This comment starts 
with '/**', but isn't a kernel-doc comment. Refer 
Documentation/doc-guide/kernel-doc.rst
drivers/gpu/drm/i915/display/intel_display_power.h:256:70: error: declaration 
of 'struct seq_file' will not be visible outside of this function 
[-Werror,-Wvisibility]
drivers/leds/leds-cht-wcove.c:144:21: warning: no previous prototype for 
'cht_wc_leds_brightness_get' [-Wmissing-prototypes]
include/asm-generic/bitops/instrumented-non-atomic.h:141: undefined reference 
to `uv_info'
lib/kunit/executor_test.c:138:4: warning: cast from 'void (*)(const void *)' to 
'kunit_action_t *' (aka 'void (*)(void *)') converts to incompatible function 
type [-Wcast-function-type-strict]
lib/kunit/test.c:775:38: warning: cast from 'void (*)(const void *)' to 
'kunit_action_t *' (aka 'void (*)(void *)') converts to incompatible function 
type [-Wcast-function-type-strict]

Unverified Error/Warning (likely false positive, please contact us if 
interested):

arch/arm64/kvm/mmu.c:147:3-9: preceding lock on line 140
drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c:98 
mlx5_devcom_register_device() error: uninitialized symbol 'tmp_dev'.
drivers/usb/cdns3/cdns3-starfive.c:23: warning: expecting prototype for 
cdns3(). Prototype was for USB_STRAP_HOST() instead
fs/btrfs/volumes.c:6404 btrfs_map_block() error: we previously assumed 
'mirror_num_ret' could be null (see line 6242)
fs/smb/client/cifsfs.c:982 cifs_smb3_do_mount() warn: possible memory leak of 
'cifs_sb'
fs/smb/client/cifssmb.c:4089 CIFSFindFirst() warn: missing error code? 'rc'
fs/smb/client/cifssmb.c:4216 CIFSFindNext() warn: missing error code? 'rc'
fs/smb/client/connect.c:2775 cifs_match_super() error: 'tlink' dereferencing 
possible ERR_PTR()
fs/smb/client/connect.c:2974 generic_ip_connect() error: we previously assumed 
'socket' could be null (see line 2962)
lib/kunit/test.c:336 __kunit_abort() warn: ignoring unreachable code.

Error/Warning ids grouped by kconfigs:

gcc_recent_errors
|-- alpha-allyesconfig
|   `-- 
drivers-gpu-drm-amd-amdgpu-amdgpu_ras_eeprom.c:warning:This-comment-starts-with-but-isn-t-a-kernel-doc-comment.-Refer-Documentation-doc-guide-kernel-doc.rst
|-- arc-allyesconfig
|   `-- 
drivers-gpu-drm-amd-amdgpu-amdgpu_ras_eeprom.c:warning:This-comment-starts-with-but-isn-t-a-kernel-doc-comment.-Refer-Documentation-doc-guide-kernel-doc.rst
|-- arm-allmodconfig
|   `-- 
drivers-gpu-drm-amd-amdgpu-amdgpu_ras_eeprom.c:warning:This-comment-starts-with-but-isn-t-a-kernel-doc-comment.-Refer-Documentation-doc-guide-kernel-doc.rst
|-- arm-allyesconfig
|   `-- 
drivers-gpu-drm-amd-amdgpu-amdgpu_ras_eeprom.c:warning:This-comment-starts-with-but-isn-t-a-kernel-doc-comment.-Refer-Documentation-doc-guide-kernel-doc.rst
|-- arm64-allyesconfig
|   `-- 
drivers-gpu-drm-amd-amdgpu-amdgpu_ras_eeprom.c:warning:This-comment-starts-with-but-isn-t-a-kernel-doc-comment.-Refer-Documentation-doc-guide-kernel-doc.rst
|-- arm64-randconfig-c033-20230611
|   `-- arch-arm64-kvm-mmu.c:preceding-lock-on-line
|-- csky-randconfig-c044-20230612
|   |-- 
csky-linux-ld:drivers-net-ethernet-sfc-ef100_netdev.c:undefined-reference-to-efx_tc_netevent_event
|   `-- 
drivers-gpu-drm-amd-amdgpu-amdgpu_ras_eeprom.c:warning:This-comment-starts-with-but-isn-t-a-kernel-doc-comment.-Refer-Documentation-doc-guide-kernel-doc.rst
|-- i386-allyesconfig
|   |-- 
drivers-gpu-drm-amd-amdgpu-amdgpu_ras_eeprom.c:warning:This-comment-starts-with-but-isn-t-a-kernel-doc-comment.-Refer-Documentation-doc-guide-kernel-doc.rst
|   `-- 
drivers-leds-leds-cht-wcove.c:warning:no-previous-prototype-for-cht_wc_leds_brightness_get
|-- i386-randconfig-m021-20230612
|   |-- 
fs-smb-client-cifsfs.c-cifs_smb3_do_mount()-warn:possible-memory-leak-of-cifs_sb
|   |-- fs-smb-client-cifssmb.c-CIFSFindFirst()-warn:missing-error-code-rc
|   |-- fs-smb-client-cifssmb.c-CIFSFindNext()-warn:missing-error-code-rc
|   |-- 
fs-smb-client-connect.c-cifs_match_super()-error:tlink-dereferencing-possible-ERR_PTR()
|   `-- 
fs-

Re: [RFC PATCH v2 00/18] Add DRM CRTC 3D LUT interface

2023-06-13 Thread Jacopo Mondi
Hello

   I'm completing the support for 3D LUT on R-Car DU peripheral
and I have used this series as a base.

I'm wondering, since quite some time has passed without any update if
this series is still a thing and it makes any sense for me to try to
bring it forward.

I'm asking as I've noticed:
"[PATCH 00/36] drm/amd/display: add AMD driver-specific properties for color 
mgmt"

which seems to supersede this proposal with driver-specific
properties.

I asked Melissa privately but I wasn't able to get an hold of her, so
if anyone has any clue feel free to reply :)

Thanks
  j

On Mon, Jan 09, 2023 at 01:38:28PM -0100, Melissa Wen wrote:
> Hi,
>
> After collecting comments in different places, here is a second version
> of the work on adding DRM CRTC 3D LUT support to the current DRM color
> mgmt interface. In comparison to previous proposals [1][2][3], here we
> add 3D LUT before gamma 1D LUT, but also a shaper 1D LUT before 3D LUT,
> that means the following DRM CRTC color correction pipeline:
>
> Blend -> Degamma 1D LUT -> CTM -> Shaper 1D LUT -> 3D LUT -> Gamma 1D LUT
>
> and we also add a DRM CRTC LUT3D_MODE property, based on Alex Hung
> proposal for pre-blending 3D LUT [4] (Thanks!), instead of just a
> LUT3D_SIZE, that allows userspace to use different supported settings of
> 3D LUT, fitting VA-API and new color API better. In this sense, I
> adjusted the pre-blending proposal for post-blending usage.
>
> Patches 1-6 targets the addition of shaper LUT and 3D LUT properties to
> the current DRM CRTC color mgmt pipeline. Patch 6 can be considered an
> extra/optional patch to define a default value for LUT3D_MODE, inspired
> by what we do for the plane blend mode property (pre-multiplied).
>
> Patches 7-18 targets AMD display code to enable shaper and 3D LUT usage
> on DCN 301 (our HW case). Patches 7-9 performs code cleanups on current
> AMD DM colors code, patch 10 updates AMD stream in case of user 3D LUT
> changes, patch 11/12 rework AMD MPC 3D LUT resource handling by context
> for DCN 301 (easily extendible to other DCN families). Finally, from
> 13-18, we wire up SHAPER LUT, LUT3D and LUT3D MODE to AMD display
> driver, exposing modes supported by HW and programming user shaper and
> 3D LUT accordingly.
>
> Our target userspace is Gamescope/SteamOS.
>
> Basic IGT tests were based on [5][6] and are available here (in-progress):
> https://gitlab.freedesktop.org/mwen/igt-gpu-tools/-/commits/crtc-lut3d-api
>
> [1] 
> https://lore.kernel.org/all/20201221015730.28333-1-laurent.pinchart+rene...@ideasonboard.com/
> [2] 
> https://github.com/vsyrjala/linux/commit/4d28e8ddf2a076f30f9e5bdc17cbb4656fe23e69
> [3] https://lore.kernel.org/amd-gfx/20220619223104.667413-1-m...@igalia.com/
> [4] 
> https://lore.kernel.org/dri-devel/20221004211451.1475215-1-alex.h...@amd.com/
> [5] https://patchwork.freedesktop.org/series/90165/
> [6] https://patchwork.freedesktop.org/series/109402/
> [VA_API] 
> http://intel.github.io/libva/structVAProcFilterParameterBuffer3DLUT.html
> [KMS_pipe_API] https://gitlab.freedesktop.org/pq/color-and-hdr/-/issues/11
>
> Let me know your thoughts.
>
> Thanks,
>
> Melissa
>
> Alex Hung (2):
>   drm: Add 3D LUT mode and its attributes
>   drm/amd/display: Define 3D LUT struct for HDR planes
>
> Melissa Wen (16):
>   drm/drm_color_mgmt: add shaper LUT to color mgmt properties
>   drm/drm_color_mgmt: add 3D LUT props to DRM color mgmt
>   drm/drm_color_mgmt: add function to create 3D LUT modes supported
>   drm/drm_color_mgmt: add function to attach 3D LUT props
>   drm/drm_color_mgmt: set first lut3d mode as default
>   drm/amd/display: remove unused regamma condition
>   drm/amd/display: add comments to describe DM crtc color mgmt behavior
>   drm/amd/display: encapsulate atomic regamma operation
>   drm/amd/display: update lut3d and shaper lut to stream
>   drm/amd/display: handle MPC 3D LUT resources for a given context
>   drm/amd/display: acquire/release 3D LUT resources for ctx on DCN301
>   drm/amd/display: expand array of supported 3D LUT modes
>   drm/amd/display: enable 3D-LUT DRM properties if supported
>   drm/amd/display: add user 3D LUT support to the amdgpu_dm color
> pipeline
>   drm/amd/display: decouple steps to reuse in shaper LUT support
>   drm/amd/display: add user shaper LUT support to amdgpu_dm color
> pipeline
>
>  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |   6 +
>  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h |   3 +
>  .../amd/display/amdgpu_dm/amdgpu_dm_color.c   | 370 --
>  .../amd/display/amdgpu_dm/amdgpu_dm_crtc.c|   2 +
>  drivers/gpu/drm/amd/display/dc/core/dc.c  |  49 ++-
>  drivers/gpu/drm/amd/display/dc/dc.h   |   8 +
>  .../amd/display/dc/dcn301/dcn301_resource.c   |  47 ++-
>  .../amd/display/modules/color/color_gamma.h   |  43 ++
>  drivers/gpu/drm/drm_atomic_state_helper.c |   7 +
>  drivers/gpu/drm/drm_atomic_uapi.c |  24 ++
>  drivers/gpu/drm/drm_color_mgmt.c  | 127 ++
>  

[PATCH 2/2] drm/amdgpu/pm: make mclk consistent for smu 13.0.7

2023-06-13 Thread Alex Deucher
Use current uclk to be consistent with other dGPUs.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
index cda4e818aab7..8eb8c30e6c69 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
@@ -949,7 +949,7 @@ static int smu_v13_0_7_read_sensor(struct smu_context *smu,
break;
case AMDGPU_PP_SENSOR_GFX_MCLK:
ret = smu_v13_0_7_get_smu_metrics_data(smu,
-  METRICS_AVERAGE_UCLK,
+  METRICS_CURR_UCLK,
   (uint32_t *)data);
*(uint32_t *)data *= 100;
*size = 4;
-- 
2.40.1



[PATCH 1/2] drm/amdgpu/pm: make gfxclock consistent for sienna cichlid

2023-06-13 Thread Alex Deucher
Use average gfxclock for consistency with other dGPUs.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
index f7ed3e655e39..1b7d93709a35 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
@@ -1927,12 +1927,16 @@ static int sienna_cichlid_read_sensor(struct 
smu_context *smu,
*size = 4;
break;
case AMDGPU_PP_SENSOR_GFX_MCLK:
-   ret = sienna_cichlid_get_current_clk_freq_by_table(smu, 
SMU_UCLK, (uint32_t *)data);
+   ret = sienna_cichlid_get_smu_metrics_data(smu,
+ METRICS_CURR_UCLK,
+ (uint32_t *)data);
*(uint32_t *)data *= 100;
*size = 4;
break;
case AMDGPU_PP_SENSOR_GFX_SCLK:
-   ret = sienna_cichlid_get_current_clk_freq_by_table(smu, 
SMU_GFXCLK, (uint32_t *)data);
+   ret = sienna_cichlid_get_smu_metrics_data(smu,
+ 
METRICS_AVERAGE_GFXCLK,
+ (uint32_t *)data);
*(uint32_t *)data *= 100;
*size = 4;
break;
-- 
2.40.1



Re: [PATCH 10/66] drm/amd/display: Do not set drr on pipe commit

2023-06-13 Thread Michel Dänzer
On 6/12/23 20:14, Pillai, Aurabindo wrote:
> 
> I want to double check if we're identifying the correct monitor for applying 
> the workaround. Could you please try the attached patch and let me know the 
> panel id ?

 amdgpu: ### Not applying any edid quirk for panel 4c2d71ac

I'm attaching the EDID.

BTW, I'm using the monitor firmware version 1011.0, which AFAICT is the latest.


-- 
Earthling Michel Dänzer|  https://redhat.com
Libre software enthusiast  | Mesa and Xwayland developer



edid
Description: Binary data


[PATCHv3] drm/amdgpu: Update invalid PTE flag setting

2023-06-13 Thread Mukul Joshi
Update the invalid PTE flag setting with TF enabled.
This is to ensure, in addition to transitioning the
retry fault to a no-retry fault, it also causes the
wavefront to enter the trap handler. With the current
setting, the fault only transitions to a no-retry fault.
Additionally, have 2 sets of invalid PTE settings, one for
TF enabled, the other for TF disabled. The setting with
TF disabled, doesn't work with TF enabled.

Signed-off-by: Mukul Joshi 
---
v1->v2:
- Update handling according to Christian's feedback.

v2->v3:
- Remove ASIC specific callback (Felix).

 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c|  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|  6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 21 +
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 1cb14ea18cd9..ff9db7e5c086 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2583,7 +2583,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, 
u32 pasid,
/* Intentionally setting invalid PTE flag
 * combination to force a no-retry-fault
 */
-   flags = AMDGPU_PTE_SNOOPED | AMDGPU_PTE_PRT;
+   flags = AMDGPU_VM_NORETRY_FLAGS;
value = 0;
} else if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_NEVER) {
/* Redirect the access to the dummy page */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 9c85d494f2a2..b81fcb962d8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -84,7 +84,13 @@ struct amdgpu_mem_stats;
 /* PDE Block Fragment Size for VEGA10 */
 #define AMDGPU_PDE_BFS(a)  ((uint64_t)a << 59)
 
+/* Flag combination to set no-retry with TF disabled */
+#define AMDGPU_VM_NORETRY_FLAGS(AMDGPU_PTE_EXECUTABLE | AMDGPU_PDE_PTE 
| \
+   AMDGPU_PTE_TF)
 
+/* Flag combination to set no-retry with TF enabled */
+#define AMDGPU_VM_NORETRY_FLAGS_TF (AMDGPU_PTE_VALID | AMDGPU_PTE_SYSTEM | \
+  AMDGPU_PTE_PRT)
 /* For GFX9 */
 #define AMDGPU_PTE_MTYPE_VG10(a)   ((uint64_t)(a) << 57)
 #define AMDGPU_PTE_MTYPE_VG10_MASK AMDGPU_PTE_MTYPE_VG10(3ULL)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index dea1a64be44d..45b26cad59cd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -778,6 +778,24 @@ int amdgpu_vm_pde_update(struct amdgpu_vm_update_params 
*params,
1, 0, flags);
 }
 
+/**
+ * amdgpu_vm_pte_update_noretry_flags - Update PTE no-retry flags
+ *
+ * @adev - amdgpu_device pointer
+ * @flags: pointer to PTE flags
+ *
+ * Update PTE no-retry flags when TF is enabled.
+ */
+static void amdgpu_vm_pte_update_noretry_flags(struct amdgpu_device *adev,
+   uint64_t *flags)
+{
+   /* Update no retry flags when TF is enabled */
+   if ((*flags & AMDGPU_VM_NORETRY_FLAGS) == AMDGPU_VM_NORETRY_FLAGS) {
+   *flags &= ~AMDGPU_VM_NORETRY_FLAGS;
+   *flags |= AMDGPU_VM_NORETRY_FLAGS_TF;
+   }
+}
+
 /*
  * amdgpu_vm_pte_update_flags - figure out flags for PTE updates
  *
@@ -804,6 +822,9 @@ static void amdgpu_vm_pte_update_flags(struct 
amdgpu_vm_update_params *params,
flags |= AMDGPU_PTE_EXECUTABLE;
}
 
+   if (adev->gmc.translate_further && level == AMDGPU_VM_PTB)
+   amdgpu_vm_pte_update_noretry_flags(adev, );
+
/* APUs mapping system memory may need different MTYPEs on different
 * NUMA nodes. Only do this for contiguous ranges that can be assumed
 * to be on the same NUMA node.
-- 
2.35.1



[PATCH 2/3] drm/amdgpu: cache gpuvm fault information for gmc7+

2023-06-13 Thread Alex Deucher
Cache the current fault info in the vm struct.  This can be queried
by userspace later to help debug UMDs.

Cc: samuel.pitoi...@gmail.com
Acked-by: Guchun Chen 
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  3 +++
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c |  3 +++
 drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c  |  3 +++
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  |  3 +++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 11 +++
 5 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index b2e42f1b0f12..ccb69f5b06fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -155,6 +155,9 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device 
*adev,
 
status = RREG32(hub->vm_l2_pro_fault_status);
WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
+
+   amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status,
+entry->vmid_src ? AMDGPU_MMHUB0(0) 
: AMDGPU_GFXHUB(0));
}
 
if (!printk_ratelimit())
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index c571f0d95994..ae35dc6ba502 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -115,6 +115,9 @@ static int gmc_v11_0_process_interrupt(struct amdgpu_device 
*adev,
 
status = RREG32(hub->vm_l2_pro_fault_status);
WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
+
+   amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status,
+entry->vmid_src ? AMDGPU_MMHUB0(0) 
: AMDGPU_GFXHUB(0));
}
 
if (printk_ratelimit()) {
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
index acd2b407860f..d51cad788769 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
@@ -1273,6 +1273,9 @@ static int gmc_v7_0_process_interrupt(struct 
amdgpu_device *adev,
if (!addr && !status)
return 0;
 
+   amdgpu_vm_update_fault_cache(adev, entry->pasid,
+((u64)addr) << AMDGPU_GPU_PAGE_SHIFT, 
status, AMDGPU_GFXHUB(0));
+
if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_FIRST)
gmc_v7_0_set_fault_enable_default(adev, false);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index 85dead2a5702..8ce7455ff3f6 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -1448,6 +1448,9 @@ static int gmc_v8_0_process_interrupt(struct 
amdgpu_device *adev,
if (!addr && !status)
return 0;
 
+   amdgpu_vm_update_fault_cache(adev, entry->pasid,
+((u64)addr) << AMDGPU_GPU_PAGE_SHIFT, 
status, AMDGPU_GFXHUB(0));
+
if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_FIRST)
gmc_v8_0_set_fault_enable_default(adev, false);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 3ed286b72cae..fbd65872050a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -555,6 +555,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
struct amdgpu_vmhub *hub;
const char *mmhub_cid;
const char *hub_name;
+   unsigned int vmhub;
u64 addr;
uint32_t cam_index = 0;
int ret, xcc_id = 0;
@@ -567,10 +568,10 @@ static int gmc_v9_0_process_interrupt(struct 
amdgpu_device *adev,
 
if (entry->client_id == SOC15_IH_CLIENTID_VMC) {
hub_name = "mmhub0";
-   hub = >vmhub[AMDGPU_MMHUB0(node_id / 4)];
+   vmhub = AMDGPU_MMHUB0(node_id / 4);
} else if (entry->client_id == SOC15_IH_CLIENTID_VMC1) {
hub_name = "mmhub1";
-   hub = >vmhub[AMDGPU_MMHUB1(0)];
+   vmhub = AMDGPU_MMHUB1(0);
} else {
hub_name = "gfxhub0";
if (adev->gfx.funcs->ih_node_to_logical_xcc) {
@@ -579,8 +580,9 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
if (xcc_id < 0)
xcc_id = 0;
}
-   hub = >vmhub[xcc_id];
+   vmhub = xcc_id;
}
+   hub = >vmhub[vmhub];
 
if (retry_fault) {
if (adev->irq.retry_cam_enabled) {
@@ -626,7 +628,6 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
if (!printk_ratelimit())
return 0;
 
-
memset(_info, 0, sizeof(struct amdgpu_task_info));
amdgpu_vm_get_task_info(adev, entry->pasid, _info);
 
@@ -663,6 +664,8 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
   

[PATCH 3/3] drm/amdgpu: add new INFO ioctl query for the last GPU page fault

2023-06-13 Thread Alex Deucher
Add a interface to query the last GPU page fault for the process.
Useful for debugging context lost errors.

v2: split vmhub representation between kernel and userspace

Mesa MR: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23238
libdrm MR: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23238

Cc: samuel.pitoi...@gmail.com
Acked-by: Guchun Chen 
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 16 
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 16 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  | 13 ++---
 include/uapi/drm/amdgpu_drm.h   | 16 
 5 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 999d008b6b48..6b053bab799c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -113,9 +113,10 @@
  *gl1c_cache_size, gl2c_cache_size, mall_size, 
enabled_rb_pipes_mask_hi
  *   3.53.0 - Support for GFX11 CP GFX shadowing
  *   3.54.0 - Add AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS support
+ * - 3.55.0 - Add AMDGPU_INFO_GPUVM_FAULT query
  */
 #define KMS_DRIVER_MAJOR   3
-#define KMS_DRIVER_MINOR   54
+#define KMS_DRIVER_MINOR   55
 #define KMS_DRIVER_PATCHLEVEL  0
 
 unsigned int amdgpu_vram_limit = UINT_MAX;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index e3531aa3c8bd..2289f8eb3d1d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1163,6 +1163,22 @@ int amdgpu_info_ioctl(struct drm_device *dev, void 
*data, struct drm_file *filp)
return copy_to_user(out, max_ibs,
min((size_t)size, sizeof(max_ibs))) ? 
-EFAULT : 0;
}
+   case AMDGPU_INFO_GPUVM_FAULT: {
+   struct amdgpu_fpriv *fpriv = filp->driver_priv;
+   struct amdgpu_vm *vm = >vm;
+   struct drm_amdgpu_info_gpuvm_fault gpuvm_fault;
+
+   if (!vm)
+   return -EINVAL;
+
+   memset(_fault, 0, sizeof(gpuvm_fault));
+   gpuvm_fault.addr = vm->fault_info.addr;
+   gpuvm_fault.status = vm->fault_info.status;
+   gpuvm_fault.vmhub = vm->fault_info.vmhub;
+
+   return copy_to_user(out, _fault,
+   min((size_t)size, sizeof(gpuvm_fault))) ? 
-EFAULT : 0;
+   }
default:
DRM_DEBUG_KMS("Invalid request %d\n", info->query);
return -EINVAL;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 5423f66a9ed8..2c1106855492 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2666,7 +2666,21 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device 
*adev,
if (vm) {
vm->fault_info.addr = addr;
vm->fault_info.status = status;
-   vm->fault_info.vmhub = vmhub;
+   if (AMDGPU_IS_GFXHUB(vmhub)) {
+   vm->fault_info.vmhub = AMDGPU_VMHUB_TYPE_GFX;
+   vm->fault_info.vmhub |=
+   (vmhub - AMDGPU_GFXHUB_START) << 
AMDGPU_VMHUB_IDX_SHIFT;
+   } else if (AMDGPU_IS_MMHUB0(vmhub)) {
+   vm->fault_info.vmhub = AMDGPU_VMHUB_TYPE_MM0;
+   vm->fault_info.vmhub |=
+   (vmhub - AMDGPU_MMHUB0_START) << 
AMDGPU_VMHUB_IDX_SHIFT;
+   } else if (AMDGPU_IS_MMHUB1(vmhub)) {
+   vm->fault_info.vmhub = AMDGPU_VMHUB_TYPE_MM1;
+   vm->fault_info.vmhub |=
+   (vmhub - AMDGPU_MMHUB1_START) << 
AMDGPU_VMHUB_IDX_SHIFT;
+   } else {
+   WARN_ONCE(1, "Invalid vmhub %u\n", vmhub);
+   }
}
xa_unlock_irqrestore(>vm_manager.pasids, flags);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index fb66a413110c..1a34fea9acb9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -116,9 +116,16 @@ struct amdgpu_mem_stats;
  * layout: max 8 GFXHUB + 4 MMHUB0 + 1 MMHUB1
  */
 #define AMDGPU_MAX_VMHUBS  13
-#define AMDGPU_GFXHUB(x)   (x)
-#define AMDGPU_MMHUB0(x)   (8 + x)
-#define AMDGPU_MMHUB1(x)   (8 + 4 + x)
+#define AMDGPU_GFXHUB_START0
+#define AMDGPU_MMHUB0_START8
+#define AMDGPU_MMHUB1_START12
+#define AMDGPU_GFXHUB(x)   (AMDGPU_GFXHUB_START + (x))
+#define AMDGPU_MMHUB0(x)   (AMDGPU_MMHUB0_START + (x))
+#define 

[PATCH 1/3] drm/amdgpu: add cached GPU fault structure to vm struct

2023-06-13 Thread Alex Deucher
When we get a GPU page fault, cache the fault for later
analysis.

Cc: samuel.pitoi...@gmail.com
Acked-by: Guchun Chen 
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 31 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 18 +++
 2 files changed, 49 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index dc80c9c8fd14..5423f66a9ed8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2640,3 +2640,34 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, 
struct seq_file *m)
   total_done_objs);
 }
 #endif
+
+/**
+ * amdgpu_vm_update_fault_cache - update cached fault into.
+ * @adev: amdgpu device pointer
+ * @pasid: PASID of the VM
+ * @addr: Address of the fault
+ * @status: GPUVM fault status register
+ * @vmhub: which vmhub got the fault
+ *
+ * Cache the fault info for later use by userspace in debugging.
+ */
+void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
+ unsigned int pasid,
+ uint64_t addr,
+ uint32_t status,
+ unsigned int vmhub)
+{
+   struct amdgpu_vm *vm;
+   unsigned long flags;
+
+   xa_lock_irqsave(>vm_manager.pasids, flags);
+
+   vm = xa_load(>vm_manager.pasids, pasid);
+   if (vm) {
+   vm->fault_info.addr = addr;
+   vm->fault_info.status = status;
+   vm->fault_info.vmhub = vmhub;
+   }
+   xa_unlock_irqrestore(>vm_manager.pasids, flags);
+}
+
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 14f9a2bf3acb..fb66a413110c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -244,6 +244,15 @@ struct amdgpu_vm_update_funcs {
  struct dma_fence **fence);
 };
 
+struct amdgpu_vm_fault_info {
+   /* fault address */
+   uint64_taddr;
+   /* fault status register */
+   uint32_tstatus;
+   /* which vmhub? gfxhub, mmhub, etc. */
+   unsigned intvmhub;
+};
+
 struct amdgpu_vm {
/* tree of virtual addresses mapped */
struct rb_root_cached   va;
@@ -332,6 +341,9 @@ struct amdgpu_vm {
 
/* Memory partition number, -1 means any partition */
int8_t  mem_id;
+
+   /* cached fault info */
+   struct amdgpu_vm_fault_info fault_info;
 };
 
 struct amdgpu_vm_manager {
@@ -540,4 +552,10 @@ static inline void amdgpu_vm_eviction_unlock(struct 
amdgpu_vm *vm)
mutex_unlock(>eviction_lock);
 }
 
+void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
+ unsigned int pasid,
+ uint64_t addr,
+ uint32_t status,
+ unsigned int vmhub);
+
 #endif
-- 
2.40.1



[PATCH v2 0/3] Add GPU page fault query interface

2023-06-13 Thread Alex Deucher
This patch set adds support for an application to query GPU
page faults.  It's useful for debugging and there are
vulkan extensions that could make use of this.  Preliminary
user space code which uses this can be found here:
https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23238
https://gitlab.freedesktop.org/mesa/drm/-/merge_requests/298

Note, that I made a small change to the vmhub definition to
decouple it from how the kernel tracks vmhubs so that we have
a consistent user view even if we decide to add more vmhubs
like we recently did for gfx 9.4.3.

I've also pushed the changed to:
https://gitlab.freedesktop.org/agd5f/linux/-/commits/gpu_fault_info_ioctl

Open question, currently we just expose the raw GPU fault status
register value for each GPU so UMDs need GPU specific knowlege to decode
it, although it's largely the same across generations.  One option would be to
translate to a generic GPU independent fault status.  Opinions?

v2:
- Fix spelling typos noted by Guchun

Alex Deucher (3):
  drm/amdgpu: add cached GPU fault structure to vm struct
  drm/amdgpu: cache gpuvm fault information for gmc7+
  drm/amdgpu: add new INFO ioctl query for the last GPU page fault

 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 16 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 45 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  | 31 +++--
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  3 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c  |  3 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c   |  3 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c   |  3 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 11 +++---
 include/uapi/drm/amdgpu_drm.h   | 16 +
 10 files changed, 126 insertions(+), 8 deletions(-)

-- 
2.40.1



Re: [PATCH 3/9] drm/ttm: use per BO cleanup workers

2023-06-13 Thread Karol Herbst
On Tue, Jun 13, 2023 at 3:59 PM Christian König
 wrote:
>
> Am 13.06.23 um 15:05 schrieb Karol Herbst:
> > On Mon, Dec 5, 2022 at 2:40 PM Christian König  
> > wrote:
> >> Am 29.11.22 um 22:14 schrieb Felix Kuehling:
> >>> On 2022-11-25 05:21, Christian König wrote:
>  Instead of a single worker going over the list of delete BOs in regular
>  intervals use a per BO worker which blocks for the resv object and
>  locking of the BO.
> 
>  This not only simplifies the handling massively, but also results in
>  much better response time when cleaning up buffers.
> 
>  Signed-off-by: Christian König 
> >>> Just thinking out loud: If I understand it correctly, this can cause a
> >>> lot of sleeping worker threads when
> >>> AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE is used and many BOs are freed
> >>> at the same time. This happens e.g. when a KFD process terminates or
> >>> crashes. I guess with a concurrency-managed workqueue this isn't going
> >>> to be excessive. And since it's on a per device workqueue, it doesn't
> >>> stall work items on the system work queue or from other devices.
> >> Yes, exactly that. The last parameter to alloc_workqueue() limits how
> >> many work items can be sleeping.
> >>
> >>> I'm trying to understand why you set WQ_MEM_RECLAIM. This work queue
> >>> is not about freeing ttm_resources but about freeing the BOs. But it
> >>> affects freeing of ghost_objs that are holding the ttm_resources being
> >>> freed.
> >> Well if the BO is idle, but not immediately lockable we delegate freeing
> >> the backing pages in the TT object to those workers as well. It might
> >> even be a good idea to use a separate wq for this case.
> >>
> >>> If those assumptions all make sense, patches 1-3 are
> >>>
> >>> Reviewed-by: Felix Kuehling 
> >> Thanks,
> >> Christian.
> >>
> > This patch causes a heap use-after-free when using nouveau with the
> > potential of trashing filesystems, is there a way to revert it until
> > we figure out a proper solution to the problem?
>
> Uff I don't think so, we have quite some work based on top of this. But
> let me double check.
>

yeah.. I already talked with Dave about fixing this issue as Dave has
more knowledge on this part of the driver (I hope), so we might have a
fix soonish, but the concerning part is, that it's already out to
users, so might be better to be able to revert it if the fix takes a
while to emerge.

> On the other hand have you tried running this with KASAN to catch use
> after free errors?

yes: https://gitlab.freedesktop.org/drm/nouveau/-/issues/213#note_1942777

>
> Since we now block for work to finish and not check every few
> milliseconds to garbage collect memory will now be reclaimed much faster
> after freeing it.

yeah, that kinda makes sense. This entire issue feels like a race
happening as I need to run the OpenGL CTS in parallel with 8+ threads
to trigger it reliably.

>
> Regards,
> Christian.
>
> >
> > Bug: https://gitlab.freedesktop.org/drm/nouveau/-/issues/213
> >
> > example trace on affected systems:
> >
> > [ 4102.946946] general protection fault, probably for non-canonical
> > address 0x5f775ce3bd949b45:  [#3] PREEMPT SMP NOPTI
> > [ 4102.957794] CPU: 12 PID: 89561 Comm: glcts Tainted: G  D
> >  6.3.5-200.fc38.x86_64 #1
> > [ 4102.966556] Hardware name: ASUS System Product Name/PRIME B660-PLUS
> > D4, BIOS 0418 10/13/2021
> > [ 4102.974972] RIP: 0010:__kmem_cache_alloc_node+0x1ba/0x320
> > [ 4102.980362] Code: 2b 14 25 28 00 00 00 0f 85 74 01 00 00 48 83 c4
> > 18 5b 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc 41 8b 47 28 4d 8b 07
> > 48 01 f8 <48> 8b 18 48 89 c1 49 33 9f b8 00 00 00 48 0f c9 48 31 cb 41
> > f6 c0
> > [ 4102.999073] RSP: 0018:9764e0057b40 EFLAGS: 00010202
> > [ 4103.004291] RAX: 5f775ce3bd949b45 RBX: 0dc0 RCX: 
> > 0046
> > [ 4103.011408] RDX: 0002cf87600c RSI: 0dc0 RDI: 
> > 5f775ce3bd949b15
> > [ 4103.018528] RBP: 0dc0 R08: 000390c0 R09: 
> > 30302d6d
> > [ 4103.025649] R10: 756c7473 R11: 20090298 R12: 
> > 
> > [ 4103.032767] R13:  R14: 0046 R15: 
> > 8bda80042600
> > [ 4103.039887] FS:  7f386a85ef00() GS:8be1df70()
> > knlGS:
> > [ 4103.047958] CS:  0010 DS:  ES:  CR0: 80050033
> > [ 4103.053692] CR2: 0493b868 CR3: 00014c3ba000 CR4: 
> > 00f50ee0
> > [ 4103.060812] PKRU: 5554
> > [ 4103.063520] Call Trace:
> > [ 4103.065970]  
> > [ 4103.068071]  ? die_addr+0x36/0x90
> > [ 4103.071384]  ? exc_general_protection+0x1be/0x420
> > [ 4103.076081]  ? asm_exc_general_protection+0x26/0x30
> > [ 4103.080952]  ? __kmem_cache_alloc_node+0x1ba/0x320
> > [ 4103.085734]  ? ext4_htree_store_dirent+0x42/0x180
> > [ 4103.090431]  ? ext4_htree_store_dirent+0x42/0x180
> > [ 4103.095132]  __kmalloc+0x4d/0x150
> > [ 4103.098444]  ext4_htree_store_dirent+0x42/0x180
> > [ 

Re: [PATCH] drm/amdgpu/sdma4: set align mask to 255

2023-06-13 Thread Christian König

Reviewed-by: Christian König 

Am 13.06.23 um 03:14 schrieb Liu, Aaron:

[AMD Official Use Only - General]

Reviewed-by: Aaron Liu 


-Original Message-
From: amd-gfx  On Behalf Of Alex
Deucher
Sent: Tuesday, June 13, 2023 5:48 AM
To: Deucher, Alexander 
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu/sdma4: set align mask to 255

ping?

On Wed, Jun 7, 2023 at 12:31 PM Alex Deucher 
wrote:

The wptr needs to be incremented at at least 64 dword intervals, use
256 to align with windows.  This should fix potential hangs with
unaligned updates.

Signed-off-by: Alex Deucher 
---
  drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 4 ++--
  drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 4 ++--
  2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 1f83eebfc8a7..cd37f45e01a1 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -2312,7 +2312,7 @@ const struct amd_ip_funcs sdma_v4_0_ip_funcs = {

  static const struct amdgpu_ring_funcs sdma_v4_0_ring_funcs = {
 .type = AMDGPU_RING_TYPE_SDMA,
-   .align_mask = 0xf,
+   .align_mask = 0xff,
 .nop = SDMA_PKT_NOP_HEADER_OP(SDMA_OP_NOP),
 .support_64bit_ptrs = true,
 .secure_submission_supported = true, @@ -2344,7 +2344,7 @@
static const struct amdgpu_ring_funcs sdma_v4_0_ring_funcs = {

  static const struct amdgpu_ring_funcs sdma_v4_0_page_ring_funcs = {
 .type = AMDGPU_RING_TYPE_SDMA,
-   .align_mask = 0xf,
+   .align_mask = 0xff,
 .nop = SDMA_PKT_NOP_HEADER_OP(SDMA_OP_NOP),
 .support_64bit_ptrs = true,
 .secure_submission_supported = true, diff --git
a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 8eebf9c2bbcd..05bb0691ee0e 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -1823,7 +1823,7 @@ const struct amd_ip_funcs sdma_v4_4_2_ip_funcs =
{

  static const struct amdgpu_ring_funcs sdma_v4_4_2_ring_funcs = {
 .type = AMDGPU_RING_TYPE_SDMA,
-   .align_mask = 0xf,
+   .align_mask = 0xff,
 .nop = SDMA_PKT_NOP_HEADER_OP(SDMA_OP_NOP),
 .support_64bit_ptrs = true,
 .get_rptr = sdma_v4_4_2_ring_get_rptr, @@ -1854,7 +1854,7 @@
static const struct amdgpu_ring_funcs sdma_v4_4_2_ring_funcs = {

  static const struct amdgpu_ring_funcs sdma_v4_4_2_page_ring_funcs = {
 .type = AMDGPU_RING_TYPE_SDMA,
-   .align_mask = 0xf,
+   .align_mask = 0xff,
 .nop = SDMA_PKT_NOP_HEADER_OP(SDMA_OP_NOP),
 .support_64bit_ptrs = true,
 .get_rptr = sdma_v4_4_2_ring_get_rptr,
--
2.40.1





Re: [PATCH 3/9] drm/ttm: use per BO cleanup workers

2023-06-13 Thread Christian König

Am 13.06.23 um 15:05 schrieb Karol Herbst:

On Mon, Dec 5, 2022 at 2:40 PM Christian König  wrote:

Am 29.11.22 um 22:14 schrieb Felix Kuehling:

On 2022-11-25 05:21, Christian König wrote:

Instead of a single worker going over the list of delete BOs in regular
intervals use a per BO worker which blocks for the resv object and
locking of the BO.

This not only simplifies the handling massively, but also results in
much better response time when cleaning up buffers.

Signed-off-by: Christian König 

Just thinking out loud: If I understand it correctly, this can cause a
lot of sleeping worker threads when
AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE is used and many BOs are freed
at the same time. This happens e.g. when a KFD process terminates or
crashes. I guess with a concurrency-managed workqueue this isn't going
to be excessive. And since it's on a per device workqueue, it doesn't
stall work items on the system work queue or from other devices.

Yes, exactly that. The last parameter to alloc_workqueue() limits how
many work items can be sleeping.


I'm trying to understand why you set WQ_MEM_RECLAIM. This work queue
is not about freeing ttm_resources but about freeing the BOs. But it
affects freeing of ghost_objs that are holding the ttm_resources being
freed.

Well if the BO is idle, but not immediately lockable we delegate freeing
the backing pages in the TT object to those workers as well. It might
even be a good idea to use a separate wq for this case.


If those assumptions all make sense, patches 1-3 are

Reviewed-by: Felix Kuehling 

Thanks,
Christian.


This patch causes a heap use-after-free when using nouveau with the
potential of trashing filesystems, is there a way to revert it until
we figure out a proper solution to the problem?


Uff I don't think so, we have quite some work based on top of this. But 
let me double check.


On the other hand have you tried running this with KASAN to catch use 
after free errors?


Since we now block for work to finish and not check every few 
milliseconds to garbage collect memory will now be reclaimed much faster 
after freeing it.


Regards,
Christian.



Bug: https://gitlab.freedesktop.org/drm/nouveau/-/issues/213

example trace on affected systems:

[ 4102.946946] general protection fault, probably for non-canonical
address 0x5f775ce3bd949b45:  [#3] PREEMPT SMP NOPTI
[ 4102.957794] CPU: 12 PID: 89561 Comm: glcts Tainted: G  D
 6.3.5-200.fc38.x86_64 #1
[ 4102.966556] Hardware name: ASUS System Product Name/PRIME B660-PLUS
D4, BIOS 0418 10/13/2021
[ 4102.974972] RIP: 0010:__kmem_cache_alloc_node+0x1ba/0x320
[ 4102.980362] Code: 2b 14 25 28 00 00 00 0f 85 74 01 00 00 48 83 c4
18 5b 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc 41 8b 47 28 4d 8b 07
48 01 f8 <48> 8b 18 48 89 c1 49 33 9f b8 00 00 00 48 0f c9 48 31 cb 41
f6 c0
[ 4102.999073] RSP: 0018:9764e0057b40 EFLAGS: 00010202
[ 4103.004291] RAX: 5f775ce3bd949b45 RBX: 0dc0 RCX: 0046
[ 4103.011408] RDX: 0002cf87600c RSI: 0dc0 RDI: 5f775ce3bd949b15
[ 4103.018528] RBP: 0dc0 R08: 000390c0 R09: 30302d6d
[ 4103.025649] R10: 756c7473 R11: 20090298 R12: 
[ 4103.032767] R13:  R14: 0046 R15: 8bda80042600
[ 4103.039887] FS:  7f386a85ef00() GS:8be1df70()
knlGS:
[ 4103.047958] CS:  0010 DS:  ES:  CR0: 80050033
[ 4103.053692] CR2: 0493b868 CR3: 00014c3ba000 CR4: 00f50ee0
[ 4103.060812] PKRU: 5554
[ 4103.063520] Call Trace:
[ 4103.065970]  
[ 4103.068071]  ? die_addr+0x36/0x90
[ 4103.071384]  ? exc_general_protection+0x1be/0x420
[ 4103.076081]  ? asm_exc_general_protection+0x26/0x30
[ 4103.080952]  ? __kmem_cache_alloc_node+0x1ba/0x320
[ 4103.085734]  ? ext4_htree_store_dirent+0x42/0x180
[ 4103.090431]  ? ext4_htree_store_dirent+0x42/0x180
[ 4103.095132]  __kmalloc+0x4d/0x150
[ 4103.098444]  ext4_htree_store_dirent+0x42/0x180
[ 4103.102970]  htree_dirblock_to_tree+0x1ed/0x370
[ 4103.107494]  ext4_htree_fill_tree+0x109/0x3d0
[ 4103.111846]  ext4_readdir+0x6d4/0xa80
[ 4103.115505]  iterate_dir+0x178/0x1c0
[ 4103.119076]  __x64_sys_getdents64+0x88/0x130
[ 4103.123341]  ? __pfx_filldir64+0x10/0x10
[ 4103.127260]  do_syscall_64+0x5d/0x90
[ 4103.130835]  ? handle_mm_fault+0x11e/0x310
[ 4103.134927]  ? do_user_addr_fault+0x1e0/0x720
[ 4103.139278]  ? exc_page_fault+0x7c/0x180
[ 4103.143195]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
[ 4103.148240] RIP: 0033:0x7f386a418047
[ 4103.151828] Code: 24 fb ff 4c 89 e0 5b 41 5c 5d c3 0f 1f 84 00 00
00 00 00 f3 0f 1e fa b8 ff ff ff 7f 48 39 c2 48 0f 47 d0 b8 d9 00 00
00 0f 05 <48> 3d 00 f0 ff ff 77 01 c3 48 8b 15 91 cd 0f 00 f7 d8 64 89
02 48
[ 4103.170543] RSP: 002b:7ffd4793ff38 EFLAGS: 0293 ORIG_RAX:
00d9
[ 4103.178095] RAX: ffda RBX: 04933830 RCX: 7f386a418047
[ 4103.185214] RDX: 8000 RSI: 04933860 RDI: 

Re: [PATCH 3/3] drm/amdgpu: Remove unused NBIO interface

2023-06-13 Thread Deucher, Alexander
[Public]

Sorry, replied to the wrong rev of the patch.  my AB applies to v2 as well.

Alex

From: amd-gfx  on behalf of Deucher, 
Alexander 
Sent: Tuesday, June 13, 2023 9:25 AM
To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org 

Cc: Ma, Le ; Kamal, Asad ; Zhang, Hawking 

Subject: Re: [PATCH 3/3] drm/amdgpu: Remove unused NBIO interface


[Public]


[Public]

Series is:
Acked-by: Alex Deucher 

From: Lazar, Lijo 
Sent: Tuesday, June 13, 2023 6:53 AM
To: amd-gfx@lists.freedesktop.org 
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Ma, Le 

Subject: [PATCH 3/3] drm/amdgpu: Remove unused NBIO interface

Set compute partition mode interface in NBIO is no longer used. Remove
the only implementation from NBIO v7.9

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h |  2 --
 drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c   | 14 --
 2 files changed, 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
index 095aecfb201e..8ab8ae01f87c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
@@ -99,8 +99,6 @@ struct amdgpu_nbio_funcs {
 int (*get_compute_partition_mode)(struct amdgpu_device *adev);
 u32 (*get_memory_partition_mode)(struct amdgpu_device *adev,
  u32 *supp_modes);
-   void (*set_compute_partition_mode)(struct amdgpu_device *adev,
-  enum amdgpu_gfx_partition mode);
 };

 struct amdgpu_nbio {
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
index b033935d6749..cd1a02d30420 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
@@ -393,19 +393,6 @@ static int nbio_v7_9_get_compute_partition_mode(struct 
amdgpu_device *adev)
 return px;
 }

-static void nbio_v7_9_set_compute_partition_mode(struct amdgpu_device *adev,
-   enum amdgpu_gfx_partition mode)
-{
-   u32 tmp;
-
-   /* SPX=0, DPX=1, TPX=2, QPX=3, CPX=4 */
-   tmp = RREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS);
-   tmp = REG_SET_FIELD(tmp, BIF_BX_PF0_PARTITION_COMPUTE_STATUS,
-   PARTITION_MODE, mode);
-
-   WREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS, tmp);
-}
-
 static u32 nbio_v7_9_get_memory_partition_mode(struct amdgpu_device *adev,
u32 *supp_modes)
 {
@@ -461,7 +448,6 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = {
 .ih_control = nbio_v7_9_ih_control,
 .remap_hdp_registers = nbio_v7_9_remap_hdp_registers,
 .get_compute_partition_mode = nbio_v7_9_get_compute_partition_mode,
-   .set_compute_partition_mode = nbio_v7_9_set_compute_partition_mode,
 .get_memory_partition_mode = nbio_v7_9_get_memory_partition_mode,
 .init_registers = nbio_v7_9_init_registers,
 };
--
2.25.1



Re: [PATCH 3/3] drm/amdgpu: Remove unused NBIO interface

2023-06-13 Thread Deucher, Alexander
[Public]

Series is:
Acked-by: Alex Deucher 

From: Lazar, Lijo 
Sent: Tuesday, June 13, 2023 6:53 AM
To: amd-gfx@lists.freedesktop.org 
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Ma, Le 

Subject: [PATCH 3/3] drm/amdgpu: Remove unused NBIO interface

Set compute partition mode interface in NBIO is no longer used. Remove
the only implementation from NBIO v7.9

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h |  2 --
 drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c   | 14 --
 2 files changed, 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
index 095aecfb201e..8ab8ae01f87c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
@@ -99,8 +99,6 @@ struct amdgpu_nbio_funcs {
 int (*get_compute_partition_mode)(struct amdgpu_device *adev);
 u32 (*get_memory_partition_mode)(struct amdgpu_device *adev,
  u32 *supp_modes);
-   void (*set_compute_partition_mode)(struct amdgpu_device *adev,
-  enum amdgpu_gfx_partition mode);
 };

 struct amdgpu_nbio {
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
index b033935d6749..cd1a02d30420 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
@@ -393,19 +393,6 @@ static int nbio_v7_9_get_compute_partition_mode(struct 
amdgpu_device *adev)
 return px;
 }

-static void nbio_v7_9_set_compute_partition_mode(struct amdgpu_device *adev,
-   enum amdgpu_gfx_partition mode)
-{
-   u32 tmp;
-
-   /* SPX=0, DPX=1, TPX=2, QPX=3, CPX=4 */
-   tmp = RREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS);
-   tmp = REG_SET_FIELD(tmp, BIF_BX_PF0_PARTITION_COMPUTE_STATUS,
-   PARTITION_MODE, mode);
-
-   WREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS, tmp);
-}
-
 static u32 nbio_v7_9_get_memory_partition_mode(struct amdgpu_device *adev,
u32 *supp_modes)
 {
@@ -461,7 +448,6 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = {
 .ih_control = nbio_v7_9_ih_control,
 .remap_hdp_registers = nbio_v7_9_remap_hdp_registers,
 .get_compute_partition_mode = nbio_v7_9_get_compute_partition_mode,
-   .set_compute_partition_mode = nbio_v7_9_set_compute_partition_mode,
 .get_memory_partition_mode = nbio_v7_9_get_memory_partition_mode,
 .init_registers = nbio_v7_9_init_registers,
 };
--
2.25.1



Re: [PATCH] drm/amdgpu: update external rev_id for gc_11_0_1 and gc_11_0_4

2023-06-13 Thread Deucher, Alexander
[Public]

Acked-by: Alex Deucher 

From: amd-gfx  on behalf of Aaron Liu 

Sent: Monday, June 12, 2023 11:09 PM
To: amd-gfx@lists.freedesktop.org 
Cc: Deucher, Alexander ; Zhang, Yifan 
; Liu, Aaron 
Subject: [PATCH] drm/amdgpu: update external rev_id for gc_11_0_1 and gc_11_0_4

For gc_11_0_1, the external rev_id of A0/A1 series is 0x1,
the external rev_id of A2 is 0x10.

Signed-off-by: Aaron Liu 
---
 drivers/gpu/drm/amd/amdgpu/soc21.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c 
b/drivers/gpu/drm/amd/amdgpu/soc21.c
index e5e5d68a4d70..caaf9da4c1c0 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc21.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc21.c
@@ -665,7 +665,10 @@ static int soc21_common_early_init(void *handle)
 AMD_PG_SUPPORT_VCN |
 AMD_PG_SUPPORT_VCN_DPG |
 AMD_PG_SUPPORT_JPEG;
-   adev->external_rev_id = adev->rev_id + 0x1;
+   if (adev->rev_id < 0xA)
+   adev->external_rev_id = 0x1;
+   else
+   adev->external_rev_id = 0x10;
 break;
 case IP_VERSION(11, 0, 3):
 adev->cg_flags = AMD_CG_SUPPORT_VCN_MGCG |
@@ -705,7 +708,7 @@ static int soc21_common_early_init(void *handle)
 AMD_PG_SUPPORT_VCN_DPG |
 AMD_PG_SUPPORT_GFX_PG |
 AMD_PG_SUPPORT_JPEG;
-   adev->external_rev_id = adev->rev_id + 0x80;
+   adev->external_rev_id = 0x80;
 break;

 default:
--
2.39.0



Re: [PATCH] drm/amdkfd: Remove DUMMY_VRAM_SIZE

2023-06-13 Thread Deucher, Alexander
[Public]

Acked-by: Alex Deucher 

From: amd-gfx  on behalf of Mukul Joshi 

Sent: Monday, June 12, 2023 7:06 PM
To: amd-gfx@lists.freedesktop.org 
Cc: Joshi, Mukul ; Kuehling, Felix 
Subject: [PATCH] drm/amdkfd: Remove DUMMY_VRAM_SIZE

Remove DUMMY_VRAM_SIZE as it is not needed and can result
in reporting incorrect memory size.

Signed-off-by: Mukul Joshi 
---
 drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index 3dcd8f8bc98e..49f40d9f16e8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -30,9 +30,6 @@
 #include "amdgpu.h"
 #include "amdgpu_amdkfd.h"

-/* Fixme: Fake 32GB for 1PNPS1 mode bringup */
-#define DUMMY_VRAM_SIZE 31138512896
-
 /* GPU Processor ID base for dGPUs for which VCRAT needs to be created.
  * GPU processor ID are expressed with Bit[31]=1.
  * The base is set to 0x8000_ + 0x1000 to avoid collision with GPU IDs
@@ -1056,8 +1053,6 @@ static int kfd_parse_subtype_mem(struct 
crat_subtype_memory *mem,

 props->heap_type = heap_type;
 props->flags = flags;
-   if (size_in_bytes == 0)
-   size_in_bytes = DUMMY_VRAM_SIZE; /* Fixme: TBD 
*/
 props->size_in_bytes = size_in_bytes;
 props->width = width;

--
2.35.1



Re: [PATCH 3/9] drm/ttm: use per BO cleanup workers

2023-06-13 Thread Karol Herbst
On Mon, Dec 5, 2022 at 2:40 PM Christian König  wrote:
>
> Am 29.11.22 um 22:14 schrieb Felix Kuehling:
> > On 2022-11-25 05:21, Christian König wrote:
> >> Instead of a single worker going over the list of delete BOs in regular
> >> intervals use a per BO worker which blocks for the resv object and
> >> locking of the BO.
> >>
> >> This not only simplifies the handling massively, but also results in
> >> much better response time when cleaning up buffers.
> >>
> >> Signed-off-by: Christian König 
> >
> > Just thinking out loud: If I understand it correctly, this can cause a
> > lot of sleeping worker threads when
> > AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE is used and many BOs are freed
> > at the same time. This happens e.g. when a KFD process terminates or
> > crashes. I guess with a concurrency-managed workqueue this isn't going
> > to be excessive. And since it's on a per device workqueue, it doesn't
> > stall work items on the system work queue or from other devices.
>
> Yes, exactly that. The last parameter to alloc_workqueue() limits how
> many work items can be sleeping.
>
> > I'm trying to understand why you set WQ_MEM_RECLAIM. This work queue
> > is not about freeing ttm_resources but about freeing the BOs. But it
> > affects freeing of ghost_objs that are holding the ttm_resources being
> > freed.
>
> Well if the BO is idle, but not immediately lockable we delegate freeing
> the backing pages in the TT object to those workers as well. It might
> even be a good idea to use a separate wq for this case.
>
> >
> > If those assumptions all make sense, patches 1-3 are
> >
> > Reviewed-by: Felix Kuehling 
>
> Thanks,
> Christian.
>

This patch causes a heap use-after-free when using nouveau with the
potential of trashing filesystems, is there a way to revert it until
we figure out a proper solution to the problem?

Bug: https://gitlab.freedesktop.org/drm/nouveau/-/issues/213

example trace on affected systems:

[ 4102.946946] general protection fault, probably for non-canonical
address 0x5f775ce3bd949b45:  [#3] PREEMPT SMP NOPTI
[ 4102.957794] CPU: 12 PID: 89561 Comm: glcts Tainted: G  D
6.3.5-200.fc38.x86_64 #1
[ 4102.966556] Hardware name: ASUS System Product Name/PRIME B660-PLUS
D4, BIOS 0418 10/13/2021
[ 4102.974972] RIP: 0010:__kmem_cache_alloc_node+0x1ba/0x320
[ 4102.980362] Code: 2b 14 25 28 00 00 00 0f 85 74 01 00 00 48 83 c4
18 5b 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc 41 8b 47 28 4d 8b 07
48 01 f8 <48> 8b 18 48 89 c1 49 33 9f b8 00 00 00 48 0f c9 48 31 cb 41
f6 c0
[ 4102.999073] RSP: 0018:9764e0057b40 EFLAGS: 00010202
[ 4103.004291] RAX: 5f775ce3bd949b45 RBX: 0dc0 RCX: 0046
[ 4103.011408] RDX: 0002cf87600c RSI: 0dc0 RDI: 5f775ce3bd949b15
[ 4103.018528] RBP: 0dc0 R08: 000390c0 R09: 30302d6d
[ 4103.025649] R10: 756c7473 R11: 20090298 R12: 
[ 4103.032767] R13:  R14: 0046 R15: 8bda80042600
[ 4103.039887] FS:  7f386a85ef00() GS:8be1df70()
knlGS:
[ 4103.047958] CS:  0010 DS:  ES:  CR0: 80050033
[ 4103.053692] CR2: 0493b868 CR3: 00014c3ba000 CR4: 00f50ee0
[ 4103.060812] PKRU: 5554
[ 4103.063520] Call Trace:
[ 4103.065970]  
[ 4103.068071]  ? die_addr+0x36/0x90
[ 4103.071384]  ? exc_general_protection+0x1be/0x420
[ 4103.076081]  ? asm_exc_general_protection+0x26/0x30
[ 4103.080952]  ? __kmem_cache_alloc_node+0x1ba/0x320
[ 4103.085734]  ? ext4_htree_store_dirent+0x42/0x180
[ 4103.090431]  ? ext4_htree_store_dirent+0x42/0x180
[ 4103.095132]  __kmalloc+0x4d/0x150
[ 4103.098444]  ext4_htree_store_dirent+0x42/0x180
[ 4103.102970]  htree_dirblock_to_tree+0x1ed/0x370
[ 4103.107494]  ext4_htree_fill_tree+0x109/0x3d0
[ 4103.111846]  ext4_readdir+0x6d4/0xa80
[ 4103.115505]  iterate_dir+0x178/0x1c0
[ 4103.119076]  __x64_sys_getdents64+0x88/0x130
[ 4103.123341]  ? __pfx_filldir64+0x10/0x10
[ 4103.127260]  do_syscall_64+0x5d/0x90
[ 4103.130835]  ? handle_mm_fault+0x11e/0x310
[ 4103.134927]  ? do_user_addr_fault+0x1e0/0x720
[ 4103.139278]  ? exc_page_fault+0x7c/0x180
[ 4103.143195]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
[ 4103.148240] RIP: 0033:0x7f386a418047
[ 4103.151828] Code: 24 fb ff 4c 89 e0 5b 41 5c 5d c3 0f 1f 84 00 00
00 00 00 f3 0f 1e fa b8 ff ff ff 7f 48 39 c2 48 0f 47 d0 b8 d9 00 00
00 0f 05 <48> 3d 00 f0 ff ff 77 01 c3 48 8b 15 91 cd 0f 00 f7 d8 64 89
02 48
[ 4103.170543] RSP: 002b:7ffd4793ff38 EFLAGS: 0293 ORIG_RAX:
00d9
[ 4103.178095] RAX: ffda RBX: 04933830 RCX: 7f386a418047
[ 4103.185214] RDX: 8000 RSI: 04933860 RDI: 0006
[ 4103.192335] RBP: 7ffd4793ff70 R08:  R09: 0001
[ 4103.199454] R10: 0004 R11: 0293 R12: 04933834
[ 4103.206573] R13: 04933860 R14: ff60 R15: 
[ 4103.213695]  
[ 4103.215883] 

RE: [PATCH v2 2/3] drm/amdgpu: Use PSP FW API for partition switch

2023-06-13 Thread Zhang, Hawking
[AMD Official Use Only - General]

Series is

Reviewed-by: Hawking Zhang 

Regards
Hawking
-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Tuesday, June 13, 2023 19:03
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Ma, Le ; 
Kamal, Asad ; Zhang, Hawking 
Subject: [PATCH v2 2/3] drm/amdgpu: Use PSP FW API for partition switch

Use PSP firmware interface for switching compute partitions.

Signed-off-by: Lijo Lazar 
---
v2:
Changed the return value to int

 .../drm/amd/amdgpu/aqua_vanjaram_reg_init.c|  3 ---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c| 18 ++
 2 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c 
b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c
index a595bb958215..16471b81a1f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c
+++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c
@@ -518,9 +518,6 @@ static int aqua_vanjaram_switch_partition_mode(struct 
amdgpu_xcp_mgr *xcp_mgr,
adev->gfx.funcs->switch_partition_mode(xcp_mgr->adev,
   num_xcc_per_xcp);

-   if (adev->nbio.funcs->set_compute_partition_mode)
-   adev->nbio.funcs->set_compute_partition_mode(adev, mode);
-
/* Init info about new xcps */
*num_xcps = num_xcc / num_xcc_per_xcp;
amdgpu_xcp_init(xcp_mgr, *num_xcps, mode); diff --git 
a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index f5b8d3f388ff..c1ee54d4c3d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -623,22 +623,16 @@ static void gfx_v9_4_3_select_me_pipe_q(struct 
amdgpu_device *adev,  static int gfx_v9_4_3_switch_compute_partition(struct 
amdgpu_device *adev,
int num_xccs_per_xcp)
 {
-   int i, num_xcc;
-   u32 tmp = 0;
-
-   num_xcc = NUM_XCC(adev->gfx.xcc_mask);
+   int ret;

-   for (i = 0; i < num_xcc; i++) {
-   tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, NUM_XCC_IN_XCP,
-   num_xccs_per_xcp);
-   tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, VIRTUAL_XCC_ID,
-   i % num_xccs_per_xcp);
-   WREG32_SOC15(GC, GET_INST(GC, i), regCP_HYP_XCP_CTL, tmp);
-   }
+   ret = psp_spatial_partition(>psp, NUM_XCC(adev->gfx.xcc_mask) /
+   num_xccs_per_xcp);
+   if (ret)
+   return ret;

adev->gfx.num_xcc_per_xcp = num_xccs_per_xcp;

-   return 0;
+   return ret;
 }

 static int gfx_v9_4_3_ih_to_xcc_inst(struct amdgpu_device *adev, int ih_node)
--
2.25.1



RE: [PATCH] drm/amdgpu: Release SDMAv4.4.2 ecc irq properly

2023-06-13 Thread Zhang, Hawking
[AMD Official Use Only - General]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, June 13, 2023 18:46
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Ma, Le 

Subject: [PATCH] drm/amdgpu: Release SDMAv4.4.2 ecc irq properly

Release ECC irq only if irq is enabled - only when RAS feature is enabled ECC 
irq gets enabled.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 8eebf9c2bbcd..77ebf27981e4 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -1434,9 +1434,11 @@ static int sdma_v4_4_2_hw_fini(void *handle)
return 0;

inst_mask = GENMASK(adev->sdma.num_instances - 1, 0);
-   for (i = 0; i < adev->sdma.num_instances; i++) {
-   amdgpu_irq_put(adev, >sdma.ecc_irq,
-  AMDGPU_SDMA_IRQ_INSTANCE0 + i);
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
+   for (i = 0; i < adev->sdma.num_instances; i++) {
+   amdgpu_irq_put(adev, >sdma.ecc_irq,
+  AMDGPU_SDMA_IRQ_INSTANCE0 + i);
+   }
}

sdma_v4_4_2_inst_ctx_switch_enable(adev, false, inst_mask); @@ -2073,9 
+2075,12 @@ static int sdma_v4_4_2_xcp_suspend(void *handle, uint32_t inst_mask)
uint32_t tmp_mask = inst_mask;
int i;

-   for_each_inst(i, tmp_mask) {
-   amdgpu_irq_put(adev, >sdma.ecc_irq,
-  AMDGPU_SDMA_IRQ_INSTANCE0 + i);
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
+   for_each_inst(i, tmp_mask)
+   {
+   amdgpu_irq_put(adev, >sdma.ecc_irq,
+  AMDGPU_SDMA_IRQ_INSTANCE0 + i);
+   }
}

sdma_v4_4_2_inst_ctx_switch_enable(adev, false, inst_mask);
--
2.25.1



[PATCH v2 2/3] drm/amdgpu: Use PSP FW API for partition switch

2023-06-13 Thread Lijo Lazar
Use PSP firmware interface for switching compute partitions.

Signed-off-by: Lijo Lazar 
---
v2:
Changed the return value to int

 .../drm/amd/amdgpu/aqua_vanjaram_reg_init.c|  3 ---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c| 18 ++
 2 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c 
b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c
index a595bb958215..16471b81a1f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c
+++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c
@@ -518,9 +518,6 @@ static int aqua_vanjaram_switch_partition_mode(struct 
amdgpu_xcp_mgr *xcp_mgr,
adev->gfx.funcs->switch_partition_mode(xcp_mgr->adev,
   num_xcc_per_xcp);
 
-   if (adev->nbio.funcs->set_compute_partition_mode)
-   adev->nbio.funcs->set_compute_partition_mode(adev, mode);
-
/* Init info about new xcps */
*num_xcps = num_xcc / num_xcc_per_xcp;
amdgpu_xcp_init(xcp_mgr, *num_xcps, mode);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index f5b8d3f388ff..c1ee54d4c3d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -623,22 +623,16 @@ static void gfx_v9_4_3_select_me_pipe_q(struct 
amdgpu_device *adev,
 static int gfx_v9_4_3_switch_compute_partition(struct amdgpu_device *adev,
int num_xccs_per_xcp)
 {
-   int i, num_xcc;
-   u32 tmp = 0;
-
-   num_xcc = NUM_XCC(adev->gfx.xcc_mask);
+   int ret;
 
-   for (i = 0; i < num_xcc; i++) {
-   tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, NUM_XCC_IN_XCP,
-   num_xccs_per_xcp);
-   tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, VIRTUAL_XCC_ID,
-   i % num_xccs_per_xcp);
-   WREG32_SOC15(GC, GET_INST(GC, i), regCP_HYP_XCP_CTL, tmp);
-   }
+   ret = psp_spatial_partition(>psp, NUM_XCC(adev->gfx.xcc_mask) /
+   num_xccs_per_xcp);
+   if (ret)
+   return ret;
 
adev->gfx.num_xcc_per_xcp = num_xccs_per_xcp;
 
-   return 0;
+   return ret;
 }
 
 static int gfx_v9_4_3_ih_to_xcc_inst(struct amdgpu_device *adev, int ih_node)
-- 
2.25.1



[PATCH v2 3/3] drm/amdgpu: Remove unused NBIO interface

2023-06-13 Thread Lijo Lazar
Set compute partition mode interface in NBIO is no longer used. Remove
the only implementation from NBIO v7.9

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h |  2 --
 drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c   | 14 --
 2 files changed, 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
index 095aecfb201e..8ab8ae01f87c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
@@ -99,8 +99,6 @@ struct amdgpu_nbio_funcs {
int (*get_compute_partition_mode)(struct amdgpu_device *adev);
u32 (*get_memory_partition_mode)(struct amdgpu_device *adev,
 u32 *supp_modes);
-   void (*set_compute_partition_mode)(struct amdgpu_device *adev,
-  enum amdgpu_gfx_partition mode);
 };
 
 struct amdgpu_nbio {
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
index b033935d6749..cd1a02d30420 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
@@ -393,19 +393,6 @@ static int nbio_v7_9_get_compute_partition_mode(struct 
amdgpu_device *adev)
return px;
 }
 
-static void nbio_v7_9_set_compute_partition_mode(struct amdgpu_device *adev,
-   enum amdgpu_gfx_partition mode)
-{
-   u32 tmp;
-
-   /* SPX=0, DPX=1, TPX=2, QPX=3, CPX=4 */
-   tmp = RREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS);
-   tmp = REG_SET_FIELD(tmp, BIF_BX_PF0_PARTITION_COMPUTE_STATUS,
-   PARTITION_MODE, mode);
-
-   WREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS, tmp);
-}
-
 static u32 nbio_v7_9_get_memory_partition_mode(struct amdgpu_device *adev,
   u32 *supp_modes)
 {
@@ -461,7 +448,6 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = {
.ih_control = nbio_v7_9_ih_control,
.remap_hdp_registers = nbio_v7_9_remap_hdp_registers,
.get_compute_partition_mode = nbio_v7_9_get_compute_partition_mode,
-   .set_compute_partition_mode = nbio_v7_9_set_compute_partition_mode,
.get_memory_partition_mode = nbio_v7_9_get_memory_partition_mode,
.init_registers = nbio_v7_9_init_registers,
 };
-- 
2.25.1



[PATCH v2 1/3] drm/amdgpu: Change nbio v7.9 xcp status definition

2023-06-13 Thread Lijo Lazar
PARTITION_MODE field in PARTITION_COMPUTE_STATUS register is defined as
below by firmware.

SPX = 0, DPX = 1, TPX = 2, QPX = 3, CPX = 4

Change driver definition accordingly.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
index d19325476752..b033935d6749 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
@@ -390,7 +390,7 @@ static int nbio_v7_9_get_compute_partition_mode(struct 
amdgpu_device *adev)
px = REG_GET_FIELD(tmp, BIF_BX_PF0_PARTITION_COMPUTE_STATUS,
   PARTITION_MODE);
 
-   return ffs(px);
+   return px;
 }
 
 static void nbio_v7_9_set_compute_partition_mode(struct amdgpu_device *adev,
@@ -398,12 +398,10 @@ static void nbio_v7_9_set_compute_partition_mode(struct 
amdgpu_device *adev,
 {
u32 tmp;
 
-   /* Each bit represents DPX,TPX,QPX,CPX mode. No bit set means default
-* SPX mode.
-*/
+   /* SPX=0, DPX=1, TPX=2, QPX=3, CPX=4 */
tmp = RREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS);
tmp = REG_SET_FIELD(tmp, BIF_BX_PF0_PARTITION_COMPUTE_STATUS,
-   PARTITION_MODE, mode ? BIT(mode - 1) : mode);
+   PARTITION_MODE, mode);
 
WREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS, tmp);
 }
-- 
2.25.1



[PATCH 2/3] drm/amdgpu: Use PSP FW API for partition switch

2023-06-13 Thread Lijo Lazar
Use PSP firmware interface for switching compute partitions.

Signed-off-by: Lijo Lazar 
---
 .../drm/amd/amdgpu/aqua_vanjaram_reg_init.c|  3 ---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c| 18 ++
 2 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c 
b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c
index a595bb958215..16471b81a1f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c
+++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram_reg_init.c
@@ -518,9 +518,6 @@ static int aqua_vanjaram_switch_partition_mode(struct 
amdgpu_xcp_mgr *xcp_mgr,
adev->gfx.funcs->switch_partition_mode(xcp_mgr->adev,
   num_xcc_per_xcp);
 
-   if (adev->nbio.funcs->set_compute_partition_mode)
-   adev->nbio.funcs->set_compute_partition_mode(adev, mode);
-
/* Init info about new xcps */
*num_xcps = num_xcc / num_xcc_per_xcp;
amdgpu_xcp_init(xcp_mgr, *num_xcps, mode);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index f5b8d3f388ff..9e3e4fcf344d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -623,22 +623,16 @@ static void gfx_v9_4_3_select_me_pipe_q(struct 
amdgpu_device *adev,
 static int gfx_v9_4_3_switch_compute_partition(struct amdgpu_device *adev,
int num_xccs_per_xcp)
 {
-   int i, num_xcc;
-   u32 tmp = 0;
-
-   num_xcc = NUM_XCC(adev->gfx.xcc_mask);
+   u32 ret;
 
-   for (i = 0; i < num_xcc; i++) {
-   tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, NUM_XCC_IN_XCP,
-   num_xccs_per_xcp);
-   tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, VIRTUAL_XCC_ID,
-   i % num_xccs_per_xcp);
-   WREG32_SOC15(GC, GET_INST(GC, i), regCP_HYP_XCP_CTL, tmp);
-   }
+   ret = psp_spatial_partition(>psp, NUM_XCC(adev->gfx.xcc_mask) /
+   num_xccs_per_xcp);
+   if (ret)
+   return ret;
 
adev->gfx.num_xcc_per_xcp = num_xccs_per_xcp;
 
-   return 0;
+   return ret;
 }
 
 static int gfx_v9_4_3_ih_to_xcc_inst(struct amdgpu_device *adev, int ih_node)
-- 
2.25.1



[PATCH 3/3] drm/amdgpu: Remove unused NBIO interface

2023-06-13 Thread Lijo Lazar
Set compute partition mode interface in NBIO is no longer used. Remove
the only implementation from NBIO v7.9

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h |  2 --
 drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c   | 14 --
 2 files changed, 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
index 095aecfb201e..8ab8ae01f87c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
@@ -99,8 +99,6 @@ struct amdgpu_nbio_funcs {
int (*get_compute_partition_mode)(struct amdgpu_device *adev);
u32 (*get_memory_partition_mode)(struct amdgpu_device *adev,
 u32 *supp_modes);
-   void (*set_compute_partition_mode)(struct amdgpu_device *adev,
-  enum amdgpu_gfx_partition mode);
 };
 
 struct amdgpu_nbio {
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
index b033935d6749..cd1a02d30420 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
@@ -393,19 +393,6 @@ static int nbio_v7_9_get_compute_partition_mode(struct 
amdgpu_device *adev)
return px;
 }
 
-static void nbio_v7_9_set_compute_partition_mode(struct amdgpu_device *adev,
-   enum amdgpu_gfx_partition mode)
-{
-   u32 tmp;
-
-   /* SPX=0, DPX=1, TPX=2, QPX=3, CPX=4 */
-   tmp = RREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS);
-   tmp = REG_SET_FIELD(tmp, BIF_BX_PF0_PARTITION_COMPUTE_STATUS,
-   PARTITION_MODE, mode);
-
-   WREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS, tmp);
-}
-
 static u32 nbio_v7_9_get_memory_partition_mode(struct amdgpu_device *adev,
   u32 *supp_modes)
 {
@@ -461,7 +448,6 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = {
.ih_control = nbio_v7_9_ih_control,
.remap_hdp_registers = nbio_v7_9_remap_hdp_registers,
.get_compute_partition_mode = nbio_v7_9_get_compute_partition_mode,
-   .set_compute_partition_mode = nbio_v7_9_set_compute_partition_mode,
.get_memory_partition_mode = nbio_v7_9_get_memory_partition_mode,
.init_registers = nbio_v7_9_init_registers,
 };
-- 
2.25.1



[PATCH 1/3] drm/amdgpu: Change nbio v7.9 xcp status definition

2023-06-13 Thread Lijo Lazar
PARTITION_MODE field in PARTITION_COMPUTE_STATUS register is defined as
below by firmware.

SPX = 0, DPX = 1, TPX = 2, QPX = 3, CPX = 4

Change driver definition accordingly.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
index d19325476752..b033935d6749 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
@@ -390,7 +390,7 @@ static int nbio_v7_9_get_compute_partition_mode(struct 
amdgpu_device *adev)
px = REG_GET_FIELD(tmp, BIF_BX_PF0_PARTITION_COMPUTE_STATUS,
   PARTITION_MODE);
 
-   return ffs(px);
+   return px;
 }
 
 static void nbio_v7_9_set_compute_partition_mode(struct amdgpu_device *adev,
@@ -398,12 +398,10 @@ static void nbio_v7_9_set_compute_partition_mode(struct 
amdgpu_device *adev,
 {
u32 tmp;
 
-   /* Each bit represents DPX,TPX,QPX,CPX mode. No bit set means default
-* SPX mode.
-*/
+   /* SPX=0, DPX=1, TPX=2, QPX=3, CPX=4 */
tmp = RREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS);
tmp = REG_SET_FIELD(tmp, BIF_BX_PF0_PARTITION_COMPUTE_STATUS,
-   PARTITION_MODE, mode ? BIT(mode - 1) : mode);
+   PARTITION_MODE, mode);
 
WREG32_SOC15(NBIO, 0, regBIF_BX_PF0_PARTITION_COMPUTE_STATUS, tmp);
 }
-- 
2.25.1



[PATCH] drm/amdgpu: Release SDMAv4.4.2 ecc irq properly

2023-06-13 Thread Lijo Lazar
Release ECC irq only if irq is enabled - only when RAS feature is enabled
ECC irq gets enabled.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 8eebf9c2bbcd..77ebf27981e4 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -1434,9 +1434,11 @@ static int sdma_v4_4_2_hw_fini(void *handle)
return 0;
 
inst_mask = GENMASK(adev->sdma.num_instances - 1, 0);
-   for (i = 0; i < adev->sdma.num_instances; i++) {
-   amdgpu_irq_put(adev, >sdma.ecc_irq,
-  AMDGPU_SDMA_IRQ_INSTANCE0 + i);
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
+   for (i = 0; i < adev->sdma.num_instances; i++) {
+   amdgpu_irq_put(adev, >sdma.ecc_irq,
+  AMDGPU_SDMA_IRQ_INSTANCE0 + i);
+   }
}
 
sdma_v4_4_2_inst_ctx_switch_enable(adev, false, inst_mask);
@@ -2073,9 +2075,12 @@ static int sdma_v4_4_2_xcp_suspend(void *handle, 
uint32_t inst_mask)
uint32_t tmp_mask = inst_mask;
int i;
 
-   for_each_inst(i, tmp_mask) {
-   amdgpu_irq_put(adev, >sdma.ecc_irq,
-  AMDGPU_SDMA_IRQ_INSTANCE0 + i);
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
+   for_each_inst(i, tmp_mask)
+   {
+   amdgpu_irq_put(adev, >sdma.ecc_irq,
+  AMDGPU_SDMA_IRQ_INSTANCE0 + i);
+   }
}
 
sdma_v4_4_2_inst_ctx_switch_enable(adev, false, inst_mask);
-- 
2.25.1



RE: [PATCH v5] drm/dp_mst: Clear MSG_RDY flag before sending new message

2023-06-13 Thread Lin, Wayne
[AMD Official Use Only - General]

Noted. Thanks, Lyude!

Regards,
Wayne Lin

> -Original Message-
> From: Lyude Paul 
> Sent: Tuesday, June 13, 2023 6:34 AM
> To: Lin, Wayne ; dri-de...@lists.freedesktop.org;
> amd-gfx@lists.freedesktop.org
> Cc: ville.syrj...@linux.intel.com; jani.nik...@intel.com; imre.d...@intel.com;
> Wentland, Harry ; Zuo, Jerry
> ; sta...@vger.kernel.org
> Subject: Re: [PATCH v5] drm/dp_mst: Clear MSG_RDY flag before sending new
> message
>
> FWIW: Should have a response to this very soon, figured out the cause of my
> MST issues so I should be able to test this very soon
>
> On Fri, 2023-06-09 at 18:49 +0800, Wayne Lin wrote:
> > [Why]
> > The sequence for collecting down_reply from source perspective should
> > be:
> >
> > Request_n->repeat (get partial reply of Request_n->clear message ready
> > flag to ack DPRX that the message is received) till all partial
> > replies for Request_n are received->new Request_n+1.
> >
> > Now there is chance that drm_dp_mst_hpd_irq() will fire new down
> > request in the tx queue when the down reply is incomplete. Source is
> > restricted to generate interveleaved message transactions so we should
> > avoid it.
> >
> > Also, while assembling partial reply packets, reading out DPCD
> > DOWN_REP Sideband MSG buffer + clearing DOWN_REP_MSG_RDY flag
> should
> > be wrapped up as a complete operation for reading out a reply packet.
> > Kicking off a new request before clearing DOWN_REP_MSG_RDY flag might
> > be risky. e.g. If the reply of the new request has overwritten the
> > DPRX DOWN_REP Sideband MSG buffer before source writing one to clear
> > DOWN_REP_MSG_RDY flag, source then unintentionally flushes the reply
> > for the new request. Should handle the up request in the same way.
> >
> > [How]
> > Separete drm_dp_mst_hpd_irq() into 2 steps. After acking the MST IRQ
> > event, driver calls drm_dp_mst_hpd_irq_send_new_request() and might
> > trigger drm_dp_mst_kick_tx() only when there is no on going message
> > transaction.
> >
> > Changes since v1:
> > * Reworked on review comments received
> > -> Adjust the fix to let driver explicitly kick off new down request
> > when mst irq event is handled and acked
> > -> Adjust the commit message
> >
> > Changes since v2:
> > * Adjust the commit message
> > * Adjust the naming of the divided 2 functions and add a new input
> >   parameter "ack".
> > * Adjust code flow as per review comments.
> >
> > Changes since v3:
> > * Update the function description of drm_dp_mst_hpd_irq_handle_event
> >
> > Changes since v4:
> > * Change ack of drm_dp_mst_hpd_irq_handle_event() to be an array align
> >   the size of esi[]
> >
> > Signed-off-by: Wayne Lin 
> > Cc: sta...@vger.kernel.org
> > ---
> >  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 32 +--
> > drivers/gpu/drm/display/drm_dp_mst_topology.c | 54
> ---
> >  drivers/gpu/drm/i915/display/intel_dp.c   |  7 +--
> >  drivers/gpu/drm/nouveau/dispnv50/disp.c   | 12 +++--
> >  include/drm/display/drm_dp_mst_helper.h   |  7 ++-
> >  5 files changed, 81 insertions(+), 31 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > index d5cec03eaa8d..ec629b4037e4 100644
> > --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > @@ -3263,6 +3263,7 @@ static void dm_handle_mst_sideband_msg(struct
> > amdgpu_dm_connector *aconnector)
> >
> > while (dret == dpcd_bytes_to_read &&
> > process_count < max_process_count) {
> > +   u8 ack[DP_PSR_ERROR_STATUS - DP_SINK_COUNT_ESI] = {};
> > u8 retry;
> > dret = 0;
> >
> > @@ -3271,28 +3272,29 @@ static void
> dm_handle_mst_sideband_msg(struct amdgpu_dm_connector *aconnector)
> > DRM_DEBUG_DRIVER("ESI %02x %02x %02x\n", esi[0],
> esi[1], esi[2]);
> > /* handle HPD short pulse irq */
> > if (aconnector->mst_mgr.mst_state)
> > -   drm_dp_mst_hpd_irq(
> > -   >mst_mgr,
> > -   esi,
> > -   _irq_handled);
> > +   drm_dp_mst_hpd_irq_handle_event(
> >mst_mgr,
> > +   esi,
> > +   ack,
> > +   _irq_handled);
> >
> > if (new_irq_handled) {
> > /* ACK at DPCD to notify down stream */
> > -   const int ack_dpcd_bytes_to_write =
> > -   dpcd_bytes_to_read - 1;
> > -
> > for (retry = 0; retry < 3; retry++) {
> > -   u8 wret;
> > -
> > -   wret = drm_dp_dpcd_write(
> > -   >dm_dp_aux.aux,
> > -   dpcd_addr + 1,
> > -

RE: [PATCH] drm/amdgpu: add wait_for helper for spirom update

2023-06-13 Thread Zhang, Hawking
[AMD Official Use Only - General]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: amd-gfx  On Behalf Of Gao, Likun
Sent: Tuesday, June 13, 2023 15:37
To: amd-gfx list 
Subject: [PATCH] drm/amdgpu: add wait_for helper for spirom update

[AMD Official Use Only - General]

[AMD Official Use Only - General]

From: Likun Gao 
Sent: Tuesday, June 13, 2023 3:29 PM
To: brahma_sw_dev 
Cc: Zhang, Hawking ; Gao, Likun 
Subject: [PATCH] drm/amdgpu: add wait_for helper for spirom update

From: Likun Gao 

Spirom update typically requires extremely long duration for command execution, 
and special helper function to wait for it's completion.

Signed-off-by: Likun Gao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 20   
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  2 ++  
drivers/gpu/drm/amd/amdgpu/psp_v13_0.c  |  9 +  
drivers/gpu/drm/amd/amdgpu/psp_v13_0.h  |  2 ++
 4 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index a39d4ddf7743..fa06da014473 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -560,6 +560,26 @@ int psp_wait_for(struct psp_context *psp, uint32_t 
reg_index,
return -ETIME;
 }

+int psp_wait_for_spirom_update(struct psp_context *psp, uint32_t reg_index,
+  uint32_t reg_val, uint32_t mask, uint32_t 
msec_timeout) {
+   uint32_t val;
+   int i;
+   struct amdgpu_device *adev = psp->adev;
+
+   if (psp->adev->no_hw_access)
+   return 0;
+
+   for (i = 0; i < msec_timeout; i++) {
+   val = RREG32(reg_index);
+   if ((val & mask) == reg_val)
+   return 0;
+   msleep(1);
+   }
+
+   return -ETIME;
+}
+
 static const char *psp_gfx_cmd_name(enum psp_gfx_cmd_id cmd_id)  {
switch (cmd_id) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index cf4f60c66122..ec3f3fe5efff 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -455,6 +455,8 @@ extern const struct amdgpu_ip_block_version 
psp_v13_0_4_ip_block;

 extern int psp_wait_for(struct psp_context *psp, uint32_t reg_index,
uint32_t field_val, uint32_t mask, bool check_changed);
+extern int psp_wait_for_spirom_update(struct psp_context *psp, uint32_t 
reg_index,
+   uint32_t field_val, uint32_t mask, uint32_t 
msec_timeout);

 int psp_gpu_reset(struct amdgpu_device *adev);  int psp_update_vcn_sram(struct 
amdgpu_device *adev, int inst_idx, diff --git 
a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index caee76ab7110..67e216373585 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -624,10 +624,11 @@ static int psp_v13_0_exec_spi_cmd(struct psp_context 
*psp, int cmd)
WREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_73, 1);

if (cmd == C2PMSG_CMD_SPI_UPDATE_FLASH_IMAGE)
-   return 0;
-
-   ret = psp_wait_for(psp, SOC15_REG_OFFSET(MP0, 0, regMP0_SMN_C2PMSG_115),
-   MBOX_READY_FLAG, MBOX_READY_MASK, false);
+   ret = psp_wait_for_spirom_update(psp, SOC15_REG_OFFSET(MP0, 0, 
regMP0_SMN_C2PMSG_115),
+MBOX_READY_FLAG, 
MBOX_READY_MASK, PSP_SPIROM_UPDATE_TIMEOUT);
+   else
+   ret = psp_wait_for(psp, SOC15_REG_OFFSET(MP0, 0, 
regMP0_SMN_C2PMSG_115),
+  MBOX_READY_FLAG, MBOX_READY_MASK, false);
if (ret) {
dev_err(adev->dev, "SPI cmd %x timed out, ret = %d", cmd, ret);
return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.h 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.h
index b2414a729ca1..9eae5e23b2e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.h
@@ -25,6 +25,8 @@

 #include "amdgpu_psp.h"

+#define PSP_SPIROM_UPDATE_TIMEOUT   6   /* 60s */
+
 void psp_v13_0_set_psp_funcs(struct psp_context *psp);

 #endif
--
2.34.1



[PATCH] drm/amdgpu: add wait_for helper for spirom update

2023-06-13 Thread Gao, Likun
[AMD Official Use Only - General]

From: Likun Gao 
Sent: Tuesday, June 13, 2023 3:29 PM
To: brahma_sw_dev 
Cc: Zhang, Hawking ; Gao, Likun 
Subject: [PATCH] drm/amdgpu: add wait_for helper for spirom update

From: Likun Gao 

Spirom update typically requires extremely long duration for command execution, 
and special helper function to wait for it's completion.

Signed-off-by: Likun Gao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 20   
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  2 ++  
drivers/gpu/drm/amd/amdgpu/psp_v13_0.c  |  9 +  
drivers/gpu/drm/amd/amdgpu/psp_v13_0.h  |  2 ++
 4 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index a39d4ddf7743..fa06da014473 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -560,6 +560,26 @@ int psp_wait_for(struct psp_context *psp, uint32_t 
reg_index,
return -ETIME;
 }

+int psp_wait_for_spirom_update(struct psp_context *psp, uint32_t reg_index,
+  uint32_t reg_val, uint32_t mask, uint32_t 
msec_timeout) {
+   uint32_t val;
+   int i;
+   struct amdgpu_device *adev = psp->adev;
+
+   if (psp->adev->no_hw_access)
+   return 0;
+
+   for (i = 0; i < msec_timeout; i++) {
+   val = RREG32(reg_index);
+   if ((val & mask) == reg_val)
+   return 0;
+   msleep(1);
+   }
+
+   return -ETIME;
+}
+
 static const char *psp_gfx_cmd_name(enum psp_gfx_cmd_id cmd_id)  {
switch (cmd_id) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index cf4f60c66122..ec3f3fe5efff 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -455,6 +455,8 @@ extern const struct amdgpu_ip_block_version 
psp_v13_0_4_ip_block;

 extern int psp_wait_for(struct psp_context *psp, uint32_t reg_index,
uint32_t field_val, uint32_t mask, bool check_changed);
+extern int psp_wait_for_spirom_update(struct psp_context *psp, uint32_t 
reg_index,
+   uint32_t field_val, uint32_t mask, uint32_t 
msec_timeout);

 int psp_gpu_reset(struct amdgpu_device *adev);  int psp_update_vcn_sram(struct 
amdgpu_device *adev, int inst_idx, diff --git 
a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index caee76ab7110..67e216373585 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -624,10 +624,11 @@ static int psp_v13_0_exec_spi_cmd(struct psp_context 
*psp, int cmd)
WREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_73, 1);

if (cmd == C2PMSG_CMD_SPI_UPDATE_FLASH_IMAGE)
-   return 0;
-
-   ret = psp_wait_for(psp, SOC15_REG_OFFSET(MP0, 0, regMP0_SMN_C2PMSG_115),
-   MBOX_READY_FLAG, MBOX_READY_MASK, false);
+   ret = psp_wait_for_spirom_update(psp, SOC15_REG_OFFSET(MP0, 0, 
regMP0_SMN_C2PMSG_115),
+MBOX_READY_FLAG, 
MBOX_READY_MASK, PSP_SPIROM_UPDATE_TIMEOUT);
+   else
+   ret = psp_wait_for(psp, SOC15_REG_OFFSET(MP0, 0, 
regMP0_SMN_C2PMSG_115),
+  MBOX_READY_FLAG, MBOX_READY_MASK, false);
if (ret) {
dev_err(adev->dev, "SPI cmd %x timed out, ret = %d", cmd, ret);
return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.h 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.h
index b2414a729ca1..9eae5e23b2e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.h
@@ -25,6 +25,8 @@

 #include "amdgpu_psp.h"

+#define PSP_SPIROM_UPDATE_TIMEOUT   6   /* 60s */
+
 void psp_v13_0_set_psp_funcs(struct psp_context *psp);

 #endif
--
2.34.1



RE: [PATCH Review 1/2] drm/amdgpu: Optimze checking ras supported

2023-06-13 Thread Yang, Stanley
[AMD Official Use Only - General]

> -Original Message-
> From: Zhou1, Tao 
> Sent: Tuesday, June 13, 2023 3:08 PM
> To: Yang, Stanley ; amd-gfx@lists.freedesktop.org;
> Zhang, Hawking 
> Cc: Yang, Stanley 
> Subject: RE: [PATCH Review 1/2] drm/amdgpu: Optimze checking ras
> supported
>
> [AMD Official Use Only - General]
>
> [Tao] typo in title: Optimze -> Optimize

[Stanley]: Thanks Tao, will update before submitting.

Regards,
Stanley
>
> > -Original Message-
> > From: Stanley.Yang 
> > Sent: Tuesday, June 13, 2023 11:53 AM
> > To: amd-gfx@lists.freedesktop.org; Zhang, Hawking
> > ; Zhou1, Tao 
> > Cc: Yang, Stanley 
> > Subject: [PATCH Review 1/2] drm/amdgpu: Optimze checking ras supported
> >
> > Using "is_app_apu" to identify device in the native APU mode or carveout
> mode.
> >
> > Signed-off-by: Stanley.Yang 
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c |  2 +-
> > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |  8 +++---
> > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 34 ++-
> --
> >  3 files changed, 23 insertions(+), 21 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> > index 78bacea951a9..352e958b190a 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> > @@ -1653,7 +1653,7 @@ int psp_ras_initialize(struct psp_context *psp)
> >
> >   if (amdgpu_ras_is_poison_mode_supported(adev))
> >   ras_cmd->ras_in_message.init_flags.poison_mode_en = 1;
> > - if (!adev->gmc.xgmi.connected_to_cpu)
> > + if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu)
> >   ras_cmd->ras_in_message.init_flags.dgpu_mode = 1;
> >   ras_cmd->ras_in_message.init_flags.xcc_mask =
> >   adev->gfx.xcc_mask;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > index 7a0924469e4f..56bb0db207b9 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > @@ -1689,8 +1689,7 @@ static void
> > amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
> >   }
> >   }
> >
> > - if (!adev->gmc.xgmi.connected_to_cpu)
> > - amdgpu_umc_poison_handler(adev, false);
> > + amdgpu_umc_poison_handler(adev, false);
> >
> >   if (block_obj->hw_ops && block_obj->hw_ops-
> > >handle_poison_consumption)
> >   poison_stat = block_obj->hw_ops-
> > >handle_poison_consumption(adev);
> > @@ -2458,11 +2457,10 @@ static void
> amdgpu_ras_check_supported(struct
> > amdgpu_device *adev)  {
> >   adev->ras_hw_enabled = adev->ras_enabled = 0;
> >
> > - if (!adev->is_atom_fw ||
> > - !amdgpu_ras_asic_supported(adev))
> > + if (!amdgpu_ras_asic_supported(adev))
> >   return;
> >
> > - if (!adev->gmc.xgmi.connected_to_cpu) {
> > + if (!adev->gmc.xgmi.connected_to_cpu && !adev-
>
> [Tao] the tab should be replaced with space.
>
> > >gmc.is_app_apu) {
> >   if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
> >   dev_info(adev->dev, "MEM ECC is active.\n");
> >   adev->ras_hw_enabled |= (1 <<
> > AMDGPU_RAS_BLOCK__UMC | diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > index 1edf8e6aeb16..db0d94ca4ffc 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > @@ -169,27 +169,31 @@ int amdgpu_umc_poison_handler(struct
> > amdgpu_device *adev, bool reset)  {
> >   int ret = AMDGPU_RAS_SUCCESS;
> >
> > - if (!amdgpu_sriov_vf(adev)) {
> > - if (!adev->gmc.xgmi.connected_to_cpu) {
> > - struct ras_err_data err_data = {0, 0, 0, NULL};
> > - struct ras_common_if head = {
> > - .block = AMDGPU_RAS_BLOCK__UMC,
> > - };
> > - struct ras_manager *obj = amdgpu_ras_find_obj(adev,
> > );
> > -
> > - ret = amdgpu_umc_do_page_retirement(adev,
> > _data, NULL, reset);
> > -
> > - if (ret == AMDGPU_RAS_SUCCESS && obj) {
> > - obj->err_data.ue_count += err_data.ue_count;
> > - obj->err_data.ce_count += err_data.ce_count;
> > - }
> > - } else if (reset) {
> > + if (adev->gmc.xgmi.connected_to_cpu ||
> > + adev->gmc.is_app_apu) {
> > + if (reset) {
> >   /* MCA poison handler is only responsible for GPU 
> > reset,
> >* let MCA notifier do page retirement.
> >*/
> >   kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> >   amdgpu_ras_reset_gpu(adev);
> >   }
> > + return ret;
> > + }
> > +
> > +

RE: [PATCH Review 2/2] drm/amdgpu: Add checking mc_vram_size

2023-06-13 Thread Zhou1, Tao
[AMD Official Use Only - General]

With my concerns fixed, the series is:

Reviewed-by: Tao Zhou 

> -Original Message-
> From: Stanley.Yang 
> Sent: Tuesday, June 13, 2023 11:53 AM
> To: amd-gfx@lists.freedesktop.org; Zhang, Hawking ;
> Zhou1, Tao 
> Cc: Yang, Stanley 
> Subject: [PATCH Review 2/2] drm/amdgpu: Add checking mc_vram_size
>
> Do not compare injection address with mc_vram_size if mc_vram_size is zero.
>
> Signed-off-by: Stanley.Yang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 56bb0db207b9..3c041efcf0c4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -494,7 +494,8 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file
> *f,
>   ret = amdgpu_ras_feature_enable(adev, , 1);
>   break;
>   case 2:
> - if ((data.inject.address >= adev->gmc.mc_vram_size) ||
> + if ((data.inject.address >= adev->gmc.mc_vram_size &&
> + adev->gmc.mc_vram_size) ||
>   (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
>   dev_warn(adev->dev, "RAS WARN: input address "
>   "0x%llx is invalid.",
> --
> 2.17.1



RE: [PATCH Review 1/2] drm/amdgpu: Optimze checking ras supported

2023-06-13 Thread Zhou1, Tao
[AMD Official Use Only - General]

[Tao] typo in title: Optimze -> Optimize

> -Original Message-
> From: Stanley.Yang 
> Sent: Tuesday, June 13, 2023 11:53 AM
> To: amd-gfx@lists.freedesktop.org; Zhang, Hawking ;
> Zhou1, Tao 
> Cc: Yang, Stanley 
> Subject: [PATCH Review 1/2] drm/amdgpu: Optimze checking ras supported
>
> Using "is_app_apu" to identify device in the native APU mode or carveout mode.
>
> Signed-off-by: Stanley.Yang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c |  2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |  8 +++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 34 ++---
>  3 files changed, 23 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index 78bacea951a9..352e958b190a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -1653,7 +1653,7 @@ int psp_ras_initialize(struct psp_context *psp)
>
>   if (amdgpu_ras_is_poison_mode_supported(adev))
>   ras_cmd->ras_in_message.init_flags.poison_mode_en = 1;
> - if (!adev->gmc.xgmi.connected_to_cpu)
> + if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu)
>   ras_cmd->ras_in_message.init_flags.dgpu_mode = 1;
>   ras_cmd->ras_in_message.init_flags.xcc_mask =
>   adev->gfx.xcc_mask;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 7a0924469e4f..56bb0db207b9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1689,8 +1689,7 @@ static void
> amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
>   }
>   }
>
> - if (!adev->gmc.xgmi.connected_to_cpu)
> - amdgpu_umc_poison_handler(adev, false);
> + amdgpu_umc_poison_handler(adev, false);
>
>   if (block_obj->hw_ops && block_obj->hw_ops-
> >handle_poison_consumption)
>   poison_stat = block_obj->hw_ops-
> >handle_poison_consumption(adev);
> @@ -2458,11 +2457,10 @@ static void amdgpu_ras_check_supported(struct
> amdgpu_device *adev)  {
>   adev->ras_hw_enabled = adev->ras_enabled = 0;
>
> - if (!adev->is_atom_fw ||
> - !amdgpu_ras_asic_supported(adev))
> + if (!amdgpu_ras_asic_supported(adev))
>   return;
>
> - if (!adev->gmc.xgmi.connected_to_cpu) {
> + if (!adev->gmc.xgmi.connected_to_cpu && !adev-

[Tao] the tab should be replaced with space.

> >gmc.is_app_apu) {
>   if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
>   dev_info(adev->dev, "MEM ECC is active.\n");
>   adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__UMC | diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> index 1edf8e6aeb16..db0d94ca4ffc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> @@ -169,27 +169,31 @@ int amdgpu_umc_poison_handler(struct
> amdgpu_device *adev, bool reset)  {
>   int ret = AMDGPU_RAS_SUCCESS;
>
> - if (!amdgpu_sriov_vf(adev)) {
> - if (!adev->gmc.xgmi.connected_to_cpu) {
> - struct ras_err_data err_data = {0, 0, 0, NULL};
> - struct ras_common_if head = {
> - .block = AMDGPU_RAS_BLOCK__UMC,
> - };
> - struct ras_manager *obj = amdgpu_ras_find_obj(adev,
> );
> -
> - ret = amdgpu_umc_do_page_retirement(adev,
> _data, NULL, reset);
> -
> - if (ret == AMDGPU_RAS_SUCCESS && obj) {
> - obj->err_data.ue_count += err_data.ue_count;
> - obj->err_data.ce_count += err_data.ce_count;
> - }
> - } else if (reset) {
> + if (adev->gmc.xgmi.connected_to_cpu ||
> + adev->gmc.is_app_apu) {
> + if (reset) {
>   /* MCA poison handler is only responsible for GPU reset,
>* let MCA notifier do page retirement.
>*/
>   kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
>   amdgpu_ras_reset_gpu(adev);
>   }
> + return ret;
> + }
> +
> + if (!amdgpu_sriov_vf(adev)) {
> + struct ras_err_data err_data = {0, 0, 0, NULL};
> + struct ras_common_if head = {
> + .block = AMDGPU_RAS_BLOCK__UMC,
> + };
> + struct ras_manager *obj = amdgpu_ras_find_obj(adev, );
> +
> + ret = amdgpu_umc_do_page_retirement(adev, _data, NULL,
> reset);
> +
> + if (ret == AMDGPU_RAS_SUCCESS && obj) {
> + obj->err_data.ue_count += err_data.ue_count;
> + obj->err_data.ce_count += err_data.ce_count;
> + }
>   }