date:20240605

[PATCH v3 0/2] Add support for 'power saving policy' property

2024-06-05 Thread Mario Limonciello

During the Display Next hackfest 2024 one of the topics discussed
was the need for compositor to be able to relay intention to drivers
that color fidelity is preferred over power savings.

To accomplish this a new optional DRM property is being introduced called
"power saving policy".  This property is a bit mask that can be configured
with requests of "Require color accuracy" or "Require low latency"
that can be configured by the compositor.

When a driver advertises support for this property and the compositor
sets it to "Require color accuracy" then the driver will disable any power
saving features that can compromise color fidelity.

In practice the main feature this currently applies to is the
"Adaptive Backlight Modulation" feature within AMD DCN on eDP panels.

When the compositor has marked the property  "Require color accuracy" then
this feature will be disabled and any userspace that tries to turn it on
will get an -EBUSY return code.

Compositors can also request that low latency is critical which in
practice should cause PSR and PSR2 to be disabled.

When the compositor has restored the value back to no requirements then
the previous value that would have been programmed will be restored.

v2->v3:
 * Updates from Leo's comments (see individual patches)

The matching changes for the igt are here:
https://lore.kernel.org/dri-devel/2024050849.33343-1-mario.limoncie...@amd.com/

Mario Limonciello (2):
  drm: Introduce 'power saving policy' drm property
  drm/amd: Add power_saving_policy drm property to eDP connectors

 drivers/gpu/drm/amd/amdgpu/amdgpu_display.c   |  4 ++
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 50 +--
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h |  2 +
 drivers/gpu/drm/drm_connector.c   | 46 +
 include/drm/drm_connector.h   |  2 +
 include/drm/drm_mode_config.h |  5 ++
 include/uapi/drm/drm_mode.h   |  7 +++
 7 files changed, 111 insertions(+), 5 deletions(-)

-- 
2.43.0

[PATCH v3 2/2] drm/amd: Add power_saving_policy drm property to eDP connectors

2024-06-05 Thread Mario Limonciello

When the `power_saving_policy` property is set to bit mask
"Require color accuracy" ABM should be disabled immediately and
any requests by sysfs to update will return an -EBUSY error.

When the `power_saving_policy` property is set to bit mask
"Require low latency" PSR should be disabled.

When the property is restored to an empty bit mask the previous
value of ABM and pSR will be restored.

Signed-off-by: Mario Limonciello 
---
v2->v3:
 * Use `disallow_edp_enter_psr` instead
 * Drop case in dm_update_crtc_state()
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_display.c   |  4 ++
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 50 +--
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h |  2 +
 3 files changed, 51 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
index 3ecc7ef95172..cfb5220cf182 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
@@ -1350,6 +1350,10 @@ int amdgpu_display_modeset_create_props(struct 
amdgpu_device *adev)
 "dither",
 amdgpu_dither_enum_list, sz);
 
+   if (adev->dc_enabled)
+   drm_mode_create_power_saving_policy_property(adev_to_drm(adev),
+
DRM_MODE_POWER_SAVING_POLICY_ALL);
+
return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index f1d67c6f4b98..5fd7210b2479 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -6436,6 +6436,13 @@ int amdgpu_dm_connector_atomic_set_property(struct 
drm_connector *connector,
} else if (property == adev->mode_info.underscan_property) {
dm_new_state->underscan_enable = val;
ret = 0;
+   } else if (property == dev->mode_config.power_saving_policy) {
+   dm_new_state->abm_forbidden = val & 
DRM_MODE_REQUIRE_COLOR_ACCURACY;
+   dm_new_state->abm_level = (dm_new_state->abm_forbidden || 
!amdgpu_dm_abm_level) ?
+   ABM_LEVEL_IMMEDIATE_DISABLE :
+   amdgpu_dm_abm_level;
+   dm_new_state->psr_forbidden = val & 
DRM_MODE_REQUIRE_LOW_LATENCY;
+   ret = 0;
}
 
return ret;
@@ -6478,6 +6485,13 @@ int amdgpu_dm_connector_atomic_get_property(struct 
drm_connector *connector,
} else if (property == adev->mode_info.underscan_property) {
*val = dm_state->underscan_enable;
ret = 0;
+   } else if (property == dev->mode_config.power_saving_policy) {
+   *val = 0;
+   if (dm_state->psr_forbidden)
+   *val |= DRM_MODE_REQUIRE_LOW_LATENCY;
+   if (dm_state->abm_forbidden)
+   *val |= DRM_MODE_REQUIRE_COLOR_ACCURACY;
+   ret = 0;
}
 
return ret;
@@ -6504,9 +6518,12 @@ static ssize_t panel_power_savings_show(struct device 
*device,
u8 val;
 
drm_modeset_lock(>mode_config.connection_mutex, NULL);
-   val = to_dm_connector_state(connector->state)->abm_level ==
-   ABM_LEVEL_IMMEDIATE_DISABLE ? 0 :
-   to_dm_connector_state(connector->state)->abm_level;
+   if (to_dm_connector_state(connector->state)->abm_forbidden)
+   val = 0;
+   else
+   val = to_dm_connector_state(connector->state)->abm_level ==
+   ABM_LEVEL_IMMEDIATE_DISABLE ? 0 :
+   to_dm_connector_state(connector->state)->abm_level;
drm_modeset_unlock(>mode_config.connection_mutex);
 
return sysfs_emit(buf, "%u\n", val);
@@ -6530,10 +6547,16 @@ static ssize_t panel_power_savings_store(struct device 
*device,
return -EINVAL;
 
drm_modeset_lock(>mode_config.connection_mutex, NULL);
-   to_dm_connector_state(connector->state)->abm_level = val ?:
-   ABM_LEVEL_IMMEDIATE_DISABLE;
+   if (to_dm_connector_state(connector->state)->abm_forbidden)
+   ret = -EBUSY;
+   else
+   to_dm_connector_state(connector->state)->abm_level = val ?:
+   ABM_LEVEL_IMMEDIATE_DISABLE;
drm_modeset_unlock(>mode_config.connection_mutex);
 
+   if (ret)
+   return ret;
+
drm_kms_helper_hotplug_event(dev);
 
return count;
@@ -7704,6 +7727,13 @@ void amdgpu_dm_connector_init_helper(struct 
amdgpu_display_manager *dm,
aconnector->base.state->max_bpc = 16;
aconnector->base.state->max_requested_bpc = 
aconnector->base.state->max_bpc;
 
+   if (connector_type == DRM_MODE_CONNECTOR_eDP &&
+   (dc_is_dmcu_initialized(adev->dm.dc) ||
+adev->dm.dc->ctx->dmub_srv)) {

[PATCH v3 1/2] drm: Introduce 'power saving policy' drm property

2024-06-05 Thread Mario Limonciello

The `power saving policy` DRM property is an optional property that
can be added to a connector by a driver.

This property is for compositors to indicate intent of policy of
whether a driver can use power saving features that may compromise
the experience intended by the compositor.

Acked-by: Leo Li 
Signed-off-by: Mario Limonciello 
---
v2->v3:
* Add tag
---
 drivers/gpu/drm/drm_connector.c | 46 +
 include/drm/drm_connector.h |  2 ++
 include/drm/drm_mode_config.h   |  5 
 include/uapi/drm/drm_mode.h |  7 +
 4 files changed, 60 insertions(+)

diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index 4d2df7f64dc5..088a5874c7a4 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -961,6 +961,11 @@ static const struct drm_prop_enum_list 
drm_scaling_mode_enum_list[] = {
{ DRM_MODE_SCALE_ASPECT, "Full aspect" },
 };
 
+static const struct drm_prop_enum_list drm_power_saving_policy_enum_list[] = {
+   { __builtin_ffs(DRM_MODE_REQUIRE_COLOR_ACCURACY) - 1, "Require color 
accuracy" },
+   { __builtin_ffs(DRM_MODE_REQUIRE_LOW_LATENCY) - 1, "Require low 
latency" },
+};
+
 static const struct drm_prop_enum_list drm_aspect_ratio_enum_list[] = {
{ DRM_MODE_PICTURE_ASPECT_NONE, "Automatic" },
{ DRM_MODE_PICTURE_ASPECT_4_3, "4:3" },
@@ -1499,6 +1504,16 @@ static const u32 dp_colorspaces =
  *
  * Drivers can set up these properties by calling
  * drm_mode_create_tv_margin_properties().
+ * power saving policy:
+ * This property is used to set the power saving policy for the connector.
+ * This property is populated with a bitmask of optional requirements set
+ * by the drm master for the drm driver to respect:
+ * - "Require color accuracy": Disable power saving features that will
+ *   affect color fidelity.
+ *   For example: Hardware assisted backlight modulation.
+ * - "Require low latency": Disable power saving features that will
+ *   affect latency.
+ *   For example: Panel self refresh (PSR)
  */
 
 int drm_connector_create_standard_properties(struct drm_device *dev)
@@ -1963,6 +1978,37 @@ int drm_mode_create_scaling_mode_property(struct 
drm_device *dev)
 }
 EXPORT_SYMBOL(drm_mode_create_scaling_mode_property);
 
+/**
+ * drm_mode_create_power_saving_policy_property - create power saving policy 
property
+ * @dev: DRM device
+ * @supported_policies: bitmask of supported power saving policies
+ *
+ * Called by a driver the first time it's needed, must be attached to desired
+ * connectors.
+ *
+ * Returns: %0
+ */
+int drm_mode_create_power_saving_policy_property(struct drm_device *dev,
+uint64_t supported_policies)
+{
+   struct drm_property *power_saving;
+
+   if (dev->mode_config.power_saving_policy)
+   return 0;
+   WARN_ON((supported_policies & DRM_MODE_POWER_SAVING_POLICY_ALL) == 0);
+
+   power_saving =
+   drm_property_create_bitmask(dev, 0, "power saving policy",
+   drm_power_saving_policy_enum_list,
+   
ARRAY_SIZE(drm_power_saving_policy_enum_list),
+   supported_policies);
+
+   dev->mode_config.power_saving_policy = power_saving;
+
+   return 0;
+}
+EXPORT_SYMBOL(drm_mode_create_power_saving_policy_property);
+
 /**
  * DOC: Variable refresh properties
  *
diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index fe88d7fc6b8f..b0ec2ec48de7 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -2025,6 +2025,8 @@ int drm_mode_create_dp_colorspace_property(struct 
drm_connector *connector,
   u32 supported_colorspaces);
 int drm_mode_create_content_type_property(struct drm_device *dev);
 int drm_mode_create_suggested_offset_properties(struct drm_device *dev);
+int drm_mode_create_power_saving_policy_property(struct drm_device *dev,
+uint64_t supported_policies);
 
 int drm_connector_set_path_property(struct drm_connector *connector,
const char *path);
diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h
index 8de3c9a5f61b..eefcbf6c5377 100644
--- a/include/drm/drm_mode_config.h
+++ b/include/drm/drm_mode_config.h
@@ -969,6 +969,11 @@ struct drm_mode_config {
 */
struct drm_atomic_state *suspend_state;
 
+   /**
+* @power_saving_policy: bitmask for power saving policy requests.
+*/
+   struct drm_property *power_saving_policy;
+
const struct drm_mode_config_helper_funcs *helper_private;
 };
 
diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
index 1ca5c7e418fd..558a306a23a7 100644
--- a/include/uapi/drm/drm_mode.h
+++ b/include/uapi/drm/drm_mode.h
@@

[PATCH] drm/amdkfd: Extend gfx12 trap handler fix to gfx10/11

2024-06-05 Thread Jay Cornwall

In commit 6d1878882d2d
("drm/amdkfd: gfx12 context save/restore trap handler fixes") the
following fix was introduced but incorrectly restricted to gfx12.
The same issue and a corresponding fix apply to gfx10 and gfx11.

Do not overwrite TRAPSTS.{SAVECTX,HOST_TRAP} when restoring this
register. Both of these fields can assert while the wavefront is
running the trap handler.

Signed-off-by: Jay Cornwall 
Cc: Lancelot Six 
---
 .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h| 16 +---
 .../amd/amdkfd/cwsr_trap_handler_gfx10.asm| 38 ++-
 2 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h 
b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index 665122d1bbbd..02f7ba8c93cd 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -1136,7 +1136,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0x705d, 0x807c817c,
0x8070ff70, 0x0080,
0xbf0a7b7c, 0xbf85fff8,
-   0xbf82013d, 0xbef4037e,
+   0xbf82013f, 0xbef4037e,
0x8775ff7f, 0x,
0x8875ff75, 0x0004,
0xbef60380, 0xbef703ff,
@@ -1275,7 +1275,8 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0x80788478, 0xbf8c,
0xb9eef815, 0xbefc036f,
0xbefe0370, 0xbeff0371,
-   0xb9f9f816, 0xb9fbf803,
+   0xb9f9f816, 0xb9fb4803,
+   0x907b8b7b, 0xb9fba2c3,
0xb9f3f801, 0xb96e3a05,
0x806e816e, 0xbf0d9972,
0xbf850002, 0x8f6e896e,
@@ -2544,7 +2545,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
0xe0704000, 0x705d,
0x807c817c, 0x8070ff70,
0x0080, 0xbf0a7b7c,
-   0xbf85fff8, 0xbf820134,
+   0xbf85fff8, 0xbf820136,
0xbef4037e, 0x8775ff7f,
0x, 0x8875ff75,
0x0004, 0xbef60380,
@@ -2683,7 +2684,8 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
0xf000, 0x80788478,
0xbf8c, 0xb9eef815,
0xbefc036f, 0xbefe0370,
-   0xbeff0371, 0xb9fbf803,
+   0xbeff0371, 0xb9fb4803,
+   0x907b8b7b, 0xb9fba2c3,
0xb9f3f801, 0xb96e3a05,
0x806e816e, 0xbf0d9972,
0xbf850002, 0x8f6e896e,
@@ -2981,7 +2983,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
0x701d, 0x807d817d,
0x8070ff70, 0x0080,
0xbf0a7b7d, 0xbfa2fff8,
-   0xbfa0013f, 0xbef4007e,
+   0xbfa00143, 0xbef4007e,
0x8b75ff7f, 0x,
0x8c75ff75, 0x0004,
0xbef60080, 0xbef700ff,
@@ -3123,7 +3125,9 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
0x80788478, 0xbf89,
0xb96ef815, 0xbefd006f,
0xbefe0070, 0xbeff0071,
-   0xb97bf803, 0xb973f801,
+   0xb97b4803, 0x857b8b7b,
+   0xb97b22c3, 0x857b867b,
+   0xb97b7443, 0xb973f801,
0xb8ee3b05, 0x806e816e,
0xbf0d9972, 0xbfa20002,
0x846e896e, 0xbfa1,
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm 
b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
index ac3702b8e3c4..44772eec9ef4 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
@@ -119,9 +119,12 @@ var SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT   = 7
 var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK  = 0x100
 var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8
 var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK  = 0x800
+var SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT = 11
 var SQ_WAVE_TRAPSTS_EXCP_HI_MASK   = 0x7000
 #if ASIC_FAMILY >= CHIP_PLUM_BONITO
+var SQ_WAVE_TRAPSTS_HOST_TRAP_SHIFT= 16
 var SQ_WAVE_TRAPSTS_WAVE_START_MASK= 0x2
+var SQ_WAVE_TRAPSTS_WAVE_START_SHIFT   = 17
 var SQ_WAVE_TRAPSTS_WAVE_END_MASK  = 0x4
 var SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK   = 0x10
 #endif
@@ -137,14 +140,23 @@ var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x003F8000
 
 var SQ_WAVE_MODE_DEBUG_EN_MASK = 0x800
 
+var S_TRAPSTS_RESTORE_PART_1_SIZE  = SQ_WAVE_TRAPSTS_SAVECTX_SHIFT
+var S_TRAPSTS_RESTORE_PART_2_SHIFT = 
SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT
+
 #if ASIC_FAMILY < CHIP_PLUM_BONITO
 var S_TRAPSTS_NON_MASKABLE_EXCP_MASK   = 
SQ_WAVE_TRAPSTS_MEM_VIOL_MASK|SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK
+var S_TRAPSTS_RESTORE_PART_2_SIZE  = 32 - 
S_TRAPSTS_RESTORE_PART_2_SHIFT
+var S_TRAPSTS_RESTORE_PART_3_SHIFT = 0
+var S_TRAPSTS_RESTORE_PART_3_SIZE  = 0
 #else
 var S_TRAPSTS_NON_MASKABLE_EXCP_MASK   = SQ_WAVE_TRAPSTS_MEM_VIOL_MASK 
|\
  
SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK |\
  
SQ_WAVE_TRAPSTS_WAVE_START_MASK   |\
  SQ_WAVE_TRAPSTS_WAVE_END_MASK 
|\

[PATCH] drm/amdgpu: Indicate CU havest info to CP

2024-06-05 Thread Harish Kasiviswanathan

To achieve full occupancy CP hardware needs to know if CUs in SE are
symmetrically or asymmetrically harvested

Signed-off-by: Harish Kasiviswanathan 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index aecc2bcea145..651995292aaa 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -4203,9 +4203,10 @@ static u32 gfx_v9_4_3_get_cu_active_bitmap(struct 
amdgpu_device *adev, int xcc_i
 static int gfx_v9_4_3_get_cu_info(struct amdgpu_device *adev,
 struct amdgpu_cu_info *cu_info)
 {
-   int i, j, k, counter, xcc_id, active_cu_number = 0;
-   u32 mask, bitmap, ao_bitmap, ao_cu_mask = 0;
+   int i, j, k, prev_counter, counter, xcc_id, active_cu_number = 0;
+   u32 mask, bitmap, ao_bitmap, ao_cu_mask = 0, tmp;
unsigned disable_masks[4 * 4];
+   bool is_symmetric_cus = true;
 
if (!adev || !cu_info)
return -EINVAL;
@@ -4250,6 +4251,15 @@ static int gfx_v9_4_3_get_cu_info(struct amdgpu_device 
*adev,
ao_cu_mask |= (ao_bitmap << (i * 16 + j 
* 8));
cu_info->ao_cu_bitmap[i][j] = ao_bitmap;
}
+   if (i && is_symmetric_cus && prev_counter != counter)
+   is_symmetric_cus = false;
+   prev_counter = counter;
+   }
+   if (is_symmetric_cus) {
+   tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), 
regCP_CPC_DEBUG);
+   tmp = REG_SET_FIELD(tmp, CP_CPC_DEBUG, 
CPC_HARVESTING_RELAUNCH_DISABLE, 1);
+   tmp = REG_SET_FIELD(tmp, CP_CPC_DEBUG, 
CPC_HARVESTING_DISPATCH_DISABLE, 1);
+   WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_DEBUG, 
tmp);
}
gfx_v9_4_3_xcc_select_se_sh(adev, 0x, 0x, 
0x,
xcc_id);
-- 
2.34.1

RE: [PATCH V2 2/2] drm/amd/display/dcn401: use pre-allocated temp structure for bounding box

2024-06-05 Thread Zhang, George

[AMD Official Use Only - AMD Internal Distribution Only]

Tested-by: George Zhang 

Thanks,
George

-Original Message-
From: Deucher, Alexander 
Sent: Tuesday, June 4, 2024 4:26 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Mahfooz, Hamza 
; Zhang, George ; Arnd Bergmann 
; Wentland, Harry ; Li, Sun peng (Leo) 
; Siqueira, Rodrigo 
Subject: [PATCH V2 2/2] drm/amd/display/dcn401: use pre-allocated temp 
structure for bounding box

This mirrors what the driver does for older DCN generations.

Should fix:
[   26.924055] BUG: sleeping function called from invalid context at 
include/linux/sched/mm.h:306
[   26.924060] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1022, 
name: modprobe
[   26.924063] preempt_count: 2, expected: 0
[   26.924064] RCU nest depth: 0, expected: 0
[   26.924066] Preemption disabled at:
[   26.924067] [] dc_fpu_begin+0x30/0xd0 [amdgpu]
[   26.924322] CPU: 9 PID: 1022 Comm: modprobe Not tainted 6.8.0+ #20
[   26.924325] Hardware name: System manufacturer System Product Name/CROSSHAIR 
VI HERO, BIOS 6302 10/23/2018
[   26.924326] Call Trace:
[   26.924327]  
[   26.924329]  dump_stack_lvl+0x37/0x50
[   26.924333]  ? dc_fpu_begin+0x30/0xd0 [amdgpu]
[   26.924589]  dump_stack+0x10/0x20
[   26.924592]  __might_resched+0x16a/0x1c0
[   26.924596]  __might_sleep+0x42/0x70
[   26.924598]  __kmalloc_node_track_caller+0x2ad/0x4b0
[   26.924601]  ? dm_helpers_allocate_gpu_mem+0x12/0x20 [amdgpu]
[   26.924855]  ? dcn401_update_bw_bounding_box+0x2a/0xf0 [amdgpu]
[   26.925122]  kmemdup+0x20/0x50
[   26.925124]  ? kernel_fpu_begin_mask+0x6b/0xe0
[   26.925127]  ? kmemdup+0x20/0x50
[   26.925129]  dcn401_update_bw_bounding_box+0x2a/0xf0 [amdgpu]
[   26.925393]  dc_create+0x311/0x670 [amdgpu]
[   26.925649]  amdgpu_dm_init+0x2aa/0x1fa0 [amdgpu]
[   26.925903]  ? irq_work_queue+0x38/0x50
[   26.925907]  ? vprintk_emit+0x1e7/0x270
[   26.925910]  ? dev_printk_emit+0x83/0xb0
[   26.925914]  ? amdgpu_device_rreg+0x17/0x20 [amdgpu]
[   26.926133]  dm_hw_init+0x14/0x30 [amdgpu]

v2: drop extra memcpy

Fixes: 669d6b078ed8 ("drm/amd/display: avoid large on-stack structures")
Suggested-by: Hamza Mahfooz 
Signed-off-by: Alex Deucher 
Cc: George Zhang 
Cc: Arnd Bergmann 
Cc: harry.wentl...@amd.com
Cc: sunpeng...@amd.com
Cc: rodrigo.sique...@amd.com
---
 drivers/gpu/drm/amd/display/dc/core/dc_state.c  | 13 +
 .../display/dc/resource/dcn401/dcn401_resource.c|  8 ++--
 2 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_state.c 
b/drivers/gpu/drm/amd/display/dc/core/dc_state.c
index 8ea9391c60b7..06b22897137e 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_state.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_state.c
@@ -192,15 +192,14 @@ static void init_state(struct dc *dc, struct dc_state 
*state)
 /* Public dc_state functions */
 struct dc_state *dc_state_create(struct dc *dc, struct dc_state_create_params 
*params)  {
+   struct dc_state *state;
 #ifdef CONFIG_DRM_AMD_DC_FP
-   struct dml2_configuration_options *dml2_opt;
+   struct dml2_configuration_options *dml2_opt = >dml2_tmp;

-   dml2_opt = kmemdup(>dml2_options, sizeof(*dml2_opt), GFP_KERNEL);
-   if (!dml2_opt)
-   return NULL;
+   memcpy(dml2_opt, >dml2_options, sizeof(dc->dml2_options));
 #endif
-   struct dc_state *state = kvzalloc(sizeof(struct dc_state),
-   GFP_KERNEL);
+
+   state = kvzalloc(sizeof(struct dc_state), GFP_KERNEL);

if (!state)
return NULL;
@@ -217,8 +216,6 @@ struct dc_state *dc_state_create(struct dc *dc, struct 
dc_state_create_params *p
dml2_opt->use_clock_dc_limits = true;
dml2_create(dc, dml2_opt, >bw_ctx.dml2_dc_power_source);
}
-
-   kfree(dml2_opt);
 #endif

kref_init(>refcount);
diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c 
b/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c
index 8dfb0a3d21cb..029ad7bd7b5b 100644
--- a/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c
@@ -1581,11 +1581,9 @@ static struct dc_cap_funcs cap_funcs = {

 static void dcn401_update_bw_bounding_box(struct dc *dc, struct clk_bw_params 
*bw_params)  {
-   struct dml2_configuration_options *dml2_opt;
+   struct dml2_configuration_options *dml2_opt = >dml2_tmp;

-   dml2_opt = kmemdup(>dml2_options, sizeof(*dml2_opt), GFP_KERNEL);
-   if (!dml2_opt)
-   return;
+   memcpy(dml2_opt, >dml2_options, sizeof(dc->dml2_options));

DC_FP_START();

@@ -1600,8 +1598,6 @@ static void dcn401_update_bw_bounding_box(struct dc *dc, 
struct clk_bw_params *b
dml2_reinit(dc, dml2_opt, 
>current_state->bw_ctx.dml2_dc_power_source);

DC_FP_END();
-
-   kfree(dml2_opt);
 }

 enum dc_status

Re: [PATCH V2 2/2] drm/amd/display/dcn401: use pre-allocated temp structure for bounding box

2024-06-05 Thread Arnd Bergmann

On Tue, Jun 4, 2024, at 22:26, Alex Deucher wrote:
> Fixes: 669d6b078ed8 ("drm/amd/display: avoid large on-stack structures")
> Suggested-by: Hamza Mahfooz 
> Signed-off-by: Alex Deucher 
> Cc: George Zhang 
> Cc: Arnd Bergmann 

Acked-by: Arnd Bergmann

Re: [PATCH v2] drm/client: Detect when ACPI lid is closed during initialization

2024-06-05 Thread Chris Bainbridge

On Tue, Jun 04, 2024 at 10:02:29AM +0800, kernel test robot wrote:
> Hi Mario,
> 
> kernel test robot noticed the following build errors:
> 
> [auto build test ERROR on drm-misc/drm-misc-next]
> [also build test ERROR on drm/drm-next drm-exynos/exynos-drm-next 
> drm-intel/for-linux-next drm-intel/for-linux-next-fixes drm-tip/drm-tip 
> linus/master v6.10-rc2 next-20240603]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
> 
> url:
> https://github.com/intel-lab-lkp/linux/commits/Mario-Limonciello/drm-client-Detect-when-ACPI-lid-is-closed-during-initialization/20240529-050440
> base:   git://anongit.freedesktop.org/drm/drm-misc drm-misc-next
> patch link:
> https://lore.kernel.org/r/20240528210319.1242-1-mario.limonciello%40amd.com
> patch subject: [PATCH v2] drm/client: Detect when ACPI lid is closed during 
> initialization
> config: i386-randconfig-053-20240604 
> (https://download.01.org/0day-ci/archive/20240604/202406040928.eu1griwv-...@intel.com/config)
> compiler: gcc-9 (Ubuntu 9.5.0-4ubuntu2) 9.5.0
> reproduce (this is a W=1 build): 
> (https://download.01.org/0day-ci/archive/20240604/202406040928.eu1griwv-...@intel.com/reproduce)
> 
> If you fix the issue in a separate patch/commit (i.e. not just a new version 
> of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot 
> | Closes: 
> https://lore.kernel.org/oe-kbuild-all/202406040928.eu1griwv-...@intel.com/
> 
> All errors (new ones prefixed by >>):
> 
>ld: drivers/gpu/drm/drm_client_modeset.o: in function 
> `drm_client_match_edp_lid':
> >> drivers/gpu/drm/drm_client_modeset.c:281:(.text+0x221b): undefined 
> >> reference to `acpi_lid_open'
> 
> 
> vim +281 drivers/gpu/drm/drm_client_modeset.c
> 
>260
>261static void drm_client_match_edp_lid(struct drm_device *dev,
>262 struct drm_connector 
> **connectors,
>263 unsigned int 
> connector_count,
>264 bool *enabled)
>265{
>266int i;
>267
>268for (i = 0; i < connector_count; i++) {
>269struct drm_connector *connector = connectors[i];
>270
>271switch (connector->connector_type) {
>272case DRM_MODE_CONNECTOR_LVDS:
>273case DRM_MODE_CONNECTOR_eDP:
>274if (!enabled[i])
>275continue;
>276break;
>277default:
>278continue;
>279}
>280
>  > 281if (!acpi_lid_open()) {
>282drm_dbg_kms(dev, "[CONNECTOR:%d:%s] lid 
> is closed, disabling\n",
>283connector->base.id, 
> connector->name);
>284enabled[i] = false;
>285}
>286}
>287}
>288
> 
> -- 
> 0-DAY CI Kernel Test Service
> https://github.com/intel/lkp-tests/wiki

The failed config has CONFIG_ACPI_BUTTON=m. The build failure can be
fixed with:

diff --git a/drivers/gpu/drm/drm_client_modeset.c 
b/drivers/gpu/drm/drm_client_modeset.c
index b76438c31761..0271e66f44f8 100644
--- a/drivers/gpu/drm/drm_client_modeset.c
+++ b/drivers/gpu/drm/drm_client_modeset.c
@@ -271,11 +271,13 @@ static void drm_client_match_edp_lid(struct drm_device 
*dev,
if (connector->connector_type != DRM_MODE_CONNECTOR_eDP || 
!enabled[i])
continue;

+#if defined(CONFIG_ACPI_BUTTON)
if (!acpi_lid_open()) {
drm_dbg_kms(dev, "[CONNECTOR:%d:%s] lid is closed, 
disabling\n",
connector->base.id, connector->name);
enabled[i] = false;
}
+#endif
}
 }

[linux-next:master] BUILD REGRESSION 234cb065ad82915ff8d06ce01e01c3e640b674d2

2024-06-05 Thread kernel test robot

tree/branch: 
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master
branch HEAD: 234cb065ad82915ff8d06ce01e01c3e640b674d2  Add linux-next specific 
files for 20240605

Error/Warning reports:

https://lore.kernel.org/oe-kbuild-all/202406051521.mroqvr5l-...@intel.com
https://lore.kernel.org/oe-kbuild-all/202406051524.a12jqlqx-...@intel.com
https://lore.kernel.org/oe-kbuild-all/202406051711.ds1sqz9n-...@intel.com
https://lore.kernel.org/oe-kbuild-all/202406051855.9viyxbtb-...@intel.com

Error/Warning: (recently discovered and may have been fixed)

include/linux/container_of.h:20:54: error: 'struct ftrace_ops' has no member 
named 'list'
include/linux/list.h:769:26: error: 'struct ftrace_ops' has no member named 
'list'
include/linux/stddef.h:16:33: error: 'struct ftrace_ops' has no member named 
'list'
kernel/trace/fgraph.c:883:43: error: 'struct ftrace_ops' has no member named 
'subop_list'
kernel/trace/fgraph.c:934:15: error: implicit declaration of function 
'ftrace_startup_subops'; did you mean 'ftrace_startup'? 
[-Werror=implicit-function-declaration]
kernel/trace/fgraph.c:973:9: error: implicit declaration of function 
'ftrace_shutdown_subops'; did you mean 'ftrace_shutdown'? 
[-Werror=implicit-function-declaration]

Error/Warning ids grouped by kconfigs:

gcc_recent_errors
|-- arc-randconfig-r061-20240605
|   |-- 
drivers-gpu-drm-arm-display-komeda-komeda_dev.c:error:implicit-declaration-of-function-seq_puts
|   |-- 
drivers-gpu-drm-arm-display-komeda-komeda_dev.c:error:invalid-use-of-undefined-type-struct-seq_file
|   `-- 
drivers-gpu-drm-arm-display-komeda-komeda_dev.c:error:type-defaults-to-int-in-declaration-of-DEFINE_SHOW_ATTRIBUTE
|-- csky-randconfig-r053-20240605
|   |-- 
include-linux-container_of.h:error:struct-ftrace_ops-has-no-member-named-list
|   |-- include-linux-list.h:error:struct-ftrace_ops-has-no-member-named-list
|   |-- include-linux-stddef.h:error:struct-ftrace_ops-has-no-member-named-list
|   |-- 
kernel-trace-fgraph.c:error:implicit-declaration-of-function-ftrace_shutdown_subops
|   |-- 
kernel-trace-fgraph.c:error:implicit-declaration-of-function-ftrace_startup_subops
|   `-- 
kernel-trace-fgraph.c:error:struct-ftrace_ops-has-no-member-named-subop_list
|-- i386-randconfig-061-20240605
|   `-- drivers-hwmon-cros_ec_hwmon.c:sparse:sparse:cast-to-restricted-__le16
|-- i386-randconfig-063-20240605
|   |-- 
kernel-trace-ftrace.c:sparse:sparse:incorrect-type-in-argument-(different-address-spaces)-expected-struct-ftrace_hash-B-got-struct-ftrace_hash-noderef-__rcu-filter_hash
|   |-- 
kernel-trace-ftrace.c:sparse:sparse:incorrect-type-in-argument-(different-address-spaces)-expected-struct-ftrace_hash-B-got-struct-ftrace_hash-noderef-__rcu-notrace_hash
|   |-- 
kernel-trace-ftrace.c:sparse:sparse:incorrect-type-in-argument-(different-address-spaces)-expected-struct-ftrace_hash-new_hash-got-struct-ftrace_hash-noderef-__rcu-filter_hash
|   |-- 
kernel-trace-ftrace.c:sparse:sparse:incorrect-type-in-argument-(different-address-spaces)-expected-struct-ftrace_hash-new_hash1-got-struct-ftrace_hash-noderef-__rcu-filter_hash
|   |-- 
kernel-trace-ftrace.c:sparse:sparse:incorrect-type-in-argument-(different-address-spaces)-expected-struct-ftrace_hash-new_hash2-got-struct-ftrace_hash-noderef-__rcu-filter_hash
|   |-- 
kernel-trace-ftrace.c:sparse:sparse:incorrect-type-in-argument-(different-address-spaces)-expected-struct-ftrace_hash-new_hash2-got-struct-ftrace_hash-noderef-__rcu-notrace_hash
|   |-- 
kernel-trace-ftrace.c:sparse:sparse:incorrect-type-in-argument-(different-address-spaces)-expected-struct-ftrace_hash-orig_hash-got-struct-ftrace_hash-noderef-__rcu
|   |-- 
kernel-trace-ftrace.c:sparse:sparse:incorrect-type-in-argument-(different-address-spaces)-expected-struct-ftrace_hash-src-got-struct-ftrace_hash-noderef-__rcu-filter_hash
|   |-- 
kernel-trace-ftrace.c:sparse:sparse:incorrect-type-in-argument-(different-address-spaces)-expected-struct-ftrace_hash-src-got-struct-ftrace_hash-noderef-__rcu-notrace_hash
|   |-- 
kernel-trace-ftrace.c:sparse:sparse:incorrect-type-in-assignment-(different-address-spaces)-expected-struct-ftrace_hash-noderef-__rcu-filter_hash-got-struct-ftrace_hash
|   |-- 
kernel-trace-ftrace.c:sparse:sparse:incorrect-type-in-assignment-(different-address-spaces)-expected-struct-ftrace_hash-noderef-__rcu-filter_hash-got-struct-ftrace_hash-assigned-filter_hash
|   |-- 
kernel-trace-ftrace.c:sparse:sparse:incorrect-type-in-assignment-(different-address-spaces)-expected-struct-ftrace_hash-noderef-__rcu-filter_hash-got-struct-ftrace_hash-save_filter_hash
|   |-- 
kernel-trace-ftrace.c:sparse:sparse:incorrect-type-in-assignment-(different-address-spaces)-expected-struct-ftrace_hash-noderef-__rcu-notrace_hash-got-struct-ftrace_hash
|   |-- 
kernel-trace-ftrace.c:sparse:sparse:incorrect-type-in-assignment-(different-address-spaces)-expected-struct-ftrace_hash-noderef-__rcu-notrace_hash-got-struct-ftrace_hash-assigned-notrace_hash
|   |-- 
kernel-trace

Re: [PATCH V2 2/2] drm/amd/display/dcn401: use pre-allocated temp structure for bounding box

2024-06-05 Thread Harry Wentland




On 2024-06-04 16:26, Alex Deucher wrote:
> This mirrors what the driver does for older DCN generations.
> 
> Should fix:
> [   26.924055] BUG: sleeping function called from invalid context at 
> include/linux/sched/mm.h:306
> [   26.924060] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1022, 
> name: modprobe
> [   26.924063] preempt_count: 2, expected: 0
> [   26.924064] RCU nest depth: 0, expected: 0
> [   26.924066] Preemption disabled at:
> [   26.924067] [] dc_fpu_begin+0x30/0xd0 [amdgpu]
> [   26.924322] CPU: 9 PID: 1022 Comm: modprobe Not tainted 6.8.0+ #20
> [   26.924325] Hardware name: System manufacturer System Product 
> Name/CROSSHAIR VI HERO, BIOS 6302 10/23/2018
> [   26.924326] Call Trace:
> [   26.924327]  
> [   26.924329]  dump_stack_lvl+0x37/0x50
> [   26.924333]  ? dc_fpu_begin+0x30/0xd0 [amdgpu]
> [   26.924589]  dump_stack+0x10/0x20
> [   26.924592]  __might_resched+0x16a/0x1c0
> [   26.924596]  __might_sleep+0x42/0x70
> [   26.924598]  __kmalloc_node_track_caller+0x2ad/0x4b0
> [   26.924601]  ? dm_helpers_allocate_gpu_mem+0x12/0x20 [amdgpu]
> [   26.924855]  ? dcn401_update_bw_bounding_box+0x2a/0xf0 [amdgpu]
> [   26.925122]  kmemdup+0x20/0x50
> [   26.925124]  ? kernel_fpu_begin_mask+0x6b/0xe0
> [   26.925127]  ? kmemdup+0x20/0x50
> [   26.925129]  dcn401_update_bw_bounding_box+0x2a/0xf0 [amdgpu]
> [   26.925393]  dc_create+0x311/0x670 [amdgpu]
> [   26.925649]  amdgpu_dm_init+0x2aa/0x1fa0 [amdgpu]
> [   26.925903]  ? irq_work_queue+0x38/0x50
> [   26.925907]  ? vprintk_emit+0x1e7/0x270
> [   26.925910]  ? dev_printk_emit+0x83/0xb0
> [   26.925914]  ? amdgpu_device_rreg+0x17/0x20 [amdgpu]
> [   26.926133]  dm_hw_init+0x14/0x30 [amdgpu]
> 
> v2: drop extra memcpy
> 
> Fixes: 669d6b078ed8 ("drm/amd/display: avoid large on-stack structures")
> Suggested-by: Hamza Mahfooz 
> Signed-off-by: Alex Deucher 
> Cc: George Zhang 
> Cc: Arnd Bergmann 
> Cc: harry.wentl...@amd.com
> Cc: sunpeng...@amd.com
> Cc: rodrigo.sique...@amd.com

Series is
Reviewed-by: Harry Wentland 

Harry

> ---
>  drivers/gpu/drm/amd/display/dc/core/dc_state.c  | 13 +
>  .../display/dc/resource/dcn401/dcn401_resource.c|  8 ++--
>  2 files changed, 7 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_state.c 
> b/drivers/gpu/drm/amd/display/dc/core/dc_state.c
> index 8ea9391c60b7..06b22897137e 100644
> --- a/drivers/gpu/drm/amd/display/dc/core/dc_state.c
> +++ b/drivers/gpu/drm/amd/display/dc/core/dc_state.c
> @@ -192,15 +192,14 @@ static void init_state(struct dc *dc, struct dc_state 
> *state)
>  /* Public dc_state functions */
>  struct dc_state *dc_state_create(struct dc *dc, struct 
> dc_state_create_params *params)
>  {
> + struct dc_state *state;
>  #ifdef CONFIG_DRM_AMD_DC_FP
> - struct dml2_configuration_options *dml2_opt;
> + struct dml2_configuration_options *dml2_opt = >dml2_tmp;
>  
> - dml2_opt = kmemdup(>dml2_options, sizeof(*dml2_opt), GFP_KERNEL);
> - if (!dml2_opt)
> - return NULL;
> + memcpy(dml2_opt, >dml2_options, sizeof(dc->dml2_options));
>  #endif
> - struct dc_state *state = kvzalloc(sizeof(struct dc_state),
> - GFP_KERNEL);
> +
> + state = kvzalloc(sizeof(struct dc_state), GFP_KERNEL);
>  
>   if (!state)
>   return NULL;
> @@ -217,8 +216,6 @@ struct dc_state *dc_state_create(struct dc *dc, struct 
> dc_state_create_params *p
>   dml2_opt->use_clock_dc_limits = true;
>   dml2_create(dc, dml2_opt, >bw_ctx.dml2_dc_power_source);
>   }
> -
> - kfree(dml2_opt);
>  #endif
>  
>   kref_init(>refcount);
> diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c 
> b/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c
> index 8dfb0a3d21cb..029ad7bd7b5b 100644
> --- a/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c
> +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c
> @@ -1581,11 +1581,9 @@ static struct dc_cap_funcs cap_funcs = {
>  
>  static void dcn401_update_bw_bounding_box(struct dc *dc, struct 
> clk_bw_params *bw_params)
>  {
> - struct dml2_configuration_options *dml2_opt;
> + struct dml2_configuration_options *dml2_opt = >dml2_tmp;
>  
> - dml2_opt = kmemdup(>dml2_options, sizeof(*dml2_opt), GFP_KERNEL);
> - if (!dml2_opt)
> - return;
> + memcpy(dml2_opt, >dml2_options, sizeof(dc->dml2_options));
>  
>   DC_FP_START();
>  
> @@ -1600,8 +1598,6 @@ static void dcn401_update_bw_bounding_box(struct dc 
> *dc, struct clk_bw_params *b
>   dml2_reinit(dc, dml2_opt, 
> >current_state->bw_ctx.dml2_dc_power_source);
>  
>   DC_FP_END();
> -
> - kfree(dml2_opt);
>  }
>  
>  enum dc_status dcn401_patch_unknown_plane_state(struct dc_plane_state 
> *plane_state)

Re: [PATCH] drm/amd/display: use pre-allocated temp structure for bounding box

2024-06-05 Thread Alex Deucher

On Wed, Jun 5, 2024 at 12:07 PM Harry Wentland  wrote:
>
>
>
> On 2024-06-04 11:50, Alex Deucher wrote:
> > This mirrors what the driver does for older DCN generations.
> >
> > Should fix:
> >
> > BUG: sleeping function called from invalid context at 
> > include/linux/sched/mm.h:306
> > in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 449, name: 
> > kworker/u64:8
> > preempt_count: 2, expected: 0
> > RCU nest depth: 0, expected: 0
> > Preemption disabled at:
> > c0ce1580>] dc_fpu_begin+0x30/0xd0 [amdgpu]
> > CPU: 5 PID: 449 Comm: kworker/u64:8 Tainted: GW  6.8.0+ #35
> > Hardware name: System manufacturer System Product Name/ROG STRIX X570-E 
> > GAMING WIFI II, BIOS 4204 02/24/2022
> > Workqueue: events_unbound async_run_entry_fn
> >
> > Fixes: 88c61827cedc ("drm/amd/display: dynamically allocate 
> > dml2_configuration_options structures")
> > Suggested-by: Hamza Mahfooz 
> > Signed-off-by: Alex Deucher 
> > Cc: George Zhang 
> > Cc: Arnd Bergmann 
> > Cc: harry.wentl...@amd.com
> > Cc: sunpeng...@amd.com
> > Cc: rodrigo.sique...@amd.com
>
> Reviewed-by: Harry Wentland 
>

Please review v2 of the patches.  There was an extra memcpy in the originals.

Alex

> Harry
>
> > ---
> >  drivers/gpu/drm/amd/display/dc/dc.h   | 1 +
> >  .../drm/amd/display/dc/resource/dcn32/dcn32_resource.c| 8 +++-
> >  .../drm/amd/display/dc/resource/dcn321/dcn321_resource.c  | 8 +++-
> >  3 files changed, 7 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/display/dc/dc.h 
> > b/drivers/gpu/drm/amd/display/dc/dc.h
> > index d0ed01ac460d..d843933ad731 100644
> > --- a/drivers/gpu/drm/amd/display/dc/dc.h
> > +++ b/drivers/gpu/drm/amd/display/dc/dc.h
> > @@ -1444,6 +1444,7 @@ struct dc {
> >   } scratch;
> >
> >   struct dml2_configuration_options dml2_options;
> > + struct dml2_configuration_options dml2_tmp;
> >   enum dc_acpi_cm_power_state power_state;
> >
> >  };
> > diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c 
> > b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c
> > index 0f11d7c8791c..7a8aa9396dea 100644
> > --- a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c
> > +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c
> > @@ -2007,11 +2007,9 @@ void dcn32_calculate_wm_and_dlg(struct dc *dc, 
> > struct dc_state *context,
> >
> >  static void dcn32_update_bw_bounding_box(struct dc *dc, struct 
> > clk_bw_params *bw_params)
> >  {
> > - struct dml2_configuration_options *dml2_opt;
> > + struct dml2_configuration_options *dml2_opt = >dml2_tmp;
> >
> > - dml2_opt = kmemdup(>dml2_options, sizeof(dc->dml2_options), 
> > GFP_KERNEL);
> > - if (!dml2_opt)
> > - return;
> > + memcpy(dml2_opt, >dml2_options, sizeof(dc->dml2_options));
> >
> >   DC_FP_START();
> >
> > @@ -2027,7 +2025,7 @@ static void dcn32_update_bw_bounding_box(struct dc 
> > *dc, struct clk_bw_params *bw
> >
> >   DC_FP_END();
> >
> > - kfree(dml2_opt);
> > + memcpy(>dml2_options, dml2_opt, sizeof(dc->dml2_options));
> >  }
> >
> >  static struct resource_funcs dcn32_res_pool_funcs = {
> > diff --git 
> > a/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c 
> > b/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c
> > index 07ca6f58447d..ef30e8632607 100644
> > --- a/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c
> > +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c
> > @@ -1581,11 +1581,9 @@ static struct dc_cap_funcs cap_funcs = {
> >
> >  static void dcn321_update_bw_bounding_box(struct dc *dc, struct 
> > clk_bw_params *bw_params)
> >  {
> > - struct dml2_configuration_options *dml2_opt;
> > + struct dml2_configuration_options *dml2_opt = >dml2_tmp;
> >
> > - dml2_opt = kmemdup(>dml2_options, sizeof(dc->dml2_options), 
> > GFP_KERNEL);
> > - if (!dml2_opt)
> > - return;
> > + memcpy(dml2_opt, >dml2_options, sizeof(dc->dml2_options));
> >
> >   DC_FP_START();
> >
> > @@ -1601,7 +1599,7 @@ static void dcn321_update_bw_bounding_box(struct dc 
> > *dc, struct clk_bw_params *b
> >
> >   DC_FP_END();
> >
> > - kfree(dml2_opt);
> > + memcpy(>dml2_options, dml2_opt, sizeof(dc->dml2_options));
> >  }
> >
> >  static struct resource_funcs dcn321_res_pool_funcs = {
>

Re: [PATCH] drm/amd/display: Add NULL check for 'afb' before dereferencing in amdgpu_dm_plane_handle_cursor_update

2024-06-05 Thread Harry Wentland




On 2024-06-05 11:46, Srinivasan Shanmugam wrote:
> This commit adds a null check for the 'afb' variable in the
> amdgpu_dm_plane_handle_cursor_update function. Previously, 'afb' was
> assumed to be null, but was used later in the code without a null check.
> This could potentially lead to a null pointer dereference.
> 
> Fixes the below:
> drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm_plane.c:1298 
> amdgpu_dm_plane_handle_cursor_update() error: we previously assumed 'afb' 
> could be null (see line 1252)
> 
> Cc: Tom Chung 
> Cc: Rodrigo Siqueira 
> Cc: Roman Li 
> Cc: Hersen Wu 
> Cc: Alex Hung 
> Cc: Aurabindo Pillai 
> Cc: Harry Wentland 
> Signed-off-by: Srinivasan Shanmugam 

Reviewed-by: Harry Wentland 

Harry

> ---
>  .../drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c  | 16 
>  1 file changed, 12 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
> index a64f20fcddaa..b339642b86c0 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
> @@ -1246,14 +1246,22 @@ void amdgpu_dm_plane_handle_cursor_update(struct 
> drm_plane *plane,
>  {
>   struct amdgpu_device *adev = drm_to_adev(plane->dev);
>   struct amdgpu_framebuffer *afb = 
> to_amdgpu_framebuffer(plane->state->fb);
> - struct drm_crtc *crtc = afb ? plane->state->crtc : 
> old_plane_state->crtc;
> - struct dm_crtc_state *crtc_state = crtc ? to_dm_crtc_state(crtc->state) 
> : NULL;
> - struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
> - uint64_t address = afb ? afb->address : 0;
> + struct drm_crtc *crtc;
> + struct dm_crtc_state *crtc_state;
> + struct amdgpu_crtc *amdgpu_crtc;
> + u64 address;
>   struct dc_cursor_position position = {0};
>   struct dc_cursor_attributes attributes;
>   int ret;
>  
> + if (!afb)
> + return;
> +
> + crtc = plane->state->crtc ? plane->state->crtc : old_plane_state->crtc;
> + crtc_state = crtc ? to_dm_crtc_state(crtc->state) : NULL;
> + amdgpu_crtc = to_amdgpu_crtc(crtc);
> + address = afb->address;
> +
>   if (!plane->state->fb && !old_plane_state->fb)
>   return;
>

[PATCH] drm/amd/display: Add NULL check for 'afb' before dereferencing in amdgpu_dm_plane_handle_cursor_update

2024-06-05 Thread Srinivasan Shanmugam

This commit adds a null check for the 'afb' variable in the
amdgpu_dm_plane_handle_cursor_update function. Previously, 'afb' was
assumed to be null, but was used later in the code without a null check.
This could potentially lead to a null pointer dereference.

Fixes the below:
drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm_plane.c:1298 
amdgpu_dm_plane_handle_cursor_update() error: we previously assumed 'afb' could 
be null (see line 1252)

Cc: Tom Chung 
Cc: Rodrigo Siqueira 
Cc: Roman Li 
Cc: Hersen Wu 
Cc: Alex Hung 
Cc: Aurabindo Pillai 
Cc: Harry Wentland 
Signed-off-by: Srinivasan Shanmugam 
---
 .../drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c  | 16 
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
index a64f20fcddaa..b339642b86c0 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
@@ -1246,14 +1246,22 @@ void amdgpu_dm_plane_handle_cursor_update(struct 
drm_plane *plane,
 {
struct amdgpu_device *adev = drm_to_adev(plane->dev);
struct amdgpu_framebuffer *afb = 
to_amdgpu_framebuffer(plane->state->fb);
-   struct drm_crtc *crtc = afb ? plane->state->crtc : 
old_plane_state->crtc;
-   struct dm_crtc_state *crtc_state = crtc ? to_dm_crtc_state(crtc->state) 
: NULL;
-   struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
-   uint64_t address = afb ? afb->address : 0;
+   struct drm_crtc *crtc;
+   struct dm_crtc_state *crtc_state;
+   struct amdgpu_crtc *amdgpu_crtc;
+   u64 address;
struct dc_cursor_position position = {0};
struct dc_cursor_attributes attributes;
int ret;
 
+   if (!afb)
+   return;
+
+   crtc = plane->state->crtc ? plane->state->crtc : old_plane_state->crtc;
+   crtc_state = crtc ? to_dm_crtc_state(crtc->state) : NULL;
+   amdgpu_crtc = to_amdgpu_crtc(crtc);
+   address = afb->address;
+
if (!plane->state->fb && !old_plane_state->fb)
return;
 
-- 
2.34.1

Re: [PATCH] drm/amd/display: use pre-allocated temp structure for bounding box

2024-06-05 Thread Harry Wentland




On 2024-06-04 11:50, Alex Deucher wrote:
> This mirrors what the driver does for older DCN generations.
> 
> Should fix:
> 
> BUG: sleeping function called from invalid context at 
> include/linux/sched/mm.h:306
> in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 449, name: 
> kworker/u64:8
> preempt_count: 2, expected: 0
> RCU nest depth: 0, expected: 0
> Preemption disabled at:
> c0ce1580>] dc_fpu_begin+0x30/0xd0 [amdgpu]
> CPU: 5 PID: 449 Comm: kworker/u64:8 Tainted: GW  6.8.0+ #35
> Hardware name: System manufacturer System Product Name/ROG STRIX X570-E 
> GAMING WIFI II, BIOS 4204 02/24/2022
> Workqueue: events_unbound async_run_entry_fn
> 
> Fixes: 88c61827cedc ("drm/amd/display: dynamically allocate 
> dml2_configuration_options structures")
> Suggested-by: Hamza Mahfooz 
> Signed-off-by: Alex Deucher 
> Cc: George Zhang 
> Cc: Arnd Bergmann 
> Cc: harry.wentl...@amd.com
> Cc: sunpeng...@amd.com
> Cc: rodrigo.sique...@amd.com

Reviewed-by: Harry Wentland 

Harry

> ---
>  drivers/gpu/drm/amd/display/dc/dc.h   | 1 +
>  .../drm/amd/display/dc/resource/dcn32/dcn32_resource.c| 8 +++-
>  .../drm/amd/display/dc/resource/dcn321/dcn321_resource.c  | 8 +++-
>  3 files changed, 7 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/display/dc/dc.h 
> b/drivers/gpu/drm/amd/display/dc/dc.h
> index d0ed01ac460d..d843933ad731 100644
> --- a/drivers/gpu/drm/amd/display/dc/dc.h
> +++ b/drivers/gpu/drm/amd/display/dc/dc.h
> @@ -1444,6 +1444,7 @@ struct dc {
>   } scratch;
>  
>   struct dml2_configuration_options dml2_options;
> + struct dml2_configuration_options dml2_tmp;
>   enum dc_acpi_cm_power_state power_state;
>  
>  };
> diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c 
> b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c
> index 0f11d7c8791c..7a8aa9396dea 100644
> --- a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c
> +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c
> @@ -2007,11 +2007,9 @@ void dcn32_calculate_wm_and_dlg(struct dc *dc, struct 
> dc_state *context,
>  
>  static void dcn32_update_bw_bounding_box(struct dc *dc, struct clk_bw_params 
> *bw_params)
>  {
> - struct dml2_configuration_options *dml2_opt;
> + struct dml2_configuration_options *dml2_opt = >dml2_tmp;
>  
> - dml2_opt = kmemdup(>dml2_options, sizeof(dc->dml2_options), 
> GFP_KERNEL);
> - if (!dml2_opt)
> - return;
> + memcpy(dml2_opt, >dml2_options, sizeof(dc->dml2_options));
>  
>   DC_FP_START();
>  
> @@ -2027,7 +2025,7 @@ static void dcn32_update_bw_bounding_box(struct dc *dc, 
> struct clk_bw_params *bw
>  
>   DC_FP_END();
>  
> - kfree(dml2_opt);
> + memcpy(>dml2_options, dml2_opt, sizeof(dc->dml2_options));
>  }
>  
>  static struct resource_funcs dcn32_res_pool_funcs = {
> diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c 
> b/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c
> index 07ca6f58447d..ef30e8632607 100644
> --- a/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c
> +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c
> @@ -1581,11 +1581,9 @@ static struct dc_cap_funcs cap_funcs = {
>  
>  static void dcn321_update_bw_bounding_box(struct dc *dc, struct 
> clk_bw_params *bw_params)
>  {
> - struct dml2_configuration_options *dml2_opt;
> + struct dml2_configuration_options *dml2_opt = >dml2_tmp;
>  
> - dml2_opt = kmemdup(>dml2_options, sizeof(dc->dml2_options), 
> GFP_KERNEL);
> - if (!dml2_opt)
> - return;
> + memcpy(dml2_opt, >dml2_options, sizeof(dc->dml2_options));
>  
>   DC_FP_START();
>  
> @@ -1601,7 +1599,7 @@ static void dcn321_update_bw_bounding_box(struct dc 
> *dc, struct clk_bw_params *b
>  
>   DC_FP_END();
>  
> - kfree(dml2_opt);
> + memcpy(>dml2_options, dml2_opt, sizeof(dc->dml2_options));
>  }
>  
>  static struct resource_funcs dcn321_res_pool_funcs = {

Re: [PATCH] drm/amd/display: Add null check for 'afb' in amdgpu_dm_update_cursor

2024-06-05 Thread Harry Wentland




On 2024-06-05 10:53, Srinivasan Shanmugam wrote:
> This commit adds a null check for the 'afb' variable in the
> amdgpu_dm_update_cursor function. Previously, 'afb' was assumed to be
> null at line 8388, but was used later in the code without a null check.
> This could potentially lead to a null pointer dereference.
> 
> Fixes the below:
> drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm.c:8433 
> amdgpu_dm_update_cursor()
>   error: we previously assumed 'afb' could be null (see line 8388)
> 
> drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm.c
> 8379 static void amdgpu_dm_update_cursor(struct drm_plane *plane,
> 8380 struct drm_plane_state 
> *old_plane_state,
> 8381 struct dc_stream_update *update)
> 8382 {
> 8383 struct amdgpu_device *adev = drm_to_adev(plane->dev);
> 8384 struct amdgpu_framebuffer *afb = 
> to_amdgpu_framebuffer(plane->state->fb);
> 8385 struct drm_crtc *crtc = afb ? plane->state->crtc : 
> old_plane_state->crtc;
>  ^
> 
> 8386 struct dm_crtc_state *crtc_state = crtc ? 
> to_dm_crtc_state(crtc->state) : NULL;
> 8387 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
> 8388 uint64_t address = afb ? afb->address : 0;
> ^ Checks for NULL
> 
> 8389 struct dc_cursor_position position = {0};
> 8390 struct dc_cursor_attributes attributes;
> 8391 int ret;
> 8392
> 8393 if (!plane->state->fb && !old_plane_state->fb)
> 8394 return;
> 8395
> 8396 drm_dbg_atomic(plane->dev, "crtc_id=%d with size %d to %d\n",
> 8397amdgpu_crtc->crtc_id, plane->state->crtc_w,
> 8398plane->state->crtc_h);
> 8399
> 8400 ret = amdgpu_dm_plane_get_cursor_position(plane, crtc, 
> );
> 8401 if (ret)
> 8402 return;
> 8403
> 8404 if (!position.enable) {
> 8405 /* turn off cursor */
> 8406 if (crtc_state && crtc_state->stream) {
> 8407 
> dc_stream_set_cursor_position(crtc_state->stream,
> 8408   );
> 8409 update->cursor_position = 
> _state->stream->cursor_position;
> 8410 }
> 8411 return;
> 8412 }
> 8413
> 8414 amdgpu_crtc->cursor_width = plane->state->crtc_w;
> 8415 amdgpu_crtc->cursor_height = plane->state->crtc_h;
> 8416
> 8417 memset(, 0, sizeof(attributes));
> 8418 attributes.address.high_part = upper_32_bits(address);
> 8419 attributes.address.low_part  = lower_32_bits(address);
> 8420 attributes.width = plane->state->crtc_w;
> 8421 attributes.height= plane->state->crtc_h;
> 8422 attributes.color_format  = 
> CURSOR_MODE_COLOR_PRE_MULTIPLIED_ALPHA;
> 8423 attributes.rotation_angle= 0;
> 8424 attributes.attribute_flags.value = 0;
> 8425
> 8426 /* Enable cursor degamma ROM on DCN3+ for implicit sRGB 
> degamma in DRM
> 8427  * legacy gamma setup.
> 8428  */
> 8429 if (crtc_state->cm_is_degamma_srgb &&
> 8430 adev->dm.dc->caps.color.dpp.gamma_corr)
> 8431 
> attributes.attribute_flags.bits.ENABLE_CURSOR_DEGAMMA = 1;
> 8432
> --> 8433 attributes.pitch = afb->base.pitches[0] / 
> afb->base.format->cpp[0];
> ^  ^
> Unchecked dereferences
> 
> 8434
> 8435 if (crtc_state->stream) {
> 8436 if 
> (!dc_stream_set_cursor_attributes(crtc_state->stream,
> 8437  ))
> 8438 DRM_ERROR("DC failed to set cursor 
> attributes\n");
> 8439
> 8440 update->cursor_attributes = 
> _state->stream->cursor_attributes;
> 8441
> 8442 if 
> (!dc_stream_set_cursor_position(crtc_state->stream,
> 8443))
> 8444 DRM_ERROR("DC failed to set cursor 
> position\n");
> 8445
> 8446 update->cursor_position = 
> _state->stream->cursor_position;
> 8447 }
> 8448 }
> 
> Fixes: 66eba12a5482 ("drm/amd/display: Do cursor programming with rest of 
> pipe")
> Reported-by: Dan Carpenter 
> Cc: Tom Chung 
> Cc: Rodrigo Siqueira 
> Cc: Roman Li 
> Cc: Hersen Wu 
> Cc: Alex Hung 
> Cc: Aurabindo Pillai 
> Cc: Harry Wentland 
> Signed-off-by: Srinivasan Shanmugam 

This code comes from

[PATCH] drm/amd/display: Add null check for 'afb' in amdgpu_dm_update_cursor

2024-06-05 Thread Srinivasan Shanmugam

This commit adds a null check for the 'afb' variable in the
amdgpu_dm_update_cursor function. Previously, 'afb' was assumed to be
null at line 8388, but was used later in the code without a null check.
This could potentially lead to a null pointer dereference.

Fixes the below:
drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm.c:8433 
amdgpu_dm_update_cursor()
error: we previously assumed 'afb' could be null (see line 8388)

drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm.c
8379 static void amdgpu_dm_update_cursor(struct drm_plane *plane,
8380 struct drm_plane_state 
*old_plane_state,
8381 struct dc_stream_update *update)
8382 {
8383 struct amdgpu_device *adev = drm_to_adev(plane->dev);
8384 struct amdgpu_framebuffer *afb = 
to_amdgpu_framebuffer(plane->state->fb);
8385 struct drm_crtc *crtc = afb ? plane->state->crtc : 
old_plane_state->crtc;
 ^

8386 struct dm_crtc_state *crtc_state = crtc ? 
to_dm_crtc_state(crtc->state) : NULL;
8387 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
8388 uint64_t address = afb ? afb->address : 0;
^ Checks for NULL

8389 struct dc_cursor_position position = {0};
8390 struct dc_cursor_attributes attributes;
8391 int ret;
8392
8393 if (!plane->state->fb && !old_plane_state->fb)
8394 return;
8395
8396 drm_dbg_atomic(plane->dev, "crtc_id=%d with size %d to %d\n",
8397amdgpu_crtc->crtc_id, plane->state->crtc_w,
8398plane->state->crtc_h);
8399
8400 ret = amdgpu_dm_plane_get_cursor_position(plane, crtc, 
);
8401 if (ret)
8402 return;
8403
8404 if (!position.enable) {
8405 /* turn off cursor */
8406 if (crtc_state && crtc_state->stream) {
8407 
dc_stream_set_cursor_position(crtc_state->stream,
8408   );
8409 update->cursor_position = 
_state->stream->cursor_position;
8410 }
8411 return;
8412 }
8413
8414 amdgpu_crtc->cursor_width = plane->state->crtc_w;
8415 amdgpu_crtc->cursor_height = plane->state->crtc_h;
8416
8417 memset(, 0, sizeof(attributes));
8418 attributes.address.high_part = upper_32_bits(address);
8419 attributes.address.low_part  = lower_32_bits(address);
8420 attributes.width = plane->state->crtc_w;
8421 attributes.height= plane->state->crtc_h;
8422 attributes.color_format  = 
CURSOR_MODE_COLOR_PRE_MULTIPLIED_ALPHA;
8423 attributes.rotation_angle= 0;
8424 attributes.attribute_flags.value = 0;
8425
8426 /* Enable cursor degamma ROM on DCN3+ for implicit sRGB 
degamma in DRM
8427  * legacy gamma setup.
8428  */
8429 if (crtc_state->cm_is_degamma_srgb &&
8430 adev->dm.dc->caps.color.dpp.gamma_corr)
8431 attributes.attribute_flags.bits.ENABLE_CURSOR_DEGAMMA 
= 1;
8432
--> 8433 attributes.pitch = afb->base.pitches[0] / 
afb->base.format->cpp[0];
^  ^
Unchecked dereferences

8434
8435 if (crtc_state->stream) {
8436 if 
(!dc_stream_set_cursor_attributes(crtc_state->stream,
8437  ))
8438 DRM_ERROR("DC failed to set cursor 
attributes\n");
8439
8440 update->cursor_attributes = 
_state->stream->cursor_attributes;
8441
8442 if (!dc_stream_set_cursor_position(crtc_state->stream,
8443))
8444 DRM_ERROR("DC failed to set cursor 
position\n");
8445
8446 update->cursor_position = 
_state->stream->cursor_position;
8447 }
8448 }

Fixes: 66eba12a5482 ("drm/amd/display: Do cursor programming with rest of pipe")
Reported-by: Dan Carpenter 
Cc: Tom Chung 
Cc: Rodrigo Siqueira 
Cc: Roman Li 
Cc: Hersen Wu 
Cc: Alex Hung 
Cc: Aurabindo Pillai 
Cc: Harry Wentland 
Signed-off-by: Srinivasan Shanmugam 
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c| 16 
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 6196de6cebbf..6d468badb669 100644
---

Re: [PATCH] drm/amd/display: Simplify if conditions

2024-06-05 Thread Alex Deucher

Applied.  Thanks!

Alex

On Tue, Jun 4, 2024 at 9:07 AM Thorsten Blum  wrote:
>
> The if conditions !A || A && B can be simplified to !A || B.
>
> Fixes the following Coccinelle/coccicheck warnings reported by
> excluded_middle.cocci:
>
> WARNING !A || A && B is equivalent to !A || B
> WARNING !A || A && B is equivalent to !A || B
> WARNING !A || A && B is equivalent to !A || B
>
> Compile-tested only.
>
> Signed-off-by: Thorsten Blum 
> ---
>  drivers/gpu/drm/amd/display/dc/dml2/dml2_dc_resource_mgmt.c | 6 +++---
>  .../gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c   | 2 +-
>  2 files changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml2_dc_resource_mgmt.c 
> b/drivers/gpu/drm/amd/display/dc/dml2/dml2_dc_resource_mgmt.c
> index ad2a6b4769fe..940081df6dc0 100644
> --- a/drivers/gpu/drm/amd/display/dc/dml2/dml2_dc_resource_mgmt.c
> +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml2_dc_resource_mgmt.c
> @@ -68,7 +68,7 @@ static bool get_plane_id(struct dml2_context *dml2, const 
> struct dc_state *state
> if (state->streams[i]->stream_id == stream_id) {
> for (j = 0; j < state->stream_status[i].plane_count; 
> j++) {
> if (state->stream_status[i].plane_states[j] 
> == plane &&
> -   (!is_plane_duplicate || 
> (is_plane_duplicate && (j == plane_index {
> +   (!is_plane_duplicate || (j == 
> plane_index))) {
> *plane_id = (i << 16) | j;
> return true;
> }
> @@ -707,8 +707,8 @@ static void free_unused_pipes_for_plane(struct 
> dml2_context *ctx, struct dc_stat
> for (i = 0; i < ctx->config.dcn_pipe_count; i++) {
> if (state->res_ctx.pipe_ctx[i].plane_state == plane &&
> state->res_ctx.pipe_ctx[i].stream->stream_id == 
> stream_id &&
> -   (!is_plane_duplicate || (is_plane_duplicate &&
> -   
> ctx->v20.scratch.dml_to_dc_pipe_mapping.dml_pipe_idx_to_plane_index[state->res_ctx.pipe_ctx[i].pipe_idx]
>  == plane_index)) &&
> +   (!is_plane_duplicate ||
> +   
> ctx->v20.scratch.dml_to_dc_pipe_mapping.dml_pipe_idx_to_plane_index[state->res_ctx.pipe_ctx[i].pipe_idx]
>  == plane_index) &&
> !is_pipe_used(pool, 
> state->res_ctx.pipe_ctx[i].pipe_idx)) {
> free_pipe(>res_ctx.pipe_ctx[i]);
> }
> diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c 
> b/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c
> index a41812598ce8..b2bbf7988f92 100644
> --- a/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c
> +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c
> @@ -979,7 +979,7 @@ static bool get_plane_id(struct dml2_context *dml2, const 
> struct dc_state *conte
> if (context->streams[i]->stream_id == stream_id) {
> for (j = 0; j < 
> context->stream_status[i].plane_count; j++) {
> if (context->stream_status[i].plane_states[j] 
> == plane &&
> -   (!is_plane_duplicate || 
> (is_plane_duplicate && (j == plane_index {
> +   (!is_plane_duplicate || (j == 
> plane_index))) {
> *plane_id = (i << 16) | j;
> return true;
> }
> --
> 2.39.2
>

Re: [PATCH] drm/amdgpu: revert "take runtime pm reference when we attach a buffer"

2024-06-05 Thread Alex Deucher

On Wed, Jun 5, 2024 at 8:32 AM Christian König
 wrote:
>
> This reverts commit b8c415e3bf989be1b749409951debe6b36f5c78c and
> commit 425285d39afddaf4a9dab36045b816af0cc3e400.
>
> Taking a runtime pm reference for DMA-buf is actually completely
> unnecessary.
>
> When the buffer is in GTT it is still accessible even when the GPU
> is powered down and when it is in VRAM the buffer gets migrated to
> GTT before powering down.
>

Won't that kind of defeat the purpose of P2P DMA?  I guess it's a
trade off between performance and power savings.

> The only use case which would make it mandatory to keep the runtime
> pm reference would be if we pin the buffer into VRAM, and that's not
> something we currently do.

We'll need to bring this back if we ever support that?  I think we'll
want that for P2P DMA with RDMA NICs that don't support ODP.  That's
one of the big blockers for a lot of ROCm customers to migrate to the
in box drivers.

Alex

>
> Signed-off-by: Christian König 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 33 -
>  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c   |  2 --
>  drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h   | 15 --
>  3 files changed, 50 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> index 0b3b10d21952..ab848047204c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> @@ -41,8 +41,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
> -#include "amdgpu_trace.h"
>
>  /**
>   * amdgpu_dma_buf_attach - _buf_ops.attach implementation
> @@ -63,37 +61,7 @@ static int amdgpu_dma_buf_attach(struct dma_buf *dmabuf,
> if (pci_p2pdma_distance(adev->pdev, attach->dev, false) < 0)
> attach->peer2peer = false;
>
> -   r = pm_runtime_get_sync(adev_to_drm(adev)->dev);
> -   trace_amdgpu_runpm_reference_dumps(1, __func__);
> -   if (r < 0)
> -   goto out;
> -
> return 0;
> -
> -out:
> -   pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
> -   trace_amdgpu_runpm_reference_dumps(0, __func__);
> -   return r;
> -}
> -
> -/**
> - * amdgpu_dma_buf_detach - _buf_ops.detach implementation
> - *
> - * @dmabuf: DMA-buf where we remove the attachment from
> - * @attach: the attachment to remove
> - *
> - * Called when an attachment is removed from the DMA-buf.
> - */
> -static void amdgpu_dma_buf_detach(struct dma_buf *dmabuf,
> - struct dma_buf_attachment *attach)
> -{
> -   struct drm_gem_object *obj = dmabuf->priv;
> -   struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj);
> -   struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
> -
> -   pm_runtime_mark_last_busy(adev_to_drm(adev)->dev);
> -   pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
> -   trace_amdgpu_runpm_reference_dumps(0, __func__);
>  }
>
>  /**
> @@ -266,7 +234,6 @@ static int amdgpu_dma_buf_begin_cpu_access(struct dma_buf 
> *dma_buf,
>
>  const struct dma_buf_ops amdgpu_dmabuf_ops = {
> .attach = amdgpu_dma_buf_attach,
> -   .detach = amdgpu_dma_buf_detach,
> .pin = amdgpu_dma_buf_pin,
> .unpin = amdgpu_dma_buf_unpin,
> .map_dma_buf = amdgpu_dma_buf_map,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 10832b470448..bc3ac73b6b8d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -181,7 +181,6 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct 
> dma_fence **f, struct amd
> amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
>seq, flags | AMDGPU_FENCE_FLAG_INT);
> pm_runtime_get_noresume(adev_to_drm(adev)->dev);
> -   trace_amdgpu_runpm_reference_dumps(1, __func__);
> ptr = >fence_drv.fences[seq & ring->fence_drv.num_fences_mask];
> if (unlikely(rcu_dereference_protected(*ptr, 1))) {
> struct dma_fence *old;
> @@ -309,7 +308,6 @@ bool amdgpu_fence_process(struct amdgpu_ring *ring)
> dma_fence_put(fence);
> pm_runtime_mark_last_busy(adev_to_drm(adev)->dev);
> pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
> -   trace_amdgpu_runpm_reference_dumps(0, __func__);
> } while (last_seq != seq);
>
> return true;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> index f539b1d00234..2fd1bfb35916 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> @@ -554,21 +554,6 @@ TRACE_EVENT(amdgpu_reset_reg_dumps,
>   __entry->value)
>  );
>
> -TRACE_EVENT(amdgpu_runpm_reference_dumps,
> -   TP_PROTO(uint32_t index, const char *func),
> -   TP_ARGS(index, func),
> -

Re: [PATCH 2/3] drm/xe: drop redundant W=1 warnings from Makefile

2024-06-05 Thread Jani Nikula

On Thu, 23 May 2024, Jani Nikula  wrote:
> Since commit a61ddb4393ad ("drm: enable (most) W=1 warnings by default
> across the subsystem"), most of the extra warnings in the driver
> Makefile are redundant. Remove them.
>
> Note that -Wmissing-declarations and -Wmissing-prototypes are always
> enabled by default in scripts/Makefile.extrawarn.
>
> Signed-off-by: Jani Nikula 

Pushed this patch to drm-xe-next with Lucas' irc ack.

BR,
Jani.


> ---
>  drivers/gpu/drm/xe/Makefile | 25 +
>  1 file changed, 1 insertion(+), 24 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
> index c9f067b8f54d..f4366cb958be 100644
> --- a/drivers/gpu/drm/xe/Makefile
> +++ b/drivers/gpu/drm/xe/Makefile
> @@ -3,31 +3,8 @@
>  # Makefile for the drm device driver.  This driver provides support for the
>  # Direct Rendering Infrastructure (DRI) in XFree86 4.1.0 and higher.
>  
> -# Unconditionally enable W=1 warnings locally
> -# --- begin copy-paste W=1 warnings from scripts/Makefile.extrawarn
> -subdir-ccflags-y += -Wextra -Wunused -Wno-unused-parameter
> -subdir-ccflags-y += -Wmissing-declarations
> -subdir-ccflags-y += $(call cc-option, -Wrestrict)
> -subdir-ccflags-y += -Wmissing-format-attribute
> -subdir-ccflags-y += -Wmissing-prototypes
> -subdir-ccflags-y += -Wold-style-definition
> -subdir-ccflags-y += -Wmissing-include-dirs
> -subdir-ccflags-y += $(call cc-option, -Wunused-but-set-variable)
> -subdir-ccflags-y += $(call cc-option, -Wunused-const-variable)
> -subdir-ccflags-y += $(call cc-option, -Wpacked-not-aligned)
> -subdir-ccflags-y += $(call cc-option, -Wformat-overflow)
> +# Enable W=1 warnings not enabled in drm subsystem Makefile
>  subdir-ccflags-y += $(call cc-option, -Wformat-truncation)
> -subdir-ccflags-y += $(call cc-option, -Wstringop-truncation)
> -# The following turn off the warnings enabled by -Wextra
> -ifeq ($(findstring 2, $(KBUILD_EXTRA_WARN)),)
> -subdir-ccflags-y += -Wno-missing-field-initializers
> -subdir-ccflags-y += -Wno-type-limits
> -subdir-ccflags-y += -Wno-shift-negative-value
> -endif
> -ifeq ($(findstring 3, $(KBUILD_EXTRA_WARN)),)
> -subdir-ccflags-y += -Wno-sign-compare
> -endif
> -# --- end copy-paste
>  
>  # Enable -Werror in CI and development
>  subdir-ccflags-$(CONFIG_DRM_XE_WERROR) += -Werror

-- 
Jani Nikula, Intel

Re: [PATCH] drm/amd/display: Increase frame-larger-than warning limit

2024-06-05 Thread Palmer Dabbelt


On Mon, 03 Jun 2024 15:29:48 PDT (-0700), nat...@kernel.org wrote:

Hi Palmer,

On Thu, May 30, 2024 at 07:57:42AM -0700, Palmer Dabbelt wrote:

From: Palmer Dabbelt 

I get a handful of build errors along the lines of


linux/drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn32/display_mode_vba_32.c:58:13:
 error: stack frame size (2352) exceeds limit (2048) in 
'DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndPerformanceCalculation'
 [-Werror,-Wframe-larger-than]
static void 
DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndPerformanceCalculation(
^

linux/drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn32/display_mode_vba_32.c:1724:6:
 error: stack frame size (2096) exceeds limit (2048) in 
'dml32_ModeSupportAndSystemConfigurationFull' [-Werror,-Wframe-larger-than]
void dml32_ModeSupportAndSystemConfigurationFull(struct display_mode_lib 
*mode_lib)
 ^


Judging from the message, this is clang/LLVM? What version?


Yes, LLVM.  Looks like I'm on 16.0.6.  Probably time for an update, so 
I'll give it a shot.



I assume
this showed up in 6.10-rc1 because of commit 77acc6b55ae4 ("riscv: add
support for kernel-mode FPU"), which allows this driver to be built for
RISC-V.


Seems reasonable.  This didn't show up until post-merge, not 100% sure 
why.  I didn't really dig any farther.



Is this allmodconfig or some other configuration?


IIRC both "allmodconfig" and "allyesconfig" show it, but I don't have a 
build tree sitting around to be 100% sure.



as of 6.10-rc1.

Signed-off-by: Palmer Dabbelt 
---
 drivers/gpu/drm/amd/display/dc/dml/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dml/Makefile 
b/drivers/gpu/drm/amd/display/dc/dml/Makefile
index c4a5efd2dda5..b2bd72e63734 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/Makefile
+++ b/drivers/gpu/drm/amd/display/dc/dml/Makefile
@@ -62,9 +62,9 @@ endif

 ifneq ($(CONFIG_FRAME_WARN),0)
 ifeq ($(filter y,$(CONFIG_KASAN)$(CONFIG_KCSAN)),y)
-frame_warn_flag := -Wframe-larger-than=3072
+frame_warn_flag := -Wframe-larger-than=4096
 else
-frame_warn_flag := -Wframe-larger-than=2048
+frame_warn_flag := -Wframe-larger-than=3072
 endif
 endif

--
2.45.1

Re: [PATCH] drm/amd/display: use GFP_ATOMIC for bounding box

2024-06-05 Thread Arnd Bergmann

On Tue, Jun 4, 2024, at 16:22, Christian König wrote:
> Am 04.06.24 um 15:50 schrieb Alex Deucher:
>> This can be called in atomic context.  Should fix:
>>
>> BUG: sleeping function called from invalid context at 
>> include/linux/sched/mm.h:306
>> in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 449, name: 
>> kworker/u64:8
>> preempt_count: 2, expected: 0
>> RCU nest depth: 0, expected: 0
>> Preemption disabled at:
>> c0ce1580>] dc_fpu_begin+0x30/0xd0 [amdgpu]
>> CPU: 5 PID: 449 Comm: kworker/u64:8 Tainted: GW  6.8.0+ #35
>> Hardware name: System manufacturer System Product Name/ROG STRIX X570-E 
>> GAMING WIFI II, BIOS 4204 02/24/2022
>> Workqueue: events_unbound async_run_entry_fn
>
> That most likely only papers over the real problem and is not a valid fix.
>
> The question is why is that an atomic context?  If the function is used 
> under a spinlock then this might indeed be the right fix.
>
> If it's because of floating point operation then that here won't work 
> either.

It looks like there is only one caller, and this turns on
floating point instructions just for the call:

if (dc->res_pool->funcs->update_bw_bounding_box) {
DC_FP_START();
dc->res_pool->funcs->update_bw_bounding_box(dc, 
dc->clk_mgr->bw_params);
DC_FP_END();
}

but then they are enabled again inside of the function.

If we can drop the outer DC_FP_START(), that means
the GFP_KERNEL allocation works. On the other hand if
we actually have to enabled it before calling into
the function (e.g. because there is an architecture that
has incompatible function calling conventions when FP
is enabled), the inner one is redundant, but we can
potentially move the kmemdup() into the caller and
pass the copy by refernence.

  Arnd

Re: [PATCH 1/2][RFC] amdgpu: fix a race in kfd_mem_export_dmabuf()

2024-06-05 Thread Al Viro

On Tue, Jun 04, 2024 at 02:08:30PM -0400, Felix Kuehling wrote:
> > +int drm_gem_prime_handle_to_fd(struct drm_device *dev,
> > +  struct drm_file *file_priv, uint32_t handle,
> > +  uint32_t flags,
> > +  int *prime_fd)
> > +{
> > +   struct dma_buf *dmabuf;
> > +   int fd = get_unused_fd_flags(flags);
> > +
> > +   if (fd < 0)
> > +   return fd;
> > +
> > +   dmabuf = drm_gem_prime_handle_to_dmabuf(dev, file_priv, handle, flags);
> > +   if (IS_ERR(dmabuf))
> > +   return PTR_ERR(dmabuf);

That ought to be
if (IS_ERR(dmabuf)) {
put_unused_fd(fd);
return PTR_ERR(dmabuf);
}
or we'll leak an allocated descriptor.  Will fix and repost...

[PATCH v2 1/2][RFC] amdgpu: fix a race in kfd_mem_export_dmabuf()

2024-06-05 Thread Al Viro

[now without a descriptor leak; it really needs testing, though]

Using drm_gem_prime_handle_to_fd() to set dmabuf up and insert it into
descriptor table, only to have it looked up by file descriptor and
remove it from descriptor table is not just too convoluted - it's
racy; another thread might have modified the descriptor table while
we'd been going through that song and dance.

It's not hard to fix - turn drm_gem_prime_handle_to_fd()
into a wrapper for a new helper that would simply return the
dmabuf, without messing with descriptor table.

Then kfd_mem_export_dmabuf() would simply use that new helper
and leave the descriptor table alone.

Signed-off-by: Al Viro 
---
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 8975cf41a91a..793780bb819c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -25,7 +25,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 
 #include 
@@ -812,18 +811,13 @@ static int kfd_mem_export_dmabuf(struct kgd_mem *mem)
if (!mem->dmabuf) {
struct amdgpu_device *bo_adev;
struct dma_buf *dmabuf;
-   int r, fd;
 
bo_adev = amdgpu_ttm_adev(mem->bo->tbo.bdev);
-   r = drm_gem_prime_handle_to_fd(_adev->ddev, 
bo_adev->kfd.client.file,
+   dmabuf = drm_gem_prime_handle_to_dmabuf(_adev->ddev, 
bo_adev->kfd.client.file,
   mem->gem_handle,
mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?
-  DRM_RDWR : 0, );
-   if (r)
-   return r;
-   dmabuf = dma_buf_get(fd);
-   close_fd(fd);
-   if (WARN_ON_ONCE(IS_ERR(dmabuf)))
+  DRM_RDWR : 0);
+   if (IS_ERR(dmabuf))
return PTR_ERR(dmabuf);
mem->dmabuf = dmabuf;
}
diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c
index 03bd3c7bd0dc..467c7a278ad3 100644
--- a/drivers/gpu/drm/drm_prime.c
+++ b/drivers/gpu/drm/drm_prime.c
@@ -409,23 +409,9 @@ static struct dma_buf *export_and_register_object(struct 
drm_device *dev,
return dmabuf;
 }
 
-/**
- * drm_gem_prime_handle_to_fd - PRIME export function for GEM drivers
- * @dev: dev to export the buffer from
- * @file_priv: drm file-private structure
- * @handle: buffer handle to export
- * @flags: flags like DRM_CLOEXEC
- * @prime_fd: pointer to storage for the fd id of the create dma-buf
- *
- * This is the PRIME export function which must be used mandatorily by GEM
- * drivers to ensure correct lifetime management of the underlying GEM object.
- * The actual exporting from GEM object to a dma-buf is done through the
- * _gem_object_funcs.export callback.
- */
-int drm_gem_prime_handle_to_fd(struct drm_device *dev,
+struct dma_buf *drm_gem_prime_handle_to_dmabuf(struct drm_device *dev,
   struct drm_file *file_priv, uint32_t handle,
-  uint32_t flags,
-  int *prime_fd)
+  uint32_t flags)
 {
struct drm_gem_object *obj;
int ret = 0;
@@ -434,14 +420,14 @@ int drm_gem_prime_handle_to_fd(struct drm_device *dev,
mutex_lock(_priv->prime.lock);
obj = drm_gem_object_lookup(file_priv, handle);
if (!obj)  {
-   ret = -ENOENT;
+   dmabuf = ERR_PTR(-ENOENT);
goto out_unlock;
}
 
dmabuf = drm_prime_lookup_buf_by_handle(_priv->prime, handle);
if (dmabuf) {
get_dma_buf(dmabuf);
-   goto out_have_handle;
+   goto out;
}
 
mutex_lock(>object_name_lock);
@@ -463,7 +449,6 @@ int drm_gem_prime_handle_to_fd(struct drm_device *dev,
/* normally the created dma-buf takes ownership of the ref,
 * but if that fails then drop the ref
 */
-   ret = PTR_ERR(dmabuf);
mutex_unlock(>object_name_lock);
goto out;
}
@@ -478,34 +463,51 @@ int drm_gem_prime_handle_to_fd(struct drm_device *dev,
ret = drm_prime_add_buf_handle(_priv->prime,
   dmabuf, handle);
mutex_unlock(>object_name_lock);
-   if (ret)
-   goto fail_put_dmabuf;
-
-out_have_handle:
-   ret = dma_buf_fd(dmabuf, flags);
-   /*
-* We must _not_ remove the buffer from the handle cache since the newly
-* created dma buf is already linked in the global obj->dma_buf pointer,
-* and that is invariant as long as a userspace gem handle exists.
-* Closing the handle will clean out the cache anyway, so we don't leak.
-*/
-   if (ret < 0) {
-   goto

Re: [PATCH 2/2][RFC] amdkfd CRIU fixes

2024-06-05 Thread Al Viro

On Tue, Jun 04, 2024 at 02:16:00PM -0400, Felix Kuehling wrote:
> 
> On 2024-06-03 22:14, Al Viro wrote:
> > Instead of trying to use close_fd() on failure exits, just have
> > criu_get_prime_handle() store the file reference without inserting
> > it into descriptor table.
> > 
> > Then, once the callers are past the last failure exit, they can go
> > and either insert all those file references into the corresponding
> > slots of descriptor table, or drop all those file references and
> > free the unused descriptors.
> > 
> > Signed-off-by: Al Viro 
> 
> Thank you for the patches and the explanation. One minor nit-pick inline.
> With that fixed, this patch is
> 
> Reviewed-by: Felix Kuehling 
> 
> I can apply this patch to amd-staging-drm-next, if you want. See one comment
> inline ...

Fine by me; I think this stuff would be better off in the relevant trees -
it's not as if we realistically could unexport close_fd() this cycle anyway,
more's the pity...  So nothing I've got in my queue has that as a prereq and
it would definitely have better odds of getting tested in your tree.

Re: [PATCH 3/3] drm/amdgpu: drop redundant W=1 warnings from Makefile

2024-06-05 Thread Jani Nikula

On Thu, 23 May 2024, Jani Nikula  wrote:
> Since commit a61ddb4393ad ("drm: enable (most) W=1 warnings by default
> across the subsystem"), most of the extra warnings in the driver
> Makefile are redundant. Remove them.
>
> Note that -Wmissing-declarations and -Wmissing-prototypes are always
> enabled by default in scripts/Makefile.extrawarn.
>
> Signed-off-by: Jani Nikula 

Alex, this one's for you to do whatever you want. ;)

BR,
Jani.


> ---
>  drivers/gpu/drm/amd/amdgpu/Makefile | 18 +-
>  1 file changed, 1 insertion(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
> b/drivers/gpu/drm/amd/amdgpu/Makefile
> index 1f6b56ec99f6..9508d0b5708e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -39,23 +39,7 @@ ccflags-y := -I$(FULL_AMD_PATH)/include/asic_reg \
>   -I$(FULL_AMD_DISPLAY_PATH)/amdgpu_dm \
>   -I$(FULL_AMD_PATH)/amdkfd
>  
> -subdir-ccflags-y := -Wextra
> -subdir-ccflags-y += -Wunused
> -subdir-ccflags-y += -Wmissing-prototypes
> -subdir-ccflags-y += -Wmissing-declarations
> -subdir-ccflags-y += -Wmissing-include-dirs
> -subdir-ccflags-y += -Wold-style-definition
> -subdir-ccflags-y += -Wmissing-format-attribute
> -# Need this to avoid recursive variable evaluation issues
> -cond-flags := $(call cc-option, -Wunused-but-set-variable) \
> - $(call cc-option, -Wunused-const-variable) \
> - $(call cc-option, -Wstringop-truncation) \
> - $(call cc-option, -Wpacked-not-aligned)
> -subdir-ccflags-y += $(cond-flags)
> -subdir-ccflags-y += -Wno-unused-parameter
> -subdir-ccflags-y += -Wno-type-limits
> -subdir-ccflags-y += -Wno-sign-compare
> -subdir-ccflags-y += -Wno-missing-field-initializers
> +# Locally disable W=1 warnings enabled in drm subsystem Makefile
>  subdir-ccflags-y += -Wno-override-init
>  subdir-ccflags-$(CONFIG_DRM_AMDGPU_WERROR) += -Werror

-- 
Jani Nikula, Intel

Re: [PATCH 1/2] drm/amd/display: use pre-allocated temp structure for bounding box

2024-06-05 Thread Arnd Bergmann

On Tue, Jun 4, 2024, at 20:06, Alex Deucher wrote:
> This mirrors what the driver does for older DCN generations.
>
> Should fix:
>
> BUG: sleeping function called from invalid context at 
> include/linux/sched/mm.h:306
> in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 449, name: 
> kworker/u64:8
> preempt_count: 2, expected: 0
> RCU nest depth: 0, expected: 0
> Preemption disabled at:
> c0ce1580>] dc_fpu_begin+0x30/0xd0 [amdgpu]
> CPU: 5 PID: 449 Comm: kworker/u64:8 Tainted: GW  6.8.0+ 
> #35
> Hardware name: System manufacturer System Product Name/ROG STRIX X570-E 
> GAMING WIFI II, BIOS 4204 02/24/2022
> Workqueue: events_unbound async_run_entry_fn
>
> Fixes: 88c61827cedc ("drm/amd/display: dynamically allocate 
> dml2_configuration_options structures")
> Tested-by: George Zhang 
> Suggested-by: Hamza Mahfooz 
> Signed-off-by: Alex Deucher 
> Cc: George Zhang 
> Cc: Arnd Bergmann 
> Cc: harry.wentl...@amd.com
> Cc: sunpeng...@amd.com
> Cc: rodrigo.sique...@amd.com

That looks nicer than all the other suggestions, thanks!

Acked-by: Arnd Bergmann 

One part sticks out though:

> @@ -2027,7 +2025,7 @@ static void dcn32_update_bw_bounding_box(struct 
> dc *dc, struct clk_bw_params *bw
> 
>   DC_FP_END();
> 
> - kfree(dml2_opt);
> + memcpy(>dml2_options, dml2_opt, sizeof(dc->dml2_options));
>  }

The driver did not copy the data back before, so this is a
change in behavior. Is that intentional or a mistake?

If the intention is to have the data copied back into
dc->dml2_options in the end, wouldn't it be easier to
just pass a pointer as in the old version before commit
e779f4587f61 ("drm/amd/display: Add handling for DC power
mode")?

 Arnd

Re: [PATCH] kernel/resource: optimize find_next_iomem_res

2024-06-05 Thread Greg Kroah-Hartman

On Thu, May 30, 2024 at 10:36:57PM -0700, Chia-I Wu wrote:
> We can skip children resources when the parent resource does not cover
> the range.
> 
> This should help vmf_insert_* users on x86, such as several DRM drivers.
> On my AMD Ryzen 5 7520C, when streaming data from cpu memory into amdgpu
> bo, the throughput goes from 5.1GB/s to 6.6GB/s.  perf report says
> 
>   34.69%--__do_fault
>   34.60%--amdgpu_gem_fault
>   34.00%--ttm_bo_vm_fault_reserved
>   32.95%--vmf_insert_pfn_prot
>   25.89%--track_pfn_insert
>   24.35%--lookup_memtype
>   21.77%--pat_pagerange_is_ram
>   20.80%--walk_system_ram_range
>   17.42%--find_next_iomem_res
> 
> before this change, and
> 
>   26.67%--__do_fault
>   26.57%--amdgpu_gem_fault
>   25.83%--ttm_bo_vm_fault_reserved
>   24.40%--vmf_insert_pfn_prot
>   14.30%--track_pfn_insert
>   12.20%--lookup_memtype
>   9.34%--pat_pagerange_is_ram
>   8.22%--walk_system_ram_range
>   5.09%--find_next_iomem_res
> 
> after.

That's great, but why is walk_system_ram_range() being called so often?

Shouldn't that be a "set up the device" only type of thing?  Why hammer
on "lookup_memtype" when you know the memtype, you just did the same
thing for the previous frame.

This feels like it could be optimized to just "don't call these things"
which would make it go faster, right?

What am I missing here, why does this always have to be calculated all
the time?  Resource mapping changes are rare, if at all, over the
lifetime of a system uptime.  Constantly calculating something that
never changes feels odd to me.

thanks,

greg k-h

Re: [PATCH 1/3] drm/i915: drop redundant W=1 warnings from Makefile

2024-06-05 Thread Jani Nikula

On Thu, 23 May 2024, Jani Nikula  wrote:
> Since commit a61ddb4393ad ("drm: enable (most) W=1 warnings by default
> across the subsystem"), most of the extra warnings in the driver
> Makefile are redundant. Remove them.
>
> Note that -Wmissing-declarations and -Wmissing-prototypes are always
> enabled by default in scripts/Makefile.extrawarn.
>
> Signed-off-by: Jani Nikula 

Pushed this patch to drm-intel-next with Lucas' irc ack.

BR,
Jani.

> ---
>  drivers/gpu/drm/i915/Makefile | 25 +
>  1 file changed, 1 insertion(+), 24 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 7cad944b825c..a70d95a8fd7a 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -3,31 +3,8 @@
>  # Makefile for the drm device driver.  This driver provides support for the
>  # Direct Rendering Infrastructure (DRI) in XFree86 4.1.0 and higher.
>  
> -# Unconditionally enable W=1 warnings locally
> -# --- begin copy-paste W=1 warnings from scripts/Makefile.extrawarn
> -subdir-ccflags-y += -Wextra -Wunused -Wno-unused-parameter
> -subdir-ccflags-y += -Wmissing-declarations
> -subdir-ccflags-y += $(call cc-option, -Wrestrict)
> -subdir-ccflags-y += -Wmissing-format-attribute
> -subdir-ccflags-y += -Wmissing-prototypes
> -subdir-ccflags-y += -Wold-style-definition
> -subdir-ccflags-y += -Wmissing-include-dirs
> -subdir-ccflags-y += $(call cc-option, -Wunused-but-set-variable)
> -subdir-ccflags-y += $(call cc-option, -Wunused-const-variable)
> -subdir-ccflags-y += $(call cc-option, -Wpacked-not-aligned)
> -subdir-ccflags-y += $(call cc-option, -Wformat-overflow)
> +# Enable W=1 warnings not enabled in drm subsystem Makefile
>  subdir-ccflags-y += $(call cc-option, -Wformat-truncation)
> -subdir-ccflags-y += $(call cc-option, -Wstringop-truncation)
> -# The following turn off the warnings enabled by -Wextra
> -ifeq ($(findstring 2, $(KBUILD_EXTRA_WARN)),)
> -subdir-ccflags-y += -Wno-missing-field-initializers
> -subdir-ccflags-y += -Wno-type-limits
> -subdir-ccflags-y += -Wno-shift-negative-value
> -endif
> -ifeq ($(findstring 3, $(KBUILD_EXTRA_WARN)),)
> -subdir-ccflags-y += -Wno-sign-compare
> -endif
> -# --- end copy-paste
>  
>  # Enable -Werror in CI and development
>  subdir-ccflags-$(CONFIG_DRM_I915_WERROR) += -Werror

-- 
Jani Nikula, Intel

Re: [PATCH] drm/amd/display: use pre-allocated temp structure for bounding box

2024-06-05 Thread Alex Deucher

I sent a separate patch for DCN 401.

On Wed, Jun 5, 2024 at 8:37 AM Pillai, Aurabindo
 wrote:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> Hi Alex,
>
> I'll a hunk for fixing DCN401 as well to this and resend it later today.
>
>
> --
>
> Regards,
> Jay
> 
> From: amd-gfx  on behalf of Zhang, 
> George 
> Sent: Tuesday, June 4, 2024 12:49 PM
> To: Deucher, Alexander ; 
> amd-gfx@lists.freedesktop.org 
> Cc: Mahfooz, Hamza ; Arnd Bergmann ; 
> Wentland, Harry ; Li, Sun peng (Leo) 
> ; Siqueira, Rodrigo 
> Subject: RE: [PATCH] drm/amd/display: use pre-allocated temp structure for 
> bounding box
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> Tested-by: George Zhang 
>
> Thanks,
> George
> -Original Message-
> From: Deucher, Alexander 
> Sent: Tuesday, June 4, 2024 11:50 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander ; Mahfooz, Hamza 
> ; Zhang, George ; Arnd Bergmann 
> ; Wentland, Harry ; Li, Sun peng (Leo) 
> ; Siqueira, Rodrigo 
> Subject: [PATCH] drm/amd/display: use pre-allocated temp structure for 
> bounding box
>
> This mirrors what the driver does for older DCN generations.
>
> Should fix:
>
> BUG: sleeping function called from invalid context at 
> include/linux/sched/mm.h:306
> in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 449, name: 
> kworker/u64:8
> preempt_count: 2, expected: 0
> RCU nest depth: 0, expected: 0
> Preemption disabled at:
> c0ce1580>] dc_fpu_begin+0x30/0xd0 [amdgpu]
> CPU: 5 PID: 449 Comm: kworker/u64:8 Tainted: GW  6.8.0+ #35
> Hardware name: System manufacturer System Product Name/ROG STRIX X570-E 
> GAMING WIFI II, BIOS 4204 02/24/2022
> Workqueue: events_unbound async_run_entry_fn
>
> Fixes: 88c61827cedc ("drm/amd/display: dynamically allocate 
> dml2_configuration_options structures")
> Suggested-by: Hamza Mahfooz 
> Signed-off-by: Alex Deucher 
> Cc: George Zhang 
> Cc: Arnd Bergmann 
> Cc: harry.wentl...@amd.com
> Cc: sunpeng...@amd.com
> Cc: rodrigo.sique...@amd.com
> ---
>  drivers/gpu/drm/amd/display/dc/dc.h   | 1 +
>  .../drm/amd/display/dc/resource/dcn32/dcn32_resource.c| 8 +++-
>  .../drm/amd/display/dc/resource/dcn321/dcn321_resource.c  | 8 +++-
>  3 files changed, 7 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/display/dc/dc.h 
> b/drivers/gpu/drm/amd/display/dc/dc.h
> index d0ed01ac460d..d843933ad731 100644
> --- a/drivers/gpu/drm/amd/display/dc/dc.h
> +++ b/drivers/gpu/drm/amd/display/dc/dc.h
> @@ -1444,6 +1444,7 @@ struct dc {
> } scratch;
>
> struct dml2_configuration_options dml2_options;
> +   struct dml2_configuration_options dml2_tmp;
> enum dc_acpi_cm_power_state power_state;
>
>  };
> diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c 
> b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c
> index 0f11d7c8791c..7a8aa9396dea 100644
> --- a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c
> +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c
> @@ -2007,11 +2007,9 @@ void dcn32_calculate_wm_and_dlg(struct dc *dc, struct 
> dc_state *context,
>
>  static void dcn32_update_bw_bounding_box(struct dc *dc, struct clk_bw_params 
> *bw_params)
>  {
> -   struct dml2_configuration_options *dml2_opt;
> +   struct dml2_configuration_options *dml2_opt = >dml2_tmp;
>
> -   dml2_opt = kmemdup(>dml2_options, sizeof(dc->dml2_options), 
> GFP_KERNEL);
> -   if (!dml2_opt)
> -   return;
> +   memcpy(dml2_opt, >dml2_options, sizeof(dc->dml2_options));
>
> DC_FP_START();
>
> @@ -2027,7 +2025,7 @@ static void dcn32_update_bw_bounding_box(struct dc *dc, 
> struct clk_bw_params *bw
>
> DC_FP_END();
>
> -   kfree(dml2_opt);
> +   memcpy(>dml2_options, dml2_opt, sizeof(dc->dml2_options));
>  }
>
>  static struct resource_funcs dcn32_res_pool_funcs = {
> diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c 
> b/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c
> index 07ca6f58447d..ef30e8632607 100644
> --- a/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c
> +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c
> @@ -1581,11 +1581,9 @@ static struct dc_cap_funcs cap_funcs = {
>
>  static void dcn321_update_bw_bounding_box(struct dc *dc, struct 
> clk_bw_params *bw_params)
>  {
> -   struct dml2_configuration_options *dml2_opt;
> +   struct dml2_configuration_options *dml2_opt = >dml2_tmp;
>
> -   dml2_opt = kmemdup(>dml2_options, sizeof(dc->dml2_options), 
> GFP_KERNEL);
> -   if (!dml2_opt)
> -   return;
> +   memcpy(dml2_opt, >dml2_options, sizeof(dc->dml2_options));
>
> DC_FP_START();
>
> @@ -1601,7 +1599,7 @@ static void dcn321_update_bw_bounding_box(struct dc 
> *dc, struct

RE: [PATCH 00/18] Enhance amdgpu_firmware_request() to improve function flexibility

2024-06-05 Thread Wang, Yang(Kevin)

[AMD Official Use Only - AMD Internal Distribution Only]

Hi Chris,

I have checked my inbox and did not see your email.
Could you please resend it?

Best Regards,
Kevin

-Original Message-
From: Koenig, Christian 
Sent: Wednesday, June 5, 2024 7:21 PM
To: Wang, Yang(Kevin) ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 

Subject: Re: [PATCH 00/18] Enhance amdgpu_firmware_request() to improve 
function flexibility

You haven't addressed any of my comments on patch #1.

Regards,
Christian.

Am 05.06.24 um 11:33 schrieb Wang, Yang(Kevin):
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> Ping...
>
> Best Regards,
> Kevin
>
> -Original Message-
> From: amd-gfx  On Behalf Of Yang Wang
> Sent: Monday, June 3, 2024 9:42 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking ; Deucher, Alexander 
> 
> Subject: [PATCH 00/18] Enhance amdgpu_firmware_request() to improve function 
> flexibility
>
> Adding variable parameter support to function amdgpu_firmware_request() to 
> improve function flexibility.
>
> Yang Wang (18):
>drm/amdgpu: enhance amdgpu_ucode_request() function flexibility
>drm/amdgpu: refine gpu_info firmware loading
>drm/amdgpu: refine isp firmware loading
>drm/amdgpu: refine mes firmware loading
>drm/amdgpu: refine psp firmware loading
>drm/amdgpu: refine sdma firmware loading
>drm/amdgpu: refine imu firmware loading
>drm/amdgpu: refine pmfw/smu firmware loading
>drm/amdgpu: refine gmc firmware loading
>drm/amdgpu: refine vcn firmware loading
>drm/amdgpu: refine vpe firmware loading
>drm/amdgpu: refine gfx6 firmware loading
>drm/amdgpu: refine gfx7 firmware loading
>drm/amdgpu: refine gfx8 firmware loading
>drm/amdgpu: refine gfx9 firmware loading
>drm/amdgpu: refine gfx10 firmware loading
>drm/amdgpu: refine gfx11 firmware loading
>drm/amdgpu: refine gfx12 firmware loading
>
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c|  9 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c   |  4 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c   |  6 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c   | 26 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c  |  8 +--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 30 +---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h |  3 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c   | 14 ++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c   |  6 +-
>   drivers/gpu/drm/amd/amdgpu/cik_sdma.c | 11 +--
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c| 25 ---
>   drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c| 26 ---
>   drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c| 22 +++---
>   drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c | 19 +++--
>   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 27 
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 69 +--
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 45 ++--
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c   | 11 ++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c | 14 ++--
>   drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c |  7 +-
>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c |  6 +-
>   drivers/gpu/drm/amd/amdgpu/imu_v11_0.c| 10 ++-
>   drivers/gpu/drm/amd/amdgpu/imu_v12_0.c| 10 ++-
>   drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c| 11 +--
>   drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c| 11 +--
>   drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c|  8 +--
>   .../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c|  6 +-
>   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c|  6 +-
>   .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  |  6 +-
>   .../gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c|  6 +-
>   30 files changed, 205 insertions(+), 257 deletions(-)
>
> --
> 2.34.1
>

[PATCH] drm/amdgpu: revert "take runtime pm reference when we attach a buffer"

2024-06-05 Thread Christian König

This reverts commit b8c415e3bf989be1b749409951debe6b36f5c78c and
commit 425285d39afddaf4a9dab36045b816af0cc3e400.

Taking a runtime pm reference for DMA-buf is actually completely
unnecessary.

When the buffer is in GTT it is still accessible even when the GPU
is powered down and when it is in VRAM the buffer gets migrated to
GTT before powering down.

The only use case which would make it mandatory to keep the runtime
pm reference would be if we pin the buffer into VRAM, and that's not
something we currently do.

Signed-off-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 33 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c   |  2 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h   | 15 --
 3 files changed, 50 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
index 0b3b10d21952..ab848047204c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
@@ -41,8 +41,6 @@
 #include 
 #include 
 #include 
-#include 
-#include "amdgpu_trace.h"
 
 /**
  * amdgpu_dma_buf_attach - _buf_ops.attach implementation
@@ -63,37 +61,7 @@ static int amdgpu_dma_buf_attach(struct dma_buf *dmabuf,
if (pci_p2pdma_distance(adev->pdev, attach->dev, false) < 0)
attach->peer2peer = false;
 
-   r = pm_runtime_get_sync(adev_to_drm(adev)->dev);
-   trace_amdgpu_runpm_reference_dumps(1, __func__);
-   if (r < 0)
-   goto out;
-
return 0;
-
-out:
-   pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
-   trace_amdgpu_runpm_reference_dumps(0, __func__);
-   return r;
-}
-
-/**
- * amdgpu_dma_buf_detach - _buf_ops.detach implementation
- *
- * @dmabuf: DMA-buf where we remove the attachment from
- * @attach: the attachment to remove
- *
- * Called when an attachment is removed from the DMA-buf.
- */
-static void amdgpu_dma_buf_detach(struct dma_buf *dmabuf,
- struct dma_buf_attachment *attach)
-{
-   struct drm_gem_object *obj = dmabuf->priv;
-   struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj);
-   struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
-
-   pm_runtime_mark_last_busy(adev_to_drm(adev)->dev);
-   pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
-   trace_amdgpu_runpm_reference_dumps(0, __func__);
 }
 
 /**
@@ -266,7 +234,6 @@ static int amdgpu_dma_buf_begin_cpu_access(struct dma_buf 
*dma_buf,
 
 const struct dma_buf_ops amdgpu_dmabuf_ops = {
.attach = amdgpu_dma_buf_attach,
-   .detach = amdgpu_dma_buf_detach,
.pin = amdgpu_dma_buf_pin,
.unpin = amdgpu_dma_buf_unpin,
.map_dma_buf = amdgpu_dma_buf_map,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 10832b470448..bc3ac73b6b8d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -181,7 +181,6 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct 
dma_fence **f, struct amd
amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
   seq, flags | AMDGPU_FENCE_FLAG_INT);
pm_runtime_get_noresume(adev_to_drm(adev)->dev);
-   trace_amdgpu_runpm_reference_dumps(1, __func__);
ptr = >fence_drv.fences[seq & ring->fence_drv.num_fences_mask];
if (unlikely(rcu_dereference_protected(*ptr, 1))) {
struct dma_fence *old;
@@ -309,7 +308,6 @@ bool amdgpu_fence_process(struct amdgpu_ring *ring)
dma_fence_put(fence);
pm_runtime_mark_last_busy(adev_to_drm(adev)->dev);
pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
-   trace_amdgpu_runpm_reference_dumps(0, __func__);
} while (last_seq != seq);
 
return true;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
index f539b1d00234..2fd1bfb35916 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
@@ -554,21 +554,6 @@ TRACE_EVENT(amdgpu_reset_reg_dumps,
  __entry->value)
 );
 
-TRACE_EVENT(amdgpu_runpm_reference_dumps,
-   TP_PROTO(uint32_t index, const char *func),
-   TP_ARGS(index, func),
-   TP_STRUCT__entry(
-__field(uint32_t, index)
-__string(func, func)
-),
-   TP_fast_assign(
-  __entry->index = index;
-  __assign_str(func, func);
-  ),
-   TP_printk("amdgpu runpm reference dump 0x%x: 0x%s\n",
- __entry->index,
- __get_str(func))
-);
 #undef AMDGPU_JOB_GET_TIMELINE_NAME
 #endif
 
-- 
2.34.1

RE: [PATCH v2 03/10] drm/amdgpu: abort fence poll if reset is started

2024-06-05 Thread Liu, Shaoyun

[AMD Official Use Only - AMD Internal Distribution Only]

Hi, Christian
If you just want to know the  status of MES , then this approach is ok . My 
original thinking is  the driver might also need to know the status of the  
functionality it requires . ex . after call remove_queue , whether the CP has  
actually unmapped it from the HQD ,  but after think it again , you are right , 
if driver want to know the  CP status , it can  use the  QUERY_STATUS from mes 
misc _op  interface.
Please keep one thing in mind , currently  MES don't generate the  interrupt 
after it successfully execute the  command , it just update the specified  
fence value on the specified address , so driver side need to poll it to check 
whether the  command  finished successfully or not .  We can  discuss this when 
 you come up a new design .

Regards
Shaoyun.liu


-Original Message-
From: Koenig, Christian 
Sent: Tuesday, June 4, 2024 4:07 AM
To: Liu, Shaoyun ; Christian König 
; Li, Yunxiang (Teddy) ; 
amd-gfx@lists.freedesktop.org; Deucher, Alexander ; 
Xiao, Hua 
Subject: Re: [PATCH v2 03/10] drm/amdgpu: abort fence poll if reset is started

Hi Shaoyun,

see inline.

Am 03.06.24 um 20:28 schrieb Liu, Shaoyun:
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> Thanks Christian for the detail explanation.
> I checked your patch , you try to use query_scheduler_status package  to 
> check the command completion . It  may not work as expected since this API 
> query the status is for MES itself , so mes can update the fence address with 
>  the expected seq value, but the  command  itself (ex .remove_queue for mes 
> and  then  mes send the  unmap_queue to kiq internally)  still fails.

And that is exactly the desired behavior.

See the fence value is for ring buffer processing and getting feedback in the 
case of a reset for example if the MES has processed the commands.

If that processing is successfully or not *must* be completely irrelevant for 
writing the fence value.

> For mes , driver always poll for the command completion

No, exactly that's what we want to avoid as much as possible.

Polling means that we throw away tons of CPU cycles and especially on fault 
handling and TLB flushing that is an absolutely in-acceptable performance loss.

>   ,  do you think it's an acceptable solution that MES set a specific failure 
> value(ex , -1)  in the fence address to indicate the failure of the  
> operation ?  But that should be similar to let driver poll the completion 
> till timeout .  MES internally also need to wait for a timeout on some  
> command that it sent  to CP (ex.  2 seconds for unmap_queue command).

No, what we should really do is to separate the fence and the result values. 
And then give an input dependency on each operation.

> I'm actually a little bit confused here , has driver use the lock to ensure 
> there is only one submission into MES at any time ?

No, exactly that's what we try to avoid. Othertwise we don't need a ring buffer 
in the first place.

>   MES can also trigger the interrupt on the failure if driver side require us 
> to do so , the  payload will have the seq number to indicate which submission 
> cause the failure , that might requires more code change from   driver side 
> .Please let me what's preferred from driver side.

I can come up with a more detailed explanation of the driver requirements when 
I'm back from sick leave.

Regards,
Christian.

>
> Regards
> Shaoyun.liu
>
> -Original Message-
> From: Koenig, Christian 
> Sent: Monday, June 3, 2024 6:59 AM
> To: Liu, Shaoyun ; Christian König
> ; Li, Yunxiang (Teddy)
> ; amd-gfx@lists.freedesktop.org; Deucher,
> Alexander ; Xiao, Hua 
> Subject: Re: [PATCH v2 03/10] drm/amdgpu: abort fence poll if reset is
> started
>
> Hi Shaoyun,
>
> yes my thinking goes into the same direction. The basic problem here is that 
> we are trying to stuff two different information into the same variable.
>
> The first information is if the commands haven been read by the MES from the 
> ring buffer. This information is necessary for the normal ring buffer and 
> reset handling, e.g. prevents ring buffer overflow, ordering of command, 
> lockups during reset etc...
>
> The second information is if a certain operation was successfully or not. For 
> example this is necessary to get signaled back if y queue map/unmap operation 
> has been successfully or if the CP not responding or any other error has 
> happened etc...
>
> Another issue is that while it is in general a good idea to have the firmware 
> work in a way where errors are reported instead of completely stopping all 
> processing, here we run into trouble because the driver usually assumes that 
> work can be scheduled on the ring buffer and a subsequent work is processed 
> only when everything previously submitted has completed successfully.
>
> So as initial fix for the issue we see I've send Alex a patch on Friday to 
> partially revert his change to use an

Re: [PATCH] Revert "drm/amd/display: avoid large on-stack structures"

2024-06-05 Thread Pillai, Aurabindo

[AMD Official Use Only - AMD Internal Distribution Only]

Thanks for the heads up!

--

Regards,
Jay

From: Mahfooz, Hamza 
Sent: Tuesday, June 4, 2024 1:50 PM
To: Pillai, Aurabindo ; amd-gfx@lists.freedesktop.org 

Cc: a...@arndb.de ; Deucher, Alexander 
; Wentland, Harry ; 
Siqueira, Rodrigo ; Zuo, Jerry 
Subject: Re: [PATCH] Revert "drm/amd/display: avoid large on-stack structures"

On 6/4/24 13:45, Aurabindo Pillai wrote:
> This reverts commit 44069f0f9b1fe577c5d4f05fa9eb02db8c618adc since
> the code path is called from FPU context, and triggers error like:
>
> [   26.924055] BUG: sleeping function called from invalid context at 
> include/linux/sched/mm.h:306
> [   26.924060] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1022, 
> name: modprobe
> [   26.924063] preempt_count: 2, expected: 0
> [   26.924064] RCU nest depth: 0, expected: 0
> [   26.924066] Preemption disabled at:
> [   26.924067] [] dc_fpu_begin+0x30/0xd0 [amdgpu]
> [   26.924322] CPU: 9 PID: 1022 Comm: modprobe Not tainted 6.8.0+ #20
> [   26.924325] Hardware name: System manufacturer System Product 
> Name/CROSSHAIR VI HERO, BIOS 6302 10/23/2018
> [   26.924326] Call Trace:
> [   26.924327]  
> [   26.924329]  dump_stack_lvl+0x37/0x50
> [   26.924333]  ? dc_fpu_begin+0x30/0xd0 [amdgpu]
> [   26.924589]  dump_stack+0x10/0x20
> [   26.924592]  __might_resched+0x16a/0x1c0
> [   26.924596]  __might_sleep+0x42/0x70
> [   26.924598]  __kmalloc_node_track_caller+0x2ad/0x4b0
> [   26.924601]  ? dm_helpers_allocate_gpu_mem+0x12/0x20 [amdgpu]
> [   26.924855]  ? dcn401_update_bw_bounding_box+0x2a/0xf0 [amdgpu]
> [   26.925122]  kmemdup+0x20/0x50
> [   26.925124]  ? kernel_fpu_begin_mask+0x6b/0xe0
> [   26.925127]  ? kmemdup+0x20/0x50
> [   26.925129]  dcn401_update_bw_bounding_box+0x2a/0xf0 [amdgpu]
> [   26.925393]  dc_create+0x311/0x670 [amdgpu]
> [   26.925649]  amdgpu_dm_init+0x2aa/0x1fa0 [amdgpu]
> [   26.925903]  ? irq_work_queue+0x38/0x50
> [   26.925907]  ? vprintk_emit+0x1e7/0x270
> [   26.925910]  ? dev_printk_emit+0x83/0xb0
> [   26.925914]  ? amdgpu_device_rreg+0x17/0x20 [amdgpu]
> [   26.926133]  dm_hw_init+0x14/0x30 [amdgpu]
> ---
>   drivers/gpu/drm/amd/display/dc/core/dc_state.c   | 16 +---
>   .../display/dc/resource/dcn401/dcn401_resource.c | 16 +---
>   2 files changed, 10 insertions(+), 22 deletions(-)

You probably want something like
https://patchwork.freedesktop.org/patch/597044/ instead.

>
> diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_state.c 
> b/drivers/gpu/drm/amd/display/dc/core/dc_state.c
> index 8ea9391c60b7..70928223b642 100644
> --- a/drivers/gpu/drm/amd/display/dc/core/dc_state.c
> +++ b/drivers/gpu/drm/amd/display/dc/core/dc_state.c
> @@ -193,11 +193,7 @@ static void init_state(struct dc *dc, struct dc_state 
> *state)
>   struct dc_state *dc_state_create(struct dc *dc, struct 
> dc_state_create_params *params)
>   {
>   #ifdef CONFIG_DRM_AMD_DC_FP
> - struct dml2_configuration_options *dml2_opt;
> -
> - dml2_opt = kmemdup(>dml2_options, sizeof(*dml2_opt), GFP_KERNEL);
> - if (!dml2_opt)
> - return NULL;
> + struct dml2_configuration_options dml2_opt = dc->dml2_options;
>   #endif
>struct dc_state *state = kvzalloc(sizeof(struct dc_state),
>GFP_KERNEL);
> @@ -211,14 +207,12 @@ struct dc_state *dc_state_create(struct dc *dc, struct 
> dc_state_create_params *p
>
>   #ifdef CONFIG_DRM_AMD_DC_FP
>if (dc->debug.using_dml2) {
> - dml2_opt->use_clock_dc_limits = false;
> - dml2_create(dc, dml2_opt, >bw_ctx.dml2);
> + dml2_opt.use_clock_dc_limits = false;
> + dml2_create(dc, _opt, >bw_ctx.dml2);
>
> - dml2_opt->use_clock_dc_limits = true;
> - dml2_create(dc, dml2_opt, >bw_ctx.dml2_dc_power_source);
> + dml2_opt.use_clock_dc_limits = true;
> + dml2_create(dc, _opt, >bw_ctx.dml2_dc_power_source);
>}
> -
> - kfree(dml2_opt);
>   #endif
>
>kref_init(>refcount);
> diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c 
> b/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c
> index 8dfb0a3d21cb..247bac177d1b 100644
> --- a/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c
> +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c
> @@ -1581,27 +1581,21 @@ static struct dc_cap_funcs cap_funcs = {
>
>   static void dcn401_update_bw_bounding_box(struct dc *dc, struct 
> clk_bw_params *bw_params)
>   {
> - struct dml2_configuration_options *dml2_opt;
> -
> - dml2_opt = kmemdup(>dml2_options, sizeof(*dml2_opt), GFP_KERNEL);
> - if (!dml2_opt)
> - return;
> + struct dml2_configuration_options dml2_opt = dc->dml2_options;
>
>DC_FP_START();
>
>dcn401_update_bw_bounding_box_fpu(dc, bw_params);
>
> - dml2_opt->use_clock_dc_limits = false;
> +

Re: [PATCH] drm/amd/display: use pre-allocated temp structure for bounding box

2024-06-05 Thread Pillai, Aurabindo

[AMD Official Use Only - AMD Internal Distribution Only]

Hi Alex,

I'll a hunk for fixing DCN401 as well to this and resend it later today.


--

Regards,
Jay

From: amd-gfx  on behalf of Zhang, 
George 
Sent: Tuesday, June 4, 2024 12:49 PM
To: Deucher, Alexander ; 
amd-gfx@lists.freedesktop.org 
Cc: Mahfooz, Hamza ; Arnd Bergmann ; 
Wentland, Harry ; Li, Sun peng (Leo) 
; Siqueira, Rodrigo 
Subject: RE: [PATCH] drm/amd/display: use pre-allocated temp structure for 
bounding box

[AMD Official Use Only - AMD Internal Distribution Only]

[AMD Official Use Only - AMD Internal Distribution Only]

Tested-by: George Zhang 

Thanks,
George
-Original Message-
From: Deucher, Alexander 
Sent: Tuesday, June 4, 2024 11:50 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Mahfooz, Hamza 
; Zhang, George ; Arnd Bergmann 
; Wentland, Harry ; Li, Sun peng (Leo) 
; Siqueira, Rodrigo 
Subject: [PATCH] drm/amd/display: use pre-allocated temp structure for bounding 
box

This mirrors what the driver does for older DCN generations.

Should fix:

BUG: sleeping function called from invalid context at 
include/linux/sched/mm.h:306
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 449, name: kworker/u64:8
preempt_count: 2, expected: 0
RCU nest depth: 0, expected: 0
Preemption disabled at:
c0ce1580>] dc_fpu_begin+0x30/0xd0 [amdgpu]
CPU: 5 PID: 449 Comm: kworker/u64:8 Tainted: GW  6.8.0+ #35
Hardware name: System manufacturer System Product Name/ROG STRIX X570-E GAMING 
WIFI II, BIOS 4204 02/24/2022
Workqueue: events_unbound async_run_entry_fn

Fixes: 88c61827cedc ("drm/amd/display: dynamically allocate 
dml2_configuration_options structures")
Suggested-by: Hamza Mahfooz 
Signed-off-by: Alex Deucher 
Cc: George Zhang 
Cc: Arnd Bergmann 
Cc: harry.wentl...@amd.com
Cc: sunpeng...@amd.com
Cc: rodrigo.sique...@amd.com
---
 drivers/gpu/drm/amd/display/dc/dc.h   | 1 +
 .../drm/amd/display/dc/resource/dcn32/dcn32_resource.c| 8 +++-
 .../drm/amd/display/dc/resource/dcn321/dcn321_resource.c  | 8 +++-
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dc.h 
b/drivers/gpu/drm/amd/display/dc/dc.h
index d0ed01ac460d..d843933ad731 100644
--- a/drivers/gpu/drm/amd/display/dc/dc.h
+++ b/drivers/gpu/drm/amd/display/dc/dc.h
@@ -1444,6 +1444,7 @@ struct dc {
} scratch;

struct dml2_configuration_options dml2_options;
+   struct dml2_configuration_options dml2_tmp;
enum dc_acpi_cm_power_state power_state;

 };
diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c 
b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c
index 0f11d7c8791c..7a8aa9396dea 100644
--- a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c
@@ -2007,11 +2007,9 @@ void dcn32_calculate_wm_and_dlg(struct dc *dc, struct 
dc_state *context,

 static void dcn32_update_bw_bounding_box(struct dc *dc, struct clk_bw_params 
*bw_params)
 {
-   struct dml2_configuration_options *dml2_opt;
+   struct dml2_configuration_options *dml2_opt = >dml2_tmp;

-   dml2_opt = kmemdup(>dml2_options, sizeof(dc->dml2_options), 
GFP_KERNEL);
-   if (!dml2_opt)
-   return;
+   memcpy(dml2_opt, >dml2_options, sizeof(dc->dml2_options));

DC_FP_START();

@@ -2027,7 +2025,7 @@ static void dcn32_update_bw_bounding_box(struct dc *dc, 
struct clk_bw_params *bw

DC_FP_END();

-   kfree(dml2_opt);
+   memcpy(>dml2_options, dml2_opt, sizeof(dc->dml2_options));
 }

 static struct resource_funcs dcn32_res_pool_funcs = {
diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c 
b/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c
index 07ca6f58447d..ef30e8632607 100644
--- a/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c
@@ -1581,11 +1581,9 @@ static struct dc_cap_funcs cap_funcs = {

 static void dcn321_update_bw_bounding_box(struct dc *dc, struct clk_bw_params 
*bw_params)
 {
-   struct dml2_configuration_options *dml2_opt;
+   struct dml2_configuration_options *dml2_opt = >dml2_tmp;

-   dml2_opt = kmemdup(>dml2_options, sizeof(dc->dml2_options), 
GFP_KERNEL);
-   if (!dml2_opt)
-   return;
+   memcpy(dml2_opt, >dml2_options, sizeof(dc->dml2_options));

DC_FP_START();

@@ -1601,7 +1599,7 @@ static void dcn321_update_bw_bounding_box(struct dc *dc, 
struct clk_bw_params *b

DC_FP_END();

-   kfree(dml2_opt);
+   memcpy(>dml2_options, dml2_opt, sizeof(dc->dml2_options));
 }

 static struct resource_funcs dcn321_res_pool_funcs = {
--
2.45.1

[PATCH AUTOSEL 6.1 13/14] drm/amdgpu: fix dereference null return value for the function amdgpu_vm_pt_parent

2024-06-05 Thread Sasha Levin

From: Jesse Zhang 

[ Upstream commit a0cf36546cc24ae1c95d72253c7795d4d2fc77aa ]

The pointer parent may be NULLed by the function amdgpu_vm_pt_parent.
To make the code more robust, check the pointer parent.

Signed-off-by: Jesse Zhang 
Suggested-by: Christian König 
Reviewed-by: Christian König 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index 69b3829bbe53f..370d02bdde862 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -754,11 +754,15 @@ int amdgpu_vm_pde_update(struct amdgpu_vm_update_params 
*params,
 struct amdgpu_vm_bo_base *entry)
 {
struct amdgpu_vm_bo_base *parent = amdgpu_vm_pt_parent(entry);
-   struct amdgpu_bo *bo = parent->bo, *pbo;
+   struct amdgpu_bo *bo, *pbo;
struct amdgpu_vm *vm = params->vm;
uint64_t pde, pt, flags;
unsigned int level;
 
+   if (WARN_ON(!parent))
+   return -EINVAL;
+
+   bo = parent->bo;
for (level = 0, pbo = bo->parent; pbo; ++level)
pbo = pbo->parent;
 
-- 
2.43.0

[PATCH AUTOSEL 6.6 17/18] Revert "drm/amdkfd: fix gfx_target_version for certain 11.0.3 devices"

2024-06-05 Thread Sasha Levin

From: Alex Deucher 

[ Upstream commit dd2b75fd9a79bf418e088656822af06fc253dbe3 ]

This reverts commit 28ebbb4981cb1fad12e0b1227dbecc88810b1ee8.

Revert this commit as apparently the LLVM code to take advantage of
this never landed.

Reviewed-by: Feifei Xu 
Signed-off-by: Alex Deucher 
Cc: Feifei Xu 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c | 11 ++-
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 913c70a0ef44f..0c94bdfadaabf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -402,15 +402,8 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, 
bool vf)
f2g = _v11_kfd2kgd;
break;
case IP_VERSION(11, 0, 3):
-   if ((adev->pdev->device == 0x7460 &&
-adev->pdev->revision == 0x00) ||
-   (adev->pdev->device == 0x7461 &&
-adev->pdev->revision == 0x00))
-   /* Note: Compiler version is 11.0.5 while HW 
version is 11.0.3 */
-   gfx_target_version = 110005;
-   else
-   /* Note: Compiler version is 11.0.1 while HW 
version is 11.0.3 */
-   gfx_target_version = 110001;
+   /* Note: Compiler version is 11.0.1 while HW version is 
11.0.3 */
+   gfx_target_version = 110001;
f2g = _v11_kfd2kgd;
break;
default:
-- 
2.43.0

[PATCH AUTOSEL 6.6 16/18] drm/amdgpu: fix dereference null return value for the function amdgpu_vm_pt_parent

2024-06-05 Thread Sasha Levin

From: Jesse Zhang 

[ Upstream commit a0cf36546cc24ae1c95d72253c7795d4d2fc77aa ]

The pointer parent may be NULLed by the function amdgpu_vm_pt_parent.
To make the code more robust, check the pointer parent.

Signed-off-by: Jesse Zhang 
Suggested-by: Christian König 
Reviewed-by: Christian König 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index 0d51222f6f8eb..026a3db947298 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -766,11 +766,15 @@ int amdgpu_vm_pde_update(struct amdgpu_vm_update_params 
*params,
 struct amdgpu_vm_bo_base *entry)
 {
struct amdgpu_vm_bo_base *parent = amdgpu_vm_pt_parent(entry);
-   struct amdgpu_bo *bo = parent->bo, *pbo;
+   struct amdgpu_bo *bo, *pbo;
struct amdgpu_vm *vm = params->vm;
uint64_t pde, pt, flags;
unsigned int level;
 
+   if (WARN_ON(!parent))
+   return -EINVAL;
+
+   bo = parent->bo;
for (level = 0, pbo = bo->parent; pbo; ++level)
pbo = pbo->parent;
 
-- 
2.43.0

[PATCH AUTOSEL 6.6 15/18] drm/amdgpu: silence UBSAN warning

2024-06-05 Thread Sasha Levin

From: Alex Deucher 

[ Upstream commit 05d9e24ddb15160164ba6e917a88c00907dc2434 ]

Convert a variable sized array from [1] to [].

Reviewed-by: Christian König 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/include/atomfirmware.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/include/atomfirmware.h 
b/drivers/gpu/drm/amd/include/atomfirmware.h
index fa7d6ced786f1..06be085515200 100644
--- a/drivers/gpu/drm/amd/include/atomfirmware.h
+++ b/drivers/gpu/drm/amd/include/atomfirmware.h
@@ -3508,7 +3508,7 @@ struct atom_gpio_voltage_object_v4
uint8_t  phase_delay_us;  // phase delay in unit of 
micro second
uint8_t  reserved;   
uint32_t gpio_mask_val; // GPIO Mask value
-   struct atom_voltage_gpio_map_lut voltage_gpio_lut[1];
+   struct atom_voltage_gpio_map_lut voltage_gpio_lut[] 
__counted_by(gpio_entry_num);
 };
 
 struct  atom_svid2_voltage_object_v4
-- 
2.43.0

Re: 6.10/bisected/regression - commits bc87d666c05 and 6d4279cb99ac cause appearing green flashing bar on top of screen on Radeon 6900XT and 120Hz

2024-06-05 Thread Mikhail Gavrilov

On Sun, May 26, 2024 at 7:06 PM Mikhail Gavrilov
 wrote:
>
> Hi,
> Day before yesterday I replaced 7900XTX to 6900XT for got clear in
> which kernel first time appeared warning message "DMA-API: amdgpu
> :0f:00.0: cacheline tracking EEXIST, overlapping mappings aren't
> supported".
> The kernel 6.3 and older won't boot on a computer with Radeon 7900XTX.
> When I booted the system with 6900XT I saw a green flashing bar on top
> of the screen when I typed commands in the gnome terminal which was
> maximized on full screen.
> Demonstration: https://youtu.be/tTvwQ_5pRkk
> For reproduction you need Radeon 6900XT GPU connected to 120Hz OLED TV by 
> HDMI.
>
> I bisected the issue and the first commit which I found was 6d4279cb99ac.
> commit 6d4279cb99ac4f51d10409501d29969f687ac8dc (HEAD)
> Author: Rodrigo Siqueira 
> Date:   Tue Mar 26 10:42:05 2024 -0600
>
> drm/amd/display: Drop legacy code
>
> This commit removes code that are not used by display anymore.
>
> Acked-by: Hamza Mahfooz 
> Signed-off-by: Rodrigo Siqueira 
> Signed-off-by: Alex Deucher 
>
>  drivers/gpu/drm/amd/display/dc/inc/hw/stream_encoder.h |  4 
>  drivers/gpu/drm/amd/display/dc/inc/resource.h  |  7 ---
>  drivers/gpu/drm/amd/display/dc/optc/dcn20/dcn20_optc.c | 10 
> --
>  drivers/gpu/drm/amd/display/dc/resource/dcn21/dcn21_resource.c | 33
> +
>  4 files changed, 1 insertion(+), 53 deletions(-)
>
> Every time after bisecting I usually make sure that I found the right
> commit and build the kernel with revert of the bad commit.
> But this time I again observed an issue after running a kernel builded
> without commit 6d4279cb99ac.
> And I decided to find a second bad commit.
> The second bad commit has been bc87d666c05.
> commit bc87d666c05a13e6d4ae1ddce41fc43d2567b9a2 (HEAD)
> Author: Rodrigo Siqueira 
> Date:   Tue Mar 26 11:55:19 2024 -0600
>
> drm/amd/display: Add fallback configuration for set DRR in DCN10
>
> Set OTG/OPTC parameters to 0 if something goes wrong on DCN10.
>
> Acked-by: Hamza Mahfooz 
> Signed-off-by: Rodrigo Siqueira 
> Signed-off-by: Alex Deucher 
>
>  drivers/gpu/drm/amd/display/dc/optc/dcn10/dcn10_optc.c | 15 ---
>  1 file changed, 12 insertions(+), 3 deletions(-)
>
> After reverting both these commits on top of 54f71b0369c9 the issue is gone.
>
> I also attach the build config.
>
> My hardware specs: https://linux-hardware.org/?probe=f25a873c5e
>
> Rodrigo or anyone else from the AMD team can you look please.
>

Did anyone watch?

-- 
Best Regards,
Mike Gavrilov.

[PATCH AUTOSEL 6.8 17/18] Revert "drm/amdkfd: fix gfx_target_version for certain 11.0.3 devices"

2024-06-05 Thread Sasha Levin

From: Alex Deucher 

[ Upstream commit dd2b75fd9a79bf418e088656822af06fc253dbe3 ]

This reverts commit 28ebbb4981cb1fad12e0b1227dbecc88810b1ee8.

Revert this commit as apparently the LLVM code to take advantage of
this never landed.

Reviewed-by: Feifei Xu 
Signed-off-by: Alex Deucher 
Cc: Feifei Xu 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c | 11 ++-
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index fcf6558d019e5..7d39982bf74e2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -405,15 +405,8 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, 
bool vf)
f2g = _v11_kfd2kgd;
break;
case IP_VERSION(11, 0, 3):
-   if ((adev->pdev->device == 0x7460 &&
-adev->pdev->revision == 0x00) ||
-   (adev->pdev->device == 0x7461 &&
-adev->pdev->revision == 0x00))
-   /* Note: Compiler version is 11.0.5 while HW 
version is 11.0.3 */
-   gfx_target_version = 110005;
-   else
-   /* Note: Compiler version is 11.0.1 while HW 
version is 11.0.3 */
-   gfx_target_version = 110001;
+   /* Note: Compiler version is 11.0.1 while HW version is 
11.0.3 */
+   gfx_target_version = 110001;
f2g = _v11_kfd2kgd;
break;
case IP_VERSION(11, 5, 0):
-- 
2.43.0

[PATCH AUTOSEL 6.8 16/18] drm/amdgpu: fix dereference null return value for the function amdgpu_vm_pt_parent

2024-06-05 Thread Sasha Levin

From: Jesse Zhang 

[ Upstream commit a0cf36546cc24ae1c95d72253c7795d4d2fc77aa ]

The pointer parent may be NULLed by the function amdgpu_vm_pt_parent.
To make the code more robust, check the pointer parent.

Signed-off-by: Jesse Zhang 
Suggested-by: Christian König 
Reviewed-by: Christian König 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index a160265ddc07c..099e375a39ec6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -766,11 +766,15 @@ int amdgpu_vm_pde_update(struct amdgpu_vm_update_params 
*params,
 struct amdgpu_vm_bo_base *entry)
 {
struct amdgpu_vm_bo_base *parent = amdgpu_vm_pt_parent(entry);
-   struct amdgpu_bo *bo = parent->bo, *pbo;
+   struct amdgpu_bo *bo, *pbo;
struct amdgpu_vm *vm = params->vm;
uint64_t pde, pt, flags;
unsigned int level;
 
+   if (WARN_ON(!parent))
+   return -EINVAL;
+
+   bo = parent->bo;
for (level = 0, pbo = bo->parent; pbo; ++level)
pbo = pbo->parent;
 
-- 
2.43.0

[PATCH AUTOSEL 6.8 15/18] drm/amdgpu: silence UBSAN warning

2024-06-05 Thread Sasha Levin

From: Alex Deucher 

[ Upstream commit 05d9e24ddb15160164ba6e917a88c00907dc2434 ]

Convert a variable sized array from [1] to [].

Reviewed-by: Christian König 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/include/atomfirmware.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/include/atomfirmware.h 
b/drivers/gpu/drm/amd/include/atomfirmware.h
index fa7d6ced786f1..06be085515200 100644
--- a/drivers/gpu/drm/amd/include/atomfirmware.h
+++ b/drivers/gpu/drm/amd/include/atomfirmware.h
@@ -3508,7 +3508,7 @@ struct atom_gpio_voltage_object_v4
uint8_t  phase_delay_us;  // phase delay in unit of 
micro second
uint8_t  reserved;   
uint32_t gpio_mask_val; // GPIO Mask value
-   struct atom_voltage_gpio_map_lut voltage_gpio_lut[1];
+   struct atom_voltage_gpio_map_lut voltage_gpio_lut[] 
__counted_by(gpio_entry_num);
 };
 
 struct  atom_svid2_voltage_object_v4
-- 
2.43.0

[PATCH AUTOSEL 6.9 19/23] drm/amdgpu: silence UBSAN warning

2024-06-05 Thread Sasha Levin

From: Alex Deucher 

[ Upstream commit 05d9e24ddb15160164ba6e917a88c00907dc2434 ]

Convert a variable sized array from [1] to [].

Reviewed-by: Christian König 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/include/atomfirmware.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/include/atomfirmware.h 
b/drivers/gpu/drm/amd/include/atomfirmware.h
index af3eebb4c9bcb..f732182218330 100644
--- a/drivers/gpu/drm/amd/include/atomfirmware.h
+++ b/drivers/gpu/drm/amd/include/atomfirmware.h
@@ -3540,7 +3540,7 @@ struct atom_gpio_voltage_object_v4
uint8_t  phase_delay_us;  // phase delay in unit of 
micro second
uint8_t  reserved;   
uint32_t gpio_mask_val; // GPIO Mask value
-   struct atom_voltage_gpio_map_lut voltage_gpio_lut[1];
+   struct atom_voltage_gpio_map_lut voltage_gpio_lut[] 
__counted_by(gpio_entry_num);
 };
 
 struct  atom_svid2_voltage_object_v4
-- 
2.43.0

[PATCH AUTOSEL 6.9 21/23] Revert "drm/amdkfd: fix gfx_target_version for certain 11.0.3 devices"

2024-06-05 Thread Sasha Levin

From: Alex Deucher 

[ Upstream commit dd2b75fd9a79bf418e088656822af06fc253dbe3 ]

This reverts commit 28ebbb4981cb1fad12e0b1227dbecc88810b1ee8.

Revert this commit as apparently the LLVM code to take advantage of
this never landed.

Reviewed-by: Feifei Xu 
Signed-off-by: Alex Deucher 
Cc: Feifei Xu 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c | 11 ++-
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 719d6d365e150..ff01610fbce3b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -408,15 +408,8 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, 
bool vf)
f2g = _v11_kfd2kgd;
break;
case IP_VERSION(11, 0, 3):
-   if ((adev->pdev->device == 0x7460 &&
-adev->pdev->revision == 0x00) ||
-   (adev->pdev->device == 0x7461 &&
-adev->pdev->revision == 0x00))
-   /* Note: Compiler version is 11.0.5 while HW 
version is 11.0.3 */
-   gfx_target_version = 110005;
-   else
-   /* Note: Compiler version is 11.0.1 while HW 
version is 11.0.3 */
-   gfx_target_version = 110001;
+   /* Note: Compiler version is 11.0.1 while HW version is 
11.0.3 */
+   gfx_target_version = 110001;
f2g = _v11_kfd2kgd;
break;
case IP_VERSION(11, 5, 0):
-- 
2.43.0

[PATCH AUTOSEL 6.9 20/23] drm/amdgpu: fix dereference null return value for the function amdgpu_vm_pt_parent

2024-06-05 Thread Sasha Levin

From: Jesse Zhang 

[ Upstream commit a0cf36546cc24ae1c95d72253c7795d4d2fc77aa ]

The pointer parent may be NULLed by the function amdgpu_vm_pt_parent.
To make the code more robust, check the pointer parent.

Signed-off-by: Jesse Zhang 
Suggested-by: Christian König 
Reviewed-by: Christian König 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index 124389a6bf481..512b42225003f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -682,11 +682,15 @@ int amdgpu_vm_pde_update(struct amdgpu_vm_update_params 
*params,
 struct amdgpu_vm_bo_base *entry)
 {
struct amdgpu_vm_bo_base *parent = amdgpu_vm_pt_parent(entry);
-   struct amdgpu_bo *bo = parent->bo, *pbo;
+   struct amdgpu_bo *bo, *pbo;
struct amdgpu_vm *vm = params->vm;
uint64_t pde, pt, flags;
unsigned int level;
 
+   if (WARN_ON(!parent))
+   return -EINVAL;
+
+   bo = parent->bo;
for (level = 0, pbo = bo->parent; pbo; ++level)
pbo = pbo->parent;
 
-- 
2.43.0

[PATCH AUTOSEL 6.9 03/23] drm/amdgpu: correct hbm field in boot status

2024-06-05 Thread Sasha Levin

From: Hawking Zhang 

[ Upstream commit ec58991054e899c9d86f7e3c8a96cb602d4b5938 ]

hbm filed takes bit 13 and bit 14 in boot status.

Signed-off-by: Hawking Zhang 
Reviewed-by: Tao Zhou 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index e0f8ce9d84406..db9cb2b4e9823 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -43,7 +43,7 @@ struct amdgpu_iv_entry;
 #define AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(x)AMDGPU_GET_REG_FIELD(x, 
7, 7)
 #define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x)
AMDGPU_GET_REG_FIELD(x, 10, 8)
 #define AMDGPU_RAS_GPU_ERR_AID_ID(x)   AMDGPU_GET_REG_FIELD(x, 
12, 11)
-#define AMDGPU_RAS_GPU_ERR_HBM_ID(x)   AMDGPU_GET_REG_FIELD(x, 
13, 13)
+#define AMDGPU_RAS_GPU_ERR_HBM_ID(x)   AMDGPU_GET_REG_FIELD(x, 
14, 13)
 #define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x)  AMDGPU_GET_REG_FIELD(x, 
31, 31)
 
 #define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT   1000
-- 
2.43.0

[PATCH AUTOSEL 6.8 4/6] drm/amdkfd: Let VRAM allocations go to GTT domain on small APUs

2024-06-05 Thread Sasha Levin

From: Lang Yu 

[ Upstream commit eb853413d02c8d9b27942429b261a9eef228f005 ]

Small APUs(i.e., consumer, embedded products) usually have a small
carveout device memory which can't satisfy most compute workloads
memory allocation requirements.

We can't even run a Basic MNIST Example with a default 512MB carveout.
https://github.com/pytorch/examples/tree/main/mnist. Error Log:

"torch.cuda.OutOfMemoryError: HIP out of memory. Tried to allocate
84.00 MiB. GPU 0 has a total capacity of 512.00 MiB of which 0 bytes
is free. Of the allocated memory 103.83 MiB is allocated by PyTorch,
and 22.17 MiB is reserved by PyTorch but unallocated"

Though we can change BIOS settings to enlarge carveout size,
which is inflexible and may bring complaint. On the other hand,
the memory resource can't be effectively used between host and device.

The solution is MI300A approach, i.e., let VRAM allocations go to GTT.
Then device and host can flexibly and effectively share memory resource.

v2: Report local_mem_size_private as 0. (Felix)

Signed-off-by: Lang Yu 
Reviewed-by: Felix Kuehling 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  5 +
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 20 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c  |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c  |  6 --
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h  |  3 ++-
 5 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 131983ed43465..7bd14a293ee52 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -455,6 +455,9 @@ void amdgpu_amdkfd_get_local_mem_info(struct amdgpu_device 
*adev,
else
mem_info->local_mem_size_private =
KFD_XCP_MEMORY_SIZE(adev, xcp->id);
+   } else if (adev->flags & AMD_IS_APU) {
+   mem_info->local_mem_size_public = (ttm_tt_pages_limit() << 
PAGE_SHIFT);
+   mem_info->local_mem_size_private = 0;
} else {
mem_info->local_mem_size_public = adev->gmc.visible_vram_size;
mem_info->local_mem_size_private = adev->gmc.real_vram_size -
@@ -803,6 +806,8 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device 
*adev, int xcp_id)
}
do_div(tmp, adev->xcp_mgr->num_xcp_per_mem_partition);
return ALIGN_DOWN(tmp, PAGE_SIZE);
+   } else if (adev->flags & AMD_IS_APU) {
+   return (ttm_tt_pages_limit() << PAGE_SHIFT);
} else {
return adev->gmc.real_vram_size;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 36796fbc00315..fd0aef17ab36c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -196,7 +196,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
return -EINVAL;
 
vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id);
-   if (adev->gmc.is_app_apu) {
+   if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
system_mem_needed = size;
ttm_mem_needed = size;
}
@@ -232,7 +232,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
  "adev reference can't be null when vram is used");
if (adev && xcp_id >= 0) {
adev->kfd.vram_used[xcp_id] += vram_needed;
-   adev->kfd.vram_used_aligned[xcp_id] += adev->gmc.is_app_apu ?
+   adev->kfd.vram_used_aligned[xcp_id] +=
+   (adev->gmc.is_app_apu || adev->flags & 
AMD_IS_APU) ?
vram_needed :
ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN);
}
@@ -260,7 +261,7 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device 
*adev,
 
if (adev) {
adev->kfd.vram_used[xcp_id] -= size;
-   if (adev->gmc.is_app_apu) {
+   if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
adev->kfd.vram_used_aligned[xcp_id] -= size;
kfd_mem_limit.system_mem_used -= size;
kfd_mem_limit.ttm_mem_used -= size;
@@ -887,7 +888,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev, 
struct kgd_mem *mem,
 * if peer device has large BAR. In contrast, access over xGMI is
 * allowed for both small and large BAR configurations of peer device
 */
-   if ((adev != bo_adev && !adev->gmc.is_app_apu) &&
+   if ((adev != bo_adev && !(adev->gmc.is_app_apu || adev->flags & 
AMD_IS_APU)) &&

[PATCH AUTOSEL 6.8 3/6] drm/amdkfd: handle duplicate BOs in reserve_bo_and_cond_vms

2024-06-05 Thread Sasha Levin

From: Lang Yu 

[ Upstream commit 2a705f3e49d20b59cd9e5cc3061b2d92ebe1e5f0 ]

Observed on gfx8 ASIC where KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM is used.
Two attachments use the same VM, root PD would be locked twice.

[   57.910418] Call Trace:
[   57.793726]  ? reserve_bo_and_cond_vms+0x111/0x1c0 [amdgpu]
[   57.793820]  amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu+0x6c/0x1c0 [amdgpu]
[   57.793923]  ? idr_get_next_ul+0xbe/0x100
[   57.793933]  kfd_process_device_free_bos+0x7e/0xf0 [amdgpu]
[   57.794041]  kfd_process_wq_release+0x2ae/0x3c0 [amdgpu]
[   57.794141]  ? process_scheduled_works+0x29c/0x580
[   57.794147]  process_scheduled_works+0x303/0x580
[   57.794157]  ? __pfx_worker_thread+0x10/0x10
[   57.794160]  worker_thread+0x1a2/0x370
[   57.794165]  ? __pfx_worker_thread+0x10/0x10
[   57.794167]  kthread+0x11b/0x150
[   57.794172]  ? __pfx_kthread+0x10/0x10
[   57.794177]  ret_from_fork+0x3d/0x60
[   57.794181]  ? __pfx_kthread+0x10/0x10
[   57.794184]  ret_from_fork_asm+0x1b/0x30

Signed-off-by: Lang Yu 
Reviewed-by: Felix Kuehling 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index daa66eb4f722b..36796fbc00315 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1186,7 +1186,8 @@ static int reserve_bo_and_cond_vms(struct kgd_mem *mem,
int ret;
 
ctx->sync = >sync;
-   drm_exec_init(>exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
+   drm_exec_init(>exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
+ DRM_EXEC_IGNORE_DUPLICATES, 0);
drm_exec_until_all_locked(>exec) {
ctx->n_vms = 0;
list_for_each_entry(entry, >attachments, list) {
-- 
2.43.0

[PATCH AUTOSEL 6.9 4/6] drm/amdkfd: Let VRAM allocations go to GTT domain on small APUs

2024-06-05 Thread Sasha Levin

From: Lang Yu 

[ Upstream commit eb853413d02c8d9b27942429b261a9eef228f005 ]

Small APUs(i.e., consumer, embedded products) usually have a small
carveout device memory which can't satisfy most compute workloads
memory allocation requirements.

We can't even run a Basic MNIST Example with a default 512MB carveout.
https://github.com/pytorch/examples/tree/main/mnist. Error Log:

"torch.cuda.OutOfMemoryError: HIP out of memory. Tried to allocate
84.00 MiB. GPU 0 has a total capacity of 512.00 MiB of which 0 bytes
is free. Of the allocated memory 103.83 MiB is allocated by PyTorch,
and 22.17 MiB is reserved by PyTorch but unallocated"

Though we can change BIOS settings to enlarge carveout size,
which is inflexible and may bring complaint. On the other hand,
the memory resource can't be effectively used between host and device.

The solution is MI300A approach, i.e., let VRAM allocations go to GTT.
Then device and host can flexibly and effectively share memory resource.

v2: Report local_mem_size_private as 0. (Felix)

Signed-off-by: Lang Yu 
Reviewed-by: Felix Kuehling 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  5 +
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 20 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c  |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c  |  6 --
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h  |  3 ++-
 5 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 35dd6effa9a34..7291c3fd8cf70 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -455,6 +455,9 @@ void amdgpu_amdkfd_get_local_mem_info(struct amdgpu_device 
*adev,
else
mem_info->local_mem_size_private =
KFD_XCP_MEMORY_SIZE(adev, xcp->id);
+   } else if (adev->flags & AMD_IS_APU) {
+   mem_info->local_mem_size_public = (ttm_tt_pages_limit() << 
PAGE_SHIFT);
+   mem_info->local_mem_size_private = 0;
} else {
mem_info->local_mem_size_public = adev->gmc.visible_vram_size;
mem_info->local_mem_size_private = adev->gmc.real_vram_size -
@@ -809,6 +812,8 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device 
*adev, int xcp_id)
}
do_div(tmp, adev->xcp_mgr->num_xcp_per_mem_partition);
return ALIGN_DOWN(tmp, PAGE_SIZE);
+   } else if (adev->flags & AMD_IS_APU) {
+   return (ttm_tt_pages_limit() << PAGE_SHIFT);
} else {
return adev->gmc.real_vram_size;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 0535b07987d9d..8975cf41a91ac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -196,7 +196,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
return -EINVAL;
 
vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id);
-   if (adev->gmc.is_app_apu) {
+   if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
system_mem_needed = size;
ttm_mem_needed = size;
}
@@ -232,7 +232,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
  "adev reference can't be null when vram is used");
if (adev && xcp_id >= 0) {
adev->kfd.vram_used[xcp_id] += vram_needed;
-   adev->kfd.vram_used_aligned[xcp_id] += adev->gmc.is_app_apu ?
+   adev->kfd.vram_used_aligned[xcp_id] +=
+   (adev->gmc.is_app_apu || adev->flags & 
AMD_IS_APU) ?
vram_needed :
ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN);
}
@@ -260,7 +261,7 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device 
*adev,
 
if (adev) {
adev->kfd.vram_used[xcp_id] -= size;
-   if (adev->gmc.is_app_apu) {
+   if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
adev->kfd.vram_used_aligned[xcp_id] -= size;
kfd_mem_limit.system_mem_used -= size;
kfd_mem_limit.ttm_mem_used -= size;
@@ -889,7 +890,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev, 
struct kgd_mem *mem,
 * if peer device has large BAR. In contrast, access over xGMI is
 * allowed for both small and large BAR configurations of peer device
 */
-   if ((adev != bo_adev && !adev->gmc.is_app_apu) &&
+   if ((adev != bo_adev && !(adev->gmc.is_app_apu || adev->flags & 
AMD_IS_APU)) &&

[PATCH AUTOSEL 6.9 3/6] drm/amdkfd: handle duplicate BOs in reserve_bo_and_cond_vms

2024-06-05 Thread Sasha Levin

From: Lang Yu 

[ Upstream commit 2a705f3e49d20b59cd9e5cc3061b2d92ebe1e5f0 ]

Observed on gfx8 ASIC where KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM is used.
Two attachments use the same VM, root PD would be locked twice.

[   57.910418] Call Trace:
[   57.793726]  ? reserve_bo_and_cond_vms+0x111/0x1c0 [amdgpu]
[   57.793820]  amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu+0x6c/0x1c0 [amdgpu]
[   57.793923]  ? idr_get_next_ul+0xbe/0x100
[   57.793933]  kfd_process_device_free_bos+0x7e/0xf0 [amdgpu]
[   57.794041]  kfd_process_wq_release+0x2ae/0x3c0 [amdgpu]
[   57.794141]  ? process_scheduled_works+0x29c/0x580
[   57.794147]  process_scheduled_works+0x303/0x580
[   57.794157]  ? __pfx_worker_thread+0x10/0x10
[   57.794160]  worker_thread+0x1a2/0x370
[   57.794165]  ? __pfx_worker_thread+0x10/0x10
[   57.794167]  kthread+0x11b/0x150
[   57.794172]  ? __pfx_kthread+0x10/0x10
[   57.794177]  ret_from_fork+0x3d/0x60
[   57.794181]  ? __pfx_kthread+0x10/0x10
[   57.794184]  ret_from_fork_asm+0x1b/0x30

Signed-off-by: Lang Yu 
Reviewed-by: Felix Kuehling 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index e4d4e55c08ad5..0535b07987d9d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1188,7 +1188,8 @@ static int reserve_bo_and_cond_vms(struct kgd_mem *mem,
int ret;
 
ctx->sync = >sync;
-   drm_exec_init(>exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
+   drm_exec_init(>exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
+ DRM_EXEC_IGNORE_DUPLICATES, 0);
drm_exec_until_all_locked(>exec) {
ctx->n_vms = 0;
list_for_each_entry(entry, >attachments, list) {
-- 
2.43.0

Re: [PATCH 00/18] Enhance amdgpu_firmware_request() to improve function flexibility

2024-06-05 Thread Christian König


You haven't addressed any of my comments on patch #1.

Regards,
Christian.

Am 05.06.24 um 11:33 schrieb Wang, Yang(Kevin):

[AMD Official Use Only - AMD Internal Distribution Only]

Ping...

Best Regards,
Kevin

-Original Message-
From: amd-gfx  On Behalf Of Yang Wang
Sent: Monday, June 3, 2024 9:42 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 

Subject: [PATCH 00/18] Enhance amdgpu_firmware_request() to improve function 
flexibility

Adding variable parameter support to function amdgpu_firmware_request() to 
improve function flexibility.

Yang Wang (18):
   drm/amdgpu: enhance amdgpu_ucode_request() function flexibility
   drm/amdgpu: refine gpu_info firmware loading
   drm/amdgpu: refine isp firmware loading
   drm/amdgpu: refine mes firmware loading
   drm/amdgpu: refine psp firmware loading
   drm/amdgpu: refine sdma firmware loading
   drm/amdgpu: refine imu firmware loading
   drm/amdgpu: refine pmfw/smu firmware loading
   drm/amdgpu: refine gmc firmware loading
   drm/amdgpu: refine vcn firmware loading
   drm/amdgpu: refine vpe firmware loading
   drm/amdgpu: refine gfx6 firmware loading
   drm/amdgpu: refine gfx7 firmware loading
   drm/amdgpu: refine gfx8 firmware loading
   drm/amdgpu: refine gfx9 firmware loading
   drm/amdgpu: refine gfx10 firmware loading
   drm/amdgpu: refine gfx11 firmware loading
   drm/amdgpu: refine gfx12 firmware loading

  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c|  9 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c   |  4 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c   |  6 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c   | 26 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c  |  8 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 30 +---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h |  3 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c   | 14 ++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c   |  6 +-
  drivers/gpu/drm/amd/amdgpu/cik_sdma.c | 11 +--
  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c| 25 ---
  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c| 26 ---
  drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c| 22 +++---
  drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c | 19 +++--
  drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 27 
  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 69 +--
  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 45 ++--
  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c   | 11 ++-
  drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c | 14 ++--
  drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c |  7 +-
  drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c |  6 +-
  drivers/gpu/drm/amd/amdgpu/imu_v11_0.c| 10 ++-
  drivers/gpu/drm/amd/amdgpu/imu_v12_0.c| 10 ++-
  drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c| 11 +--
  drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c| 11 +--
  drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c|  8 +--
  .../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c|  6 +-
  .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c|  6 +-
  .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  |  6 +-
  .../gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c|  6 +-
  30 files changed, 205 insertions(+), 257 deletions(-)

--
2.34.1

Page fault storms and IH ring overflow

2024-06-05 Thread Christian König


Hi guys,

just FYI: Alex published yesterday a bunch of new firmware files: 
https://gitlab.freedesktop.org/drm/firmware/-/commits/amd-staging


One major issue which should be fixed by those is that page faults can 
no longer overflow the IH ring buffer on APUs and older dGPUs.


Newer dGPU with IH 6.x  (Navi 3x) should redirect them to the secondary 
IH ring where they can overflow as much as they want and don't cause any 
more trouble.


Kudos to Sunil for looking into all of this and pushing our firmware 
team to implement the necessary fixes.


Regards,
Christian.

Re: [RFC PATCH v4 00/42] Color Pipeline API w/ VKMS

2024-06-05 Thread Melissa Wen

On 02/26, Harry Wentland wrote:
> This is an RFC set for a color pipeline API, along with a sample
> implementation in VKMS. All the key API bits are here. VKMS now
> supports two named transfer function colorops and two matrix
> colorops. We have IGT tests that check all four of these colorops
> with a pixel-by-pixel comparison that checks that these colorops
> do what we expect them to do with a +/- 1 8 bpc code point margin.
> 
> The big new change with v4 is the addition of an amdgpu color
> pipeline, for all AMD GPUs with DCN 3 and newer. Amdgpu now support
> the following:
> 
> 1. 1D Curve EOTF
> 2. 3x4 CTM
> 3. Multiplier
> 4. 1D Curve Inverse EOTF
> 5. 1D LUT
> 6. 1D Curve EOTF
> 7. 1D LUT
> 
> The supported curves for the 1D Curve type are:
> - sRGB EOTF and its inverse
> - PQ EOTF, scaled to [0.0, 125.0] and its inverse
> - BT.2020/BT.709 OETF and its inverse

So, as we talked in the 2024 Linux Display Next Hackfest, I hacked
`drm_info` to show the KMS color pipeline properties. You can find the
experimental-and-ugly code here:
- https://gitlab.freedesktop.org/mwen/drm_info/-/merge_requests/1
It depends on updating libdrm [1] and you will only see something if you
use a custom kernel with this series applied.

After checking the output, I missed a kind of Default or "Identity"
curve for the `CURVE_1D_TYPE` enum. I understand that if the color
operation is set bypass, we can ignore all property values, but I didn't
find a similar approach on plane properties, so it looks weird to me:

└───"CURVE_1D_TYPE" (atomic): enum {sRGB Inverse EOTF, BT.2020 OETF, PQ 125 
Inverse EOTF} = invalid (0)

Another thing that caught my attention was the size property for 1D
Curve Custom LUT, that I expected a similar setup to CRTC 1D LUTs:

└───"GAMMA_LUT_SIZE" (immutable): range [0, UINT32_MAX] = 4096

But I got:

├───"SIZE" (atomic): range [4096, 4096] = 4096

Any thoughts?

Anyway, see below an example of `drm_info` output on AMD DCN301 for a
given Overlay/Primary plane without userspace usage of the properties:

│   └───"COLOR_PIPELINE" (atomic): enum {Bypass, Color Pipeline 265} = 
Bypass
│   ├───Bypass
│   └───Color Pipeline 265
│   ├───Color Operation 265
│   │   ├───"TYPE" (immutable): enum {1D Curve, 1D Curve Custom 
LUT, 3x4 Matrix, Multiplier} = 1D Curve
│   │   ├───"BYPASS" (atomic): range [0, 1] = 1
│   │   └───"CURVE_1D_TYPE" (atomic): enum {sRGB EOTF, BT.2020 
Inverse OETF, PQ 125 EOTF} = sRGB EOTF
│   ├───Color Operation 270
│   │   ├───"TYPE" (immutable): enum {1D Curve, 1D Curve Custom 
LUT, 3x4 Matrix, Multiplier} = 3x4 Matrix
│   │   ├───"BYPASS" (atomic): range [0, 1] = 1
│   │   └───"DATA" (atomic): blob = 0
│   ├───Color Operation 275
│   │   ├───"TYPE" (immutable): enum {1D Curve, 1D Curve Custom 
LUT, 3x4 Matrix, Multiplier} = Multiplier
│   │   ├───"BYPASS" (atomic): range [0, 1] = 1
│   │   └───"MULTIPLIER" (atomic): range [0, UINT64_MAX] = 0
│   ├───Color Operation 280
│   │   ├───"TYPE" (immutable): enum {1D Curve, 1D Curve Custom 
LUT, 3x4 Matrix, Multiplier} = 1D Curve
│   │   ├───"BYPASS" (atomic): range [0, 1] = 1
│   │   └───"CURVE_1D_TYPE" (atomic): enum {sRGB Inverse EOTF, 
BT.2020 OETF, PQ 125 Inverse EOTF} = invalid (0)
│   ├───Color Operation 285
│   │   ├───"TYPE" (immutable): enum {1D Curve, 1D Curve Custom 
LUT, 3x4 Matrix, Multiplier} = 1D Curve Custom LUT
│   │   ├───"BYPASS" (atomic): range [0, 1] = 1
│   │   ├───"SIZE" (atomic): range [4096, 4096] = 4096
│   │   └───"DATA" (atomic): blob = 0
│   ├───Color Operation 291
│   │   ├───"TYPE" (immutable): enum {1D Curve, 1D Curve Custom 
LUT, 3x4 Matrix, Multiplier} = 1D Curve
│   │   ├───"BYPASS" (atomic): range [0, 1] = 1
│   │   └───"CURVE_1D_TYPE" (atomic): enum {sRGB EOTF, BT.2020 
Inverse OETF, PQ 125 EOTF} = sRGB EOTF
│   └───Color Operation 296
│   ├───"TYPE" (immutable): enum {1D Curve, 1D Curve Custom 
LUT, 3x4 Matrix, Multiplier} = 1D Curve Custom LUT
│   ├───"BYPASS" (atomic): range [0, 1] = 1
│   ├───"SIZE" (atomic): range [4096, 4096] = 4096
│   └───"DATA" (atomic): blob = 0

It's a WIP and the output still needs to be reviewed. So feel free to
point out any mistake.

BR,

Melissa

> 
> Note that the 1st and 5th colorops take the EOTF or Inverse
> OETF while the 3rd colorop takes the Inverse EOTF or OETF.
> 
> We are working on two more ops for amdgpu, the HDR multiplier
> and the 3DLUT, which will give us this:
> 
> 1. 1D Curve EOTF
> 2. 3x4 CTM
> 3. HDR Multiplier
> 4. 1D Curve Inverse EOTF

RE: [PATCH] drm/amdgpu: move some aca/mca init functions into ras_init() stage

2024-06-05 Thread Zhou1, Tao

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Tao Zhou 

> -Original Message-
> From: Wang, Yang(Kevin) 
> Sent: Wednesday, June 5, 2024 5:32 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking ; Zhou1, Tao
> 
> Subject: [PATCH] drm/amdgpu: move some aca/mca init functions into ras_init()
> stage
>
> adjust the function position to better match aca/mca fini code in ras_fini().
>
> Signed-off-by: Yang Wang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 28 ++---
>  1 file changed, 16 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 8dbfdb767f94..3258feb753ca 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3428,6 +3428,13 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>   goto release_con;
>   }
>
> + if (amdgpu_aca_is_enabled(adev))
> + r = amdgpu_aca_init(adev);
> + else
> + r = amdgpu_mca_init(adev);
> + if (r)
> + goto release_con;
> +
>   dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
>"hardware ability[%x] ras_mask[%x]\n",
>adev->ras_hw_enabled, adev->ras_enabled); @@ -3636,25
> +3643,22 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
>
>   amdgpu_ras_event_mgr_init(adev);
>
> - if (amdgpu_aca_is_enabled(adev)) {
> - if (!amdgpu_in_reset(adev)) {
> - r = amdgpu_aca_init(adev);
> + if (amdgpu_in_reset(adev)) {
> + if (!amdgpu_aca_is_enabled(adev)) {
> + r = amdgpu_mca_reset(adev);
>   if (r)
>   return r;
>   }
> + }
>
> - if (!amdgpu_sriov_vf(adev))
> - amdgpu_ras_set_aca_debug_mode(adev, false);
> - } else {
> - if (amdgpu_in_reset(adev))
> - r = amdgpu_mca_reset(adev);
> + if (!amdgpu_sriov_vf(adev)) {
> + if (amdgpu_aca_is_enabled(adev))
> + r = amdgpu_ras_set_aca_debug_mode(adev, false);
>   else
> - r = amdgpu_mca_init(adev);
> + r = amdgpu_ras_set_mca_debug_mode(adev, false);
> +
>   if (r)
>   return r;
> -
> - if (!amdgpu_sriov_vf(adev))
> - amdgpu_ras_set_mca_debug_mode(adev, false);
>   }
>
>   /* Guest side doesn't need init ras feature */
> --
> 2.34.1

RE: [PATCH 00/18] Enhance amdgpu_firmware_request() to improve function flexibility

2024-06-05 Thread Wang, Yang(Kevin)

[AMD Official Use Only - AMD Internal Distribution Only]

Ping...

Best Regards,
Kevin

-Original Message-
From: amd-gfx  On Behalf Of Yang Wang
Sent: Monday, June 3, 2024 9:42 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 

Subject: [PATCH 00/18] Enhance amdgpu_firmware_request() to improve function 
flexibility

Adding variable parameter support to function amdgpu_firmware_request() to 
improve function flexibility.

Yang Wang (18):
  drm/amdgpu: enhance amdgpu_ucode_request() function flexibility
  drm/amdgpu: refine gpu_info firmware loading
  drm/amdgpu: refine isp firmware loading
  drm/amdgpu: refine mes firmware loading
  drm/amdgpu: refine psp firmware loading
  drm/amdgpu: refine sdma firmware loading
  drm/amdgpu: refine imu firmware loading
  drm/amdgpu: refine pmfw/smu firmware loading
  drm/amdgpu: refine gmc firmware loading
  drm/amdgpu: refine vcn firmware loading
  drm/amdgpu: refine vpe firmware loading
  drm/amdgpu: refine gfx6 firmware loading
  drm/amdgpu: refine gfx7 firmware loading
  drm/amdgpu: refine gfx8 firmware loading
  drm/amdgpu: refine gfx9 firmware loading
  drm/amdgpu: refine gfx10 firmware loading
  drm/amdgpu: refine gfx11 firmware loading
  drm/amdgpu: refine gfx12 firmware loading

 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c|  9 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c   |  4 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c   |  6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c   | 26 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c  |  8 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 30 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c   | 14 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c   |  6 +-
 drivers/gpu/drm/amd/amdgpu/cik_sdma.c | 11 +--
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c| 25 ---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c| 26 ---
 drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c| 22 +++---
 drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c | 19 +++--
 drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 27 
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 69 +--
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 45 ++--
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c   | 11 ++-
 drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c | 14 ++--
 drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c |  7 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c |  6 +-
 drivers/gpu/drm/amd/amdgpu/imu_v11_0.c| 10 ++-
 drivers/gpu/drm/amd/amdgpu/imu_v12_0.c| 10 ++-
 drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c| 11 +--
 drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c| 11 +--
 drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c|  8 +--
 .../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c|  6 +-
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c|  6 +-
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  |  6 +-
 .../gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c|  6 +-
 30 files changed, 205 insertions(+), 257 deletions(-)

--
2.34.1

[PATCH] drm/amdgpu: move some aca/mca init functions into ras_init() stage

2024-06-05 Thread Yang Wang

adjust the function position to better match aca/mca fini code in ras_fini().

Signed-off-by: Yang Wang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 28 ++---
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 8dbfdb767f94..3258feb753ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3428,6 +3428,13 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con;
}
 
+   if (amdgpu_aca_is_enabled(adev))
+   r = amdgpu_aca_init(adev);
+   else
+   r = amdgpu_mca_init(adev);
+   if (r)
+   goto release_con;
+
dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
 "hardware ability[%x] ras_mask[%x]\n",
 adev->ras_hw_enabled, adev->ras_enabled);
@@ -3636,25 +3643,22 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
 
amdgpu_ras_event_mgr_init(adev);
 
-   if (amdgpu_aca_is_enabled(adev)) {
-   if (!amdgpu_in_reset(adev)) {
-   r = amdgpu_aca_init(adev);
+   if (amdgpu_in_reset(adev)) {
+   if (!amdgpu_aca_is_enabled(adev)) {
+   r = amdgpu_mca_reset(adev);
if (r)
return r;
}
+   }
 
-   if (!amdgpu_sriov_vf(adev))
-   amdgpu_ras_set_aca_debug_mode(adev, false);
-   } else {
-   if (amdgpu_in_reset(adev))
-   r = amdgpu_mca_reset(adev);
+   if (!amdgpu_sriov_vf(adev)) {
+   if (amdgpu_aca_is_enabled(adev))
+   r = amdgpu_ras_set_aca_debug_mode(adev, false);
else
-   r = amdgpu_mca_init(adev);
+   r = amdgpu_ras_set_mca_debug_mode(adev, false);
+
if (r)
return r;
-
-   if (!amdgpu_sriov_vf(adev))
-   amdgpu_ras_set_mca_debug_mode(adev, false);
}
 
/* Guest side doesn't need init ras feature */
-- 
2.34.1

Re: [PATCH 1/2][RFC] amdgpu: fix a race in kfd_mem_export_dmabuf()

2024-06-05 Thread Christian König


Am 04.06.24 um 20:08 schrieb Felix Kuehling:


On 2024-06-03 22:13, Al Viro wrote:

Using drm_gem_prime_handle_to_fd() to set dmabuf up and insert it into
descriptor table, only to have it looked up by file descriptor and
remove it from descriptor table is not just too convoluted - it's
racy; another thread might have modified the descriptor table while
we'd been going through that song and dance.

It's not hard to fix - turn drm_gem_prime_handle_to_fd()
into a wrapper for a new helper that would simply return the
dmabuf, without messing with descriptor table.

Then kfd_mem_export_dmabuf() would simply use that new helper
and leave the descriptor table alone.

Signed-off-by: Al Viro 


This patch looks good to me on the amdgpu side. For the DRM side I'm 
adding dri-devel.


Yeah that patch should probably be split up and the DRM changes 
discussed separately.


On the other hand skimming over it it seems reasonable to me.

Felix are you going to look into this or should I take a look and try to 
push it through drm-misc-next?


Thanks,
Christian.



Acked-by: Felix Kuehling 



---
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

index 8975cf41a91a..793780bb819c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -25,7 +25,6 @@
  #include 
  #include 
  #include 
-#include 
  #include 
    #include 
@@ -812,18 +811,13 @@ static int kfd_mem_export_dmabuf(struct kgd_mem 
*mem)

  if (!mem->dmabuf) {
  struct amdgpu_device *bo_adev;
  struct dma_buf *dmabuf;
-    int r, fd;
    bo_adev = amdgpu_ttm_adev(mem->bo->tbo.bdev);
-    r = drm_gem_prime_handle_to_fd(_adev->ddev, 
bo_adev->kfd.client.file,
+    dmabuf = drm_gem_prime_handle_to_dmabuf(_adev->ddev, 
bo_adev->kfd.client.file,

 mem->gem_handle,
  mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?
-   DRM_RDWR : 0, );
-    if (r)
-    return r;
-    dmabuf = dma_buf_get(fd);
-    close_fd(fd);
-    if (WARN_ON_ONCE(IS_ERR(dmabuf)))
+   DRM_RDWR : 0);
+    if (IS_ERR(dmabuf))
  return PTR_ERR(dmabuf);
  mem->dmabuf = dmabuf;
  }
diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c
index 03bd3c7bd0dc..622c51d3fe18 100644
--- a/drivers/gpu/drm/drm_prime.c
+++ b/drivers/gpu/drm/drm_prime.c
@@ -409,23 +409,9 @@ static struct dma_buf 
*export_and_register_object(struct drm_device *dev,

  return dmabuf;
  }
  -/**
- * drm_gem_prime_handle_to_fd - PRIME export function for GEM drivers
- * @dev: dev to export the buffer from
- * @file_priv: drm file-private structure
- * @handle: buffer handle to export
- * @flags: flags like DRM_CLOEXEC
- * @prime_fd: pointer to storage for the fd id of the create dma-buf
- *
- * This is the PRIME export function which must be used mandatorily 
by GEM
- * drivers to ensure correct lifetime management of the underlying 
GEM object.
- * The actual exporting from GEM object to a dma-buf is done through 
the

- * _gem_object_funcs.export callback.
- */
-int drm_gem_prime_handle_to_fd(struct drm_device *dev,
+struct dma_buf *drm_gem_prime_handle_to_dmabuf(struct drm_device *dev,
 struct drm_file *file_priv, uint32_t handle,
-   uint32_t flags,
-   int *prime_fd)
+   uint32_t flags)
  {
  struct drm_gem_object *obj;
  int ret = 0;
@@ -434,14 +420,14 @@ int drm_gem_prime_handle_to_fd(struct 
drm_device *dev,

  mutex_lock(_priv->prime.lock);
  obj = drm_gem_object_lookup(file_priv, handle);
  if (!obj)  {
-    ret = -ENOENT;
+    dmabuf = ERR_PTR(-ENOENT);
  goto out_unlock;
  }
    dmabuf = drm_prime_lookup_buf_by_handle(_priv->prime, 
handle);

  if (dmabuf) {
  get_dma_buf(dmabuf);
-    goto out_have_handle;
+    goto out;
  }
    mutex_lock(>object_name_lock);
@@ -463,7 +449,6 @@ int drm_gem_prime_handle_to_fd(struct drm_device 
*dev,

  /* normally the created dma-buf takes ownership of the ref,
   * but if that fails then drop the ref
   */
-    ret = PTR_ERR(dmabuf);
  mutex_unlock(>object_name_lock);
  goto out;
  }
@@ -478,34 +463,49 @@ int drm_gem_prime_handle_to_fd(struct 
drm_device *dev,

  ret = drm_prime_add_buf_handle(_priv->prime,
 dmabuf, handle);
  mutex_unlock(>object_name_lock);
-    if (ret)
-    goto fail_put_dmabuf;
-
-out_have_handle:
-    ret = dma_buf_fd(dmabuf, flags);
-    /*
- * We must _not_ remove the buffer from the handle cache since 
the newly
- * created dma buf is already linked in the global obj->dma_buf 
pointer,

- * and that is invariant as long as a userspace gem handle exists.
- * Closing the handle will clean out the cache

Re: [PATCH] drm/amdgpu: add reset source in various cases

2024-06-05 Thread Christian König


Am 04.06.24 um 17:58 schrieb Eric Huang:

To fullfill the reset event description.

Suggested-by: Lijo Lazar 
Signed-off-by: Eric Huang 


Reviewed-by: Christian König 


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 1 +
  3 files changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 10832b470448..dff7033f2026 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -980,6 +980,7 @@ static void amdgpu_debugfs_reset_work(struct work_struct 
*work)
  
  	reset_context.method = AMD_RESET_METHOD_NONE;

reset_context.reset_req_dev = adev;
+   reset_context.src = AMDGPU_RESET_SRC_USER;
set_bit(AMDGPU_NEED_FULL_RESET, _context.flags);
  
  	amdgpu_device_gpu_recover(adev, NULL, _context);

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e4742b65032d..cf0c4470ab9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -77,6 +77,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
  
  		reset_context.method = AMD_RESET_METHOD_NONE;

reset_context.reset_req_dev = adev;
+   reset_context.src = AMDGPU_RESET_SRC_JOB;
clear_bit(AMDGPU_NEED_FULL_RESET, _context.flags);
  
  		r = amdgpu_device_gpu_recover(ring->adev, job, _context);

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 8dbfdb767f94..33f592ec8bde 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2487,6 +2487,7 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
  
  		reset_context.method = AMD_RESET_METHOD_NONE;

reset_context.reset_req_dev = adev;
+   reset_context.src = AMDGPU_RESET_SRC_RAS;
  
  		/* Perform full reset in fatal error mode */

if (!amdgpu_ras_is_poison_mode_supported(ras->adev))

Re: [PATCH] drm/amd/display: use pre-allocated temp structure for bounding box

2024-06-05 Thread Christian König


Am 04.06.24 um 17:50 schrieb Alex Deucher:

This mirrors what the driver does for older DCN generations.

Should fix:

BUG: sleeping function called from invalid context at 
include/linux/sched/mm.h:306
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 449, name: kworker/u64:8
preempt_count: 2, expected: 0
RCU nest depth: 0, expected: 0
Preemption disabled at:
c0ce1580>] dc_fpu_begin+0x30/0xd0 [amdgpu]
CPU: 5 PID: 449 Comm: kworker/u64:8 Tainted: GW  6.8.0+ #35
Hardware name: System manufacturer System Product Name/ROG STRIX X570-E GAMING 
WIFI II, BIOS 4204 02/24/2022
Workqueue: events_unbound async_run_entry_fn

Fixes: 88c61827cedc ("drm/amd/display: dynamically allocate 
dml2_configuration_options structures")
Suggested-by: Hamza Mahfooz 
Signed-off-by: Alex Deucher 
Cc: George Zhang 
Cc: Arnd Bergmann 
Cc: harry.wentl...@amd.com
Cc: sunpeng...@amd.com
Cc: rodrigo.sique...@amd.com


Acked-by: Christian König 


---
  drivers/gpu/drm/amd/display/dc/dc.h   | 1 +
  .../drm/amd/display/dc/resource/dcn32/dcn32_resource.c| 8 +++-
  .../drm/amd/display/dc/resource/dcn321/dcn321_resource.c  | 8 +++-
  3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dc.h 
b/drivers/gpu/drm/amd/display/dc/dc.h
index d0ed01ac460d..d843933ad731 100644
--- a/drivers/gpu/drm/amd/display/dc/dc.h
+++ b/drivers/gpu/drm/amd/display/dc/dc.h
@@ -1444,6 +1444,7 @@ struct dc {
} scratch;
  
  	struct dml2_configuration_options dml2_options;

+   struct dml2_configuration_options dml2_tmp;
enum dc_acpi_cm_power_state power_state;
  
  };

diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c 
b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c
index 0f11d7c8791c..7a8aa9396dea 100644
--- a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c
@@ -2007,11 +2007,9 @@ void dcn32_calculate_wm_and_dlg(struct dc *dc, struct 
dc_state *context,
  
  static void dcn32_update_bw_bounding_box(struct dc *dc, struct clk_bw_params *bw_params)

  {
-   struct dml2_configuration_options *dml2_opt;
+   struct dml2_configuration_options *dml2_opt = >dml2_tmp;
  
-	dml2_opt = kmemdup(>dml2_options, sizeof(dc->dml2_options), GFP_KERNEL);

-   if (!dml2_opt)
-   return;
+   memcpy(dml2_opt, >dml2_options, sizeof(dc->dml2_options));
  
  	DC_FP_START();
  
@@ -2027,7 +2025,7 @@ static void dcn32_update_bw_bounding_box(struct dc *dc, struct clk_bw_params *bw
  
  	DC_FP_END();
  
-	kfree(dml2_opt);

+   memcpy(>dml2_options, dml2_opt, sizeof(dc->dml2_options));
  }
  
  static struct resource_funcs dcn32_res_pool_funcs = {

diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c 
b/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c
index 07ca6f58447d..ef30e8632607 100644
--- a/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c
@@ -1581,11 +1581,9 @@ static struct dc_cap_funcs cap_funcs = {
  
  static void dcn321_update_bw_bounding_box(struct dc *dc, struct clk_bw_params *bw_params)

  {
-   struct dml2_configuration_options *dml2_opt;
+   struct dml2_configuration_options *dml2_opt = >dml2_tmp;
  
-	dml2_opt = kmemdup(>dml2_options, sizeof(dc->dml2_options), GFP_KERNEL);

-   if (!dml2_opt)
-   return;
+   memcpy(dml2_opt, >dml2_options, sizeof(dc->dml2_options));
  
  	DC_FP_START();
  
@@ -1601,7 +1599,7 @@ static void dcn321_update_bw_bounding_box(struct dc *dc, struct clk_bw_params *b
  
  	DC_FP_END();
  
-	kfree(dml2_opt);

+   memcpy(>dml2_options, dml2_opt, sizeof(dc->dml2_options));
  }
  
  static struct resource_funcs dcn321_res_pool_funcs = {

Re: [PATCH] drm/amd/display: use GFP_ATOMIC for bounding box

2024-06-05 Thread Christian König


Am 04.06.24 um 16:57 schrieb Arnd Bergmann:

On Tue, Jun 4, 2024, at 16:22, Christian König wrote:

Am 04.06.24 um 15:50 schrieb Alex Deucher:

This can be called in atomic context.  Should fix:

BUG: sleeping function called from invalid context at 
include/linux/sched/mm.h:306
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 449, name: kworker/u64:8
preempt_count: 2, expected: 0
RCU nest depth: 0, expected: 0
Preemption disabled at:
c0ce1580>] dc_fpu_begin+0x30/0xd0 [amdgpu]
CPU: 5 PID: 449 Comm: kworker/u64:8 Tainted: GW  6.8.0+ #35
Hardware name: System manufacturer System Product Name/ROG STRIX X570-E GAMING 
WIFI II, BIOS 4204 02/24/2022
Workqueue: events_unbound async_run_entry_fn

That most likely only papers over the real problem and is not a valid fix.

The question is why is that an atomic context?  If the function is used
under a spinlock then this might indeed be the right fix.

If it's because of floating point operation then that here won't work
either.

It looks like there is only one caller, and this turns on
floating point instructions just for the call:

 if (dc->res_pool->funcs->update_bw_bounding_box) {
 DC_FP_START();
 dc->res_pool->funcs->update_bw_bounding_box(dc, 
dc->clk_mgr->bw_params);
 DC_FP_END();
 }

but then they are enabled again inside of the function.

If we can drop the outer DC_FP_START(), that means
the GFP_KERNEL allocation works. On the other hand if
we actually have to enabled it before calling into
the function (e.g. because there is an architecture that
has incompatible function calling conventions when FP
is enabled), the inner one is redundant, but we can
potentially move the kmemdup() into the caller and
pass the copy by refernence.


Yeah exactly that's the case.

The DC_FP_START() and DC_FP_END() calls need to be outside of the 
function  because the compiler has no idea that it can't move any 
flouting point instructions outside of the critical section between the 
two functions.


So yes the calls to DC_FP_START() and DC_FP_END() from within floating 
point enabled code should be forbidden somehow.


And when that is done the caller should allocate any parameters needed 
and pass them by reference to avoid the GFP_ATOMIC.


Regards,
Christian.



   Arnd

RE: [PATCH 1/12 V2] drm/amd/pm: remove dead code in si_convert_power_level_to_smc

2024-06-05 Thread Huang, Tim

[Public]

This patch is,

Reviewed-by: Tim Huang 



> -Original Message-
> From: Jesse Zhang 
> Sent: Wednesday, June 5, 2024 4:34 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander ; Koenig, Christian
> ; Huang, Tim ; Zhang,
> Jesse(Jie) ; Zhang, Jesse(Jie)
> 
> Subject: [PATCH 1/12 V2] drm/amd/pm: remove dead code in
> si_convert_power_level_to_smc
>
> Since gmc_pg is false, setting mcFlags with SISLANDS_SMC_MC_PG_EN
> cannot be reach.
>
> Signed-off-by: Jesse Zhang 
> Suggested-by: Tim Huang 
> ---
>  drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c | 4 
>  1 file changed, 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c
> b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c
> index 68ac01a8bc3a..f324a8ef8032 100644
> --- a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c
> +++ b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c
> @@ -5467,7 +5467,6 @@ static int si_convert_power_level_to_smc(struct
> amdgpu_device *adev,
>   int ret;
>   bool dll_state_on;
>   u16 std_vddc;
> - bool gmc_pg = false;
>
>   if (eg_pi->pcie_performance_request &&
>   (si_pi->force_pcie_gen != SI_PCIE_GEN_INVALID)) @@ -5487,9
> +5486,6 @@ static int si_convert_power_level_to_smc(struct amdgpu_device
> *adev,
>   (RREG32(DPG_PIPE_STUTTER_CONTROL) & STUTTER_ENABLE) &&
>   (adev->pm.dpm.new_active_crtc_count <= 2)) {
>   level->mcFlags |= SISLANDS_SMC_MC_STUTTER_EN;
> -
> - if (gmc_pg)
> - level->mcFlags |= SISLANDS_SMC_MC_PG_EN;
>   }
>
>   if (adev->gmc.vram_type == AMDGPU_VRAM_TYPE_GDDR5) {
> --
> 2.25.1

RE: [PATCH 4/12 V2] drm/amdgpu: remove dead code in atom_get_src_int

2024-06-05 Thread Huang, Tim

[Public]

This patch is,

Reviewed-by: Tim Huang 



> -Original Message-
> From: Jesse Zhang 
> Sent: Wednesday, June 5, 2024 4:34 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander ; Koenig, Christian
> ; Huang, Tim ; Zhang,
> Jesse(Jie) ; Zhang, Jesse(Jie)
> 
> Subject: [PATCH 4/12 V2] drm/amdgpu: remove dead code in
> atom_get_src_int
>
> Since the range of align is 0~7, the expression is: align = (attr >> 3) & 7.
> In the case of ATOM_ARG_IMM, the code cannot reach the default case.
> So there is no need for "break".
>
> Signed-off-by: Jesse Zhang 
> Suggested-by: Tim Huang 
> ---
>  drivers/gpu/drm/amd/amdgpu/atom.c | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/atom.c
> b/drivers/gpu/drm/amd/amdgpu/atom.c
> index d552e013354c..09715b506468 100644
> --- a/drivers/gpu/drm/amd/amdgpu/atom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/atom.c
> @@ -301,7 +301,7 @@ static uint32_t atom_get_src_int(atom_exec_context
> *ctx, uint8_t attr,
>   (*ptr) += 4;
>   if (print)
>   DEBUG("IMM 0x%08X\n", val);
> - return val;
> + break;
>   case ATOM_SRC_WORD0:
>   case ATOM_SRC_WORD8:
>   case ATOM_SRC_WORD16:
> @@ -309,7 +309,7 @@ static uint32_t atom_get_src_int(atom_exec_context
> *ctx, uint8_t attr,
>   (*ptr) += 2;
>   if (print)
>   DEBUG("IMM 0x%04X\n", val);
> - return val;
> + break;
>   case ATOM_SRC_BYTE0:
>   case ATOM_SRC_BYTE8:
>   case ATOM_SRC_BYTE16:
> @@ -318,9 +318,9 @@ static uint32_t atom_get_src_int(atom_exec_context
> *ctx, uint8_t attr,
>   (*ptr)++;
>   if (print)
>   DEBUG("IMM 0x%02X\n", val);
> - return val;
> + break;
>   }
> - break;
> + return val;
>   case ATOM_ARG_PLL:
>   idx = U8(*ptr);
>   (*ptr)++;
> --
> 2.25.1

[PATCH 4/12 V2] drm/amdgpu: remove dead code in atom_get_src_int

2024-06-05 Thread Jesse Zhang

Since the range of align is 0~7, the expression is: align = (attr >> 3) & 7.
In the case of ATOM_ARG_IMM, the code cannot reach the default case.
So there is no need for "break".

Signed-off-by: Jesse Zhang 
Suggested-by: Tim Huang 
---
 drivers/gpu/drm/amd/amdgpu/atom.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/atom.c 
b/drivers/gpu/drm/amd/amdgpu/atom.c
index d552e013354c..09715b506468 100644
--- a/drivers/gpu/drm/amd/amdgpu/atom.c
+++ b/drivers/gpu/drm/amd/amdgpu/atom.c
@@ -301,7 +301,7 @@ static uint32_t atom_get_src_int(atom_exec_context *ctx, 
uint8_t attr,
(*ptr) += 4;
if (print)
DEBUG("IMM 0x%08X\n", val);
-   return val;
+   break;
case ATOM_SRC_WORD0:
case ATOM_SRC_WORD8:
case ATOM_SRC_WORD16:
@@ -309,7 +309,7 @@ static uint32_t atom_get_src_int(atom_exec_context *ctx, 
uint8_t attr,
(*ptr) += 2;
if (print)
DEBUG("IMM 0x%04X\n", val);
-   return val;
+   break;
case ATOM_SRC_BYTE0:
case ATOM_SRC_BYTE8:
case ATOM_SRC_BYTE16:
@@ -318,9 +318,9 @@ static uint32_t atom_get_src_int(atom_exec_context *ctx, 
uint8_t attr,
(*ptr)++;
if (print)
DEBUG("IMM 0x%02X\n", val);
-   return val;
+   break;
}
-   break;
+   return val;
case ATOM_ARG_PLL:
idx = U8(*ptr);
(*ptr)++;
-- 
2.25.1

[PATCH 1/12 V2] drm/amd/pm: remove dead code in si_convert_power_level_to_smc

2024-06-05 Thread Jesse Zhang

Since gmc_pg is false, setting mcFlags with SISLANDS_SMC_MC_PG_EN  cannot be 
reach.

Signed-off-by: Jesse Zhang 
Suggested-by: Tim Huang 
---
 drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c 
b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c
index 68ac01a8bc3a..f324a8ef8032 100644
--- a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c
+++ b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c
@@ -5467,7 +5467,6 @@ static int si_convert_power_level_to_smc(struct 
amdgpu_device *adev,
int ret;
bool dll_state_on;
u16 std_vddc;
-   bool gmc_pg = false;
 
if (eg_pi->pcie_performance_request &&
(si_pi->force_pcie_gen != SI_PCIE_GEN_INVALID))
@@ -5487,9 +5486,6 @@ static int si_convert_power_level_to_smc(struct 
amdgpu_device *adev,
(RREG32(DPG_PIPE_STUTTER_CONTROL) & STUTTER_ENABLE) &&
(adev->pm.dpm.new_active_crtc_count <= 2)) {
level->mcFlags |= SISLANDS_SMC_MC_STUTTER_EN;
-
-   if (gmc_pg)
-   level->mcFlags |= SISLANDS_SMC_MC_PG_EN;
}
 
if (adev->gmc.vram_type == AMDGPU_VRAM_TYPE_GDDR5) {
-- 
2.25.1

Re: [PATCH] drm/amdgpu: Fix the BO release clear memory warning

2024-06-05 Thread Paneer Selvam, Arunpravin


Hi Christian,

On 5/7/2024 8:21 PM, Christian König wrote:

Am 06.05.24 um 15:48 schrieb Arunpravin Paneer Selvam:

This happens when the amdgpu_bo_release_notify running
before amdgpu_ttm_set_buffer_funcs_status set the buffer
funcs to enabled.

check the buffer funcs enablement before calling the fill
buffer memory.

Log snip:
[    6.036477] [drm:amdgpu_fill_buffer [amdgpu]] *ERROR* Trying to 
clear memory with ring turned off.

[    6.036667] [ cut here ]
[    6.036668] WARNING: CPU: 3 PID: 370 at 
drivers/gpu/drm/amd/amdgpu/amdgpu_object.c:1355 
amdgpu_bo_release_notify+0x201/0x220 [amdgpu]
[    6.036767] Modules linked in: hid_generic amdgpu(+) amdxcp 
drm_exec gpu_sched drm_buddy i2c_algo_bit usbhid drm_suballoc_helper 
drm_display_helper hid sd_mod cec rc_core drm_ttm_helper ahci ttm 
nvme libahci drm_kms_helper nvme_core r8169 xhci_pci libata t10_pi 
xhci_hcd realtek crc32_pclmul crc64_rocksoft mdio_devres crc64 drm 
crc32c_intel scsi_mod usbcore thunderbolt crc_t10dif i2c_piix4 libphy 
crct10dif_generic crct10dif_pclmul crct10dif_common scsi_common 
usb_common video wmi gpio_amdpt gpio_generic button
[    6.036793] CPU: 3 PID: 370 Comm: (udev-worker) Not tainted 
6.8.7-dirty #1
[    6.036795] Hardware name: ASRock X670E Taichi/X670E Taichi, BIOS 
2.10 03/26/2024

[    6.036796] RIP: 0010:amdgpu_bo_release_notify+0x201/0x220 [amdgpu]
[    6.036891] Code: 0b e9 af fe ff ff 48 ba ff ff ff ff ff ff ff 7f 
31 f6 4c 89 e7 e8 7f 2f 7a d8 eb 98 e8 18 28 7a d8 eb b2 0f 0b e9 58 
fe ff ff <0f> 0b eb a7 be 03 00 00 00 e8 e1 89 4e d8 eb 9b e8 aa 4d 
ad d8 66

[    6.036892] RSP: 0018:bbe140d1f638 EFLAGS: 00010282
[    6.036894] RAX: ffea RBX: 90cba9e4e858 RCX: 
90dabde38c28
[    6.036895] RDX:  RSI: dfff RDI: 
0001
[    6.036896] RBP: 90cba980ef40 R08:  R09: 
bbe140d1f3c0
[    6.036896] R10: bbe140d1f3b8 R11: 0003 R12: 
90cba9e4e800
[    6.036897] R13: 90cba9e4e958 R14: 90cba980ef40 R15: 
0258
[    6.036898] FS:  7f2bd1679d00() GS:90da7e2c() 
knlGS:

[    6.036899] CS:  0010 DS:  ES:  CR0: 80050033
[    6.036900] CR2: 55a9b0f7299d CR3: 00011bb6e000 CR4: 
00750ef0

[    6.036901] PKRU: 5554
[    6.036901] Call Trace:
[    6.036903]  
[    6.036904]  ? amdgpu_bo_release_notify+0x201/0x220 [amdgpu]
[    6.036998]  ? __warn+0x81/0x130
[    6.037002]  ? amdgpu_bo_release_notify+0x201/0x220 [amdgpu]
[    6.037095]  ? report_bug+0x171/0x1a0
[    6.037099]  ? handle_bug+0x3c/0x80
[    6.037101]  ? exc_invalid_op+0x17/0x70
[    6.037103]  ? asm_exc_invalid_op+0x1a/0x20
[    6.037107]  ? amdgpu_bo_release_notify+0x201/0x220 [amdgpu]
[    6.037199]  ? amdgpu_bo_release_notify+0x14a/0x220 [amdgpu]
[    6.037292]  ttm_bo_release+0xff/0x2e0 [ttm]
[    6.037297]  ? srso_alias_return_thunk+0x5/0xfbef5
[    6.037299]  ? srso_alias_return_thunk+0x5/0xfbef5
[    6.037301]  ? ttm_resource_move_to_lru_tail+0x140/0x1e0 [ttm]
[    6.037306]  amdgpu_bo_free_kernel+0xcb/0x120 [amdgpu]
[    6.037399]  dm_helpers_free_gpu_mem+0x41/0x80 [amdgpu]
[    6.037544]  dcn315_clk_mgr_construct+0x198/0x7e0 [amdgpu]
[    6.037692]  dc_clk_mgr_create+0x16e/0x5f0 [amdgpu]
[    6.037826]  dc_create+0x28a/0x650 [amdgpu]
[    6.037958]  amdgpu_dm_init.isra.0+0x2d5/0x1ec0 [amdgpu]
[    6.038085]  ? srso_alias_return_thunk+0x5/0xfbef5
[    6.038087]  ? prb_read_valid+0x1b/0x30
[    6.038089]  ? srso_alias_return_thunk+0x5/0xfbef5
[    6.038090]  ? console_unlock+0x78/0x120
[    6.038092]  ? srso_alias_return_thunk+0x5/0xfbef5
[    6.038094]  ? vprintk_emit+0x175/0x2c0
[    6.038095]  ? srso_alias_return_thunk+0x5/0xfbef5
[    6.038097]  ? srso_alias_return_thunk+0x5/0xfbef5
[    6.038098]  ? dev_printk_emit+0xa5/0xd0
[    6.038104]  dm_hw_init+0x12/0x30 [amdgpu]
[    6.038209]  amdgpu_device_init+0x1e50/0x2500 [amdgpu]
[    6.038308]  ? srso_alias_return_thunk+0x5/0xfbef5
[    6.038310]  ? srso_alias_return_thunk+0x5/0xfbef5
[    6.038313]  amdgpu_driver_load_kms+0x19/0x190 [amdgpu]
[    6.038409]  amdgpu_pci_probe+0x18b/0x510 [amdgpu]
[    6.038505]  local_pci_probe+0x42/0xa0
[    6.038508]  pci_device_probe+0xc7/0x240
[    6.038510]  really_probe+0x19b/0x3e0
[    6.038513]  ? __pfx___driver_attach+0x10/0x10
[    6.038514]  __driver_probe_device+0x78/0x160
[    6.038516]  driver_probe_device+0x1f/0x90
[    6.038517]  __driver_attach+0xd2/0x1c0
[    6.038519]  bus_for_each_dev+0x85/0xd0
[    6.038521]  bus_add_driver+0x116/0x220
[    6.038523]  driver_register+0x59/0x100
[    6.038525]  ? __pfx_amdgpu_init+0x10/0x10 [amdgpu]
[    6.038618]  do_one_initcall+0x58/0x320
[    6.038621]  do_init_module+0x60/0x230
[    6.038624]  init_module_from_file+0x89/0xe0
[    6.038628]  idempotent_init_module+0x120/0x2b0
[    6.038630]  __x64_sys_finit_module+0x5e/0xb0
[    6.038632]  do_syscall_64+0x84/0x1a0
[    6.038634]  ?

RE: [PATCH 5/5] drm/amdgpu: add ras fatal flag to distingush fatal error reset

2024-06-05 Thread Zhang, Hawking

[AMD Official Use Only - AMD Internal Distribution Only]

Series is

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: amd-gfx  On Behalf Of Tao Zhou
Sent: Friday, May 31, 2024 18:49
To: amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao 
Subject: [PATCH 5/5] drm/amdgpu: add ras fatal flag to distingush fatal error 
reset

Check it in mode1 reset.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 32 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h  |  1 +
 .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c|  2 +-
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c  |  2 +-  
.../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  |  2 +-
 6 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 2071e30d7e56..97b770ba6424 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2451,6 +2451,26 @@ bool amdgpu_ras_in_recovery(struct amdgpu_device *adev)
return false;
 }

+bool amdgpu_ras_in_fatal(struct amdgpu_device *adev) {
+   struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+   int hive_ras_fatal = 0;
+
+   if (!amdgpu_ras_in_recovery(adev))
+   return false;
+
+   if (hive) {
+   hive_ras_fatal = atomic_read(>ras_fatal);
+   amdgpu_put_xgmi_hive(hive);
+   }
+
+   if (ras && (atomic_read(>in_fatal) || hive_ras_fatal))
+   return true;
+
+   return false;
+}
+
 static void amdgpu_ras_do_recovery(struct work_struct *work)  {
struct amdgpu_ras *ras =
@@ -2462,6 +2482,8 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)

if (hive) {
atomic_set(>ras_recovery, 1);
+   if (atomic_read(>in_fatal))
+   atomic_set(>ras_fatal, 1);

/* If any device which is part of the hive received RAS fatal
 * error interrupt, set fatal error status on all. This @@ 
-2526,8 +2548,10 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
amdgpu_device_gpu_recover(ras->adev, NULL, _context);
}
atomic_set(>in_recovery, 0);
+   atomic_set(>in_fatal, 0);
if (hive) {
atomic_set(>ras_recovery, 0);
+   atomic_set(>ras_fatal, 0);
amdgpu_put_xgmi_hive(hive);
}
 }
@@ -2982,6 +3006,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
mutex_init(>recovery_lock);
INIT_WORK(>recovery_work, amdgpu_ras_do_recovery);
atomic_set(>in_recovery, 0);
+   atomic_set(>in_fatal, 0);
con->eeprom_control.bad_channel_bitmap = 0;

max_eeprom_records_count = 
amdgpu_ras_eeprom_max_record_count(>eeprom_control);
@@ -4006,8 +4031,13 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, 
bool fatal)
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
}

-   if (atomic_cmpxchg(>in_recovery, 0, 1) == 0)
+   if (atomic_cmpxchg(>in_recovery, 0, 1) == 0) {
+   if (fatal)
+   atomic_set(>in_fatal, 1);
+
amdgpu_reset_domain_schedule(ras->adev->reset_domain, 
>recovery_work);
+   }
+
return 0;
 }

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index ed5793458a70..444a7fb7fbe3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -489,6 +489,7 @@ struct amdgpu_ras {
/* gpu recovery */
struct work_struct recovery_work;
atomic_t in_recovery;
+   atomic_t in_fatal;
struct amdgpu_device *adev;
/* error handler data */
struct ras_err_handler_data *eh_data;
@@ -953,6 +954,7 @@ int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,
pasid_notify pasid_fn, void *data, uint32_t reset);

 bool amdgpu_ras_in_recovery(struct amdgpu_device *adev);
+bool amdgpu_ras_in_fatal(struct amdgpu_device *adev);

 __printf(3, 4)
 void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, diff 
--git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index a3bfc16de6d4..a6d6272a4ec6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -44,6 +44,7 @@ struct amdgpu_hive_info {

struct amdgpu_reset_domain *reset_domain;
atomic_t ras_recovery;
+   atomic_t ras_fatal;
struct ras_event_manager event_mgr;
 };

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index 04533f99f1e3..a850e7b29d9d 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++

Re: [PATCH v4 6/9] drm/amdgpu: call flush_gpu_tlb directly in gfxhub enable

2024-06-05 Thread Christian König


Am 05.06.24 um 03:33 schrieb Yunxiang Li:

Here since we are in reset and takes the reset_domain write side lock
already. We can't use the flush tlb helper which tries to take the read
side.

Signed-off-by: Yunxiang Li 


Please add some code comments with a TODO that this needs more 
investigation.


With that done the patch is Reviewed-by: Christian König 



Regards,
Christian.


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 4 +---
  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c  | 2 +-
  drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c  | 2 +-
  3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 603c0738fd03..660599823050 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -620,10 +620,8 @@ void amdgpu_gmc_flush_gpu_tlb(struct amdgpu_device *adev, 
uint32_t vmid,
int r;
  
  	if (!hub->sdma_invalidation_workaround || vmid ||

-   !adev->mman.buffer_funcs_enabled ||
-   !adev->ib_pool_ready || amdgpu_in_reset(adev) ||
+   !adev->mman.buffer_funcs_enabled || !adev->ib_pool_ready ||
!ring->sched.ready) {
-
/*
 * A GPU reset should flush all TLBs anyway, so no need to do
 * this while one is ongoing.
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index aba0a51be960..93b62107f7a4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -4401,7 +4401,7 @@ static int gfx_v11_0_gfxhub_enable(struct amdgpu_device 
*adev)
false : true;
  
  	adev->gfxhub.funcs->set_fault_enable_default(adev, value);

-   amdgpu_gmc_flush_gpu_tlb(adev, 0, AMDGPU_GFXHUB(0), 0);
+   adev->gmc.gmc_funcs->flush_gpu_tlb(adev, 0, AMDGPU_GFXHUB(0), 0);
  
  	return 0;

  }
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
index 1ef9de41d193..b7ea46ed0c72 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
@@ -3213,7 +3213,7 @@ static int gfx_v12_0_gfxhub_enable(struct amdgpu_device 
*adev)
false : true;
  
  	adev->gfxhub.funcs->set_fault_enable_default(adev, value);

-   amdgpu_gmc_flush_gpu_tlb(adev, 0, AMDGPU_GFXHUB(0), 0);
+   adev->gmc.gmc_funcs->flush_gpu_tlb(adev, 0, AMDGPU_GFXHUB(0), 0);
  
  	return 0;

  }

RE: [PATCH 05/12] drm/amd/pm: remove dead code in navi10_emit_clk_levels and navi10_print_clk_levels

2024-06-05 Thread Huang, Tim

[Public]

This patch is,

Reviewed-by: Tim Huang 



> -Original Message-
> From: Jesse Zhang 
> Sent: Monday, June 3, 2024 4:48 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander ; Koenig, Christian
> ; Kuehling, Felix ;
> Huang, Tim ; Zhang, Jesse(Jie)
> ; Zhang, Jesse(Jie) 
> Subject: [PATCH 05/12] drm/amd/pm: remove dead code in
> navi10_emit_clk_levels and navi10_print_clk_levels
>
> Since the range of the varibable i is 0 - 3.
> So execution cannot reach this statement: default.
>
> Signed-off-by: Jesse Zhang 
> ---
>  drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c | 4 
>  1 file changed, 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
> b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
> index cf556f1b5ed1..076620fa3ef5 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
> @@ -1389,8 +1389,6 @@ static int navi10_emit_clk_levels(struct
> smu_context *smu,
>   case 2:
>   curve_settings = _table->GfxclkFreq3;
>   break;
> - default:
> - break;
>   }
>   *offset += sysfs_emit_at(buf, *offset,
> "%d: %uMHz %umV\n",
> i, curve_settings[0],
> @@ -1594,8 +1592,6 @@ static int navi10_print_clk_levels(struct
> smu_context *smu,
>   case 2:
>   curve_settings = _table->GfxclkFreq3;
>   break;
> - default:
> - break;
>   }
>   size += sysfs_emit_at(buf, size,
> "%d: %uMHz %umV\n",
> i, curve_settings[0],
> --
> 2.25.1

RE: [PATCH 2/5] drm/amdgpu: trigger mode1 reset for RAS RMA status

2024-06-05 Thread Zhang, Hawking

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: amd-gfx  On Behalf Of Tao Zhou
Sent: Friday, May 31, 2024 18:49
To: amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao 
Subject: [PATCH 2/5] drm/amdgpu: trigger mode1 reset for RAS RMA status

Check RMA status in bad page retirement flow.

v2: fix coding bugs in v1.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 28 +++-  
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c  |  8 +++  
drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c |  4 +++-
 3 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 616dc2387f34..10cbcc0d1a1a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2049,8 +2049,9 @@ static void 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
struct amdgpu_device *adev = obj->adev;
struct amdgpu_ras_block_object *block_obj =
amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);

-   if (!block_obj)
+   if (!block_obj || !con)
return;

/* both query_poison_status and handle_poison_consumption are optional, 
@@ -2073,14 +2074,17 @@ static void 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
poison_stat = 
block_obj->hw_ops->handle_poison_consumption(adev);

-   /* gpu reset is fallback for failed and default cases */
-   if (poison_stat) {
+   /* gpu reset is fallback for failed and default cases.
+* For RMA case, amdgpu_umc_poison_handler will handle gpu reset.
+*/
+   if (poison_stat && !con->is_rma) {
dev_info(adev->dev, "GPU reset for %s RAS poison consumption is 
issued!\n",
block_obj->ras_comm.name);
amdgpu_ras_reset_gpu(adev);
-   } else {
-   amdgpu_gfx_poison_consumption_handler(adev, entry);
}
+
+   if (!poison_stat)
+   amdgpu_gfx_poison_consumption_handler(adev, entry);
 }

 static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager 
*obj, @@ -2801,6 +2805,7 @@ static void amdgpu_ras_do_page_retirement(struct 
work_struct *work)
  page_retirement_dwork.work);
struct amdgpu_device *adev = con->adev;
struct ras_err_data err_data;
+   unsigned long err_cnt;

if (amdgpu_in_reset(adev) || atomic_read(>in_recovery))
return;
@@ -2808,9 +2813,13 @@ static void amdgpu_ras_do_page_retirement(struct 
work_struct *work)
amdgpu_ras_error_data_init(_data);

amdgpu_umc_handle_bad_pages(adev, _data);
+   err_cnt = err_data.err_addr_cnt;

amdgpu_ras_error_data_fini(_data);

+   if (err_cnt && con->is_rma)
+   amdgpu_ras_reset_gpu(adev);
+
mutex_lock(>umc_ecc_log.lock);
if (radix_tree_tagged(>umc_ecc_log.de_page_tree,
UMC_ECC_NEW_DETECTED_TAG))
@@ -2867,7 +2876,8 @@ static int amdgpu_ras_poison_consumption_handler(struct 
amdgpu_device *adev,
if (poison_msg->pasid_fn)
poison_msg->pasid_fn(adev, pasid, poison_msg->data);

-   if (reset) {
+   /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */
+   if (reset && !con->is_rma) {
flush_delayed_work(>page_retirement_dwork);

con->gpu_reset_flags |= reset;
@@ -3983,6 +3993,12 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)  {
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);

+   /* mode1 is the only selection for RMA status */
+   if (ras->is_rma) {
+   ras->gpu_reset_flags = 0;
+   ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+   }
+
if (atomic_cmpxchg(>in_recovery, 0, 1) == 0)
amdgpu_reset_domain_schedule(ras->adev->reset_domain, 
>recovery_work);
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 1dbe69eabb9a..4a72ff8d8d80 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -195,7 +195,8 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
amdgpu_umc_handle_bad_pages(adev, ras_error_status);

-   if (err_data->ue_count && reset) {
+   if ((err_data->ue_count || err_data->de_count) &&
+   (reset || (con && con->is_rma))) {
con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev);
}
@@ -211,6 +212,7 @@ int

RE: [PATCH 04/12] drm/amdgpu: remove dead code in atom_get_src_int

2024-06-05 Thread Huang, Tim

[AMD Official Use Only - AMD Internal Distribution Only]

Hi Jesse,

> -Original Message-
> From: Jesse Zhang 
> Sent: Monday, June 3, 2024 4:47 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander ; Koenig, Christian
> ; Kuehling, Felix ;
> Huang, Tim ; Zhang, Jesse(Jie)
> ; Zhang, Jesse(Jie) 
> Subject: [PATCH 04/12] drm/amdgpu: remove dead code in atom_get_src_int
>
> Since the range of align is 0~7, the expression is: align = (attr >> 3) & 7.
> In the case of ATOM_ARG_IMM, the code cannot reach the default case.
> So there is no need for "break".
>
> Signed-off-by: Jesse Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/atom.c | 1 -
>  1 file changed, 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/atom.c
> b/drivers/gpu/drm/amd/amdgpu/atom.c
> index d552e013354c..c660e4a663ef 100644
> --- a/drivers/gpu/drm/amd/amdgpu/atom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/atom.c
> @@ -320,7 +320,6 @@ static uint32_t atom_get_src_int(atom_exec_context
> *ctx, uint8_t attr,
>   DEBUG("IMM 0x%02X\n", val);
>   return val;
>   }
> - break;

This should have new statement may fall through warning if remove the break 
here?

Tim Huang

>   case ATOM_ARG_PLL:
>   idx = U8(*ptr);
>   (*ptr)++;
> --
> 2.25.1

RE: [PATCH 01/12] drm/amd/pm: remove dead code in si_convert_power_level_to_smc

2024-06-05 Thread Huang, Tim

[Public]

Hi Jesse,

> -Original Message-
> From: Jesse Zhang 
> Sent: Monday, June 3, 2024 4:46 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander ; Koenig, Christian
> ; Kuehling, Felix ;
> Huang, Tim ; Zhang, Jesse(Jie)
> ; Zhang, Jesse(Jie) 
> Subject: [PATCH 01/12] drm/amd/pm: remove dead code in
> si_convert_power_level_to_smc
>
> Since gmc_pg is false, setting mcFlags with SISLANDS_SMC_MC_PG_EN
> cannot be reach.
>
> Signed-off-by: Jesse Zhang 
> ---
>  drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c | 3 ---
>  1 file changed, 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c
> b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c
> index 68ac01a8bc3a..a18f75a6d480 100644
> --- a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c
> +++ b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c
> @@ -5487,9 +5487,6 @@ static int si_convert_power_level_to_smc(struct
> amdgpu_device *adev,
>   (RREG32(DPG_PIPE_STUTTER_CONTROL) & STUTTER_ENABLE) &&
>   (adev->pm.dpm.new_active_crtc_count <= 2)) {
>   level->mcFlags |= SISLANDS_SMC_MC_STUTTER_EN;
> -
> - if (gmc_pg)
> - level->mcFlags |= SISLANDS_SMC_MC_PG_EN;

If remove this, the pmc_pg should never be used, maybe remove the definition of 
"bool gmc_pg = false" as well?

Tim Huang

>   }
>
>   if (adev->gmc.vram_type == AMDGPU_VRAM_TYPE_GDDR5) {
> --
> 2.25.1

RE: [PATCH 3/5] drm/amdgpu: create amdgpu_ras_in_recovery to simplify code

2024-06-05 Thread Zhang, Hawking

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: amd-gfx  On Behalf Of Tao Zhou
Sent: Friday, May 31, 2024 18:49
To: amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao 
Subject: [PATCH 3/5] drm/amdgpu: create amdgpu_ras_in_recovery to simplify code

Reduce redundant code and user doesn't need to pay attention to RAS details.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 13 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   | 14 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 31 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   |  2 ++
 .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c|  5 ++-
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c  |  3 +-  
.../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 12 +--
 7 files changed, 29 insertions(+), 51 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2ba8c4d5dc76..1811c7ba9bdc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6325,20 +6325,11 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev 
*pdev)
struct amdgpu_reset_context reset_context;
u32 memsize;
struct list_head device_list;
-   struct amdgpu_hive_info *hive;
-   int hive_ras_recovery = 0;
-   struct amdgpu_ras *ras;

/* PCI error slot reset should be skipped During RAS recovery */
-   hive = amdgpu_get_xgmi_hive(adev);
-   if (hive) {
-   hive_ras_recovery = atomic_read(>ras_recovery);
-   amdgpu_put_xgmi_hive(hive);
-   }
-   ras = amdgpu_ras_get_context(adev);
if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
-amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
-   ras && (atomic_read(>in_recovery) || hive_ras_recovery))
+   amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
+   amdgpu_ras_in_recovery(adev))
return PCI_ERS_RESULT_RECOVERED;

DRM_INFO("PCI error: slot reset callback!!\n"); diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 68505eaa92f9..fb5fc1fe6ad0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -505,9 +505,6 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int 
xcc_id)  {
struct amdgpu_kiq *kiq = >gfx.kiq[xcc_id];
struct amdgpu_ring *kiq_ring = >ring;
-   struct amdgpu_hive_info *hive;
-   struct amdgpu_ras *ras;
-   int hive_ras_recovery = 0;
int i, r = 0;
int j;

@@ -532,16 +529,9 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int 
xcc_id)
 * This is workaround: only skip kiq_ring test
 * during ras recovery in suspend stage for gfx9.4.3
 */
-   hive = amdgpu_get_xgmi_hive(adev);
-   if (hive) {
-   hive_ras_recovery = atomic_read(>ras_recovery);
-   amdgpu_put_xgmi_hive(hive);
-   }
-
-   ras = amdgpu_ras_get_context(adev);
if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
-amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
-   ras && (atomic_read(>in_recovery) || hive_ras_recovery)) {
+   amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
+   amdgpu_ras_in_recovery(adev)) {
spin_unlock(>ring_lock);
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 10cbcc0d1a1a..ff2d34dc9718 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1409,11 +1409,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device 
*adev,
enum amdgpu_ras_block block)
 {
struct amdgpu_ras_block_object *block_obj = 
amdgpu_ras_get_ras_block(adev, block, 0);
-   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
-   struct amdgpu_hive_info *hive;
-   int hive_ras_recovery = 0;

if (!block_obj || !block_obj->hw_ops) {
dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", @@ 
-1425,15 +1422,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
!amdgpu_ras_get_aca_debug_mode(adev))
return -EOPNOTSUPP;

-   hive = amdgpu_get_xgmi_hive(adev);
-   if (hive) {
-   hive_ras_recovery = atomic_read(>ras_recovery);
-   amdgpu_put_xgmi_hive(hive);
-   }
-
/* skip ras error reset in gpu reset */
-   if ((amdgpu_in_reset(adev) || atomic_read(>in_recovery) ||
-   hive_ras_recovery) &&
+

RE: [PATCH 4/5] drma/amdgpu: set fatal flag for RAS recovery

2024-06-05 Thread Zhang, Hawking

[AMD Official Use Only - AMD Internal Distribution Only]

Please correct the commit subject before pushing the change

drma->drm

Regards,
Hawking
-Original Message-
From: amd-gfx  On Behalf Of Tao Zhou
Sent: Friday, May 31, 2024 18:49
To: amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao 
Subject: [PATCH 4/5] drma/amdgpu: set fatal flag for RAS recovery

PMFW needs the flag to know the reason of mode1.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  |  2 +-  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 10 +-  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h  |  2 +-  
drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c |  2 +-  
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c  |  6 +++---  
drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c   |  2 +-
 7 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index fb5fc1fe6ad0..f55bff59052f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -940,7 +940,7 @@ int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device 
*adev,
if (adev->gfx.ras && adev->gfx.ras->ras_block.hw_ops &&
adev->gfx.ras->ras_block.hw_ops->query_ras_error_count)

adev->gfx.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data);
-   amdgpu_ras_reset_gpu(adev);
+   amdgpu_ras_reset_gpu(adev, true);
}
return AMDGPU_RAS_SUCCESS;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ff2d34dc9718..2071e30d7e56 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2070,7 +2070,7 @@ static void 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
if (poison_stat && !con->is_rma) {
dev_info(adev->dev, "GPU reset for %s RAS poison consumption is 
issued!\n",
block_obj->ras_comm.name);
-   amdgpu_ras_reset_gpu(adev);
+   amdgpu_ras_reset_gpu(adev, false);
}

if (!poison_stat)
@@ -2825,7 +2825,7 @@ static void amdgpu_ras_do_page_retirement(struct 
work_struct *work)
amdgpu_ras_error_data_fini(_data);

if (err_cnt && con->is_rma)
-   amdgpu_ras_reset_gpu(adev);
+   amdgpu_ras_reset_gpu(adev, false);

mutex_lock(>umc_ecc_log.lock);
if (radix_tree_tagged(>umc_ecc_log.de_page_tree,
@@ -2888,7 +2888,7 @@ static int amdgpu_ras_poison_consumption_handler(struct 
amdgpu_device *adev,
flush_delayed_work(>page_retirement_dwork);

con->gpu_reset_flags |= reset;
-   amdgpu_ras_reset_gpu(adev);
+   amdgpu_ras_reset_gpu(adev, false);
}

return 0;
@@ -3815,7 +3815,7 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)

amdgpu_ras_set_fed(adev, true);
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
-   amdgpu_ras_reset_gpu(adev);
+   amdgpu_ras_reset_gpu(adev, true);
}
 }

@@ -3996,7 +3996,7 @@ int amdgpu_ras_is_supported(struct amdgpu_device *adev,
return ret;
 }

-int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
+int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, bool fatal)
 {
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 37e1c93c243d..ed5793458a70 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -878,7 +878,7 @@ bool amdgpu_ras_is_poison_mode_supported(struct 
amdgpu_device *adev);

 int amdgpu_ras_is_supported(struct amdgpu_device *adev, unsigned int block);

-int amdgpu_ras_reset_gpu(struct amdgpu_device *adev);
+int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, bool fatal);

 struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev);

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 151f83ea803b..f976b6deb42d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -129,7 +129,7 @@ int amdgpu_sdma_process_ras_data_cb(struct amdgpu_device 
*adev,
if (amdgpu_sriov_vf(adev))
return AMDGPU_RAS_SUCCESS;

-   amdgpu_ras_reset_gpu(adev);
+   amdgpu_ras_reset_gpu(adev, true);

return AMDGPU_RAS_SUCCESS;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 4a72ff8d8d80..2596a1c2a64e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -198,7 +198,7 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
if ((err_data->ue_count || err_data->de_count) &&

RE: [PATCH 1/5] drm/amdgpu: add RAS is_rma flag

2024-06-05 Thread Zhou1, Tao

[AMD Official Use Only - AMD Internal Distribution Only]

Ping for the series...

> -Original Message-
> From: Zhou1, Tao 
> Sent: Friday, May 31, 2024 6:49 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Zhou1, Tao 
> Subject: [PATCH 1/5] drm/amdgpu: add RAS is_rma flag
>
> Set the flag to true if bad page number reaches threshold.
>
> Signed-off-by: Tao Zhou 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c|  7 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h |  3 +--
>  4 files changed, 11 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 2dc47475b8e9..616dc2387f34 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2940,7 +2940,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device
> *adev)
>   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>   struct ras_err_handler_data **data;
>   u32  max_eeprom_records_count = 0;
> - bool exc_err_limit = false;
>   int ret;
>
>   if (!con || amdgpu_sriov_vf(adev))
> @@ -2977,12 +2976,12 @@ int amdgpu_ras_recovery_init(struct
> amdgpu_device *adev)
>*/
>   if (adev->gmc.xgmi.pending_reset)
>   return 0;
> - ret = amdgpu_ras_eeprom_init(>eeprom_control, _err_limit);
> + ret = amdgpu_ras_eeprom_init(>eeprom_control);
>   /*
>* This calling fails when exc_err_limit is true or
>* ret != 0.
>*/
> - if (exc_err_limit || ret)
> + if (con->is_rma || ret)
>   goto free;
>
>   if (con->eeprom_control.ras_num_recs) { @@ -3033,7 +3032,7 @@ int
> amdgpu_ras_recovery_init(struct amdgpu_device *adev)
>* Except error threshold exceeding case, other failure cases in this
>* function would not fail amdgpu driver init.
>*/
> - if (!exc_err_limit)
> + if (!con->is_rma)
>   ret = 0;
>   else
>   ret = -EINVAL;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index d06c01b978cd..437c58c85639 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -521,6 +521,7 @@ struct amdgpu_ras {
>   bool update_channel_flag;
>   /* Record status of smu mca debug mode */
>   bool is_aca_debug_mode;
> + bool is_rma;
>
>   /* Record special requirements of gpu reset caller */
>   uint32_t  gpu_reset_flags;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index 9b789dcc2bd1..eae0a555df3c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -750,6 +750,9 @@ amdgpu_ras_eeprom_update_header(struct
> amdgpu_ras_eeprom_control *control)
>   control->tbl_rai.health_percent = 0;
>   }
>
> + if (amdgpu_bad_page_threshold != -1)
> + ras->is_rma = true;
> +
>   /* ignore the -ENOTSUPP return value */
>   amdgpu_dpm_send_rma_reason(adev);
>   }
> @@ -1321,8 +1324,7 @@ static int __read_table_ras_info(struct
> amdgpu_ras_eeprom_control *control)
>   return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res;  }
>
> -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
> -bool *exceed_err_limit)
> +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
>  {
>   struct amdgpu_device *adev = to_amdgpu_device(control);
>   unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 }; @@ -1330,7
> +1332,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control
> *control,
>   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>   int res;
>
> - *exceed_err_limit = false;
> + ras->is_rma = false;
>
>   if (!__is_ras_eeprom_supported(adev))
>   return 0;
> @@ -1422,7 +1424,7 @@ int amdgpu_ras_eeprom_init(struct
> amdgpu_ras_eeprom_control *control,
>   dev_warn(adev->dev, "GPU will be initialized
> due to bad_page_threshold = -1.");
>   res = 0;
>   } else {
> - *exceed_err_limit = true;
> + ras->is_rma = true;
>   dev_err(adev->dev,
>   "RAS records:%d exceed threshold:%d, "
>   "GPU will not be initialized. Replace 
> this
> GPU or increase the threshold", diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> index 6dfd667f3013..b9ebda577797 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> +++

74 matches

Mail list logo