[PATCH] drm/amdgpu: move xgmi init/fini to xgmi_add/remove_device call

2020-01-28 Thread Frank . Min
From: Hawking Zhang 

For sriov, psp ip block has to be initialized before
ih block for the dynamic register programming interface
that needed for vf ih ring buffer. On the other hand,
current psp ip block hw_init function will initialize
xgmi session which actaully depends on interrupt to
return session context. This results an empty xgmi ta
session id and later failures on all the xgmi ta cmd
invoked from vf. xgmi ta session initialization has to
be done after ih ip block hw_init call.

to unify xgmi session init/fini for both bare-metal
sriov virtualization use scenario, move xgmi ta init
to xgmi_add_device call, and accordingly terminate xgmi
ta session in xgmi_remove_device call.

The existing suspend/resume sequence will not be changed.

Change-Id: I40776af28e45d2d5ed9f87b28983069c746f2f2e
Signed-off-by: Hawking Zhang 
Reviewed-by: Frank Min 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c  | 18 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h  |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 11 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h |  2 +-
 4 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 3a1570dafe34..939a114605c0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -558,7 +558,7 @@ int psp_xgmi_invoke(struct psp_context *psp, uint32_t 
ta_cmd_id)
return psp_ta_invoke(psp, ta_cmd_id, psp->xgmi_context.session_id);
 }
 
-static int psp_xgmi_terminate(struct psp_context *psp)
+int psp_xgmi_terminate(struct psp_context *psp)
 {
int ret;
 
@@ -579,7 +579,7 @@ static int psp_xgmi_terminate(struct psp_context *psp)
return 0;
 }
 
-static int psp_xgmi_initialize(struct psp_context *psp)
+int psp_xgmi_initialize(struct psp_context *psp)
 {
struct ta_xgmi_shared_memory *xgmi_cmd;
int ret;
@@ -1420,16 +1420,6 @@ static int psp_load_fw(struct amdgpu_device *adev)
return ret;
}
 
-   if (adev->gmc.xgmi.num_physical_nodes > 1) {
-   ret = psp_xgmi_initialize(psp);
-   /* Warning the XGMI seesion initialize failure
-* Instead of stop driver initialization
-*/
-   if (ret)
-   dev_err(psp->adev->dev,
-   "XGMI: Failed to initialize XGMI session\n");
-   }
-
if (psp->adev->psp.ta_fw) {
ret = psp_ras_initialize(psp);
if (ret)
@@ -1494,10 +1484,6 @@ static int psp_hw_fini(void *handle)
void *tmr_buf;
void **pptr;
 
-   if (adev->gmc.xgmi.num_physical_nodes > 1 &&
-   psp->xgmi_context.initialized == 1)
-psp_xgmi_terminate(psp);
-
if (psp->adev->psp.ta_fw) {
psp_ras_terminate(psp);
psp_dtm_terminate(psp);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 611021514c52..c77e1abb538a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -362,6 +362,8 @@ int psp_gpu_reset(struct amdgpu_device *adev);
 int psp_update_vcn_sram(struct amdgpu_device *adev, int inst_idx,
uint64_t cmd_gpu_addr, int cmd_size);
 
+int psp_xgmi_initialize(struct psp_context *psp);
+int psp_xgmi_terminate(struct psp_context *psp);
 int psp_xgmi_invoke(struct psp_context *psp, uint32_t ta_cmd_id);
 
 int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index a97af422575a..78989e9560d1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -365,6 +365,13 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
return 0;
 
if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
+   ret = psp_xgmi_initialize(>psp);
+   if (ret) {
+   dev_err(adev->dev,
+   "XGMI: Failed to initialize xgmi session\n");
+   return ret;
+   }
+
ret = psp_xgmi_get_hive_id(>psp, >gmc.xgmi.hive_id);
if (ret) {
dev_err(adev->dev,
@@ -451,7 +458,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
return ret;
 }
 
-void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
+int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
 {
struct amdgpu_hive_info *hive;
 
@@ -471,6 +478,8 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
mutex_unlock(>hive_lock);
}
+
+   return psp_xgmi_terminate(>psp);
 }
 
 int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev)
diff --git 

[PATCH] drm/amdgpu: move xgmi init/fini to xgmi_add/remove_device call

2020-01-28 Thread Frank . Min
From: Hawking Zhang 

For sriov, psp ip block has to be initialized before
ih block for the dynamic register programming interface
that needed for vf ih ring buffer. On the other hand,
current psp ip block hw_init function will initialize
xgmi session which actaully depends on interrupt to
return session context. This results an empty xgmi ta
session id and later failures on all the xgmi ta cmd
invoked from vf. xgmi ta session initialization has to
be done after ih ip block hw_init call.

to unify xgmi session init/fini for both bare-metal
sriov virtualization use scenario, move xgmi ta init
to xgmi_add_device call, and accordingly terminate xgmi
ta session in xgmi_remove_device call.

The existing suspend/resume sequence will not be changed.

Change-Id: I40776af28e45d2d5ed9f87b28983069c746f2f2e
Signed-off-by: Hawking Zhang 
Reviewed-by: Frank Min 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c  | 18 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h  |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 11 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h |  2 +-
 4 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 3a1570dafe34..939a114605c0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -558,7 +558,7 @@ int psp_xgmi_invoke(struct psp_context *psp, uint32_t 
ta_cmd_id)
return psp_ta_invoke(psp, ta_cmd_id, psp->xgmi_context.session_id);
 }
 
-static int psp_xgmi_terminate(struct psp_context *psp)
+int psp_xgmi_terminate(struct psp_context *psp)
 {
int ret;
 
@@ -579,7 +579,7 @@ static int psp_xgmi_terminate(struct psp_context *psp)
return 0;
 }
 
-static int psp_xgmi_initialize(struct psp_context *psp)
+int psp_xgmi_initialize(struct psp_context *psp)
 {
struct ta_xgmi_shared_memory *xgmi_cmd;
int ret;
@@ -1420,16 +1420,6 @@ static int psp_load_fw(struct amdgpu_device *adev)
return ret;
}
 
-   if (adev->gmc.xgmi.num_physical_nodes > 1) {
-   ret = psp_xgmi_initialize(psp);
-   /* Warning the XGMI seesion initialize failure
-* Instead of stop driver initialization
-*/
-   if (ret)
-   dev_err(psp->adev->dev,
-   "XGMI: Failed to initialize XGMI session\n");
-   }
-
if (psp->adev->psp.ta_fw) {
ret = psp_ras_initialize(psp);
if (ret)
@@ -1494,10 +1484,6 @@ static int psp_hw_fini(void *handle)
void *tmr_buf;
void **pptr;
 
-   if (adev->gmc.xgmi.num_physical_nodes > 1 &&
-   psp->xgmi_context.initialized == 1)
-psp_xgmi_terminate(psp);
-
if (psp->adev->psp.ta_fw) {
psp_ras_terminate(psp);
psp_dtm_terminate(psp);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 611021514c52..c77e1abb538a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -362,6 +362,8 @@ int psp_gpu_reset(struct amdgpu_device *adev);
 int psp_update_vcn_sram(struct amdgpu_device *adev, int inst_idx,
uint64_t cmd_gpu_addr, int cmd_size);
 
+int psp_xgmi_initialize(struct psp_context *psp);
+int psp_xgmi_terminate(struct psp_context *psp);
 int psp_xgmi_invoke(struct psp_context *psp, uint32_t ta_cmd_id);
 
 int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index a97af422575a..78989e9560d1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -365,6 +365,13 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
return 0;
 
if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
+   ret = psp_xgmi_initialize(>psp);
+   if (ret) {
+   dev_err(adev->dev,
+   "XGMI: Failed to initialize xgmi session\n");
+   return ret;
+   }
+
ret = psp_xgmi_get_hive_id(>psp, >gmc.xgmi.hive_id);
if (ret) {
dev_err(adev->dev,
@@ -451,7 +458,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
return ret;
 }
 
-void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
+int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
 {
struct amdgpu_hive_info *hive;
 
@@ -471,6 +478,8 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
mutex_unlock(>hive_lock);
}
+
+   return psp_xgmi_terminate(>psp);
 }
 
 int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev)
diff --git 

Re: [Patch v1 4/5] drm/amdkfd: show warning when kfd is locked

2020-01-28 Thread Felix Kuehling

On 2020-01-27 20:29, Rajneesh Bhardwaj wrote:

During system suspend the kfd driver aquires a lock that prohibits
further kfd actions unless the gpu is resumed. This adds some info which
can be useful while debugging.

Signed-off-by: Rajneesh Bhardwaj 
---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 1aebda4bbbe7..081cc5f40d18 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -127,6 +127,8 @@ static int kfd_open(struct inode *inode, struct file *filep)
return PTR_ERR(process);
  
  	if (kfd_is_locked()) {

+   dev_warn(kfd_device, "kfd is locked!\n"
+   "process %d unreferenced", process->pasid);


If this is for debugging, make it dev_dbg. Printing warnings like this 
usually confuses people reporting completely unrelated problems that 
aren't even kernel problems at all. "Look, I found a warning in the 
kernel log. It must be a kernel problem."


Regards,
  Felix



kfd_unref_process(process);
return -EAGAIN;
}

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu/smu_v11_0: Correct behavior of restoring default tables

2020-01-28 Thread Matt Coffin
Previously, the syfs functionality for restoring the default powerplay
table was sourcing it's information from the currently-staged powerplay
table.

This patch adds a step to cache the first overdrive table that we see on
boot, so that it can be used later to "restore" the powerplay table

Signed-off-by: Matt Coffin 
---
 .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h|  1 +
 drivers/gpu/drm/amd/powerplay/navi10_ppt.c|  9 +++---
 drivers/gpu/drm/amd/powerplay/smu_v11_0.c |  6 
 drivers/gpu/drm/amd/powerplay/vega20_ppt.c| 28 ++-
 4 files changed, 19 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
index b0591a8dda41..1e33c3e9b98d 100644
--- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
@@ -273,6 +273,7 @@ struct smu_table_context
uint8_t thermal_controller_type;
 
void*overdrive_table;
+   void*boot_overdrive_table;
 };
 
 struct smu_dpm_context {
diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c 
b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
index f60762f9b143..26cfccc57331 100644
--- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
@@ -2064,12 +2064,11 @@ static int navi10_od_edit_dpm_table(struct smu_context 
*smu, enum PP_OD_DPM_TABL
od_table->UclkFmax = input[1];
break;
case PP_OD_RESTORE_DEFAULT_TABLE:
-   ret = smu_update_table(smu, SMU_TABLE_OVERDRIVE, 0, 
table_context->overdrive_table, false);
-   if (ret) {
-   pr_err("Failed to export over drive table!\n");
-   return ret;
+   if (!(table_context->overdrive_table && 
table_context->boot_overdrive_table)) {
+   pr_err("Overdrive table was not initialized!\n");
+   return -EINVAL;
}
-
+   memcpy(table_context->overdrive_table, 
table_context->boot_overdrive_table, sizeof(OverDriveTable_t));
break;
case PP_OD_COMMIT_DPM_TABLE:
navi10_dump_od_table(od_table);
diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c 
b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
index 02f8c9cb89d9..0dc49479a7eb 100644
--- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
+++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
@@ -1882,6 +1882,12 @@ int smu_v11_0_set_default_od_settings(struct smu_context 
*smu, bool initialize,
pr_err("Failed to export overdrive table!\n");
return ret;
}
+   if (!table_context->boot_overdrive_table) {
+   table_context->boot_overdrive_table = 
kmemdup(table_context->overdrive_table, overdrive_table_size, GFP_KERNEL);
+   if (!table_context->boot_overdrive_table) {
+   return -ENOMEM;
+   }
+   }
}
ret = smu_update_table(smu, SMU_TABLE_OVERDRIVE, 0, 
table_context->overdrive_table, true);
if (ret) {
diff --git a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c 
b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
index 38febd5ca4da..4ad8d6c14ee5 100644
--- a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
@@ -1706,22 +1706,11 @@ static int vega20_set_default_od_settings(struct 
smu_context *smu,
struct smu_table_context *table_context = >smu_table;
int ret;
 
-   if (initialize) {
-   if (table_context->overdrive_table)
-   return -EINVAL;
-
-   table_context->overdrive_table = 
kzalloc(sizeof(OverDriveTable_t), GFP_KERNEL);
-
-   if (!table_context->overdrive_table)
-   return -ENOMEM;
-
-   ret = smu_update_table(smu, SMU_TABLE_OVERDRIVE, 0,
-  table_context->overdrive_table, false);
-   if (ret) {
-   pr_err("Failed to export over drive table!\n");
-   return ret;
-   }
+   ret = smu_v11_0_set_default_od_settings(smu, initialize, 
sizeof(OverDriveTable_t));
+   if (ret)
+   return ret;
 
+   if (initialize) {
ret = vega20_set_default_od8_setttings(smu);
if (ret)
return ret;
@@ -2778,12 +2767,11 @@ static int vega20_odn_edit_dpm_table(struct smu_context 
*smu,
break;
 
case PP_OD_RESTORE_DEFAULT_TABLE:
-   ret = smu_update_table(smu, SMU_TABLE_OVERDRIVE, 0, 
table_context->overdrive_table, false);
-   if (ret) {
-   pr_err("Failed to export over drive table!\n");
-   return ret;
+   if 

Re: [PATCH] drm/amdgpu/smu_v11_0: Correct behavior of restoring default tables

2020-01-28 Thread Alex Deucher
On Tue, Jan 28, 2020 at 3:10 PM Matt Coffin  wrote:
>
> Previously, the syfs functionality for restoring the default powerplay
> table was sourcing it's information from the currently-staged powerplay
> table.
>
> This patch adds a step to cache the first overdrive table that we see on
> boot, so that it can be used later to "restore" the powerplay table

Missing your signed-off by.  With that fixed, patch is:
Reviewed-by: Alex Deucher 

> ---
>  .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h|  1 +
>  drivers/gpu/drm/amd/powerplay/navi10_ppt.c|  9 +++---
>  drivers/gpu/drm/amd/powerplay/smu_v11_0.c |  6 
>  drivers/gpu/drm/amd/powerplay/vega20_ppt.c| 28 ++-
>  4 files changed, 19 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h 
> b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> index b0591a8dda41..1e33c3e9b98d 100644
> --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> @@ -273,6 +273,7 @@ struct smu_table_context
> uint8_t thermal_controller_type;
>
> void*overdrive_table;
> +   void*boot_overdrive_table;
>  };
>
>  struct smu_dpm_context {
> diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c 
> b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> index f60762f9b143..26cfccc57331 100644
> --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> @@ -2064,12 +2064,11 @@ static int navi10_od_edit_dpm_table(struct 
> smu_context *smu, enum PP_OD_DPM_TABL
> od_table->UclkFmax = input[1];
> break;
> case PP_OD_RESTORE_DEFAULT_TABLE:
> -   ret = smu_update_table(smu, SMU_TABLE_OVERDRIVE, 0, 
> table_context->overdrive_table, false);
> -   if (ret) {
> -   pr_err("Failed to export over drive table!\n");
> -   return ret;
> +   if (!(table_context->overdrive_table && 
> table_context->boot_overdrive_table)) {
> +   pr_err("Overdrive table was not initialized!\n");
> +   return -EINVAL;
> }
> -
> +   memcpy(table_context->overdrive_table, 
> table_context->boot_overdrive_table, sizeof(OverDriveTable_t));
> break;
> case PP_OD_COMMIT_DPM_TABLE:
> navi10_dump_od_table(od_table);
> diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c 
> b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> index 02f8c9cb89d9..0dc49479a7eb 100644
> --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> @@ -1882,6 +1882,12 @@ int smu_v11_0_set_default_od_settings(struct 
> smu_context *smu, bool initialize,
> pr_err("Failed to export overdrive table!\n");
> return ret;
> }
> +   if (!table_context->boot_overdrive_table) {
> +   table_context->boot_overdrive_table = 
> kmemdup(table_context->overdrive_table, overdrive_table_size, GFP_KERNEL);
> +   if (!table_context->boot_overdrive_table) {
> +   return -ENOMEM;
> +   }
> +   }
> }
> ret = smu_update_table(smu, SMU_TABLE_OVERDRIVE, 0, 
> table_context->overdrive_table, true);
> if (ret) {
> diff --git a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c 
> b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
> index 38febd5ca4da..4ad8d6c14ee5 100644
> --- a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
> +++ b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
> @@ -1706,22 +1706,11 @@ static int vega20_set_default_od_settings(struct 
> smu_context *smu,
> struct smu_table_context *table_context = >smu_table;
> int ret;
>
> -   if (initialize) {
> -   if (table_context->overdrive_table)
> -   return -EINVAL;
> -
> -   table_context->overdrive_table = 
> kzalloc(sizeof(OverDriveTable_t), GFP_KERNEL);
> -
> -   if (!table_context->overdrive_table)
> -   return -ENOMEM;
> -
> -   ret = smu_update_table(smu, SMU_TABLE_OVERDRIVE, 0,
> -  table_context->overdrive_table, false);
> -   if (ret) {
> -   pr_err("Failed to export over drive table!\n");
> -   return ret;
> -   }
> +   ret = smu_v11_0_set_default_od_settings(smu, initialize, 
> sizeof(OverDriveTable_t));
> +   if (ret)
> +   return ret;
>
> +   if (initialize) {
> ret = vega20_set_default_od8_setttings(smu);
> if (ret)
> return ret;
> @@ -2778,12 +2767,11 @@ static int vega20_odn_edit_dpm_table(struct 
> smu_context *smu,
> 

Re: [PATCH] drm/amd/display: Move drm_dp_mst_atomic_check() to the front of dc_validate_global_state()

2020-01-28 Thread Mikita Lipski

Reviewed-by: Mikita Lipski 

Thanks!
Mikita

On 1/28/20 4:44 PM, Zhan Liu wrote:

[Why]
Need to do atomic check first, then validate global state.
If not, when connecting both MST and HDMI displays and
set a bad mode via xrandr, system will hang.

[How]
Move drm_dp_mst_atomic_check() to the front of
dc_validate_global_state().

Signed-off-by: Zhan Liu 
---
  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 14 ++
  1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index eed3ed7180fd..805d8d84ebb8 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -8256,6 +8256,16 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev,
goto fail;
  #endif
  
+		/*

+* Perform validation of MST topology in the state:
+* We need to perform MST atomic check before calling
+* dc_validate_global_state(), or there is a chance
+* to get stuck in an infinite loop and hang eventually.
+*/
+   ret = drm_dp_mst_atomic_check(state);
+   if (ret)
+   goto fail;
+
if (dc_validate_global_state(dc, dm_state->context, false) != 
DC_OK) {
ret = -EINVAL;
goto fail;
@@ -8284,10 +8294,6 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev,
dc_retain_state(old_dm_state->context);
}
}
-   /* Perform validation of MST topology in the state*/
-   ret = drm_dp_mst_atomic_check(state);
-   if (ret)
-   goto fail;
  
  	/* Store the overall update type for use later in atomic check. */

for_each_new_crtc_in_state (state, crtc, new_crtc_state, i) {



--
Thanks,
Mikita Lipski
Software Engineer 2, AMD
mikita.lip...@amd.com
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amd/display: Move drm_dp_mst_atomic_check() to the front of dc_validate_global_state()

2020-01-28 Thread Zhan Liu
[Why]
Need to do atomic check first, then validate global state.
If not, when connecting both MST and HDMI displays and
set a bad mode via xrandr, system will hang.

[How]
Move drm_dp_mst_atomic_check() to the front of
dc_validate_global_state().

Signed-off-by: Zhan Liu 
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index eed3ed7180fd..805d8d84ebb8 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -8256,6 +8256,16 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev,
goto fail;
 #endif
 
+   /*
+* Perform validation of MST topology in the state:
+* We need to perform MST atomic check before calling
+* dc_validate_global_state(), or there is a chance
+* to get stuck in an infinite loop and hang eventually.
+*/
+   ret = drm_dp_mst_atomic_check(state);
+   if (ret)
+   goto fail;
+
if (dc_validate_global_state(dc, dm_state->context, false) != 
DC_OK) {
ret = -EINVAL;
goto fail;
@@ -8284,10 +8294,6 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev,
dc_retain_state(old_dm_state->context);
}
}
-   /* Perform validation of MST topology in the state*/
-   ret = drm_dp_mst_atomic_check(state);
-   if (ret)
-   goto fail;
 
/* Store the overall update type for use later in atomic check. */
for_each_new_crtc_in_state (state, crtc, new_crtc_state, i) {
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amd/display: fix spelling mistake link_integiry_check -> link_integrity_check

2020-01-28 Thread Alex Deucher
On Tue, Jan 28, 2020 at 6:28 AM Colin King  wrote:
>
> From: Colin Ian King 
>
> There is a spelling mistake on the struct field name link_integiry_check,
> fix this by renaming it.
>
> Signed-off-by: Colin Ian King 

Applied.  Thanks!

Alex

> ---
>  drivers/gpu/drm/amd/display/modules/hdcp/hdcp.h   | 2 +-
>  .../gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c| 8 
>  .../gpu/drm/amd/display/modules/hdcp/hdcp1_transition.c   | 4 ++--
>  3 files changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp.h 
> b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp.h
> index f98d3d9ecb6d..af78e4f1be68 100644
> --- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp.h
> +++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp.h
> @@ -63,7 +63,7 @@ struct mod_hdcp_transition_input_hdcp1 {
> uint8_t hdcp_capable_dp;
> uint8_t binfo_read_dp;
> uint8_t r0p_available_dp;
> -   uint8_t link_integiry_check;
> +   uint8_t link_integrity_check;
> uint8_t reauth_request_check;
> uint8_t stream_encryption_dp;
>  };
> diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c 
> b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c
> index 04845e43df15..37670db64855 100644
> --- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c
> +++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c
> @@ -283,8 +283,8 @@ static enum mod_hdcp_status wait_for_ready(struct 
> mod_hdcp *hdcp,
> hdcp, "bstatus_read"))
> goto out;
> if (!mod_hdcp_execute_and_set(check_link_integrity_dp,
> -   >link_integiry_check, ,
> -   hdcp, "link_integiry_check"))
> +   >link_integrity_check, ,
> +   hdcp, "link_integrity_check"))
> goto out;
> if 
> (!mod_hdcp_execute_and_set(check_no_reauthentication_request_dp,
> >reauth_request_check, ,
> @@ -431,8 +431,8 @@ static enum mod_hdcp_status authenticated_dp(struct 
> mod_hdcp *hdcp,
> hdcp, "bstatus_read"))
> goto out;
> if (!mod_hdcp_execute_and_set(check_link_integrity_dp,
> -   >link_integiry_check, ,
> -   hdcp, "link_integiry_check"))
> +   >link_integrity_check, ,
> +   hdcp, "link_integrity_check"))
> goto out;
> if (!mod_hdcp_execute_and_set(check_no_reauthentication_request_dp,
> >reauth_request_check, ,
> diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_transition.c 
> b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_transition.c
> index 21ebc62bb9d9..76edcbe51f71 100644
> --- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_transition.c
> +++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_transition.c
> @@ -241,7 +241,7 @@ enum mod_hdcp_status mod_hdcp_hdcp1_dp_transition(struct 
> mod_hdcp *hdcp,
> }
> break;
> case D1_A4_AUTHENTICATED:
> -   if (input->link_integiry_check != PASS ||
> +   if (input->link_integrity_check != PASS ||
> input->reauth_request_check != PASS) {
> /* 1A-07: restart hdcp on a link integrity failure */
> fail_and_restart_in_ms(0, , output);
> @@ -249,7 +249,7 @@ enum mod_hdcp_status mod_hdcp_hdcp1_dp_transition(struct 
> mod_hdcp *hdcp,
> }
> break;
> case D1_A6_WAIT_FOR_READY:
> -   if (input->link_integiry_check == FAIL ||
> +   if (input->link_integrity_check == FAIL ||
> input->reauth_request_check == FAIL) {
> fail_and_restart_in_ms(0, , output);
> break;
> --
> 2.24.0
>
> ___
> dri-devel mailing list
> dri-de...@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amdgpu/navi10: add mclk to navi10_get_clock_by_type_with_latency

2020-01-28 Thread Alex Deucher
On Tue, Jan 28, 2020 at 1:50 PM Matt Coffin  wrote:
>
> Let me know if I'm not doing this correctly since I'm still new.
>
>
>
> Patch is
>
> Reviewed-by: Matt Coffin 

Thanks!

Alex

>
> On 1/25/20 11:33 AM, Alex Deucher wrote:
> > Doesn't seem to be used, but add it just in case.
> >
> > Signed-off-by: Alex Deucher 
> > ---
> >  drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 2 ++
> >  1 file changed, 2 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c 
> > b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> > index 015f5ea38901..f60762f9b143 100644
> > --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> > +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> > @@ -1008,6 +1008,8 @@ static int 
> > navi10_get_clock_by_type_with_latency(struct smu_context *smu,
> >   case SMU_GFXCLK:
> >   case SMU_DCEFCLK:
> >   case SMU_SOCCLK:
> > + case SMU_MCLK:
> > + case SMU_UCLK:
> >   ret = smu_get_dpm_level_count(smu, clk_type, _count);
> >   if (ret)
> >   return ret;
> >
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [Patch v1 3/5] drm/amdkfd: Introduce debugfs option to disable baco

2020-01-28 Thread Alex Deucher
On Mon, Jan 27, 2020 at 8:30 PM Rajneesh Bhardwaj
 wrote:
>
> When BACO is enabled by default, sometimes it can cause additional
> trouble to debug KFD issues. This debugfs override allows to temporarily
> disable BACO for debug purpose without having to reboot the machine.
>
> However, in some cases one suspend-resume cycle might be needed if
> the device is already runtime suspended.
>
> e.g
>
> sudo rtcwake -m < mem or freeze > -s 15
>
> or
>
> by triggering autosuspend event from user space, by doing something
> like:
>
> echo 6000 > /sys/bus/pci/devices/\:03\:00.0/power/autosuspend_delay_ms
>
> Usage:
>
> echo 0 > /sys/kernel/debug/kfd/enable_baco and run
> cat /sys/kernel/debug/kfd/baco_status to verify whether BACO is
> enabled or disabled by kfd driver.
>
> It should be noted that while enabling baco again via kfd override, we
> must do the following steps:
>
> 1. echo 0 > /sys/kernel/debug/kfd/enable_baco
> 2. sudo rtcwake -m < mem > -s 15
>
> In this case, we need GPU to be fully reset which is done by BIOS. This
> is not possible in case of S2idle i.e. freeze so we must use mem for
> sleep.
>

I think we can drop this patch in favor of just using the standard
runtime pm control.  E.g.,
/sys/class/drm/card0/device/power/control

Also, the underlying mechanism may not always be BACO.  E.g., hybrid
laptops use BOCO (d3cold).  So it would be better to make the naming
more generic (e.g., runpm_enable, runpm_status).  This is also kfd
specific.  If we actually do want something like this, I think it
should be at the device level in amdgpu.

Alex

> Signed-off-by: Rajneesh Bhardwaj 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  3 +++
>  drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c   | 27 ++
>  3 files changed, 31 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 47b0f2957d1f..0fa898d30207 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -34,6 +34,7 @@
>  #include "amdgpu_vm.h"
>
>  extern uint64_t amdgpu_amdkfd_total_mem_size;
> +extern bool kfd_enable_baco;
>
>  struct amdgpu_device;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 2c64d2a83d61..d9e5eac182d3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3259,6 +3259,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
> fbcon)
> return -ENODEV;
> }
>
> +   if (!kfd_enable_baco)
> +   return -EBUSY;
> +
> adev = dev->dev_private;
>
> if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
> index 511712c2e382..c6f6ff2ff603 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
> @@ -26,6 +26,7 @@
>  #include "kfd_priv.h"
>
>  static struct dentry *debugfs_root;
> +bool kfd_enable_baco = true;
>
>  static int kfd_debugfs_open(struct inode *inode, struct file *file)
>  {
> @@ -83,6 +84,28 @@ static const struct file_operations 
> kfd_debugfs_hang_hws_fops = {
> .release = single_release,
>  };
>
> +static int baco_sts_set(void *data, u64 val)
> +{
> +   if (!val)
> +   kfd_enable_baco = false;
> +   else
> +   kfd_enable_baco = true;
> +
> +   return 0;
> +}
> +DEFINE_SIMPLE_ATTRIBUTE(fops_baco_enable, NULL, baco_sts_set, "%lld\n");
> +
> +static int baco_sts_show(struct seq_file *s, void *unused)
> +{
> +   if (kfd_enable_baco)
> +   seq_puts(s, "Baco is Enabled\n");
> +   else
> +   seq_puts(s, "Baco is Disabled\n");
> +
> +   return 0;
> +}
> +DEFINE_SHOW_ATTRIBUTE(baco_sts);
> +
>  void kfd_debugfs_init(void)
>  {
> debugfs_root = debugfs_create_dir("kfd", NULL);
> @@ -95,6 +118,10 @@ void kfd_debugfs_init(void)
> kfd_debugfs_rls_by_device, _debugfs_fops);
> debugfs_create_file("hang_hws", S_IFREG | 0200, debugfs_root,
> NULL, _debugfs_hang_hws_fops);
> +   debugfs_create_file("enable_baco", 0644, debugfs_root, NULL,
> +   _baco_enable);
> +   debugfs_create_file("baco_status", 0444, debugfs_root, NULL,
> +   _sts_fops);
>  }
>
>  void kfd_debugfs_fini(void)
> --
> 2.17.1
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [Patch v1 2/5] drm/amdgpu: Fix missing error check in suspend

2020-01-28 Thread Alex Deucher
On Mon, Jan 27, 2020 at 8:30 PM Rajneesh Bhardwaj
 wrote:
>
> amdgpu_device_suspend might return an error code since it can be called
> from both runtime and system suspend flows. Add the missing return code
> in case of a failure.
>
> Signed-off-by: Rajneesh Bhardwaj 

Reviewed-by: Alex Deucher 

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 137e76f0e3db..71466df6dff6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -1236,6 +1236,9 @@ static int amdgpu_pmops_runtime_suspend(struct device 
> *dev)
> drm_kms_helper_poll_disable(drm_dev);
>
> ret = amdgpu_device_suspend(drm_dev, false);
> +   if (ret)
> +   return ret;
> +
> if (amdgpu_device_supports_boco(drm_dev)) {
> /* Only need to handle PCI state in the driver for ATPX
>  * PCI core handles it for _PR3.
> --
> 2.17.1
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [Patch v1 1/5] drm/amdgpu: always enable runtime power management

2020-01-28 Thread Alex Deucher
On Mon, Jan 27, 2020 at 8:30 PM Rajneesh Bhardwaj
 wrote:
>
> This allows runtime power management to kick in on amdgpu driver when
> the underlying hardware supports either BOCO or BACO. This can still be
> avoided if boot arg amdgpu.runpm = 0 is supplied.
>
> BOCO: Bus Off, Chip Off
> BACO: Bus Alive, Chip Off
>
> Signed-off-by: Rajneesh Bhardwaj 

This patch should be the last one in the series, otherwise we'll
enable runpm on BACO capable devices before the KFD code is in place.
Also, it's only supported on VI and newer asics, so we should use this
patch instead:
https://patchwork.freedesktop.org/patch/335402/

Alex

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 9 -
>  1 file changed, 4 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index 3a0ea9096498..7958d508486e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -169,11 +169,10 @@ int amdgpu_driver_load_kms(struct drm_device *dev, 
> unsigned long flags)
> goto out;
> }
>
> -   if (amdgpu_device_supports_boco(dev) &&
> -   (amdgpu_runtime_pm != 0)) /* enable runpm by default */
> -   adev->runpm = true;
> -   else if (amdgpu_device_supports_baco(dev) &&
> -(amdgpu_runtime_pm > 0)) /* enable runpm if runpm=1 */
> +   /* always enable runtime power management except when amdgpu.runpm=0 
> */
> +   if ((amdgpu_device_supports_boco(dev) ||
> +   amdgpu_device_supports_baco(dev))
> +   && (amdgpu_runtime_pm != 0))
> adev->runpm = true;
>
> /* Call ACPI methods: require modeset init
> --
> 2.17.1
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [Patch v1 0/5] Enable BACO with KFD

2020-01-28 Thread Alex Deucher
On Tue, Jan 28, 2020 at 12:39 PM Mike Lothian  wrote:
>
> Is this designed to work with PRIME laptops too?
>

Yes.  The title should really be runtime pm rather than BACO
specifically.  The underlying power savings mechanism (BOCO, BACO,
etc.) varies from platform to platform.

Alex

> On Tue, 28 Jan 2020 at 01:29, Rajneesh Bhardwaj
>  wrote:
> >
> > This series aims to enable BACO by default on supported AMD platforms
> > and ensures that the AMD Kernel Fusion Driver can co-exist with this
> > feature when the GPU devices are runtime suspended and firmware pushes
> > the envelop to save more power with BACO entry sequence. Current
> > approach makes sure that if KFD is using GPU services for compute, it
> > won't let AMDGPU driver suspend and if the AMDGPU driver is already
> > runtime suspended with GPUs in deep power saving mode with BACO, the KFD
> > driver wakes up the AMDGPU and then starts the compute workload
> > execution.
> >
> > This series has been tested with a single GPU (fiji), Dual GPUs (fiji
> > and fiji/tonga) and seems to work fine. I have not seen but expect some
> > corner cases where for some reason a KFD client binds a process but is
> > unable to complete the task within 60 seconds. Ideally we should put the
> > runtime_put and auto_suspend code in some function which is logical
> > opposite of bind_process_to_device but during my experiments, it ended
> > up with random gpu hang, machine reboot etc so any comments for
> > improvement are welcome.
> >
> > Todo:
> >
> > rebase on top of https://patchwork.freedesktop.org/patch/338037/
> >
> > Rajneesh Bhardwaj (5):
> >   drm/amdgpu: always enable runtime power management
> >   drm/amdgpu: Fix missing error check in suspend
> >   drm/amdkfd: Introduce debugfs option to disable baco
> >   drm/amdkfd: show warning when kfd is locked
> >   drm/amdkfd: refactor runtime pm for baco
> >
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 12 -
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  9 ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  7 +++--
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  3 +++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c|  9 +++
> >  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c   |  2 ++
> >  drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c   | 27 +++
> >  drivers/gpu/drm/amd/amdkfd/kfd_device.c| 31 +-
> >  drivers/gpu/drm/amd/amdkfd/kfd_iommu.c |  5 +++-
> >  drivers/gpu/drm/amd/amdkfd/kfd_process.c   | 19 ++---
> >  10 files changed, 91 insertions(+), 33 deletions(-)
> >
> > --
> > 2.17.1
> >
> > ___
> > amd-gfx mailing list
> > amd-gfx@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu/smu_v11_0: Correct behavior of restoring default tables

2020-01-28 Thread Matt Coffin
Previously, the syfs functionality for restoring the default powerplay
table was sourcing it's information from the currently-staged powerplay
table.

This patch adds a step to cache the first overdrive table that we see on
boot, so that it can be used later to "restore" the powerplay table
---
 .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h|  1 +
 drivers/gpu/drm/amd/powerplay/navi10_ppt.c|  9 +++---
 drivers/gpu/drm/amd/powerplay/smu_v11_0.c |  6 
 drivers/gpu/drm/amd/powerplay/vega20_ppt.c| 28 ++-
 4 files changed, 19 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
index b0591a8dda41..1e33c3e9b98d 100644
--- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
@@ -273,6 +273,7 @@ struct smu_table_context
uint8_t thermal_controller_type;
 
void*overdrive_table;
+   void*boot_overdrive_table;
 };
 
 struct smu_dpm_context {
diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c 
b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
index f60762f9b143..26cfccc57331 100644
--- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
@@ -2064,12 +2064,11 @@ static int navi10_od_edit_dpm_table(struct smu_context 
*smu, enum PP_OD_DPM_TABL
od_table->UclkFmax = input[1];
break;
case PP_OD_RESTORE_DEFAULT_TABLE:
-   ret = smu_update_table(smu, SMU_TABLE_OVERDRIVE, 0, 
table_context->overdrive_table, false);
-   if (ret) {
-   pr_err("Failed to export over drive table!\n");
-   return ret;
+   if (!(table_context->overdrive_table && 
table_context->boot_overdrive_table)) {
+   pr_err("Overdrive table was not initialized!\n");
+   return -EINVAL;
}
-
+   memcpy(table_context->overdrive_table, 
table_context->boot_overdrive_table, sizeof(OverDriveTable_t));
break;
case PP_OD_COMMIT_DPM_TABLE:
navi10_dump_od_table(od_table);
diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c 
b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
index 02f8c9cb89d9..0dc49479a7eb 100644
--- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
+++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
@@ -1882,6 +1882,12 @@ int smu_v11_0_set_default_od_settings(struct smu_context 
*smu, bool initialize,
pr_err("Failed to export overdrive table!\n");
return ret;
}
+   if (!table_context->boot_overdrive_table) {
+   table_context->boot_overdrive_table = 
kmemdup(table_context->overdrive_table, overdrive_table_size, GFP_KERNEL);
+   if (!table_context->boot_overdrive_table) {
+   return -ENOMEM;
+   }
+   }
}
ret = smu_update_table(smu, SMU_TABLE_OVERDRIVE, 0, 
table_context->overdrive_table, true);
if (ret) {
diff --git a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c 
b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
index 38febd5ca4da..4ad8d6c14ee5 100644
--- a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
@@ -1706,22 +1706,11 @@ static int vega20_set_default_od_settings(struct 
smu_context *smu,
struct smu_table_context *table_context = >smu_table;
int ret;
 
-   if (initialize) {
-   if (table_context->overdrive_table)
-   return -EINVAL;
-
-   table_context->overdrive_table = 
kzalloc(sizeof(OverDriveTable_t), GFP_KERNEL);
-
-   if (!table_context->overdrive_table)
-   return -ENOMEM;
-
-   ret = smu_update_table(smu, SMU_TABLE_OVERDRIVE, 0,
-  table_context->overdrive_table, false);
-   if (ret) {
-   pr_err("Failed to export over drive table!\n");
-   return ret;
-   }
+   ret = smu_v11_0_set_default_od_settings(smu, initialize, 
sizeof(OverDriveTable_t));
+   if (ret)
+   return ret;
 
+   if (initialize) {
ret = vega20_set_default_od8_setttings(smu);
if (ret)
return ret;
@@ -2778,12 +2767,11 @@ static int vega20_odn_edit_dpm_table(struct smu_context 
*smu,
break;
 
case PP_OD_RESTORE_DEFAULT_TABLE:
-   ret = smu_update_table(smu, SMU_TABLE_OVERDRIVE, 0, 
table_context->overdrive_table, false);
-   if (ret) {
-   pr_err("Failed to export over drive table!\n");
-   return ret;
+   if (!(table_context->overdrive_table && 

RE: [Patch v1 5/5] drm/amdkfd: refactor runtime pm for baco

2020-01-28 Thread Zeng, Oak
[AMD Official Use Only - Internal Distribution Only]



Regards,
Oak

-Original Message-
From: amd-gfx  On Behalf Of Rajneesh 
Bhardwaj
Sent: Monday, January 27, 2020 8:29 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Kuehling, Felix 
; Bhardwaj, Rajneesh 
Subject: [Patch v1 5/5] drm/amdkfd: refactor runtime pm for baco

So far the kfd driver implemented same routines for runtime and system wide 
suspend and resume (s2idle or mem). During system wide suspend the kfd aquires 
an atomic lock that prevents any more user processes to create queues and 
interact with kfd driver and amd gpu. This mechanism created problem when 
amdgpu device is runtime suspended with BACO enabled. Any application that 
relies on kfd driver fails to load because the driver reports a locked kfd 
device since gpu is runtime suspended.

However, in an ideal case, when gpu is runtime  suspended the kfd driver should 
be able to:

 - auto resume amdgpu driver whenever a client requests compute service
 - prevent runtime suspend for amdgpu  while kfd is in use

This change refactors the amdgpu and amdkfd drivers to support BACO and runtime 
runtime power management.
[Oak] two runtime above

Signed-off-by: Rajneesh Bhardwaj 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 12 -  
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  8 +++---  
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 +--
 drivers/gpu/drm/amd/amdkfd/kfd_device.c| 31 +-
 drivers/gpu/drm/amd/amdkfd/kfd_iommu.c |  5 +++-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c   | 19 ++---
 6 files changed, 51 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 8609287620ea..314c4a2a0354 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -178,18 +178,18 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,
kgd2kfd_interrupt(adev->kfd.dev, ih_ring_entry);  }
 
-void amdgpu_amdkfd_suspend(struct amdgpu_device *adev)
+void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm)
[Oak] then name run_pm is a little bit confusing. Maybe system_wide_pm or 
none_runtime_pm?
 {
if (adev->kfd.dev)
-   kgd2kfd_suspend(adev->kfd.dev);
+   kgd2kfd_suspend(adev->kfd.dev, run_pm);
 }
 
-int amdgpu_amdkfd_resume(struct amdgpu_device *adev)
+int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm)
 {
int r = 0;
 
if (adev->kfd.dev)
-   r = kgd2kfd_resume(adev->kfd.dev);
+   r = kgd2kfd_resume(adev->kfd.dev, run_pm);
 
return r;
 }
@@ -713,11 +713,11 @@ void kgd2kfd_exit(void)  {  }
 
-void kgd2kfd_suspend(struct kfd_dev *kfd)
+void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
 {
 }
 
-int kgd2kfd_resume(struct kfd_dev *kfd)
+int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
 {
return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 0fa898d30207..3dce4a51e522 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -123,8 +123,8 @@ struct amdkfd_process_info {  int amdgpu_amdkfd_init(void); 
 void amdgpu_amdkfd_fini(void);
 
-void amdgpu_amdkfd_suspend(struct amdgpu_device *adev); -int 
amdgpu_amdkfd_resume(struct amdgpu_device *adev);
+void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm); 
+int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm);
 void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,
const void *ih_ring_entry);
 void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev); @@ -250,8 +250,8 
@@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 struct drm_device *ddev,
 const struct kgd2kfd_shared_resources *gpu_resources); 
 void kgd2kfd_device_exit(struct kfd_dev *kfd); -void kgd2kfd_suspend(struct 
kfd_dev *kfd); -int kgd2kfd_resume(struct kfd_dev *kfd);
+void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm); int 
+kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
 int kgd2kfd_pre_reset(struct kfd_dev *kfd);  int kgd2kfd_post_reset(struct 
kfd_dev *kfd);  void kgd2kfd_interrupt(struct kfd_dev *kfd, const void 
*ih_ring_entry); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index d9e5eac182d3..787d49e9f4de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3314,7 +3314,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
fbcon)
}
}
 
-   amdgpu_amdkfd_suspend(adev);
+   amdgpu_amdkfd_suspend(adev, fbcon);
 
amdgpu_ras_suspend(adev);
 
@@ -3398,7 +3398,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool 
fbcon)
}
}
}
-   r = 

Re: [PATCH 3/3] drm/amdgpu/navi10: add OD support for restoring default table

2020-01-28 Thread Alex Deucher
On Tue, Jan 28, 2020 at 1:49 PM Matt Coffin  wrote:
>
>
>
> On 1/28/20 10:26 AM, Alex Deucher wrote:
> > On Tue, Jan 28, 2020 at 11:44 AM Matt Coffin  wrote:
>
> > I just copied that vega20 did.  You may be right.  I haven't paged the
> > recent SMU interface stuff into my head in a while.  If so, we should
> > also fix the vega20_ppt.c code.
>
> The vega20_ppt code was correct, until we implemented the ability for
> the user to write to that overdrive table, which will land in 5.5.
>
> Not entirely sure about the canonical way to distribute changes to
> someone else's series, but I can take a crack at fixing this.
>

Thanks.  Just go ahead and send patches.

Alex


> >>
> >> On 1/25/20 11:48 AM, Alex Deucher wrote:
> >>> Was missing before.
> >>>
> >>> Bug: https://gitlab.freedesktop.org/drm/amd/issues/1020
> >>> Signed-off-by: Alex Deucher 
> >>> ---
> >>>  drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 8 
> >>>  1 file changed, 8 insertions(+)
> >>>
> >>> diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c 
> >>> b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> >>> index d2d45181ae23..f60762f9b143 100644
> >>> --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> >>> +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> >>> @@ -2062,6 +2062,14 @@ static int navi10_od_edit_dpm_table(struct 
> >>> smu_context *smu, enum PP_OD_DPM_TABL
> >>>   if (ret)
> >>>   return ret;
> >>>   od_table->UclkFmax = input[1];
> >>> + break;
> >>> + case PP_OD_RESTORE_DEFAULT_TABLE:
> >>> + ret = smu_update_table(smu, SMU_TABLE_OVERDRIVE, 0, 
> >>> table_context->overdrive_table, false);
> >>> + if (ret) {
> >>> + pr_err("Failed to export over drive table!\n");
> >>> + return ret;
> >>> + }
> >>> +
> >>>   break;
> >>>   case PP_OD_COMMIT_DPM_TABLE:
> >>>   navi10_dump_od_table(od_table);
> >>>
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amdgpu: correct vram mgr node size calculation

2020-01-28 Thread Christian König

Am 28.01.20 um 20:55 schrieb Philip Yang:

Use pages_per_node instead of mem->num_pages to alloc vram, this will
increase the chance to get vram node after vram fragments.


NAK, this is intentional to aid TLB with continuous allocations.

The fallback to using pages_per_node is directly below, so your patch 
should be completely superfluous.


Regards,
Christian.



Signed-off-by: Philip Yang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 11 +++
  1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 82a3299e53c0..67a454f4c37a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -369,7 +369,7 @@ static int amdgpu_vram_mgr_new(struct ttm_mem_type_manager 
*man,
  
  	spin_lock(>lock);

for (i = 0; pages_left >= pages_per_node; ++i) {
-   unsigned long pages = rounddown_pow_of_two(pages_left);
+   unsigned long pages = rounddown_pow_of_two(pages_per_node);
  
  		r = drm_mm_insert_node_in_range(mm, [i], pages,

pages_per_node, 0,
@@ -383,15 +383,11 @@ static int amdgpu_vram_mgr_new(struct 
ttm_mem_type_manager *man,
pages_left -= pages;
}
  
-	for (; pages_left; ++i) {

-   unsigned long pages = min(pages_left, pages_per_node);
+   if (pages_left) {
uint32_t alignment = mem->page_alignment;
  
-		if (pages == pages_per_node)

-   alignment = pages_per_node;
-
r = drm_mm_insert_node_in_range(mm, [i],
-   pages, alignment, 0,
+   pages_left, alignment, 0,
place->fpfn, lpfn,
mode);
if (unlikely(r))
@@ -399,7 +395,6 @@ static int amdgpu_vram_mgr_new(struct ttm_mem_type_manager 
*man,
  
  		vis_usage += amdgpu_vram_mgr_vis_size(adev, [i]);

amdgpu_vram_mgr_virt_start(mem, [i]);
-   pages_left -= pages;
}
spin_unlock(>lock);
  


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: correct vram mgr node size calculation

2020-01-28 Thread Philip Yang
Use pages_per_node instead of mem->num_pages to alloc vram, this will
increase the chance to get vram node after vram fragments.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 82a3299e53c0..67a454f4c37a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -369,7 +369,7 @@ static int amdgpu_vram_mgr_new(struct ttm_mem_type_manager 
*man,
 
spin_lock(>lock);
for (i = 0; pages_left >= pages_per_node; ++i) {
-   unsigned long pages = rounddown_pow_of_two(pages_left);
+   unsigned long pages = rounddown_pow_of_two(pages_per_node);
 
r = drm_mm_insert_node_in_range(mm, [i], pages,
pages_per_node, 0,
@@ -383,15 +383,11 @@ static int amdgpu_vram_mgr_new(struct 
ttm_mem_type_manager *man,
pages_left -= pages;
}
 
-   for (; pages_left; ++i) {
-   unsigned long pages = min(pages_left, pages_per_node);
+   if (pages_left) {
uint32_t alignment = mem->page_alignment;
 
-   if (pages == pages_per_node)
-   alignment = pages_per_node;
-
r = drm_mm_insert_node_in_range(mm, [i],
-   pages, alignment, 0,
+   pages_left, alignment, 0,
place->fpfn, lpfn,
mode);
if (unlikely(r))
@@ -399,7 +395,6 @@ static int amdgpu_vram_mgr_new(struct ttm_mem_type_manager 
*man,
 
vis_usage += amdgpu_vram_mgr_vis_size(adev, [i]);
amdgpu_vram_mgr_virt_start(mem, [i]);
-   pages_left -= pages;
}
spin_unlock(>lock);
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 1/2] drm/amdgpu/display: handle multiple numbers of fclks in dcn_calcs.c

2020-01-28 Thread Alex Deucher
We might get different numbers of clocks from powerplay depending
on what the OEM has populated.

Bug: https://gitlab.freedesktop.org/drm/amd/issues/963
Signed-off-by: Alex Deucher 
---
 .../gpu/drm/amd/display/dc/calcs/dcn_calcs.c  | 31 ---
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c 
b/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c
index a27d84ca15a5..8ad32a11d363 100644
--- a/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c
+++ b/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c
@@ -1446,17 +1446,26 @@ void dcn_bw_update_from_pplib(struct dc *dc)
res = verify_clock_values();
 
if (res) {
-   ASSERT(fclks.num_levels >= 3);
-   dc->dcn_soc->fabric_and_dram_bandwidth_vmin0p65 = 32 * 
(fclks.data[0].clocks_in_khz / 1000.0) / 1000.0;
-   dc->dcn_soc->fabric_and_dram_bandwidth_vmid0p72 = 
dc->dcn_soc->number_of_channels *
-   (fclks.data[fclks.num_levels - 
(fclks.num_levels > 2 ? 3 : 2)].clocks_in_khz / 1000.0)
-   * ddr4_dram_factor_single_Channel / 1000.0;
-   dc->dcn_soc->fabric_and_dram_bandwidth_vnom0p8 = 
dc->dcn_soc->number_of_channels *
-   (fclks.data[fclks.num_levels - 2].clocks_in_khz 
/ 1000.0)
-   * ddr4_dram_factor_single_Channel / 1000.0;
-   dc->dcn_soc->fabric_and_dram_bandwidth_vmax0p9 = 
dc->dcn_soc->number_of_channels *
-   (fclks.data[fclks.num_levels - 1].clocks_in_khz 
/ 1000.0)
-   * ddr4_dram_factor_single_Channel / 1000.0;
+   unsigned vmin0p65_idx = 0;
+   unsigned vmid0p72_idx = fclks.num_levels -
+   (fclks.num_levels > 2 ? 3 : (fclks.num_levels > 1 ? 2 : 
1));
+   unsigned vnom0p8_idx = fclks.num_levels - (fclks.num_levels > 1 
? 2 : 1);
+   unsigned vmax0p9_idx = fclks.num_levels - 1;
+
+   dc->dcn_soc->fabric_and_dram_bandwidth_vmin0p65 =
+   32 * (fclks.data[vmin0p65_idx].clocks_in_khz / 1000.0) 
/ 1000.0;
+   dc->dcn_soc->fabric_and_dram_bandwidth_vmid0p72 =
+   dc->dcn_soc->number_of_channels *
+   (fclks.data[vmid0p72_idx].clocks_in_khz / 1000.0)
+   * ddr4_dram_factor_single_Channel / 1000.0;
+   dc->dcn_soc->fabric_and_dram_bandwidth_vnom0p8 =
+   dc->dcn_soc->number_of_channels *
+   (fclks.data[vnom0p8_idx].clocks_in_khz / 1000.0)
+   * ddr4_dram_factor_single_Channel / 1000.0;
+   dc->dcn_soc->fabric_and_dram_bandwidth_vmax0p9 =
+   dc->dcn_soc->number_of_channels *
+   (fclks.data[vmax0p9_idx].clocks_in_khz / 1000.0)
+   * ddr4_dram_factor_single_Channel / 1000.0;
} else
BREAK_TO_DEBUGGER();
 
-- 
2.24.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 2/2] drm/amdgpu/smu10: fix smu10_get_clock_by_type_with_latency

2020-01-28 Thread Alex Deucher
Only send non-0 clocks to DC for validation.  This mirrors
what the windows driver does.

Bug: https://gitlab.freedesktop.org/drm/amd/issues/963
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/powerplay/hwmgr/smu10_hwmgr.c | 15 +--
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/smu10_hwmgr.c 
b/drivers/gpu/drm/amd/powerplay/hwmgr/smu10_hwmgr.c
index 4e8ab139bb3b..273126cfc37d 100644
--- a/drivers/gpu/drm/amd/powerplay/hwmgr/smu10_hwmgr.c
+++ b/drivers/gpu/drm/amd/powerplay/hwmgr/smu10_hwmgr.c
@@ -1026,12 +1026,15 @@ static int smu10_get_clock_by_type_with_latency(struct 
pp_hwmgr *hwmgr,
 
clocks->num_levels = 0;
for (i = 0; i < pclk_vol_table->count; i++) {
-   clocks->data[i].clocks_in_khz = pclk_vol_table->entries[i].clk 
* 10;
-   clocks->data[i].latency_in_us = latency_required ?
-   smu10_get_mem_latency(hwmgr,
-   pclk_vol_table->entries[i].clk) 
:
-   0;
-   clocks->num_levels++;
+   if (pclk_vol_table->entries[i].clk) {
+   clocks->data[clocks->num_levels].clocks_in_khz =
+   pclk_vol_table->entries[i].clk * 10;
+   clocks->data[clocks->num_levels].latency_in_us = 
latency_required ?
+   smu10_get_mem_latency(hwmgr,
+ 
pclk_vol_table->entries[i].clk) :
+   0;
+   clocks->num_levels++;
+   }
}
 
return 0;
-- 
2.24.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amdgpu/navi10: add mclk to navi10_get_clock_by_type_with_latency

2020-01-28 Thread Matt Coffin
Let me know if I'm not doing this correctly since I'm still new.



Patch is

Reviewed-by: Matt Coffin 

On 1/25/20 11:33 AM, Alex Deucher wrote:
> Doesn't seem to be used, but add it just in case.
> 
> Signed-off-by: Alex Deucher 
> ---
>  drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c 
> b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> index 015f5ea38901..f60762f9b143 100644
> --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> @@ -1008,6 +1008,8 @@ static int navi10_get_clock_by_type_with_latency(struct 
> smu_context *smu,
>   case SMU_GFXCLK:
>   case SMU_DCEFCLK:
>   case SMU_SOCCLK:
> + case SMU_MCLK:
> + case SMU_UCLK:
>   ret = smu_get_dpm_level_count(smu, clk_type, _count);
>   if (ret)
>   return ret;
> 
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 3/3] drm/amdgpu/navi10: add OD support for restoring default table

2020-01-28 Thread Matt Coffin



On 1/28/20 10:26 AM, Alex Deucher wrote:
> On Tue, Jan 28, 2020 at 11:44 AM Matt Coffin  wrote:

> I just copied that vega20 did.  You may be right.  I haven't paged the
> recent SMU interface stuff into my head in a while.  If so, we should
> also fix the vega20_ppt.c code.

The vega20_ppt code was correct, until we implemented the ability for
the user to write to that overdrive table, which will land in 5.5.

Not entirely sure about the canonical way to distribute changes to
someone else's series, but I can take a crack at fixing this.

>>
>> On 1/25/20 11:48 AM, Alex Deucher wrote:
>>> Was missing before.
>>>
>>> Bug: https://gitlab.freedesktop.org/drm/amd/issues/1020
>>> Signed-off-by: Alex Deucher 
>>> ---
>>>  drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 8 
>>>  1 file changed, 8 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c 
>>> b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
>>> index d2d45181ae23..f60762f9b143 100644
>>> --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
>>> +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
>>> @@ -2062,6 +2062,14 @@ static int navi10_od_edit_dpm_table(struct 
>>> smu_context *smu, enum PP_OD_DPM_TABL
>>>   if (ret)
>>>   return ret;
>>>   od_table->UclkFmax = input[1];
>>> + break;
>>> + case PP_OD_RESTORE_DEFAULT_TABLE:
>>> + ret = smu_update_table(smu, SMU_TABLE_OVERDRIVE, 0, 
>>> table_context->overdrive_table, false);
>>> + if (ret) {
>>> + pr_err("Failed to export over drive table!\n");
>>> + return ret;
>>> + }
>>> +
>>>   break;
>>>   case PP_OD_COMMIT_DPM_TABLE:
>>>   navi10_dump_od_table(od_table);
>>>
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [Patch v1 0/5] Enable BACO with KFD

2020-01-28 Thread Mike Lothian
Is this designed to work with PRIME laptops too?

On Tue, 28 Jan 2020 at 01:29, Rajneesh Bhardwaj
 wrote:
>
> This series aims to enable BACO by default on supported AMD platforms
> and ensures that the AMD Kernel Fusion Driver can co-exist with this
> feature when the GPU devices are runtime suspended and firmware pushes
> the envelop to save more power with BACO entry sequence. Current
> approach makes sure that if KFD is using GPU services for compute, it
> won't let AMDGPU driver suspend and if the AMDGPU driver is already
> runtime suspended with GPUs in deep power saving mode with BACO, the KFD
> driver wakes up the AMDGPU and then starts the compute workload
> execution.
>
> This series has been tested with a single GPU (fiji), Dual GPUs (fiji
> and fiji/tonga) and seems to work fine. I have not seen but expect some
> corner cases where for some reason a KFD client binds a process but is
> unable to complete the task within 60 seconds. Ideally we should put the
> runtime_put and auto_suspend code in some function which is logical
> opposite of bind_process_to_device but during my experiments, it ended
> up with random gpu hang, machine reboot etc so any comments for
> improvement are welcome.
>
> Todo:
>
> rebase on top of https://patchwork.freedesktop.org/patch/338037/
>
> Rajneesh Bhardwaj (5):
>   drm/amdgpu: always enable runtime power management
>   drm/amdgpu: Fix missing error check in suspend
>   drm/amdkfd: Introduce debugfs option to disable baco
>   drm/amdkfd: show warning when kfd is locked
>   drm/amdkfd: refactor runtime pm for baco
>
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 12 -
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  9 ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  7 +++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  3 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c|  9 +++
>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c   |  2 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c   | 27 +++
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c| 31 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_iommu.c |  5 +++-
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c   | 19 ++---
>  10 files changed, 91 insertions(+), 33 deletions(-)
>
> --
> 2.17.1
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 3/3] drm/amdgpu/navi10: add OD support for restoring default table

2020-01-28 Thread Alex Deucher
On Tue, Jan 28, 2020 at 11:44 AM Matt Coffin  wrote:
>
> For this, I believe we're updating `table_context->overdrive_table` with
> the values set by the user, wouldn't the intended behavior here be to
> restore the settings that were there on boot?
>

Correct.

>
>
> If so, I think we'd have to cache the overdrive table that was there on
> boot, and use that in the response for `PP_OD_RESTORE_DEFAULT_TABLE`, no?
>
>
>
> I'm doing some testing on this patchset, but on initial lookover that's
> the only thing I saw. I could be mistaken, but I think this just writes
> the overdrive table that we are currently using over again instead of
> writing the default one.

I just copied that vega20 did.  You may be right.  I haven't paged the
recent SMU interface stuff into my head in a while.  If so, we should
also fix the vega20_ppt.c code.

Thanks!

Alex

>
> On 1/25/20 11:48 AM, Alex Deucher wrote:
> > Was missing before.
> >
> > Bug: https://gitlab.freedesktop.org/drm/amd/issues/1020
> > Signed-off-by: Alex Deucher 
> > ---
> >  drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 8 
> >  1 file changed, 8 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c 
> > b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> > index d2d45181ae23..f60762f9b143 100644
> > --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> > +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> > @@ -2062,6 +2062,14 @@ static int navi10_od_edit_dpm_table(struct 
> > smu_context *smu, enum PP_OD_DPM_TABL
> >   if (ret)
> >   return ret;
> >   od_table->UclkFmax = input[1];
> > + break;
> > + case PP_OD_RESTORE_DEFAULT_TABLE:
> > + ret = smu_update_table(smu, SMU_TABLE_OVERDRIVE, 0, 
> > table_context->overdrive_table, false);
> > + if (ret) {
> > + pr_err("Failed to export over drive table!\n");
> > + return ret;
> > + }
> > +
> >   break;
> >   case PP_OD_COMMIT_DPM_TABLE:
> >   navi10_dump_od_table(od_table);
> >
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 3/3] drm/amdgpu/navi10: add OD support for restoring default table

2020-01-28 Thread Matt Coffin
For this, I believe we're updating `table_context->overdrive_table` with
the values set by the user, wouldn't the intended behavior here be to
restore the settings that were there on boot?



If so, I think we'd have to cache the overdrive table that was there on
boot, and use that in the response for `PP_OD_RESTORE_DEFAULT_TABLE`, no?



I'm doing some testing on this patchset, but on initial lookover that's
the only thing I saw. I could be mistaken, but I think this just writes
the overdrive table that we are currently using over again instead of
writing the default one.

On 1/25/20 11:48 AM, Alex Deucher wrote:
> Was missing before.
> 
> Bug: https://gitlab.freedesktop.org/drm/amd/issues/1020
> Signed-off-by: Alex Deucher 
> ---
>  drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 8 
>  1 file changed, 8 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c 
> b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> index d2d45181ae23..f60762f9b143 100644
> --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> @@ -2062,6 +2062,14 @@ static int navi10_od_edit_dpm_table(struct smu_context 
> *smu, enum PP_OD_DPM_TABL
>   if (ret)
>   return ret;
>   od_table->UclkFmax = input[1];
> + break;
> + case PP_OD_RESTORE_DEFAULT_TABLE:
> + ret = smu_update_table(smu, SMU_TABLE_OVERDRIVE, 0, 
> table_context->overdrive_table, false);
> + if (ret) {
> + pr_err("Failed to export over drive table!\n");
> + return ret;
> + }
> +
>   break;
>   case PP_OD_COMMIT_DPM_TABLE:
>   navi10_dump_od_table(od_table);
> 
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/radeon: avoid page fault during gpu reset

2020-01-28 Thread Christian König

Am 28.01.20 um 14:15 schrieb Andreas Messer:

On Sat, Jan 25, 2020 at 07:01:36PM +, Koenig, Christian wrote:


Am 25.01.2020 19:47 schrieb Andreas Messer :
When backing up a ring, validate pointer to avoid page fault.
[ cut description / kernel messages ]

NAK, that was suggested multiple times now and is essentially the wrong
approach.

The problem is that the value is invalid because the hardware is not
functional any more. Returning here without backing up the ring just
papers over the real problem.

This is just the first occurance of this and you would need to fix a
couple of hundred register accesses (both inside and outside of the
driver) to make that really work reliable.

Sure, it wont fix the hardware. But since the page fault is most prominent
part in kernel log, people will continue suggesting it. With that change,
the kernel messages are full of ring and atom bios timeouts and might make
users more likely to consider a hardware issue in the first place.


That is correct, but the problem is that we currently have 2209 places 
where we read a register and usually expect that the values to be in a 
valid range.


If you really want to avoid all crashes you would need to audit and fix 
all occurrences where for example the register value is used as index in 
an array or similar.


And the radeon code is only the beginning, the whole PCIe subsystem 
would need an audit in a similar way. That is a huge lot of work we are 
not willing to do.



  Anyway:


The only advice I can give you is to replace the hardware. From
experience those symptoms mean that your GPU will die rather soon.

I think my hardware is fine. I have monitored gpu temp and fan pwm now for
a while and found the pwm to be driven at ~60% only although the gpu
already got quite high temperature during gameplay. When forcing the pwm
to ~80% no crash occurs anymore. I suppose it is not the GPU crashing but
instead the VRMs, not getting enough airflow.

I have compared the Bios fan tables of my card with them of other cards
bios (downloaded from web) of same GPU type and similar design.
Although they differ in cooler construction and used fan, all of them
despite one model have exactly the same fan regulation points with PWMHigh
at 80% for 90°C. This single model with other settings has 100% for this
temp and generally much more sane looking regulation curve.

I suppose most of the vendors just copied some reference design,
maybe the vendor's windows driver adjust the curve to a better one,
I don't know.

I think I'll add some sysfs attributes or module parameter to adjust
the curve to my needs.


The issue is that this is most likely not a temperature problem at all. 
If you have a temperature problem the ASIC usually just hangs in a 
shader or so, but the BIF is still fully functional (e.g. you can probe 
PCI-IDs etc...).


That looks more like the ESD protection is kicking in for some reason. 
In other words what you got here is a cold/broken solder point on the 
SMD components which happens to loose contact because the material 
expands when it warms up.


That is a serious hardware fault and a really good indicator that you 
should replace the faulty component ASAP.


Regards,
Christian.




[ Patch cut out ]

cheers,
Andreas



___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amd/display: fix spelling mistake link_integiry_check -> link_integrity_check

2020-01-28 Thread Colin King
From: Colin Ian King 

There is a spelling mistake on the struct field name link_integiry_check,
fix this by renaming it.

Signed-off-by: Colin Ian King 
---
 drivers/gpu/drm/amd/display/modules/hdcp/hdcp.h   | 2 +-
 .../gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c| 8 
 .../gpu/drm/amd/display/modules/hdcp/hdcp1_transition.c   | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp.h 
b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp.h
index f98d3d9ecb6d..af78e4f1be68 100644
--- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp.h
+++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp.h
@@ -63,7 +63,7 @@ struct mod_hdcp_transition_input_hdcp1 {
uint8_t hdcp_capable_dp;
uint8_t binfo_read_dp;
uint8_t r0p_available_dp;
-   uint8_t link_integiry_check;
+   uint8_t link_integrity_check;
uint8_t reauth_request_check;
uint8_t stream_encryption_dp;
 };
diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c 
b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c
index 04845e43df15..37670db64855 100644
--- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c
+++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c
@@ -283,8 +283,8 @@ static enum mod_hdcp_status wait_for_ready(struct mod_hdcp 
*hdcp,
hdcp, "bstatus_read"))
goto out;
if (!mod_hdcp_execute_and_set(check_link_integrity_dp,
-   >link_integiry_check, ,
-   hdcp, "link_integiry_check"))
+   >link_integrity_check, ,
+   hdcp, "link_integrity_check"))
goto out;
if 
(!mod_hdcp_execute_and_set(check_no_reauthentication_request_dp,
>reauth_request_check, ,
@@ -431,8 +431,8 @@ static enum mod_hdcp_status authenticated_dp(struct 
mod_hdcp *hdcp,
hdcp, "bstatus_read"))
goto out;
if (!mod_hdcp_execute_and_set(check_link_integrity_dp,
-   >link_integiry_check, ,
-   hdcp, "link_integiry_check"))
+   >link_integrity_check, ,
+   hdcp, "link_integrity_check"))
goto out;
if (!mod_hdcp_execute_and_set(check_no_reauthentication_request_dp,
>reauth_request_check, ,
diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_transition.c 
b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_transition.c
index 21ebc62bb9d9..76edcbe51f71 100644
--- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_transition.c
+++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_transition.c
@@ -241,7 +241,7 @@ enum mod_hdcp_status mod_hdcp_hdcp1_dp_transition(struct 
mod_hdcp *hdcp,
}
break;
case D1_A4_AUTHENTICATED:
-   if (input->link_integiry_check != PASS ||
+   if (input->link_integrity_check != PASS ||
input->reauth_request_check != PASS) {
/* 1A-07: restart hdcp on a link integrity failure */
fail_and_restart_in_ms(0, , output);
@@ -249,7 +249,7 @@ enum mod_hdcp_status mod_hdcp_hdcp1_dp_transition(struct 
mod_hdcp *hdcp,
}
break;
case D1_A6_WAIT_FOR_READY:
-   if (input->link_integiry_check == FAIL ||
+   if (input->link_integrity_check == FAIL ||
input->reauth_request_check == FAIL) {
fail_and_restart_in_ms(0, , output);
break;
-- 
2.24.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/radeon: avoid page fault during gpu reset

2020-01-28 Thread Andreas Messer
On Sat, Jan 25, 2020 at 07:01:36PM +, Koenig, Christian wrote:
> 
> 
> Am 25.01.2020 19:47 schrieb Andreas Messer :
> When backing up a ring, validate pointer to avoid page fault.
> [ cut description / kernel messages ] 
> 
> NAK, that was suggested multiple times now and is essentially the wrong
> approach.
>
> The problem is that the value is invalid because the hardware is not
> functional any more. Returning here without backing up the ring just
> papers over the real problem.
> 
> This is just the first occurance of this and you would need to fix a
> couple of hundred register accesses (both inside and outside of the
> driver) to make that really work reliable.

Sure, it wont fix the hardware. But since the page fault is most prominent
part in kernel log, people will continue suggesting it. With that change,
the kernel messages are full of ring and atom bios timeouts and might make
users more likely to consider a hardware issue in the first place. Anyway:

> The only advice I can give you is to replace the hardware. From
> experience those symptoms mean that your GPU will die rather soon.

I think my hardware is fine. I have monitored gpu temp and fan pwm now for
a while and found the pwm to be driven at ~60% only although the gpu
already got quite high temperature during gameplay. When forcing the pwm
to ~80% no crash occurs anymore. I suppose it is not the GPU crashing but
instead the VRMs, not getting enough airflow.

I have compared the Bios fan tables of my card with them of other cards
bios (downloaded from web) of same GPU type and similar design.
Although they differ in cooler construction and used fan, all of them
despite one model have exactly the same fan regulation points with PWMHigh
at 80% for 90°C. This single model with other settings has 100% for this
temp and generally much more sane looking regulation curve.

I suppose most of the vendors just copied some reference design,
maybe the vendor's windows driver adjust the curve to a better one,
I don't know.

I think I'll add some sysfs attributes or module parameter to adjust 
the curve to my needs.

> [ Patch cut out ]

cheers,
Andreas




signature.asc
Description: PGP signature
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amdgpu: allocate entities on demand

2020-01-28 Thread Christian König

Patch is Reviewed-by: Christian König 

Regards,
Christian.

Am 28.01.20 um 11:13 schrieb Nirmoy:

Gentle reminder !

On 1/24/20 5:31 PM, Nirmoy Das wrote:

Currently we pre-allocate entities and fences for all the HW IPs on
context creation and some of which are might never be used.

This patch tries to resolve entity/fences wastage by creating entity
only when needed.

v2: allocate memory for entity and fences together

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 235 
  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |   6 +-
  2 files changed, 124 insertions(+), 117 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

index 05c2af61e7de..94a6c42f29ea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -42,19 +42,12 @@ const unsigned int 
amdgpu_ctx_num_entities[AMDGPU_HW_IP_NUM] = {

  [AMDGPU_HW_IP_VCN_JPEG]    =    1,
  };
  -static int amdgpu_ctx_total_num_entities(void)
-{
-    unsigned i, num_entities = 0;
-
-    for (i = 0; i < AMDGPU_HW_IP_NUM; ++i)
-    num_entities += amdgpu_ctx_num_entities[i];
-
-    return num_entities;
-}
-
  static int amdgpu_ctx_priority_permit(struct drm_file *filp,
    enum drm_sched_priority priority)
  {
+    if (priority < 0 || priority >= DRM_SCHED_PRIORITY_MAX)
+    return -EINVAL;
+
  /* NORMAL and below are accessible by everyone */
  if (priority <= DRM_SCHED_PRIORITY_NORMAL)
  return 0;
@@ -68,64 +61,24 @@ static int amdgpu_ctx_priority_permit(struct 
drm_file *filp,

  return -EACCES;
  }
  -static int amdgpu_ctx_init(struct amdgpu_device *adev,
-   enum drm_sched_priority priority,
-   struct drm_file *filp,
-   struct amdgpu_ctx *ctx)
+static int amdgpu_ctx_init_entity(struct amdgpu_ctx *ctx, const u32 
hw_ip, const u32 ring)

  {
-    unsigned num_entities = amdgpu_ctx_total_num_entities();
-    unsigned i, j;
+    struct amdgpu_device *adev = ctx->adev;
+    struct amdgpu_ctx_entity *entity;
+    struct drm_gpu_scheduler **scheds = NULL, *sched = NULL;
+    unsigned num_scheds = 0;
+    enum drm_sched_priority priority;
  int r;
  -    if (priority < 0 || priority >= DRM_SCHED_PRIORITY_MAX)
-    return -EINVAL;
-
-    r = amdgpu_ctx_priority_permit(filp, priority);
-    if (r)
-    return r;
-
-    memset(ctx, 0, sizeof(*ctx));
-    ctx->adev = adev;
-
-
-    ctx->entities[0] = kcalloc(num_entities,
-   sizeof(struct amdgpu_ctx_entity),
-   GFP_KERNEL);
-    if (!ctx->entities[0])
-    return -ENOMEM;
-
+    entity = kcalloc(1, offsetof(typeof(*entity), 
fences[amdgpu_sched_jobs]),

+ GFP_KERNEL);
+    if (!entity)
+    return  -ENOMEM;
  -    for (i = 0; i < num_entities; ++i) {
-    struct amdgpu_ctx_entity *entity = >entities[0][i];
-
-    entity->sequence = 1;
-    entity->fences = kcalloc(amdgpu_sched_jobs,
- sizeof(struct dma_fence*), GFP_KERNEL);
-    if (!entity->fences) {
-    r = -ENOMEM;
-    goto error_cleanup_memory;
-    }
-    }
-    for (i = 1; i < AMDGPU_HW_IP_NUM; ++i)
-    ctx->entities[i] = ctx->entities[i - 1] +
-    amdgpu_ctx_num_entities[i - 1];
-
-    kref_init(>refcount);
-    spin_lock_init(>ring_lock);
-    mutex_init(>lock);
-
-    ctx->reset_counter = atomic_read(>gpu_reset_counter);
-    ctx->reset_counter_query = ctx->reset_counter;
-    ctx->vram_lost_counter = atomic_read(>vram_lost_counter);
-    ctx->init_priority = priority;
-    ctx->override_priority = DRM_SCHED_PRIORITY_UNSET;
-
-    for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
-    struct drm_gpu_scheduler **scheds;
-    struct drm_gpu_scheduler *sched;
-    unsigned num_scheds = 0;
-
-    switch (i) {
+    entity->sequence = 1;
+    priority = (ctx->override_priority == DRM_SCHED_PRIORITY_UNSET) ?
+    ctx->init_priority : ctx->override_priority;
+    switch (hw_ip) {
  case AMDGPU_HW_IP_GFX:
  sched = >gfx.gfx_ring[0].sched;
  scheds = 
@@ -166,63 +119,90 @@ static int amdgpu_ctx_init(struct amdgpu_device 
*adev,

  scheds = adev->jpeg.jpeg_sched;
  num_scheds =  adev->jpeg.num_jpeg_sched;
  break;
-    }
-
-    for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j)
-    r = drm_sched_entity_init(>entities[i][j].entity,
-  priority, scheds,
-  num_scheds, >guilty);
-    if (r)
-    goto error_cleanup_entities;
  }
  +    r = drm_sched_entity_init(>entity, priority, scheds, 
num_scheds,

+  >guilty);
+    if (r)
+    goto error_free_entity;
+
+    ctx->entities[hw_ip][ring] = entity;
  return 0;
  -error_cleanup_entities:
-    for (i = 0; i < num_entities; ++i)
- 

Re: [PATCH] drm/amdgpu: allocate entities on demand

2020-01-28 Thread Nirmoy

Gentle reminder !

On 1/24/20 5:31 PM, Nirmoy Das wrote:

Currently we pre-allocate entities and fences for all the HW IPs on
context creation and some of which are might never be used.

This patch tries to resolve entity/fences wastage by creating entity
only when needed.

v2: allocate memory for entity and fences together

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 235 
  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |   6 +-
  2 files changed, 124 insertions(+), 117 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index 05c2af61e7de..94a6c42f29ea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -42,19 +42,12 @@ const unsigned int 
amdgpu_ctx_num_entities[AMDGPU_HW_IP_NUM] = {
[AMDGPU_HW_IP_VCN_JPEG] =   1,
  };
  
-static int amdgpu_ctx_total_num_entities(void)

-{
-   unsigned i, num_entities = 0;
-
-   for (i = 0; i < AMDGPU_HW_IP_NUM; ++i)
-   num_entities += amdgpu_ctx_num_entities[i];
-
-   return num_entities;
-}
-
  static int amdgpu_ctx_priority_permit(struct drm_file *filp,
  enum drm_sched_priority priority)
  {
+   if (priority < 0 || priority >= DRM_SCHED_PRIORITY_MAX)
+   return -EINVAL;
+
/* NORMAL and below are accessible by everyone */
if (priority <= DRM_SCHED_PRIORITY_NORMAL)
return 0;
@@ -68,64 +61,24 @@ static int amdgpu_ctx_priority_permit(struct drm_file *filp,
return -EACCES;
  }
  
-static int amdgpu_ctx_init(struct amdgpu_device *adev,

-  enum drm_sched_priority priority,
-  struct drm_file *filp,
-  struct amdgpu_ctx *ctx)
+static int amdgpu_ctx_init_entity(struct amdgpu_ctx *ctx, const u32 hw_ip, 
const u32 ring)
  {
-   unsigned num_entities = amdgpu_ctx_total_num_entities();
-   unsigned i, j;
+   struct amdgpu_device *adev = ctx->adev;
+   struct amdgpu_ctx_entity *entity;
+   struct drm_gpu_scheduler **scheds = NULL, *sched = NULL;
+   unsigned num_scheds = 0;
+   enum drm_sched_priority priority;
int r;
  
-	if (priority < 0 || priority >= DRM_SCHED_PRIORITY_MAX)

-   return -EINVAL;
-
-   r = amdgpu_ctx_priority_permit(filp, priority);
-   if (r)
-   return r;
-
-   memset(ctx, 0, sizeof(*ctx));
-   ctx->adev = adev;
-
-
-   ctx->entities[0] = kcalloc(num_entities,
-  sizeof(struct amdgpu_ctx_entity),
-  GFP_KERNEL);
-   if (!ctx->entities[0])
-   return -ENOMEM;
-
+   entity = kcalloc(1, offsetof(typeof(*entity), 
fences[amdgpu_sched_jobs]),
+GFP_KERNEL);
+   if (!entity)
+   return  -ENOMEM;
  
-	for (i = 0; i < num_entities; ++i) {

-   struct amdgpu_ctx_entity *entity = >entities[0][i];
-
-   entity->sequence = 1;
-   entity->fences = kcalloc(amdgpu_sched_jobs,
-sizeof(struct dma_fence*), GFP_KERNEL);
-   if (!entity->fences) {
-   r = -ENOMEM;
-   goto error_cleanup_memory;
-   }
-   }
-   for (i = 1; i < AMDGPU_HW_IP_NUM; ++i)
-   ctx->entities[i] = ctx->entities[i - 1] +
-   amdgpu_ctx_num_entities[i - 1];
-
-   kref_init(>refcount);
-   spin_lock_init(>ring_lock);
-   mutex_init(>lock);
-
-   ctx->reset_counter = atomic_read(>gpu_reset_counter);
-   ctx->reset_counter_query = ctx->reset_counter;
-   ctx->vram_lost_counter = atomic_read(>vram_lost_counter);
-   ctx->init_priority = priority;
-   ctx->override_priority = DRM_SCHED_PRIORITY_UNSET;
-
-   for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
-   struct drm_gpu_scheduler **scheds;
-   struct drm_gpu_scheduler *sched;
-   unsigned num_scheds = 0;
-
-   switch (i) {
+   entity->sequence = 1;
+   priority = (ctx->override_priority == DRM_SCHED_PRIORITY_UNSET) ?
+   ctx->init_priority : ctx->override_priority;
+   switch (hw_ip) {
case AMDGPU_HW_IP_GFX:
sched = >gfx.gfx_ring[0].sched;
scheds = 
@@ -166,63 +119,90 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
scheds = adev->jpeg.jpeg_sched;
num_scheds =  adev->jpeg.num_jpeg_sched;
break;
-   }
-
-   for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j)
-   r = drm_sched_entity_init(>entities[i][j].entity,
- priority, scheds,
-