Re: [PATCH 2/2] drm/amdgpu: rename amdgpu_vm_bo_rmv to _del

2022-02-02 Thread Christian König

Am 01.02.22 um 17:27 schrieb Daniel Vetter:

On Tue, Feb 1, 2022 at 4:28 PM Christian König
 wrote:

Some people complained about the name and this matches much
more Linux naming conventions for object functions.

Signed-off-by: Christian König 

"some people" sounds mightily ominous :-)


That's intentional, you were not the only one :)

Cheers,
Christian.



On both patches: Acked-by: Daniel Vetter 

Cheers, Daniel


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 ++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c  | 2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c  | 2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c  | 4 ++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 4 ++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h   | 2 +-
  6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 5df387c4d7fb..5d00a6878ef2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -778,7 +778,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev, 
struct kgd_mem *mem,
 continue;
 if (attachment[i]->bo_va) {
 amdgpu_bo_reserve(bo[i], true);
-   amdgpu_vm_bo_rmv(adev, attachment[i]->bo_va);
+   amdgpu_vm_bo_del(adev, attachment[i]->bo_va);
 amdgpu_bo_unreserve(bo[i]);
 list_del(&attachment[i]->list);
 }
@@ -795,7 +795,7 @@ static void kfd_mem_detach(struct kfd_mem_attachment 
*attachment)

 pr_debug("\t remove VA 0x%llx in entry %p\n",
 attachment->va, attachment);
-   amdgpu_vm_bo_rmv(attachment->adev, attachment->bo_va);
+   amdgpu_vm_bo_del(attachment->adev, attachment->bo_va);
 drm_gem_object_put(&bo->tbo.base);
 list_del(&attachment->list);
 kfree(attachment);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
index da21e60bb827..c6d4d41c4393 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
@@ -98,7 +98,7 @@ int amdgpu_map_static_csa(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,

 if (r) {
 DRM_ERROR("failed to do bo_map on static CSA, err=%d\n", r);
-   amdgpu_vm_bo_rmv(adev, *bo_va);
+   amdgpu_vm_bo_del(adev, *bo_va);
 ttm_eu_backoff_reservation(&ticket, &list);
 return r;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 7c1f1b8ca77d..b4c0a4e77525 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -221,7 +221,7 @@ static void amdgpu_gem_object_close(struct drm_gem_object 
*obj,
 if (!bo_va || --bo_va->ref_count)
 goto out_unlock;

-   amdgpu_vm_bo_rmv(adev, bo_va);
+   amdgpu_vm_bo_del(adev, bo_va);
 if (!amdgpu_vm_ready(vm))
 goto out_unlock;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index ee3c17bd02fc..efd13898c83e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1194,12 +1194,12 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
 if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_VCE) != NULL)
 amdgpu_vce_free_handles(adev, file_priv);

-   amdgpu_vm_bo_rmv(adev, fpriv->prt_va);
+   amdgpu_vm_bo_del(adev, fpriv->prt_va);

 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
 /* TODO: how to handle reserve failure */
 BUG_ON(amdgpu_bo_reserve(adev->virt.csa_obj, true));
-   amdgpu_vm_bo_rmv(adev, fpriv->csa_va);
+   amdgpu_vm_bo_del(adev, fpriv->csa_va);
 fpriv->csa_va = NULL;
 amdgpu_bo_unreserve(adev->virt.csa_obj);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 7910046fe11a..8174d71764d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2653,7 +2653,7 @@ void amdgpu_vm_bo_trace_cs(struct amdgpu_vm *vm, struct 
ww_acquire_ctx *ticket)
  }

  /**
- * amdgpu_vm_bo_rmv - remove a bo to a specific vm
+ * amdgpu_vm_bo_del - remove a bo to a specific vm
   *
   * @adev: amdgpu_device pointer
   * @bo_va: requested bo_va
@@ -2662,7 +2662,7 @@ void amdgpu_vm_bo_trace_cs(struct amdgpu_vm *vm, struct 
ww_acquire_ctx *ticket)
   *
   * Object have to be reserved!
   */
-void amdgpu_vm_bo_rmv(struct amdgpu_device *adev,
+void amdgpu_vm_bo_del(struct amdgpu_device *adev,
   struct amdgpu_bo_va *bo_va)
  {
 struct amdgpu_bo_va_mapping *mapping, *next;

[PATCH] drm/amdgpu: skipping SDMA IP suspend for S0ix.

2022-02-02 Thread Rajib Mahapatra
[Why]
amdgpu error observed if suspend is aborted during S0i3
resume.

[How]
If suspend is aborted for some reason during S0i3 resume
cycle, it follows amdgpu errors in resume.
Skipping SDMA ip in suspend solves the issue on RENOIR
(green sardine apu) chip. This time, the system is
able to resume gracefully even the suspend is aborted.

Signed-off-by: Rajib Mahapatra 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7931132ce6e3..f01b1d7f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2927,6 +2927,16 @@ static int amdgpu_device_ip_suspend_phase2(struct 
amdgpu_device *adev)
 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX))
continue;
 
+   /* skip suspend of sdma for S0ix
+* Resume has issues if the suspend is aborted during S0i3 
cycle.
+* Skipping sdma for RN/CZN/BRC chip - green sardine apu.
+*/
+   if (adev->in_s0ix &&
+   (adev->asic_type == CHIP_RENOIR &&
+(adev->pdev->device == 0x15e7 || adev->pdev->device == 
0x1638) &&
+adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_SDMA))
+   continue;
+
/* XXX handle errors */
r = adev->ip_blocks[i].version->funcs->suspend(adev);
/* XXX handle errors */
-- 
2.25.1



RE: [PATCH] drm/amdgpu: skipping SDMA IP suspend for S0ix.

2022-02-02 Thread Limonciello, Mario
[Public]



> -Original Message-
> From: Mahapatra, Rajib 
> Sent: Wednesday, February 2, 2022 03:07
> To: Liang, Prike ; Limonciello, Mario
> ; Deucher, Alexander
> 
> Cc: amd-gfx@lists.freedesktop.org; S, Shirish ;
> Mahapatra, Rajib 
> Subject: [PATCH] drm/amdgpu: skipping SDMA IP suspend for S0ix.
> 
> [Why]
> amdgpu error observed if suspend is aborted during S0i3
> resume.
> 
> [How]
> If suspend is aborted for some reason during S0i3 resume
> cycle, it follows amdgpu errors in resume.
> Skipping SDMA ip in suspend solves the issue on RENOIR
> (green sardine apu) chip. This time, the system is
> able to resume gracefully even the suspend is aborted.
> 
> Signed-off-by: Rajib Mahapatra 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 ++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 7931132ce6e3..f01b1d7f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2927,6 +2927,16 @@ static int amdgpu_device_ip_suspend_phase2(struct
> amdgpu_device *adev)
>adev->ip_blocks[i].version->type ==
> AMD_IP_BLOCK_TYPE_GFX))
>   continue;
> 
> + /* skip suspend of sdma for S0ix
> +  * Resume has issues if the suspend is aborted during S0i3 
> cycle.
> +  * Skipping sdma for RN/CZN/BRC chip - green sardine apu.
> +  */
> + if (adev->in_s0ix &&
> + (adev->asic_type == CHIP_RENOIR &&
> +  (adev->pdev->device == 0x15e7 || adev->pdev->device ==
> 0x1638) &&
> +  adev->ip_blocks[i].version->type ==
> AMD_IP_BLOCK_TYPE_SDMA))
> + continue;
> +
>   /* XXX handle errors */
>   r = adev->ip_blocks[i].version->funcs->suspend(adev);
>   /* XXX handle errors */
> --
> 2.25.1

As this is specific to RN/CZN I think this check is better suited in
drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c perhaps in 
sdma_v4_0_suspend/sdma_v4_0_resume functions.

The aborted suspend case if it's specific to s0ix, can also use adev->in_s0ix as
part of the check.


Re: [PATCH] drm/amdgpu: skipping SDMA IP suspend for S0ix.

2022-02-02 Thread Alex Deucher
On Wed, Feb 2, 2022 at 4:07 AM Rajib Mahapatra  wrote:
>
> [Why]
> amdgpu error observed if suspend is aborted during S0i3
> resume.
>
> [How]
> If suspend is aborted for some reason during S0i3 resume
> cycle, it follows amdgpu errors in resume.
> Skipping SDMA ip in suspend solves the issue on RENOIR
> (green sardine apu) chip. This time, the system is
> able to resume gracefully even the suspend is aborted.
>
> Signed-off-by: Rajib Mahapatra 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 ++
>  1 file changed, 10 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 7931132ce6e3..f01b1d7f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2927,6 +2927,16 @@ static int amdgpu_device_ip_suspend_phase2(struct 
> amdgpu_device *adev)
>  adev->ip_blocks[i].version->type == 
> AMD_IP_BLOCK_TYPE_GFX))
> continue;
>
> +   /* skip suspend of sdma for S0ix
> +* Resume has issues if the suspend is aborted during S0i3 
> cycle.
> +* Skipping sdma for RN/CZN/BRC chip - green sardine apu.
> +*/
> +   if (adev->in_s0ix &&
> +   (adev->asic_type == CHIP_RENOIR &&
> +(adev->pdev->device == 0x15e7 || adev->pdev->device == 
> 0x1638) &&

The check here seems to contradict the comment above.  Is this all
Renoir based APUs or just green sardine?  If it's just green sardine,
you can check the APU flags rather than the PCI ids.  E.g.,
(adev->apu_flags & AMD_APU_IS_GREEN_SARDINE)
Also move this to sdma 4 code as Mario suggested.

Alex

> +adev->ip_blocks[i].version->type == 
> AMD_IP_BLOCK_TYPE_SDMA))
> +   continue;
> +
> /* XXX handle errors */
> r = adev->ip_blocks[i].version->funcs->suspend(adev);
> /* XXX handle errors */
> --
> 2.25.1
>


Re: [PATCH v4 00/10] Add MEMORY_DEVICE_COHERENT for coherent device memory mapping

2022-02-02 Thread Christoph Hellwig
On Thu, Jan 27, 2022 at 02:32:58PM -0800, Andrew Morton wrote:
> On Wed, 26 Jan 2022 21:09:39 -0600 Alex Sierra  wrote:
> 
> > This patch series introduces MEMORY_DEVICE_COHERENT, a type of memory
> > owned by a device that can be mapped into CPU page tables like
> > MEMORY_DEVICE_GENERIC and can also be migrated like
> > MEMORY_DEVICE_PRIVATE.
> 
> Some more reviewer input appears to be desirable here.
> 
> I was going to tentatively add it to -mm and -next, but problems. 
> 5.17-rc1's mm/migrate.c:migrate_vma_check_page() is rather different
> from the tree you patched.  Please redo, refresh and resend?

I really hate adding more types with the weird one off page refcount.
We need to clean that mess up first.


Re: [PATCH] drm/amdgpu: skipping SDMA IP suspend for S0ix.

2022-02-02 Thread Limonciello, Mario

On 2/2/2022 08:16, Alex Deucher wrote:

On Wed, Feb 2, 2022 at 4:07 AM Rajib Mahapatra  wrote:


[Why]
amdgpu error observed if suspend is aborted during S0i3
resume.

[How]
If suspend is aborted for some reason during S0i3 resume
cycle, it follows amdgpu errors in resume.
Skipping SDMA ip in suspend solves the issue on RENOIR
(green sardine apu) chip. This time, the system is
able to resume gracefully even the suspend is aborted.

Signed-off-by: Rajib Mahapatra 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 ++
  1 file changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7931132ce6e3..f01b1d7f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2927,6 +2927,16 @@ static int amdgpu_device_ip_suspend_phase2(struct 
amdgpu_device *adev)
  adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_GFX))
 continue;

+   /* skip suspend of sdma for S0ix
+* Resume has issues if the suspend is aborted during S0i3 
cycle.
+* Skipping sdma for RN/CZN/BRC chip - green sardine apu.
+*/
+   if (adev->in_s0ix &&
+   (adev->asic_type == CHIP_RENOIR &&
+(adev->pdev->device == 0x15e7 || adev->pdev->device == 0x1638) 
&&


The check here seems to contradict the comment above.  Is this all
Renoir based APUs or just green sardine?  If it's just green sardine,
you can check the APU flags rather than the PCI ids.  E.g.,
(adev->apu_flags & AMD_APU_IS_GREEN_SARDINE)
Also move this to sdma 4 code as Mario suggested.


Both RN and green sardine share the same flows for SMU, I would think it 
should just be match (adev->in_s0xi && (adev->flags & AMD_IS_APU)) when 
it's moved to skip suspend.




Alex


+adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_SDMA))
+   continue;
+
 /* XXX handle errors */
 r = adev->ip_blocks[i].version->funcs->suspend(adev);
 /* XXX handle errors */
--
2.25.1





Re: [PATCH] drm/amdgpu: Handle the GPU recovery failure in SRIOV environment.

2022-02-02 Thread Andrey Grodzovsky



On 2022-02-01 16:47, Surbhi Kakarya wrote:

This patch handles the GPU recovery faliure in sriov environment by
retrying the reset if the first reset fails. To determine the condition of 
retry, a
new function amdgpu_is_retry_sriov_reset() is added which returns true if 
failure is due
to ETIMEDOUT, EINVAL or EBUSY, otherwise return false. MAX_RETRY_LIMIT is used 
to
limit the retry to 2.

It also handles the return status in Post Asic Reset by updating the return code
with asic_reset_res and eventually return the return code in 
amdgpu_job_timedout().

Signed-off-by: Surbhi Kakarya 
Change-Id: Ib2e408819b4780e6963e1dc078c3410cd512e6e8
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 --
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c|  6 ++-
  2 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 53af2623c58f..f50c18cb38c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -89,6 +89,7 @@ MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
  MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin");
  
  #define AMDGPU_RESUME_MS		2000

+#define MAX_RETRY_LIMIT2
  
  const char *amdgpu_asic_name[] = {

"TAHITI",
@@ -5026,11 +5027,27 @@ static int amdgpu_device_suspend_display_audio(struct 
amdgpu_device *adev)
return 0;
  }
  
+/**

+ * amdgpu_is_retry_sriov_reset - check if we should retry sriov reset
+ *
+ * Check amdgpu_is_retry_sriov_reset and return status to see if we should 
retry reset.
+ */
+static bool amdgpu_is_retry_sriov_reset(int r)
+{
+
+if(r == -EBUSY || r == -ETIMEDOUT || r == -EINVAL)
+return true;
+else
+return false;
+
+}
+
  static void amdgpu_device_recheck_guilty_jobs(
struct amdgpu_device *adev, struct list_head *device_list_handle,
struct amdgpu_reset_context *reset_context)
  {
int i, r = 0;
+   int retry_limit = 0;
  
  	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {

struct amdgpu_ring *ring = adev->rings[i];
@@ -5064,8 +5081,18 @@ static void amdgpu_device_recheck_guilty_jobs(
if (amdgpu_sriov_vf(adev)) {
amdgpu_virt_fini_data_exchange(adev);
r = amdgpu_device_reset_sriov(adev, false);
-   if (r)
+   if (r) {
adev->asic_reset_res = r;
+   if (amdgpu_is_retry_sriov_reset(r)) {
+   adev->asic_reset_res = 0;
+   if (retry_limit < 
MAX_RETRY_LIMIT) {
+   retry_limit++;
+   goto retry;
+   }
+   else
+   DRM_ERROR("GPU reset retry 
is beyond the retry limit\n");
+   }
+   }
} else {
clear_bit(AMDGPU_SKIP_HW_RESET,
  &reset_context->flags);
@@ -5122,6 +5149,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
bool locked = false;
int tmp_vram_lost_counter;
struct amdgpu_reset_context reset_context;
+   int retry_limit = 0;
  
  	memset(&reset_context, 0, sizeof(reset_context));
  
@@ -5299,8 +5327,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

/* Host driver will handle XGMI hive reset for SRIOV */
if (amdgpu_sriov_vf(adev)) {
r = amdgpu_device_reset_sriov(adev, job ? false : true);
-   if (r)
-   adev->asic_reset_res = r;
+if (r) {
+adev->asic_reset_res = r;
+if (amdgpu_is_retry_sriov_reset(r)) {
+   adev->asic_reset_res = 0;
+   if (retry_limit < MAX_RETRY_LIMIT) {
+   retry_limit++;
+   goto retry;
+   }
+   else
+   DRM_ERROR("GPU reset retry is beyond the 
retry limit\n");



Just same comment as Christian, if you could move this retry handling 
inside amdgpu_device_reset_sriov
so to avoid code duplication here and above. Other then that looks good 
to me.


Andrey



+}
+}
} else {
r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
if (r && r == -EAGAIN)
@@ -5341,6 +5379,9 @@ int amdgpu_device_gpu_recover(s

Re: [PATCH v4 00/10] Add MEMORY_DEVICE_COHERENT for coherent device memory mapping

2022-02-02 Thread Jason Gunthorpe
On Wed, Feb 02, 2022 at 03:57:50PM +0100, Christoph Hellwig wrote:
> On Thu, Jan 27, 2022 at 02:32:58PM -0800, Andrew Morton wrote:
> > On Wed, 26 Jan 2022 21:09:39 -0600 Alex Sierra  wrote:
> > 
> > > This patch series introduces MEMORY_DEVICE_COHERENT, a type of memory
> > > owned by a device that can be mapped into CPU page tables like
> > > MEMORY_DEVICE_GENERIC and can also be migrated like
> > > MEMORY_DEVICE_PRIVATE.
> > 
> > Some more reviewer input appears to be desirable here.
> > 
> > I was going to tentatively add it to -mm and -next, but problems. 
> > 5.17-rc1's mm/migrate.c:migrate_vma_check_page() is rather different
> > from the tree you patched.  Please redo, refresh and resend?
> 
> I really hate adding more types with the weird one off page refcount.
> We need to clean that mess up first.

Is there anyone who could give an outline of what is needed to make
fsdax use compound pages/folios for its PMD stuff?

I already suggested removing that as a way forward, and was shot down,
but nobody is standing up to maintain this code and fix it :(

We got devdax and the DRM stuff fixed now, so FSDAX is the next
blocker on this work.

The people who want this to advance have no idea about FSs or what to
do, unfortunately.

Jason


Re: [PATCH] drm/amdgpu: skipping SDMA IP suspend for S0ix.

2022-02-02 Thread Alex Deucher
On Wed, Feb 2, 2022 at 10:29 AM Limonciello, Mario
 wrote:
>
> On 2/2/2022 08:16, Alex Deucher wrote:
> > On Wed, Feb 2, 2022 at 4:07 AM Rajib Mahapatra  
> > wrote:
> >>
> >> [Why]
> >> amdgpu error observed if suspend is aborted during S0i3
> >> resume.
> >>
> >> [How]
> >> If suspend is aborted for some reason during S0i3 resume
> >> cycle, it follows amdgpu errors in resume.
> >> Skipping SDMA ip in suspend solves the issue on RENOIR
> >> (green sardine apu) chip. This time, the system is
> >> able to resume gracefully even the suspend is aborted.
> >>
> >> Signed-off-by: Rajib Mahapatra 
> >> ---
> >>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 ++
> >>   1 file changed, 10 insertions(+)
> >>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >> index 7931132ce6e3..f01b1d7f 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >> @@ -2927,6 +2927,16 @@ static int amdgpu_device_ip_suspend_phase2(struct 
> >> amdgpu_device *adev)
> >>   adev->ip_blocks[i].version->type == 
> >> AMD_IP_BLOCK_TYPE_GFX))
> >>  continue;
> >>
> >> +   /* skip suspend of sdma for S0ix
> >> +* Resume has issues if the suspend is aborted during S0i3 
> >> cycle.
> >> +* Skipping sdma for RN/CZN/BRC chip - green sardine apu.
> >> +*/
> >> +   if (adev->in_s0ix &&
> >> +   (adev->asic_type == CHIP_RENOIR &&
> >> +(adev->pdev->device == 0x15e7 || adev->pdev->device 
> >> == 0x1638) &&
> >
> > The check here seems to contradict the comment above.  Is this all
> > Renoir based APUs or just green sardine?  If it's just green sardine,
> > you can check the APU flags rather than the PCI ids.  E.g.,
> > (adev->apu_flags & AMD_APU_IS_GREEN_SARDINE)
> > Also move this to sdma 4 code as Mario suggested.
>
> Both RN and green sardine share the same flows for SMU, I would think it
> should just be match (adev->in_s0xi && (adev->flags & AMD_IS_APU)) when
> it's moved to skip suspend.

The SDMA 4.0 code is shared with Raven1/2 and Picasso as well.  We
should verify that it's required for them as well.

Alex

>
> >
> > Alex
> >
> >> +adev->ip_blocks[i].version->type == 
> >> AMD_IP_BLOCK_TYPE_SDMA))
> >> +   continue;
> >> +
> >>  /* XXX handle errors */
> >>  r = adev->ip_blocks[i].version->funcs->suspend(adev);
> >>  /* XXX handle errors */
> >> --
> >> 2.25.1
> >>
>


Re: [PATCH] drm/amdgpu: Handle the GPU recovery failure in SRIOV environment.

2022-02-02 Thread Felix Kuehling

Am 2022-02-01 um 16:47 schrieb Surbhi Kakarya:

This patch handles the GPU recovery faliure in sriov environment by
retrying the reset if the first reset fails. To determine the condition of 
retry, a
new function amdgpu_is_retry_sriov_reset() is added which returns true if 
failure is due
to ETIMEDOUT, EINVAL or EBUSY, otherwise return false. MAX_RETRY_LIMIT is used 
to
limit the retry to 2.

It also handles the return status in Post Asic Reset by updating the return code
with asic_reset_res and eventually return the return code in 
amdgpu_job_timedout().

Signed-off-by: Surbhi Kakarya 
Change-Id: Ib2e408819b4780e6963e1dc078c3410cd512e6e8
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 --
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c|  6 ++-
  2 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 53af2623c58f..f50c18cb38c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -89,6 +89,7 @@ MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
  MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin");
  
  #define AMDGPU_RESUME_MS		2000

+#define MAX_RETRY_LIMIT2
  
  const char *amdgpu_asic_name[] = {

"TAHITI",
@@ -5026,11 +5027,27 @@ static int amdgpu_device_suspend_display_audio(struct 
amdgpu_device *adev)
return 0;
  }
  
+/**

+ * amdgpu_is_retry_sriov_reset - check if we should retry sriov reset
+ *
+ * Check amdgpu_is_retry_sriov_reset and return status to see if we should 
retry reset.
+ */
+static bool amdgpu_is_retry_sriov_reset(int r)
+{
+
+if(r == -EBUSY || r == -ETIMEDOUT || r == -EINVAL)
+return true;
+else
+return false;
+
+}


The missing space between "if" and "(" should cause a checkpatch coding 
style warning. Please run your patch through checkpatch.pl.


That said, this function could be much simpler, maybe even a macro instead:

#define RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) 
== -EINVAL)


Regards,
  Felix



+
  static void amdgpu_device_recheck_guilty_jobs(
struct amdgpu_device *adev, struct list_head *device_list_handle,
struct amdgpu_reset_context *reset_context)
  {
int i, r = 0;
+   int retry_limit = 0;
  
  	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {

struct amdgpu_ring *ring = adev->rings[i];
@@ -5064,8 +5081,18 @@ static void amdgpu_device_recheck_guilty_jobs(
if (amdgpu_sriov_vf(adev)) {
amdgpu_virt_fini_data_exchange(adev);
r = amdgpu_device_reset_sriov(adev, false);
-   if (r)
+   if (r) {
adev->asic_reset_res = r;
+   if (amdgpu_is_retry_sriov_reset(r)) {
+   adev->asic_reset_res = 0;
+   if (retry_limit < 
MAX_RETRY_LIMIT) {
+   retry_limit++;
+   goto retry;
+   }
+   else
+   DRM_ERROR("GPU reset retry 
is beyond the retry limit\n");
+   }
+   }
} else {
clear_bit(AMDGPU_SKIP_HW_RESET,
  &reset_context->flags);
@@ -5122,6 +5149,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
bool locked = false;
int tmp_vram_lost_counter;
struct amdgpu_reset_context reset_context;
+   int retry_limit = 0;
  
  	memset(&reset_context, 0, sizeof(reset_context));
  
@@ -5299,8 +5327,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

/* Host driver will handle XGMI hive reset for SRIOV */
if (amdgpu_sriov_vf(adev)) {
r = amdgpu_device_reset_sriov(adev, job ? false : true);
-   if (r)
-   adev->asic_reset_res = r;
+if (r) {
+adev->asic_reset_res = r;
+if (amdgpu_is_retry_sriov_reset(r)) {
+   adev->asic_reset_res = 0;
+   if (retry_limit < MAX_RETRY_LIMIT) {
+   retry_limit++;
+   goto retry;
+   }
+   else
+   DRM_ERROR("GPU reset retry is beyond the 
retry limit\n");
+}
+}
} else {
r = amdgpu_do_asic_reset(devic

[RFC v4] drm/amdgpu: Rework reset domain to be refcounted.

2022-02-02 Thread Andrey Grodzovsky
The reset domain contains register access semaphor
now and so needs to be present as long as each device
in a hive needs it and so it cannot be binded to XGMI
hive life cycle.
Adress this by making reset domain refcounted and pointed
by each member of the hive and the hive itself.

v4:
Fix crash on boot with XGMI hive by adding type to reset_domain.
XGMI will only create a new reset_domain if prevoius was of single
device type meaning it's first boot. Otherwsie it will take a
refocunt to exsiting reset_domain from the amdgou device.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  6 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  | 38 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 18 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 29 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  4 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c  |  4 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c  |  4 +-
 9 files changed, 118 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 8e96b9a14452..f2ba460bfd59 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -813,9 +813,7 @@ struct amd_powerplay {
 #define AMDGPU_RESET_MAGIC_NUM 64
 #define AMDGPU_MAX_DF_PERFMONS 4
 
-struct amdgpu_reset_domain {
-   struct workqueue_struct *wq;
-};
+struct amdgpu_reset_domain;
 
 struct amdgpu_device {
struct device   *dev;
@@ -1102,7 +1100,7 @@ struct amdgpu_device {
struct amdgpu_reset_control *reset_cntl;
uint32_t
ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
 
-   struct amdgpu_reset_domain  reset_domain;
+   struct amdgpu_reset_domain  *reset_domain;
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index fef952ca8db5..cd1b7af69c35 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2313,7 +2313,7 @@ static int amdgpu_device_init_schedulers(struct 
amdgpu_device *adev)
 
r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
   ring->num_hw_submission, 
amdgpu_job_hang_limit,
-  timeout, adev->reset_domain.wq, 
ring->sched_score, ring->name);
+  timeout, adev->reset_domain->wq, 
ring->sched_score, ring->name);
if (r) {
DRM_ERROR("Failed to create scheduler on ring %s.\n",
  ring->name);
@@ -2432,24 +2432,22 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
if (r)
goto init_failed;
 
+   /**
+* In case of XGMI grab extra reference for reset domain for this device
+*/
if (adev->gmc.xgmi.num_physical_nodes > 1) {
-   struct amdgpu_hive_info *hive;
-
-   amdgpu_xgmi_add_device(adev);
+   if (amdgpu_xgmi_add_device(adev) == 0) {
+   struct amdgpu_hive_info *hive = 
amdgpu_get_xgmi_hive(adev);
 
-   hive = amdgpu_get_xgmi_hive(adev);
-   if (!hive || !hive->reset_domain.wq) {
-   DRM_ERROR("Failed to obtain reset domain info for XGMI 
hive:%llx", hive->hive_id);
-   r = -EINVAL;
-   goto init_failed;
-   }
+   if (!hive->reset_domain ||
+   
!kref_get_unless_zero(&hive->reset_domain->refcount)) {
+   r = -ENOENT;
+   goto init_failed;
+   }
 
-   adev->reset_domain.wq = hive->reset_domain.wq;
-   } else {
-   adev->reset_domain.wq = 
alloc_ordered_workqueue("amdgpu-reset-dev", 0);
-   if (!adev->reset_domain.wq) {
-   r = -ENOMEM;
-   goto init_failed;
+   /* Drop the early temporary reset domain we created for 
device */
+   kref_put(&adev->reset_domain->refcount, 
amdgpu_reset_destroy_reset_domain);
+   adev->reset_domain = hive->reset_domain;
}
}
 
@@ -3599,6 +3597,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
return r;
}
 
+   /*
+* Reset domain needs to be present early, before XGMI hive discovered
+* (if any) and intitialized to use reset sem and in_gpu reset flag
+* early on during init.
+*/
+   adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE 
,"amdgpu-reset-dev");
+   if (!adev->reset_do

[bug report] drm/amd/display: refactor destructive verify link cap sequence

2022-02-02 Thread Dan Carpenter
Hello Wenjing Liu,

The patch 1a206273c322: "drm/amd/display: refactor destructive verify
link cap sequence" from Jan 28, 2022, leads to the following Smatch
static checker warning:

drivers/gpu/drm/amd/amdgpu/../display/dc/core/dc_link_dp.c:3248 
dp_verify_link_cap()
error: uninitialized symbol 'status'.

drivers/gpu/drm/amd/amdgpu/../display/dc/core/dc_link_dp.c
3192 static bool dp_verify_link_cap(
3193 struct dc_link *link,
3194 struct dc_link_settings *known_limit_link_setting,
3195 int *fail_count)
3196 {
3197 struct dc_link_settings cur_link_settings = {0};
3198 struct dc_link_settings initial_link_settings = 
*known_limit_link_setting;
3199 bool success = false;
3200 bool skip_video_pattern;
3201 enum clock_source_id dp_cs_id = get_clock_source_id(link);
3202 enum link_training_result status;
3203 union hpd_irq_data irq_data;
3204 struct link_resource link_res;
3205 
3206 memset(&irq_data, 0, sizeof(irq_data));
3207 cur_link_settings = initial_link_settings;
3208 
3209 /* Grant extended timeout request */
3210 if ((link->lttpr_mode == LTTPR_MODE_NON_TRANSPARENT) && 
(link->dpcd_caps.lttpr_caps.max_ext_timeout > 0)) {
3211 uint8_t grant = 
link->dpcd_caps.lttpr_caps.max_ext_timeout & 0x80;
3212 
3213 core_link_write_dpcd(link, 
DP_PHY_REPEATER_EXTENDED_WAIT_TIMEOUT, &grant, sizeof(grant));
3214 }
3215 
3216 do {
3217 if (!get_temp_dp_link_res(link, &link_res, 
&cur_link_settings))
3218 continue;

"status" is not set on this continure path.

3219 
3220 skip_video_pattern = cur_link_settings.link_rate != 
LINK_RATE_LOW;
3221 dp_enable_link_phy(
3222 link,
3223 &link_res,
3224 link->connector_signal,
3225 dp_cs_id,
3226 &cur_link_settings);
3227 
3228 status = dc_link_dp_perform_link_training(
3229 link,
3230 &link_res,
3231 &cur_link_settings,
3232 skip_video_pattern);
3233 
3234 if (status == LINK_TRAINING_SUCCESS) {
3235 success = true;
3236 udelay(1000);
3237 if (read_hpd_rx_irq_data(link, &irq_data) == 
DC_OK &&
3238 
hpd_rx_irq_check_link_loss_status(
3239 link,
3240 &irq_data))
3241 (*fail_count)++;
3242 
3243 } else {
3244 (*fail_count)++;
3245 }
3246 dp_disable_link_phy(link, &link_res, 
link->connector_signal);
3247 } while (!success && decide_fallback_link_setting(link,
--> 3248 initial_link_settings, &cur_link_settings, 
status));

^^
Uninitialized variable

3249 
3250 link->verified_link_cap = success ?
3251 cur_link_settings : fail_safe_link_settings;
3252 return success;
3253 }

regards,
dan carpenter


[PATCH] drm/amd/display: Handle removed connector in early_unregister

2022-02-02 Thread Fangzhi Zuo
From: Wayne Lin 

[Why]
commit "drm/amd/display: turn DPMS off on connector unplug" and
commit "drm/amd/display: Clear dc remote sinks on MST disconnect"
were trying to resolve the resource problem when we connectors get
disconnected under MST scenarios. However, these patches don't
really clean up all remote sinks. Nor turns DPMS off on all affected
streams. Also, these can't handle disconnected connectors reported by CSN.

[How]
- Revise commit "drm/amd/display: turn DPMS off on connector unplug"
a bit to handle none mst case only.
- Revert commit "drm/amd/display: Clear dc remote sinks on MST disconnect"
- Revise a bit the logic in above patches and change to turn DPMS
off/clear dc remote sink within amdgpu_dm_mst_connector_early_unregister().
Since drm will call .early_unregister for all disconnected connectors,
we can ensure to also handle disconnected connectors reported by CSN.

Signed-off-by: Wayne Lin 
Signed-off-by: Fangzhi Zuo 
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |  7 
 .../display/amdgpu_dm/amdgpu_dm_mst_types.c   | 41 +--
 .../gpu/drm/amd/display/dc/core/dc_stream.c   | 12 ++
 drivers/gpu/drm/amd/display/dc/dc_stream.h|  1 +
 4 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index f5941e59e5ad..529b3ddaa10b 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -3034,6 +3034,7 @@ static void handle_hpd_irq_helper(struct 
amdgpu_dm_connector *aconnector)
struct drm_connector *connector = &aconnector->base;
struct drm_device *dev = connector->dev;
enum dc_connection_type new_connection_type = dc_connection_none;
+   enum dc_connection_type old_connection_type = aconnector->dc_link->type;
struct amdgpu_device *adev = drm_to_adev(dev);
struct dm_connector_state *dm_con_state = 
to_dm_connector_state(connector->state);
struct dm_crtc_state *dm_crtc_state = NULL;
@@ -3074,7 +3075,13 @@ static void handle_hpd_irq_helper(struct 
amdgpu_dm_connector *aconnector)
drm_kms_helper_hotplug_event(dev);
 
} else if (dc_link_detect(aconnector->dc_link, DETECT_REASON_HPD)) {
+   /**
+* MST cases are handled within .early_unregister where we
+* can handle disconnected conectors reported by long HPD
+* and CSN.
+*/
if (new_connection_type == dc_connection_none &&
+   old_connection_type != dc_connection_mst_branch &&
aconnector->dc_link->type == dc_connection_none &&
dm_crtc_state)
dm_set_dpms_off(aconnector->dc_link, dm_crtc_state);
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
index 8e97d21bdf5c..411b55596b00 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
@@ -139,11 +139,46 @@ amdgpu_dm_mst_connector_late_register(struct 
drm_connector *connector)
 static void
 amdgpu_dm_mst_connector_early_unregister(struct drm_connector *connector)
 {
-   struct amdgpu_dm_connector *amdgpu_dm_connector =
-   to_amdgpu_dm_connector(connector);
-   struct drm_dp_mst_port *port = amdgpu_dm_connector->port;
+   struct amdgpu_dm_connector *aconnector =
+to_amdgpu_dm_connector(connector);
+   struct drm_dp_mst_port *port = aconnector->port;
+   struct dc_stream_update stream_update;
+   struct dc_stream_state *stream_state;
+   struct drm_device *ddev = aconnector->base.dev;
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+   struct dc_link *dc_link = aconnector->dc_link;
+   struct dc_sink *dc_sink = aconnector->dc_sink;
+   bool dpms_off = true;
 
drm_dp_mst_connector_early_unregister(connector, port);
+
+   ASSERT(dc_link);
+
+   if (dc_sink) {
+   mutex_lock(&ddev->mode_config.mutex);
+   mutex_lock(&adev->dm.dc_lock);
+
+   memset(&stream_update, 0, sizeof(stream_update));
+   stream_update.dpms_off = &dpms_off;
+
+   /*set stream dpms_off*/
+   stream_state = dc_stream_get_stream_by_sink(dc_sink);
+   if (stream_state != NULL) {
+   stream_update.stream = stream_state;
+   
dc_commit_updates_for_stream(stream_state->ctx->dc, NULL, 0,
+   
stream_state, &stream_update,
+   
stream_state->ctx->dc->current_sta

Re: [RFC v3 00/12] Define and use reset domain for GPU recovery in amdgpu

2022-02-02 Thread Andrey Grodzovsky
Just another ping, with Shyun's help I was able to do some smoke testing 
on XGMI SRIOV system (booting and triggering hive reset)

and for now looks good.

Andrey

On 2022-01-28 14:36, Andrey Grodzovsky wrote:
Just a gentle ping if people have more comments on this patch set ? 
Especially last 5 patches

as first 7 are exact same as V2 and we already went over them mostly.

Andrey

On 2022-01-25 17:37, Andrey Grodzovsky wrote:
This patchset is based on earlier work by Boris[1] that allowed to 
have an

ordered workqueue at the driver level that will be used by the different
schedulers to queue their timeout work. On top of that I also serialized
any GPU reset we trigger from within amdgpu code to also go through 
the same
ordered wq and in this way simplify somewhat our GPU reset code so we 
don't need
to protect from concurrency by multiple GPU reset triggeres such as 
TDR on one

hand and sysfs trigger or RAS trigger on the other hand.

As advised by Christian and Daniel I defined a reset_domain struct 
such that
all the entities that go through reset together will be serialized 
one against

another.

TDR triggered by multiple entities within the same domain due to the 
same reason will not
be triggered as the first such reset will cancel all the pending 
resets. This is
relevant only to TDR timers and not to triggered resets coming from 
RAS or SYSFS,

those will still happen after the in flight resets finishes.

v2:
Add handling on SRIOV configuration, the reset notify coming from host
and driver already trigger a work queue to handle the reset so drop this
intermediate wq and send directly to timeout wq. (Shaoyun)

v3:
Lijo suggested puting 'adev->in_gpu_reset' in amdgpu_reset_domain 
struct.
I followed his advise and also moved adev->reset_sem into same place. 
This

in turn caused to do some follow-up refactor of the original patches
where i decoupled amdgpu_reset_domain life cycle frolm XGMI hive 
because hive is destroyed and
reconstructed for the case of reset the devices in the XGMI hive 
during probe for SRIOV See [2]
while we need the reset sem and gpu_reset flag to always be present. 
This was attained
by adding refcount to amdgpu_reset_domain so each device can safely 
point to it as long as

it needs.


[1] 
https://patchwork.kernel.org/project/dri-devel/patch/20210629073510.2764391-3-boris.brezil...@collabora.com/

[2] https://www.spinics.net/lists/amd-gfx/msg58836.html

P.S Going through drm-misc-next and not amd-staging-drm-next as Boris 
work hasn't landed yet there.


P.P.S Patches 8-12 are the refactor on top of the original V2 patchset.

P.P.P.S I wasn't able yet to test the reworked code on XGMI SRIOV 
system because drm-misc-next fails to load there.
Would appriciate if maybe jingwech can try it on his system like he 
tested V2.


Andrey Grodzovsky (12):
   drm/amdgpu: Introduce reset domain
   drm/amdgpu: Move scheduler init to after XGMI is ready
   drm/amdgpu: Fix crash on modprobe
   drm/amdgpu: Serialize non TDR gpu recovery with TDRs
   drm/amd/virt: For SRIOV send GPU reset directly to TDR queue.
   drm/amdgpu: Drop hive->in_reset
   drm/amdgpu: Drop concurrent GPU reset protection for device
   drm/amdgpu: Rework reset domain to be refcounted.
   drm/amdgpu: Move reset sem into reset_domain
   drm/amdgpu: Move in_gpu_reset into reset_domain
   drm/amdgpu: Rework amdgpu_device_lock_adev
   Revert 'drm/amdgpu: annotate a false positive recursive locking'

  drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  15 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c   |  10 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    | 275 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  43 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   |   2 +-
  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    |  18 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c |  39 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  12 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h  |   2 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c  |  24 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h  |   3 +-
  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c    |   6 +-
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c |  14 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c |  19 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c |  19 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c |  11 +-
  16 files changed, 313 insertions(+), 199 deletions(-)



[PATCH] drm/amd/display: Use NULL pointer instead of plain integer

2022-02-02 Thread Magali Lemes
Assigning 0L to a pointer variable caused the following warning:

drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dsc/rc_calc_fpu.c:71:40:
warning: Using plain integer as NULL pointer

In order to remove this warning, this commit assigns a NULL pointer to
the pointer variable that caused this issue.

Reported-by: kernel test robot 
Signed-off-by: Magali Lemes 
---
 drivers/gpu/drm/amd/display/dc/dml/dsc/rc_calc_fpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dml/dsc/rc_calc_fpu.c 
b/drivers/gpu/drm/amd/display/dc/dml/dsc/rc_calc_fpu.c
index ec636d06e18c..ef75eb7d5adc 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dsc/rc_calc_fpu.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/dsc/rc_calc_fpu.c
@@ -68,7 +68,7 @@ static void get_qp_set(qp_set qps, enum colour_mode cm, enum 
bits_per_comp bpc,
int sel = table_hash(mode, bpc, max_min);
int table_size = 0;
int index;
-   const struct qp_entry *table = 0L;
+   const struct qp_entry *table = NULL;
 
// alias enum
enum { min = DAL_MM_MIN, max = DAL_MM_MAX };
-- 
2.25.1



Re: [PATCH] drm/amd/display: Use NULL pointer instead of plain integer

2022-02-02 Thread Alex Deucher
Applied.  Thanks!

Alex

On Wed, Feb 2, 2022 at 5:20 PM Magali Lemes  wrote:
>
> Assigning 0L to a pointer variable caused the following warning:
>
> drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dsc/rc_calc_fpu.c:71:40:
> warning: Using plain integer as NULL pointer
>
> In order to remove this warning, this commit assigns a NULL pointer to
> the pointer variable that caused this issue.
>
> Reported-by: kernel test robot 
> Signed-off-by: Magali Lemes 
> ---
>  drivers/gpu/drm/amd/display/dc/dml/dsc/rc_calc_fpu.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/display/dc/dml/dsc/rc_calc_fpu.c 
> b/drivers/gpu/drm/amd/display/dc/dml/dsc/rc_calc_fpu.c
> index ec636d06e18c..ef75eb7d5adc 100644
> --- a/drivers/gpu/drm/amd/display/dc/dml/dsc/rc_calc_fpu.c
> +++ b/drivers/gpu/drm/amd/display/dc/dml/dsc/rc_calc_fpu.c
> @@ -68,7 +68,7 @@ static void get_qp_set(qp_set qps, enum colour_mode cm, 
> enum bits_per_comp bpc,
> int sel = table_hash(mode, bpc, max_min);
> int table_size = 0;
> int index;
> -   const struct qp_entry *table = 0L;
> +   const struct qp_entry *table = NULL;
>
> // alias enum
> enum { min = DAL_MM_MIN, max = DAL_MM_MAX };
> --
> 2.25.1
>


[PATCH] drm/amd/pm: add missing prototypes to amdgpu_dpm_internal

2022-02-02 Thread Maíra Canal
Include the header with the prototype to silence the following clang
warnings:

drivers/gpu/drm/amd/amdgpu/../pm/amdgpu_dpm_internal.c:29:6: warning: no
previous prototype for function 'amdgpu_dpm_get_active_displays'
[-Wmissing-prototypes]
void amdgpu_dpm_get_active_displays(struct amdgpu_device *adev)
 ^
drivers/gpu/drm/amd/amdgpu/../pm/amdgpu_dpm_internal.c:29:1: note: declare
'static' if the function is not intended to be used outside of this
translation unit
void amdgpu_dpm_get_active_displays(struct amdgpu_device *adev)
^
static
drivers/gpu/drm/amd/amdgpu/../pm/amdgpu_dpm_internal.c:76:5: warning: no
previous prototype for function 'amdgpu_dpm_get_vrefresh'
[-Wmissing-prototypes]
u32 amdgpu_dpm_get_vrefresh(struct amdgpu_device *adev)
^
drivers/gpu/drm/amd/amdgpu/../pm/amdgpu_dpm_internal.c:76:1: note: declare
'static' if the function is not intended to be used outside of this
translation unit
u32 amdgpu_dpm_get_vrefresh(struct amdgpu_device *adev)
^
static
2 warnings generated.

Besides that, remove the duplicated prototype of the function
amdgpu_dpm_get_vblank_time in order to keep the consistency of the
headers.

fixes: 6ddbd37f ("drm/amd/pm: optimize the amdgpu_pm_compute_clocks()
implementations")

Signed-off-by: Maíra Canal 
---
 drivers/gpu/drm/amd/pm/amdgpu_dpm_internal.c | 1 +
 drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h  | 1 -
 drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c   | 1 +
 3 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm_internal.c 
b/drivers/gpu/drm/amd/pm/amdgpu_dpm_internal.c
index ba5f6413412d..42efe838fa85 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_dpm_internal.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm_internal.c
@@ -25,6 +25,7 @@
 #include "amdgpu_display.h"
 #include "hwmgr.h"
 #include "amdgpu_smu.h"
+#include "amdgpu_dpm_internal.h"
 
 void amdgpu_dpm_get_active_displays(struct amdgpu_device *adev)
 {
diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h 
b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
index 5cc05110cdae..09790413cbc4 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
@@ -343,7 +343,6 @@ struct amdgpu_pm {
struct amdgpu_ctx   *stable_pstate_ctx;
 };
 
-u32 amdgpu_dpm_get_vblank_time(struct amdgpu_device *adev);
 int amdgpu_dpm_read_sensor(struct amdgpu_device *adev, enum amd_pp_sensors 
sensor,
   void *data, uint32_t *size);
 
diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c 
b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c
index 7427c50409d4..caae54487f9c 100644
--- a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c
+++ b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c
@@ -28,6 +28,7 @@
 #include "amdgpu_pm.h"
 #include "amdgpu_dpm.h"
 #include "amdgpu_atombios.h"
+#include "amdgpu_dpm_internal.h"
 #include "amd_pcie.h"
 #include "sid.h"
 #include "r600_dpm.h"
-- 
2.34.1



[pull] amdgpu drm-fixes-5.17

2022-02-02 Thread Alex Deucher
Hi Dave, Daniel,

Fixes for 5.17.

The following changes since commit 26291c54e111ff6ba87a164d85d4a4e134b7315c:

  Linux 5.17-rc2 (2022-01-30 15:37:07 +0200)

are available in the Git repository at:

  https://gitlab.freedesktop.org/agd5f/linux.git 
tags/amd-drm-fixes-5.17-2022-02-02

for you to fetch changes up to e8ae38720e1a685fd98cfa5ae118c9d07b45ca79:

  drm/amdgpu: fix logic inversion in check (2022-02-02 18:35:00 -0500)


amd-drm-fixes-5.17-2022-02-02:

amdgpu:
- mGPU fan boost fix for beige goby
- S0ix fixes
- Cyan skillfish hang fix
- DCN fixes for DCN 3.1
- DCN fixes for DCN 3.01
- Apple retina panel fix
- ttm logic inversion fix


Agustin Gutierrez (1):
  drm/amd/display: Update watermark values for DCN301

Aun-Ali Zaidi (1):
  drm/amd/display: Force link_rate as LINK_RATE_RBR2 for 2018 15" Apple 
Retina panels

Christian König (1):
  drm/amdgpu: fix logic inversion in check

Evan Quan (1):
  drm/amd/pm: correct the MGpuFanBoost support for Beige Goby

Lang Yu (1):
  drm/amdgpu: fix a potential GPU hang on cyan skillfish

Mario Limonciello (4):
  drm/amd: Warn users about potential s0ix problems
  drm/amd: add support to check whether the system is set to s3
  drm/amd: Only run s3 or s0ix if system is configured properly
  drm/amd: avoid suspend on dGPUs w/ s2idle support when runtime PM enabled

Paul Hsieh (1):
  drm/amd/display: watermark latencies is not enough on DCN31

Zhan Liu (1):
  drm/amd/display: revert "Reset fifo after enable otg"

 drivers/gpu/drm/amd/amdgpu/amdgpu.h| 10 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c   | 37 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 11 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c|  2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  3 ++
 .../drm/amd/display/dc/clk_mgr/dcn301/vg_clk_mgr.c | 16 +-
 .../amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c   | 20 ++--
 drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c   | 20 
 .../amd/display/dc/dce110/dce110_hw_sequencer.c|  5 ---
 .../amd/display/dc/dcn10/dcn10_stream_encoder.c| 15 -
 .../amd/display/dc/dcn10/dcn10_stream_encoder.h|  3 --
 .../amd/display/dc/dcn20/dcn20_stream_encoder.c|  2 --
 .../display/dc/dcn30/dcn30_dio_stream_encoder.c|  2 --
 .../gpu/drm/amd/display/dc/inc/hw/stream_encoder.h |  4 ---
 .../drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c|  6 ++--
 15 files changed, 94 insertions(+), 62 deletions(-)


[PATCH -next] drm/amdkfd: Fix resource_size.cocci warning

2022-02-02 Thread Yang Li
Use resource_size function on resource object instead of explicit
computation.

Eliminate the following coccicheck warning:
./drivers/gpu/drm/amd/amdkfd/kfd_migrate.c:978:11-14: ERROR: Missing
resource_size with res

Reported-by: Abaci Robot 
Signed-off-by: Yang Li 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 8430f6475723..d4287a39be56 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -975,7 +975,7 @@ int svm_migrate_init(struct amdgpu_device *adev)
pgmap->type = 0;
if (pgmap->type == MEMORY_DEVICE_PRIVATE)
devm_release_mem_region(adev->dev, res->start,
-   res->end - res->start + 1);
+   resource_size(res));
return PTR_ERR(r);
}
 
-- 
2.20.1.7.g153144c