date:20220208

Re: [RFC v4 08/11] drm/amdgpu: Move reset sem into reset_domain

2022-02-08 Thread Christian König


Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky:

We want single instance of reset sem across all
reset clients because in case of XGMI we should stop
access cross device MMIO because any of them could be
in a reset in the moment.

Signed-off-by: Andrey Grodzovsky 


Reviewed-by: Christian König 


---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  1 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c   | 10 
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 23 +--
  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 18 ---
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c |  2 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c|  6 +++--
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 14 ++-
  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c |  4 ++--
  drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c |  4 ++--
  10 files changed, 46 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index cb9764513df8..ddfbcc8fd3d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1058,7 +1058,6 @@ struct amdgpu_device {
  
  	atomic_t 			in_gpu_reset;

enum pp_mp1_state   mp1_state;
-   struct rw_semaphore reset_sem;
struct amdgpu_doorbell_index doorbell_index;
  
  	struct mutex			notifier_lock;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 25e2e5bf90eb..c3728061d65a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -37,6 +37,8 @@
  #include "amdgpu_fw_attestation.h"
  #include "amdgpu_umr.h"
  
+#include "amdgpu_reset.h"

+
  #if defined(CONFIG_DEBUG_FS)
  
  /**

@@ -1279,7 +1281,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
*m, void *unused)
}
  
  	/* Avoid accidently unparking the sched thread during GPU reset */

-   r = down_write_killable(>reset_sem);
+   r = down_write_killable(>reset_domain->sem);
if (r)
return r;
  
@@ -1308,7 +1310,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)

kthread_unpark(ring->sched.thread);
}
  
-	up_write(>reset_sem);

+   up_write(>reset_domain->sem);
  
  	pm_runtime_mark_last_busy(dev->dev);

pm_runtime_put_autosuspend(dev->dev);
@@ -1517,7 +1519,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
return -ENOMEM;
  
  	/* Avoid accidently unparking the sched thread during GPU reset */

-   r = down_read_killable(>reset_sem);
+   r = down_read_killable(>reset_domain->sem);
if (r)
goto pro_end;
  
@@ -1560,7 +1562,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)

/* restart the scheduler */
kthread_unpark(ring->sched.thread);
  
-	up_read(>reset_sem);

+   up_read(>reset_domain->sem);
  
  	ttm_bo_unlock_delayed_workqueue(>mman.bdev, resched);
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index d61bc0a0457c..dcbb175d336f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -424,10 +424,10 @@ bool amdgpu_device_skip_hw_access(struct amdgpu_device 
*adev)
 * the lock.
 */
if (in_task()) {
-   if (down_read_trylock(>reset_sem))
-   up_read(>reset_sem);
+   if (down_read_trylock(>reset_domain->sem))
+   up_read(>reset_domain->sem);
else
-   lockdep_assert_held(>reset_sem);
+   lockdep_assert_held(>reset_domain->sem);
}
  #endif
return false;
@@ -453,9 +453,9 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
if ((reg * 4) < adev->rmmio_size) {
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
amdgpu_sriov_runtime(adev) &&
-   down_read_trylock(>reset_sem)) {
+   down_read_trylock(>reset_domain->sem)) {
ret = amdgpu_kiq_rreg(adev, reg);
-   up_read(>reset_sem);
+   up_read(>reset_domain->sem);
} else {
ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
}
@@ -538,9 +538,9 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
if ((reg * 4) < adev->rmmio_size) {
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
amdgpu_sriov_runtime(adev) &&
-   down_read_trylock(>reset_sem)) {
+   down_read_trylock(>reset_domain->sem)) {
amdgpu_kiq_wreg(adev, reg, v);
-   up_read(>reset_sem);
+   up_read(>reset_domain->sem);
}

Re: [RFC v4 07/11] drm/amdgpu: Rework reset domain to be refcounted.

2022-02-08 Thread Christian König


Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky:

The reset domain contains register access semaphor
now and so needs to be present as long as each device
in a hive needs it and so it cannot be binded to XGMI
hive life cycle.
Adress this by making reset domain refcounted and pointed
by each member of the hive and the hive itself.

v4:

Fix crash on boot witrh XGMI hive by adding type to reset_domain.
XGMI will only create a new reset_domain if prevoius was of single
device type meaning it's first boot. Otherwsie it will take a
refocunt to exsiting reset_domain from the amdgou device.

Add a wrapper around reset_domain->refcount get/put
and a wrapper around send to reset wq (Lijo)

Signed-off-by: Andrey Grodzovsky 


Acked-by: Christian König 


---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h|  6 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  | 40 
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 35 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 29 +++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  6 ++-
  drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c  |  6 ++-
  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c  |  6 ++-
  9 files changed, 140 insertions(+), 34 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 540a38fe5cd6..cb9764513df8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -813,9 +813,7 @@ struct amd_powerplay {
  #define AMDGPU_RESET_MAGIC_NUM 64
  #define AMDGPU_MAX_DF_PERFMONS 4
  #define AMDGPU_PRODUCT_NAME_LEN 64
-struct amdgpu_reset_domain {
-   struct workqueue_struct *wq;
-};
+struct amdgpu_reset_domain;
  
  struct amdgpu_device {

struct device   *dev;
@@ -1104,7 +1102,7 @@ struct amdgpu_device {
uint32_t
ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
  
  	boolram_is_direct_mapped;

-   struct amdgpu_reset_domain  reset_domain;
+   struct amdgpu_reset_domain  *reset_domain;
  };
  
  static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e3c0ec684a85..d61bc0a0457c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2316,7 +2316,7 @@ static int amdgpu_device_init_schedulers(struct 
amdgpu_device *adev)
  
  		r = drm_sched_init(>sched, _sched_ops,

   ring->num_hw_submission, 
amdgpu_job_hang_limit,
-  timeout, adev->reset_domain.wq, 
ring->sched_score, ring->name);
+  timeout, adev->reset_domain->wq, 
ring->sched_score, ring->name);
if (r) {
DRM_ERROR("Failed to create scheduler on ring %s.\n",
  ring->name);
@@ -2439,24 +2439,22 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
if (r)
goto init_failed;
  
+	/**

+* In case of XGMI grab extra reference for reset domain for this device
+*/
if (adev->gmc.xgmi.num_physical_nodes > 1) {
-   struct amdgpu_hive_info *hive;
+   if (amdgpu_xgmi_add_device(adev) == 0) {
+   struct amdgpu_hive_info *hive = 
amdgpu_get_xgmi_hive(adev);
  
-		amdgpu_xgmi_add_device(adev);

+   if (!hive->reset_domain ||
+   !amdgpu_reset_get_reset_domain(hive->reset_domain)) 
{
+   r = -ENOENT;
+   goto init_failed;
+   }
  
-		hive = amdgpu_get_xgmi_hive(adev);

-   if (!hive || !hive->reset_domain.wq) {
-   DRM_ERROR("Failed to obtain reset domain info for XGMI 
hive:%llx", hive->hive_id);
-   r = -EINVAL;
-   goto init_failed;
-   }
-
-   adev->reset_domain.wq = hive->reset_domain.wq;
-   } else {
-   adev->reset_domain.wq = 
alloc_ordered_workqueue("amdgpu-reset-dev", 0);
-   if (!adev->reset_domain.wq) {
-   r = -ENOMEM;
-   goto init_failed;
+   /* Drop the early temporary reset domain we created for 
device */
+   amdgpu_reset_put_reset_domain(adev->reset_domain);
+   adev->reset_domain = hive->reset_domain;
}
}
  
@@ -3640,6 +3638,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,

return r;
}
  
+	/*

+* Reset domain needs to be present early, before XGMI hive discovered
+* (if any) and intitialized to use reset sem and in_gpu reset flag
+* early on

Re: [RFC v4 04/11] drm/amd/virt: For SRIOV send GPU reset directly to TDR queue.

2022-02-08 Thread Christian König





Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky:

No need to to trigger another work queue inside the work queue.

v3:

Problem:
Extra reset caused by host side FLR notification
following guest side triggered reset.
Fix: Preven qeuing flr_work from mailbox irq if guest
already executing a reset.

Suggested-by: Liu Shaoyun 
Signed-off-by: Andrey Grodzovsky 


Feel free to add an Acked-by: Christian König 
, but an rb from somebody more familiar with 
the code would be better.


Regards,
Christian.


---
  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 9 ++---
  drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 9 ++---
  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 9 ++---
  3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 56da5ab82987..5869d51d8bee 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -282,7 +282,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
*work)
if (amdgpu_device_should_recover_gpu(adev)
&& (!amdgpu_device_has_job_running(adev) ||
adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
-   amdgpu_device_gpu_recover(adev, NULL);
+   amdgpu_device_gpu_recover_imp(adev, NULL);
  }
  
  static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,

@@ -307,8 +307,11 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device 
*adev,
  
  	switch (event) {

case IDH_FLR_NOTIFICATION:
-   if (amdgpu_sriov_runtime(adev))
-   schedule_work(>virt.flr_work);
+   if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+   WARN_ONCE(!queue_work(adev->reset_domain.wq,
+ >virt.flr_work),
+ "Failed to queue work! at %s",
+ __func__);
break;
case IDH_QUERY_ALIVE:
xgpu_ai_mailbox_send_ack(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 477d0dde19c5..5728a6401d73 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -309,7 +309,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct 
*work)
adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
-   amdgpu_device_gpu_recover(adev, NULL);
+   amdgpu_device_gpu_recover_imp(adev, NULL);
  }
  
  static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev,

@@ -337,8 +337,11 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device 
*adev,
  
  	switch (event) {

case IDH_FLR_NOTIFICATION:
-   if (amdgpu_sriov_runtime(adev))
-   schedule_work(>virt.flr_work);
+   if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+   WARN_ONCE(!queue_work(adev->reset_domain.wq,
+ >virt.flr_work),
+ "Failed to queue work! at %s",
+ __func__);
break;
/* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can 
ignore
 * it byfar since that polling thread will handle it,
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index aef9d059ae52..02290febfcf4 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -521,7 +521,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct 
*work)
  
  	/* Trigger recovery due to world switch failure */

if (amdgpu_device_should_recover_gpu(adev))
-   amdgpu_device_gpu_recover(adev, NULL);
+   amdgpu_device_gpu_recover_imp(adev, NULL);
  }
  
  static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,

@@ -550,8 +550,11 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device 
*adev,
r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
  
  		/* only handle FLR_NOTIFY now */

-   if (!r)
-   schedule_work(>virt.flr_work);
+   if (!r && !amdgpu_in_reset(adev))
+   WARN_ONCE(!queue_work(adev->reset_domain.wq,
+ >virt.flr_work),
+ "Failed to queue work! at %s",
+ __func__);
}
  
  	return 0;

Re: [RFC v4 02/11] drm/amdgpu: Move scheduler init to after XGMI is ready

2022-02-08 Thread Christian König


Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky:

Before we initialize schedulers we must know which reset
domain are we in - for single device there iis a single
domain per device and so single wq per device. For XGMI
the reset domain spans the entire XGMI hive and so the
reset wq is per hive.

Signed-off-by: Andrey Grodzovsky 


One more comment below, with that fixed Reviewed-by: Christian König 
.



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 34 ++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  2 +
  3 files changed, 51 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 9704b0e1fd82..00123b0013d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2287,6 +2287,47 @@ static int amdgpu_device_fw_loading(struct amdgpu_device 
*adev)
return r;
  }
  
+static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)

+{
+   long timeout;
+   int r, i;
+
+   for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+   struct amdgpu_ring *ring = adev->rings[i];
+
+   /* No need to setup the GPU scheduler for rings that don't need 
it */
+   if (!ring || ring->no_scheduler)
+   continue;
+
+   switch (ring->funcs->type) {
+   case AMDGPU_RING_TYPE_GFX:
+   timeout = adev->gfx_timeout;
+   break;
+   case AMDGPU_RING_TYPE_COMPUTE:
+   timeout = adev->compute_timeout;
+   break;
+   case AMDGPU_RING_TYPE_SDMA:
+   timeout = adev->sdma_timeout;
+   break;
+   default:
+   timeout = adev->video_timeout;
+   break;
+   }
+
+   r = drm_sched_init(>sched, _sched_ops,
+  ring->num_hw_submission, 
amdgpu_job_hang_limit,
+  timeout, adev->reset_domain.wq, 
ring->sched_score, ring->name);
+   if (r) {
+   DRM_ERROR("Failed to create scheduler on ring %s.\n",
+ ring->name);
+   return r;
+   }
+   }
+
+   return 0;
+}
+
+
  /**
   * amdgpu_device_ip_init - run init for hardware IPs
   *
@@ -2419,6 +2460,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
}
}
  
+	r = amdgpu_device_init_schedulers(adev);

+   if (r)
+   goto init_failed;
+
/* Don't init kfd if whole hive need to be reset during init */
if (!adev->gmc.xgmi.pending_reset)
amdgpu_amdkfd_device_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 45977a72b5dd..fa302540c69a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -457,8 +457,6 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
  atomic_t *sched_score)
  {
struct amdgpu_device *adev = ring->adev;
-   long timeout;
-   int r;
  
  	if (!adev)

return -EINVAL;
@@ -478,36 +476,12 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring 
*ring,
spin_lock_init(>fence_drv.lock);
ring->fence_drv.fences = kcalloc(num_hw_submission * 2, sizeof(void *),
 GFP_KERNEL);
-   if (!ring->fence_drv.fences)
-   return -ENOMEM;
  
-	/* No need to setup the GPU scheduler for rings that don't need it */

-   if (ring->no_scheduler)
-   return 0;
+   ring->num_hw_submission = num_hw_submission;
+   ring->sched_score = sched_score;


Let's move this into the caller and then use ring->num_hw_submission in 
the fence code as well.


The maximum number of jobs on the ring is not really fence specific.

Regards,
Christian.

  
-	switch (ring->funcs->type) {

-   case AMDGPU_RING_TYPE_GFX:
-   timeout = adev->gfx_timeout;
-   break;
-   case AMDGPU_RING_TYPE_COMPUTE:
-   timeout = adev->compute_timeout;
-   break;
-   case AMDGPU_RING_TYPE_SDMA:
-   timeout = adev->sdma_timeout;
-   break;
-   default:
-   timeout = adev->video_timeout;
-   break;
-   }
-
-   r = drm_sched_init(>sched, _sched_ops,
-  num_hw_submission, amdgpu_job_hang_limit,
-  timeout, NULL, sched_score, ring->name);
-   if (r) {
-   DRM_ERROR("Failed to create scheduler on ring %s.\n",
- ring->name);
-   return r;
-   }
+   if (!ring->fence_drv.fences)
+

Re: [RFC v4] drm/amdgpu: Rework reset domain to be refcounted.

2022-02-08 Thread Christian König





Am 08.02.22 um 17:19 schrieb Andrey Grodzovsky:


On 2022-02-08 06:25, Lazar, Lijo wrote:



On 2/2/2022 10:56 PM, Andrey Grodzovsky wrote:

The reset domain contains register access semaphor
now and so needs to be present as long as each device
in a hive needs it and so it cannot be binded to XGMI
hive life cycle.
Adress this by making reset domain refcounted and pointed
by each member of the hive and the hive itself.

v4:
Fix crash on boot with XGMI hive by adding type to reset_domain.
XGMI will only create a new reset_domain if prevoius was of single
device type meaning it's first boot. Otherwsie it will take a
refocunt to exsiting reset_domain from the amdgou device.

Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h    |  6 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 
+-

  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  | 38 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 18 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 29 +++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  4 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c  |  4 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c  |  4 +-
  9 files changed, 118 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index 8e96b9a14452..f2ba460bfd59 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -813,9 +813,7 @@ struct amd_powerplay {
  #define AMDGPU_RESET_MAGIC_NUM 64
  #define AMDGPU_MAX_DF_PERFMONS 4
  -struct amdgpu_reset_domain {
-    struct workqueue_struct *wq;
-};
+struct amdgpu_reset_domain;
    struct amdgpu_device {
  struct device    *dev;
@@ -1102,7 +1100,7 @@ struct amdgpu_device {
  struct amdgpu_reset_control *reset_cntl;
  uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
  -    struct amdgpu_reset_domain    reset_domain;
+    struct amdgpu_reset_domain    *reset_domain;
  };
    static inline struct amdgpu_device *drm_to_adev(struct 
drm_device *ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index fef952ca8db5..cd1b7af69c35 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2313,7 +2313,7 @@ static int 
amdgpu_device_init_schedulers(struct amdgpu_device *adev)

    r = drm_sched_init(>sched, _sched_ops,
 ring->num_hw_submission, amdgpu_job_hang_limit,
-   timeout, adev->reset_domain.wq, 
ring->sched_score, ring->name);
+   timeout, adev->reset_domain->wq, 
ring->sched_score, ring->name);

  if (r) {
  DRM_ERROR("Failed to create scheduler on ring %s.\n",
    ring->name);
@@ -2432,24 +2432,22 @@ static int amdgpu_device_ip_init(struct 
amdgpu_device *adev)

  if (r)
  goto init_failed;
  +    /**
+ * In case of XGMI grab extra reference for reset domain for 
this device

+ */
  if (adev->gmc.xgmi.num_physical_nodes > 1) {
-    struct amdgpu_hive_info *hive;
-
-    amdgpu_xgmi_add_device(adev);
+    if (amdgpu_xgmi_add_device(adev) == 0) {
+    struct amdgpu_hive_info *hive = 
amdgpu_get_xgmi_hive(adev);

  -    hive = amdgpu_get_xgmi_hive(adev);
-    if (!hive || !hive->reset_domain.wq) {
-    DRM_ERROR("Failed to obtain reset domain info for XGMI 
hive:%llx", hive->hive_id);

-    r = -EINVAL;
-    goto init_failed;
-    }
+    if (!hive->reset_domain ||
+ !kref_get_unless_zero(>reset_domain->refcount)) {
+    r = -ENOENT;
+    goto init_failed;
+    }
  -    adev->reset_domain.wq = hive->reset_domain.wq;
-    } else {
-    adev->reset_domain.wq = 
alloc_ordered_workqueue("amdgpu-reset-dev", 0);

-    if (!adev->reset_domain.wq) {
-    r = -ENOMEM;
-    goto init_failed;
+    /* Drop the early temporary reset domain we created for 
device */
+    kref_put(>reset_domain->refcount, 
amdgpu_reset_destroy_reset_domain);

+    adev->reset_domain = hive->reset_domain;
  }
  }
  @@ -3599,6 +3597,15 @@ int amdgpu_device_init(struct amdgpu_device 
*adev,

  return r;
  }
  +    /*
+ * Reset domain needs to be present early, before XGMI hive 
discovered
+ * (if any) and intitialized to use reset sem and in_gpu reset 
flag

+ * early on during init.
+ */
+    adev->reset_domain = 
amdgpu_reset_create_reset_domain(SINGLE_DEVICE ,"amdgpu-reset-dev");

+    if (!adev->reset_domain)
+    return -ENOMEM;
+
  /* early init functions */
  r = amdgpu_device_ip_early_init(adev);
  if (r)
@@ -3949,6 +3956,9 @@ void amdgpu_device_fini_sw(struct 
amdgpu_device *adev)

  if (adev->mman.discovery_bin)

Re: [PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset

2022-02-08 Thread Christian König


Am 08.02.22 um 16:28 schrieb Alex Deucher:

On Tue, Feb 8, 2022 at 3:17 AM Somalapuram Amaranath
 wrote:

Dump the list of register values to trace event on GPU reset.

Signed-off-by: Somalapuram Amaranath 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 19 +++
  2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1e651b959141..057922fb7e37 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4534,6 +4534,23 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device 
*adev,
 return r;
  }

+static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
+{
+   int i;
+   uint32_t reg_value[128];
+
+   for (i = 0; adev->reset_dump_reg_list[i] != 0; i++) {
+   if (adev->asic_type >= CHIP_NAVI10)

This check should be against CHIP_VEGA10.  Also, this only allows for
GC registers.  If we wanted to dump other registers, we'd need a
different macro.  Might be better to just use RREG32 here for
everything and then encode the full offset using
SOC15_REG_ENTRY_OFFSET() or a similar macro.  Also, we need to think
about how to handle gfxoff in this case.  gfxoff needs to be disabled
or we'll hang the chip if we try and read GC or SDMA registers via
MMIO which will adversely affect the hang signature.


Well this should execute right before a GPU reset, so I think it 
shouldn't matter if we hang the chip or not as long as the read comes 
back correctly (I remember a very long UVD debug session because of this).


But in general I agree, we should just use RREG32() here and always 
encode the full register offset.


Regards,
Christian.




Alex


+   reg_value[i] = RREG32_SOC15_IP(GC, 
adev->reset_dump_reg_list[i]);
+   else
+   reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
+   }
+
+   trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list, reg_value, i);
+
+   return 0;
+}
+
  int amdgpu_do_asic_reset(struct list_head *device_list_handle,
  struct amdgpu_reset_context *reset_context)
  {
@@ -4567,8 +4584,10 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
 tmp_adev->gmc.xgmi.pending_reset = false;
 if (!queue_work(system_unbound_wq, 
_adev->xgmi_reset_work))
 r = -EALREADY;
-   } else
+   } else {
+   amdgpu_reset_reg_dumps(tmp_adev);
 r = amdgpu_asic_reset(tmp_adev);
+   }

 if (r) {
 dev_err(tmp_adev->dev, "ASIC reset failed with 
error, %d for drm dev, %s",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
index d855cb53c7e0..3fe33de3564a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
@@ -537,6 +537,25 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
   __entry->seqno)
  );

+TRACE_EVENT(amdgpu_reset_reg_dumps,
+   TP_PROTO(long *address, uint32_t *value, int length),
+   TP_ARGS(address, value, length),
+   TP_STRUCT__entry(
+__array(long, address, 128)
+__array(uint32_t, value, 128)
+__field(int, len)
+),
+   TP_fast_assign(
+  memcpy(__entry->address, address, 128);
+  memcpy(__entry->value,  value, 128);
+  __entry->len = length;
+  ),
+   TP_printk("amdgpu register dump offset: %s value: %s ",
+ __print_array(__entry->address, __entry->len, 8),
+ __print_array(__entry->value, __entry->len, 8)
+)
+);
+
  #undef AMDGPU_JOB_GET_TIMELINE_NAME
  #endif

--
2.25.1

Re: [PATCH 7/8] mm: remove the extra ZONE_DEVICE struct page refcount

2022-02-08 Thread Christoph Hellwig

On Tue, Feb 08, 2022 at 07:30:11PM -0800, Dan Williams wrote:
> Interesting. I had expected that to really fix the refcount problem
> that fs/dax.c would need to start taking real page references as pages
> were added to a mapping, just like page cache.

I think we should do that eventually.  But I think this series that
just attacks the device private type and extends to the device coherent
and p2p enhacements is a good first step to stop the proliferation of
the one off refcount and to allow to deal with the fsdax pages in another
more focuessed series.

Re: [PATCH 6/8] mm: don't include in

2022-02-08 Thread Christoph Hellwig

On Tue, Feb 08, 2022 at 03:53:14PM -0800, Dan Williams wrote:
> Yeah, same as Logan:
> 
> mm/memcontrol.c: In function ‘get_mctgt_type’:
> mm/memcontrol.c:5724:29: error: implicit declaration of function
> ‘is_device_private_page’; did you mean
> ‘is_device_private_entry’? [-Werror=implicit-function-declaration]
>  5724 | if (is_device_private_page(page))
>   | ^~
>   | is_device_private_entry
> 
> ...needs:

Yeah, the buildbot also complained.  I've fixed this up locally now.

Re: [RFC v3 00/12] Define and use reset domain for GPU recovery in amdgpu

2022-02-08 Thread JingWen Chen

Hi Andrey,

I have been testing your patch and it seems fine till now.

Best Regards,

Jingwen Chen

On 2022/2/3 上午2:57, Andrey Grodzovsky wrote:
> Just another ping, with Shyun's help I was able to do some smoke testing on 
> XGMI SRIOV system (booting and triggering hive reset)
> and for now looks good.
>
> Andrey
>
> On 2022-01-28 14:36, Andrey Grodzovsky wrote:
>> Just a gentle ping if people have more comments on this patch set ? 
>> Especially last 5 patches
>> as first 7 are exact same as V2 and we already went over them mostly.
>>
>> Andrey
>>
>> On 2022-01-25 17:37, Andrey Grodzovsky wrote:
>>> This patchset is based on earlier work by Boris[1] that allowed to have an
>>> ordered workqueue at the driver level that will be used by the different
>>> schedulers to queue their timeout work. On top of that I also serialized
>>> any GPU reset we trigger from within amdgpu code to also go through the same
>>> ordered wq and in this way simplify somewhat our GPU reset code so we don't 
>>> need
>>> to protect from concurrency by multiple GPU reset triggeres such as TDR on 
>>> one
>>> hand and sysfs trigger or RAS trigger on the other hand.
>>>
>>> As advised by Christian and Daniel I defined a reset_domain struct such that
>>> all the entities that go through reset together will be serialized one 
>>> against
>>> another.
>>>
>>> TDR triggered by multiple entities within the same domain due to the same 
>>> reason will not
>>> be triggered as the first such reset will cancel all the pending resets. 
>>> This is
>>> relevant only to TDR timers and not to triggered resets coming from RAS or 
>>> SYSFS,
>>> those will still happen after the in flight resets finishes.
>>>
>>> v2:
>>> Add handling on SRIOV configuration, the reset notify coming from host
>>> and driver already trigger a work queue to handle the reset so drop this
>>> intermediate wq and send directly to timeout wq. (Shaoyun)
>>>
>>> v3:
>>> Lijo suggested puting 'adev->in_gpu_reset' in amdgpu_reset_domain struct.
>>> I followed his advise and also moved adev->reset_sem into same place. This
>>> in turn caused to do some follow-up refactor of the original patches
>>> where i decoupled amdgpu_reset_domain life cycle frolm XGMI hive because 
>>> hive is destroyed and
>>> reconstructed for the case of reset the devices in the XGMI hive during 
>>> probe for SRIOV See [2]
>>> while we need the reset sem and gpu_reset flag to always be present. This 
>>> was attained
>>> by adding refcount to amdgpu_reset_domain so each device can safely point 
>>> to it as long as
>>> it needs.
>>>
>>>
>>> [1] 
>>> https://patchwork.kernel.org/project/dri-devel/patch/20210629073510.2764391-3-boris.brezil...@collabora.com/
>>> [2] https://www.spinics.net/lists/amd-gfx/msg58836.html
>>>
>>> P.S Going through drm-misc-next and not amd-staging-drm-next as Boris work 
>>> hasn't landed yet there.
>>>
>>> P.P.S Patches 8-12 are the refactor on top of the original V2 patchset.
>>>
>>> P.P.P.S I wasn't able yet to test the reworked code on XGMI SRIOV system 
>>> because drm-misc-next fails to load there.
>>> Would appriciate if maybe jingwech can try it on his system like he tested 
>>> V2.
>>>
>>> Andrey Grodzovsky (12):
>>>    drm/amdgpu: Introduce reset domain
>>>    drm/amdgpu: Move scheduler init to after XGMI is ready
>>>    drm/amdgpu: Fix crash on modprobe
>>>    drm/amdgpu: Serialize non TDR gpu recovery with TDRs
>>>    drm/amd/virt: For SRIOV send GPU reset directly to TDR queue.
>>>    drm/amdgpu: Drop hive->in_reset
>>>    drm/amdgpu: Drop concurrent GPU reset protection for device
>>>    drm/amdgpu: Rework reset domain to be refcounted.
>>>    drm/amdgpu: Move reset sem into reset_domain
>>>    drm/amdgpu: Move in_gpu_reset into reset_domain
>>>    drm/amdgpu: Rework amdgpu_device_lock_adev
>>>    Revert 'drm/amdgpu: annotate a false positive recursive locking'
>>>
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  15 +-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c   |  10 +-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    | 275 ++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  43 +--
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   |   2 +-
>>>   .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    |  18 +-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c |  39 +++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  12 +
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h  |   2 +
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c  |  24 +-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h  |   3 +-
>>>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c    |   6 +-
>>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c |  14 +-
>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c |  19 +-
>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c |  19 +-
>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c |  11 +-
>>>   16 files changed, 313 insertions(+), 199 deletions(-)
>>>

[PATCH 11/11] drm/amdgpu: Merge amdgpu_ras_late_init/amdgpu_ras_late_fini to amdgpu_ras_block_late_init/amdgpu_ras_block_late_fini

2022-02-08 Thread yipechai

1. Merge amdgpu_ras_late_init to
   amdgpu_ras_block_late_init.
2. Remove amdgpu_ras_late_init since no ras block
   calls amdgpu_ras_late_init.
3. Merge amdgpu_ras_late_fini to
   amdgpu_ras_block_late_fini.
4. Remove amdgpu_ras_late_fini since no ras block
   calls amdgpu_ras_late_fini.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 53 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  8 
 2 files changed, 11 insertions(+), 50 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 9f8f596663ab..1aff88fcea76 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2397,11 +2397,10 @@ bool amdgpu_ras_is_poison_mode_supported(struct 
amdgpu_device *adev)
 }
 
 /* helper function to handle common stuff in ip late init phase */
-int amdgpu_ras_late_init(struct amdgpu_device *adev,
-struct ras_common_if *ras_block,
-struct ras_fs_if *fs_info,
-struct ras_ih_if *ih_info)
+int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
+struct ras_common_if *ras_block)
 {
+   struct amdgpu_ras_block_object *ras_obj;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
unsigned long ue_count, ce_count;
int r;
@@ -2429,7 +2428,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
if (adev->in_suspend || amdgpu_in_reset(adev))
return 0;
 
-   if (ih_info->cb) {
+   ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, 
ras_comm);
+   if (ras_obj->ras_cb) {
r = amdgpu_ras_interrupt_add_handler(adev, ras_block);
if (r)
goto interrupt;
@@ -2450,57 +2450,26 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
 cleanup:
amdgpu_ras_sysfs_remove(adev, ras_block);
 sysfs:
-   if (ih_info->cb)
+   if (ras_obj->ras_cb)
amdgpu_ras_interrupt_remove_handler(adev, ras_block);
 interrupt:
amdgpu_ras_feature_enable(adev, ras_block, 0);
return r;
 }
 
-int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
-   struct ras_common_if *ras_block)
-{
-   char sysfs_name[32];
-   struct ras_ih_if ih_info;
-   struct ras_fs_if fs_info;
-   struct amdgpu_ras_block_object *obj;
-
-   obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
-   ih_info.cb = obj->ras_cb;
-   ih_info.head = *ras_block;
-   snprintf(sysfs_name, sizeof(sysfs_name), "%s_err_count", 
ras_block->name);
-   fs_info.sysfs_name = (const char *)sysfs_name;
-   fs_info.head = *ras_block;
-   return amdgpu_ras_late_init(adev, ras_block, _info, _info);
-}
-
 /* helper function to remove ras fs node and interrupt handler */
-void amdgpu_ras_late_fini(struct amdgpu_device *adev,
- struct ras_common_if *ras_block,
- struct ras_ih_if *ih_info)
-{
-   if (!ras_block || !ih_info)
-   return;
-
-   amdgpu_ras_sysfs_remove(adev, ras_block);
-   if (ih_info->cb)
-   amdgpu_ras_interrupt_remove_handler(adev, _info->head);
-}
-
 void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
  struct ras_common_if *ras_block)
 {
-   struct ras_ih_if ih_info;
-   struct amdgpu_ras_block_object *obj;
-
+   struct amdgpu_ras_block_object *ras_obj;
if (!ras_block)
return;
 
-   obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
-   ih_info.head = *ras_block;
-   ih_info.cb = obj->ras_cb;
+   amdgpu_ras_sysfs_remove(adev, ras_block);
 
-   amdgpu_ras_late_fini(adev, ras_block, _info);
+   ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, 
ras_comm);
+   if (ras_obj->ras_cb)
+   amdgpu_ras_interrupt_remove_handler(adev, ras_block);
 }
 
 /* do some init work after IP late init as dependence.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index ae8741ac526f..5de567c6a8f7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -597,18 +597,10 @@ amdgpu_ras_error_to_ta(enum amdgpu_ras_error_type error) {
 int amdgpu_ras_init(struct amdgpu_device *adev);
 int amdgpu_ras_fini(struct amdgpu_device *adev);
 int amdgpu_ras_pre_fini(struct amdgpu_device *adev);
-int amdgpu_ras_late_init(struct amdgpu_device *adev,
-struct ras_common_if *ras_block,
-struct ras_fs_if *fs_info,
-struct ras_ih_if *ih_info);
 
 int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
struct ras_common_if *ras_block);
 
-void amdgpu_ras_late_fini(struct amdgpu_device *adev,
- struct

[PATCH 08/11] drm/amdgpu: Optimize amdgpu_umc_ras_late_init/amdgpu_umc_ras_fini function code

2022-02-08 Thread yipechai

Optimize amdgpu_umc_ras_late_init/amdgpu_umc_ras_fini function code.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 44 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  4 +++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  6 
 3 files changed, 16 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index ff7805beda38..9f1406e1a48a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -129,7 +129,7 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
return ret;
 }
 
-static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
+int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
void *ras_error_status,
struct amdgpu_iv_entry *entry)
 {
@@ -139,36 +139,15 @@ static int amdgpu_umc_process_ras_data_cb(struct 
amdgpu_device *adev,
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, void *ras_info)
 {
int r;
-   struct ras_fs_if fs_info = {
-   .sysfs_name = "umc_err_count",
-   };
-   struct ras_ih_if ih_info = {
-   .cb = amdgpu_umc_process_ras_data_cb,
-   };
 
-   if (!adev->umc.ras_if) {
-   adev->umc.ras_if =
-   kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
-   if (!adev->umc.ras_if)
-   return -ENOMEM;
-   adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
-   adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
-   adev->umc.ras_if->sub_block_index = 0;
-   }
-   ih_info.head = fs_info.head = *adev->umc.ras_if;
-
-   r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
-_info, _info);
+   r = amdgpu_ras_block_late_init(adev, adev->umc.ras_if);
if (r)
-   goto free;
+   return r;
 
if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
r = amdgpu_irq_get(adev, >gmc.ecc_irq, 0);
if (r)
goto late_fini;
-   } else {
-   r = 0;
-   goto free;
}
 
/* ras init of specific umc version */
@@ -179,26 +158,15 @@ int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, 
void *ras_info)
return 0;
 
 late_fini:
-   amdgpu_ras_late_fini(adev, adev->umc.ras_if, _info);
-free:
-   kfree(adev->umc.ras_if);
-   adev->umc.ras_if = NULL;
+   amdgpu_ras_block_late_fini(adev, adev->umc.ras_if);
return r;
 }
 
 void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
 {
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
-   adev->umc.ras_if) {
-   struct ras_common_if *ras_if = adev->umc.ras_if;
-   struct ras_ih_if ih_info = {
-   .head = *ras_if,
-   .cb = amdgpu_umc_process_ras_data_cb,
-   };
-
-   amdgpu_ras_late_fini(adev, ras_if, _info);
-   kfree(ras_if);
-   }
+   adev->umc.ras_if)
+   amdgpu_ras_block_late_fini(adev, adev->umc.ras_if);
 }
 
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 4db0526d0be4..ec15b3640399 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -85,4 +85,8 @@ void amdgpu_umc_fill_error_record(struct ras_err_data 
*err_data,
uint64_t retired_page,
uint32_t channel_index,
uint32_t umc_inst);
+
+int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
+   void *ras_error_status,
+   struct amdgpu_iv_entry *entry);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 15958fd45f64..94095b965e2c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1234,6 +1234,8 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device 
*adev)
 
strcpy(adev->umc.ras->ras_block.ras_comm.name, "umc");
adev->umc.ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__UMC;
+   adev->umc.ras->ras_block.ras_comm.type = 
AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->umc.ras_if = >umc.ras->ras_block.ras_comm;
 
/* If don't define special ras_late_init function, use default 
ras_late_init */
if (!adev->umc.ras->ras_block.ras_late_init)
@@ -1242,6 +1244,10 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device 
*adev)
/* If don't define special ras_fini function, use default 
ras_fini */
if (!adev->umc.ras->ras_block.ras_fini)

[PATCH 10/11] drm/amdgpu: Optimize operating sysfs and interrupt function interface in amdgpu_ras.c

2022-02-08 Thread yipechai

In order to reduce redundant struct conversion, modify
operating sysfs and interrupt function interface parameters.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 37 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  6 ++--
 2 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 8a76a4e07659..9f8f596663ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1276,18 +1276,17 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct 
amdgpu_device *adev)
 }
 
 int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
-   struct ras_fs_if *head)
+   struct ras_common_if *head)
 {
-   struct ras_manager *obj = amdgpu_ras_find_obj(adev, >head);
+   struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
 
if (!obj || obj->attr_inuse)
return -EINVAL;
 
get_obj(obj);
 
-   memcpy(obj->fs_data.sysfs_name,
-   head->sysfs_name,
-   sizeof(obj->fs_data.sysfs_name));
+   snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name),
+   "%s_err_count", head->name);
 
obj->sysfs_attr = (struct device_attribute){
.attr = {
@@ -1594,9 +1593,9 @@ int amdgpu_ras_interrupt_dispatch(struct amdgpu_device 
*adev,
 }
 
 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
-   struct ras_ih_if *info)
+   struct ras_common_if *head)
 {
-   struct ras_manager *obj = amdgpu_ras_find_obj(adev, >head);
+   struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
struct ras_ih_data *data;
 
if (!obj)
@@ -1616,24 +1615,27 @@ int amdgpu_ras_interrupt_remove_handler(struct 
amdgpu_device *adev,
 }
 
 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
-   struct ras_ih_if *info)
+   struct ras_common_if *head)
 {
-   struct ras_manager *obj = amdgpu_ras_find_obj(adev, >head);
+   struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
struct ras_ih_data *data;
+   struct amdgpu_ras_block_object *ras_obj;
 
if (!obj) {
/* in case we registe the IH before enable ras feature */
-   obj = amdgpu_ras_create_obj(adev, >head);
+   obj = amdgpu_ras_create_obj(adev, head);
if (!obj)
return -EINVAL;
} else
get_obj(obj);
 
+   ras_obj = container_of(head, struct amdgpu_ras_block_object, ras_comm);
+
data = >ih_data;
/* add the callback.etc */
*data = (struct ras_ih_data) {
.inuse = 0,
-   .cb = info->cb,
+   .cb = ras_obj->ras_cb,
.element_size = sizeof(struct amdgpu_iv_entry),
.rptr = 0,
.wptr = 0,
@@ -1662,10 +1664,7 @@ static int amdgpu_ras_interrupt_remove_all(struct 
amdgpu_device *adev)
struct ras_manager *obj, *tmp;
 
list_for_each_entry_safe(obj, tmp, >head, node) {
-   struct ras_ih_if info = {
-   .head = obj->head,
-   };
-   amdgpu_ras_interrupt_remove_handler(adev, );
+   amdgpu_ras_interrupt_remove_handler(adev, >head);
}
 
return 0;
@@ -2431,12 +2430,12 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
return 0;
 
if (ih_info->cb) {
-   r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
+   r = amdgpu_ras_interrupt_add_handler(adev, ras_block);
if (r)
goto interrupt;
}
 
-   r = amdgpu_ras_sysfs_create(adev, fs_info);
+   r = amdgpu_ras_sysfs_create(adev, ras_block);
if (r)
goto sysfs;
 
@@ -2452,7 +2451,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
amdgpu_ras_sysfs_remove(adev, ras_block);
 sysfs:
if (ih_info->cb)
-   amdgpu_ras_interrupt_remove_handler(adev, ih_info);
+   amdgpu_ras_interrupt_remove_handler(adev, ras_block);
 interrupt:
amdgpu_ras_feature_enable(adev, ras_block, 0);
return r;
@@ -2485,7 +2484,7 @@ void amdgpu_ras_late_fini(struct amdgpu_device *adev,
 
amdgpu_ras_sysfs_remove(adev, ras_block);
if (ih_info->cb)
-   amdgpu_ras_interrupt_remove_handler(adev, ih_info);
+   amdgpu_ras_interrupt_remove_handler(adev, _info->head);
 }
 
 void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 8b94b556baf6..ae8741ac526f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -619,7 +619,7 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device

[PATCH 07/11] drm/amdgpu: Optimize amdgpu_sdma_ras_late_init/amdgpu_sdma_ras_fini function code

2022-02-08 Thread yipechai

Optimize amdgpu_sdma_ras_late_init/amdgpu_sdma_ras_fini function code.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 46 +++-
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 12 ---
 2 files changed, 13 insertions(+), 45 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 65debb65a5df..242a7b4dcad9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -90,28 +90,10 @@ int amdgpu_sdma_ras_late_init(struct amdgpu_device *adev,
  void *ras_ih_info)
 {
int r, i;
-   struct ras_ih_if *ih_info = (struct ras_ih_if *)ras_ih_info;
-   struct ras_fs_if fs_info = {
-   .sysfs_name = "sdma_err_count",
-   };
-
-   if (!ih_info)
-   return -EINVAL;
 
-   if (!adev->sdma.ras_if) {
-   adev->sdma.ras_if = kmalloc(sizeof(struct ras_common_if), 
GFP_KERNEL);
-   if (!adev->sdma.ras_if)
-   return -ENOMEM;
-   adev->sdma.ras_if->block = AMDGPU_RAS_BLOCK__SDMA;
-   adev->sdma.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
-   adev->sdma.ras_if->sub_block_index = 0;
-   }
-   fs_info.head = ih_info->head = *adev->sdma.ras_if;
-
-   r = amdgpu_ras_late_init(adev, adev->sdma.ras_if,
-_info, ih_info);
+   r = amdgpu_ras_block_late_init(adev, adev->sdma.ras_if);
if (r)
-   goto free;
+   return r;
 
if (amdgpu_ras_is_supported(adev, adev->sdma.ras_if->block)) {
for (i = 0; i < adev->sdma.num_instances; i++) {
@@ -120,38 +102,20 @@ int amdgpu_sdma_ras_late_init(struct amdgpu_device *adev,
if (r)
goto late_fini;
}
-   } else {
-   r = 0;
-   goto free;
}
 
return 0;
 
 late_fini:
-   amdgpu_ras_late_fini(adev, adev->sdma.ras_if, ih_info);
-free:
-   kfree(adev->sdma.ras_if);
-   adev->sdma.ras_if = NULL;
+   amdgpu_ras_block_late_fini(adev, adev->sdma.ras_if);
return r;
 }
 
 void amdgpu_sdma_ras_fini(struct amdgpu_device *adev)
 {
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA) &&
-   adev->sdma.ras_if) {
-   struct ras_common_if *ras_if = adev->sdma.ras_if;
-   struct ras_ih_if ih_info = {
-   .head = *ras_if,
-   /* the cb member will not be used by
-* amdgpu_ras_interrupt_remove_handler, init it only
-* to cheat the check in ras_late_fini
-*/
-   .cb = amdgpu_sdma_process_ras_data_cb,
-   };
-
-   amdgpu_ras_late_fini(adev, ras_if, _info);
-   kfree(ras_if);
-   }
+   adev->sdma.ras_if)
+   amdgpu_ras_block_late_fini(adev, adev->sdma.ras_if);
 }
 
 int amdgpu_sdma_process_ras_data_cb(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 8b0a8587dd36..82a31dfa8c21 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1885,9 +1885,6 @@ static int sdma_v4_0_process_ras_data_cb(struct 
amdgpu_device *adev,
 static int sdma_v4_0_late_init(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-   struct ras_ih_if ih_info = {
-   .cb = sdma_v4_0_process_ras_data_cb,
-   };
 
sdma_v4_0_setup_ulv(adev);
 
@@ -1898,7 +1895,7 @@ static int sdma_v4_0_late_init(void *handle)
}
 
if (adev->sdma.ras && adev->sdma.ras->ras_block.ras_late_init)
-   return adev->sdma.ras->ras_block.ras_late_init(adev, _info);
+   return adev->sdma.ras->ras_block.ras_late_init(adev, NULL);
else
return 0;
 }
@@ -2794,6 +2791,7 @@ const struct amdgpu_ras_block_hw_ops sdma_v4_0_ras_hw_ops 
= {
 static struct amdgpu_sdma_ras sdma_v4_0_ras = {
.ras_block = {
.hw_ops = _v4_0_ras_hw_ops,
+   .ras_cb = sdma_v4_0_process_ras_data_cb,
},
 };
 
@@ -2816,6 +2814,8 @@ static void sdma_v4_0_set_ras_funcs(struct amdgpu_device 
*adev)
 
strcpy(adev->sdma.ras->ras_block.ras_comm.name, "sdma");
adev->sdma.ras->ras_block.ras_comm.block = 
AMDGPU_RAS_BLOCK__SDMA;
+   adev->sdma.ras->ras_block.ras_comm.type = 
AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->sdma.ras_if = >sdma.ras->ras_block.ras_comm;
 
/* If don't define special ras_late_init function, use default 
ras_late_init */
if (!adev->sdma.ras->ras_block.ras_late_init)
@@ -2824,6 +2824,10 @@ static void

[PATCH 09/11] drm/amdgpu: Optimize amdgpu_xgmi_ras_late_init/amdgpu_xgmi_ras_fini function code

2022-02-08 Thread yipechai

Optimize amdgpu_xgmi_ras_late_init/amdgpu_xgmi_ras_fini function code.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c  |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 40 +++-
 2 files changed, 6 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index d426de48d299..956cc994ca7d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -441,6 +441,7 @@ int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev)
if (!adev->gmc.xgmi.connected_to_cpu) {
adev->gmc.xgmi.ras = _ras;
amdgpu_ras_register_ras_block(adev, 
>gmc.xgmi.ras->ras_block);
+   adev->gmc.xgmi.ras_if = >gmc.xgmi.ras->ras_block.ras_comm;
}
 
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 15707af89212..a785b1e088cd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -734,51 +734,20 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
 
 static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, void 
*ras_info)
 {
-   int r;
-   struct ras_ih_if ih_info = {
-   .cb = NULL,
-   };
-   struct ras_fs_if fs_info = {
-   .sysfs_name = "xgmi_wafl_err_count",
-   };
-
if (!adev->gmc.xgmi.supported ||
adev->gmc.xgmi.num_physical_nodes == 0)
return 0;
 
adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev);
 
-   if (!adev->gmc.xgmi.ras_if) {
-   adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), 
GFP_KERNEL);
-   if (!adev->gmc.xgmi.ras_if)
-   return -ENOMEM;
-   adev->gmc.xgmi.ras_if->block = AMDGPU_RAS_BLOCK__XGMI_WAFL;
-   adev->gmc.xgmi.ras_if->type = 
AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
-   adev->gmc.xgmi.ras_if->sub_block_index = 0;
-   }
-   ih_info.head = fs_info.head = *adev->gmc.xgmi.ras_if;
-   r = amdgpu_ras_late_init(adev, adev->gmc.xgmi.ras_if,
-_info, _info);
-   if (r || !amdgpu_ras_is_supported(adev, adev->gmc.xgmi.ras_if->block)) {
-   kfree(adev->gmc.xgmi.ras_if);
-   adev->gmc.xgmi.ras_if = NULL;
-   }
-
-   return r;
+   return amdgpu_ras_block_late_init(adev, adev->gmc.xgmi.ras_if);
 }
 
 static void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev)
 {
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) &&
-   adev->gmc.xgmi.ras_if) {
-   struct ras_common_if *ras_if = adev->gmc.xgmi.ras_if;
-   struct ras_ih_if ih_info = {
-   .cb = NULL,
-   };
-
-   amdgpu_ras_late_fini(adev, ras_if, _info);
-   kfree(ras_if);
-   }
+   adev->gmc.xgmi.ras_if)
+   amdgpu_ras_block_late_fini(adev, adev->gmc.xgmi.ras_if);
 }
 
 uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
@@ -982,8 +951,9 @@ struct amdgpu_ras_block_hw_ops  xgmi_ras_hw_ops = {
 struct amdgpu_xgmi_ras xgmi_ras = {
.ras_block = {
.ras_comm = {
-   .name = "xgmi",
+   .name = "xgmi_wafl",
.block = AMDGPU_RAS_BLOCK__XGMI_WAFL,
+   .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
},
.hw_ops = _ras_hw_ops,
.ras_late_init = amdgpu_xgmi_ras_late_init,
-- 
2.25.1

[PATCH 06/11] drm/amdgpu: Optimize amdgpu_nbio_ras_late_init/amdgpu_nbio_ras_fini function code

2022-02-08 Thread yipechai

Optimize amdgpu_nbio_ras_late_init/amdgpu_nbio_ras_fini function code.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c | 40 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |  1 +
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c   |  1 +
 3 files changed, 7 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
index 6ace2e390e77..89e61fdd3580 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
@@ -25,26 +25,9 @@
 int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, void *ras_info)
 {
int r;
-   struct ras_ih_if ih_info = {
-   .cb = NULL,
-   };
-   struct ras_fs_if fs_info = {
-   .sysfs_name = "pcie_bif_err_count",
-   };
-
-   if (!adev->nbio.ras_if) {
-   adev->nbio.ras_if = kmalloc(sizeof(struct ras_common_if), 
GFP_KERNEL);
-   if (!adev->nbio.ras_if)
-   return -ENOMEM;
-   adev->nbio.ras_if->block = AMDGPU_RAS_BLOCK__PCIE_BIF;
-   adev->nbio.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
-   adev->nbio.ras_if->sub_block_index = 0;
-   }
-   ih_info.head = fs_info.head = *adev->nbio.ras_if;
-   r = amdgpu_ras_late_init(adev, adev->nbio.ras_if,
-_info, _info);
+   r = amdgpu_ras_block_late_init(adev, adev->nbio.ras_if);
if (r)
-   goto free;
+   return r;
 
if (amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
r = amdgpu_irq_get(adev, >nbio.ras_controller_irq, 0);
@@ -53,30 +36,17 @@ int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, 
void *ras_info)
r = amdgpu_irq_get(adev, >nbio.ras_err_event_athub_irq, 
0);
if (r)
goto late_fini;
-   } else {
-   r = 0;
-   goto free;
}
 
return 0;
 late_fini:
-   amdgpu_ras_late_fini(adev, adev->nbio.ras_if, _info);
-free:
-   kfree(adev->nbio.ras_if);
-   adev->nbio.ras_if = NULL;
+   amdgpu_ras_block_late_fini(adev, adev->nbio.ras_if);
return r;
 }
 
 void amdgpu_nbio_ras_fini(struct amdgpu_device *adev)
 {
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF) &&
-   adev->nbio.ras_if) {
-   struct ras_common_if *ras_if = adev->nbio.ras_if;
-   struct ras_ih_if ih_info = {
-   .cb = NULL,
-   };
-
-   amdgpu_ras_late_fini(adev, ras_if, _info);
-   kfree(ras_if);
-   }
+   adev->nbio.ras_if)
+   amdgpu_ras_block_late_fini(adev, adev->nbio.ras_if);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index b7aed19db7e9..8a76a4e07659 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2301,6 +2301,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
if (!adev->gmc.xgmi.connected_to_cpu) {
adev->nbio.ras = _v7_4_ras;
amdgpu_ras_register_ras_block(adev, 
>nbio.ras->ras_block);
+   adev->nbio.ras_if = >nbio.ras->ras_block.ras_comm;
}
break;
default:
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index c7cca87f1647..14768570c298 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -667,6 +667,7 @@ struct amdgpu_nbio_ras nbio_v7_4_ras = {
.ras_comm = {
.name = "pcie_bif",
.block = AMDGPU_RAS_BLOCK__PCIE_BIF,
+   .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
},
.hw_ops = _v7_4_ras_hw_ops,
.ras_late_init = amdgpu_nbio_ras_late_init,
-- 
2.25.1

[PATCH 05/11] drm/amdgpu: Optimize amdgpu_mmhub_ras_late_init/amdgpu_mmhub_ras_fini function code

2022-02-08 Thread yipechai

Optimize amdgpu_mmhub_ras_late_init/amdgpu_mmhub_ras_fini function code.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c | 37 ++-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c |  2 ++
 2 files changed, 5 insertions(+), 34 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c
index f9b5472a75d7..2bdb4d8b7955 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c
@@ -26,43 +26,12 @@
 
 int amdgpu_mmhub_ras_late_init(struct amdgpu_device *adev, void *ras_info)
 {
-   int r;
-   struct ras_ih_if ih_info = {
-   .cb = NULL,
-   };
-   struct ras_fs_if fs_info = {
-   .sysfs_name = "mmhub_err_count",
-   };
-
-   if (!adev->mmhub.ras_if) {
-   adev->mmhub.ras_if = kmalloc(sizeof(struct ras_common_if), 
GFP_KERNEL);
-   if (!adev->mmhub.ras_if)
-   return -ENOMEM;
-   adev->mmhub.ras_if->block = AMDGPU_RAS_BLOCK__MMHUB;
-   adev->mmhub.ras_if->type = 
AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
-   adev->mmhub.ras_if->sub_block_index = 0;
-   }
-   ih_info.head = fs_info.head = *adev->mmhub.ras_if;
-   r = amdgpu_ras_late_init(adev, adev->mmhub.ras_if,
-_info, _info);
-   if (r || !amdgpu_ras_is_supported(adev, adev->mmhub.ras_if->block)) {
-   kfree(adev->mmhub.ras_if);
-   adev->mmhub.ras_if = NULL;
-   }
-
-   return r;
+   return amdgpu_ras_block_late_init(adev, adev->mmhub.ras_if);
 }
 
 void amdgpu_mmhub_ras_fini(struct amdgpu_device *adev)
 {
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MMHUB) &&
-   adev->mmhub.ras_if) {
-   struct ras_common_if *ras_if = adev->mmhub.ras_if;
-   struct ras_ih_if ih_info = {
-   .cb = NULL,
-   };
-
-   amdgpu_ras_late_fini(adev, ras_if, _info);
-   kfree(ras_if);
-   }
+   adev->mmhub.ras_if)
+   amdgpu_ras_block_late_fini(adev, adev->mmhub.ras_if);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index b12fe6703f02..15958fd45f64 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1282,6 +1282,8 @@ static void gmc_v9_0_set_mmhub_ras_funcs(struct 
amdgpu_device *adev)
 
strcpy(adev->mmhub.ras->ras_block.ras_comm.name, "mmhub");
adev->mmhub.ras->ras_block.ras_comm.block = 
AMDGPU_RAS_BLOCK__MMHUB;
+   adev->mmhub.ras->ras_block.ras_comm.type = 
AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->mmhub.ras_if = >mmhub.ras->ras_block.ras_comm;
 
/* If don't define special ras_late_init function, use default 
ras_late_init */
if (!adev->mmhub.ras->ras_block.ras_late_init)
-- 
2.25.1

[PATCH 04/11] drm/amdgpu: Optimize amdgpu_mca_ras_late_init/amdgpu_mca_ras_fini function code

2022-02-08 Thread yipechai

Optimize amdgpu_mca_ras_late_init/amdgpu_mca_ras_fini function code.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 41 ++---
 drivers/gpu/drm/amd/amdgpu/mca_v3_0.c   |  6 
 2 files changed, 8 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index ad057d6b2c77..1c77fe7e9e68 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -74,48 +74,11 @@ void amdgpu_mca_query_ras_error_count(struct amdgpu_device 
*adev,
 int amdgpu_mca_ras_late_init(struct amdgpu_device *adev,
 struct amdgpu_mca_ras *mca_dev)
 {
-   char sysfs_name[32] = {0};
-   int r;
-   struct ras_ih_if ih_info = {
-   .cb = NULL,
-   };
-   struct ras_fs_if fs_info= {
-   .sysfs_name = sysfs_name,
-   };
-
-   snprintf(sysfs_name, sizeof(sysfs_name), "%s_err_count",
-   mca_dev->ras->ras_block.ras_comm.name);
-
-   if (!mca_dev->ras_if) {
-   mca_dev->ras_if = kmalloc(sizeof(struct ras_common_if), 
GFP_KERNEL);
-   if (!mca_dev->ras_if)
-   return -ENOMEM;
-   mca_dev->ras_if->block = mca_dev->ras->ras_block.ras_comm.block;
-   mca_dev->ras_if->sub_block_index = 
mca_dev->ras->ras_block.ras_comm.sub_block_index;
-   mca_dev->ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
-   }
-   ih_info.head = fs_info.head = *mca_dev->ras_if;
-   r = amdgpu_ras_late_init(adev, mca_dev->ras_if,
-_info, _info);
-   if (r || !amdgpu_ras_is_supported(adev, mca_dev->ras_if->block)) {
-   kfree(mca_dev->ras_if);
-   mca_dev->ras_if = NULL;
-   }
-
-   return r;
+   return amdgpu_ras_block_late_init(adev, mca_dev->ras_if);
 }
 
 void amdgpu_mca_ras_fini(struct amdgpu_device *adev,
 struct amdgpu_mca_ras *mca_dev)
 {
-   struct ras_ih_if ih_info = {
-   .cb = NULL,
-   };
-
-   if (!mca_dev->ras_if)
-   return;
-
-   amdgpu_ras_late_fini(adev, mca_dev->ras_if, _info);
-   kfree(mca_dev->ras_if);
-   mca_dev->ras_if = NULL;
+   amdgpu_ras_block_late_fini(adev, mca_dev->ras_if);
 }
\ No newline at end of file
diff --git a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c 
b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
index 386416378a82..a307f336f7ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
@@ -71,6 +71,7 @@ struct amdgpu_mca_ras_block mca_v3_0_mp0_ras = {
.ras_comm = {
.block = AMDGPU_RAS_BLOCK__MCA,
.sub_block_index = AMDGPU_RAS_MCA_BLOCK__MP0,
+   .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
.name = "mp0",
},
.hw_ops = _v3_0_mp0_hw_ops,
@@ -108,6 +109,7 @@ struct amdgpu_mca_ras_block mca_v3_0_mp1_ras = {
.ras_comm = {
.block = AMDGPU_RAS_BLOCK__MCA,
.sub_block_index = AMDGPU_RAS_MCA_BLOCK__MP1,
+   .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
.name = "mp1",
},
.hw_ops = _v3_0_mp1_hw_ops,
@@ -145,6 +147,7 @@ struct amdgpu_mca_ras_block mca_v3_0_mpio_ras = {
.ras_comm = {
.block = AMDGPU_RAS_BLOCK__MCA,
.sub_block_index = AMDGPU_RAS_MCA_BLOCK__MPIO,
+   .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
.name = "mpio",
},
.hw_ops = _v3_0_mpio_hw_ops,
@@ -165,6 +168,9 @@ static void mca_v3_0_init(struct amdgpu_device *adev)
amdgpu_ras_register_ras_block(adev, >mp0.ras->ras_block);
amdgpu_ras_register_ras_block(adev, >mp1.ras->ras_block);
amdgpu_ras_register_ras_block(adev, >mpio.ras->ras_block);
+   mca->mp0.ras_if = _v3_0_mp0_ras.ras_block.ras_comm;
+   mca->mp1.ras_if = _v3_0_mp1_ras.ras_block.ras_comm;
+   mca->mpio.ras_if = _v3_0_mpio_ras.ras_block.ras_comm;
 }
 
 const struct amdgpu_mca_funcs mca_v3_0_funcs = {
-- 
2.25.1

[PATCH 02/11] drm/amdgpu: Optimize amdgpu_gfx_ras_late_init/amdgpu_gfx_ras_fini function code

2022-02-08 Thread yipechai

Optimize amdgpu_gfx_ras_late_init/amdgpu_gfx_ras_fini function code.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 42 +++--
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   |  6 
 2 files changed, 11 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 43004822ec6f..fe392108b5c2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -625,26 +625,9 @@ int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, 
uint32_t *value)
 int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, void *ras_info)
 {
int r;
-   struct ras_fs_if fs_info = {
-   .sysfs_name = "gfx_err_count",
-   };
-   struct ras_ih_if ih_info = {
-   .cb = amdgpu_gfx_process_ras_data_cb,
-   };
-
-   if (!adev->gfx.ras_if) {
-   adev->gfx.ras_if = kmalloc(sizeof(struct ras_common_if), 
GFP_KERNEL);
-   if (!adev->gfx.ras_if)
-   return -ENOMEM;
-   adev->gfx.ras_if->block = AMDGPU_RAS_BLOCK__GFX;
-   adev->gfx.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
-   adev->gfx.ras_if->sub_block_index = 0;
-   }
-   fs_info.head = ih_info.head = *adev->gfx.ras_if;
-   r = amdgpu_ras_late_init(adev, adev->gfx.ras_if,
-_info, _info);
+   r = amdgpu_ras_block_late_init(adev, adev->gfx.ras_if);
if (r)
-   goto free;
+   return r;
 
if (amdgpu_ras_is_supported(adev, adev->gfx.ras_if->block)) {
if (!amdgpu_persistent_edc_harvesting_supported(adev))
@@ -653,34 +636,19 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, 
void *ras_info)
r = amdgpu_irq_get(adev, >gfx.cp_ecc_error_irq, 0);
if (r)
goto late_fini;
-   } else {
-   /* free gfx ras_if if ras is not supported */
-   r = 0;
-   goto free;
}
 
return 0;
 late_fini:
-   amdgpu_ras_late_fini(adev, adev->gfx.ras_if, _info);
-free:
-   kfree(adev->gfx.ras_if);
-   adev->gfx.ras_if = NULL;
+   amdgpu_ras_block_late_fini(adev, adev->gfx.ras_if);
return r;
 }
 
 void amdgpu_gfx_ras_fini(struct amdgpu_device *adev)
 {
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX) &&
-   adev->gfx.ras_if) {
-   struct ras_common_if *ras_if = adev->gfx.ras_if;
-   struct ras_ih_if ih_info = {
-   .head = *ras_if,
-   .cb = amdgpu_gfx_process_ras_data_cb,
-   };
-
-   amdgpu_ras_late_fini(adev, ras_if, _info);
-   kfree(ras_if);
-   }
+   adev->gfx.ras_if)
+   amdgpu_ras_block_late_fini(adev, adev->gfx.ras_if);
 }
 
 int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 0a291d2e5f91..0c3b3972c04e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2200,6 +2200,8 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device 
*adev)
 
strcpy(adev->gfx.ras->ras_block.ras_comm.name, "gfx");
adev->gfx.ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__GFX;
+   adev->gfx.ras->ras_block.ras_comm.type = 
AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->gfx.ras_if = >gfx.ras->ras_block.ras_comm;
 
/* If not define special ras_late_init function, use gfx 
default ras_late_init */
if (!adev->gfx.ras->ras_block.ras_late_init)
@@ -2208,6 +2210,10 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device 
*adev)
/* If not define special ras_fini function, use gfx default 
ras_fini */
if (!adev->gfx.ras->ras_block.ras_fini)
adev->gfx.ras->ras_block.ras_fini = amdgpu_gfx_ras_fini;
+
+   /* If not defined special ras_cb function, use default ras_cb */
+   if (!adev->gfx.ras->ras_block.ras_cb)
+   adev->gfx.ras->ras_block.ras_cb = 
amdgpu_gfx_process_ras_data_cb;
}
 
adev->gfx.config.gb_addr_config = gb_addr_config;
-- 
2.25.1

[PATCH 03/11] drm/amdgpu: Optimize amdgpu_hdp_ras_late_init/amdgpu_hdp_ras_fini function code

2022-02-08 Thread yipechai

Optimize amdgpu_hdp_ras_late_init/amdgpu_hdp_ras_fini function code.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c | 37 ++---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  1 +
 drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c   |  1 +
 3 files changed, 5 insertions(+), 34 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
index 518966a26130..21a5f884dd2a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
@@ -26,43 +26,12 @@
 
 int amdgpu_hdp_ras_late_init(struct amdgpu_device *adev, void *ras_info)
 {
-   int r;
-   struct ras_ih_if ih_info = {
-   .cb = NULL,
-   };
-   struct ras_fs_if fs_info = {
-   .sysfs_name = "hdp_err_count",
-   };
-
-   if (!adev->hdp.ras_if) {
-   adev->hdp.ras_if = kmalloc(sizeof(struct ras_common_if), 
GFP_KERNEL);
-   if (!adev->hdp.ras_if)
-   return -ENOMEM;
-   adev->hdp.ras_if->block = AMDGPU_RAS_BLOCK__HDP;
-   adev->hdp.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
-   adev->hdp.ras_if->sub_block_index = 0;
-   }
-   ih_info.head = fs_info.head = *adev->hdp.ras_if;
-   r = amdgpu_ras_late_init(adev, adev->hdp.ras_if,
-_info, _info);
-   if (r || !amdgpu_ras_is_supported(adev, adev->hdp.ras_if->block)) {
-   kfree(adev->hdp.ras_if);
-   adev->hdp.ras_if = NULL;
-   }
-
-   return r;
+   return amdgpu_ras_block_late_init(adev, adev->hdp.ras_if);
 }
 
 void amdgpu_hdp_ras_fini(struct amdgpu_device *adev)
 {
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__HDP) &&
-   adev->hdp.ras_if) {
-   struct ras_common_if *ras_if = adev->hdp.ras_if;
-   struct ras_ih_if ih_info = {
-   .cb = NULL,
-   };
-
-   amdgpu_ras_late_fini(adev, ras_if, _info);
-   kfree(ras_if);
-   }
+   adev->hdp.ras_if)
+   amdgpu_ras_block_late_fini(adev, adev->hdp.ras_if);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index af873c99d5e4..b12fe6703f02 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1302,6 +1302,7 @@ static void gmc_v9_0_set_hdp_ras_funcs(struct 
amdgpu_device *adev)
 {
adev->hdp.ras = _v4_0_ras;
amdgpu_ras_register_ras_block(adev, >hdp.ras->ras_block);
+   adev->hdp.ras_if = >hdp.ras->ras_block.ras_comm;
 }
 
 static void gmc_v9_0_set_mca_funcs(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
index 503c292b321e..a9ed4232cdeb 100644
--- a/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
@@ -160,6 +160,7 @@ struct amdgpu_hdp_ras hdp_v4_0_ras = {
.ras_comm = {
.name = "hdp",
.block = AMDGPU_RAS_BLOCK__HDP,
+   .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
},
.hw_ops = _v4_0_ras_hw_ops,
.ras_late_init = amdgpu_hdp_ras_late_init,
-- 
2.25.1

[PATCH 01/11] drm/amdgpu: Optimize xxx_ras_late_init/xxx_ras_late_fini for each ras block

2022-02-08 Thread yipechai

1. Define amdgpu_ras_block_late_init to create sysfs nodes
   and interrupt handles.
2. Define amdgpu_ras_block_late_fini to remove sysfs nodes
   and interrupt handles.
3. Replace ras block variable members in struct
   amdgpu_ras_block_object with struct ras_common_if, which
   can makes it easy to associate each ras block instance
   with each ras block functional interface.
4. Add .ras_cb to struct amdgpu_ras_block_object.
5. Change each ras block to fit for the changement of struct
   amdgpu_ras_block_object.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c  |  7 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 35 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h  | 15 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c |  6 ++--
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c|  4 +--
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c   |  4 +--
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c|  8 +++---
 drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c|  6 ++--
 drivers/gpu/drm/amd/amdgpu/mca_v3_0.c| 28 +++
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c   |  6 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   |  4 +--
 11 files changed, 86 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index 52a60c2316a2..ad057d6b2c77 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -83,14 +83,15 @@ int amdgpu_mca_ras_late_init(struct amdgpu_device *adev,
.sysfs_name = sysfs_name,
};
 
-   snprintf(sysfs_name, sizeof(sysfs_name), "%s_err_count", 
mca_dev->ras->ras_block.name);
+   snprintf(sysfs_name, sizeof(sysfs_name), "%s_err_count",
+   mca_dev->ras->ras_block.ras_comm.name);
 
if (!mca_dev->ras_if) {
mca_dev->ras_if = kmalloc(sizeof(struct ras_common_if), 
GFP_KERNEL);
if (!mca_dev->ras_if)
return -ENOMEM;
-   mca_dev->ras_if->block = mca_dev->ras->ras_block.block;
-   mca_dev->ras_if->sub_block_index = 
mca_dev->ras->ras_block.sub_block_index;
+   mca_dev->ras_if->block = mca_dev->ras->ras_block.ras_comm.block;
+   mca_dev->ras_if->sub_block_index = 
mca_dev->ras->ras_block.ras_comm.sub_block_index;
mca_dev->ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
}
ih_info.head = fs_info.head = *mca_dev->ras_if;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5934326b9db3..b7aed19db7e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -877,7 +877,7 @@ static int amdgpu_ras_block_match_default(struct 
amdgpu_ras_block_object *block_
if (!block_obj)
return -EINVAL;
 
-   if (block_obj->block == block)
+   if (block_obj->ras_comm.block == block)
return 0;
 
return -EINVAL;
@@ -2457,6 +2457,23 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
return r;
 }
 
+int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
+   struct ras_common_if *ras_block)
+{
+   char sysfs_name[32];
+   struct ras_ih_if ih_info;
+   struct ras_fs_if fs_info;
+   struct amdgpu_ras_block_object *obj;
+
+   obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
+   ih_info.cb = obj->ras_cb;
+   ih_info.head = *ras_block;
+   snprintf(sysfs_name, sizeof(sysfs_name), "%s_err_count", 
ras_block->name);
+   fs_info.sysfs_name = (const char *)sysfs_name;
+   fs_info.head = *ras_block;
+   return amdgpu_ras_late_init(adev, ras_block, _info, _info);
+}
+
 /* helper function to remove ras fs node and interrupt handler */
 void amdgpu_ras_late_fini(struct amdgpu_device *adev,
  struct ras_common_if *ras_block,
@@ -2470,6 +2487,22 @@ void amdgpu_ras_late_fini(struct amdgpu_device *adev,
amdgpu_ras_interrupt_remove_handler(adev, ih_info);
 }
 
+void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
+ struct ras_common_if *ras_block)
+{
+   struct ras_ih_if ih_info;
+   struct amdgpu_ras_block_object *obj;
+
+   if (!ras_block)
+   return;
+
+   obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
+   ih_info.head = *ras_block;
+   ih_info.cb = obj->ras_cb;
+
+   amdgpu_ras_late_fini(adev, ras_block, _info);
+}
+
 /* do some init work after IP late init as dependence.
  * and it runs in resume/gpu reset/booting up cases.
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index a55743b12d57..8b94b556baf6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -486,17 +486,13 @@ struct ras_debug_if {
 };
 
 struct amdgpu_ras_block_object {
-

Re: [PATCH 7/8] mm: remove the extra ZONE_DEVICE struct page refcount

2022-02-08 Thread Dan Williams

On Sun, Feb 6, 2022 at 10:33 PM Christoph Hellwig  wrote:
[..]
> @@ -500,28 +482,27 @@ void free_devmap_managed_page(struct page *page)
>  */
> page->mapping = NULL;
> page->pgmap->ops->page_free(page);
> +
> +   /*
> +* Reset the page count to 1 to prepare for handing out the page 
> again.
> +*/
> +   set_page_count(page, 1);

Interesting. I had expected that to really fix the refcount problem
that fs/dax.c would need to start taking real page references as pages
were added to a mapping, just like page cache.

This looks ok to me, and passes my tests. So given I'm still working
my way back to fixing the references properly I'm ok for this hack to
replace the more broken hack that is there presently.

Reviewed-by: Dan Williams

RE: [RFC v4 04/11] drm/amd/virt: For SRIOV send GPU reset directly to TDR queue.

2022-02-08 Thread Liu, Shaoyun

[AMD Official Use Only]

This patch is reviewed by  Shaoyun.liu 

Since  other  patches are suggested by  other engineer and they may already od 
some review on them , so I will leave  them  to continue review  the rest 
patches.  

Regards
Shaoyun.liu

-Original Message-
From: Grodzovsky, Andrey  
Sent: Tuesday, February 8, 2022 7:23 PM
To: dri-de...@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Koenig, Christian ; dan...@ffwll.ch; Liu, Monk 
; Chen, Horace ; Lazar, Lijo 
; Chen, JingWen ; Grodzovsky, Andrey 
; Liu, Shaoyun 
Subject: [RFC v4 04/11] drm/amd/virt: For SRIOV send GPU reset directly to TDR 
queue.

No need to to trigger another work queue inside the work queue.

v3:

Problem:
Extra reset caused by host side FLR notification following guest side triggered 
reset.
Fix: Preven qeuing flr_work from mailbox irq if guest already executing a reset.

Suggested-by: Liu Shaoyun 
Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 9 ++---  
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 9 ++---  
drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 9 ++---
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 56da5ab82987..5869d51d8bee 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -282,7 +282,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
*work)
if (amdgpu_device_should_recover_gpu(adev)
&& (!amdgpu_device_has_job_running(adev) ||
adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
-   amdgpu_device_gpu_recover(adev, NULL);
+   amdgpu_device_gpu_recover_imp(adev, NULL);
 }
 
 static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev, @@ -307,8 
+307,11 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,
 
switch (event) {
case IDH_FLR_NOTIFICATION:
-   if (amdgpu_sriov_runtime(adev))
-   schedule_work(>virt.flr_work);
+   if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+   WARN_ONCE(!queue_work(adev->reset_domain.wq,
+ >virt.flr_work),
+ "Failed to queue work! at %s",
+ __func__);
break;
case IDH_QUERY_ALIVE:
xgpu_ai_mailbox_send_ack(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 477d0dde19c5..5728a6401d73 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -309,7 +309,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct 
*work)
adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
-   amdgpu_device_gpu_recover(adev, NULL);
+   amdgpu_device_gpu_recover_imp(adev, NULL);
 }
 
 static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev, @@ -337,8 
+337,11 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev,
 
switch (event) {
case IDH_FLR_NOTIFICATION:
-   if (amdgpu_sriov_runtime(adev))
-   schedule_work(>virt.flr_work);
+   if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+   WARN_ONCE(!queue_work(adev->reset_domain.wq,
+ >virt.flr_work),
+ "Failed to queue work! at %s",
+ __func__);
break;
/* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can 
ignore
 * it byfar since that polling thread will handle it, diff 
--git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index aef9d059ae52..02290febfcf4 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -521,7 +521,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct 
*work)
 
/* Trigger recovery due to world switch failure */
if (amdgpu_device_should_recover_gpu(adev))
-   amdgpu_device_gpu_recover(adev, NULL);
+   amdgpu_device_gpu_recover_imp(adev, NULL);
 }
 
 static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev, @@ -550,8 
+550,11 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev,
r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
 
/* only handle FLR_NOTIFY now */
-   if (!r)
-   schedule_work(>virt.flr_work);
+   if (!r && !amdgpu_in_reset(adev))
+   WARN_ONCE(!queue_work(adev->reset_domain.wq,
+

[RFC v4 07/11] drm/amdgpu: Rework reset domain to be refcounted.

2022-02-08 Thread Andrey Grodzovsky

The reset domain contains register access semaphor
now and so needs to be present as long as each device
in a hive needs it and so it cannot be binded to XGMI
hive life cycle.
Adress this by making reset domain refcounted and pointed
by each member of the hive and the hive itself.

v4:

Fix crash on boot witrh XGMI hive by adding type to reset_domain.
XGMI will only create a new reset_domain if prevoius was of single
device type meaning it's first boot. Otherwsie it will take a
refocunt to exsiting reset_domain from the amdgou device.

Add a wrapper around reset_domain->refcount get/put
and a wrapper around send to reset wq (Lijo)

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  6 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  | 40 
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 35 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 29 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  6 ++-
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c  |  6 ++-
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c  |  6 ++-
 9 files changed, 140 insertions(+), 34 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 540a38fe5cd6..cb9764513df8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -813,9 +813,7 @@ struct amd_powerplay {
 #define AMDGPU_RESET_MAGIC_NUM 64
 #define AMDGPU_MAX_DF_PERFMONS 4
 #define AMDGPU_PRODUCT_NAME_LEN 64
-struct amdgpu_reset_domain {
-   struct workqueue_struct *wq;
-};
+struct amdgpu_reset_domain;
 
 struct amdgpu_device {
struct device   *dev;
@@ -1104,7 +1102,7 @@ struct amdgpu_device {
uint32_t
ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
 
boolram_is_direct_mapped;
-   struct amdgpu_reset_domain  reset_domain;
+   struct amdgpu_reset_domain  *reset_domain;
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e3c0ec684a85..d61bc0a0457c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2316,7 +2316,7 @@ static int amdgpu_device_init_schedulers(struct 
amdgpu_device *adev)
 
r = drm_sched_init(>sched, _sched_ops,
   ring->num_hw_submission, 
amdgpu_job_hang_limit,
-  timeout, adev->reset_domain.wq, 
ring->sched_score, ring->name);
+  timeout, adev->reset_domain->wq, 
ring->sched_score, ring->name);
if (r) {
DRM_ERROR("Failed to create scheduler on ring %s.\n",
  ring->name);
@@ -2439,24 +2439,22 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
if (r)
goto init_failed;
 
+   /**
+* In case of XGMI grab extra reference for reset domain for this device
+*/
if (adev->gmc.xgmi.num_physical_nodes > 1) {
-   struct amdgpu_hive_info *hive;
+   if (amdgpu_xgmi_add_device(adev) == 0) {
+   struct amdgpu_hive_info *hive = 
amdgpu_get_xgmi_hive(adev);
 
-   amdgpu_xgmi_add_device(adev);
+   if (!hive->reset_domain ||
+   !amdgpu_reset_get_reset_domain(hive->reset_domain)) 
{
+   r = -ENOENT;
+   goto init_failed;
+   }
 
-   hive = amdgpu_get_xgmi_hive(adev);
-   if (!hive || !hive->reset_domain.wq) {
-   DRM_ERROR("Failed to obtain reset domain info for XGMI 
hive:%llx", hive->hive_id);
-   r = -EINVAL;
-   goto init_failed;
-   }
-
-   adev->reset_domain.wq = hive->reset_domain.wq;
-   } else {
-   adev->reset_domain.wq = 
alloc_ordered_workqueue("amdgpu-reset-dev", 0);
-   if (!adev->reset_domain.wq) {
-   r = -ENOMEM;
-   goto init_failed;
+   /* Drop the early temporary reset domain we created for 
device */
+   amdgpu_reset_put_reset_domain(adev->reset_domain);
+   adev->reset_domain = hive->reset_domain;
}
}
 
@@ -3640,6 +3638,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
return r;
}
 
+   /*
+* Reset domain needs to be present early, before XGMI hive discovered
+* (if any) and intitialized to use reset sem and in_gpu reset flag
+* early on during init.
+*/
+

[RFC v4 11/11] Revert 'drm/amdgpu: annotate a false positive recursive locking'

2022-02-08 Thread Andrey Grodzovsky

Since we have a single instance of reset semaphore which we
lock only once even for XGMI hive we don't need the nested
locking hint anymore.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 --
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index aaecf0797484..75d0dd289023 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4825,16 +4825,10 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
return r;
 }
 
-static void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain 
*reset_domain,
-   struct amdgpu_hive_info *hive)
+static void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain 
*reset_domain)
 {
atomic_set(_domain->in_gpu_reset, 1);
-
-   if (hive) {
-   down_write_nest_lock(_domain->sem, >hive_lock);
-   } else {
-   down_write(_domain->sem);
-   }
+   down_write(_domain->sem);
 }
 
 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
@@ -5072,7 +5066,7 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device 
*adev,
/* We need to lock reset domain only once both for XGMI and single 
device */
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);
-   amdgpu_device_lock_reset_domain(tmp_adev->reset_domain, hive);
+   amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
 
/* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
@@ -5496,7 +5490,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev 
*pdev, pci_channel_sta
 * Locking adev->reset_domain->sem will prevent any external 
access
 * to GPU during PCI error recovery
 */
-   amdgpu_device_lock_reset_domain(adev->reset_domain, NULL);
+   amdgpu_device_lock_reset_domain(adev->reset_domain);
amdgpu_device_set_mp1_state(adev);
 
/*
-- 
2.25.1

[RFC v4 06/11] drm/amdgpu: Drop concurrent GPU reset protection for device

2022-02-08 Thread Andrey Grodzovsky

Since now all GPU resets are serialzied there is no need for this.

This patch also reverts 'drm/amdgpu: race issue when jobs on 2 ring timeout'

Signed-off-by: Andrey Grodzovsky 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 89 ++
 1 file changed, 7 insertions(+), 82 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7e92f2432087..e3c0ec684a85 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4817,11 +4817,10 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
return r;
 }
 
-static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
+static void amdgpu_device_lock_adev(struct amdgpu_device *adev,
struct amdgpu_hive_info *hive)
 {
-   if (atomic_cmpxchg(>in_gpu_reset, 0, 1) != 0)
-   return false;
+   atomic_set(>in_gpu_reset, 1);
 
if (hive) {
down_write_nest_lock(>reset_sem, >hive_lock);
@@ -4840,8 +4839,6 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device 
*adev,
adev->mp1_state = PP_MP1_STATE_NONE;
break;
}
-
-   return true;
 }
 
 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
@@ -4852,46 +4849,6 @@ static void amdgpu_device_unlock_adev(struct 
amdgpu_device *adev)
up_write(>reset_sem);
 }
 
-/*
- * to lockup a list of amdgpu devices in a hive safely, if not a hive
- * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
- *
- * unlock won't require roll back.
- */
-static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct 
amdgpu_hive_info *hive)
-{
-   struct amdgpu_device *tmp_adev = NULL;
-
-   if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
-   if (!hive) {
-   dev_err(adev->dev, "Hive is NULL while device has 
multiple xgmi nodes");
-   return -ENODEV;
-   }
-   list_for_each_entry(tmp_adev, >device_list, 
gmc.xgmi.head) {
-   if (!amdgpu_device_lock_adev(tmp_adev, hive))
-   goto roll_back;
-   }
-   } else if (!amdgpu_device_lock_adev(adev, hive))
-   return -EAGAIN;
-
-   return 0;
-roll_back:
-   if (!list_is_first(_adev->gmc.xgmi.head, >device_list)) {
-   /*
-* if the lockup iteration break in the middle of a hive,
-* it may means there may has a race issue,
-* or a hive device locked up independently.
-* we may be in trouble and may not, so will try to roll back
-* the lock and give out a warnning.
-*/
-   dev_warn(tmp_adev->dev, "Hive lock iteration broke in the 
middle. Rolling back to unlock");
-   list_for_each_entry_continue_reverse(tmp_adev, 
>device_list, gmc.xgmi.head) {
-   amdgpu_device_unlock_adev(tmp_adev);
-   }
-   }
-   return -EAGAIN;
-}
-
 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
 {
struct pci_dev *p = NULL;
@@ -5078,22 +5035,6 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device 
*adev,
reset_context.hive = hive;
clear_bit(AMDGPU_NEED_FULL_RESET, _context.flags);
 
-   /*
-* lock the device before we try to operate the linked list
-* if didn't get the device lock, don't touch the linked list since
-* others may iterating it.
-*/
-   r = amdgpu_device_lock_hive_adev(adev, hive);
-   if (r) {
-   dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another 
already in progress",
-   job ? job->base.id : -1);
-
-   /* even we skipped this reset, still need to set the job to 
guilty */
-   if (job && job->vm)
-   drm_sched_increase_karma(>base);
-   goto skip_recovery;
-   }
-
/*
 * Build list of devices to reset.
 * In case we are in XGMI hive mode, resort the device list
@@ -5113,6 +5054,9 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device 
*adev,
 
/* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+
+   amdgpu_device_lock_adev(tmp_adev, hive);
+
/*
 * Try to put the audio codec into suspend state
 * before gpu reset started.
@@ -5264,13 +5208,12 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device 
*adev,
amdgpu_device_unlock_adev(tmp_adev);
}
 
-skip_recovery:
if (hive) {
mutex_unlock(>hive_lock);
amdgpu_put_xgmi_hive(hive);
}
 
-   if (r && r != -EAGAIN)

[RFC v4 10/11] drm/amdgpu: Rework amdgpu_device_lock_adev

2022-02-08 Thread Andrey Grodzovsky

This functions needs to be split into 2 parts where
one is called only once for locking single instance of
reset_domain's sem and reset flag and the other part
which handles MP1 states should still be called for
each device in XGMI hive.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 48 --
 1 file changed, 35 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e05d7cbefd2c..aaecf0797484 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4825,16 +4825,20 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
return r;
 }
 
-static void amdgpu_device_lock_adev(struct amdgpu_device *adev,
-   struct amdgpu_hive_info *hive)
+static void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain 
*reset_domain,
+   struct amdgpu_hive_info *hive)
 {
-   atomic_set(>reset_domain->in_gpu_reset, 1);
+   atomic_set(_domain->in_gpu_reset, 1);
 
if (hive) {
-   down_write_nest_lock(>reset_domain->sem, 
>hive_lock);
+   down_write_nest_lock(_domain->sem, >hive_lock);
} else {
-   down_write(>reset_domain->sem);
+   down_write(_domain->sem);
}
+}
+
+static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
+{
 
switch (amdgpu_asic_reset_method(adev)) {
case AMD_RESET_METHOD_MODE1:
@@ -4849,14 +4853,19 @@ static void amdgpu_device_lock_adev(struct 
amdgpu_device *adev,
}
 }
 
-static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
+static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
 {
amdgpu_vf_error_trans_all(adev);
adev->mp1_state = PP_MP1_STATE_NONE;
-   atomic_set(>reset_domain->in_gpu_reset, 0);
-   up_write(>reset_domain->sem);
 }
 
+static void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain 
*reset_domain)
+{
+   atomic_set(_domain->in_gpu_reset, 0);
+   up_write(_domain->sem);
+}
+
+
 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
 {
struct pci_dev *p = NULL;
@@ -5060,10 +5069,15 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device 
*adev,
device_list_handle = _list;
}
 
+   /* We need to lock reset domain only once both for XGMI and single 
device */
+   tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
+   reset_list);
+   amdgpu_device_lock_reset_domain(tmp_adev->reset_domain, hive);
+
/* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
 
-   amdgpu_device_lock_adev(tmp_adev, hive);
+   amdgpu_device_set_mp1_state(tmp_adev);
 
/*
 * Try to put the audio codec into suspend state
@@ -5213,9 +5227,14 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device 
*adev,
 
if (audio_suspended)
amdgpu_device_resume_display_audio(tmp_adev);
-   amdgpu_device_unlock_adev(tmp_adev);
+
+   amdgpu_device_unset_mp1_state(tmp_adev);
}
 
+   tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
+   reset_list);
+   amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
+
if (hive) {
mutex_unlock(>hive_lock);
amdgpu_put_xgmi_hive(hive);
@@ -5477,7 +5496,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev 
*pdev, pci_channel_sta
 * Locking adev->reset_domain->sem will prevent any external 
access
 * to GPU during PCI error recovery
 */
-   amdgpu_device_lock_adev(adev, NULL);
+   amdgpu_device_lock_reset_domain(adev->reset_domain, NULL);
+   amdgpu_device_set_mp1_state(adev);
 
/*
 * Block any work scheduling as we do for regular GPU reset
@@ -5584,7 +5604,8 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev 
*pdev)
DRM_INFO("PCIe error recovery succeeded\n");
} else {
DRM_ERROR("PCIe error recovery failed, err:%d", r);
-   amdgpu_device_unlock_adev(adev);
+   amdgpu_device_unset_mp1_state(adev);
+   amdgpu_device_unlock_reset_domain(adev->reset_domain);
}
 
return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
@@ -5621,7 +5642,8 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
drm_sched_start(>sched, true);
}
 
-   amdgpu_device_unlock_adev(adev);
+   amdgpu_device_unset_mp1_state(adev);
+   amdgpu_device_unlock_reset_domain(adev->reset_domain);
 }

[RFC v4 08/11] drm/amdgpu: Move reset sem into reset_domain

2022-02-08 Thread Andrey Grodzovsky

We want single instance of reset sem across all
reset clients because in case of XGMI we should stop
access cross device MMIO because any of them could be
in a reset in the moment.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c   | 10 
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 23 +--
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 18 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c|  6 +++--
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 14 ++-
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c |  4 ++--
 10 files changed, 46 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index cb9764513df8..ddfbcc8fd3d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1058,7 +1058,6 @@ struct amdgpu_device {
 
atomic_tin_gpu_reset;
enum pp_mp1_state   mp1_state;
-   struct rw_semaphore reset_sem;
struct amdgpu_doorbell_index doorbell_index;
 
struct mutexnotifier_lock;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 25e2e5bf90eb..c3728061d65a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -37,6 +37,8 @@
 #include "amdgpu_fw_attestation.h"
 #include "amdgpu_umr.h"
 
+#include "amdgpu_reset.h"
+
 #if defined(CONFIG_DEBUG_FS)
 
 /**
@@ -1279,7 +1281,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
*m, void *unused)
}
 
/* Avoid accidently unparking the sched thread during GPU reset */
-   r = down_write_killable(>reset_sem);
+   r = down_write_killable(>reset_domain->sem);
if (r)
return r;
 
@@ -1308,7 +1310,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
*m, void *unused)
kthread_unpark(ring->sched.thread);
}
 
-   up_write(>reset_sem);
+   up_write(>reset_domain->sem);
 
pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev);
@@ -1517,7 +1519,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
return -ENOMEM;
 
/* Avoid accidently unparking the sched thread during GPU reset */
-   r = down_read_killable(>reset_sem);
+   r = down_read_killable(>reset_domain->sem);
if (r)
goto pro_end;
 
@@ -1560,7 +1562,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
/* restart the scheduler */
kthread_unpark(ring->sched.thread);
 
-   up_read(>reset_sem);
+   up_read(>reset_domain->sem);
 
ttm_bo_unlock_delayed_workqueue(>mman.bdev, resched);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index d61bc0a0457c..dcbb175d336f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -424,10 +424,10 @@ bool amdgpu_device_skip_hw_access(struct amdgpu_device 
*adev)
 * the lock.
 */
if (in_task()) {
-   if (down_read_trylock(>reset_sem))
-   up_read(>reset_sem);
+   if (down_read_trylock(>reset_domain->sem))
+   up_read(>reset_domain->sem);
else
-   lockdep_assert_held(>reset_sem);
+   lockdep_assert_held(>reset_domain->sem);
}
 #endif
return false;
@@ -453,9 +453,9 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
if ((reg * 4) < adev->rmmio_size) {
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
amdgpu_sriov_runtime(adev) &&
-   down_read_trylock(>reset_sem)) {
+   down_read_trylock(>reset_domain->sem)) {
ret = amdgpu_kiq_rreg(adev, reg);
-   up_read(>reset_sem);
+   up_read(>reset_domain->sem);
} else {
ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
}
@@ -538,9 +538,9 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
if ((reg * 4) < adev->rmmio_size) {
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
amdgpu_sriov_runtime(adev) &&
-   down_read_trylock(>reset_sem)) {
+   down_read_trylock(>reset_domain->sem)) {
amdgpu_kiq_wreg(adev, reg, v);
-   up_read(>reset_sem);
+   up_read(>reset_domain->sem);
} else {
writel(v, ((void

[RFC v4 09/11] drm/amdgpu: Move in_gpu_reset into reset_domain

2022-02-08 Thread Andrey Grodzovsky

We should have a single instance per entrire reset domain.

Signed-off-by: Andrey Grodzovsky 
Suggested-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  7 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  4 ++--
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c  |  4 ++--
 6 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index ddfbcc8fd3d3..b89406b01694 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1056,7 +1056,6 @@ struct amdgpu_device {
boolin_s4;
boolin_s0ix;
 
-   atomic_tin_gpu_reset;
enum pp_mp1_state   mp1_state;
struct amdgpu_doorbell_index doorbell_index;
 
@@ -1463,8 +1462,6 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device 
*adev)
return adev->gmc.tmz_enabled;
 }
 
-static inline int amdgpu_in_reset(struct amdgpu_device *adev)
-{
-   return atomic_read(>in_gpu_reset);
-}
+int amdgpu_in_reset(struct amdgpu_device *adev);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index dcbb175d336f..e05d7cbefd2c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3554,7 +3554,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(>mn_lock);
mutex_init(>virt.vf_errors.lock);
hash_init(adev->mn_hash);
-   atomic_set(>in_gpu_reset, 0);
mutex_init(>psp.mutex);
mutex_init(>notifier_lock);
 
@@ -4829,7 +4828,7 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
 static void amdgpu_device_lock_adev(struct amdgpu_device *adev,
struct amdgpu_hive_info *hive)
 {
-   atomic_set(>in_gpu_reset, 1);
+   atomic_set(>reset_domain->in_gpu_reset, 1);
 
if (hive) {
down_write_nest_lock(>reset_domain->sem, 
>hive_lock);
@@ -4854,7 +4853,7 @@ static void amdgpu_device_unlock_adev(struct 
amdgpu_device *adev)
 {
amdgpu_vf_error_trans_all(adev);
adev->mp1_state = PP_MP1_STATE_NONE;
-   atomic_set(>in_gpu_reset, 0);
+   atomic_set(>reset_domain->in_gpu_reset, 0);
up_write(>reset_domain->sem);
 }
 
@@ -5699,6 +5698,11 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device 
*adev,
amdgpu_asic_invalidate_hdp(adev, ring);
 }
 
+int amdgpu_in_reset(struct amdgpu_device *adev)
+{
+   return atomic_read(>reset_domain->in_gpu_reset);
+   }
+   
 /**
  * amdgpu_device_halt() - bring hardware to some kind of halt state
  *
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index c0988c804459..5ab72c3bfbda 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -131,6 +131,7 @@ struct amdgpu_reset_domain 
*amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
 
}
 
+   atomic_set(_domain->in_gpu_reset, 0);
init_rwsem(_domain->sem);
 
return reset_domain;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 80f918e87d4f..ea6fc98ea927 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -81,6 +81,7 @@ struct amdgpu_reset_domain {
struct workqueue_struct *wq;
enum amdgpu_reset_domain_type type;
struct rw_semaphore sem;
+   atomic_t in_gpu_reset;
 };
 
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 4e23c29e665c..b81acf59870c 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -259,7 +259,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
*work)
 * otherwise the mailbox msg will be ruined/reseted by
 * the VF FLR.
 */
-   if (atomic_cmpxchg(>in_gpu_reset, 0, 1) != 0)
+   if (atomic_cmpxchg(>reset_domain->in_gpu_reset, 0, 1) != 0)
return;
 
down_write(>reset_domain->sem);
@@ -277,7 +277,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
*work)
} while (timeout > 1);
 
 flr_done:
-   atomic_set(>in_gpu_reset, 0);
+   atomic_set(>reset_domain->in_gpu_reset, 0);
up_write(>reset_domain->sem);
 
/* Trigger recovery for world switch failure if no TDR */
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index f715780f7d20..22c10b97ea81 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -283,7 +283,7 @@ static void xgpu_nv_mailbox_flr_work(struct

[RFC v4 05/11] drm/amdgpu: Drop hive->in_reset

2022-02-08 Thread Andrey Grodzovsky

Since we serialize all resets no need to protect from concurrent
resets.

Signed-off-by: Andrey Grodzovsky 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  1 -
 3 files changed, 1 insertion(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 15e8fde3ac2d..7e92f2432087 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5067,26 +5067,10 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device 
*adev,
dev_info(adev->dev, "GPU %s begin!\n",
need_emergency_restart ? "jobs stop":"reset");
 
-   /*
-* Here we trylock to avoid chain of resets executing from
-* either trigger by jobs on different adevs in XGMI hive or jobs on
-* different schedulers for same device while this TO handler is 
running.
-* We always reset all schedulers for device and all devices for XGMI
-* hive so that should take care of them too.
-*/
if (!amdgpu_sriov_vf(adev))
hive = amdgpu_get_xgmi_hive(adev);
-   if (hive) {
-   if (atomic_cmpxchg(>in_reset, 0, 1) != 0) {
-   DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as 
another already in progress",
-   job ? job->base.id : -1, hive->hive_id);
-   amdgpu_put_xgmi_hive(hive);
-   if (job && job->vm)
-   drm_sched_increase_karma(>base);
-   return 0;
-   }
+   if (hive)
mutex_lock(>hive_lock);
-   }
 
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
@@ -5282,7 +5266,6 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device 
*adev,
 
 skip_recovery:
if (hive) {
-   atomic_set(>in_reset, 0);
mutex_unlock(>hive_lock);
amdgpu_put_xgmi_hive(hive);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index d406897346d6..89b682afe821 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -410,7 +410,6 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct 
amdgpu_device *adev)
INIT_LIST_HEAD(>device_list);
INIT_LIST_HEAD(>node);
mutex_init(>hive_lock);
-   atomic_set(>in_reset, 0);
atomic_set(>number_devices, 0);
task_barrier_init(>tb);
hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 6121aaa292cb..2f2ce53645a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -33,7 +33,6 @@ struct amdgpu_hive_info {
struct list_head node;
atomic_t number_devices;
struct mutex hive_lock;
-   atomic_t in_reset;
int hi_req_count;
struct amdgpu_device *hi_req_gpu;
struct task_barrier tb;
-- 
2.25.1

[RFC v4 04/11] drm/amd/virt: For SRIOV send GPU reset directly to TDR queue.

2022-02-08 Thread Andrey Grodzovsky

No need to to trigger another work queue inside the work queue.

v3:

Problem:
Extra reset caused by host side FLR notification
following guest side triggered reset.
Fix: Preven qeuing flr_work from mailbox irq if guest
already executing a reset.

Suggested-by: Liu Shaoyun 
Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 9 ++---
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 9 ++---
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 9 ++---
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 56da5ab82987..5869d51d8bee 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -282,7 +282,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
*work)
if (amdgpu_device_should_recover_gpu(adev)
&& (!amdgpu_device_has_job_running(adev) ||
adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
-   amdgpu_device_gpu_recover(adev, NULL);
+   amdgpu_device_gpu_recover_imp(adev, NULL);
 }
 
 static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -307,8 +307,11 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device 
*adev,
 
switch (event) {
case IDH_FLR_NOTIFICATION:
-   if (amdgpu_sriov_runtime(adev))
-   schedule_work(>virt.flr_work);
+   if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+   WARN_ONCE(!queue_work(adev->reset_domain.wq,
+ >virt.flr_work),
+ "Failed to queue work! at %s",
+ __func__);
break;
case IDH_QUERY_ALIVE:
xgpu_ai_mailbox_send_ack(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 477d0dde19c5..5728a6401d73 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -309,7 +309,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct 
*work)
adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
-   amdgpu_device_gpu_recover(adev, NULL);
+   amdgpu_device_gpu_recover_imp(adev, NULL);
 }
 
 static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -337,8 +337,11 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device 
*adev,
 
switch (event) {
case IDH_FLR_NOTIFICATION:
-   if (amdgpu_sriov_runtime(adev))
-   schedule_work(>virt.flr_work);
+   if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+   WARN_ONCE(!queue_work(adev->reset_domain.wq,
+ >virt.flr_work),
+ "Failed to queue work! at %s",
+ __func__);
break;
/* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can 
ignore
 * it byfar since that polling thread will handle it,
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index aef9d059ae52..02290febfcf4 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -521,7 +521,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct 
*work)
 
/* Trigger recovery due to world switch failure */
if (amdgpu_device_should_recover_gpu(adev))
-   amdgpu_device_gpu_recover(adev, NULL);
+   amdgpu_device_gpu_recover_imp(adev, NULL);
 }
 
 static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -550,8 +550,11 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device 
*adev,
r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
 
/* only handle FLR_NOTIFY now */
-   if (!r)
-   schedule_work(>virt.flr_work);
+   if (!r && !amdgpu_in_reset(adev))
+   WARN_ONCE(!queue_work(adev->reset_domain.wq,
+ >virt.flr_work),
+ "Failed to queue work! at %s",
+ __func__);
}
 
return 0;
-- 
2.25.1

[RFC v4 03/11] drm/amdgpu: Serialize non TDR gpu recovery with TDRs

2022-02-08 Thread Andrey Grodzovsky

Use reset domain wq also for non TDR gpu recovery trigers
such as sysfs and RAS. We must serialize all possible
GPU recoveries to gurantee no concurrency there.
For TDR call the original recovery function directly since
it's already executed from within the wq. For others just
use a wrapper to qeueue work and wait on it to finish.

v2: Rename to amdgpu_recover_work_struct

Signed-off-by: Andrey Grodzovsky 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 33 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c|  2 +-
 3 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index b76c1cfad7f1..540a38fe5cd6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1298,6 +1298,8 @@ bool amdgpu_device_has_job_running(struct amdgpu_device 
*adev);
 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev);
 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
  struct amdgpu_job* job);
+int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
+ struct amdgpu_job *job);
 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev);
 int amdgpu_device_pci_reset(struct amdgpu_device *adev);
 bool amdgpu_device_need_post(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 00123b0013d3..15e8fde3ac2d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5033,7 +5033,7 @@ static void amdgpu_device_recheck_guilty_jobs(
  * Returns 0 for success or an error on failure.
  */
 
-int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
+int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
  struct amdgpu_job *job)
 {
struct list_head device_list, *device_list_handle =  NULL;
@@ -5292,6 +5292,37 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
return r;
 }
 
+struct amdgpu_recover_work_struct {
+   struct work_struct base;
+   struct amdgpu_device *adev;
+   struct amdgpu_job *job;
+   int ret;
+};
+
+static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work)
+{
+   struct amdgpu_recover_work_struct *recover_work = container_of(work, 
struct amdgpu_recover_work_struct, base);
+
+   recover_work->ret = amdgpu_device_gpu_recover_imp(recover_work->adev, 
recover_work->job);
+}
+/*
+ * Serialize gpu recover into reset domain single threaded wq
+ */
+int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
+   struct amdgpu_job *job)
+{
+   struct amdgpu_recover_work_struct work = {.adev = adev, .job = job};
+
+   INIT_WORK(, amdgpu_device_queue_gpu_recover_work);
+
+   if (!queue_work(adev->reset_domain.wq, ))
+   return -EAGAIN;
+
+   flush_work();
+
+   return work.ret;
+}
+
 /**
  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
  *
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index bfc47bea23db..38c9fd7b7ad4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -63,7 +63,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
  ti.process_name, ti.tgid, ti.task_name, ti.pid);
 
if (amdgpu_device_should_recover_gpu(ring->adev)) {
-   amdgpu_device_gpu_recover(ring->adev, job);
+   amdgpu_device_gpu_recover_imp(ring->adev, job);
} else {
drm_sched_suspend_timeout(>sched);
if (amdgpu_sriov_vf(adev))
-- 
2.25.1

[RFC v4 02/11] drm/amdgpu: Move scheduler init to after XGMI is ready

2022-02-08 Thread Andrey Grodzovsky

Before we initialize schedulers we must know which reset
domain are we in - for single device there iis a single
domain per device and so single wq per device. For XGMI
the reset domain spans the entire XGMI hive and so the
reset wq is per hive.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 34 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  2 +
 3 files changed, 51 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 9704b0e1fd82..00123b0013d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2287,6 +2287,47 @@ static int amdgpu_device_fw_loading(struct amdgpu_device 
*adev)
return r;
 }
 
+static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
+{
+   long timeout;
+   int r, i;
+
+   for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+   struct amdgpu_ring *ring = adev->rings[i];
+
+   /* No need to setup the GPU scheduler for rings that don't need 
it */
+   if (!ring || ring->no_scheduler)
+   continue;
+
+   switch (ring->funcs->type) {
+   case AMDGPU_RING_TYPE_GFX:
+   timeout = adev->gfx_timeout;
+   break;
+   case AMDGPU_RING_TYPE_COMPUTE:
+   timeout = adev->compute_timeout;
+   break;
+   case AMDGPU_RING_TYPE_SDMA:
+   timeout = adev->sdma_timeout;
+   break;
+   default:
+   timeout = adev->video_timeout;
+   break;
+   }
+
+   r = drm_sched_init(>sched, _sched_ops,
+  ring->num_hw_submission, 
amdgpu_job_hang_limit,
+  timeout, adev->reset_domain.wq, 
ring->sched_score, ring->name);
+   if (r) {
+   DRM_ERROR("Failed to create scheduler on ring %s.\n",
+ ring->name);
+   return r;
+   }
+   }
+
+   return 0;
+}
+
+
 /**
  * amdgpu_device_ip_init - run init for hardware IPs
  *
@@ -2419,6 +2460,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
}
}
 
+   r = amdgpu_device_init_schedulers(adev);
+   if (r)
+   goto init_failed;
+
/* Don't init kfd if whole hive need to be reset during init */
if (!adev->gmc.xgmi.pending_reset)
amdgpu_amdkfd_device_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 45977a72b5dd..fa302540c69a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -457,8 +457,6 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
  atomic_t *sched_score)
 {
struct amdgpu_device *adev = ring->adev;
-   long timeout;
-   int r;
 
if (!adev)
return -EINVAL;
@@ -478,36 +476,12 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring 
*ring,
spin_lock_init(>fence_drv.lock);
ring->fence_drv.fences = kcalloc(num_hw_submission * 2, sizeof(void *),
 GFP_KERNEL);
-   if (!ring->fence_drv.fences)
-   return -ENOMEM;
 
-   /* No need to setup the GPU scheduler for rings that don't need it */
-   if (ring->no_scheduler)
-   return 0;
+   ring->num_hw_submission = num_hw_submission;
+   ring->sched_score = sched_score;
 
-   switch (ring->funcs->type) {
-   case AMDGPU_RING_TYPE_GFX:
-   timeout = adev->gfx_timeout;
-   break;
-   case AMDGPU_RING_TYPE_COMPUTE:
-   timeout = adev->compute_timeout;
-   break;
-   case AMDGPU_RING_TYPE_SDMA:
-   timeout = adev->sdma_timeout;
-   break;
-   default:
-   timeout = adev->video_timeout;
-   break;
-   }
-
-   r = drm_sched_init(>sched, _sched_ops,
-  num_hw_submission, amdgpu_job_hang_limit,
-  timeout, NULL, sched_score, ring->name);
-   if (r) {
-   DRM_ERROR("Failed to create scheduler on ring %s.\n",
- ring->name);
-   return r;
-   }
+   if (!ring->fence_drv.fences)
+   return -ENOMEM;
 
return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index fae7d185ad0d..7f20ce73a243 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -251,6 +251,8 @@ struct

[RFC v4 01/11] drm/amdgpu: Introduce reset domain

2022-02-08 Thread Andrey Grodzovsky

Defined a reset_domain struct such that
all the entities that go through reset
together will be serialized one against
another. Do it for both single device and
XGMI hive cases.

Signed-off-by: Andrey Grodzovsky 
Suggested-by: Daniel Vetter 
Suggested-by: Christian König 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |  9 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 ++
 4 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index d8b854fcbffa..b76c1cfad7f1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -813,6 +813,10 @@ struct amd_powerplay {
 #define AMDGPU_RESET_MAGIC_NUM 64
 #define AMDGPU_MAX_DF_PERFMONS 4
 #define AMDGPU_PRODUCT_NAME_LEN 64
+struct amdgpu_reset_domain {
+   struct workqueue_struct *wq;
+};
+
 struct amdgpu_device {
struct device   *dev;
struct pci_dev  *pdev;
@@ -1100,6 +1104,7 @@ struct amdgpu_device {
uint32_t
ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
 
boolram_is_direct_mapped;
+   struct amdgpu_reset_domain  reset_domain;
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ed077de426d9..9704b0e1fd82 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2398,9 +2398,27 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
if (r)
goto init_failed;
 
-   if (adev->gmc.xgmi.num_physical_nodes > 1)
+   if (adev->gmc.xgmi.num_physical_nodes > 1) {
+   struct amdgpu_hive_info *hive;
+
amdgpu_xgmi_add_device(adev);
 
+   hive = amdgpu_get_xgmi_hive(adev);
+   if (!hive || !hive->reset_domain.wq) {
+   DRM_ERROR("Failed to obtain reset domain info for XGMI 
hive:%llx", hive->hive_id);
+   r = -EINVAL;
+   goto init_failed;
+   }
+
+   adev->reset_domain.wq = hive->reset_domain.wq;
+   } else {
+   adev->reset_domain.wq = 
alloc_ordered_workqueue("amdgpu-reset-dev", 0);
+   if (!adev->reset_domain.wq) {
+   r = -ENOMEM;
+   goto init_failed;
+   }
+   }
+
/* Don't init kfd if whole hive need to be reset during init */
if (!adev->gmc.xgmi.pending_reset)
amdgpu_amdkfd_device_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index e8b8f28c2f72..d406897346d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -398,6 +398,14 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct 
amdgpu_device *adev)
goto pro_end;
}
 
+   hive->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-hive", 0);
+   if (!hive->reset_domain.wq) {
+   dev_err(adev->dev, "XGMI: failed allocating wq for reset 
domain!\n");
+   kfree(hive);
+   hive = NULL;
+   goto pro_end;
+   }
+
hive->hive_id = adev->gmc.xgmi.hive_id;
INIT_LIST_HEAD(>device_list);
INIT_LIST_HEAD(>node);
@@ -407,6 +415,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct 
amdgpu_device *adev)
task_barrier_init(>tb);
hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
hive->hi_req_gpu = NULL;
+
/*
 * hive pstate on boot is high in vega20 so we have to go to low
 * pstate on after boot.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index d2189bf7d428..6121aaa292cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -42,6 +42,8 @@ struct amdgpu_hive_info {
AMDGPU_XGMI_PSTATE_MAX_VEGA20,
AMDGPU_XGMI_PSTATE_UNKNOWN
} pstate;
+
+   struct amdgpu_reset_domain reset_domain;
 };
 
 struct amdgpu_pcs_ras_field {
-- 
2.25.1

[RFC v4 00/11] Define and use reset domain for GPU recovery in amdgpu

2022-02-08 Thread Andrey Grodzovsky

This patchset is based on earlier work by Boris[1] that allowed to have an
ordered workqueue at the driver level that will be used by the different
schedulers to queue their timeout work. On top of that I also serialized
any GPU reset we trigger from within amdgpu code to also go through the same
ordered wq and in this way simplify somewhat our GPU reset code so we don't need
to protect from concurrency by multiple GPU reset triggeres such as TDR on one
hand and sysfs trigger or RAS trigger on the other hand.

As advised by Christian and Daniel I defined a reset_domain struct such that
all the entities that go through reset together will be serialized one against
another. 

TDR triggered by multiple entities within the same domain due to the same 
reason will not
be triggered as the first such reset will cancel all the pending resets. This is
relevant only to TDR timers and not to triggered resets coming from RAS or 
SYSFS,
those will still happen after the in flight resets finishes.

v2:
Add handling on SRIOV configuration, the reset notify coming from host 
and driver already trigger a work queue to handle the reset so drop this
intermediate wq and send directly to timeout wq. (Shaoyun)

v3:
Lijo suggested puting 'adev->in_gpu_reset' in amdgpu_reset_domain struct.
I followed his advise and also moved adev->reset_sem into same place. This
in turn caused to do some follow-up refactor of the original patches 
where i decoupled amdgpu_reset_domain life cycle frolm XGMI hive because hive 
is destroyed and 
reconstructed for the case of reset the devices in the XGMI hive during probe 
for SRIOV See [2]
while we need the reset sem and gpu_reset flag to always be present. This was 
attained
by adding refcount to amdgpu_reset_domain so each device can safely point to it 
as long as
it needs.

v4:
Just bug fixing of reset_domain refcount on XGMI hive boot and some
cosmetic wrappers to reset domain refocunt.


[1] 
https://patchwork.kernel.org/project/dri-devel/patch/20210629073510.2764391-3-boris.brezil...@collabora.com/
[2] https://www.spinics.net/lists/amd-gfx/msg58836.html

P.S Going through drm-misc-next and not amd-staging-drm-next as Boris work 
hasn't landed yet there.

P.P.S Patches 8-12 are the refactor on top of the original V2 patchset.

Andrey Grodzovsky (11):
  drm/amdgpu: Introduce reset domain
  drm/amdgpu: Move scheduler init to after XGMI is ready
  drm/amdgpu: Serialize non TDR gpu recovery with TDRs
  drm/amd/virt: For SRIOV send GPU reset directly to TDR queue.
  drm/amdgpu: Drop hive->in_reset
  drm/amdgpu: Drop concurrent GPU reset protection for device
  drm/amdgpu: Rework reset domain to be refcounted.
  drm/amdgpu: Move reset sem into reset_domain
  drm/amdgpu: Move in_gpu_reset into reset_domain
  drm/amdgpu: Rework amdgpu_device_lock_adev
  Revert 'drm/amdgpu: annotate a false positive recursive locking'

 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  13 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c   |  10 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 275 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  34 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   |   2 +-
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c|  18 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c |  43 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  37 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h  |   2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c  |  27 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h  |   3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c|   6 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c |  14 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c |  19 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c |  19 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c |  11 +-
 16 files changed, 335 insertions(+), 198 deletions(-)

-- 
2.25.1

Re: [PATCH 6/8] mm: don't include in

2022-02-08 Thread Dan Williams

On Mon, Feb 7, 2022 at 3:49 PM Dan Williams  wrote:
>
> On Sun, Feb 6, 2022 at 10:33 PM Christoph Hellwig  wrote:
> >
> > Move the check for the actual pgmap types that need the free at refcount
> > one behavior into the out of line helper, and thus avoid the need to
> > pull memremap.h into mm.h.
>
> Looks good to me assuming the compile bots agree.
>
> Reviewed-by: Dan Williams 

Yeah, same as Logan:

mm/memcontrol.c: In function ‘get_mctgt_type’:
mm/memcontrol.c:5724:29: error: implicit declaration of function
‘is_device_private_page’; did you mean
‘is_device_private_entry’? [-Werror=implicit-function-declaration]
 5724 | if (is_device_private_page(page))
  | ^~
  | is_device_private_entry

...needs:

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d1e97a54ae53..0ac7515c85f9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -62,6 +62,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "internal.h"
 #include 
 #include

freedesktop

2022-02-08 Thread Anthony Liu

 Dear Manager,
(Please forward this to your CEO, because this is urgent. Thanks!)
My name is Anthony Liu, Operating Manager of a Network Service Company which is 
the domain name registration center in Shanghai, China. On February 7, 2022, we 
received an application from Dengdisi Holdings Ltd requested “freedesktop” as 
their internet keyword and China (CN) domain names( freedesktop.cn/ 
freedesktop.com.cn/ freedesktop.net.cn/ freedesktop.org.cn). But after checking 
it, we find this name conflict with your company name or trademark. In order to 
deal with this matter better, it’s necessary to send email to you and confirm 
whether this company is your distributor or business partner in China?
Kind regards
Anthony Liu
Operating Manager
NETCHINA Headquarters
w w w[.]ne tchi na[.]org
8008, Tianan Building, No. 1399 Jinqiao Road, 
Shanghai 200120, China
Tel: 0086-21-6191-8696  
Fax: 0086-21-6191-8697
Mobi: 0086-134-8281-9147  

This email contains privileged and confidential information intended for the 
addressee only. If you are not the intended recipient, please destroy this 
email and inform the sender immediately. We appreciate you respecting the 
confidentiality of this information by not disclosing or using the information 
in this email.

Re: [PATCH V3 4/7] drm/amd/pm: correct the usage for 'supported' member of smu_feature structure

2022-02-08 Thread Nathan Chancellor

Hi Evan,

On Fri, Jan 28, 2022 at 03:04:52PM +0800, Evan Quan wrote:
> The supported features should be retrieved just after EnableAllDpmFeatures 
> message
> complete. And the check(whether some dpm feature is supported) is only needed 
> when we
> decide to enable or disable it.
> 
> Signed-off-by: Evan Quan 
> Change-Id: I07c9a5ac5290cd0d88a40ce1768d393156419b5a
> ---
>  drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 11 +++
>  drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c   |  8 
>  .../gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c   | 10 +-
>  drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c|  3 ---
>  drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c  |  5 +
>  drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c|  3 ---
>  drivers/gpu/drm/amd/pm/swsmu/smu13/yellow_carp_ppt.c  |  3 ---
>  7 files changed, 21 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
> b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> index ae48cc5aa567..803068cb5079 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> @@ -1057,8 +1057,10 @@ static int smu_get_thermal_temperature_range(struct 
> smu_context *smu)
>  
>  static int smu_smc_hw_setup(struct smu_context *smu)
>  {
> + struct smu_feature *feature = >smu_feature;
>   struct amdgpu_device *adev = smu->adev;
>   uint32_t pcie_gen = 0, pcie_width = 0;
> + uint64_t features_supported;
>   int ret = 0;
>  
>   if (adev->in_suspend && smu_is_dpm_running(smu)) {
> @@ -1138,6 +1140,15 @@ static int smu_smc_hw_setup(struct smu_context *smu)
>   return ret;
>   }
>  
> + ret = smu_feature_get_enabled_mask(smu, _supported);
> + if (ret) {
> + dev_err(adev->dev, "Failed to retrieve supported dpm 
> features!\n");
> + return ret;
> + }
> + bitmap_copy(feature->supported,
> + (unsigned long *)_supported,
> + feature->feature_num);
> +
>   if (!smu_is_dpm_running(smu))
>   dev_info(adev->dev, "dpm has been disabled\n");
>  
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c 
> b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
> index 84cbde3f913d..f55ead5f9aba 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
> @@ -1624,8 +1624,8 @@ static int navi10_display_config_changed(struct 
> smu_context *smu)
>   int ret = 0;
>  
>   if ((smu->watermarks_bitmap & WATERMARKS_EXIST) &&
> - smu_cmn_feature_is_supported(smu, SMU_FEATURE_DPM_DCEFCLK_BIT) &&
> - smu_cmn_feature_is_supported(smu, SMU_FEATURE_DPM_SOCCLK_BIT)) {
> + smu_cmn_feature_is_enabled(smu, SMU_FEATURE_DPM_DCEFCLK_BIT) &&
> + smu_cmn_feature_is_enabled(smu, SMU_FEATURE_DPM_SOCCLK_BIT)) {
>   ret = smu_cmn_send_smc_msg_with_param(smu, 
> SMU_MSG_NumOfDisplays,
> 
> smu->display_config->num_display,
> NULL);
> @@ -1860,13 +1860,13 @@ static int navi10_notify_smc_display_config(struct 
> smu_context *smu)
>   min_clocks.dcef_clock_in_sr = 
> smu->display_config->min_dcef_deep_sleep_set_clk;
>   min_clocks.memory_clock = smu->display_config->min_mem_set_clock;
>  
> - if (smu_cmn_feature_is_supported(smu, SMU_FEATURE_DPM_DCEFCLK_BIT)) {
> + if (smu_cmn_feature_is_enabled(smu, SMU_FEATURE_DPM_DCEFCLK_BIT)) {
>   clock_req.clock_type = amd_pp_dcef_clock;
>   clock_req.clock_freq_in_khz = min_clocks.dcef_clock * 10;
>  
>   ret = smu_v11_0_display_clock_voltage_request(smu, _req);
>   if (!ret) {
> - if (smu_cmn_feature_is_supported(smu, 
> SMU_FEATURE_DS_DCEFCLK_BIT)) {
> + if (smu_cmn_feature_is_enabled(smu, 
> SMU_FEATURE_DS_DCEFCLK_BIT)) {
>   ret = smu_cmn_send_smc_msg_with_param(smu,
> 
> SMU_MSG_SetMinDeepSleepDcefclk,
> 
> min_clocks.dcef_clock_in_sr/100,
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c 
> b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
> index b6759f8b5167..804e1c98238d 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
> @@ -1280,8 +1280,8 @@ static int sienna_cichlid_display_config_changed(struct 
> smu_context *smu)
>   int ret = 0;
>  
>   if ((smu->watermarks_bitmap & WATERMARKS_EXIST) &&
> - smu_cmn_feature_is_supported(smu, SMU_FEATURE_DPM_DCEFCLK_BIT) &&
> - smu_cmn_feature_is_supported(smu, SMU_FEATURE_DPM_SOCCLK_BIT)) {
> + smu_cmn_feature_is_enabled(smu, SMU_FEATURE_DPM_DCEFCLK_BIT) &&
> +

Re: [PATCH v4 2/2] drm/radeon/uvd: Fix forgotten unmap buffer objects

2022-02-08 Thread Alex Deucher

Applied the series.  Thanks!

Alex

On Tue, Feb 8, 2022 at 3:33 AM Christian König  wrote:
>
> I think so, Alex will probably pick that up.
>
> Thanks,
> Christian.
>
> Am 08.02.22 um 09:28 schrieb zhanglianjie:
> > I am very sorry that I submitted many times due to the character
> > coding problem. Can PATCH V4 be used?
> >
> >> I'm scratching my head what you are doing here?
> >>
> >> That's the fives time you send out the same patch, so something is
> >> going wrong here :)
> >>
> >> Please double check why that lands in your outbox over and over again.
> >>
> >> Regards,
> >> Christian.
> >>
> >> Am 08.02.22 um 09:14 schrieb zhanglianjie:
> >>> after the buffer object is successfully mapped, call
> >>> radeon_bo_kunmap before the function returns.
> >>>
> >>> Signed-off-by: zhanglianjie 
> >>> Reviewed-by: Christian König 
> >>>
> >>> diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c
> >>> b/drivers/gpu/drm/radeon/radeon_uvd.c
> >>> index 377f9cdb5b53..0558d928d98d 100644
> >>> --- a/drivers/gpu/drm/radeon/radeon_uvd.c
> >>> +++ b/drivers/gpu/drm/radeon/radeon_uvd.c
> >>> @@ -497,6 +497,7 @@ static int radeon_uvd_cs_msg(struct
> >>> radeon_cs_parser *p, struct radeon_bo *bo,
> >>>   handle = msg[2];
> >>>
> >>>   if (handle == 0) {
> >>> +radeon_bo_kunmap(bo);
> >>>   DRM_ERROR("Invalid UVD handle!\n");
> >>>   return -EINVAL;
> >>>   }
> >>> @@ -559,12 +560,10 @@ static int radeon_uvd_cs_msg(struct
> >>> radeon_cs_parser *p, struct radeon_bo *bo,
> >>>   return 0;
> >>>
> >>>   default:
> >>> -
> >>>   DRM_ERROR("Illegal UVD message type (%d)!\n", msg_type);
> >>> -return -EINVAL;
> >>>   }
> >>>
> >>> -BUG();
> >>> +radeon_bo_kunmap(bo);
> >>>   return -EINVAL;
> >>>   }
> >>>
> >>> --
> >>> 2.20.1
> >>>
> >>>
> >>>
> >>
> >>
> >
> >
> >
>

Re: [RFC v4] drm/amdgpu: Rework reset domain to be refcounted.

2022-02-08 Thread Andrey Grodzovsky




On 2022-02-08 06:25, Lazar, Lijo wrote:



On 2/2/2022 10:56 PM, Andrey Grodzovsky wrote:

The reset domain contains register access semaphor
now and so needs to be present as long as each device
in a hive needs it and so it cannot be binded to XGMI
hive life cycle.
Adress this by making reset domain refcounted and pointed
by each member of the hive and the hive itself.

v4:
Fix crash on boot with XGMI hive by adding type to reset_domain.
XGMI will only create a new reset_domain if prevoius was of single
device type meaning it's first boot. Otherwsie it will take a
refocunt to exsiting reset_domain from the amdgou device.

Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h    |  6 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  | 38 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 18 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 29 +++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  4 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c  |  4 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c  |  4 +-
  9 files changed, 118 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index 8e96b9a14452..f2ba460bfd59 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -813,9 +813,7 @@ struct amd_powerplay {
  #define AMDGPU_RESET_MAGIC_NUM 64
  #define AMDGPU_MAX_DF_PERFMONS 4
  -struct amdgpu_reset_domain {
-    struct workqueue_struct *wq;
-};
+struct amdgpu_reset_domain;
    struct amdgpu_device {
  struct device    *dev;
@@ -1102,7 +1100,7 @@ struct amdgpu_device {
  struct amdgpu_reset_control *reset_cntl;
  uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
  -    struct amdgpu_reset_domain    reset_domain;
+    struct amdgpu_reset_domain    *reset_domain;
  };
    static inline struct amdgpu_device *drm_to_adev(struct drm_device 
*ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index fef952ca8db5..cd1b7af69c35 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2313,7 +2313,7 @@ static int amdgpu_device_init_schedulers(struct 
amdgpu_device *adev)

    r = drm_sched_init(>sched, _sched_ops,
 ring->num_hw_submission, amdgpu_job_hang_limit,
-   timeout, adev->reset_domain.wq, 
ring->sched_score, ring->name);
+   timeout, adev->reset_domain->wq, 
ring->sched_score, ring->name);

  if (r) {
  DRM_ERROR("Failed to create scheduler on ring %s.\n",
    ring->name);
@@ -2432,24 +2432,22 @@ static int amdgpu_device_ip_init(struct 
amdgpu_device *adev)

  if (r)
  goto init_failed;
  +    /**
+ * In case of XGMI grab extra reference for reset domain for 
this device

+ */
  if (adev->gmc.xgmi.num_physical_nodes > 1) {
-    struct amdgpu_hive_info *hive;
-
-    amdgpu_xgmi_add_device(adev);
+    if (amdgpu_xgmi_add_device(adev) == 0) {
+    struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
  -    hive = amdgpu_get_xgmi_hive(adev);
-    if (!hive || !hive->reset_domain.wq) {
-    DRM_ERROR("Failed to obtain reset domain info for XGMI 
hive:%llx", hive->hive_id);

-    r = -EINVAL;
-    goto init_failed;
-    }
+    if (!hive->reset_domain ||
+ !kref_get_unless_zero(>reset_domain->refcount)) {
+    r = -ENOENT;
+    goto init_failed;
+    }
  -    adev->reset_domain.wq = hive->reset_domain.wq;
-    } else {
-    adev->reset_domain.wq = 
alloc_ordered_workqueue("amdgpu-reset-dev", 0);

-    if (!adev->reset_domain.wq) {
-    r = -ENOMEM;
-    goto init_failed;
+    /* Drop the early temporary reset domain we created for 
device */
+    kref_put(>reset_domain->refcount, 
amdgpu_reset_destroy_reset_domain);

+    adev->reset_domain = hive->reset_domain;
  }
  }
  @@ -3599,6 +3597,15 @@ int amdgpu_device_init(struct amdgpu_device 
*adev,

  return r;
  }
  +    /*
+ * Reset domain needs to be present early, before XGMI hive 
discovered

+ * (if any) and intitialized to use reset sem and in_gpu reset flag
+ * early on during init.
+ */
+    adev->reset_domain = 
amdgpu_reset_create_reset_domain(SINGLE_DEVICE ,"amdgpu-reset-dev");

+    if (!adev->reset_domain)
+    return -ENOMEM;
+
  /* early init functions */
  r = amdgpu_device_ip_early_init(adev);
  if (r)
@@ -3949,6 +3956,9 @@ void amdgpu_device_fini_sw(struct amdgpu_device 
*adev)

  if (adev->mman.discovery_bin)
  amdgpu_discovery_fini(adev);
  +    kref_put(>reset_domain->refcount,

Re: [RFC v3 06/12] drm/amdgpu: Drop hive->in_reset

2022-02-08 Thread Andrey Grodzovsky




On 2022-02-08 01:33, Lazar, Lijo wrote:



On 1/26/2022 4:07 AM, Andrey Grodzovsky wrote:

Since we serialize all resets no need to protect from concurrent
resets.

Signed-off-by: Andrey Grodzovsky 
Reviewed-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |  1 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  1 -
  3 files changed, 1 insertion(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 258ec3c0b2af..107a393ebbfd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5013,25 +5013,9 @@ int amdgpu_device_gpu_recover_imp(struct 
amdgpu_device *adev,

  dev_info(adev->dev, "GPU %s begin!\n",
  need_emergency_restart ? "jobs stop":"reset");
  -    /*
- * Here we trylock to avoid chain of resets executing from
- * either trigger by jobs on different adevs in XGMI hive or 
jobs on
- * different schedulers for same device while this TO handler is 
running.
- * We always reset all schedulers for device and all devices for 
XGMI

- * hive so that should take care of them too.
- */
  hive = amdgpu_get_xgmi_hive(adev);
-    if (hive) {
-    if (atomic_cmpxchg(>in_reset, 0, 1) != 0) {
-    DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as 
another already in progress",

-    job ? job->base.id : -1, hive->hive_id);
-    amdgpu_put_xgmi_hive(hive);
-    if (job && job->vm)
-    drm_sched_increase_karma(>base);
-    return 0;
-    }


This function in general will reset all devices in a hive.

In a situation like GPU0 in hive0 gets to this function first and GPU1 
in hive0 also hangs shortly (before GPU0 recovery process starts 
reseting other devices in hive), we don't want to execute work queued 
as part of GPU1's recovery also.Both GPU0 and GPU1 recovery process 
will try to reset all the devices in hive.


In short - if a reset domain is already active, probably we don't need 
to queue another work to the domain since all devices in the domain 
are expected to get reset shortly.


Thanks,
Lijo



No worries for this - check this part in drm_sched_stop 
https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/scheduler/sched_main.c#L452
this will be called for each scheduler participating in the reset domain 
(including schedulers of GPU) and will cancel any such pending resets that

we want to avoid executing.

Andrey





+    if (hive)
  mutex_lock(>hive_lock);
-    }
    reset_context.method = AMD_RESET_METHOD_NONE;
  reset_context.reset_req_dev = adev;
@@ -5227,7 +5211,6 @@ int amdgpu_device_gpu_recover_imp(struct 
amdgpu_device *adev,

    skip_recovery:
  if (hive) {
-    atomic_set(>in_reset, 0);
  mutex_unlock(>hive_lock);
  amdgpu_put_xgmi_hive(hive);
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c

index a858e3457c5c..9ad742039ac9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -404,7 +404,6 @@ struct amdgpu_hive_info 
*amdgpu_get_xgmi_hive(struct amdgpu_device *adev)

  INIT_LIST_HEAD(>device_list);
  INIT_LIST_HEAD(>node);
  mutex_init(>hive_lock);
-    atomic_set(>in_reset, 0);
  atomic_set(>number_devices, 0);
  task_barrier_init(>tb);
  hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h

index 6121aaa292cb..2f2ce53645a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -33,7 +33,6 @@ struct amdgpu_hive_info {
  struct list_head node;
  atomic_t number_devices;
  struct mutex hive_lock;
-    atomic_t in_reset;
  int hi_req_count;
  struct amdgpu_device *hi_req_gpu;
  struct task_barrier tb;

Re: [PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset

2022-02-08 Thread Alex Deucher

On Tue, Feb 8, 2022 at 3:17 AM Somalapuram Amaranath
 wrote:
>
> Dump the list of register values to trace event on GPU reset.
>
> Signed-off-by: Somalapuram Amaranath 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21 -
>  drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 19 +++
>  2 files changed, 39 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 1e651b959141..057922fb7e37 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4534,6 +4534,23 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device 
> *adev,
> return r;
>  }
>
> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
> +{
> +   int i;
> +   uint32_t reg_value[128];
> +
> +   for (i = 0; adev->reset_dump_reg_list[i] != 0; i++) {
> +   if (adev->asic_type >= CHIP_NAVI10)

This check should be against CHIP_VEGA10.  Also, this only allows for
GC registers.  If we wanted to dump other registers, we'd need a
different macro.  Might be better to just use RREG32 here for
everything and then encode the full offset using
SOC15_REG_ENTRY_OFFSET() or a similar macro.  Also, we need to think
about how to handle gfxoff in this case.  gfxoff needs to be disabled
or we'll hang the chip if we try and read GC or SDMA registers via
MMIO which will adversely affect the hang signature.

Alex

> +   reg_value[i] = RREG32_SOC15_IP(GC, 
> adev->reset_dump_reg_list[i]);
> +   else
> +   reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
> +   }
> +
> +   trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list, reg_value, i);
> +
> +   return 0;
> +}
> +
>  int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>  struct amdgpu_reset_context *reset_context)
>  {
> @@ -4567,8 +4584,10 @@ int amdgpu_do_asic_reset(struct list_head 
> *device_list_handle,
> tmp_adev->gmc.xgmi.pending_reset = false;
> if (!queue_work(system_unbound_wq, 
> _adev->xgmi_reset_work))
> r = -EALREADY;
> -   } else
> +   } else {
> +   amdgpu_reset_reg_dumps(tmp_adev);
> r = amdgpu_asic_reset(tmp_adev);
> +   }
>
> if (r) {
> dev_err(tmp_adev->dev, "ASIC reset failed 
> with error, %d for drm dev, %s",
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> index d855cb53c7e0..3fe33de3564a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> @@ -537,6 +537,25 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
>   __entry->seqno)
>  );
>
> +TRACE_EVENT(amdgpu_reset_reg_dumps,
> +   TP_PROTO(long *address, uint32_t *value, int length),
> +   TP_ARGS(address, value, length),
> +   TP_STRUCT__entry(
> +__array(long, address, 128)
> +__array(uint32_t, value, 128)
> +__field(int, len)
> +),
> +   TP_fast_assign(
> +  memcpy(__entry->address, address, 128);
> +  memcpy(__entry->value,  value, 128);
> +  __entry->len = length;
> +  ),
> +   TP_printk("amdgpu register dump offset: %s value: %s ",
> + __print_array(__entry->address, __entry->len, 8),
> + __print_array(__entry->value, __entry->len, 8)
> +)
> +);
> +
>  #undef AMDGPU_JOB_GET_TIMELINE_NAME
>  #endif
>
> --
> 2.25.1
>

Re: [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list

2022-02-08 Thread Sharma, Shashank

Based on confirmation from Christian, it seems my understanding of the 
design was not correct, and user must add a list of registers to dump.

That resolves most of my comments automatically, @Amar, please fix a max 
register condition in the loop, to handle the negative testing case and 
the uint32_t stuff. With those two changes in place, feel free to use 
for this patch:

Reviewed-by: Shashank Sharma 

- Shashank

On 2/8/2022 3:31 PM, Sharma, Shashank wrote:

 >> User only update the list of reg offsets on init, there is no
 >> predefined reg offset from kernel code.

I missed this comment in the last patch, and this makes me a bit 
confused. During the design phase, did we agree to have this whole list 
loaded from user ? which means that if user doesn't define the list at 
init, we will not send the trace_event().

Or was it kernel has a list, and user can modify if he wants to, and we 
will dump the values as per the register list.

@Christian ?

Regards
Shashank
On 2/8/2022 3:18 PM, Sharma, Shashank wrote:

On 2/8/2022 2:39 PM, Somalapuram, Amaranath wrote:

On 2/8/2022 4:43 PM, Sharma, Shashank wrote:

I thought we spoke and agreed about:
- Not doing dynamic memory allocation during a reset call,
as there is a redesign debugfs call will happen during the 
application initialization and not during reset.
- Not doing string operations, but just dumping register values by 
index.
I think your referring to the second patch which happens during reset 
and no string operation in second patch.

Pls see my comment in the end.

NACK !

- Shashank

Amar,
Apart from the long comment,there are a few more bugs in the patch, 
which I have mentioned here inline. Please check them out.

- Shashank

On 2/8/2022 9:18 AM, Christian König wrote:

Am 08.02.22 um 09:16 schrieb Somalapuram Amaranath:
List of register to be populated for dump collection during the 
GPU reset.

Signed-off-by: Somalapuram Amaranath 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h |  3 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 60 
+

  2 files changed, 63 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index b85b67a88a3d..78fa46f959c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1097,6 +1097,9 @@ struct amdgpu_device {
  struct amdgpu_reset_control *reset_cntl;
  uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
+
+    /* reset dump register */
+    long    reset_dump_reg_list[128];

I don't have time for a full review, but using long here certainly 
makes no sense.

long is either 32bit or 64bit depending on the CPU architecture.

Regards,
Christian.

will change uint32_t.

  };
  static inline struct amdgpu_device *drm_to_adev(struct 
drm_device *ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index 164d6a9e9fbb..dad268e8a81a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1609,6 +1609,64 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, 
NULL,

  DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
  amdgpu_debugfs_sclk_set, "%llu\n");
+static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
+    char __user *buf, size_t size, loff_t *pos)
+{
+    struct amdgpu_device *adev = (struct amdgpu_device 
*)file_inode(f)->i_private;

+    char *reg_offset;
+    int i, r, len;
+
+    reg_offset = kmalloc(2048, GFP_KERNEL);

We also want to understand how does the value 2048 came into picture, 
probably a macro which calculates the size preprocessing time will 
work better.

#define #define N_REGS_DUMP_GPU_RESET 10
#define BUFFER_SZ(N_REGS_DUMP_GPU_RESET * (sizeof uint64_t) + 1)

This first macro can be used later for the loop count for registers as 
well.

+    memset(reg_offset,  0, 2048);
+    for (i = 0; adev->reset_dump_reg_list[i] != 0; i++)

This loop termination condition is incorrect, why are we running the 
loop until adev->reset_dump_reg_list[i] != 0 ?

What if I have 10 registers to dump, but my 4th register value is 0 
? It will break the loop at 4 and we will not get all values.

agreed, i try to avoid one more variable in adev

Not by the cost of logic of course :).

Now you can run this loop here.

for (i = 0; i < N_REGS...; i++) {
 register_value_copy_here;
}

+    sprintf(reg_offset + strlen(reg_offset), "0x%lx ", 
adev->reset_dump_reg_list[i]);

+

+    sprintf(reg_offset + strlen(reg_offset), "\n");
+    len = strlen(reg_offset);
+
+    if (*pos >=  len)
+    return 0;
+
+    r = copy_to_user(buf, reg_offset, len);
+    *pos += len - r;
+    kfree(reg_offset);

Also, why are we doing a dynamic memory allocation for reg_offest ? 
We can simply use adev->reset_dump_reg_list[i] isnt't it ?

simply
for (i=0; ireg_list[i], sizeof(uint64_t));
}

Or without even a loop, simply:

Re: [PATCH 7/7] drm/amd/pm: fix some OEM SKU specific stability issues

2022-02-08 Thread Deucher, Alexander

[Public]

Series is:
Reviewed-by: Alex Deucher 

From: Quan, Evan 
Sent: Monday, February 7, 2022 10:20 PM
To: amd-gfx@lists.freedesktop.org 
Cc: Deucher, Alexander ; Quan, Evan 

Subject: [PATCH 7/7] drm/amd/pm: fix some OEM SKU specific stability issues

Add a quirk in sienna_cichlid_ppt.c to fix some OEM SKU
specific stability issues.

Signed-off-by: Evan Quan 
Change-Id: I172c6429c54253788dbf28f7acf877375f2bfc5b
---
 .../amd/pm/swsmu/smu11/sienna_cichlid_ppt.c   | 32 ++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
index a7bb5358d4a4..f964af05f376 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
@@ -428,6 +428,36 @@ static int sienna_cichlid_store_powerplay_table(struct 
smu_context *smu)
 return 0;
 }

+static int sienna_cichlid_patch_pptable_quirk(struct smu_context *smu)
+{
+   struct amdgpu_device *adev = smu->adev;
+   uint32_t *board_reserved;
+   uint16_t *freq_table_gfx;
+   uint32_t i;
+
+   /* Fix some OEM SKU specific stability issues */
+   GET_PPTABLE_MEMBER(BoardReserved, _reserved);
+   if ((adev->pdev->device == 0x73DF) &&
+   (adev->pdev->revision == 0XC3) &&
+   (adev->pdev->subsystem_device == 0x16C2) &&
+   (adev->pdev->subsystem_vendor == 0x1043))
+   board_reserved[0] = 1387;
+
+   GET_PPTABLE_MEMBER(FreqTableGfx, _table_gfx);
+   if ((adev->pdev->device == 0x73DF) &&
+   (adev->pdev->revision == 0XC3) &&
+   ((adev->pdev->subsystem_device == 0x16C2) ||
+   (adev->pdev->subsystem_device == 0x133C)) &&
+   (adev->pdev->subsystem_vendor == 0x1043)) {
+   for (i = 0; i < NUM_GFXCLK_DPM_LEVELS; i++) {
+   if (freq_table_gfx[i] > 2500)
+   freq_table_gfx[i] = 2500;
+   }
+   }
+
+   return 0;
+}
+
 static int sienna_cichlid_setup_pptable(struct smu_context *smu)
 {
 int ret = 0;
@@ -448,7 +478,7 @@ static int sienna_cichlid_setup_pptable(struct smu_context 
*smu)
 if (ret)
 return ret;

-   return ret;
+   return sienna_cichlid_patch_pptable_quirk(smu);
 }

 static int sienna_cichlid_tables_init(struct smu_context *smu)
--
2.29.0

Re: [PATCH v2] drm/amd/pm: fix hwmon node of power1_label create issue

2022-02-08 Thread Alex Deucher

Reviewed-by: Alex Deucher 

On Tue, Feb 8, 2022 at 2:09 AM Yang Wang  wrote:
>
> it will cause hwmon node of power1_label is not created.
>
> v2:
> the hwmon node of "power1_lable" is always needed for all ASICs.
> and the patch will remove ASIC type check for "power1_label".
>
> Fixes: ae07970a06 ("drm/amd/pm: add support for hwmon control of slow and 
> fast PPT limit on vangogh")
>
> Signed-off-by: Yang Wang 
> ---
>  drivers/gpu/drm/amd/pm/amdgpu_pm.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
> b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> index d68e7132da2c..d6c01c59f32e 100644
> --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> @@ -3288,7 +3288,7 @@ static umode_t hwmon_attributes_visible(struct kobject 
> *kobj,
>  attr == _dev_attr_power2_cap.dev_attr.attr ||
>  attr == _dev_attr_power2_cap_default.dev_attr.attr ||
>  attr == _dev_attr_power2_label.dev_attr.attr ||
> -attr == _dev_attr_power1_label.dev_attr.attr))
> +attr == _dev_attr_power2_label.dev_attr.attr))
> return 0;
>
> return effective_mode;
> --
> 2.25.1
>

[PATCH v8 2/3] drm/mediatek: init panel orientation property

2022-02-08 Thread Hsin-Yi Wang

Init panel orientation property after connector is initialized. Let the
panel driver decides the orientation value later.

Signed-off-by: Hsin-Yi Wang 
Acked-by: Chun-Kuang Hu 
---
 drivers/gpu/drm/mediatek/mtk_dsi.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/mediatek/mtk_dsi.c 
b/drivers/gpu/drm/mediatek/mtk_dsi.c
index 5d90d2eb001935..491bf5b0a2b984 100644
--- a/drivers/gpu/drm/mediatek/mtk_dsi.c
+++ b/drivers/gpu/drm/mediatek/mtk_dsi.c
@@ -965,6 +965,13 @@ static int mtk_dsi_encoder_init(struct drm_device *drm, 
struct mtk_dsi *dsi)
ret = PTR_ERR(dsi->connector);
goto err_cleanup_encoder;
}
+
+   ret = drm_connector_init_panel_orientation_property(dsi->connector);
+   if (ret) {
+   DRM_ERROR("Unable to init panel orientation\n");
+   goto err_cleanup_encoder;
+   }
+
drm_connector_attach_encoder(dsi->connector, >encoder);
 
return 0;
-- 
2.35.0.263.gb82422642f-goog

[PATCH v8 1/3] gpu: drm: separate panel orientation property creating and value setting

2022-02-08 Thread Hsin-Yi Wang

drm_dev_register() sets connector->registration_state to
DRM_CONNECTOR_REGISTERED and dev->registered to true. If
drm_connector_set_panel_orientation() is first called after
drm_dev_register(), it will fail several checks and results in following
warning.

Add a function to create panel orientation property and set default value
to UNKNOWN, so drivers can call this function to init the property earlier
, and let the panel set the real value later.

[4.480976] [ cut here ]
[4.485603] WARNING: CPU: 5 PID: 369 at drivers/gpu/drm/drm_mode_object.c:45 
__drm_mode_object_add+0xb4/0xbc

[4.609772] Call trace:
[4.612208]  __drm_mode_object_add+0xb4/0xbc
[4.616466]  drm_mode_object_add+0x20/0x2c
[4.620552]  drm_property_create+0xdc/0x174
[4.624723]  drm_property_create_enum+0x34/0x98
[4.629241]  drm_connector_set_panel_orientation+0x64/0xa0
[4.634716]  boe_panel_get_modes+0x88/0xd8
[4.638802]  drm_panel_get_modes+0x2c/0x48
[4.642887]  panel_bridge_get_modes+0x1c/0x28
[4.647233]  drm_bridge_connector_get_modes+0xa0/0xd4
[4.652273]  drm_helper_probe_single_connector_modes+0x218/0x700
[4.658266]  drm_mode_getconnector+0x1b4/0x45c
[4.662699]  drm_ioctl_kernel+0xac/0x128
[4.11]  drm_ioctl+0x268/0x410
[4.670002]  drm_compat_ioctl+0xdc/0xf0
[4.673829]  __arm64_compat_sys_ioctl+0xc8/0x100
[4.678436]  el0_svc_common+0xf4/0x1c0
[4.682174]  do_el0_svc_compat+0x28/0x3c
[4.686088]  el0_svc_compat+0x10/0x1c
[4.689738]  el0_sync_compat_handler+0xa8/0xcc
[4.694171]  el0_sync_compat+0x178/0x180
[4.698082] ---[ end trace b4f2db9d9c88610b ]---
[4.702721] [ cut here ]
[4.707329] WARNING: CPU: 5 PID: 369 at 
drivers/gpu/drm/drm_mode_object.c:243 drm_object_attach_property+0x48/0xb8

[4.833830] Call trace:
[4.836266]  drm_object_attach_property+0x48/0xb8
[4.840958]  drm_connector_set_panel_orientation+0x84/0xa0
[4.846432]  boe_panel_get_modes+0x88/0xd8
[4.850516]  drm_panel_get_modes+0x2c/0x48
[4.854600]  panel_bridge_get_modes+0x1c/0x28
[4.858946]  drm_bridge_connector_get_modes+0xa0/0xd4
[4.863984]  drm_helper_probe_single_connector_modes+0x218/0x700
[4.869978]  drm_mode_getconnector+0x1b4/0x45c
[4.874410]  drm_ioctl_kernel+0xac/0x128
[4.878320]  drm_ioctl+0x268/0x410
[4.881711]  drm_compat_ioctl+0xdc/0xf0
[4.885536]  __arm64_compat_sys_ioctl+0xc8/0x100
[4.890142]  el0_svc_common+0xf4/0x1c0
[4.893879]  do_el0_svc_compat+0x28/0x3c
[4.897791]  el0_svc_compat+0x10/0x1c
[4.901441]  el0_sync_compat_handler+0xa8/0xcc
[4.905873]  el0_sync_compat+0x178/0x180
[4.909783] ---[ end trace b4f2db9d9c88610c ]---

Signed-off-by: Hsin-Yi Wang 
Reviewed-by: Sean Paul 
---
v7->v8:
- check if the prop is created to avoid leak issue when called multiple
  times.
- attempt to create prop in drm_connector_set_panel_orientation if prop
  is not created before, so driver don't need to call
  drm_connector_init_panel_orientation_property if they don't need to
  set the property earlier.
---
 drivers/gpu/drm/drm_connector.c | 62 -
 include/drm/drm_connector.h |  2 ++
 2 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index a50c82bc2b2fec..572ead7ac10690 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -1252,7 +1252,7 @@ static const struct drm_prop_enum_list dp_colorspaces[] = 
{
  * INPUT_PROP_DIRECT) will still map 1:1 to the actual LCD panel
  * coordinates, so if userspace rotates the picture to adjust for
  * the orientation it must also apply the same transformation to the
- * touchscreen input coordinates. This property is initialized by calling
+ * touchscreen input coordinates. This property value is set by calling
  * drm_connector_set_panel_orientation() or
  * drm_connector_set_panel_orientation_with_quirk()
  *
@@ -2341,8 +2341,8 @@ EXPORT_SYMBOL(drm_connector_set_vrr_capable_property);
  * @connector: connector for which to set the panel-orientation property.
  * @panel_orientation: drm_panel_orientation value to set
  *
- * This function sets the connector's panel_orientation and attaches
- * a "panel orientation" property to the connector.
+ * This function sets the connector's panel_orientation value. If the property
+ * doesn't exist, it will try to create one.
  *
  * Calling this function on a connector where the panel_orientation has
  * already been set is a no-op (e.g. the orientation has been overridden with
@@ -2373,19 +2373,12 @@ int drm_connector_set_panel_orientation(
info->panel_orientation = panel_orientation;
 
prop = dev->mode_config.panel_orientation_property;
-   if (!prop) {
-   prop = drm_property_create_enum(dev, DRM_MODE_PROP_IMMUTABLE,
-   "panel

[PATCH v8 3/3] arm64: dts: mt8183: Add panel rotation

2022-02-08 Thread Hsin-Yi Wang

krane, kakadu, and kodama boards have a default panel rotation.

Signed-off-by: Hsin-Yi Wang 
Reviewed-by: Enric Balletbo i Serra 
Tested-by: Enric Balletbo i Serra 
---
 arch/arm64/boot/dts/mediatek/mt8183-kukui.dtsi | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/boot/dts/mediatek/mt8183-kukui.dtsi 
b/arch/arm64/boot/dts/mediatek/mt8183-kukui.dtsi
index b42d81d26d7211..d29d4378170971 100644
--- a/arch/arm64/boot/dts/mediatek/mt8183-kukui.dtsi
+++ b/arch/arm64/boot/dts/mediatek/mt8183-kukui.dtsi
@@ -276,6 +276,7 @@ panel: panel@0 {
avee-supply = <_lcd>;
pp1800-supply = <_lcd>;
backlight = <_lcd0>;
+   rotation = <270>;
port {
panel_in: endpoint {
remote-endpoint = <_out>;
-- 
2.35.0.263.gb82422642f-goog

Re: [PATCH v4 2/2] drm/radeon/uvd: Fix forgotten unmap buffer objects

2022-02-08 Thread zhanglianjie

I am very sorry that I submitted many times due to the character coding 
problem. Can PATCH V4 be used?



I'm scratching my head what you are doing here?

That's the fives time you send out the same patch, so something is going 
wrong here :)


Please double check why that lands in your outbox over and over again.

Regards,
Christian.

Am 08.02.22 um 09:14 schrieb zhanglianjie:
after the buffer object is successfully mapped, call radeon_bo_kunmap 
before the function returns.


Signed-off-by: zhanglianjie 
Reviewed-by: Christian König 

diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c 
b/drivers/gpu/drm/radeon/radeon_uvd.c

index 377f9cdb5b53..0558d928d98d 100644
--- a/drivers/gpu/drm/radeon/radeon_uvd.c
+++ b/drivers/gpu/drm/radeon/radeon_uvd.c
@@ -497,6 +497,7 @@ static int radeon_uvd_cs_msg(struct 
radeon_cs_parser *p, struct radeon_bo *bo,

  handle = msg[2];

  if (handle == 0) {
+    radeon_bo_kunmap(bo);
  DRM_ERROR("Invalid UVD handle!\n");
  return -EINVAL;
  }
@@ -559,12 +560,10 @@ static int radeon_uvd_cs_msg(struct 
radeon_cs_parser *p, struct radeon_bo *bo,

  return 0;

  default:
-
  DRM_ERROR("Illegal UVD message type (%d)!\n", msg_type);
-    return -EINVAL;
  }

-    BUG();
+    radeon_bo_kunmap(bo);
  return -EINVAL;
  }

--
2.20.1

Re: [PATCH v4 2/2] drm/radeon/uvd: Fix forgotten unmap buffer objects

2022-02-08 Thread zhanglianjie


Thank you very much for your review.


I think so, Alex will probably pick that up.

Thanks,
Christian.

Am 08.02.22 um 09:28 schrieb zhanglianjie:
I am very sorry that I submitted many times due to the character 
coding problem. Can PATCH V4 be used?



I'm scratching my head what you are doing here?

That's the fives time you send out the same patch, so something is 
going wrong here :)


Please double check why that lands in your outbox over and over again.

Regards,
Christian.

Am 08.02.22 um 09:14 schrieb zhanglianjie:
after the buffer object is successfully mapped, call 
radeon_bo_kunmap before the function returns.


Signed-off-by: zhanglianjie 
Reviewed-by: Christian König 

diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c 
b/drivers/gpu/drm/radeon/radeon_uvd.c

index 377f9cdb5b53..0558d928d98d 100644
--- a/drivers/gpu/drm/radeon/radeon_uvd.c
+++ b/drivers/gpu/drm/radeon/radeon_uvd.c
@@ -497,6 +497,7 @@ static int radeon_uvd_cs_msg(struct 
radeon_cs_parser *p, struct radeon_bo *bo,

  handle = msg[2];

  if (handle == 0) {
+    radeon_bo_kunmap(bo);
  DRM_ERROR("Invalid UVD handle!\n");
  return -EINVAL;
  }
@@ -559,12 +560,10 @@ static int radeon_uvd_cs_msg(struct 
radeon_cs_parser *p, struct radeon_bo *bo,

  return 0;

  default:
-
  DRM_ERROR("Illegal UVD message type (%d)!\n", msg_type);
-    return -EINVAL;
  }

-    BUG();
+    radeon_bo_kunmap(bo);
  return -EINVAL;
  }

--
2.20.1

Re: [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list

2022-02-08 Thread Sharma, Shashank

>> User only update the list of reg offsets on init, there is no
>> predefined reg offset from kernel code.

I missed this comment in the last patch, and this makes me a bit 
confused. During the design phase, did we agree to have this whole list 
loaded from user ? which means that if user doesn't define the list at 
init, we will not send the trace_event().

Or was it kernel has a list, and user can modify if he wants to, and we 
will dump the values as per the register list.

@Christian ?

Regards
Shashank
On 2/8/2022 3:18 PM, Sharma, Shashank wrote:

On 2/8/2022 2:39 PM, Somalapuram, Amaranath wrote:

On 2/8/2022 4:43 PM, Sharma, Shashank wrote:

I thought we spoke and agreed about:
- Not doing dynamic memory allocation during a reset call,
as there is a redesign debugfs call will happen during the application 
initialization and not during reset.
- Not doing string operations, but just dumping register values by 
index.
I think your referring to the second patch which happens during reset 
and no string operation in second patch.

Pls see my comment in the end.

NACK !

- Shashank

Amar,
Apart from the long comment,there are a few more bugs in the patch, 
which I have mentioned here inline. Please check them out.

- Shashank

On 2/8/2022 9:18 AM, Christian König wrote:

Am 08.02.22 um 09:16 schrieb Somalapuram Amaranath:
List of register to be populated for dump collection during the GPU 
reset.

Signed-off-by: Somalapuram Amaranath 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h |  3 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 60 
+

  2 files changed, 63 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index b85b67a88a3d..78fa46f959c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1097,6 +1097,9 @@ struct amdgpu_device {
  struct amdgpu_reset_control *reset_cntl;
  uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
+
+    /* reset dump register */
+    long    reset_dump_reg_list[128];

I don't have time for a full review, but using long here certainly 
makes no sense.

long is either 32bit or 64bit depending on the CPU architecture.

Regards,
Christian.

will change uint32_t.

  };
  static inline struct amdgpu_device *drm_to_adev(struct drm_device 
*ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index 164d6a9e9fbb..dad268e8a81a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1609,6 +1609,64 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
  DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
  amdgpu_debugfs_sclk_set, "%llu\n");
+static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
+    char __user *buf, size_t size, loff_t *pos)
+{
+    struct amdgpu_device *adev = (struct amdgpu_device 
*)file_inode(f)->i_private;

+    char *reg_offset;
+    int i, r, len;
+
+    reg_offset = kmalloc(2048, GFP_KERNEL);

We also want to understand how does the value 2048 came into picture, 
probably a macro which calculates the size preprocessing time will work 
better.

#define #define N_REGS_DUMP_GPU_RESET 10
#define BUFFER_SZ(N_REGS_DUMP_GPU_RESET * (sizeof uint64_t) + 1)

This first macro can be used later for the loop count for registers as 
well.

+    memset(reg_offset,  0, 2048);
+    for (i = 0; adev->reset_dump_reg_list[i] != 0; i++)

This loop termination condition is incorrect, why are we running the 
loop until adev->reset_dump_reg_list[i] != 0 ?

What if I have 10 registers to dump, but my 4th register value is 0 ? 
It will break the loop at 4 and we will not get all values.

agreed, i try to avoid one more variable in adev

Not by the cost of logic of course :).

Now you can run this loop here.

for (i = 0; i < N_REGS...; i++) {
 register_value_copy_here;
}

+    sprintf(reg_offset + strlen(reg_offset), "0x%lx ", 
adev->reset_dump_reg_list[i]);

+

+    sprintf(reg_offset + strlen(reg_offset), "\n");
+    len = strlen(reg_offset);
+
+    if (*pos >=  len)
+    return 0;
+
+    r = copy_to_user(buf, reg_offset, len);
+    *pos += len - r;
+    kfree(reg_offset);

Also, why are we doing a dynamic memory allocation for reg_offest ? 
We can simply use adev->reset_dump_reg_list[i] isnt't it ?

simply
for (i=0; ireg_list[i], sizeof(uint64_t));
}

Or without even a loop, simply:
copy_to_user(buf, >reg_list, num_regs * sizeof(uint64_t));

- Shashank

it will not be in user readable format for debugfs, (if non readable 
is acceptable I will change this)

We are just adding 0x in front of the reg value, so honestly I don't see 
a huge improvement in the user readability, but if you still want to do 
the dynamic allocation of memory, add the register offset or name as 
well, I mean then it should read like:

0x1234 = 0xABCD
0x1238 = 0x

- Shashank

Re: [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list

2022-02-08 Thread Sharma, Shashank





On 2/8/2022 2:39 PM, Somalapuram, Amaranath wrote:



On 2/8/2022 4:43 PM, Sharma, Shashank wrote:

I thought we spoke and agreed about:
- Not doing dynamic memory allocation during a reset call,
as there is a redesign debugfs call will happen during the application 
initialization and not during reset.

- Not doing string operations, but just dumping register values by index.
I think your referring to the second patch which happens during reset 
and no string operation in second patch.


Pls see my comment in the end.


NACK !

- Shashank

Amar,
Apart from the long comment,there are a few more bugs in the patch, 
which I have mentioned here inline. Please check them out.


- Shashank

On 2/8/2022 9:18 AM, Christian König wrote:

Am 08.02.22 um 09:16 schrieb Somalapuram Amaranath:
List of register to be populated for dump collection during the GPU 
reset.


Signed-off-by: Somalapuram Amaranath 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h |  3 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 60 
+

  2 files changed, 63 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index b85b67a88a3d..78fa46f959c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1097,6 +1097,9 @@ struct amdgpu_device {
  struct amdgpu_reset_control *reset_cntl;
  uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
+
+    /* reset dump register */
+    long    reset_dump_reg_list[128];


I don't have time for a full review, but using long here certainly 
makes no sense.


long is either 32bit or 64bit depending on the CPU architecture.

Regards,
Christian.


will change uint32_t.

  };
  static inline struct amdgpu_device *drm_to_adev(struct drm_device 
*ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index 164d6a9e9fbb..dad268e8a81a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1609,6 +1609,64 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
  DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
  amdgpu_debugfs_sclk_set, "%llu\n");
+static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
+    char __user *buf, size_t size, loff_t *pos)
+{
+    struct amdgpu_device *adev = (struct amdgpu_device 
*)file_inode(f)->i_private;

+    char *reg_offset;
+    int i, r, len;
+
+    reg_offset = kmalloc(2048, GFP_KERNEL);


We also want to understand how does the value 2048 came into picture, 
probably a macro which calculates the size preprocessing time will work 
better.


#define #define N_REGS_DUMP_GPU_RESET 10
#define BUFFER_SZ(N_REGS_DUMP_GPU_RESET * (sizeof uint64_t) + 1)

This first macro can be used later for the loop count for registers as well.


+    memset(reg_offset,  0, 2048);
+    for (i = 0; adev->reset_dump_reg_list[i] != 0; i++)


This loop termination condition is incorrect, why are we running the 
loop until adev->reset_dump_reg_list[i] != 0 ?


What if I have 10 registers to dump, but my 4th register value is 0 ? 
It will break the loop at 4 and we will not get all values.



agreed, i try to avoid one more variable in adev


Not by the cost of logic of course :).

Now you can run this loop here.

for (i = 0; i < N_REGS...; i++) {
register_value_copy_here;
}

+    sprintf(reg_offset + strlen(reg_offset), "0x%lx ", 
adev->reset_dump_reg_list[i]);

+



+    sprintf(reg_offset + strlen(reg_offset), "\n");
+    len = strlen(reg_offset);
+
+    if (*pos >=  len)
+    return 0;
+
+    r = copy_to_user(buf, reg_offset, len);
+    *pos += len - r;
+    kfree(reg_offset);


Also, why are we doing a dynamic memory allocation for reg_offest ? We 
can simply use adev->reset_dump_reg_list[i] isnt't it ?


simply
for (i=0; ireg_list[i], sizeof(uint64_t));
}

Or without even a loop, simply:
copy_to_user(buf, >reg_list, num_regs * sizeof(uint64_t));

- Shashank


it will not be in user readable format for debugfs, (if non readable is 
acceptable I will change this)




We are just adding 0x in front of the reg value, so honestly I don't see 
a huge improvement in the user readability, but if you still want to do 
the dynamic allocation of memory, add the register offset or name as 
well, I mean then it should read like:


0x1234 = 0xABCD
0x1238 = 0x

- Shashank


+


+    return len - r;
+}
+
+static ssize_t amdgpu_reset_dump_register_list_write(struct file 
*f, const char __user *buf,

+    size_t size, loff_t *pos)
+{
+    struct amdgpu_device *adev = (struct amdgpu_device 
*)file_inode(f)->i_private;

+    char *reg_offset, *reg;
+    int ret, i = 0;
+
+    reg_offset = kmalloc(size, GFP_KERNEL);
+    memset(reg_offset,  0, size);
+    ret = copy_from_user(reg_offset, buf, size);
+


We are not allowing user to write into the list, so this whole 
function can just be a NOOP.


- Shashank
User only update

Re: [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list

2022-02-08 Thread Somalapuram, Amaranath





On 2/8/2022 4:43 PM, Sharma, Shashank wrote:

I thought we spoke and agreed about:
- Not doing dynamic memory allocation during a reset call,
as there is a redesign debugfs call will happen during the application 
initialization and not during reset.

- Not doing string operations, but just dumping register values by index.
I think your referring to the second patch which happens during reset 
and no string operation in second patch.

NACK !

- Shashank

Amar,
Apart from the long comment,there are a few more bugs in the patch, 
which I have mentioned here inline. Please check them out.


- Shashank

On 2/8/2022 9:18 AM, Christian König wrote:

Am 08.02.22 um 09:16 schrieb Somalapuram Amaranath:
List of register to be populated for dump collection during the GPU 
reset.


Signed-off-by: Somalapuram Amaranath 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h |  3 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 60 
+

  2 files changed, 63 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index b85b67a88a3d..78fa46f959c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1097,6 +1097,9 @@ struct amdgpu_device {
  struct amdgpu_reset_control *reset_cntl;
  uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
+
+    /* reset dump register */
+    long    reset_dump_reg_list[128];


I don't have time for a full review, but using long here certainly 
makes no sense.


long is either 32bit or 64bit depending on the CPU architecture.

Regards,
Christian.


will change uint32_t.

  };
  static inline struct amdgpu_device *drm_to_adev(struct drm_device 
*ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index 164d6a9e9fbb..dad268e8a81a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1609,6 +1609,64 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
  DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
  amdgpu_debugfs_sclk_set, "%llu\n");
+static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
+    char __user *buf, size_t size, loff_t *pos)
+{
+    struct amdgpu_device *adev = (struct amdgpu_device 
*)file_inode(f)->i_private;

+    char *reg_offset;
+    int i, r, len;
+
+    reg_offset = kmalloc(2048, GFP_KERNEL);
+    memset(reg_offset,  0, 2048);
+    for (i = 0; adev->reset_dump_reg_list[i] != 0; i++)


This loop termination condition is incorrect, why are we running the 
loop until adev->reset_dump_reg_list[i] != 0 ?


What if I have 10 registers to dump, but my 4th register value is 0 ? 
It will break the loop at 4 and we will not get all values.



agreed, i try to avoid one more variable in adev
+    sprintf(reg_offset + strlen(reg_offset), "0x%lx ", 
adev->reset_dump_reg_list[i]);

+
+    sprintf(reg_offset + strlen(reg_offset), "\n");
+    len = strlen(reg_offset);
+
+    if (*pos >=  len)
+    return 0;
+
+    r = copy_to_user(buf, reg_offset, len);
+    *pos += len - r;
+    kfree(reg_offset);


Also, why are we doing a dynamic memory allocation for reg_offest ? We 
can simply use adev->reset_dump_reg_list[i] isnt't it ?


simply
for (i=0; ireg_list[i], sizeof(uint64_t));
}

Or without even a loop, simply:
copy_to_user(buf, >reg_list, num_regs * sizeof(uint64_t));

- Shashank


it will not be in user readable format for debugfs, (if non readable is 
acceptable I will change this)



+


+    return len - r;
+}
+
+static ssize_t amdgpu_reset_dump_register_list_write(struct file 
*f, const char __user *buf,

+    size_t size, loff_t *pos)
+{
+    struct amdgpu_device *adev = (struct amdgpu_device 
*)file_inode(f)->i_private;

+    char *reg_offset, *reg;
+    int ret, i = 0;
+
+    reg_offset = kmalloc(size, GFP_KERNEL);
+    memset(reg_offset,  0, size);
+    ret = copy_from_user(reg_offset, buf, size);
+


We are not allowing user to write into the list, so this whole 
function can just be a NOOP.


- Shashank
User only update the list of reg offsets on init, there is no predefined 
reg offset from kernel code.



+    if (ret)
+    return -EFAULT;
+
+    while ((reg = strsep(_offset, " ")) != NULL) {
+    ret  = kstrtol(reg, 16, >reset_dump_reg_list[i]);
+    if (ret)
+    return -EINVAL;
+    i++;
+    }
+
+    kfree(reg_offset);
+
+    return size;
+}
+
+static const struct file_operations amdgpu_reset_dump_register_list 
= {

+    .owner = THIS_MODULE,
+    .read = amdgpu_reset_dump_register_list_read,
+    .write = amdgpu_reset_dump_register_list_write,
+    .llseek = default_llseek
+};
+
  int amdgpu_debugfs_init(struct amdgpu_device *adev)
  {
  struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
@@ -1672,6 +1730,8 @@ int amdgpu_debugfs_init(struct amdgpu_device 
*adev)

  _debugfs_test_ib_fops);

RE: [PATCH] drm/amd/pm: correct hwmon power lable name

2022-02-08 Thread Chen, Guchun

[Public]

A typo in subject, s/lable/label.

With that addressed, the patch is:
Reviewed-by: Guchun Chen 

Regards,
Guchun

-Original Message-
From: amd-gfx  On Behalf Of Yang Wang
Sent: Tuesday, February 8, 2022 3:44 PM
To: amd-gfx@lists.freedesktop.org
Cc: Hou, Xiaomeng (Matthew) ; Lazar, Lijo 
; Feng, Kenneth ; Wang, Yang(Kevin) 

Subject: [PATCH] drm/amd/pm: correct hwmon power lable name

only vangogh has 2 types of hwmon power node: "fastPPT" and "slowPPT", the 
other asic only has 1 type of hwmon power node: "PPT".

Signed-off-by: Yang Wang 
---
 drivers/gpu/drm/amd/pm/amdgpu_pm.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 426e00112c91..ad5da252228b 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -2842,10 +2842,14 @@ static ssize_t amdgpu_hwmon_show_power_label(struct 
device *dev,
 struct device_attribute *attr,
 char *buf)
 {
-   int limit_type = to_sensor_dev_attr(attr)->index;
+   struct amdgpu_device *adev = dev_get_drvdata(dev);
 
-   return sysfs_emit(buf, "%s\n",
-   limit_type == PP_PWR_TYPE_FAST ? "fastPPT" : "slowPPT");
+   if (adev->asic_type == CHIP_VANGOGH)
+   return sysfs_emit(buf, "%s\n",
+ to_sensor_dev_attr(attr)->index == 
PP_PWR_TYPE_FAST ?
+ "fastPPT" : "slowPPT");
+   else
+   return sysfs_emit(buf, "PPT\n");
 }
 
 static ssize_t amdgpu_hwmon_set_power_cap(struct device *dev,
--
2.25.1

Re: [RFC v4] drm/amdgpu: Rework reset domain to be refcounted.

2022-02-08 Thread Lazar, Lijo





On 2/2/2022 10:56 PM, Andrey Grodzovsky wrote:

The reset domain contains register access semaphor
now and so needs to be present as long as each device
in a hive needs it and so it cannot be binded to XGMI
hive life cycle.
Adress this by making reset domain refcounted and pointed
by each member of the hive and the hive itself.

v4:
Fix crash on boot with XGMI hive by adding type to reset_domain.
XGMI will only create a new reset_domain if prevoius was of single
device type meaning it's first boot. Otherwsie it will take a
refocunt to exsiting reset_domain from the amdgou device.

Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h|  6 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  | 38 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 18 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 29 +++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  4 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c  |  4 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c  |  4 +-
  9 files changed, 118 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 8e96b9a14452..f2ba460bfd59 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -813,9 +813,7 @@ struct amd_powerplay {
  #define AMDGPU_RESET_MAGIC_NUM 64
  #define AMDGPU_MAX_DF_PERFMONS 4
  
-struct amdgpu_reset_domain {

-   struct workqueue_struct *wq;
-};
+struct amdgpu_reset_domain;
  
  struct amdgpu_device {

struct device   *dev;
@@ -1102,7 +1100,7 @@ struct amdgpu_device {
struct amdgpu_reset_control *reset_cntl;
uint32_t
ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
  
-	struct amdgpu_reset_domain	reset_domain;

+   struct amdgpu_reset_domain  *reset_domain;
  };
  
  static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index fef952ca8db5..cd1b7af69c35 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2313,7 +2313,7 @@ static int amdgpu_device_init_schedulers(struct 
amdgpu_device *adev)
  
  		r = drm_sched_init(>sched, _sched_ops,

   ring->num_hw_submission, 
amdgpu_job_hang_limit,
-  timeout, adev->reset_domain.wq, 
ring->sched_score, ring->name);
+  timeout, adev->reset_domain->wq, 
ring->sched_score, ring->name);
if (r) {
DRM_ERROR("Failed to create scheduler on ring %s.\n",
  ring->name);
@@ -2432,24 +2432,22 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
if (r)
goto init_failed;
  
+	/**

+* In case of XGMI grab extra reference for reset domain for this device
+*/
if (adev->gmc.xgmi.num_physical_nodes > 1) {
-   struct amdgpu_hive_info *hive;
-
-   amdgpu_xgmi_add_device(adev);
+   if (amdgpu_xgmi_add_device(adev) == 0) {
+   struct amdgpu_hive_info *hive = 
amdgpu_get_xgmi_hive(adev);
  
-		hive = amdgpu_get_xgmi_hive(adev);

-   if (!hive || !hive->reset_domain.wq) {
-   DRM_ERROR("Failed to obtain reset domain info for XGMI 
hive:%llx", hive->hive_id);
-   r = -EINVAL;
-   goto init_failed;
-   }
+   if (!hive->reset_domain ||
+   
!kref_get_unless_zero(>reset_domain->refcount)) {
+   r = -ENOENT;
+   goto init_failed;
+   }
  
-		adev->reset_domain.wq = hive->reset_domain.wq;

-   } else {
-   adev->reset_domain.wq = 
alloc_ordered_workqueue("amdgpu-reset-dev", 0);
-   if (!adev->reset_domain.wq) {
-   r = -ENOMEM;
-   goto init_failed;
+   /* Drop the early temporary reset domain we created for 
device */
+   kref_put(>reset_domain->refcount, 
amdgpu_reset_destroy_reset_domain);
+   adev->reset_domain = hive->reset_domain;
}
}
  
@@ -3599,6 +3597,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,

return r;
}
  
+	/*

+* Reset domain needs to be present early, before XGMI hive discovered
+* (if any) and intitialized to use reset sem and in_gpu reset flag
+* early on during init.
+*/
+   adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE 
,"amdgpu-reset-dev");
+   if

Re: [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list

2022-02-08 Thread Sharma, Shashank




Amar,
Apart from the long comment,there are a few more bugs in the patch, 
which I have mentioned here inline. Please check them out.


- Shashank

On 2/8/2022 9:18 AM, Christian König wrote:

Am 08.02.22 um 09:16 schrieb Somalapuram Amaranath:
List of register to be populated for dump collection during the GPU 
reset.


Signed-off-by: Somalapuram Amaranath 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h |  3 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 60 +
  2 files changed, 63 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index b85b67a88a3d..78fa46f959c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1097,6 +1097,9 @@ struct amdgpu_device {
  struct amdgpu_reset_control *reset_cntl;
  uint32_t
ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];

+
+    /* reset dump register */
+    long    reset_dump_reg_list[128];


I don't have time for a full review, but using long here certainly makes 
no sense.


long is either 32bit or 64bit depending on the CPU architecture.

Regards,
Christian.


  };
  static inline struct amdgpu_device *drm_to_adev(struct drm_device 
*ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index 164d6a9e9fbb..dad268e8a81a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1609,6 +1609,64 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
  DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
  amdgpu_debugfs_sclk_set, "%llu\n");
+static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
+    char __user *buf, size_t size, loff_t *pos)
+{
+    struct amdgpu_device *adev = (struct amdgpu_device 
*)file_inode(f)->i_private;

+    char *reg_offset;
+    int i, r, len;
+
+    reg_offset = kmalloc(2048, GFP_KERNEL);
+    memset(reg_offset,  0, 2048);
+    for (i = 0; adev->reset_dump_reg_list[i] != 0; i++)


This loop termination condition is incorrect, why are we running the 
loop until adev->reset_dump_reg_list[i] != 0 ?


What if I have 10 registers to dump, but my 4th register value is 0 ? It 
will break the loop at 4 and we will not get all values.


+    sprintf(reg_offset + strlen(reg_offset), "0x%lx ", 
adev->reset_dump_reg_list[i]);

+
+    sprintf(reg_offset + strlen(reg_offset), "\n");
+    len = strlen(reg_offset);
+
+    if (*pos >=  len)
+    return 0;
+
+    r = copy_to_user(buf, reg_offset, len);
+    *pos += len - r;
+    kfree(reg_offset);


Also, why are we doing a dynamic memory allocation for reg_offest ? We 
can simply use adev->reset_dump_reg_list[i] isnt't it ?


simply
for (i=0; ireg_list[i], sizeof(uint64_t));
}

Or without even a loop, simply:
copy_to_user(buf, >reg_list, num_regs * sizeof(uint64_t));

- Shashank


+
+    return len - r;
+}
+
+static ssize_t amdgpu_reset_dump_register_list_write(struct file *f, 
const char __user *buf,

+    size_t size, loff_t *pos)
+{
+    struct amdgpu_device *adev = (struct amdgpu_device 
*)file_inode(f)->i_private;

+    char *reg_offset, *reg;
+    int ret, i = 0;
+
+    reg_offset = kmalloc(size, GFP_KERNEL);
+    memset(reg_offset,  0, size);
+    ret = copy_from_user(reg_offset, buf, size);
+


We are not allowing user to write into the list, so this whole function 
can just be a NOOP.


- Shashank


+    if (ret)
+    return -EFAULT;
+
+    while ((reg = strsep(_offset, " ")) != NULL) {
+    ret  = kstrtol(reg, 16, >reset_dump_reg_list[i]);
+    if (ret)
+    return -EINVAL;
+    i++;
+    }
+
+    kfree(reg_offset);
+
+    return size;
+}
+
+static const struct file_operations amdgpu_reset_dump_register_list = {
+    .owner = THIS_MODULE,
+    .read = amdgpu_reset_dump_register_list_read,
+    .write = amdgpu_reset_dump_register_list_write,
+    .llseek = default_llseek
+};
+
  int amdgpu_debugfs_init(struct amdgpu_device *adev)
  {
  struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
@@ -1672,6 +1730,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
  _debugfs_test_ib_fops);
  debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
  _debugfs_vm_info_fops);
+    debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
root, adev,

+    _reset_dump_register_list);
  adev->debugfs_vbios_blob.data = adev->bios;
  adev->debugfs_vbios_blob.size = adev->bios_size;

Re: [PATCH v11 5/5] drm/amdgpu: add drm buddy support to amdgpu

2022-02-08 Thread Arunpravin




On 04/02/22 6:53 pm, Christian König wrote:
> Am 04.02.22 um 12:22 schrieb Arunpravin:
>> On 28/01/22 7:48 pm, Matthew Auld wrote:
>>> On Thu, 27 Jan 2022 at 14:11, Arunpravin
>>>  wrote:
 - Remove drm_mm references and replace with drm buddy functionalities
 - Add res cursor support for drm buddy

 v2(Matthew Auld):
- replace spinlock with mutex as we call kmem_cache_zalloc
  (..., GFP_KERNEL) in drm_buddy_alloc() function

- lock drm_buddy_block_trim() function as it calls
  mark_free/mark_split are all globally visible

 v3(Matthew Auld):
- remove trim method error handling as we address the failure case
  at drm_buddy_block_trim() function

 v4:
- fix warnings reported by kernel test robot 

 v5:
- fix merge conflict issue

 v6:
- fix warnings reported by kernel test robot 

 Signed-off-by: Arunpravin 
 ---
   drivers/gpu/drm/Kconfig   |   1 +
   .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h|  97 +--
   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h   |   7 +-
   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  | 259 ++
   4 files changed, 231 insertions(+), 133 deletions(-)
>>> 
>>>
 -/**
 - * amdgpu_vram_mgr_virt_start - update virtual start address
 - *
 - * @mem: ttm_resource to update
 - * @node: just allocated node
 - *
 - * Calculate a virtual BO start address to easily check if everything is 
 CPU
 - * accessible.
 - */
 -static void amdgpu_vram_mgr_virt_start(struct ttm_resource *mem,
 -  struct drm_mm_node *node)
 -{
 -   unsigned long start;
 -
 -   start = node->start + node->size;
 -   if (start > mem->num_pages)
 -   start -= mem->num_pages;
 -   else
 -   start = 0;
 -   mem->start = max(mem->start, start);
 -}
 -
   /**
* amdgpu_vram_mgr_new - allocate new ranges
*
 @@ -366,13 +357,13 @@ static int amdgpu_vram_mgr_new(struct 
 ttm_resource_manager *man,
 const struct ttm_place *place,
 struct ttm_resource **res)
   {
 -   unsigned long lpfn, num_nodes, pages_per_node, pages_left, pages;
 +   unsigned long lpfn, pages_per_node, pages_left, pages, n_pages;
 +   u64 vis_usage = 0, mem_bytes, max_bytes, min_page_size;
  struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
  struct amdgpu_device *adev = to_amdgpu_device(mgr);
 -   uint64_t vis_usage = 0, mem_bytes, max_bytes;
 -   struct ttm_range_mgr_node *node;
 -   struct drm_mm *mm = >mm;
 -   enum drm_mm_insert_mode mode;
 +   struct amdgpu_vram_mgr_node *node;
 +   struct drm_buddy *mm = >mm;
 +   struct drm_buddy_block *block;
  unsigned i;
  int r;

 @@ -391,10 +382,9 @@ static int amdgpu_vram_mgr_new(struct 
 ttm_resource_manager *man,
  goto error_sub;
  }

 -   if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
 +   if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
  pages_per_node = ~0ul;
 -   num_nodes = 1;
 -   } else {
 +   else {
   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  pages_per_node = HPAGE_PMD_NR;
   #else
 @@ -403,11 +393,9 @@ static int amdgpu_vram_mgr_new(struct 
 ttm_resource_manager *man,
   #endif
  pages_per_node = max_t(uint32_t, pages_per_node,
 tbo->page_alignment);
 -   num_nodes = DIV_ROUND_UP_ULL(PFN_UP(mem_bytes), 
 pages_per_node);
  }

 -   node = kvmalloc(struct_size(node, mm_nodes, num_nodes),
 -   GFP_KERNEL | __GFP_ZERO);
 +   node = kzalloc(sizeof(*node), GFP_KERNEL);
  if (!node) {
  r = -ENOMEM;
  goto error_sub;
 @@ -415,9 +403,17 @@ static int amdgpu_vram_mgr_new(struct 
 ttm_resource_manager *man,

  ttm_resource_init(tbo, place, >base);

 -   mode = DRM_MM_INSERT_BEST;
 +   INIT_LIST_HEAD(>blocks);
 +
  if (place->flags & TTM_PL_FLAG_TOPDOWN)
 -   mode = DRM_MM_INSERT_HIGH;
 +   node->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION;
 +
 +   if (place->fpfn || lpfn != man->size)
 +   /* Allocate blocks in desired range */
 +   node->flags |= DRM_BUDDY_RANGE_ALLOCATION;
 +
 +   min_page_size = mgr->default_page_size;
 +   BUG_ON(min_page_size < mm->chunk_size);

  pages_left = node->base.num_pages;

 @@ -425,36 +421,61 @@

RE: [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list

2022-02-08 Thread Sharma, Shashank

I thought we spoke and agreed about:
- Not doing dynamic memory allocation during a reset call,
- Not doing string operations, but just dumping register values by index. 

NACK !

- Shashank

-Original Message-
From: Somalapuram, Amaranath  
Sent: Tuesday, February 8, 2022 9:17 AM
To: amd-gfx@lists.freedesktop.org
Cc: Koenig, Christian ; Deucher, Alexander 
; Sharma, Shashank ; 
Somalapuram, Amaranath 
Subject: [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list

List of register to be populated for dump collection during the GPU reset.

Signed-off-by: Somalapuram Amaranath 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  3 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 60 +
 2 files changed, 63 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index b85b67a88a3d..78fa46f959c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1097,6 +1097,9 @@ struct amdgpu_device {
 
struct amdgpu_reset_control *reset_cntl;
uint32_t
ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
+
+   /* reset dump register */
+   longreset_dump_reg_list[128];
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) diff 
--git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 164d6a9e9fbb..dad268e8a81a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1609,6 +1609,64 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,  
DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
amdgpu_debugfs_sclk_set, "%llu\n");
 
+static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
+   char __user *buf, size_t size, loff_t *pos) {
+   struct amdgpu_device *adev = (struct amdgpu_device 
*)file_inode(f)->i_private;
+   char *reg_offset;
+   int i, r, len;
+
+   reg_offset = kmalloc(2048, GFP_KERNEL);
+   memset(reg_offset,  0, 2048);
+   for (i = 0; adev->reset_dump_reg_list[i] != 0; i++)
+   sprintf(reg_offset + strlen(reg_offset), "0x%lx ", 
+adev->reset_dump_reg_list[i]);
+
+   sprintf(reg_offset + strlen(reg_offset), "\n");
+   len = strlen(reg_offset);
+
+   if (*pos >=  len)
+   return 0;
+
+   r = copy_to_user(buf, reg_offset, len);
+   *pos += len - r;
+   kfree(reg_offset);
+
+   return len - r;
+}
+
+static ssize_t amdgpu_reset_dump_register_list_write(struct file *f, const 
char __user *buf,
+   size_t size, loff_t *pos)
+{
+   struct amdgpu_device *adev = (struct amdgpu_device 
*)file_inode(f)->i_private;
+   char *reg_offset, *reg;
+   int ret, i = 0;
+
+   reg_offset = kmalloc(size, GFP_KERNEL);
+   memset(reg_offset,  0, size);
+   ret = copy_from_user(reg_offset, buf, size);
+
+   if (ret)
+   return -EFAULT;
+
+   while ((reg = strsep(_offset, " ")) != NULL) {
+   ret  = kstrtol(reg, 16, >reset_dump_reg_list[i]);
+   if (ret)
+   return -EINVAL;
+   i++;
+   }
+
+   kfree(reg_offset);
+
+   return size;
+}
+
+static const struct file_operations amdgpu_reset_dump_register_list = {
+   .owner = THIS_MODULE,
+   .read = amdgpu_reset_dump_register_list_read,
+   .write = amdgpu_reset_dump_register_list_write,
+   .llseek = default_llseek
+};
+
 int amdgpu_debugfs_init(struct amdgpu_device *adev)  {
struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
@@ -1672,6 +1730,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
_debugfs_test_ib_fops);
debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
_debugfs_vm_info_fops);
+   debugfs_create_file("amdgpu_reset_dump_register_list", 0644, root, adev,
+   _reset_dump_register_list);
 
adev->debugfs_vbios_blob.data = adev->bios;
adev->debugfs_vbios_blob.size = adev->bios_size;
--
2.25.1

Re: [RFC v3 10/12] drm/amdgpu: Move in_gpu_reset into reset_domain

2022-02-08 Thread Lazar, Lijo





On 1/26/2022 4:07 AM, Andrey Grodzovsky wrote:

We should have a single instance per entrire reset domain.

Signed-off-by: Andrey Grodzovsky 
Suggested-by: Lijo Lazar 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h|  7 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 +++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  |  1 +
  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  4 ++--
  drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c  |  4 ++--
  6 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f021cd3c9d34..087796e389ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1056,7 +1056,6 @@ struct amdgpu_device {
boolin_s4;
boolin_s0ix;
  
-	atomic_t 			in_gpu_reset;

enum pp_mp1_state   mp1_state;
struct amdgpu_doorbell_index doorbell_index;
  
@@ -1461,8 +1460,6 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev)

 return adev->gmc.tmz_enabled;
  }
  
-static inline int amdgpu_in_reset(struct amdgpu_device *adev)

-{
-   return atomic_read(>in_gpu_reset);
-}
+int amdgpu_in_reset(struct amdgpu_device *adev);
+
  #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6991ab4a8191..aa43af443ebe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3511,7 +3511,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(>mn_lock);
mutex_init(>virt.vf_errors.lock);
hash_init(adev->mn_hash);
-   atomic_set(>in_gpu_reset, 0);
mutex_init(>psp.mutex);
mutex_init(>notifier_lock);
  
@@ -4775,7 +4774,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,

  static void amdgpu_device_lock_adev(struct amdgpu_device *adev,
struct amdgpu_hive_info *hive)
  {
-   atomic_set(>in_gpu_reset, 1);
+   atomic_set(>reset_domain->in_gpu_reset, 1);
  
  	if (hive) {

down_write_nest_lock(>reset_domain->sem, 
>hive_lock);
@@ -4800,7 +4799,7 @@ static void amdgpu_device_unlock_adev(struct 
amdgpu_device *adev)
  {
amdgpu_vf_error_trans_all(adev);
adev->mp1_state = PP_MP1_STATE_NONE;
-   atomic_set(>in_gpu_reset, 0);
+   atomic_set(>reset_domain->in_gpu_reset, 0);
up_write(>reset_domain->sem);
  }
  
@@ -5643,3 +5642,8 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
  
  	amdgpu_asic_invalidate_hdp(adev, ring);

  }
+
+int amdgpu_in_reset(struct amdgpu_device *adev)
+{
+   return atomic_read(>reset_domain->in_gpu_reset);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 011585e330f6..e9b804a89b34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -127,6 +127,7 @@ struct amdgpu_reset_domain 
*amdgpu_reset_create_reset_domain(char *wq_name)
  
  	}
  
+	atomic_set(_domain->in_gpu_reset, 0);

init_rwsem(_domain->sem);
  
  	return reset_domain;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 7451089b0c06..413982f4e1ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -74,6 +74,7 @@ struct amdgpu_reset_domain {
struct kref refcount;
struct workqueue_struct *wq;
struct rw_semaphore sem;
+   atomic_t in_gpu_reset;


Maybe 'active' (independent of gpu) just to indicate that a reset is 
ongoing in the domain?


Thanks,
Lijo


  };
  
  
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c

index 5dab06fce26a..6c79746d18db 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -258,7 +258,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
*work)
return;
  
  	amdgpu_virt_fini_data_exchange(adev);

-   atomic_set(>in_gpu_reset, 1);
+   atomic_set(>reset_domain->in_gpu_reset, 1);
  
  	xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
  
@@ -271,7 +271,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)

} while (timeout > 1);
  
  flr_done:

-   atomic_set(>in_gpu_reset, 0);
+   atomic_set(>reset_domain->in_gpu_reset, 0);
up_write(>reset_domain->sem);
  
  	/* Trigger recovery for world switch failure if no TDR */

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 868144fff16a..39f7e1e9ab81 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -287,7 +287,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct 
*work)

Re: [PATCH 1/7] drm/selftests: Move i915 buddy selftests into drm

2022-02-08 Thread Matthew Auld


On 03/02/2022 13:32, Arunpravin wrote:

- move i915 buddy selftests into drm selftests folder
- add Makefile and Kconfig support
- add sanitycheck testcase

Prerequisites
- These series of selftests patches are created on top of
   drm buddy series
- Enable kselftests for DRM as a module in .config

Signed-off-by: Arunpravin 


At some point I guess we also want some IGT that picks this up? Like we 
do in tests/drm_mm.c? That way this can get picked up by CI?


Acked-by: Matthew Auld 


---
  drivers/gpu/drm/Kconfig   |  1 +
  drivers/gpu/drm/selftests/Makefile|  3 +-
  .../gpu/drm/selftests/drm_buddy_selftests.h   |  9 
  drivers/gpu/drm/selftests/test-drm_buddy.c| 49 +++
  4 files changed, 61 insertions(+), 1 deletion(-)
  create mode 100644 drivers/gpu/drm/selftests/drm_buddy_selftests.h
  create mode 100644 drivers/gpu/drm/selftests/test-drm_buddy.c

diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index eb5a57ae3c5c..ff856df3f97f 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -71,6 +71,7 @@ config DRM_DEBUG_SELFTEST
select DRM_DP_HELPER
select DRM_LIB_RANDOM
select DRM_KMS_HELPER
+   select DRM_BUDDY
select DRM_EXPORT_FOR_TESTS if m
default n
help
diff --git a/drivers/gpu/drm/selftests/Makefile 
b/drivers/gpu/drm/selftests/Makefile
index 0856e4b12f70..5ba5f9138c95 100644
--- a/drivers/gpu/drm/selftests/Makefile
+++ b/drivers/gpu/drm/selftests/Makefile
@@ -4,4 +4,5 @@ test-drm_modeset-y := test-drm_modeset_common.o 
test-drm_plane_helper.o \
  test-drm_damage_helper.o test-drm_dp_mst_helper.o \
  test-drm_rect.o
  
-obj-$(CONFIG_DRM_DEBUG_SELFTEST) += test-drm_mm.o test-drm_modeset.o test-drm_cmdline_parser.o

+obj-$(CONFIG_DRM_DEBUG_SELFTEST) += test-drm_mm.o test-drm_modeset.o 
test-drm_cmdline_parser.o \
+   test-drm_buddy.o
diff --git a/drivers/gpu/drm/selftests/drm_buddy_selftests.h 
b/drivers/gpu/drm/selftests/drm_buddy_selftests.h
new file mode 100644
index ..a4bcf3a6dfe3
--- /dev/null
+++ b/drivers/gpu/drm/selftests/drm_buddy_selftests.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* List each unit test as selftest(name, function)
+ *
+ * The name is used as both an enum and expanded as igt__name to create
+ * a module parameter. It must be unique and legal for a C identifier.
+ *
+ * Tests are executed in order by igt/drm_buddy
+ */
+selftest(sanitycheck, igt_sanitycheck) /* keep first (selfcheck for igt) */
diff --git a/drivers/gpu/drm/selftests/test-drm_buddy.c 
b/drivers/gpu/drm/selftests/test-drm_buddy.c
new file mode 100644
index ..51e4d393d22c
--- /dev/null
+++ b/drivers/gpu/drm/selftests/test-drm_buddy.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2019 Intel Corporation
+ */
+
+#define pr_fmt(fmt) "drm_buddy: " fmt
+
+#include 
+
+#include 
+
+#include "../lib/drm_random.h"
+
+#define TESTS "drm_buddy_selftests.h"
+#include "drm_selftest.h"
+
+static unsigned int random_seed;
+
+static int igt_sanitycheck(void *ignored)
+{
+   pr_info("%s - ok!\n", __func__);
+   return 0;
+}
+
+#include "drm_selftest.c"
+
+static int __init test_drm_buddy_init(void)
+{
+   int err;
+
+   while (!random_seed)
+   random_seed = get_random_int();
+
+   pr_info("Testing DRM buddy manager (struct drm_buddy), with 
random_seed=0x%x\n",
+   random_seed);
+   err = run_selftests(selftests, ARRAY_SIZE(selftests), NULL);
+
+   return err > 0 ? 0 : err;
+}
+
+static void __exit test_drm_buddy_exit(void)
+{
+}
+
+module_init(test_drm_buddy_init);
+module_exit(test_drm_buddy_exit);
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_LICENSE("GPL");

Re: [PATCH 7/7] drm/selftests: add drm buddy pathological testcase

2022-02-08 Thread Matthew Auld


On 03/02/2022 13:32, Arunpravin wrote:

create a pot-sized mm, then allocate one of each possible
order within. This should leave the mm with exactly one
page left. Free the largest block, then whittle down again.
Eventually we will have a fully 50% fragmented mm.

Signed-off-by: Arunpravin 
---
  .../gpu/drm/selftests/drm_buddy_selftests.h   |   1 +
  drivers/gpu/drm/selftests/test-drm_buddy.c| 136 ++
  2 files changed, 137 insertions(+)

diff --git a/drivers/gpu/drm/selftests/drm_buddy_selftests.h 
b/drivers/gpu/drm/selftests/drm_buddy_selftests.h
index 411d072cbfc5..455b756c4ae5 100644
--- a/drivers/gpu/drm/selftests/drm_buddy_selftests.h
+++ b/drivers/gpu/drm/selftests/drm_buddy_selftests.h
@@ -12,3 +12,4 @@ selftest(buddy_alloc_range, igt_buddy_alloc_range)
  selftest(buddy_alloc_optimistic, igt_buddy_alloc_optimistic)
  selftest(buddy_alloc_pessimistic, igt_buddy_alloc_pessimistic)
  selftest(buddy_alloc_smoke, igt_buddy_alloc_smoke)
+selftest(buddy_alloc_pathological, igt_buddy_alloc_pathological)
diff --git a/drivers/gpu/drm/selftests/test-drm_buddy.c 
b/drivers/gpu/drm/selftests/test-drm_buddy.c
index 2074e8c050a4..b2d0313a4bc5 100644
--- a/drivers/gpu/drm/selftests/test-drm_buddy.c
+++ b/drivers/gpu/drm/selftests/test-drm_buddy.c
@@ -338,6 +338,142 @@ static void igt_mm_config(u64 *size, u64 *chunk_size)
*size = (u64)s << 12;
  }
  
+static int igt_buddy_alloc_pathological(void *arg)

+{
+   u64 mm_size, size, min_page_size, start = 0;
+   struct drm_buddy_block *block;
+   const int max_order = 3;
+   unsigned long flags = 0;
+   int order, top, err;
+   struct drm_buddy mm;
+   LIST_HEAD(blocks);
+   LIST_HEAD(holes);
+   LIST_HEAD(tmp);
+
+   /*
+* Create a pot-sized mm, then allocate one of each possible
+* order within. This should leave the mm with exactly one
+* page left. Free the largest block, then whittle down again.
+* Eventually we will have a fully 50% fragmented mm.
+*/
+
+   mm_size = PAGE_SIZE << max_order;
+   err = drm_buddy_init(, mm_size, PAGE_SIZE);
+   if (err) {
+   pr_err("buddy_init failed(%d)\n", err);
+   return err;
+   }
+   BUG_ON(mm.max_order != max_order);
+
+   for (top = max_order; top; top--) {
+   /* Make room by freeing the largest allocated block */
+   block = list_first_entry_or_null(, typeof(*block), link);
+   if (block) {
+   list_del(>link);
+   drm_buddy_free_block(, block);
+   }
+
+   for (order = top; order--; ) {
+   size = min_page_size = get_size(order, PAGE_SIZE);
+   err = drm_buddy_alloc_blocks(, start, mm_size, size,
+min_page_size, , 
flags);
+   if (err) {
+   pr_info("buddy_alloc hit -ENOMEM with order=%d, 
top=%d\n",
+   order, top);
+   goto err;
+   }
+
+   block = list_first_entry_or_null(,
+struct drm_buddy_block,
+link);
+   if (!block) {
+   pr_err("alloc_blocks has no blocks\n");
+   err = -EINVAL;
+   goto err;
+   }
+
+   list_del(>link);
+   list_add_tail(>link, );
+   }
+
+   /* There should be one final page for this sub-allocation */
+   size = min_page_size = get_size(0, PAGE_SIZE);
+   err = drm_buddy_alloc_blocks(, start, mm_size, size, 
min_page_size, , flags);
+   if (err) {
+   pr_info("buddy_alloc hit -ENOME for hole\n");


ENOMEM

Reviewed-by: Matthew Auld 


+   goto err;
+   }
+
+   block = list_first_entry_or_null(,
+struct drm_buddy_block,
+link);
+   if (!block) {
+   pr_err("alloc_blocks has no blocks\n");
+   err = -EINVAL;
+   goto err;
+   }
+
+   list_del(>link);
+   list_add_tail(>link, );
+
+   size = min_page_size = get_size(top, PAGE_SIZE);
+   err = drm_buddy_alloc_blocks(, start, mm_size, size, 
min_page_size, , flags);
+   if (!err) {
+   pr_info("buddy_alloc unexpectedly succeeded at top-order 
%d/%d, it should be full!",
+   top, max_order);
+   block = list_first_entry_or_null(,
+

Re: [PATCH 6/7] drm/selftests: add drm buddy smoke testcase

2022-02-08 Thread Matthew Auld


On 03/02/2022 13:32, Arunpravin wrote:

- add a test to ascertain that the critical functionalities
   of the program is working fine
- add a timeout helper function

Signed-off-by: Arunpravin 
---
  .../gpu/drm/selftests/drm_buddy_selftests.h   |   1 +
  drivers/gpu/drm/selftests/test-drm_buddy.c| 143 ++
  2 files changed, 144 insertions(+)

diff --git a/drivers/gpu/drm/selftests/drm_buddy_selftests.h 
b/drivers/gpu/drm/selftests/drm_buddy_selftests.h
index b14f04a1de19..411d072cbfc5 100644
--- a/drivers/gpu/drm/selftests/drm_buddy_selftests.h
+++ b/drivers/gpu/drm/selftests/drm_buddy_selftests.h
@@ -11,3 +11,4 @@ selftest(buddy_alloc_limit, igt_buddy_alloc_limit)
  selftest(buddy_alloc_range, igt_buddy_alloc_range)
  selftest(buddy_alloc_optimistic, igt_buddy_alloc_optimistic)
  selftest(buddy_alloc_pessimistic, igt_buddy_alloc_pessimistic)
+selftest(buddy_alloc_smoke, igt_buddy_alloc_smoke)
diff --git a/drivers/gpu/drm/selftests/test-drm_buddy.c 
b/drivers/gpu/drm/selftests/test-drm_buddy.c
index e97f583ed0cd..2074e8c050a4 100644
--- a/drivers/gpu/drm/selftests/test-drm_buddy.c
+++ b/drivers/gpu/drm/selftests/test-drm_buddy.c
@@ -7,6 +7,7 @@
  
  #include 

  #include 
+#include 
  
  #include 
  
@@ -15,6 +16,9 @@

  #define TESTS "drm_buddy_selftests.h"
  #include "drm_selftest.h"
  
+#define IGT_TIMEOUT(name__) \

+   unsigned long name__ = jiffies + MAX_SCHEDULE_TIMEOUT
+
  static unsigned int random_seed;
  
  static inline u64 get_size(int order, u64 chunk_size)

@@ -22,6 +26,26 @@ static inline u64 get_size(int order, u64 chunk_size)
return (1 << order) * chunk_size;
  }
  
+__printf(2, 3)

+static bool __igt_timeout(unsigned long timeout, const char *fmt, ...)
+{
+   va_list va;
+
+   if (!signal_pending(current)) {
+   cond_resched();
+   if (time_before(jiffies, timeout))
+   return false;
+   }
+
+   if (fmt) {
+   va_start(va, fmt);
+   vprintk(fmt, va);
+   va_end(va);
+   }
+
+   return true;
+}
+
  static inline const char *yesno(bool v)
  {
return v ? "yes" : "no";
@@ -314,6 +338,125 @@ static void igt_mm_config(u64 *size, u64 *chunk_size)
*size = (u64)s << 12;
  }
  
+static int igt_buddy_alloc_smoke(void *arg)

+{
+   u64 mm_size, min_page_size, chunk_size, start = 0;
+   unsigned long flags = 0;
+   struct drm_buddy mm;
+   int *order;
+   int err, i;
+
+   DRM_RND_STATE(prng, random_seed);
+   IGT_TIMEOUT(end_time);
+
+   igt_mm_config(_size, _size);
+
+   err = drm_buddy_init(, mm_size, chunk_size);
+   if (err) {
+   pr_err("buddy_init failed(%d)\n", err);
+   return err;
+   }
+
+   order = drm_random_order(mm.max_order + 1, );
+   if (!order)
+   goto out_fini;
+
+   for (i = 0; i <= mm.max_order; ++i) {
+   struct drm_buddy_block *block;
+   int max_order = order[i];
+   bool timeout = false;
+   LIST_HEAD(blocks);
+   u64 total, size;
+   LIST_HEAD(tmp);
+   int order;
+
+   err = igt_check_mm();
+   if (err) {
+   pr_err("pre-mm check failed, abort\n");
+   break;
+   }
+
+   order = max_order;
+   total = 0;
+
+   do {
+retry:
+   size = min_page_size = get_size(order, chunk_size);
+   err = drm_buddy_alloc_blocks(, start, mm_size, size,
+min_page_size, , 
flags);
+   if (err) {
+   if (err == -ENOMEM) {
+   pr_info("buddy_alloc hit -ENOMEM with 
order=%d\n",
+   order);
+   } else {
+   if (order--) {
+   err = 0;
+   goto retry;
+   }
+
+   pr_err("buddy_alloc with order=%d 
failed(%d)\n",
+  order, err);
+   }
+
+   break;
+   }
+
+   block = list_first_entry_or_null(,
+struct drm_buddy_block,
+link);
+   if (!block) {
+   pr_err("alloc_blocks has no blocks\n");
+   err = -EINVAL;
+   break;
+   }
+
+   list_del(>link);
+   list_add_tail(>link, );


Could just make this list_move_tail()?

Re: [PATCH 5/7] drm/selftests: add drm buddy pessimistic testcase

2022-02-08 Thread Matthew Auld


On 03/02/2022 13:32, Arunpravin wrote:

create a pot-sized mm, then allocate one of each possible
order within. This should leave the mm with exactly one
page left.

Signed-off-by: Arunpravin 
---
  .../gpu/drm/selftests/drm_buddy_selftests.h   |   1 +
  drivers/gpu/drm/selftests/test-drm_buddy.c| 153 ++
  2 files changed, 154 insertions(+)

diff --git a/drivers/gpu/drm/selftests/drm_buddy_selftests.h 
b/drivers/gpu/drm/selftests/drm_buddy_selftests.h
index 21a6bd38864f..b14f04a1de19 100644
--- a/drivers/gpu/drm/selftests/drm_buddy_selftests.h
+++ b/drivers/gpu/drm/selftests/drm_buddy_selftests.h
@@ -10,3 +10,4 @@ selftest(sanitycheck, igt_sanitycheck) /* keep first 
(selfcheck for igt) */
  selftest(buddy_alloc_limit, igt_buddy_alloc_limit)
  selftest(buddy_alloc_range, igt_buddy_alloc_range)
  selftest(buddy_alloc_optimistic, igt_buddy_alloc_optimistic)
+selftest(buddy_alloc_pessimistic, igt_buddy_alloc_pessimistic)
diff --git a/drivers/gpu/drm/selftests/test-drm_buddy.c 
b/drivers/gpu/drm/selftests/test-drm_buddy.c
index b193d9556fb4..e97f583ed0cd 100644
--- a/drivers/gpu/drm/selftests/test-drm_buddy.c
+++ b/drivers/gpu/drm/selftests/test-drm_buddy.c
@@ -314,6 +314,159 @@ static void igt_mm_config(u64 *size, u64 *chunk_size)
*size = (u64)s << 12;
  }
  
+static int igt_buddy_alloc_pessimistic(void *arg)

+{
+   u64 mm_size, size, min_page_size, start = 0;
+   struct drm_buddy_block *block, *bn;
+   const unsigned int max_order = 16;
+   unsigned long flags = 0;
+   struct drm_buddy mm;
+   unsigned int order;
+   LIST_HEAD(blocks);
+   LIST_HEAD(tmp);
+   int err;
+
+   /*
+* Create a pot-sized mm, then allocate one of each possible
+* order within. This should leave the mm with exactly one
+* page left.
+*/
+
+   mm_size = PAGE_SIZE << max_order;
+   err = drm_buddy_init(, mm_size, PAGE_SIZE);
+   if (err) {
+   pr_err("buddy_init failed(%d)\n", err);
+   return err;
+   }
+   BUG_ON(mm.max_order != max_order);
+
+   for (order = 0; order < max_order; order++) {
+   size = min_page_size = get_size(order, PAGE_SIZE);
+   err = drm_buddy_alloc_blocks(, start, mm_size, size, 
min_page_size, , flags);
+   if (err) {
+   pr_info("buddy_alloc hit -ENOMEM with order=%d\n",
+   order);
+   goto err;
+   }
+
+   block = list_first_entry_or_null(,
+struct drm_buddy_block,
+link);
+   if (!block) {
+   pr_err("alloc_blocks has no blocks\n");
+   err = -EINVAL;
+   goto err;
+   }
+
+   list_del(>link);
+   list_add_tail(>link, );
+   }
+
+   /* And now the last remaining block available */
+   size = min_page_size = get_size(0, PAGE_SIZE);
+   err = drm_buddy_alloc_blocks(, start, mm_size, size, min_page_size, 
, flags);
+   if (err) {
+   pr_info("buddy_alloc hit -ENOMEM on final alloc\n");
+   goto err;
+   }
+
+   block = list_first_entry_or_null(,
+struct drm_buddy_block,
+link);
+   if (!block) {
+   pr_err("alloc_blocks has no blocks\n");
+   err = -EINVAL;
+   goto err;
+   }
+
+   list_del(>link);
+   list_add_tail(>link, );
+
+   /* Should be completely full! */
+   for (order = max_order; order--; ) {
+   size = min_page_size = get_size(order, PAGE_SIZE);
+   err = drm_buddy_alloc_blocks(, start, mm_size, size, 
min_page_size, , flags);
+   if (!err) {
+   pr_info("buddy_alloc unexpectedly succeeded at order %d, it 
should be full!",
+   order);
+   block = list_first_entry_or_null(,
+struct drm_buddy_block,
+link);
+   if (!block) {
+   pr_err("alloc_blocks has no blocks\n");
+   err = -EINVAL;
+   goto err;
+   }
+
+   list_del(>link);
+   list_add_tail(>link, );
+   err = -EINVAL;
+   goto err;
+   }
+   }
+
+   block = list_last_entry(, typeof(*block), link);
+   list_del(>link);
+   drm_buddy_free_block(, block);
+
+   /* As we free in increasing size, we make available larger blocks */
+   order = 1;
+   list_for_each_entry_safe(block, bn, , link) {
+   list_del(>link);
+

Re: [PATCH 4/7] drm/selftests: add drm buddy optimistic testcase

2022-02-08 Thread Matthew Auld


On 03/02/2022 13:32, Arunpravin wrote:

create a mm with one block of each order available, and
try to allocate them all.

Signed-off-by: Arunpravin 

Reviewed-by: Matthew Auld

Re: [PATCH 3/7] drm/selftests: add drm buddy alloc range testcase

2022-02-08 Thread Matthew Auld


On 03/02/2022 13:32, Arunpravin wrote:

- add a test to check the range allocation
- export get_buddy() function in drm_buddy.c
- export drm_prandom_u32_max_state() in lib/drm_random.c
- include helper functions
- include prime number header file

Signed-off-by: Arunpravin 
---
  drivers/gpu/drm/drm_buddy.c   |  20 +-
  drivers/gpu/drm/lib/drm_random.c  |   3 +-
  drivers/gpu/drm/lib/drm_random.h  |   2 +
  .../gpu/drm/selftests/drm_buddy_selftests.h   |   1 +
  drivers/gpu/drm/selftests/test-drm_buddy.c| 390 ++
  include/drm/drm_buddy.h   |   3 +
  6 files changed, 414 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 4845ef784b5e..501229d843c4 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -211,7 +211,7 @@ static int split_block(struct drm_buddy *mm,
  }
  
  static struct drm_buddy_block *

-get_buddy(struct drm_buddy_block *block)
+__get_buddy(struct drm_buddy_block *block)
  {
struct drm_buddy_block *parent;
  
@@ -225,6 +225,18 @@ get_buddy(struct drm_buddy_block *block)

return parent->left;
  }
  
+/**

+ * drm_get_buddy - get buddy address


Maybe add some more info here:

"Return the corresponding buddy block for @block, or NULL if this is a 
root block and can't be merged further. Requires some kind of locking to 
protect against any concurrent allocate and free operations."


?

Anyway,
Reviewed-by: Matthew Auld 



+ *
+ * @block: DRM buddy block
+ */
+struct drm_buddy_block *
+drm_get_buddy(struct drm_buddy_block *block)
+{
+   return __get_buddy(block);
+}
+EXPORT_SYMBOL(drm_get_buddy);
+
  static void __drm_buddy_free(struct drm_buddy *mm,
 struct drm_buddy_block *block)
  {
@@ -233,7 +245,7 @@ static void __drm_buddy_free(struct drm_buddy *mm,
while ((parent = block->parent)) {
struct drm_buddy_block *buddy;
  
-		buddy = get_buddy(block);

+   buddy = __get_buddy(block);
  
  		if (!drm_buddy_block_is_free(buddy))

break;
@@ -361,7 +373,7 @@ alloc_range_bias(struct drm_buddy *mm,
 * bigger is better, so make sure we merge everything back before we
 * free the allocated blocks.
 */
-   buddy = get_buddy(block);
+   buddy = __get_buddy(block);
if (buddy &&
(drm_buddy_block_is_free(block) &&
 drm_buddy_block_is_free(buddy)))
@@ -500,7 +512,7 @@ static int __alloc_range(struct drm_buddy *mm,
 * bigger is better, so make sure we merge everything back before we
 * free the allocated blocks.
 */
-   buddy = get_buddy(block);
+   buddy = __get_buddy(block);
if (buddy &&
(drm_buddy_block_is_free(block) &&
 drm_buddy_block_is_free(buddy)))
diff --git a/drivers/gpu/drm/lib/drm_random.c b/drivers/gpu/drm/lib/drm_random.c
index eeb155826d27..31b5a3e21911 100644
--- a/drivers/gpu/drm/lib/drm_random.c
+++ b/drivers/gpu/drm/lib/drm_random.c
@@ -7,10 +7,11 @@
  
  #include "drm_random.h"
  
-static inline u32 drm_prandom_u32_max_state(u32 ep_ro, struct rnd_state *state)

+u32 drm_prandom_u32_max_state(u32 ep_ro, struct rnd_state *state)
  {
return upper_32_bits((u64)prandom_u32_state(state) * ep_ro);
  }
+EXPORT_SYMBOL(drm_prandom_u32_max_state);
  
  void drm_random_reorder(unsigned int *order, unsigned int count,

struct rnd_state *state)
diff --git a/drivers/gpu/drm/lib/drm_random.h b/drivers/gpu/drm/lib/drm_random.h
index 4a3e94dfa0c0..5543bf0474bc 100644
--- a/drivers/gpu/drm/lib/drm_random.h
+++ b/drivers/gpu/drm/lib/drm_random.h
@@ -22,5 +22,7 @@ unsigned int *drm_random_order(unsigned int count,
  void drm_random_reorder(unsigned int *order,
unsigned int count,
struct rnd_state *state);
+u32 drm_prandom_u32_max_state(u32 ep_ro,
+ struct rnd_state *state);
  
  #endif /* !__DRM_RANDOM_H__ */

diff --git a/drivers/gpu/drm/selftests/drm_buddy_selftests.h 
b/drivers/gpu/drm/selftests/drm_buddy_selftests.h
index ebe16162762f..3230bfd2770b 100644
--- a/drivers/gpu/drm/selftests/drm_buddy_selftests.h
+++ b/drivers/gpu/drm/selftests/drm_buddy_selftests.h
@@ -8,3 +8,4 @@
   */
  selftest(sanitycheck, igt_sanitycheck) /* keep first (selfcheck for igt) */
  selftest(buddy_alloc_limit, igt_buddy_alloc_limit)
+selftest(buddy_alloc_range, igt_buddy_alloc_range)
diff --git a/drivers/gpu/drm/selftests/test-drm_buddy.c 
b/drivers/gpu/drm/selftests/test-drm_buddy.c
index fd7d1a112458..e347060c05a2 100644
--- a/drivers/gpu/drm/selftests/test-drm_buddy.c
+++ b/drivers/gpu/drm/selftests/test-drm_buddy.c
@@ -6,6 +6,7 @@
  #define pr_fmt(fmt) "drm_buddy: " fmt
  
  #include 

+#include 
  
  #include 
  
@@ -16,6 +17,395 @@
  
  static unsigned int random_seed;
  
+static inline const char *yesno(bool v)

+{
+

Re: [PATCH 2/7] drm/selftests: add drm buddy alloc limit testcase

2022-02-08 Thread Matthew Auld


On 03/02/2022 13:32, Arunpravin wrote:

add a test to check the maximum allocation limit

Signed-off-by: Arunpravin 
---
  .../gpu/drm/selftests/drm_buddy_selftests.h   |  1 +
  drivers/gpu/drm/selftests/test-drm_buddy.c| 60 +++
  2 files changed, 61 insertions(+)

diff --git a/drivers/gpu/drm/selftests/drm_buddy_selftests.h 
b/drivers/gpu/drm/selftests/drm_buddy_selftests.h
index a4bcf3a6dfe3..ebe16162762f 100644
--- a/drivers/gpu/drm/selftests/drm_buddy_selftests.h
+++ b/drivers/gpu/drm/selftests/drm_buddy_selftests.h
@@ -7,3 +7,4 @@
   * Tests are executed in order by igt/drm_buddy
   */
  selftest(sanitycheck, igt_sanitycheck) /* keep first (selfcheck for igt) */
+selftest(buddy_alloc_limit, igt_buddy_alloc_limit)
diff --git a/drivers/gpu/drm/selftests/test-drm_buddy.c 
b/drivers/gpu/drm/selftests/test-drm_buddy.c
index 51e4d393d22c..fd7d1a112458 100644
--- a/drivers/gpu/drm/selftests/test-drm_buddy.c
+++ b/drivers/gpu/drm/selftests/test-drm_buddy.c
@@ -16,6 +16,66 @@
  
  static unsigned int random_seed;
  
+static int igt_buddy_alloc_limit(void *arg)

+{
+   u64 end, size = U64_MAX, start = 0;
+   struct drm_buddy_block *block;
+   unsigned long flags = 0;
+   LIST_HEAD(allocated);
+   struct drm_buddy mm;
+   int err;
+
+   size = end = round_down(size, 4096);
+   err = drm_buddy_init(, size, PAGE_SIZE);
+   if (err)
+   return err;
+
+   if (mm.max_order != DRM_BUDDY_MAX_ORDER) {
+   pr_err("mm.max_order(%d) != %d\n",
+  mm.max_order, DRM_BUDDY_MAX_ORDER);
+   err = -EINVAL;
+   goto out_fini;
+   }
+
+   err = drm_buddy_alloc_blocks(, start, end, size,
+PAGE_SIZE, , flags);
+
+   if (unlikely(err))
+   goto out_free;
+
+   block = list_first_entry_or_null(,
+struct drm_buddy_block,
+link);
+
+   if (!block)


err = -EINVAL;


+   goto out_fini;
+
+   if (drm_buddy_block_order(block) != mm.max_order) {
+   pr_err("block order(%d) != %d\n",
+  drm_buddy_block_order(block), mm.max_order);
+   err = -EINVAL;
+   goto out_free;
+   }
+
+   if (drm_buddy_block_size(, block) !=
+   BIT_ULL(mm.max_order) * PAGE_SIZE) {
+   pr_err("block size(%llu) != %llu\n",
+  drm_buddy_block_size(, block),
+  BIT_ULL(mm.max_order) * PAGE_SIZE);
+   err = -EINVAL;
+   goto out_free;
+   }
+
+   if (!err)


Always true AFAICT?


+   pr_info("%s - succeeded\n", __func__);


I guess this could be made part of the run_selftests()? It looks like it 
already prints the current test, perhaps that is already enough?


With the err = -EINVAL change, feel free to add,
Reviewed-by: Matthew Auld 


+
+out_free:
+   drm_buddy_free_list(, );
+out_fini:
+   drm_buddy_fini();
+   return err;
+}
+
  static int igt_sanitycheck(void *ignored)
  {
pr_info("%s - ok!\n", __func__);

Re: Minimal GPU setup

2022-02-08 Thread Amol

Thank you Alex.

On 07/02/2022, Deucher, Alexander  wrote:
> [AMD Official Use Only]
>
> Most of the register programming in evergreen_gpu_init is required.  That
> code handles things like harvesting (e.g., disabling bad hardware resources)
> and setting sane asic specific settings in some registers.  If you don't do
> it, work may get scheduled to bad or incorrectly configured hardware blocks
> which will lead to hangs or corrupted results.  You can probably skip some
> of them, but I don't remember what is minimally required off hand.  It's
> generally a good idea to re-initialize those registers anyway in case
> someone has previously messed with them (e.g., manual register munging or
> GPU passed through to a VM etc.).

Understood.

>
> Posting the bios is enough to get you a working memory controller and enough
> asic setup to light up displays (basically what you need for pre-OS
> console).  As Christian mentioned, loading the ucodes will get the
> associated engines working so that you can start feeding commands to the
> GPU, but without proper configuration of the various hardware blocks on the
> GPU, you may not have success in feeding data to the GPU.

Understood. I think I wanted a confirmation that the steps I took so far are not
completely incorrect and may be just enough to see some GPU activity,
before I spend more effort programming other blocks. The feedback and a small
but working test helps restore the motivation.

Thanks,
Amol

>
> Alex
>
>
> 
> From: amd-gfx  on behalf of Amol
> 
> Sent: Saturday, February 5, 2022 4:47 AM
> To: amd-gfx@lists.freedesktop.org 
> Subject: Minimal GPU setup
>
> Hello,
>
> I am learning to program Radeon HD 7350 by reading the radeon
> driver source in Linux, and the guides/manuals from AMD.
>
> I understand the general flow of initialization the driver performs. I
> have also been able to understand and re-implement the ATOM
> BIOS virtual machine.
>
> I am trying to program the device up from scratch (i.e. bare-metal).
> Do I need to perform all those steps that the driver does? Reading
> the evergreen_gpu_init function is demotivating; it initializes many
> fields and registers which I suspect may not be required for a minimal
> setup.
>
> Is posting the BIOS and loading the microcode enough to get me started
> with running basic tasks (DMA transfers, simple packet processing, etc.)?
>
> Thanks,
> Amol
>

Re: Minimal GPU setup

2022-02-08 Thread Amol

Thank you Christian.

On 06/02/2022, Christian König  wrote:
> Hi Amol,
>
> Am 05.02.22 um 10:47 schrieb Amol:

. . .

>> Is posting the BIOS and loading the microcode enough to get me started
>> with running basic tasks (DMA transfers, simple packet processing, etc.)?
>
> Well yes and no. As bare minimum you need the following:
> 1. Firmware loading
> 2. Memory management
> 3. Ring buffer setup
> 4. Hardware initialization
>
> When that is done you can write commands into the ring buffers of the CP
> or SDMA and see if they are executed (see the *_ring_test() functions in
> the driver). SDMA is usually easier to get working.

The DMA-ring-test of making the SDMA write into a WB location in the
system RAM succeeded.

The sequence followed mimics what the Linux driver does for the most part,
until evergreen_gpu_init. That and the portions of power mgmt, interrupt mgmt,
indirect buffer mgmt, the entire _modeset_init were skipped for now.

The WB and the CP, DMA ring buffers are PAGE_SIZE buffers in the system
RAM. GTT is a 512-entries table, in the BAR0 aperture, appropriately filled in
to map the WB, CP and DMA buffers.

>
> When you got that working you can worry about IB (indirect buffers)
> which are basically subroutines calls written into the ring buffers.
>
> Most commands (like copy from A to B, fill something, write value X to
> memory or write X into register Y) can be used from the ring buffers
> directly, but IIRC some context switching commands which are part of the
> rendering process require special handling.
>
> But keep in mind that all of this will just be horrible slow because the
> ASIC runs with the bootup clocks which are something like 100Mhz or even
> only 17Mhz on very old models. To change that you need to implement
> power management, interrupt handling etc etc

Understood. Yes, the DPM and the IH portions. I think by programming only
for the hardware I have I can manage to set them up with comparatively less
effort.

Thanks,
Amol

>
> Good luck,
> Christian.
>
>>
>> Thanks,
>> Amol
>
>

Re: [PATCH v4 2/2] drm/radeon/uvd: Fix forgotten unmap buffer objects

2022-02-08 Thread Christian König


I think so, Alex will probably pick that up.

Thanks,
Christian.

Am 08.02.22 um 09:28 schrieb zhanglianjie:
I am very sorry that I submitted many times due to the character 
coding problem. Can PATCH V4 be used?



I'm scratching my head what you are doing here?

That's the fives time you send out the same patch, so something is 
going wrong here :)


Please double check why that lands in your outbox over and over again.

Regards,
Christian.

Am 08.02.22 um 09:14 schrieb zhanglianjie:
after the buffer object is successfully mapped, call 
radeon_bo_kunmap before the function returns.


Signed-off-by: zhanglianjie 
Reviewed-by: Christian König 

diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c 
b/drivers/gpu/drm/radeon/radeon_uvd.c

index 377f9cdb5b53..0558d928d98d 100644
--- a/drivers/gpu/drm/radeon/radeon_uvd.c
+++ b/drivers/gpu/drm/radeon/radeon_uvd.c
@@ -497,6 +497,7 @@ static int radeon_uvd_cs_msg(struct 
radeon_cs_parser *p, struct radeon_bo *bo,

  handle = msg[2];

  if (handle == 0) {
+    radeon_bo_kunmap(bo);
  DRM_ERROR("Invalid UVD handle!\n");
  return -EINVAL;
  }
@@ -559,12 +560,10 @@ static int radeon_uvd_cs_msg(struct 
radeon_cs_parser *p, struct radeon_bo *bo,

  return 0;

  default:
-
  DRM_ERROR("Illegal UVD message type (%d)!\n", msg_type);
-    return -EINVAL;
  }

-    BUG();
+    radeon_bo_kunmap(bo);
  return -EINVAL;
  }

--
2.20.1

[PATCH v7 3/3] arm64: dts: mt8183: Add panel rotation

2022-02-08 Thread Hsin-Yi Wang

krane, kakadu, and kodama boards have a default panel rotation.

Signed-off-by: Hsin-Yi Wang 
Reviewed-by: Enric Balletbo i Serra 
Tested-by: Enric Balletbo i Serra 
---
 arch/arm64/boot/dts/mediatek/mt8183-kukui.dtsi | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/boot/dts/mediatek/mt8183-kukui.dtsi 
b/arch/arm64/boot/dts/mediatek/mt8183-kukui.dtsi
index b42d81d26d7211..d29d4378170971 100644
--- a/arch/arm64/boot/dts/mediatek/mt8183-kukui.dtsi
+++ b/arch/arm64/boot/dts/mediatek/mt8183-kukui.dtsi
@@ -276,6 +276,7 @@ panel: panel@0 {
avee-supply = <_lcd>;
pp1800-supply = <_lcd>;
backlight = <_lcd0>;
+   rotation = <270>;
port {
panel_in: endpoint {
remote-endpoint = <_out>;
-- 
2.35.0.263.gb82422642f-goog

Re: [Intel-gfx] [PATCH v7 1/3] gpu: drm: separate panel orientation property creating and value setting

2022-02-08 Thread Hsin-Yi Wang

On Tue, Feb 8, 2022 at 3:52 PM Ville Syrjälä
 wrote:
>
> On Tue, Feb 08, 2022 at 03:37:12PM +0800, Hsin-Yi Wang wrote:
> > +int drm_connector_init_panel_orientation_property(
> > + struct drm_connector *connector)
> > +{
> > + struct drm_device *dev = connector->dev;
> > + struct drm_property *prop;
> > +
> > + prop = drm_property_create_enum(dev, DRM_MODE_PROP_IMMUTABLE,
> > + "panel orientation",
> > + drm_panel_orientation_enum_list,
> > + ARRAY_SIZE(drm_panel_orientation_enum_list));
> > + if (!prop)
> > + return -ENOMEM;
> > +
> > + dev->mode_config.panel_orientation_property = prop;
>
> Leak when called multiple times. I guess you could just put
> this into drm_connector_create_standard_properties() instead
> and avoid that issue entirely.
>
I'll add a check for dev->mode_config.panel_orientation_property to
avoid the leak issue if called multiple times.
If we add in drm_connector_create_standard_properties(), we still need
another function to attach the property earlier for bridge/connectors
that require this property, since not all bridge/connectors need this
property.

> --
> Ville Syrjälä
> Intel

[PATCH v3 2/2] drm/radeon/uvd: Fix forgotten unmap buffer objects

2022-02-08 Thread zhanglianjie

after the buffer object is successfully mapped, call radeon_bo_kunmap before 
the function returns.

Signed-off-by: zhanglianjie 
Reviewed-by: Christian K??nig 

diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c 
b/drivers/gpu/drm/radeon/radeon_uvd.c
index 377f9cdb5b53..0558d928d98d 100644
--- a/drivers/gpu/drm/radeon/radeon_uvd.c
+++ b/drivers/gpu/drm/radeon/radeon_uvd.c
@@ -497,6 +497,7 @@ static int radeon_uvd_cs_msg(struct radeon_cs_parser *p, 
struct radeon_bo *bo,
handle = msg[2];

if (handle == 0) {
+   radeon_bo_kunmap(bo);
DRM_ERROR("Invalid UVD handle!\n");
return -EINVAL;
}
@@ -559,12 +560,10 @@ static int radeon_uvd_cs_msg(struct radeon_cs_parser *p, 
struct radeon_bo *bo,
return 0;

default:
-
DRM_ERROR("Illegal UVD message type (%d)!\n", msg_type);
-   return -EINVAL;
}

-   BUG();
+   radeon_bo_kunmap(bo);
return -EINVAL;
 }

--
2.20.1

[PATCH v7 2/3] drm/mediatek: init panel orientation property

2022-02-08 Thread Hsin-Yi Wang

Init panel orientation property after connector is initialized. Let the
panel driver decides the orientation value later.

Signed-off-by: Hsin-Yi Wang 
Acked-by: Chun-Kuang Hu 
---
 drivers/gpu/drm/mediatek/mtk_dsi.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/mediatek/mtk_dsi.c 
b/drivers/gpu/drm/mediatek/mtk_dsi.c
index 5d90d2eb001935..491bf5b0a2b984 100644
--- a/drivers/gpu/drm/mediatek/mtk_dsi.c
+++ b/drivers/gpu/drm/mediatek/mtk_dsi.c
@@ -965,6 +965,13 @@ static int mtk_dsi_encoder_init(struct drm_device *drm, 
struct mtk_dsi *dsi)
ret = PTR_ERR(dsi->connector);
goto err_cleanup_encoder;
}
+
+   ret = drm_connector_init_panel_orientation_property(dsi->connector);
+   if (ret) {
+   DRM_ERROR("Unable to init panel orientation\n");
+   goto err_cleanup_encoder;
+   }
+
drm_connector_attach_encoder(dsi->connector, >encoder);
 
return 0;
-- 
2.35.0.263.gb82422642f-goog

Re: [PATCH 2/2] drm/radeon/uvd: Fix forgotten unmap buffer objects

2022-02-08 Thread zhanglianjie


Hi,
Thanks for your review. I have resubmitted, see 
https://lkml.org/lkml/2022/2/7/2014





Am 29.01.22 um 08:35 schrieb zhanglianjie:
after the buffer object is successfully mapped, call radeon_bo_kunmap 
before the function returns.


Signed-off-by: zhanglianjie 

diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c 
b/drivers/gpu/drm/radeon/radeon_uvd.c

index 377f9cdb5b53..c5482f7793db 100644
--- a/drivers/gpu/drm/radeon/radeon_uvd.c
+++ b/drivers/gpu/drm/radeon/radeon_uvd.c
@@ -497,6 +497,7 @@ static int radeon_uvd_cs_msg(struct 
radeon_cs_parser *p, struct radeon_bo *bo,

  handle = msg[2];

  if (handle == 0) {
+    radeon_bo_kunmap(bo);
  DRM_ERROR("Invalid UVD handle!\n");
  return -EINVAL;
  }
@@ -559,11 +560,10 @@ static int radeon_uvd_cs_msg(struct 
radeon_cs_parser *p, struct radeon_bo *bo,

  return 0;

  default:
-
  DRM_ERROR("Illegal UVD message type (%d)!\n", msg_type);
-    return -EINVAL;
  }

+    radeon_bo_kunmap(bo);
  BUG();


That looks like it will trigger this BUG() now. Please also remove that 
line.


Apart from that the patches look good to me.

Regards,
Christian.


  return -EINVAL;
  }
--
2.20.1

[PATCH v2 2/2] drm/radeon/uvd: Fix forgotten unmap buffer objects

2022-02-08 Thread zhanglianjie

after the buffer object is successfully mapped, call radeon_bo_kunmap before 
the function returns.

Signed-off-by: zhanglianjie 

diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c 
b/drivers/gpu/drm/radeon/radeon_uvd.c
index 377f9cdb5b53..0558d928d98d 100644
--- a/drivers/gpu/drm/radeon/radeon_uvd.c
+++ b/drivers/gpu/drm/radeon/radeon_uvd.c
@@ -497,6 +497,7 @@ static int radeon_uvd_cs_msg(struct radeon_cs_parser *p, 
struct radeon_bo *bo,
handle = msg[2];

if (handle == 0) {
+   radeon_bo_kunmap(bo);
DRM_ERROR("Invalid UVD handle!\n");
return -EINVAL;
}
@@ -559,12 +560,10 @@ static int radeon_uvd_cs_msg(struct radeon_cs_parser *p, 
struct radeon_bo *bo,
return 0;

default:
-
DRM_ERROR("Illegal UVD message type (%d)!\n", msg_type);
-   return -EINVAL;
}

-   BUG();
+   radeon_bo_kunmap(bo);
return -EINVAL;
 }

--
2.20.1

[PATCH v7 1/3] gpu: drm: separate panel orientation property creating and value setting

2022-02-08 Thread Hsin-Yi Wang

drm_dev_register() sets connector->registration_state to
DRM_CONNECTOR_REGISTERED and dev->registered to true. If
drm_connector_set_panel_orientation() is first called after
drm_dev_register(), it will fail several checks and results in following
warning.

Add a function to create panel orientation property and set default value
to UNKNOWN, so drivers can call this function to init the property earlier
, and let the panel set the real value later.

[4.480976] [ cut here ]
[4.485603] WARNING: CPU: 5 PID: 369 at drivers/gpu/drm/drm_mode_object.c:45 
__drm_mode_object_add+0xb4/0xbc

[4.609772] Call trace:
[4.612208]  __drm_mode_object_add+0xb4/0xbc
[4.616466]  drm_mode_object_add+0x20/0x2c
[4.620552]  drm_property_create+0xdc/0x174
[4.624723]  drm_property_create_enum+0x34/0x98
[4.629241]  drm_connector_set_panel_orientation+0x64/0xa0
[4.634716]  boe_panel_get_modes+0x88/0xd8
[4.638802]  drm_panel_get_modes+0x2c/0x48
[4.642887]  panel_bridge_get_modes+0x1c/0x28
[4.647233]  drm_bridge_connector_get_modes+0xa0/0xd4
[4.652273]  drm_helper_probe_single_connector_modes+0x218/0x700
[4.658266]  drm_mode_getconnector+0x1b4/0x45c
[4.662699]  drm_ioctl_kernel+0xac/0x128
[4.11]  drm_ioctl+0x268/0x410
[4.670002]  drm_compat_ioctl+0xdc/0xf0
[4.673829]  __arm64_compat_sys_ioctl+0xc8/0x100
[4.678436]  el0_svc_common+0xf4/0x1c0
[4.682174]  do_el0_svc_compat+0x28/0x3c
[4.686088]  el0_svc_compat+0x10/0x1c
[4.689738]  el0_sync_compat_handler+0xa8/0xcc
[4.694171]  el0_sync_compat+0x178/0x180
[4.698082] ---[ end trace b4f2db9d9c88610b ]---
[4.702721] [ cut here ]
[4.707329] WARNING: CPU: 5 PID: 369 at 
drivers/gpu/drm/drm_mode_object.c:243 drm_object_attach_property+0x48/0xb8

[4.833830] Call trace:
[4.836266]  drm_object_attach_property+0x48/0xb8
[4.840958]  drm_connector_set_panel_orientation+0x84/0xa0
[4.846432]  boe_panel_get_modes+0x88/0xd8
[4.850516]  drm_panel_get_modes+0x2c/0x48
[4.854600]  panel_bridge_get_modes+0x1c/0x28
[4.858946]  drm_bridge_connector_get_modes+0xa0/0xd4
[4.863984]  drm_helper_probe_single_connector_modes+0x218/0x700
[4.869978]  drm_mode_getconnector+0x1b4/0x45c
[4.874410]  drm_ioctl_kernel+0xac/0x128
[4.878320]  drm_ioctl+0x268/0x410
[4.881711]  drm_compat_ioctl+0xdc/0xf0
[4.885536]  __arm64_compat_sys_ioctl+0xc8/0x100
[4.890142]  el0_svc_common+0xf4/0x1c0
[4.893879]  do_el0_svc_compat+0x28/0x3c
[4.897791]  el0_svc_compat+0x10/0x1c
[4.901441]  el0_sync_compat_handler+0xa8/0xcc
[4.905873]  el0_sync_compat+0x178/0x180
[4.909783] ---[ end trace b4f2db9d9c88610c ]---

Signed-off-by: Hsin-Yi Wang 
Reviewed-by: Sean Paul 
---
v6 -> v7:
- Rebase to latest drm-misc.
- Add function for amdgpu_dm.
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |  1 +
 drivers/gpu/drm/drm_connector.c   | 58 ++-
 drivers/gpu/drm/i915/display/icl_dsi.c|  1 +
 drivers/gpu/drm/i915/display/intel_dp.c   |  1 +
 drivers/gpu/drm/i915/display/vlv_dsi.c|  1 +
 include/drm/drm_connector.h   |  2 +
 6 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index e8a994559b6580..3eb0be187292ff 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -8105,6 +8105,7 @@ static void amdgpu_set_panel_orientation(struct 
drm_connector *connector)
if (native_mode->hdisplay == 0 || native_mode->vdisplay == 0)
return;
 
+   drm_connector_init_panel_orientation_property(connector);
drm_connector_set_panel_orientation_with_quirk(connector,
   
DRM_MODE_PANEL_ORIENTATION_UNKNOWN,
   native_mode->hdisplay,
diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index a50c82bc2b2fec..041e496c8f15a9 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -1252,7 +1252,7 @@ static const struct drm_prop_enum_list dp_colorspaces[] = 
{
  * INPUT_PROP_DIRECT) will still map 1:1 to the actual LCD panel
  * coordinates, so if userspace rotates the picture to adjust for
  * the orientation it must also apply the same transformation to the
- * touchscreen input coordinates. This property is initialized by calling
+ * touchscreen input coordinates. This property value is set by calling
  * drm_connector_set_panel_orientation() or
  * drm_connector_set_panel_orientation_with_quirk()
  *
@@ -2341,8 +2341,8 @@ EXPORT_SYMBOL(drm_connector_set_vrr_capable_property);
  * @connector: connector for which to set the panel-orientation property.
  * @panel_orientation:

[RFC] Upstreaming Linux for Nintendo Wii U

2022-02-08 Thread Ash Logan

Hello,

I'm the lead dev on a downstream kernel with support for the Wii U[1],
Nintendo's previous-gen game console. You might have seen Emmanuel
 submitting some of the more self-contained
drivers recently[2][3]. I've gotten to the point where I'd like to look
at upstreaming the platform. Since we need to refactor all the patches
for upstreaming anyway, I thought it would be good to talk to the
experts ahead of time ;)

Some quick details about the platform:
- Tri-core PowerPC "Espresso" (750CL) @ 1.24GHz
- 2GiB DDR3-1600 (and a little over 32MiB of SRAM)
- "Latte" proprietary SoC with USB, SDIO, SATA, crypto, ARM9
coprocessor, Radeon R7xx GPU
- Curiously, the entire graphics pipeline from the original Wii, usually
powered off

The bulk of the interesting work for Linux is in the SoC, which is
pretty similar to the original Wii's in layout (we expect to share a lot
of drivers), with the addition of some more modern blocks.

The state of the downstream work:
- Basic platform init works, "easy" drivers like SDIO, SATA, accelerated
cryptography, etc. all here - some are even upstreamed already.
- Bootloader duties are performed by linux-loader[4], a small firmware
for the ARM coprocessor that idles once Linux starts.
- linux-loader handles a dtbImage right now and has a hardcoded memory
area to pass commandline parameters, parsed from a config file. I don't
expect that to be acceptable, eventually I'd like to move it to loading
vmlinuz directly and pulling the dtb off the SD card, similar to the
Raspberry Pi. Alternatively, petitboot, but kexec doesn't seem to work
right now.
- Linux itself runs tolerably (though given the hardware it should be
faster), with framebuffer graphics and basic support for most hardware,
with the notable exceptions of the WiFi card and the GPU.
- No SMP - will cover this later.

That's about the state of things. I'm not sure how much is or isn't
upstreamable, but right now I'm only thinking about getting the basic
platform support up and some core hardware. On that front, there are a
few decisions that need to be made and help that needs to be had, which
is where I hope you all can give some insight:

- USB only works with patches to the USB core[5] that appear to have
failed upstreaming before[6]. I don't really understand these well
enough to say what particular hardware restriction they're working
around. I do know that there's a curious restriction on DMA addressing
where most SoC blocks (including USB) can't see the SRAM at address 0,
but we worked around this using reserved-mem in the devicetree. Almost
all of the peripherals on Wii U are connected over USB, so having a
working stack is pretty important.
- The Radeon, despite being a mostly standard card, has its GPUF0MMReg
area mapped into the SoC's mmio, with no PCI bus in sight. The Linux
drivers (radeon, too old for amdgpu) seem to expect PCI, so some pretty
extensive patching would be needed to get that moving - not to mention
things like the proprietary HDMI encoder, which seems similar to the
PS4's[7]. Downstream, we have an fbdev driver, which I obviously don't
expect to get accepted.
- Both of those issues together means I'm not convinced an initial port
would have any viable output device. I would like to fix USB, though
barring that we could use a flat framebuffer that linux-loader leaves
enabled.
- Right now I've made a new platform (like ps3) rather than joining the
GameCube and Wii in embedded6xx, since that is marked as BROKEN_ON_SMP.
The Wii U is a 3-core system, though a CPU bug[8] prevents existing
userspaces working with it. Bit of a "cross that bridge when we get
there" situation, though I'm reluctant to prevent that possibility by
using a BROKEN_ON_SMP platform.
- Like the Wii before it, the Wii U has a small amount of RAM at address
zero, a gap, then a large amount of RAM at a higher address. Instead of
the "map everything and reserve the gap" approach of the Wii, we loop
over each memblock and map only true RAM[9]. This seems to work, but as
far as I can tell is unique amongst powerpc32 platforms, so it's worth
pointing out. (Note: I've been told this doesn't work anymore after some
KUAP changes[10], so this point might be moot; haven't investigated)
- Due to the aformentioned DMA restrictions and possibly a fatal
bytemasking bug on uncached mappings[11], I have been wondering if it'd
be better to just give up on the SRAM at address 0 altogether and use it
as VRAM or something, loading the kernel at a higher address.
- Like the Wii, the Wii U also takes a bit of a loose approach to cache
coherency, and has several SoC peripherals with big-endian registers,
requiring driver patching. USB already has devicetree quirks, but others
require more drastic measures. I expect we'll take that on a
driver-by-driver basis.

In terms of platform bringup, the key issue is whether to be embedded6xx
or not and what output device to use. Beyond that it's just things like
IRQ controller drivers, should be pretty

[PATCH v3 2/2] drm/radeon/uvd: Fix forgotten unmap buffer objects

2022-02-08 Thread zhanglianjie

after the buffer object is successfully mapped, call radeon_bo_kunmap before 
the function returns.

Signed-off-by: zhanglianjie 
Reviewed-by: Christian König 

diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c 
b/drivers/gpu/drm/radeon/radeon_uvd.c
index 377f9cdb5b53..0558d928d98d 100644
--- a/drivers/gpu/drm/radeon/radeon_uvd.c
+++ b/drivers/gpu/drm/radeon/radeon_uvd.c
@@ -497,6 +497,7 @@ static int radeon_uvd_cs_msg(struct radeon_cs_parser *p, 
struct radeon_bo *bo,
handle = msg[2];

if (handle == 0) {
+   radeon_bo_kunmap(bo);
DRM_ERROR("Invalid UVD handle!\n");
return -EINVAL;
}
@@ -559,12 +560,10 @@ static int radeon_uvd_cs_msg(struct radeon_cs_parser *p, 
struct radeon_bo *bo,
return 0;

default:
-
DRM_ERROR("Illegal UVD message type (%d)!\n", msg_type);
-   return -EINVAL;
}

-   BUG();
+   radeon_bo_kunmap(bo);
return -EINVAL;
 }

--
2.20.1

[PATCH v4 2/2] drm/radeon/uvd: Fix forgotten unmap buffer objects

2022-02-08 Thread zhanglianjie

after the buffer object is successfully mapped, call radeon_bo_kunmap before 
the function returns.

Signed-off-by: zhanglianjie 
Reviewed-by: Christian König 

diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c 
b/drivers/gpu/drm/radeon/radeon_uvd.c
index 377f9cdb5b53..0558d928d98d 100644
--- a/drivers/gpu/drm/radeon/radeon_uvd.c
+++ b/drivers/gpu/drm/radeon/radeon_uvd.c
@@ -497,6 +497,7 @@ static int radeon_uvd_cs_msg(struct radeon_cs_parser *p, 
struct radeon_bo *bo,
handle = msg[2];

if (handle == 0) {
+   radeon_bo_kunmap(bo);
DRM_ERROR("Invalid UVD handle!\n");
return -EINVAL;
}
@@ -559,12 +560,10 @@ static int radeon_uvd_cs_msg(struct radeon_cs_parser *p, 
struct radeon_bo *bo,
return 0;

default:
-
DRM_ERROR("Illegal UVD message type (%d)!\n", msg_type);
-   return -EINVAL;
}

-   BUG();
+   radeon_bo_kunmap(bo);
return -EINVAL;
 }

--
2.20.1

Re: [PATCH 1/2] drm/amdgpu: add debugfs for reset registers list

2022-02-08 Thread Christian König


Am 08.02.22 um 09:16 schrieb Somalapuram Amaranath:

List of register to be populated for dump collection during the GPU reset.

Signed-off-by: Somalapuram Amaranath 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h |  3 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 60 +
  2 files changed, 63 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index b85b67a88a3d..78fa46f959c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1097,6 +1097,9 @@ struct amdgpu_device {
  
  	struct amdgpu_reset_control *reset_cntl;

uint32_t
ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
+
+   /* reset dump register */
+   longreset_dump_reg_list[128];


I don't have time for a full review, but using long here certainly makes 
no sense.


long is either 32bit or 64bit depending on the CPU architecture.

Regards,
Christian.


  };
  
  static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 164d6a9e9fbb..dad268e8a81a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1609,6 +1609,64 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
  DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
amdgpu_debugfs_sclk_set, "%llu\n");
  
+static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,

+   char __user *buf, size_t size, loff_t *pos)
+{
+   struct amdgpu_device *adev = (struct amdgpu_device 
*)file_inode(f)->i_private;
+   char *reg_offset;
+   int i, r, len;
+
+   reg_offset = kmalloc(2048, GFP_KERNEL);
+   memset(reg_offset,  0, 2048);
+   for (i = 0; adev->reset_dump_reg_list[i] != 0; i++)
+   sprintf(reg_offset + strlen(reg_offset), "0x%lx ", 
adev->reset_dump_reg_list[i]);
+
+   sprintf(reg_offset + strlen(reg_offset), "\n");
+   len = strlen(reg_offset);
+
+   if (*pos >=  len)
+   return 0;
+
+   r = copy_to_user(buf, reg_offset, len);
+   *pos += len - r;
+   kfree(reg_offset);
+
+   return len - r;
+}
+
+static ssize_t amdgpu_reset_dump_register_list_write(struct file *f, const 
char __user *buf,
+   size_t size, loff_t *pos)
+{
+   struct amdgpu_device *adev = (struct amdgpu_device 
*)file_inode(f)->i_private;
+   char *reg_offset, *reg;
+   int ret, i = 0;
+
+   reg_offset = kmalloc(size, GFP_KERNEL);
+   memset(reg_offset,  0, size);
+   ret = copy_from_user(reg_offset, buf, size);
+
+   if (ret)
+   return -EFAULT;
+
+   while ((reg = strsep(_offset, " ")) != NULL) {
+   ret  = kstrtol(reg, 16, >reset_dump_reg_list[i]);
+   if (ret)
+   return -EINVAL;
+   i++;
+   }
+
+   kfree(reg_offset);
+
+   return size;
+}
+
+static const struct file_operations amdgpu_reset_dump_register_list = {
+   .owner = THIS_MODULE,
+   .read = amdgpu_reset_dump_register_list_read,
+   .write = amdgpu_reset_dump_register_list_write,
+   .llseek = default_llseek
+};
+
  int amdgpu_debugfs_init(struct amdgpu_device *adev)
  {
struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
@@ -1672,6 +1730,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
_debugfs_test_ib_fops);
debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
_debugfs_vm_info_fops);
+   debugfs_create_file("amdgpu_reset_dump_register_list", 0644, root, adev,
+   _reset_dump_register_list);
  
  	adev->debugfs_vbios_blob.data = adev->bios;

adev->debugfs_vbios_blob.size = adev->bios_size;

Re: [PATCH v4 2/2] drm/radeon/uvd: Fix forgotten unmap buffer objects

2022-02-08 Thread Christian König


I'm scratching my head what you are doing here?

That's the fives time you send out the same patch, so something is going 
wrong here :)


Please double check why that lands in your outbox over and over again.

Regards,
Christian.

Am 08.02.22 um 09:14 schrieb zhanglianjie:

after the buffer object is successfully mapped, call radeon_bo_kunmap before 
the function returns.

Signed-off-by: zhanglianjie 
Reviewed-by: Christian König 

diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c 
b/drivers/gpu/drm/radeon/radeon_uvd.c
index 377f9cdb5b53..0558d928d98d 100644
--- a/drivers/gpu/drm/radeon/radeon_uvd.c
+++ b/drivers/gpu/drm/radeon/radeon_uvd.c
@@ -497,6 +497,7 @@ static int radeon_uvd_cs_msg(struct radeon_cs_parser *p, 
struct radeon_bo *bo,
handle = msg[2];

if (handle == 0) {
+   radeon_bo_kunmap(bo);
DRM_ERROR("Invalid UVD handle!\n");
return -EINVAL;
}
@@ -559,12 +560,10 @@ static int radeon_uvd_cs_msg(struct radeon_cs_parser *p, 
struct radeon_bo *bo,
return 0;

default:
-
DRM_ERROR("Illegal UVD message type (%d)!\n", msg_type);
-   return -EINVAL;
}

-   BUG();
+   radeon_bo_kunmap(bo);
return -EINVAL;
  }

--
2.20.1

[PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset

2022-02-08 Thread Somalapuram Amaranath

Dump the list of register values to trace event on GPU reset.

Signed-off-by: Somalapuram Amaranath 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 19 +++
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1e651b959141..057922fb7e37 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4534,6 +4534,23 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device 
*adev,
return r;
 }
 
+static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
+{
+   int i;
+   uint32_t reg_value[128];
+
+   for (i = 0; adev->reset_dump_reg_list[i] != 0; i++) {
+   if (adev->asic_type >= CHIP_NAVI10)
+   reg_value[i] = RREG32_SOC15_IP(GC, 
adev->reset_dump_reg_list[i]);
+   else
+   reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
+   }
+
+   trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list, reg_value, i);
+
+   return 0;
+}
+
 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
 struct amdgpu_reset_context *reset_context)
 {
@@ -4567,8 +4584,10 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
tmp_adev->gmc.xgmi.pending_reset = false;
if (!queue_work(system_unbound_wq, 
_adev->xgmi_reset_work))
r = -EALREADY;
-   } else
+   } else {
+   amdgpu_reset_reg_dumps(tmp_adev);
r = amdgpu_asic_reset(tmp_adev);
+   }
 
if (r) {
dev_err(tmp_adev->dev, "ASIC reset failed with 
error, %d for drm dev, %s",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
index d855cb53c7e0..3fe33de3564a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
@@ -537,6 +537,25 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
  __entry->seqno)
 );
 
+TRACE_EVENT(amdgpu_reset_reg_dumps,
+   TP_PROTO(long *address, uint32_t *value, int length),
+   TP_ARGS(address, value, length),
+   TP_STRUCT__entry(
+__array(long, address, 128)
+__array(uint32_t, value, 128)
+__field(int, len)
+),
+   TP_fast_assign(
+  memcpy(__entry->address, address, 128);
+  memcpy(__entry->value,  value, 128);
+  __entry->len = length;
+  ),
+   TP_printk("amdgpu register dump offset: %s value: %s ",
+ __print_array(__entry->address, __entry->len, 8),
+ __print_array(__entry->value, __entry->len, 8)
+)
+);
+
 #undef AMDGPU_JOB_GET_TIMELINE_NAME
 #endif
 
-- 
2.25.1

[PATCH 1/2] drm/amdgpu: add debugfs for reset registers list

2022-02-08 Thread Somalapuram Amaranath

List of register to be populated for dump collection during the GPU reset.

Signed-off-by: Somalapuram Amaranath 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  3 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 60 +
 2 files changed, 63 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index b85b67a88a3d..78fa46f959c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1097,6 +1097,9 @@ struct amdgpu_device {
 
struct amdgpu_reset_control *reset_cntl;
uint32_t
ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
+
+   /* reset dump register */
+   longreset_dump_reg_list[128];
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 164d6a9e9fbb..dad268e8a81a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1609,6 +1609,64 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
 DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
amdgpu_debugfs_sclk_set, "%llu\n");
 
+static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
+   char __user *buf, size_t size, loff_t *pos)
+{
+   struct amdgpu_device *adev = (struct amdgpu_device 
*)file_inode(f)->i_private;
+   char *reg_offset;
+   int i, r, len;
+
+   reg_offset = kmalloc(2048, GFP_KERNEL);
+   memset(reg_offset,  0, 2048);
+   for (i = 0; adev->reset_dump_reg_list[i] != 0; i++)
+   sprintf(reg_offset + strlen(reg_offset), "0x%lx ", 
adev->reset_dump_reg_list[i]);
+
+   sprintf(reg_offset + strlen(reg_offset), "\n");
+   len = strlen(reg_offset);
+
+   if (*pos >=  len)
+   return 0;
+
+   r = copy_to_user(buf, reg_offset, len);
+   *pos += len - r;
+   kfree(reg_offset);
+
+   return len - r;
+}
+
+static ssize_t amdgpu_reset_dump_register_list_write(struct file *f, const 
char __user *buf,
+   size_t size, loff_t *pos)
+{
+   struct amdgpu_device *adev = (struct amdgpu_device 
*)file_inode(f)->i_private;
+   char *reg_offset, *reg;
+   int ret, i = 0;
+
+   reg_offset = kmalloc(size, GFP_KERNEL);
+   memset(reg_offset,  0, size);
+   ret = copy_from_user(reg_offset, buf, size);
+
+   if (ret)
+   return -EFAULT;
+
+   while ((reg = strsep(_offset, " ")) != NULL) {
+   ret  = kstrtol(reg, 16, >reset_dump_reg_list[i]);
+   if (ret)
+   return -EINVAL;
+   i++;
+   }
+
+   kfree(reg_offset);
+
+   return size;
+}
+
+static const struct file_operations amdgpu_reset_dump_register_list = {
+   .owner = THIS_MODULE,
+   .read = amdgpu_reset_dump_register_list_read,
+   .write = amdgpu_reset_dump_register_list_write,
+   .llseek = default_llseek
+};
+
 int amdgpu_debugfs_init(struct amdgpu_device *adev)
 {
struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
@@ -1672,6 +1730,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
_debugfs_test_ib_fops);
debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
_debugfs_vm_info_fops);
+   debugfs_create_file("amdgpu_reset_dump_register_list", 0644, root, adev,
+   _reset_dump_register_list);
 
adev->debugfs_vbios_blob.data = adev->bios;
adev->debugfs_vbios_blob.size = adev->bios_size;
-- 
2.25.1

Re: [PATCH 4/8] mm: move free_devmap_managed_page to memremap.c

2022-02-08 Thread Muchun Song

On Mon, Feb 7, 2022 at 2:42 PM Christoph Hellwig  wrote:
>
> free_devmap_managed_page has nothing to do with the code in swap.c,
> move it to live with the rest of the code for devmap handling.
>
> Signed-off-by: Christoph Hellwig 

Reviewed-by: Muchun Song 

Thanks.

Re: [PATCH 2/8] mm: remove the KERNEL guard from

2022-02-08 Thread Muchun Song

On Mon, Feb 7, 2022 at 2:42 PM Christoph Hellwig  wrote:
>
> __KERNEL__ ifdefs don't make sense outside of include/uapi/.
>
> Signed-off-by: Christoph Hellwig 

Reviewed-by: Muchun Song 

Thanks.

Re: [PATCH 1/8] mm: remove a pointless CONFIG_ZONE_DEVICE check in memremap_pages

2022-02-08 Thread Muchun Song

On Mon, Feb 7, 2022 at 2:36 PM Christoph Hellwig  wrote:
>
> memremap.c is only built when CONFIG_ZONE_DEVICE is set, so remove
> the superflous extra check.
>
> Signed-off-by: Christoph Hellwig 

Reviewed-by: Muchun Song 

Thanks.

85 matches

Mail list logo