Re: [PATCH] drm/amdkfd: fix the hang caused by the write reorder to fence_addr

2024-10-22 Thread Philip Yang

  


On 2024-10-21 04:12, Christian König
  wrote:

Am
  18.10.24 um 23:59 schrieb Philip Yang:
  
  On 2024-10-18 14:28, Felix Kuehling wrote:


  
  On 2024-10-17 04:34, Victor Zhao wrote:
  
  make sure KFD_FENCE_INIT write to
fence_addr before pm_send_query_status

called, to avoid qcm fence timeout caused by incorrect
ordering.


Signed-off-by: Victor Zhao 

---

  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1
+

  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 2
+-

  2 files changed, 2 insertions(+), 1 deletion(-)


diff --git
a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

index b2b16a812e73..d9264a353775 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

@@ -2254,6 +2254,7 @@ static int unmap_queues_cpsch(struct
device_queue_manager *dqm,

  goto out;

    *dqm->fence_addr = KFD_FENCE_INIT;

+    mb();

  pm_send_query_status(&dqm->packet_mgr,
dqm->fence_gpu_addr,

  KFD_FENCE_COMPLETED);

  /* should be timed out */

diff --git
a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h

index 09ab36f8e8c6..bddb169bb301 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h

@@ -260,7 +260,7 @@ struct device_queue_manager {

  uint16_t    vmid_pasid[VMID_NUM];

  uint64_t    pipelines_addr;

  uint64_t    fence_gpu_addr;

-    uint64_t    *fence_addr;

+    volatile uint64_t    *fence_addr;

  
  
  [+Christian]
  
  
  Is the volatile keyword really needed here? I just saw other
  patches removing volatile in some places because it's not
  sufficient, and not needed if you use memory barriers
  correctly.
  


After reading kernel memory barrier document and below link, I
think we need both volatile type and memory barrier, to
guarantee F/W get the updated fence value. This fixes an CP hang
issue on SRIOV.


https://stackoverflow.com/questions/75750110/volatile-vs-memory-barriers#:~:text=volatile%20will%20make%20sure%20that,not%20reorder%20writes%20or%20reads.


  
  
  No, that isn't correct. Using volatile is considered harmful and
  almost never correct, see here
  https://docs.kernel.org/process/volatile-considered-harmful.html
  
  
  Placing appropriate memory barriers must be sufficient or
  otherwise there is a rather bad platform or compiler bug lurking
  around.
  

Yes, Victor confirmed that memory barrier fixes the issue, will
  send new patch to remove the volatile type.
Regards,
Philip


  
  Regards,
  
  Christian.
  
  
  Regards,


Philip



  
  Regards,
  
    Felix
  
  
  
    struct kfd_mem_obj   
*fence_mem;

  bool    active_runlist;

  int    sched_policy;

  

  
  

  



Re: [PATCH] Revert "drm/amdkfd: SMI report dropped event count"

2024-10-21 Thread Philip Yang

  


On 2024-10-21 13:46, Alex Deucher
  wrote:


  This reverts commit a3ab2d45b9887ee609cd3bea39f668236935774c.

The userspace side for this code is not ready yet so revert
for now.

Signed-off-by: Alex Deucher 
Cc: Philip Yang 

Reviewed-by: Philip Yang

  
---
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 27 +++--
 include/uapi/linux/kfd_ioctl.h  |  6 -
 2 files changed, 4 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index c8d67d62ca3f..9b8169761ec5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -42,7 +42,6 @@ struct kfd_smi_client {
 	struct rcu_head rcu;
 	pid_t pid;
 	bool suser;
-	u32 drop_count;
 };
 
 #define KFD_MAX_KFIFO_SIZE	8192
@@ -104,28 +103,12 @@ static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user,
 	}
 	to_copy = min(size, to_copy);
 	ret = kfifo_out(&client->fifo, buf, to_copy);
+	spin_unlock(&client->lock);
 	if (ret <= 0) {
-		spin_unlock(&client->lock);
 		ret = -EAGAIN;
 		goto ret_err;
 	}
 
-	if (client->drop_count) {
-		char msg[KFD_SMI_EVENT_MSG_SIZE];
-		int len;
-
-		len = snprintf(msg, sizeof(msg), "%x ", KFD_SMI_EVENT_DROPPED_EVENT);
-		len += snprintf(msg + len, sizeof(msg) - len,
-KFD_EVENT_FMT_DROPPED_EVENT(ktime_get_boottime_ns(),
-client->pid, client->drop_count));
-		if (kfifo_avail(&client->fifo) >= len) {
-			kfifo_in(&client->fifo, msg, len);
-			client->drop_count = 0;
-		}
-	}
-
-	spin_unlock(&client->lock);
-
 	ret = copy_to_user(user, buf, to_copy);
 	if (ret) {
 		ret = -EFAULT;
@@ -199,15 +182,13 @@ static void add_event_to_kfifo(pid_t pid, struct kfd_node *dev,
 	list_for_each_entry_rcu(client, &dev->smi_clients, list) {
 		if (!kfd_smi_ev_enabled(pid, client, smi_event))
 			continue;
-
 		spin_lock(&client->lock);
-		if (!client->drop_count && kfifo_avail(&client->fifo) >= len) {
+		if (kfifo_avail(&client->fifo) >= len) {
 			kfifo_in(&client->fifo, event_msg, len);
 			wake_up_all(&client->wait_queue);
 		} else {
-			client->drop_count++;
-			pr_debug("smi_event(EventID: %u): no space left drop_count %d\n",
- smi_event, client->drop_count);
+			pr_debug("smi_event(EventID: %u): no space left\n",
+	smi_event);
 		}
 		spin_unlock(&client->lock);
 	}
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 7afd66d45313..fa9f9846b88e 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -530,7 +530,6 @@ enum kfd_smi_event {
 	KFD_SMI_EVENT_QUEUE_EVICTION = 9,
 	KFD_SMI_EVENT_QUEUE_RESTORE = 10,
 	KFD_SMI_EVENT_UNMAP_FROM_GPU = 11,
-	KFD_SMI_EVENT_DROPPED_EVENT = 12,
 
 	/*
 	 * max event number, as a flag bit to get events from all processes,
@@ -611,7 +610,6 @@ struct kfd_ioctl_smi_events_args {
  *rw: 'W' for write page fault, 'R' for read page fault
  *rescheduled: 'R' if the queue restore failed and rescheduled to try again
  *error_code: migrate failure error code, 0 if no error
- *drop_count: how many events dropped when fifo is full
  */
 #define KFD_EVENT_FMT_UPDATE_GPU_RESET(reset_seq_num, reset_cause)\
 		"%x %s\n", (reset_seq_num), (reset_cause)
@@ -647,10 +645,6 @@ struct kfd_ioctl_smi_events_args {
 		"%lld -%d @%lx(%lx) %x %d\n", (ns), (pid), (addr), (size),\
 		(node), (unmap_trigger)
 
-#define KFD_EVENT_FMT_DROPPED_EVENT(ns, pid, drop_count)\
-		"%lld -%d %d\n", (ns), (pid), (drop_count)
-
-
 /**
  * CRIU IOCTLs (Checkpoint Restore In Userspace)
  *


  



Re: [PATCH] drm/amdkfd: fix the hang caused by the write reorder to fence_addr

2024-10-18 Thread Philip Yang

  


On 2024-10-18 14:28, Felix Kuehling
  wrote:


  
  On 2024-10-17 04:34, Victor Zhao wrote:
  
  make sure KFD_FENCE_INIT write to
fence_addr before pm_send_query_status

called, to avoid qcm fence timeout caused by incorrect ordering.


Signed-off-by: Victor Zhao 

---

  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1 +

  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 2 +-

  2 files changed, 2 insertions(+), 1 deletion(-)


diff --git
a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

index b2b16a812e73..d9264a353775 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

@@ -2254,6 +2254,7 @@ static int unmap_queues_cpsch(struct
device_queue_manager *dqm,

  goto out;

    *dqm->fence_addr = KFD_FENCE_INIT;

+    mb();

  pm_send_query_status(&dqm->packet_mgr,
dqm->fence_gpu_addr,

  KFD_FENCE_COMPLETED);

  /* should be timed out */

diff --git
a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h

index 09ab36f8e8c6..bddb169bb301 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h

@@ -260,7 +260,7 @@ struct device_queue_manager {

  uint16_t    vmid_pasid[VMID_NUM];

  uint64_t    pipelines_addr;

  uint64_t    fence_gpu_addr;

-    uint64_t    *fence_addr;

+    volatile uint64_t    *fence_addr;

  
  
  [+Christian]
  
  
  Is the volatile keyword really needed here? I just saw other
  patches removing volatile in some places because it's not
  sufficient, and not needed if you use memory barriers correctly.
  

After reading kernel memory barrier document and below link, I
  think we need both volatile type and memory barrier, to guarantee
  F/W get the updated fence value. This fixes an CP hang issue on
  SRIOV.

https://stackoverflow.com/questions/75750110/volatile-vs-memory-barriers#:~:text=volatile%20will%20make%20sure%20that,not%20reorder%20writes%20or%20reads.
Regards,
Philip


  
  Regards,
  
    Felix
  
  
  
    struct kfd_mem_obj    *fence_mem;

  bool    active_runlist;

  int    sched_policy;

  

  



Re: [PATCH] drm/amd/amdkfd: add/remove kfd queues on start/stop KFD scheduling

2024-10-18 Thread Philip Yang

  
It is safe to access dqm->sched status inside dqm_lock, no
  race with gpu reset.
Reviewed-by: Philip Yang 

On 2024-10-18 11:10, Shaoyun Liu wrote:


  From: shaoyunl 

Add back kfd queues in start scheduling that originally been
removed on stop scheduling.

Signed-off-by: Shaoyun Liu 
Reviewed-by: Felix Kuehling 
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 40 +--
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index b2b16a812e73..edfb9f98555f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -202,6 +202,8 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
 	int r, queue_type;
 	uint64_t wptr_addr_off;
 
+	if (!dqm->sched_running || dqm->sched_halt)
+		return 0;
 	if (!down_read_trylock(&adev->reset_domain->sem))
 		return -EIO;
 
@@ -270,6 +272,8 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
 	int r;
 	struct mes_remove_queue_input queue_input;
 
+	if (!dqm->sched_running || dqm->sched_halt)
+		return 0;
 	if (!down_read_trylock(&adev->reset_domain->sem))
 		return -EIO;
 
@@ -292,7 +296,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
 	return r;
 }
 
-static int remove_all_queues_mes(struct device_queue_manager *dqm)
+static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm)
 {
 	struct device_process_node *cur;
 	struct device *dev = dqm->dev->adev->dev;
@@ -319,6 +323,33 @@ static int remove_all_queues_mes(struct device_queue_manager *dqm)
 	return retval;
 }
 
+static int add_all_kfd_queues_mes(struct device_queue_manager *dqm)
+{
+	struct device_process_node *cur;
+	struct device *dev = dqm->dev->adev->dev;
+	struct qcm_process_device *qpd;
+	struct queue *q;
+	int retval = 0;
+
+	list_for_each_entry(cur, &dqm->queues, list) {
+		qpd = cur->qpd;
+		list_for_each_entry(q, &qpd->queues_list, list) {
+			if (!q->properties.is_active)
+continue;
+			retval = add_queue_mes(dqm, q, qpd);
+			if (retval) {
+dev_err(dev, "%s: Failed to add queue %d for dev %d",
+	__func__,
+	q->properties.queue_id,
+	dqm->dev->id);
+return retval;
+			}
+		}
+	}
+
+	return retval;
+}
+
 static int suspend_all_queues_mes(struct device_queue_manager *dqm)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
@@ -1742,7 +1773,7 @@ static int halt_cpsch(struct device_queue_manager *dqm)
 		 KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
 USE_DEFAULT_GRACE_PERIOD, false);
 		else
-			ret = remove_all_queues_mes(dqm);
+			ret = remove_all_kfd_queues_mes(dqm);
 	}
 	dqm->sched_halt = true;
 	dqm_unlock(dqm);
@@ -1768,6 +1799,9 @@ static int unhalt_cpsch(struct device_queue_manager *dqm)
 		ret = execute_queues_cpsch(dqm,
 	   KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES,
 			0, USE_DEFAULT_GRACE_PERIOD);
+	else
+		ret = add_all_kfd_queues_mes(dqm);
+
 	dqm_unlock(dqm);
 
 	return ret;
@@ -1867,7 +1901,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
 	if (!dqm->dev->kfd->shared_resources.enable_mes)
 		unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false);
 	else
-		remove_all_queues_mes(dqm);
+		remove_all_kfd_queues_mes(dqm);
 
 	dqm->sched_running = false;
 


  



Re: [PATCH] drm/amdkfd: fix the hang caused by the write reorder to fence_addr

2024-10-18 Thread Philip Yang

  


On 2024-10-18 01:31, Zhao, Victor
  wrote:


  [AMD Official Use Only - AMD Internal Distribution Only]

[AMD Official Use Only - AMD Internal Distribution Only]

Ping. Please help review.

Thanks,
Victor

-Original Message-
From: Victor Zhao 
Sent: Thursday, October 17, 2024 4:35 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhao, Victor 
Subject: [PATCH] drm/amdkfd: fix the hang caused by the write reorder to fence_addr

make sure KFD_FENCE_INIT write to fence_addr before pm_send_query_status called, to avoid qcm fence timeout caused by incorrect ordering.

Signed-off-by: Victor Zhao 

Reviewed-by: Philip Yang 

  
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1 +  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index b2b16a812e73..d9264a353775 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -2254,6 +2254,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
goto out;

*dqm->fence_addr = KFD_FENCE_INIT;
+   mb();
pm_send_query_status(&dqm->packet_mgr, dqm->fence_gpu_addr,
KFD_FENCE_COMPLETED);
/* should be timed out */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 09ab36f8e8c6..bddb169bb301 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -260,7 +260,7 @@ struct device_queue_manager {
uint16_tvmid_pasid[VMID_NUM];
uint64_tpipelines_addr;
uint64_tfence_gpu_addr;
-   uint64_t*fence_addr;
+   volatile uint64_t   *fence_addr;
struct kfd_mem_obj  *fence_mem;
boolactive_runlist;
int sched_policy;
--
2.34.1



  



Re: [PATCH] drm/amd/amdkfd: add/remove kfd queues on start/stop KFD scheduling

2024-10-17 Thread Philip Yang

  


On 2024-10-17 12:12, Shaoyun Liu wrote:


  From: shaoyunl 

Add back kfd queues in start scheduling that originally been
removed on stop scheduling.

Signed-off-by: Shaoyun Liu 
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 40 +--
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index b2b16a812e73..542363b4712e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -204,6 +204,8 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
 
 	if (!down_read_trylock(&adev->reset_domain->sem))
 		return -EIO;
+	if (!dqm->sched_running || dqm->sched_halt) {

                    up_read(&adev->reset_domain->sem);

  
+		return 0;

    }

  
 
 	memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input));
 	queue_input.process_id = qpd->pqm->process->pasid;
@@ -272,6 +274,8 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
 
 	if (!down_read_trylock(&adev->reset_domain->sem))
 		return -EIO;
+	if (!dqm->sched_running || dqm->sched_halt) {

 up_read(&adev->reset_domain->sem);

  
+		return 0;

  }
It is simpler to move sched_halt/running check outside reset sem
  lock, but not sure if it is safe.

Regards,
Philip  


  
 
 	memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));
 	queue_input.doorbell_offset = q->properties.doorbell_off;
@@ -292,7 +296,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
 	return r;
 }
 
-static int remove_all_queues_mes(struct device_queue_manager *dqm)
+static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm)
 {
 	struct device_process_node *cur;
 	struct device *dev = dqm->dev->adev->dev;
@@ -319,6 +323,33 @@ static int remove_all_queues_mes(struct device_queue_manager *dqm)
 	return retval;
 }
 
+static int add_all_kfd_queues_mes(struct device_queue_manager *dqm)
+{
+	struct device_process_node *cur;
+	struct device *dev = dqm->dev->adev->dev;
+	struct qcm_process_device *qpd;
+	struct queue *q;
+	int retval = 0;
+
+	list_for_each_entry(cur, &dqm->queues, list) {
+		qpd = cur->qpd;
+		list_for_each_entry(q, &qpd->queues_list, list) {
+			if (!q->properties.is_active)
+continue;
+			retval = add_queue_mes(dqm, q, qpd);
+			if (retval) {
+dev_err(dev, "%s: Failed to add queue %d for dev %d",
+	__func__,
+	q->properties.queue_id,
+	dqm->dev->id);
+return retval;
+			}
+		}
+	}
+
+	return retval;
+}
+
 static int suspend_all_queues_mes(struct device_queue_manager *dqm)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
@@ -1742,7 +1773,7 @@ static int halt_cpsch(struct device_queue_manager *dqm)
 		 KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
 USE_DEFAULT_GRACE_PERIOD, false);
 		else
-			ret = remove_all_queues_mes(dqm);
+			ret = remove_all_kfd_queues_mes(dqm);
 	}
 	dqm->sched_halt = true;
 	dqm_unlock(dqm);
@@ -1768,6 +1799,9 @@ static int unhalt_cpsch(struct device_queue_manager *dqm)
 		ret = execute_queues_cpsch(dqm,
 	   KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES,
 			0, USE_DEFAULT_GRACE_PERIOD);
+	else
+		ret = add_all_kfd_queues_mes(dqm);
+
 	dqm_unlock(dqm);
 
 	return ret;
@@ -1867,7 +1901,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
 	if (!dqm->dev->kfd->shared_resources.enable_mes)
 		unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false);
 	else
-		remove_all_queues_mes(dqm);
+		remove_all_kfd_queues_mes(dqm);
 
 	dqm->sched_running = false;
 


  



[PATCH v3] drm/amdkfd: Accounting pdd vram_usage for svm

2024-10-11 Thread Philip Yang
Process device data pdd->vram_usage is read by rocm-smi via sysfs, this
is currently missing the svm_bo usage accounting, so "rocm-smi
--showpids" per process VRAM usage report is incorrect.

Add pdd->vram_usage accounting when svm_bo allocation and release,
change to atomic64_t type because it is updated outside process mutex
now.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  6 +++---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |  4 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 26 
 4 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index a1f191a5984b..065d87841459 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1148,7 +1148,7 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file 
*filep,
 
if (flags & KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM)
size >>= 1;
-   WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + PAGE_ALIGN(size));
+   atomic64_add(PAGE_ALIGN(size), &pdd->vram_usage);
}
 
mutex_unlock(&p->mutex);
@@ -1219,7 +1219,7 @@ static int kfd_ioctl_free_memory_of_gpu(struct file 
*filep,
kfd_process_device_remove_obj_handle(
pdd, GET_IDR_HANDLE(args->handle));
 
-   WRITE_ONCE(pdd->vram_usage, pdd->vram_usage - size);
+   atomic64_sub(size, &pdd->vram_usage);
 
 err_unlock:
 err_pdd:
@@ -2347,7 +2347,7 @@ static int criu_restore_memory_of_gpu(struct 
kfd_process_device *pdd,
} else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
bo_bucket->restored_offset = offset;
/* Update the VRAM usage count */
-   WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + bo_bucket->size);
+   atomic64_add(bo_bucket->size, &pdd->vram_usage);
}
return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 6a5bf88cc232..9e5ca0b93b2a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -775,7 +775,7 @@ struct kfd_process_device {
enum kfd_pdd_bound bound;
 
/* VRAM usage */
-   uint64_t vram_usage;
+   atomic64_t vram_usage;
struct attribute attr_vram;
char vram_filename[MAX_SYSFS_FILENAME_LEN];
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 7909dfd158be..4810521736a9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -332,7 +332,7 @@ static ssize_t kfd_procfs_show(struct kobject *kobj, struct 
attribute *attr,
} else if (strncmp(attr->name, "vram_", 5) == 0) {
struct kfd_process_device *pdd = container_of(attr, struct 
kfd_process_device,
  attr_vram);
-   return snprintf(buffer, PAGE_SIZE, "%llu\n", 
READ_ONCE(pdd->vram_usage));
+   return snprintf(buffer, PAGE_SIZE, "%llu\n", 
atomic64_read(&pdd->vram_usage));
} else if (strncmp(attr->name, "sdma_", 5) == 0) {
struct kfd_process_device *pdd = container_of(attr, struct 
kfd_process_device,
  attr_sdma);
@@ -1625,7 +1625,7 @@ struct kfd_process_device 
*kfd_create_process_device_data(struct kfd_node *dev,
pdd->bound = PDD_UNBOUND;
pdd->already_dequeued = false;
pdd->runtime_inuse = false;
-   pdd->vram_usage = 0;
+   atomic64_set(&pdd->vram_usage, 0);
pdd->sdma_past_activity_counter = 0;
pdd->user_gpu_id = dev->id;
atomic64_set(&pdd->evict_duration_counter, 0);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 857ec6f23bba..3e2911895c74 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -405,6 +405,27 @@ static void svm_range_bo_release(struct kref *kref)
spin_lock(&svm_bo->list_lock);
}
spin_unlock(&svm_bo->list_lock);
+
+   if (mmget_not_zero(svm_bo->eviction_fence->mm)) {
+   struct kfd_process_device *pdd;
+   struct kfd_process *p;
+   struct mm_struct *mm;
+
+   mm = svm_bo->eviction_fence->mm;
+   /*
+* The forked child process takes svm_bo device pages ref, 
svm_bo could be
+* released after parent process is gone.
+*/
+   p = kfd_lookup_process_by_mm(mm);
+   

Re: [PATCH 1/2] drm/amdkfd: Save pdd to svm_bo to replace node

2024-10-11 Thread Philip Yang

  
Drop this patch series, as Felix pointed out, the forked process
  takes svm_bo device pages ref, svm_bo->pdd could refer to the
  process that doesn't exist any more.
Regards,
Philip

On 2024-10-11 11:00, Philip Yang wrote:

  KFD process device data pdd will be used for VRAM usage accounting, save
pdd to svm_bo to avoid searching pdd for every accounting, and get KFD
node from pdd->dev.

svm_bo->pdd will always be valid because KFD process release free all
svm_bo first, then destroy process pdds.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 27 +--
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h |  2 +-
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 857ec6f23bba..d40f6fb803df 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -180,7 +180,7 @@ svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange,
 
 		page = hmm_pfn_to_page(hmm_pfns[i]);
 		if (is_zone_device_page(page)) {
-			struct amdgpu_device *bo_adev = prange->svm_bo->node->adev;
+			struct amdgpu_device *bo_adev = prange->svm_bo->pdd->dev->adev;
 
 			addr[i] = (hmm_pfns[i] << PAGE_SHIFT) +
    bo_adev->vm_manager.vram_base_offset -
@@ -457,11 +457,11 @@ svm_range_validate_svm_bo(struct kfd_node *node, struct svm_range *prange)
 	}
 	if (svm_bo_ref_unless_zero(prange->svm_bo)) {
 		/*
-		 * Migrate from GPU to GPU, remove range from source svm_bo->node
+		 * Migrate from GPU to GPU, remove range from source svm_bo node
 		 * range list, and return false to allocate svm_bo from destination
 		 * node.
 		 */
-		if (prange->svm_bo->node != node) {
+		if (prange->svm_bo->pdd->dev != node) {
 			mutex_unlock(&prange->lock);
 
 			spin_lock(&prange->svm_bo->list_lock);
@@ -532,6 +532,7 @@ int
 svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange,
 			bool clear)
 {
+	struct kfd_process_device *pdd;
 	struct amdgpu_bo_param bp;
 	struct svm_range_bo *svm_bo;
 	struct amdgpu_bo_user *ubo;
@@ -548,17 +549,22 @@ svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange,
 		return 0;
 
 	svm_bo = svm_range_bo_new();
-	if (!svm_bo) {
-		pr_debug("failed to alloc svm bo\n");
+	if (!svm_bo)
 		return -ENOMEM;
+
+	pdd = svm_range_get_pdd_by_node(prange, node);
+	if (!pdd) {
+		r = -ESRCH;
+		goto out_free;
 	}
+	svm_bo->pdd = pdd;
+
 	mm = get_task_mm(p->lead_thread);
 	if (!mm) {
 		pr_debug("failed to get mm\n");
-		kfree(svm_bo);
-		return -ESRCH;
+		r = -ESRCH;
+		goto out_free;
 	}
-	svm_bo->node = node;
 	svm_bo->eviction_fence =
 		amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1),
 	   mm,
@@ -629,6 +635,7 @@ svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange,
 	amdgpu_bo_unref(&bo);
 create_bo_failed:
 	dma_fence_put(&svm_bo->eviction_fence->base);
+out_free:
 	kfree(svm_bo);
 	prange->ttm_res = NULL;
 
@@ -1176,7 +1183,7 @@ svm_range_get_pte_flags(struct kfd_node *node,
 	unsigned int mtype_local;
 
 	if (domain == SVM_RANGE_VRAM_DOMAIN)
-		bo_node = prange->svm_bo->node;
+		bo_node = prange->svm_bo->pdd->dev;
 
 	switch (amdgpu_ip_version(node->adev, GC_HWIP, 0)) {
 	case IP_VERSION(9, 4, 1):
@@ -1440,7 +1447,7 @@ svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset,
 	int r = 0;
 
 	if (prange->svm_bo && prange->ttm_res)
-		bo_adev = prange->svm_bo->node->adev;
+		bo_adev = prange->svm_bo->pdd->dev->adev;
 
 	p = container_of(prange->svms, struct kfd_process, svms);
 	for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index bddd24f04669..fad2d6d2223a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -48,7 +48,7 @@ struct svm_range_bo {
 	struct work_struct		eviction_work;
 	uint32_t			evicting;
 	struct work_struct		release_work;
-	struct kfd_node			*node;
+	struct kfd_process_device	*pdd;
 };
 
 enum svm_work_list_ops {


  



[PATCH 2/2] drm/amdkfd: Accounting pdd vram_usage for svm

2024-10-11 Thread Philip Yang
Process device data pdd->vram_usage is read by rocm-smi via sysfs, this
is currently missing the svm_bo usage accounting, so "rocm-smi
--showpids" per process VRAM usage report is incorrect.

Add pdd->vram_usage accounting when svm_bo allocation and free, and
change type to atomic64_t because it is updated outside process mutex
now.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 6 +++---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h| 2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 2 ++
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index a1f191a5984b..065d87841459 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1148,7 +1148,7 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file 
*filep,
 
if (flags & KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM)
size >>= 1;
-   WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + PAGE_ALIGN(size));
+   atomic64_add(PAGE_ALIGN(size), &pdd->vram_usage);
}
 
mutex_unlock(&p->mutex);
@@ -1219,7 +1219,7 @@ static int kfd_ioctl_free_memory_of_gpu(struct file 
*filep,
kfd_process_device_remove_obj_handle(
pdd, GET_IDR_HANDLE(args->handle));
 
-   WRITE_ONCE(pdd->vram_usage, pdd->vram_usage - size);
+   atomic64_sub(size, &pdd->vram_usage);
 
 err_unlock:
 err_pdd:
@@ -2347,7 +2347,7 @@ static int criu_restore_memory_of_gpu(struct 
kfd_process_device *pdd,
} else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
bo_bucket->restored_offset = offset;
/* Update the VRAM usage count */
-   WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + bo_bucket->size);
+   atomic64_add(bo_bucket->size, &pdd->vram_usage);
}
return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 6a5bf88cc232..9e5ca0b93b2a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -775,7 +775,7 @@ struct kfd_process_device {
enum kfd_pdd_bound bound;
 
/* VRAM usage */
-   uint64_t vram_usage;
+   atomic64_t vram_usage;
struct attribute attr_vram;
char vram_filename[MAX_SYSFS_FILENAME_LEN];
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 7909dfd158be..4810521736a9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -332,7 +332,7 @@ static ssize_t kfd_procfs_show(struct kobject *kobj, struct 
attribute *attr,
} else if (strncmp(attr->name, "vram_", 5) == 0) {
struct kfd_process_device *pdd = container_of(attr, struct 
kfd_process_device,
  attr_vram);
-   return snprintf(buffer, PAGE_SIZE, "%llu\n", 
READ_ONCE(pdd->vram_usage));
+   return snprintf(buffer, PAGE_SIZE, "%llu\n", 
atomic64_read(&pdd->vram_usage));
} else if (strncmp(attr->name, "sdma_", 5) == 0) {
struct kfd_process_device *pdd = container_of(attr, struct 
kfd_process_device,
  attr_sdma);
@@ -1625,7 +1625,7 @@ struct kfd_process_device 
*kfd_create_process_device_data(struct kfd_node *dev,
pdd->bound = PDD_UNBOUND;
pdd->already_dequeued = false;
pdd->runtime_inuse = false;
-   pdd->vram_usage = 0;
+   atomic64_set(&pdd->vram_usage, 0);
pdd->sdma_past_activity_counter = 0;
pdd->user_gpu_id = dev->id;
atomic64_set(&pdd->evict_duration_counter, 0);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index d40f6fb803df..ba501fffa556 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -409,6 +409,7 @@ static void svm_range_bo_release(struct kref *kref)
/* We're not in the eviction worker. Signal the fence. */
dma_fence_signal(&svm_bo->eviction_fence->base);
dma_fence_put(&svm_bo->eviction_fence->base);
+   atomic64_sub(amdgpu_bo_size(svm_bo->bo), &svm_bo->pdd->vram_usage);
amdgpu_bo_unref(&svm_bo->bo);
kfree(svm_bo);
 }
@@ -628,6 +629,7 @@ svm_range_vram_node_new(struct kfd_node *node, struct 
svm_range *prange,
spin_lock(&svm_bo->list_lock);
list_add(&prange->svm_bo_list, &svm_bo->range_list);
spin_unlock(&svm_bo->list_lock);
+   atomic64_add(amdgpu_bo_size(bo), &svm_bo->pdd->vram_usage);
 
return 0;
 
-- 
2.43.2



[PATCH 1/2] drm/amdkfd: Save pdd to svm_bo to replace node

2024-10-11 Thread Philip Yang
KFD process device data pdd will be used for VRAM usage accounting, save
pdd to svm_bo to avoid searching pdd for every accounting, and get KFD
node from pdd->dev.

svm_bo->pdd will always be valid because KFD process release free all
svm_bo first, then destroy process pdds.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 27 +--
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h |  2 +-
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 857ec6f23bba..d40f6fb803df 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -180,7 +180,7 @@ svm_range_dma_map_dev(struct amdgpu_device *adev, struct 
svm_range *prange,
 
page = hmm_pfn_to_page(hmm_pfns[i]);
if (is_zone_device_page(page)) {
-   struct amdgpu_device *bo_adev = 
prange->svm_bo->node->adev;
+   struct amdgpu_device *bo_adev = 
prange->svm_bo->pdd->dev->adev;
 
addr[i] = (hmm_pfns[i] << PAGE_SHIFT) +
   bo_adev->vm_manager.vram_base_offset -
@@ -457,11 +457,11 @@ svm_range_validate_svm_bo(struct kfd_node *node, struct 
svm_range *prange)
}
if (svm_bo_ref_unless_zero(prange->svm_bo)) {
/*
-* Migrate from GPU to GPU, remove range from source 
svm_bo->node
+* Migrate from GPU to GPU, remove range from source svm_bo node
 * range list, and return false to allocate svm_bo from 
destination
 * node.
 */
-   if (prange->svm_bo->node != node) {
+   if (prange->svm_bo->pdd->dev != node) {
mutex_unlock(&prange->lock);
 
spin_lock(&prange->svm_bo->list_lock);
@@ -532,6 +532,7 @@ int
 svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange,
bool clear)
 {
+   struct kfd_process_device *pdd;
struct amdgpu_bo_param bp;
struct svm_range_bo *svm_bo;
struct amdgpu_bo_user *ubo;
@@ -548,17 +549,22 @@ svm_range_vram_node_new(struct kfd_node *node, struct 
svm_range *prange,
return 0;
 
svm_bo = svm_range_bo_new();
-   if (!svm_bo) {
-   pr_debug("failed to alloc svm bo\n");
+   if (!svm_bo)
return -ENOMEM;
+
+   pdd = svm_range_get_pdd_by_node(prange, node);
+   if (!pdd) {
+   r = -ESRCH;
+   goto out_free;
}
+   svm_bo->pdd = pdd;
+
mm = get_task_mm(p->lead_thread);
if (!mm) {
pr_debug("failed to get mm\n");
-   kfree(svm_bo);
-   return -ESRCH;
+   r = -ESRCH;
+   goto out_free;
}
-   svm_bo->node = node;
svm_bo->eviction_fence =
amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1),
   mm,
@@ -629,6 +635,7 @@ svm_range_vram_node_new(struct kfd_node *node, struct 
svm_range *prange,
amdgpu_bo_unref(&bo);
 create_bo_failed:
dma_fence_put(&svm_bo->eviction_fence->base);
+out_free:
kfree(svm_bo);
prange->ttm_res = NULL;
 
@@ -1176,7 +1183,7 @@ svm_range_get_pte_flags(struct kfd_node *node,
unsigned int mtype_local;
 
if (domain == SVM_RANGE_VRAM_DOMAIN)
-   bo_node = prange->svm_bo->node;
+   bo_node = prange->svm_bo->pdd->dev;
 
switch (amdgpu_ip_version(node->adev, GC_HWIP, 0)) {
case IP_VERSION(9, 4, 1):
@@ -1440,7 +1447,7 @@ svm_range_map_to_gpus(struct svm_range *prange, unsigned 
long offset,
int r = 0;
 
if (prange->svm_bo && prange->ttm_res)
-   bo_adev = prange->svm_bo->node->adev;
+   bo_adev = prange->svm_bo->pdd->dev->adev;
 
p = container_of(prange->svms, struct kfd_process, svms);
for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index bddd24f04669..fad2d6d2223a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -48,7 +48,7 @@ struct svm_range_bo {
struct work_struct  eviction_work;
uint32_tevicting;
struct work_struct  release_work;
-   struct kfd_node *node;
+   struct kfd_process_device   *pdd;
 };
 
 enum svm_work_list_ops {
-- 
2.43.2



Re: [PATCH] drm/amdkfd: Accounting pdd vram_usage for svm

2024-10-11 Thread Philip Yang

  


On 2024-10-09 17:20, Felix Kuehling
  wrote:


  
  On 2024-10-04 16:28, Philip Yang wrote:
  
  Per process device data pdd->vram_usage
is used by rocm-smi to report

VRAM usage, this is currently missing the svm_bo usage
accounting, so

"rocm-smi --showpids" per process report is incorrect.


Add pdd->vram_usage accounting for svm_bo and change type to
atomic64_t

because it is updated outside process mutex now.


    Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  6 +++---

  drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  2 +-

  drivers/gpu/drm/amd/amdkfd/kfd_process.c |  4 ++--

  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 22
++

  4 files changed, 28 insertions(+), 6 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c

index a1f191a5984b..065d87841459 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c

@@ -1148,7 +1148,7 @@ static int
kfd_ioctl_alloc_memory_of_gpu(struct file *filep,

    if (flags &
KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM)

  size >>= 1;

-    WRITE_ONCE(pdd->vram_usage, pdd->vram_usage +
PAGE_ALIGN(size));

+    atomic64_add(PAGE_ALIGN(size),
&pdd->vram_usage);

  }

    mutex_unlock(&p->mutex);

@@ -1219,7 +1219,7 @@ static int
kfd_ioctl_free_memory_of_gpu(struct file *filep,

  kfd_process_device_remove_obj_handle(

  pdd, GET_IDR_HANDLE(args->handle));

  -    WRITE_ONCE(pdd->vram_usage, pdd->vram_usage -
size);

+    atomic64_sub(size, &pdd->vram_usage);

    err_unlock:

  err_pdd:

@@ -2347,7 +2347,7 @@ static int
criu_restore_memory_of_gpu(struct kfd_process_device *pdd,

  } else if (bo_bucket->alloc_flags &
KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {

  bo_bucket->restored_offset = offset;

  /* Update the VRAM usage count */

-    WRITE_ONCE(pdd->vram_usage, pdd->vram_usage +
bo_bucket->size);

+    atomic64_add(bo_bucket->size,
&pdd->vram_usage);

  }

  return 0;

  }

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 6a5bf88cc232..9e5ca0b93b2a 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

@@ -775,7 +775,7 @@ struct kfd_process_device {

  enum kfd_pdd_bound bound;

    /* VRAM usage */

-    uint64_t vram_usage;

+    atomic64_t vram_usage;

  struct attribute attr_vram;

  char vram_filename[MAX_SYSFS_FILENAME_LEN];

  diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c

index 7909dfd158be..4810521736a9 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c

@@ -332,7 +332,7 @@ static ssize_t kfd_procfs_show(struct
kobject *kobj, struct attribute *attr,

  } else if (strncmp(attr->name, "vram_", 5) == 0) {

  struct kfd_process_device *pdd = container_of(attr,
struct kfd_process_device,

    attr_vram);

-    return snprintf(buffer, PAGE_SIZE, "%llu\n",
READ_ONCE(pdd->vram_usage));

+    return snprintf(buffer, PAGE_SIZE, "%llu\n",
atomic64_read(&pdd->vram_usage));

  } else if (strncmp(attr->name, "sdma_", 5) == 0) {

  struct kfd_process_device *pdd = container_of(attr,
struct kfd_process_device,

    attr_sdma);

@@ -1625,7 +1625,7 @@ struct kfd_process_device
*kfd_create_process_device_data(struct kfd_node *dev,

  pdd->b

[PATCH] drm/amdkfd: Accounting pdd vram_usage for svm

2024-10-04 Thread Philip Yang
Per process device data pdd->vram_usage is used by rocm-smi to report
VRAM usage, this is currently missing the svm_bo usage accounting, so
"rocm-smi --showpids" per process report is incorrect.

Add pdd->vram_usage accounting for svm_bo and change type to atomic64_t
because it is updated outside process mutex now.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  6 +++---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |  4 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 22 ++
 4 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index a1f191a5984b..065d87841459 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1148,7 +1148,7 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file 
*filep,
 
if (flags & KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM)
size >>= 1;
-   WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + PAGE_ALIGN(size));
+   atomic64_add(PAGE_ALIGN(size), &pdd->vram_usage);
}
 
mutex_unlock(&p->mutex);
@@ -1219,7 +1219,7 @@ static int kfd_ioctl_free_memory_of_gpu(struct file 
*filep,
kfd_process_device_remove_obj_handle(
pdd, GET_IDR_HANDLE(args->handle));
 
-   WRITE_ONCE(pdd->vram_usage, pdd->vram_usage - size);
+   atomic64_sub(size, &pdd->vram_usage);
 
 err_unlock:
 err_pdd:
@@ -2347,7 +2347,7 @@ static int criu_restore_memory_of_gpu(struct 
kfd_process_device *pdd,
} else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
bo_bucket->restored_offset = offset;
/* Update the VRAM usage count */
-   WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + bo_bucket->size);
+   atomic64_add(bo_bucket->size, &pdd->vram_usage);
}
return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 6a5bf88cc232..9e5ca0b93b2a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -775,7 +775,7 @@ struct kfd_process_device {
enum kfd_pdd_bound bound;
 
/* VRAM usage */
-   uint64_t vram_usage;
+   atomic64_t vram_usage;
struct attribute attr_vram;
char vram_filename[MAX_SYSFS_FILENAME_LEN];
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 7909dfd158be..4810521736a9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -332,7 +332,7 @@ static ssize_t kfd_procfs_show(struct kobject *kobj, struct 
attribute *attr,
} else if (strncmp(attr->name, "vram_", 5) == 0) {
struct kfd_process_device *pdd = container_of(attr, struct 
kfd_process_device,
  attr_vram);
-   return snprintf(buffer, PAGE_SIZE, "%llu\n", 
READ_ONCE(pdd->vram_usage));
+   return snprintf(buffer, PAGE_SIZE, "%llu\n", 
atomic64_read(&pdd->vram_usage));
} else if (strncmp(attr->name, "sdma_", 5) == 0) {
struct kfd_process_device *pdd = container_of(attr, struct 
kfd_process_device,
  attr_sdma);
@@ -1625,7 +1625,7 @@ struct kfd_process_device 
*kfd_create_process_device_data(struct kfd_node *dev,
pdd->bound = PDD_UNBOUND;
pdd->already_dequeued = false;
pdd->runtime_inuse = false;
-   pdd->vram_usage = 0;
+   atomic64_set(&pdd->vram_usage, 0);
pdd->sdma_past_activity_counter = 0;
pdd->user_gpu_id = dev->id;
atomic64_set(&pdd->evict_duration_counter, 0);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 857ec6f23bba..61891ea6b1ac 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -379,6 +379,7 @@ static bool svm_bo_ref_unless_zero(struct svm_range_bo 
*svm_bo)
 static void svm_range_bo_release(struct kref *kref)
 {
struct svm_range_bo *svm_bo;
+   struct mm_struct *mm = NULL;
 
svm_bo = container_of(kref, struct svm_range_bo, kref);
pr_debug("svm_bo 0x%p\n", svm_bo);
@@ -405,6 +406,22 @@ static void svm_range_bo_release(struct kref *kref)
spin_lock(&svm_bo->list_lock);
}
spin_unlock(&svm_bo->list_lock);
+
+   if (mmget_not_zero(svm_bo->eviction_fence->mm)) {
+   struct kfd_process_device *pdd;
+   struct kfd_process *p;
+
+   mm 

[PATCH] drm/amdkfd: Copy wave state only for compute queue

2024-10-03 Thread Philip Yang
get_wave_state is not defined for sdma queue, copy_context_work_handler
calls it for sdma queue will crash.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 648f40091aa3..b2b16a812e73 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -3173,7 +3173,7 @@ struct copy_context_work_handler_workarea {
struct kfd_process *p;
 };
 
-static void copy_context_work_handler (struct work_struct *work)
+static void copy_context_work_handler(struct work_struct *work)
 {
struct copy_context_work_handler_workarea *workarea;
struct mqd_manager *mqd_mgr;
@@ -3200,6 +3200,9 @@ static void copy_context_work_handler (struct work_struct 
*work)
struct qcm_process_device *qpd = &pdd->qpd;
 
list_for_each_entry(q, &qpd->queues_list, list) {
+   if (q->properties.type != KFD_QUEUE_TYPE_COMPUTE)
+   continue;
+
mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_CP];
 
/* We ignore the return value from get_wave_state
-- 
2.43.2



Re: [PATCH] drm/amdkfd: fix vm-pasid lookup for multiple partitions

2024-09-11 Thread Philip Yang

  


On 2024-09-11 02:54, Christian König
  wrote:


  
  Yeah, I completely agree with Xiaogang.
  
  The PASID is an identifier of an address space. And the idea of
  the KFD was that we can just use the same address space and with
  it the page tables for multiple execution devices, e.g. CPUs, GPUs
  etc...
  
  That idea turned out to be a bad one because it clashes with some
  use cases (e.g. native context virtualization). The better
  approach is to see the CPU and GPU processes as separate things
  which just share the same underlying data.
  
  Opening the KFD node multiple times currently results in the same
  KFD process being used. We should probably consider changing that.

It is one KFD process binding to one app process, with count to
  support multiple open/close of for the same process.

The IOMMU most likely uses Linux process pid, not from kfd
  process->pasid. The KFD process->pasid is passed to F/W to
  map queues, flush TLB.
The reason to replace vm->pasid with KFD process->pasid is
  to find vm from fault pasid, then for compute vm, find the kfd
  process from pasid.
I can see a bug in amdgpu_vm_handle_fault, only for compute vm,
  to force update PTE no-retry-fault to the incorrect VM for
  multiple partitions. This patch will fix this bug but we can have
  a simple fix.

Regards,
Philip

 
  Regards,
  Christian.
  
  Am 11.09.24 um 01:59 schrieb Chen,
Xiaogang:
  
  


You want have 1:1 mapping between vm and pasid so can query
  vm from pasid.  I think there is a basic existing issue that
  we cannot have vm and pasid 1:1 correspondence.
PASIDs are global address space identifiers that can be
  shared between the GPU, an IOMMU and the driver. One app
  should have one pasid that iommu uses to decide which page
  table to use when device access system resource. But one app
  can open render/kfd node multiple times even for one gpu. That
  said one app could have multiple GPU vms .
I think we did not have this issue because app usually open a
  rent node or kfd node only once. With one adev has multiple
  partitions there are multiple vms on one adev, so have this
  issue.  But the root cause is not from multiple partitions and
  solution is not to introduce multiple pasids. I think we
  should have one pasid for one app and use different way to get
  vm from pasid.


Regards
Xiaogang

On 9/10/2024 3:47 PM, Kim, Jonathan
  wrote:


  
  
   [Public]
  
  
  

  

  
  
  
  
Caution: This message
  originated from an External Source. Use proper
  caution when opening attachments, clicking links,
  or responding. 
  

  



   [Public]
  
  
  

  KMS
  open still set per pasid-vm bindings per adev
  (socket) so I don’t see how the per-partition
  overwrite PASID issue is primarily a KFD concern.
  Are
  you saying the KFD process devices holds a shadow
  copy of the correct VM during page restore during
  fault?
  Doesn’t
  it acquire the wrong VM object on process init in
  the first place?
  Even
  if it were the case the KFD had a separate VM
  reference, the underlying IRQ fault handling is
  still broken.
  We
  probably don’t want to bandage over something to
  fix one symptom.
   
  Jon
   
   
  

  
From:
Yang, Philip 

Sent: Tuesday, September 10, 2024
11:24 AM
To: Koenig, Christian ;
Kim, Jonathan ;
amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix ;
Deucher, Alexander ;
Joshi, Mukul 
Subject: Re: [PATCH] drm/amdkfd: fix
   

Re: [PATCH] drm/amdkfd: fix vm-pasid lookup for multiple partitions

2024-09-10 Thread Philip Yang

  


On 2024-09-09 14:46, Christian König
  wrote:

Am
  09.09.24 um 18:02 schrieb Kim, Jonathan:
  
  [Public]


-Original Message-
  
  From: Christian König 
  
  Sent: Thursday, September 5, 2024 10:24 AM
  
  To: Kim, Jonathan ;
  amd-gfx@lists.freedesktop.org
  
  Cc: Kuehling, Felix ; Deucher,
  Alexander
  
  ; Joshi, Mukul
  
  
  Subject: Re: [PATCH] drm/amdkfd: fix vm-pasid lookup for
  multiple partitions
  
  
  Caution: This message originated from an External Source. Use
  proper caution
  
  when opening attachments, clicking links, or responding.
  
  
  
  Am 19.08.24 um 19:59 schrieb Jonathan Kim:
  
  Currently multiple partitions will
incorrectly overwrite the VM lookup

table since the table is indexed by PASID and multiple
partitions can

register different VM objects on the same PASID.

  
  That's a rather bad idea. Why do we have the same PASID for
  different VM
  
  objects in the first place?
  

Alex can probably elaborate on the KGD side, but from what I can
see, the KMS driver open call has always assigned a new VM
object per PASID on an open call.

The KFD acquires and replaces the KGD PASID-VMID registration on
its own compute process open/creation call.

If this is the bad_idea you're referring to, then someone else
will have to chime in.  I don't have much history on this
unfortunately.

  
  
  Yeah, Felix and I designed that.
  

app opens drm node to create vm for each partition, with
  different vm->pasid for each vm, issue is from
  kfd_ioctl_acquire_vm -> kfd_process_device_init_vm -> 
  amdgpu_amdkfd_gpuvm_set_vm_pasid, to replace all vm->pasid with
  kfd process->pasid, which is from open kfd node. This ends up
  to store only one vm to adev->vm_manager.pasids with KFD
  process pasid, so we cannot retrieve correct vm from
  adev->vm_manager.pasids on mGPUs or multiple partitions.


  
  That aside, the current problem is, is
that all KFD device structures are logical partitions and
register their PASID-VM binding using this concept of a device.

  
  
  As far as I can see that is the fundamental problem. This needs to
  be fixed instead.
  
  
  On the KGD side however, the registration
table is maintained in the adev struct, which is a physical
socket.

So there's a mismatch in understanding of what a device is
between the KFD & KGD with regard to the look up table that
results in bad bindings.


Adding a per-partition dimension to the existing lookup table
resolves issues where seeing, for example, with memory violation
interception and XNACK i.e bad bindings result in wrong vm
object found to set no-retry flags on memory violations.

  

svm_range_restore_pages retry fault recover uses fault pasid to
  get kfd process, and use the fault node_id to get pdd->vm,
  maybe you can use this way to fix the debugger issue.

Regards,
Philip


  
  Yeah that is pretty much a no-go.
  
  
  The PASID and how it is used is defined by the PCIe
  specifications. If we now start to assign multiple VMs to the same
  PASID then we are violating the PCIe specification.
  
  
  The problems you see are most likely just the tip of the iceberg
  here.
  
  
  Regards,
  
  Christian.
  
  
  

Jon


Regards,
  
  Christian.
  
  
  This results in loading the wrong VM
object on PASID query.


To correct this, setup the lookup table to be
per-partition-per-PASID

instead.


Signed-off-by: Jonathan Kim 

---

   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    | 12 

   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  1 +

   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   |  4 +-

   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   |  7 ++-

   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    | 55
+++--

  
  --
  
    
driver

Re: [PATCH V5] drm/amdgpu: Surface svm_default_granularity, a RW module parameter

2024-09-04 Thread Philip Yang

  


On 2024-09-03 19:24, Ramesh Errabolu
  wrote:


  Enables users to update SVM's default granularity, used in
buffer migration and handling of recoverable page faults.
Param value is set in terms of log(numPages(buffer)),
e.g. 9 for a 2 MIB buffer

Signed-off-by: Ramesh Errabolu 

With 2 below nitpicks fixed, this patch is
Reviewed-by: Philip Yang 
change subject to "drm/amdkfd: Add svm_default_granularity module
  parameter"


  
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 17 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h   |  6 ++
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c| 23 +++
 4 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index e8c284aea1f2..8eb934af02f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -237,6 +237,7 @@ extern int sched_policy;
 extern bool debug_evictions;
 extern bool no_system_mem_limit;
 extern int halt_if_hws_hang;
+extern uint amdgpu_svm_default_granularity;
 #else
 static const int __maybe_unused sched_policy = KFD_SCHED_POLICY_HWS;
 static const bool __maybe_unused debug_evictions; /* = false */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index b9529948f2b2..442039436cb3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -169,6 +169,16 @@ uint amdgpu_sdma_phase_quantum = 32;
 char *amdgpu_disable_cu;
 char *amdgpu_virtual_display;
 bool enforce_isolation;
+
+/* Specifies the default granularity for SVM, used in buffer
+ * migration and restoration of backing memory when handling
+ * recoverable page faults.
+ *
+ * The value is given as log(numPages(buffer)); for a 2 MiB
+ * buffer it computes to be 9
+ */
+uint amdgpu_svm_default_granularity = 9;
+
 /*
  * OverDrive(bit 14) disabled by default
  * GFX DCS(bit 19) disabled by default
@@ -320,6 +330,13 @@ module_param_named(pcie_gen2, amdgpu_pcie_gen2, int, 0444);
 MODULE_PARM_DESC(msi, "MSI support (1 = enable, 0 = disable, -1 = auto)");
 module_param_named(msi, amdgpu_msi, int, 0444);
 
+/**
+ * DOC: svm_default_granularity (uint)
+ * Used in buffer migration and handling of recoverable page faults
+ */
+MODULE_PARM_DESC(svm_default_granularity, "SVM's default granularity in log(2^Pages), default 9 = 2^9 = 2 MiB");
+module_param_named(svm_default_granularity, amdgpu_svm_default_granularity, uint, 0644);
+
 /**
  * DOC: lockup_timeout (string)
  * Set GPU scheduler timeout value in ms.
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 9ae9abc6eb43..d6530febabad 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -868,6 +868,12 @@ struct svm_range_list {
 	struct task_struct		*faulting_task;
 	/* check point ts decides if page fault recovery need be dropped */
 	uint64_t			checkpoint_ts[MAX_GPU_INSTANCE];
+
+	/* Default granularity to use in buffer migration
+	 * and restoration of backing memory while handling
+	 * recoverable page faults
+	 */
+	uint8_t default_granularity;
 };
 
 /* Process data */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index b44dec90969f..2bc2389cc7f0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -309,12 +309,13 @@ static void svm_range_free(struct svm_range *prange, bool do_unmap)
 }
 
 static void
-svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc,
- uint8_t *granularity, uint32_t *flags)
+svm_range_set_default_attributes(struct svm_range_list *svms, int32_t *location,
+ int32_t *prefetch_loc, uint8_t *granularity,
+ uint32_t *flags)
 {
 	*location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
 	*prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
-	*granularity = 9;
+	*granularity = svms->default_granularity;
 	*flags =
 		KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT;
 }
@@ -358,9 +359,10 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
 		bitmap_copy(prange->bitmap_access, svms->bitmap_supported,
 			MAX_GPU_INSTANCE);
 
-	svm_range_set_default_attributes(&prange->preferred_loc,
+	svm_range_set_default_attributes(svms, &prange->preferred_loc,
 	 &prange->prefetch_loc,
-	 &prange->granularity, &prange->flags);
+	 &prange->granularity,
+	 &prange->flags);

unnecessary extra change.

  
 
 	pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last);
 
@@ -2694,9 +2696,10 @@ svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr,
 	*is_heap_stack = vma_is_initial_heap(vma) || vma_is_initial_stack(vma);
 
 	start_limi

Re: [PATCH v2] drm/amdgpu: fix a call trace when unload amdgpu driver

2024-09-04 Thread Philip Yang

  


On 2024-09-04 04:04, Asher Song wrote:


  In some APUs, the bo type of GART page table is ttm_bo_type_sg.
Those type BOs is released by bo->delayed_delete which is added in ttm_device->wq, not released immediately.

To make sure all the ttm_resource is released before ttm_resource_manager is finilized, drain the workqueue in ttm_device.

v2: move drain_workqueue to amdgpu_ttm.c

Fixes:d99fbd9aab62 ("drm/ttm: Always take the bo delayed cleanup path for imported bos")
Suggested-by: Christian König 
Signed-off-by: Asher Song 
    
Acked-by: Philip Yang 

Most likely this will fix another bug caused by race condition b/w GPU mode 1 reset and delayed bo cleanup worker.

Thank you.
Philip


  
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 5c938ff0bf48..cbac21df5c47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -2461,6 +2461,7 @@ void amdgpu_ttm_fini(struct amdgpu_device *adev)
 		drm_dev_exit(idx);
 	}
 
+	drain_workqueue(adev->mman.bdev.wq);
 	amdgpu_direct_gma_fini(adev);
 	amdgpu_vram_mgr_fini(adev);
 	amdgpu_gtt_mgr_fini(adev);


  



Re: [PATCH] drm/amdgpu: fix invalid fence handling in amdgpu_vm_tlb_flush

2024-09-04 Thread Philip Yang

  


On 2024-09-02 05:06, Christian König
  wrote:

Am
  02.09.24 um 05:03 schrieb Lang Yu:
  
  Fixes: 5a1c27951966 ("drm/amdgpu:
implement TLB flush fence")


Signed-off-by: Lang Yu 

  
  
  Ah yes, that explains why CPU based updates doesn't work reliable
  any more.
  

My understanding is amdgpu_vm_cpu_commit increase the
  vm->tlb_seq if needs_flush, so this patch only fix the tlb_cb
  memory leaking issue.
Regards,
Philip


  
  You need to add some explanation to the commit message, e.g.
  something like "CPU based updates doesn't produce a fence."
  
  
  With that done Reviewed-by: Christian König
  
  
  
  ---

  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 6 --

  1 file changed, 4 insertions(+), 2 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

index 1d46a5c81ec4..f93804902fd3 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

@@ -908,10 +908,12 @@ amdgpu_vm_tlb_flush(struct
amdgpu_vm_update_params *params,

  {

  struct amdgpu_vm *vm = params->vm;

  -    if (!fence || !*fence)

+    tlb_cb->vm = vm;

+    if (!fence || !*fence) {

+    amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);

  return;

+    }

  -    tlb_cb->vm = vm;

  if (!dma_fence_add_callback(*fence, &tlb_cb->cb,

  amdgpu_vm_tlb_seq_cb)) {

  dma_fence_put(vm->last_tlb_flush);

  
  

  



Re: [PATCH V3] drm/amdgpu: Surface svm_default_granularity, a RW module parameter

2024-09-03 Thread Philip Yang

  


On 2024-08-29 18:31, Chen, Xiaogang
  wrote:


  
  
  On 8/29/2024 5:13 PM, Ramesh Errabolu wrote:
  
  Caution: This message originated from an
External Source. Use proper caution when opening attachments,
clicking links, or responding.



Enables users to update SVM's default granularity, used in

buffer migration and handling of recoverable page faults.

Param value is set in terms of log(numPages(buffer)),

e.g. 9 for a 2 MIB buffer

  
  
  Forgot asking if this parameter is request from customer or used
  for debug/experiment purpose? If it is later, how about put it at
  debug fs? There are already many driver parameters.
  

debugfs is not always available, depending on kernel
  configuration, and debugfs seems for debugging purpose, ex.
  /sys/kernel/debug/kfd/mqds, hqds, not for functional purpose. one
  comment embedded below.


  
  Regards
  
  
  Xiaogang
  
  
  Signed-off-by: Ramesh Errabolu


---

  drivers/gpu/drm/amd/amdgpu/amdgpu.h |  1 +

  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 17 +

  drivers/gpu/drm/amd/amdkfd/kfd_priv.h   |  6 ++

  drivers/gpu/drm/amd/amdkfd/kfd_svm.c    | 25
+++--

  4 files changed, 39 insertions(+), 10 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index e8c284aea1f2..8eb934af02f2 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

@@ -237,6 +237,7 @@ extern int sched_policy;

  extern bool debug_evictions;

  extern bool no_system_mem_limit;

  extern int halt_if_hws_hang;

+extern uint amdgpu_svm_default_granularity;

  #else

  static const int __maybe_unused sched_policy =
KFD_SCHED_POLICY_HWS;

  static const bool __maybe_unused debug_evictions; /* = false
*/

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

index b9529948f2b2..442039436cb3 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

@@ -169,6 +169,16 @@ uint amdgpu_sdma_phase_quantum = 32;

  char *amdgpu_disable_cu;

  char *amdgpu_virtual_display;

  bool enforce_isolation;

+

+/* Specifies the default granularity for SVM, used in buffer

+ * migration and restoration of backing memory when handling

+ * recoverable page faults.

+ *

+ * The value is given as log(numPages(buffer)); for a 2 MiB

+ * buffer it computes to be 9

+ */

+uint amdgpu_svm_default_granularity = 9;

+

  /*

   * OverDrive(bit 14) disabled by default

   * GFX DCS(bit 19) disabled by default

@@ -320,6 +330,13 @@ module_param_named(pcie_gen2,
amdgpu_pcie_gen2, int, 0444);

  MODULE_PARM_DESC(msi, "MSI support (1 = enable, 0 = disable,
-1 = auto)");

  module_param_named(msi, amdgpu_msi, int, 0444);


+/**

+ * DOC: svm_default_granularity (uint)

+ * Used in buffer migration and handling of recoverable page
faults

+ */

+MODULE_PARM_DESC(svm_default_granularity, "SVM's default
granularity in log(2^Pages), default 9 = 2^9 = 2 MiB");

+module_param_named(svm_default_granularity,
amdgpu_svm_default_granularity, uint, 0644);

+

  /**

   * DOC: lockup_timeout (string)

   * Set GPU scheduler timeout value in ms.

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 9ae9abc6eb43..d6530febabad 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

@@ -868,6 +868,12 @@ struct svm_range_list {

 struct task_struct  *faulting_task;

 /* check point ts decides if page fault recovery need
be dropped */

 uint64_t   
checkpoint_ts[MAX_GPU_INSTANCE];

Re: [PATCH] drm/amdkfd: restore_process_worker race with GPU reset

2024-08-29 Thread Philip Yang

  


On 2024-08-29 17:15, Felix Kuehling
  wrote:

On
  2024-08-23 15:49, Philip Yang wrote:
  
  If GPU reset kick in while KFD
restore_process_worker running, this may

causes different issues, for example below rcu stall warning,
because

restore work may move BOs and evict queues under VRAM pressure.


Fix this race by taking adev reset_domain read semaphore to
prevent GPU

reset in restore_process_worker, the reset read semaphore can be
taken

recursively if adev have multiple partitions.


Then there is live locking issue if CP hangs while

restore_process_worker runs, then GPU reset wait for semaphore
to start

and restore_process_worker cannot finish to release semaphore.
We need

signal eviction fence to solve the live locking if evict queue
return

-ETIMEOUT (for MES path) or -ETIME (for HWS path) because CP
hangs,


  amdgpu :af:00.0: amdgpu: GPU reset(21) succeeded!

  rcu: INFO: rcu_sched self-detected stall on CPU


  Workqueue: kfd_restore_wq restore_process_worker [amdgpu]

  Call Trace:

   update_process_times+0x94/0xd0

  RIP: 0010:amdgpu_vm_handle_moved+0x9a/0x210 [amdgpu]

   amdgpu_amdkfd_gpuvm_restore_process_bos+0x3d6/0x7d0 [amdgpu]

   restore_process_helper+0x27/0x80 [amdgpu]


Signed-off-by: Philip Yang 

  
  
  See comments inline. I'd also like Christian to take a look at
  this patch since he's the expert on the reset locking stuff.
  
  
  
  ---

  drivers/gpu/drm/amd/amdkfd/kfd_process.c | 56
+++-

  1 file changed, 55 insertions(+), 1 deletion(-)


diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c

index a902950cc060..53a814347522 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c

@@ -35,6 +35,7 @@

  #include 

  #include "amdgpu_amdkfd.h"

  #include "amdgpu.h"

+#include "amdgpu_reset.h"

    struct mm_struct;

  @@ -1972,8 +1973,14 @@ static void evict_process_worker(struct
work_struct *work)

  kfd_process_restore_queues(p);

    pr_debug("Finished evicting pasid 0x%x\n",
p->pasid);

-    } else

+    } else if (ret == -ETIMEDOUT || ret == -ETIME) {

+    /* If CP hangs, signal the eviction fence, then
restore_bo_worker

+ * can finish to up_read GPU reset semaphore to start
GPU reset.

+ */

+    signal_eviction_fence(p);

+    } else {

  pr_err("Failed to evict queues of pasid 0x%x\n",
p->pasid);

+    }

  }

    static int restore_process_helper(struct kfd_process *p)

@@ -1997,6 +2004,45 @@ static int restore_process_helper(struct
kfd_process *p)

  return ret;

  }

  +/*

+ * kfd_hold_devices_reset_semaphore

+ *

+ * return:

+ *   true : hold reset domain semaphore to prevent device reset

+ *   false: one of the device is resetting or already reset

+ *

+ */

+static bool kfd_hold_devices_reset_semaphore(struct kfd_process
*p)

  
  
  I find the function naming of these functions (hold/unhold) a bit
  weird. I'd suggest
  kfd_process_trylock_reset_sems/kfd_process_unlock_reset_sems.
  

ok

  
  
  +{

+    struct amdgpu_device *adev;

+    int i;

+

+    for (i = 0; i < p->n_pdds; i++) {

+    adev = p->pdds[i]->dev->adev;

+    if
(!down_read_trylock(&adev->reset_domain->sem))

+    goto out_upread;

+    }

+    return true;

+

+out_upread:

+    while (i--) {

+    adev = p->pdds[i]->dev->adev;

 

Re: [PATCH] drm/amdkfd: restore_process_worker race with GPU reset

2024-08-29 Thread Philip Yang

  


On 2024-08-28 18:01, Felix Kuehling
  wrote:


  
  On 2024-08-23 15:49, Philip Yang wrote:
  
  If GPU reset kick in while KFD
restore_process_worker running, this may

causes different issues, for example below rcu stall warning,
because

restore work may move BOs and evict queues under VRAM pressure.


Fix this race by taking adev reset_domain read semaphore to
prevent GPU

reset in restore_process_worker, the reset read semaphore can be
taken

recursively if adev have multiple partitions.

  
  
  Are you sure that an rw_sem can be read-locked recursively in the
  same thread. I can't find any evidence that this is true.
  

yes, down_read_trylock(&adev->reset_domain->sem) still
  return 1 for successful inside
  down_read_trylock(&adev->reset_domain->sem).
This works fine in many path now, for example
  execute_queues_cpsch down read lock reset_domain->sem ->
  unmap_queues_cpsch down read lock reset_domain->sem again.
Regards,
Philip


  
  Regards,
  
    Felix
  
  
  
  

Then there is live locking issue if CP hangs while

restore_process_worker runs, then GPU reset wait for semaphore
to start

and restore_process_worker cannot finish to release semaphore.
We need

signal eviction fence to solve the live locking if evict queue
return

-ETIMEOUT (for MES path) or -ETIME (for HWS path) because CP
hangs,


  amdgpu :af:00.0: amdgpu: GPU reset(21) succeeded!

  rcu: INFO: rcu_sched self-detected stall on CPU


  Workqueue: kfd_restore_wq restore_process_worker [amdgpu]

  Call Trace:

   update_process_times+0x94/0xd0

  RIP: 0010:amdgpu_vm_handle_moved+0x9a/0x210 [amdgpu]

   amdgpu_amdkfd_gpuvm_restore_process_bos+0x3d6/0x7d0 [amdgpu]

   restore_process_helper+0x27/0x80 [amdgpu]
    
    
Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdkfd/kfd_process.c | 56
+++-

  1 file changed, 55 insertions(+), 1 deletion(-)


diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c

index a902950cc060..53a814347522 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c

@@ -35,6 +35,7 @@

  #include 

  #include "amdgpu_amdkfd.h"

  #include "amdgpu.h"

+#include "amdgpu_reset.h"

    struct mm_struct;

  @@ -1972,8 +1973,14 @@ static void evict_process_worker(struct
work_struct *work)

  kfd_process_restore_queues(p);

    pr_debug("Finished evicting pasid 0x%x\n",
p->pasid);

-    } else

+    } else if (ret == -ETIMEDOUT || ret == -ETIME) {

+    /* If CP hangs, signal the eviction fence, then
restore_bo_worker

+ * can finish to up_read GPU reset semaphore to start
GPU reset.

+ */

+    signal_eviction_fence(p);

+    } else {

  pr_err("Failed to evict queues of pasid 0x%x\n",
p->pasid);

+    }

  }

    static int restore_process_helper(struct kfd_process *p)

@@ -1997,6 +2004,45 @@ static int restore_process_helper(struct
kfd_process *p)

  return ret;

  }

  +/*

+ * kfd_hold_devices_reset_semaphore

+ *

+ * return:

+ *   true : hold reset domain semaphore to prevent device reset

+ *   false: one of the device is resetting or already reset

+ *

+ */

+static bool kfd_hold_devices_reset_semaphore(struct kfd_process
*p)

+{

+    struct amdgpu_device *adev;

+    int i;

+

+    for (i = 0; i < p->n_pdds; i++) {

+    adev = p->pdds[i]->dev->adev;

+    if
(!down_read_trylock(&adev->reset_domain->sem))

+    goto out

Re: [PATCH v2] drm/amdgpu: Surface svm_attr_gobm, a RW module parameter

2024-08-28 Thread Philip Yang

  


On 2024-08-26 15:34, Ramesh Errabolu
  wrote:


  Enables users to update the default size of buffer used
in migration either from Sysmem to VRAM or vice versa.
The param GOBM refers to granularity of buffer migration,
and is specified in terms of log(numPages(buffer)). It
facilitates users of unregistered memory to control GOBM,

it is used for both registered and unregistered range cases.

  
albeit at a coarse level

Signed-off-by: Ramesh Errabolu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  4 
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 18 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h   | 12 
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c| 26 -
 4 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index e8c284aea1f2..73dd816b01f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -237,6 +237,7 @@ extern int sched_policy;
 extern bool debug_evictions;
 extern bool no_system_mem_limit;
 extern int halt_if_hws_hang;
+extern uint amdgpu_svm_attr_gobm;
 #else
 static const int __maybe_unused sched_policy = KFD_SCHED_POLICY_HWS;
 static const bool __maybe_unused debug_evictions; /* = false */
@@ -313,6 +314,9 @@ extern int amdgpu_wbrf;
 /* Extra time delay(in ms) to eliminate the influence of temperature momentary fluctuation */
 #define AMDGPU_SWCTF_EXTRA_DELAY		50
 
+/* Default size of buffer to use in migrating buffer */
+#define AMDGPU_SVM_ATTR_GOBM	9
+
 struct amdgpu_xcp_mgr;
 struct amdgpu_device;
 struct amdgpu_irq_src;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index b9529948f2b2..09c501753a3b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -169,6 +169,17 @@ uint amdgpu_sdma_phase_quantum = 32;
 char *amdgpu_disable_cu;
 char *amdgpu_virtual_display;
 bool enforce_isolation;
+
+/* Specifies the default size of buffer to use in
+ * migrating buffer from Sysmem to VRAM and vice
+ * versa
+ *
+ * GOBM - Granularity of Buffer Migration
+ *
+ * Defined as log2(sizeof(buffer)/PAGE_SIZE)
+ */
+uint amdgpu_svm_attr_gobm = AMDGPU_SVM_ATTR_GOBM;
+
 /*
  * OverDrive(bit 14) disabled by default
  * GFX DCS(bit 19) disabled by default
@@ -320,6 +331,13 @@ module_param_named(pcie_gen2, amdgpu_pcie_gen2, int, 0444);
 MODULE_PARM_DESC(msi, "MSI support (1 = enable, 0 = disable, -1 = auto)");
 module_param_named(msi, amdgpu_msi, int, 0444);
 
+/**
+ * DOC: svm_attr_gobm (uint)
+ * Size of buffer to use in migrating buffer from Sysmem to VRAM and vice versa
+ */
+MODULE_PARM_DESC(svm_attr_gobm, "Defined as log2(sizeof(buffer)/PAGE_SIZE), e.g. 9 for 2 MiB");
+module_param_named(svm_attr_gobm, amdgpu_svm_attr_gobm, uint, 0644);
+
 /**
  * DOC: lockup_timeout (string)
  * Set GPU scheduler timeout value in ms.
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 9ae9abc6eb43..c2e54b18c167 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -868,6 +868,18 @@ struct svm_range_list {
 	struct task_struct		*faulting_task;
 	/* check point ts decides if page fault recovery need be dropped */
 	uint64_t			checkpoint_ts[MAX_GPU_INSTANCE];
+
+	/* Indicates the default size to use in migrating
+	 * buffers of a process from Sysmem to VRAM and vice
+	 * versa. The max legal value cannot be greater than
+	 * 0x3F
+	 *
+	 * @note: A side effect of this symbol being part of
+	 * struct svm_range_list is that it forces all buffers
+	 * of the process of unregistered kind to use the same
+	 * size in buffer migration
+	 */

The comment is good enough, note maybe misleading, not needed.

  
+	uint8_t attr_gobm;
 };
 
 /* Process data */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index b44dec90969f..78c78baddb1f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -309,12 +309,11 @@ static void svm_range_free(struct svm_range *prange, bool do_unmap)
 }
 
 static void
-svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc,
- uint8_t *granularity, uint32_t *flags)
+svm_range_set_default_attributes(int32_t *location,
+			int32_t *prefetch_loc, uint32_t *flags)
 {
 	*location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
 	*prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
-	*granularity = 9;

as svm_range_set_default_attributes is called in multiple places,
  add new parameter struct svm_range_list *svms, to remove the
  duplicate code.

*granularity = svms->attr_gobm;


  
 	*flags =
 		KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT;
 }
@@ -358,9 +357,9 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
 		bitmap_copy(prange->bitmap_a

[PATCH v3 0/4] Improve SVM migrate event report

2024-08-27 Thread Philip Yang
1. Document how to use SMI system management interface to receive SVM
events, define string format macro for user mode.
2. Increase the event kfifo size, so less chance to drop event.
3. Output migrate end event with error code if migration failed.
4. Report dropped event count if fifo is full.

v3:
  Simplify event drop count handling (James Zhu) 

Philip Yang (4):
  drm/amdkfd: Document and define SVM events message macro
  drm/amdkfd: Output migrate end event if migrate failed
  drm/amdkfd: Increase SMI event fifo size
  drm/amdkfd: SMI report dropped event count

 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c|  14 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c |  79 +--
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h |   3 +-
 include/uapi/linux/kfd_ioctl.h  | 107 +---
 4 files changed, 150 insertions(+), 53 deletions(-)

-- 
2.43.2



[PATCH v3 4/4] drm/amdkfd: SMI report dropped event count

2024-08-27 Thread Philip Yang
Add new SMI event to report the dropped event count.

When the event kfifo is full, drop count is not zero, or no enough space
left to store the event message, increase drop count.

After reading event out from kfifo, if event was dropped, drop_count is
not zero, generate a dropped event record and reset drop count to zero.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 25 +
 include/uapi/linux/kfd_ioctl.h  |  6 +
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 9b8169761ec5..db321b8ea127 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -42,6 +42,7 @@ struct kfd_smi_client {
struct rcu_head rcu;
pid_t pid;
bool suser;
+   u32 drop_count;
 };
 
 #define KFD_MAX_KFIFO_SIZE 8192
@@ -103,12 +104,26 @@ static ssize_t kfd_smi_ev_read(struct file *filep, char 
__user *user,
}
to_copy = min(size, to_copy);
ret = kfifo_out(&client->fifo, buf, to_copy);
-   spin_unlock(&client->lock);
if (ret <= 0) {
+   spin_unlock(&client->lock);
ret = -EAGAIN;
goto ret_err;
}
 
+   if (client->drop_count) {
+   char msg[KFD_SMI_EVENT_MSG_SIZE];
+   int len;
+
+   len = snprintf(msg, sizeof(msg), "%x ", 
KFD_SMI_EVENT_DROPPED_EVENT);
+   len += snprintf(msg + len, sizeof(msg) - len,
+   
KFD_EVENT_FMT_DROPPED_EVENT(ktime_get_boottime_ns(),
+   client->pid, client->drop_count));
+   kfifo_in(&client->fifo, msg, len);
+   client->drop_count = 0;
+   }
+
+   spin_unlock(&client->lock);
+
ret = copy_to_user(user, buf, to_copy);
if (ret) {
ret = -EFAULT;
@@ -182,13 +197,15 @@ static void add_event_to_kfifo(pid_t pid, struct kfd_node 
*dev,
list_for_each_entry_rcu(client, &dev->smi_clients, list) {
if (!kfd_smi_ev_enabled(pid, client, smi_event))
continue;
+
spin_lock(&client->lock);
-   if (kfifo_avail(&client->fifo) >= len) {
+   if (!client->drop_count && kfifo_avail(&client->fifo) >= len) {
kfifo_in(&client->fifo, event_msg, len);
wake_up_all(&client->wait_queue);
} else {
-   pr_debug("smi_event(EventID: %u): no space left\n",
-   smi_event);
+   client->drop_count++;
+   pr_debug("smi_event(EventID: %u): no space left 
drop_count %d\n",
+smi_event, client->drop_count);
}
spin_unlock(&client->lock);
}
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index fa9f9846b88e..7afd66d45313 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -530,6 +530,7 @@ enum kfd_smi_event {
KFD_SMI_EVENT_QUEUE_EVICTION = 9,
KFD_SMI_EVENT_QUEUE_RESTORE = 10,
KFD_SMI_EVENT_UNMAP_FROM_GPU = 11,
+   KFD_SMI_EVENT_DROPPED_EVENT = 12,
 
/*
 * max event number, as a flag bit to get events from all processes,
@@ -610,6 +611,7 @@ struct kfd_ioctl_smi_events_args {
  *rw: 'W' for write page fault, 'R' for read page fault
  *rescheduled: 'R' if the queue restore failed and rescheduled to try again
  *error_code: migrate failure error code, 0 if no error
+ *drop_count: how many events dropped when fifo is full
  */
 #define KFD_EVENT_FMT_UPDATE_GPU_RESET(reset_seq_num, reset_cause)\
"%x %s\n", (reset_seq_num), (reset_cause)
@@ -645,6 +647,10 @@ struct kfd_ioctl_smi_events_args {
"%lld -%d @%lx(%lx) %x %d\n", (ns), (pid), (addr), (size),\
(node), (unmap_trigger)
 
+#define KFD_EVENT_FMT_DROPPED_EVENT(ns, pid, drop_count)\
+   "%lld -%d %d\n", (ns), (pid), (drop_count)
+
+
 
/**
  * CRIU IOCTLs (Checkpoint Restore In Userspace)
  *
-- 
2.43.2



[PATCH] drm/amdkfd: SMI report dropped event count

2024-08-27 Thread Philip Yang
Add new SMI event to report the dropped event count.

When the event kfifo is full, drop count is not zero, or no enough space
left to store the event message, increase drop count.

After reading event out from kfifo, if event was dropped, drop_count is
not zero, generate a dropped event record and reset drop count to zero.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 25 +
 include/uapi/linux/kfd_ioctl.h  |  6 +
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 9b8169761ec5..db321b8ea127 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -42,6 +42,7 @@ struct kfd_smi_client {
struct rcu_head rcu;
pid_t pid;
bool suser;
+   u32 drop_count;
 };
 
 #define KFD_MAX_KFIFO_SIZE 8192
@@ -103,12 +104,26 @@ static ssize_t kfd_smi_ev_read(struct file *filep, char 
__user *user,
}
to_copy = min(size, to_copy);
ret = kfifo_out(&client->fifo, buf, to_copy);
-   spin_unlock(&client->lock);
if (ret <= 0) {
+   spin_unlock(&client->lock);
ret = -EAGAIN;
goto ret_err;
}
 
+   if (client->drop_count) {
+   char msg[KFD_SMI_EVENT_MSG_SIZE];
+   int len;
+
+   len = snprintf(msg, sizeof(msg), "%x ", 
KFD_SMI_EVENT_DROPPED_EVENT);
+   len += snprintf(msg + len, sizeof(msg) - len,
+   
KFD_EVENT_FMT_DROPPED_EVENT(ktime_get_boottime_ns(),
+   client->pid, client->drop_count));
+   kfifo_in(&client->fifo, msg, len);
+   client->drop_count = 0;
+   }
+
+   spin_unlock(&client->lock);
+
ret = copy_to_user(user, buf, to_copy);
if (ret) {
ret = -EFAULT;
@@ -182,13 +197,15 @@ static void add_event_to_kfifo(pid_t pid, struct kfd_node 
*dev,
list_for_each_entry_rcu(client, &dev->smi_clients, list) {
if (!kfd_smi_ev_enabled(pid, client, smi_event))
continue;
+
spin_lock(&client->lock);
-   if (kfifo_avail(&client->fifo) >= len) {
+   if (!client->drop_count && kfifo_avail(&client->fifo) >= len) {
kfifo_in(&client->fifo, event_msg, len);
wake_up_all(&client->wait_queue);
} else {
-   pr_debug("smi_event(EventID: %u): no space left\n",
-   smi_event);
+   client->drop_count++;
+   pr_debug("smi_event(EventID: %u): no space left 
drop_count %d\n",
+smi_event, client->drop_count);
}
spin_unlock(&client->lock);
}
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index fa9f9846b88e..7afd66d45313 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -530,6 +530,7 @@ enum kfd_smi_event {
KFD_SMI_EVENT_QUEUE_EVICTION = 9,
KFD_SMI_EVENT_QUEUE_RESTORE = 10,
KFD_SMI_EVENT_UNMAP_FROM_GPU = 11,
+   KFD_SMI_EVENT_DROPPED_EVENT = 12,
 
/*
 * max event number, as a flag bit to get events from all processes,
@@ -610,6 +611,7 @@ struct kfd_ioctl_smi_events_args {
  *rw: 'W' for write page fault, 'R' for read page fault
  *rescheduled: 'R' if the queue restore failed and rescheduled to try again
  *error_code: migrate failure error code, 0 if no error
+ *drop_count: how many events dropped when fifo is full
  */
 #define KFD_EVENT_FMT_UPDATE_GPU_RESET(reset_seq_num, reset_cause)\
"%x %s\n", (reset_seq_num), (reset_cause)
@@ -645,6 +647,10 @@ struct kfd_ioctl_smi_events_args {
"%lld -%d @%lx(%lx) %x %d\n", (ns), (pid), (addr), (size),\
(node), (unmap_trigger)
 
+#define KFD_EVENT_FMT_DROPPED_EVENT(ns, pid, drop_count)\
+   "%lld -%d %d\n", (ns), (pid), (drop_count)
+
+
 
/**
  * CRIU IOCTLs (Checkpoint Restore In Userspace)
  *
-- 
2.43.2



[PATCH v3 1/4] drm/amdkfd: Document and define SVM events message macro

2024-08-27 Thread Philip Yang
Document how to use SMI system management interface to enable and
receive SVM events. Document SVM event triggers.

Define SVM events message string format macro that could be used by user
mode for sscanf to parse the event. Add it to uAPI header file to make
it obvious that is changing uAPI in future.

No functional changes.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c |  45 +
 include/uapi/linux/kfd_ioctl.h  | 100 +---
 2 files changed, 109 insertions(+), 36 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index ea6a8e43bd5b..de8b9abf7afc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -235,17 +235,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, 
bool post_reset,
amdgpu_reset_get_desc(reset_context, reset_cause,
  sizeof(reset_cause));
 
-   kfd_smi_event_add(0, dev, event, "%x %s\n",
- dev->reset_seq_num,
- reset_cause);
+   kfd_smi_event_add(0, dev, event, KFD_EVENT_FMT_UPDATE_GPU_RESET(
+ dev->reset_seq_num, reset_cause));
 }
 
 void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
 uint64_t throttle_bitmask)
 {
-   kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n",
+   kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, 
KFD_EVENT_FMT_THERMAL_THROTTLING(
  throttle_bitmask,
- amdgpu_dpm_get_thermal_throttling_counter(dev->adev));
+ 
amdgpu_dpm_get_thermal_throttling_counter(dev->adev)));
 }
 
 void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
@@ -256,8 +255,8 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, 
uint16_t pasid)
if (task_info) {
/* Report VM faults from user applications, not retry from 
kernel */
if (task_info->pid)
-   kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, 
"%x:%s\n",
-task_info->pid, task_info->task_name);
+   kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, 
KFD_EVENT_FMT_VMFAULT(
+ task_info->pid, 
task_info->task_name));
amdgpu_vm_put_task_info(task_info);
}
 }
@@ -267,16 +266,16 @@ void kfd_smi_event_page_fault_start(struct kfd_node 
*node, pid_t pid,
ktime_t ts)
 {
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_START,
- "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid,
- address, node->id, write_fault ? 'W' : 'R');
+ KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid,
+ address, node->id, write_fault ? 'W' : 'R'));
 }
 
 void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t pid,
  unsigned long address, bool migration)
 {
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END,
- "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(),
- pid, address, node->id, migration ? 'M' : 'U');
+ KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(),
+ pid, address, node->id, migration ? 'M' : 'U'));
 }
 
 void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid,
@@ -286,9 +285,9 @@ void kfd_smi_event_migration_start(struct kfd_node *node, 
pid_t pid,
   uint32_t trigger)
 {
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_START,
- "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n",
+ KFD_EVENT_FMT_MIGRATE_START(
  ktime_get_boottime_ns(), pid, start, end - start,
- from, to, prefetch_loc, preferred_loc, trigger);
+ from, to, prefetch_loc, preferred_loc, trigger));
 }
 
 void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid,
@@ -296,24 +295,24 @@ void kfd_smi_event_migration_end(struct kfd_node *node, 
pid_t pid,
 uint32_t from, uint32_t to, uint32_t trigger)
 {
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END,
- "%lld -%d @%lx(%lx) %x->%x %d\n",
+ KFD_EVENT_FMT_MIGRATE_END(
  ktime_get_boottime_ns(), pid, start, end - start,
- from, to, trigger);
+   

[PATCH v3 3/4] drm/amdkfd: Increase SMI event fifo size

2024-08-27 Thread Philip Yang
SMI event fifo size 1KB was enough to report GPU vm fault or reset
event, but could drop the more frequent SVM migration events. Increase
kfifo size to 8KB to store about 100 migrate events, less chance to drop
the migrate events if lots of migration happened in the short period of
time. Add KFD prefix to the macro name.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 1d94b445a060..9b8169761ec5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -44,7 +44,7 @@ struct kfd_smi_client {
bool suser;
 };
 
-#define MAX_KFIFO_SIZE 1024
+#define KFD_MAX_KFIFO_SIZE 8192
 
 static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *);
 static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *);
@@ -86,7 +86,7 @@ static ssize_t kfd_smi_ev_read(struct file *filep, char 
__user *user,
struct kfd_smi_client *client = filep->private_data;
unsigned char *buf;
 
-   size = min_t(size_t, size, MAX_KFIFO_SIZE);
+   size = min_t(size_t, size, KFD_MAX_KFIFO_SIZE);
buf = kmalloc(size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -355,7 +355,7 @@ int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd)
return -ENOMEM;
INIT_LIST_HEAD(&client->list);
 
-   ret = kfifo_alloc(&client->fifo, MAX_KFIFO_SIZE, GFP_KERNEL);
+   ret = kfifo_alloc(&client->fifo, KFD_MAX_KFIFO_SIZE, GFP_KERNEL);
if (ret) {
kfree(client);
return ret;
-- 
2.43.2



[PATCH v3 2/4] drm/amdkfd: Output migrate end event if migrate failed

2024-08-27 Thread Philip Yang
If page migration failed, also output migrate end event to match with
migrate start event, with failure error_code added to the end of the
migrate message macro. This will not break uAPI because application uses
old message macro sscanf drop and ignore the error_code.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c| 14 ++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c |  5 +++--
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h |  3 ++-
 include/uapi/linux/kfd_ioctl.h  |  7 ---
 4 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 8ee3d07ffbdf..eacfeb32f35d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -445,14 +445,13 @@ svm_migrate_vma_to_vram(struct kfd_node *node, struct 
svm_range *prange,
pr_debug("successful/cpages/npages 0x%lx/0x%lx/0x%lx\n",
 mpages, cpages, migrate.npages);
 
-   kfd_smi_event_migration_end(node, p->lead_thread->pid,
-   start >> PAGE_SHIFT, end >> PAGE_SHIFT,
-   0, node->id, trigger);
-
svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages);
 
 out_free:
kvfree(buf);
+   kfd_smi_event_migration_end(node, p->lead_thread->pid,
+   start >> PAGE_SHIFT, end >> PAGE_SHIFT,
+   0, node->id, trigger, r);
 out:
if (!r && mpages) {
pdd = svm_range_get_pdd_by_node(prange, node);
@@ -751,14 +750,13 @@ svm_migrate_vma_to_ram(struct kfd_node *node, struct 
svm_range *prange,
svm_migrate_copy_done(adev, mfence);
migrate_vma_finalize(&migrate);
 
-   kfd_smi_event_migration_end(node, p->lead_thread->pid,
-   start >> PAGE_SHIFT, end >> PAGE_SHIFT,
-   node->id, 0, trigger);
-
svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages);
 
 out_free:
kvfree(buf);
+   kfd_smi_event_migration_end(node, p->lead_thread->pid,
+   start >> PAGE_SHIFT, end >> PAGE_SHIFT,
+   node->id, 0, trigger, r);
 out:
if (!r && cpages) {
mpages = cpages - upages;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index de8b9abf7afc..1d94b445a060 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -292,12 +292,13 @@ void kfd_smi_event_migration_start(struct kfd_node *node, 
pid_t pid,
 
 void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid,
 unsigned long start, unsigned long end,
-uint32_t from, uint32_t to, uint32_t trigger)
+uint32_t from, uint32_t to, uint32_t trigger,
+int error_code)
 {
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END,
  KFD_EVENT_FMT_MIGRATE_END(
  ktime_get_boottime_ns(), pid, start, end - start,
- from, to, trigger));
+ from, to, trigger, error_code));
 }
 
 void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
index 85010b8307f8..503bff13d815 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
@@ -44,7 +44,8 @@ void kfd_smi_event_migration_start(struct kfd_node *node, 
pid_t pid,
 uint32_t trigger);
 void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid,
 unsigned long start, unsigned long end,
-uint32_t from, uint32_t to, uint32_t trigger);
+uint32_t from, uint32_t to, uint32_t trigger,
+int error_code);
 void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid,
  uint32_t trigger);
 void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 717307d6b5b7..fa9f9846b88e 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -609,6 +609,7 @@ struct kfd_ioctl_smi_events_args {
  *migrate_update: GPU page fault is recovered by 'M' for migrate, 'U' for 
update
  *rw: 'W' for write page fault, 'R' for read page fault
  *rescheduled: 'R' if the queue restore failed and rescheduled to

[PATCH] drm/amdkfd: restore_process_worker race with GPU reset

2024-08-23 Thread Philip Yang
If GPU reset kick in while KFD restore_process_worker running, this may
causes different issues, for example below rcu stall warning, because
restore work may move BOs and evict queues under VRAM pressure.

Fix this race by taking adev reset_domain read semaphore to prevent GPU
reset in restore_process_worker, the reset read semaphore can be taken
recursively if adev have multiple partitions.

Then there is live locking issue if CP hangs while
restore_process_worker runs, then GPU reset wait for semaphore to start
and restore_process_worker cannot finish to release semaphore. We need
signal eviction fence to solve the live locking if evict queue return
-ETIMEOUT (for MES path) or -ETIME (for HWS path) because CP hangs,

 amdgpu :af:00.0: amdgpu: GPU reset(21) succeeded!
 rcu: INFO: rcu_sched self-detected stall on CPU

 Workqueue: kfd_restore_wq restore_process_worker [amdgpu]
 Call Trace:
  update_process_times+0x94/0xd0
 RIP: 0010:amdgpu_vm_handle_moved+0x9a/0x210 [amdgpu]
  amdgpu_amdkfd_gpuvm_restore_process_bos+0x3d6/0x7d0 [amdgpu]
  restore_process_helper+0x27/0x80 [amdgpu]

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 56 +++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index a902950cc060..53a814347522 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -35,6 +35,7 @@
 #include 
 #include "amdgpu_amdkfd.h"
 #include "amdgpu.h"
+#include "amdgpu_reset.h"
 
 struct mm_struct;
 
@@ -1972,8 +1973,14 @@ static void evict_process_worker(struct work_struct 
*work)
kfd_process_restore_queues(p);
 
pr_debug("Finished evicting pasid 0x%x\n", p->pasid);
-   } else
+   } else if (ret == -ETIMEDOUT || ret == -ETIME) {
+   /* If CP hangs, signal the eviction fence, then 
restore_bo_worker
+* can finish to up_read GPU reset semaphore to start GPU reset.
+*/
+   signal_eviction_fence(p);
+   } else {
pr_err("Failed to evict queues of pasid 0x%x\n", p->pasid);
+   }
 }
 
 static int restore_process_helper(struct kfd_process *p)
@@ -1997,6 +2004,45 @@ static int restore_process_helper(struct kfd_process *p)
return ret;
 }
 
+/*
+ * kfd_hold_devices_reset_semaphore
+ *
+ * return:
+ *   true : hold reset domain semaphore to prevent device reset
+ *   false: one of the device is resetting or already reset
+ *
+ */
+static bool kfd_hold_devices_reset_semaphore(struct kfd_process *p)
+{
+   struct amdgpu_device *adev;
+   int i;
+
+   for (i = 0; i < p->n_pdds; i++) {
+   adev = p->pdds[i]->dev->adev;
+   if (!down_read_trylock(&adev->reset_domain->sem))
+   goto out_upread;
+   }
+   return true;
+
+out_upread:
+   while (i--) {
+   adev = p->pdds[i]->dev->adev;
+   up_read(&adev->reset_domain->sem);
+   }
+   return false;
+}
+
+static void kfd_unhold_devices_reset_semaphore(struct kfd_process *p)
+{
+   struct amdgpu_device *adev;
+   int i;
+
+   for (i = 0; i < p->n_pdds; i++) {
+   adev = p->pdds[i]->dev->adev;
+   up_read(&adev->reset_domain->sem);
+   }
+}
+
 static void restore_process_worker(struct work_struct *work)
 {
struct delayed_work *dwork;
@@ -2009,6 +2055,12 @@ static void restore_process_worker(struct work_struct 
*work)
 * lifetime of this thread, kfd_process p will be valid
 */
p = container_of(dwork, struct kfd_process, restore_work);
+
+   if (!kfd_hold_devices_reset_semaphore(p)) {
+   pr_debug("GPU resetting, restore bo and queue skipped\n");
+   return;
+   }
+
pr_debug("Started restoring pasid 0x%x\n", p->pasid);
 
/* Setting last_restore_timestamp before successful restoration.
@@ -2031,6 +2083,8 @@ static void restore_process_worker(struct work_struct 
*work)
 msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)))
kfd_process_restore_queues(p);
}
+
+   kfd_unhold_devices_reset_semaphore(p);
 }
 
 void kfd_suspend_all_processes(void)
-- 
2.43.2



Re: [PATCH v2 3/4] drm/amdkfd: Increase SMI event fifo size

2024-08-22 Thread Philip Yang

  


On 2024-08-22 10:34, James Zhu wrote:


  
  On 2024-07-30 16:15, Philip Yang wrote:
  
  SMI event fifo size 1KB was enough to
report GPU vm fault or reset

  
  [JZ] There is a typo here. it should be NOT enough.
  

It is not typo, but it is not clear, will change to
SMI event fifo size 1KB is enough to report GPU vm fault or reset
  event, but could drop the more frequent SVM migration events,
Regards,
Philip


  event, increase it to 8KB to store about
100 migrate events, less chance

to drop the migrate events if lots of migration happened in the
short

period of time. Add KFD prefix to the macro name.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 6 +++---

  1 file changed, 3 insertions(+), 3 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c

index 1d94b445a060..9b8169761ec5 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c

@@ -44,7 +44,7 @@ struct kfd_smi_client {

  bool suser;

  };

  -#define MAX_KFIFO_SIZE    1024

+#define KFD_MAX_KFIFO_SIZE    8192

    static __poll_t kfd_smi_ev_poll(struct file *, struct
poll_table_struct *);

  static ssize_t kfd_smi_ev_read(struct file *, char __user *,
size_t, loff_t *);

@@ -86,7 +86,7 @@ static ssize_t kfd_smi_ev_read(struct file
*filep, char __user *user,

  struct kfd_smi_client *client = filep->private_data;

  unsigned char *buf;

  -    size = min_t(size_t, size, MAX_KFIFO_SIZE);

+    size = min_t(size_t, size, KFD_MAX_KFIFO_SIZE);

  buf = kmalloc(size, GFP_KERNEL);

  if (!buf)

  return -ENOMEM;

@@ -355,7 +355,7 @@ int kfd_smi_event_open(struct kfd_node *dev,
uint32_t *fd)

  return -ENOMEM;

  INIT_LIST_HEAD(&client->list);

  -    ret = kfifo_alloc(&client->fifo, MAX_KFIFO_SIZE,
GFP_KERNEL);

+    ret = kfifo_alloc(&client->fifo, KFD_MAX_KFIFO_SIZE,
GFP_KERNEL);

  if (ret) {

  kfree(client);

  return ret;

  

  



Re: [PATCH v2 1/4] drm/amdkfd: Document and define SVM events message macro

2024-08-22 Thread Philip Yang

  


On 2024-08-22 10:32, James Zhu wrote:


  
  

  On 2024-07-30 16:15, Philip Yang
wrote:
  
  
Document how to use SMI system management interface to enable and
receive SVM events. Document SVM event triggers.

Define SVM events message string format macro that could be used by user
mode for sscanf to parse the event. Add it to uAPI header file to make
it obvious that is changing uAPI in future.

No functional changes.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c |  45 +
 include/uapi/linux/kfd_ioctl.h  | 100 +---
 2 files changed, 109 insertions(+), 36 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index ea6a8e43bd5b..de8b9abf7afc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -235,17 +235,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset,
 		amdgpu_reset_get_desc(reset_context, reset_cause,
   sizeof(reset_cause));
 
-	kfd_smi_event_add(0, dev, event, "%x %s\n",
-			  dev->reset_seq_num,
-			  reset_cause);
+	kfd_smi_event_add(0, dev, event, KFD_EVENT_FMT_UPDATE_GPU_RESET(
+			  dev->reset_seq_num, reset_cause));
 }
 
 void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
 	 uint64_t throttle_bitmask)
 {
-	kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n",
+	kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, KFD_EVENT_FMT_THERMAL_THROTTLING(
 			  throttle_bitmask,
-			  amdgpu_dpm_get_thermal_throttling_counter(dev->adev));
+			  amdgpu_dpm_get_thermal_throttling_counter(dev->adev)));
 }
 
 void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
@@ -256,8 +255,8 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
 	if (task_info) {
 		/* Report VM faults from user applications, not retry from kernel */
 		if (task_info->pid)
-			kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",
-	 task_info->pid, task_info->task_name);
+			kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, KFD_EVENT_FMT_VMFAULT(
+	  task_info->pid, task_info->task_name));
 		amdgpu_vm_put_task_info(task_info);
 	}
 }
@@ -267,16 +266,16 @@ void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid,
 ktime_t ts)
 {
 	kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_START,
-			  "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid,
-			  address, node->id, write_fault ? 'W' : 'R');
+			  KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid,
+			  address, node->id, write_fault ? 'W' : 'R'));
 }
 
 void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t pid,
   unsigned long address, bool migration)
 {
 	kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END,
-			  "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(),
-			  pid, address, node->id, migration ? 'M' : 'U');
+			  KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(),
+			  pid, address, node->id, migration ? 'M' : 'U'));
 }
 
 void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid,
@@ -286,9 +285,9 @@ void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid,
    uint32_t trigger)
 {
 	kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_START,
-			  "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n",
+			  KFD_EVENT_FMT_MIGRATE_START(
 			  ktime_get_boottime_ns(), pid, start, end - start,
-			  from, to, prefetch_loc, preferred_loc, trigger);
+			  from, to, prefetch_loc, preferred_loc, trigger));
 }
 
 void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid,
@@ -296,24 +295,24 @@ void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid,
  uint32_t from, uint32_t to, uint32_t trigger)
 {
 	kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END,
-			  "%lld -%d @%lx(%lx) %x->%x %d\n",
+			  KFD_EVENT_FMT_MIGRATE_END(
 			  ktime_get_boottime_ns(), pid, start, end - start,
-			  from, to, trigger);
+			  from, to, trigger));
 }
 
 void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid,
   uint32_t trigger)
 {
 	kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_EVICTION,
-			  "%lld -%d %x %d\n", ktime_get_boottime_ns(), pid,
-			  node->id, trigger);
+			  KFD_EVENT_FMT_QUEUE_EVICTION(ktime_get_boottime_ns(), pid,
+			  node->id, trigger));
 }
 
 void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid)
 {
 	kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_RESTORE,
-			  "%lld -%d %x\n", ktime_get_boottime_ns(), pid,
-			  node->id);
+			  KFD_EVENT_FMT_QUEUE_RESTORE(ktime_get_boottime_ns(), pid,
+			  node->id, 0));
 }
 
 void kfd_smi_event_queue_restore_reschedu

Re: [PATCH v6] drm/amdkfd: Change kfd/svm page fault drain handling

2024-08-22 Thread Philip Yang

  


On 2024-08-21 18:01, Xiaogang.Chen
  wrote:


  From: Xiaogang Chen 

When app unmap vm ranges(munmap) kfd/svm starts drain pending page fault and
not handle any incoming pages fault of this process until a deferred work item
got executed by default system wq. The time period of "not handle page fault"
can be long and is unpredicable. That is advese to kfd performance on page
faults recovery.

This patch uses time stamp of incoming page fault to decide to drop or recover
page fault. When app unmap vm ranges kfd records each gpu device's ih ring
current time stamp. These time stamps are used at kfd page fault recovery
routine.

Any page fault happened on unmapped ranges after unmap events is application
bug that accesses vm range after unmap. It is not driver work to cover that.

By using time stamp of page fault do not need drain page faults at deferred
work. So, the time period that kfd does not handle page faults is reduced
and can be controlled.

Signed-off-by: Xiaogang.Chen 

Some nitpicks below.
This patch is Reviewed-by: Philip Yang 

  
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c |  4 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h  |  2 +
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c   | 98 +-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h   |  2 +-
 7 files changed, 75 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index a060c28f0877..d89a4c14bbb8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2764,7 +2764,7 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
  * shouldn't be reported any more.
  */
 bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
-			u32 vmid, u32 node_id, uint64_t addr,
+			u32 vmid, u32 node_id, uint64_t addr, uint64_t ts,
 			bool write_fault)
 {
 	bool is_compute_context = false;
@@ -2790,7 +2790,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
 	addr /= AMDGPU_GPU_PAGE_SIZE;
 
 	if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid,
-	node_id, addr, write_fault)) {
+	node_id, addr, ts, write_fault)) {
 		amdgpu_bo_unref(&root);
 		return true;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 046949c4b695..d12d66dca8e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -558,7 +558,7 @@ amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm);
 void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info);
 
 bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
-			u32 vmid, u32 node_id, uint64_t addr,
+			u32 vmid, u32 node_id, uint64_t addr, uint64_t ts,
 			bool write_fault);
 
 void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index f0ceab3ce5bf..9784a2892185 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -132,7 +132,8 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
 		/* Try to handle the recoverable page faults by filling page
 		 * tables
 		 */
-		if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, write_fault))
+		if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr,
+	   entry->timestamp, write_fault))
 			return 1;
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index b73136d390cc..c76ac0dfe572 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -595,7 +595,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 			cam_index = entry->src_data[2] & 0x3ff;
 
 			ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
-		 addr, write_fault);
+		 addr, entry->timestamp, write_fault);
 			WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index);
 			if (ret)
 return 1;
@@ -618,7 +618,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 			 * tables
 			 */
 			if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
-		   addr, write_fault))
+		   addr, entry->timestamp, write_fault))
 return 1;
 		}
 	}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 4190fa339913..288ebf02fa1c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -863,6 +863,8 @@ struct svm_range_list {
 	struct delayed_work		restore_work;
 	DECLARE_BITMAP(bitmap_supported, MAX_GPU_INSTANCE);
 	struct task_struct		*faul

Re: [PATCH] drm/amdgpu: Surface svm_attr_gobm, a RW module parameter

2024-08-22 Thread Philip Yang

  


On 2024-08-21 19:22, Ramesh Errabolu
  wrote:


  KFD's design of unified memory (UM) does not allow users to
configure the size of buffer used in migrating buffer either
from Sysmem to VRAM or vice versa. 

This is not true, app can change range granularity attribute, to
  configure the buffer migration size. This module parameter is used
  to config the default range granularity.


  This patch remedies this
gap, albeit at a coarse level, for workloads that deal with
unregistered memory

Signed-off-by: Ramesh Errabolu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  4 
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 16 ++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h   | 12 ++
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c| 29 +
 4 files changed, 52 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index e8c284aea1f2..73dd816b01f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -237,6 +237,7 @@ extern int sched_policy;
 extern bool debug_evictions;
 extern bool no_system_mem_limit;
 extern int halt_if_hws_hang;
+extern uint amdgpu_svm_attr_gobm;


  
 #else
 static const int __maybe_unused sched_policy = KFD_SCHED_POLICY_HWS;
 static const bool __maybe_unused debug_evictions; /* = false */
@@ -313,6 +314,9 @@ extern int amdgpu_wbrf;
 /* Extra time delay(in ms) to eliminate the influence of temperature momentary fluctuation */
 #define AMDGPU_SWCTF_EXTRA_DELAY		50
 
+/* Default size of buffer to use in migrating buffer */
+#define AMDGPU_SVM_ATTR_GOBM	9
+
 struct amdgpu_xcp_mgr;
 struct amdgpu_device;
 struct amdgpu_irq_src;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index b9529948f2b2..e195e1cf0f28 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -169,6 +169,15 @@ uint amdgpu_sdma_phase_quantum = 32;
 char *amdgpu_disable_cu;
 char *amdgpu_virtual_display;
 bool enforce_isolation;
+
+/* Specifies the default size of buffer to use in
+ * migrating buffer from Sysmem to VRAM and vice
+ * versa
+ *
+ * Defined as log2(sizeof(buffer)/PAGE_SIZE)
+ */
+uint amdgpu_svm_attr_gobm = AMDGPU_SVM_ATTR_GOBM;

/* add explanation, GOBM : granularity of buffer migration

Use u8 type, the same type used in prange->granularity
u8 amdgpu_svm_default_gobm = AMDGPU_SVM_DEFAULT_GOBM


  
+
 /*
  * OverDrive(bit 14) disabled by default
  * GFX DCS(bit 19) disabled by default
@@ -320,6 +329,13 @@ module_param_named(pcie_gen2, amdgpu_pcie_gen2, int, 0444);
 MODULE_PARM_DESC(msi, "MSI support (1 = enable, 0 = disable, -1 = auto)");
 module_param_named(msi, amdgpu_msi, int, 0444);
 
+/**
+ * DOC: svm_attr_gobm (uint)
+ * Size of buffer to use in migrating buffer from Sysmem to VRAM and vice versa
+ */
+MODULE_PARM_DESC(svm_attr_gobm, "Defined as log2(sizeof(buffer)/PAGE_SIZE), e.g. 9 for 2 MiB");
+module_param_named(svm_attr_gobm, amdgpu_svm_attr_gobm, uint, 0644);
+
 /**
  * DOC: lockup_timeout (string)
  * Set GPU scheduler timeout value in ms.
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 7bba6bed2f48..07b202ab008a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -866,6 +866,18 @@ struct svm_range_list {
 	struct delayed_work		restore_work;
 	DECLARE_BITMAP(bitmap_supported, MAX_GPU_INSTANCE);
 	struct task_struct		*faulting_task;
+
+	/* Indicates the default size to use in migrating
+	 * buffers of a process from Sysmem to VRAM and vice
+	 * versa. The max legal value cannot be greater than
+	 * 0x3F
+	 *
+	 * @note: A side effect of this symbol being part of
+	 * struct svm_range_list is that it forces all buffers
+	 * of the process of unregistered kind to use the same
+	 * size in buffer migration
+	 */
+	uint8_t attr_gobm;
 };
 
 /* Process data */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 10b1a1f64198..fcfe5543a3c0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -309,12 +309,11 @@ static void svm_range_free(struct svm_range *prange, bool do_unmap)
 }
 
 static void
-svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc,
- uint8_t *granularity, uint32_t *flags)
+svm_range_set_default_attributes(int32_t *location,
+			int32_t *prefetch_loc, uint32_t *flags)
 {
 	*location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
 	*prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
-	*granularity = 9;
 	*flags =
 		KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT;
 }
@@ -358,9 +357,9 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
 		bitmap_copy(prange->bitmap_access, svms->bitmap_supported,
 			MAX_GPU_INSTANCE);
 
+	prange->granula

[PATCH] drm/amdkfd: Handle queue destroy buffer access race

2024-08-02 Thread Philip Yang
Add helper function kfd_queue_unreference_buffers to reduce queue buffer
refcount, separate it from release queue buffers.

Because it is circular locking to hold dqm_lock to take vm lock,
kfd_ioctl_destroy_queue should take vm lock, unreference queue buffers
first, but not release queue buffers, to handle error in case failed to
hold vm lock. Then hold dqm_lock to remove queue from queue list and
then release queue buffers.

Restore process worker restore queue hold dqm_lock, will always find
the queue with valid queue buffers.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  5 +-
 .../amd/amdkfd/kfd_process_queue_manager.c|  8 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 62 ---
 4 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 0622ebd7e8ef..10d6e29b23cb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -400,6 +400,7 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
return 0;
 
 err_create_queue:
+   kfd_queue_unreference_buffers(pdd, &q_properties);
kfd_queue_release_buffers(pdd, &q_properties);
 err_acquire_queue_buf:
 err_sdma_engine_id:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 057d20446c31..e38484b40467 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1298,9 +1298,12 @@ void print_queue_properties(struct queue_properties *q);
 void print_queue(struct queue *q);
 int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct 
amdgpu_bo **pbo,
 u64 expected_size);
-void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo);
+void kfd_queue_buffer_put(struct amdgpu_bo **bo);
 int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct 
queue_properties *properties);
 int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct 
queue_properties *properties);
+void kfd_queue_unreference_buffer(struct amdgpu_vm *vm, struct amdgpu_bo **bo);
+int kfd_queue_unreference_buffers(struct kfd_process_device *pdd,
+ struct queue_properties *properties);
 void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev);
 
 struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index f732ee35b531..ef76a9cbc7e2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -217,6 +217,7 @@ void pqm_uninit(struct process_queue_manager *pqm)
list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) {
if (pqn->q) {
pdd = kfd_get_process_device_data(pqn->q->device, 
pqm->process);
+   kfd_queue_unreference_buffers(pdd, &pqn->q->properties);
kfd_queue_release_buffers(pdd, &pqn->q->properties);
pqm_clean_queue_resource(pqm, pqn);
}
@@ -512,7 +513,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, 
unsigned int qid)
}
 
if (pqn->q) {
-   retval = kfd_queue_release_buffers(pdd, &pqn->q->properties);
+   retval = kfd_queue_unreference_buffers(pdd, 
&pqn->q->properties);
if (retval)
goto err_destroy_queue;
 
@@ -526,7 +527,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, 
unsigned int qid)
if (retval != -ETIME)
goto err_destroy_queue;
}
-
+   kfd_queue_release_buffers(pdd, &pqn->q->properties);
pqm_clean_queue_resource(pqm, pqn);
uninit_queue(pqn->q);
}
@@ -579,7 +580,8 @@ int pqm_update_queue_properties(struct 
process_queue_manager *pqm,
return -EFAULT;
}
 
-   kfd_queue_buffer_put(vm, &pqn->q->properties.ring_bo);
+   kfd_queue_unreference_buffer(vm, &pqn->q->properties.ring_bo);
+   kfd_queue_buffer_put(&pqn->q->properties.ring_bo);
amdgpu_bo_unreserve(vm->root.bo);
 
pqn->q->properties.ring_bo = p->ring_bo;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index e0a073ae4a49..9ac15dff527f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -224,16 +224,8 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __us

Re: [PATCH] drm/amdgpu: change kgd2kfd_init_zone sequence during device_init

2024-07-31 Thread Philip Yang

  


On 2024-07-31 04:10, Shikang Fan wrote:


  Move kgd2kfd_init _zone_device() after release_full_gpu()
as it takes long time for asics with huge bar size and it could
potentially hit full access timeout for SRIOV during init.

Signed-off-by: Shikang Fan 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3a43754e7f10..4494fa7ae70f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2930,10 +2930,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
 		amdgpu_ttm_set_buffer_funcs_status(adev, true);
 
 	/* Don't init kfd if whole hive need to be reset during init */
-	if (!adev->gmc.xgmi.pending_reset) {
-		kgd2kfd_init_zone_device(adev);
+	if (!adev->gmc.xgmi.pending_reset)
 		amdgpu_amdkfd_device_init(adev);
-	}
 
 	amdgpu_fru_get_product_info(adev);
 
@@ -4362,6 +4360,13 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 		flush_delayed_work(&adev->delayed_init_work);
 	}
 
+	/* On asics with huge bar size, memremap_pages can take long time
+	 * and potentially leading to full access timeout for SRIOV. Move
+	 * init_zone_device() after exit full gpu
+	 */
+	if (!adev->gmc.xgmi.pending_reset)
+		kgd2kfd_init_zone_device(adev);
+

This change will not work because KFD amdgpu_amdkfd_device_init
  check KFD_IS_SVM_API_SUPPORTED, it always return false, as a
  result, SVM API is not enabled for user space.
Maybe you can move two function calls together here, if there is
  no other init dependency issue. 

    /* Don't init kfd if whole hive need to be reset during init
  */
      if (!adev->gmc.xgmi.pending_reset) {
      kgd2kfd_init_zone_device(adev);
      amdgpu_amdkfd_device_init(adev);
      }

Regards,
Philip


  
 	/*
 	 * Place those sysfs registering after `late_init`. As some of those
 	 * operations performed in `late_init` might affect the sysfs


  



[PATCH v2 1/4] drm/amdkfd: Document and define SVM events message macro

2024-07-30 Thread Philip Yang
Document how to use SMI system management interface to enable and
receive SVM events. Document SVM event triggers.

Define SVM events message string format macro that could be used by user
mode for sscanf to parse the event. Add it to uAPI header file to make
it obvious that is changing uAPI in future.

No functional changes.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c |  45 +
 include/uapi/linux/kfd_ioctl.h  | 100 +---
 2 files changed, 109 insertions(+), 36 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index ea6a8e43bd5b..de8b9abf7afc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -235,17 +235,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, 
bool post_reset,
amdgpu_reset_get_desc(reset_context, reset_cause,
  sizeof(reset_cause));
 
-   kfd_smi_event_add(0, dev, event, "%x %s\n",
- dev->reset_seq_num,
- reset_cause);
+   kfd_smi_event_add(0, dev, event, KFD_EVENT_FMT_UPDATE_GPU_RESET(
+ dev->reset_seq_num, reset_cause));
 }
 
 void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
 uint64_t throttle_bitmask)
 {
-   kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n",
+   kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, 
KFD_EVENT_FMT_THERMAL_THROTTLING(
  throttle_bitmask,
- amdgpu_dpm_get_thermal_throttling_counter(dev->adev));
+ 
amdgpu_dpm_get_thermal_throttling_counter(dev->adev)));
 }
 
 void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
@@ -256,8 +255,8 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, 
uint16_t pasid)
if (task_info) {
/* Report VM faults from user applications, not retry from 
kernel */
if (task_info->pid)
-   kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, 
"%x:%s\n",
-task_info->pid, task_info->task_name);
+   kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, 
KFD_EVENT_FMT_VMFAULT(
+ task_info->pid, 
task_info->task_name));
amdgpu_vm_put_task_info(task_info);
}
 }
@@ -267,16 +266,16 @@ void kfd_smi_event_page_fault_start(struct kfd_node 
*node, pid_t pid,
ktime_t ts)
 {
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_START,
- "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid,
- address, node->id, write_fault ? 'W' : 'R');
+ KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid,
+ address, node->id, write_fault ? 'W' : 'R'));
 }
 
 void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t pid,
  unsigned long address, bool migration)
 {
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END,
- "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(),
- pid, address, node->id, migration ? 'M' : 'U');
+ KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(),
+ pid, address, node->id, migration ? 'M' : 'U'));
 }
 
 void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid,
@@ -286,9 +285,9 @@ void kfd_smi_event_migration_start(struct kfd_node *node, 
pid_t pid,
   uint32_t trigger)
 {
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_START,
- "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n",
+ KFD_EVENT_FMT_MIGRATE_START(
  ktime_get_boottime_ns(), pid, start, end - start,
- from, to, prefetch_loc, preferred_loc, trigger);
+ from, to, prefetch_loc, preferred_loc, trigger));
 }
 
 void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid,
@@ -296,24 +295,24 @@ void kfd_smi_event_migration_end(struct kfd_node *node, 
pid_t pid,
 uint32_t from, uint32_t to, uint32_t trigger)
 {
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END,
- "%lld -%d @%lx(%lx) %x->%x %d\n",
+ KFD_EVENT_FMT_MIGRATE_END(
  ktime_get_boottime_ns(), pid, start, end - start,
- from, to, trigger);
+   

[PATCH v2 4/4] drm/amdkfd: SMI report dropped event count

2024-07-30 Thread Philip Yang
Add new SMI event to report the dropped event count when the event kfifo
is full.

When the kfifo has space for two events, generate a dropped event record
to report how many events were dropped, together with the next event to
add to kfifo.

After reading event out from kfifo, if there were events dropped,
generate a dropped event record and add to kfifo.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 41 ++---
 include/uapi/linux/kfd_ioctl.h  |  6 +++
 2 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 9b8169761ec5..9b47657d5160 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -42,6 +42,7 @@ struct kfd_smi_client {
struct rcu_head rcu;
pid_t pid;
bool suser;
+   u32 drop_count;
 };
 
 #define KFD_MAX_KFIFO_SIZE 8192
@@ -103,12 +104,26 @@ static ssize_t kfd_smi_ev_read(struct file *filep, char 
__user *user,
}
to_copy = min(size, to_copy);
ret = kfifo_out(&client->fifo, buf, to_copy);
-   spin_unlock(&client->lock);
if (ret <= 0) {
+   spin_unlock(&client->lock);
ret = -EAGAIN;
goto ret_err;
}
 
+   if (client->drop_count) {
+   char msg[KFD_SMI_EVENT_MSG_SIZE];
+   int len;
+
+   len = snprintf(msg, sizeof(msg), "%x ", 
KFD_SMI_EVENT_DROPPED_EVENT);
+   len += snprintf(msg + len, sizeof(msg) - len,
+   
KFD_EVENT_FMT_DROPPED_EVENT(ktime_get_boottime_ns(),
+   client->pid, client->drop_count));
+   kfifo_in(&client->fifo, msg, len);
+   client->drop_count = 0;
+   }
+
+   spin_unlock(&client->lock);
+
ret = copy_to_user(user, buf, to_copy);
if (ret) {
ret = -EFAULT;
@@ -173,22 +188,36 @@ static bool kfd_smi_ev_enabled(pid_t pid, struct 
kfd_smi_client *client,
 }
 
 static void add_event_to_kfifo(pid_t pid, struct kfd_node *dev,
-  unsigned int smi_event, char *event_msg, int len)
+  unsigned int smi_event, char *event_msg, int 
event_len)
 {
struct kfd_smi_client *client;
+   char msg[KFD_SMI_EVENT_MSG_SIZE];
+   int len = 0;
 
rcu_read_lock();
 
list_for_each_entry_rcu(client, &dev->smi_clients, list) {
if (!kfd_smi_ev_enabled(pid, client, smi_event))
continue;
+
spin_lock(&client->lock);
-   if (kfifo_avail(&client->fifo) >= len) {
-   kfifo_in(&client->fifo, event_msg, len);
+   if (client->drop_count) {
+   len = snprintf(msg, sizeof(msg), "%x ", 
KFD_SMI_EVENT_DROPPED_EVENT);
+   len += snprintf(msg + len, sizeof(msg) - len,
+   
KFD_EVENT_FMT_DROPPED_EVENT(ktime_get_boottime_ns(), pid,
+   client->drop_count));
+   }
+
+   if (kfifo_avail(&client->fifo) >= event_len + len) {
+   if (len)
+   kfifo_in(&client->fifo, msg, len);
+   kfifo_in(&client->fifo, event_msg, event_len);
wake_up_all(&client->wait_queue);
+   client->drop_count = 0;
} else {
-   pr_debug("smi_event(EventID: %u): no space left\n",
-   smi_event);
+   client->drop_count++;
+   pr_debug("smi_event(EventID: %u): no space left 
drop_count %d\n",
+smi_event, client->drop_count);
}
spin_unlock(&client->lock);
}
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index e4ed8fec3294..915d1e7c67fe 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -530,6 +530,7 @@ enum kfd_smi_event {
KFD_SMI_EVENT_QUEUE_EVICTION = 9,
KFD_SMI_EVENT_QUEUE_RESTORE = 10,
KFD_SMI_EVENT_UNMAP_FROM_GPU = 11,
+   KFD_SMI_EVENT_DROPPED_EVENT = 12,
 
/*
 * max event number, as a flag bit to get events from all processes,
@@ -610,6 +611,7 @@ struct kfd_ioctl_smi_events_args {
  *rw: 'W' for write page fault, 'R' for read page fault
  *rescheduled: 'R' if the queue restore failed and rescheduled to try again
  *error_code: migrate failure error code, 0 if no error
+ *drop_count: how many events dropped when fifo is full
  */

[PATCH v2 3/4] drm/amdkfd: Increase SMI event fifo size

2024-07-30 Thread Philip Yang
SMI event fifo size 1KB was enough to report GPU vm fault or reset
event, increase it to 8KB to store about 100 migrate events, less chance
to drop the migrate events if lots of migration happened in the short
period of time. Add KFD prefix to the macro name.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 1d94b445a060..9b8169761ec5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -44,7 +44,7 @@ struct kfd_smi_client {
bool suser;
 };
 
-#define MAX_KFIFO_SIZE 1024
+#define KFD_MAX_KFIFO_SIZE 8192
 
 static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *);
 static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *);
@@ -86,7 +86,7 @@ static ssize_t kfd_smi_ev_read(struct file *filep, char 
__user *user,
struct kfd_smi_client *client = filep->private_data;
unsigned char *buf;
 
-   size = min_t(size_t, size, MAX_KFIFO_SIZE);
+   size = min_t(size_t, size, KFD_MAX_KFIFO_SIZE);
buf = kmalloc(size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -355,7 +355,7 @@ int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd)
return -ENOMEM;
INIT_LIST_HEAD(&client->list);
 
-   ret = kfifo_alloc(&client->fifo, MAX_KFIFO_SIZE, GFP_KERNEL);
+   ret = kfifo_alloc(&client->fifo, KFD_MAX_KFIFO_SIZE, GFP_KERNEL);
if (ret) {
kfree(client);
return ret;
-- 
2.43.2



[PATCH v2 2/4] drm/amdkfd: Output migrate end event if migrate failed

2024-07-30 Thread Philip Yang
If page migration failed, also output migrate end event to match with
migrate start event, with failure error_code added to the end of the
migrate message macro. This will not break uAPI because application uses
old message macro sscanf drop and ignore the error_code.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c| 14 ++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c |  5 +++--
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h |  3 ++-
 include/uapi/linux/kfd_ioctl.h  |  7 ---
 4 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 8ee3d07ffbdf..eacfeb32f35d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -445,14 +445,13 @@ svm_migrate_vma_to_vram(struct kfd_node *node, struct 
svm_range *prange,
pr_debug("successful/cpages/npages 0x%lx/0x%lx/0x%lx\n",
 mpages, cpages, migrate.npages);
 
-   kfd_smi_event_migration_end(node, p->lead_thread->pid,
-   start >> PAGE_SHIFT, end >> PAGE_SHIFT,
-   0, node->id, trigger);
-
svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages);
 
 out_free:
kvfree(buf);
+   kfd_smi_event_migration_end(node, p->lead_thread->pid,
+   start >> PAGE_SHIFT, end >> PAGE_SHIFT,
+   0, node->id, trigger, r);
 out:
if (!r && mpages) {
pdd = svm_range_get_pdd_by_node(prange, node);
@@ -751,14 +750,13 @@ svm_migrate_vma_to_ram(struct kfd_node *node, struct 
svm_range *prange,
svm_migrate_copy_done(adev, mfence);
migrate_vma_finalize(&migrate);
 
-   kfd_smi_event_migration_end(node, p->lead_thread->pid,
-   start >> PAGE_SHIFT, end >> PAGE_SHIFT,
-   node->id, 0, trigger);
-
svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages);
 
 out_free:
kvfree(buf);
+   kfd_smi_event_migration_end(node, p->lead_thread->pid,
+   start >> PAGE_SHIFT, end >> PAGE_SHIFT,
+   node->id, 0, trigger, r);
 out:
if (!r && cpages) {
mpages = cpages - upages;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index de8b9abf7afc..1d94b445a060 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -292,12 +292,13 @@ void kfd_smi_event_migration_start(struct kfd_node *node, 
pid_t pid,
 
 void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid,
 unsigned long start, unsigned long end,
-uint32_t from, uint32_t to, uint32_t trigger)
+uint32_t from, uint32_t to, uint32_t trigger,
+int error_code)
 {
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END,
  KFD_EVENT_FMT_MIGRATE_END(
  ktime_get_boottime_ns(), pid, start, end - start,
- from, to, trigger));
+ from, to, trigger, error_code));
 }
 
 void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
index 85010b8307f8..503bff13d815 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
@@ -44,7 +44,8 @@ void kfd_smi_event_migration_start(struct kfd_node *node, 
pid_t pid,
 uint32_t trigger);
 void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid,
 unsigned long start, unsigned long end,
-uint32_t from, uint32_t to, uint32_t trigger);
+uint32_t from, uint32_t to, uint32_t trigger,
+int error_code);
 void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid,
  uint32_t trigger);
 void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index c94182ad8fb8..e4ed8fec3294 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -609,6 +609,7 @@ struct kfd_ioctl_smi_events_args {
  *migrate_update: GPU page fault is recovered by 'M' for migrate, 'U' for 
update
  *rw: 'W' for write page fault, 'R' for read page fault
  *rescheduled: 'R' if the queue restore failed and rescheduled to

[PATCH v2 0/4] Improve SVM migrate event report

2024-07-30 Thread Philip Yang
1. Document how to use SMI system management interface to receive SVM
events, define string format macro for user mode.
2. Increase the event kfifo size, so less chance to drop event.
3. Output migrate end event with error code if migration failed.
4. Report dropped event count if fifo is full.

Philip Yang (4):
  drm/amdkfd: Document and define SVM events message macro
  drm/amdkfd: Output migrate end event if migrate failed
  drm/amdkfd: Increase SMI event fifo size
  drm/amdkfd: SMI report dropped event count

 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c|  14 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c |  95 +++--
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h |   3 +-
 include/uapi/linux/kfd_ioctl.h  | 107 +---
 4 files changed, 164 insertions(+), 55 deletions(-)

-- 
2.43.2



Re: [PATCH] drm/amdkfd: Fix missing error code in kfd_queue_acquire_buffers

2024-07-26 Thread Philip Yang

  
The kfdtest user queue validation cases don't cover those error
  condition path, thanks for catching it.
This patch is

Reviewed-by: Philip Yang 

On 2024-07-26 02:47, Srinivasan
  Shanmugam wrote:


  The fix involves setting 'err' to '-EINVAL' before each 'goto
out_err_unreserve'.

Fixes the below:
drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_queue.c:265 kfd_queue_acquire_buffers()
warn: missing error code 'err'

drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_queue.c
226 int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties)
227 {
228 struct kfd_topology_device *topo_dev;
229 struct amdgpu_vm *vm;
230 u32 total_cwsr_size;
231 int err;
232
233 topo_dev = kfd_topology_device_by_id(pdd->dev->id);
234 if (!topo_dev)
235 return -EINVAL;
236
237 vm = drm_priv_to_vm(pdd->drm_priv);
238 err = amdgpu_bo_reserve(vm->root.bo, false);
239 if (err)
240 return err;
241
242 err = kfd_queue_buffer_get(vm, properties->write_ptr, &properties->wptr_bo, PAGE_SIZE);
243 if (err)
244 goto out_err_unreserve;
245
246 err = kfd_queue_buffer_get(vm, properties->read_ptr, &properties->rptr_bo, PAGE_SIZE);
247 if (err)
248 goto out_err_unreserve;
249
250 err = kfd_queue_buffer_get(vm, (void *)properties->queue_address,
251&properties->ring_bo, properties->queue_size);
252 if (err)
253 goto out_err_unreserve;
254
255 /* only compute queue requires EOP buffer and CWSR area */
256 if (properties->type != KFD_QUEUE_TYPE_COMPUTE)
257 goto out_unreserve;

This is clearly a success path.

258
259 /* EOP buffer is not required for all ASICs */
260 if (properties->eop_ring_buffer_address) {
261 if (properties->eop_ring_buffer_size != topo_dev->node_props.eop_buffer_size) {
262 pr_debug("queue eop bo size 0x%lx not equal to node eop buf size 0x%x\n",
263 properties->eop_buf_bo->tbo.base.size,
264 topo_dev->node_props.eop_buffer_size);
--> 265 goto out_err_unreserve;

This has err in the label name.  err = -EINVAL?

266 }
267 err = kfd_queue_buffer_get(vm, (void *)properties->eop_ring_buffer_address,
268&properties->eop_buf_bo,
269properties->eop_ring_buffer_size);
270 if (err)
271 goto out_err_unreserve;
272 }
273
274 if (properties->ctl_stack_size != topo_dev->node_props.ctl_stack_size) {
275 pr_debug("queue ctl stack size 0x%x not equal to node ctl stack size 0x%x\n",
276 properties->ctl_stack_size,
277 topo_dev->node_props.ctl_stack_size);
278 goto out_err_unreserve;

err?

279 }
280
281 if (properties->ctx_save_restore_area_size != topo_dev->node_props.cwsr_size) {
282 pr_debug("queue cwsr size 0x%x not equal to node cwsr size 0x%x\n",
283 properties->ctx_save_restore_area_size,
284 topo_dev->node_props.cwsr_size);
285 goto out_err_unreserve;

err?  Not sure.

286 }
287
288 total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size)
289   * NUM_XCC(pdd->dev->xcc_mask);
290 total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE);
291
292 err = kfd_queue_buffer_get(vm, (void *)properties->ctx_save_restore_area_address,
293&properties->cwsr_bo, total_cwsr_size);
294 if (!err)
295 goto out_unreserve;
296
297 amdgpu_bo_unreserve(vm->root.bo);
298
299 err = kfd_queue_buffer_svm_get(pdd, properties->ctx_save_restore_area_address,
300total_cwsr_size);
301 if (err)
302 goto out_err_release;
303
304 return 0;
305
306 out_unreserve:
307 amdgpu_bo_unreserve(vm->root.bo);
308 return 0;
309
310 out_err_unreserve:
311 amdgpu_b

Re: [PATCH v3] drm/amdkfd: Change kfd/svm page fault drain handling

2024-07-25 Thread Philip Yang

  


On 2024-07-25 14:19, Xiaogang.Chen
  wrote:


  From: Xiaogang Chen 

When app unmap vm ranges(munmap) kfd/svm starts drain pending page fault and
not handle any incoming pages fault of this process until a deferred work item
got executed by default system wq. The time period of "not handle page fault"
can be long and is unpredicable. That is advese to kfd performance on page
faults recovery.

This patch uses time stamp of incoming page page to decide to drop or handle
page fault. When app unmap vm ranges kfd records each gpu device's ih ring
current time stamp. These time stamps are used at kfd page fault recovery
routine.

Any page fault happens on unmapped ranges after unmap events is app bug that
accesses vm range after unmap. It is not driver work to cover that.

By using time stamp of page fault do not need drain page faults at deferred
work. So, the time period that kfd does not handle page faults is reduced
and can be controlled.

Signed-off-by: Xiaogang.Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c |   4 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |   2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |   3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |   4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h  |   5 +-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c   | 102 -
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h   |   2 +-
 7 files changed, 79 insertions(+), 43 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 3abfa66d72a2..d90b7ea3f020 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2763,7 +2763,7 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
  * shouldn't be reported any more.
  */
 bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
-			u32 vmid, u32 node_id, uint64_t addr,
+			u32 vmid, u32 node_id, uint64_t addr, uint64_t ts,
 			bool write_fault)
 {
 	bool is_compute_context = false;
@@ -2789,7 +2789,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
 	addr /= AMDGPU_GPU_PAGE_SIZE;
 
 	if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid,
-	node_id, addr, write_fault)) {
+	node_id, addr, ts, write_fault)) {
 		amdgpu_bo_unref(&root);
 		return true;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 312a408b80d3..1d6a1381ede9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -548,7 +548,7 @@ amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm);
 void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info);
 
 bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
-			u32 vmid, u32 node_id, uint64_t addr,
+			u32 vmid, u32 node_id, uint64_t addr, uint64_t ts,
 			bool write_fault);
 
 void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index d933e19e0cf5..3596cc2ee7e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -132,7 +132,8 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
 		/* Try to handle the recoverable page faults by filling page
 		 * tables
 		 */
-		if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, write_fault))
+		if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr,
+	   entry->timestamp, write_fault))
 			return 1;
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 350f6b6676f1..ac08d9424feb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -595,7 +595,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 			cam_index = entry->src_data[2] & 0x3ff;
 
 			ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
-		 addr, write_fault);
+		 addr, entry->timestamp, write_fault);
 			WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index);
 			if (ret)
 return 1;
@@ -618,7 +618,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 			 * tables
 			 */
 			if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
-		   addr, write_fault))
+		   addr, entry->timestamp, write_fault))
 return 1;
 		}
 	}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index c51e908f6f19..771c98e104ee 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -850,10 +850,13 @@ struct svm_range_list {
 	struct list_headcriu_svm_metadata_list;
 	spinlock_t			deferred_list_lock;
 	atomic_t			evicted_ranges;
-	atomic_t			drain_pagefaults;
+	/* stop page fault recovery for this process */
+	atomic_t			stop_pf_recovery;
 	struct delayed_work		restore_work;
 	DECLAR

[PATCH] drm/amdkfd: Fix compile error if HMM support not enabled

2024-07-25 Thread Philip Yang
Fixes the below if kernel config not enable HMM support

>> drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_queue.c:107:26: error:
implicit declaration of function 'svm_range_from_addr'

>> drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_queue.c:107:24: error:
assignment to 'struct svm_range *' from 'int' makes pointer from integer
without a cast [-Wint-conversion]

>> drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_queue.c:111:28: error:
invalid use of undefined type 'struct svm_range'

Fixes: de165b53c93f ("drm/amdkfd: Validate user queue svm memory residency")
Reported-by: kernel test robot 
Closes: 
https://lore.kernel.org/oe-kbuild-all/202407252127.zvnxakra-...@intel.com/
Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index 9807e8adf77d..64c292f0ba1b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -85,6 +85,8 @@ void uninit_queue(struct queue *q)
kfree(q);
 }
 
+#if IS_ENABLED(CONFIG_HSA_AMD_SVM)
+
 static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, 
u64 size)
 {
struct kfd_process *p = pdd->process;
@@ -178,6 +180,18 @@ static void kfd_queue_buffer_svm_put(struct 
kfd_process_device *pdd, u64 addr, u
 
mutex_unlock(&p->svms.lock);
 }
+#else
+
+static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, 
u64 size)
+{
+   return -EINVAL;
+}
+
+static void kfd_queue_buffer_svm_put(struct kfd_process_device *pdd, u64 addr, 
u64 size)
+{
+}
+
+#endif
 
 int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct 
amdgpu_bo **pbo,
 u64 expected_size)
-- 
2.43.2



Re: [PATCH v2] drm/amdkfd: Change kfd/svm page fault drain handling

2024-07-24 Thread Philip Yang

  


On 2024-07-24 09:58, Chen, Xiaogang
  wrote:


  
  
  
  On 7/23/2024 4:02 PM, Philip Yang
wrote:
  
  


On 2024-07-19 18:17, Xiaogang.Chen
  wrote:


  From: Xiaogang Chen 

When app unmap vm ranges(munmap) kfd/svm starts drain pending page fault and
not handle any incoming pages fault of this process until a deferred work item
got executed by default system wq. The time period of "not handle page fault"
can be long and is unpredicable. That is advese to kfd performance on page
faults recovery.

This patch uses time stamp of incoming page page to decide to drop or handle
page fault. When app unmap vm ranges kfd records each gpu device's ih ring
current time stamp. These time stamps are used at kfd page fault recovery
routine.

Any page fault happens on unmapped ranges after unmap events is app bug that
accesses vm range after unmap. It is not driver work to cover that.

By using time stamp of page fault do not need drain page faults at deferred
work. So, the time period that kfd does not handle page faults is reduced
and can be controlled.

This simplify the retry fault draining and support the multiple
processes correctly now, some nitpick below.

  Signed-off-by: Xiaogang.Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c |   4 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |   2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |   3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |   4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h  |   5 +-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c   | 111 +
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h   |   2 +-
 7 files changed, 88 insertions(+), 43 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 3abfa66d72a2..d90b7ea3f020 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2763,7 +2763,7 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
  * shouldn't be reported any more.
  */
 bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
-			u32 vmid, u32 node_id, uint64_t addr,
+			u32 vmid, u32 node_id, uint64_t addr, uint64_t ts,
 			bool write_fault)
 {
 	bool is_compute_context = false;
@@ -2789,7 +2789,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
 	addr /= AMDGPU_GPU_PAGE_SIZE;
 
 	if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid,
-	node_id, addr, write_fault)) {
+	node_id, addr, ts, write_fault)) {
 		amdgpu_bo_unref(&root);
 		return true;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 312a408b80d3..1d6a1381ede9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -548,7 +548,7 @@ amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm);
 void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info);
 
 bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
-			u32 vmid, u32 node_id, uint64_t addr,
+			u32 vmid, u32 node_id, uint64_t addr, uint64_t ts,
 			bool write_fault);
 
 void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index d933e19e0cf5..5cceaba6e5c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -132,7 +132,8 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
 		/* Try to handle the recoverable page faults by filling page
 		 * tables
 		 */
-		if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, write_fault))
+		if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr,
+   entry->timestamp, write_fault))

indent should align to the start bracket.
  
  ok.
  

   			return 1;
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 350f6b6676f1..ac08d9424feb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -595,7 +595,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 			cam_index = entry->src_data[2] & 0x3ff;
 
 			ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
-		 addr, write_fault);
+		 addr, entry->timestamp, write_fault);
 			WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index);
 			if (ret)
 return 1;
@@ -618,7 +618,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 			 * tables
 			 */
 			if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
-		   addr, write_fault))
+		   addr, entry->timestamp, write_fault))
 return 1;
 		}
 	}
diff --git a/drivers/

Re: [PATCH v2] drm/amdkfd: Change kfd/svm page fault drain handling

2024-07-23 Thread Philip Yang

  


On 2024-07-19 18:17, Xiaogang.Chen
  wrote:


  From: Xiaogang Chen 

When app unmap vm ranges(munmap) kfd/svm starts drain pending page fault and
not handle any incoming pages fault of this process until a deferred work item
got executed by default system wq. The time period of "not handle page fault"
can be long and is unpredicable. That is advese to kfd performance on page
faults recovery.

This patch uses time stamp of incoming page page to decide to drop or handle
page fault. When app unmap vm ranges kfd records each gpu device's ih ring
current time stamp. These time stamps are used at kfd page fault recovery
routine.

Any page fault happens on unmapped ranges after unmap events is app bug that
accesses vm range after unmap. It is not driver work to cover that.

By using time stamp of page fault do not need drain page faults at deferred
work. So, the time period that kfd does not handle page faults is reduced
and can be controlled.

This simplify the retry fault draining and support the multiple
processes correctly now, some nitpick below.

  

Signed-off-by: Xiaogang.Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c |   4 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |   2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |   3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |   4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h  |   5 +-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c   | 111 +
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h   |   2 +-
 7 files changed, 88 insertions(+), 43 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 3abfa66d72a2..d90b7ea3f020 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2763,7 +2763,7 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
  * shouldn't be reported any more.
  */
 bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
-			u32 vmid, u32 node_id, uint64_t addr,
+			u32 vmid, u32 node_id, uint64_t addr, uint64_t ts,
 			bool write_fault)
 {
 	bool is_compute_context = false;
@@ -2789,7 +2789,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
 	addr /= AMDGPU_GPU_PAGE_SIZE;
 
 	if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid,
-	node_id, addr, write_fault)) {
+	node_id, addr, ts, write_fault)) {
 		amdgpu_bo_unref(&root);
 		return true;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 312a408b80d3..1d6a1381ede9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -548,7 +548,7 @@ amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm);
 void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info);
 
 bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
-			u32 vmid, u32 node_id, uint64_t addr,
+			u32 vmid, u32 node_id, uint64_t addr, uint64_t ts,
 			bool write_fault);
 
 void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index d933e19e0cf5..5cceaba6e5c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -132,7 +132,8 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
 		/* Try to handle the recoverable page faults by filling page
 		 * tables
 		 */
-		if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, write_fault))
+		if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr,
+   entry->timestamp, write_fault))

indent should align to the start bracket.

  
 			return 1;
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 350f6b6676f1..ac08d9424feb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -595,7 +595,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 			cam_index = entry->src_data[2] & 0x3ff;
 
 			ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
-		 addr, write_fault);
+		 addr, entry->timestamp, write_fault);
 			WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index);
 			if (ret)
 return 1;
@@ -618,7 +618,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 			 * tables
 			 */
 			if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
-		   addr, write_fault))
+		   addr, entry->timestamp, write_fault))
 return 1;
 		}
 	}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index c51e908f6f19..8b8d5ab9da76 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -850,10 +850,13 @@ struct svm_range_list {
 	struct list_headcriu_svm_metadata_list;
 	spinlock_t			deferr

[PATCH v2 3/9] drm/amdkfd: Refactor queue wptr_bo GART mapping

2024-07-18 Thread Philip Yang
Add helper function kfd_queue_acquire_buffers to get queue wptr_bo
reference from queue write_ptr if it is mapped to the KFD node with
expected size.

Add wptr_bo to structure queue_properties because structure queue is
allocated after queue buffers are validated, then we can remove wptr_bo
parameter from pqm_create_queue.

Rename structure queue wptr_bo_gart to hold wptr_bo reference for GART
mapping and umapping. Move MES wptr_bo_gart mapping to init_user_queue,
the same location with queue ctx_bo GART mapping.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  2 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  5 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  | 56 +++---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |  6 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 13 +++--
 .../amd/amdkfd/kfd_process_queue_manager.c| 45 +++
 drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 57 +++
 7 files changed, 116 insertions(+), 68 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 6e591280774b..4ed49265c764 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -322,7 +322,7 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem 
*mem,
 void **kptr, uint64_t *size);
 void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem);
 
-int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo);
+int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct amdgpu_bo 
**bo_gart);
 
 int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
struct dma_fence __rcu **ef);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 199e387d35f4..0ab37e7aec26 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2226,11 +2226,12 @@ int amdgpu_amdkfd_gpuvm_sync_memory(
 /**
  * amdgpu_amdkfd_map_gtt_bo_to_gart - Map BO to GART and increment reference 
count
  * @bo: Buffer object to be mapped
+ * @bo_gart: Return bo reference
  *
  * Before return, bo reference count is incremented. To release the reference 
and unpin/
  * unmap the BO, call amdgpu_amdkfd_free_gtt_mem.
  */
-int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo)
+int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct amdgpu_bo 
**bo_gart)
 {
int ret;
 
@@ -2257,7 +2258,7 @@ int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo)
 
amdgpu_bo_unreserve(bo);
 
-   bo = amdgpu_bo_ref(bo);
+   *bo_gart = amdgpu_bo_ref(bo);
 
return 0;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 823f245dc7d0..202f24ee4bd7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -247,8 +247,8 @@ static int set_queue_properties_from_user(struct 
queue_properties *q_properties,
q_properties->priority = args->queue_priority;
q_properties->queue_address = args->ring_base_address;
q_properties->queue_size = args->ring_size;
-   q_properties->read_ptr = (uint32_t *) args->read_pointer_address;
-   q_properties->write_ptr = (uint32_t *) args->write_pointer_address;
+   q_properties->read_ptr = (void __user *)args->read_pointer_address;
+   q_properties->write_ptr = (void __user *)args->write_pointer_address;
q_properties->eop_ring_buffer_address = args->eop_buffer_address;
q_properties->eop_ring_buffer_size = args->eop_buffer_size;
q_properties->ctx_save_restore_area_address =
@@ -306,7 +306,6 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
struct kfd_process_device *pdd;
struct queue_properties q_properties;
uint32_t doorbell_offset_in_process = 0;
-   struct amdgpu_bo *wptr_bo = NULL;
 
memset(&q_properties, 0, sizeof(struct queue_properties));
 
@@ -342,53 +341,17 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
}
}
 
-   /* Starting with GFX11, wptr BOs must be mapped to GART for MES to 
determine work
-* on unmapped queues for usermode queue oversubscription (no 
aggregated doorbell)
-*/
-   if (dev->kfd->shared_resources.enable_mes &&
-   ((dev->adev->mes.sched_version & 
AMDGPU_MES_API_VERSION_MASK)
-   >> AMDGPU_MES_API_VERSION_SHIFT) >= 2) {
-   struct amdgpu_bo_va_mapping *wptr_mapping;
-   struct amdgpu_vm *wptr_vm;
-
-   wptr_vm = drm_priv_to_vm(pdd->drm_priv);
-   err = 

[PATCH v2 8/9] drm/amdkfd: Store queue cwsr area size to node properties

2024-07-18 Thread Philip Yang
Use the queue eop buffer size, cwsr area size, ctl stack size
calculation from Thunk, store the value to KFD node properties.

Those will be used to validate queue eop buffer size, cwsr area size,
ctl stack size when creating KFD user compute queue.

Those will be exposed to user space via sysfs KFD node properties, to
remove the duplicate calculation code from Thunk.

Signed-off-by: Philip Yang 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 75 +++
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c |  2 +
 drivers/gpu/drm/amd/amdkfd/kfd_topology.h |  4 ++
 4 files changed, 82 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index c31589043d5b..b5cae48dff66 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1295,6 +1295,7 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void 
__user *addr, struct amdgpu_
 void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo);
 int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct 
queue_properties *properties);
 int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct 
queue_properties *properties);
+void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev);
 
 struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
struct kfd_node *dev);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index 67242ce051b5..adcda9730c9f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -24,6 +24,7 @@
 
 #include 
 #include "kfd_priv.h"
+#include "kfd_topology.h"
 #include "kfd_svm.h"
 
 void print_queue_properties(struct queue_properties *q)
@@ -305,3 +306,77 @@ int kfd_queue_release_buffers(struct kfd_process_device 
*pdd, struct queue_prope
 properties->ctx_save_restore_area_size);
return 0;
 }
+
+#define SGPR_SIZE_PER_CU   0x4000
+#define LDS_SIZE_PER_CU0x1
+#define HWREG_SIZE_PER_CU  0x1000
+#define DEBUGGER_BYTES_ALIGN   64
+#define DEBUGGER_BYTES_PER_WAVE32
+
+static u32 kfd_get_vgpr_size_per_cu(u32 gfxv)
+{
+   u32 vgpr_size = 0x4;
+
+   if ((gfxv / 100 * 100) == 90400 ||  /* GFX_VERSION_AQUA_VANJARAM */
+   gfxv == 90010 ||/* GFX_VERSION_ALDEBARAN */
+   gfxv == 90008)  /* GFX_VERSION_ARCTURUS */
+   vgpr_size = 0x8;
+   else if (gfxv == 11 ||  /* GFX_VERSION_PLUM_BONITO */
+gfxv == 110001 ||  /* GFX_VERSION_WHEAT_NAS */
+gfxv == 12 ||  /* GFX_VERSION_GFX1200 */
+gfxv == 120001)/* GFX_VERSION_GFX1201 */
+   vgpr_size = 0x6;
+
+   return vgpr_size;
+}
+
+#define WG_CONTEXT_DATA_SIZE_PER_CU(gfxv)  \
+   (kfd_get_vgpr_size_per_cu(gfxv) + SGPR_SIZE_PER_CU +\
+LDS_SIZE_PER_CU + HWREG_SIZE_PER_CU)
+
+#define CNTL_STACK_BYTES_PER_WAVE(gfxv)\
+   ((gfxv) >= 100100 ? 12 : 8) /* GFX_VERSION_NAVI10*/
+
+#define SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER 40
+
+void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev)
+{
+   struct kfd_node_properties *props = &dev->node_props;
+   u32 gfxv = props->gfx_target_version;
+   u32 ctl_stack_size;
+   u32 wg_data_size;
+   u32 wave_num;
+   u32 cu_num;
+
+   if (gfxv < 80001)   /* GFX_VERSION_CARRIZO */
+   return;
+
+   cu_num = props->simd_count / props->simd_per_cu / 
NUM_XCC(dev->gpu->xcc_mask);
+   wave_num = (gfxv < 100100) ?/* GFX_VERSION_NAVI10 */
+   min(cu_num * 40, props->array_count / 
props->simd_arrays_per_engine * 512)
+   : cu_num * 32;
+
+   wg_data_size = ALIGN(cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(gfxv), 
PAGE_SIZE);
+   ctl_stack_size = wave_num * CNTL_STACK_BYTES_PER_WAVE(gfxv) + 8;
+   ctl_stack_size = ALIGN(SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER + 
ctl_stack_size,
+  PAGE_SIZE);
+
+   if ((gfxv / 1 * 1) == 10) {
+   /* HW design limits control stack size to 0x7000.
+* This is insufficient for theoretical PM4 cases
+* but sufficient for AQL, limited by SPI events.
+*/
+   ctl_stack_size = min(ctl_stack_size, 0x7000);
+   }
+
+   props->ctl_stack_size = ctl_stack_size;
+   props->debug_memory_size = ALIGN(wave_num * DEBUGGER_BYTES_PER_WAVE, 
DEBUGGER_BYTES_ALIGN);
+   props->cwsr_size = ctl_stack_size + wg_data_size;
+
+   if (gfxv == 80002)  /* GFX_VERSION_TONGA */
+   props->eop_buffer_size = 0x

[PATCH v2 2/9] drm/amdkfd: amdkfd_free_gtt_mem clear the correct pointer

2024-07-18 Thread Philip Yang
Pass pointer reference to amdgpu_bo_unref to clear the correct pointer,
otherwise amdgpu_bo_unref clear the local variable, the original pointer
not set to NULL, this could cause use-after-free bug.

Signed-off-by: Philip Yang 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 14 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c   |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_device.c|  4 ++--
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c   |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c   |  2 +-
 .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |  4 ++--
 8 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 03205e3c3746..c272461d70a9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -364,15 +364,15 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device 
*adev, size_t size,
return r;
 }
 
-void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void *mem_obj)
+void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void **mem_obj)
 {
-   struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj;
+   struct amdgpu_bo **bo = (struct amdgpu_bo **) mem_obj;
 
-   amdgpu_bo_reserve(bo, true);
-   amdgpu_bo_kunmap(bo);
-   amdgpu_bo_unpin(bo);
-   amdgpu_bo_unreserve(bo);
-   amdgpu_bo_unref(&(bo));
+   amdgpu_bo_reserve(*bo, true);
+   amdgpu_bo_kunmap(*bo);
+   amdgpu_bo_unpin(*bo);
+   amdgpu_bo_unreserve(*bo);
+   amdgpu_bo_unref(bo);
 }
 
 int amdgpu_amdkfd_alloc_gws(struct amdgpu_device *adev, size_t size,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 66b1c72c81e5..6e591280774b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -235,7 +235,7 @@ int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo 
*bo,
 int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device *adev, size_t size,
void **mem_obj, uint64_t *gpu_addr,
void **cpu_ptr, bool mqd_gfx9);
-void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void *mem_obj);
+void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void **mem_obj);
 int amdgpu_amdkfd_alloc_gws(struct amdgpu_device *adev, size_t size,
void **mem_obj);
 void amdgpu_amdkfd_free_gws(struct amdgpu_device *adev, void *mem_obj);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 1d9b21628be7..823f245dc7d0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -423,7 +423,7 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
 
 err_create_queue:
if (wptr_bo)
-   amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo);
+   amdgpu_amdkfd_free_gtt_mem(dev->adev, (void **)&wptr_bo);
 err_wptr_map_gart:
 err_bind_process:
 err_pdd:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index f4d20adaa068..6619028dd58b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -907,7 +907,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 kfd_doorbell_error:
kfd_gtt_sa_fini(kfd);
 kfd_gtt_sa_init_error:
-   amdgpu_amdkfd_free_gtt_mem(kfd->adev, kfd->gtt_mem);
+   amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem);
 alloc_gtt_mem_failure:
dev_err(kfd_device,
"device %x:%x NOT added due to errors\n",
@@ -925,7 +925,7 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
kfd_doorbell_fini(kfd);
ida_destroy(&kfd->doorbell_ida);
kfd_gtt_sa_fini(kfd);
-   amdgpu_amdkfd_free_gtt_mem(kfd->adev, kfd->gtt_mem);
+   amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem);
}
 
kfree(kfd);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 4f48507418d2..420444eb8e98 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -2621,7 +2621,7 @@ static void deallocate_hiq_sdma_mqd(struct kfd_node *dev,
 {
WARN(!mqd, "No hiq sdma mqd trunk to free");
 
-   amdgpu_amdkfd_free_gtt_mem(dev->adev, mqd->gtt_mem);
+   amdgpu_amdkfd_free_gtt_mem(dev->adev, &mqd->gtt_mem);
 }
 
 void device_queue_manager_uninit(struct device_queue_manager *dqm)
diff --git a/drivers/gpu/drm/amd/am

[PATCH v2 4/9] drm/amdkfd: Validate user queue buffers

2024-07-18 Thread Philip Yang
Find user queue rptr, ring buf, eop buffer and cwsr area BOs, and
check BOs are mapped on the GPU with correct size and take the BO
reference.

Signed-off-by: Philip Yang 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h  |  4 +++
 drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 38 --
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index aba9bcd91f65..80d8080c5764 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -524,6 +524,10 @@ struct queue_properties {
uint64_t exception_status;
 
struct amdgpu_bo *wptr_bo;
+   struct amdgpu_bo *rptr_bo;
+   struct amdgpu_bo *ring_bo;
+   struct amdgpu_bo *eop_buf_bo;
+   struct amdgpu_bo *cwsr_bo;
 };
 
 #define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 &&  \
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index b4529ec298a9..0e661160c295 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -97,7 +97,8 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user 
*addr, struct amdgpu_
if (!mapping)
goto out_err;
 
-   if (user_addr != mapping->start || user_addr + size - 1 != 
mapping->last) {
+   if (user_addr != mapping->start ||
+   (size != 0 && user_addr + size - 1 != mapping->last)) {
pr_debug("expected size 0x%llx not equal to mapping addr 0x%llx 
size 0x%llx\n",
expected_size, mapping->start << AMDGPU_GPU_PAGE_SHIFT,
(mapping->last - mapping->start + 1) << 
AMDGPU_GPU_PAGE_SHIFT);
@@ -124,18 +125,51 @@ int kfd_queue_acquire_buffers(struct kfd_process_device 
*pdd, struct queue_prope
 
err = kfd_queue_buffer_get(vm, properties->write_ptr, 
&properties->wptr_bo, PAGE_SIZE);
if (err)
+   goto out_err_unreserve;
+
+   err = kfd_queue_buffer_get(vm, properties->read_ptr, 
&properties->rptr_bo, PAGE_SIZE);
+   if (err)
+   goto out_err_unreserve;
+
+   err = kfd_queue_buffer_get(vm, (void *)properties->queue_address,
+  &properties->ring_bo, 
properties->queue_size);
+   if (err)
+   goto out_err_unreserve;
+
+   /* only compute queue requires EOP buffer and CWSR area */
+   if (properties->type != KFD_QUEUE_TYPE_COMPUTE)
goto out_unreserve;
 
+   /* EOP buffer is not required for all ASICs */
+   if (properties->eop_ring_buffer_address) {
+   err = kfd_queue_buffer_get(vm, (void 
*)properties->eop_ring_buffer_address,
+  &properties->eop_buf_bo,
+  properties->eop_ring_buffer_size);
+   if (err)
+   goto out_err_unreserve;
+   }
+
+   err = kfd_queue_buffer_get(vm, (void 
*)properties->ctx_save_restore_area_address,
+  &properties->cwsr_bo, 0);
+   if (err)
+   goto out_err_unreserve;
+
+out_unreserve:
amdgpu_bo_unreserve(vm->root.bo);
return 0;
 
-out_unreserve:
+out_err_unreserve:
amdgpu_bo_unreserve(vm->root.bo);
+   kfd_queue_release_buffers(pdd, properties);
return err;
 }
 
 int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct 
queue_properties *properties)
 {
amdgpu_bo_unref(&properties->wptr_bo);
+   amdgpu_bo_unref(&properties->rptr_bo);
+   amdgpu_bo_unref(&properties->ring_bo);
+   amdgpu_bo_unref(&properties->eop_buf_bo);
+   amdgpu_bo_unref(&properties->cwsr_bo);
return 0;
 }
-- 
2.43.2



[PATCH v2 5/9] drm/amdkfd: Ensure user queue buffers residency

2024-07-18 Thread Philip Yang
Add atomic queue_refcount to struct bo_va, return -EBUSY to fail unmap
BO from the GPU if the bo_va queue_refcount is not zero.

Create queue to increase the bo_va queue_refcount, destroy queue to
decrease the bo_va queue_refcount, to ensure the queue buffers mapped on
the GPU when queue is active.

Signed-off-by: Philip Yang 
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 14 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.h|  6 
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 34 ---
 5 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 0ab37e7aec26..6d5fd371d5ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1252,7 +1252,7 @@ static int unreserve_bo_and_vms(struct 
bo_vm_reservation_context *ctx,
return ret;
 }
 
-static void unmap_bo_from_gpuvm(struct kgd_mem *mem,
+static int unmap_bo_from_gpuvm(struct kgd_mem *mem,
struct kfd_mem_attachment *entry,
struct amdgpu_sync *sync)
 {
@@ -1260,11 +1260,18 @@ static void unmap_bo_from_gpuvm(struct kgd_mem *mem,
struct amdgpu_device *adev = entry->adev;
struct amdgpu_vm *vm = bo_va->base.vm;
 
+   if (bo_va->queue_refcount) {
+   pr_debug("bo_va->queue_refcount %d\n", bo_va->queue_refcount);
+   return -EBUSY;
+   }
+
amdgpu_vm_bo_unmap(adev, bo_va, entry->va);
 
amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update);
 
amdgpu_sync_fence(sync, bo_va->last_pt_update);
+
+   return 0;
 }
 
 static int update_gpuvm_pte(struct kgd_mem *mem,
@@ -2191,7 +2198,10 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
pr_debug("\t unmap VA 0x%llx - 0x%llx from entry %p\n",
 entry->va, entry->va + bo_size, entry);
 
-   unmap_bo_from_gpuvm(mem, entry, ctx.sync);
+   ret = unmap_bo_from_gpuvm(mem, entry, ctx.sync);
+   if (ret)
+   goto unreserve_out;
+
entry->is_mapped = false;
 
mem->mapped_to_gpu_memory--;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index bc42ccbde659..d7e27957013f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -90,6 +90,12 @@ struct amdgpu_bo_va {
boolcleared;
 
boolis_xgmi;
+
+   /*
+* protected by vm reservation lock
+* if non-zero, cannot unmap from GPU because user queues may still 
access it
+*/
+   unsigned intqueue_refcount;
 };
 
 struct amdgpu_bo {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 202f24ee4bd7..65a37ac5a0f0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1384,8 +1384,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file 
*filep,
err = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
peer_pdd->dev->adev, (struct kgd_mem *)mem, 
peer_pdd->drm_priv);
if (err) {
-   pr_err("Failed to unmap from gpu %d/%d\n",
-  i, args->n_devices);
+   pr_debug("Failed to unmap from gpu %d/%d\n", i, 
args->n_devices);
goto unmap_memory_from_gpu_failed;
}
args->n_success = i+1;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 80d8080c5764..c31589043d5b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1292,6 +1292,7 @@ void print_queue_properties(struct queue_properties *q);
 void print_queue(struct queue *q);
 int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct 
amdgpu_bo **pbo,
 u64 expected_size);
+void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo);
 int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct 
queue_properties *properties);
 int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct 
queue_properties *properties);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index 0e661160c295..3fd386dcb011 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -106,6 +106,7 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user 
*addr, struct

[PATCH v2 9/9] drm/amdkfd: Validate queue cwsr area and eop buffer size

2024-07-18 Thread Philip Yang
When creating KFD user compute queue, check if queue eop buffer size,
cwsr area size, ctl stack size equal to the size of KFD node
properities.

Check the entire cwsr area which may split into multiple svm ranges
aligned to gramularity boundary.

Signed-off-by: Philip Yang 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 46 +++---
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index adcda9730c9f..9807e8adf77d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -225,9 +225,15 @@ void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct 
amdgpu_bo **bo)
 
 int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct 
queue_properties *properties)
 {
+   struct kfd_topology_device *topo_dev;
struct amdgpu_vm *vm;
+   u32 total_cwsr_size;
int err;
 
+   topo_dev = kfd_topology_device_by_id(pdd->dev->id);
+   if (!topo_dev)
+   return -EINVAL;
+
vm = drm_priv_to_vm(pdd->drm_priv);
err = amdgpu_bo_reserve(vm->root.bo, false);
if (err)
@@ -252,6 +258,12 @@ int kfd_queue_acquire_buffers(struct kfd_process_device 
*pdd, struct queue_prope
 
/* EOP buffer is not required for all ASICs */
if (properties->eop_ring_buffer_address) {
+   if (properties->eop_ring_buffer_size != 
topo_dev->node_props.eop_buffer_size) {
+   pr_debug("queue eop bo size 0x%lx not equal to node eop 
buf size 0x%x\n",
+   properties->eop_buf_bo->tbo.base.size,
+   topo_dev->node_props.eop_buffer_size);
+   goto out_err_unreserve;
+   }
err = kfd_queue_buffer_get(vm, (void 
*)properties->eop_ring_buffer_address,
   &properties->eop_buf_bo,
   properties->eop_ring_buffer_size);
@@ -259,15 +271,33 @@ int kfd_queue_acquire_buffers(struct kfd_process_device 
*pdd, struct queue_prope
goto out_err_unreserve;
}
 
+   if (properties->ctl_stack_size != topo_dev->node_props.ctl_stack_size) {
+   pr_debug("queue ctl stack size 0x%x not equal to node ctl stack 
size 0x%x\n",
+   properties->ctl_stack_size,
+   topo_dev->node_props.ctl_stack_size);
+   goto out_err_unreserve;
+   }
+
+   if (properties->ctx_save_restore_area_size != 
topo_dev->node_props.cwsr_size) {
+   pr_debug("queue cwsr size 0x%x not equal to node cwsr size 
0x%x\n",
+   properties->ctx_save_restore_area_size,
+   topo_dev->node_props.cwsr_size);
+   goto out_err_unreserve;
+   }
+
+   total_cwsr_size = (topo_dev->node_props.cwsr_size + 
topo_dev->node_props.debug_memory_size)
+ * NUM_XCC(pdd->dev->xcc_mask);
+   total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE);
+
err = kfd_queue_buffer_get(vm, (void 
*)properties->ctx_save_restore_area_address,
-  &properties->cwsr_bo, 0);
+  &properties->cwsr_bo, total_cwsr_size);
if (!err)
goto out_unreserve;
 
amdgpu_bo_unreserve(vm->root.bo);
 
err = kfd_queue_buffer_svm_get(pdd, 
properties->ctx_save_restore_area_address,
-  properties->ctx_save_restore_area_size);
+  total_cwsr_size);
if (err)
goto out_err_release;
 
@@ -286,7 +316,9 @@ int kfd_queue_acquire_buffers(struct kfd_process_device 
*pdd, struct queue_prope
 
 int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct 
queue_properties *properties)
 {
+   struct kfd_topology_device *topo_dev;
struct amdgpu_vm *vm;
+   u32 total_cwsr_size;
int err;
 
vm = drm_priv_to_vm(pdd->drm_priv);
@@ -302,8 +334,14 @@ int kfd_queue_release_buffers(struct kfd_process_device 
*pdd, struct queue_prope
 
amdgpu_bo_unreserve(vm->root.bo);
 
-   kfd_queue_buffer_svm_put(pdd, properties->ctx_save_restore_area_address,
-properties->ctx_save_restore_area_size);
+   topo_dev = kfd_topology_device_by_id(pdd->dev->id);
+   if (!topo_dev)
+   return -EINVAL;
+   total_cwsr_size = (topo_dev->node_props.cwsr_size + 
topo_dev->node_props.debug_memory_size)
+ * NUM_XCC(pdd->dev->xcc_mask);
+   total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE);
+
+   kfd_queue_buffer_svm_put(pdd, 
properties->ctx_save_restore_area_address, total_cwsr_size);
return 0;
 }
 
-- 
2.43.2



[PATCH v2 6/9] drm/amdkfd: Validate user queue svm memory residency

2024-07-18 Thread Philip Yang
Queue CWSR area maybe registered to GPU as svm memory, create queue to
ensure svm mapped to GPU with KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED flag.

Add queue_refcount to struct svm_range, to track queue CWSR area usage.

Because unmap mmu notifier callback return value is ignored, if
application unmap the CWSR area while queue is active, pr_warn message
in dmesg log. To be safe, evict user queue.

Signed-off-by: Philip Yang 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 110 -
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c   |  12 +++
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h   |   1 +
 3 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index 3fd386dcb011..67242ce051b5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -24,6 +24,7 @@
 
 #include 
 #include "kfd_priv.h"
+#include "kfd_svm.h"
 
 void print_queue_properties(struct queue_properties *q)
 {
@@ -83,6 +84,100 @@ void uninit_queue(struct queue *q)
kfree(q);
 }
 
+static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, 
u64 size)
+{
+   struct kfd_process *p = pdd->process;
+   struct list_head update_list;
+   struct svm_range *prange;
+   int ret = -EINVAL;
+
+   INIT_LIST_HEAD(&update_list);
+   addr >>= PAGE_SHIFT;
+   size >>= PAGE_SHIFT;
+
+   mutex_lock(&p->svms.lock);
+
+   /*
+* range may split to multiple svm pranges aligned to granularity 
boundaery.
+*/
+   while (size) {
+   uint32_t gpuid, gpuidx;
+   int r;
+
+   prange = svm_range_from_addr(&p->svms, addr, NULL);
+   if (!prange)
+   break;
+
+   if (!prange->mapped_to_gpu)
+   break;
+
+   r = kfd_process_gpuid_from_node(p, pdd->dev, &gpuid, &gpuidx);
+   if (r < 0)
+   break;
+   if (!test_bit(gpuidx, prange->bitmap_access) &&
+   !test_bit(gpuidx, prange->bitmap_aip))
+   break;
+
+   if (!(prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED))
+   break;
+
+   list_add(&prange->update_list, &update_list);
+
+   if (prange->last - prange->start + 1 >= size) {
+   size = 0;
+   break;
+   }
+
+   size -= prange->last - prange->start + 1;
+   addr += prange->last - prange->start + 1;
+   }
+   if (size) {
+   pr_debug("[0x%llx 0x%llx] not registered\n", addr, addr + size 
- 1);
+   goto out_unlock;
+   }
+
+   list_for_each_entry(prange, &update_list, update_list)
+   atomic_inc(&prange->queue_refcount);
+   ret = 0;
+
+out_unlock:
+   mutex_unlock(&p->svms.lock);
+   return ret;
+}
+
+static void kfd_queue_buffer_svm_put(struct kfd_process_device *pdd, u64 addr, 
u64 size)
+{
+   struct kfd_process *p = pdd->process;
+   struct svm_range *prange, *pchild;
+   struct interval_tree_node *node;
+   unsigned long last;
+
+   addr >>= PAGE_SHIFT;
+   last = addr + (size >> PAGE_SHIFT) - 1;
+
+   mutex_lock(&p->svms.lock);
+
+   node = interval_tree_iter_first(&p->svms.objects, addr, last);
+   while (node) {
+   struct interval_tree_node *next_node;
+   unsigned long next_start;
+
+   prange = container_of(node, struct svm_range, it_node);
+   next_node = interval_tree_iter_next(node, addr, last);
+   next_start = min(node->last, last) + 1;
+
+   if (atomic_add_unless(&prange->queue_refcount, -1, 0)) {
+   list_for_each_entry(pchild, &prange->child_list, 
child_list)
+   atomic_add_unless(&pchild->queue_refcount, -1, 
0);
+   }
+
+   node = next_node;
+   addr = next_start;
+   }
+
+   mutex_unlock(&p->svms.lock);
+}
+
 int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct 
amdgpu_bo **pbo,
 u64 expected_size)
 {
@@ -165,8 +260,17 @@ int kfd_queue_acquire_buffers(struct kfd_process_device 
*pdd, struct queue_prope
 
err = kfd_queue_buffer_get(vm, (void 
*)properties->ctx_save_restore_area_address,
   &properties->cwsr_bo, 0);
+   if (!err)
+   goto out_unreserve;
+
+   amdgpu_bo_unreserve(vm->root.bo);
+
+   err = kfd_queue_buffer_svm_get(pdd, 
properties->ctx_save_restore_area_address,
+

[PATCH v2 7/9] drm/amdkfd: Validate user queue update

2024-07-18 Thread Philip Yang
Ensure update queue new ring buffer is mapped on GPU with correct size.

Decrease queue old ring_bo queue_refcount and increase new ring_bo
queue_refcount.

Signed-off-by: Philip Yang 
Reviewed-by: Felix Kuehling 
---
 .../amd/amdkfd/kfd_process_queue_manager.c| 32 ++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 4947f28b3afb..9995dbb43359 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -549,11 +549,41 @@ int pqm_update_queue_properties(struct 
process_queue_manager *pqm,
struct process_queue_node *pqn;
 
pqn = get_queue_by_qid(pqm, qid);
-   if (!pqn) {
+   if (!pqn || !pqn->q) {
pr_debug("No queue %d exists for update operation\n", qid);
return -EFAULT;
}
 
+   /*
+* Update with NULL ring address is used to disable the queue
+*/
+   if (p->queue_address && p->queue_size) {
+   struct kfd_process_device *pdd;
+   struct amdgpu_vm *vm;
+   struct queue *q = pqn->q;
+   int err;
+
+   pdd = kfd_get_process_device_data(q->device, q->process);
+   if (!pdd)
+   return -ENODEV;
+   vm = drm_priv_to_vm(pdd->drm_priv);
+   err = amdgpu_bo_reserve(vm->root.bo, false);
+   if (err)
+   return err;
+
+   if (kfd_queue_buffer_get(vm, (void *)p->queue_address, 
&p->ring_bo,
+p->queue_size)) {
+   pr_debug("ring buf 0x%llx size 0x%llx not mapped on 
GPU\n",
+p->queue_address, p->queue_size);
+   return -EFAULT;
+   }
+
+   kfd_queue_buffer_put(vm, &pqn->q->properties.ring_bo);
+   amdgpu_bo_unreserve(vm->root.bo);
+
+   pqn->q->properties.ring_bo = p->ring_bo;
+   }
+
pqn->q->properties.queue_address = p->queue_address;
pqn->q->properties.queue_size = p->queue_size;
pqn->q->properties.queue_percent = p->queue_percent;
-- 
2.43.2



[PATCH v2 1/9] drm/amdkfd: kfd_bo_mapped_dev support partition

2024-07-18 Thread Philip Yang
Change amdgpu_amdkfd_bo_mapped_to_dev to use drm_priv as parameter
instead of adev, to support spatial partition. This is only used by CRIU
checkpoint restore now. No functional change.

Signed-off-by: Philip Yang 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h   | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 5 +++--
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index e7bb1ca35801..66b1c72c81e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -345,7 +345,7 @@ void 
amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *ad
pasid_notify pasid_fn, void *data, uint32_t reset);
 
 bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
-bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem 
*mem);
+bool amdgpu_amdkfd_bo_mapped_to_dev(void *drm_priv, struct kgd_mem *mem);
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
 int amdgpu_amdkfd_criu_resume(void *p);
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 11672bfe4fad..199e387d35f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -3200,12 +3200,13 @@ int amdgpu_amdkfd_get_tile_config(struct amdgpu_device 
*adev,
return 0;
 }
 
-bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem 
*mem)
+bool amdgpu_amdkfd_bo_mapped_to_dev(void *drm_priv, struct kgd_mem *mem)
 {
+   struct amdgpu_vm *vm = drm_priv_to_vm(drm_priv);
struct kfd_mem_attachment *entry;
 
list_for_each_entry(entry, &mem->attachments, list) {
-   if (entry->is_mapped && entry->adev == adev)
+   if (entry->is_mapped && entry->bo_va->base.vm == vm)
return true;
}
return false;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 32e5db509560..1d9b21628be7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1963,7 +1963,7 @@ static int criu_checkpoint_bos(struct kfd_process *p,
bo_bucket->offset = 
amdgpu_bo_mmap_offset(dumper_bo);
 
for (i = 0; i < p->n_pdds; i++) {
-   if 
(amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->dev->adev, kgd_mem))
+   if 
(amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->drm_priv, kgd_mem))
bo_priv->mapped_gpuids[dev_idx++] = 
p->pdds[i]->user_gpu_id;
}
 
-- 
2.43.2



[PATCH v2 0/9] KFD user queue validation

2024-07-18 Thread Philip Yang
This patch series do additional queue buffers validation in the queue
creation IOCTLS, fail the queue creation if buffers not mapped on the GPU
with the expected size.

Ensure queue buffers residency by tracking the GPUVM virtual addresses
for queue buffers to return error if the user tries to free and unmap them
when the qeueu is active, or evict the queue if SVM memory is unmapped and
freed from CPU.

Patch 1-2 is prepration work and general fix.

v2:
 - patch 3/9, keep wptr_bo_gart in struct queue

Philip Yang (9):
  drm/amdkfd: kfd_bo_mapped_dev support partition
  drm/amdkfd: amdkfd_free_gtt_mem clear the correct pointer
  drm/amdkfd: Refactor queue wptr_bo GART mapping
  drm/amdkfd: Validate user queue buffers
  drm/amdkfd: Ensure user queue buffers residency
  drm/amdkfd: Validate user queue svm memory residency
  drm/amdkfd: Validate user queue update
  drm/amdkfd: Store queue cwsr area size to node properties
  drm/amdkfd: Validate queue cwsr area and eop buffer size

 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  14 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|   6 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  24 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.h|   6 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  61 +---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c   |   4 +-
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |   8 +-
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c  |   2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  19 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  |   2 +-
 .../amd/amdkfd/kfd_process_queue_manager.c|  79 +++-
 drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 336 ++
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c  |  12 +
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h  |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_topology.h |   4 +
 16 files changed, 489 insertions(+), 91 deletions(-)

-- 
2.43.2



Re: [PATCH 3/9] drm/amdkfd: Refactor queue wptr_bo GART mapping

2024-07-18 Thread Philip Yang

  


On 2024-07-17 16:16, Felix Kuehling
  wrote:

Sorry, I
  see that this patch still doesn't propagate errors returned from
  kfd_queue_releasre_buffers correctly. And the later patches in the
  series don't seem to fix it either. See inline.
  

kfd_queue_release_buffers return value is handled in queue destroy
path, to return -ERESTARTSYS if fail to hold vm lock to release
buffers because signal is received. See inline.

  
  On 2024-07-15 08:34, Philip Yang wrote:
  
  Add helper function
kfd_queue_acquire_buffers to get queue wptr_bo

reference from queue write_ptr if it is mapped to the KFD node
with

expected size.


Move wptr_bo to structure queue_properties from struct queue as
queue is

allocated after queue buffers are validated, then we can remove
wptr_bo

parameter from pqm_create_queue.


Because amdgpu_bo_unref clear the pointer, queue_properties
wptr_bo is

used to acquire and release wptr_bo for validation, add
wptr_bo_gart to

queue_propertes, to hold wptr_bo reference for GART mapping and

umapping.


Move MES wptr_bo GART mapping to init_user_queue, the same
location with

queue ctx_bo GART mapping.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  2 +-

  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  5 +-

  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  | 56
+++---

  .../drm/amd/amdkfd/kfd_device_queue_manager.c |  6 +-

  drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 14 +++--

  .../amd/amdkfd/kfd_process_queue_manager.c    | 45
+++

  drivers/gpu/drm/amd/amdkfd/kfd_queue.c    | 57
+++

  7 files changed, 116 insertions(+), 69 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h

index 6e591280774b..4ed49265c764 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h

@@ -322,7 +322,7 @@ int
amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem,

   void **kptr, uint64_t *size);

  void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct
kgd_mem *mem);

  -int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo);

+int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo,
struct amdgpu_bo **bo_gart);

    int amdgpu_amdkfd_gpuvm_restore_process_bos(void
*process_info,

  struct dma_fence __rcu **ef);

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

index 199e387d35f4..0ab37e7aec26 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

@@ -2226,11 +2226,12 @@ int amdgpu_amdkfd_gpuvm_sync_memory(

  /**

   * amdgpu_amdkfd_map_gtt_bo_to_gart - Map BO to GART and
increment reference count

   * @bo: Buffer object to be mapped

+ * @bo_gart: Return bo reference

   *

   * Before return, bo reference count is incremented. To
release the reference and unpin/

   * unmap the BO, call amdgpu_amdkfd_free_gtt_mem.

   */

-int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo)

+int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo,
struct amdgpu_bo **bo_gart)

  {

  int ret;

  @@ -2257,7 +2258,7 @@ int
amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo)

    amdgpu_bo_unreserve(bo);

  -    bo = amdgpu_bo_ref(bo);

+    *bo_gart = amdgpu_bo_ref(bo);

    return 0;

  diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c

index 823f245dc7d0..202f24ee4bd7 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c

@@ -247,8 +247,8 @@ static int
set_queue_properties_from_u

Re: [PATCH 3/9] drm/amdkfd: Refactor queue wptr_bo GART mapping

2024-07-18 Thread Philip Yang

  


On 2024-07-17 16:10, Felix Kuehling
  wrote:


  @@ -603,8 +606,6
@@ struct queue {

  void *gang_ctx_bo;

  uint64_t gang_ctx_gpu_addr;

  void *gang_ctx_cpu_ptr;

-

-    struct amdgpu_bo *wptr_bo;

  
  
  If the wptr_bo_gart is GART-mapped and freed in the same place as
  the gang_ctx_bo, then maybe it makes sense to keep the two
  together in this structure. It also avoids having two different
  reference to the same BO in the same queue_properties structure
  above.
  

Yes, agree it makes sense to keep it inside struct queue and
  rename to wptr_bo_gart for GART mapping and unmapping. Add wptr_bo
  to struct queue_properties for queue wptr acquire and release.
Regards,
Philip


  
  Other than that, this patch looks good to me.
  
  
  Regards,
  
    Felix
  
  
  
    };

    enum KFD_MQD_TYPE {
  

  



[PATCH 9/9] drm/amdkfd: Validate queue cwsr area and eop buffer size

2024-07-15 Thread Philip Yang
When creating KFD user compute queue, check if queue eop buffer size,
cwsr area size, ctl stack size equal to the size of KFD node
properities.

Check the entire cwsr area which may split into multiple svm ranges
aligned to gramularity boundary.

Signed-off-by: Philip Yang 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 46 +++---
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index adcda9730c9f..9807e8adf77d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -225,9 +225,15 @@ void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct 
amdgpu_bo **bo)
 
 int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct 
queue_properties *properties)
 {
+   struct kfd_topology_device *topo_dev;
struct amdgpu_vm *vm;
+   u32 total_cwsr_size;
int err;
 
+   topo_dev = kfd_topology_device_by_id(pdd->dev->id);
+   if (!topo_dev)
+   return -EINVAL;
+
vm = drm_priv_to_vm(pdd->drm_priv);
err = amdgpu_bo_reserve(vm->root.bo, false);
if (err)
@@ -252,6 +258,12 @@ int kfd_queue_acquire_buffers(struct kfd_process_device 
*pdd, struct queue_prope
 
/* EOP buffer is not required for all ASICs */
if (properties->eop_ring_buffer_address) {
+   if (properties->eop_ring_buffer_size != 
topo_dev->node_props.eop_buffer_size) {
+   pr_debug("queue eop bo size 0x%lx not equal to node eop 
buf size 0x%x\n",
+   properties->eop_buf_bo->tbo.base.size,
+   topo_dev->node_props.eop_buffer_size);
+   goto out_err_unreserve;
+   }
err = kfd_queue_buffer_get(vm, (void 
*)properties->eop_ring_buffer_address,
   &properties->eop_buf_bo,
   properties->eop_ring_buffer_size);
@@ -259,15 +271,33 @@ int kfd_queue_acquire_buffers(struct kfd_process_device 
*pdd, struct queue_prope
goto out_err_unreserve;
}
 
+   if (properties->ctl_stack_size != topo_dev->node_props.ctl_stack_size) {
+   pr_debug("queue ctl stack size 0x%x not equal to node ctl stack 
size 0x%x\n",
+   properties->ctl_stack_size,
+   topo_dev->node_props.ctl_stack_size);
+   goto out_err_unreserve;
+   }
+
+   if (properties->ctx_save_restore_area_size != 
topo_dev->node_props.cwsr_size) {
+   pr_debug("queue cwsr size 0x%x not equal to node cwsr size 
0x%x\n",
+   properties->ctx_save_restore_area_size,
+   topo_dev->node_props.cwsr_size);
+   goto out_err_unreserve;
+   }
+
+   total_cwsr_size = (topo_dev->node_props.cwsr_size + 
topo_dev->node_props.debug_memory_size)
+ * NUM_XCC(pdd->dev->xcc_mask);
+   total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE);
+
err = kfd_queue_buffer_get(vm, (void 
*)properties->ctx_save_restore_area_address,
-  &properties->cwsr_bo, 0);
+  &properties->cwsr_bo, total_cwsr_size);
if (!err)
goto out_unreserve;
 
amdgpu_bo_unreserve(vm->root.bo);
 
err = kfd_queue_buffer_svm_get(pdd, 
properties->ctx_save_restore_area_address,
-  properties->ctx_save_restore_area_size);
+  total_cwsr_size);
if (err)
goto out_err_release;
 
@@ -286,7 +316,9 @@ int kfd_queue_acquire_buffers(struct kfd_process_device 
*pdd, struct queue_prope
 
 int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct 
queue_properties *properties)
 {
+   struct kfd_topology_device *topo_dev;
struct amdgpu_vm *vm;
+   u32 total_cwsr_size;
int err;
 
vm = drm_priv_to_vm(pdd->drm_priv);
@@ -302,8 +334,14 @@ int kfd_queue_release_buffers(struct kfd_process_device 
*pdd, struct queue_prope
 
amdgpu_bo_unreserve(vm->root.bo);
 
-   kfd_queue_buffer_svm_put(pdd, properties->ctx_save_restore_area_address,
-properties->ctx_save_restore_area_size);
+   topo_dev = kfd_topology_device_by_id(pdd->dev->id);
+   if (!topo_dev)
+   return -EINVAL;
+   total_cwsr_size = (topo_dev->node_props.cwsr_size + 
topo_dev->node_props.debug_memory_size)
+ * NUM_XCC(pdd->dev->xcc_mask);
+   total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE);
+
+   kfd_queue_buffer_svm_put(pdd, 
properties->ctx_save_restore_area_address, total_cwsr_size);
return 0;
 }
 
-- 
2.43.2



[PATCH 4/9] drm/amdkfd: Validate user queue buffers

2024-07-15 Thread Philip Yang
Find user queue rptr, ring buf, eop buffer and cwsr area BOs, and
check BOs are mapped on the GPU with correct size and take the BO
reference.

Signed-off-by: Philip Yang 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h  |  4 +++
 drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 38 --
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index c98ff548313c..d0dca20849d9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -525,6 +525,10 @@ struct queue_properties {
 
struct amdgpu_bo *wptr_bo_gart;
struct amdgpu_bo *wptr_bo;
+   struct amdgpu_bo *rptr_bo;
+   struct amdgpu_bo *ring_bo;
+   struct amdgpu_bo *eop_buf_bo;
+   struct amdgpu_bo *cwsr_bo;
 };
 
 #define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 &&  \
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index b4529ec298a9..0e661160c295 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -97,7 +97,8 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user 
*addr, struct amdgpu_
if (!mapping)
goto out_err;
 
-   if (user_addr != mapping->start || user_addr + size - 1 != 
mapping->last) {
+   if (user_addr != mapping->start ||
+   (size != 0 && user_addr + size - 1 != mapping->last)) {
pr_debug("expected size 0x%llx not equal to mapping addr 0x%llx 
size 0x%llx\n",
expected_size, mapping->start << AMDGPU_GPU_PAGE_SHIFT,
(mapping->last - mapping->start + 1) << 
AMDGPU_GPU_PAGE_SHIFT);
@@ -124,18 +125,51 @@ int kfd_queue_acquire_buffers(struct kfd_process_device 
*pdd, struct queue_prope
 
err = kfd_queue_buffer_get(vm, properties->write_ptr, 
&properties->wptr_bo, PAGE_SIZE);
if (err)
+   goto out_err_unreserve;
+
+   err = kfd_queue_buffer_get(vm, properties->read_ptr, 
&properties->rptr_bo, PAGE_SIZE);
+   if (err)
+   goto out_err_unreserve;
+
+   err = kfd_queue_buffer_get(vm, (void *)properties->queue_address,
+  &properties->ring_bo, 
properties->queue_size);
+   if (err)
+   goto out_err_unreserve;
+
+   /* only compute queue requires EOP buffer and CWSR area */
+   if (properties->type != KFD_QUEUE_TYPE_COMPUTE)
goto out_unreserve;
 
+   /* EOP buffer is not required for all ASICs */
+   if (properties->eop_ring_buffer_address) {
+   err = kfd_queue_buffer_get(vm, (void 
*)properties->eop_ring_buffer_address,
+  &properties->eop_buf_bo,
+  properties->eop_ring_buffer_size);
+   if (err)
+   goto out_err_unreserve;
+   }
+
+   err = kfd_queue_buffer_get(vm, (void 
*)properties->ctx_save_restore_area_address,
+  &properties->cwsr_bo, 0);
+   if (err)
+   goto out_err_unreserve;
+
+out_unreserve:
amdgpu_bo_unreserve(vm->root.bo);
return 0;
 
-out_unreserve:
+out_err_unreserve:
amdgpu_bo_unreserve(vm->root.bo);
+   kfd_queue_release_buffers(pdd, properties);
return err;
 }
 
 int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct 
queue_properties *properties)
 {
amdgpu_bo_unref(&properties->wptr_bo);
+   amdgpu_bo_unref(&properties->rptr_bo);
+   amdgpu_bo_unref(&properties->ring_bo);
+   amdgpu_bo_unref(&properties->eop_buf_bo);
+   amdgpu_bo_unref(&properties->cwsr_bo);
return 0;
 }
-- 
2.43.2



[PATCH 5/9] drm/amdkfd: Ensure user queue buffers residency

2024-07-15 Thread Philip Yang
Add atomic queue_refcount to struct bo_va, return -EBUSY to fail unmap
BO from the GPU if the bo_va queue_refcount is not zero.

Create queue to increase the bo_va queue_refcount, destroy queue to
decrease the bo_va queue_refcount, to ensure the queue buffers mapped on
the GPU when queue is active.

Signed-off-by: Philip Yang 
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 14 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.h|  6 
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 34 ---
 5 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 0ab37e7aec26..6d5fd371d5ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1252,7 +1252,7 @@ static int unreserve_bo_and_vms(struct 
bo_vm_reservation_context *ctx,
return ret;
 }
 
-static void unmap_bo_from_gpuvm(struct kgd_mem *mem,
+static int unmap_bo_from_gpuvm(struct kgd_mem *mem,
struct kfd_mem_attachment *entry,
struct amdgpu_sync *sync)
 {
@@ -1260,11 +1260,18 @@ static void unmap_bo_from_gpuvm(struct kgd_mem *mem,
struct amdgpu_device *adev = entry->adev;
struct amdgpu_vm *vm = bo_va->base.vm;
 
+   if (bo_va->queue_refcount) {
+   pr_debug("bo_va->queue_refcount %d\n", bo_va->queue_refcount);
+   return -EBUSY;
+   }
+
amdgpu_vm_bo_unmap(adev, bo_va, entry->va);
 
amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update);
 
amdgpu_sync_fence(sync, bo_va->last_pt_update);
+
+   return 0;
 }
 
 static int update_gpuvm_pte(struct kgd_mem *mem,
@@ -2191,7 +2198,10 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
pr_debug("\t unmap VA 0x%llx - 0x%llx from entry %p\n",
 entry->va, entry->va + bo_size, entry);
 
-   unmap_bo_from_gpuvm(mem, entry, ctx.sync);
+   ret = unmap_bo_from_gpuvm(mem, entry, ctx.sync);
+   if (ret)
+   goto unreserve_out;
+
entry->is_mapped = false;
 
mem->mapped_to_gpu_memory--;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index bc42ccbde659..d7e27957013f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -90,6 +90,12 @@ struct amdgpu_bo_va {
boolcleared;
 
boolis_xgmi;
+
+   /*
+* protected by vm reservation lock
+* if non-zero, cannot unmap from GPU because user queues may still 
access it
+*/
+   unsigned intqueue_refcount;
 };
 
 struct amdgpu_bo {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 202f24ee4bd7..65a37ac5a0f0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1384,8 +1384,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file 
*filep,
err = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
peer_pdd->dev->adev, (struct kgd_mem *)mem, 
peer_pdd->drm_priv);
if (err) {
-   pr_err("Failed to unmap from gpu %d/%d\n",
-  i, args->n_devices);
+   pr_debug("Failed to unmap from gpu %d/%d\n", i, 
args->n_devices);
goto unmap_memory_from_gpu_failed;
}
args->n_success = i+1;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index d0dca20849d9..95fbdb12beb1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1291,6 +1291,7 @@ void print_queue_properties(struct queue_properties *q);
 void print_queue(struct queue *q);
 int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct 
amdgpu_bo **pbo,
 u64 expected_size);
+void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo);
 int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct 
queue_properties *properties);
 int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct 
queue_properties *properties);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index 0e661160c295..3fd386dcb011 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -106,6 +106,7 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user 
*addr, struct

[PATCH 8/9] drm/amdkfd: Store queue cwsr area size to node properties

2024-07-15 Thread Philip Yang
Use the queue eop buffer size, cwsr area size, ctl stack size
calculation from Thunk, store the value to KFD node properties.

Those will be used to validate queue eop buffer size, cwsr area size,
ctl stack size when creating KFD user compute queue.

Those will be exposed to user space via sysfs KFD node properties, to
remove the duplicate calculation code from Thunk.

Signed-off-by: Philip Yang 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 75 +++
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c |  2 +
 drivers/gpu/drm/amd/amdkfd/kfd_topology.h |  4 ++
 4 files changed, 82 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 95fbdb12beb1..58f5bc021ea9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1294,6 +1294,7 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void 
__user *addr, struct amdgpu_
 void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo);
 int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct 
queue_properties *properties);
 int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct 
queue_properties *properties);
+void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev);
 
 struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
struct kfd_node *dev);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index 67242ce051b5..adcda9730c9f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -24,6 +24,7 @@
 
 #include 
 #include "kfd_priv.h"
+#include "kfd_topology.h"
 #include "kfd_svm.h"
 
 void print_queue_properties(struct queue_properties *q)
@@ -305,3 +306,77 @@ int kfd_queue_release_buffers(struct kfd_process_device 
*pdd, struct queue_prope
 properties->ctx_save_restore_area_size);
return 0;
 }
+
+#define SGPR_SIZE_PER_CU   0x4000
+#define LDS_SIZE_PER_CU0x1
+#define HWREG_SIZE_PER_CU  0x1000
+#define DEBUGGER_BYTES_ALIGN   64
+#define DEBUGGER_BYTES_PER_WAVE32
+
+static u32 kfd_get_vgpr_size_per_cu(u32 gfxv)
+{
+   u32 vgpr_size = 0x4;
+
+   if ((gfxv / 100 * 100) == 90400 ||  /* GFX_VERSION_AQUA_VANJARAM */
+   gfxv == 90010 ||/* GFX_VERSION_ALDEBARAN */
+   gfxv == 90008)  /* GFX_VERSION_ARCTURUS */
+   vgpr_size = 0x8;
+   else if (gfxv == 11 ||  /* GFX_VERSION_PLUM_BONITO */
+gfxv == 110001 ||  /* GFX_VERSION_WHEAT_NAS */
+gfxv == 12 ||  /* GFX_VERSION_GFX1200 */
+gfxv == 120001)/* GFX_VERSION_GFX1201 */
+   vgpr_size = 0x6;
+
+   return vgpr_size;
+}
+
+#define WG_CONTEXT_DATA_SIZE_PER_CU(gfxv)  \
+   (kfd_get_vgpr_size_per_cu(gfxv) + SGPR_SIZE_PER_CU +\
+LDS_SIZE_PER_CU + HWREG_SIZE_PER_CU)
+
+#define CNTL_STACK_BYTES_PER_WAVE(gfxv)\
+   ((gfxv) >= 100100 ? 12 : 8) /* GFX_VERSION_NAVI10*/
+
+#define SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER 40
+
+void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev)
+{
+   struct kfd_node_properties *props = &dev->node_props;
+   u32 gfxv = props->gfx_target_version;
+   u32 ctl_stack_size;
+   u32 wg_data_size;
+   u32 wave_num;
+   u32 cu_num;
+
+   if (gfxv < 80001)   /* GFX_VERSION_CARRIZO */
+   return;
+
+   cu_num = props->simd_count / props->simd_per_cu / 
NUM_XCC(dev->gpu->xcc_mask);
+   wave_num = (gfxv < 100100) ?/* GFX_VERSION_NAVI10 */
+   min(cu_num * 40, props->array_count / 
props->simd_arrays_per_engine * 512)
+   : cu_num * 32;
+
+   wg_data_size = ALIGN(cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(gfxv), 
PAGE_SIZE);
+   ctl_stack_size = wave_num * CNTL_STACK_BYTES_PER_WAVE(gfxv) + 8;
+   ctl_stack_size = ALIGN(SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER + 
ctl_stack_size,
+  PAGE_SIZE);
+
+   if ((gfxv / 1 * 1) == 10) {
+   /* HW design limits control stack size to 0x7000.
+* This is insufficient for theoretical PM4 cases
+* but sufficient for AQL, limited by SPI events.
+*/
+   ctl_stack_size = min(ctl_stack_size, 0x7000);
+   }
+
+   props->ctl_stack_size = ctl_stack_size;
+   props->debug_memory_size = ALIGN(wave_num * DEBUGGER_BYTES_PER_WAVE, 
DEBUGGER_BYTES_ALIGN);
+   props->cwsr_size = ctl_stack_size + wg_data_size;
+
+   if (gfxv == 80002)  /* GFX_VERSION_TONGA */
+   props->eop_buffer_size = 0x

[PATCH 7/9] drm/amdkfd: Validate user queue update

2024-07-15 Thread Philip Yang
Ensure update queue new ring buffer is mapped on GPU with correct size.

Decrease queue old ring_bo queue_refcount and increase new ring_bo
queue_refcount.

Signed-off-by: Philip Yang 
Reviewed-by: Felix Kuehling 
---
 .../amd/amdkfd/kfd_process_queue_manager.c| 32 ++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 8552400d6d47..dda26a7e3c37 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -549,11 +549,41 @@ int pqm_update_queue_properties(struct 
process_queue_manager *pqm,
struct process_queue_node *pqn;
 
pqn = get_queue_by_qid(pqm, qid);
-   if (!pqn) {
+   if (!pqn || !pqn->q) {
pr_debug("No queue %d exists for update operation\n", qid);
return -EFAULT;
}
 
+   /*
+* Update with NULL ring address is used to disable the queue
+*/
+   if (p->queue_address && p->queue_size) {
+   struct kfd_process_device *pdd;
+   struct amdgpu_vm *vm;
+   struct queue *q = pqn->q;
+   int err;
+
+   pdd = kfd_get_process_device_data(q->device, q->process);
+   if (!pdd)
+   return -ENODEV;
+   vm = drm_priv_to_vm(pdd->drm_priv);
+   err = amdgpu_bo_reserve(vm->root.bo, false);
+   if (err)
+   return err;
+
+   if (kfd_queue_buffer_get(vm, (void *)p->queue_address, 
&p->ring_bo,
+p->queue_size)) {
+   pr_debug("ring buf 0x%llx size 0x%llx not mapped on 
GPU\n",
+p->queue_address, p->queue_size);
+   return -EFAULT;
+   }
+
+   kfd_queue_buffer_put(vm, &pqn->q->properties.ring_bo);
+   amdgpu_bo_unreserve(vm->root.bo);
+
+   pqn->q->properties.ring_bo = p->ring_bo;
+   }
+
pqn->q->properties.queue_address = p->queue_address;
pqn->q->properties.queue_size = p->queue_size;
pqn->q->properties.queue_percent = p->queue_percent;
-- 
2.43.2



[PATCH 3/9] drm/amdkfd: Refactor queue wptr_bo GART mapping

2024-07-15 Thread Philip Yang
Add helper function kfd_queue_acquire_buffers to get queue wptr_bo
reference from queue write_ptr if it is mapped to the KFD node with
expected size.

Move wptr_bo to structure queue_properties from struct queue as queue is
allocated after queue buffers are validated, then we can remove wptr_bo
parameter from pqm_create_queue.

Because amdgpu_bo_unref clear the pointer, queue_properties wptr_bo is
used to acquire and release wptr_bo for validation, add wptr_bo_gart to
queue_propertes, to hold wptr_bo reference for GART mapping and
umapping.

Move MES wptr_bo GART mapping to init_user_queue, the same location with
queue ctx_bo GART mapping.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  2 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  5 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  | 56 +++---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |  6 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 14 +++--
 .../amd/amdkfd/kfd_process_queue_manager.c| 45 +++
 drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 57 +++
 7 files changed, 116 insertions(+), 69 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 6e591280774b..4ed49265c764 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -322,7 +322,7 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem 
*mem,
 void **kptr, uint64_t *size);
 void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem);
 
-int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo);
+int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct amdgpu_bo 
**bo_gart);
 
 int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
struct dma_fence __rcu **ef);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 199e387d35f4..0ab37e7aec26 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2226,11 +2226,12 @@ int amdgpu_amdkfd_gpuvm_sync_memory(
 /**
  * amdgpu_amdkfd_map_gtt_bo_to_gart - Map BO to GART and increment reference 
count
  * @bo: Buffer object to be mapped
+ * @bo_gart: Return bo reference
  *
  * Before return, bo reference count is incremented. To release the reference 
and unpin/
  * unmap the BO, call amdgpu_amdkfd_free_gtt_mem.
  */
-int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo)
+int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct amdgpu_bo 
**bo_gart)
 {
int ret;
 
@@ -2257,7 +2258,7 @@ int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo)
 
amdgpu_bo_unreserve(bo);
 
-   bo = amdgpu_bo_ref(bo);
+   *bo_gart = amdgpu_bo_ref(bo);
 
return 0;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 823f245dc7d0..202f24ee4bd7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -247,8 +247,8 @@ static int set_queue_properties_from_user(struct 
queue_properties *q_properties,
q_properties->priority = args->queue_priority;
q_properties->queue_address = args->ring_base_address;
q_properties->queue_size = args->ring_size;
-   q_properties->read_ptr = (uint32_t *) args->read_pointer_address;
-   q_properties->write_ptr = (uint32_t *) args->write_pointer_address;
+   q_properties->read_ptr = (void __user *)args->read_pointer_address;
+   q_properties->write_ptr = (void __user *)args->write_pointer_address;
q_properties->eop_ring_buffer_address = args->eop_buffer_address;
q_properties->eop_ring_buffer_size = args->eop_buffer_size;
q_properties->ctx_save_restore_area_address =
@@ -306,7 +306,6 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
struct kfd_process_device *pdd;
struct queue_properties q_properties;
uint32_t doorbell_offset_in_process = 0;
-   struct amdgpu_bo *wptr_bo = NULL;
 
memset(&q_properties, 0, sizeof(struct queue_properties));
 
@@ -342,53 +341,17 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
}
}
 
-   /* Starting with GFX11, wptr BOs must be mapped to GART for MES to 
determine work
-* on unmapped queues for usermode queue oversubscription (no 
aggregated doorbell)
-*/
-   if (dev->kfd->shared_resources.enable_mes &&
-   ((dev->adev->mes.sched_version & 
AMDGPU_MES_API_VERSION_MASK)
-   >> AMDGPU_MES_API_VERSION_SHIFT) >= 2) {
-   struct amdgpu_bo_va_mapping *wptr_mapping;
-

[PATCH 1/9] drm/amdkfd: kfd_bo_mapped_dev support partition

2024-07-15 Thread Philip Yang
Change amdgpu_amdkfd_bo_mapped_to_dev to use drm_priv as parameter
instead of adev, to support spatial partition. This is only used by CRIU
checkpoint restore now. No functional change.

Signed-off-by: Philip Yang 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h   | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 5 +++--
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index e7bb1ca35801..66b1c72c81e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -345,7 +345,7 @@ void 
amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *ad
pasid_notify pasid_fn, void *data, uint32_t reset);
 
 bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
-bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem 
*mem);
+bool amdgpu_amdkfd_bo_mapped_to_dev(void *drm_priv, struct kgd_mem *mem);
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
 int amdgpu_amdkfd_criu_resume(void *p);
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 11672bfe4fad..199e387d35f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -3200,12 +3200,13 @@ int amdgpu_amdkfd_get_tile_config(struct amdgpu_device 
*adev,
return 0;
 }
 
-bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem 
*mem)
+bool amdgpu_amdkfd_bo_mapped_to_dev(void *drm_priv, struct kgd_mem *mem)
 {
+   struct amdgpu_vm *vm = drm_priv_to_vm(drm_priv);
struct kfd_mem_attachment *entry;
 
list_for_each_entry(entry, &mem->attachments, list) {
-   if (entry->is_mapped && entry->adev == adev)
+   if (entry->is_mapped && entry->bo_va->base.vm == vm)
return true;
}
return false;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 32e5db509560..1d9b21628be7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1963,7 +1963,7 @@ static int criu_checkpoint_bos(struct kfd_process *p,
bo_bucket->offset = 
amdgpu_bo_mmap_offset(dumper_bo);
 
for (i = 0; i < p->n_pdds; i++) {
-   if 
(amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->dev->adev, kgd_mem))
+   if 
(amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->drm_priv, kgd_mem))
bo_priv->mapped_gpuids[dev_idx++] = 
p->pdds[i]->user_gpu_id;
}
 
-- 
2.43.2



[PATCH 6/9] drm/amdkfd: Validate user queue svm memory residency

2024-07-15 Thread Philip Yang
Queue CWSR area maybe registered to GPU as svm memory, create queue to
ensure svm mapped to GPU with KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED flag.

Add queue_refcount to struct svm_range, to track queue CWSR area usage.

Because unmap mmu notifier callback return value is ignored, if
application unmap the CWSR area while queue is active, pr_warn message
in dmesg log. To be safe, evict user queue.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 110 -
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c   |  12 +++
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h   |   1 +
 3 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index 3fd386dcb011..67242ce051b5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -24,6 +24,7 @@
 
 #include 
 #include "kfd_priv.h"
+#include "kfd_svm.h"
 
 void print_queue_properties(struct queue_properties *q)
 {
@@ -83,6 +84,100 @@ void uninit_queue(struct queue *q)
kfree(q);
 }
 
+static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, 
u64 size)
+{
+   struct kfd_process *p = pdd->process;
+   struct list_head update_list;
+   struct svm_range *prange;
+   int ret = -EINVAL;
+
+   INIT_LIST_HEAD(&update_list);
+   addr >>= PAGE_SHIFT;
+   size >>= PAGE_SHIFT;
+
+   mutex_lock(&p->svms.lock);
+
+   /*
+* range may split to multiple svm pranges aligned to granularity 
boundaery.
+*/
+   while (size) {
+   uint32_t gpuid, gpuidx;
+   int r;
+
+   prange = svm_range_from_addr(&p->svms, addr, NULL);
+   if (!prange)
+   break;
+
+   if (!prange->mapped_to_gpu)
+   break;
+
+   r = kfd_process_gpuid_from_node(p, pdd->dev, &gpuid, &gpuidx);
+   if (r < 0)
+   break;
+   if (!test_bit(gpuidx, prange->bitmap_access) &&
+   !test_bit(gpuidx, prange->bitmap_aip))
+   break;
+
+   if (!(prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED))
+   break;
+
+   list_add(&prange->update_list, &update_list);
+
+   if (prange->last - prange->start + 1 >= size) {
+   size = 0;
+   break;
+   }
+
+   size -= prange->last - prange->start + 1;
+   addr += prange->last - prange->start + 1;
+   }
+   if (size) {
+   pr_debug("[0x%llx 0x%llx] not registered\n", addr, addr + size 
- 1);
+   goto out_unlock;
+   }
+
+   list_for_each_entry(prange, &update_list, update_list)
+   atomic_inc(&prange->queue_refcount);
+   ret = 0;
+
+out_unlock:
+   mutex_unlock(&p->svms.lock);
+   return ret;
+}
+
+static void kfd_queue_buffer_svm_put(struct kfd_process_device *pdd, u64 addr, 
u64 size)
+{
+   struct kfd_process *p = pdd->process;
+   struct svm_range *prange, *pchild;
+   struct interval_tree_node *node;
+   unsigned long last;
+
+   addr >>= PAGE_SHIFT;
+   last = addr + (size >> PAGE_SHIFT) - 1;
+
+   mutex_lock(&p->svms.lock);
+
+   node = interval_tree_iter_first(&p->svms.objects, addr, last);
+   while (node) {
+   struct interval_tree_node *next_node;
+   unsigned long next_start;
+
+   prange = container_of(node, struct svm_range, it_node);
+   next_node = interval_tree_iter_next(node, addr, last);
+   next_start = min(node->last, last) + 1;
+
+   if (atomic_add_unless(&prange->queue_refcount, -1, 0)) {
+   list_for_each_entry(pchild, &prange->child_list, 
child_list)
+   atomic_add_unless(&pchild->queue_refcount, -1, 
0);
+   }
+
+   node = next_node;
+   addr = next_start;
+   }
+
+   mutex_unlock(&p->svms.lock);
+}
+
 int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct 
amdgpu_bo **pbo,
 u64 expected_size)
 {
@@ -165,8 +260,17 @@ int kfd_queue_acquire_buffers(struct kfd_process_device 
*pdd, struct queue_prope
 
err = kfd_queue_buffer_get(vm, (void 
*)properties->ctx_save_restore_area_address,
   &properties->cwsr_bo, 0);
+   if (!err)
+   goto out_unreserve;
+
+   amdgpu_bo_unreserve(vm->root.bo);
+
+   err = kfd_queue_buffer_svm_get(pdd, 
properties->ctx_save_restore_area_address,
+  properties-&

[PATCH 0/9] KFD user queue validation

2024-07-15 Thread Philip Yang
This patch series do additional queue buffers validation in the queue
creation IOCTLS, fail the queue creation if buffers not mapped on the GPU
with the expected size.

Ensure queue buffers residency by tracking the GPUVM virtual addresses
for queue buffers to return error if the user tries to free and unmap them
when the qeueu is active, or evict the queue if SVM memory is unmapped and
freed from CPU.  

Patch 1-2 is prepration work and general fix.

Philip Yang (9):
  drm/amdkfd: kfd_bo_mapped_dev support partition
  drm/amdkfd: amdkfd_free_gtt_mem clear the correct pointer
  drm/amdkfd: Refactor queue wptr_bo GART mapping
  drm/amdkfd: Validate user queue buffers
  drm/amdkfd: Ensure user queue buffers residency
  drm/amdkfd: Validate user queue svm memory residency
  drm/amdkfd: Validate user queue update
  drm/amdkfd: Store queue cwsr area size to node properties
  drm/amdkfd: Validate queue cwsr area and eop buffer size

 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  14 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|   6 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  24 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.h|   6 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  61 +---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c   |   4 +-
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |   8 +-
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c  |   2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  20 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  |   2 +-
 .../amd/amdkfd/kfd_process_queue_manager.c|  79 +++-
 drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 336 ++
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c  |  12 +
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h  |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_topology.h |   4 +
 16 files changed, 489 insertions(+), 92 deletions(-)

-- 
2.43.2



[PATCH 2/9] drm/amdkfd: amdkfd_free_gtt_mem clear the correct pointer

2024-07-15 Thread Philip Yang
Pass pointer reference to amdgpu_bo_unref to clear the correct pointer,
otherwise amdgpu_bo_unref clear the local variable, the original pointer
not set to NULL, this could cause use-after-free bug.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 14 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c   |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_device.c|  4 ++--
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c   |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c   |  2 +-
 .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |  4 ++--
 8 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 03205e3c3746..c272461d70a9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -364,15 +364,15 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device 
*adev, size_t size,
return r;
 }
 
-void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void *mem_obj)
+void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void **mem_obj)
 {
-   struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj;
+   struct amdgpu_bo **bo = (struct amdgpu_bo **) mem_obj;
 
-   amdgpu_bo_reserve(bo, true);
-   amdgpu_bo_kunmap(bo);
-   amdgpu_bo_unpin(bo);
-   amdgpu_bo_unreserve(bo);
-   amdgpu_bo_unref(&(bo));
+   amdgpu_bo_reserve(*bo, true);
+   amdgpu_bo_kunmap(*bo);
+   amdgpu_bo_unpin(*bo);
+   amdgpu_bo_unreserve(*bo);
+   amdgpu_bo_unref(bo);
 }
 
 int amdgpu_amdkfd_alloc_gws(struct amdgpu_device *adev, size_t size,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 66b1c72c81e5..6e591280774b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -235,7 +235,7 @@ int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo 
*bo,
 int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device *adev, size_t size,
void **mem_obj, uint64_t *gpu_addr,
void **cpu_ptr, bool mqd_gfx9);
-void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void *mem_obj);
+void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void **mem_obj);
 int amdgpu_amdkfd_alloc_gws(struct amdgpu_device *adev, size_t size,
void **mem_obj);
 void amdgpu_amdkfd_free_gws(struct amdgpu_device *adev, void *mem_obj);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 1d9b21628be7..823f245dc7d0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -423,7 +423,7 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
 
 err_create_queue:
if (wptr_bo)
-   amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo);
+   amdgpu_amdkfd_free_gtt_mem(dev->adev, (void **)&wptr_bo);
 err_wptr_map_gart:
 err_bind_process:
 err_pdd:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index f4d20adaa068..6619028dd58b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -907,7 +907,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 kfd_doorbell_error:
kfd_gtt_sa_fini(kfd);
 kfd_gtt_sa_init_error:
-   amdgpu_amdkfd_free_gtt_mem(kfd->adev, kfd->gtt_mem);
+   amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem);
 alloc_gtt_mem_failure:
dev_err(kfd_device,
"device %x:%x NOT added due to errors\n",
@@ -925,7 +925,7 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
kfd_doorbell_fini(kfd);
ida_destroy(&kfd->doorbell_ida);
kfd_gtt_sa_fini(kfd);
-   amdgpu_amdkfd_free_gtt_mem(kfd->adev, kfd->gtt_mem);
+   amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem);
}
 
kfree(kfd);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 4f48507418d2..420444eb8e98 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -2621,7 +2621,7 @@ static void deallocate_hiq_sdma_mqd(struct kfd_node *dev,
 {
WARN(!mqd, "No hiq sdma mqd trunk to free");
 
-   amdgpu_amdkfd_free_gtt_mem(dev->adev, mqd->gtt_mem);
+   amdgpu_amdkfd_free_gtt_mem(dev->adev, &mqd->gtt_mem);
 }
 
 void device_queue_manager_uninit(struct device_queue_manager *dqm)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c 
b/drivers/g

Re: [PATCH] drm/amdkfd: Correct svm prange overlapping handling at svm_range_set_attr ioctl

2024-06-24 Thread Philip Yang

  


On 2024-06-21 13:28, Xiaogang.Chen
  wrote:


  From: Xiaogang Chen 

When user adds new vm range that has overlapping with existing svm pranges
current kfd clones new prange and remove existing pranges including all data
associate with it. It is not necessary. We can handle the overlapping on
existing pranges directly that would simplify kfd code. And, when remove a
existing prange the locks from it will get destroyed. This may cause issue if
code still use these locks. And locks from cloned prange do not inherit
context of locks that got removed.

This patch does not remove existing pranges or clone new pranges, keeps locks
of pranges alive.

Signed-off-by: Xiaogang Chen
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 89 
 1 file changed, 12 insertions(+), 77 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 407636a68814..a8fcace6f9a2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -904,23 +904,6 @@ svm_range_copy_array(void *psrc, size_t size, uint64_t num_elements,
 	return (void *)dst;
 }
 
-static int
-svm_range_copy_dma_addrs(struct svm_range *dst, struct svm_range *src)
-{
-	int i;
-
-	for (i = 0; i < MAX_GPU_INSTANCE; i++) {
-		if (!src->dma_addr[i])
-			continue;
-		dst->dma_addr[i] = svm_range_copy_array(src->dma_addr[i],
-	sizeof(*src->dma_addr[i]), src->npages, 0, NULL);
-		if (!dst->dma_addr[i])
-			return -ENOMEM;
-	}
-
-	return 0;
-}
-
 static int
 svm_range_split_array(void *ppnew, void *ppold, size_t size,
 		  uint64_t old_start, uint64_t old_n,
@@ -1967,38 +1950,6 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
 	return r;
 }
 
-static struct svm_range *svm_range_clone(struct svm_range *old)
-{
-	struct svm_range *new;
-
-	new = svm_range_new(old->svms, old->start, old->last, false);
-	if (!new)
-		return NULL;
-	if (svm_range_copy_dma_addrs(new, old)) {
-		svm_range_free(new, false);
-		return NULL;
-	}
-	if (old->svm_bo) {
-		new->ttm_res = old->ttm_res;
-		new->offset = old->offset;
-		new->svm_bo = svm_range_bo_ref(old->svm_bo);
-		spin_lock(&new->svm_bo->list_lock);
-		list_add(&new->svm_bo_list, &new->svm_bo->range_list);
-		spin_unlock(&new->svm_bo->list_lock);
-	}
-	new->flags = old->flags;
-	new->preferred_loc = old->preferred_loc;
-	new->prefetch_loc = old->prefetch_loc;
-	new->actual_loc = old->actual_loc;
-	new->granularity = old->granularity;
-	new->mapped_to_gpu = old->mapped_to_gpu;
-	new->vram_pages = old->vram_pages;
-	bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
-	bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);
-
-	return new;
-}
-
 void svm_range_set_max_pages(struct amdgpu_device *adev)
 {
 	uint64_t max_pages;
@@ -2057,7 +2008,6 @@ svm_range_split_new(struct svm_range_list *svms, uint64_t start, uint64_t last,
  * @attrs: array of attributes
  * @update_list: output, the ranges need validate and update GPU mapping
  * @insert_list: output, the ranges need insert to svms
- * @remove_list: output, the ranges are replaced and need remove from svms
  * @remap_list: output, remap unaligned svm ranges
  *
  * Check if the virtual address range has overlap with any existing ranges,
@@ -2082,7 +2032,7 @@ static int
 svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
 	  uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs,
 	  struct list_head *update_list, struct list_head *insert_list,
-	  struct list_head *remove_list, struct list_head *remap_list)
+	  struct list_head *remap_list)
 {
 	unsigned long last = start + size - 1UL;
 	struct svm_range_list *svms = &p->svms;
@@ -2096,7 +2046,6 @@ svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
 
 	INIT_LIST_HEAD(update_list);
 	INIT_LIST_HEAD(insert_list);
-	INIT_LIST_HEAD(remove_list);
 	INIT_LIST_HEAD(&new_list);
 	INIT_LIST_HEAD(remap_list);
 
@@ -2117,20 +2066,11 @@ svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
 			/* nothing to do */
 		} else if (node->start < start || node->last > last) {
 			/* node intersects the update range and its attributes
-			 * will change. Clone and split it, apply updates only
+			 * will change. Split it, apply updates only
 			 * to the overlapping part
 			 */
-			struct svm_range *old = prange;
-
-			prange = svm_range_clone(old);
-			if (!prange) {
-r = -ENOMEM;
-goto out;
-			}
-
-			list_add(&old->update_list, remove_list);
-			list_add(&prange->list, insert_list);
-			list_add(&prange->update_list, update_list);
+			list_move_tail(&prange->list, insert_list);
+			list_move_tail(&prange->update_list, update_list);

The main purpose to clone prange is for error handling rollback.
  If removing original prange from svms and update it on
  insert_list, how do you rollback to put prange back to svms after
  splitting prange error?
We hold svms lock to ac

[PATCH] drm/amdgpu: Show retry fault message if process xnack on

2024-05-07 Thread Philip Yang
If vm_context_cntl set xnack on, then GPU vm fault has retry_fault bit
set, but the driver select xnack on or off path depending on per process
xnack setting which is also used to set qpd mem_config xnack on or off
if KFD_SUPPORT_XNACK_PER_PROCESS.

If process is xnack on, then GPU page fault show retry page fault
message, otherwise show no-retry page fault message, to avoid misleading
when debugging application page fault issue.

The process lookup from pasid is done inside retry fault handler
svm_range_restore_pages, add xnack_on parameter to pass process xnack
setting back to amdgpu_vm_handle_fault and then to gmc interrupt handler
to show vm fault message.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 5 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 7 ---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c   | 4 +++-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h   | 2 +-
 6 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 64ddc87f7fb6..58f7ab193027 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2757,13 +2757,14 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
  *   GFX 9.4.3.
  * @addr: Address of the fault
  * @write_fault: true is write fault, false is read fault
+ * @xnack_on: return value, true if the process sets xnack on
  *
  * Try to gracefully handle a VM fault. Return true if the fault was handled 
and
  * shouldn't be reported any more.
  */
 bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
u32 vmid, u32 node_id, uint64_t addr,
-   bool write_fault)
+   bool write_fault, bool *xnack_on)
 {
bool is_compute_context = false;
struct amdgpu_bo *root;
@@ -2788,7 +2789,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, 
u32 pasid,
addr /= AMDGPU_GPU_PAGE_SIZE;
 
if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid,
-   node_id, addr, write_fault)) {
+   node_id, addr, write_fault, xnack_on)) {
amdgpu_bo_unref(&root);
return true;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index bc71b44387b2..7f364f0b9a60 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -549,7 +549,7 @@ void amdgpu_vm_put_task_info(struct amdgpu_task_info 
*task_info);
 
 bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
u32 vmid, u32 node_id, uint64_t addr,
-   bool write_fault);
+   bool write_fault, bool *xnack_on);
 
 void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index d933e19e0cf5..2f0752376236 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -132,7 +132,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device 
*adev,
/* Try to handle the recoverable page faults by filling page
 * tables
 */
-   if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, 
write_fault))
+   if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, 
write_fault, NULL))
return 1;
}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 671a6766df5b..3db0f2304b6a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -558,6 +558,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
uint32_t cam_index = 0;
int ret, xcc_id = 0;
uint32_t node_id;
+   bool xnack_on = false;
 
node_id = entry->node_id;
 
@@ -595,7 +596,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
cam_index = entry->src_data[2] & 0x3ff;
 
ret = amdgpu_vm_handle_fault(adev, entry->pasid, 
entry->vmid, node_id,
-addr, write_fault);
+addr, write_fault, 
&xnack_on);
WDOORBELL32(adev->irq.retry_cam_doorbell_index, 
cam_index);
if (ret)
return 1;
@@ -618,7 +619,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
 * tables
 */
if (amdgpu_vm_handle_fault(adev, entry->pasid, 
entry->vmid, node_id,
-  

Re: [PATCH] drm/amdkfd: Remove arbitrary timeout for hmm_range_fault

2024-05-02 Thread Philip Yang

  


On 2024-05-02 08:42, James Zhu wrote:


  
  On 2024-05-01 18:56, Philip Yang wrote:
  
  On system with khugepaged enabled and user
cases with THP buffer, the

hmm_range_fault may takes > 15 seconds to return -EBUSY, the
arbitrary

timeout value is not accurate, cause memory allocation failure.


Remove the arbitrary timeout value, return EAGAIN to application
if

hmm_range_fault return EBUSY, then userspace libdrm and Thunk
will call

ioctl again.


Change EAGAIN to debug message as this is not error.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c |  5 -

  drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c  | 12
+++-

  drivers/gpu/drm/amd/amdkfd/kfd_svm.c |  5 +

  3 files changed, 8 insertions(+), 14 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

index 54198c3928c7..02696c2102f1 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

@@ -1087,7 +1087,10 @@ static int init_user_pages(struct kgd_mem
*mem, uint64_t user_addr,

    ret = amdgpu_ttm_tt_get_user_pages(bo,
bo->tbo.ttm->pages, &range);

  if (ret) {

-    pr_err("%s: Failed to get user pages: %d\n", __func__,
ret);

+    if (ret == -EAGAIN)

+    pr_debug("Failed to get user pages, try again\n");

+    else

+    pr_err("%s: Failed to get user pages: %d\n",
__func__, ret);

  goto unregister_out;

  }

  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c

index 431ec72655ec..e36fede7f74c 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c

@@ -202,20 +202,12 @@ int amdgpu_hmm_range_get_pages(struct
mmu_interval_notifier *notifier,

  pr_debug("hmm range: start = 0x%lx, end = 0x%lx",

  hmm_range->start, hmm_range->end);

  -    /* Assuming 64MB takes maximum 1 second to fault page
address */

-    timeout = max((hmm_range->end - hmm_range->start)
>> 26, 1UL);

-    timeout *= HMM_RANGE_DEFAULT_TIMEOUT;

-    timeout = jiffies + msecs_to_jiffies(timeout);

+    timeout = jiffies +
msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);

  
  [JZ] should we reduce MAX_WALK_BYTE to 64M in the meantime?
  

From debug log, the range size is not related, 64MB range may takes
same long time to return EBUSY too.

      retry:

  hmm_range->notifier_seq =
mmu_interval_read_begin(notifier);

  r = hmm_range_fault(hmm_range);

  if (unlikely(r)) {

-    schedule();

  
  [JZ] the above is for CPU stall WA, we may still need keep it.
  

The timeout 1 second should be long enough for normal case, if
  hmm_range_fault returns EBUSY, we release mmap_read lock and
  return to user space, so don't need explicit schedule to fix the
  CPU stale warning. Will run overnight KFDTest LargestSysBufferTest
  on larger memory system to confirm if there is CPU stale message.
Regards,
Philip


  -    /*

- * FIXME: This timeout should encompass the retry
from

- * mmu_interval_read_retry() as well.

- */

  if (r == -EBUSY && !time_after(jiffies,
timeout))

  goto retry;

  goto out_free_pfns;

@@ -247,6 +239,8 @@ int amdgpu_hmm_range_get_pages(struct
mmu_interval_notifier *notifier,

  out_free_range:

  kfree(hmm_range);

  +    if (r == -EBUSY)

+    r = -EAGAIN;

  return r;

  }

  diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
b/drivers/gpu/drm/amd/amdkfd/kf

Re: [PATCH] drm/amdkfd: Remove arbitrary timeout for hmm_range_fault

2024-05-02 Thread Philip Yang

  


On 2024-05-02 00:09, Chen, Xiaogang
  wrote:


  
  On 5/1/2024 5:56 PM, Philip Yang wrote:
  
  Caution: This message originated from an
External Source. Use proper caution when opening attachments,
clicking links, or responding.



On system with khugepaged enabled and user cases with THP
buffer, the

hmm_range_fault may takes > 15 seconds to return -EBUSY, the
arbitrary

timeout value is not accurate, cause memory allocation failure.


Remove the arbitrary timeout value, return EAGAIN to application
if

hmm_range_fault return EBUSY, then userspace libdrm and Thunk
will call

ioctl again.

  
  
  Wonder why letting user space do retry is better? Seems this issue
  is caused by hugepage merging, so how user space can avoid it?
  

The issue is caused by khugepaged + 4 processes + sdma stalls test
(to slow down sdma) + small_BAR + QPX mode, during overnight test,
hmm_range_fault 180MB buffer may takes >15 seconds returns EBUSY,
then alloc memory ioctl failed. Return EAGAIN, Thunk will call the
alloc memory ioctl again, and we don't see the alloc memory
failure.  

  
  And applications may not use Thunk or libdrm, instead, use ioctl
  directly.
  

If app calls ioctl directly, it should do the same thing, to call
  ioctl again if errno is EINTR or EAGAIN.
Regards,
Philip


  
  Regards
  
  
  Xiaogang
  
  
  Change EAGAIN to debug message as this is
not error.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c |  5 -

  drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c  | 12
+++-

  drivers/gpu/drm/amd/amdkfd/kfd_svm.c |  5 +

  3 files changed, 8 insertions(+), 14 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

index 54198c3928c7..02696c2102f1 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

@@ -1087,7 +1087,10 @@ static int init_user_pages(struct kgd_mem
*mem, uint64_t user_addr,


 ret = amdgpu_ttm_tt_get_user_pages(bo,
bo->tbo.ttm->pages, &range);

 if (ret) {

-   pr_err("%s: Failed to get user pages: %d\n",
__func__, ret);

+   if (ret == -EAGAIN)

+   pr_debug("Failed to get user pages, try
again\n");

+   else

+   pr_err("%s: Failed to get user pages:
%d\n", __func__, ret);

 goto unregister_out;

 }


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c

index 431ec72655ec..e36fede7f74c 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c

@@ -202,20 +202,12 @@ int amdgpu_hmm_range_get_pages(struct
mmu_interval_notifier *notifier,

 pr_debug("hmm range: start = 0x%lx, end =
0x%lx",

 hmm_range->start,
hmm_range->end);


-   /* Assuming 64MB takes maximum 1 second to fault
page address */

-   timeout = max((hmm_range->end -
hmm_range->start) >> 26, 1UL);

-   timeout *= HMM_RANGE_DEFAULT_TIMEOUT;

-   timeout = jiffies + msecs_to_jiffies(timeout);

+   timeout = jiffies +
msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);


  retry:

 hmm_range->notifier_seq =
mmu_interval_read_begin(notifier);

 r = hmm_range_fault(hmm_range);

 if (unlikely(r)) {

-   schedule();

-   /*

-    * FIXME: This timeout should encompass
the retry from

-    * mmu_interval_read_retry() as well.

-    */
  

[PATCH] drm/amdkfd: Remove arbitrary timeout for hmm_range_fault

2024-05-01 Thread Philip Yang
On system with khugepaged enabled and user cases with THP buffer, the
hmm_range_fault may takes > 15 seconds to return -EBUSY, the arbitrary
timeout value is not accurate, cause memory allocation failure.

Remove the arbitrary timeout value, return EAGAIN to application if
hmm_range_fault return EBUSY, then userspace libdrm and Thunk will call
ioctl again.

Change EAGAIN to debug message as this is not error.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c |  5 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c  | 12 +++-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c |  5 +
 3 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 54198c3928c7..02696c2102f1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1087,7 +1087,10 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t 
user_addr,
 
ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, &range);
if (ret) {
-   pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
+   if (ret == -EAGAIN)
+   pr_debug("Failed to get user pages, try again\n");
+   else
+   pr_err("%s: Failed to get user pages: %d\n", __func__, 
ret);
goto unregister_out;
}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
index 431ec72655ec..e36fede7f74c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
@@ -202,20 +202,12 @@ int amdgpu_hmm_range_get_pages(struct 
mmu_interval_notifier *notifier,
pr_debug("hmm range: start = 0x%lx, end = 0x%lx",
hmm_range->start, hmm_range->end);
 
-   /* Assuming 64MB takes maximum 1 second to fault page address */
-   timeout = max((hmm_range->end - hmm_range->start) >> 26, 1UL);
-   timeout *= HMM_RANGE_DEFAULT_TIMEOUT;
-   timeout = jiffies + msecs_to_jiffies(timeout);
+   timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
 
 retry:
hmm_range->notifier_seq = mmu_interval_read_begin(notifier);
r = hmm_range_fault(hmm_range);
if (unlikely(r)) {
-   schedule();
-   /*
-* FIXME: This timeout should encompass the retry from
-* mmu_interval_read_retry() as well.
-*/
if (r == -EBUSY && !time_after(jiffies, timeout))
goto retry;
goto out_free_pfns;
@@ -247,6 +239,8 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier 
*notifier,
 out_free_range:
kfree(hmm_range);
 
+   if (r == -EBUSY)
+   r = -EAGAIN;
return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 94f83be2232d..e7040f809f33 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1670,11 +1670,8 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
   readonly, owner, NULL,
   &hmm_range);
WRITE_ONCE(p->svms.faulting_task, NULL);
-   if (r) {
+   if (r)
pr_debug("failed %d to get svm range pages\n", 
r);
-   if (r == -EBUSY)
-   r = -EAGAIN;
-   }
} else {
r = -EFAULT;
}
-- 
2.43.2



Re: [PATCH] drm/amd/amdkfd: Fix a resource leak in svm_range_validate_and_map()

2024-05-01 Thread Philip Yang

  


On 2024-04-30 19:29, Ramesh Errabolu
  wrote:


  Analysis of code by Coverity, a static code analyser, has identified
a resource leak in the symbol hmm_range. This leak occurs when one of
the prior steps before it is released encounters an error.

Signed-off-by: Ramesh Errabolu 

Reviewed-by: Philip Yang 

  
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 386875e6eb96..dcb1d5d3f860 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1658,7 +1658,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
 	start = map_start << PAGE_SHIFT;
 	end = (map_last + 1) << PAGE_SHIFT;
 	for (addr = start; !r && addr < end; ) {
-		struct hmm_range *hmm_range;
+		struct hmm_range *hmm_range = NULL;
 		unsigned long map_start_vma;
 		unsigned long map_last_vma;
 		struct vm_area_struct *vma;
@@ -1696,7 +1696,9 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
 		}
 
 		svm_range_lock(prange);
-		if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) {
+
+		// Free backing memory of hmm_range if it was initialized
+		if (hmm_range && amdgpu_hmm_range_get_pages_done(hmm_range)) {
 			pr_debug("hmm update the range, need validate again\n");
 			r = -EAGAIN;
 		}


  



[PATCH v6 1/5] drm/amdgpu: Support contiguous VRAM allocation

2024-04-24 Thread Philip Yang
RDMA device with limited scatter-gather ability requires contiguous VRAM
buffer allocation for RDMA peer direct support.

Add a new KFD alloc memory flag and store as bo alloc flag
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export for RDMA
peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag, and ask
VRAM buddy allocator to get contiguous VRAM.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 
 include/uapi/linux/kfd_ioctl.h   | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index f672205243e0..02d66faaade5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1712,6 +1712,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) 
?
AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0;
+
+   /* For contiguous VRAM allocation */
+   if (flags & KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS)
+   alloc_flags |= 
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
}
xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ?
0 : fpriv->xcp_id;
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 2040a470ddb4..d09c4a18e571 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args {
 #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT   (1 << 26)
 #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED   (1 << 25)
 #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT   (1 << 24)
+#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS (1 << 23)
 
 /* Allocate memory for later SVM (shared virtual memory) mapping.
  *
-- 
2.43.2



[PATCH v6 5/5] drm/amdkfd: Bump kfd version for contiguous VRAM allocation

2024-04-24 Thread Philip Yang
Bump the kfd ioctl minor version to delcare the contiguous VRAM
allocation flag support.

Signed-off-by: Philip Yang 
---
 include/uapi/linux/kfd_ioctl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index d09c4a18e571..f8e9d3c1d117 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -41,9 +41,10 @@
  * - 1.13 - Add debugger API
  * - 1.14 - Update kfd_event_data
  * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl
+ * - 1.16 - Add contiguous VRAM allocation flag
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 15
+#define KFD_IOCTL_MINOR_VERSION 16
 
 struct kfd_ioctl_get_version_args {
__u32 major_version;/* from KFD */
-- 
2.43.2



[PATCH v6 4/5] drm/amdkfd: Evict BO itself for contiguous allocation

2024-04-24 Thread Philip Yang
If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to
system memory first to free the VRAM space, then allocate contiguous
VRAM space, and then move it from system memory back to VRAM.

v6: user context should use interruptible call (Felix)

Signed-off-by: Philip Yang 
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 19 ++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 02d66faaade5..acc825b84113 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1470,13 +1470,30 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo 
*bo, u32 domain)
if (unlikely(ret))
return ret;
 
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
+   /*
+* If bo is not contiguous on VRAM, move to system memory first 
to ensure
+* we can get contiguous VRAM space after evicting other BOs.
+*/
+   if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) {
+   struct ttm_operation_ctx ctx = { true, false };
+
+   amdgpu_bo_placement_from_domain(bo, 
AMDGPU_GEM_DOMAIN_GTT);
+   ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
+   if (unlikely(ret)) {
+   pr_debug("validate bo 0x%p to GTT failed %d\n", 
&bo->tbo, ret);
+   goto out;
+   }
+   }
+   }
+
ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0);
if (ret)
pr_err("Error in Pinning BO to domain: %d\n", domain);
 
amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);
+out:
amdgpu_bo_unreserve(bo);
-
return ret;
 }
 
-- 
2.43.2



[PATCH v6 0/5] Best effort contiguous VRAM allocation

2024-04-24 Thread Philip Yang
This patch series implement new KFD memory alloc flag for best effort contiguous
VRAM allocation, to support peer direct access RDMA device with limited 
scatter-gather
dma capability.

v2: rebase on patch ("drm/amdgpu: Modify the contiguous flags behaviour")
to avoid adding the new GEM flag

v3: add patch 2 to handle sg segment size limit (Christian)

v4: remove the buddy block size limit from vram mgr because sg table creation 
already
remove the limit, and resource uses u64 to handle block start, size 
(Christian)

v5: remove patch 7 which is not for upstream, add AMDGPU prefix to the macro 
name.

v6: use shorter flag name, use interruptible wait ctx, drop patch 5/6 (Felix)

Philip Yang (5):
  drm/amdgpu: Support contiguous VRAM allocation
  drm/amdgpu: Handle sg size limit for contiguous allocation
  drm/amdgpu: Evict BOs from same process for contiguous allocation
  drm/amdkfd: Evict BO itself for contiguous allocation
  drm/amdkfd: Bump kfd version for contiguous VRAM allocation

 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 23 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  | 12 +-
 include/uapi/linux/kfd_ioctl.h|  4 +++-
 4 files changed, 33 insertions(+), 9 deletions(-)

-- 
2.43.2



[PATCH v6 3/5] drm/amdgpu: Evict BOs from same process for contiguous allocation

2024-04-24 Thread Philip Yang
When TTM failed to alloc VRAM, TTM try evict BOs from VRAM to system
memory then retry the allocation, this skips the KFD BOs from the same
process because KFD require all BOs are resident for user queues.

If TTM with TTM_PL_FLAG_CONTIGUOUS flag to alloc contiguous VRAM, allow
TTM evict KFD BOs from the same process, this will evict the user queues
first, and restore the queues later after contiguous VRAM allocation.

Signed-off-by: Philip Yang 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 64f5001a7dc5..c21ea808f931 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1403,7 +1403,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct 
ttm_buffer_object *bo,
 */
dma_resv_for_each_fence(&resv_cursor, bo->base.resv,
DMA_RESV_USAGE_BOOKKEEP, f) {
-   if (amdkfd_fence_check_mm(f, current->mm))
+   if (amdkfd_fence_check_mm(f, current->mm) &&
+   !(place->flags & TTM_PL_FLAG_CONTIGUOUS))
return false;
}
 
-- 
2.43.2



[PATCH v6 2/5] drm/amdgpu: Handle sg size limit for contiguous allocation

2024-04-24 Thread Philip Yang
Define macro AMDGPU_MAX_SG_SEGMENT_SIZE 2GB, because struct scatterlist
length is unsigned int, and some users of it cast to a signed int, so
every segment of sg table is limited to size 2GB maximum.

For contiguous VRAM allocation, don't limit the max buddy block size in
order to get contiguous VRAM memory. To workaround the sg table segment
size limit, allocate multiple segments if contiguous size is bigger than
AMDGPU_MAX_SG_SEGMENT_SIZE.

Signed-off-by: Philip Yang 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 4be8b091099a..ebffb58ea53a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -31,6 +31,8 @@
 #include "amdgpu_atomfirmware.h"
 #include "atom.h"
 
+#define AMDGPU_MAX_SG_SEGMENT_SIZE (2UL << 30)
+
 struct amdgpu_vram_reservation {
u64 start;
u64 size;
@@ -532,9 +534,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
 
BUG_ON(min_block_size < mm->chunk_size);
 
-   /* Limit maximum size to 2GiB due to SG table limitations */
-   size = min(remaining_size, 2ULL << 30);
-
+   size = remaining_size;
if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
!(size & (((u64)pages_per_block << PAGE_SHIFT) 
- 1)))
min_block_size = (u64)pages_per_block << PAGE_SHIFT;
@@ -675,7 +675,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
amdgpu_res_first(res, offset, length, &cursor);
while (cursor.remaining) {
num_entries++;
-   amdgpu_res_next(&cursor, cursor.size);
+   amdgpu_res_next(&cursor, min(cursor.size, 
AMDGPU_MAX_SG_SEGMENT_SIZE));
}
 
r = sg_alloc_table(*sgt, num_entries, GFP_KERNEL);
@@ -695,7 +695,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
amdgpu_res_first(res, offset, length, &cursor);
for_each_sgtable_sg((*sgt), sg, i) {
phys_addr_t phys = cursor.start + adev->gmc.aper_base;
-   size_t size = cursor.size;
+   unsigned long size = min(cursor.size, 
AMDGPU_MAX_SG_SEGMENT_SIZE);
dma_addr_t addr;
 
addr = dma_map_resource(dev, phys, size, dir,
@@ -708,7 +708,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
sg_dma_address(sg) = addr;
sg_dma_len(sg) = size;
 
-   amdgpu_res_next(&cursor, cursor.size);
+   amdgpu_res_next(&cursor, size);
}
 
return 0;
-- 
2.43.2



Re: [PATCH v5 1/6] drm/amdgpu: Support contiguous VRAM allocation

2024-04-24 Thread Philip Yang

  


On 2024-04-23 18:17, Felix Kuehling
  wrote:


  
  On 2024-04-23 11:28, Philip Yang wrote:
  
  RDMA device with limited scatter-gather
ability requires contiguous VRAM

buffer allocation for RDMA peer direct support.


Add a new KFD alloc memory flag and store as bo alloc flag

AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export
for RDMA

peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag,
and ask

VRAM buddy allocator to get contiguous VRAM.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 

  include/uapi/linux/kfd_ioctl.h   | 1 +

  2 files changed, 5 insertions(+)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

index 0ae9fd844623..ef9154043757 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

@@ -1712,6 +1712,10 @@ int
amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(

  alloc_flags =
AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;

  alloc_flags |= (flags &
KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ?

  AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0;

+

+    /* For contiguous VRAM allocation */

+    if (flags &
KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT)

+    alloc_flags |=
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;

  }

  xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ?

  0 : fpriv->xcp_id;

diff --git a/include/uapi/linux/kfd_ioctl.h
b/include/uapi/linux/kfd_ioctl.h

index 2040a470ddb4..c1394c162d4e 100644

--- a/include/uapi/linux/kfd_ioctl.h

+++ b/include/uapi/linux/kfd_ioctl.h

@@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args {

  #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT    (1 << 26)

  #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED    (1 << 25)

  #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT    (1 <<
24)

+#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT    (1
<< 23)

  
  
  If I understand it correctly, AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
  was redefined to mean "best effort". Maybe we can drop the
  explicit "BEST_EFFORT" from this flag as well to keep the name to
  a reasonable length.
  

yes, AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS is redefined, to implement
  "best effort" without adding new upstream GEM flag, so we may get
  scattered allocation if contiguous allocation failed. If we drop
  the "BEST_EFFORT" from flag name, this may mislead the users.
Regards,
Philip  

  
  Regards,
  
    Felix
  
  
  
      /* Allocate memory for later SVM
(shared virtual memory) mapping.

   *

  

  



Re: [PATCH v5 4/6] drm/amdkfd: Evict BO itself for contiguous allocation

2024-04-24 Thread Philip Yang

  


On 2024-04-23 18:15, Felix Kuehling
  wrote:

On
  2024-04-23 11:28, Philip Yang wrote:
  
  If the BO pages pinned for RDMA is not
contiguous on VRAM, evict it to

system memory first to free the VRAM space, then allocate
contiguous

VRAM space, and then move it from system memory back to VRAM.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16
+++-

  1 file changed, 15 insertions(+), 1 deletion(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

index ef9154043757..5d118e5580ce 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

@@ -1470,13 +1470,27 @@ static int
amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain)

  if (unlikely(ret))

  return ret;

  +    if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
{

+    /*

+ * If bo is not contiguous on VRAM, move to system
memory first to ensure

+ * we can get contiguous VRAM space after evicting
other BOs.

+ */

+    if (!(bo->tbo.resource->placement &
TTM_PL_FLAG_CONTIGUOUS)) {

+    ret = amdgpu_amdkfd_bo_validate(bo,
AMDGPU_GEM_DOMAIN_GTT, false);

  
  
  amdgpu_amdkfd_bo_validate is meant for use in kernel threads. It
  always runs uninterruptible. I believe pin_bo runs in the context
  of ioctls from user mode. So it should be interruptible.
  

yes, pin_bo is in the context of user mode, from KFD alloc memory
  or from rdma driver get pages, should use interruptible wait.
amdgpu_amdkfd_bo_validate is currently used by kernel threads and
  ioctl amdgpu_amdkfd_add_gws_to_process (this seems bug), does it
  make sense to add parameter interruptible, then we can remove many
  duplicate code amdgpu_bo_placement_from_domain + ttm_bo_validate
  or I can fix it here and leave the cleanup and bug fix in the
  future?
Regards,
Philip


  
  Regards,
  
    Felix
  
  
  
  +    if (unlikely(ret)) {

+    pr_debug("validate bo 0x%p to GTT failed %d\n",
&bo->tbo, ret);

+    goto out;

+    }

+    }

+    }

+

  ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0);

  if (ret)

  pr_err("Error in Pinning BO to domain: %d\n", domain);

    amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);

+out:

  amdgpu_bo_unreserve(bo);

-

  return ret;

  }

  

  



[PATCH v5 6/6] drm/amdkfd: Bump kfd version for contiguous VRAM allocation

2024-04-23 Thread Philip Yang
Bump the kfd ioctl minor version to delcare the contiguous VRAM
allocation flag support.

Signed-off-by: Philip Yang 
---
 include/uapi/linux/kfd_ioctl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index c1394c162d4e..a5ebbe98ff7f 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -41,9 +41,10 @@
  * - 1.13 - Add debugger API
  * - 1.14 - Update kfd_event_data
  * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl
+ * - 1.16 - Add contiguous VRAM allocation flag
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 15
+#define KFD_IOCTL_MINOR_VERSION 16
 
 struct kfd_ioctl_get_version_args {
__u32 major_version;/* from KFD */
-- 
2.43.2



[PATCH v5 4/6] drm/amdkfd: Evict BO itself for contiguous allocation

2024-04-23 Thread Philip Yang
If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to
system memory first to free the VRAM space, then allocate contiguous
VRAM space, and then move it from system memory back to VRAM.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index ef9154043757..5d118e5580ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1470,13 +1470,27 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo 
*bo, u32 domain)
if (unlikely(ret))
return ret;
 
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
+   /*
+* If bo is not contiguous on VRAM, move to system memory first 
to ensure
+* we can get contiguous VRAM space after evicting other BOs.
+*/
+   if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) {
+   ret = amdgpu_amdkfd_bo_validate(bo, 
AMDGPU_GEM_DOMAIN_GTT, false);
+   if (unlikely(ret)) {
+   pr_debug("validate bo 0x%p to GTT failed %d\n", 
&bo->tbo, ret);
+   goto out;
+   }
+   }
+   }
+
ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0);
if (ret)
pr_err("Error in Pinning BO to domain: %d\n", domain);
 
amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);
+out:
amdgpu_bo_unreserve(bo);
-
return ret;
 }
 
-- 
2.43.2



[PATCH v5 2/6] drm/amdgpu: Handle sg size limit for contiguous allocation

2024-04-23 Thread Philip Yang
Define macro MAX_SG_SEGMENT_SIZE 2GB, because struct scatterlist length
is unsigned int, and some users of it cast to a signed int, so every
segment of sg table is limited to size 2GB maximum.

For contiguous VRAM allocation, don't limit the max buddy block size in
order to get contiguous VRAM memory. To workaround the sg table segment
size limit, allocate multiple segments if contiguous size is bigger than
MAX_SG_SEGMENT_SIZE.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 4be8b091099a..ebffb58ea53a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -31,6 +31,8 @@
 #include "amdgpu_atomfirmware.h"
 #include "atom.h"
 
+#define AMDGPU_MAX_SG_SEGMENT_SIZE (2UL << 30)
+
 struct amdgpu_vram_reservation {
u64 start;
u64 size;
@@ -532,9 +534,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
 
BUG_ON(min_block_size < mm->chunk_size);
 
-   /* Limit maximum size to 2GiB due to SG table limitations */
-   size = min(remaining_size, 2ULL << 30);
-
+   size = remaining_size;
if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
!(size & (((u64)pages_per_block << PAGE_SHIFT) 
- 1)))
min_block_size = (u64)pages_per_block << PAGE_SHIFT;
@@ -675,7 +675,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
amdgpu_res_first(res, offset, length, &cursor);
while (cursor.remaining) {
num_entries++;
-   amdgpu_res_next(&cursor, cursor.size);
+   amdgpu_res_next(&cursor, min(cursor.size, 
AMDGPU_MAX_SG_SEGMENT_SIZE));
}
 
r = sg_alloc_table(*sgt, num_entries, GFP_KERNEL);
@@ -695,7 +695,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
amdgpu_res_first(res, offset, length, &cursor);
for_each_sgtable_sg((*sgt), sg, i) {
phys_addr_t phys = cursor.start + adev->gmc.aper_base;
-   size_t size = cursor.size;
+   unsigned long size = min(cursor.size, 
AMDGPU_MAX_SG_SEGMENT_SIZE);
dma_addr_t addr;
 
addr = dma_map_resource(dev, phys, size, dir,
@@ -708,7 +708,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
sg_dma_address(sg) = addr;
sg_dma_len(sg) = size;
 
-   amdgpu_res_next(&cursor, cursor.size);
+   amdgpu_res_next(&cursor, size);
}
 
return 0;
-- 
2.43.2



[PATCH v5 3/6] drm/amdgpu: Evict BOs from same process for contiguous allocation

2024-04-23 Thread Philip Yang
When TTM failed to alloc VRAM, TTM try evict BOs from VRAM to system
memory then retry the allocation, this skips the KFD BOs from the same
process because KFD require all BOs are resident for user queues.

If TTM with TTM_PL_FLAG_CONTIGUOUS flag to alloc contiguous VRAM, allow
TTM evict KFD BOs from the same process, this will evict the user queues
first, and restore the queues later after contiguous VRAM allocation.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 851509c6e90e..c907d6005641 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1398,7 +1398,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct 
ttm_buffer_object *bo,
 */
dma_resv_for_each_fence(&resv_cursor, bo->base.resv,
DMA_RESV_USAGE_BOOKKEEP, f) {
-   if (amdkfd_fence_check_mm(f, current->mm))
+   if (amdkfd_fence_check_mm(f, current->mm) &&
+   !(place->flags & TTM_PL_FLAG_CONTIGUOUS))
return false;
}
 
-- 
2.43.2



[PATCH v5 5/6] drm/amdkfd: Increase KFD bo restore wait time

2024-04-23 Thread Philip Yang
TTM allocate contiguous VRAM may takes more than 1 second to evict BOs
for larger size RDMA buffer. Because KFD restore bo worker reserves all
KFD BOs, then TTM cannot hold the remainning KFD BOs lock to evict them,
this causes TTM failed to alloc contiguous VRAM.

Increase the KFD restore BO wait time to 2 seconds, long enough for RDMA
pin BO to alloc the contiguous VRAM.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index a81ef232fdef..c205e2d3acf9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -698,7 +698,7 @@ struct qcm_process_device {
 /* KFD Memory Eviction */
 
 /* Approx. wait time before attempting to restore evicted BOs */
-#define PROCESS_RESTORE_TIME_MS 100
+#define PROCESS_RESTORE_TIME_MS 2000
 /* Approx. back off time if restore fails due to lack of memory */
 #define PROCESS_BACK_OFF_TIME_MS 100
 /* Approx. time before evicting the process again */
-- 
2.43.2



[PATCH v5 1/6] drm/amdgpu: Support contiguous VRAM allocation

2024-04-23 Thread Philip Yang
RDMA device with limited scatter-gather ability requires contiguous VRAM
buffer allocation for RDMA peer direct support.

Add a new KFD alloc memory flag and store as bo alloc flag
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export for RDMA
peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag, and ask
VRAM buddy allocator to get contiguous VRAM.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 
 include/uapi/linux/kfd_ioctl.h   | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 0ae9fd844623..ef9154043757 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1712,6 +1712,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) 
?
AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0;
+
+   /* For contiguous VRAM allocation */
+   if (flags & 
KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT)
+   alloc_flags |= 
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
}
xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ?
0 : fpriv->xcp_id;
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 2040a470ddb4..c1394c162d4e 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args {
 #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT   (1 << 26)
 #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED   (1 << 25)
 #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT   (1 << 24)
+#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT (1 << 23)
 
 /* Allocate memory for later SVM (shared virtual memory) mapping.
  *
-- 
2.43.2



[PATCH v5 0/6] Best effort contiguous VRAM allocation

2024-04-23 Thread Philip Yang
This patch series implement new KFD memory alloc flag for best effort contiguous
VRAM allocation, to support peer direct access RDMA device with limited 
scatter-gather
dma capability.

v2: rebase on patch ("drm/amdgpu: Modify the contiguous flags behaviour")
to avoid adding the new GEM flag

v3: add patch 2 to handle sg segment size limit (Christian)

v4: remove the buddy block size limit from vram mgr because sg table creation 
already
remove the limit, and resource uses u64 to handle block start, size 
(Christian)

v5: remove patch 7 which is not for upstream, add AMDGPU prefix to the macro 
name.

Philip Yang (6):
  drm/amdgpu: Support contiguous VRAM allocation
  drm/amdgpu: Handle sg size limit for contiguous allocation
  drm/amdgpu: Evict BOs from same process for contiguous allocation
  drm/amdkfd: Evict BO itself for contiguous allocation
  drm/amdkfd: Increase KFD bo restore wait time
  drm/amdkfd: Bump kfd version for contiguous VRAM allocation

 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 20 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  | 12 +--
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  2 +-
 include/uapi/linux/kfd_ioctl.h|  4 +++-
 5 files changed, 31 insertions(+), 10 deletions(-)

-- 
2.43.2



Re: [PATCH v4 6/7] drm/amdgpu: Skip dma map resource for null RDMA device

2024-04-23 Thread Philip Yang

  


On 2024-04-23 09:32, Christian König
  wrote:

Am
  23.04.24 um 15:04 schrieb Philip Yang:
  
  To test RDMA using dummy driver on the
system without NIC/RDMA

device, the get/put dma pages pass in null device pointer, skip
the

dma map/unmap resource and sg table to avoid null pointer
access.

  
  
  Well just to make it clear this patch is really a no-go for
  upstreaming.
  
  
  The RDMA code isn't upstream as far as I know and doing this here
  is really not a good idea even internally.
  

Right, this change is not needed and not related to upstream,
  just to minimize the difference with upstream.
I will not upstream this patch.
Regards,
Philip


  
  Regards,
  
  Christian.
  
  
  

Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 33
+++-

  1 file changed, 19 insertions(+), 14 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index 6c7133bf51d8..101a85263b53 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

@@ -698,12 +698,15 @@ int amdgpu_vram_mgr_alloc_sgt(struct
amdgpu_device *adev,

  unsigned long size = min(cursor.size,
MAX_SG_SEGMENT_SIZE);

  dma_addr_t addr;

  -    addr = dma_map_resource(dev, phys, size, dir,

-    DMA_ATTR_SKIP_CPU_SYNC);

-    r = dma_mapping_error(dev, addr);

-    if (r)

-    goto error_unmap;

-

+    if (dev) {

+    addr = dma_map_resource(dev, phys, size, dir,

+    DMA_ATTR_SKIP_CPU_SYNC);

+    r = dma_mapping_error(dev, addr);

+    if (r)

+    goto error_unmap;

+    } else {

+    addr = phys;

+    }

  sg_set_page(sg, NULL, size, 0);

  sg_dma_address(sg) = addr;

  sg_dma_len(sg) = size;

@@ -717,10 +720,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct
amdgpu_device *adev,

  for_each_sgtable_sg((*sgt), sg, i) {

  if (!sg->length)

  continue;

-

-    dma_unmap_resource(dev, sg->dma_address,

-   sg->length, dir,

-   DMA_ATTR_SKIP_CPU_SYNC);

+    if (dev)

+    dma_unmap_resource(dev, sg->dma_address,

+   sg->length, dir,

+   DMA_ATTR_SKIP_CPU_SYNC);

  }

  sg_free_table(*sgt);

  @@ -745,10 +748,12 @@ void amdgpu_vram_mgr_free_sgt(struct
device *dev,

  struct scatterlist *sg;

  int i;

  -    for_each_sgtable_sg(sgt, sg, i)

-    dma_unmap_resource(dev, sg->dma_address,

-   sg->length, dir,

-   DMA_ATTR_SKIP_CPU_SYNC);

+    if (dev) {

+    for_each_sgtable_sg(sgt, sg, i)

+    dma_unmap_resource(dev, sg->dma_address,

+   sg->length, dir,

+   DMA_ATTR_SKIP_CPU_SYNC);

+    }

  sg_free_table(sgt);

  kfree(sgt);

  }

  
  

  



[PATCH v4 1/7] drm/amdgpu: Support contiguous VRAM allocation

2024-04-23 Thread Philip Yang
RDMA device with limited scatter-gather ability requires contiguous VRAM
buffer allocation for RDMA peer direct support.

Add a new KFD alloc memory flag and store as bo alloc flag
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export for RDMA
peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag, and ask
VRAM buddy allocator to get contiguous VRAM.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 
 include/uapi/linux/kfd_ioctl.h   | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 0ae9fd844623..ef9154043757 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1712,6 +1712,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) 
?
AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0;
+
+   /* For contiguous VRAM allocation */
+   if (flags & 
KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT)
+   alloc_flags |= 
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
}
xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ?
0 : fpriv->xcp_id;
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 2040a470ddb4..c1394c162d4e 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args {
 #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT   (1 << 26)
 #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED   (1 << 25)
 #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT   (1 << 24)
+#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT (1 << 23)
 
 /* Allocate memory for later SVM (shared virtual memory) mapping.
  *
-- 
2.43.2



[PATCH v4 3/7] drm/amdgpu: Evict BOs from same process for contiguous allocation

2024-04-23 Thread Philip Yang
When TTM failed to alloc VRAM, TTM try evict BOs from VRAM to system
memory then retry the allocation, this skips the KFD BOs from the same
process because KFD require all BOs are resident for user queues.

If TTM with TTM_PL_FLAG_CONTIGUOUS flag to alloc contiguous VRAM, allow
TTM evict KFD BOs from the same process, this will evict the user queues
first, and restore the queues later after contiguous VRAM allocation.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 851509c6e90e..c907d6005641 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1398,7 +1398,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct 
ttm_buffer_object *bo,
 */
dma_resv_for_each_fence(&resv_cursor, bo->base.resv,
DMA_RESV_USAGE_BOOKKEEP, f) {
-   if (amdkfd_fence_check_mm(f, current->mm))
+   if (amdkfd_fence_check_mm(f, current->mm) &&
+   !(place->flags & TTM_PL_FLAG_CONTIGUOUS))
return false;
}
 
-- 
2.43.2



[PATCH v4 7/7] drm/amdkfd: Bump kfd version for contiguous VRAM allocation

2024-04-23 Thread Philip Yang
Bump the kfd ioctl minor version to delcare the contiguous VRAM
allocation flag support.

Signed-off-by: Philip Yang 
---
 include/uapi/linux/kfd_ioctl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index c1394c162d4e..a5ebbe98ff7f 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -41,9 +41,10 @@
  * - 1.13 - Add debugger API
  * - 1.14 - Update kfd_event_data
  * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl
+ * - 1.16 - Add contiguous VRAM allocation flag
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 15
+#define KFD_IOCTL_MINOR_VERSION 16
 
 struct kfd_ioctl_get_version_args {
__u32 major_version;/* from KFD */
-- 
2.43.2



[PATCH v4 4/7] drm/amdkfd: Evict BO itself for contiguous allocation

2024-04-23 Thread Philip Yang
If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to
system memory first to free the VRAM space, then allocate contiguous
VRAM space, and then move it from system memory back to VRAM.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index ef9154043757..5d118e5580ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1470,13 +1470,27 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo 
*bo, u32 domain)
if (unlikely(ret))
return ret;
 
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
+   /*
+* If bo is not contiguous on VRAM, move to system memory first 
to ensure
+* we can get contiguous VRAM space after evicting other BOs.
+*/
+   if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) {
+   ret = amdgpu_amdkfd_bo_validate(bo, 
AMDGPU_GEM_DOMAIN_GTT, false);
+   if (unlikely(ret)) {
+   pr_debug("validate bo 0x%p to GTT failed %d\n", 
&bo->tbo, ret);
+   goto out;
+   }
+   }
+   }
+
ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0);
if (ret)
pr_err("Error in Pinning BO to domain: %d\n", domain);
 
amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);
+out:
amdgpu_bo_unreserve(bo);
-
return ret;
 }
 
-- 
2.43.2



[PATCH v4 6/7] drm/amdgpu: Skip dma map resource for null RDMA device

2024-04-23 Thread Philip Yang
To test RDMA using dummy driver on the system without NIC/RDMA
device, the get/put dma pages pass in null device pointer, skip the
dma map/unmap resource and sg table to avoid null pointer access.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 33 +++-
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 6c7133bf51d8..101a85263b53 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -698,12 +698,15 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
unsigned long size = min(cursor.size, MAX_SG_SEGMENT_SIZE);
dma_addr_t addr;
 
-   addr = dma_map_resource(dev, phys, size, dir,
-   DMA_ATTR_SKIP_CPU_SYNC);
-   r = dma_mapping_error(dev, addr);
-   if (r)
-   goto error_unmap;
-
+   if (dev) {
+   addr = dma_map_resource(dev, phys, size, dir,
+   DMA_ATTR_SKIP_CPU_SYNC);
+   r = dma_mapping_error(dev, addr);
+   if (r)
+   goto error_unmap;
+   } else {
+   addr = phys;
+   }
sg_set_page(sg, NULL, size, 0);
sg_dma_address(sg) = addr;
sg_dma_len(sg) = size;
@@ -717,10 +720,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
for_each_sgtable_sg((*sgt), sg, i) {
if (!sg->length)
continue;
-
-   dma_unmap_resource(dev, sg->dma_address,
-  sg->length, dir,
-  DMA_ATTR_SKIP_CPU_SYNC);
+   if (dev)
+   dma_unmap_resource(dev, sg->dma_address,
+  sg->length, dir,
+  DMA_ATTR_SKIP_CPU_SYNC);
}
sg_free_table(*sgt);
 
@@ -745,10 +748,12 @@ void amdgpu_vram_mgr_free_sgt(struct device *dev,
struct scatterlist *sg;
int i;
 
-   for_each_sgtable_sg(sgt, sg, i)
-   dma_unmap_resource(dev, sg->dma_address,
-  sg->length, dir,
-  DMA_ATTR_SKIP_CPU_SYNC);
+   if (dev) {
+   for_each_sgtable_sg(sgt, sg, i)
+   dma_unmap_resource(dev, sg->dma_address,
+  sg->length, dir,
+  DMA_ATTR_SKIP_CPU_SYNC);
+   }
sg_free_table(sgt);
kfree(sgt);
 }
-- 
2.43.2



[PATCH v4 5/7] drm/amdkfd: Increase KFD bo restore wait time

2024-04-23 Thread Philip Yang
TTM allocate contiguous VRAM may takes more than 1 second to evict BOs
for larger size RDMA buffer. Because KFD restore bo worker reserves all
KFD BOs, then TTM cannot hold the remainning KFD BOs lock to evict them,
this causes TTM failed to alloc contiguous VRAM.

Increase the KFD restore BO wait time to 2 seconds, long enough for RDMA
pin BO to alloc the contiguous VRAM.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index a81ef232fdef..c205e2d3acf9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -698,7 +698,7 @@ struct qcm_process_device {
 /* KFD Memory Eviction */
 
 /* Approx. wait time before attempting to restore evicted BOs */
-#define PROCESS_RESTORE_TIME_MS 100
+#define PROCESS_RESTORE_TIME_MS 2000
 /* Approx. back off time if restore fails due to lack of memory */
 #define PROCESS_BACK_OFF_TIME_MS 100
 /* Approx. time before evicting the process again */
-- 
2.43.2



[PATCH v4 2/7] drm/amdgpu: Handle sg size limit for contiguous allocation

2024-04-23 Thread Philip Yang
Define macro MAX_SG_SEGMENT_SIZE 2GB, because struct scatterlist length
is unsigned int, and some users of it cast to a signed int, so every
segment of sg table is limited to size 2GB maximum.

For contiguous VRAM allocation, don't limit the max buddy block size in
order to get contiguous VRAM memory. To workaround the sg table segment
size limit, allocate multiple segments if contiguous size is bigger than
MAX_SG_SEGMENT_SIZE.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 4be8b091099a..6c7133bf51d8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -31,6 +31,8 @@
 #include "amdgpu_atomfirmware.h"
 #include "atom.h"
 
+#define MAX_SG_SEGMENT_SIZE(2UL << 30)
+
 struct amdgpu_vram_reservation {
u64 start;
u64 size;
@@ -532,9 +534,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
 
BUG_ON(min_block_size < mm->chunk_size);
 
-   /* Limit maximum size to 2GiB due to SG table limitations */
-   size = min(remaining_size, 2ULL << 30);
-
+   size = remaining_size;
if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
!(size & (((u64)pages_per_block << PAGE_SHIFT) 
- 1)))
min_block_size = (u64)pages_per_block << PAGE_SHIFT;
@@ -675,7 +675,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
amdgpu_res_first(res, offset, length, &cursor);
while (cursor.remaining) {
num_entries++;
-   amdgpu_res_next(&cursor, cursor.size);
+   amdgpu_res_next(&cursor, min(cursor.size, MAX_SG_SEGMENT_SIZE));
}
 
r = sg_alloc_table(*sgt, num_entries, GFP_KERNEL);
@@ -695,7 +695,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
amdgpu_res_first(res, offset, length, &cursor);
for_each_sgtable_sg((*sgt), sg, i) {
phys_addr_t phys = cursor.start + adev->gmc.aper_base;
-   size_t size = cursor.size;
+   unsigned long size = min(cursor.size, MAX_SG_SEGMENT_SIZE);
dma_addr_t addr;
 
addr = dma_map_resource(dev, phys, size, dir,
@@ -708,7 +708,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
sg_dma_address(sg) = addr;
sg_dma_len(sg) = size;
 
-   amdgpu_res_next(&cursor, cursor.size);
+   amdgpu_res_next(&cursor, size);
}
 
return 0;
-- 
2.43.2



[PATCH v4 0/7] Best effort contiguous VRAM allocation

2024-04-23 Thread Philip Yang
This patch series implement new KFD memory alloc flag for best effort contiguous
VRAM allocation, to support peer direct access RDMA device with limited 
scatter-gather
dma capability.

v2: rebase on patch ("drm/amdgpu: Modify the contiguous flags behaviour")
to avoid adding the new GEM flag

v3: add patch 2 to handle sg segment size limit (Christian)

v4: remove the buddy block size limit from vram mgr because sg table creation 
already
remove the limit, and resource uses u64 to handle block start, size 
(Christian)

Philip Yang (7):
  drm/amdgpu: Support contiguous VRAM allocation
  drm/amdgpu: Handle sg size limit for contiguous allocation
  drm/amdgpu: Evict BOs from same process for contiguous allocation
  drm/amdkfd: Evict BO itself for contiguous allocation
  drm/amdkfd: Increase KFD bo restore wait time
  drm/amdgpu: Skip dma map resource for null RDMA device
  drm/amdkfd: Bump kfd version for contiguous VRAM allocation

 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 20 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  | 45 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  2 +-
 include/uapi/linux/kfd_ioctl.h|  4 +-
 5 files changed, 50 insertions(+), 24 deletions(-)

-- 
2.43.2



Re: [PATCH v3 6/7] drm/amdgpu: Skip dma map resource for null RDMA device

2024-04-22 Thread Philip Yang

  


On 2024-04-22 10:56, Christian König
  wrote:

Am
  22.04.24 um 15:57 schrieb Philip Yang:
  
  To test RDMA using dummy driver on the
system without NIC/RDMA

device, the get/put dma pages pass in null device pointer, skip
the

dma map/unmap resource and sg table to avoid null pointer
access.

  
  
  Well that is completely illegal and would break IOMMU.
  
  
  Why does the RDMA driver does that in the first place?
  

That is the amdp2ptest driver, part of KFDTest rdma test. The
  simple rdma test app and driver is used to test the driver path,
  without actually transferring data b/w machines.
Regards,
Philip


  
  Regards,
  
  Christian.
  
  
  

Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 33
+++-

  1 file changed, 19 insertions(+), 14 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index 9fe56a21ef88..0caf2c89ef1d 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

@@ -705,12 +705,15 @@ int amdgpu_vram_mgr_alloc_sgt(struct
amdgpu_device *adev,

  unsigned long size = min(cursor.size,
MAX_SG_SEGMENT_SIZE);

  dma_addr_t addr;

  -    addr = dma_map_resource(dev, phys, size, dir,

-    DMA_ATTR_SKIP_CPU_SYNC);

-    r = dma_mapping_error(dev, addr);

-    if (r)

-    goto error_unmap;

-

+    if (dev) {

+    addr = dma_map_resource(dev, phys, size, dir,

+    DMA_ATTR_SKIP_CPU_SYNC);

+    r = dma_mapping_error(dev, addr);

+    if (r)

+    goto error_unmap;

+    } else {

+    addr = phys;

+    }

  sg_set_page(sg, NULL, size, 0);

  sg_dma_address(sg) = addr;

  sg_dma_len(sg) = size;

@@ -724,10 +727,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct
amdgpu_device *adev,

  for_each_sgtable_sg((*sgt), sg, i) {

  if (!sg->length)

  continue;

-

-    dma_unmap_resource(dev, sg->dma_address,

-   sg->length, dir,

-   DMA_ATTR_SKIP_CPU_SYNC);

+    if (dev)

+    dma_unmap_resource(dev, sg->dma_address,

+   sg->length, dir,

+   DMA_ATTR_SKIP_CPU_SYNC);

  }

  sg_free_table(*sgt);

  @@ -752,10 +755,12 @@ void amdgpu_vram_mgr_free_sgt(struct
device *dev,

  struct scatterlist *sg;

  int i;

  -    for_each_sgtable_sg(sgt, sg, i)

-    dma_unmap_resource(dev, sg->dma_address,

-   sg->length, dir,

-   DMA_ATTR_SKIP_CPU_SYNC);

+    if (dev) {

+    for_each_sgtable_sg(sgt, sg, i)

+    dma_unmap_resource(dev, sg->dma_address,

+   sg->length, dir,

+   DMA_ATTR_SKIP_CPU_SYNC);

+    }

  sg_free_table(sgt);

  kfree(sgt);

  }

  
  

  



Re: [PATCH v3 2/7] drm/amdgpu: Handle sg size limit for contiguous allocation

2024-04-22 Thread Philip Yang

  


On 2024-04-22 10:40, Christian König
  wrote:

Am
  22.04.24 um 15:57 schrieb Philip Yang:
  
  Define macro MAX_SG_SEGMENT_SIZE 2GB,
because struct scatterlist length

is unsigned int, and some users of it cast to a signed int, so
every

segment of sg table is limited to size 2GB maximum.


For contiguous VRAM allocation, don't limit the max buddy block
size in

order to get contiguous VRAM memory. To workaround the sg table
segment

size limit, allocate multiple segments if contiguous size is
bigger than

MAX_SG_SEGMENT_SIZE.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17
-

  1 file changed, 12 insertions(+), 5 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index 4be8b091099a..9fe56a21ef88 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

@@ -31,6 +31,8 @@

  #include "amdgpu_atomfirmware.h"

  #include "atom.h"

  +#define MAX_SG_SEGMENT_SIZE    (2UL << 30)

+

  struct amdgpu_vram_reservation {

  u64 start;

  u64 size;

@@ -532,8 +534,13 @@ static int amdgpu_vram_mgr_new(struct
ttm_resource_manager *man,

    BUG_ON(min_block_size < mm->chunk_size);

  -    /* Limit maximum size to 2GiB due to SG table
limitations */

-    size = min(remaining_size, 2ULL << 30);

+    if (place->flags & TTM_PL_FLAG_CONTIGUOUS)

+    size = remaining_size;

+    else

+    /* Limit maximum size to 2GiB due to SG table
limitations

+ * for no contiguous allocation.

+ */

+    size = min(remaining_size, MAX_SG_SEGMENT_SIZE);

  
  
  Well that doesn't make sense, either fix the creation of the sg
  tables or limit the segment size. Not both.
  

yes, right. we don't need limit the segment size for
  non-contiguous allocation either as this is handled by
  min_block_size. I will send v4 patch to fix this. Then we could
  have another patch to remove the while loop, size and remaining
  size to simply the code in future.
Regards,
Philip


  
      if ((size >=
(u64)pages_per_block << PAGE_SHIFT) &&

  !(size & (((u64)pages_per_block <<
PAGE_SHIFT) - 1)))

@@ -675,7 +682,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct
amdgpu_device *adev,

  amdgpu_res_first(res, offset, length, &cursor);

  while (cursor.remaining) {

  num_entries++;

-    amdgpu_res_next(&cursor, cursor.size);

+    amdgpu_res_next(&cursor, min(cursor.size,
MAX_SG_SEGMENT_SIZE));

  }

    r = sg_alloc_table(*sgt, num_entries, GFP_KERNEL);

@@ -695,7 +702,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct
amdgpu_device *adev,

  amdgpu_res_first(res, offset, length, &cursor);

  for_each_sgtable_sg((*sgt), sg, i) {

  phys_addr_t phys = cursor.start +
adev->gmc.aper_base;

-    size_t size = cursor.size;

+    unsigned long size = min(cursor.size,
MAX_SG_SEGMENT_SIZE);

  
  
  Please keep size_t here or use unsigned int, using unsigned long
  just looks like trying to hide the problem.
  
  
  And I wouldn't use a separate define but rather just INT_MAX
  instead.
  
  
  Regards,
  
  Christian.
  
  
    dma_addr_t addr;

    addr = dma_map_resource(dev, phys, size, dir,

@@ -708,7 +715,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct
amdgpu_device *adev,

  sg_dma_address(sg) = addr;

  sg_dma_len(sg) = size;

  -    amdgpu_res_next(&cursor, cursor.size);

+    amdgpu_res_next(&cursor, size);

  }

    return 0;

  
  

  



[PATCH v3 6/7] drm/amdgpu: Skip dma map resource for null RDMA device

2024-04-22 Thread Philip Yang
To test RDMA using dummy driver on the system without NIC/RDMA
device, the get/put dma pages pass in null device pointer, skip the
dma map/unmap resource and sg table to avoid null pointer access.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 33 +++-
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 9fe56a21ef88..0caf2c89ef1d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -705,12 +705,15 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
unsigned long size = min(cursor.size, MAX_SG_SEGMENT_SIZE);
dma_addr_t addr;
 
-   addr = dma_map_resource(dev, phys, size, dir,
-   DMA_ATTR_SKIP_CPU_SYNC);
-   r = dma_mapping_error(dev, addr);
-   if (r)
-   goto error_unmap;
-
+   if (dev) {
+   addr = dma_map_resource(dev, phys, size, dir,
+   DMA_ATTR_SKIP_CPU_SYNC);
+   r = dma_mapping_error(dev, addr);
+   if (r)
+   goto error_unmap;
+   } else {
+   addr = phys;
+   }
sg_set_page(sg, NULL, size, 0);
sg_dma_address(sg) = addr;
sg_dma_len(sg) = size;
@@ -724,10 +727,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
for_each_sgtable_sg((*sgt), sg, i) {
if (!sg->length)
continue;
-
-   dma_unmap_resource(dev, sg->dma_address,
-  sg->length, dir,
-  DMA_ATTR_SKIP_CPU_SYNC);
+   if (dev)
+   dma_unmap_resource(dev, sg->dma_address,
+  sg->length, dir,
+  DMA_ATTR_SKIP_CPU_SYNC);
}
sg_free_table(*sgt);
 
@@ -752,10 +755,12 @@ void amdgpu_vram_mgr_free_sgt(struct device *dev,
struct scatterlist *sg;
int i;
 
-   for_each_sgtable_sg(sgt, sg, i)
-   dma_unmap_resource(dev, sg->dma_address,
-  sg->length, dir,
-  DMA_ATTR_SKIP_CPU_SYNC);
+   if (dev) {
+   for_each_sgtable_sg(sgt, sg, i)
+   dma_unmap_resource(dev, sg->dma_address,
+  sg->length, dir,
+  DMA_ATTR_SKIP_CPU_SYNC);
+   }
sg_free_table(sgt);
kfree(sgt);
 }
-- 
2.43.2



  1   2   3   4   5   6   7   8   >