Re: [PATCH 2/2] drm/amdgpu: improve VM state machine documentation

2018-09-01 Thread Christian König

Am 01.09.2018 um 01:51 schrieb Felix Kuehling:

Thanks for this. A few comments and a question inline.

On 2018-08-31 09:27 AM, Christian König wrote:

Since we have a lot of FAQ on the VM state machine try to improve the
documentation by adding functions for each state move.

Signed-off-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 107 -
  1 file changed, 79 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index a9275a99d793..40c22635fefd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -204,6 +204,69 @@ static unsigned amdgpu_vm_bo_size(struct amdgpu_device 
*adev, unsigned level)
return AMDGPU_GPU_PAGE_ALIGN(amdgpu_vm_num_entries(adev, level) * 8);
  }
  
+/**

+ * amdgpu_vm_bo_evicted - vm_bo is evicted
+ *
+ * @vm_bo: vm_bo which is evicted
+ *
+ * State for PDs/PTs and per VM BOs which are not at the location they should
+ * be.
+ */
+static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo)
+{
+   struct amdgpu_vm *vm = vm_bo->vm;
+   struct amdgpu_bo *bo = vm_bo->bo;
+
+   vm_bo->moved = true;
+   if (bo->tbo.type == ttm_bo_type_kernel)
+   list_move(&vm_bo->vm_status, &vm->evicted);
+   else
+   list_move_tail(&vm_bo->vm_status, &vm->evicted);
+}
+
+/**
+ * amdgpu_vm_bo_relocated - vm_bo is reloacted
+ *
+ * @vm_bo: vm_bo which is relocated
+ *
+ * State for PDs/PTs which needs to update their parent PD.
+ */
+static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo)
+{
+   list_move(&vm_bo->vm_status, &vm_bo->vm->relocated);
+}
+
+/**
+ * amdgpu_vm_bo_moved - vm_bo is moved
+ *
+ * @vm_bo: vm_bo which is moved
+ *
+ * State for per VM and normal BOs which are moved, but that change is not yet
+ * reflected in the page tables.

I have a question here. Why does amdgpu_cs_vm_handling call
amdgpu_vm_bo_update manually for its BO list entries? Wouldn't it be
enough to just call amdgpu_vm_handle_moved?


No, it is still possible that the mapping of the BO was never updated 
because the VM page tables where evicted.


We still need to make sure that all BOs of the current submission are 
updated correctly.





+ */
+static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo)
+{
+   struct amdgpu_vm *vm = vm_bo->vm;
+
+   spin_lock(&vm->moved_lock);
+   list_move(&vm_bo->vm_status, &vm->moved);
+   spin_unlock(&vm->moved_lock);

If vm->moved_lock protects the moved list, do we also need to take it
whenever something is moved from that list?


Correct, yes.


That could potentially be
any list_move operation that uses vm_bo->vm_status. I found one case
below where that may not be handled correctly.


+}
+
+/**
+ * amdgpu_vm_bo_idle - vm_bo is idle
+ *
+ * @vm_bo: vm_bo which is now idle
+ *
+ * State for PDs/PTs and per VM BOs which have gone through the state machine
+ * and are now idle.
+ */
+static void amdgpu_vm_bo_idle(struct amdgpu_vm_bo_base *vm_bo)
+{
+   list_move(&vm_bo->vm_status, &vm_bo->vm->idle);
+   vm_bo->moved = false;
+}
+
  /**
   * amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
   *
@@ -232,9 +295,9 @@ static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base 
*base,
  
  	vm->bulk_moveable = false;

if (bo->tbo.type == ttm_bo_type_kernel)
-   list_move(&base->vm_status, &vm->relocated);
+   amdgpu_vm_bo_relocated(base);
else
-   list_move(&base->vm_status, &vm->idle);
+   amdgpu_vm_bo_idle(base);
  
  	if (bo->preferred_domains &

amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type))
@@ -245,8 +308,7 @@ static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base 
*base,
 * is currently evicted. add the bo to the evicted list to make sure it
 * is validated on next vm use to avoid fault.
 * */
-   list_move_tail(&base->vm_status, &vm->evicted);
-   base->moved = true;
+   amdgpu_vm_bo_evicted(base);
  }
  
  /**

@@ -342,9 +404,7 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, 
struct amdgpu_vm *vm,
break;
  
  		if (bo->tbo.type != ttm_bo_type_kernel) {

-   spin_lock(&vm->moved_lock);
-   list_move(&bo_base->vm_status, &vm->moved);
-   spin_unlock(&vm->moved_lock);
+   amdgpu_vm_bo_moved(bo_base);
} else {
if (vm->use_cpu_for_update)
r = amdgpu_bo_kmap(bo, NULL);
@@ -352,7 +412,7 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, 
struct amdgpu_vm *vm,
r = amdgpu_ttm_alloc_gart(&bo->tbo);
if (r)
break;
-   list_move(&bo_base->vm_status, &vm->relocated);
+

Re: [PATCH libdrm] amdgpu: When couldn't find bo, need to return error.

2018-09-01 Thread Christian König

Am 01.09.2018 um 06:24 schrieb Emily Deng:

The startx will have segmant fault if return success.

SWDEV-163962

Change-Id: I56b189fa26efdcd1d96e5100af3f3e0b1208b0c3
Signed-off-by: Emily Deng 


Jerry already send a much better patch for this.


---
  amdgpu/amdgpu_bo.c | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/amdgpu/amdgpu_bo.c b/amdgpu/amdgpu_bo.c
index f25cacc..7e297fa 100644
--- a/amdgpu/amdgpu_bo.c
+++ b/amdgpu/amdgpu_bo.c
@@ -760,6 +760,7 @@ int amdgpu_find_bo_by_cpu_mapping(amdgpu_device_handle dev,
  uint64_t *offset_in_bo)
  {
uint32_t i;
+   int r = 0;
struct amdgpu_bo *bo;
  
  	if (cpu == NULL || size == 0)

@@ -787,10 +788,11 @@ int amdgpu_find_bo_by_cpu_mapping(amdgpu_device_handle 
dev,
} else {
*buf_handle = NULL;
*offset_in_bo = 0;
+   r = -errno;


errno doesn't contain any error in this case.


}
pthread_mutex_unlock(&dev->bo_table_mutex);
  
-	return 0;

+   return r;
  }
  
  int amdgpu_create_bo_from_user_mem(amdgpu_device_handle dev,


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH libdrm] amdgpu: When couldn't find bo, need to return error.

2018-09-01 Thread Deng, Emily
Ok, then just ignore this patch. But seems didn't saw the patch on branch 
amd-staging-hybrid-master20180315.

Best wishes
Emily Deng

>-Original Message-
>From: Christian König 
>Sent: Saturday, September 1, 2018 4:17 PM
>To: Deng, Emily ; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH libdrm] amdgpu: When couldn't find bo, need to return
>error.
>
>Am 01.09.2018 um 06:24 schrieb Emily Deng:
>> The startx will have segmant fault if return success.
>>
>> SWDEV-163962
>>
>> Change-Id: I56b189fa26efdcd1d96e5100af3f3e0b1208b0c3
>> Signed-off-by: Emily Deng 
>
>Jerry already send a much better patch for this.
>
>> ---
>>   amdgpu/amdgpu_bo.c | 4 +++-
>>   1 file changed, 3 insertions(+), 1 deletion(-)
>>
>> diff --git a/amdgpu/amdgpu_bo.c b/amdgpu/amdgpu_bo.c index
>> f25cacc..7e297fa 100644
>> --- a/amdgpu/amdgpu_bo.c
>> +++ b/amdgpu/amdgpu_bo.c
>> @@ -760,6 +760,7 @@ int
>amdgpu_find_bo_by_cpu_mapping(amdgpu_device_handle dev,
>>uint64_t *offset_in_bo)
>>   {
>>  uint32_t i;
>> +int r = 0;
>>  struct amdgpu_bo *bo;
>>
>>  if (cpu == NULL || size == 0)
>> @@ -787,10 +788,11 @@ int
>amdgpu_find_bo_by_cpu_mapping(amdgpu_device_handle dev,
>>  } else {
>>  *buf_handle = NULL;
>>  *offset_in_bo = 0;
>> +r = -errno;
>
>errno doesn't contain any error in this case.
>
>>  }
>>  pthread_mutex_unlock(&dev->bo_table_mutex);
>>
>> -return 0;
>> +return r;
>>   }
>>
>>   int amdgpu_create_bo_from_user_mem(amdgpu_device_handle dev,

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 1/1] drm/amdgpu: Clean up KFD init and fini

2018-09-01 Thread Deucher, Alexander
Acked-by: Alex Deucher 


From: amd-gfx  on behalf of Felix 
Kuehling 
Sent: Friday, August 31, 2018 5:34:42 PM
To: amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix
Subject: [PATCH 1/1] drm/amdgpu: Clean up KFD init and fini

Only initialize KFD once by moving amdgpu_amdkfd_init from
amdgpu_pci_probe to amdgpu_init. This fixes kernel oopses and hangs
when booting multi-GPU systems.

Also removed some vestiges of KFD being its own module.

Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 12 
 2 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 8bee9a0..a79df2a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -28,7 +28,6 @@
 #include 

 const struct kgd2kfd_calls *kgd2kfd;
-bool (*kgd2kfd_init_p)(unsigned int, const struct kgd2kfd_calls**);

 static const unsigned int compute_vmid_bitmap = 0xFF00;

@@ -51,10 +50,8 @@ int amdgpu_amdkfd_init(void)

 void amdgpu_amdkfd_fini(void)
 {
-   if (kgd2kfd) {
+   if (kgd2kfd)
 kgd2kfd->exit();
-   symbol_put(kgd2kfd_init);
-   }
 }

 void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index a96ceff..b5c2ccb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -922,14 +922,6 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
 return -ENODEV;
 }

-   /*
-* Initialize amdkfd before starting radeon. If it was not loaded yet,
-* defer radeon probing
-*/
-   ret = amdgpu_amdkfd_init();
-   if (ret == -EPROBE_DEFER)
-   return ret;
-
 /* Get rid of things like offb */
 ret = amdgpu_kick_out_firmware_fb(pdev);
 if (ret)
@@ -1274,6 +1266,10 @@ static int __init amdgpu_init(void)
 pdriver = &amdgpu_kms_pci_driver;
 driver->num_ioctls = amdgpu_max_kms_ioctl;
 amdgpu_register_atpx_handler();
+
+   /* Ignore KFD init failures. Normal when CONFIG_HSA_AMD is not set. */
+   amdgpu_amdkfd_init();
+
 /* let modprobe override vga console setting */
 return pci_register_driver(pdriver);

--
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 2/2] drm/amdgpu: improve VM state machine documentation

2018-09-01 Thread Kuehling, Felix
>>> @@ -1746,9 +1805,9 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
>>>   uint32_t mem_type = bo->tbo.mem.mem_type;
>>>
>>>   if (!(bo->preferred_domains & 
>>> amdgpu_mem_type_to_domain(mem_type)))
>>> -list_add_tail(&bo_va->base.vm_status, &vm->evicted);
>>> +amdgpu_vm_bo_evicted(&bo_va->base);
>>>   else
>>> -list_add(&bo_va->base.vm_status, &vm->idle);
>>> +amdgpu_vm_bo_idle(&bo_va->base);
>> There is a small change in behaviour here for clearing
>> bo_va->base.moved. Not sure if it matters.
>
> Using list_add is minimal more efficient.

What I meant was that the original code would not always set bo_va->base.moved 
= false (it's done a few lines above this under some conditions, but not 
always). The new code will always set bo_va->base.moved = false inside 
amdgpu_vm_bo_idle.

Regards,
  Felix


From: Christian König 
Sent: Saturday, September 1, 2018 4:16:17 AM
To: Kuehling, Felix; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 2/2] drm/amdgpu: improve VM state machine documentation

Am 01.09.2018 um 01:51 schrieb Felix Kuehling:
> Thanks for this. A few comments and a question inline.
>
> On 2018-08-31 09:27 AM, Christian König wrote:
>> Since we have a lot of FAQ on the VM state machine try to improve the
>> documentation by adding functions for each state move.
>>
>> Signed-off-by: Christian König 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 107 
>> -
>>   1 file changed, 79 insertions(+), 28 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index a9275a99d793..40c22635fefd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -204,6 +204,69 @@ static unsigned amdgpu_vm_bo_size(struct amdgpu_device 
>> *adev, unsigned level)
>>  return AMDGPU_GPU_PAGE_ALIGN(amdgpu_vm_num_entries(adev, level) * 8);
>>   }
>>
>> +/**
>> + * amdgpu_vm_bo_evicted - vm_bo is evicted
>> + *
>> + * @vm_bo: vm_bo which is evicted
>> + *
>> + * State for PDs/PTs and per VM BOs which are not at the location they 
>> should
>> + * be.
>> + */
>> +static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo)
>> +{
>> +struct amdgpu_vm *vm = vm_bo->vm;
>> +struct amdgpu_bo *bo = vm_bo->bo;
>> +
>> +vm_bo->moved = true;
>> +if (bo->tbo.type == ttm_bo_type_kernel)
>> +list_move(&vm_bo->vm_status, &vm->evicted);
>> +else
>> +list_move_tail(&vm_bo->vm_status, &vm->evicted);
>> +}
>> +
>> +/**
>> + * amdgpu_vm_bo_relocated - vm_bo is reloacted
>> + *
>> + * @vm_bo: vm_bo which is relocated
>> + *
>> + * State for PDs/PTs which needs to update their parent PD.
>> + */
>> +static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo)
>> +{
>> +list_move(&vm_bo->vm_status, &vm_bo->vm->relocated);
>> +}
>> +
>> +/**
>> + * amdgpu_vm_bo_moved - vm_bo is moved
>> + *
>> + * @vm_bo: vm_bo which is moved
>> + *
>> + * State for per VM and normal BOs which are moved, but that change is not 
>> yet
>> + * reflected in the page tables.
> I have a question here. Why does amdgpu_cs_vm_handling call
> amdgpu_vm_bo_update manually for its BO list entries? Wouldn't it be
> enough to just call amdgpu_vm_handle_moved?

No, it is still possible that the mapping of the BO was never updated
because the VM page tables where evicted.

We still need to make sure that all BOs of the current submission are
updated correctly.

>
>> + */
>> +static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo)
>> +{
>> +struct amdgpu_vm *vm = vm_bo->vm;
>> +
>> +spin_lock(&vm->moved_lock);
>> +list_move(&vm_bo->vm_status, &vm->moved);
>> +spin_unlock(&vm->moved_lock);
> If vm->moved_lock protects the moved list, do we also need to take it
> whenever something is moved from that list?

Correct, yes.

> That could potentially be
> any list_move operation that uses vm_bo->vm_status. I found one case
> below where that may not be handled correctly.
>
>> +}
>> +
>> +/**
>> + * amdgpu_vm_bo_idle - vm_bo is idle
>> + *
>> + * @vm_bo: vm_bo which is now idle
>> + *
>> + * State for PDs/PTs and per VM BOs which have gone through the state 
>> machine
>> + * and are now idle.
>> + */
>> +static void amdgpu_vm_bo_idle(struct amdgpu_vm_bo_base *vm_bo)
>> +{
>> +list_move(&vm_bo->vm_status, &vm_bo->vm->idle);
>> +vm_bo->moved = false;
>> +}
>> +
>>   /**
>>* amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the 
>> vm
>>*
>> @@ -232,9 +295,9 @@ static void amdgpu_vm_bo_base_init(struct 
>> amdgpu_vm_bo_base *base,
>>
>>  vm->bulk_moveable = false;
>>  if (bo->tbo.type == ttm_bo_type_kernel)
>> -list_move(&base->vm_status, &vm->relocated);
>> +amdgpu_vm_bo_relocated(base);
>>  else
>> -list_mo

[PATCH 2/3] drm/amdgpu: separate per VM BOs from normal in the moved state

2018-09-01 Thread Christian König
Allows us to avoid taking the spinlock in more places.

Signed-off-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 67 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  7 +++-
 2 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index a9275a99d793..65977e7c94dc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -342,9 +342,7 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, 
struct amdgpu_vm *vm,
break;
 
if (bo->tbo.type != ttm_bo_type_kernel) {
-   spin_lock(&vm->moved_lock);
list_move(&bo_base->vm_status, &vm->moved);
-   spin_unlock(&vm->moved_lock);
} else {
if (vm->use_cpu_for_update)
r = amdgpu_bo_kmap(bo, NULL);
@@ -1734,10 +1732,6 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
amdgpu_asic_flush_hdp(adev, NULL);
}
 
-   spin_lock(&vm->moved_lock);
-   list_del_init(&bo_va->base.vm_status);
-   spin_unlock(&vm->moved_lock);
-
/* If the BO is not in its preferred location add it back to
 * the evicted list so that it gets validated again on the
 * next command submission.
@@ -1746,9 +1740,13 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
uint32_t mem_type = bo->tbo.mem.mem_type;
 
if (!(bo->preferred_domains & 
amdgpu_mem_type_to_domain(mem_type)))
-   list_add_tail(&bo_va->base.vm_status, &vm->evicted);
+   list_move_tail(&bo_va->base.vm_status, &vm->evicted);
else
-   list_add(&bo_va->base.vm_status, &vm->idle);
+   list_move(&bo_va->base.vm_status, &vm->idle);
+   } else {
+   spin_lock(&vm->invalidated_lock);
+   list_del_init(&bo_va->base.vm_status);
+   spin_unlock(&vm->invalidated_lock);
}
 
list_splice_init(&bo_va->invalids, &bo_va->valids);
@@ -1974,40 +1972,40 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
   struct amdgpu_vm *vm)
 {
struct amdgpu_bo_va *bo_va, *tmp;
-   struct list_head moved;
+   struct reservation_object *resv;
bool clear;
int r;
 
-   INIT_LIST_HEAD(&moved);
-   spin_lock(&vm->moved_lock);
-   list_splice_init(&vm->moved, &moved);
-   spin_unlock(&vm->moved_lock);
+   list_for_each_entry_safe(bo_va, tmp, &vm->moved, base.vm_status) {
+   /* Per VM BOs never need to bo cleared in the page tables */
+   r = amdgpu_vm_bo_update(adev, bo_va, false);
+   if (r)
+   return r;
+   }
 
-   list_for_each_entry_safe(bo_va, tmp, &moved, base.vm_status) {
-   struct reservation_object *resv = bo_va->base.bo->tbo.resv;
+   spin_lock(&vm->invalidated_lock);
+   while (!list_empty(&vm->invalidated)) {
+   bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va,
+base.vm_status);
+   resv = bo_va->base.bo->tbo.resv;
+   spin_unlock(&vm->invalidated_lock);
 
-   /* Per VM BOs never need to bo cleared in the page tables */
-   if (resv == vm->root.base.bo->tbo.resv)
-   clear = false;
/* Try to reserve the BO to avoid clearing its ptes */
-   else if (!amdgpu_vm_debug && reservation_object_trylock(resv))
+   if (!amdgpu_vm_debug && reservation_object_trylock(resv))
clear = false;
/* Somebody else is using the BO right now */
else
clear = true;
 
r = amdgpu_vm_bo_update(adev, bo_va, clear);
-   if (r) {
-   spin_lock(&vm->moved_lock);
-   list_splice(&moved, &vm->moved);
-   spin_unlock(&vm->moved_lock);
+   if (r)
return r;
-   }
 
-   if (!clear && resv != vm->root.base.bo->tbo.resv)
+   if (!clear)
reservation_object_unlock(resv);
-
+   spin_lock(&vm->invalidated_lock);
}
+   spin_unlock(&vm->invalidated_lock);
 
return 0;
 }
@@ -2072,9 +2070,7 @@ static void amdgpu_vm_bo_insert_map(struct amdgpu_device 
*adev,
 
if (bo && bo->tbo.resv == vm->root.base.bo->tbo.resv &&
!bo_va->base.moved) {
-   spin_lock(&vm->moved_lock);
list_move(&bo_va->base.vm_status, &vm->moved);
-   spin_unlock(&vm->moved_lock);
}
trace_amdgpu_vm_bo_map(bo_va, mapping);
 }
@@ -2430,9 +2426,9 @@

[PATCH 1/3] drm/amdgpu: move size calculations to the front of the file again

2018-09-01 Thread Christian König
amdgpu_vm_bo_* functions should come much later.

Signed-off-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 90 +-
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index d59222fb5931..a9275a99d793 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -133,51 +133,6 @@ struct amdgpu_prt_cb {
struct dma_fence_cb cb;
 };
 
-/**
- * amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
- *
- * @base: base structure for tracking BO usage in a VM
- * @vm: vm to which bo is to be added
- * @bo: amdgpu buffer object
- *
- * Initialize a bo_va_base structure and add it to the appropriate lists
- *
- */
-static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
-  struct amdgpu_vm *vm,
-  struct amdgpu_bo *bo)
-{
-   base->vm = vm;
-   base->bo = bo;
-   INIT_LIST_HEAD(&base->bo_list);
-   INIT_LIST_HEAD(&base->vm_status);
-
-   if (!bo)
-   return;
-   list_add_tail(&base->bo_list, &bo->va);
-
-   if (bo->tbo.resv != vm->root.base.bo->tbo.resv)
-   return;
-
-   vm->bulk_moveable = false;
-   if (bo->tbo.type == ttm_bo_type_kernel)
-   list_move(&base->vm_status, &vm->relocated);
-   else
-   list_move(&base->vm_status, &vm->idle);
-
-   if (bo->preferred_domains &
-   amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type))
-   return;
-
-   /*
-* we checked all the prerequisites, but it looks like this per vm bo
-* is currently evicted. add the bo to the evicted list to make sure it
-* is validated on next vm use to avoid fault.
-* */
-   list_move_tail(&base->vm_status, &vm->evicted);
-   base->moved = true;
-}
-
 /**
  * amdgpu_vm_level_shift - return the addr shift for each level
  *
@@ -249,6 +204,51 @@ static unsigned amdgpu_vm_bo_size(struct amdgpu_device 
*adev, unsigned level)
return AMDGPU_GPU_PAGE_ALIGN(amdgpu_vm_num_entries(adev, level) * 8);
 }
 
+/**
+ * amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
+ *
+ * @base: base structure for tracking BO usage in a VM
+ * @vm: vm to which bo is to be added
+ * @bo: amdgpu buffer object
+ *
+ * Initialize a bo_va_base structure and add it to the appropriate lists
+ *
+ */
+static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
+  struct amdgpu_vm *vm,
+  struct amdgpu_bo *bo)
+{
+   base->vm = vm;
+   base->bo = bo;
+   INIT_LIST_HEAD(&base->bo_list);
+   INIT_LIST_HEAD(&base->vm_status);
+
+   if (!bo)
+   return;
+   list_add_tail(&base->bo_list, &bo->va);
+
+   if (bo->tbo.resv != vm->root.base.bo->tbo.resv)
+   return;
+
+   vm->bulk_moveable = false;
+   if (bo->tbo.type == ttm_bo_type_kernel)
+   list_move(&base->vm_status, &vm->relocated);
+   else
+   list_move(&base->vm_status, &vm->idle);
+
+   if (bo->preferred_domains &
+   amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type))
+   return;
+
+   /*
+* we checked all the prerequisites, but it looks like this per vm bo
+* is currently evicted. add the bo to the evicted list to make sure it
+* is validated on next vm use to avoid fault.
+* */
+   list_move_tail(&base->vm_status, &vm->evicted);
+   base->moved = true;
+}
+
 /**
  * amdgpu_vm_get_pd_bo - add the VM PD to a validation list
  *
-- 
2.14.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 3/3] drm/amdgpu: improve VM state machine documentation

2018-09-01 Thread Christian König
Since we have a lot of FAQ on the VM state machine try to improve the
documentation by adding functions for each state move.

Signed-off-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 139 +
 1 file changed, 108 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 65977e7c94dc..ce252ead2ee4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -204,6 +204,95 @@ static unsigned amdgpu_vm_bo_size(struct amdgpu_device 
*adev, unsigned level)
return AMDGPU_GPU_PAGE_ALIGN(amdgpu_vm_num_entries(adev, level) * 8);
 }
 
+/**
+ * amdgpu_vm_bo_evicted - vm_bo is evicted
+ *
+ * @vm_bo: vm_bo which is evicted
+ *
+ * State for PDs/PTs and per VM BOs which are not at the location they should
+ * be.
+ */
+static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo)
+{
+   struct amdgpu_vm *vm = vm_bo->vm;
+   struct amdgpu_bo *bo = vm_bo->bo;
+
+   vm_bo->moved = true;
+   if (bo->tbo.type == ttm_bo_type_kernel)
+   list_move(&vm_bo->vm_status, &vm->evicted);
+   else
+   list_move_tail(&vm_bo->vm_status, &vm->evicted);
+}
+
+/**
+ * amdgpu_vm_bo_relocated - vm_bo is reloacted
+ *
+ * @vm_bo: vm_bo which is relocated
+ *
+ * State for PDs/PTs which needs to update their parent PD.
+ */
+static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo)
+{
+   list_move(&vm_bo->vm_status, &vm_bo->vm->relocated);
+}
+
+/**
+ * amdgpu_vm_bo_moved - vm_bo is moved
+ *
+ * @vm_bo: vm_bo which is moved
+ *
+ * State for per VM BOs which are moved, but that change is not yet reflected
+ * in the page tables.
+ */
+static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo)
+{
+   list_move(&vm_bo->vm_status, &vm_bo->vm->moved);
+}
+
+/**
+ * amdgpu_vm_bo_idle - vm_bo is idle
+ *
+ * @vm_bo: vm_bo which is now idle
+ *
+ * State for PDs/PTs and per VM BOs which have gone through the state machine
+ * and are now idle.
+ */
+static void amdgpu_vm_bo_idle(struct amdgpu_vm_bo_base *vm_bo)
+{
+   list_move(&vm_bo->vm_status, &vm_bo->vm->idle);
+   vm_bo->moved = false;
+}
+
+/**
+ * amdgpu_vm_bo_invalidated - vm_bo is invalidated
+ *
+ * @vm_bo: vm_bo which is now invalidated
+ *
+ * State for normal BOs which are invalidated and that change not yet reflected
+ * in the PTs.
+ */
+static void amdgpu_vm_bo_invalidated(struct amdgpu_vm_bo_base *vm_bo)
+{
+   spin_lock(&vm_bo->vm->invalidated_lock);
+   list_move(&vm_bo->vm_status, &vm_bo->vm->idle);
+   spin_unlock(&vm_bo->vm->invalidated_lock);
+}
+
+/**
+ * amdgpu_vm_bo_done - vm_bo is done
+ *
+ * @vm_bo: vm_bo which is now done
+ *
+ * State for normal BOs which are invalidated and that change has been updated
+ * in the PTs.
+ */
+static void amdgpu_vm_bo_done(struct amdgpu_vm_bo_base *vm_bo)
+{
+   spin_lock(&vm_bo->vm->invalidated_lock);
+   list_del_init(&vm_bo->vm_status);
+   spin_unlock(&vm_bo->vm->invalidated_lock);
+}
+
 /**
  * amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
  *
@@ -232,9 +321,9 @@ static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base 
*base,
 
vm->bulk_moveable = false;
if (bo->tbo.type == ttm_bo_type_kernel)
-   list_move(&base->vm_status, &vm->relocated);
+   amdgpu_vm_bo_relocated(base);
else
-   list_move(&base->vm_status, &vm->idle);
+   amdgpu_vm_bo_idle(base);
 
if (bo->preferred_domains &
amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type))
@@ -245,8 +334,7 @@ static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base 
*base,
 * is currently evicted. add the bo to the evicted list to make sure it
 * is validated on next vm use to avoid fault.
 * */
-   list_move_tail(&base->vm_status, &vm->evicted);
-   base->moved = true;
+   amdgpu_vm_bo_evicted(base);
 }
 
 /**
@@ -342,7 +430,7 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, 
struct amdgpu_vm *vm,
break;
 
if (bo->tbo.type != ttm_bo_type_kernel) {
-   list_move(&bo_base->vm_status, &vm->moved);
+   amdgpu_vm_bo_moved(bo_base);
} else {
if (vm->use_cpu_for_update)
r = amdgpu_bo_kmap(bo, NULL);
@@ -350,7 +438,7 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, 
struct amdgpu_vm *vm,
r = amdgpu_ttm_alloc_gart(&bo->tbo);
if (r)
break;
-   list_move(&bo_base->vm_status, &vm->relocated);
+   amdgpu_vm_bo_relocated(bo_base);
}
}
 
@@ -1121,8 +1209,7 @@ int amdgpu_vm_update_directories(struct amdgpu_device 
*adev,
bo_base = list_fir

Re: [PATCH 2/2] drm/amdgpu: improve VM state machine documentation

2018-09-01 Thread Christian König

Am 01.09.2018 um 18:45 schrieb Kuehling, Felix:

@@ -1746,9 +1805,9 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
   uint32_t mem_type = bo->tbo.mem.mem_type;

   if (!(bo->preferred_domains & 
amdgpu_mem_type_to_domain(mem_type)))
-list_add_tail(&bo_va->base.vm_status, &vm->evicted);
+amdgpu_vm_bo_evicted(&bo_va->base);
   else
-list_add(&bo_va->base.vm_status, &vm->idle);
+amdgpu_vm_bo_idle(&bo_va->base);

There is a small change in behaviour here for clearing
bo_va->base.moved. Not sure if it matters.

Using list_add is minimal more efficient.

What I meant was that the original code would not always set bo_va->base.moved = 
false (it's done a few lines above this under some conditions, but not always). The 
new code will always set bo_va->base.moved = false inside amdgpu_vm_bo_idle.


Good point as well, but that is indeed correct.

Regards,
Christian.



Regards,
   Felix


From: Christian König 
Sent: Saturday, September 1, 2018 4:16:17 AM
To: Kuehling, Felix; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 2/2] drm/amdgpu: improve VM state machine documentation

Am 01.09.2018 um 01:51 schrieb Felix Kuehling:

Thanks for this. A few comments and a question inline.

On 2018-08-31 09:27 AM, Christian König wrote:

Since we have a lot of FAQ on the VM state machine try to improve the
documentation by adding functions for each state move.

Signed-off-by: Christian König 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 107 
-
   1 file changed, 79 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index a9275a99d793..40c22635fefd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -204,6 +204,69 @@ static unsigned amdgpu_vm_bo_size(struct amdgpu_device 
*adev, unsigned level)
  return AMDGPU_GPU_PAGE_ALIGN(amdgpu_vm_num_entries(adev, level) * 8);
   }

+/**
+ * amdgpu_vm_bo_evicted - vm_bo is evicted
+ *
+ * @vm_bo: vm_bo which is evicted
+ *
+ * State for PDs/PTs and per VM BOs which are not at the location they should
+ * be.
+ */
+static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo)
+{
+struct amdgpu_vm *vm = vm_bo->vm;
+struct amdgpu_bo *bo = vm_bo->bo;
+
+vm_bo->moved = true;
+if (bo->tbo.type == ttm_bo_type_kernel)
+list_move(&vm_bo->vm_status, &vm->evicted);
+else
+list_move_tail(&vm_bo->vm_status, &vm->evicted);
+}
+
+/**
+ * amdgpu_vm_bo_relocated - vm_bo is reloacted
+ *
+ * @vm_bo: vm_bo which is relocated
+ *
+ * State for PDs/PTs which needs to update their parent PD.
+ */
+static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo)
+{
+list_move(&vm_bo->vm_status, &vm_bo->vm->relocated);
+}
+
+/**
+ * amdgpu_vm_bo_moved - vm_bo is moved
+ *
+ * @vm_bo: vm_bo which is moved
+ *
+ * State for per VM and normal BOs which are moved, but that change is not yet
+ * reflected in the page tables.

I have a question here. Why does amdgpu_cs_vm_handling call
amdgpu_vm_bo_update manually for its BO list entries? Wouldn't it be
enough to just call amdgpu_vm_handle_moved?

No, it is still possible that the mapping of the BO was never updated
because the VM page tables where evicted.

We still need to make sure that all BOs of the current submission are
updated correctly.


+ */
+static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo)
+{
+struct amdgpu_vm *vm = vm_bo->vm;
+
+spin_lock(&vm->moved_lock);
+list_move(&vm_bo->vm_status, &vm->moved);
+spin_unlock(&vm->moved_lock);

If vm->moved_lock protects the moved list, do we also need to take it
whenever something is moved from that list?

Correct, yes.


That could potentially be
any list_move operation that uses vm_bo->vm_status. I found one case
below where that may not be handled correctly.


+}
+
+/**
+ * amdgpu_vm_bo_idle - vm_bo is idle
+ *
+ * @vm_bo: vm_bo which is now idle
+ *
+ * State for PDs/PTs and per VM BOs which have gone through the state machine
+ * and are now idle.
+ */
+static void amdgpu_vm_bo_idle(struct amdgpu_vm_bo_base *vm_bo)
+{
+list_move(&vm_bo->vm_status, &vm_bo->vm->idle);
+vm_bo->moved = false;
+}
+
   /**
* amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
*
@@ -232,9 +295,9 @@ static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base 
*base,

  vm->bulk_moveable = false;
  if (bo->tbo.type == ttm_bo_type_kernel)
-list_move(&base->vm_status, &vm->relocated);
+amdgpu_vm_bo_relocated(base);
  else
-list_move(&base->vm_status, &vm->idle);
+amdgpu_vm_bo_idle(base);

  if (bo->preferred_domains &
  amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type))
@@ -245,8 +308,7 @@ static void amdgpu_vm_bo_base_init(st