Re: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close v2

2020-03-18 Thread Pan, Xinhui
[AMD Official Use Only - Internal Distribution Only]

yes, adding excl fence again as shared one is more reliable

From: Christian König 
Sent: Wednesday, March 18, 2020 4:03:14 PM
To: Pan, Xinhui 
Cc: amd-gfx@lists.freedesktop.org ; Liu, Monk 

Subject: Re: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close v2

The key point is that 10ms should be sufficient that either the move or
the update is finished.

One alternative which came to my mind would be to add the exclusive
fence as shared as well in this case.

This way we won't need to block at all.

Christian.

Am 18.03.20 um 09:00 schrieb Pan, Xinhui:
> I wonder if it really fix anything with such small delay. but it should be no 
> harm anyway.
>
> Reviewed-by: xinhui pan 
>
>> 2020年3月18日 15:51,Christian König  写道:
>>
>> Ping? Xinhui can I get an rb for this?
>>
>> Thanks,
>> Christian.
>>
>> Am 16.03.20 um 14:22 schrieb Christian König:
>>> The problem is that we can't add the clear fence to the BO
>>> when there is an exclusive fence on it since we can't
>>> guarantee the the clear fence will complete after the
>>> exclusive one.
>>>
>>> To fix this refactor the function and wait for any potential
>>> exclusive fence with a small timeout before adding the
>>> shared clear fence.
>>>
>>> v2: fix warning
>>>
>>> Signed-off-by: Christian König 
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 43 +++--
>>>   1 file changed, 26 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> index 5bec66e6b1f8..49c91dac35a0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> @@ -161,10 +161,11 @@ void amdgpu_gem_object_close(struct drm_gem_object 
>>> *obj,
>>>  struct amdgpu_bo_list_entry vm_pd;
>>>  struct list_head list, duplicates;
>>> +   struct dma_fence *fence = NULL;
>>>  struct ttm_validate_buffer tv;
>>>  struct ww_acquire_ctx ticket;
>>>  struct amdgpu_bo_va *bo_va;
>>> -   int r;
>>> +   long r;
>>>  INIT_LIST_HEAD();
>>>  INIT_LIST_HEAD();
>>> @@ -178,28 +179,36 @@ void amdgpu_gem_object_close(struct drm_gem_object 
>>> *obj,
>>>  r = ttm_eu_reserve_buffers(, , false, );
>>>  if (r) {
>>>  dev_err(adev->dev, "leaking bo va because "
>>> -   "we fail to reserve bo (%d)\n", r);
>>> +   "we fail to reserve bo (%ld)\n", r);
>>>  return;
>>>  }
>>>  bo_va = amdgpu_vm_bo_find(vm, bo);
>>> -   if (bo_va && --bo_va->ref_count == 0) {
>>> -   amdgpu_vm_bo_rmv(adev, bo_va);
>>> +   if (!bo_va || --bo_va->ref_count)
>>> +   goto out_unlock;
>>>   - if (amdgpu_vm_ready(vm)) {
>>> -   struct dma_fence *fence = NULL;
>>> +   amdgpu_vm_bo_rmv(adev, bo_va);
>>> +   if (!amdgpu_vm_ready(vm))
>>> +   goto out_unlock;
>>>   - r = amdgpu_vm_clear_freed(adev, vm, );
>>> -   if (unlikely(r)) {
>>> -   dev_err(adev->dev, "failed to clear page "
>>> -   "tables on GEM object close (%d)\n", r);
>>> -   }
>>>   - if (fence) {
>>> -   amdgpu_bo_fence(bo, fence, true);
>>> -   dma_fence_put(fence);
>>> -   }
>>> -   }
>>> -   }
>>> +   r = amdgpu_vm_clear_freed(adev, vm, );
>>> +   if (r || !fence)
>>> +   goto out_unlock;
>>> +
>>> +   r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv, false, false,
>>> + msecs_to_jiffies(10));
>>> +   if (r == 0)
>>> +   r = -ETIMEDOUT;
>>> +   if (r)
>>> +   goto out_unlock;
>>> +
>>> +   amdgpu_bo_fence(bo, fence, true);
>>> +   dma_fence_put(fence);
>>> +
>>> +out_unlock:
>>> +   if (unlikely(r < 0))
>>> +   dev_err(adev->dev, "failed to clear page "
>>> +   "tables on GEM object close (%ld)\n", r);
>>>  ttm_eu_backoff_reservation(, );
>>>   }
>>>

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close v2

2020-03-18 Thread Christian König
The key point is that 10ms should be sufficient that either the move or 
the update is finished.


One alternative which came to my mind would be to add the exclusive 
fence as shared as well in this case.


This way we won't need to block at all.

Christian.

Am 18.03.20 um 09:00 schrieb Pan, Xinhui:

I wonder if it really fix anything with such small delay. but it should be no 
harm anyway.

Reviewed-by: xinhui pan 


2020年3月18日 15:51,Christian König  写道:

Ping? Xinhui can I get an rb for this?

Thanks,
Christian.

Am 16.03.20 um 14:22 schrieb Christian König:

The problem is that we can't add the clear fence to the BO
when there is an exclusive fence on it since we can't
guarantee the the clear fence will complete after the
exclusive one.

To fix this refactor the function and wait for any potential
exclusive fence with a small timeout before adding the
shared clear fence.

v2: fix warning

Signed-off-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 43 +++--
  1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 5bec66e6b1f8..49c91dac35a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -161,10 +161,11 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
struct amdgpu_bo_list_entry vm_pd;
struct list_head list, duplicates;
+   struct dma_fence *fence = NULL;
struct ttm_validate_buffer tv;
struct ww_acquire_ctx ticket;
struct amdgpu_bo_va *bo_va;
-   int r;
+   long r;
INIT_LIST_HEAD();
INIT_LIST_HEAD();
@@ -178,28 +179,36 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
r = ttm_eu_reserve_buffers(, , false, );
if (r) {
dev_err(adev->dev, "leaking bo va because "
-   "we fail to reserve bo (%d)\n", r);
+   "we fail to reserve bo (%ld)\n", r);
return;
}
bo_va = amdgpu_vm_bo_find(vm, bo);
-   if (bo_va && --bo_va->ref_count == 0) {
-   amdgpu_vm_bo_rmv(adev, bo_va);
+   if (!bo_va || --bo_va->ref_count)
+   goto out_unlock;
  - if (amdgpu_vm_ready(vm)) {
-   struct dma_fence *fence = NULL;
+   amdgpu_vm_bo_rmv(adev, bo_va);
+   if (!amdgpu_vm_ready(vm))
+   goto out_unlock;
  - r = amdgpu_vm_clear_freed(adev, vm, );
-   if (unlikely(r)) {
-   dev_err(adev->dev, "failed to clear page "
-   "tables on GEM object close (%d)\n", r);
-   }
  - if (fence) {
-   amdgpu_bo_fence(bo, fence, true);
-   dma_fence_put(fence);
-   }
-   }
-   }
+   r = amdgpu_vm_clear_freed(adev, vm, );
+   if (r || !fence)
+   goto out_unlock;
+
+   r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv, false, false,
+ msecs_to_jiffies(10));
+   if (r == 0)
+   r = -ETIMEDOUT;
+   if (r)
+   goto out_unlock;
+
+   amdgpu_bo_fence(bo, fence, true);
+   dma_fence_put(fence);
+
+out_unlock:
+   if (unlikely(r < 0))
+   dev_err(adev->dev, "failed to clear page "
+   "tables on GEM object close (%ld)\n", r);
ttm_eu_backoff_reservation(, );
  }
  


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close v2

2020-03-18 Thread Pan, Xinhui
I wonder if it really fix anything with such small delay. but it should be no 
harm anyway.

Reviewed-by: xinhui pan 

> 2020年3月18日 15:51,Christian König  写道:
> 
> Ping? Xinhui can I get an rb for this?
> 
> Thanks,
> Christian.
> 
> Am 16.03.20 um 14:22 schrieb Christian König:
>> The problem is that we can't add the clear fence to the BO
>> when there is an exclusive fence on it since we can't
>> guarantee the the clear fence will complete after the
>> exclusive one.
>> 
>> To fix this refactor the function and wait for any potential
>> exclusive fence with a small timeout before adding the
>> shared clear fence.
>> 
>> v2: fix warning
>> 
>> Signed-off-by: Christian König 
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 43 +++--
>>  1 file changed, 26 insertions(+), 17 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> index 5bec66e6b1f8..49c91dac35a0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> @@ -161,10 +161,11 @@ void amdgpu_gem_object_close(struct drm_gem_object 
>> *obj,
>>  struct amdgpu_bo_list_entry vm_pd;
>>  struct list_head list, duplicates;
>> +struct dma_fence *fence = NULL;
>>  struct ttm_validate_buffer tv;
>>  struct ww_acquire_ctx ticket;
>>  struct amdgpu_bo_va *bo_va;
>> -int r;
>> +long r;
>>  INIT_LIST_HEAD();
>>  INIT_LIST_HEAD();
>> @@ -178,28 +179,36 @@ void amdgpu_gem_object_close(struct drm_gem_object 
>> *obj,
>>  r = ttm_eu_reserve_buffers(, , false, );
>>  if (r) {
>>  dev_err(adev->dev, "leaking bo va because "
>> -"we fail to reserve bo (%d)\n", r);
>> +"we fail to reserve bo (%ld)\n", r);
>>  return;
>>  }
>>  bo_va = amdgpu_vm_bo_find(vm, bo);
>> -if (bo_va && --bo_va->ref_count == 0) {
>> -amdgpu_vm_bo_rmv(adev, bo_va);
>> +if (!bo_va || --bo_va->ref_count)
>> +goto out_unlock;
>>  -   if (amdgpu_vm_ready(vm)) {
>> -struct dma_fence *fence = NULL;
>> +amdgpu_vm_bo_rmv(adev, bo_va);
>> +if (!amdgpu_vm_ready(vm))
>> +goto out_unlock;
>>  -   r = amdgpu_vm_clear_freed(adev, vm, );
>> -if (unlikely(r)) {
>> -dev_err(adev->dev, "failed to clear page "
>> -"tables on GEM object close (%d)\n", r);
>> -}
>>  -   if (fence) {
>> -amdgpu_bo_fence(bo, fence, true);
>> -dma_fence_put(fence);
>> -}
>> -}
>> -}
>> +r = amdgpu_vm_clear_freed(adev, vm, );
>> +if (r || !fence)
>> +goto out_unlock;
>> +
>> +r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv, false, false,
>> +  msecs_to_jiffies(10));
>> +if (r == 0)
>> +r = -ETIMEDOUT;
>> +if (r)
>> +goto out_unlock;
>> +
>> +amdgpu_bo_fence(bo, fence, true);
>> +dma_fence_put(fence);
>> +
>> +out_unlock:
>> +if (unlikely(r < 0))
>> +dev_err(adev->dev, "failed to clear page "
>> +"tables on GEM object close (%ld)\n", r);
>>  ttm_eu_backoff_reservation(, );
>>  }
>>  
> 

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close v2

2020-03-18 Thread Christian König

Ping? Xinhui can I get an rb for this?

Thanks,
Christian.

Am 16.03.20 um 14:22 schrieb Christian König:

The problem is that we can't add the clear fence to the BO
when there is an exclusive fence on it since we can't
guarantee the the clear fence will complete after the
exclusive one.

To fix this refactor the function and wait for any potential
exclusive fence with a small timeout before adding the
shared clear fence.

v2: fix warning

Signed-off-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 43 +++--
  1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 5bec66e6b1f8..49c91dac35a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -161,10 +161,11 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
  
  	struct amdgpu_bo_list_entry vm_pd;

struct list_head list, duplicates;
+   struct dma_fence *fence = NULL;
struct ttm_validate_buffer tv;
struct ww_acquire_ctx ticket;
struct amdgpu_bo_va *bo_va;
-   int r;
+   long r;
  
  	INIT_LIST_HEAD();

INIT_LIST_HEAD();
@@ -178,28 +179,36 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
r = ttm_eu_reserve_buffers(, , false, );
if (r) {
dev_err(adev->dev, "leaking bo va because "
-   "we fail to reserve bo (%d)\n", r);
+   "we fail to reserve bo (%ld)\n", r);
return;
}
bo_va = amdgpu_vm_bo_find(vm, bo);
-   if (bo_va && --bo_va->ref_count == 0) {
-   amdgpu_vm_bo_rmv(adev, bo_va);
+   if (!bo_va || --bo_va->ref_count)
+   goto out_unlock;
  
-		if (amdgpu_vm_ready(vm)) {

-   struct dma_fence *fence = NULL;
+   amdgpu_vm_bo_rmv(adev, bo_va);
+   if (!amdgpu_vm_ready(vm))
+   goto out_unlock;
  
-			r = amdgpu_vm_clear_freed(adev, vm, );

-   if (unlikely(r)) {
-   dev_err(adev->dev, "failed to clear page "
-   "tables on GEM object close (%d)\n", r);
-   }
  
-			if (fence) {

-   amdgpu_bo_fence(bo, fence, true);
-   dma_fence_put(fence);
-   }
-   }
-   }
+   r = amdgpu_vm_clear_freed(adev, vm, );
+   if (r || !fence)
+   goto out_unlock;
+
+   r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv, false, false,
+ msecs_to_jiffies(10));
+   if (r == 0)
+   r = -ETIMEDOUT;
+   if (r)
+   goto out_unlock;
+
+   amdgpu_bo_fence(bo, fence, true);
+   dma_fence_put(fence);
+
+out_unlock:
+   if (unlikely(r < 0))
+   dev_err(adev->dev, "failed to clear page "
+   "tables on GEM object close (%ld)\n", r);
ttm_eu_backoff_reservation(, );
  }
  


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close v2

2020-03-16 Thread Christian König
The problem is that we can't add the clear fence to the BO
when there is an exclusive fence on it since we can't
guarantee the the clear fence will complete after the
exclusive one.

To fix this refactor the function and wait for any potential
exclusive fence with a small timeout before adding the
shared clear fence.

v2: fix warning

Signed-off-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 43 +++--
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 5bec66e6b1f8..49c91dac35a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -161,10 +161,11 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
 
struct amdgpu_bo_list_entry vm_pd;
struct list_head list, duplicates;
+   struct dma_fence *fence = NULL;
struct ttm_validate_buffer tv;
struct ww_acquire_ctx ticket;
struct amdgpu_bo_va *bo_va;
-   int r;
+   long r;
 
INIT_LIST_HEAD();
INIT_LIST_HEAD();
@@ -178,28 +179,36 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
r = ttm_eu_reserve_buffers(, , false, );
if (r) {
dev_err(adev->dev, "leaking bo va because "
-   "we fail to reserve bo (%d)\n", r);
+   "we fail to reserve bo (%ld)\n", r);
return;
}
bo_va = amdgpu_vm_bo_find(vm, bo);
-   if (bo_va && --bo_va->ref_count == 0) {
-   amdgpu_vm_bo_rmv(adev, bo_va);
+   if (!bo_va || --bo_va->ref_count)
+   goto out_unlock;
 
-   if (amdgpu_vm_ready(vm)) {
-   struct dma_fence *fence = NULL;
+   amdgpu_vm_bo_rmv(adev, bo_va);
+   if (!amdgpu_vm_ready(vm))
+   goto out_unlock;
 
-   r = amdgpu_vm_clear_freed(adev, vm, );
-   if (unlikely(r)) {
-   dev_err(adev->dev, "failed to clear page "
-   "tables on GEM object close (%d)\n", r);
-   }
 
-   if (fence) {
-   amdgpu_bo_fence(bo, fence, true);
-   dma_fence_put(fence);
-   }
-   }
-   }
+   r = amdgpu_vm_clear_freed(adev, vm, );
+   if (r || !fence)
+   goto out_unlock;
+
+   r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv, false, false,
+ msecs_to_jiffies(10));
+   if (r == 0)
+   r = -ETIMEDOUT;
+   if (r)
+   goto out_unlock;
+
+   amdgpu_bo_fence(bo, fence, true);
+   dma_fence_put(fence);
+
+out_unlock:
+   if (unlikely(r < 0))
+   dev_err(adev->dev, "failed to clear page "
+   "tables on GEM object close (%ld)\n", r);
ttm_eu_backoff_reservation(, );
 }
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close

2020-03-12 Thread Christian König

Am 12.03.20 um 16:03 schrieb Liu, Monk:

The problem is that dma_resv_test_signaled_rcu() tests only the shared fence if 
one is present.

Okay I got the point now, but why we cannot modify dma_resv_test_signaled_rcu() 
to let it wait for both exclusive and shared lists ?


That is exactly what I and Xinhui said as well and what we also both 
proposed in patches, but the Intel guys are against that.


I also already proposed an extension to the dma_resv infrastructure 
where you get a list of "other" fences for stuff like this, but it 
wasn't to well received either and I can't dedicate more time to this.


Regards,
Christian.




Ack-by: Monk Liu 
_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: Koenig, Christian 
Sent: Thursday, March 12, 2020 9:42 PM
To: Liu, Monk ; Pan, Xinhui ; 
amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close

The problem is that dma_resv_test_signaled_rcu() tests only the shared fence if 
one is present.

Now what happened is that the clear fence completed before the exclusive one, 
and that in turn caused TTM to think that the BO is unused and freed it.

Christian.

Am 12.03.20 um 14:25 schrieb Liu, Monk:

without your patch, the clear fence is also hooked in the shared list
of bo's reservation obj,  no matter the exclusive fence of that BO
signaled before clear fence or not

since the clear fence is always kept in the bo's resv object, can you tell me 
what's the problem than ? are we going to loose this clear fence and still use 
it during the  VM pt/pd clearing ?

thanks
_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: Christian König 
Sent: Thursday, March 12, 2020 8:50 PM
To: Liu, Monk ; Pan, Xinhui ;
amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: fix and cleanup
amdgpu_gem_object_close

   From the semantic the dma_resv object contains a single exclusive and 
multiple shared fences and it is mandatory that the shared fences complete 
after the exclusive one.

Now what happens is that clearing the VM page tables runs asynchronously to the 
exclusive fence which moves the buffer around.

And because of this Xinhui ran into a rare problem that TTM thought it could 
reuse the memory while in reality it was still in use.

Regards,
Christian.

Am 12.03.20 um 12:30 schrieb Liu, Monk:

Can you give more details about " we can't guarantee the the clear fence will 
complete after the exclusive one." ?

Thanks

_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: amd-gfx  On Behalf Of
Christian K?nig
Sent: Thursday, March 12, 2020 7:12 PM
To: Pan, Xinhui ; amd-gfx@lists.freedesktop.org
Subject: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close

The problem is that we can't add the clear fence to the BO when there is an 
exclusive fence on it since we can't guarantee the the clear fence will 
complete after the exclusive one.

To fix this refactor the function and wait for any potential exclusive fence 
with a small timeout before adding the shared clear fence.

Signed-off-by: Christian König 
---
drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 41 +++--
1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 4277125a79ee..0782162fb5f3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -161,10 +161,11 @@ void amdgpu_gem_object_close(struct
drm_gem_object *obj,

	struct amdgpu_bo_list_entry vm_pd;

struct list_head list, duplicates;
+   struct dma_fence *fence = NULL;
struct ttm_validate_buffer tv;
struct ww_acquire_ctx ticket;
struct amdgpu_bo_va *bo_va;
-   int r;
+   long r;

	INIT_LIST_HEAD();

INIT_LIST_HEAD();
@@ -182,24 +183,32 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
return;
}
bo_va = amdgpu_vm_bo_find(vm, bo);
-   if (bo_va && --bo_va->ref_count == 0) {
-   amdgpu_vm_bo_rmv(adev, bo_va);
+   if (!bo_va || --bo_va->ref_count)
+   goto out_unlock;

-		if (amdgpu_vm_ready(vm)) {

-   struct dma_fence *fence = NULL;
+   amdgpu_vm_bo_rmv(adev, bo_va);
+   if (!amdgpu_vm_ready(vm))
+   goto out_unlock;

-			r = amdgpu_vm_clear_freed(adev, vm, );

-   if (unlikely(r)) {
-   dev_err(adev->dev, "failed to clear page "
-   "tables on GEM object close (%d)\n", r);
-   }

-			if (fence) {

-   amdgpu_bo_fence(bo, fence, t

RE: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close

2020-03-12 Thread Liu, Monk
>>> The problem is that dma_resv_test_signaled_rcu() tests only the shared 
>>> fence if one is present.
Okay I got the point now, but why we cannot modify dma_resv_test_signaled_rcu() 
to let it wait for both exclusive and shared lists ? 


Ack-by: Monk Liu 
_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: Koenig, Christian  
Sent: Thursday, March 12, 2020 9:42 PM
To: Liu, Monk ; Pan, Xinhui ; 
amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close

The problem is that dma_resv_test_signaled_rcu() tests only the shared fence if 
one is present.

Now what happened is that the clear fence completed before the exclusive one, 
and that in turn caused TTM to think that the BO is unused and freed it.

Christian.

Am 12.03.20 um 14:25 schrieb Liu, Monk:
> without your patch, the clear fence is also hooked in the shared list 
> of bo's reservation obj,  no matter the exclusive fence of that BO 
> signaled before clear fence or not
>
> since the clear fence is always kept in the bo's resv object, can you tell me 
> what's the problem than ? are we going to loose this clear fence and still 
> use it during the  VM pt/pd clearing ?
>
> thanks
> _
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -Original Message-
> From: Christian König 
> Sent: Thursday, March 12, 2020 8:50 PM
> To: Liu, Monk ; Pan, Xinhui ; 
> amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: fix and cleanup 
> amdgpu_gem_object_close
>
>   From the semantic the dma_resv object contains a single exclusive and 
> multiple shared fences and it is mandatory that the shared fences complete 
> after the exclusive one.
>
> Now what happens is that clearing the VM page tables runs asynchronously to 
> the exclusive fence which moves the buffer around.
>
> And because of this Xinhui ran into a rare problem that TTM thought it could 
> reuse the memory while in reality it was still in use.
>
> Regards,
> Christian.
>
> Am 12.03.20 um 12:30 schrieb Liu, Monk:
>> Can you give more details about " we can't guarantee the the clear fence 
>> will complete after the exclusive one." ?
>>
>> Thanks
>>
>> _
>> Monk Liu|GPU Virtualization Team |AMD
>>
>>
>> -----Original Message-
>> From: amd-gfx  On Behalf Of 
>> Christian K?nig
>> Sent: Thursday, March 12, 2020 7:12 PM
>> To: Pan, Xinhui ; amd-gfx@lists.freedesktop.org
>> Subject: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close
>>
>> The problem is that we can't add the clear fence to the BO when there is an 
>> exclusive fence on it since we can't guarantee the the clear fence will 
>> complete after the exclusive one.
>>
>> To fix this refactor the function and wait for any potential exclusive fence 
>> with a small timeout before adding the shared clear fence.
>>
>> Signed-off-by: Christian König 
>> ---
>>drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 41 +++--
>>1 file changed, 25 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> index 4277125a79ee..0782162fb5f3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> @@ -161,10 +161,11 @@ void amdgpu_gem_object_close(struct 
>> drm_gem_object *obj,
>>
>>  struct amdgpu_bo_list_entry vm_pd;
>>  struct list_head list, duplicates;
>> +struct dma_fence *fence = NULL;
>>  struct ttm_validate_buffer tv;
>>  struct ww_acquire_ctx ticket;
>>  struct amdgpu_bo_va *bo_va;
>> -int r;
>> +long r;
>>
>>  INIT_LIST_HEAD();
>>  INIT_LIST_HEAD();
>> @@ -182,24 +183,32 @@ void amdgpu_gem_object_close(struct drm_gem_object 
>> *obj,
>>  return;
>>  }
>>  bo_va = amdgpu_vm_bo_find(vm, bo);
>> -if (bo_va && --bo_va->ref_count == 0) {
>> -amdgpu_vm_bo_rmv(adev, bo_va);
>> +if (!bo_va || --bo_va->ref_count)
>> +goto out_unlock;
>>
>> -if (amdgpu_vm_ready(vm)) {
>> -struct dma_fence *fence = NULL;
>> +amdgpu_vm_bo_rmv(adev, bo_va);
>> +if (!amdgpu_vm_ready(vm))
>> +goto out_unlock;
>>
>> -r = amdgpu_vm_clear_freed(adev, vm, );
>> -if (unlikely(r)) {
>> - 

Re: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close

2020-03-12 Thread Christian König
The problem is that dma_resv_test_signaled_rcu() tests only the shared 
fence if one is present.


Now what happened is that the clear fence completed before the exclusive 
one, and that in turn caused TTM to think that the BO is unused and 
freed it.


Christian.

Am 12.03.20 um 14:25 schrieb Liu, Monk:

without your patch, the clear fence is also hooked in the shared list of bo's 
reservation obj,  no matter the exclusive fence of that BO signaled before 
clear fence or not

since the clear fence is always kept in the bo's resv object, can you tell me 
what's the problem than ? are we going to loose this clear fence and still use 
it during the  VM pt/pd clearing ?

thanks
_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: Christian König 
Sent: Thursday, March 12, 2020 8:50 PM
To: Liu, Monk ; Pan, Xinhui ; 
amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close

  From the semantic the dma_resv object contains a single exclusive and 
multiple shared fences and it is mandatory that the shared fences complete 
after the exclusive one.

Now what happens is that clearing the VM page tables runs asynchronously to the 
exclusive fence which moves the buffer around.

And because of this Xinhui ran into a rare problem that TTM thought it could 
reuse the memory while in reality it was still in use.

Regards,
Christian.

Am 12.03.20 um 12:30 schrieb Liu, Monk:

Can you give more details about " we can't guarantee the the clear fence will 
complete after the exclusive one." ?

Thanks

_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: amd-gfx  On Behalf Of
Christian K?nig
Sent: Thursday, March 12, 2020 7:12 PM
To: Pan, Xinhui ; amd-gfx@lists.freedesktop.org
Subject: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close

The problem is that we can't add the clear fence to the BO when there is an 
exclusive fence on it since we can't guarantee the the clear fence will 
complete after the exclusive one.

To fix this refactor the function and wait for any potential exclusive fence 
with a small timeout before adding the shared clear fence.

Signed-off-by: Christian König 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 41 +++--
   1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 4277125a79ee..0782162fb5f3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -161,10 +161,11 @@ void amdgpu_gem_object_close(struct
drm_gem_object *obj,
   
   	struct amdgpu_bo_list_entry vm_pd;

struct list_head list, duplicates;
+   struct dma_fence *fence = NULL;
struct ttm_validate_buffer tv;
struct ww_acquire_ctx ticket;
struct amdgpu_bo_va *bo_va;
-   int r;
+   long r;
   
   	INIT_LIST_HEAD();

INIT_LIST_HEAD();
@@ -182,24 +183,32 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
return;
}
bo_va = amdgpu_vm_bo_find(vm, bo);
-   if (bo_va && --bo_va->ref_count == 0) {
-   amdgpu_vm_bo_rmv(adev, bo_va);
+   if (!bo_va || --bo_va->ref_count)
+   goto out_unlock;
   
-		if (amdgpu_vm_ready(vm)) {

-   struct dma_fence *fence = NULL;
+   amdgpu_vm_bo_rmv(adev, bo_va);
+   if (!amdgpu_vm_ready(vm))
+   goto out_unlock;
   
-			r = amdgpu_vm_clear_freed(adev, vm, );

-   if (unlikely(r)) {
-   dev_err(adev->dev, "failed to clear page "
-   "tables on GEM object close (%d)\n", r);
-   }
   
-			if (fence) {

-   amdgpu_bo_fence(bo, fence, true);
-   dma_fence_put(fence);
-   }
-   }
-   }
+   r = amdgpu_vm_clear_freed(adev, vm, );
+   if (r || !fence)
+   goto out_unlock;
+
+   r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv, false, false,
+ msecs_to_jiffies(10));
+   if (r == 0)
+   r = -ETIMEDOUT;
+   if (r)
+   goto out_unlock;
+
+   amdgpu_bo_fence(bo, fence, true);
+   dma_fence_put(fence);
+
+out_unlock:
+   if (unlikely(r))
+   dev_err(adev->dev, "failed to clear page "
+   "tables on GEM object close (%d)\n", r);
ttm_eu_backoff_reservation(, );  }
   
--

2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7CMo
nk.Liu%40amd.com%7C267

RE: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close

2020-03-12 Thread Liu, Monk
>> Now what happens is that clearing the VM page tables runs asynchronously to 
>> the exclusive fence which moves the buffer around.

The amdgpu_vm_clear_freed  is already kicked off before you wait for the 
exclusive fence signaled,  why you can avoid clearing PT not overlap with the 
"move" action after you introduce a " dma_resv_wait_timeout_rcu"
*after* the amdgpu_vm_clear_freed () ?
_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: Christian König  
Sent: Thursday, March 12, 2020 8:50 PM
To: Liu, Monk ; Pan, Xinhui ; 
amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close

 From the semantic the dma_resv object contains a single exclusive and multiple 
shared fences and it is mandatory that the shared fences complete after the 
exclusive one.

Now what happens is that clearing the VM page tables runs asynchronously to the 
exclusive fence which moves the buffer around.

And because of this Xinhui ran into a rare problem that TTM thought it could 
reuse the memory while in reality it was still in use.

Regards,
Christian.

Am 12.03.20 um 12:30 schrieb Liu, Monk:
> Can you give more details about " we can't guarantee the the clear fence will 
> complete after the exclusive one." ?
>
> Thanks
>
> _
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -Original Message-
> From: amd-gfx  On Behalf Of 
> Christian K?nig
> Sent: Thursday, March 12, 2020 7:12 PM
> To: Pan, Xinhui ; amd-gfx@lists.freedesktop.org
> Subject: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close
>
> The problem is that we can't add the clear fence to the BO when there is an 
> exclusive fence on it since we can't guarantee the the clear fence will 
> complete after the exclusive one.
>
> To fix this refactor the function and wait for any potential exclusive fence 
> with a small timeout before adding the shared clear fence.
>
> Signed-off-by: Christian König 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 41 +++--
>   1 file changed, 25 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index 4277125a79ee..0782162fb5f3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -161,10 +161,11 @@ void amdgpu_gem_object_close(struct 
> drm_gem_object *obj,
>   
>   struct amdgpu_bo_list_entry vm_pd;
>   struct list_head list, duplicates;
> + struct dma_fence *fence = NULL;
>   struct ttm_validate_buffer tv;
>   struct ww_acquire_ctx ticket;
>   struct amdgpu_bo_va *bo_va;
> - int r;
> + long r;
>   
>   INIT_LIST_HEAD();
>   INIT_LIST_HEAD();
> @@ -182,24 +183,32 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
>   return;
>   }
>   bo_va = amdgpu_vm_bo_find(vm, bo);
> - if (bo_va && --bo_va->ref_count == 0) {
> - amdgpu_vm_bo_rmv(adev, bo_va);
> + if (!bo_va || --bo_va->ref_count)
> + goto out_unlock;
>   
> - if (amdgpu_vm_ready(vm)) {
> - struct dma_fence *fence = NULL;
> + amdgpu_vm_bo_rmv(adev, bo_va);
> + if (!amdgpu_vm_ready(vm))
> + goto out_unlock;
>   
> - r = amdgpu_vm_clear_freed(adev, vm, );
> - if (unlikely(r)) {
> - dev_err(adev->dev, "failed to clear page "
> - "tables on GEM object close (%d)\n", r);
> - }
>   
> - if (fence) {
> - amdgpu_bo_fence(bo, fence, true);
> - dma_fence_put(fence);
> - }
> - }
> - }
> + r = amdgpu_vm_clear_freed(adev, vm, );
> + if (r || !fence)
> + goto out_unlock;
> +
> + r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv, false, false,
> +   msecs_to_jiffies(10));
> + if (r == 0)
> + r = -ETIMEDOUT;
> + if (r)
> + goto out_unlock;
> +
> + amdgpu_bo_fence(bo, fence, true);
> + dma_fence_put(fence);
> +
> +out_unlock:
> + if (unlikely(r))
> + dev_err(adev->dev, "failed to clear page "
> + "tables on GEM object close (%d)\n", r);
>   ttm_eu_backoff_reservation(, );  }
>   
> --
> 2.17.1
>
> ___
> amd-gfx mailing list
> amd-gfx@

RE: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close

2020-03-12 Thread Liu, Monk
without your patch, the clear fence is also hooked in the shared list of bo's 
reservation obj,  no matter the exclusive fence of that BO signaled before 
clear fence or not 

since the clear fence is always kept in the bo's resv object, can you tell me 
what's the problem than ? are we going to loose this clear fence and still use 
it during the  VM pt/pd clearing ?

thanks 
_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: Christian König  
Sent: Thursday, March 12, 2020 8:50 PM
To: Liu, Monk ; Pan, Xinhui ; 
amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close

 From the semantic the dma_resv object contains a single exclusive and multiple 
shared fences and it is mandatory that the shared fences complete after the 
exclusive one.

Now what happens is that clearing the VM page tables runs asynchronously to the 
exclusive fence which moves the buffer around.

And because of this Xinhui ran into a rare problem that TTM thought it could 
reuse the memory while in reality it was still in use.

Regards,
Christian.

Am 12.03.20 um 12:30 schrieb Liu, Monk:
> Can you give more details about " we can't guarantee the the clear fence will 
> complete after the exclusive one." ?
>
> Thanks
>
> _
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -Original Message-
> From: amd-gfx  On Behalf Of 
> Christian K?nig
> Sent: Thursday, March 12, 2020 7:12 PM
> To: Pan, Xinhui ; amd-gfx@lists.freedesktop.org
> Subject: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close
>
> The problem is that we can't add the clear fence to the BO when there is an 
> exclusive fence on it since we can't guarantee the the clear fence will 
> complete after the exclusive one.
>
> To fix this refactor the function and wait for any potential exclusive fence 
> with a small timeout before adding the shared clear fence.
>
> Signed-off-by: Christian König 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 41 +++--
>   1 file changed, 25 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index 4277125a79ee..0782162fb5f3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -161,10 +161,11 @@ void amdgpu_gem_object_close(struct 
> drm_gem_object *obj,
>   
>   struct amdgpu_bo_list_entry vm_pd;
>   struct list_head list, duplicates;
> + struct dma_fence *fence = NULL;
>   struct ttm_validate_buffer tv;
>   struct ww_acquire_ctx ticket;
>   struct amdgpu_bo_va *bo_va;
> - int r;
> + long r;
>   
>   INIT_LIST_HEAD();
>   INIT_LIST_HEAD();
> @@ -182,24 +183,32 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
>   return;
>   }
>   bo_va = amdgpu_vm_bo_find(vm, bo);
> - if (bo_va && --bo_va->ref_count == 0) {
> - amdgpu_vm_bo_rmv(adev, bo_va);
> + if (!bo_va || --bo_va->ref_count)
> + goto out_unlock;
>   
> - if (amdgpu_vm_ready(vm)) {
> - struct dma_fence *fence = NULL;
> + amdgpu_vm_bo_rmv(adev, bo_va);
> + if (!amdgpu_vm_ready(vm))
> + goto out_unlock;
>   
> - r = amdgpu_vm_clear_freed(adev, vm, );
> - if (unlikely(r)) {
> - dev_err(adev->dev, "failed to clear page "
> - "tables on GEM object close (%d)\n", r);
> - }
>   
> - if (fence) {
> - amdgpu_bo_fence(bo, fence, true);
> - dma_fence_put(fence);
> - }
> - }
> - }
> + r = amdgpu_vm_clear_freed(adev, vm, );
> + if (r || !fence)
> + goto out_unlock;
> +
> + r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv, false, false,
> +   msecs_to_jiffies(10));
> + if (r == 0)
> + r = -ETIMEDOUT;
> + if (r)
> + goto out_unlock;
> +
> + amdgpu_bo_fence(bo, fence, true);
> + dma_fence_put(fence);
> +
> +out_unlock:
> + if (unlikely(r))
> + dev_err(adev->dev, "failed to clear page "
> + "tables on GEM object close (%d)\n", r);
>   ttm_eu_backoff_reservation(, );  }
>   
> --
> 2.17.1
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safe

Re: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close

2020-03-12 Thread Christian König
From the semantic the dma_resv object contains a single exclusive and 
multiple shared fences and it is mandatory that the shared fences 
complete after the exclusive one.


Now what happens is that clearing the VM page tables runs asynchronously 
to the exclusive fence which moves the buffer around.


And because of this Xinhui ran into a rare problem that TTM thought it 
could reuse the memory while in reality it was still in use.


Regards,
Christian.

Am 12.03.20 um 12:30 schrieb Liu, Monk:

Can you give more details about " we can't guarantee the the clear fence will 
complete after the exclusive one." ?

Thanks

_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: amd-gfx  On Behalf Of Christian 
K?nig
Sent: Thursday, March 12, 2020 7:12 PM
To: Pan, Xinhui ; amd-gfx@lists.freedesktop.org
Subject: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close

The problem is that we can't add the clear fence to the BO when there is an 
exclusive fence on it since we can't guarantee the the clear fence will 
complete after the exclusive one.

To fix this refactor the function and wait for any potential exclusive fence 
with a small timeout before adding the shared clear fence.

Signed-off-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 41 +++--
  1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 4277125a79ee..0782162fb5f3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -161,10 +161,11 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
  
  	struct amdgpu_bo_list_entry vm_pd;

struct list_head list, duplicates;
+   struct dma_fence *fence = NULL;
struct ttm_validate_buffer tv;
struct ww_acquire_ctx ticket;
struct amdgpu_bo_va *bo_va;
-   int r;
+   long r;
  
  	INIT_LIST_HEAD();

INIT_LIST_HEAD();
@@ -182,24 +183,32 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
return;
}
bo_va = amdgpu_vm_bo_find(vm, bo);
-   if (bo_va && --bo_va->ref_count == 0) {
-   amdgpu_vm_bo_rmv(adev, bo_va);
+   if (!bo_va || --bo_va->ref_count)
+   goto out_unlock;
  
-		if (amdgpu_vm_ready(vm)) {

-   struct dma_fence *fence = NULL;
+   amdgpu_vm_bo_rmv(adev, bo_va);
+   if (!amdgpu_vm_ready(vm))
+   goto out_unlock;
  
-			r = amdgpu_vm_clear_freed(adev, vm, );

-   if (unlikely(r)) {
-   dev_err(adev->dev, "failed to clear page "
-   "tables on GEM object close (%d)\n", r);
-   }
  
-			if (fence) {

-   amdgpu_bo_fence(bo, fence, true);
-   dma_fence_put(fence);
-   }
-   }
-   }
+   r = amdgpu_vm_clear_freed(adev, vm, );
+   if (r || !fence)
+   goto out_unlock;
+
+   r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv, false, false,
+ msecs_to_jiffies(10));
+   if (r == 0)
+   r = -ETIMEDOUT;
+   if (r)
+   goto out_unlock;
+
+   amdgpu_bo_fence(bo, fence, true);
+   dma_fence_put(fence);
+
+out_unlock:
+   if (unlikely(r))
+   dev_err(adev->dev, "failed to clear page "
+   "tables on GEM object close (%d)\n", r);
ttm_eu_backoff_reservation(, );  }
  
--

2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cmonk.liu%40amd.com%7C3826cd029c9a4d00617008d7c6763433%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637196083320910768sdata=1jCINJLzbYhXx9at5%2FU%2FX6d6476FhbhUYZrs53ARSAY%3Dreserved=0


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close

2020-03-12 Thread Liu, Monk
Can you give more details about " we can't guarantee the the clear fence will 
complete after the exclusive one." ?

Thanks 

_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: amd-gfx  On Behalf Of Christian 
K?nig
Sent: Thursday, March 12, 2020 7:12 PM
To: Pan, Xinhui ; amd-gfx@lists.freedesktop.org
Subject: [PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close

The problem is that we can't add the clear fence to the BO when there is an 
exclusive fence on it since we can't guarantee the the clear fence will 
complete after the exclusive one.

To fix this refactor the function and wait for any potential exclusive fence 
with a small timeout before adding the shared clear fence.

Signed-off-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 41 +++--
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 4277125a79ee..0782162fb5f3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -161,10 +161,11 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
 
struct amdgpu_bo_list_entry vm_pd;
struct list_head list, duplicates;
+   struct dma_fence *fence = NULL;
struct ttm_validate_buffer tv;
struct ww_acquire_ctx ticket;
struct amdgpu_bo_va *bo_va;
-   int r;
+   long r;
 
INIT_LIST_HEAD();
INIT_LIST_HEAD();
@@ -182,24 +183,32 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
return;
}
bo_va = amdgpu_vm_bo_find(vm, bo);
-   if (bo_va && --bo_va->ref_count == 0) {
-   amdgpu_vm_bo_rmv(adev, bo_va);
+   if (!bo_va || --bo_va->ref_count)
+   goto out_unlock;
 
-   if (amdgpu_vm_ready(vm)) {
-   struct dma_fence *fence = NULL;
+   amdgpu_vm_bo_rmv(adev, bo_va);
+   if (!amdgpu_vm_ready(vm))
+   goto out_unlock;
 
-   r = amdgpu_vm_clear_freed(adev, vm, );
-   if (unlikely(r)) {
-   dev_err(adev->dev, "failed to clear page "
-   "tables on GEM object close (%d)\n", r);
-   }
 
-   if (fence) {
-   amdgpu_bo_fence(bo, fence, true);
-   dma_fence_put(fence);
-   }
-   }
-   }
+   r = amdgpu_vm_clear_freed(adev, vm, );
+   if (r || !fence)
+   goto out_unlock;
+
+   r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv, false, false,
+ msecs_to_jiffies(10));
+   if (r == 0)
+   r = -ETIMEDOUT;
+   if (r)
+   goto out_unlock;
+
+   amdgpu_bo_fence(bo, fence, true);
+   dma_fence_put(fence);
+
+out_unlock:
+   if (unlikely(r))
+   dev_err(adev->dev, "failed to clear page "
+   "tables on GEM object close (%d)\n", r);
ttm_eu_backoff_reservation(, );  }
 
--
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cmonk.liu%40amd.com%7C3826cd029c9a4d00617008d7c6763433%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637196083320910768sdata=1jCINJLzbYhXx9at5%2FU%2FX6d6476FhbhUYZrs53ARSAY%3Dreserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: fix and cleanup amdgpu_gem_object_close

2020-03-12 Thread Christian König
The problem is that we can't add the clear fence to the BO
when there is an exclusive fence on it since we can't
guarantee the the clear fence will complete after the
exclusive one.

To fix this refactor the function and wait for any potential
exclusive fence with a small timeout before adding the
shared clear fence.

Signed-off-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 41 +++--
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 4277125a79ee..0782162fb5f3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -161,10 +161,11 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
 
struct amdgpu_bo_list_entry vm_pd;
struct list_head list, duplicates;
+   struct dma_fence *fence = NULL;
struct ttm_validate_buffer tv;
struct ww_acquire_ctx ticket;
struct amdgpu_bo_va *bo_va;
-   int r;
+   long r;
 
INIT_LIST_HEAD();
INIT_LIST_HEAD();
@@ -182,24 +183,32 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
return;
}
bo_va = amdgpu_vm_bo_find(vm, bo);
-   if (bo_va && --bo_va->ref_count == 0) {
-   amdgpu_vm_bo_rmv(adev, bo_va);
+   if (!bo_va || --bo_va->ref_count)
+   goto out_unlock;
 
-   if (amdgpu_vm_ready(vm)) {
-   struct dma_fence *fence = NULL;
+   amdgpu_vm_bo_rmv(adev, bo_va);
+   if (!amdgpu_vm_ready(vm))
+   goto out_unlock;
 
-   r = amdgpu_vm_clear_freed(adev, vm, );
-   if (unlikely(r)) {
-   dev_err(adev->dev, "failed to clear page "
-   "tables on GEM object close (%d)\n", r);
-   }
 
-   if (fence) {
-   amdgpu_bo_fence(bo, fence, true);
-   dma_fence_put(fence);
-   }
-   }
-   }
+   r = amdgpu_vm_clear_freed(adev, vm, );
+   if (r || !fence)
+   goto out_unlock;
+
+   r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv, false, false,
+ msecs_to_jiffies(10));
+   if (r == 0)
+   r = -ETIMEDOUT;
+   if (r)
+   goto out_unlock;
+
+   amdgpu_bo_fence(bo, fence, true);
+   dma_fence_put(fence);
+
+out_unlock:
+   if (unlikely(r))
+   dev_err(adev->dev, "failed to clear page "
+   "tables on GEM object close (%d)\n", r);
ttm_eu_backoff_reservation(, );
 }
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx