[Intel-gfx] [PATCH 2/2] intel: Use I915_EXEC_NO_RELOC when available

2015-01-16 Thread Kristian Høgsberg
The I915_EXEC_NO_RELOC flag lets us tell the kernel that the offset we
provide in the validate list entry is what we've used in all relocations
to the bo in question.  If the bo hasn't moved, the kernel can skip
relocations completely.

Signed-off-by: Kristian Høgsberg 
---
 intel/intel_bufmgr_gem.c | 17 -
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
index 8a51cea..a657a4d 100644
--- a/intel/intel_bufmgr_gem.c
+++ b/intel/intel_bufmgr_gem.c
@@ -131,6 +131,7 @@ typedef struct _drm_intel_bufmgr_gem {
unsigned int no_exec : 1;
unsigned int has_vebox : 1;
unsigned int has_handle_lut : 1;
+   unsigned int has_no_reloc : 1;
bool fenced_relocs;
 
char *aub_filename;
@@ -504,7 +505,15 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo, int 
need_fence)
bufmgr_gem->exec2_objects[index].relocation_count = bo_gem->reloc_count;
bufmgr_gem->exec2_objects[index].relocs_ptr = (uintptr_t)bo_gem->relocs;
bufmgr_gem->exec2_objects[index].alignment = 0;
-   bufmgr_gem->exec2_objects[index].offset = 0;
+
+   /* If the kernel supports I915_EXEC_NO_RELOC, it will compare
+* offset in struct drm_i915_gem_exec_object2 against the bos
+* current offset and if all bos haven't moved it will skip
+* relocation processing alltogether.  If I915_EXEC_NO_RELOC
+* is not supported, the kernel ignores the incoming value of
+* offset so we can set it either way.
+*/
+   bufmgr_gem->exec2_objects[index].offset = bo->offset64;
bufmgr_gem->exec_bos[index] = bo;
bufmgr_gem->exec2_objects[index].flags = 0;
bufmgr_gem->exec2_objects[index].rsvd1 = 0;
@@ -2471,6 +2480,8 @@ do_exec2(drm_intel_bo *bo, int used, drm_intel_context 
*ctx,
 
if (bufmgr_gem->has_handle_lut)
execbuf.flags |= I915_EXEC_HANDLE_LUT;
+   if (bufmgr_gem->has_no_reloc)
+   execbuf.flags |= I915_EXEC_NO_RELOC;
 
ret = drmIoctl(bufmgr_gem->fd,
   DRM_IOCTL_I915_GEM_EXECBUFFER2,
@@ -3598,6 +3609,10 @@ drm_intel_bufmgr_gem_init(int fd, int batch_size)
ret = drmIoctl(bufmgr_gem->fd, DRM_IOCTL_I915_GETPARAM, &gp);
bufmgr_gem->has_handle_lut = ret == 0;
 
+   gp.param = I915_PARAM_HAS_EXEC_NO_RELOC;
+   ret = drmIoctl(bufmgr_gem->fd, DRM_IOCTL_I915_GETPARAM, &gp);
+   bufmgr_gem->has_no_reloc = ret == 0;
+
/* Let's go with one relocation per every 2 dwords (but round down a bit
 * since a power of two will mean an extra page allocation for the reloc
 * buffer).
-- 
2.2.1

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 2/2] intel: Use I915_EXEC_NO_RELOC when available

2015-01-16 Thread Daniel Vetter
On Fri, Jan 16, 2015 at 05:46:00PM -0800, Kristian Høgsberg wrote:
> The I915_EXEC_NO_RELOC flag lets us tell the kernel that the offset we
> provide in the validate list entry is what we've used in all relocations
> to the bo in question.  If the bo hasn't moved, the kernel can skip
> relocations completely.
>
> Signed-off-by: Kristian Høgsberg 
> ---
>  intel/intel_bufmgr_gem.c | 17 -
>  1 file changed, 16 insertions(+), 1 deletion(-)
>
> diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
> index 8a51cea..a657a4d 100644
> --- a/intel/intel_bufmgr_gem.c
> +++ b/intel/intel_bufmgr_gem.c
> @@ -131,6 +131,7 @@ typedef struct _drm_intel_bufmgr_gem {
>   unsigned int no_exec : 1;
>   unsigned int has_vebox : 1;
>   unsigned int has_handle_lut : 1;
> + unsigned int has_no_reloc : 1;
>   bool fenced_relocs;
>
>   char *aub_filename;
> @@ -504,7 +505,15 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo, int 
> need_fence)
>   bufmgr_gem->exec2_objects[index].relocation_count = bo_gem->reloc_count;
>   bufmgr_gem->exec2_objects[index].relocs_ptr = (uintptr_t)bo_gem->relocs;
>   bufmgr_gem->exec2_objects[index].alignment = 0;
> - bufmgr_gem->exec2_objects[index].offset = 0;
> +
> + /* If the kernel supports I915_EXEC_NO_RELOC, it will compare
> + * offset in struct drm_i915_gem_exec_object2 against the bos
> + * current offset and if all bos haven't moved it will skip
> + * relocation processing alltogether.  If I915_EXEC_NO_RELOC
> + * is not supported, the kernel ignores the incoming value of
> + * offset so we can set it either way.
> + */
> + bufmgr_gem->exec2_objects[index].offset = bo->offset64;
>   bufmgr_gem->exec_bos[index] = bo;
>   bufmgr_gem->exec2_objects[index].flags = 0;
>   bufmgr_gem->exec2_objects[index].rsvd1 = 0;
> @@ -2471,6 +2480,8 @@ do_exec2(drm_intel_bo *bo, int used, drm_intel_context 
> *ctx,
>
>   if (bufmgr_gem->has_handle_lut)
>   execbuf.flags |= I915_EXEC_HANDLE_LUT;
> + if (bufmgr_gem->has_no_reloc)
> + execbuf.flags |= I915_EXEC_NO_RELOC;

You need some opt-in flag to not break existing userspace: Iirc both UXA
and libva retain reloc trees partially, which means that we might have
different presumed offsets for the same bo in different relocs.

This is only safe when you throw away and rebuild the reloc tree with all
buffers completely each time around (like current mesa does afaik).

-Daniel

>
>   ret = drmIoctl(bufmgr_gem->fd,
> DRM_IOCTL_I915_GEM_EXECBUFFER2,
> @@ -3598,6 +3609,10 @@ drm_intel_bufmgr_gem_init(int fd, int batch_size)
>   ret = drmIoctl(bufmgr_gem->fd, DRM_IOCTL_I915_GETPARAM, &gp);
>   bufmgr_gem->has_handle_lut = ret == 0;
>
> + gp.param = I915_PARAM_HAS_EXEC_NO_RELOC;
> + ret = drmIoctl(bufmgr_gem->fd, DRM_IOCTL_I915_GETPARAM, &gp);
> + bufmgr_gem->has_no_reloc = ret == 0;
> +
>   /* Let's go with one relocation per every 2 dwords (but round down a bit
>   * since a power of two will mean an extra page allocation for the reloc
>   * buffer).
> --
> 2.2.1
>
> ___
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 2/2] intel: Use I915_EXEC_NO_RELOC when available

2015-01-17 Thread Chris Wilson
On Sat, Jan 17, 2015 at 05:23:59AM +0100, Daniel Vetter wrote:
> On Fri, Jan 16, 2015 at 05:46:00PM -0800, Kristian Høgsberg wrote:
> > The I915_EXEC_NO_RELOC flag lets us tell the kernel that the offset we
> > provide in the validate list entry is what we've used in all relocations
> > to the bo in question.  If the bo hasn't moved, the kernel can skip
> > relocations completely.
> >
> > Signed-off-by: Kristian Høgsberg 
> > ---
> >  intel/intel_bufmgr_gem.c | 17 -
> >  1 file changed, 16 insertions(+), 1 deletion(-)
> >
> > diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
> > index 8a51cea..a657a4d 100644
> > --- a/intel/intel_bufmgr_gem.c
> > +++ b/intel/intel_bufmgr_gem.c
> > @@ -131,6 +131,7 @@ typedef struct _drm_intel_bufmgr_gem {
> >   unsigned int no_exec : 1;
> >   unsigned int has_vebox : 1;
> >   unsigned int has_handle_lut : 1;
> > + unsigned int has_no_reloc : 1;
> >   bool fenced_relocs;
> >
> >   char *aub_filename;
> > @@ -504,7 +505,15 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo, int 
> > need_fence)
> >   bufmgr_gem->exec2_objects[index].relocation_count = bo_gem->reloc_count;
> >   bufmgr_gem->exec2_objects[index].relocs_ptr = (uintptr_t)bo_gem->relocs;
> >   bufmgr_gem->exec2_objects[index].alignment = 0;
> > - bufmgr_gem->exec2_objects[index].offset = 0;
> > +
> > + /* If the kernel supports I915_EXEC_NO_RELOC, it will compare
> > + * offset in struct drm_i915_gem_exec_object2 against the bos
> > + * current offset and if all bos haven't moved it will skip
> > + * relocation processing alltogether.  If I915_EXEC_NO_RELOC
> > + * is not supported, the kernel ignores the incoming value of
> > + * offset so we can set it either way.
> > + */
> > + bufmgr_gem->exec2_objects[index].offset = bo->offset64;
> >   bufmgr_gem->exec_bos[index] = bo;
> >   bufmgr_gem->exec2_objects[index].flags = 0;
> >   bufmgr_gem->exec2_objects[index].rsvd1 = 0;
> > @@ -2471,6 +2480,8 @@ do_exec2(drm_intel_bo *bo, int used, 
> > drm_intel_context *ctx,
> >
> >   if (bufmgr_gem->has_handle_lut)
> >   execbuf.flags |= I915_EXEC_HANDLE_LUT;
> > + if (bufmgr_gem->has_no_reloc)
> > + execbuf.flags |= I915_EXEC_NO_RELOC;
> 
> You need some opt-in flag to not break existing userspace: Iirc both UXA
> and libva retain reloc trees partially, which means that we might have
> different presumed offsets for the same bo in different relocs.

A bigger challenge is that you have to use execlist flags to indicate
read/write domains (actually just read or write!), and a special flag for
the SNB pipecontrol w/a. (This is because the kernel no longer even
scans the relocation trees if the buffers haven't moved and so we don't
have the chance to construct the read/write domains from the relocs.)
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 2/2] intel: Use I915_EXEC_NO_RELOC when available

2015-01-19 Thread Kristian Høgsberg
On Fri, Jan 16, 2015 at 8:23 PM, Daniel Vetter  wrote:
> On Fri, Jan 16, 2015 at 05:46:00PM -0800, Kristian Høgsberg wrote:
>> The I915_EXEC_NO_RELOC flag lets us tell the kernel that the offset we
>> provide in the validate list entry is what we've used in all relocations
>> to the bo in question.  If the bo hasn't moved, the kernel can skip
>> relocations completely.
>>
>> Signed-off-by: Kristian Høgsberg 
>> ---
>>  intel/intel_bufmgr_gem.c | 17 -
>>  1 file changed, 16 insertions(+), 1 deletion(-)
>>
>> diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
>> index 8a51cea..a657a4d 100644
>> --- a/intel/intel_bufmgr_gem.c
>> +++ b/intel/intel_bufmgr_gem.c
>> @@ -131,6 +131,7 @@ typedef struct _drm_intel_bufmgr_gem {
>>   unsigned int no_exec : 1;
>>   unsigned int has_vebox : 1;
>>   unsigned int has_handle_lut : 1;
>> + unsigned int has_no_reloc : 1;
>>   bool fenced_relocs;
>>
>>   char *aub_filename;
>> @@ -504,7 +505,15 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo, int 
>> need_fence)
>>   bufmgr_gem->exec2_objects[index].relocation_count = bo_gem->reloc_count;
>>   bufmgr_gem->exec2_objects[index].relocs_ptr = (uintptr_t)bo_gem->relocs;
>>   bufmgr_gem->exec2_objects[index].alignment = 0;
>> - bufmgr_gem->exec2_objects[index].offset = 0;
>> +
>> + /* If the kernel supports I915_EXEC_NO_RELOC, it will compare
>> + * offset in struct drm_i915_gem_exec_object2 against the bos
>> + * current offset and if all bos haven't moved it will skip
>> + * relocation processing alltogether.  If I915_EXEC_NO_RELOC
>> + * is not supported, the kernel ignores the incoming value of
>> + * offset so we can set it either way.
>> + */
>> + bufmgr_gem->exec2_objects[index].offset = bo->offset64;
>>   bufmgr_gem->exec_bos[index] = bo;
>>   bufmgr_gem->exec2_objects[index].flags = 0;
>>   bufmgr_gem->exec2_objects[index].rsvd1 = 0;
>> @@ -2471,6 +2480,8 @@ do_exec2(drm_intel_bo *bo, int used, drm_intel_context 
>> *ctx,
>>
>>   if (bufmgr_gem->has_handle_lut)
>>   execbuf.flags |= I915_EXEC_HANDLE_LUT;
>> + if (bufmgr_gem->has_no_reloc)
>> + execbuf.flags |= I915_EXEC_NO_RELOC;
>
> You need some opt-in flag to not break existing userspace: Iirc both UXA
> and libva retain reloc trees partially, which means that we might have
> different presumed offsets for the same bo in different relocs.
>
> This is only safe when you throw away and rebuild the reloc tree with all
> buffers completely each time around (like current mesa does afaik).

And it turns out that even inside mesa we cannot guarantee this.  In
case of multiple threads sharing objects, thread A could be halfway in
building up its batchbuffer when thread B does a execbuf ioctls and
causes some objects to be moved.  Thread A will then finish building
its reloc list with different offsets for the objects that were moved
and if we pass NO_RELOC in that case, nothing will fix up the wrong
presumed_offsets for the first half.

We can just check that all presumed_offset of all relocs match
bo->offset64, and pass NO_RELOC if they do for all bos. This will also
work of UXA and libva and not require any opt-in.

Kristian

> -Daniel
>
>>
>>   ret = drmIoctl(bufmgr_gem->fd,
>> DRM_IOCTL_I915_GEM_EXECBUFFER2,
>> @@ -3598,6 +3609,10 @@ drm_intel_bufmgr_gem_init(int fd, int batch_size)
>>   ret = drmIoctl(bufmgr_gem->fd, DRM_IOCTL_I915_GETPARAM, &gp);
>>   bufmgr_gem->has_handle_lut = ret == 0;
>>
>> + gp.param = I915_PARAM_HAS_EXEC_NO_RELOC;
>> + ret = drmIoctl(bufmgr_gem->fd, DRM_IOCTL_I915_GETPARAM, &gp);
>> + bufmgr_gem->has_no_reloc = ret == 0;
>> +
>>   /* Let's go with one relocation per every 2 dwords (but round down a bit
>>   * since a power of two will mean an extra page allocation for the reloc
>>   * buffer).
>> --
>> 2.2.1
>>
>> ___
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 2/2] intel: Use I915_EXEC_NO_RELOC when available

2015-01-19 Thread Kristian Høgsberg
On Sat, Jan 17, 2015 at 1:49 AM, Chris Wilson  wrote:
> On Sat, Jan 17, 2015 at 05:23:59AM +0100, Daniel Vetter wrote:
>> On Fri, Jan 16, 2015 at 05:46:00PM -0800, Kristian Høgsberg wrote:
>> > The I915_EXEC_NO_RELOC flag lets us tell the kernel that the offset we
>> > provide in the validate list entry is what we've used in all relocations
>> > to the bo in question.  If the bo hasn't moved, the kernel can skip
>> > relocations completely.
>> >
>> > Signed-off-by: Kristian Høgsberg 
>> > ---
>> >  intel/intel_bufmgr_gem.c | 17 -
>> >  1 file changed, 16 insertions(+), 1 deletion(-)
>> >
>> > diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
>> > index 8a51cea..a657a4d 100644
>> > --- a/intel/intel_bufmgr_gem.c
>> > +++ b/intel/intel_bufmgr_gem.c
>> > @@ -131,6 +131,7 @@ typedef struct _drm_intel_bufmgr_gem {
>> >   unsigned int no_exec : 1;
>> >   unsigned int has_vebox : 1;
>> >   unsigned int has_handle_lut : 1;
>> > + unsigned int has_no_reloc : 1;
>> >   bool fenced_relocs;
>> >
>> >   char *aub_filename;
>> > @@ -504,7 +505,15 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo, int 
>> > need_fence)
>> >   bufmgr_gem->exec2_objects[index].relocation_count = bo_gem->reloc_count;
>> >   bufmgr_gem->exec2_objects[index].relocs_ptr = (uintptr_t)bo_gem->relocs;
>> >   bufmgr_gem->exec2_objects[index].alignment = 0;
>> > - bufmgr_gem->exec2_objects[index].offset = 0;
>> > +
>> > + /* If the kernel supports I915_EXEC_NO_RELOC, it will compare
>> > + * offset in struct drm_i915_gem_exec_object2 against the bos
>> > + * current offset and if all bos haven't moved it will skip
>> > + * relocation processing alltogether.  If I915_EXEC_NO_RELOC
>> > + * is not supported, the kernel ignores the incoming value of
>> > + * offset so we can set it either way.
>> > + */
>> > + bufmgr_gem->exec2_objects[index].offset = bo->offset64;
>> >   bufmgr_gem->exec_bos[index] = bo;
>> >   bufmgr_gem->exec2_objects[index].flags = 0;
>> >   bufmgr_gem->exec2_objects[index].rsvd1 = 0;
>> > @@ -2471,6 +2480,8 @@ do_exec2(drm_intel_bo *bo, int used, 
>> > drm_intel_context *ctx,
>> >
>> >   if (bufmgr_gem->has_handle_lut)
>> >   execbuf.flags |= I915_EXEC_HANDLE_LUT;
>> > + if (bufmgr_gem->has_no_reloc)
>> > + execbuf.flags |= I915_EXEC_NO_RELOC;
>>
>> You need some opt-in flag to not break existing userspace: Iirc both UXA
>> and libva retain reloc trees partially, which means that we might have
>> different presumed offsets for the same bo in different relocs.
>
> A bigger challenge is that you have to use execlist flags to indicate
> read/write domains (actually just read or write!), and a special flag for
> the SNB pipecontrol w/a. (This is because the kernel no longer even
> scans the relocation trees if the buffers haven't moved and so we don't
> have the chance to construct the read/write domains from the relocs.)

You're referring to having to set the EXEC_OBJECT_WRITE flag for
buffers that are in any write domain?  That seems doable - we're
already scanning all relocs before submitting.  I didn't find anything
in i915_drm.h about the SNB pipecontrol... can you elaborate?

thanks,
Kristian

> -Chris
>
> --
> Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 2/2] intel: Use I915_EXEC_NO_RELOC when available

2015-01-20 Thread Daniel Vetter
On Mon, Jan 19, 2015 at 09:58:55PM -0800, Kristian Høgsberg wrote:
> On Sat, Jan 17, 2015 at 1:49 AM, Chris Wilson  
> wrote:
> > On Sat, Jan 17, 2015 at 05:23:59AM +0100, Daniel Vetter wrote:
> >> On Fri, Jan 16, 2015 at 05:46:00PM -0800, Kristian Høgsberg wrote:
> >> > The I915_EXEC_NO_RELOC flag lets us tell the kernel that the offset we
> >> > provide in the validate list entry is what we've used in all relocations
> >> > to the bo in question.  If the bo hasn't moved, the kernel can skip
> >> > relocations completely.
> >> >
> >> > Signed-off-by: Kristian Høgsberg 
> >> > ---
> >> >  intel/intel_bufmgr_gem.c | 17 -
> >> >  1 file changed, 16 insertions(+), 1 deletion(-)
> >> >
> >> > diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
> >> > index 8a51cea..a657a4d 100644
> >> > --- a/intel/intel_bufmgr_gem.c
> >> > +++ b/intel/intel_bufmgr_gem.c
> >> > @@ -131,6 +131,7 @@ typedef struct _drm_intel_bufmgr_gem {
> >> >   unsigned int no_exec : 1;
> >> >   unsigned int has_vebox : 1;
> >> >   unsigned int has_handle_lut : 1;
> >> > + unsigned int has_no_reloc : 1;
> >> >   bool fenced_relocs;
> >> >
> >> >   char *aub_filename;
> >> > @@ -504,7 +505,15 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo, 
> >> > int need_fence)
> >> >   bufmgr_gem->exec2_objects[index].relocation_count = 
> >> > bo_gem->reloc_count;
> >> >   bufmgr_gem->exec2_objects[index].relocs_ptr = 
> >> > (uintptr_t)bo_gem->relocs;
> >> >   bufmgr_gem->exec2_objects[index].alignment = 0;
> >> > - bufmgr_gem->exec2_objects[index].offset = 0;
> >> > +
> >> > + /* If the kernel supports I915_EXEC_NO_RELOC, it will compare
> >> > + * offset in struct drm_i915_gem_exec_object2 against the bos
> >> > + * current offset and if all bos haven't moved it will skip
> >> > + * relocation processing alltogether.  If I915_EXEC_NO_RELOC
> >> > + * is not supported, the kernel ignores the incoming value of
> >> > + * offset so we can set it either way.
> >> > + */
> >> > + bufmgr_gem->exec2_objects[index].offset = bo->offset64;
> >> >   bufmgr_gem->exec_bos[index] = bo;
> >> >   bufmgr_gem->exec2_objects[index].flags = 0;
> >> >   bufmgr_gem->exec2_objects[index].rsvd1 = 0;
> >> > @@ -2471,6 +2480,8 @@ do_exec2(drm_intel_bo *bo, int used, 
> >> > drm_intel_context *ctx,
> >> >
> >> >   if (bufmgr_gem->has_handle_lut)
> >> >   execbuf.flags |= I915_EXEC_HANDLE_LUT;
> >> > + if (bufmgr_gem->has_no_reloc)
> >> > + execbuf.flags |= I915_EXEC_NO_RELOC;
> >>
> >> You need some opt-in flag to not break existing userspace: Iirc both UXA
> >> and libva retain reloc trees partially, which means that we might have
> >> different presumed offsets for the same bo in different relocs.
> >
> > A bigger challenge is that you have to use execlist flags to indicate
> > read/write domains (actually just read or write!), and a special flag for
> > the SNB pipecontrol w/a. (This is because the kernel no longer even
> > scans the relocation trees if the buffers haven't moved and so we don't
> > have the chance to construct the read/write domains from the relocs.)
> 
> You're referring to having to set the EXEC_OBJECT_WRITE flag for
> buffers that are in any write domain?  That seems doable - we're
> already scanning all relocs before submitting.  I didn't find anything
> in i915_drm.h about the SNB pipecontrol... can you elaborate?

On gen6 if you do a pipe_control write (as e.g. used for query objects or
the nonzero flush wa) the kernel needs to apply a wa. For old w/a it does
this by matching on relocs with write_domain =
I915_GEM_DOMAIN_INSTRUCTION. With the no_reloc trick instead you need to
set the per-obj EXEC_OBJECT_NEEDS_GTT flag. Note that this is only
required on gen6, if you do it on gen7+ and have full ppgtt enabled the
kernel will yell at you.

If you go with recovering the no_reloc semantics in libdrm then you could
also recover this one (on top of checking presumed_offsets for all
relocs).
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 2/2] intel: Use I915_EXEC_NO_RELOC when available

2015-01-20 Thread Daniel Vetter
On Mon, Jan 19, 2015 at 09:45:35PM -0800, Kristian Høgsberg wrote:
> On Fri, Jan 16, 2015 at 8:23 PM, Daniel Vetter  wrote:
> > On Fri, Jan 16, 2015 at 05:46:00PM -0800, Kristian Høgsberg wrote:
> >> The I915_EXEC_NO_RELOC flag lets us tell the kernel that the offset we
> >> provide in the validate list entry is what we've used in all relocations
> >> to the bo in question.  If the bo hasn't moved, the kernel can skip
> >> relocations completely.
> >>
> >> Signed-off-by: Kristian Høgsberg 
> >> ---
> >>  intel/intel_bufmgr_gem.c | 17 -
> >>  1 file changed, 16 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
> >> index 8a51cea..a657a4d 100644
> >> --- a/intel/intel_bufmgr_gem.c
> >> +++ b/intel/intel_bufmgr_gem.c
> >> @@ -131,6 +131,7 @@ typedef struct _drm_intel_bufmgr_gem {
> >>   unsigned int no_exec : 1;
> >>   unsigned int has_vebox : 1;
> >>   unsigned int has_handle_lut : 1;
> >> + unsigned int has_no_reloc : 1;
> >>   bool fenced_relocs;
> >>
> >>   char *aub_filename;
> >> @@ -504,7 +505,15 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo, int 
> >> need_fence)
> >>   bufmgr_gem->exec2_objects[index].relocation_count = bo_gem->reloc_count;
> >>   bufmgr_gem->exec2_objects[index].relocs_ptr = (uintptr_t)bo_gem->relocs;
> >>   bufmgr_gem->exec2_objects[index].alignment = 0;
> >> - bufmgr_gem->exec2_objects[index].offset = 0;
> >> +
> >> + /* If the kernel supports I915_EXEC_NO_RELOC, it will compare
> >> + * offset in struct drm_i915_gem_exec_object2 against the bos
> >> + * current offset and if all bos haven't moved it will skip
> >> + * relocation processing alltogether.  If I915_EXEC_NO_RELOC
> >> + * is not supported, the kernel ignores the incoming value of
> >> + * offset so we can set it either way.
> >> + */
> >> + bufmgr_gem->exec2_objects[index].offset = bo->offset64;
> >>   bufmgr_gem->exec_bos[index] = bo;
> >>   bufmgr_gem->exec2_objects[index].flags = 0;
> >>   bufmgr_gem->exec2_objects[index].rsvd1 = 0;
> >> @@ -2471,6 +2480,8 @@ do_exec2(drm_intel_bo *bo, int used, 
> >> drm_intel_context *ctx,
> >>
> >>   if (bufmgr_gem->has_handle_lut)
> >>   execbuf.flags |= I915_EXEC_HANDLE_LUT;
> >> + if (bufmgr_gem->has_no_reloc)
> >> + execbuf.flags |= I915_EXEC_NO_RELOC;
> >
> > You need some opt-in flag to not break existing userspace: Iirc both UXA
> > and libva retain reloc trees partially, which means that we might have
> > different presumed offsets for the same bo in different relocs.
> >
> > This is only safe when you throw away and rebuild the reloc tree with all
> > buffers completely each time around (like current mesa does afaik).
> 
> And it turns out that even inside mesa we cannot guarantee this.  In
> case of multiple threads sharing objects, thread A could be halfway in
> building up its batchbuffer when thread B does a execbuf ioctls and
> causes some objects to be moved.  Thread A will then finish building
> its reloc list with different offsets for the objects that were moved
> and if we pass NO_RELOC in that case, nothing will fix up the wrong
> presumed_offsets for the first half.
> 
> We can just check that all presumed_offset of all relocs match
> bo->offset64, and pass NO_RELOC if they do for all bos. This will also
> work of UXA and libva and not require any opt-in.

My idea for all this would have been to create a per-thread execbuf
relocation context with a hashtab to map buffer pointers to execbuf index
and a bunch of arrays to prepare the reloc entry tables. If you do it
correctly all the per-reloc work should be a O(1) streaming writes to a
few arrays plus the hashtab lookup. With no code run at execbuf time
(except the ioctl ofc). Even the libdrm_bo->presumed_offset update after
execbuf could be done lockless (as long as readers are careful to never
reload it by using something similar to the kernel's READ_ONCE macro).

But that means a completely new reloc api, so a lot more work. Also I
think it only makes sense do that for drivers that really care about the
last bit of performance, and then do it within the driver so that there's
no constraints about abi.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 2/2] intel: Use I915_EXEC_NO_RELOC when available

2015-01-20 Thread Kristian Høgsberg
On Tue, Jan 20, 2015 at 12:42 AM, Daniel Vetter  wrote:
> On Mon, Jan 19, 2015 at 09:45:35PM -0800, Kristian Høgsberg wrote:
>> On Fri, Jan 16, 2015 at 8:23 PM, Daniel Vetter  wrote:
>> > On Fri, Jan 16, 2015 at 05:46:00PM -0800, Kristian Høgsberg wrote:
>> >> The I915_EXEC_NO_RELOC flag lets us tell the kernel that the offset we
>> >> provide in the validate list entry is what we've used in all relocations
>> >> to the bo in question.  If the bo hasn't moved, the kernel can skip
>> >> relocations completely.
>> >>
>> >> Signed-off-by: Kristian Høgsberg 
>> >> ---
>> >>  intel/intel_bufmgr_gem.c | 17 -
>> >>  1 file changed, 16 insertions(+), 1 deletion(-)
>> >>
>> >> diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
>> >> index 8a51cea..a657a4d 100644
>> >> --- a/intel/intel_bufmgr_gem.c
>> >> +++ b/intel/intel_bufmgr_gem.c
>> >> @@ -131,6 +131,7 @@ typedef struct _drm_intel_bufmgr_gem {
>> >>   unsigned int no_exec : 1;
>> >>   unsigned int has_vebox : 1;
>> >>   unsigned int has_handle_lut : 1;
>> >> + unsigned int has_no_reloc : 1;
>> >>   bool fenced_relocs;
>> >>
>> >>   char *aub_filename;
>> >> @@ -504,7 +505,15 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo, int 
>> >> need_fence)
>> >>   bufmgr_gem->exec2_objects[index].relocation_count = bo_gem->reloc_count;
>> >>   bufmgr_gem->exec2_objects[index].relocs_ptr = (uintptr_t)bo_gem->relocs;
>> >>   bufmgr_gem->exec2_objects[index].alignment = 0;
>> >> - bufmgr_gem->exec2_objects[index].offset = 0;
>> >> +
>> >> + /* If the kernel supports I915_EXEC_NO_RELOC, it will compare
>> >> + * offset in struct drm_i915_gem_exec_object2 against the bos
>> >> + * current offset and if all bos haven't moved it will skip
>> >> + * relocation processing alltogether.  If I915_EXEC_NO_RELOC
>> >> + * is not supported, the kernel ignores the incoming value of
>> >> + * offset so we can set it either way.
>> >> + */
>> >> + bufmgr_gem->exec2_objects[index].offset = bo->offset64;
>> >>   bufmgr_gem->exec_bos[index] = bo;
>> >>   bufmgr_gem->exec2_objects[index].flags = 0;
>> >>   bufmgr_gem->exec2_objects[index].rsvd1 = 0;
>> >> @@ -2471,6 +2480,8 @@ do_exec2(drm_intel_bo *bo, int used, 
>> >> drm_intel_context *ctx,
>> >>
>> >>   if (bufmgr_gem->has_handle_lut)
>> >>   execbuf.flags |= I915_EXEC_HANDLE_LUT;
>> >> + if (bufmgr_gem->has_no_reloc)
>> >> + execbuf.flags |= I915_EXEC_NO_RELOC;
>> >
>> > You need some opt-in flag to not break existing userspace: Iirc both UXA
>> > and libva retain reloc trees partially, which means that we might have
>> > different presumed offsets for the same bo in different relocs.
>> >
>> > This is only safe when you throw away and rebuild the reloc tree with all
>> > buffers completely each time around (like current mesa does afaik).
>>
>> And it turns out that even inside mesa we cannot guarantee this.  In
>> case of multiple threads sharing objects, thread A could be halfway in
>> building up its batchbuffer when thread B does a execbuf ioctls and
>> causes some objects to be moved.  Thread A will then finish building
>> its reloc list with different offsets for the objects that were moved
>> and if we pass NO_RELOC in that case, nothing will fix up the wrong
>> presumed_offsets for the first half.
>>
>> We can just check that all presumed_offset of all relocs match
>> bo->offset64, and pass NO_RELOC if they do for all bos. This will also
>> work of UXA and libva and not require any opt-in.
>
> My idea for all this would have been to create a per-thread execbuf
> relocation context with a hashtab to map buffer pointers to execbuf index
> and a bunch of arrays to prepare the reloc entry tables. If you do it
> correctly all the per-reloc work should be a O(1) streaming writes to a
> few arrays plus the hashtab lookup. With no code run at execbuf time
> (except the ioctl ofc). Even the libdrm_bo->presumed_offset update after
> execbuf could be done lockless (as long as readers are careful to never
> reload it by using something similar to the kernel's READ_ONCE macro).
>
> But that means a completely new reloc api, so a lot more work. Also I
> think it only makes sense do that for drivers that really care about the
> last bit of performance, and then do it within the driver so that there's
> no constraints about abi.

Indeed, I moved it into mesa so I could rework that. bo_emit_reloc()
is showing up in profiles. The patch below along with NO_RELOC and
HANDLE_LUT flags gives me 5-10% improement on CPU bound benchmarks, so
it's certainly worth it.  I'm skeptical that a hashtable lookup per
reloc emit is going to perform better than just fixing up the relocs
at execbuf2 time though.  It would be nice to not do any work at ioctl
time, but for that you need a very fast way to map from bo to
per-thread bo state as you go. Maybe a per-thread array mapping from
gem handle to exec_object could work...

WIP Patch is here:

http://cgit.freedesktop.org/~krh/mesa/commit/?h=b0e4ce7bbce2a79ad37d6de460af88b9581ea1d7

Re: [Intel-gfx] [PATCH 2/2] intel: Use I915_EXEC_NO_RELOC when available

2015-01-20 Thread Chris Wilson
On Tue, Jan 20, 2015 at 12:53:35PM -0800, Kristian Høgsberg wrote:
> On Tue, Jan 20, 2015 at 12:42 AM, Daniel Vetter  wrote:
> > My idea for all this would have been to create a per-thread execbuf
> > relocation context with a hashtab to map buffer pointers to execbuf index
> > and a bunch of arrays to prepare the reloc entry tables. If you do it
> > correctly all the per-reloc work should be a O(1) streaming writes to a
> > few arrays plus the hashtab lookup. With no code run at execbuf time
> > (except the ioctl ofc). Even the libdrm_bo->presumed_offset update after
> > execbuf could be done lockless (as long as readers are careful to never
> > reload it by using something similar to the kernel's READ_ONCE macro).
> >
> > But that means a completely new reloc api, so a lot more work. Also I
> > think it only makes sense do that for drivers that really care about the
> > last bit of performance, and then do it within the driver so that there's
> > no constraints about abi.
> 
> Indeed, I moved it into mesa so I could rework that. bo_emit_reloc()
> is showing up in profiles. The patch below along with NO_RELOC and
> HANDLE_LUT flags gives me 5-10% improement on CPU bound benchmarks, so
> it's certainly worth it.  I'm skeptical that a hashtable lookup per
> reloc emit is going to perform better than just fixing up the relocs
> at execbuf2 time though.  It would be nice to not do any work at ioctl
> time, but for that you need a very fast way to map from bo to
> per-thread bo state as you go. Maybe a per-thread array mapping from
> gem handle to exec_object could work...
> 
> WIP Patch is here:
> 
> http://cgit.freedesktop.org/~krh/mesa/commit/?h=b0e4ce7bbce2a79ad37d6de460af88b9581ea1d7

Hmm, that is actually pretty neat. My idle thought was to create
per-context batchmgr with its own view of the bo (to counter the
multithreaded free-for-all). In your patch, you neatly demonstrate that
you don't need per-context view of the bo, only of the relocations. And
it will make drm_intel_bo_emit_reloc() fixed cost, which should produce
most of your CPU overhead saving.

However, I think if you do take it a step further with a batchmgr_bo,
you can make the drm_intel_bo_references() very cheap as well. 

Looks good.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 2/2] intel: Use I915_EXEC_NO_RELOC when available

2015-01-20 Thread Kristian Høgsberg
On Tue, Jan 20, 2015 at 1:46 PM, Chris Wilson  wrote:
> On Tue, Jan 20, 2015 at 12:53:35PM -0800, Kristian Høgsberg wrote:
>> On Tue, Jan 20, 2015 at 12:42 AM, Daniel Vetter  wrote:
>> > My idea for all this would have been to create a per-thread execbuf
>> > relocation context with a hashtab to map buffer pointers to execbuf index
>> > and a bunch of arrays to prepare the reloc entry tables. If you do it
>> > correctly all the per-reloc work should be a O(1) streaming writes to a
>> > few arrays plus the hashtab lookup. With no code run at execbuf time
>> > (except the ioctl ofc). Even the libdrm_bo->presumed_offset update after
>> > execbuf could be done lockless (as long as readers are careful to never
>> > reload it by using something similar to the kernel's READ_ONCE macro).
>> >
>> > But that means a completely new reloc api, so a lot more work. Also I
>> > think it only makes sense do that for drivers that really care about the
>> > last bit of performance, and then do it within the driver so that there's
>> > no constraints about abi.
>>
>> Indeed, I moved it into mesa so I could rework that. bo_emit_reloc()
>> is showing up in profiles. The patch below along with NO_RELOC and
>> HANDLE_LUT flags gives me 5-10% improement on CPU bound benchmarks, so
>> it's certainly worth it.  I'm skeptical that a hashtable lookup per
>> reloc emit is going to perform better than just fixing up the relocs
>> at execbuf2 time though.  It would be nice to not do any work at ioctl
>> time, but for that you need a very fast way to map from bo to
>> per-thread bo state as you go. Maybe a per-thread array mapping from
>> gem handle to exec_object could work...
>>
>> WIP Patch is here:
>>
>> http://cgit.freedesktop.org/~krh/mesa/commit/?h=b0e4ce7bbce2a79ad37d6de460af88b9581ea1d7
>
> Hmm, that is actually pretty neat. My idle thought was to create
> per-context batchmgr with its own view of the bo (to counter the
> multithreaded free-for-all). In your patch, you neatly demonstrate that
> you don't need per-context view of the bo, only of the relocations. And
> it will make drm_intel_bo_emit_reloc() fixed cost, which should produce
> most of your CPU overhead saving.
>
> However, I think if you do take it a step further with a batchmgr_bo,
> you can make the drm_intel_bo_references() very cheap as well.

I did think about a per-thread/context bo wrapper that could track
validation list index and presumed_offset, but the problem is that
there's only one intel_mipmap_tree/intel_buffer_object struct to store
it in.  We'd have to a per-thread wrapper for those too, and I'm not
sure that's feasible.

However, drm_intel_bo_reference/unreference() is showing up on the
profiles now that relocations are cheaper, but I think the better way
to make those cheaper is to move the ref count to the public struct
and make the ref/unref inline functions (calling out to a destructor
for the refcount==0 case, of course).  On that note, do you know why
drm_intel_gem_bo_unreference() is so convoluted? What's the deal with
the atomic_add_unless?

> Looks good.
> -Chris
>
> --
> Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 2/2] intel: Use I915_EXEC_NO_RELOC when available

2015-01-21 Thread Daniel Vetter
On Tue, Jan 20, 2015 at 10:10:57PM -0800, Kristian Høgsberg wrote:
> On Tue, Jan 20, 2015 at 1:46 PM, Chris Wilson  
> wrote:
> > On Tue, Jan 20, 2015 at 12:53:35PM -0800, Kristian Høgsberg wrote:
> >> On Tue, Jan 20, 2015 at 12:42 AM, Daniel Vetter  wrote:
> >> > My idea for all this would have been to create a per-thread execbuf
> >> > relocation context with a hashtab to map buffer pointers to execbuf index
> >> > and a bunch of arrays to prepare the reloc entry tables. If you do it
> >> > correctly all the per-reloc work should be a O(1) streaming writes to a
> >> > few arrays plus the hashtab lookup. With no code run at execbuf time
> >> > (except the ioctl ofc). Even the libdrm_bo->presumed_offset update after
> >> > execbuf could be done lockless (as long as readers are careful to never
> >> > reload it by using something similar to the kernel's READ_ONCE macro).
> >> >
> >> > But that means a completely new reloc api, so a lot more work. Also I
> >> > think it only makes sense do that for drivers that really care about the
> >> > last bit of performance, and then do it within the driver so that there's
> >> > no constraints about abi.
> >>
> >> Indeed, I moved it into mesa so I could rework that. bo_emit_reloc()
> >> is showing up in profiles. The patch below along with NO_RELOC and
> >> HANDLE_LUT flags gives me 5-10% improement on CPU bound benchmarks, so
> >> it's certainly worth it.  I'm skeptical that a hashtable lookup per
> >> reloc emit is going to perform better than just fixing up the relocs
> >> at execbuf2 time though.  It would be nice to not do any work at ioctl
> >> time, but for that you need a very fast way to map from bo to
> >> per-thread bo state as you go. Maybe a per-thread array mapping from
> >> gem handle to exec_object could work...
> >>
> >> WIP Patch is here:
> >>
> >> http://cgit.freedesktop.org/~krh/mesa/commit/?h=b0e4ce7bbce2a79ad37d6de460af88b9581ea1d7
> >
> > Hmm, that is actually pretty neat. My idle thought was to create
> > per-context batchmgr with its own view of the bo (to counter the
> > multithreaded free-for-all). In your patch, you neatly demonstrate that
> > you don't need per-context view of the bo, only of the relocations. And
> > it will make drm_intel_bo_emit_reloc() fixed cost, which should produce
> > most of your CPU overhead saving.
> >
> > However, I think if you do take it a step further with a batchmgr_bo,
> > you can make the drm_intel_bo_references() very cheap as well.
> 
> I did think about a per-thread/context bo wrapper that could track
> validation list index and presumed_offset, but the problem is that
> there's only one intel_mipmap_tree/intel_buffer_object struct to store
> it in.  We'd have to a per-thread wrapper for those too, and I'm not
> sure that's feasible.

Another thing to consider is that with full ppgtt reloc offset will be per
context. So as soon as you have multiple contexts with some shared bo you
most likely will always fallback to need_reloc=1 with your code. Not sure
whether we care.

And I guess your obj->flags = 0 is just because it's WIP? You'd need
something like

if (relocs[i].write_domain)
obj->flasg |= EXEC_OBJECT_WRITE;
if (IS_GEN6 && relocs[i].write_domain == INSTRUCTION)
obj->flasg |= EXEC_OBJECT_NEEDS_GTT;

before the continue.
-Daniel

> However, drm_intel_bo_reference/unreference() is showing up on the
> profiles now that relocations are cheaper, but I think the better way
> to make those cheaper is to move the ref count to the public struct
> and make the ref/unref inline functions (calling out to a destructor
> for the refcount==0 case, of course).  On that note, do you know why
> drm_intel_gem_bo_unreference() is so convoluted? What's the deal with
> the atomic_add_unless?
> 
> > Looks good.
> > -Chris
> >
> > --
> > Chris Wilson, Intel Open Source Technology Centre

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 2/2] intel: Use I915_EXEC_NO_RELOC when available

2015-01-21 Thread Daniel Vetter
On Tue, Jan 20, 2015 at 10:10:57PM -0800, Kristian Høgsberg wrote:
> However, drm_intel_bo_reference/unreference() is showing up on the
> profiles now that relocations are cheaper, but I think the better way
> to make those cheaper is to move the ref count to the public struct
> and make the ref/unref inline functions (calling out to a destructor
> for the refcount==0 case, of course).  On that note, do you know why
> drm_intel_gem_bo_unreference() is so convoluted? What's the deal with
> the atomic_add_unless?

Smells like the last reference can only be dropped with the mutex held,
probably because someone is assuming that buffers don't go poof while
holding the mutex. You should be able to drop this after some refcounting
audit to make sure no one looks at a bo after the unref is done.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 2/2] intel: Use I915_EXEC_NO_RELOC when available

2015-01-21 Thread Daniel Vetter
On Wed, Jan 21, 2015 at 10:17:01AM +0100, Daniel Vetter wrote:
> On Tue, Jan 20, 2015 at 10:10:57PM -0800, Kristian Høgsberg wrote:
> > However, drm_intel_bo_reference/unreference() is showing up on the
> > profiles now that relocations are cheaper, but I think the better way
> > to make those cheaper is to move the ref count to the public struct
> > and make the ref/unref inline functions (calling out to a destructor
> > for the refcount==0 case, of course).  On that note, do you know why
> > drm_intel_gem_bo_unreference() is so convoluted? What's the deal with
> > the atomic_add_unless?
> 
> Smells like the last reference can only be dropped with the mutex held,
> probably because someone is assuming that buffers don't go poof while
> holding the mutex. You should be able to drop this after some refcounting
> audit to make sure no one looks at a bo after the unref is done.

To elaborate: I think the reason here is that there's a bunch of weak
references (i.e. pointers that don't actually hold a reference) in the
various caches in libdrm (both bo cache and import cache). Those are only
protected by the mutex, which means if someone could drop the refcount to
0 without holding the mutex they'd try to increase the refcount of a bo
which is in the process of final destruction. Not great.

The approach the kernel uses is to have plain unlocked atomic_inc/dec for
all cases where there's a strong references. For the weak ones you use
atomic_in_unless_zero while holding a lock that the destructor also always
needs. That gives you enough synchronization to reliable detect when your
weak reference points at a zombie (in which case you just fail the
lookup). That gives you a nice fastpath and shifts all the cost to the
weak refcount lookup and final destruction (where you must grab the mutex
and use more expensive atomic ops).

In drm there's a bunch of examples using kref_get_unless_zero as reading
material.

Cheers, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 2/2] intel: Use I915_EXEC_NO_RELOC when available

2015-01-21 Thread Chris Wilson
On Tue, Jan 20, 2015 at 10:10:57PM -0800, Kristian Høgsberg wrote:
> However, drm_intel_bo_reference/unreference() is showing up on the
> profiles now that relocations are cheaper, but I think the better way
> to make those cheaper is to move the ref count to the public struct
> and make the ref/unref inline functions (calling out to a destructor
> for the refcount==0 case, of course).  On that note, do you know why
> drm_intel_gem_bo_unreference() is so convoluted? What's the deal with
> the atomic_add_unless?

Yeah, it's the optimised way to do an unref that then requires taking a
mutex. Releasing the last reference to the object is only threadsafe
under the mutex (otherwise another thread may preempt us, acquire the
ref and release it all before the first thread processes the free and
decouple it from the lists), but we want to avoid taking the mutex until
necessary.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx