Re: [Mesa-dev] [PATCH] winsys/radeon: consolidate hash table lookup

2014-04-12 Thread Christian König

Am 11.04.2014 19:03, schrieb Marek Olšák:

From: Marek Olšák marek.ol...@amd.com


Reviewed-by: Christian König christian.koe...@amd.com

BTW:

I've always wondered if the custom hash table is the best approach here. 
Having a BO active in more than one command submission context at the 
same time sounds rather unlikely to me.


Something like storing a pointer to the last used CS and it's index in 
the BO and only when a BO is really used in more than one CS context at 
the same time fall back to the hashtable and lockup the index from there 
sounds like less overhead.




I should have done this long ago.
---
  src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 110 +++---
  src/gallium/winsys/radeon/drm/radeon_drm_cs.h |   7 +-
  2 files changed, 49 insertions(+), 68 deletions(-)

diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c 
b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index db9fbfa..b55eb80 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -188,41 +188,40 @@ static INLINE void update_reloc(struct 
drm_radeon_cs_reloc *reloc,
  reloc-flags = MAX2(reloc-flags, priority);
  }
  
-int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo)

+int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo,
+ struct drm_radeon_cs_reloc **out_reloc)
  {
-struct drm_radeon_cs_reloc *reloc;
-unsigned i;
+struct drm_radeon_cs_reloc *reloc = NULL;
  unsigned hash = bo-handle  (sizeof(csc-is_handle_added)-1);
+int i = -1;
  
  if (csc-is_handle_added[hash]) {

  i = csc-reloc_indices_hashlist[hash];
  reloc = csc-relocs[i];
-if (reloc-handle == bo-handle) {
-return i;
-}
  
-/* Hash collision, look for the BO in the list of relocs linearly. */

-for (i = csc-crelocs; i != 0;) {
---i;
-reloc = csc-relocs[i];
-if (reloc-handle == bo-handle) {
-/* Put this reloc in the hash list.
- * This will prevent additional hash collisions if there are
- * several consecutive get_reloc calls for the same buffer.
- *
- * Example: Assuming buffers A,B,C collide in the hash list,
- * the following sequence of relocs:
- * AAABB
- * will collide here: ^ and here:   ^,
- * meaning that we should get very few collisions in the end. 
*/
-csc-reloc_indices_hashlist[hash] = i;
-/*printf(write_reloc collision, hash: %i, handle: %i\n, hash, 
bo-handle);*/
-return i;
+if (reloc-handle != bo-handle) {
+/* Hash collision, look for the BO in the list of relocs linearly. 
*/
+for (i = csc-crelocs - 1; i = 0; i--) {
+reloc = csc-relocs[i];
+if (reloc-handle == bo-handle) {
+/* Put this reloc in the hash list.
+ * This will prevent additional hash collisions if there 
are
+ * several consecutive get_reloc calls for the same buffer.
+ *
+ * Example: Assuming buffers A,B,C collide in the hash 
list,
+ * the following sequence of relocs:
+ * AAABB
+ * will collide here: ^ and here:   ^,
+ * meaning that we should get very few collisions in the 
end. */
+csc-reloc_indices_hashlist[hash] = i;
+break;
+}
  }
  }
  }
-
-return -1;
+if (out_reloc)
+*out_reloc = reloc;
+return i;
  }
  
  static unsigned radeon_add_reloc(struct radeon_drm_cs *cs,

@@ -237,45 +236,28 @@ static unsigned radeon_add_reloc(struct radeon_drm_cs *cs,
  unsigned hash = bo-handle  (sizeof(csc-is_handle_added)-1);
  enum radeon_bo_domain rd = usage  RADEON_USAGE_READ ? domains : 0;
  enum radeon_bo_domain wd = usage  RADEON_USAGE_WRITE ? domains : 0;
-bool update_hash = TRUE;
-int i;
+int i = -1;
  
  priority = MIN2(priority, 15);

  *added_domains = 0;
  
-if (csc-is_handle_added[hash]) {

-i = csc-reloc_indices_hashlist[hash];
-reloc = csc-relocs[i];
-
-if (reloc-handle != bo-handle) {
-/* Hash collision, look for the BO in the list of relocs linearly. 
*/
-for (i = csc-crelocs - 1; i = 0; i--) {
-reloc = csc-relocs[i];
-if (reloc-handle == bo-handle) {
-/*printf(write_reloc collision, hash: %i, handle: %i\n, 
hash, bo-handle);*/
-break;
-}
-}
-}
-
-if (i = 0) {
-update_reloc(reloc, rd, wd, priority, 

Re: [Mesa-dev] [PATCH] winsys/radeon: consolidate hash table lookup

2014-04-12 Thread Marek Olšák
Do you mean having bo-cs and bo-index? That would work, but there
would have to be mutex_lock and mutex_unlock when accessing the
variables. I'm not sure if locking a mutex is more expensive than the
hash table lookup. OpenGL with GLX allows sharing buffers and textures
between contexts, so we must play safe.

Note that if all handles are less than 512, the lookup is always in
O(1), so it's very cheap. We can increase the number to 1024 or 2048
if we find out that apps use too many buffers.

Marek



On Sat, Apr 12, 2014 at 10:35 AM, Christian König
deathsim...@vodafone.de wrote:
 Am 11.04.2014 19:03, schrieb Marek Olšák:

 From: Marek Olšák marek.ol...@amd.com


 Reviewed-by: Christian König christian.koe...@amd.com

 BTW:

 I've always wondered if the custom hash table is the best approach here.
 Having a BO active in more than one command submission context at the same
 time sounds rather unlikely to me.

 Something like storing a pointer to the last used CS and it's index in the
 BO and only when a BO is really used in more than one CS context at the same
 time fall back to the hashtable and lockup the index from there sounds like
 less overhead.



 I should have done this long ago.
 ---
   src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 110
 +++---
   src/gallium/winsys/radeon/drm/radeon_drm_cs.h |   7 +-
   2 files changed, 49 insertions(+), 68 deletions(-)

 diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
 b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
 index db9fbfa..b55eb80 100644
 --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
 +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
 @@ -188,41 +188,40 @@ static INLINE void update_reloc(struct
 drm_radeon_cs_reloc *reloc,
   reloc-flags = MAX2(reloc-flags, priority);
   }
   -int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo
 *bo)
 +int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo,
 + struct drm_radeon_cs_reloc **out_reloc)
   {
 -struct drm_radeon_cs_reloc *reloc;
 -unsigned i;
 +struct drm_radeon_cs_reloc *reloc = NULL;
   unsigned hash = bo-handle  (sizeof(csc-is_handle_added)-1);
 +int i = -1;
 if (csc-is_handle_added[hash]) {
   i = csc-reloc_indices_hashlist[hash];
   reloc = csc-relocs[i];
 -if (reloc-handle == bo-handle) {
 -return i;
 -}
   -/* Hash collision, look for the BO in the list of relocs
 linearly. */
 -for (i = csc-crelocs; i != 0;) {
 ---i;
 -reloc = csc-relocs[i];
 -if (reloc-handle == bo-handle) {
 -/* Put this reloc in the hash list.
 - * This will prevent additional hash collisions if there
 are
 - * several consecutive get_reloc calls for the same
 buffer.
 - *
 - * Example: Assuming buffers A,B,C collide in the hash
 list,
 - * the following sequence of relocs:
 - * AAABB
 - * will collide here: ^ and here:   ^,
 - * meaning that we should get very few collisions in the
 end. */
 -csc-reloc_indices_hashlist[hash] = i;
 -/*printf(write_reloc collision, hash: %i, handle: %i\n,
 hash, bo-handle);*/
 -return i;
 +if (reloc-handle != bo-handle) {
 +/* Hash collision, look for the BO in the list of relocs
 linearly. */
 +for (i = csc-crelocs - 1; i = 0; i--) {
 +reloc = csc-relocs[i];
 +if (reloc-handle == bo-handle) {
 +/* Put this reloc in the hash list.
 + * This will prevent additional hash collisions if
 there are
 + * several consecutive get_reloc calls for the same
 buffer.
 + *
 + * Example: Assuming buffers A,B,C collide in the
 hash list,
 + * the following sequence of relocs:
 + * AAABB
 + * will collide here: ^ and here:   ^,
 + * meaning that we should get very few collisions in
 the end. */
 +csc-reloc_indices_hashlist[hash] = i;
 +break;
 +}
   }
   }
   }
 -
 -return -1;
 +if (out_reloc)
 +*out_reloc = reloc;
 +return i;
   }
 static unsigned radeon_add_reloc(struct radeon_drm_cs *cs,
 @@ -237,45 +236,28 @@ static unsigned radeon_add_reloc(struct
 radeon_drm_cs *cs,
   unsigned hash = bo-handle  (sizeof(csc-is_handle_added)-1);
   enum radeon_bo_domain rd = usage  RADEON_USAGE_READ ? domains : 0;
   enum radeon_bo_domain wd = usage  RADEON_USAGE_WRITE ? domains : 0;
 -bool update_hash = TRUE;
 -int i;
 +int i = -1;
 priority = MIN2(priority, 15);
   

Re: [Mesa-dev] [PATCH] winsys/radeon: consolidate hash table lookup

2014-04-12 Thread Christian König

Am 12.04.2014 12:52, schrieb Marek Olšák:

Do you mean having bo-cs and bo-index? That would work, but there
would have to be mutex_lock and mutex_unlock when accessing the
variables. I'm not sure if locking a mutex is more expensive than the
hash table lookup. OpenGL with GLX allows sharing buffers and textures
between contexts, so we must play safe.


Obviously thread safety must be taken care of. Maybe we could squash CS 
identifier and index into a 64bit atomic? If we need to lock a mutex for 
every access it would indeed take longer than the hashtable handling.



Note that if all handles are less than 512, the lookup is always in
O(1), so it's very cheap. We can increase the number to 1024 or 2048
if we find out that apps use too many buffers.


Didn't knew that there are so many entries in the hashtable, thought 
that we rather get a collision every 32 handles or something like this. 
If we have O(1) anyway most of the time than changing it would probably 
not make much sense.


Christian.



Marek



On Sat, Apr 12, 2014 at 10:35 AM, Christian König
deathsim...@vodafone.de wrote:

Am 11.04.2014 19:03, schrieb Marek Olšák:

From: Marek Olšák marek.ol...@amd.com


Reviewed-by: Christian König christian.koe...@amd.com

BTW:

I've always wondered if the custom hash table is the best approach here.
Having a BO active in more than one command submission context at the same
time sounds rather unlikely to me.

Something like storing a pointer to the last used CS and it's index in the
BO and only when a BO is really used in more than one CS context at the same
time fall back to the hashtable and lockup the index from there sounds like
less overhead.



I should have done this long ago.
---
   src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 110
+++---
   src/gallium/winsys/radeon/drm/radeon_drm_cs.h |   7 +-
   2 files changed, 49 insertions(+), 68 deletions(-)

diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index db9fbfa..b55eb80 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -188,41 +188,40 @@ static INLINE void update_reloc(struct
drm_radeon_cs_reloc *reloc,
   reloc-flags = MAX2(reloc-flags, priority);
   }
   -int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo
*bo)
+int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo,
+ struct drm_radeon_cs_reloc **out_reloc)
   {
-struct drm_radeon_cs_reloc *reloc;
-unsigned i;
+struct drm_radeon_cs_reloc *reloc = NULL;
   unsigned hash = bo-handle  (sizeof(csc-is_handle_added)-1);
+int i = -1;
 if (csc-is_handle_added[hash]) {
   i = csc-reloc_indices_hashlist[hash];
   reloc = csc-relocs[i];
-if (reloc-handle == bo-handle) {
-return i;
-}
   -/* Hash collision, look for the BO in the list of relocs
linearly. */
-for (i = csc-crelocs; i != 0;) {
---i;
-reloc = csc-relocs[i];
-if (reloc-handle == bo-handle) {
-/* Put this reloc in the hash list.
- * This will prevent additional hash collisions if there
are
- * several consecutive get_reloc calls for the same
buffer.
- *
- * Example: Assuming buffers A,B,C collide in the hash
list,
- * the following sequence of relocs:
- * AAABB
- * will collide here: ^ and here:   ^,
- * meaning that we should get very few collisions in the
end. */
-csc-reloc_indices_hashlist[hash] = i;
-/*printf(write_reloc collision, hash: %i, handle: %i\n,
hash, bo-handle);*/
-return i;
+if (reloc-handle != bo-handle) {
+/* Hash collision, look for the BO in the list of relocs
linearly. */
+for (i = csc-crelocs - 1; i = 0; i--) {
+reloc = csc-relocs[i];
+if (reloc-handle == bo-handle) {
+/* Put this reloc in the hash list.
+ * This will prevent additional hash collisions if
there are
+ * several consecutive get_reloc calls for the same
buffer.
+ *
+ * Example: Assuming buffers A,B,C collide in the
hash list,
+ * the following sequence of relocs:
+ * AAABB
+ * will collide here: ^ and here:   ^,
+ * meaning that we should get very few collisions in
the end. */
+csc-reloc_indices_hashlist[hash] = i;
+break;
+}
   }
   }
   }
-
-return -1;
+if (out_reloc)
+*out_reloc = reloc;
+return i;
   }
 static unsigned 

[Mesa-dev] [PATCH] winsys/radeon: consolidate hash table lookup

2014-04-11 Thread Marek Olšák
From: Marek Olšák marek.ol...@amd.com

I should have done this long ago.
---
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 110 +++---
 src/gallium/winsys/radeon/drm/radeon_drm_cs.h |   7 +-
 2 files changed, 49 insertions(+), 68 deletions(-)

diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c 
b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index db9fbfa..b55eb80 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -188,41 +188,40 @@ static INLINE void update_reloc(struct 
drm_radeon_cs_reloc *reloc,
 reloc-flags = MAX2(reloc-flags, priority);
 }
 
-int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo)
+int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo,
+ struct drm_radeon_cs_reloc **out_reloc)
 {
-struct drm_radeon_cs_reloc *reloc;
-unsigned i;
+struct drm_radeon_cs_reloc *reloc = NULL;
 unsigned hash = bo-handle  (sizeof(csc-is_handle_added)-1);
+int i = -1;
 
 if (csc-is_handle_added[hash]) {
 i = csc-reloc_indices_hashlist[hash];
 reloc = csc-relocs[i];
-if (reloc-handle == bo-handle) {
-return i;
-}
 
-/* Hash collision, look for the BO in the list of relocs linearly. */
-for (i = csc-crelocs; i != 0;) {
---i;
-reloc = csc-relocs[i];
-if (reloc-handle == bo-handle) {
-/* Put this reloc in the hash list.
- * This will prevent additional hash collisions if there are
- * several consecutive get_reloc calls for the same buffer.
- *
- * Example: Assuming buffers A,B,C collide in the hash list,
- * the following sequence of relocs:
- * AAABB
- * will collide here: ^ and here:   ^,
- * meaning that we should get very few collisions in the end. 
*/
-csc-reloc_indices_hashlist[hash] = i;
-/*printf(write_reloc collision, hash: %i, handle: %i\n, 
hash, bo-handle);*/
-return i;
+if (reloc-handle != bo-handle) {
+/* Hash collision, look for the BO in the list of relocs linearly. 
*/
+for (i = csc-crelocs - 1; i = 0; i--) {
+reloc = csc-relocs[i];
+if (reloc-handle == bo-handle) {
+/* Put this reloc in the hash list.
+ * This will prevent additional hash collisions if there 
are
+ * several consecutive get_reloc calls for the same buffer.
+ *
+ * Example: Assuming buffers A,B,C collide in the hash 
list,
+ * the following sequence of relocs:
+ * AAABB
+ * will collide here: ^ and here:   ^,
+ * meaning that we should get very few collisions in the 
end. */
+csc-reloc_indices_hashlist[hash] = i;
+break;
+}
 }
 }
 }
-
-return -1;
+if (out_reloc)
+*out_reloc = reloc;
+return i;
 }
 
 static unsigned radeon_add_reloc(struct radeon_drm_cs *cs,
@@ -237,45 +236,28 @@ static unsigned radeon_add_reloc(struct radeon_drm_cs *cs,
 unsigned hash = bo-handle  (sizeof(csc-is_handle_added)-1);
 enum radeon_bo_domain rd = usage  RADEON_USAGE_READ ? domains : 0;
 enum radeon_bo_domain wd = usage  RADEON_USAGE_WRITE ? domains : 0;
-bool update_hash = TRUE;
-int i;
+int i = -1;
 
 priority = MIN2(priority, 15);
 *added_domains = 0;
 
-if (csc-is_handle_added[hash]) {
-i = csc-reloc_indices_hashlist[hash];
-reloc = csc-relocs[i];
-
-if (reloc-handle != bo-handle) {
-/* Hash collision, look for the BO in the list of relocs linearly. 
*/
-for (i = csc-crelocs - 1; i = 0; i--) {
-reloc = csc-relocs[i];
-if (reloc-handle == bo-handle) {
-/*printf(write_reloc collision, hash: %i, handle: %i\n, 
hash, bo-handle);*/
-break;
-}
-}
-}
-
-if (i = 0) {
-update_reloc(reloc, rd, wd, priority, added_domains);
-
-/* For async DMA, every add_reloc call must add a buffer to the 
list
- * no matter how many duplicates there are. This is due to the fact
- * the DMA CS checker doesn't use NOP packets for offset patching,
- * but always uses the i-th buffer from the list to patch the i-th
- * offset. If there are N offsets in a DMA CS, there must also be N
- * buffers in the relocation list.
- *
- * This doesn't have to be done if virtual memory is enabled,
- * because