Re: [Mesa-dev] [PATCH] winsys/radeon: consolidate hash table lookup
Am 11.04.2014 19:03, schrieb Marek Olšák: From: Marek Olšák marek.ol...@amd.com Reviewed-by: Christian König christian.koe...@amd.com BTW: I've always wondered if the custom hash table is the best approach here. Having a BO active in more than one command submission context at the same time sounds rather unlikely to me. Something like storing a pointer to the last used CS and it's index in the BO and only when a BO is really used in more than one CS context at the same time fall back to the hashtable and lockup the index from there sounds like less overhead. I should have done this long ago. --- src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 110 +++--- src/gallium/winsys/radeon/drm/radeon_drm_cs.h | 7 +- 2 files changed, 49 insertions(+), 68 deletions(-) diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c index db9fbfa..b55eb80 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c @@ -188,41 +188,40 @@ static INLINE void update_reloc(struct drm_radeon_cs_reloc *reloc, reloc-flags = MAX2(reloc-flags, priority); } -int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo) +int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo, + struct drm_radeon_cs_reloc **out_reloc) { -struct drm_radeon_cs_reloc *reloc; -unsigned i; +struct drm_radeon_cs_reloc *reloc = NULL; unsigned hash = bo-handle (sizeof(csc-is_handle_added)-1); +int i = -1; if (csc-is_handle_added[hash]) { i = csc-reloc_indices_hashlist[hash]; reloc = csc-relocs[i]; -if (reloc-handle == bo-handle) { -return i; -} -/* Hash collision, look for the BO in the list of relocs linearly. */ -for (i = csc-crelocs; i != 0;) { ---i; -reloc = csc-relocs[i]; -if (reloc-handle == bo-handle) { -/* Put this reloc in the hash list. - * This will prevent additional hash collisions if there are - * several consecutive get_reloc calls for the same buffer. - * - * Example: Assuming buffers A,B,C collide in the hash list, - * the following sequence of relocs: - * AAABB - * will collide here: ^ and here: ^, - * meaning that we should get very few collisions in the end. */ -csc-reloc_indices_hashlist[hash] = i; -/*printf(write_reloc collision, hash: %i, handle: %i\n, hash, bo-handle);*/ -return i; +if (reloc-handle != bo-handle) { +/* Hash collision, look for the BO in the list of relocs linearly. */ +for (i = csc-crelocs - 1; i = 0; i--) { +reloc = csc-relocs[i]; +if (reloc-handle == bo-handle) { +/* Put this reloc in the hash list. + * This will prevent additional hash collisions if there are + * several consecutive get_reloc calls for the same buffer. + * + * Example: Assuming buffers A,B,C collide in the hash list, + * the following sequence of relocs: + * AAABB + * will collide here: ^ and here: ^, + * meaning that we should get very few collisions in the end. */ +csc-reloc_indices_hashlist[hash] = i; +break; +} } } } - -return -1; +if (out_reloc) +*out_reloc = reloc; +return i; } static unsigned radeon_add_reloc(struct radeon_drm_cs *cs, @@ -237,45 +236,28 @@ static unsigned radeon_add_reloc(struct radeon_drm_cs *cs, unsigned hash = bo-handle (sizeof(csc-is_handle_added)-1); enum radeon_bo_domain rd = usage RADEON_USAGE_READ ? domains : 0; enum radeon_bo_domain wd = usage RADEON_USAGE_WRITE ? domains : 0; -bool update_hash = TRUE; -int i; +int i = -1; priority = MIN2(priority, 15); *added_domains = 0; -if (csc-is_handle_added[hash]) { -i = csc-reloc_indices_hashlist[hash]; -reloc = csc-relocs[i]; - -if (reloc-handle != bo-handle) { -/* Hash collision, look for the BO in the list of relocs linearly. */ -for (i = csc-crelocs - 1; i = 0; i--) { -reloc = csc-relocs[i]; -if (reloc-handle == bo-handle) { -/*printf(write_reloc collision, hash: %i, handle: %i\n, hash, bo-handle);*/ -break; -} -} -} - -if (i = 0) { -update_reloc(reloc, rd, wd, priority,
Re: [Mesa-dev] [PATCH] winsys/radeon: consolidate hash table lookup
Do you mean having bo-cs and bo-index? That would work, but there would have to be mutex_lock and mutex_unlock when accessing the variables. I'm not sure if locking a mutex is more expensive than the hash table lookup. OpenGL with GLX allows sharing buffers and textures between contexts, so we must play safe. Note that if all handles are less than 512, the lookup is always in O(1), so it's very cheap. We can increase the number to 1024 or 2048 if we find out that apps use too many buffers. Marek On Sat, Apr 12, 2014 at 10:35 AM, Christian König deathsim...@vodafone.de wrote: Am 11.04.2014 19:03, schrieb Marek Olšák: From: Marek Olšák marek.ol...@amd.com Reviewed-by: Christian König christian.koe...@amd.com BTW: I've always wondered if the custom hash table is the best approach here. Having a BO active in more than one command submission context at the same time sounds rather unlikely to me. Something like storing a pointer to the last used CS and it's index in the BO and only when a BO is really used in more than one CS context at the same time fall back to the hashtable and lockup the index from there sounds like less overhead. I should have done this long ago. --- src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 110 +++--- src/gallium/winsys/radeon/drm/radeon_drm_cs.h | 7 +- 2 files changed, 49 insertions(+), 68 deletions(-) diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c index db9fbfa..b55eb80 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c @@ -188,41 +188,40 @@ static INLINE void update_reloc(struct drm_radeon_cs_reloc *reloc, reloc-flags = MAX2(reloc-flags, priority); } -int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo) +int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo, + struct drm_radeon_cs_reloc **out_reloc) { -struct drm_radeon_cs_reloc *reloc; -unsigned i; +struct drm_radeon_cs_reloc *reloc = NULL; unsigned hash = bo-handle (sizeof(csc-is_handle_added)-1); +int i = -1; if (csc-is_handle_added[hash]) { i = csc-reloc_indices_hashlist[hash]; reloc = csc-relocs[i]; -if (reloc-handle == bo-handle) { -return i; -} -/* Hash collision, look for the BO in the list of relocs linearly. */ -for (i = csc-crelocs; i != 0;) { ---i; -reloc = csc-relocs[i]; -if (reloc-handle == bo-handle) { -/* Put this reloc in the hash list. - * This will prevent additional hash collisions if there are - * several consecutive get_reloc calls for the same buffer. - * - * Example: Assuming buffers A,B,C collide in the hash list, - * the following sequence of relocs: - * AAABB - * will collide here: ^ and here: ^, - * meaning that we should get very few collisions in the end. */ -csc-reloc_indices_hashlist[hash] = i; -/*printf(write_reloc collision, hash: %i, handle: %i\n, hash, bo-handle);*/ -return i; +if (reloc-handle != bo-handle) { +/* Hash collision, look for the BO in the list of relocs linearly. */ +for (i = csc-crelocs - 1; i = 0; i--) { +reloc = csc-relocs[i]; +if (reloc-handle == bo-handle) { +/* Put this reloc in the hash list. + * This will prevent additional hash collisions if there are + * several consecutive get_reloc calls for the same buffer. + * + * Example: Assuming buffers A,B,C collide in the hash list, + * the following sequence of relocs: + * AAABB + * will collide here: ^ and here: ^, + * meaning that we should get very few collisions in the end. */ +csc-reloc_indices_hashlist[hash] = i; +break; +} } } } - -return -1; +if (out_reloc) +*out_reloc = reloc; +return i; } static unsigned radeon_add_reloc(struct radeon_drm_cs *cs, @@ -237,45 +236,28 @@ static unsigned radeon_add_reloc(struct radeon_drm_cs *cs, unsigned hash = bo-handle (sizeof(csc-is_handle_added)-1); enum radeon_bo_domain rd = usage RADEON_USAGE_READ ? domains : 0; enum radeon_bo_domain wd = usage RADEON_USAGE_WRITE ? domains : 0; -bool update_hash = TRUE; -int i; +int i = -1; priority = MIN2(priority, 15);
Re: [Mesa-dev] [PATCH] winsys/radeon: consolidate hash table lookup
Am 12.04.2014 12:52, schrieb Marek Olšák: Do you mean having bo-cs and bo-index? That would work, but there would have to be mutex_lock and mutex_unlock when accessing the variables. I'm not sure if locking a mutex is more expensive than the hash table lookup. OpenGL with GLX allows sharing buffers and textures between contexts, so we must play safe. Obviously thread safety must be taken care of. Maybe we could squash CS identifier and index into a 64bit atomic? If we need to lock a mutex for every access it would indeed take longer than the hashtable handling. Note that if all handles are less than 512, the lookup is always in O(1), so it's very cheap. We can increase the number to 1024 or 2048 if we find out that apps use too many buffers. Didn't knew that there are so many entries in the hashtable, thought that we rather get a collision every 32 handles or something like this. If we have O(1) anyway most of the time than changing it would probably not make much sense. Christian. Marek On Sat, Apr 12, 2014 at 10:35 AM, Christian König deathsim...@vodafone.de wrote: Am 11.04.2014 19:03, schrieb Marek Olšák: From: Marek Olšák marek.ol...@amd.com Reviewed-by: Christian König christian.koe...@amd.com BTW: I've always wondered if the custom hash table is the best approach here. Having a BO active in more than one command submission context at the same time sounds rather unlikely to me. Something like storing a pointer to the last used CS and it's index in the BO and only when a BO is really used in more than one CS context at the same time fall back to the hashtable and lockup the index from there sounds like less overhead. I should have done this long ago. --- src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 110 +++--- src/gallium/winsys/radeon/drm/radeon_drm_cs.h | 7 +- 2 files changed, 49 insertions(+), 68 deletions(-) diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c index db9fbfa..b55eb80 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c @@ -188,41 +188,40 @@ static INLINE void update_reloc(struct drm_radeon_cs_reloc *reloc, reloc-flags = MAX2(reloc-flags, priority); } -int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo) +int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo, + struct drm_radeon_cs_reloc **out_reloc) { -struct drm_radeon_cs_reloc *reloc; -unsigned i; +struct drm_radeon_cs_reloc *reloc = NULL; unsigned hash = bo-handle (sizeof(csc-is_handle_added)-1); +int i = -1; if (csc-is_handle_added[hash]) { i = csc-reloc_indices_hashlist[hash]; reloc = csc-relocs[i]; -if (reloc-handle == bo-handle) { -return i; -} -/* Hash collision, look for the BO in the list of relocs linearly. */ -for (i = csc-crelocs; i != 0;) { ---i; -reloc = csc-relocs[i]; -if (reloc-handle == bo-handle) { -/* Put this reloc in the hash list. - * This will prevent additional hash collisions if there are - * several consecutive get_reloc calls for the same buffer. - * - * Example: Assuming buffers A,B,C collide in the hash list, - * the following sequence of relocs: - * AAABB - * will collide here: ^ and here: ^, - * meaning that we should get very few collisions in the end. */ -csc-reloc_indices_hashlist[hash] = i; -/*printf(write_reloc collision, hash: %i, handle: %i\n, hash, bo-handle);*/ -return i; +if (reloc-handle != bo-handle) { +/* Hash collision, look for the BO in the list of relocs linearly. */ +for (i = csc-crelocs - 1; i = 0; i--) { +reloc = csc-relocs[i]; +if (reloc-handle == bo-handle) { +/* Put this reloc in the hash list. + * This will prevent additional hash collisions if there are + * several consecutive get_reloc calls for the same buffer. + * + * Example: Assuming buffers A,B,C collide in the hash list, + * the following sequence of relocs: + * AAABB + * will collide here: ^ and here: ^, + * meaning that we should get very few collisions in the end. */ +csc-reloc_indices_hashlist[hash] = i; +break; +} } } } - -return -1; +if (out_reloc) +*out_reloc = reloc; +return i; } static unsigned
[Mesa-dev] [PATCH] winsys/radeon: consolidate hash table lookup
From: Marek Olšák marek.ol...@amd.com I should have done this long ago. --- src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 110 +++--- src/gallium/winsys/radeon/drm/radeon_drm_cs.h | 7 +- 2 files changed, 49 insertions(+), 68 deletions(-) diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c index db9fbfa..b55eb80 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c @@ -188,41 +188,40 @@ static INLINE void update_reloc(struct drm_radeon_cs_reloc *reloc, reloc-flags = MAX2(reloc-flags, priority); } -int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo) +int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo, + struct drm_radeon_cs_reloc **out_reloc) { -struct drm_radeon_cs_reloc *reloc; -unsigned i; +struct drm_radeon_cs_reloc *reloc = NULL; unsigned hash = bo-handle (sizeof(csc-is_handle_added)-1); +int i = -1; if (csc-is_handle_added[hash]) { i = csc-reloc_indices_hashlist[hash]; reloc = csc-relocs[i]; -if (reloc-handle == bo-handle) { -return i; -} -/* Hash collision, look for the BO in the list of relocs linearly. */ -for (i = csc-crelocs; i != 0;) { ---i; -reloc = csc-relocs[i]; -if (reloc-handle == bo-handle) { -/* Put this reloc in the hash list. - * This will prevent additional hash collisions if there are - * several consecutive get_reloc calls for the same buffer. - * - * Example: Assuming buffers A,B,C collide in the hash list, - * the following sequence of relocs: - * AAABB - * will collide here: ^ and here: ^, - * meaning that we should get very few collisions in the end. */ -csc-reloc_indices_hashlist[hash] = i; -/*printf(write_reloc collision, hash: %i, handle: %i\n, hash, bo-handle);*/ -return i; +if (reloc-handle != bo-handle) { +/* Hash collision, look for the BO in the list of relocs linearly. */ +for (i = csc-crelocs - 1; i = 0; i--) { +reloc = csc-relocs[i]; +if (reloc-handle == bo-handle) { +/* Put this reloc in the hash list. + * This will prevent additional hash collisions if there are + * several consecutive get_reloc calls for the same buffer. + * + * Example: Assuming buffers A,B,C collide in the hash list, + * the following sequence of relocs: + * AAABB + * will collide here: ^ and here: ^, + * meaning that we should get very few collisions in the end. */ +csc-reloc_indices_hashlist[hash] = i; +break; +} } } } - -return -1; +if (out_reloc) +*out_reloc = reloc; +return i; } static unsigned radeon_add_reloc(struct radeon_drm_cs *cs, @@ -237,45 +236,28 @@ static unsigned radeon_add_reloc(struct radeon_drm_cs *cs, unsigned hash = bo-handle (sizeof(csc-is_handle_added)-1); enum radeon_bo_domain rd = usage RADEON_USAGE_READ ? domains : 0; enum radeon_bo_domain wd = usage RADEON_USAGE_WRITE ? domains : 0; -bool update_hash = TRUE; -int i; +int i = -1; priority = MIN2(priority, 15); *added_domains = 0; -if (csc-is_handle_added[hash]) { -i = csc-reloc_indices_hashlist[hash]; -reloc = csc-relocs[i]; - -if (reloc-handle != bo-handle) { -/* Hash collision, look for the BO in the list of relocs linearly. */ -for (i = csc-crelocs - 1; i = 0; i--) { -reloc = csc-relocs[i]; -if (reloc-handle == bo-handle) { -/*printf(write_reloc collision, hash: %i, handle: %i\n, hash, bo-handle);*/ -break; -} -} -} - -if (i = 0) { -update_reloc(reloc, rd, wd, priority, added_domains); - -/* For async DMA, every add_reloc call must add a buffer to the list - * no matter how many duplicates there are. This is due to the fact - * the DMA CS checker doesn't use NOP packets for offset patching, - * but always uses the i-th buffer from the list to patch the i-th - * offset. If there are N offsets in a DMA CS, there must also be N - * buffers in the relocation list. - * - * This doesn't have to be done if virtual memory is enabled, - * because