[PATCH 2/2] libdrm_radeon: Optimize reloc writing to do less looping.

2010-03-10 Thread Pauli Nieminen
Bit has table will be first checked from BO if we can quarentee this BO is not
in this cs already.

To quarentee that there is no other cs with same id number of CS that can have
id is limited to 32. Adding and remocing reference in bo is done with atomic
operations to allow parallel access to a bo from multiple contexts.

This optimization decreases cs_write_reloc share of torcs profiling from 4.3%
to 2.6%.

Signed-off-by: Pauli Nieminen suok...@gmail.com
---
 radeon/radeon_bo_gem.c |1 +
 radeon/radeon_cs.c |6 ++
 radeon/radeon_cs.h |2 +-
 radeon/radeon_cs_gem.c |  133 ---
 radeon/radeon_cs_int.h |1 +
 5 files changed, 111 insertions(+), 32 deletions(-)

diff --git a/radeon/radeon_bo_gem.c b/radeon/radeon_bo_gem.c
index bc8058d..1b33bdb 100644
--- a/radeon/radeon_bo_gem.c
+++ b/radeon/radeon_bo_gem.c
@@ -80,6 +80,7 @@ static struct radeon_bo *bo_open(struct radeon_bo_manager 
*bom,
 bo-base.domains = domains;
 bo-base.flags = flags;
 bo-base.ptr = NULL;
+bo-base.referenced_in_cs = 0;
 bo-map_count = 0;
 if (handle) {
 struct drm_gem_open open_arg;
diff --git a/radeon/radeon_cs.c b/radeon/radeon_cs.c
index cc9be39..d0e922b 100644
--- a/radeon/radeon_cs.c
+++ b/radeon/radeon_cs.c
@@ -88,3 +88,9 @@ void radeon_cs_space_set_flush(struct radeon_cs *cs, void 
(*fn)(void *), void *d
 csi-space_flush_fn = fn;
 csi-space_flush_data = data;
 }
+
+uint32_t radeon_cs_get_id(struct radeon_cs *cs)
+{
+struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+return csi-id;
+}
diff --git a/radeon/radeon_cs.h b/radeon/radeon_cs.h
index 49d5d9a..7f6ee68 100644
--- a/radeon/radeon_cs.h
+++ b/radeon/radeon_cs.h
@@ -85,7 +85,7 @@ extern int radeon_cs_write_reloc(struct radeon_cs *cs,
  uint32_t read_domain,
  uint32_t write_domain,
  uint32_t flags);
-
+extern uint32_t radeon_cs_get_id(struct radeon_cs *cs);
 /*
  * add a persistent BO to the list
  * a persistent BO is one that will be referenced across flushes,
diff --git a/radeon/radeon_cs_gem.c b/radeon/radeon_cs_gem.c
index 45a219c..83aabea 100644
--- a/radeon/radeon_cs_gem.c
+++ b/radeon/radeon_cs_gem.c
@@ -32,6 +32,7 @@
 #include assert.h
 #include errno.h
 #include stdlib.h
+#include pthread.h
 #include sys/mman.h
 #include sys/ioctl.h
 #include radeon_cs.h
@@ -68,6 +69,66 @@ struct cs_gem {
 struct radeon_bo_int**relocs_bo;
 };
 
+
+#if !defined(__GNUC__) || __GNUC__  4 || (__GNUC__ == 4  __GNUC_MINOR__  2)
+/* no built in sync support in compiler define place holders */
+uint32_t __sync_add_and_fetch(uint32_t *a, uint32_t val)
+{
+   *a += val;
+   return val;
+}
+
+uint32_t __sync_add_and_fetch(uint32_t *a, uint32_t val)
+{
+   *a -= val;
+   return val;
+}
+#endif
+
+static pthread_mutex_t id_mutex = PTHREAD_MUTEX_INITIALIZER;
+static uint32_t cs_id_source = 0;
+
+/**
+ * result is undefined if called with ~0
+ */
+static uint32_t get_first_zero(const uint32_t n)
+{
+/* __builtin_ctz returns number of trailing zeros. */
+return 1  __builtin_ctz(~n);
+}
+
+/**
+ * Returns a free id for cs.
+ * If there is no free id we return zero
+ **/
+static uint32_t generate_id(void)
+{
+uint32_t r = 0;
+pthread_mutex_lock( id_mutex );
+/* check for free ids */
+if (cs_id_source != ~r) {
+/* find first zero bit */
+r = get_first_zero(cs_id_source);
+
+/* set id as reserved */
+cs_id_source |= r;
+}
+pthread_mutex_unlock( id_mutex );
+return r;
+}
+
+/**
+ * Free the id for later reuse
+ **/
+static void free_id(uint32_t id)
+{
+pthread_mutex_lock( id_mutex );
+
+cs_id_source = ~id;
+
+pthread_mutex_unlock( id_mutex );
+}
+
 static struct radeon_cs_int *cs_gem_create(struct radeon_cs_manager *csm,
uint32_t ndw)
 {
@@ -90,6 +151,7 @@ static struct radeon_cs_int *cs_gem_create(struct 
radeon_cs_manager *csm,
 }
 csg-base.relocs_total_size = 0;
 csg-base.crelocs = 0;
+csg-base.id = generate_id();
 csg-nrelocs = 4096 / (4 * 4) ;
 csg-relocs_bo = (struct radeon_bo_int**)calloc(1,
 csg-nrelocs*sizeof(void*));
@@ -141,38 +203,43 @@ static int cs_gem_write_reloc(struct radeon_cs_int *cs,
 if (write_domain == RADEON_GEM_DOMAIN_CPU) {
 return -EINVAL;
 }
-/* check if bo is already referenced */
-for(i = 0; i  cs-crelocs; i++) {
-idx = i * RELOC_SIZE;
-reloc = (struct cs_reloc_gem*)csg-relocs[idx];
-if (reloc-handle == bo-handle) {
-/* Check domains must be in read or write. As we check already
- * checked that in argument one of the read or write domain was
- * set we only need to check that if previous reloc as the read
- * domain set then the read_domain should also 

Re: [PATCH 2/2] libdrm_radeon: Optimize reloc writing to do less looping.

2010-03-10 Thread Michel Dänzer
On Wed, 2010-03-10 at 18:20 +0200, Pauli Nieminen wrote: 
 Bit has table will be first checked from BO if we can quarentee this BO is not
 in this cs already.
 
 To quarentee that there is no other cs with same id number of CS that can have
 id is limited to 32. Adding and remocing reference in bo is done with atomic
 operations to allow parallel access to a bo from multiple contexts.
 
 This optimization decreases cs_write_reloc share of torcs profiling from 4.3%
 to 2.6%.
 
 Signed-off-by: Pauli Nieminen suok...@gmail.com

[...]

 diff --git a/radeon/radeon_cs_gem.c b/radeon/radeon_cs_gem.c
 index 45a219c..83aabea 100644
 --- a/radeon/radeon_cs_gem.c
 +++ b/radeon/radeon_cs_gem.c
 @@ -68,6 +69,66 @@ struct cs_gem {
  struct radeon_bo_int**relocs_bo;
  };
  
 +
 +#if !defined(__GNUC__) || __GNUC__  4 || (__GNUC__ == 4  __GNUC_MINOR__  
 2)
 +/* no built in sync support in compiler define place holders */
 +uint32_t __sync_add_and_fetch(uint32_t *a, uint32_t val)
 +{
 + *a += val;
 + return val;
 +}
 +
 +uint32_t __sync_add_and_fetch(uint32_t *a, uint32_t val)
 +{
 + *a -= val;
 + return val;
 +}
 +#endif

This doesn't look like it could build... presumably the latter should be
called __sync_sub_and_fetch()?

Do these stand any chance of working properly in circumstances where
atomicity is actually important though?


-- 
Earthling Michel Dänzer   |http://www.vmware.com
Libre software enthusiast |  Debian, X and DRI developer

--
Download Intel#174; Parallel Studio Eval
Try the new software tools for yourself. Speed compiling, find bugs
proactively, and fine-tune applications for parallel performance.
See why Intel Parallel Studio got high marks during beta.
http://p.sf.net/sfu/intel-sw-dev
--
___
Dri-devel mailing list
Dri-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/dri-devel


Re: [PATCH 2/2] libdrm_radeon: Optimize reloc writing to do less looping.

2010-03-10 Thread Pauli Nieminen
2010/3/10 Michel Dänzer mic...@daenzer.net:
 On Wed, 2010-03-10 at 18:20 +0200, Pauli Nieminen wrote:
 Bit has table will be first checked from BO if we can quarentee this BO is 
 not
 in this cs already.

 To quarentee that there is no other cs with same id number of CS that can 
 have
 id is limited to 32. Adding and remocing reference in bo is done with atomic
 operations to allow parallel access to a bo from multiple contexts.

 This optimization decreases cs_write_reloc share of torcs profiling from 4.3%
 to 2.6%.

 Signed-off-by: Pauli Nieminen suok...@gmail.com

 [...]

 diff --git a/radeon/radeon_cs_gem.c b/radeon/radeon_cs_gem.c
 index 45a219c..83aabea 100644
 --- a/radeon/radeon_cs_gem.c
 +++ b/radeon/radeon_cs_gem.c
 @@ -68,6 +69,66 @@ struct cs_gem {
      struct radeon_bo_int        **relocs_bo;
  };

 +
 +#if !defined(__GNUC__) || __GNUC__  4 || (__GNUC__ == 4  __GNUC_MINOR__ 
  2)
 +/* no built in sync support in compiler define place holders */
 +uint32_t __sync_add_and_fetch(uint32_t *a, uint32_t val)
 +{
 +     *a += val;
 +     return val;
 +}
 +
 +uint32_t __sync_add_and_fetch(uint32_t *a, uint32_t val)
 +{
 +     *a -= val;
 +     return val;
 +}
 +#endif

 This doesn't look like it could build... presumably the latter should be
 called __sync_sub_and_fetch()?


sorry .wrong patch coming from somewhere :/

 Do these stand any chance of working properly in circumstances where
 atomicity is actually important though?


 --
 Earthling Michel Dänzer           |                http://www.vmware.com
 Libre software enthusiast         |          Debian, X and DRI developer


--
Download Intel#174; Parallel Studio Eval
Try the new software tools for yourself. Speed compiling, find bugs
proactively, and fine-tune applications for parallel performance.
See why Intel Parallel Studio got high marks during beta.
http://p.sf.net/sfu/intel-sw-dev
--
___
Dri-devel mailing list
Dri-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/dri-devel