Re: [Mesa-dev] [PATCH 1/2] winsys/amdgpu: adjust IB size based on buffer wait time

2016-04-20 Thread Marek Olšák
On Wed, Apr 20, 2016 at 8:18 PM, Grigori Goronzy  wrote:
> On 2016-04-20 02:20, Nicolai Hähnle wrote:
>>
>> This is just a slight massaging of the patch you sent previously. What
>> happened to the discussion we had about how to do this properly?
>>
>
> This already provides good value as-is and it is (IMHO) reasonably clean, so
> why not include it for the time being? Marek seemed to like the general
> concept as well. I agree that basing IB size on GPU idleness is a great idea
> and I'll look into that, either as an alternative or as an addition to this.
>
> Just a random question: we can count on up to date gfx fence sequence
> numbers being available without any calls into the kernel, right? The winsys
> code makes that conditional and calls into the kernel when no fence pointer
> is available.

Yes, there is a fence buffer you can read to get the latest sequence
numbers for different rings. That buffer might not be updated when IBs
of other process finish though.

Only UVD and VCE don't support user fences.

Marek
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/2] winsys/amdgpu: adjust IB size based on buffer wait time

2016-04-20 Thread Grigori Goronzy

On 2016-04-20 02:20, Nicolai Hähnle wrote:

This is just a slight massaging of the patch you sent previously. What
happened to the discussion we had about how to do this properly?



This already provides good value as-is and it is (IMHO) reasonably 
clean, so why not include it for the time being? Marek seemed to like 
the general concept as well. I agree that basing IB size on GPU idleness 
is a great idea and I'll look into that, either as an alternative or as 
an addition to this.


Just a random question: we can count on up to date gfx fence sequence 
numbers being available without any calls into the kernel, right? The 
winsys code makes that conditional and calls into the kernel when no 
fence pointer is available.


Grigori



On 19.04.2016 18:13, Grigori Goronzy wrote:

Small IBs help to reduce stalls for workloads that require a lot of
synchronization. On the other hand, if there is no notable
synchronization, we can use a large IB size to slightly improve
performance in some cases.

This introduces tuning of the IB size based on feedback on the average
buffer wait time. The average wait time is tracked with exponential
smoothing.
---
  src/gallium/winsys/amdgpu/drm/amdgpu_bo.c |  6 +-
  src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 17 -
  src/gallium/winsys/amdgpu/drm/amdgpu_cs.h |  2 ++
  src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h |  9 +
  4 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c

index 036301e..4b8554d 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -195,8 +195,10 @@ static void *amdgpu_bo_map(struct pb_buffer *buf,
 return NULL;
  }
   }
+ amdgpu_winsys_update_buffer_wait_avg(bo->ws, 0);
} else {
   uint64_t time = os_time_get_nano();
+ uint64_t duration;

   if (!(usage & PIPE_TRANSFER_WRITE)) {
  /* Mapping for read.
@@ -221,7 +223,9 @@ static void *amdgpu_bo_map(struct pb_buffer *buf,
 RADEON_USAGE_READWRITE);
   }

- bo->ws->buffer_wait_time += os_time_get_nano() - time;
+ duration = os_time_get_nano() - time;
+ bo->ws->buffer_wait_time += duration;
+ amdgpu_winsys_update_buffer_wait_avg(bo->ws, duration);
}
 }

diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c

index 69902c4..b9a7c5b 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -201,10 +201,7 @@ amdgpu_ctx_query_reset_status(struct 
radeon_winsys_ctx *rwctx)
  static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct 
amdgpu_ib *ib,
struct amdgpu_cs_ib_info *info, 
unsigned ib_type)

  {
-   /* Small IBs are better than big IBs, because the GPU goes idle 
quicker

-* and there is less waiting for buffers and fences. Proof:
-*   
http://www.phoronix.com/scan.php?page=article=mesa-111-si=1

-*/
+   struct amdgpu_winsys *aws = (struct amdgpu_winsys *)ws;
 unsigned buffer_size, ib_size;

 switch (ib_type) {
@@ -216,8 +213,18 @@ static bool amdgpu_get_new_ib(struct 
radeon_winsys *ws, struct amdgpu_ib *ib,

ib_size = 128 * 1024 * 4;
break;
 case IB_MAIN:
+  /* Small IBs are often better than big IBs, because the GPU 
goes idle
+   * quicker and there is less waiting for buffers and fences. 
Proof:
+   *   
http://www.phoronix.com/scan.php?page=article=mesa-111-si=1
+   * Tune IB size depending on average buffer waiting time, which 
is an

+   * indicator for the amount of synchronization going on. Some
+   * applications don't cause notable synchronization, so we can 
use

+   * large IB size for slightly improved throughput.
+   */
buffer_size = 128 * 1024 * 4;
-  ib_size = 20 * 1024 * 4;
+  ib_size = 32 * 1024 * 4;
+  if (aws->buffer_wait_time_avg > IB_SIZE_WAIT_THRESHOLD_NS)
+ ib_size = 10 * 1024 * 4;
 }

 ib->base.cdw = 0;
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h

index 4ed830b..98e58a2 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@@ -35,6 +35,8 @@
  #include "amdgpu_bo.h"
  #include "util/u_memory.h"

+#define IB_SIZE_WAIT_THRESHOLD_NS   1
+
  struct amdgpu_ctx {
 struct amdgpu_winsys *ws;
 amdgpu_context_handle ctx;
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h 
b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h

index 91b9be4..3bd63b6 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
@@ -54,6 +54,7 @@ struct amdgpu_winsys {
 uint64_t allocated_vram;
 uint64_t allocated_gtt;
 uint64_t buffer_wait_time; /* 

Re: [Mesa-dev] [PATCH 1/2] winsys/amdgpu: adjust IB size based on buffer wait time

2016-04-19 Thread Nicolai Hähnle
This is just a slight massaging of the patch you sent previously. What 
happened to the discussion we had about how to do this properly?


Nicolai

On 19.04.2016 18:13, Grigori Goronzy wrote:

Small IBs help to reduce stalls for workloads that require a lot of
synchronization. On the other hand, if there is no notable
synchronization, we can use a large IB size to slightly improve
performance in some cases.

This introduces tuning of the IB size based on feedback on the average
buffer wait time. The average wait time is tracked with exponential
smoothing.
---
  src/gallium/winsys/amdgpu/drm/amdgpu_bo.c |  6 +-
  src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 17 -
  src/gallium/winsys/amdgpu/drm/amdgpu_cs.h |  2 ++
  src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h |  9 +
  4 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 036301e..4b8554d 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -195,8 +195,10 @@ static void *amdgpu_bo_map(struct pb_buffer *buf,
 return NULL;
  }
   }
+ amdgpu_winsys_update_buffer_wait_avg(bo->ws, 0);
} else {
   uint64_t time = os_time_get_nano();
+ uint64_t duration;

   if (!(usage & PIPE_TRANSFER_WRITE)) {
  /* Mapping for read.
@@ -221,7 +223,9 @@ static void *amdgpu_bo_map(struct pb_buffer *buf,
 RADEON_USAGE_READWRITE);
   }

- bo->ws->buffer_wait_time += os_time_get_nano() - time;
+ duration = os_time_get_nano() - time;
+ bo->ws->buffer_wait_time += duration;
+ amdgpu_winsys_update_buffer_wait_avg(bo->ws, duration);
}
 }

diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 69902c4..b9a7c5b 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -201,10 +201,7 @@ amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx 
*rwctx)
  static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_ib *ib,
struct amdgpu_cs_ib_info *info, unsigned 
ib_type)
  {
-   /* Small IBs are better than big IBs, because the GPU goes idle quicker
-* and there is less waiting for buffers and fences. Proof:
-*   http://www.phoronix.com/scan.php?page=article=mesa-111-si=1
-*/
+   struct amdgpu_winsys *aws = (struct amdgpu_winsys *)ws;
 unsigned buffer_size, ib_size;

 switch (ib_type) {
@@ -216,8 +213,18 @@ static bool amdgpu_get_new_ib(struct radeon_winsys *ws, 
struct amdgpu_ib *ib,
ib_size = 128 * 1024 * 4;
break;
 case IB_MAIN:
+  /* Small IBs are often better than big IBs, because the GPU goes idle
+   * quicker and there is less waiting for buffers and fences. Proof:
+   *   http://www.phoronix.com/scan.php?page=article=mesa-111-si=1
+   * Tune IB size depending on average buffer waiting time, which is an
+   * indicator for the amount of synchronization going on. Some
+   * applications don't cause notable synchronization, so we can use
+   * large IB size for slightly improved throughput.
+   */
buffer_size = 128 * 1024 * 4;
-  ib_size = 20 * 1024 * 4;
+  ib_size = 32 * 1024 * 4;
+  if (aws->buffer_wait_time_avg > IB_SIZE_WAIT_THRESHOLD_NS)
+ ib_size = 10 * 1024 * 4;
 }

 ib->base.cdw = 0;
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
index 4ed830b..98e58a2 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@@ -35,6 +35,8 @@
  #include "amdgpu_bo.h"
  #include "util/u_memory.h"

+#define IB_SIZE_WAIT_THRESHOLD_NS   1
+
  struct amdgpu_ctx {
 struct amdgpu_winsys *ws;
 amdgpu_context_handle ctx;
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h 
b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
index 91b9be4..3bd63b6 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
@@ -54,6 +54,7 @@ struct amdgpu_winsys {
 uint64_t allocated_vram;
 uint64_t allocated_gtt;
 uint64_t buffer_wait_time; /* time spent in buffer_wait in ns */
+   uint64_t buffer_wait_time_avg;
 uint64_t num_cs_flushes;
 unsigned gart_page_size;

@@ -76,6 +77,14 @@ amdgpu_winsys(struct radeon_winsys *base)
 return (struct amdgpu_winsys*)base;
  }

+static inline
+void amdgpu_winsys_update_buffer_wait_avg(struct amdgpu_winsys *ws,
+  uint64_t wait)
+{
+   /* Exponential smoothing with alpha = 0.25 */
+   ws->buffer_wait_time_avg = (3 * ws->buffer_wait_time_avg + wait) / 4;
+}
+
  void amdgpu_surface_init_functions(struct amdgpu_winsys *ws);
  ADDR_HANDLE amdgpu_addr_create(struct 

[Mesa-dev] [PATCH 1/2] winsys/amdgpu: adjust IB size based on buffer wait time

2016-04-19 Thread Grigori Goronzy
Small IBs help to reduce stalls for workloads that require a lot of
synchronization. On the other hand, if there is no notable
synchronization, we can use a large IB size to slightly improve
performance in some cases.

This introduces tuning of the IB size based on feedback on the average
buffer wait time. The average wait time is tracked with exponential
smoothing.
---
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c |  6 +-
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 17 -
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.h |  2 ++
 src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h |  9 +
 4 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 036301e..4b8554d 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -195,8 +195,10 @@ static void *amdgpu_bo_map(struct pb_buffer *buf,
return NULL;
 }
  }
+ amdgpu_winsys_update_buffer_wait_avg(bo->ws, 0);
   } else {
  uint64_t time = os_time_get_nano();
+ uint64_t duration;
 
  if (!(usage & PIPE_TRANSFER_WRITE)) {
 /* Mapping for read.
@@ -221,7 +223,9 @@ static void *amdgpu_bo_map(struct pb_buffer *buf,
RADEON_USAGE_READWRITE);
  }
 
- bo->ws->buffer_wait_time += os_time_get_nano() - time;
+ duration = os_time_get_nano() - time;
+ bo->ws->buffer_wait_time += duration;
+ amdgpu_winsys_update_buffer_wait_avg(bo->ws, duration);
   }
}
 
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 69902c4..b9a7c5b 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -201,10 +201,7 @@ amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx 
*rwctx)
 static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_ib *ib,
   struct amdgpu_cs_ib_info *info, unsigned ib_type)
 {
-   /* Small IBs are better than big IBs, because the GPU goes idle quicker
-* and there is less waiting for buffers and fences. Proof:
-*   http://www.phoronix.com/scan.php?page=article=mesa-111-si=1
-*/
+   struct amdgpu_winsys *aws = (struct amdgpu_winsys *)ws;
unsigned buffer_size, ib_size;
 
switch (ib_type) {
@@ -216,8 +213,18 @@ static bool amdgpu_get_new_ib(struct radeon_winsys *ws, 
struct amdgpu_ib *ib,
   ib_size = 128 * 1024 * 4;
   break;
case IB_MAIN:
+  /* Small IBs are often better than big IBs, because the GPU goes idle
+   * quicker and there is less waiting for buffers and fences. Proof:
+   *   http://www.phoronix.com/scan.php?page=article=mesa-111-si=1
+   * Tune IB size depending on average buffer waiting time, which is an
+   * indicator for the amount of synchronization going on. Some
+   * applications don't cause notable synchronization, so we can use
+   * large IB size for slightly improved throughput.
+   */
   buffer_size = 128 * 1024 * 4;
-  ib_size = 20 * 1024 * 4;
+  ib_size = 32 * 1024 * 4;
+  if (aws->buffer_wait_time_avg > IB_SIZE_WAIT_THRESHOLD_NS)
+ ib_size = 10 * 1024 * 4;
}
 
ib->base.cdw = 0;
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
index 4ed830b..98e58a2 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@@ -35,6 +35,8 @@
 #include "amdgpu_bo.h"
 #include "util/u_memory.h"
 
+#define IB_SIZE_WAIT_THRESHOLD_NS   1
+
 struct amdgpu_ctx {
struct amdgpu_winsys *ws;
amdgpu_context_handle ctx;
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h 
b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
index 91b9be4..3bd63b6 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
@@ -54,6 +54,7 @@ struct amdgpu_winsys {
uint64_t allocated_vram;
uint64_t allocated_gtt;
uint64_t buffer_wait_time; /* time spent in buffer_wait in ns */
+   uint64_t buffer_wait_time_avg;
uint64_t num_cs_flushes;
unsigned gart_page_size;
 
@@ -76,6 +77,14 @@ amdgpu_winsys(struct radeon_winsys *base)
return (struct amdgpu_winsys*)base;
 }
 
+static inline
+void amdgpu_winsys_update_buffer_wait_avg(struct amdgpu_winsys *ws,
+  uint64_t wait)
+{
+   /* Exponential smoothing with alpha = 0.25 */
+   ws->buffer_wait_time_avg = (3 * ws->buffer_wait_time_avg + wait) / 4;
+}
+
 void amdgpu_surface_init_functions(struct amdgpu_winsys *ws);
 ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws);
 
-- 
1.9.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev