If an app keeps building and resetting the command buffers, most
command buffers will  have 1 CS bo and 1 upload bo. This merges them
by allocating uploads from the end of the CS bo.

On dota2, this goes from ~700 to ~400 BOs after the merging in the
create_bo_list. Sadly no discernible performance difference though.
The create_bo_list CPU usage went down by about half, but looks
like this thread is not a bottleneck.

We still need to keep the old path for SI where we don't write the
CS to a BO immegiately.
---
 src/amd/vulkan/radv_cmd_buffer.c              | 11 +++++
 src/amd/vulkan/radv_cs.h                      |  2 +-
 src/amd/vulkan/radv_radeon_winsys.h           | 15 ++++--
 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 67 ++++++++++++++-------------
 4 files changed, 58 insertions(+), 37 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 6dfae4d5e3..7df7928180 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -367,6 +367,17 @@ radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer 
*cmd_buffer,
                             unsigned *out_offset,
                             void **ptr)
 {
+       if (cmd_buffer->cs->bo) {
+               radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, size 
+ alignment);
+               cmd_buffer->cs->max_dw -= size;
+               cmd_buffer->cs->max_dw &= ~(MAX2(alignment >> 2, 1u) - 1u);
+
+               *out_offset = cmd_buffer->cs->max_dw * 4;
+               *ptr = cmd_buffer->cs->buf + cmd_buffer->cs->max_dw;
+               *bo = cmd_buffer->cs->bo;
+               return true;
+       }
+
        uint64_t offset = align(cmd_buffer->upload.offset, alignment);
        if (offset + size > cmd_buffer->upload.size) {
                if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size))
diff --git a/src/amd/vulkan/radv_cs.h b/src/amd/vulkan/radv_cs.h
index 840597686a..9ca5d41b1d 100644
--- a/src/amd/vulkan/radv_cs.h
+++ b/src/amd/vulkan/radv_cs.h
@@ -34,7 +34,7 @@ static inline unsigned radeon_check_space(struct 
radeon_winsys *ws,
                                       struct radeon_winsys_cs *cs,
                                       unsigned needed)
 {
-        if (cs->max_dw - cs->cdw < needed)
+        if (cs->cdw + cs->reserved_dw + needed >= cs->max_dw)
                 ws->cs_grow(cs, needed);
         return cs->cdw + needed;
 }
diff --git a/src/amd/vulkan/radv_radeon_winsys.h 
b/src/amd/vulkan/radv_radeon_winsys.h
index 341e40505c..a1b2b011f0 100644
--- a/src/amd/vulkan/radv_radeon_winsys.h
+++ b/src/amd/vulkan/radv_radeon_winsys.h
@@ -94,10 +94,19 @@ enum radeon_value_id {
        RADEON_CURRENT_MCLK,
 };
 
+struct radeon_winsys_bo {
+       uint64_t va;
+       bool is_local;
+};
+
 struct radeon_winsys_cs {
        unsigned cdw;  /* Number of used dwords. */
-       unsigned max_dw; /* Maximum number of dwords. */
+       unsigned max_dw; /* Maximum number of dwords. With a combined upload 
buf, this is the start of the upload buf. */
        uint32_t *buf; /* The base pointer of the chunk. */
+
+       /* For having a combined CS / upload buffer */
+       struct radeon_winsys_bo *bo; /* the bo backing the CS if there is one */
+       uint32_t reserved_dw; /* Required buffer space between the end of the 
CS and the start of the upload data. */
 };
 
 #define RADEON_SURF_TYPE_MASK                   0xFF
@@ -159,10 +168,6 @@ struct radeon_bo_metadata {
 uint32_t syncobj_handle;
 struct radeon_winsys_fence;
 
-struct radeon_winsys_bo {
-       uint64_t va;
-       bool is_local;
-};
 struct radv_winsys_sem_counts {
        uint32_t syncobj_count;
        uint32_t sem_count;
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c 
b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
index 0ee56f9144..2d103e2b09 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
@@ -44,7 +44,8 @@ struct radv_amdgpu_cs {
 
        struct amdgpu_cs_ib_info    ib;
 
-       struct radeon_winsys_bo     *ib_buffer;
+       uint32_t                    ib_size;
+
        uint8_t                 *ib_mapped;
        unsigned                    max_num_buffers;
        unsigned                    num_buffers;
@@ -158,8 +159,8 @@ static void radv_amdgpu_cs_destroy(struct radeon_winsys_cs 
*rcs)
 {
        struct radv_amdgpu_cs *cs = radv_amdgpu_cs(rcs);
 
-       if (cs->ib_buffer)
-               cs->ws->base.buffer_destroy(cs->ib_buffer);
+       if (cs->base.bo)
+               cs->ws->base.buffer_destroy(cs->base.bo);
        else
                free(cs->base.buf);
 
@@ -199,32 +200,34 @@ radv_amdgpu_cs_create(struct radeon_winsys *ws,
        radv_amdgpu_init_cs(cs, ring_type);
 
        if (cs->ws->use_ib_bos) {
-               cs->ib_buffer = ws->buffer_create(ws, ib_size, 0,
+               cs->base.bo = ws->buffer_create(ws, ib_size, 0,
                                                  RADEON_DOMAIN_GTT,
                                                  RADEON_FLAG_CPU_ACCESS |
-                                                 
RADEON_FLAG_NO_INTERPROCESS_SHARING |
-                                                 RADEON_FLAG_READ_ONLY);
-               if (!cs->ib_buffer) {
+                                                 
RADEON_FLAG_NO_INTERPROCESS_SHARING);
+               if (!cs->base.bo) {
                        free(cs);
                        return NULL;
                }
 
-               cs->ib_mapped = ws->buffer_map(cs->ib_buffer);
+               cs->ib_mapped = ws->buffer_map(cs->base.bo);
                if (!cs->ib_mapped) {
-                       ws->buffer_destroy(cs->ib_buffer);
+                       ws->buffer_destroy(cs->base.bo);
                        free(cs);
                        return NULL;
                }
 
-               cs->ib.ib_mc_address = 
radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
+               cs->ib.ib_mc_address = 
radv_amdgpu_winsys_bo(cs->base.bo)->base.va;
+               cs->ib_size = ib_size;
                cs->base.buf = (uint32_t *)cs->ib_mapped;
-               cs->base.max_dw = ib_size / 4 - 4;
+               cs->base.max_dw = ib_size / 4;
+               cs->base.reserved_dw = 13; /* 4 for chaining + up to 7 for 
alignment */
                cs->ib_size_ptr = &cs->ib.size;
                cs->ib.size = 0;
 
-               ws->cs_add_buffer(&cs->base, cs->ib_buffer, 8);
+               ws->cs_add_buffer(&cs->base, cs->base.bo, 8);
        } else {
                cs->base.buf = malloc(16384);
+               cs->ib_size = 16384;
                cs->base.max_dw = 4096;
                if (!cs->base.buf) {
                        free(cs);
@@ -268,7 +271,7 @@ static void radv_amdgpu_cs_grow(struct radeon_winsys_cs 
*_cs, size_t min_size)
                return;
        }
 
-       uint64_t ib_size = MAX2(min_size * 4 + 16, cs->base.max_dw * 4 * 2);
+       uint64_t ib_size = MAX2((min_size + cs->base.reserved_dw) * 4, 
cs->ib_size * 2);
 
        /* max that fits in the chain size field. */
        ib_size = MIN2(ib_size, 0xfffff);
@@ -284,39 +287,39 @@ static void radv_amdgpu_cs_grow(struct radeon_winsys_cs 
*_cs, size_t min_size)
                                             cs->max_num_old_ib_buffers * 
sizeof(void*));
        }
 
-       cs->old_ib_buffers[cs->num_old_ib_buffers++] = cs->ib_buffer;
+       cs->old_ib_buffers[cs->num_old_ib_buffers++] = cs->base.bo;
 
-       cs->ib_buffer = cs->ws->base.buffer_create(&cs->ws->base, ib_size, 0,
+       cs->base.bo = cs->ws->base.buffer_create(&cs->ws->base, ib_size, 0,
                                                   RADEON_DOMAIN_GTT,
                                                   RADEON_FLAG_CPU_ACCESS |
-                                                  
RADEON_FLAG_NO_INTERPROCESS_SHARING |
-                                                  RADEON_FLAG_READ_ONLY);
+                                                  
RADEON_FLAG_NO_INTERPROCESS_SHARING);
 
-       if (!cs->ib_buffer) {
+       if (!cs->base.bo) {
                cs->base.cdw = 0;
                cs->failed = true;
-               cs->ib_buffer = cs->old_ib_buffers[--cs->num_old_ib_buffers];
+               cs->base.bo = cs->old_ib_buffers[--cs->num_old_ib_buffers];
        }
 
-       cs->ib_mapped = cs->ws->base.buffer_map(cs->ib_buffer);
+       cs->ib_mapped = cs->ws->base.buffer_map(cs->base.bo);
        if (!cs->ib_mapped) {
-               cs->ws->base.buffer_destroy(cs->ib_buffer);
+               cs->ws->base.buffer_destroy(cs->base.bo);
                cs->base.cdw = 0;
                cs->failed = true;
-               cs->ib_buffer = cs->old_ib_buffers[--cs->num_old_ib_buffers];
+               cs->base.bo = cs->old_ib_buffers[--cs->num_old_ib_buffers];
        }
 
-       cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer, 8);
+       cs->ws->base.cs_add_buffer(&cs->base, cs->base.bo, 8);
 
        cs->base.buf[cs->base.cdw++] = PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0);
-       cs->base.buf[cs->base.cdw++] = 
radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
-       cs->base.buf[cs->base.cdw++] = 
radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va >> 32;
+       cs->base.buf[cs->base.cdw++] = 
radv_amdgpu_winsys_bo(cs->base.bo)->base.va;
+       cs->base.buf[cs->base.cdw++] = 
radv_amdgpu_winsys_bo(cs->base.bo)->base.va >> 32;
        cs->ib_size_ptr = cs->base.buf + cs->base.cdw;
        cs->base.buf[cs->base.cdw++] = S_3F2_CHAIN(1) | S_3F2_VALID(1);
 
        cs->base.buf = (uint32_t *)cs->ib_mapped;
        cs->base.cdw = 0;
-       cs->base.max_dw = ib_size / 4 - 4;
+       cs->base.max_dw = ib_size / 4;
+       cs->ib_size = ib_size;
 
 }
 
@@ -340,6 +343,7 @@ static void radv_amdgpu_cs_reset(struct radeon_winsys_cs 
*_cs)
 {
        struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
        cs->base.cdw = 0;
+       cs->base.max_dw = cs->ib_size / 4;
        cs->failed = false;
 
        for (unsigned i = 0; i < cs->num_buffers; ++i) {
@@ -357,13 +361,13 @@ static void radv_amdgpu_cs_reset(struct radeon_winsys_cs 
*_cs)
        cs->num_virtual_buffers = 0;
 
        if (cs->ws->use_ib_bos) {
-               cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer, 8);
+               cs->ws->base.cs_add_buffer(&cs->base, cs->base.bo, 8);
 
                for (unsigned i = 0; i < cs->num_old_ib_buffers; ++i)
                        cs->ws->base.buffer_destroy(cs->old_ib_buffers[i]);
 
                cs->num_old_ib_buffers = 0;
-               cs->ib.ib_mc_address = 
radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
+               cs->ib.ib_mc_address = 
radv_amdgpu_winsys_bo(cs->base.bo)->base.va;
                cs->ib_size_ptr = &cs->ib.size;
                cs->ib.size = 0;
        }
@@ -497,7 +501,7 @@ static void radv_amdgpu_cs_execute_secondary(struct 
radeon_winsys_cs *_parent,
        }
 
        if (parent->ws->use_ib_bos) {
-               if (parent->base.cdw + 4 > parent->base.max_dw)
+               if (parent->base.cdw + 4 + parent->base.reserved_dw > 
parent->base.max_dw)
                        radv_amdgpu_cs_grow(&parent->base, 4);
 
                parent->base.buf[parent->base.cdw++] = 
PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0);
@@ -647,6 +651,7 @@ static int radv_amdgpu_create_bo_list(struct 
radv_amdgpu_winsys *ws,
                        *bo_list = 0;
                }
 
+
                free(handles);
                free(priorities);
        }
@@ -984,13 +989,13 @@ static void *radv_amdgpu_winsys_get_cpu_addr(void *_cs, 
uint64_t addr)
        struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
        void *ret = NULL;
 
-       if (!cs->ib_buffer)
+       if (!cs->base.bo)
                return NULL;
        for (unsigned i = 0; i <= cs->num_old_ib_buffers; ++i) {
                struct radv_amdgpu_winsys_bo *bo;
 
                bo = (struct radv_amdgpu_winsys_bo*)
-                      (i == cs->num_old_ib_buffers ? cs->ib_buffer : 
cs->old_ib_buffers[i]);
+                      (i == cs->num_old_ib_buffers ? cs->base.bo : 
cs->old_ib_buffers[i]);
                if (addr >= bo->base.va && addr - bo->base.va < bo->size) {
                        if (amdgpu_bo_cpu_map(bo->bo, &ret) == 0)
                                return (char *)ret + (addr - bo->base.va);
-- 
2.15.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to