Avoid constant register reloads while emitting IBs by using a local write
pointer and only updating the size at the end of each helper.

Signed-off-by: Tvrtko Ursulin <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 51 +++++++++++++------------
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
index 8058ea91ecaf..884c091f40ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
@@ -354,6 +354,7 @@ static int gfx_v9_4_2_run_shader(struct amdgpu_device *adev,
        int r, i;
        uint32_t total_size, shader_offset;
        u64 gpu_addr;
+       u32 *ptr;
 
        total_size = (regs_size * 3 + 4 + 5 + 5) * 4;
        total_size = ALIGN(total_size, 256);
@@ -370,43 +371,43 @@ static int gfx_v9_4_2_run_shader(struct amdgpu_device 
*adev,
        }
 
        /* load the compute shaders */
-       for (i = 0; i < shader_size/sizeof(u32); i++)
-               ib->ptr[i + (shader_offset / 4)] = shader_ptr[i];
+       memcpy(&ib->ptr[shader_offset / sizeof(u32)], shader_ptr, shader_size);
 
-       /* init the ib length to 0 */
-       ib->length_dw = 0;
+       ptr = ib->ptr;
 
        /* write the register state for the compute dispatch */
        for (i = 0; i < regs_size; i++) {
-               ib->ptr[ib->length_dw++] = PACKET3(PACKET3_SET_SH_REG, 1);
-               ib->ptr[ib->length_dw++] = SOC15_REG_ENTRY_OFFSET(init_regs[i])
-                                                               - 
PACKET3_SET_SH_REG_START;
-               ib->ptr[ib->length_dw++] = init_regs[i].reg_value;
+               *ptr++ = PACKET3(PACKET3_SET_SH_REG, 1);
+               *ptr++ = SOC15_REG_ENTRY_OFFSET(init_regs[i]) -
+                        PACKET3_SET_SH_REG_START;
+               *ptr++ = init_regs[i].reg_value;
        }
 
        /* write the shader start address: mmCOMPUTE_PGM_LO, mmCOMPUTE_PGM_HI */
        gpu_addr = (ib->gpu_addr + (u64)shader_offset) >> 8;
-       ib->ptr[ib->length_dw++] = PACKET3(PACKET3_SET_SH_REG, 2);
-       ib->ptr[ib->length_dw++] = SOC15_REG_OFFSET(GC, 0, regCOMPUTE_PGM_LO)
-                                                       - 
PACKET3_SET_SH_REG_START;
-       ib->ptr[ib->length_dw++] = lower_32_bits(gpu_addr);
-       ib->ptr[ib->length_dw++] = upper_32_bits(gpu_addr);
+       *ptr++ = PACKET3(PACKET3_SET_SH_REG, 2);
+       *ptr++ = SOC15_REG_OFFSET(GC, 0, regCOMPUTE_PGM_LO)
+                - PACKET3_SET_SH_REG_START;
+       *ptr++ = lower_32_bits(gpu_addr);
+       *ptr++ = upper_32_bits(gpu_addr);
 
        /* write the wb buffer address */
-       ib->ptr[ib->length_dw++] = PACKET3(PACKET3_SET_SH_REG, 3);
-       ib->ptr[ib->length_dw++] = SOC15_REG_OFFSET(GC, 0, 
regCOMPUTE_USER_DATA_0)
-                                                       - 
PACKET3_SET_SH_REG_START;
-       ib->ptr[ib->length_dw++] = lower_32_bits(wb_gpu_addr);
-       ib->ptr[ib->length_dw++] = upper_32_bits(wb_gpu_addr);
-       ib->ptr[ib->length_dw++] = pattern;
+       *ptr++ = PACKET3(PACKET3_SET_SH_REG, 3);
+       *ptr++ = SOC15_REG_OFFSET(GC, 0, regCOMPUTE_USER_DATA_0) -
+                PACKET3_SET_SH_REG_START;
+       *ptr++ = lower_32_bits(wb_gpu_addr);
+       *ptr++ = upper_32_bits(wb_gpu_addr);
+       *ptr++ = pattern;
 
        /* write dispatch packet */
-       ib->ptr[ib->length_dw++] = PACKET3(PACKET3_DISPATCH_DIRECT, 3);
-       ib->ptr[ib->length_dw++] = compute_dim_x; /* x */
-       ib->ptr[ib->length_dw++] = 1; /* y */
-       ib->ptr[ib->length_dw++] = 1; /* z */
-       ib->ptr[ib->length_dw++] =
-               REG_SET_FIELD(0, COMPUTE_DISPATCH_INITIATOR, COMPUTE_SHADER_EN, 
1);
+       *ptr++ = PACKET3(PACKET3_DISPATCH_DIRECT, 3);
+       *ptr++ = compute_dim_x; /* x */
+       *ptr++ = 1; /* y */
+       *ptr++ = 1; /* z */
+       *ptr++ = REG_SET_FIELD(0, COMPUTE_DISPATCH_INITIATOR, COMPUTE_SHADER_EN,
+                              1);
+
+       ib->length_dw = ptr - ib->ptr;
 
        /* shedule the ib on the ring */
        r = amdgpu_ib_schedule(ring, 1, ib, NULL, fence_ptr);
-- 
2.48.0

Reply via email to