Avoid constant register reloads while emitting IBs by using a local write
pointer and only updating the size at the end of each helper.

Signed-off-by: Tvrtko Ursulin <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 119 +++++++++++++++----------
 1 file changed, 70 insertions(+), 49 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
index 326ecc8d37d2..f8e1e5a7835f 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
@@ -1077,19 +1077,22 @@ static void sdma_v7_0_vm_copy_pte(struct amdgpu_ib *ib,
                                  uint64_t pe, uint64_t src,
                                  unsigned count)
 {
+       u32 *ptr = &ib->ptr[ib->length_dw];
        unsigned bytes = count * 8;
 
-       ib->ptr[ib->length_dw++] = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_COPY) 
|
-               SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR) |
-               SDMA_PKT_COPY_LINEAR_HEADER_CPV(1);
+       *ptr++ = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_COPY) |
+                SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR) |
+                SDMA_PKT_COPY_LINEAR_HEADER_CPV(1);
 
-       ib->ptr[ib->length_dw++] = bytes - 1;
-       ib->ptr[ib->length_dw++] = 0; /* src/dst endian swap */
-       ib->ptr[ib->length_dw++] = lower_32_bits(src);
-       ib->ptr[ib->length_dw++] = upper_32_bits(src);
-       ib->ptr[ib->length_dw++] = lower_32_bits(pe);
-       ib->ptr[ib->length_dw++] = upper_32_bits(pe);
-       ib->ptr[ib->length_dw++] = 0;
+       *ptr++ = bytes - 1;
+       *ptr++ = 0; /* src/dst endian swap */
+       *ptr++ = lower_32_bits(src);
+       *ptr++ = upper_32_bits(src);
+       *ptr++ = lower_32_bits(pe);
+       *ptr++ = upper_32_bits(pe);
+       *ptr++ = 0;
+
+       ib->length_dw = ptr - ib->ptr;
 
 }
 
@@ -1108,18 +1111,21 @@ static void sdma_v7_0_vm_write_pte(struct amdgpu_ib 
*ib, uint64_t pe,
                                   uint64_t value, unsigned count,
                                   uint32_t incr)
 {
+       u32 *ptr = &ib->ptr[ib->length_dw];
        unsigned ndw = count * 2;
 
-       ib->ptr[ib->length_dw++] = 
SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_WRITE) |
-               SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_WRITE_LINEAR);
-       ib->ptr[ib->length_dw++] = lower_32_bits(pe);
-       ib->ptr[ib->length_dw++] = upper_32_bits(pe);
-       ib->ptr[ib->length_dw++] = ndw - 1;
+       *ptr++ = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_WRITE) |
+                SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_WRITE_LINEAR);
+       *ptr++ = lower_32_bits(pe);
+       *ptr++ = upper_32_bits(pe);
+       *ptr++ = ndw - 1;
        for (; ndw > 0; ndw -= 2) {
-               ib->ptr[ib->length_dw++] = lower_32_bits(value);
-               ib->ptr[ib->length_dw++] = upper_32_bits(value);
+               *ptr++ = lower_32_bits(value);
+               *ptr++ = upper_32_bits(value);
                value += incr;
        }
+
+       ib->length_dw = ptr - ib->ptr;
 }
 
 /**
@@ -1139,17 +1145,21 @@ static void sdma_v7_0_vm_set_pte_pde(struct amdgpu_ib 
*ib,
                                     uint64_t addr, unsigned count,
                                     uint32_t incr, uint64_t flags)
 {
+       u32 *ptr = &ib->ptr[ib->length_dw];
+
        /* for physically contiguous pages (vram) */
-       ib->ptr[ib->length_dw++] = 
SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_PTEPDE);
-       ib->ptr[ib->length_dw++] = lower_32_bits(pe); /* dst addr */
-       ib->ptr[ib->length_dw++] = upper_32_bits(pe);
-       ib->ptr[ib->length_dw++] = lower_32_bits(flags); /* mask */
-       ib->ptr[ib->length_dw++] = upper_32_bits(flags);
-       ib->ptr[ib->length_dw++] = lower_32_bits(addr); /* value */
-       ib->ptr[ib->length_dw++] = upper_32_bits(addr);
-       ib->ptr[ib->length_dw++] = incr; /* increment size */
-       ib->ptr[ib->length_dw++] = 0;
-       ib->ptr[ib->length_dw++] = count - 1; /* number of entries */
+       *ptr++ = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_PTEPDE);
+       *ptr++ = lower_32_bits(pe); /* dst addr */
+       *ptr++ = upper_32_bits(pe);
+       *ptr++ = lower_32_bits(flags); /* mask */
+       *ptr++ = upper_32_bits(flags);
+       *ptr++ = lower_32_bits(addr); /* value */
+       *ptr++ = upper_32_bits(addr);
+       *ptr++ = incr; /* increment size */
+       *ptr++ = 0;
+       *ptr++ = count - 1; /* number of entries */
+
+       ib->length_dw = ptr - ib->ptr;
 }
 
 /**
@@ -1163,18 +1173,22 @@ static void sdma_v7_0_vm_set_pte_pde(struct amdgpu_ib 
*ib,
 static void sdma_v7_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib 
*ib)
 {
        struct amdgpu_sdma_instance *sdma = 
amdgpu_sdma_get_instance_from_ring(ring);
+       u32 *ptr = &ib->ptr[ib->length_dw];
        u32 pad_count;
        int i;
 
        pad_count = (-ib->length_dw) & 0x7;
+       if (!pad_count)
+               return;
+
        for (i = 0; i < pad_count; i++)
                if (sdma && sdma->burst_nop && (i == 0))
-                       ib->ptr[ib->length_dw++] =
-                               SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_NOP) |
+                       *ptr++= SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_NOP) |
                                SDMA_PKT_NOP_HEADER_COUNT(pad_count - 1);
                else
-                       ib->ptr[ib->length_dw++] =
-                               SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_NOP);
+                       *ptr++ = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_NOP);
+
+       ib->length_dw += pad_count;
 }
 
 /**
@@ -1765,31 +1779,34 @@ static void sdma_v7_0_emit_copy_buffer(struct amdgpu_ib 
*ib,
                                       uint32_t copy_flags)
 {
        uint32_t num_type, data_format, max_com, write_cm;
+       u32 *ptr = &ib->ptr[ib->length_dw];
 
        max_com = AMDGPU_COPY_FLAGS_GET(copy_flags, MAX_COMPRESSED);
        data_format = AMDGPU_COPY_FLAGS_GET(copy_flags, DATA_FORMAT);
        num_type = AMDGPU_COPY_FLAGS_GET(copy_flags, NUMBER_TYPE);
        write_cm = AMDGPU_COPY_FLAGS_GET(copy_flags, WRITE_COMPRESS_DISABLE) ? 
2 : 1;
 
-       ib->ptr[ib->length_dw++] = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_COPY) 
|
-               SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR) |
-               SDMA_PKT_COPY_LINEAR_HEADER_TMZ((copy_flags & 
AMDGPU_COPY_FLAGS_TMZ) ? 1 : 0) |
-               SDMA_PKT_COPY_LINEAR_HEADER_CPV(1);
+       *ptr++ = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_COPY) |
+                SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR) |
+                SDMA_PKT_COPY_LINEAR_HEADER_TMZ((copy_flags & 
AMDGPU_COPY_FLAGS_TMZ) ? 1 : 0) |
+                SDMA_PKT_COPY_LINEAR_HEADER_CPV(1);
 
-       ib->ptr[ib->length_dw++] = byte_count - 1;
-       ib->ptr[ib->length_dw++] = 0; /* src/dst endian swap */
-       ib->ptr[ib->length_dw++] = lower_32_bits(src_offset);
-       ib->ptr[ib->length_dw++] = upper_32_bits(src_offset);
-       ib->ptr[ib->length_dw++] = lower_32_bits(dst_offset);
-       ib->ptr[ib->length_dw++] = upper_32_bits(dst_offset);
+       *ptr++ = byte_count - 1;
+       *ptr++ = 0; /* src/dst endian swap */
+       *ptr++ = lower_32_bits(src_offset);
+       *ptr++ = upper_32_bits(src_offset);
+       *ptr++ = lower_32_bits(dst_offset);
+       *ptr++ = upper_32_bits(dst_offset);
 
        if ((copy_flags & (AMDGPU_COPY_FLAGS_READ_DECOMPRESSED | 
AMDGPU_COPY_FLAGS_WRITE_COMPRESSED)))
-               ib->ptr[ib->length_dw++] = SDMA_DCC_DATA_FORMAT(data_format) | 
SDMA_DCC_NUM_TYPE(num_type) |
+               *ptr++ = SDMA_DCC_DATA_FORMAT(data_format) | 
SDMA_DCC_NUM_TYPE(num_type) |
                        ((copy_flags & AMDGPU_COPY_FLAGS_READ_DECOMPRESSED) ? 
SDMA_DCC_READ_CM(2) : 0) |
                        ((copy_flags & AMDGPU_COPY_FLAGS_WRITE_COMPRESSED) ? 
SDMA_DCC_WRITE_CM(write_cm) : 0) |
                        SDMA_DCC_MAX_COM(max_com) | SDMA_DCC_MAX_UCOM(1);
        else
-               ib->ptr[ib->length_dw++] = 0;
+               *ptr++ = 0;
+
+       ib->length_dw = ptr - ib->ptr;
 }
 
 /**
@@ -1807,12 +1824,16 @@ static void sdma_v7_0_emit_fill_buffer(struct amdgpu_ib 
*ib,
                                       uint64_t dst_offset,
                                       uint32_t byte_count)
 {
-       ib->ptr[ib->length_dw++] = 
SDMA_PKT_CONSTANT_FILL_HEADER_OP(SDMA_OP_CONST_FILL) |
-               SDMA_PKT_CONSTANT_FILL_HEADER_COMPRESS(1);
-       ib->ptr[ib->length_dw++] = lower_32_bits(dst_offset);
-       ib->ptr[ib->length_dw++] = upper_32_bits(dst_offset);
-       ib->ptr[ib->length_dw++] = src_data;
-       ib->ptr[ib->length_dw++] = byte_count - 1;
+       u32 *ptr = &ib->ptr[ib->length_dw];
+
+       *ptr++ = SDMA_PKT_CONSTANT_FILL_HEADER_OP(SDMA_OP_CONST_FILL) |
+                SDMA_PKT_CONSTANT_FILL_HEADER_COMPRESS(1);
+       *ptr++ = lower_32_bits(dst_offset);
+       *ptr++ = upper_32_bits(dst_offset);
+       *ptr++ = src_data;
+       *ptr++ = byte_count - 1;
+
+       ib->length_dw = ptr - ib->ptr;
 }
 
 static const struct amdgpu_buffer_funcs sdma_v7_0_buffer_funcs = {
-- 
2.48.0

Reply via email to