Avoid constant register reloads while emitting IBs by using a local write pointer and only updating the size at the end of each helper.
Signed-off-by: Tvrtko Ursulin <tvrtko.ursu...@igalia.com> --- drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c | 108 +++++++++++++++---------- 1 file changed, 65 insertions(+), 43 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c index 1c076bd1cf73..01b95d1c7419 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c @@ -927,16 +927,19 @@ static void sdma_v3_0_vm_copy_pte(struct amdgpu_ib *ib, uint64_t pe, uint64_t src, unsigned count) { + u32 *ptr = &ib->ptr[ib->length_dw]; unsigned bytes = count * 8; - ib->ptr[ib->length_dw++] = SDMA_PKT_HEADER_OP(SDMA_OP_COPY) | - SDMA_PKT_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR); - ib->ptr[ib->length_dw++] = bytes; - ib->ptr[ib->length_dw++] = 0; /* src/dst endian swap */ - ib->ptr[ib->length_dw++] = lower_32_bits(src); - ib->ptr[ib->length_dw++] = upper_32_bits(src); - ib->ptr[ib->length_dw++] = lower_32_bits(pe); - ib->ptr[ib->length_dw++] = upper_32_bits(pe); + *ptr++ = SDMA_PKT_HEADER_OP(SDMA_OP_COPY) | + SDMA_PKT_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR); + *ptr++ = bytes; + *ptr++ = 0; /* src/dst endian swap */ + *ptr++ = lower_32_bits(src); + *ptr++ = upper_32_bits(src); + *ptr++ = lower_32_bits(pe); + *ptr++ = upper_32_bits(pe); + + ib->length_dw = ptr - ib->ptr; } /** @@ -954,18 +957,21 @@ static void sdma_v3_0_vm_write_pte(struct amdgpu_ib *ib, uint64_t pe, uint64_t value, unsigned count, uint32_t incr) { + u32 *ptr = &ib->ptr[ib->length_dw]; unsigned ndw = count * 2; - ib->ptr[ib->length_dw++] = SDMA_PKT_HEADER_OP(SDMA_OP_WRITE) | - SDMA_PKT_HEADER_SUB_OP(SDMA_SUBOP_WRITE_LINEAR); - ib->ptr[ib->length_dw++] = lower_32_bits(pe); - ib->ptr[ib->length_dw++] = upper_32_bits(pe); - ib->ptr[ib->length_dw++] = ndw; + *ptr++ = SDMA_PKT_HEADER_OP(SDMA_OP_WRITE) | + SDMA_PKT_HEADER_SUB_OP(SDMA_SUBOP_WRITE_LINEAR); + *ptr++ = lower_32_bits(pe); + *ptr++ = upper_32_bits(pe); + *ptr++ = ndw; for (; ndw > 0; ndw -= 2) { - ib->ptr[ib->length_dw++] = lower_32_bits(value); - ib->ptr[ib->length_dw++] = upper_32_bits(value); + *ptr++ = lower_32_bits(value); + *ptr++ = upper_32_bits(value); value += incr; } + + ib->length_dw = ptr - ib->ptr; } /** @@ -984,17 +990,21 @@ static void sdma_v3_0_vm_set_pte_pde(struct amdgpu_ib *ib, uint64_t pe, uint64_t addr, unsigned count, uint32_t incr, uint64_t flags) { + u32 *ptr = &ib->ptr[ib->length_dw]; + /* for physically contiguous pages (vram) */ - ib->ptr[ib->length_dw++] = SDMA_PKT_HEADER_OP(SDMA_OP_GEN_PTEPDE); - ib->ptr[ib->length_dw++] = lower_32_bits(pe); /* dst addr */ - ib->ptr[ib->length_dw++] = upper_32_bits(pe); - ib->ptr[ib->length_dw++] = lower_32_bits(flags); /* mask */ - ib->ptr[ib->length_dw++] = upper_32_bits(flags); - ib->ptr[ib->length_dw++] = lower_32_bits(addr); /* value */ - ib->ptr[ib->length_dw++] = upper_32_bits(addr); - ib->ptr[ib->length_dw++] = incr; /* increment size */ - ib->ptr[ib->length_dw++] = 0; - ib->ptr[ib->length_dw++] = count; /* number of entries */ + *ptr++ = SDMA_PKT_HEADER_OP(SDMA_OP_GEN_PTEPDE); + *ptr++ = lower_32_bits(pe); /* dst addr */ + *ptr++ = upper_32_bits(pe); + *ptr++ = lower_32_bits(flags); /* mask */ + *ptr++ = upper_32_bits(flags); + *ptr++ = lower_32_bits(addr); /* value */ + *ptr++ = upper_32_bits(addr); + *ptr++ = incr; /* increment size */ + *ptr++ = 0; + *ptr++ = count; /* number of entries */ + + ib->length_dw = ptr - ib->ptr; } /** @@ -1007,18 +1017,22 @@ static void sdma_v3_0_vm_set_pte_pde(struct amdgpu_ib *ib, uint64_t pe, static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib) { struct amdgpu_sdma_instance *sdma = amdgpu_sdma_get_instance_from_ring(ring); + u32 *ptr = &ib->ptr[ib->length_dw]; u32 pad_count; int i; pad_count = (-ib->length_dw) & 7; + if (!pad_count) + return; + for (i = 0; i < pad_count; i++) if (sdma && sdma->burst_nop && (i == 0)) - ib->ptr[ib->length_dw++] = - SDMA_PKT_HEADER_OP(SDMA_OP_NOP) | - SDMA_PKT_NOP_HEADER_COUNT(pad_count - 1); + *ptr++ = SDMA_PKT_HEADER_OP(SDMA_OP_NOP) | + SDMA_PKT_NOP_HEADER_COUNT(pad_count - 1); else - ib->ptr[ib->length_dw++] = - SDMA_PKT_HEADER_OP(SDMA_OP_NOP); + *ptr++ = SDMA_PKT_HEADER_OP(SDMA_OP_NOP); + + ib->length_dw = pad_count; } /** @@ -1626,14 +1640,18 @@ static void sdma_v3_0_emit_copy_buffer(struct amdgpu_ib *ib, uint32_t byte_count, uint32_t copy_flags) { - ib->ptr[ib->length_dw++] = SDMA_PKT_HEADER_OP(SDMA_OP_COPY) | - SDMA_PKT_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR); - ib->ptr[ib->length_dw++] = byte_count; - ib->ptr[ib->length_dw++] = 0; /* src/dst endian swap */ - ib->ptr[ib->length_dw++] = lower_32_bits(src_offset); - ib->ptr[ib->length_dw++] = upper_32_bits(src_offset); - ib->ptr[ib->length_dw++] = lower_32_bits(dst_offset); - ib->ptr[ib->length_dw++] = upper_32_bits(dst_offset); + u32 *ptr = &ib->ptr[ib->length_dw]; + + *ptr++ = SDMA_PKT_HEADER_OP(SDMA_OP_COPY) | + SDMA_PKT_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR); + *ptr++ = byte_count; + *ptr++ = 0; /* src/dst endian swap */ + *ptr++ = lower_32_bits(src_offset); + *ptr++ = upper_32_bits(src_offset); + *ptr++ = lower_32_bits(dst_offset); + *ptr++ = upper_32_bits(dst_offset); + + ib->length_dw = ptr - ib->ptr; } /** @@ -1651,11 +1669,15 @@ static void sdma_v3_0_emit_fill_buffer(struct amdgpu_ib *ib, uint64_t dst_offset, uint32_t byte_count) { - ib->ptr[ib->length_dw++] = SDMA_PKT_HEADER_OP(SDMA_OP_CONST_FILL); - ib->ptr[ib->length_dw++] = lower_32_bits(dst_offset); - ib->ptr[ib->length_dw++] = upper_32_bits(dst_offset); - ib->ptr[ib->length_dw++] = src_data; - ib->ptr[ib->length_dw++] = byte_count; + u32 *ptr = &ib->ptr[ib->length_dw]; + + *ptr++ = SDMA_PKT_HEADER_OP(SDMA_OP_CONST_FILL); + *ptr++ = lower_32_bits(dst_offset); + *ptr++ = upper_32_bits(dst_offset); + *ptr++ = src_data; + *ptr++ = byte_count; + + ib->length_dw = ptr - ib->ptr; } static const struct amdgpu_buffer_funcs sdma_v3_0_buffer_funcs = { -- 2.48.0