Avoid constant register reloads while emitting IBs by using a local write pointer and only updating the size at the end of each helper.
Signed-off-by: Tvrtko Ursulin <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 134 +++++++++++++----------- 1 file changed, 75 insertions(+), 59 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c index 5e0786ea911b..b9958778cd83 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c @@ -619,9 +619,12 @@ static int amdgpu_vcn_dec_send_msg(struct amdgpu_ring *ring, { u64 addr = AMDGPU_GPU_PAGE_ALIGN(ib_msg->gpu_addr); struct amdgpu_device *adev = ring->adev; + struct amdgpu_vcn_reg *internal = &adev->vcn.inst[ring->me].internal; + uint32_t nop = PACKET0(internal->nop, 0); struct dma_fence *f = NULL; struct amdgpu_job *job; struct amdgpu_ib *ib; + uint32_t *ptr; int i, r; r = amdgpu_job_alloc_with_ib(ring->adev, NULL, NULL, @@ -631,15 +634,16 @@ static int amdgpu_vcn_dec_send_msg(struct amdgpu_ring *ring, goto err; ib = &job->ibs[0]; - ib->ptr[0] = PACKET0(adev->vcn.inst[ring->me].internal.data0, 0); - ib->ptr[1] = addr; - ib->ptr[2] = PACKET0(adev->vcn.inst[ring->me].internal.data1, 0); - ib->ptr[3] = addr >> 32; - ib->ptr[4] = PACKET0(adev->vcn.inst[ring->me].internal.cmd, 0); - ib->ptr[5] = 0; + ptr = ib->ptr; + *ptr++ = PACKET0(internal->data0, 0); + *ptr++ = addr; + *ptr++ = PACKET0(internal->data1, 0); + *ptr++ = addr >> 32; + *ptr++ = PACKET0(internal->cmd, 0); + *ptr++ = 0; for (i = 6; i < 16; i += 2) { - ib->ptr[i] = PACKET0(adev->vcn.inst[ring->me].internal.nop, 0); - ib->ptr[i+1] = 0; + *ptr++ = nop; + *ptr++ = 0; } ib->length_dw = 16; @@ -759,17 +763,20 @@ int amdgpu_vcn_dec_ring_test_ib(struct amdgpu_ring *ring, long timeout) static uint32_t *amdgpu_vcn_unified_ring_ib_header(struct amdgpu_ib *ib, uint32_t ib_pack_in_dw, bool enc) { + u32 *ptr = &ib->ptr[ib->length_dw]; uint32_t *ib_checksum; - ib->ptr[ib->length_dw++] = 0x00000010; /* single queue checksum */ - ib->ptr[ib->length_dw++] = 0x30000002; - ib_checksum = &ib->ptr[ib->length_dw++]; - ib->ptr[ib->length_dw++] = ib_pack_in_dw; + *ptr++ = 0x00000010; /* single queue checksum */ + *ptr++ = 0x30000002; + ib_checksum = ptr++; + *ptr++ = ib_pack_in_dw; - ib->ptr[ib->length_dw++] = 0x00000010; /* engine info */ - ib->ptr[ib->length_dw++] = 0x30000001; - ib->ptr[ib->length_dw++] = enc ? 0x2 : 0x3; - ib->ptr[ib->length_dw++] = ib_pack_in_dw * sizeof(uint32_t); + *ptr++ = 0x00000010; /* engine info */ + *ptr++ = 0x30000001; + *ptr++ = enc ? 0x2 : 0x3; + *ptr++ = ib_pack_in_dw * sizeof(uint32_t); + + ib->length_dw = ptr - ib->ptr; return ib_checksum; } @@ -799,7 +806,8 @@ static int amdgpu_vcn_dec_sw_send_msg(struct amdgpu_ring *ring, uint64_t addr = AMDGPU_GPU_PAGE_ALIGN(ib_msg->gpu_addr); uint32_t *ib_checksum; uint32_t ib_pack_in_dw; - int i, r; + u32 *ptr; + int r; if (adev->vcn.inst[ring->me].using_unified_queue) ib_size_dw += 8; @@ -820,18 +828,20 @@ static int amdgpu_vcn_dec_sw_send_msg(struct amdgpu_ring *ring, ib_checksum = amdgpu_vcn_unified_ring_ib_header(ib, ib_pack_in_dw, false); } - ib->ptr[ib->length_dw++] = sizeof(struct amdgpu_vcn_decode_buffer) + 8; - ib->ptr[ib->length_dw++] = cpu_to_le32(AMDGPU_VCN_IB_FLAG_DECODE_BUFFER); - decode_buffer = (struct amdgpu_vcn_decode_buffer *)&(ib->ptr[ib->length_dw]); - ib->length_dw += sizeof(struct amdgpu_vcn_decode_buffer) / 4; + ptr = &ib->ptr[ib->length_dw]; + + *ptr++ = sizeof(struct amdgpu_vcn_decode_buffer) + 8; + *ptr++ = cpu_to_le32(AMDGPU_VCN_IB_FLAG_DECODE_BUFFER); + decode_buffer = (struct amdgpu_vcn_decode_buffer *)ptr; + ib->length_dw = ptr - ib->ptr + + sizeof(struct amdgpu_vcn_decode_buffer) / 4; memset(decode_buffer, 0, sizeof(struct amdgpu_vcn_decode_buffer)); decode_buffer->valid_buf_flag |= cpu_to_le32(AMDGPU_VCN_CMD_FLAG_MSG_BUFFER); decode_buffer->msg_buffer_address_hi = cpu_to_le32(addr >> 32); decode_buffer->msg_buffer_address_lo = cpu_to_le32(addr); - for (i = ib->length_dw; i < ib_size_dw; ++i) - ib->ptr[i] = 0x0; + memset32(ptr, 0, ib_size_dw - ib->length_dw); if (adev->vcn.inst[ring->me].using_unified_queue) amdgpu_vcn_unified_ring_ib_checksum(&ib_checksum, ib_pack_in_dw); @@ -929,7 +939,8 @@ static int amdgpu_vcn_enc_get_create_msg(struct amdgpu_ring *ring, uint32_t hand struct dma_fence *f = NULL; uint32_t *ib_checksum = NULL; uint64_t addr; - int i, r; + u32 *ptr; + int r; if (adev->vcn.inst[ring->me].using_unified_queue) ib_size_dw += 8; @@ -941,31 +952,33 @@ static int amdgpu_vcn_enc_get_create_msg(struct amdgpu_ring *ring, uint32_t hand return r; ib = &job->ibs[0]; - addr = AMDGPU_GPU_PAGE_ALIGN(ib_msg->gpu_addr); - ib->length_dw = 0; + addr = AMDGPU_GPU_PAGE_ALIGN(ib_msg->gpu_addr); if (adev->vcn.inst[ring->me].using_unified_queue) ib_checksum = amdgpu_vcn_unified_ring_ib_header(ib, 0x11, true); - ib->ptr[ib->length_dw++] = 0x00000018; - ib->ptr[ib->length_dw++] = 0x00000001; /* session info */ - ib->ptr[ib->length_dw++] = handle; - ib->ptr[ib->length_dw++] = upper_32_bits(addr); - ib->ptr[ib->length_dw++] = addr; - ib->ptr[ib->length_dw++] = 0x00000000; + ptr = &ib->ptr[ib->length_dw]; - ib->ptr[ib->length_dw++] = 0x00000014; - ib->ptr[ib->length_dw++] = 0x00000002; /* task info */ - ib->ptr[ib->length_dw++] = 0x0000001c; - ib->ptr[ib->length_dw++] = 0x00000000; - ib->ptr[ib->length_dw++] = 0x00000000; + *ptr++ = 0x00000018; + *ptr++ = 0x00000001; /* session info */ + *ptr++ = handle; + *ptr++ = upper_32_bits(addr); + *ptr++ = addr; + *ptr++ = 0x00000000; - ib->ptr[ib->length_dw++] = 0x00000008; - ib->ptr[ib->length_dw++] = 0x08000001; /* op initialize */ + *ptr++ = 0x00000014; + *ptr++ = 0x00000002; /* task info */ + *ptr++ = 0x0000001c; + *ptr++ = 0x00000000; + *ptr++ = 0x00000000; - for (i = ib->length_dw; i < ib_size_dw; ++i) - ib->ptr[i] = 0x0; + *ptr++ = 0x00000008; + *ptr++ = 0x08000001; /* op initialize */ + + ib->length_dw = ptr - ib->ptr; + + memset32(ptr, 0, ib_size_dw - ib->length_dw); if (adev->vcn.inst[ring->me].using_unified_queue) amdgpu_vcn_unified_ring_ib_checksum(&ib_checksum, 0x11); @@ -996,7 +1009,8 @@ static int amdgpu_vcn_enc_get_destroy_msg(struct amdgpu_ring *ring, uint32_t han struct dma_fence *f = NULL; uint32_t *ib_checksum = NULL; uint64_t addr; - int i, r; + u32 *ptr; + int r; if (adev->vcn.inst[ring->me].using_unified_queue) ib_size_dw += 8; @@ -1008,31 +1022,33 @@ static int amdgpu_vcn_enc_get_destroy_msg(struct amdgpu_ring *ring, uint32_t han return r; ib = &job->ibs[0]; - addr = AMDGPU_GPU_PAGE_ALIGN(ib_msg->gpu_addr); - ib->length_dw = 0; + addr = AMDGPU_GPU_PAGE_ALIGN(ib_msg->gpu_addr); if (adev->vcn.inst[ring->me].using_unified_queue) ib_checksum = amdgpu_vcn_unified_ring_ib_header(ib, 0x11, true); - ib->ptr[ib->length_dw++] = 0x00000018; - ib->ptr[ib->length_dw++] = 0x00000001; - ib->ptr[ib->length_dw++] = handle; - ib->ptr[ib->length_dw++] = upper_32_bits(addr); - ib->ptr[ib->length_dw++] = addr; - ib->ptr[ib->length_dw++] = 0x00000000; + ptr = &ib->ptr[ib->length_dw]; - ib->ptr[ib->length_dw++] = 0x00000014; - ib->ptr[ib->length_dw++] = 0x00000002; - ib->ptr[ib->length_dw++] = 0x0000001c; - ib->ptr[ib->length_dw++] = 0x00000000; - ib->ptr[ib->length_dw++] = 0x00000000; + *ptr++ = 0x00000018; + *ptr++ = 0x00000001; + *ptr++ = handle; + *ptr++ = upper_32_bits(addr); + *ptr++ = addr; + *ptr++ = 0x00000000; - ib->ptr[ib->length_dw++] = 0x00000008; - ib->ptr[ib->length_dw++] = 0x08000002; /* op close session */ + *ptr++ = 0x00000014; + *ptr++ = 0x00000002; + *ptr++ = 0x0000001c; + *ptr++ = 0x00000000; + *ptr++ = 0x00000000; - for (i = ib->length_dw; i < ib_size_dw; ++i) - ib->ptr[i] = 0x0; + *ptr++ = 0x00000008; + *ptr++ = 0x08000002; /* op close session */ + + ib->length_dw = ptr - ib->ptr; + + memset32(ptr, 0, ib_size_dw - ib->length_dw); if (adev->vcn.inst[ring->me].using_unified_queue) amdgpu_vcn_unified_ring_ib_checksum(&ib_checksum, 0x11); -- 2.48.0
