From: Marek Olšák <marek.ol...@amd.com> --- src/amd/common/sid.h | 4 ++++ src/gallium/drivers/radeonsi/si_dma_cs.c | 29 ++++++++++++++++++++++++ src/gallium/drivers/radeonsi/si_pipe.h | 2 ++ src/gallium/drivers/radeonsi/si_query.c | 21 +++++++++++++++-- src/gallium/drivers/radeonsi/si_query.h | 1 + 5 files changed, 55 insertions(+), 2 deletions(-)
diff --git a/src/amd/common/sid.h b/src/amd/common/sid.h index d9c4a1a7414..d696c01d4dd 100644 --- a/src/amd/common/sid.h +++ b/src/amd/common/sid.h @@ -9133,20 +9133,24 @@ #define CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW 0x5 #define CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW 0x6 #define CIK_SDMA_OPCODE_WRITE 0x2 #define SDMA_WRITE_SUB_OPCODE_LINEAR 0x0 #define SDMA_WRTIE_SUB_OPCODE_TILED 0x1 #define CIK_SDMA_OPCODE_INDIRECT_BUFFER 0x4 #define CIK_SDMA_PACKET_FENCE 0x5 #define CIK_SDMA_PACKET_TRAP 0x6 #define CIK_SDMA_PACKET_SEMAPHORE 0x7 #define CIK_SDMA_PACKET_CONSTANT_FILL 0xb +#define CIK_SDMA_OPCODE_TIMESTAMP 0xd +#define SDMA_TS_SUB_OPCODE_SET_LOCAL_TIMESTAMP 0x0 +#define SDMA_TS_SUB_OPCODE_GET_LOCAL_TIMESTAMP 0x1 +#define SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP 0x2 #define CIK_SDMA_PACKET_SRBM_WRITE 0xe #define CIK_SDMA_COPY_MAX_SIZE 0x3fffe0 enum amd_cmp_class_flags { S_NAN = 1 << 0, // Signaling NaN Q_NAN = 1 << 1, // Quiet NaN N_INFINITY = 1 << 2, // Negative infinity N_NORMAL = 1 << 3, // Negative normal N_SUBNORMAL = 1 << 4, // Negative subnormal N_ZERO = 1 << 5, // Negative zero diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c b/src/gallium/drivers/radeonsi/si_dma_cs.c index 3bb769309e3..7db9570af3c 100644 --- a/src/gallium/drivers/radeonsi/si_dma_cs.c +++ b/src/gallium/drivers/radeonsi/si_dma_cs.c @@ -16,32 +16,61 @@ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "si_pipe.h" +#include "sid.h" static void si_dma_emit_wait_idle(struct si_context *sctx) { struct radeon_cmdbuf *cs = sctx->dma_cs; /* NOP waits for idle. */ if (sctx->chip_class >= CIK) radeon_emit(cs, 0x00000000); /* NOP */ else radeon_emit(cs, 0xf0000000); /* NOP */ } +void si_dma_emit_timestamp(struct si_context *sctx, struct r600_resource *dst, + uint64_t offset) +{ + struct radeon_cmdbuf *cs = sctx->dma_cs; + uint64_t va = dst->gpu_address + offset; + + if (sctx->chip_class == SI) { + unreachable("SI DMA doesn't support the timestamp packet."); + return; + } + + /* Mark the buffer range of destination as valid (initialized), + * so that transfer_map knows it should wait for the GPU when mapping + * that range. */ + util_range_add(&dst->valid_buffer_range, offset, offset + 8); + + assert(va % 8 == 0); + + si_need_dma_space(sctx, 4, dst, NULL); + si_dma_emit_wait_idle(sctx); + + radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP, + SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, + 0)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); +} + void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct r600_resource *dst, struct r600_resource *src) { uint64_t vram = ctx->dma_cs->used_vram; uint64_t gtt = ctx->dma_cs->used_gart; if (dst) { vram += dst->vram_usage; gtt += dst->gart_usage; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 95489f09612..4c3f13b84e2 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1148,20 +1148,22 @@ void si_log_draw_state(struct si_context *sctx, struct u_log_context *log); void si_log_compute_state(struct si_context *sctx, struct u_log_context *log); void si_init_debug_functions(struct si_context *sctx); void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved, enum ring_type ring); bool si_replace_shader(unsigned num, struct ac_shader_binary *binary); /* si_dma.c */ void si_init_dma_functions(struct si_context *sctx); /* si_dma_cs.c */ +void si_dma_emit_timestamp(struct si_context *sctx, struct r600_resource *dst, + uint64_t offset); void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct r600_resource *dst, struct r600_resource *src); void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence); void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset, uint64_t size, unsigned value); /* si_fence.c */ void si_gfx_write_event_eop(struct si_context *ctx, unsigned event, unsigned event_flags, diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c index f768b531139..93efbd4ef4a 100644 --- a/src/gallium/drivers/radeonsi/si_query.c +++ b/src/gallium/drivers/radeonsi/si_query.c @@ -641,20 +641,25 @@ static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, query->ops = &query_hw_default_hw_ops; switch (query_type) { case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: query->result_size = 16 * sscreen->info.num_render_backends; query->result_size += 16; /* for the fence + alignment */ query->num_cs_dw_end = 6 + si_gfx_write_fence_dwords(sscreen); break; + case SI_QUERY_TIME_ELAPSED_SDMA: + /* GET_GLOBAL_TIMESTAMP only works if the offset is a multiple of 32. */ + query->result_size = 64; + query->num_cs_dw_end = 0; + break; case PIPE_QUERY_TIME_ELAPSED: query->result_size = 24; query->num_cs_dw_end = 8 + si_gfx_write_fence_dwords(sscreen); break; case PIPE_QUERY_TIMESTAMP: query->result_size = 16; query->num_cs_dw_end = 8 + si_gfx_write_fence_dwords(sscreen); query->flags = SI_QUERY_HW_FLAG_NO_START; break; case PIPE_QUERY_PRIMITIVES_EMITTED: @@ -740,20 +745,23 @@ static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va, } static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query, struct r600_resource *buffer, uint64_t va) { struct radeon_cmdbuf *cs = sctx->gfx_cs; switch (query->b.type) { + case SI_QUERY_TIME_ELAPSED_SDMA: + si_dma_emit_timestamp(sctx, buffer, va - buffer->gpu_address); + return; case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); radeon_emit(cs, va); radeon_emit(cs, va >> 32); break; case PIPE_QUERY_PRIMITIVES_EMITTED: case PIPE_QUERY_PRIMITIVES_GENERATED: @@ -795,21 +803,22 @@ static void si_query_hw_emit_start(struct si_context *sctx, struct si_query_hw *query) { uint64_t va; if (!query->buffer.buf) return; // previous buffer allocation failure si_update_occlusion_query_state(sctx, query->b.type, 1); si_update_prims_generated_query_state(sctx, query->b.type, 1); - si_need_gfx_cs_space(sctx); + if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA) + si_need_gfx_cs_space(sctx); /* Get a new query buffer if needed. */ if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) { struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer); *qbuf = query->buffer; query->buffer.results_end = 0; query->buffer.previous = qbuf; query->buffer.buf = si_new_query_buffer(sctx->screen, query); if (!query->buffer.buf) return; @@ -825,20 +834,23 @@ static void si_query_hw_emit_start(struct si_context *sctx, static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query, struct r600_resource *buffer, uint64_t va) { struct radeon_cmdbuf *cs = sctx->gfx_cs; uint64_t fence_va = 0; switch (query->b.type) { + case SI_QUERY_TIME_ELAPSED_SDMA: + si_dma_emit_timestamp(sctx, buffer, va + 32 - buffer->gpu_address); + return; case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: va += 8; radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); radeon_emit(cs, va); radeon_emit(cs, va >> 32); fence_va = va + sctx->screen->info.num_render_backends * 16 - 8; @@ -1015,21 +1027,22 @@ static void si_emit_query_predication(struct si_context *ctx) } } static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index) { struct si_screen *sscreen = (struct si_screen *)ctx->screen; if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || query_type == PIPE_QUERY_GPU_FINISHED || - query_type >= PIPE_QUERY_DRIVER_SPECIFIC) + (query_type >= PIPE_QUERY_DRIVER_SPECIFIC && + query_type != SI_QUERY_TIME_ELAPSED_SDMA)) return si_query_sw_create(query_type); return si_query_hw_create(sscreen, query_type, index); } static void si_destroy_query(struct pipe_context *ctx, struct pipe_query *query) { struct si_context *sctx = (struct si_context *)ctx; struct si_query *rquery = (struct si_query *)query; @@ -1231,20 +1244,23 @@ static void si_query_hw_add_result(struct si_screen *sscreen, for (unsigned i = 0; i < max_rbs; ++i) { unsigned results_base = i * 16; result->b = result->b || si_query_read_result(buffer + results_base, 0, 2, true) != 0; } break; } case PIPE_QUERY_TIME_ELAPSED: result->u64 += si_query_read_result(buffer, 0, 2, false); break; + case SI_QUERY_TIME_ELAPSED_SDMA: + result->u64 += si_query_read_result(buffer, 0, 32/4, false); + break; case PIPE_QUERY_TIMESTAMP: result->u64 = *(uint64_t*)buffer; break; case PIPE_QUERY_PRIMITIVES_EMITTED: /* SAMPLE_STREAMOUTSTATS stores this structure: * { * u64 NumPrimitivesWritten; * u64 PrimitiveStorageNeeded; * } * We only need NumPrimitivesWritten here. */ @@ -1375,20 +1391,21 @@ bool si_query_hw_get_result(struct si_context *sctx, while (results_base != qbuf->results_end) { query->ops->add_result(sscreen, query, map + results_base, result); results_base += query->result_size; } } /* Convert the time to expected units. */ if (rquery->type == PIPE_QUERY_TIME_ELAPSED || + rquery->type == SI_QUERY_TIME_ELAPSED_SDMA || rquery->type == PIPE_QUERY_TIMESTAMP) { result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq; } return true; } static void si_restore_qbo_state(struct si_context *sctx, struct si_qbo_state *st) { sctx->b.bind_compute_state(&sctx->b, st->saved_compute); diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h index 3f60208e2f8..bc3eb397bc5 100644 --- a/src/gallium/drivers/radeonsi/si_query.h +++ b/src/gallium/drivers/radeonsi/si_query.h @@ -102,20 +102,21 @@ enum { SI_QUERY_GPU_SCRATCH_RAM_BUSY, SI_QUERY_NUM_COMPILATIONS, SI_QUERY_NUM_SHADERS_CREATED, SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO, SI_QUERY_NUM_SHADER_CACHE_HITS, SI_QUERY_GPIN_ASIC_ID, SI_QUERY_GPIN_NUM_SIMD, SI_QUERY_GPIN_NUM_RB, SI_QUERY_GPIN_NUM_SPI, SI_QUERY_GPIN_NUM_SE, + SI_QUERY_TIME_ELAPSED_SDMA, SI_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100, }; enum { SI_QUERY_GROUP_GPIN = 0, SI_NUM_SW_QUERY_GROUPS }; struct si_query_ops { -- 2.17.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev