From: Marek Olšák <marek.ol...@amd.com> --- src/gallium/drivers/radeonsi/Makefile.sources | 2 +- src/gallium/drivers/radeonsi/meson.build | 2 +- src/gallium/drivers/radeonsi/si_blit.c | 2 +- src/gallium/drivers/radeonsi/si_cp_dma.c | 8 +- src/gallium/drivers/radeonsi/si_pipe.c | 8 +- src/gallium/drivers/radeonsi/si_pipe.h | 9 +- .../drivers/radeonsi/si_shaderlib_tgsi.c | 102 ++++ .../drivers/radeonsi/si_test_clearbuffer.c | 139 ------ .../drivers/radeonsi/si_test_dma_perf.c | 470 ++++++++++++++++++ 9 files changed, 590 insertions(+), 152 deletions(-) delete mode 100644 src/gallium/drivers/radeonsi/si_test_clearbuffer.c create mode 100644 src/gallium/drivers/radeonsi/si_test_dma_perf.c
diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources index b52db3a0598..abdc4e07f1e 100644 --- a/src/gallium/drivers/radeonsi/Makefile.sources +++ b/src/gallium/drivers/radeonsi/Makefile.sources @@ -37,22 +37,22 @@ C_SOURCES := \ si_shader_tgsi_setup.c \ si_shaderlib_tgsi.c \ si_state.c \ si_state_binning.c \ si_state_draw.c \ si_state_msaa.c \ si_state_shaders.c \ si_state_streamout.c \ si_state_viewport.c \ si_state.h \ - si_test_clearbuffer.c \ si_test_dma.c \ + si_test_dma_perf.c \ si_texture.c \ si_uvd.c \ ../radeon/r600_perfcounter.c \ ../radeon/radeon_uvd.c \ ../radeon/radeon_uvd.h \ ../radeon/radeon_vcn_dec.c \ ../radeon/radeon_vcn_dec.h \ ../radeon/radeon_vcn_enc_1_2.c \ ../radeon/radeon_vcn_enc.c \ ../radeon/radeon_vcn_enc.h \ diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build index 57229046de1..4d6044f724b 100644 --- a/src/gallium/drivers/radeonsi/meson.build +++ b/src/gallium/drivers/radeonsi/meson.build @@ -53,22 +53,22 @@ files_libradeonsi = files( 'si_shader_tgsi_setup.c', 'si_shaderlib_tgsi.c', 'si_state.c', 'si_state.h', 'si_state_binning.c', 'si_state_draw.c', 'si_state_msaa.c', 'si_state_shaders.c', 'si_state_streamout.c', 'si_state_viewport.c', - 'si_test_clearbuffer.c', 'si_test_dma.c', + 'si_test_dma_perf.c', 'si_texture.c', 'si_uvd.c', '../radeon/r600_perfcounter.c', '../radeon/radeon_uvd.c', '../radeon/radeon_uvd.h', '../radeon/radeon_vcn_enc_1_2.c', '../radeon/radeon_vcn_enc.c', '../radeon/radeon_vcn_enc.h', '../radeon/radeon_vcn_dec.c', '../radeon/radeon_vcn_dec.h', diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index cf6495291bd..fcaff80125c 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -903,21 +903,21 @@ void si_resource_copy_region(struct pipe_context *ctx, struct si_context *sctx = (struct si_context *)ctx; struct si_texture *ssrc = (struct si_texture*)src; struct pipe_surface *dst_view, dst_templ; struct pipe_sampler_view src_templ, *src_view; unsigned dst_width, dst_height, src_width0, src_height0; unsigned dst_width0, dst_height0, src_force_level = 0; struct pipe_box sbox, dstbox; /* Handle buffers first. */ if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) { - si_copy_buffer(sctx, dst, src, dstx, src_box->x, src_box->width, 0); + si_copy_buffer(sctx, dst, src, dstx, src_box->x, src_box->width, 0, -1); return; } assert(u_max_sample(dst) == u_max_sample(src)); /* The driver doesn't decompress resources automatically while * u_blitter is rendering. */ si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level, src_box->z, src_box->z + src_box->depth - 1); diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 61be22f28b5..486ae75c77f 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -429,32 +429,34 @@ static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size, } /** * Do memcpy between buffers using CP DMA. * * \param user_flags bitmask of SI_CPDMA_* */ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, unsigned size, - unsigned user_flags) + unsigned user_flags, enum si_cache_policy cache_policy) { uint64_t main_dst_offset, main_src_offset; unsigned skipped_size = 0; unsigned realign_size = 0; enum si_coherency coher = SI_COHERENCY_SHADER; - enum si_cache_policy cache_policy = get_cache_policy(sctx, coher); bool is_first = true; if (!size) return; + if (cache_policy == -1) + cache_policy = get_cache_policy(sctx, coher); + if (dst != src || dst_offset != src_offset) { /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size); } dst_offset += r600_resource(dst)->gpu_address; src_offset += r600_resource(src)->gpu_address; @@ -532,21 +534,21 @@ void si_copy_buffer(struct si_context *sctx, /* If it's not a prefetch... */ if (dst_offset != src_offset) sctx->num_cp_dma_calls++; } void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset, unsigned size) { assert(sctx->chip_class >= CIK); - si_copy_buffer(sctx, buf, buf, offset, offset, size, SI_CPDMA_SKIP_ALL); + si_copy_buffer(sctx, buf, buf, offset, offset, size, SI_CPDMA_SKIP_ALL, L2_LRU); } static void cik_prefetch_shader_async(struct si_context *sctx, struct si_pm4_state *state) { struct pipe_resource *bo = &state->bo[0]->b.b; assert(state->nbo == 1); cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0); } diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 13fcf1f3aea..c259c260550 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -94,21 +94,21 @@ static const struct debug_named_value debug_options[] = { { "nodccclear", DBG(NO_DCC_CLEAR), "Disable DCC fast clear." }, { "nodccfb", DBG(NO_DCC_FB), "Disable separate DCC on the main framebuffer" }, { "nodccmsaa", DBG(NO_DCC_MSAA), "Disable DCC for MSAA" }, { "nofmask", DBG(NO_FMASK), "Disable MSAA compression" }, /* Tests: */ { "testdma", DBG(TEST_DMA), "Invoke SDMA tests and exit." }, { "testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and exit." }, { "testvmfaultsdma", DBG(TEST_VMFAULT_SDMA), "Invoke a SDMA VM fault test and exit." }, { "testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit." }, - { "testclearbufperf", DBG(TEST_CLEARBUF_PERF), "Test Clearbuffer Performance" }, + { "testdmaperf", DBG(TEST_DMA_PERF), "Test DMA performance" }, DEBUG_NAMED_VALUE_END /* must be last */ }; static void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compiler) { /* Only create the less-optimizing version of the compiler on APUs * predating Ryzen (Raven). */ bool create_low_opt_compiler = !sscreen->info.has_dedicated_vram && @@ -723,21 +723,21 @@ static void si_test_vmfault(struct si_screen *sscreen) pipe_buffer_create_const0(&sscreen->b, 0, PIPE_USAGE_DEFAULT, 64); if (!buf) { puts("Buffer allocation failed."); exit(1); } r600_resource(buf)->gpu_address = 0; /* cause a VM fault */ if (sscreen->debug_flags & DBG(TEST_VMFAULT_CP)) { - si_copy_buffer(sctx, buf, buf, 0, 4, 4, 0); + si_copy_buffer(sctx, buf, buf, 0, 4, 4, 0, -1); ctx->flush(ctx, NULL, 0); puts("VM fault test: CP - done."); } if (sscreen->debug_flags & DBG(TEST_VMFAULT_SDMA)) { sctx->dma_clear_buffer(sctx, buf, 0, 4, 0); ctx->flush(ctx, NULL, 0); puts("VM fault test: SDMA - done."); } if (sscreen->debug_flags & DBG(TEST_VMFAULT_SHADER)) { util_test_constant_buffer(ctx, buf); @@ -1063,21 +1063,21 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws, si_init_compiler(sscreen, &sscreen->compiler[i]); for (i = 0; i < num_comp_lo_threads; i++) si_init_compiler(sscreen, &sscreen->compiler_lowp[i]); /* Create the auxiliary context. This must be done last. */ sscreen->aux_context = si_create_context(&sscreen->b, 0); if (sscreen->debug_flags & DBG(TEST_DMA)) si_test_dma(sscreen); - if (sscreen->debug_flags & DBG(TEST_CLEARBUF_PERF)) { - si_test_clearbuffer_perf(sscreen); + if (sscreen->debug_flags & DBG(TEST_DMA_PERF)) { + si_test_dma_perf(sscreen); } if (sscreen->debug_flags & (DBG(TEST_VMFAULT_CP) | DBG(TEST_VMFAULT_SDMA) | DBG(TEST_VMFAULT_SHADER))) si_test_vmfault(sscreen); return &sscreen->b; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index fe06064b388..cfd7622c7a3 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -158,21 +158,21 @@ enum { DBG_NO_DCC_CLEAR, DBG_NO_DCC_FB, DBG_NO_DCC_MSAA, DBG_NO_FMASK, /* Tests: */ DBG_TEST_DMA, DBG_TEST_VMFAULT_CP, DBG_TEST_VMFAULT_SDMA, DBG_TEST_VMFAULT_SHADER, - DBG_TEST_CLEARBUF_PERF, + DBG_TEST_DMA_PERF, }; #define DBG_ALL_SHADERS (((1 << (DBG_CS + 1)) - 1)) #define DBG(name) (1ull << DBG_##name) struct si_compute; struct hash_table; struct u_suballocator; /* Only 32-bit buffer allocations are supported, gallium doesn't support more @@ -1126,21 +1126,21 @@ void si_cp_dma_wait_for_idle(struct si_context *sctx); void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset, uint64_t size, unsigned value, enum si_coherency coher, enum si_cache_policy cache_policy); void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset, uint64_t size, unsigned value, enum si_coherency coher); void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, unsigned size, - unsigned user_flags); + unsigned user_flags, enum si_cache_policy cache_policy); void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset, unsigned size); void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only); void si_init_cp_dma_functions(struct si_context *sctx); /* si_debug.c */ void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs, struct radeon_saved_cs *saved, bool get_buffer_list); void si_clear_saved_cs(struct radeon_saved_cs *saved); void si_destroy_saved_cs(struct si_saved_cs *scs); @@ -1210,27 +1210,30 @@ bool si_check_device_reset(struct si_context *sctx); /* si_query.c */ void si_init_screen_query_functions(struct si_screen *sscreen); void si_init_query_functions(struct si_context *sctx); void si_suspend_queries(struct si_context *sctx); void si_resume_queries(struct si_context *sctx); /* si_shaderlib_tgsi.c */ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, unsigned num_layers); void *si_create_fixed_func_tcs(struct si_context *sctx); +void *si_create_dma_compute_shader(struct pipe_context *ctx, + unsigned num_dwords_per_thread, + bool stream_cache_policy, bool is_copy); void *si_create_query_result_cs(struct si_context *sctx); /* si_test_dma.c */ void si_test_dma(struct si_screen *sscreen); /* si_test_clearbuffer.c */ -void si_test_clearbuffer_perf(struct si_screen *sscreen); +void si_test_dma_perf(struct si_screen *sscreen); /* si_uvd.c */ struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context, const struct pipe_video_codec *templ); struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe, const struct pipe_video_buffer *tmpl); /* si_viewport.c */ void si_update_vs_viewport_state(struct si_context *ctx); diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c index 45bc93ed782..911b710abe6 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c @@ -112,20 +112,122 @@ void *si_create_fixed_func_tcs(struct si_context *sctx) tessouter = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSOUTER, 0); tessinner = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSINNER, 0); ureg_MOV(ureg, tessouter, outer); ureg_MOV(ureg, tessinner, inner); ureg_END(ureg); return ureg_create_shader_and_destroy(ureg, &sctx->b); } +/* Create a compute shader implementing clear_buffer or copy_buffer. */ +void *si_create_dma_compute_shader(struct pipe_context *ctx, + unsigned num_dwords_per_thread, + bool stream_cache_policy, bool is_copy) +{ + assert(util_is_power_of_two_nonzero(num_dwords_per_thread)); + + unsigned qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT; + if (stream_cache_policy) + qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY; + + unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4); + unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned)); + + for (unsigned i = 0; i < num_mem_ops; i++) { + if (i*4 < num_dwords_per_thread) + inst_dwords[i] = MIN2(4, num_dwords_per_thread - i*4); + } + + struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE); + if (!ureg) + return NULL; + + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 64); + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1); + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1); + + struct ureg_src value; + if (!is_copy) { + ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_DWORDS, inst_dwords[0]); + value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA, 0); + } + + struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0); + struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0); + struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X); + struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X); + struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false)); + struct ureg_src srcbuf; + struct ureg_src *values = NULL; + + if (is_copy) { + srcbuf = ureg_DECL_buffer(ureg, 1, false); + values = malloc(num_mem_ops * sizeof(struct ureg_src)); + } + + /* If there are multiple stores, the first store writes into 0+tid, + * the 2nd store writes into 64+tid, the 3rd store writes into 128+tid, etc. + */ + ureg_UMAD(ureg, store_addr, blk, ureg_imm1u(ureg, 64 * num_mem_ops), tid); + /* Convert from a "store size unit" into bytes. */ + ureg_UMUL(ureg, store_addr, ureg_src(store_addr), + ureg_imm1u(ureg, 4 * inst_dwords[0])); + ureg_MOV(ureg, load_addr, ureg_src(store_addr)); + + /* Distance between a load and a store for latency hiding. */ + unsigned load_store_distance = is_copy ? 8 : 0; + + for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) { + int d = i - load_store_distance; + + if (is_copy && i < num_mem_ops) { + if (i) { + ureg_UADD(ureg, load_addr, ureg_src(load_addr), + ureg_imm1u(ureg, 4 * inst_dwords[i] * 64)); + } + + values[i] = ureg_src(ureg_DECL_temporary(ureg)); + struct ureg_dst dst = + ureg_writemask(ureg_dst(values[i]), + u_bit_consecutive(0, inst_dwords[i])); + struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)}; + ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2, + qualifier, TGSI_TEXTURE_BUFFER, 0); + } + + if (d >= 0) { + if (d) { + ureg_UADD(ureg, store_addr, ureg_src(store_addr), + ureg_imm1u(ureg, 4 * inst_dwords[d] * 64)); + } + + struct ureg_dst dst = + ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d])); + struct ureg_src srcs[] = + {ureg_src(store_addr), is_copy ? values[d] : value}; + ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2, + qualifier, TGSI_TEXTURE_BUFFER, 0); + } + } + ureg_END(ureg); + + struct pipe_compute_state state = {}; + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = ureg_get_tokens(ureg, NULL); + + void *cs = ctx->create_compute_state(ctx, &state); + ureg_destroy(ureg); + free(values); + return cs; +} + /* Create the compute shader that is used to collect the results. * * One compute grid with a single thread is launched for every query result * buffer. The thread (optionally) reads a previous summary buffer, then * accumulates data from the query result buffer, and writes the result either * to a summary buffer to be consumed by the next grid invocation or to the * user-supplied buffer. * * Data layout: * diff --git a/src/gallium/drivers/radeonsi/si_test_clearbuffer.c b/src/gallium/drivers/radeonsi/si_test_clearbuffer.c deleted file mode 100644 index e863381fd15..00000000000 --- a/src/gallium/drivers/radeonsi/si_test_clearbuffer.c +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright 2018 Advanced Micro Devices, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -/* This file implements tests on the si_clearbuffer function. */ - -#include "si_pipe.h" - -#define CLEARBUF_MIN 32 -#define CLEARBUF_COUNT 16 -#define CLEARBUF_MEMSZ 1024 - -static uint64_t -measure_clearbuf_time(struct pipe_context *ctx, - uint64_t memory_size) -{ - struct pipe_query *query_te; - union pipe_query_result qresult; - struct pipe_resource *buf; - - struct si_context *sctx = (struct si_context*)ctx; - struct pipe_screen *screen = ctx->screen; - - buf = pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, memory_size); - - query_te = ctx->create_query(ctx, PIPE_QUERY_TIME_ELAPSED, 0); - - ctx->begin_query(ctx, query_te); - /* operation */ - si_cp_dma_clear_buffer(sctx, buf, 0, memory_size, 0x00, - SI_COHERENCY_SHADER, L2_LRU); - ctx->end_query(ctx, query_te); - ctx->get_query_result(ctx, query_te, true, &qresult); - - /* Cleanup. */ - ctx->destroy_query(ctx, query_te); - pipe_resource_reference(&buf, NULL); - - /* Report Results */ - return qresult.u64; -} - -/** - * @brief Analyze rate of clearing a 1K Buffer averaged over 16 iterations - * @param ctx Context of pipe to perform analysis on - */ -static void -analyze_clearbuf_perf_avg(struct pipe_context *ctx) -{ - uint index = 0; - uint64_t result[CLEARBUF_COUNT]; - uint64_t sum = 0; - long long int rate_kBps; - - /* Run Tests. */ - for (index = 0 ; index < CLEARBUF_COUNT ; index++) { - result[index] = measure_clearbuf_time(ctx, CLEARBUF_MEMSZ); - sum += result[index]; - } - - /* Calculate Results. */ - /* kBps = (size(bytes))/(1000) / (time(ns)/(1000*1000*1000)) */ - rate_kBps = CLEARBUF_COUNT*CLEARBUF_MEMSZ; - rate_kBps *= 1000UL*1000UL; - rate_kBps /= sum; - - /* Display Results. */ - printf("CP DMA clear_buffer performance (buffer %lu ,repeat %u ):", - (uint64_t)CLEARBUF_MEMSZ, - CLEARBUF_COUNT ); - printf(" %llu kB/s\n", rate_kBps ); -} - -/** - * @brief Analyze rate of clearing a range of Buffer sizes - * @param ctx Context of pipe to perform analysis on - */ -static void -analyze_clearbuf_perf_rng(struct pipe_context *ctx) -{ - uint index = 0; - uint64_t result[CLEARBUF_COUNT]; - uint64_t mem_size; - long long int rate_kBps; - - /* Run Tests. */ - mem_size = CLEARBUF_MIN; - for (index = 0 ; index < CLEARBUF_COUNT ; index++ ) { - result[index] = measure_clearbuf_time(ctx, mem_size); - mem_size <<= 1; - } - - /* Calculate & Display Results. */ - /* kBps = (size(bytes))/(1000) / (time(ns)/(1000*1000*1000)) */ - mem_size = CLEARBUF_MIN; - for (index = 0 ; index < CLEARBUF_COUNT ; index++ ) { - rate_kBps = mem_size; - rate_kBps *= 1000UL*1000UL; - rate_kBps /= result[index]; - - printf("CP DMA clear_buffer performance (buffer %lu):", - mem_size ); - printf(" %llu kB/s\n", rate_kBps ); - - mem_size <<= 1; - } -} - -void si_test_clearbuffer_perf(struct si_screen *sscreen) -{ - struct pipe_screen *screen = &sscreen->b; - struct pipe_context *ctx = screen->context_create(screen, NULL, 0); - - analyze_clearbuf_perf_avg(ctx); - analyze_clearbuf_perf_rng(ctx); - - exit(0); -} diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c new file mode 100644 index 00000000000..46d31a2e16e --- /dev/null +++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c @@ -0,0 +1,470 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +/* This file implements tests on the si_clearbuffer function. */ + +#include "si_pipe.h" +#include "si_query.h" + +#define MIN_SIZE 256 +#define MAX_SIZE (128 * 1024 * 1024) +#define SIZE_SHIFT 1 +#define NUM_RUNS 128 + +static double get_MBps_rate(unsigned num_bytes, unsigned ns) +{ + return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0); +} + +void si_test_dma_perf(struct si_screen *sscreen) +{ + struct pipe_screen *screen = &sscreen->b; + struct pipe_context *ctx = screen->context_create(screen, NULL, 0); + struct si_context *sctx = (struct si_context*)ctx; + const uint32_t clear_value = 0x12345678; + static const unsigned cs_dwords_per_thread_list[] = {1, 4, 16, 64}; + static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0}; + +#define NUM_SHADERS (ARRAY_SIZE(cs_dwords_per_thread_list) * 2) +#define NUM_METHODS (4 + NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list)) + + void *clear_cs[NUM_SHADERS], *copy_cs[NUM_SHADERS]; + + for (unsigned i = 0; i < NUM_SHADERS; i++) { + clear_cs[i] = si_create_dma_compute_shader(ctx, cs_dwords_per_thread_list[i / 2], + i % 2, false); + copy_cs[i] = si_create_dma_compute_shader(ctx, cs_dwords_per_thread_list[i / 2], + i % 2, true); + } + + + static const char *method_str[] = { + "CP MC ", + "CP L2 ", + "CP L2 ", + "SDMA ", + }; + static const char *placement_str[] = { + /* Clear */ + "fill->VRAM", + "fill->GTT ", + /* Copy */ + "VRAM->VRAM", + "VRAM->GTT ", + "GTT ->VRAM", + }; + + printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n"); + printf("Heap ,Method ,L2p,Wa,"); + for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) { + if (size >= 1024) + printf("%6uKB,", size / 1024); + else + printf(" %6uB,", size); + } + printf("\n"); + + /* results[log2(size)][placement][method][] */ + struct si_result { + bool is_valid; + bool is_cp; + bool is_sdma; + bool is_cs; + unsigned cache_policy; + unsigned dwords_per_thread; + unsigned waves_per_sh; + unsigned score; + } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {}; + + /* Run benchmarks. */ + for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) { + bool is_copy = placement >= 2; + + printf("-----------,--------,---,--,"); + for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) + printf("--------,"); + printf("\n"); + + for (unsigned method = 0; method < NUM_METHODS; method++) { + bool test_cp = method <= 2; + bool test_sdma = method == 3; + bool test_cs = method >= 4; + unsigned cs_method = method - 4; + unsigned cache_policy = test_cp ? method % 3 : + test_cs ? L2_LRU + cs_method % 2 : 0; + unsigned cs_shader = cs_method % NUM_SHADERS; + unsigned cs_dwords_per_thread = + test_cs ? cs_dwords_per_thread_list[cs_shader / 2] : 0; + unsigned cs_waves_per_sh = + test_cs ? cs_waves_per_sh_list[cs_method / NUM_SHADERS] : 0; + + if (sctx->chip_class == SI) { + /* SI doesn't support CP DMA operations through L2. */ + if (test_cp && cache_policy != L2_BYPASS) + continue; + /* WAVES_PER_SH is in multiples of 16 on SI. */ + if (test_cs && cs_waves_per_sh % 16 != 0) + continue; + } + + printf("%s ,", placement_str[placement]); + if (test_cs) { + printf("CS x%-4u,%3s,", cs_dwords_per_thread, + cache_policy == L2_LRU ? "LRU" : + cache_policy == L2_STREAM ? "Str" : ""); + } else { + printf("%s,%3s,", method_str[method], + method == L2_LRU ? "LRU" : + method == L2_STREAM ? "Str" : ""); + } + if (test_cs && cs_waves_per_sh) + printf("%2u,", cs_waves_per_sh); + else + printf(" ,"); + + double score = 0; + for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) { + /* Don't test bigger sizes if it's too slow. Print 0. */ + if (size >= 512*1024 && + score < 400 * (size / (4*1024*1024))) { + printf("%7.0f ,", 0.0); + continue; + } + + enum pipe_resource_usage dst_usage, src_usage; + struct pipe_resource *dst, *src; + struct pipe_query *q[NUM_RUNS]; + unsigned query_type = PIPE_QUERY_TIME_ELAPSED; + + if (test_sdma) { + if (sctx->chip_class == SI) + query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI; + else + query_type = SI_QUERY_TIME_ELAPSED_SDMA; + } + + if (placement == 0 || placement == 2 || placement == 4) + dst_usage = PIPE_USAGE_DEFAULT; + else + dst_usage = PIPE_USAGE_STREAM; + + if (placement == 2 || placement == 3) + src_usage = PIPE_USAGE_DEFAULT; + else + src_usage = PIPE_USAGE_STREAM; + + dst = pipe_buffer_create(screen, 0, dst_usage, size); + src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL; + + /* Run tests. */ + for (unsigned iter = 0; iter < NUM_RUNS; iter++) { + q[iter] = ctx->create_query(ctx, query_type, 0); + ctx->begin_query(ctx, q[iter]); + + if (test_cp) { + /* CP DMA */ + if (is_copy) { + si_copy_buffer(sctx, dst, src, 0, 0, size, 0, + cache_policy); + } else { + si_cp_dma_clear_buffer(sctx, dst, 0, size, clear_value, + SI_COHERENCY_NONE, cache_policy); + } + } else if (test_sdma) { + /* SDMA */ + if (is_copy) { + struct pipe_box box; + u_box_1d(0, size, &box); + sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, 0, &box); + } else { + sctx->dma_clear_buffer(sctx, dst, 0, size, clear_value); + } + } else { + /* Compute */ + /* The memory accesses are coalesced, meaning that the 1st instruction writes + * the 1st contiguous block of data for the whole wave, the 2nd instruction + * writes the 2nd contiguous block of data, etc. + */ + unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4); + unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread; + unsigned dwords_per_wave = cs_dwords_per_thread * 64; + + unsigned num_dwords = size / 4; + unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction); + + struct pipe_grid_info info = {}; + info.block[0] = MIN2(64, num_instructions); + info.block[1] = 1; + info.block[2] = 1; + info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave); + info.grid[1] = 1; + info.grid[2] = 1; + + struct pipe_shader_buffer sb[2] = {}; + sb[0].buffer = dst; + sb[0].buffer_size = size; + + if (is_copy) { + sctx->flags |= SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_SMEM_L1; + + sb[1].buffer = src; + sb[1].buffer_size = size; + + ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb); + ctx->bind_compute_state(ctx, copy_cs[cs_shader]); + } else { + for (unsigned i = 0; i < 4; i++) + sctx->cs_user_data[i] = clear_value; + + ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb); + ctx->bind_compute_state(ctx, clear_cs[cs_shader]); + } + + sctx->cs_max_waves_per_sh = cs_waves_per_sh; + ctx->launch_grid(ctx, &info); + + /* Wait and flush L2. */ + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; + si_emit_cache_flush(sctx); + sctx->cs_max_waves_per_sh = 0; /* disable the limit */ + } + ctx->end_query(ctx, q[iter]); + ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC); + } + pipe_resource_reference(&dst, NULL); + pipe_resource_reference(&src, NULL); + + /* Get results. */ + uint64_t min = ~0ull, max = 0, total = 0; + + for (unsigned iter = 0; iter < NUM_RUNS; iter++) { + union pipe_query_result result; + + ctx->get_query_result(ctx, q[iter], true, &result); + ctx->destroy_query(ctx, q[iter]); + + min = MIN2(min, result.u64); + max = MAX2(max, result.u64); + total += result.u64; + } + + score = get_MBps_rate(size, total / (double)NUM_RUNS); + printf("%7.0f ,", score); + fflush(stdout); + + struct si_result *r = &results[util_logbase2(size)][placement][method]; + r->is_valid = true; + r->is_cp = test_cp; + r->is_sdma = test_sdma; + r->is_cs = test_cs; + r->cache_policy = cache_policy; + r->dwords_per_thread = cs_dwords_per_thread; + r->waves_per_sh = cs_waves_per_sh; + r->score = score; + } + puts(""); + } + } + + puts(""); + puts("static struct si_method"); + printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool cached)\n", + sctx->screen->info.name); + puts("{"); + puts(" unsigned size = MIN2(size64, UINT_MAX);\n"); + + /* Analyze results and find the best methods. */ + for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) { + if (placement == 0) + puts(" if (dst == RADEON_DOMAIN_VRAM) {"); + else if (placement == 1) + puts(" } else { /* GTT */"); + else if (placement == 2) { + puts("}"); + puts(""); + puts("static struct si_method"); + printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n", + sctx->screen->info.name); + printf(" uint64_t size64, bool async, bool cached)\n"); + puts("{"); + puts(" unsigned size = MIN2(size64, UINT_MAX);\n"); + puts(" if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {"); + } else if (placement == 3) + puts(" } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {"); + else + puts(" } else { /* GTT -> VRAM */"); + + for (unsigned mode = 0; mode < 3; mode++) { + bool async = mode == 0; + bool cached = mode == 1; + + if (async) + puts(" if (async) { /* SDMA or async compute */"); + else if (cached) + puts(" if (cached) { /* gfx ring */"); + else + puts(" } else { /* gfx ring - uncached */"); + + /* The list of best chosen methods. */ + struct si_result *methods[32]; + unsigned method_max_size[32]; + unsigned num_methods = 0; + + for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) { + /* Find the best method. */ + struct si_result *best = NULL; + + for (unsigned i = 0; i < NUM_METHODS; i++) { + struct si_result *r = &results[util_logbase2(size)][placement][i]; + + if (!r->is_valid) + continue; + + /* Ban CP DMA clears via MC on <= VI. They are super slow + * on GTT, which we can get due to BO evictions. + */ + if (sctx->chip_class <= VI && placement == 1 && + r->is_cp && r->cache_policy == L2_BYPASS) + continue; + + if (async) { + /* The following constraints for compute IBs try to limit + * resource usage so as not to decrease the performance + * of gfx IBs too much. + */ + + /* Don't use CP DMA on asynchronous rings, because + * the engine is shared with gfx IBs. + */ + if (r->is_cp) + continue; + + /* Don't use L2 caching on asynchronous rings to minimize + * L2 usage. + */ + if (r->cache_policy == L2_LRU) + continue; + + /* Asynchronous compute recommends waves_per_sh != 0 + * to limit CU usage. */ + if (r->is_cs && r->waves_per_sh == 0) + continue; + } else { + /* SDMA is always asynchronous */ + if (r->is_sdma) + continue; + + if (cached && r->cache_policy == L2_BYPASS) + continue; + if (!cached && r->cache_policy == L2_LRU) + continue; + } + + if (!best) { + best = r; + continue; + } + + /* Assume some measurement error. Earlier methods occupy fewer + * resources, so the next method is always more greedy, and we + * don't want to select it due to a measurement error. + */ + double min_improvement = 1.03; + + if (best->score * min_improvement < r->score) + best = r; + } + + if (num_methods > 0) { + unsigned i = num_methods - 1; + + /* If the best one is also the best for the previous size, + * just bump the size for the previous one. + * + * If there is no best, it means all methods were too slow + * for this size and were not tested. Use the best one for + * the previous size. + */ + if (!best || + (methods[i]->is_cp == best->is_cp && + methods[i]->is_sdma == best->is_sdma && + methods[i]->is_cs == best->is_cs && + methods[i]->cache_policy == best->cache_policy && + methods[i]->dwords_per_thread == best->dwords_per_thread && + methods[i]->waves_per_sh == best->waves_per_sh)) { + method_max_size[i] = size; + continue; + } + } + + /* Add it to the list. */ + assert(num_methods < ARRAY_SIZE(methods)); + methods[num_methods] = best; + method_max_size[num_methods] = size; + num_methods++; + } + + for (unsigned i = 0; i < num_methods; i++) { + struct si_result *best = methods[i]; + unsigned size = method_max_size[i]; + + /* The size threshold is between the current benchmarked + * size and the next benchmarked size. */ + if (i < num_methods - 1) + printf(" if (size <= %u) ", (size + (size << SIZE_SHIFT)) / 2); + else + printf(" "); + printf("return get("); + + assert(best); + if (best->is_cp) { + printf("CP_DMA, %s, 0, 0);\n", + best->cache_policy == L2_BYPASS ? "L2_BYPASS" : + best->cache_policy == L2_LRU ? "L2_LRU" : "L2_STREAM"); + } + if (best->is_sdma) + printf("SDMA, 0, 0, 0);\n"); + if (best->is_cs) { + printf("COMPUTE, %s, %u, %u);\n", + best->cache_policy == L2_LRU ? "L2_LRU" : "L2_STREAM", + best->dwords_per_thread, + best->waves_per_sh); + } + } + } + puts(" }"); + } + puts(" }"); + puts("}"); + + /* Cleanup. */ + for (unsigned i = 0; i < NUM_SHADERS; i++) + ctx->delete_compute_state(ctx, clear_cs[i]); + for (unsigned i = 0; i < NUM_SHADERS; i++) + ctx->delete_compute_state(ctx, copy_cs[i]); + ctx->destroy(ctx); + exit(0); +} -- 2.17.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev