Module: Mesa Branch: main Commit: f119f34742c41bff7d77ab69cc171a3ac68e2f97 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=f119f34742c41bff7d77ab69cc171a3ac68e2f97
Author: Ganesh Belgur Ramachandra <[email protected]> Date: Tue Nov 21 02:55:54 2023 -0600 radeonsi: convert "gfx11_create_sh_query_result_cs" shader to nir Acked-by: Pierre-Eric Pelloux-Prayer <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25972> --- src/gallium/drivers/radeonsi/meson.build | 1 - src/gallium/drivers/radeonsi/si_pipe.h | 2 - src/gallium/drivers/radeonsi/si_shaderlib_nir.c | 252 +++++++++++++++++++++++ src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c | 228 -------------------- 4 files changed, 252 insertions(+), 231 deletions(-) diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build index 29afa87c0b6..3c3903044f4 100644 --- a/src/gallium/drivers/radeonsi/meson.build +++ b/src/gallium/drivers/radeonsi/meson.build @@ -74,7 +74,6 @@ files_libradeonsi = files( 'si_shader_internal.h', 'si_shader_nir.c', 'si_shaderlib_nir.c', - 'si_shaderlib_tgsi.c', 'si_sqtt.c', 'si_state.c', 'si_state.h', diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index eabfe3a1883..af3fd4151d9 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1674,8 +1674,6 @@ void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_t void *si_clear_12bytes_buffer_shader(struct si_context *sctx); void *si_create_fmask_expand_cs(struct si_context *sctx, unsigned num_samples, bool is_array); void *si_create_query_result_cs(struct si_context *sctx); - -/* si_shaderlib_tgsi.c */ void *gfx11_create_sh_query_result_cs(struct si_context *sctx); /* gfx11_query.c */ diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c index 5d65e9caa39..1ab8a0a5a65 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c @@ -8,6 +8,7 @@ #define AC_SURFACE_INCLUDE_NIR #include "ac_surface.h" #include "si_pipe.h" +#include "si_query.h" #include "nir_format_convert.h" @@ -1253,3 +1254,254 @@ void *si_create_query_result_cs(struct si_context *sctx) return create_shader_state(sctx, b.shader); } + +/* Create the compute shader that is used to collect the results of gfx10+ + * shader queries. + * + * One compute grid with a single thread is launched for every query result + * buffer. The thread (optionally) reads a previous summary buffer, then + * accumulates data from the query result buffer, and writes the result either + * to a summary buffer to be consumed by the next grid invocation or to the + * user-supplied buffer. + * + * Data layout: + * + * CONST + * 0.x = config; + * [0:2] the low 3 bits indicate the mode: + * 0: sum up counts + * 1: determine result availability and write it as a boolean + * 2: SO_OVERFLOW + * 3: SO_ANY_OVERFLOW + * the remaining bits form a bitfield: + * 8: write result as a 64-bit value + * 0.y = offset in bytes to counts or stream for SO_OVERFLOW mode + * 0.z = chain bit field: + * 1: have previous summary buffer + * 2: write next summary buffer + * 0.w = result_count + */ +void *gfx11_create_sh_query_result_cs(struct si_context *sctx) +{ + const nir_shader_compiler_options *options = + sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE); + + nir_builder b = + nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "gfx11_create_sh_query_result_cs"); + b.shader->info.workgroup_size[0] = 1; + b.shader->info.workgroup_size[1] = 1; + b.shader->info.workgroup_size[2] = 1; + b.shader->info.num_ubos = 1; + b.shader->info.num_ssbos = 3; + b.shader->num_uniforms = 1; + + nir_def *zero = nir_imm_int(&b, 0); + nir_def *one = nir_imm_int(&b, 1); + nir_def *two = nir_imm_int(&b, 2); + nir_def *four = nir_imm_int(&b, 4); + nir_def *minus_one = nir_imm_int(&b, 0xffffffff); + + /* uint32_t acc_result = 0, acc_missing = 0; */ + nir_function_impl *e = nir_shader_get_entrypoint(b.shader); + nir_variable *acc_result = nir_local_variable_create(e, glsl_uint_type(), "acc_result"); + nir_store_var(&b, acc_result, zero, 0x1); + nir_variable *acc_missing = nir_local_variable_create(e, glsl_uint_type(), "acc_missing"); + nir_store_var(&b, acc_missing, zero, 0x1); + + /* uint32_t buff_0[4] = load_ubo(0, 0); */ + nir_def *buff_0 = nir_load_ubo(&b, 4, 32, zero, zero, .range_base = 0, .range = 16); + + /* if((chain & 1) { + * uint32_t result[2] = load_ssbo(1, 0); + * acc_result = result[0]; + * acc_missing = result[1]; + * } + */ + nir_def *is_prev_summary_buffer = nir_i2b(&b, nir_iand(&b, nir_channel(&b, buff_0, 2), one)); + nir_if *if_prev_summary_buffer = nir_push_if(&b, is_prev_summary_buffer); { + nir_def *result = nir_load_ssbo(&b, 2, 32, one, zero); + nir_store_var(&b, acc_result, nir_channel(&b, result, 0), 0x1); + nir_store_var(&b, acc_missing, nir_channel(&b, result, 1), 0x1); + } + nir_pop_if(&b, if_prev_summary_buffer); + + /* uint32_t mode = config & 0b111; + * bool is_overflow = mode >= 2; + */ + nir_def *mode = nir_iand_imm(&b, nir_channel(&b, buff_0, 0), 0b111); + nir_def *is_overflow = nir_uge(&b, mode, two); + + /* uint32_t result_remaining = (is_overflow && acc_result) ? 0 : result_count; */ + nir_variable *result_remaining = nir_local_variable_create(e, glsl_uint_type(), "result_remaining"); + nir_variable *base_offset = nir_local_variable_create(e, glsl_uint_type(), "base_offset"); + nir_def *state = nir_iand(&b, + nir_isub(&b, zero, nir_b2i32(&b, is_overflow)), + nir_load_var(&b, acc_result)); + nir_def *value = nir_bcsel(&b, nir_i2b(&b, state), zero, nir_channel(&b, buff_0, 3)); + nir_store_var(&b, result_remaining, value, 0x1); + + /* uint32_t base_offset = 0; */ + nir_store_var(&b, base_offset, zero, 0x1); + + /* Outer loop begin. + * while (!result_remaining) { + * ... + */ + nir_loop *loop_outer = nir_push_loop(&b); { + nir_def *condition = nir_load_var(&b, result_remaining); + nir_if *if_not_condition = nir_push_if(&b, nir_ieq(&b, condition, zero)); { + nir_jump(&b, nir_jump_break); + } + nir_pop_if(&b, if_not_condition); + + /* result_remaining--; */ + condition = nir_iadd(&b, condition, minus_one); + nir_store_var(&b, result_remaining, condition, 0x1); + + /* uint32_t fence = load_ssbo(0, base_offset + sizeof(gfx11_sh_query_buffer_mem.stream)); */ + nir_def *b_offset = nir_load_var(&b, base_offset); + uint64_t buffer_mem_stream_size = sizeof(((struct gfx11_sh_query_buffer_mem*)0)->stream); + nir_def *fence = nir_load_ssbo(&b, 1, 32, zero, + nir_iadd_imm(&b, b_offset, buffer_mem_stream_size)); + + /* if (!fence) { + * acc_missing = ~0u; + * break; + * } + */ + nir_def *is_zero = nir_ieq(&b, fence, zero); + nir_def *y_value = nir_isub(&b, zero, nir_b2i32(&b, is_zero)); + nir_store_var(&b, acc_missing, y_value, 0x1); + nir_if *if_ssbo_zero = nir_push_if(&b, is_zero); { + nir_jump(&b, nir_jump_break); + } + nir_pop_if(&b, if_ssbo_zero); + + /* stream_offset = base_offset + offset; */ + nir_def *s_offset = nir_iadd(&b, b_offset, nir_channel(&b, buff_0, 1)); + + /* if (!(config & 7)) { + * acc_result += buffer[0]@stream_offset; + * } + */ + nir_if *if_sum_up_counts = nir_push_if(&b, nir_ieq(&b, mode, zero)); { + nir_def *x_value = nir_load_ssbo(&b, 1, 32, zero, s_offset); + x_value = nir_iadd(&b, nir_load_var(&b, acc_result), x_value); + nir_store_var(&b, acc_result, x_value, 0x1); + } + nir_pop_if(&b, if_sum_up_counts); + + /* if (is_overflow) { + * uint32_t count = (config & 1) ? 4 : 1; + * ... + */ + nir_if *if_overflow = nir_push_if(&b, is_overflow); { + nir_def *is_result_available = nir_i2b(&b, nir_iand(&b, mode, one)); + nir_def *initial_count = nir_bcsel(&b, is_result_available, four, one); + + nir_variable *count = + nir_local_variable_create(e, glsl_uint_type(), "count"); + nir_store_var(&b, count, initial_count, 0x1); + + nir_variable *stream_offset = + nir_local_variable_create(e, glsl_uint_type(), "stream_offset"); + nir_store_var(&b, stream_offset, s_offset, 0x1); + + /* Inner loop begin. + * do { + * ... + */ + nir_loop *loop_inner = nir_push_loop(&b); { + /* uint32_t buffer[4] = load_ssbo(0, stream_offset + 2 * sizeof(uint64_t)); */ + nir_def *stream_offset_value = nir_load_var(&b, stream_offset); + nir_def *buffer = + nir_load_ssbo(&b, 4, 32, zero, + nir_iadd_imm(&b, stream_offset_value, 2 * sizeof(uint64_t))); + + /* if (generated != emitted) { + * acc_result = 1; + * base_offset = 0; + * break; + * } + */ + nir_def *generated = nir_channel(&b, buffer, 0); + nir_def *emitted = nir_channel(&b, buffer, 2); + nir_if *if_not_equal = nir_push_if(&b, nir_ine(&b, generated, emitted)); { + nir_store_var(&b, acc_result, one, 0x1); + nir_store_var(&b, base_offset, zero, 0x1); + nir_jump(&b, nir_jump_break); + } + nir_pop_if(&b, if_not_equal); + + /* stream_offset += sizeof(gfx11_sh_query_buffer_mem.stream[0]); */ + uint64_t buffer_mem_stream0_size = + sizeof(((struct gfx11_sh_query_buffer_mem*)0)->stream[0]); + stream_offset_value = nir_iadd_imm(&b, stream_offset_value, buffer_mem_stream0_size); + nir_store_var(&b, stream_offset, stream_offset_value, 0x1); + + /* } while(count--); */ + nir_def *loop_count = nir_load_var(&b, count); + loop_count = nir_iadd(&b, loop_count, minus_one); + nir_store_var(&b, count, loop_count, 0x1); + + nir_if *if_zero = nir_push_if(&b, nir_ieq(&b, loop_count, zero)); { + nir_jump(&b, nir_jump_break); + } + nir_pop_if(&b, if_zero); + } + nir_pop_loop(&b, loop_inner); /* Inner loop end */ + } + nir_pop_if(&b, if_overflow); + + /* base_offset += sizeof(gfx11_sh_query_buffer_mem); */ + nir_def *buffer_mem_size = nir_imm_int(&b, sizeof(struct gfx11_sh_query_buffer_mem)); + nir_store_var(&b, base_offset, nir_iadd(&b, nir_load_var(&b, base_offset), buffer_mem_size), 0x1); + } + nir_pop_loop(&b, loop_outer); /* Outer loop end */ + + nir_def *acc_result_value = nir_load_var(&b, acc_result); + nir_def *y_value = nir_load_var(&b, acc_missing); + + /* if ((chain & 2)) { + * store_ssbo(<acc_result, acc_missing>, 2, 0); + * ... + */ + nir_def *is_write_summary_buffer = nir_i2b(&b, nir_iand(&b, nir_channel(&b, buff_0, 2), two)); + nir_if *if_write_summary_buffer = nir_push_if(&b, is_write_summary_buffer); { + nir_store_ssbo(&b, nir_vec2(&b, acc_result_value, y_value), two, zero); + } nir_push_else(&b, if_write_summary_buffer); { + + /* } else { + * if ((config & 7) == 1) { + * acc_result = acc_missing ? 0 : 1; + * acc_missing = 0; + * } + * ... + */ + nir_def *is_result_available = nir_ieq(&b, mode, one); + nir_def *is_zero = nir_ieq(&b, y_value, zero); + acc_result_value = nir_bcsel(&b, is_result_available, nir_b2i32(&b, is_zero), acc_result_value); + nir_def *ny = nir_bcsel(&b, is_result_available, zero, y_value); + + /* if (!acc_missing) { + * store_ssbo(acc_result, 2, 0); + * if (config & 8)) { + * store_ssbo(0, 2, 4) + * } + * } + */ + nir_if *if_zero = nir_push_if(&b, nir_ieq(&b, ny, zero)); { + nir_store_ssbo(&b, acc_result_value, two, zero); + + nir_def *is_so_any_overflow = nir_i2b(&b, nir_iand_imm(&b, nir_channel(&b, buff_0, 0), 8)); + nir_if *if_so_any_overflow = nir_push_if(&b, is_so_any_overflow); { + nir_store_ssbo(&b, zero, two, four); + } + nir_pop_if(&b, if_so_any_overflow); + } + nir_pop_if(&b, if_zero); + } + nir_pop_if(&b, if_write_summary_buffer); + + return create_shader_state(sctx, b.shader); +} diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c deleted file mode 100644 index 2eab564ca26..00000000000 --- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c +++ /dev/null @@ -1,228 +0,0 @@ -/* - * Copyright 2018 Advanced Micro Devices, Inc. - * - * SPDX-License-Identifier: MIT - */ - -#include "si_pipe.h" -#include "tgsi/tgsi_text.h" -#include "tgsi/tgsi_ureg.h" - -/* Create the compute shader that is used to collect the results of gfx10+ - * shader queries. - * - * One compute grid with a single thread is launched for every query result - * buffer. The thread (optionally) reads a previous summary buffer, then - * accumulates data from the query result buffer, and writes the result either - * to a summary buffer to be consumed by the next grid invocation or to the - * user-supplied buffer. - * - * Data layout: - * - * BUFFER[0] = query result buffer (layout is defined by gfx10_sh_query_buffer_mem) - * BUFFER[1] = previous summary buffer - * BUFFER[2] = next summary buffer or user-supplied buffer - * - * CONST - * 0.x = config; the low 3 bits indicate the mode: - * 0: sum up counts - * 1: determine result availability and write it as a boolean - * 2: SO_OVERFLOW - * 3: SO_ANY_OVERFLOW - * the remaining bits form a bitfield: - * 8: write result as a 64-bit value - * 0.y = offset in bytes to counts or stream for SO_OVERFLOW mode - * 0.z = chain bit field: - * 1: have previous summary buffer - * 2: write next summary buffer - * 0.w = result_count - */ -void *gfx11_create_sh_query_result_cs(struct si_context *sctx) -{ - /* TEMP[0].x = accumulated result so far - * TEMP[0].y = result missing - * TEMP[0].z = whether we're in overflow mode - */ - static const char text_tmpl[] = - "COMP\n" - "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n" - "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" - "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" - "DCL BUFFER[0]\n" - "DCL BUFFER[1]\n" - "DCL BUFFER[2]\n" - "DCL CONST[0][0..0]\n" - "DCL TEMP[0..5]\n" - "IMM[0] UINT32 {0, 7, 256, 4294967295}\n" - "IMM[1] UINT32 {1, 2, 4, 8}\n" - "IMM[2] UINT32 {16, 32, 64, 128}\n" - - /* acc_result = 0; - * acc_missing = 0; - */ - "MOV TEMP[0].xy, IMM[0].xxxx\n" - - /* if (chain & 1) { - * acc_result = buffer[1][0]; - * acc_missing = buffer[1][1]; - * } - */ - "AND TEMP[5], CONST[0][0].zzzz, IMM[1].xxxx\n" - "UIF TEMP[5]\n" - "LOAD TEMP[0].xy, BUFFER[1], IMM[0].xxxx\n" - "ENDIF\n" - - /* is_overflow (TEMP[0].z) = (config & 7) >= 2; */ - "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n" - "USGE TEMP[0].z, TEMP[5].xxxx, IMM[1].yyyy\n" - - /* result_remaining (TEMP[1].x) = (is_overflow && acc_result) ? 0 : result_count; */ - "AND TEMP[5].x, TEMP[0].zzzz, TEMP[0].xxxx\n" - "UCMP TEMP[1].x, TEMP[5].xxxx, IMM[0].xxxx, CONST[0][0].wwww\n" - - /* base_offset (TEMP[1].y) = 0; */ - "MOV TEMP[1].y, IMM[0].xxxx\n" - - /* for (;;) { - * if (!result_remaining) { - * break; - * } - * result_remaining--; - */ - "BGNLOOP\n" - " USEQ TEMP[5], TEMP[1].xxxx, IMM[0].xxxx\n" - " UIF TEMP[5]\n" - " BRK\n" - " ENDIF\n" - " UADD TEMP[1].x, TEMP[1].xxxx, IMM[0].wwww\n" - - /* fence = buffer[0]@(base_offset + sizeof(gfx10_sh_query_buffer_mem.stream)); */ - " UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].wwww\n" - " LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n" - - /* if (!fence) { - * acc_missing = ~0u; - * break; - * } - */ - " USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n" - " UIF TEMP[5]\n" - " MOV TEMP[0].y, TEMP[5].xxxx\n" - " BRK\n" - " ENDIF\n" - - /* stream_offset (TEMP[2].x) = base_offset + offset; */ - " UADD TEMP[2].x, TEMP[1].yyyy, CONST[0][0].yyyy\n" - - /* if (!(config & 7)) { - * acc_result += buffer[0]@stream_offset; - * } - */ - " AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n" - " USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n" - " UIF TEMP[5]\n" - " LOAD TEMP[5].x, BUFFER[0], TEMP[2].xxxx\n" - " UADD TEMP[0].x, TEMP[0].xxxx, TEMP[5].xxxx\n" - " ENDIF\n" - - /* if ((config & 7) >= 2) { - * count (TEMP[2].y) = (config & 1) ? 4 : 1; - */ - " AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n" - " USGE TEMP[5], TEMP[5].xxxx, IMM[1].yyyy\n" - " UIF TEMP[5]\n" - " AND TEMP[5].x, CONST[0][0].xxxx, IMM[1].xxxx\n" - " UCMP TEMP[2].y, TEMP[5].xxxx, IMM[1].zzzz, IMM[1].xxxx\n" - - /* do { - * generated = buffer[0]@(stream_offset + 2 * sizeof(uint64_t)); - * emitted = buffer[0]@(stream_offset + 3 * sizeof(uint64_t)); - * if (generated != emitted) { - * acc_result = 1; - * result_remaining = 0; - * break; - * } - * - * stream_offset += sizeof(gfx10_sh_query_buffer_mem.stream[0]); - * } while (--count); - * } - */ - " BGNLOOP\n" - " UADD TEMP[5].x, TEMP[2].xxxx, IMM[2].xxxx\n" - " LOAD TEMP[4].xyzw, BUFFER[0], TEMP[5].xxxx\n" - " USNE TEMP[5], TEMP[4].xyxy, TEMP[4].zwzw\n" - " UIF TEMP[5]\n" - " MOV TEMP[0].x, IMM[1].xxxx\n" - " MOV TEMP[1].y, IMM[0].xxxx\n" - " BRK\n" - " ENDIF\n" - - " UADD TEMP[2].y, TEMP[2].yyyy, IMM[0].wwww\n" - " USEQ TEMP[5], TEMP[2].yyyy, IMM[0].xxxx\n" - " UIF TEMP[5]\n" - " BRK\n" - " ENDIF\n" - " UADD TEMP[2].x, TEMP[2].xxxx, IMM[2].yyyy\n" - " ENDLOOP\n" - " ENDIF\n" - - /* base_offset += sizeof(gfx10_sh_query_buffer_mem); - * } // end outer loop - */ - " UADD TEMP[1].y, TEMP[1].yyyy, IMM[0].zzzz\n" - "ENDLOOP\n" - - /* if (chain & 2) { - * buffer[2][0] = acc_result; - * buffer[2][1] = acc_missing; - * } else { - */ - "AND TEMP[5], CONST[0][0].zzzz, IMM[1].yyyy\n" - "UIF TEMP[5]\n" - " STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0]\n" - "ELSE\n" - - /* if ((config & 7) == 1) { - * acc_result = acc_missing ? 0 : 1; - * acc_missing = 0; - * } - */ - " AND TEMP[5], CONST[0][0].xxxx, IMM[0].yyyy\n" - " USEQ TEMP[5], TEMP[5].xxxx, IMM[1].xxxx\n" - " UIF TEMP[5]\n" - " UCMP TEMP[0].x, TEMP[0].yyyy, IMM[0].xxxx, IMM[1].xxxx\n" - " MOV TEMP[0].y, IMM[0].xxxx\n" - " ENDIF\n" - - /* if (!acc_missing) { - * buffer[2][0] = acc_result; - * if (config & 8) { - * buffer[2][1] = 0; - * } - * } - * } - */ - " USEQ TEMP[5], TEMP[0].yyyy, IMM[0].xxxx\n" - " UIF TEMP[5]\n" - " STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n" - " AND TEMP[5], CONST[0][0].xxxx, IMM[1].wwww\n" - " UIF TEMP[5]\n" - " STORE BUFFER[2].x, IMM[1].zzzz, TEMP[0].yyyy\n" - " ENDIF\n" - " ENDIF\n" - "ENDIF\n" - "END\n"; - - struct tgsi_token tokens[1024]; - struct pipe_compute_state state = {}; - - if (!tgsi_text_translate(text_tmpl, tokens, ARRAY_SIZE(tokens))) { - assert(false); - return NULL; - } - - state.ir_type = PIPE_SHADER_IR_TGSI; - state.prog = tokens; - - return sctx->b.create_compute_state(&sctx->b, &state); -}
