Module: Mesa Branch: main Commit: c109c3f95c225fd01589644b807ad321c6a05b69 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=c109c3f95c225fd01589644b807ad321c6a05b69
Author: Ganesh Belgur Ramachandra <[email protected]> Date: Fri Sep 29 13:00:49 2023 -0500 radeonsi: convert "create_query_result_cs" shader to nir Acked-by: Pierre-Eric Pelloux-Prayer <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25972> --- src/gallium/drivers/radeonsi/si_pipe.h | 2 +- src/gallium/drivers/radeonsi/si_shaderlib_nir.c | 380 +++++++++++++++++++++++ src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c | 215 ------------- 3 files changed, 381 insertions(+), 216 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 237b234978a..eabfe3a1883 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1673,9 +1673,9 @@ void *si_create_clear_buffer_rmw_cs(struct si_context *sctx); void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type); void *si_clear_12bytes_buffer_shader(struct si_context *sctx); void *si_create_fmask_expand_cs(struct si_context *sctx, unsigned num_samples, bool is_array); +void *si_create_query_result_cs(struct si_context *sctx); /* si_shaderlib_tgsi.c */ -void *si_create_query_result_cs(struct si_context *sctx); void *gfx11_create_sh_query_result_cs(struct si_context *sctx); /* gfx11_query.c */ diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c index 85e08c61bd2..5d65e9caa39 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c @@ -873,3 +873,383 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, *vs = create_shader_state(sctx, b.shader); return *vs; } + +/* Create the compute shader that is used to collect the results. + * + * One compute grid with a single thread is launched for every query result + * buffer. The thread (optionally) reads a previous summary buffer, then + * accumulates data from the query result buffer, and writes the result either + * to a summary buffer to be consumed by the next grid invocation or to the + * user-supplied buffer. + * + * Data layout: + * + * CONST + * 0.x = end_offset + * 0.y = result_stride + * 0.z = result_count + * 0.w = bit field: + * 1: read previously accumulated values + * 2: write accumulated values for chaining + * 4: write result available + * 8: convert result to boolean (0/1) + * 16: only read one dword and use that as result + * 32: apply timestamp conversion + * 64: store full 64 bits result + * 128: store signed 32 bits result + * 256: SO_OVERFLOW mode: take the difference of two successive half-pairs + * 1.x = fence_offset + * 1.y = pair_stride + * 1.z = pair_count + * + */ +void *si_create_query_result_cs(struct si_context *sctx) +{ + const nir_shader_compiler_options *options = + sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE); + + nir_builder b = + nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "create_query_result_cs"); + b.shader->info.workgroup_size[0] = 1; + b.shader->info.workgroup_size[1] = 1; + b.shader->info.workgroup_size[2] = 1; + b.shader->info.num_ubos = 1; + b.shader->info.num_ssbos = 3; + b.shader->num_uniforms = 2; + + nir_def *var_undef = nir_undef(&b, 1, 32); + nir_def *zero = nir_imm_int(&b, 0); + nir_def *one = nir_imm_int(&b, 1); + nir_def *two = nir_imm_int(&b, 2); + nir_def *four = nir_imm_int(&b, 4); + nir_def *eight = nir_imm_int(&b, 8); + nir_def *sixteen = nir_imm_int(&b, 16); + nir_def *thirty_one = nir_imm_int(&b, 31); + nir_def *sixty_four = nir_imm_int(&b, 64); + + /* uint32_t x, y, z = 0; */ + nir_function_impl *e = nir_shader_get_entrypoint(b.shader); + nir_variable *x = nir_local_variable_create(e, glsl_uint_type(), "x"); + nir_store_var(&b, x, var_undef, 0x1); + nir_variable *y = nir_local_variable_create(e, glsl_uint_type(), "y"); + nir_store_var(&b, y, var_undef, 0x1); + nir_variable *z = nir_local_variable_create(e, glsl_uint_type(), "z"); + nir_store_var(&b, z, zero, 0x1); + + /* uint32_t buff_0[4] = load_ubo(0, 0); */ + nir_def *buff_0 = nir_load_ubo(&b, 4, 32, zero, zero, .range_base = 0, .range = 16); + /* uint32_t buff_1[4] = load_ubo(1, 16); */ + nir_def *buff_1 = nir_load_ubo(&b, 4, 32, zero, sixteen, .range_base = 16, .range = 16); + + /* uint32_t b0_bitfield = buff_0.w; */ + nir_def *b0_bitfield = nir_channel(&b, buff_0, 3); + + /* Check result availability. + * if (b0_bitfield & (1u << 4)) { + * ... + */ + nir_def *is_one_dword_result = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixteen)); + nir_if *if_one_dword_result = nir_push_if(&b, is_one_dword_result); { + + /* int32_t value = load_ssbo(0, fence_offset); + * z = ~(value >> 31); + */ + nir_def *value = nir_load_ssbo(&b, 1, 32, zero, nir_channel(&b, buff_1, 0)); + nir_def *bitmask = nir_inot(&b, nir_ishr(&b, value, thirty_one)); + nir_store_var(&b, z, bitmask, 0x1); + + /* Load result if available. + * if (value < 0) { + * uint32_t result[2] = load_ssbo(0, 0); + * x = result[0]; + * y = result[1]; + * } + */ + nir_if *if_negative = nir_push_if(&b, nir_ilt(&b, value, zero)); { + nir_def *result = nir_load_ssbo(&b, 2, 32, zero, zero); + nir_store_var(&b, x, nir_channel(&b, result, 0), 0x1); + nir_store_var(&b, y, nir_channel(&b, result, 1), 0x1); + } + nir_pop_if(&b, if_negative); + } nir_push_else(&b, if_one_dword_result); { + + /* } else { + * x = 0; y = 0; + */ + nir_store_var(&b, x, zero, 0x1); + nir_store_var(&b, y, zero, 0x1); + + /* Load previously accumulated result if requested. + * if (b0_bitfield & (1u << 0)) { + * uint32_t result[3] = load_ssbo(1, 0); + * x = result[0]; + * y = result[1]; + * z = result[2]; + * } + */ + nir_def *is_prev_acc_result = nir_i2b(&b, nir_iand(&b, b0_bitfield, one)); + nir_if *if_prev_acc_result = nir_push_if(&b, is_prev_acc_result); { + nir_def *result = nir_load_ssbo(&b, 3, 32, one, zero); + nir_store_var(&b, x, nir_channel(&b, result, 0), 0x1); + nir_store_var(&b, y, nir_channel(&b, result, 1), 0x1); + nir_store_var(&b, z, nir_channel(&b, result, 2), 0x1); + } + nir_pop_if(&b, if_prev_acc_result); + + /* if (!z) { + * uint32_t result_index = 0; + * uint32_t pitch = 0; + * ... + */ + nir_def *z_value = nir_load_var(&b, z); + nir_if *if_not_z = nir_push_if(&b, nir_ieq(&b, z_value, zero)); { + nir_variable *outer_loop_iter = + nir_local_variable_create(e, glsl_uint_type(), "outer_loop_iter"); + nir_store_var(&b, outer_loop_iter, zero, 0x1); + nir_variable *pitch = nir_local_variable_create(e, glsl_uint_type(), "pitch"); + nir_store_var(&b, pitch, zero, 0x1); + + /* Outer loop. + * while (result_index <= result_count) { + * ... + */ + nir_loop *loop_outer = nir_push_loop(&b); { + nir_def *result_index = nir_load_var(&b, outer_loop_iter); + nir_def *is_result_index_out_of_bound = + nir_uge(&b, result_index, nir_channel(&b, buff_0, 2)); + nir_if *if_out_of_bound = nir_push_if(&b, is_result_index_out_of_bound); { + nir_jump(&b, nir_jump_break); + } + nir_pop_if(&b, if_out_of_bound); + + /* Load fence and check result availability. + * pitch = i * result_stride; + * uint32_t address = fence_offset + pitch; + * int32_t value = load_ssbo(0, address); + * z = ~(value >> 31); + */ + nir_def *pitch_outer_loop = nir_imul(&b, result_index, nir_channel(&b, buff_0, 1)); + nir_store_var(&b, pitch, pitch_outer_loop, 0x1); + nir_def *address = nir_iadd(&b, pitch_outer_loop, nir_channel(&b, buff_1, 0)); + nir_def *value = nir_load_ssbo(&b, 1, 32, zero, address); + nir_def *bitmask = nir_inot(&b, nir_ishr(&b, value, thirty_one)); + nir_store_var(&b, z, bitmask, 0x1); + + /* if (z) { + * break; + * } + */ + nir_if *if_result_available = nir_push_if(&b, nir_i2b(&b, bitmask)); { + nir_jump(&b, nir_jump_break); + } + nir_pop_if(&b, if_result_available); + + /* Inner loop iterator. + * uint32_t i = 0; + */ + nir_variable *inner_loop_iter = + nir_local_variable_create(e, glsl_uint_type(), "inner_loop_iter"); + nir_store_var(&b, inner_loop_iter, zero, 0x1); + + /* Inner loop. + * do { + * ... + */ + nir_loop *loop_inner = nir_push_loop(&b); { + nir_def *pitch_inner_loop = nir_load_var(&b, pitch); + nir_def *i = nir_load_var(&b, inner_loop_iter); + + /* Load start and end. + * uint64_t first = load_ssbo(0, pitch); + * uint64_t second = load_ssbo(0, pitch + end_offset); + * uint64_t start_half_pair = second - first; + */ + nir_def *first = nir_load_ssbo(&b, 1, 64, zero, pitch_inner_loop); + nir_def *new_pitch = nir_iadd(&b, pitch_inner_loop, nir_channel(&b, buff_0, 0)); + nir_def *second = nir_load_ssbo(&b, 1, 64, zero, new_pitch); + nir_def *start_half_pair = nir_isub(&b, second, first); + + /* Load second start/end half-pair and take the difference. + * if (b0_bitfield & (1u << 8)) { + * uint64_t first = load_ssbo(0, pitch + 8); + * uint64_t second = load_ssbo(0, pitch + end_offset + 8); + * uint64_t end_half_pair = second - first; + * uint64_t difference = start_half_pair - end_half_pair; + * } + */ + nir_def *difference; + nir_def *is_so_overflow_mode = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 256)); + nir_if *if_so_overflow_mode = nir_push_if(&b, is_so_overflow_mode); { + first = nir_load_ssbo(&b, 1, 64, zero, nir_iadd(&b, pitch_inner_loop, eight)); + second = nir_load_ssbo(&b, 1, 64, zero, nir_iadd(&b, new_pitch, eight)); + nir_def *end_half_pair = nir_isub(&b, second, first); + difference = nir_isub(&b, start_half_pair, end_half_pair); + } + nir_pop_if(&b, if_so_overflow_mode); + + /* uint64_t sum = (x | (uint64_t) y << 32) + difference; */ + nir_def *sum = nir_iadd(&b, + nir_pack_64_2x32_split(&b, + nir_load_var(&b, x), + nir_load_var(&b, y)), + nir_if_phi(&b, difference, start_half_pair)); + sum = nir_unpack_64_2x32(&b, sum); + + /* Increment inner loop iterator. + * i++; + */ + i = nir_iadd(&b, i, one); + nir_store_var(&b, inner_loop_iter, i, 0x1); + + /* Update pitch value. + * pitch = i * pair_stride + pitch; + */ + nir_def *incremented_pitch = nir_iadd(&b, + nir_imul(&b, i, nir_channel(&b, buff_1, 1)), + pitch_outer_loop); + nir_store_var(&b, pitch, incremented_pitch, 0x1); + + /* Update x and y. + * x = sum.x; + * y = sum.x >> 32; + */ + nir_store_var(&b, x, nir_channel(&b, sum, 0), 0x1); + nir_store_var(&b, y, nir_channel(&b, sum, 1), 0x1); + + /* } while (i < pair_count); + */ + nir_def *is_pair_count_exceeded = nir_uge(&b, i, nir_channel(&b, buff_1, 2)); + nir_if *if_pair_count_exceeded = nir_push_if(&b, is_pair_count_exceeded); { + nir_jump(&b, nir_jump_break); + } + nir_pop_if(&b, if_pair_count_exceeded); + } + nir_pop_loop(&b, loop_inner); + + /* Increment pair iterator. + * result_index++; + */ + nir_store_var(&b, outer_loop_iter, nir_iadd(&b, result_index, one), 0x1); + } + nir_pop_loop(&b, loop_outer); + } + nir_pop_if(&b, if_not_z); + } + nir_pop_if(&b, if_one_dword_result); + + nir_def *x_value = nir_load_var(&b, x); + nir_def *y_value = nir_load_var(&b, y); + nir_def *z_value = nir_load_var(&b, z); + + /* Store accumulated data for chaining. + * if (b0_bitfield & (1u << 1)) { + * store_ssbo(<x, y, z>, 2, 0); + */ + nir_def *is_acc_chaining = nir_i2b(&b, nir_iand(&b, b0_bitfield, two)); + nir_if *if_acc_chaining = nir_push_if(&b, is_acc_chaining); { + nir_store_ssbo(&b, nir_vec3(&b, x_value, y_value, z_value), two, zero); + } nir_push_else(&b, if_acc_chaining); { + + /* Store result availability. + * } else { + * if (b0_bitfield & (1u << 2)) { + * store_ssbo((~z & 1), 2, 0); + * ... + */ + nir_def *is_result_available = nir_i2b(&b, nir_iand(&b, b0_bitfield, four)); + nir_if *if_result_available = nir_push_if(&b, is_result_available); { + nir_store_ssbo(&b, nir_iand(&b, nir_inot(&b, z_value), one), two, zero); + + /* Store full 64 bits result. + * if (b0_bitfield & (1u << 6)) { + * store_ssbo(<0, 0>, 2, 0); + * } + */ + nir_def *is_result_64_bits = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixty_four)); + nir_if *if_result_64_bits = nir_push_if(&b, is_result_64_bits); { + nir_store_ssbo(&b, nir_imm_ivec2(&b, 0, 0), two, zero, + .write_mask = (1u << 1)); + } + nir_pop_if(&b, if_result_64_bits); + } nir_push_else(&b, if_result_available); { + + /* } else { + * if (~z) { + * ... + */ + nir_def *is_bitwise_not_z = nir_i2b(&b, nir_inot(&b, z_value)); + nir_if *if_bitwise_not_z = nir_push_if(&b, is_bitwise_not_z); { + nir_def *ts_x, *ts_y; + + /* Apply timestamp conversion. + * if (b0_bitfield & (1u << 5)) { + * uint64_t xy_million = (x | (uint64_t) y << 32) * (uint64_t) 1000000; + * uint64_t ts_converted = xy_million / (uint64_t) clock_crystal_frequency; + * x = ts_converted.x; + * y = ts_converted.x >> 32; + * } + */ + nir_def *is_apply_timestamp = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 32)); + nir_if *if_apply_timestamp = nir_push_if(&b, is_apply_timestamp); { + /* Add the frequency into the shader for timestamp conversion + * so that the backend can use the full range of optimizations + * for divide-by-constant. + */ + nir_def *clock_crystal_frequency = + nir_imm_int64(&b, sctx->screen->info.clock_crystal_freq); + + nir_def *xy_million = nir_imul(&b, + nir_pack_64_2x32_split(&b, x_value, y_value), + nir_imm_int64(&b, 1000000)); + nir_def *ts_converted = nir_udiv(&b, xy_million, clock_crystal_frequency); + ts_converted = nir_unpack_64_2x32(&b, ts_converted); + ts_x = nir_channel(&b, ts_converted, 0); + ts_y = nir_channel(&b, ts_converted, 1); + } + nir_pop_if(&b, if_apply_timestamp); + + nir_def *nx = nir_if_phi(&b, ts_x, x_value); + nir_def *ny = nir_if_phi(&b, ts_y, y_value); + + /* x = b0_bitfield & (1u << 3) ? ((x | (uint64_t) y << 32) != 0) : x; + * y = b0_bitfield & (1u << 3) ? 0 : y; + */ + nir_def *is_convert_to_bool = nir_i2b(&b, nir_iand(&b, b0_bitfield, eight)); + nir_def *xy = nir_pack_64_2x32_split(&b, nx, ny); + nir_def *is_xy = nir_b2i32(&b, nir_ine(&b, xy, nir_imm_int64(&b, 0))); + nx = nir_bcsel(&b, is_convert_to_bool, is_xy, nx); + ny = nir_bcsel(&b, is_convert_to_bool, zero, ny); + + /* if (b0_bitfield & (1u << 6)) { + * store_ssbo(<x, y>, 2, 0); + * } + */ + nir_def *is_result_64_bits = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixty_four)); + nir_if *if_result_64_bits = nir_push_if(&b, is_result_64_bits); { + nir_store_ssbo(&b, nir_vec2(&b, nx, ny), two, zero); + } nir_push_else(&b, if_result_64_bits); { + + /* Clamping. + * } else { + * x = y ? UINT32_MAX : x; + * x = b0_bitfield & (1u << 7) ? min(x, INT_MAX) : x; + * store_ssbo(x, 2, 0); + * } + */ + nir_def *is_y = nir_ine(&b, ny, zero); + nx = nir_bcsel(&b, is_y, nir_imm_int(&b, UINT32_MAX), nx); + nir_def *is_signed_32bit_result = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 128)); + nir_def *min = nir_umin(&b, nx, nir_imm_int(&b, INT_MAX)); + nx = nir_bcsel(&b, is_signed_32bit_result, min, nx); + nir_store_ssbo(&b, nx, two, zero); + } + nir_pop_if(&b, if_result_64_bits); + } + nir_pop_if(&b, if_bitwise_not_z); + } + nir_pop_if(&b, if_result_available); + } + nir_pop_if(&b, if_acc_chaining); + + return create_shader_state(sctx, b.shader); +} diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c index 13acc4eefc7..2eab564ca26 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c @@ -8,221 +8,6 @@ #include "tgsi/tgsi_text.h" #include "tgsi/tgsi_ureg.h" -/* Create the compute shader that is used to collect the results. - * - * One compute grid with a single thread is launched for every query result - * buffer. The thread (optionally) reads a previous summary buffer, then - * accumulates data from the query result buffer, and writes the result either - * to a summary buffer to be consumed by the next grid invocation or to the - * user-supplied buffer. - * - * Data layout: - * - * CONST - * 0.x = end_offset - * 0.y = result_stride - * 0.z = result_count - * 0.w = bit field: - * 1: read previously accumulated values - * 2: write accumulated values for chaining - * 4: write result available - * 8: convert result to boolean (0/1) - * 16: only read one dword and use that as result - * 32: apply timestamp conversion - * 64: store full 64 bits result - * 128: store signed 32 bits result - * 256: SO_OVERFLOW mode: take the difference of two successive half-pairs - * 1.x = fence_offset - * 1.y = pair_stride - * 1.z = pair_count - * - * BUFFER[0] = query result buffer - * BUFFER[1] = previous summary buffer - * BUFFER[2] = next summary buffer or user-supplied buffer - */ -void *si_create_query_result_cs(struct si_context *sctx) -{ - /* TEMP[0].xy = accumulated result so far - * TEMP[0].z = result not available - * - * TEMP[1].x = current result index - * TEMP[1].y = current pair index - */ - static const char text_tmpl[] = - "COMP\n" - "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n" - "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" - "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" - "DCL BUFFER[0]\n" - "DCL BUFFER[1]\n" - "DCL BUFFER[2]\n" - "DCL CONST[0][0..1]\n" - "DCL TEMP[0..5]\n" - "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n" - "IMM[1] UINT32 {1, 2, 4, 8}\n" - "IMM[2] UINT32 {16, 32, 64, 128}\n" - "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */ - "IMM[4] UINT32 {256, 0, 0, 0}\n" - - "AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n" - "UIF TEMP[5]\n" - /* Check result availability. */ - "LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n" - "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n" - "MOV TEMP[1], TEMP[0].zzzz\n" - "NOT TEMP[0].z, TEMP[0].zzzz\n" - - /* Load result if available. */ - "UIF TEMP[1]\n" - "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n" - "ENDIF\n" - "ELSE\n" - /* Load previously accumulated result if requested. */ - "MOV TEMP[0], IMM[0].xxxx\n" - "AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n" - "UIF TEMP[4]\n" - "LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n" - "ENDIF\n" - - "MOV TEMP[1].x, IMM[0].xxxx\n" - "BGNLOOP\n" - /* Break if accumulated result so far is not available. */ - "UIF TEMP[0].zzzz\n" - "BRK\n" - "ENDIF\n" - - /* Break if result_index >= result_count. */ - "USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n" - "UIF TEMP[5]\n" - "BRK\n" - "ENDIF\n" - - /* Load fence and check result availability */ - "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n" - "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n" - "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n" - "NOT TEMP[0].z, TEMP[0].zzzz\n" - "UIF TEMP[0].zzzz\n" - "BRK\n" - "ENDIF\n" - - "MOV TEMP[1].y, IMM[0].xxxx\n" - "BGNLOOP\n" - /* Load start and end. */ - "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n" - "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n" - "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n" - - "UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n" - "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n" - - "U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n" - - "AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n" - "UIF TEMP[5].zzzz\n" - /* Load second start/end half-pair and - * take the difference - */ - "UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n" - "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n" - "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n" - - "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n" - "U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n" - "ENDIF\n" - - "U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n" - - /* Increment pair index */ - "UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n" - "USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n" - "UIF TEMP[5]\n" - "BRK\n" - "ENDIF\n" - "ENDLOOP\n" - - /* Increment result index */ - "UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n" - "ENDLOOP\n" - "ENDIF\n" - - "AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n" - "UIF TEMP[4]\n" - /* Store accumulated data for chaining. */ - "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n" - "ELSE\n" - "AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n" - "UIF TEMP[4]\n" - /* Store result availability. */ - "NOT TEMP[0].z, TEMP[0]\n" - "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n" - "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n" - - "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n" - "UIF TEMP[4]\n" - "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n" - "ENDIF\n" - "ELSE\n" - /* Store result if it is available. */ - "NOT TEMP[4], TEMP[0].zzzz\n" - "UIF TEMP[4]\n" - /* Apply timestamp conversion */ - "AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n" - "UIF TEMP[4]\n" - "U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n" - "U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n" - "ENDIF\n" - - /* Convert to boolean */ - "AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n" - "UIF TEMP[4]\n" - "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n" - "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n" - "MOV TEMP[0].y, IMM[0].xxxx\n" - "ENDIF\n" - - "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n" - "UIF TEMP[4]\n" - "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n" - "ELSE\n" - /* Clamping */ - "UIF TEMP[0].yyyy\n" - "MOV TEMP[0].x, IMM[0].wwww\n" - "ENDIF\n" - - "AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n" - "UIF TEMP[4]\n" - "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n" - "ENDIF\n" - - "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n" - "ENDIF\n" - "ENDIF\n" - "ENDIF\n" - "ENDIF\n" - - "END\n"; - - char text[sizeof(text_tmpl) + 32]; - struct tgsi_token tokens[1024]; - struct pipe_compute_state state = {}; - - /* Hard code the frequency into the shader so that the backend can - * use the full range of optimizations for divide-by-constant. - */ - snprintf(text, sizeof(text), text_tmpl, sctx->screen->info.clock_crystal_freq); - - if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { - assert(false); - return NULL; - } - - state.ir_type = PIPE_SHADER_IR_TGSI; - state.prog = tokens; - - return sctx->b.create_compute_state(&sctx->b, &state); -} - /* Create the compute shader that is used to collect the results of gfx10+ * shader queries. *
