From: Ian Romanick <ian.d.roman...@intel.com> That flow control may be trying to avoid invalid loads. On at least some platforms, those loads can also be expensive.
No shader-db changes on any Intel platform (even with the later patch "intel/compiler: More peephole select"). NOTE: I've tried to CC everyone whose driver is affected by this change. v2: Add a 'indirect_load_ok' flag to nir_opt_peephole_select. Suggested by Rob. See also the big comment in src/intel/compiler/brw_nir.c. v3: Use nir_deref_instr_has_indirect instead of deref_has_indirect (from nir_lower_io_arrays_to_elements.c). Signed-off-by: Ian Romanick <ian.d.roman...@intel.com> Cc: Eric Anholt <e...@anholt.net> Cc: Rob Clark <robdcl...@gmail.com> Cc: Marek Olšák <marek.ol...@amd.com> --- src/amd/vulkan/radv_shader.c | 2 +- src/broadcom/compiler/nir_to_vir.c | 2 +- src/compiler/nir/nir.h | 3 ++- src/compiler/nir/nir_opt_peephole_select.c | 37 ++++++++++++++++++++-------- src/gallium/drivers/freedreno/ir3/ir3_nir.c | 2 +- src/gallium/drivers/radeonsi/si_shader_nir.c | 2 +- src/gallium/drivers/vc4/vc4_program.c | 2 +- src/intel/compiler/brw_nir.c | 13 +++++++++- src/mesa/state_tracker/st_glsl_to_nir.cpp | 2 +- 9 files changed, 47 insertions(+), 18 deletions(-) diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 207e5b050eb..632512db09b 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -143,7 +143,7 @@ radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively) NIR_PASS(progress, shader, nir_opt_if); NIR_PASS(progress, shader, nir_opt_dead_cf); NIR_PASS(progress, shader, nir_opt_cse); - NIR_PASS(progress, shader, nir_opt_peephole_select, 8); + NIR_PASS(progress, shader, nir_opt_peephole_select, 8, true); NIR_PASS(progress, shader, nir_opt_algebraic); NIR_PASS(progress, shader, nir_opt_constant_folding); NIR_PASS(progress, shader, nir_opt_undef); diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 158c1c3e9f3..0d23cea4d5b 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -1210,7 +1210,7 @@ v3d_optimize_nir(struct nir_shader *s) NIR_PASS(progress, s, nir_opt_dce); NIR_PASS(progress, s, nir_opt_dead_cf); NIR_PASS(progress, s, nir_opt_cse); - NIR_PASS(progress, s, nir_opt_peephole_select, 8); + NIR_PASS(progress, s, nir_opt_peephole_select, 8, true); NIR_PASS(progress, s, nir_opt_algebraic); NIR_PASS(progress, s, nir_opt_constant_folding); NIR_PASS(progress, s, nir_opt_undef); diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 12cad6029cd..67fa46d5557 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -3002,7 +3002,8 @@ bool nir_opt_move_comparisons(nir_shader *shader); bool nir_opt_move_load_ubo(nir_shader *shader); -bool nir_opt_peephole_select(nir_shader *shader, unsigned limit); +bool nir_opt_peephole_select(nir_shader *shader, unsigned limit, + bool indirect_load_ok); bool nir_opt_remove_phis_impl(nir_function_impl *impl); bool nir_opt_remove_phis(nir_shader *shader); diff --git a/src/compiler/nir/nir_opt_peephole_select.c b/src/compiler/nir/nir_opt_peephole_select.c index ad9d0abec03..6808d3eda6c 100644 --- a/src/compiler/nir/nir_opt_peephole_select.c +++ b/src/compiler/nir/nir_opt_peephole_select.c @@ -58,7 +58,8 @@ */ static bool -block_check_for_allowed_instrs(nir_block *block, unsigned *count, bool alu_ok) +block_check_for_allowed_instrs(nir_block *block, unsigned *count, + bool alu_ok, bool indirect_load_ok) { nir_foreach_instr(instr, block) { switch (instr->type) { @@ -66,16 +67,26 @@ block_check_for_allowed_instrs(nir_block *block, unsigned *count, bool alu_ok) nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); switch (intrin->intrinsic) { - case nir_intrinsic_load_deref: - switch (nir_src_as_deref(intrin->src[0])->mode) { + case nir_intrinsic_load_deref: { + nir_deref_instr *const deref = nir_src_as_deref(intrin->src[0]); + + switch (deref->mode) { case nir_var_shader_in: case nir_var_uniform: + /* Don't try to remove flow control around an indirect load + * because that flow control may be trying to avoid invalid + * loads. + */ + if (!indirect_load_ok && nir_deref_instr_has_indirect(deref)) + return false; + break; default: return false; } break; + } case nir_intrinsic_load_uniform: if (!alu_ok) @@ -149,7 +160,7 @@ block_check_for_allowed_instrs(nir_block *block, unsigned *count, bool alu_ok) static bool nir_opt_peephole_select_block(nir_block *block, nir_shader *shader, - unsigned limit) + unsigned limit, bool indirect_load_ok) { if (nir_cf_node_is_first(&block->cf_node)) return false; @@ -169,8 +180,10 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader, /* ... and those blocks must only contain "allowed" instructions. */ unsigned count = 0; - if (!block_check_for_allowed_instrs(then_block, &count, limit != 0) || - !block_check_for_allowed_instrs(else_block, &count, limit != 0)) + if (!block_check_for_allowed_instrs(then_block, &count, limit != 0, + indirect_load_ok) || + !block_check_for_allowed_instrs(else_block, &count, limit != 0, + indirect_load_ok)) return false; if (count > limit) @@ -236,13 +249,15 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader, } static bool -nir_opt_peephole_select_impl(nir_function_impl *impl, unsigned limit) +nir_opt_peephole_select_impl(nir_function_impl *impl, unsigned limit, + bool indirect_load_ok) { nir_shader *shader = impl->function->shader; bool progress = false; nir_foreach_block_safe(block, impl) { - progress |= nir_opt_peephole_select_block(block, shader, limit); + progress |= nir_opt_peephole_select_block(block, shader, limit, + indirect_load_ok); } if (progress) @@ -252,13 +267,15 @@ nir_opt_peephole_select_impl(nir_function_impl *impl, unsigned limit) } bool -nir_opt_peephole_select(nir_shader *shader, unsigned limit) +nir_opt_peephole_select(nir_shader *shader, unsigned limit, + bool indirect_load_ok) { bool progress = false; nir_foreach_function(function, shader) { if (function->impl) - progress |= nir_opt_peephole_select_impl(function->impl, limit); + progress |= nir_opt_peephole_select_impl(function->impl, limit, + indirect_load_ok); } return progress; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_nir.c index db1d74fdee7..5f66ef5d170 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.c @@ -107,7 +107,7 @@ ir3_optimize_loop(nir_shader *s) progress |= OPT(s, nir_opt_gcm, true); else if (gcm == 2) progress |= OPT(s, nir_opt_gcm, false); - progress |= OPT(s, nir_opt_peephole_select, 16); + progress |= OPT(s, nir_opt_peephole_select, 16, true); progress |= OPT(s, nir_opt_intrinsics); progress |= OPT(s, nir_opt_algebraic); progress |= OPT(s, nir_opt_constant_folding); diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 0aefca22385..9a7a8264283 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -813,7 +813,7 @@ si_lower_nir(struct si_shader_selector* sel) NIR_PASS(progress, sel->nir, nir_opt_if); NIR_PASS(progress, sel->nir, nir_opt_dead_cf); NIR_PASS(progress, sel->nir, nir_opt_cse); - NIR_PASS(progress, sel->nir, nir_opt_peephole_select, 8); + NIR_PASS(progress, sel->nir, nir_opt_peephole_select, 8, true); /* Needed for algebraic lowering */ NIR_PASS(progress, sel->nir, nir_opt_algebraic); diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 1d767af1bdb..39f7db9148c 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -1580,7 +1580,7 @@ vc4_optimize_nir(struct nir_shader *s) NIR_PASS(progress, s, nir_opt_dce); NIR_PASS(progress, s, nir_opt_dead_cf); NIR_PASS(progress, s, nir_opt_cse); - NIR_PASS(progress, s, nir_opt_peephole_select, 8); + NIR_PASS(progress, s, nir_opt_peephole_select, 8, true); NIR_PASS(progress, s, nir_opt_algebraic); NIR_PASS(progress, s, nir_opt_constant_folding); NIR_PASS(progress, s, nir_opt_undef); diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index ef5034d1e1e..6ce8325a4dd 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -566,7 +566,18 @@ brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler, OPT(nir_copy_prop); OPT(nir_opt_dce); OPT(nir_opt_cse); - OPT(nir_opt_peephole_select, 0); + + /* For indirect loads of uniforms (push constants), we assume that array + * indices will nearly always be in bounds and the cost of the load is + * low. Therefore there shouldn't be a performance benefit to avoid it. + * However, in vec4 tessellation shaders, these loads operate by + * actually pulling from memory. + */ + const bool is_vec4_tessellation = !is_scalar && + (nir->info.stage == MESA_SHADER_TESS_CTRL || + nir->info.stage == MESA_SHADER_TESS_EVAL); + OPT(nir_opt_peephole_select, 0, is_vec4_tessellation); + OPT(nir_opt_intrinsics); OPT(nir_opt_algebraic); OPT(nir_opt_constant_folding); diff --git a/src/mesa/state_tracker/st_glsl_to_nir.cpp b/src/mesa/state_tracker/st_glsl_to_nir.cpp index ae2c49960c9..bfcef3a293f 100644 --- a/src/mesa/state_tracker/st_glsl_to_nir.cpp +++ b/src/mesa/state_tracker/st_glsl_to_nir.cpp @@ -344,7 +344,7 @@ st_nir_opts(nir_shader *nir, bool scalar) NIR_PASS(progress, nir, nir_opt_if); NIR_PASS(progress, nir, nir_opt_dead_cf); NIR_PASS(progress, nir, nir_opt_cse); - NIR_PASS(progress, nir, nir_opt_peephole_select, 8); + NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true); NIR_PASS(progress, nir, nir_opt_algebraic); NIR_PASS(progress, nir, nir_opt_constant_folding); -- 2.14.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev