[Mesa-dev] [PATCH v4] anv: fix alphaToCoverage when there is no color attachment
From: Samuel Iglesias Gonsálvez There are tests in CTS for alpha to coverage without a color attachment that are failing. This happens because we remove the shader color outputs when we don't have a valid color attachment for them, but when alpha to coverage is enabled we still want to preserve the the output at location 0 since we need the alpha component. In that case we will also need to create a null render target for RT 0. v2: - We already create a null rt when we don't have any, so reuse that for this case (Jason) - Simplify the code a bit (Iago) v3: - Take alpha to coverage from the key and don't tie this to depth-only rendering only, we want the same behavior if we have multiple render targets but the one at location 0 is not used. (Jason). - Rewrite commit message (Iago) v4: - Make sure we take into account the array length of the shader outputs, which we were no handling correctly either and make sure we also create null render targets for any invalid array entries too. (Jason) Fixes the following CTS tests: dEQP-VK.pipeline.multisample.alpha_to_coverage_no_color_attachment.* Signed-off-by: Samuel Iglesias Gonsálvez Signed-off-by: Iago Toral Quiroga --- src/intel/vulkan/anv_pipeline.c | 56 - 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 20eab548fb2..f15f0896266 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -823,14 +823,24 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler, continue; const unsigned rt = var->data.location - FRAG_RESULT_DATA0; - /* Unused or out-of-bounds */ - if (rt >= MAX_RTS || !(stage->key.wm.color_outputs_valid & (1 << rt))) + /* Out-of-bounds */ + if (rt >= MAX_RTS) continue; const unsigned array_len = glsl_type_is_array(var->type) ? glsl_get_length(var->type) : 1; assert(rt + array_len <= max_rt); + /* Unused */ + if (!(stage->key.wm.color_outputs_valid & BITFIELD_RANGE(rt, array_len))) { + /* If this is the RT at location 0 and we have alpha to coverage + * enabled we will have to create a null RT for it, so mark it as + * used. + */ + if (rt > 0 || !stage->key.wm.alpha_to_coverage) +continue; + } + for (unsigned i = 0; i < array_len; i++) rt_used[rt + i] = true; } @@ -841,11 +851,22 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler, continue; rt_to_bindings[i] = num_rts; - rt_bindings[rt_to_bindings[i]] = (struct anv_pipeline_binding) { - .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS, - .binding = 0, - .index = i, - }; + + if (stage->key.wm.color_outputs_valid & (1 << i)) { + rt_bindings[rt_to_bindings[i]] = (struct anv_pipeline_binding) { +.set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS, +.binding = 0, +.index = i, + }; + } else { + /* Setup a null render target */ + rt_bindings[rt_to_bindings[i]] = (struct anv_pipeline_binding) { +.set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS, +.binding = 0, +.index = UINT32_MAX, + }; + } + num_rts++; } @@ -855,14 +876,21 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler, continue; const unsigned rt = var->data.location - FRAG_RESULT_DATA0; + const unsigned array_len = + glsl_type_is_array(var->type) ? glsl_get_length(var->type) : 1; + if (rt >= MAX_RTS || - !(stage->key.wm.color_outputs_valid & (1 << rt))) { - /* Unused or out-of-bounds, throw it away */ - deleted_output = true; - var->data.mode = nir_var_function_temp; - exec_node_remove(&var->node); - exec_list_push_tail(&impl->locals, &var->node); - continue; + !(stage->key.wm.color_outputs_valid & BITFIELD_RANGE(rt, array_len))) { + /* Unused or out-of-bounds, throw it away, unless it is the first + * RT and we have alpha to coverage enabled. + */ + if (rt != 0 || !stage->key.wm.alpha_to_coverage) { +deleted_output = true; +var->data.mode = nir_var_function_temp; +exec_node_remove(&var->node); +exec_list_push_tail(&impl->locals, &var->node); +continue; + } } /* Give it the new location */ -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3] anv: fix alphaToCoverage when there is no color attachment
From: Samuel Iglesias Gonsálvez There are tests in CTS for alpha to coverage without a color attachment that are failing. This happens because when we remove the shader color outputs when we don't have a valid color attachment for them, but when alpha to coverage is enabled we still want to preserve the the output at location 0 since we need its alpha component for alpha to coverage. In that case we will also need to create a null render target for RT 0. v2: - We already create a null rt when we don't have any, so reuse that for this case (Jason) - Simplify the code a bit (Iago) v3: - Take alpha to coverage from the key and don't tie this to depth-only rendering only, we want the same behavior if we have multiple render targets but the one at location 0 is not used. (Jason). - Rewrite commit message (Iago) Fixes the following CTS tests: dEQP-VK.pipeline.multisample.alpha_to_coverage_no_color_attachment.* Signed-off-by: Samuel Iglesias Gonsálvez Signed-off-by: Iago Toral Quiroga --- src/intel/vulkan/anv_pipeline.c | 48 + 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 20eab548fb2..f379dd2752e 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -818,15 +818,28 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler, memset(rt_used, 0, sizeof(rt_used)); /* Flag used render targets */ + bool needs_null_rt_for_alpha_to_coverage = false; nir_foreach_variable_safe(var, &stage->nir->outputs) { if (var->data.location < FRAG_RESULT_DATA0) continue; const unsigned rt = var->data.location - FRAG_RESULT_DATA0; - /* Unused or out-of-bounds */ - if (rt >= MAX_RTS || !(stage->key.wm.color_outputs_valid & (1 << rt))) + /* Out-of-bounds */ + if (rt >= MAX_RTS) continue; + /* Unused */ + if (!(stage->key.wm.color_outputs_valid & (1 << rt))) { + /* If this is the RT at location 0 and we have alpha to coverage + * enabled, we'll have to create a null render target and it must + * be at index 0. + */ + if (rt == 0 && stage->key.wm.alpha_to_coverage) +needs_null_rt_for_alpha_to_coverage = true; + + continue; + } + const unsigned array_len = glsl_type_is_array(var->type) ? glsl_get_length(var->type) : 1; assert(rt + array_len <= max_rt); @@ -835,7 +848,12 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler, rt_used[rt + i] = true; } - /* Set new, compacted, location */ + /* Make sure we leave the first RT slot available for alpha to coverage +* if we don't have a valid RT 0. +*/ + if (needs_null_rt_for_alpha_to_coverage) + num_rts = 1; + for (unsigned i = 0; i < max_rt; i++) { if (!rt_used[i]) continue; @@ -857,11 +875,15 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler, const unsigned rt = var->data.location - FRAG_RESULT_DATA0; if (rt >= MAX_RTS || !(stage->key.wm.color_outputs_valid & (1 << rt))) { - /* Unused or out-of-bounds, throw it away */ - deleted_output = true; - var->data.mode = nir_var_function_temp; - exec_node_remove(&var->node); - exec_list_push_tail(&impl->locals, &var->node); + /* Unused or out-of-bounds, throw it away, unless it is the first + * RT and we have alpha to coverage. + */ + if (rt != 0 || !stage->key.wm.alpha_to_coverage) { +deleted_output = true; +var->data.mode = nir_var_function_temp; +exec_node_remove(&var->node); +exec_list_push_tail(&impl->locals, &var->node); + } continue; } @@ -873,14 +895,18 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler, if (deleted_output) nir_fixup_deref_modes(stage->nir); - if (num_rts == 0) { - /* If we have no render targets, we need a null render target */ + /* If we have no render targets or we need to create one for alpha to +* coverage, we need a null render target. +*/ + if (num_rts == 0 || needs_null_rt_for_alpha_to_coverage) { rt_bindings[0] = (struct anv_pipeline_binding) { .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS, .binding = 0, .index = UINT32_MAX, }; - num_rts = 1; + + if (num_rts == 0) + num_rts = 1; } /* Now that we've determined the actual number of render targets, adjust -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2] anv: fix alphaToCoverage when there is no color attachment
From: Samuel Iglesias Gonsálvez There tests in CTS for for alpha to coverage without a color attachment. First the test draws a primitive with alpha 0 and a subpass with only a depth buffer. No writes to a depth buffer are expected. Then a second draw with a color buffer and the same depth buffer is done to verify the depth buffer still has the original clear values. This behavior is not explicitly forbidden by the Vulkan spec, so it seems it is allowed. When there is no color attachment for a given output, we discard it so at the end we have an FS assembly like: Native code for unnamed fragment shader (null) SIMD16 shader: 1 instructions. 0 loops. 4 cycles. 0:0 spills:fills. Promoted 0 constants. Compacted 16 to 16 bytes (0%) START B0 (4 cycles) sendc(16) null<1>UW g120<0,1,0>F0x90031000 render MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 8 rlen 0 { align1 1H EOT }; As g120 is not initialized, we see random writes to the depth buffer due to the alphaToCoverage enablement. This patch fixes that by keeping the output in that case. v2: - No need to create a null render target, the driver is already doing that (Jason) - Simplified code a bit (Iago) Fixes the following CTS tests: dEQP-VK.pipeline.multisample.alpha_to_coverage_no_color_attachment.* Signed-off-by: Samuel Iglesias Gonsálvez Signed-off-by: Iago Toral Quiroga --- src/intel/vulkan/anv_pipeline.c | 25 ++--- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index b9c9bfd7598..07f1a939e43 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -808,7 +808,9 @@ anv_pipeline_compile_gs(const struct brw_compiler *compiler, static void anv_pipeline_link_fs(const struct brw_compiler *compiler, - struct anv_pipeline_stage *stage) + struct anv_pipeline_stage *stage, + bool has_depth_stencil_att, + bool has_alpha_to_coverage) { unsigned num_rts = 0; const int max_rt = FRAG_RESULT_DATA7 - FRAG_RESULT_DATA0 + 1; @@ -859,11 +861,17 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler, const unsigned rt = var->data.location - FRAG_RESULT_DATA0; if (rt >= MAX_RTS || !(stage->key.wm.color_outputs_valid & (1 << rt))) { - /* Unused or out-of-bounds, throw it away */ - deleted_output = true; - var->data.mode = nir_var_function_temp; - exec_node_remove(&var->node); - exec_list_push_tail(&impl->locals, &var->node); + /* Unused or out-of-bounds, throw it away. The exception is depth-only + * rendering with alphaToCoverage, as in this case we need to keep the + * fragment output in location 0, which we will bind later to a null + * render target. + */ + if (rt != 0 || !has_alpha_to_coverage || !has_depth_stencil_att) { +deleted_output = true; +var->data.mode = nir_var_function_temp; +exec_node_remove(&var->node); +exec_list_push_tail(&impl->locals, &var->node); + } continue; } @@ -1120,7 +1128,10 @@ anv_pipeline_compile_graphics(struct anv_pipeline *pipeline, anv_pipeline_link_gs(compiler, &stages[s], next_stage); break; case MESA_SHADER_FRAGMENT: - anv_pipeline_link_fs(compiler, &stages[s]); + anv_pipeline_link_fs(compiler, &stages[s], + pipeline->subpass->depth_stencil_attachment, + info->pMultisampleState && + info->pMultisampleState->alphaToCoverageEnable); break; default: unreachable("Invalid graphics shader stage"); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] anv/query: ensure visibility of reset before copying query results
Specifically, vkCmdCopyQueryPoolResults is required to see the effect of a previous vkCmdResetQueryPool. This may not work currently when query execution is still on going, as some of the queries may become available asynchronously after the reset. Fixes new CTS tests: dEQP-VK.query_pool.statistics_query.reset_before_copy.* --- Jason, do you have any better ideas? src/intel/vulkan/genX_query.c | 13 + 1 file changed, 13 insertions(+) diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c index 146435c3f8f..08b013f6351 100644 --- a/src/intel/vulkan/genX_query.c +++ b/src/intel/vulkan/genX_query.c @@ -383,6 +383,19 @@ void genX(CmdResetQueryPool)( ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); + /* From the Vulkan spec: +* +*"vkCmdCopyQueryPoolResults is guaranteed to see the effect of +* previous uses of vkCmdResetQueryPool in the same queue, without +* any additional synchronization. Thus, the results will always +* reflect the most recent use of the query." +* +* So we need to make sure that any on-going queries are finished by +* the time we emit the reset. +*/ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + for (uint32_t i = 0; i < queryCount; i++) { anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdm) { sdm.Address = anv_query_address(pool, firstQuery + i); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 3/3] intel/compiler: implement more algebraic optimizations
Now that we propagate constants to the first source of 2src instructions we see more opportunities of constant folding in the backend. v2: - The hardware only uses 5 bits (or 6 bits for Q/UQ) from the shift count parameter in SHL/SHR instructions, so do the same in constant propagation (Ian) --- src/intel/compiler/brw_fs.cpp | 203 -- 1 file changed, 195 insertions(+), 8 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index bd588b55bde..f09320b4127 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -2583,9 +2583,55 @@ fs_visitor::opt_algebraic() break; case BRW_OPCODE_MUL: - if (inst->src[1].file != IMM) + if (inst->src[0].file != IMM && inst->src[1].file != IMM) continue; + /* Constant folding */ + if (inst->src[0].file == IMM && inst->src[1].file == IMM) { +assert(inst->src[0].type == inst->src[1].type); +bool local_progress = true; +switch (inst->src[0].type) { +case BRW_REGISTER_TYPE_HF: { + float v1 = _mesa_half_to_float(inst->src[0].ud & 0xu); + float v2 = _mesa_half_to_float(inst->src[1].ud & 0xu); + inst->src[0] = brw_imm_w(_mesa_float_to_half(v1 * v2)); + break; +} +case BRW_REGISTER_TYPE_W: { + int16_t v1 = inst->src[0].ud & 0xu; + int16_t v2 = inst->src[1].ud & 0xu; + inst->src[0] = brw_imm_w(v1 * v2); + break; +} +case BRW_REGISTER_TYPE_UW: { + uint16_t v1 = inst->src[0].ud & 0xu; + uint16_t v2 = inst->src[1].ud & 0xu; + inst->src[0] = brw_imm_uw(v1 * v2); + break; +} +case BRW_REGISTER_TYPE_F: + inst->src[0].f *= inst->src[1].f; + break; +case BRW_REGISTER_TYPE_D: + inst->src[0].d *= inst->src[1].d; + break; +case BRW_REGISTER_TYPE_UD: + inst->src[0].ud *= inst->src[1].ud; + break; +default: + local_progress = false; + break; +}; + +if (local_progress) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[1] = reg_undef; + progress = true; + break; +} + } + + /* a * 1.0 = a */ if (inst->src[1].is_one()) { inst->opcode = BRW_OPCODE_MOV; @@ -2594,6 +2640,14 @@ fs_visitor::opt_algebraic() break; } + if (inst->src[0].is_one()) { +inst->opcode = BRW_OPCODE_MOV; +inst->src[0] = inst->src[1]; +inst->src[1] = reg_undef; +progress = true; +break; + } + /* a * -1.0 = -a */ if (inst->src[1].is_negative_one()) { inst->opcode = BRW_OPCODE_MOV; @@ -2603,27 +2657,160 @@ fs_visitor::opt_algebraic() break; } - if (inst->src[0].file == IMM && - inst->src[0].type == BRW_REGISTER_TYPE_F) { + if (inst->src[0].is_negative_one()) { +inst->opcode = BRW_OPCODE_MOV; +inst->src[0] = inst->src[1]; +inst->src[0].negate = !inst->src[1].negate; +inst->src[1] = reg_undef; +progress = true; +break; + } + + /* a * 0 = 0 (this is not exact for floating point) */ + if (inst->src[1].is_zero() && + brw_reg_type_is_integer(inst->src[1].type)) { +inst->opcode = BRW_OPCODE_MOV; +inst->src[0] = inst->src[1]; +inst->src[1] = reg_undef; +progress = true; +break; + } + + if (inst->src[0].is_zero() && + brw_reg_type_is_integer(inst->src[0].type)) { inst->opcode = BRW_OPCODE_MOV; -inst->src[0].f *= inst->src[1].f; inst->src[1] = reg_undef; progress = true; break; } break; case BRW_OPCODE_ADD: - if (inst->src[1].file != IMM) + if (inst->src[0].file != IMM && inst->src[1].file != IMM) continue; - if (inst->src[0].file == IMM && - inst->src[0].type == BRW_REGISTER_TYPE_F) { + /* Constant folding */ + if (inst->src[0].file == IMM && inst->src[1].file == IMM) { +assert(inst->src[0].type == inst->src[1].type); +bool local_progress = true; +switch (inst->src[0].type) { +case BRW_REGISTER_TYPE_HF: { + float v1 = _mesa_half_to_float(inst->src[0].ud & 0xu); + float v2 = _mesa_half_to_float(inst->src[1].ud & 0xu); + inst->src[0] = brw_imm_w(_mesa_float_to_half(v1 + v2)); +
[Mesa-dev] [PATCH v2 1/3] intel/compiler: allow constant propagation for int quotient and reminder
And let combine constants promote the constants if needed. --- src/intel/compiler/brw_fs_combine_constants.cpp | 2 ++ src/intel/compiler/brw_fs_copy_propagation.cpp | 4 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp b/src/intel/compiler/brw_fs_combine_constants.cpp index db7b14a8312..a26eb67a00a 100644 --- a/src/intel/compiler/brw_fs_combine_constants.cpp +++ b/src/intel/compiler/brw_fs_combine_constants.cpp @@ -69,6 +69,8 @@ static bool must_promote_imm(const struct gen_device_info *devinfo, const fs_inst *inst) { switch (inst->opcode) { + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: case SHADER_OPCODE_POW: return devinfo->gen < 8; case BRW_OPCODE_MAD: diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp index 8b904ab356b..c11b05b128a 100644 --- a/src/intel/compiler/brw_fs_copy_propagation.cpp +++ b/src/intel/compiler/brw_fs_copy_propagation.cpp @@ -611,10 +611,6 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry) case SHADER_OPCODE_INT_QUOTIENT: case SHADER_OPCODE_INT_REMAINDER: - /* FINISHME: Promote non-float constants and remove this. */ - if (devinfo->gen < 8) -break; - /* fallthrough */ case SHADER_OPCODE_POW: /* Allow constant propagation into src1 (except on Gen 6 which * doesn't support scalar source math), and let constant combining -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 2/3] intel/compiler: allow constant propagation to first source of 2src instructions
Even if it is not supported by the hardware, we will fix it up in the combine constants pass. v2: - This will enable new constant folding opportunities in the algebraic pass for MUL or ADD with types other than F, so do not assert on that type. For now we just skip anything that is not float and a later patch will expand the algebraic pass to support more constant folding scenarios. --- src/intel/compiler/brw_fs.cpp | 8 +-- .../compiler/brw_fs_combine_constants.cpp | 37 ++--- .../compiler/brw_fs_copy_propagation.cpp | 55 +-- 3 files changed, 60 insertions(+), 40 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 2358acbeb59..bd588b55bde 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -2603,8 +2603,8 @@ fs_visitor::opt_algebraic() break; } - if (inst->src[0].file == IMM) { -assert(inst->src[0].type == BRW_REGISTER_TYPE_F); + if (inst->src[0].file == IMM && + inst->src[0].type == BRW_REGISTER_TYPE_F) { inst->opcode = BRW_OPCODE_MOV; inst->src[0].f *= inst->src[1].f; inst->src[1] = reg_undef; @@ -2616,8 +2616,8 @@ fs_visitor::opt_algebraic() if (inst->src[1].file != IMM) continue; - if (inst->src[0].file == IMM) { -assert(inst->src[0].type == BRW_REGISTER_TYPE_F); + if (inst->src[0].file == IMM && + inst->src[0].type == BRW_REGISTER_TYPE_F) { inst->opcode = BRW_OPCODE_MOV; inst->src[0].f += inst->src[1].f; inst->src[1] = reg_undef; diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp b/src/intel/compiler/brw_fs_combine_constants.cpp index a26eb67a00a..bb155caeccf 100644 --- a/src/intel/compiler/brw_fs_combine_constants.cpp +++ b/src/intel/compiler/brw_fs_combine_constants.cpp @@ -66,13 +66,31 @@ could_coissue(const struct gen_device_info *devinfo, const fs_inst *inst) * Returns true for instructions that don't support immediate sources. */ static bool -must_promote_imm(const struct gen_device_info *devinfo, const fs_inst *inst) +must_promote_imm(const struct gen_device_info *devinfo, + const fs_inst *inst, const int src_idx) { switch (inst->opcode) { case SHADER_OPCODE_INT_QUOTIENT: case SHADER_OPCODE_INT_REMAINDER: case SHADER_OPCODE_POW: - return devinfo->gen < 8; + return src_idx != 1 || devinfo->gen < 8; + case BRW_OPCODE_BFI1: + case BRW_OPCODE_ASR: + case BRW_OPCODE_SHL: + case BRW_OPCODE_SHR: + case BRW_OPCODE_SUBB: + case BRW_OPCODE_MACH: + case BRW_OPCODE_MUL: + case SHADER_OPCODE_MULH: + case BRW_OPCODE_ADD: + case BRW_OPCODE_OR: + case BRW_OPCODE_AND: + case BRW_OPCODE_XOR: + case BRW_OPCODE_ADDC: + case BRW_OPCODE_CMP: + case BRW_OPCODE_IF: + case BRW_OPCODE_SEL: + return src_idx != 1; case BRW_OPCODE_MAD: case BRW_OPCODE_LRP: return true; @@ -335,13 +353,18 @@ fs_visitor::opt_combine_constants() foreach_block_and_inst(block, fs_inst, inst, cfg) { ip++; - if (!could_coissue(devinfo, inst) && !must_promote_imm(devinfo, inst)) + const bool is_coissue_candidate = could_coissue(devinfo, inst); + if (!is_coissue_candidate && !must_promote_imm(devinfo, inst, -1)) continue; for (int i = 0; i < inst->sources; i++) { if (inst->src[i].file != IMM) continue; + const bool must_promote = must_promote_imm(devinfo, inst, i); + if (!is_coissue_candidate && !must_promote) +continue; + char data[8]; brw_reg_type type; if (!get_constant_value(devinfo, inst, i, data, &type)) @@ -357,8 +380,8 @@ fs_visitor::opt_combine_constants() imm->inst = NULL; imm->block = intersection; imm->uses->push_tail(link(const_ctx, &inst->src[i])); -imm->uses_by_coissue += could_coissue(devinfo, inst); -imm->must_promote = imm->must_promote || must_promote_imm(devinfo, inst); +imm->uses_by_coissue += is_coissue_candidate; +imm->must_promote = imm->must_promote || must_promote; imm->last_use_ip = ip; if (type == BRW_REGISTER_TYPE_HF) imm->is_half_float = true; @@ -371,8 +394,8 @@ fs_visitor::opt_combine_constants() memcpy(imm->bytes, data, size); imm->size = size; imm->is_half_float = type == BRW_REGISTER_TYPE_HF; -imm->uses_by_coissue = could_coissue(devinfo, inst); -imm->must_promote = must_promote_imm(devinfo, inst); +imm->uses_by_coissue = is_coissue_candidate; +imm->must_promote = must_promote; imm->first_use_ip = ip; imm->last_use_ip = ip; } diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b
[Mesa-dev] [PATCH v2 0/3] intel: propagate constants to first source of 2-src instructions
This v2 addresses comments to v1, specifically constant folding of SHR/SHL instructions now only take up to 5-bits from the shift count parameter, just like the hardware does. Also, patch 2 has been fixed to avoid hitting assertions that would then be fixed by patch 3. I sent the series up to patch 2 to Jenkins and verified that it came back green. Iago Toral Quiroga (3): intel/compiler: allow constant propagation for int quotient and reminder intel/compiler: allow constant propagation to first source of 2src instructions intel/compiler: implement more algebraic optimizations src/intel/compiler/brw_fs.cpp | 203 +- .../compiler/brw_fs_combine_constants.cpp | 39 +++- .../compiler/brw_fs_copy_propagation.cpp | 59 +++-- 3 files changed, 253 insertions(+), 48 deletions(-) -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/3] intel/compiler: allow constant propagation to first source of 2-src instructions
Even if it is not supported by the hardware, we will fix it up in the combine constants pass. --- .../compiler/brw_fs_combine_constants.cpp | 37 ++--- .../compiler/brw_fs_copy_propagation.cpp | 55 +-- 2 files changed, 56 insertions(+), 36 deletions(-) diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp b/src/intel/compiler/brw_fs_combine_constants.cpp index a26eb67a00a..bb155caeccf 100644 --- a/src/intel/compiler/brw_fs_combine_constants.cpp +++ b/src/intel/compiler/brw_fs_combine_constants.cpp @@ -66,13 +66,31 @@ could_coissue(const struct gen_device_info *devinfo, const fs_inst *inst) * Returns true for instructions that don't support immediate sources. */ static bool -must_promote_imm(const struct gen_device_info *devinfo, const fs_inst *inst) +must_promote_imm(const struct gen_device_info *devinfo, + const fs_inst *inst, const int src_idx) { switch (inst->opcode) { case SHADER_OPCODE_INT_QUOTIENT: case SHADER_OPCODE_INT_REMAINDER: case SHADER_OPCODE_POW: - return devinfo->gen < 8; + return src_idx != 1 || devinfo->gen < 8; + case BRW_OPCODE_BFI1: + case BRW_OPCODE_ASR: + case BRW_OPCODE_SHL: + case BRW_OPCODE_SHR: + case BRW_OPCODE_SUBB: + case BRW_OPCODE_MACH: + case BRW_OPCODE_MUL: + case SHADER_OPCODE_MULH: + case BRW_OPCODE_ADD: + case BRW_OPCODE_OR: + case BRW_OPCODE_AND: + case BRW_OPCODE_XOR: + case BRW_OPCODE_ADDC: + case BRW_OPCODE_CMP: + case BRW_OPCODE_IF: + case BRW_OPCODE_SEL: + return src_idx != 1; case BRW_OPCODE_MAD: case BRW_OPCODE_LRP: return true; @@ -335,13 +353,18 @@ fs_visitor::opt_combine_constants() foreach_block_and_inst(block, fs_inst, inst, cfg) { ip++; - if (!could_coissue(devinfo, inst) && !must_promote_imm(devinfo, inst)) + const bool is_coissue_candidate = could_coissue(devinfo, inst); + if (!is_coissue_candidate && !must_promote_imm(devinfo, inst, -1)) continue; for (int i = 0; i < inst->sources; i++) { if (inst->src[i].file != IMM) continue; + const bool must_promote = must_promote_imm(devinfo, inst, i); + if (!is_coissue_candidate && !must_promote) +continue; + char data[8]; brw_reg_type type; if (!get_constant_value(devinfo, inst, i, data, &type)) @@ -357,8 +380,8 @@ fs_visitor::opt_combine_constants() imm->inst = NULL; imm->block = intersection; imm->uses->push_tail(link(const_ctx, &inst->src[i])); -imm->uses_by_coissue += could_coissue(devinfo, inst); -imm->must_promote = imm->must_promote || must_promote_imm(devinfo, inst); +imm->uses_by_coissue += is_coissue_candidate; +imm->must_promote = imm->must_promote || must_promote; imm->last_use_ip = ip; if (type == BRW_REGISTER_TYPE_HF) imm->is_half_float = true; @@ -371,8 +394,8 @@ fs_visitor::opt_combine_constants() memcpy(imm->bytes, data, size); imm->size = size; imm->is_half_float = type == BRW_REGISTER_TYPE_HF; -imm->uses_by_coissue = could_coissue(devinfo, inst); -imm->must_promote = must_promote_imm(devinfo, inst); +imm->uses_by_coissue = is_coissue_candidate; +imm->must_promote = must_promote; imm->first_use_ip = ip; imm->last_use_ip = ip; } diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp index c11b05b128a..33454e50861 100644 --- a/src/intel/compiler/brw_fs_copy_propagation.cpp +++ b/src/intel/compiler/brw_fs_copy_propagation.cpp @@ -624,10 +624,8 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry) case BRW_OPCODE_SHL: case BRW_OPCODE_SHR: case BRW_OPCODE_SUBB: - if (i == 1) { -inst->src[i] = val; -progress = true; - } + inst->src[i] = val; + progress = true; break; case BRW_OPCODE_MACH: @@ -638,10 +636,7 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry) case BRW_OPCODE_AND: case BRW_OPCODE_XOR: case BRW_OPCODE_ADDC: - if (i == 1) { -inst->src[i] = val; -progress = true; - } else if (i == 0 && inst->src[1].file != IMM) { + if (i == 0 && inst->src[1].file != IMM) { /* Fit this constant in by commuting the operands. * Exception: we can't do this for 32-bit integer MUL/MACH * because it's asymmetric. @@ -653,24 +648,25 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry) * Integer MUL with a non-accumulator destination will be lowered * by lower_integer_multiplication(), so don't restrict it. */ -if (((inst->opcode
[Mesa-dev] [PATCH 3/3] intel/compiler: implement more algebraic optimizations
Now that we propagate constants to the first source of 2src instructions we see more opportunities of constant folding in the backend. Shader-db results on KBL: total instructions in shared programs: 14965607 -> 14855983 (-0.73%) instructions in affected programs: 3988102 -> 3878478 (-2.75%) helped: 14292 HURT: 59 total cycles in shared programs: 344324295 -> 340656008 (-1.07%) cycles in affected programs: 247527740 -> 243859453 (-1.48%) helped: 14056 HURT: 3314 total loops in shared programs: 4283 -> 4283 (0.00%) loops in affected programs: 0 -> 0 helped: 0 HURT: 0 total spills in shared programs: 27812 -> 24350 (-12.45%) spills in affected programs: 24921 -> 21459 (-13.89%) helped: 345 HURT: 19 total fills in shared programs: 24173 -> 22032 (-8.86%) fills in affected programs: 21124 -> 18983 (-10.14%) helped: 355 HURT: 25 LOST: 0 GAINED: 5 --- src/intel/compiler/brw_fs.cpp | 203 -- 1 file changed, 195 insertions(+), 8 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 2358acbeb59..b2b60237c82 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -2583,9 +2583,55 @@ fs_visitor::opt_algebraic() break; case BRW_OPCODE_MUL: - if (inst->src[1].file != IMM) + if (inst->src[0].file != IMM && inst->src[1].file != IMM) continue; + /* Constant folding */ + if (inst->src[0].file == IMM && inst->src[1].file == IMM) { +assert(inst->src[0].type == inst->src[1].type); +bool local_progress = true; +switch (inst->src[0].type) { +case BRW_REGISTER_TYPE_HF: { + float v1 = _mesa_half_to_float(inst->src[0].ud & 0xu); + float v2 = _mesa_half_to_float(inst->src[1].ud & 0xu); + inst->src[0] = brw_imm_w(_mesa_float_to_half(v1 * v2)); + break; +} +case BRW_REGISTER_TYPE_W: { + int16_t v1 = inst->src[0].ud & 0xu; + int16_t v2 = inst->src[1].ud & 0xu; + inst->src[0] = brw_imm_w(v1 * v2); + break; +} +case BRW_REGISTER_TYPE_UW: { + uint16_t v1 = inst->src[0].ud & 0xu; + uint16_t v2 = inst->src[1].ud & 0xu; + inst->src[0] = brw_imm_uw(v1 * v2); + break; +} +case BRW_REGISTER_TYPE_F: + inst->src[0].f *= inst->src[1].f; + break; +case BRW_REGISTER_TYPE_D: + inst->src[0].d *= inst->src[1].d; + break; +case BRW_REGISTER_TYPE_UD: + inst->src[0].ud *= inst->src[1].ud; + break; +default: + local_progress = false; + break; +}; + +if (local_progress) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[1] = reg_undef; + progress = true; + break; +} + } + + /* a * 1.0 = a */ if (inst->src[1].is_one()) { inst->opcode = BRW_OPCODE_MOV; @@ -2594,6 +2640,14 @@ fs_visitor::opt_algebraic() break; } + if (inst->src[0].is_one()) { +inst->opcode = BRW_OPCODE_MOV; +inst->src[0] = inst->src[1]; +inst->src[1] = reg_undef; +progress = true; +break; + } + /* a * -1.0 = -a */ if (inst->src[1].is_negative_one()) { inst->opcode = BRW_OPCODE_MOV; @@ -2603,27 +2657,160 @@ fs_visitor::opt_algebraic() break; } - if (inst->src[0].file == IMM) { -assert(inst->src[0].type == BRW_REGISTER_TYPE_F); + if (inst->src[0].is_negative_one()) { +inst->opcode = BRW_OPCODE_MOV; +inst->src[0] = inst->src[1]; +inst->src[0].negate = !inst->src[1].negate; +inst->src[1] = reg_undef; +progress = true; +break; + } + + /* a * 0 = 0 (this is not exact for floating point) */ + if (inst->src[1].is_zero() && + brw_reg_type_is_integer(inst->src[1].type)) { +inst->opcode = BRW_OPCODE_MOV; +inst->src[0] = inst->src[1]; +inst->src[1] = reg_undef; +progress = true; +break; + } + + if (inst->src[0].is_zero() && + brw_reg_type_is_integer(inst->src[0].type)) { inst->opcode = BRW_OPCODE_MOV; -inst->src[0].f *= inst->src[1].f; inst->src[1] = reg_undef; progress = true; break; } break; case BRW_OPCODE_ADD: - if (inst->src[1].file != IMM) + if (inst->src[0].file != IMM && inst->src[1].file != IMM) continue; - if (inst->src[0].file == IMM) { -
[Mesa-dev] [PATCH 1/3] intel/compiler: allow constant propagation for int quotient and reminder
And let combine constants promote the constants if needed. --- src/intel/compiler/brw_fs_combine_constants.cpp | 2 ++ src/intel/compiler/brw_fs_copy_propagation.cpp | 4 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp b/src/intel/compiler/brw_fs_combine_constants.cpp index db7b14a8312..a26eb67a00a 100644 --- a/src/intel/compiler/brw_fs_combine_constants.cpp +++ b/src/intel/compiler/brw_fs_combine_constants.cpp @@ -69,6 +69,8 @@ static bool must_promote_imm(const struct gen_device_info *devinfo, const fs_inst *inst) { switch (inst->opcode) { + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: case SHADER_OPCODE_POW: return devinfo->gen < 8; case BRW_OPCODE_MAD: diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp index 8b904ab356b..c11b05b128a 100644 --- a/src/intel/compiler/brw_fs_copy_propagation.cpp +++ b/src/intel/compiler/brw_fs_copy_propagation.cpp @@ -611,10 +611,6 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry) case SHADER_OPCODE_INT_QUOTIENT: case SHADER_OPCODE_INT_REMAINDER: - /* FINISHME: Promote non-float constants and remove this. */ - if (devinfo->gen < 8) -break; - /* fallthrough */ case SHADER_OPCODE_POW: /* Allow constant propagation into src1 (except on Gen 6 which * doesn't support scalar source math), and let constant combining -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 0/3] intel: propagate constants to first source of 2-src instructions
This little series lives on top of my VK_KHR_shader_float16_int8 branch, since it depends on having a more flexible combine constants pass, which is included with that work. A branch with that on this series is available here: https://github.com/Igalia/mesa/tree/itoral/VK_KHR_shader_float16_int8_combine_constants Shader-db results on KBL: total instructions in shared programs: 14965607 -> 14855983 (-0.73%) instructions in affected programs: 3988102 -> 3878478 (-2.75%) helped: 14292 HURT: 59 total cycles in shared programs: 344324295 -> 340656008 (-1.07%) cycles in affected programs: 247527740 -> 243859453 (-1.48%) helped: 14056 HURT: 3314 total loops in shared programs: 4283 -> 4283 (0.00%) loops in affected programs: 0 -> 0 helped: 0 HURT: 0 total spills in shared programs: 27812 -> 24350 (-12.45%) spills in affected programs: 24921 -> 21459 (-13.89%) helped: 345 HURT: 19 total fills in shared programs: 24173 -> 22032 (-8.86%) fills in affected programs: 21124 -> 18983 (-10.14%) helped: 355 HURT: 25 LOST: 0 GAINED: 5 Initially the series included propagation to BFE, BFI2 and BFREV, but that actually led to significantly worse shader-db results, so that part has been removed. Iago Iago Toral Quiroga (3): intel/compiler: allow constant propagation for int quotient and reminder intel/compiler: allow constant propagation to first source of 2-src instructions intel/compiler: implement more algebraic optimizations src/intel/compiler/brw_fs.cpp | 203 +- .../compiler/brw_fs_combine_constants.cpp | 39 +++- .../compiler/brw_fs_copy_propagation.cpp | 59 +++-- 3 files changed, 253 insertions(+), 48 deletions(-) -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v5 33/40] intel/compiler: also set F execution type for mixed float mode in BDW
The section 'Execution Data Types' of 3D Media GPGPU volume, which describes execution types, is exactly the same in BDW and SKL+. Also, this section states that there is a single execution type, so it makes sense that this is the wider of the two floating point types involved in mixed float mode, which is what we do for SKL+ and CHV. v2: - Make sure we also account for the destination type in mixed mode (Curro). --- src/intel/compiler/brw_eu_validate.c | 39 +--- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/src/intel/compiler/brw_eu_validate.c b/src/intel/compiler/brw_eu_validate.c index 358a0347a93..e0010f0fb07 100644 --- a/src/intel/compiler/brw_eu_validate.c +++ b/src/intel/compiler/brw_eu_validate.c @@ -348,6 +348,17 @@ is_unsupported_inst(const struct gen_device_info *devinfo, return brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst)) == NULL; } +/** + * Returns whether a combination of two types would qualify as mixed float + * operation mode + */ +static inline bool +types_are_mixed_float(enum brw_reg_type t0, enum brw_reg_type t1) +{ + return (t0 == BRW_REGISTER_TYPE_F && t1 == BRW_REGISTER_TYPE_HF) || + (t1 == BRW_REGISTER_TYPE_F && t0 == BRW_REGISTER_TYPE_HF); +} + static enum brw_reg_type execution_type_for_type(enum brw_reg_type type) { @@ -390,20 +401,24 @@ execution_type(const struct gen_device_info *devinfo, const brw_inst *inst) enum brw_reg_type src0_exec_type, src1_exec_type; /* Execution data type is independent of destination data type, except in -* mixed F/HF instructions on CHV and SKL+. +* mixed F/HF instructions. */ enum brw_reg_type dst_exec_type = brw_inst_dst_type(devinfo, inst); src0_exec_type = execution_type_for_type(brw_inst_src0_type(devinfo, inst)); if (num_sources == 1) { - if ((devinfo->gen >= 9 || devinfo->is_cherryview) && - src0_exec_type == BRW_REGISTER_TYPE_HF) { + if (src0_exec_type == BRW_REGISTER_TYPE_HF) return dst_exec_type; - } return src0_exec_type; } src1_exec_type = execution_type_for_type(brw_inst_src1_type(devinfo, inst)); + if (types_are_mixed_float(src0_exec_type, src1_exec_type) || + types_are_mixed_float(src0_exec_type, dst_exec_type) || + types_are_mixed_float(src1_exec_type, dst_exec_type)) { + return BRW_REGISTER_TYPE_F; + } + if (src0_exec_type == src1_exec_type) return src0_exec_type; @@ -431,18 +446,12 @@ execution_type(const struct gen_device_info *devinfo, const brw_inst *inst) src1_exec_type == BRW_REGISTER_TYPE_DF) return BRW_REGISTER_TYPE_DF; - if (devinfo->gen >= 9 || devinfo->is_cherryview) { - if (dst_exec_type == BRW_REGISTER_TYPE_F || - src0_exec_type == BRW_REGISTER_TYPE_F || - src1_exec_type == BRW_REGISTER_TYPE_F) { - return BRW_REGISTER_TYPE_F; - } else { - return BRW_REGISTER_TYPE_HF; - } - } + if (src0_exec_type == BRW_REGISTER_TYPE_F || + src1_exec_type == BRW_REGISTER_TYPE_F) + return BRW_REGISTER_TYPE_F; - assert(src0_exec_type == BRW_REGISTER_TYPE_F); - return BRW_REGISTER_TYPE_F; + assert(src0_exec_type == BRW_REGISTER_TYPE_HF); + return BRW_REGISTER_TYPE_HF; } /** -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v5 02/40] intel/compiler: add a NIR pass to lower conversions
Some conversions are not directly supported in hardware and need to be split in two conversion instructions going through an intermediary type. Doing this at the NIR level simplifies a bit the complexity in the backend. v2: - Consider fp16 rounding conversion opcodes - Properly handle swizzles on conversion sources. v3 - Run the pass earlier, right after nir_opt_algebraic_late (Jason) - NIR alu output types already have the bit-size (Jason) - Use 'is_conversion' to identify conversion operations (Jason) v4: - Be careful about the intermediate types we use so we don't lose range and avoid incorrect rounding semantics (Jason) Reviewed-by: Topi Pohjolainen (v1) --- src/intel/Makefile.sources| 1 + src/intel/compiler/brw_nir.c | 2 + src/intel/compiler/brw_nir.h | 2 + .../compiler/brw_nir_lower_conversions.c | 169 ++ src/intel/compiler/meson.build| 1 + 5 files changed, 175 insertions(+) create mode 100644 src/intel/compiler/brw_nir_lower_conversions.c diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources index 94a28d370e8..9975daa3ad1 100644 --- a/src/intel/Makefile.sources +++ b/src/intel/Makefile.sources @@ -83,6 +83,7 @@ COMPILER_FILES = \ compiler/brw_nir_analyze_boolean_resolves.c \ compiler/brw_nir_analyze_ubo_ranges.c \ compiler/brw_nir_attribute_workarounds.c \ + compiler/brw_nir_lower_conversions.c \ compiler/brw_nir_lower_cs_intrinsics.c \ compiler/brw_nir_lower_image_load_store.c \ compiler/brw_nir_lower_mem_access_bit_sizes.c \ diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 9dbf06004a4..7e3dbc9e447 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -876,6 +876,8 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler, OPT(nir_opt_algebraic_late); + OPT(brw_nir_lower_conversions); + OPT(nir_lower_to_source_mods, nir_lower_all_source_mods); OPT(nir_copy_prop); OPT(nir_opt_dce); diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h index bc81950d47e..662b2627e95 100644 --- a/src/intel/compiler/brw_nir.h +++ b/src/intel/compiler/brw_nir.h @@ -114,6 +114,8 @@ void brw_nir_lower_tcs_outputs(nir_shader *nir, const struct brw_vue_map *vue, GLenum tes_primitive_mode); void brw_nir_lower_fs_outputs(nir_shader *nir); +bool brw_nir_lower_conversions(nir_shader *nir); + bool brw_nir_lower_image_load_store(nir_shader *nir, const struct gen_device_info *devinfo); void brw_nir_rewrite_image_intrinsic(nir_intrinsic_instr *intrin, diff --git a/src/intel/compiler/brw_nir_lower_conversions.c b/src/intel/compiler/brw_nir_lower_conversions.c new file mode 100644 index 000..9aff30b568b --- /dev/null +++ b/src/intel/compiler/brw_nir_lower_conversions.c @@ -0,0 +1,169 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_nir.h" +#include "compiler/nir/nir_builder.h" + +static nir_op +get_conversion_op(nir_alu_type src_type, + unsigned src_bit_size, + nir_alu_type dst_type, + unsigned dst_bit_size, + nir_rounding_mode rounding_mode) +{ + nir_alu_type src_full_type = (nir_alu_type) (src_type | src_bit_size); + nir_alu_type dst_full_type = (nir_alu_type) (dst_type | dst_bit_size); + + return nir_type_conversion_op(src_full_type, dst_full_type, rounding_mode); +} + +static nir_rounding_mode +get_opcode_rounding_mode(nir_op op) +{ + switch (op) { + case nir_op_f2f16_rtz: + return nir_rounding_mode_rtz; + case nir_op_f2f16_rtne: + return nir_rounding_mode_rtne; + default: + return nir_rounding_mode_undef; + } +} + +static v
[Mesa-dev] [PATCH v4 34/40] intel/compiler: validate region restrictions for half-float conversions
--- src/intel/compiler/brw_eu_validate.c| 64 - src/intel/compiler/test_eu_validate.cpp | 122 2 files changed, 185 insertions(+), 1 deletion(-) diff --git a/src/intel/compiler/brw_eu_validate.c b/src/intel/compiler/brw_eu_validate.c index 000a05cb6ac..203641fecb9 100644 --- a/src/intel/compiler/brw_eu_validate.c +++ b/src/intel/compiler/brw_eu_validate.c @@ -531,7 +531,69 @@ general_restrictions_based_on_operand_types(const struct gen_device_info *devinf exec_type_size == 8 && dst_type_size == 4) dst_type_size = 8; - if (exec_type_size > dst_type_size) { + /* From the BDW+ PRM: +* +*"There is no direct conversion from HF to DF or DF to HF. +* There is no direct conversion from HF to Q/UQ or Q/UQ to HF." +*/ + enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst); + ERROR_IF(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MOV && +((dst_type == BRW_REGISTER_TYPE_HF && type_sz(src0_type) == 8) || + (dst_type_size == 8 && src0_type == BRW_REGISTER_TYPE_HF)), +"There are no direct conversion between 64-bit types and HF"); + + /* From the BDW+ PRM: +* +* "Conversion between Integer and HF (Half Float) must be +*DWord-aligned and strided by a DWord on the destination." +* +* But this seems to be expanded on CHV and SKL+ by: +* +* "There is a relaxed alignment rule for word destinations. When +*the destination type is word (UW, W, HF), destination data types +*can be aligned to either the lowest word or the second lowest +*word of the execution channel. This means the destination data +*words can be either all in the even word locations or all in the +*odd word locations." +* +* We do not implement the second rule as is though, since empirical testing +* shows inconsistencies: +* - It suggests that packed 16-bit is not allowed, which is not true. +* - It suggests that conversions from Q/DF to W (which need to be 64-bit +* aligned on the destination) are not possible, which is not true. +* - It suggests that conversions from 16-bit executions types to W need +* to be 32-bit aligned, which doesn't seem to be necessary. +* +* So from this rule we only validate the implication that conversion from +* F to HF needs to be DWord aligned too (in BDW this is limited to +* conversions from integer types). +*/ + bool is_half_float_conversion = + brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MOV && + dst_type != src0_type && + (dst_type == BRW_REGISTER_TYPE_HF || src0_type == BRW_REGISTER_TYPE_HF); + + if (is_half_float_conversion) { + assert(devinfo->gen >= 8); + + if ((dst_type == BRW_REGISTER_TYPE_HF && brw_reg_type_is_integer(src0_type)) || + (brw_reg_type_is_integer(dst_type) && src0_type == BRW_REGISTER_TYPE_HF)) { + ERROR_IF(dst_stride * dst_type_size != 4, + "Conversions between integer and half-float must be strided " + "by a DWord on the destination"); + + unsigned subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst); + ERROR_IF(subreg % 4 != 0, + "Conversions between integer and half-float must be aligned " + "to a DWord on the destination"); + } else if ((devinfo->is_cherryview || devinfo->gen >= 9) && + dst_type == BRW_REGISTER_TYPE_HF) { + ERROR_IF(dst_stride != 2, + "Conversions to HF must have either all words in even word " + "locations or all words in odd word locations"); + } + + } else if (exec_type_size > dst_type_size) { if (!(dst_type_is_byte && inst_is_raw_move(devinfo, inst))) { ERROR_IF(dst_stride * dst_type_size != exec_type_size, "Destination stride must be equal to the ratio of the sizes " diff --git a/src/intel/compiler/test_eu_validate.cpp b/src/intel/compiler/test_eu_validate.cpp index 73300b23122..1557b6d2452 100644 --- a/src/intel/compiler/test_eu_validate.cpp +++ b/src/intel/compiler/test_eu_validate.cpp @@ -848,6 +848,128 @@ TEST_P(validation_test, byte_destination_relaxed_alignment) } } +TEST_P(validation_test, half_float_conversion) +{ + static const struct { + enum brw_reg_type dst_type; + enum brw_reg_type src_type; + unsigned dst_stride; + unsigned dst_subnr; + bool expected_result; + } inst[] = { +#define INST(dst_type, src_type, dst_stride, dst_subnr, expected_result) \ + { \ + BRW_REGISTER_TYPE_##dst_type,\ + BRW_REGISTER_TYPE_##src_type,\ + BRW_HORIZONTAL_STRIDE_##dst_stride, \ + dst_subnr,
[Mesa-dev] [PATCH v4 03/40] intel/compiler: split float to 64-bit opcodes from int to 64-bit
Going forward having these split is a bit more convenient since these two groups have different restrictions. v2: - Rebased on top of new regioning lowering pass. Reviewed-by: Topi Pohjolainen (v1) Reviewed-by: Jason Ekstrand --- src/intel/compiler/brw_fs_nir.cpp | 7 +++ 1 file changed, 7 insertions(+) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 1041296b903..bb7591422d4 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -801,10 +801,17 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) case nir_op_f2f64: case nir_op_f2i64: case nir_op_f2u64: + assert(type_sz(op[0].type) > 2); /* brw_nir_lower_conversions */ + inst = bld.MOV(result, op[0]); + inst->saturate = instr->dest.saturate; + break; + case nir_op_i2f64: case nir_op_i2i64: case nir_op_u2f64: case nir_op_u2u64: + assert(type_sz(op[0].type) > 1); /* brw_nir_lower_conversions */ + /* fallthrough */ case nir_op_f2f32: case nir_op_f2i32: case nir_op_f2u32: -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 24/40] intel/compiler: implement isign for int8
Reviewed-by: Jason Ekstrand --- src/intel/compiler/brw_fs_nir.cpp | 25 + 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 3a6e4a2eb60..40c0481ac53 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -906,11 +906,28 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) * Predicated OR sets 1 if val is positive. */ uint32_t bit_size = nir_dest_bit_size(instr->dest.dest); - assert(bit_size == 32 || bit_size == 16); - fs_reg zero = bit_size == 32 ? brw_imm_d(0) : brw_imm_w(0); - fs_reg one = bit_size == 32 ? brw_imm_d(1) : brw_imm_w(1); - fs_reg shift = bit_size == 32 ? brw_imm_d(31) : brw_imm_w(15); + fs_reg zero, one, shift; + switch (bit_size) { + case 32: + zero = brw_imm_d(0); + one = brw_imm_d(1); + shift = brw_imm_d(31); + break; + case 16: + zero = brw_imm_w(0); + one = brw_imm_w(1); + shift = brw_imm_w(15); + break; + case 8: { + zero = setup_imm_b(bld, 0); + one = setup_imm_b(bld, 1); + shift = setup_imm_b(bld, 7); + break; + } + default: + unreachable("unsupported bit-size"); + }; bld.CMP(bld.null_reg_d(), op[0], zero, BRW_CONDITIONAL_G); bld.ASR(result, op[0], shift); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 15/40] intel/compiler: don't compact 3-src instructions with Src1Type or Src2Type bits
We are now using these bits, so don't assert that they are not set. In gen8, if these bits are set compaction is not possible. On gen9 and CHV platforms set_3src_control_index() checks these bits (and others) against a table to validate if the particular bit combination is eligible for compaction or not. v2 - Add more detail in the commit message explaining the situation for SKL+ and CHV (Jason) Reviewed-by: Topi Pohjolainen Reviewed-by: Jason Ekstrand Reviewed-by: Matt Turner --- src/intel/compiler/brw_eu_compact.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/intel/compiler/brw_eu_compact.c b/src/intel/compiler/brw_eu_compact.c index ae14ef10ec0..20fed254331 100644 --- a/src/intel/compiler/brw_eu_compact.c +++ b/src/intel/compiler/brw_eu_compact.c @@ -928,8 +928,11 @@ has_3src_unmapped_bits(const struct gen_device_info *devinfo, assert(!brw_inst_bits(src, 127, 126) && !brw_inst_bits(src, 105, 105) && !brw_inst_bits(src, 84, 84) && - !brw_inst_bits(src, 36, 35) && !brw_inst_bits(src, 7, 7)); + + /* Src1Type and Src2Type, used for mixed-precision floating point */ + if (brw_inst_bits(src, 36, 35)) + return true; } return false; -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 17/40] intel/compiler: set correct precision fields for 3-source float instructions
Source0 and Destination extract the floating-point precision automatically from the SrcType and DstType instruction fields respectively when they are set to types :F or :HF. For Source1 and Source2 operands, we use the new 1-bit fields Src1Type and Src2Type, where 0 means normal precision and 1 means half-precision. Since we always use the type of the destination for all operands when we emit 3-source instructions, we only need set Src1Type and Src2Type to 1 when we are emitting a half-precision instruction. v2: - Set the bit separately for each source based on its type so we can do mixed floating-point mode in the future (Topi). v3: - Use regular citation style for the comment referencing the PRM (Matt). - Decided not to add asserts in the emission code to check that only mixed HF/F types are used since such checks would break negative tests for brw_eu_validate.c (Matt) Reviewed-by: Topi Pohjolainen Reviewed-by: Jason Ekstrand Reviewed-by: Matt Turner --- src/intel/compiler/brw_eu_emit.c | 16 1 file changed, 16 insertions(+) diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c index 30037e71b00..195c26ab760 100644 --- a/src/intel/compiler/brw_eu_emit.c +++ b/src/intel/compiler/brw_eu_emit.c @@ -843,6 +843,22 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest, */ brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type); brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type); + + /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType: + * + *"Three source instructions can use operands with mixed-mode + * precision. When SrcType field is set to :f or :hf it defines + * precision for source 0 only, and fields Src1Type and Src2Type + * define precision for other source operands: + * + * 0b = :f. Single precision Float (32-bit). + * 1b = :hf. Half precision Float (16-bit)." + */ + if (src1.type == BRW_REGISTER_TYPE_HF) +brw_inst_set_3src_a16_src1_type(devinfo, inst, 1); + + if (src2.type == BRW_REGISTER_TYPE_HF) +brw_inst_set_3src_a16_src2_type(devinfo, inst, 1); } } -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 08/40] intel/compiler: implement 16-bit fsign
v2: - make 16-bit be its own separate case (Jason) v3: - Drop the result_int temporary (Jason) Reviewed-by: Topi Pohjolainen (v1) Reviewed-by: Jason Ekstrand --- src/intel/compiler/brw_fs_nir.cpp | 17 - 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 4c7a839390c..64e24f86b5a 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -844,7 +844,21 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) : bld.MOV(result, brw_imm_f(1.0f)); set_predicate(BRW_PREDICATE_NORMAL, inst); - } else if (type_sz(op[0].type) < 8) { + } else if (type_sz(op[0].type) == 2) { + /* AND(val, 0x8000) gives the sign bit. + * + * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero. + */ + fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF); + bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ); + + op[0].type = BRW_REGISTER_TYPE_UW; + result.type = BRW_REGISTER_TYPE_UW; + bld.AND(result, op[0], brw_imm_uw(0x8000u)); + + inst = bld.OR(result, result, brw_imm_uw(0x3c00u)); + inst->predicate = BRW_PREDICATE_NORMAL; + } else if (type_sz(op[0].type) == 4) { /* AND(val, 0x8000) gives the sign bit. * * Predicated OR ORs 1.0 (0x3f80) with the sign bit if val is not @@ -866,6 +880,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) * - The sign is encoded in the high 32-bit of each DF * - We need to produce a DF result. */ + assert(type_sz(op[0].type) == 8); fs_reg zero = vgrf(glsl_type::double_type); bld.MOV(zero, setup_imm_df(bld, 0.0)); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 13/40] intel/compiler: add instruction setters for Src1Type and Src2Type.
The original SrcType is a 3-bit field that takes a subset of the types supported for the hardware for 3-source instructions. Since gen8, when the half-float type was added, 3-source floating point operations can use use mixed precision mode, where not all the operands have the same floating-point precision. While the precision for the first operand is taken from the type in SrcType, the bits in Src1Type (bit 36) and Src2Type (bit 35) define the precision for the other operands (0: normal precision, 1: half precision). Reviewed-by: Topi Pohjolainen Reviewed-by: Matt Turner --- src/intel/compiler/brw_inst.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/intel/compiler/brw_inst.h b/src/intel/compiler/brw_inst.h index 71316f12215..1f55d45125d 100644 --- a/src/intel/compiler/brw_inst.h +++ b/src/intel/compiler/brw_inst.h @@ -222,6 +222,8 @@ F8(3src_src1_negate,39, 39, 40, 40) F8(3src_src1_abs, 38, 38, 39, 39) F8(3src_src0_negate,37, 37, 38, 38) F8(3src_src0_abs, 36, 36, 37, 37) +F8(3src_a16_src1_type, -1, -1, 36, 36) +F8(3src_a16_src2_type, -1, -1, 35, 35) F8(3src_a16_flag_reg_nr,34, 34, 33, 33) F8(3src_a16_flag_subreg_nr, 33, 33, 32, 32) FF(3src_a16_dst_reg_file, -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 01/40] compiler/nir: add an is_conversion field to nir_op_info
This is set to True only for numeric conversion opcodes. --- src/compiler/nir/nir.h| 3 ++ src/compiler/nir/nir_opcodes.py | 73 +-- src/compiler/nir/nir_opcodes_c.py | 1 + 3 files changed, 44 insertions(+), 33 deletions(-) diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index ff2c41faf27..2793662b1d9 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -926,6 +926,9 @@ typedef struct { nir_alu_type input_types[NIR_MAX_VEC_COMPONENTS]; nir_op_algebraic_property algebraic_properties; + + /* Whether this represents a numeric conversion opcode */ + bool is_conversion; } nir_op_info; extern const nir_op_info nir_op_infos[nir_num_opcodes]; diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py index d32005846a6..dc4cd9ac63d 100644 --- a/src/compiler/nir/nir_opcodes.py +++ b/src/compiler/nir/nir_opcodes.py @@ -33,12 +33,13 @@ class Opcode(object): NOTE: this must be kept in sync with nir_op_info """ def __init__(self, name, output_size, output_type, input_sizes, -input_types, algebraic_properties, const_expr): +input_types, is_conversion, algebraic_properties, const_expr): """Parameters: - name is the name of the opcode (prepend nir_op_ for the enum name) - all types are strings that get nir_type_ prepended to them - input_types is a list of types + - is_conversion is true if this opcode represents a type conversion - algebraic_properties is a space-seperated string, where nir_op_is_ is prepended before each entry - const_expr is an expression or series of statements that computes the @@ -70,6 +71,7 @@ class Opcode(object): assert isinstance(input_sizes[0], int) assert isinstance(input_types, list) assert isinstance(input_types[0], str) + assert isinstance(is_conversion, bool) assert isinstance(algebraic_properties, str) assert isinstance(const_expr, str) assert len(input_sizes) == len(input_types) @@ -84,6 +86,7 @@ class Opcode(object): self.output_type = output_type self.input_sizes = input_sizes self.input_types = input_types + self.is_conversion = is_conversion self.algebraic_properties = algebraic_properties self.const_expr = const_expr @@ -138,21 +141,22 @@ associative = "associative " opcodes = {} def opcode(name, output_size, output_type, input_sizes, input_types, - algebraic_properties, const_expr): + is_conversion, algebraic_properties, const_expr): assert name not in opcodes opcodes[name] = Opcode(name, output_size, output_type, input_sizes, - input_types, algebraic_properties, const_expr) + input_types, is_conversion, algebraic_properties, + const_expr) def unop_convert(name, out_type, in_type, const_expr): - opcode(name, 0, out_type, [0], [in_type], "", const_expr) + opcode(name, 0, out_type, [0], [in_type], False, "", const_expr) def unop(name, ty, const_expr): - opcode(name, 0, ty, [0], [ty], "", const_expr) + opcode(name, 0, ty, [0], [ty], False, "", const_expr) def unop_horiz(name, output_size, output_type, input_size, input_type, const_expr): - opcode(name, output_size, output_type, [input_size], [input_type], "", - const_expr) + opcode(name, output_size, output_type, [input_size], [input_type], + False, "", const_expr) def unop_reduce(name, output_size, output_type, input_type, prereduce_expr, reduce_expr, final_expr): @@ -173,6 +177,8 @@ def unop_reduce(name, output_size, output_type, input_type, prereduce_expr, unop_horiz(name + "4", output_size, output_type, 4, input_type, final(reduce_(reduce_(src0, src1), reduce_(src2, src3 +def unop_numeric_convert(name, out_type, in_type, const_expr): + opcode(name, 0, out_type, [0], [in_type], True, "", const_expr) # These two move instructions differ in what modifiers they support and what # the negate modifier means. Otherwise, they are identical. @@ -215,13 +221,13 @@ for src_t in [tint, tuint, tfloat, tbool]: if bit_size == 16 and dst_t == tfloat and src_t == tfloat: rnd_modes = ['_rtne', '_rtz', ''] for rnd_mode in rnd_modes: - unop_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0], - bit_size, rnd_mode), - dst_t + str(bit_size), src_t, "src0") + unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0], + bit_size, rnd_mode), + dst_t + str(bit_size), src_t, "src0") else: conv_expr = "src0 != 0" if dst_t == tbool else "src0" - unop_conve
[Mesa-dev] [PATCH v4 37/40] intel/compiler: validate region restrictions for mixed float mode
--- src/intel/compiler/brw_eu_validate.c| 256 ++ src/intel/compiler/test_eu_validate.cpp | 618 2 files changed, 874 insertions(+) diff --git a/src/intel/compiler/brw_eu_validate.c b/src/intel/compiler/brw_eu_validate.c index ed9c8fe59dd..a61d4c46e81 100644 --- a/src/intel/compiler/brw_eu_validate.c +++ b/src/intel/compiler/brw_eu_validate.c @@ -170,6 +170,13 @@ src1_is_null(const struct gen_device_info *devinfo, const brw_inst *inst) brw_inst_src1_da_reg_nr(devinfo, inst) == BRW_ARF_NULL; } +static bool +src0_is_acc(const struct gen_device_info *devinfo, const brw_inst *inst) +{ + return brw_inst_src0_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE && + brw_inst_src0_da_reg_nr(devinfo, inst) == BRW_ARF_ACCUMULATOR; +} + static bool src0_is_grf(const struct gen_device_info *devinfo, const brw_inst *inst) { @@ -847,6 +854,254 @@ general_restrictions_on_region_parameters(const struct gen_device_info *devinfo, return error_msg; } +static struct string +special_restrictions_for_mixed_float_mode(const struct gen_device_info *devinfo, + const brw_inst *inst) +{ + struct string error_msg = { .str = NULL, .len = 0 }; + + unsigned opcode = brw_inst_opcode(devinfo, inst); + unsigned num_sources = num_sources_from_inst(devinfo, inst); + if (num_sources >= 3) + return error_msg; + + if (!is_mixed_float(devinfo, inst)) + return error_msg; + + unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst); + bool is_align16 = brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16; + + enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst); + enum brw_reg_type src1_type = brw_inst_src1_type(devinfo, inst); + enum brw_reg_type dst_type = brw_inst_dst_type(devinfo, inst); + + unsigned dst_stride = STRIDE(brw_inst_dst_hstride(devinfo, inst)); + bool dst_is_packed = is_packed(exec_size * dst_stride, exec_size, dst_stride); + + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode +* Float Operations: +* +*"Indirect addressing on source is not supported when source and +* destination data types are mixed float." +* +* Indirect addressing is only supported on the first source, so we only +* check that. +*/ + ERROR_IF(types_are_mixed_float(dst_type, src0_type) && +brw_inst_src0_address_mode(devinfo, inst) != BRW_ADDRESS_DIRECT, +"Indirect addressing on source is not supported when source and " +"destination data types are mixed float"); + + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode +* Float Operations: +* +*"No SIMD16 in mixed mode when destination is f32. Instruction +* execution size must be no more than 8." +*/ + ERROR_IF(exec_size > 8 && dst_type == BRW_REGISTER_TYPE_F, +"Mixed float mode with 32-bit float destination is limited " +"to SIMD8"); + + if (is_align16) { + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode + * Float Operations: + * + * "In Align16 mode, when half float and float data types are mixed + *between source operands OR between source and destination operands, + *the register content are assumed to be packed." + * + * Since Align16 doesn't have a concept of horizontal stride (or width), + * it means that vertical stride must always be 4, since 0 and 2 would + * lead to replicated data, and any other value is disallowed in Align16. + * However, the PRM also says: + * + * "In Align16, vertical stride can never be zero for f16" + * + * Which is oddly redundant and specific considering the more general + * assumption that all operands are assumed to be packed, so we + * understand that this might be hinting that there may be an exception + * for f32 operands with a vstride of 0, so we don't validate this for + * them while we don't have empirical evidence that it is forbidden. + */ + ERROR_IF(brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4 && + (src0_type != BRW_REGISTER_TYPE_F || +brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0), + "Align16 mixed float mode assumes packed data (vstride must " + "be 4 -or 0 for f32 operands-)"); + + ERROR_IF(num_sources >= 2 && + brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4 && + (src1_type != BRW_REGISTER_TYPE_F || +brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0), + "Align16 mixed float mode assumes packed data (vstride must " + "be 4 -or 0 for f32 operands-)"); + + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode + * Float Operations: + * + * "For Align16
[Mesa-dev] [PATCH v4 21/40] intel/compiler: split is_partial_write() into two variants
This function is used in two different scenarios that for 32-bit instructions are the same, but for 16-bit instructions are not. One scenario is that in which we are working at a SIMD8 register level and we need to know if a register is fully defined or written. This is useful, for example, in the context of liveness analysis or register allocation, where we work with units of registers. The other scenario is that in which we want to know if an instruction is writing a full scalar component or just some subset of it. This is useful, for example, in the context of some optimization passes like copy propagation. For 32-bit instructions (or larger), a SIMD8 dispatch will always write at least a full SIMD8 register (32B) if the write is not partial. The function is_partial_write() checks this to determine if we have a partial write. However, when we deal with 16-bit instructions, that logic disables some optimizations that should be safe. For example, a SIMD8 16-bit MOV will only update half of a SIMD register, but it is still a complete write of the variable for a SIMD8 dispatch, so we should not prevent copy propagation in this scenario because we don't write all 32 bytes in the SIMD register or because the write starts at offset 16B (wehere we pack components Y or W of 16-bit vectors). This is a problem for SIMD8 executions (VS, TCS, TES, GS) of 16-bit instructions, which lose a number of optimizations because of this, most important of which is copy-propagation. This patch splits is_partial_write() into is_partial_reg_write(), which represents the current is_partial_write(), useful for things like liveness analysis, and is_partial_var_write(), which considers the dispatch size to check if we are writing a full variable (rather than a full register) to decide if the write is partial or not, which is what we really want in many optimization passes. Then the patch goes on and rewrites all uses of is_partial_write() to use one or the other version. Specifically, we use is_partial_var_write() in the following places: copy propagation, cmod propagation, common subexpression elimination, saturate propagation and sel peephole. Notice that the semantics of is_partial_var_write() exactly match the current implementation of is_partial_write() for anything that is 32-bit or larger, so no changes are expected for 32-bit instructions. Tested against ~5000 tests involving 16-bit instructions in CTS produced the following changes in instruction counts: Patched | Master|%| SIMD8 |621,900 |706,721| -12.00% | SIMD16 | 93,252 | 93,252| 0.00% | As expected, the change only affects SIMD8 dispatches. Reviewed-by: Topi Pohjolainen --- src/intel/compiler/brw_fs.cpp | 31 +++ .../compiler/brw_fs_cmod_propagation.cpp | 20 ++-- .../compiler/brw_fs_copy_propagation.cpp | 8 ++--- src/intel/compiler/brw_fs_cse.cpp | 3 +- .../compiler/brw_fs_dead_code_eliminate.cpp | 2 +- src/intel/compiler/brw_fs_live_variables.cpp | 2 +- src/intel/compiler/brw_fs_reg_allocate.cpp| 2 +- .../compiler/brw_fs_register_coalesce.cpp | 2 +- .../compiler/brw_fs_saturate_propagation.cpp | 7 +++-- src/intel/compiler/brw_fs_sel_peephole.cpp| 4 +-- src/intel/compiler/brw_ir_fs.h| 3 +- 11 files changed, 54 insertions(+), 30 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index d172c2de4d7..0f04e577de3 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -696,14 +696,33 @@ fs_visitor::limit_dispatch_width(unsigned n, const char *msg) * it. */ bool -fs_inst::is_partial_write() const +fs_inst::is_partial_reg_write() const { return ((this->predicate && this->opcode != BRW_OPCODE_SEL) || - (this->exec_size * type_sz(this->dst.type)) < 32 || !this->dst.is_contiguous() || + (this->exec_size * type_sz(this->dst.type)) < REG_SIZE || this->dst.offset % REG_SIZE != 0); } +/** + * Returns true if the instruction has a flag that means it won't + * update an entire variable for the given dispatch width. + * + * This is only different from is_partial_reg_write() for SIMD8 + * dispatches of 16-bit (or smaller) instructions. + */ +bool +fs_inst::is_partial_var_write(uint32_t dispatch_width) const +{ + const uint32_t type_size = type_sz(this->dst.type); + uint32_t var_size = MIN2(REG_SIZE, dispatch_width * type_size); + + return ((this->predicate && this->opcode != BRW_OPCODE_SEL) || + !this->dst.is_contiguous() || + (this->exec_size * type_sz(this->dst.type)) < var_size || + this->dst.offset % var_size != 0); +} + unsigned fs_inst::components_read(unsigned i) const { @@ -2923,7 +2942,7 @@ fs_v
[Mesa-dev] [PATCH v4 04/40] intel/compiler: handle b2i/b2f with other integer conversion opcodes
Since we handle booleans as integers this makes more sense. v2: - rebased to incorporate new boolean conversion opcodes v3: - rebased on top regioning lowering pass Reviewed-by: Jason Ekstrand (v1) Reviewed-by: Topi Pohjolainen (v2) --- src/intel/compiler/brw_fs_nir.cpp | 16 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index bb7591422d4..b705b3f5bc2 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -788,6 +788,14 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) inst->saturate = instr->dest.saturate; break; + case nir_op_f2f64: + case nir_op_f2i64: + case nir_op_f2u64: + assert(type_sz(op[0].type) > 2); /* brw_nir_lower_conversions */ + inst = bld.MOV(result, op[0]); + inst->saturate = instr->dest.saturate; + break; + case nir_op_b2i8: case nir_op_b2i16: case nir_op_b2i32: @@ -798,14 +806,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) op[0].type = BRW_REGISTER_TYPE_D; op[0].negate = !op[0].negate; /* fallthrough */ - case nir_op_f2f64: - case nir_op_f2i64: - case nir_op_f2u64: - assert(type_sz(op[0].type) > 2); /* brw_nir_lower_conversions */ - inst = bld.MOV(result, op[0]); - inst->saturate = instr->dest.saturate; - break; - case nir_op_i2f64: case nir_op_i2i64: case nir_op_u2f64: -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 33/40] intel/compiler: also set F execution type for mixed float mode in BDW
The section 'Execution Data Types' of 3D Media GPGPU volume, which describes execution types, is exactly the same in BDW and SKL+. Also, this section states that there is a single execution type, so it makes sense that this is the wider of the two floating point types involved in mixed float mode, which is what we do for SKL+ and CHV. --- src/intel/compiler/brw_eu_validate.c | 18 +++--- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/intel/compiler/brw_eu_validate.c b/src/intel/compiler/brw_eu_validate.c index 358a0347a93..000a05cb6ac 100644 --- a/src/intel/compiler/brw_eu_validate.c +++ b/src/intel/compiler/brw_eu_validate.c @@ -431,18 +431,14 @@ execution_type(const struct gen_device_info *devinfo, const brw_inst *inst) src1_exec_type == BRW_REGISTER_TYPE_DF) return BRW_REGISTER_TYPE_DF; - if (devinfo->gen >= 9 || devinfo->is_cherryview) { - if (dst_exec_type == BRW_REGISTER_TYPE_F || - src0_exec_type == BRW_REGISTER_TYPE_F || - src1_exec_type == BRW_REGISTER_TYPE_F) { - return BRW_REGISTER_TYPE_F; - } else { - return BRW_REGISTER_TYPE_HF; - } + if (dst_exec_type == BRW_REGISTER_TYPE_F || + src0_exec_type == BRW_REGISTER_TYPE_F || + src1_exec_type == BRW_REGISTER_TYPE_F) { + return BRW_REGISTER_TYPE_F; + } else { + assert(devinfo->gen >= 8 && src0_exec_type == BRW_REGISTER_TYPE_HF); + return BRW_REGISTER_TYPE_HF; } - - assert(src0_exec_type == BRW_REGISTER_TYPE_F); - return BRW_REGISTER_TYPE_F; } /** -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 36/40] intel/compiler: skip validating restrictions on operand types for mixed float
Mixed float instructions are those that use both F and HF operands as their sources or destination, except for regular conversions. There are specific rules for mixed float operation mode with its own set of restrictions, which involve rules that are incompatible with general restrictions. For example: "In Align1, destination stride can be smaller than execution type" Instead, we will implement validation for mixed float mode instructions separately in a follow-up patch. --- src/intel/compiler/brw_eu_validate.c | 50 1 file changed, 50 insertions(+) diff --git a/src/intel/compiler/brw_eu_validate.c b/src/intel/compiler/brw_eu_validate.c index b1fdd1ce941..ed9c8fe59dd 100644 --- a/src/intel/compiler/brw_eu_validate.c +++ b/src/intel/compiler/brw_eu_validate.c @@ -461,6 +461,53 @@ is_packed(unsigned vstride, unsigned width, unsigned hstride) return false; } +/** + * Returns whether a combination of two types would qualify as mixed float + * operation mode + */ +static inline bool +types_are_mixed_float(enum brw_reg_type t0, enum brw_reg_type t1) +{ + return (t0 == BRW_REGISTER_TYPE_F && t1 == BRW_REGISTER_TYPE_HF) || + (t1 == BRW_REGISTER_TYPE_F && t0 == BRW_REGISTER_TYPE_HF); +} + +/** + * Returns whether an instruction is using mixed float operation mode + */ +static bool +is_mixed_float(const struct gen_device_info *devinfo, const brw_inst *inst) +{ + if (devinfo->gen < 8) + return false; + + if (inst_is_send(devinfo, inst)) + return false; + + unsigned opcode = brw_inst_opcode(devinfo, inst); + const struct opcode_desc *desc = brw_opcode_desc(devinfo, opcode); + if (desc->ndst == 0) + return false; + + /* FIXME: support 3-src instructions */ + unsigned num_sources = num_sources_from_inst(devinfo, inst); + assert(num_sources < 3); + + enum brw_reg_type dst_type = brw_inst_dst_type(devinfo, inst); + enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst); + + if (num_sources == 1) { + return opcode == BRW_OPCODE_MATH && + types_are_mixed_float(src0_type, dst_type); + } + + enum brw_reg_type src1_type = brw_inst_src1_type(devinfo, inst); + + return types_are_mixed_float(src0_type, src1_type) || + types_are_mixed_float(src0_type, dst_type) || + types_are_mixed_float(src1_type, dst_type); +} + /** * Checks restrictions listed in "General Restrictions Based on Operand Types" * in the "Register Region Restrictions" section. @@ -487,6 +534,9 @@ general_restrictions_based_on_operand_types(const struct gen_device_info *devinf if (desc->ndst == 0) return (struct string){}; + if (is_mixed_float(devinfo, inst)) + return (struct string){}; + /* The PRMs say: * *Where n is the largest element size in bytes for any source or -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 12/40] compiler/nir: add lowering for 16-bit ldexp
v2 (Topi): - Make bit-size handling order be 16-bit, 32-bit, 64-bit - Clamp lower exponent range at -28 instead of -30. Reviewed-by: Topi Pohjolainen Reviewed-by: Jason Ekstrand --- src/compiler/nir/nir_opt_algebraic.py | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 40eb3de02c3..71c626e1b3f 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -790,7 +790,9 @@ for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']): def fexp2i(exp, bits): # We assume that exp is already in the right range. - if bits == 32: + if bits == 16: + return ('i2i16', ('ishl', ('iadd', exp, 15), 10)) + elif bits == 32: return ('ishl', ('iadd', exp, 127), 23) elif bits == 64: return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20)) @@ -808,7 +810,9 @@ def ldexp(f, exp, bits): # handles a range on exp of [-252, 254] which allows you to create any # value (including denorms if the hardware supports it) and to adjust the # exponent of any normal value to anything you want. - if bits == 32: + if bits == 16: + exp = ('imin', ('imax', exp, -28), 30) + elif bits == 32: exp = ('imin', ('imax', exp, -252), 254) elif bits == 64: exp = ('imin', ('imax', exp, -2044), 2046) @@ -828,6 +832,7 @@ def ldexp(f, exp, bits): return ('fmul', ('fmul', f, pow2_1), pow2_2) optimizations += [ + (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'), (('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'), (('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'), ] -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 00/40] intel: VK_KHR_shader_float16_int8 implementation
The changes in this version address review feedback to v3. The most significant changes include: 1. A more generic constant combining pass that can handle more constant types (not just F and HF) requested by Jason. 2. The addition of assembly validation for half-float restrictions, and also for mixed float mode, requested by Curro. It should be noted that this implementation of VK_KHR_shader_float16_int8 does not emit any mixed mode float instructions at this moment so I have not empirically validated the restictions implemented here. As always, a branch with these patches is available for testing in the itoral/VK_KHR_shader_float16_int8 branch of the Igalia Mesa repository at https://github.com/Igalia/mesa. Iago Toral Quiroga (40): compiler/nir: add an is_conversion field to nir_op_info intel/compiler: add a NIR pass to lower conversions intel/compiler: split float to 64-bit opcodes from int to 64-bit intel/compiler: handle b2i/b2f with other integer conversion opcodes intel/compiler: assert restrictions on conversions to half-float intel/compiler: lower some 16-bit float operations to 32-bit intel/compiler: handle extended math restrictions for half-float intel/compiler: implement 16-bit fsign intel/compiler: drop unnecessary temporary from 32-bit fsign implementation compiler/nir: add lowering option for 16-bit fmod compiler/nir: add lowering for 16-bit flrp compiler/nir: add lowering for 16-bit ldexp intel/compiler: add instruction setters for Src1Type and Src2Type. intel/compiler: add new half-float register type for 3-src instructions intel/compiler: don't compact 3-src instructions with Src1Type or Src2Type bits intel/compiler: allow half-float on 3-source instructions since gen8 intel/compiler: set correct precision fields for 3-source float instructions intel/compiler: fix ddx and ddy for 16-bit float intel/compiler: fix ddy for half-float in Broadwell intel/compiler: workaround for SIMD8 half-float MAD in gen8 intel/compiler: split is_partial_write() into two variants intel/compiler: activate 16-bit bit-size lowerings also for 8-bit intel/compiler: rework conversion opcodes intel/compiler: implement isign for int8 intel/compiler: ask for an integer type if requesting an 8-bit type intel/eu: force stride of 2 on NULL register for Byte instructions intel/compiler: generalize the combine constants pass intel/compiler: implement is_zero, is_one, is_negative_one for 8-bit/16-bit intel/compiler: add a brw_reg_type_is_integer helper intel/compiler: fix cmod propagation for non 32-bit types intel/compiler: remove inexact algebraic optimizations from the backend intel/compiler: skip MAD algebraic optimization for half-float or mixed mode intel/compiler: also set F execution type for mixed float mode in BDW intel/compiler: validate region restrictions for half-float conversions intel/compiler: validate conversions between 64-bit and 8-bit types intel/compiler: skip validating restrictions on operand types for mixed float intel/compiler: validate region restrictions for mixed float mode compiler/spirv: move the check for Int8 capability anv/pipeline: support Float16 and Int8 SPIR-V capabilities in gen8+ anv/device: expose VK_KHR_shader_float16_int8 in gen8+ src/compiler/nir/nir.h| 5 + src/compiler/nir/nir_opcodes.py | 73 +- src/compiler/nir/nir_opcodes_c.py | 1 + src/compiler/nir/nir_opt_algebraic.py | 11 +- src/compiler/shader_info.h| 1 + src/compiler/spirv/spirv_to_nir.c | 11 +- src/intel/Makefile.sources| 1 + src/intel/compiler/brw_compiler.c | 2 + src/intel/compiler/brw_eu_compact.c | 5 +- src/intel/compiler/brw_eu_emit.c | 36 +- src/intel/compiler/brw_eu_validate.c | 396 - src/intel/compiler/brw_fs.cpp | 101 ++- .../compiler/brw_fs_cmod_propagation.cpp | 34 +- .../compiler/brw_fs_combine_constants.cpp | 202 - .../compiler/brw_fs_copy_propagation.cpp | 8 +- src/intel/compiler/brw_fs_cse.cpp | 3 +- .../compiler/brw_fs_dead_code_eliminate.cpp | 2 +- src/intel/compiler/brw_fs_generator.cpp | 54 +- src/intel/compiler/brw_fs_live_variables.cpp | 2 +- src/intel/compiler/brw_fs_lower_regioning.cpp | 39 +- src/intel/compiler/brw_fs_nir.cpp | 87 +- src/intel/compiler/brw_fs_reg_allocate.cpp| 2 +- .../compiler/brw_fs_register_coalesce.cpp | 2 +- .../compiler/brw_fs_saturate_propagation.cpp | 7 +- src/intel/compiler/brw_fs_sel_peephole.cpp| 4 +- src/intel/compiler/brw_inst.h | 2 + src/intel/compiler/brw_ir_fs.h| 3 +- src/intel/compiler/brw_nir.c | 22 +- src/intel/compiler/brw_nir.h | 2 + .../com
[Mesa-dev] [PATCH v4 14/40] intel/compiler: add new half-float register type for 3-src instructions
This is available since gen8. v2: restore previously existing assertion. v3: don't use separate tables for gen7 and gen8, just assert that we don't use half-float before gen8 (Matt) Reviewed-by: Topi Pohjolainen (v1) --- src/intel/compiler/brw_reg_type.c | 4 1 file changed, 4 insertions(+) diff --git a/src/intel/compiler/brw_reg_type.c b/src/intel/compiler/brw_reg_type.c index 60240ba1513..feabee2f53b 100644 --- a/src/intel/compiler/brw_reg_type.c +++ b/src/intel/compiler/brw_reg_type.c @@ -138,6 +138,7 @@ enum hw_3src_reg_type { GEN7_3SRC_TYPE_D = 1, GEN7_3SRC_TYPE_UD = 2, GEN7_3SRC_TYPE_DF = 3, + GEN8_3SRC_TYPE_HF = 4, /** When ExecutionDatatype is 1: @{ */ GEN10_ALIGN1_3SRC_REG_TYPE_HF = 0b000, @@ -166,6 +167,7 @@ static const struct hw_3src_type { [BRW_REGISTER_TYPE_D] = { GEN7_3SRC_TYPE_D }, [BRW_REGISTER_TYPE_UD] = { GEN7_3SRC_TYPE_UD }, [BRW_REGISTER_TYPE_DF] = { GEN7_3SRC_TYPE_DF }, + [BRW_REGISTER_TYPE_HF] = { GEN8_3SRC_TYPE_HF }, }, gen10_hw_3src_align1_type[] = { #define E(x) BRW_ALIGN1_3SRC_EXEC_TYPE_##x [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID }, @@ -258,6 +260,7 @@ brw_reg_type_to_a16_hw_3src_type(const struct gen_device_info *devinfo, enum brw_reg_type type) { assert(type < ARRAY_SIZE(gen7_hw_3src_type)); + assert(devinfo->gen >= 8 || type != BRW_REGISTER_TYPE_HF); assert(gen7_hw_3src_type[type].reg_type != (enum hw_3src_reg_type)INVALID); return gen7_hw_3src_type[type].reg_type; } @@ -283,6 +286,7 @@ enum brw_reg_type brw_a16_hw_3src_type_to_reg_type(const struct gen_device_info *devinfo, unsigned hw_type) { + assert(devinfo->gen >= 8 || hw_type != GEN8_3SRC_TYPE_HF); for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) { if (gen7_hw_3src_type[i].reg_type == hw_type) { return i; -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 10/40] compiler/nir: add lowering option for 16-bit fmod
And enable it on Intel. v2: - Squash the change to enable this lowering on Intel (Jason) Reviewed-by: Jason Ekstrand --- src/compiler/nir/nir.h| 1 + src/compiler/nir/nir_opt_algebraic.py | 1 + src/intel/compiler/brw_compiler.c | 1 + 3 files changed, 3 insertions(+) diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 2793662b1d9..6547a468648 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -2119,6 +2119,7 @@ typedef struct nir_shader_compiler_options { bool lower_fpow; bool lower_fsat; bool lower_fsqrt; + bool lower_fmod16; bool lower_fmod32; bool lower_fmod64; /** Lowers ibitfield_extract/ubitfield_extract to ibfe/ubfe. */ diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 75a3d2ad238..cd969de1f88 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -636,6 +636,7 @@ optimizations = [ (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)), # Misc. lowering + (('fmod@16', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b, 'options->lower_fmod16'), (('fmod@32', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b, 'options->lower_fmod32'), (('fmod@64', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b, 'options->lower_fmod64'), (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b, 'options->lower_fmod32'), diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c index fe632c5badc..f885e79c3e6 100644 --- a/src/intel/compiler/brw_compiler.c +++ b/src/intel/compiler/brw_compiler.c @@ -33,6 +33,7 @@ .lower_sub = true, \ .lower_fdiv = true,\ .lower_scmp = true,\ + .lower_fmod16 = true, \ .lower_fmod32 = true, \ .lower_fmod64 = false, \ .lower_bitfield_extract = true,\ -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 23/40] intel/compiler: rework conversion opcodes
Now that we have the regioning lowering pass we can just put all of these opcodes together in a single block and we can just assert on the few cases of conversion instructions that are not supported in hardware and that should be lowered in brw_nir_lower_conversions. The only cases what we still handle separately are the conversions from float to half-float since the rounding variants would need to fallthrough and we are already doing this for boolean opcodes (since they need to negate), plus there is also a large comment about these opcodes that we probably want to keep so it is just easier to keep these separate. Suggested-by: Jason Ekstrand --- src/intel/compiler/brw_fs_nir.cpp | 41 +-- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index f59e9ad4e2b..3a6e4a2eb60 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -772,7 +772,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), brw_imm_d(brw_rnd_mode_from_nir_op(instr->op))); /* fallthrough */ - + case nir_op_f2f16: /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending * on the HW gen, it is a special hw opcode or just a MOV, and * brw_F32TO16 (at brw_eu_emit) would do the work to chose. @@ -782,23 +782,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) * only for gen8+, it will be better to use directly the MOV, and use * BRW_OPCODE_F32TO16 when/if we work for HF support on gen7. */ - - case nir_op_f2f16: - case nir_op_i2f16: - case nir_op_u2f16: assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ inst = bld.MOV(result, op[0]); inst->saturate = instr->dest.saturate; break; - case nir_op_f2f64: - case nir_op_f2i64: - case nir_op_f2u64: - assert(type_sz(op[0].type) > 2); /* brw_nir_lower_conversions */ - inst = bld.MOV(result, op[0]); - inst->saturate = instr->dest.saturate; - break; - case nir_op_b2i8: case nir_op_b2i16: case nir_op_b2i32: @@ -813,19 +801,34 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) case nir_op_i2i64: case nir_op_u2f64: case nir_op_u2u64: - assert(type_sz(op[0].type) > 1); /* brw_nir_lower_conversions */ - /* fallthrough */ + case nir_op_f2f64: + case nir_op_f2i64: + case nir_op_f2u64: + case nir_op_i2i32: + case nir_op_u2u32: case nir_op_f2f32: case nir_op_f2i32: case nir_op_f2u32: - case nir_op_f2i16: - case nir_op_f2u16: - case nir_op_i2i32: - case nir_op_u2u32: + case nir_op_i2f16: case nir_op_i2i16: + case nir_op_u2f16: case nir_op_u2u16: + case nir_op_f2i16: + case nir_op_f2u16: case nir_op_i2i8: case nir_op_u2u8: + case nir_op_f2i8: + case nir_op_f2u8: + if (result.type == BRW_REGISTER_TYPE_B || + result.type == BRW_REGISTER_TYPE_UB || + result.type == BRW_REGISTER_TYPE_HF) + assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ + + if (op[0].type == BRW_REGISTER_TYPE_B || + op[0].type == BRW_REGISTER_TYPE_UB || + op[0].type == BRW_REGISTER_TYPE_HF) + assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */ + inst = bld.MOV(result, op[0]); inst->saturate = instr->dest.saturate; break; -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 39/40] anv/pipeline: support Float16 and Int8 SPIR-V capabilities in gen8+
v2: - Merge Float16 and Int8 capabilities into a single patch (Jason) - Merged patch that enabled SPIR-V front-end checks for these caps (except for Int8, which was already merged) Reviewed-by: Jason Ekstrand (v1) --- src/compiler/shader_info.h| 1 + src/compiler/spirv/spirv_to_nir.c | 4 +++- src/intel/vulkan/anv_pipeline.c | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/compiler/shader_info.h b/src/compiler/shader_info.h index 3d871938751..4726c185243 100644 --- a/src/compiler/shader_info.h +++ b/src/compiler/shader_info.h @@ -38,6 +38,7 @@ struct spirv_supported_capabilities { bool descriptor_array_dynamic_indexing; bool device_group; bool draw_parameters; + bool float16; bool float64; bool geometry_streams; bool gcn_shader; diff --git a/src/compiler/spirv/spirv_to_nir.c b/src/compiler/spirv/spirv_to_nir.c index 7e07de2bfc0..309ed6c59b0 100644 --- a/src/compiler/spirv/spirv_to_nir.c +++ b/src/compiler/spirv/spirv_to_nir.c @@ -3556,7 +3556,6 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode, case SpvCapabilityLinkage: case SpvCapabilityVector16: case SpvCapabilityFloat16Buffer: - case SpvCapabilityFloat16: case SpvCapabilitySparseResidency: vtn_warn("Unsupported SPIR-V capability: %s", spirv_capability_to_string(cap)); @@ -3573,6 +3572,9 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode, case SpvCapabilityFloat64: spv_check_supported(float64, cap); break; + case SpvCapabilityFloat16: + spv_check_supported(float16, cap); + break; case SpvCapabilityInt64: spv_check_supported(int64, cap); break; diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index e2024212bd9..0e8c4245df6 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -139,8 +139,10 @@ anv_shader_compile_to_nir(struct anv_device *device, .device_group = true, .draw_parameters = true, .float64 = pdevice->info.gen >= 8, + .float16 = pdevice->info.gen >= 8, .geometry_streams = true, .image_write_without_format = true, + .int8 = pdevice->info.gen >= 8, .int16 = pdevice->info.gen >= 8, .int64 = pdevice->info.gen >= 8, .min_lod = true, -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 28/40] intel/compiler: implement is_zero, is_one, is_negative_one for 8-bit/16-bit
There are no 8-bit immediates, so assert in that case. 16-bit immediates are replicated in each word of a 32-bit immediate, so we only need to check the lower 16-bits. v2: - Fix is_zero with half-float to consider -0 as well (Jason). - Fix is_negative_one for word type. Reviewed-by: Jason Ekstrand --- src/intel/compiler/brw_shader.cpp | 26 ++ 1 file changed, 26 insertions(+) diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index 569e68e02af..bf09c416e9e 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -719,11 +719,20 @@ backend_reg::is_zero() const if (file != IMM) return false; + assert(type_sz(type) > 1); + switch (type) { + case BRW_REGISTER_TYPE_HF: + assert((d & 0x) == ((d >> 16) & 0x)); + return (d & 0x) == 0 || (d & 0x) == 0x8000; case BRW_REGISTER_TYPE_F: return f == 0; case BRW_REGISTER_TYPE_DF: return df == 0; + case BRW_REGISTER_TYPE_W: + case BRW_REGISTER_TYPE_UW: + assert((d & 0x) == ((d >> 16) & 0x)); + return (d & 0x) == 0; case BRW_REGISTER_TYPE_D: case BRW_REGISTER_TYPE_UD: return d == 0; @@ -741,11 +750,20 @@ backend_reg::is_one() const if (file != IMM) return false; + assert(type_sz(type) > 1); + switch (type) { + case BRW_REGISTER_TYPE_HF: + assert((d & 0x) == ((d >> 16) & 0x)); + return (d & 0x) == 0x3c00; case BRW_REGISTER_TYPE_F: return f == 1.0f; case BRW_REGISTER_TYPE_DF: return df == 1.0; + case BRW_REGISTER_TYPE_W: + case BRW_REGISTER_TYPE_UW: + assert((d & 0x) == ((d >> 16) & 0x)); + return (d & 0x) == 1; case BRW_REGISTER_TYPE_D: case BRW_REGISTER_TYPE_UD: return d == 1; @@ -763,11 +781,19 @@ backend_reg::is_negative_one() const if (file != IMM) return false; + assert(type_sz(type) > 1); + switch (type) { + case BRW_REGISTER_TYPE_HF: + assert((d & 0x) == ((d >> 16) & 0x)); + return (d & 0x) == 0xbc00; case BRW_REGISTER_TYPE_F: return f == -1.0; case BRW_REGISTER_TYPE_DF: return df == -1.0; + case BRW_REGISTER_TYPE_W: + assert((d & 0x) == ((d >> 16) & 0x)); + return (d & 0x) == 0x; case BRW_REGISTER_TYPE_D: return d == -1; case BRW_REGISTER_TYPE_Q: -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 29/40] intel/compiler: add a brw_reg_type_is_integer helper
v2: - Fixed typo: meant BRW_REGISTER_TYPE_UB instead BRW_REGISTER_TYPE_UV Reviewed-by: Jason Ekstrand (v1) --- src/intel/compiler/brw_reg_type.h | 18 ++ 1 file changed, 18 insertions(+) diff --git a/src/intel/compiler/brw_reg_type.h b/src/intel/compiler/brw_reg_type.h index ffbec90d3fe..086770d2e03 100644 --- a/src/intel/compiler/brw_reg_type.h +++ b/src/intel/compiler/brw_reg_type.h @@ -82,6 +82,24 @@ brw_reg_type_is_floating_point(enum brw_reg_type type) } } +static inline bool +brw_reg_type_is_integer(enum brw_reg_type type) +{ + switch (type) { + case BRW_REGISTER_TYPE_Q: + case BRW_REGISTER_TYPE_UQ: + case BRW_REGISTER_TYPE_D: + case BRW_REGISTER_TYPE_UD: + case BRW_REGISTER_TYPE_W: + case BRW_REGISTER_TYPE_UW: + case BRW_REGISTER_TYPE_B: + case BRW_REGISTER_TYPE_UB: + return true; + default: + return false; + } +} + unsigned brw_reg_type_to_hw_type(const struct gen_device_info *devinfo, enum brw_reg_file file, enum brw_reg_type type); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 19/40] intel/compiler: fix ddy for half-float in Broadwell
Broadwell has restrictions that apply to Align16 half-float that make the Align16 implementation of this invalid for this platform. Use the gen11 path for this instead, which uses Align1 mode. The restriction is not present in cherryview, gen9 or gen10, where the Align16 implementation seems to work just fine. v2: - Rework the comment in the code, move the PRM citation from the commit message to the comment in the code (Matt) - Cherryview isn't affected, only Broadwell (Matt) Reviewed-by: Jason Ekstrand (v1) Reviewed-by: Matt Turner --- src/intel/compiler/brw_fs_generator.cpp | 17 +++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index 996eafd4af1..37129ce4c67 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -1272,8 +1272,21 @@ fs_generator::generate_ddy(const fs_inst *inst, const uint32_t type_size = type_sz(src.type); if (inst->opcode == FS_OPCODE_DDY_FINE) { - /* produce accurate derivatives */ - if (devinfo->gen >= 11) { + /* produce accurate derivatives. + * + * From the Broadwell PRM, Volume 7 (3D-Media-GPGPU) + * "Register Region Restrictions", Section "1. Special Restrictions": + * + *"In Align16 mode, the channel selects and channel enables apply to + * a pair of half-floats, because these parameters are defined for + * DWord elements ONLY. This is applicable when both source and + * destination are half-floats." + * + * So for half-float operations we use the Gen11+ Align1 path. CHV + * inherits its FP16 hardware from SKL, so it is not affected. + */ + if (devinfo->gen >= 11 || + (devinfo->is_broadwell && src.type == BRW_REGISTER_TYPE_HF)) { src = stride(src, 0, 2, 1); struct brw_reg src_0 = byte_offset(src, 0 * type_size); struct brw_reg src_2 = byte_offset(src, 2 * type_size); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 27/40] intel/compiler: generalize the combine constants pass
At the very least we need it to handle HF too, since we are doing constant propagation for MAD and LRP, which relies on this pass to promote the immediates to GRF in the end, but ideally we want it to support even more types so we can take advantage of it to improve register pressure in some scenarios. --- .../compiler/brw_fs_combine_constants.cpp | 202 -- 1 file changed, 180 insertions(+), 22 deletions(-) diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp b/src/intel/compiler/brw_fs_combine_constants.cpp index 7343f77bb45..5d79f1a0826 100644 --- a/src/intel/compiler/brw_fs_combine_constants.cpp +++ b/src/intel/compiler/brw_fs_combine_constants.cpp @@ -36,6 +36,7 @@ #include "brw_fs.h" #include "brw_cfg.h" +#include "util/half_float.h" using namespace brw; @@ -114,8 +115,17 @@ struct imm { */ exec_list *uses; - /** The immediate value. We currently only handle floats. */ - float val; + /** The immediate value */ + union { + char bytes[8]; + float f; + int32_t d; + int16_t w; + }; + uint8_t size; + + /** When promoting half-float we need to account for certain restrictions */ + bool is_half_float; /** * The GRF register and subregister number where we've decided to store the @@ -145,10 +155,11 @@ struct table { }; static struct imm * -find_imm(struct table *table, float val) +find_imm(struct table *table, void *data, uint8_t size) { for (int i = 0; i < table->len; i++) { - if (table->imm[i].val == val) { + if (table->imm[i].size == size && + !memcmp(table->imm[i].bytes, data, size)) { return &table->imm[i]; } } @@ -190,6 +201,96 @@ compare(const void *_a, const void *_b) return a->first_use_ip - b->first_use_ip; } +static bool +get_constant_value(const struct gen_device_info *devinfo, + const fs_inst *inst, uint32_t src_idx, + void *out, brw_reg_type *out_type) +{ + const bool can_do_source_mods = inst->can_do_source_mods(devinfo); + const fs_reg *src = &inst->src[src_idx]; + + *out_type = src->type; + + switch (*out_type) { + case BRW_REGISTER_TYPE_F: { + float val = !can_do_source_mods ? src->f : fabsf(src->f); + memcpy(out, &val, 4); + break; + } + case BRW_REGISTER_TYPE_HF: { + uint16_t val = src->d & 0xu; + if (can_do_source_mods) + val = _mesa_float_to_half(fabsf(_mesa_half_to_float(val))); + memcpy(out, &val, 2); + break; + } + case BRW_REGISTER_TYPE_D: { + int32_t val = !can_do_source_mods ? src->d : abs(src->d); + memcpy(out, &val, 4); + break; + } + case BRW_REGISTER_TYPE_UD: + memcpy(out, &src->ud, 4); + break; + case BRW_REGISTER_TYPE_W: { + int16_t val = src->d & 0xu; + if (can_do_source_mods) + val = abs(val); + memcpy(out, &val, 2); + break; + } + case BRW_REGISTER_TYPE_UW: + memcpy(out, &src->ud, 2); + break; + default: + return false; + }; + + return true; +} + +static struct brw_reg +build_imm_reg_for_copy(struct imm *imm) +{ + switch (imm->size) { + case 4: + return brw_imm_d(imm->d); + case 2: + return brw_imm_w(imm->w); + default: + unreachable("not implemented"); + } +} + +static inline uint32_t +get_alignment_for_imm(const struct imm *imm) +{ + if (imm->is_half_float) + return 4; /* At least MAD seems to require this */ + else + return imm->size; +} + +static bool +needs_negate(const struct fs_reg *reg, const struct imm *imm) +{ + switch (reg->type) { + case BRW_REGISTER_TYPE_F: + return signbit(reg->f) != signbit(imm->f); + case BRW_REGISTER_TYPE_D: + return (reg->d < 0) != (imm->d < 0); + case BRW_REGISTER_TYPE_HF: + return (reg->d & 0x8000u) != (imm->w & 0x8000u); + case BRW_REGISTER_TYPE_W: + return ((reg->d & 0xu) < 0) != (imm->w < 0); + case BRW_REGISTER_TYPE_UD: + case BRW_REGISTER_TYPE_UW: + return false; + default: + unreachable("not implemented"); + }; +} + bool fs_visitor::opt_combine_constants() { @@ -214,13 +315,17 @@ fs_visitor::opt_combine_constants() continue; for (int i = 0; i < inst->sources; i++) { - if (inst->src[i].file != IMM || - inst->src[i].type != BRW_REGISTER_TYPE_F) + if (inst->src[i].file != IMM) continue; - float val = !inst->can_do_source_mods(devinfo) ? inst->src[i].f : - fabs(inst->src[i].f); - struct imm *imm = find_imm(&table, val); + char data[8]; + brw_reg_type type; + if (!get_constant_value(devinfo, inst, i, data, &type)) +continue; + + uint8_t size = type_sz(type); + + struct imm *imm = find_imm(&table, data, size); if (imm) { bblock_t *intersection = cfg_t::intersect(block, imm->block); @@ -237,7 +342,9 @@ fs_visitor::opt_combine_constants(
[Mesa-dev] [PATCH v4 26/40] intel/eu: force stride of 2 on NULL register for Byte instructions
The hardware only allows a stride of 1 on a Byte destination for raw byte MOV instructions. This is required even when the destination is the NULL register. Rather than making sure that we emit a proper NULL:B destination every time we need one, just fix it at emission time. Reviewed-by: Jason Ekstrand --- src/intel/compiler/brw_eu_emit.c | 11 +++ 1 file changed, 11 insertions(+) diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c index 195c26ab760..0c79a77c9c7 100644 --- a/src/intel/compiler/brw_eu_emit.c +++ b/src/intel/compiler/brw_eu_emit.c @@ -94,6 +94,17 @@ brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest) else if (dest.file == BRW_GENERAL_REGISTER_FILE) assert(dest.nr < 128); + /* The hardware has a restriction where if the destination is Byte, +* the instruction needs to have a stride of 2 (except for packed byte +* MOV). This seems to be required even if the destination is the NULL +* register. +*/ + if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE && + dest.nr == BRW_ARF_NULL && + type_sz(dest.type) == 1) { + dest.hstride = BRW_HORIZONTAL_STRIDE_2; + } + gen7_convert_mrf_to_grf(p, &dest); if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS || -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 25/40] intel/compiler: ask for an integer type if requesting an 8-bit type
v2: - Assign BRW_REGISTER_TYPE_B directly for 8-bit (Jason) Reviewed-by: Jason Ekstrand --- src/intel/compiler/brw_fs_nir.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 40c0481ac53..761d4d3c4ed 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -345,7 +345,7 @@ fs_visitor::nir_emit_impl(nir_function_impl *impl) unsigned array_elems = reg->num_array_elems == 0 ? 1 : reg->num_array_elems; unsigned size = array_elems * reg->num_components; - const brw_reg_type reg_type = + const brw_reg_type reg_type = reg->bit_size == 8 ? BRW_REGISTER_TYPE_B : brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F); nir_locals[reg->index] = bld.vgrf(reg_type, size); } @@ -4376,7 +4376,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr fs_reg value = get_nir_src(instr->src[0]); if (instr->intrinsic == nir_intrinsic_vote_feq) { const unsigned bit_size = nir_src_bit_size(instr->src[0]); - value.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F); + value.type = bit_size == 8 ? BRW_REGISTER_TYPE_B : +brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F); } fs_reg uniformized = bld.emit_uniformize(value); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 30/40] intel/compiler: fix cmod propagation for non 32-bit types
v2: - Do not propagate if the bit-size changes Reviewed-by: Jason Ekstrand --- src/intel/compiler/brw_fs_cmod_propagation.cpp | 14 +- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/intel/compiler/brw_fs_cmod_propagation.cpp b/src/intel/compiler/brw_fs_cmod_propagation.cpp index 7bb5c9afbc9..57d4e645c05 100644 --- a/src/intel/compiler/brw_fs_cmod_propagation.cpp +++ b/src/intel/compiler/brw_fs_cmod_propagation.cpp @@ -244,8 +244,7 @@ opt_cmod_propagation_local(const gen_device_info *devinfo, /* CMP's result is the same regardless of dest type. */ if (inst->conditional_mod == BRW_CONDITIONAL_NZ && scan_inst->opcode == BRW_OPCODE_CMP && -(inst->dst.type == BRW_REGISTER_TYPE_D || - inst->dst.type == BRW_REGISTER_TYPE_UD)) { +brw_reg_type_is_integer(inst->dst.type)) { inst->remove(block); progress = true; break; @@ -258,9 +257,14 @@ opt_cmod_propagation_local(const gen_device_info *devinfo, break; /* Comparisons operate differently for ints and floats */ -if (scan_inst->dst.type != inst->dst.type && -(scan_inst->dst.type == BRW_REGISTER_TYPE_F || - inst->dst.type == BRW_REGISTER_TYPE_F)) +if (brw_reg_type_is_floating_point(scan_inst->dst.type) != +brw_reg_type_is_floating_point(inst->dst.type)) + break; + +/* Comparison result may be altered if the bit-size changes + * since that affects range, denorms, etc + */ +if (type_sz(scan_inst->dst.type) != type_sz(inst->dst.type)) break; /* If the instruction generating inst's source also wrote the -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 35/40] intel/compiler: validate conversions between 64-bit and 8-bit types
--- src/intel/compiler/brw_eu_validate.c| 10 +- src/intel/compiler/test_eu_validate.cpp | 46 + 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/src/intel/compiler/brw_eu_validate.c b/src/intel/compiler/brw_eu_validate.c index 203641fecb9..b1fdd1ce941 100644 --- a/src/intel/compiler/brw_eu_validate.c +++ b/src/intel/compiler/brw_eu_validate.c @@ -533,10 +533,18 @@ general_restrictions_based_on_operand_types(const struct gen_device_info *devinf /* From the BDW+ PRM: * -*"There is no direct conversion from HF to DF or DF to HF. +*"There is no direct conversion from B/UB to DF or DF to B/UB. +* There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB. +* There is no direct conversion from HF to DF or DF to HF. * There is no direct conversion from HF to Q/UQ or Q/UQ to HF." */ enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst); + + ERROR_IF(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MOV && +((dst_type_size == 1 && type_sz(src0_type) == 8) || + (dst_type_size == 8 && type_sz(src0_type) == 1)), +"There are no direct conversion between 64-bit types and B/UB"); + ERROR_IF(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MOV && ((dst_type == BRW_REGISTER_TYPE_HF && type_sz(src0_type) == 8) || (dst_type_size == 8 && src0_type == BRW_REGISTER_TYPE_HF)), diff --git a/src/intel/compiler/test_eu_validate.cpp b/src/intel/compiler/test_eu_validate.cpp index 1557b6d2452..06beb53eb5d 100644 --- a/src/intel/compiler/test_eu_validate.cpp +++ b/src/intel/compiler/test_eu_validate.cpp @@ -848,6 +848,52 @@ TEST_P(validation_test, byte_destination_relaxed_alignment) } } +TEST_P(validation_test, byte_64bit_conversion) +{ + static const struct { + enum brw_reg_type dst_type; + enum brw_reg_type src_type; + unsigned dst_stride; + bool expected_result; + } inst[] = { +#define INST(dst_type, src_type, dst_stride, expected_result) \ + { \ + BRW_REGISTER_TYPE_##dst_type,\ + BRW_REGISTER_TYPE_##src_type,\ + BRW_HORIZONTAL_STRIDE_##dst_stride, \ + expected_result, \ + } + + INST(B, Q, 1, false), + INST(B, UQ, 1, false), + INST(B, DF, 1, false), + + INST(B, Q, 2, false), + INST(B, UQ, 2, false), + INST(B, DF, 2, false), + + INST(B, Q, 4, false), + INST(B, UQ, 4, false), + INST(B, DF, 4, false), + +#undef INST + }; + + if (devinfo.gen < 8) + return; + + for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) { + if (!devinfo.has_64bit_types && type_sz(inst[i].src_type) == 8) + continue; + + brw_MOV(p, retype(g0, inst[i].dst_type), retype(g0, inst[i].src_type)); + brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride); + EXPECT_EQ(inst[i].expected_result, validate(p)); + + clear_instructions(p); + } +} + TEST_P(validation_test, half_float_conversion) { static const struct { -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 18/40] intel/compiler: fix ddx and ddy for 16-bit float
We were assuming 32-bit elements. Also, In SIMD8 we pack 2 vector components in a single SIMD register, so for example, component Y of a 16-bit vec2 starts is at byte offset 16B. This means that when we compute the offset of the elements to be differentiated we should not stomp whatever base offset we have, but instead add to it. v2 - Use byte_offset() helper (Jason) - Merge the fix for SIMD8: using byte_offset() fixes that too. Reviewed-by: Jason Ekstrand (v1) Reviewed-by: Matt Turner --- src/intel/compiler/brw_fs_generator.cpp | 37 - 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index e3b68fa3165..996eafd4af1 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -1248,10 +1248,9 @@ fs_generator::generate_ddx(const fs_inst *inst, width = BRW_WIDTH_4; } - struct brw_reg src0 = src; + struct brw_reg src0 = byte_offset(src, type_sz(src.type));; struct brw_reg src1 = src; - src0.subnr = sizeof(float); src0.vstride = vstride; src0.width = width; src0.hstride = BRW_HORIZONTAL_STRIDE_0; @@ -1270,23 +1269,25 @@ void fs_generator::generate_ddy(const fs_inst *inst, struct brw_reg dst, struct brw_reg src) { + const uint32_t type_size = type_sz(src.type); + if (inst->opcode == FS_OPCODE_DDY_FINE) { /* produce accurate derivatives */ if (devinfo->gen >= 11) { src = stride(src, 0, 2, 1); - struct brw_reg src_0 = byte_offset(src, 0 * sizeof(float)); - struct brw_reg src_2 = byte_offset(src, 2 * sizeof(float)); - struct brw_reg src_4 = byte_offset(src, 4 * sizeof(float)); - struct brw_reg src_6 = byte_offset(src, 6 * sizeof(float)); - struct brw_reg src_8 = byte_offset(src, 8 * sizeof(float)); - struct brw_reg src_10 = byte_offset(src, 10 * sizeof(float)); - struct brw_reg src_12 = byte_offset(src, 12 * sizeof(float)); - struct brw_reg src_14 = byte_offset(src, 14 * sizeof(float)); - - struct brw_reg dst_0 = byte_offset(dst, 0 * sizeof(float)); - struct brw_reg dst_4 = byte_offset(dst, 4 * sizeof(float)); - struct brw_reg dst_8 = byte_offset(dst, 8 * sizeof(float)); - struct brw_reg dst_12 = byte_offset(dst, 12 * sizeof(float)); + struct brw_reg src_0 = byte_offset(src, 0 * type_size); + struct brw_reg src_2 = byte_offset(src, 2 * type_size); + struct brw_reg src_4 = byte_offset(src, 4 * type_size); + struct brw_reg src_6 = byte_offset(src, 6 * type_size); + struct brw_reg src_8 = byte_offset(src, 8 * type_size); + struct brw_reg src_10 = byte_offset(src, 10 * type_size); + struct brw_reg src_12 = byte_offset(src, 12 * type_size); + struct brw_reg src_14 = byte_offset(src, 14 * type_size); + + struct brw_reg dst_0 = byte_offset(dst, 0 * type_size); + struct brw_reg dst_4 = byte_offset(dst, 4 * type_size); + struct brw_reg dst_8 = byte_offset(dst, 8 * type_size); + struct brw_reg dst_12 = byte_offset(dst, 12 * type_size); brw_push_insn_state(p); brw_set_default_exec_size(p, BRW_EXECUTE_4); @@ -1313,10 +1314,8 @@ fs_generator::generate_ddy(const fs_inst *inst, } } else { /* replicate the derivative at the top-left pixel to other pixels */ - struct brw_reg src0 = stride(src, 4, 4, 0); - struct brw_reg src1 = stride(src, 4, 4, 0); - src0.subnr = 0 * sizeof(float); - src1.subnr = 2 * sizeof(float); + struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size); + struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size); brw_ADD(p, dst, negate(src0), src1); } -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 38/40] compiler/spirv: move the check for Int8 capability
So it is right after the checks for the other various Int* capabilities. --- src/compiler/spirv/spirv_to_nir.c | 7 +++ 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/compiler/spirv/spirv_to_nir.c b/src/compiler/spirv/spirv_to_nir.c index 1cbc926c818..7e07de2bfc0 100644 --- a/src/compiler/spirv/spirv_to_nir.c +++ b/src/compiler/spirv/spirv_to_nir.c @@ -3579,6 +3579,9 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode, case SpvCapabilityInt16: spv_check_supported(int16, cap); break; + case SpvCapabilityInt8: + spv_check_supported(int8, cap); + break; case SpvCapabilityTransformFeedback: spv_check_supported(transform_feedback, cap); @@ -3591,10 +3594,6 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode, case SpvCapabilityInt64Atomics: spv_check_supported(int64_atomics, cap); - case SpvCapabilityInt8: - spv_check_supported(int8, cap); - break; - case SpvCapabilityStorageImageMultisample: spv_check_supported(storage_image_ms, cap); break; -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 11/40] compiler/nir: add lowering for 16-bit flrp
And enable it on Intel. v2: - Squash the change to enable it on Intel (Jason) Reviewed-by: Jason Ekstrand --- src/compiler/nir/nir.h| 1 + src/compiler/nir/nir_opt_algebraic.py | 1 + src/intel/compiler/brw_compiler.c | 1 + 3 files changed, 3 insertions(+) diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 6547a468648..740c64d2a94 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -2113,6 +2113,7 @@ typedef struct nir_shader_compiler_options { bool lower_fdiv; bool lower_ffma; bool fuse_ffma; + bool lower_flrp16; bool lower_flrp32; /** Lowers flrp when it does not support doubles */ bool lower_flrp64; diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index cd969de1f88..40eb3de02c3 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -124,6 +124,7 @@ optimizations = [ (('~flrp', 0.0, a, b), ('fmul', a, b)), (('~flrp', a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp32'), (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)), + (('flrp@16', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp16'), (('flrp@32', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp32'), (('flrp@64', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp64'), (('ffloor', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'), diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c index f885e79c3e6..04a1a7cac4e 100644 --- a/src/intel/compiler/brw_compiler.c +++ b/src/intel/compiler/brw_compiler.c @@ -33,6 +33,7 @@ .lower_sub = true, \ .lower_fdiv = true,\ .lower_scmp = true,\ + .lower_flrp16 = true, \ .lower_fmod16 = true, \ .lower_fmod32 = true, \ .lower_fmod64 = false, \ -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 02/40] intel/compiler: add a NIR pass to lower conversions
Some conversions are not directly supported in hardware and need to be split in two conversion instructions going through an intermediary type. Doing this at the NIR level simplifies a bit the complexity in the backend. v2: - Consider fp16 rounding conversion opcodes - Properly handle swizzles on conversion sources. v3 - Run the pass earlier, right after nir_opt_algebraic_late (Jason) - NIR alu output types already have the bit-size (Jason) - Use the new is_conversion field in nir_op_info to select conversion instructions (Jason) - Use 'is_conversion' to identify conversion operations (Jason) Reviewed-by: Topi Pohjolainen (v1) --- src/intel/Makefile.sources| 1 + src/intel/compiler/brw_nir.c | 2 + src/intel/compiler/brw_nir.h | 2 + .../compiler/brw_nir_lower_conversions.c | 158 ++ src/intel/compiler/meson.build| 1 + 5 files changed, 164 insertions(+) create mode 100644 src/intel/compiler/brw_nir_lower_conversions.c diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources index 94a28d370e8..9975daa3ad1 100644 --- a/src/intel/Makefile.sources +++ b/src/intel/Makefile.sources @@ -83,6 +83,7 @@ COMPILER_FILES = \ compiler/brw_nir_analyze_boolean_resolves.c \ compiler/brw_nir_analyze_ubo_ranges.c \ compiler/brw_nir_attribute_workarounds.c \ + compiler/brw_nir_lower_conversions.c \ compiler/brw_nir_lower_cs_intrinsics.c \ compiler/brw_nir_lower_image_load_store.c \ compiler/brw_nir_lower_mem_access_bit_sizes.c \ diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 9dbf06004a4..7e3dbc9e447 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -876,6 +876,8 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler, OPT(nir_opt_algebraic_late); + OPT(brw_nir_lower_conversions); + OPT(nir_lower_to_source_mods, nir_lower_all_source_mods); OPT(nir_copy_prop); OPT(nir_opt_dce); diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h index bc81950d47e..662b2627e95 100644 --- a/src/intel/compiler/brw_nir.h +++ b/src/intel/compiler/brw_nir.h @@ -114,6 +114,8 @@ void brw_nir_lower_tcs_outputs(nir_shader *nir, const struct brw_vue_map *vue, GLenum tes_primitive_mode); void brw_nir_lower_fs_outputs(nir_shader *nir); +bool brw_nir_lower_conversions(nir_shader *nir); + bool brw_nir_lower_image_load_store(nir_shader *nir, const struct gen_device_info *devinfo); void brw_nir_rewrite_image_intrinsic(nir_intrinsic_instr *intrin, diff --git a/src/intel/compiler/brw_nir_lower_conversions.c b/src/intel/compiler/brw_nir_lower_conversions.c new file mode 100644 index 000..bb1312ad428 --- /dev/null +++ b/src/intel/compiler/brw_nir_lower_conversions.c @@ -0,0 +1,158 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_nir.h" +#include "compiler/nir/nir_builder.h" + +static nir_op +get_conversion_op(nir_alu_type src_type, + unsigned src_bit_size, + nir_alu_type dst_type, + unsigned dst_bit_size, + nir_rounding_mode rounding_mode) +{ + nir_alu_type src_full_type = (nir_alu_type) (src_type | src_bit_size); + nir_alu_type dst_full_type = (nir_alu_type) (dst_type | dst_bit_size); + + return nir_type_conversion_op(src_full_type, dst_full_type, rounding_mode); +} + +static nir_rounding_mode +get_opcode_rounding_mode(nir_op op) +{ + switch (op) { + case nir_op_f2f16_rtz: + return nir_rounding_mode_rtz; + case nir_op_f2f16_rtne: + return nir_rounding_mode_rtne; + default: + return nir_rounding_mode_undef; + } +} + +static void +split_conversion(nir_builder
[Mesa-dev] [PATCH v4 40/40] anv/device: expose VK_KHR_shader_float16_int8 in gen8+
v2 (Jason): - Merge shaderFloat16 and shaderInt8 enablement into a single patch. - Merge extension enable. Reviewed-by: Jason Ekstrand (v1) --- src/intel/vulkan/anv_device.c | 9 + src/intel/vulkan/anv_extensions.py | 1 + 2 files changed, 10 insertions(+) diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 77120937c51..0fc6d59162d 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -1001,6 +1001,15 @@ void anv_GetPhysicalDeviceFeatures2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR: { + VkPhysicalDeviceFloat16Int8FeaturesKHR *features = (void *)ext; + ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice); + + features->shaderFloat16 = pdevice->info.gen >= 8; + features->shaderInt8 = pdevice->info.gen >= 8; + break; + } + default: anv_debug_ignored_stype(ext->sType); break; diff --git a/src/intel/vulkan/anv_extensions.py b/src/intel/vulkan/anv_extensions.py index e502b5d5685..db28898d663 100644 --- a/src/intel/vulkan/anv_extensions.py +++ b/src/intel/vulkan/anv_extensions.py @@ -109,6 +109,7 @@ EXTENSIONS = [ Extension('VK_KHR_sampler_mirror_clamp_to_edge', 1, True), Extension('VK_KHR_sampler_ycbcr_conversion', 1, True), Extension('VK_KHR_shader_draw_parameters',1, True), +Extension('VK_KHR_shader_float16_int8', 1, 'device->info.gen >= 8'), Extension('VK_KHR_storage_buffer_storage_class', 1, True), Extension('VK_KHR_surface', 25, 'ANV_HAS_SURFACE'), Extension('VK_KHR_swapchain',68, 'ANV_HAS_SURFACE'), -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 09/40] intel/compiler: drop unnecessary temporary from 32-bit fsign implementation
Reviewed-by: Jason Ekstrand --- src/intel/compiler/brw_fs_nir.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 64e24f86b5a..f59e9ad4e2b 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -866,12 +866,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) */ bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ); - fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD); op[0].type = BRW_REGISTER_TYPE_UD; result.type = BRW_REGISTER_TYPE_UD; - bld.AND(result_int, op[0], brw_imm_ud(0x8000u)); + bld.AND(result, op[0], brw_imm_ud(0x8000u)); - inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f80u)); + inst = bld.OR(result, result, brw_imm_ud(0x3f80u)); inst->predicate = BRW_PREDICATE_NORMAL; } else { /* For doubles we do the same but we need to consider: -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 07/40] intel/compiler: handle extended math restrictions for half-float
Extended math with half-float operands is only supported since gen9, but it is limited to SIMD8. In gen8 we lower it to 32-bit. v2: quashed together the following patches (Jason): - intel/compiler: allow extended math functions with HF operands - intel/compiler: lower 16-bit extended math to 32-bit prior to gen9 - intel/compiler: extended Math is limited to SIMD8 on half-float Reviewed-by: Jason Ekstrand Reviewed-by: Topi Pohjolainen (allow extended math functions with HF operands, extended Math is limited to SIMD8 on half-float) --- src/intel/compiler/brw_eu_emit.c | 6 -- src/intel/compiler/brw_fs.cpp| 27 ++- src/intel/compiler/brw_nir.c | 13 - 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c index 9be82d1b87c..2f31d9591fc 100644 --- a/src/intel/compiler/brw_eu_emit.c +++ b/src/intel/compiler/brw_eu_emit.c @@ -1916,8 +1916,10 @@ void gen6_math(struct brw_codegen *p, assert(src1.file == BRW_GENERAL_REGISTER_FILE || (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE)); } else { - assert(src0.type == BRW_REGISTER_TYPE_F); - assert(src1.type == BRW_REGISTER_TYPE_F); + assert(src0.type == BRW_REGISTER_TYPE_F || + (src0.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9)); + assert(src1.type == BRW_REGISTER_TYPE_F || + (src1.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9)); } /* Source modifiers are ignored for extended math instructions on Gen6. */ diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 5a18ba86a96..d172c2de4d7 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -5898,18 +5898,27 @@ get_lowered_simd_width(const struct gen_device_info *devinfo, case SHADER_OPCODE_EXP2: case SHADER_OPCODE_LOG2: case SHADER_OPCODE_SIN: - case SHADER_OPCODE_COS: + case SHADER_OPCODE_COS: { /* Unary extended math instructions are limited to SIMD8 on Gen4 and - * Gen6. + * Gen6. Extended Math Function is limited to SIMD8 with half-float. */ - return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) : - devinfo->gen == 5 || devinfo->is_g4x ? MIN2(16, inst->exec_size) : - MIN2(8, inst->exec_size)); + if (devinfo->gen == 6 || (devinfo->gen == 4 && !devinfo->is_g4x)) + return MIN2(8, inst->exec_size); + if (inst->dst.type == BRW_REGISTER_TYPE_HF) + return MIN2(8, inst->exec_size); + return MIN2(16, inst->exec_size); + } - case SHADER_OPCODE_POW: - /* SIMD16 is only allowed on Gen7+. */ - return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) : - MIN2(8, inst->exec_size)); + case SHADER_OPCODE_POW: { + /* SIMD16 is only allowed on Gen7+. Extended Math Function is limited + * to SIMD8 with half-float + */ + if (devinfo->gen < 7) + return MIN2(8, inst->exec_size); + if (inst->dst.type == BRW_REGISTER_TYPE_HF) + return MIN2(8, inst->exec_size); + return MIN2(16, inst->exec_size); + } case SHADER_OPCODE_INT_QUOTIENT: case SHADER_OPCODE_INT_REMAINDER: diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 75513e5113c..9b26a6c3d6f 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -631,6 +631,8 @@ lower_bit_size_callback(const nir_alu_instr *alu, UNUSED void *data) if (alu->dest.dest.ssa.bit_size != 16) return 0; + const struct brw_compiler *compiler = (const struct brw_compiler *) data; + switch (alu->op) { case nir_op_idiv: case nir_op_imod: @@ -643,6 +645,15 @@ lower_bit_size_callback(const nir_alu_instr *alu, UNUSED void *data) case nir_op_fround_even: case nir_op_ftrunc: return 32; + case nir_op_frcp: + case nir_op_frsq: + case nir_op_fsqrt: + case nir_op_fpow: + case nir_op_fexp2: + case nir_op_flog2: + case nir_op_fsin: + case nir_op_fcos: + return compiler->devinfo->gen < 9 ? 32 : 0; default: return 0; } @@ -770,7 +781,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir) OPT(nir_opt_large_constants, NULL, 32); } - OPT(nir_lower_bit_size, lower_bit_size_callback, NULL); + OPT(nir_lower_bit_size, lower_bit_size_callback, (void *)compiler); if (is_scalar) { OPT(nir_lower_load_const_to_scalar); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 22/40] intel/compiler: activate 16-bit bit-size lowerings also for 8-bit
Particularly, we need the same lowewrings we use for 16-bit integers. Reviewed-by: Jason Ekstrand --- src/intel/compiler/brw_nir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 9b26a6c3d6f..1d62f2adde8 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -628,7 +628,7 @@ static unsigned lower_bit_size_callback(const nir_alu_instr *alu, UNUSED void *data) { assert(alu->dest.dest.is_ssa); - if (alu->dest.dest.ssa.bit_size != 16) + if (alu->dest.dest.ssa.bit_size >= 32) return 0; const struct brw_compiler *compiler = (const struct brw_compiler *) data; -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 31/40] intel/compiler: remove inexact algebraic optimizations from the backend
NIR already has these and correctly considers exact/inexact qualification, whereas the backend doesn't and can apply the optimizations where it shouldn't. This happened to be the case in a handful of Tomb Raider shaders, where NIR would skip the optimizations because of a precise qualification but the backend would then (incorrectly) apply them anyway. Besides this, considering that we are not emitting much math in the backend these days it is unlikely that these optimizations are useful in general. A shader-db run confirms that MAD and LRP optimizations, for example, were only being triggered in cases where NIR would skip them due to precise requirements, so in the near future we might want to remove more of these, but for now we just remove the ones that are not completely correct. Suggested-by: Jason Ekstrand --- src/intel/compiler/brw_fs.cpp | 39 +-- 1 file changed, 1 insertion(+), 38 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 0f04e577de3..873a1dd8196 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -2547,15 +2547,6 @@ fs_visitor::opt_algebraic() break; } - /* a * 0.0 = 0.0 */ - if (inst->src[1].is_zero()) { -inst->opcode = BRW_OPCODE_MOV; -inst->src[0] = inst->src[1]; -inst->src[1] = reg_undef; -progress = true; -break; - } - if (inst->src[0].file == IMM) { assert(inst->src[0].type == BRW_REGISTER_TYPE_F); inst->opcode = BRW_OPCODE_MOV; @@ -2569,14 +2560,6 @@ fs_visitor::opt_algebraic() if (inst->src[1].file != IMM) continue; - /* a + 0.0 = a */ - if (inst->src[1].is_zero()) { -inst->opcode = BRW_OPCODE_MOV; -inst->src[1] = reg_undef; -progress = true; -break; - } - if (inst->src[0].file == IMM) { assert(inst->src[0].type == BRW_REGISTER_TYPE_F); inst->opcode = BRW_OPCODE_MOV; @@ -2595,16 +2578,6 @@ fs_visitor::opt_algebraic() break; } break; - case BRW_OPCODE_LRP: - if (inst->src[1].equals(inst->src[2])) { -inst->opcode = BRW_OPCODE_MOV; -inst->src[0] = inst->src[1]; -inst->src[1] = reg_undef; -inst->src[2] = reg_undef; -progress = true; -break; - } - break; case BRW_OPCODE_CMP: if ((inst->conditional_mod == BRW_CONDITIONAL_Z || inst->conditional_mod == BRW_CONDITIONAL_NZ) && @@ -2682,17 +2655,7 @@ fs_visitor::opt_algebraic() } break; case BRW_OPCODE_MAD: - if (inst->src[1].is_zero() || inst->src[2].is_zero()) { -inst->opcode = BRW_OPCODE_MOV; -inst->src[1] = reg_undef; -inst->src[2] = reg_undef; -progress = true; - } else if (inst->src[0].is_zero()) { -inst->opcode = BRW_OPCODE_MUL; -inst->src[0] = inst->src[2]; -inst->src[2] = reg_undef; -progress = true; - } else if (inst->src[1].is_one()) { + if (inst->src[1].is_one()) { inst->opcode = BRW_OPCODE_ADD; inst->src[1] = inst->src[2]; inst->src[2] = reg_undef; -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 16/40] intel/compiler: allow half-float on 3-source instructions since gen8
Reviewed-by: Topi Pohjolainen Reviewed-by: Jason Ekstrand Reviewed-by: Matt Turner --- src/intel/compiler/brw_eu_emit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c index 2f31d9591fc..30037e71b00 100644 --- a/src/intel/compiler/brw_eu_emit.c +++ b/src/intel/compiler/brw_eu_emit.c @@ -797,7 +797,8 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest, assert(dest.type == BRW_REGISTER_TYPE_F || dest.type == BRW_REGISTER_TYPE_DF || dest.type == BRW_REGISTER_TYPE_D || - dest.type == BRW_REGISTER_TYPE_UD); + dest.type == BRW_REGISTER_TYPE_UD || + (dest.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 8)); if (devinfo->gen == 6) { brw_inst_set_3src_a16_dst_reg_file(devinfo, inst, dest.file == BRW_MESSAGE_REGISTER_FILE); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 32/40] intel/compiler: skip MAD algebraic optimization for half-float or mixed mode
It is very likely that this optimzation is never useful and we'll probably just end up removing it, so let's not bother adding more cases to it for now. --- src/intel/compiler/brw_fs.cpp | 4 1 file changed, 4 insertions(+) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 873a1dd8196..aeabaefd6df 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -2655,6 +2655,10 @@ fs_visitor::opt_algebraic() } break; case BRW_OPCODE_MAD: + if (inst->src[0].type != BRW_REGISTER_TYPE_F || + inst->src[1].type != BRW_REGISTER_TYPE_F || + inst->src[2].type != BRW_REGISTER_TYPE_F) +break; if (inst->src[1].is_one()) { inst->opcode = BRW_OPCODE_ADD; inst->src[1] = inst->src[2]; -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 20/40] intel/compiler: workaround for SIMD8 half-float MAD in gen8
Empirical testing shows that gen8 has a bug where MAD instructions with a half-float source starting at a non-zero offset fail to execute properly. This scenario usually happened in SIMD8 executions, where we used to pack vector components Y and W in the second half of SIMD registers (therefore, with a 16B offset). It looks like we are not currently doing this any more but this would handle the situation properly if we ever happen to produce code like this again. v2 (Jason): - Move this workaround to the lower_regioning pass as an additional case to has_invalid_src_region() - Do not apply the workaround if the stride of the source operand is 0, testing suggests the problem doesn't exist in that case. Reviewed-by: Topi Pohjolainen (v1) --- src/intel/compiler/brw_fs_lower_regioning.cpp | 39 +-- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp b/src/intel/compiler/brw_fs_lower_regioning.cpp index df50993dee6..7c70cfab535 100644 --- a/src/intel/compiler/brw_fs_lower_regioning.cpp +++ b/src/intel/compiler/brw_fs_lower_regioning.cpp @@ -109,20 +109,37 @@ namespace { has_invalid_src_region(const gen_device_info *devinfo, const fs_inst *inst, unsigned i) { - if (is_unordered(inst)) { + if (is_unordered(inst)) return false; - } else { - const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type); - const unsigned src_byte_stride = inst->src[i].stride * -type_sz(inst->src[i].type); - const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE; - const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE; - return has_dst_aligned_region_restriction(devinfo, inst) && -!is_uniform(inst->src[i]) && -(src_byte_stride != dst_byte_stride || - src_byte_offset != dst_byte_offset); + /* Empirical testing shows that Broadwell has a bug affecting half-float + * MAD instructions when any of its sources has a non-zero offset, such + * as: + * + * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q }; + * + * We used to generate code like this for SIMD8 executions where we + * used to pack components Y and W of a vector at offset 16B of a SIMD + * register. The problem doesn't occur if the stride of the source is 0. + */ + if (devinfo->gen == 8 && + inst->opcode == BRW_OPCODE_MAD && + inst->src[i].type == BRW_REGISTER_TYPE_HF && + inst->src[i].offset > 0 && + inst->src[i].stride != 0) { + return true; } + + const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type); + const unsigned src_byte_stride = inst->src[i].stride * + type_sz(inst->src[i].type); + const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE; + const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE; + + return has_dst_aligned_region_restriction(devinfo, inst) && + !is_uniform(inst->src[i]) && + (src_byte_stride != dst_byte_stride || + src_byte_offset != dst_byte_offset); } /* -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 06/40] intel/compiler: lower some 16-bit float operations to 32-bit
The hardware doesn't support half-float for these. Reviewed-by: Topi Pohjolainen Reviewed-by: Jason Ekstrand --- src/intel/compiler/brw_nir.c | 5 + 1 file changed, 5 insertions(+) diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 7e3dbc9e447..75513e5113c 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -637,6 +637,11 @@ lower_bit_size_callback(const nir_alu_instr *alu, UNUSED void *data) case nir_op_irem: case nir_op_udiv: case nir_op_umod: + case nir_op_fceil: + case nir_op_ffloor: + case nir_op_ffract: + case nir_op_fround_even: + case nir_op_ftrunc: return 32; default: return 0; -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v4 05/40] intel/compiler: assert restrictions on conversions to half-float
There are some hardware restrictions that brw_nir_lower_conversions should have taken care of before we get here. v2: - rebased on top of regioning lowering pass Reviewed-by: Topi Pohjolainen (v1) Reviewed-by: Jason Ekstrand --- src/intel/compiler/brw_fs_nir.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index b705b3f5bc2..4c7a839390c 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -784,6 +784,9 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) */ case nir_op_f2f16: + case nir_op_i2f16: + case nir_op_u2f16: + assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ inst = bld.MOV(result, op[0]); inst->saturate = instr->dest.saturate; break; @@ -821,8 +824,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) case nir_op_u2u32: case nir_op_i2i16: case nir_op_u2u16: - case nir_op_i2f16: - case nir_op_u2f16: case nir_op_i2i8: case nir_op_u2u8: inst = bld.MOV(result, op[0]); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] intel/compiler: update validator to account for half-float exec type promotion
Commit c84ec70b3a72 implemented execution type promotion to 32-bit for conversions involving half-float registers, which empirical testing suggested was required, but it did not incorporate this change into the assembly validator logic. This commits adds that, preventing validation errors like this: mov(16) g9<4>B g3<16,8,2>HF { align1 1H }; ERROR: Destination stride must be equal to the ratio of the sizes of the execution data type to the destination type Fixes: c84ec70b3a72 "intel/fs: Promote execution type to 32-bit when any half-float conversion is needed." --- src/intel/compiler/brw_eu_validate.c | 27 ++- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/intel/compiler/brw_eu_validate.c b/src/intel/compiler/brw_eu_validate.c index a25010b225c..3bb37677672 100644 --- a/src/intel/compiler/brw_eu_validate.c +++ b/src/intel/compiler/brw_eu_validate.c @@ -325,17 +325,20 @@ execution_type(const struct gen_device_info *devinfo, const brw_inst *inst) unsigned num_sources = num_sources_from_inst(devinfo, inst); enum brw_reg_type src0_exec_type, src1_exec_type; - /* Execution data type is independent of destination data type, except in -* mixed F/HF instructions on CHV and SKL+. + /* Empirical testing suggests that type conversions involving half-float +* promote execution type to 32-bit. See get_exec_type() in brw_ir_fs.h. */ enum brw_reg_type dst_exec_type = brw_inst_dst_type(devinfo, inst); src0_exec_type = execution_type_for_type(brw_inst_src0_type(devinfo, inst)); if (num_sources == 1) { - if ((devinfo->gen >= 9 || devinfo->is_cherryview) && - src0_exec_type == BRW_REGISTER_TYPE_HF) { - return dst_exec_type; + if (type_sz(src0_exec_type) == 2 && dst_exec_type != src0_exec_type) { + if (src0_exec_type == BRW_REGISTER_TYPE_HF) +return BRW_REGISTER_TYPE_F; + else if (dst_exec_type == BRW_REGISTER_TYPE_HF) +return BRW_REGISTER_TYPE_D; } + return src0_exec_type; } @@ -367,14 +370,12 @@ execution_type(const struct gen_device_info *devinfo, const brw_inst *inst) src1_exec_type == BRW_REGISTER_TYPE_DF) return BRW_REGISTER_TYPE_DF; - if (devinfo->gen >= 9 || devinfo->is_cherryview) { - if (dst_exec_type == BRW_REGISTER_TYPE_F || - src0_exec_type == BRW_REGISTER_TYPE_F || - src1_exec_type == BRW_REGISTER_TYPE_F) { - return BRW_REGISTER_TYPE_F; - } else { - return BRW_REGISTER_TYPE_HF; - } + if (dst_exec_type == BRW_REGISTER_TYPE_F || + src0_exec_type == BRW_REGISTER_TYPE_F || + src1_exec_type == BRW_REGISTER_TYPE_F) { + return BRW_REGISTER_TYPE_F; + } else { + return BRW_REGISTER_TYPE_HF; } assert(src0_exec_type == BRW_REGISTER_TYPE_F); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 03/42] intel/compiler: split float to 64-bit opcodes from int to 64-bit
Going forward having these split is a bit more convenient since these two groups have different restrictions. v2: - Rebased on top of new regioning lowering pass. Reviewed-by: Topi Pohjolainen (v1) --- src/intel/compiler/brw_fs_nir.cpp | 7 +++ 1 file changed, 7 insertions(+) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index bdc883e5364..a59debf2b78 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -801,10 +801,17 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) case nir_op_f2f64: case nir_op_f2i64: case nir_op_f2u64: + assert(type_sz(op[0].type) > 2); /* brw_nir_lower_conversions */ + inst = bld.MOV(result, op[0]); + inst->saturate = instr->dest.saturate; + break; + case nir_op_i2f64: case nir_op_i2i64: case nir_op_u2f64: case nir_op_u2u64: + assert(type_sz(op[0].type) > 1); /* brw_nir_lower_conversions */ + /* fallthrough */ case nir_op_f2f32: case nir_op_f2i32: case nir_op_f2u32: -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 27/42] intel/compiler: activate 16-bit bit-size lowerings also for 8-bit
Particularly, we need the same lowewrings we use for 16-bit integers. Reviewed-by: Jason Ekstrand --- src/intel/compiler/brw_nir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 3b2909da33e..2dfbf8824dc 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -628,7 +628,7 @@ static unsigned lower_bit_size_callback(const nir_alu_instr *alu, UNUSED void *data) { assert(alu->dest.dest.is_ssa); - if (alu->dest.dest.ssa.bit_size != 16) + if (alu->dest.dest.ssa.bit_size >= 32) return 0; const struct brw_compiler *compiler = (const struct brw_compiler *) data; -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 32/42] intel/eu: force stride of 2 on NULL register for Byte instructions
The hardware only allows a stride of 1 on a Byte destination for raw byte MOV instructions. This is required even when the destination is the NULL register. Rather than making sure that we emit a proper NULL:B destination every time we need one, just fix it at emission time. Reviewed-by: Jason Ekstrand --- src/intel/compiler/brw_eu_emit.c | 11 +++ 1 file changed, 11 insertions(+) diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c index 2fa89f8a2a3..4e1672408ea 100644 --- a/src/intel/compiler/brw_eu_emit.c +++ b/src/intel/compiler/brw_eu_emit.c @@ -94,6 +94,17 @@ brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest) else if (dest.file == BRW_GENERAL_REGISTER_FILE) assert(dest.nr < 128); + /* The hardware has a restriction where if the destination is Byte, +* the instruction needs to have a stride of 2 (except for packed byte +* MOV). This seems to be required even if the destination is the NULL +* register. +*/ + if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE && + dest.nr == BRW_ARF_NULL && + type_sz(dest.type) == 1) { + dest.hstride = BRW_HORIZONTAL_STRIDE_2; + } + gen7_convert_mrf_to_grf(p, &dest); brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 39/42] intel/compiler: remove MAD/LRP algebraic optimizations from the backend
NIR already has these so they are redundant. A run of shader-db confirms that the only cases where these backend optimizations are activated are some Tomb Raider shaders where the affected variables are qualified as "precise", which is why NIR won't apply them and why the backend shouldn't either (so it is actually a bug). Suggested-by: Jason Ekstrand --- src/intel/compiler/brw_fs.cpp | 37 --- 1 file changed, 37 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 77c955ac435..e7f5a8822a3 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -2568,16 +2568,6 @@ fs_visitor::opt_algebraic() break; } break; - case BRW_OPCODE_LRP: - if (inst->src[1].equals(inst->src[2])) { -inst->opcode = BRW_OPCODE_MOV; -inst->src[0] = inst->src[1]; -inst->src[1] = reg_undef; -inst->src[2] = reg_undef; -progress = true; -break; - } - break; case BRW_OPCODE_CMP: if ((inst->conditional_mod == BRW_CONDITIONAL_Z || inst->conditional_mod == BRW_CONDITIONAL_NZ) && @@ -2654,33 +2644,6 @@ fs_visitor::opt_algebraic() } } break; - case BRW_OPCODE_MAD: - if (inst->src[1].is_zero() || inst->src[2].is_zero()) { -inst->opcode = BRW_OPCODE_MOV; -inst->src[1] = reg_undef; -inst->src[2] = reg_undef; -progress = true; - } else if (inst->src[0].is_zero()) { -inst->opcode = BRW_OPCODE_MUL; -inst->src[0] = inst->src[2]; -inst->src[2] = reg_undef; -progress = true; - } else if (inst->src[1].is_one()) { -inst->opcode = BRW_OPCODE_ADD; -inst->src[1] = inst->src[2]; -inst->src[2] = reg_undef; -progress = true; - } else if (inst->src[2].is_one()) { -inst->opcode = BRW_OPCODE_ADD; -inst->src[2] = reg_undef; -progress = true; - } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) { -inst->opcode = BRW_OPCODE_ADD; -inst->src[1].f *= inst->src[2].f; -inst->src[2] = reg_undef; -progress = true; - } - break; case SHADER_OPCODE_BROADCAST: if (is_uniform(inst->src[0])) { inst->opcode = BRW_OPCODE_MOV; -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 17/42] intel/compiler: add new half-float register type for 3-src instructions
This is available since gen8. v2: restore previously existing assertion. Reviewed-by: Topi Pohjolainen (v1) --- src/intel/compiler/brw_reg_type.c | 36 +++ 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/src/intel/compiler/brw_reg_type.c b/src/intel/compiler/brw_reg_type.c index 60240ba1513..09b3ea61d4c 100644 --- a/src/intel/compiler/brw_reg_type.c +++ b/src/intel/compiler/brw_reg_type.c @@ -138,6 +138,7 @@ enum hw_3src_reg_type { GEN7_3SRC_TYPE_D = 1, GEN7_3SRC_TYPE_UD = 2, GEN7_3SRC_TYPE_DF = 3, + GEN8_3SRC_TYPE_HF = 4, /** When ExecutionDatatype is 1: @{ */ GEN10_ALIGN1_3SRC_REG_TYPE_HF = 0b000, @@ -166,6 +167,14 @@ static const struct hw_3src_type { [BRW_REGISTER_TYPE_D] = { GEN7_3SRC_TYPE_D }, [BRW_REGISTER_TYPE_UD] = { GEN7_3SRC_TYPE_UD }, [BRW_REGISTER_TYPE_DF] = { GEN7_3SRC_TYPE_DF }, +}, gen8_hw_3src_type[] = { + [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID }, + + [BRW_REGISTER_TYPE_F] = { GEN7_3SRC_TYPE_F }, + [BRW_REGISTER_TYPE_D] = { GEN7_3SRC_TYPE_D }, + [BRW_REGISTER_TYPE_UD] = { GEN7_3SRC_TYPE_UD }, + [BRW_REGISTER_TYPE_DF] = { GEN7_3SRC_TYPE_DF }, + [BRW_REGISTER_TYPE_HF] = { GEN8_3SRC_TYPE_HF }, }, gen10_hw_3src_align1_type[] = { #define E(x) BRW_ALIGN1_3SRC_EXEC_TYPE_##x [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID }, @@ -249,6 +258,20 @@ brw_hw_type_to_reg_type(const struct gen_device_info *devinfo, unreachable("not reached"); } +static inline const struct hw_3src_type * +get_hw_3src_type_map(const struct gen_device_info *devinfo, uint32_t *size) +{ + if (devinfo->gen < 8) { + if (size) + *size = ARRAY_SIZE(gen7_hw_3src_type); + return gen7_hw_3src_type; + } else { + if (size) + *size = ARRAY_SIZE(gen8_hw_3src_type); + return gen8_hw_3src_type; + } +} + /** * Convert a brw_reg_type enumeration value into the hardware representation * for a 3-src align16 instruction @@ -257,9 +280,12 @@ unsigned brw_reg_type_to_a16_hw_3src_type(const struct gen_device_info *devinfo, enum brw_reg_type type) { - assert(type < ARRAY_SIZE(gen7_hw_3src_type)); - assert(gen7_hw_3src_type[type].reg_type != (enum hw_3src_reg_type)INVALID); - return gen7_hw_3src_type[type].reg_type; + uint32_t map_size; + const struct hw_3src_type *hw_3src_type_map = + get_hw_3src_type_map(devinfo, &map_size); + assert(type < map_size); + assert(hw_3src_type_map[type].reg_type != (enum hw_3src_reg_type)INVALID); + return hw_3src_type_map[type].reg_type; } /** @@ -283,8 +309,10 @@ enum brw_reg_type brw_a16_hw_3src_type_to_reg_type(const struct gen_device_info *devinfo, unsigned hw_type) { + const struct hw_3src_type *hw_3src_type_map = + get_hw_3src_type_map(devinfo, NULL); for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) { - if (gen7_hw_3src_type[i].reg_type == hw_type) { + if (hw_3src_type_map[i].reg_type == hw_type) { return i; } } -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 28/42] intel/compiler: handle 64-bit float to 8-bit integer conversions
These are not directly supported in hardware and brw_nir_lower_conversions should have taken care of that before we get here. Also, while we are at it, make sure 64-bit integer to 8-bit are also properly split by the same lowering pass, since they have the same hardware restrictions. --- src/intel/compiler/brw_fs_nir.cpp | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index cf546b8ff09..e454578d99b 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -786,6 +786,10 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) case nir_op_f2f16: case nir_op_i2f16: case nir_op_u2f16: + case nir_op_i2i8: + case nir_op_u2u8: + case nir_op_f2i8: + case nir_op_f2u8: assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ inst = bld.MOV(result, op[0]); inst->saturate = instr->dest.saturate; @@ -824,8 +828,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) case nir_op_u2u32: case nir_op_i2i16: case nir_op_u2u16: - case nir_op_i2i8: - case nir_op_u2u8: inst = bld.MOV(result, op[0]); inst->saturate = instr->dest.saturate; break; -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 16/42] intel/compiler: add instruction setters for Src1Type and Src2Type.
The original SrcType is a 3-bit field that takes a subset of the types supported for the hardware for 3-source instructions. Since gen8, when the half-float type was added, 3-source floating point operations can use use mixed precision mode, where not all the operands have the same floating-point precision. While the precision for the first operand is taken from the type in SrcType, the bits in Src1Type (bit 36) and Src2Type (bit 35) define the precision for the other operands (0: normal precision, 1: half precision). Reviewed-by: Topi Pohjolainen --- src/intel/compiler/brw_inst.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/intel/compiler/brw_inst.h b/src/intel/compiler/brw_inst.h index ce89bbba72f..c45697eaa3a 100644 --- a/src/intel/compiler/brw_inst.h +++ b/src/intel/compiler/brw_inst.h @@ -222,6 +222,8 @@ F8(3src_src1_negate,39, 39, 40, 40) F8(3src_src1_abs, 38, 38, 39, 39) F8(3src_src0_negate,37, 37, 38, 38) F8(3src_src0_abs, 36, 36, 37, 37) +F8(3src_a16_src1_type, -1, -1, 36, 36) +F8(3src_a16_src2_type, -1, -1, 35, 35) F8(3src_a16_flag_reg_nr,34, 34, 33, 33) F8(3src_a16_flag_subreg_nr, 33, 33, 32, 32) FF(3src_a16_dst_reg_file, -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 35/42] anv/device: expose shaderFloat16 and shaderInt8 in gen8+
v2 (Jason): - Merge Float16 and Int8 into a single patch. - Merge extension enable. Reviewed-by: Jason Ekstrand (v1) --- src/intel/vulkan/anv_device.c | 9 + src/intel/vulkan/anv_extensions.py | 1 + 2 files changed, 10 insertions(+) diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 523f1483e29..d9931d339e5 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -966,6 +966,15 @@ void anv_GetPhysicalDeviceFeatures2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR: { + VkPhysicalDeviceFloat16Int8FeaturesKHR *features = (void *)ext; + ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice); + + features->shaderFloat16 = pdevice->info.gen >= 8; + features->shaderInt8 = pdevice->info.gen >= 8; + break; + } + default: anv_debug_ignored_stype(ext->sType); break; diff --git a/src/intel/vulkan/anv_extensions.py b/src/intel/vulkan/anv_extensions.py index 388845003aa..0f579ced692 100644 --- a/src/intel/vulkan/anv_extensions.py +++ b/src/intel/vulkan/anv_extensions.py @@ -105,6 +105,7 @@ EXTENSIONS = [ Extension('VK_KHR_sampler_mirror_clamp_to_edge', 1, True), Extension('VK_KHR_sampler_ycbcr_conversion', 1, True), Extension('VK_KHR_shader_draw_parameters',1, True), +Extension('VK_KHR_shader_float16_int8', 1, 'device->info.gen >= 8'), Extension('VK_KHR_storage_buffer_storage_class', 1, True), Extension('VK_KHR_surface', 25, 'ANV_HAS_SURFACE'), Extension('VK_KHR_swapchain',68, 'ANV_HAS_SURFACE'), -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 21/42] intel/compiler: set correct precision fields for 3-source float instructions
Source0 and Destination extract the floating-point precision automatically from the SrcType and DstType instruction fields respectively when they are set to types :F or :HF. For Source1 and Source2 operands, we use the new 1-bit fields Src1Type and Src2Type, where 0 means normal precision and 1 means half-precision. Since we always use the type of the destination for all operands when we emit 3-source instructions, we only need set Src1Type and Src2Type to 1 when we are emitting a half-precision instruction. v2: - Set the bit separately for each source based on its type so we can do mixed floating-point mode in the future (Topi). Reviewed-by: Topi Pohjolainen --- src/intel/compiler/brw_eu_emit.c | 16 1 file changed, 16 insertions(+) diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c index a785f96b650..2fa89f8a2a3 100644 --- a/src/intel/compiler/brw_eu_emit.c +++ b/src/intel/compiler/brw_eu_emit.c @@ -801,6 +801,22 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest, */ brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type); brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type); + + /* From the Bspec: Instruction types + * + * Three source instructions can use operands with mixed-mode + * precision. When SrcType field is set to :f or :hf it defines + * precision for source 0 only, and fields Src1Type and Src2Type + * define precision for other source operands: + * + * 0b = :f. Single precision Float (32-bit). + * 1b = :hf. Half precision Float (16-bit). + */ + if (src1.type == BRW_REGISTER_TYPE_HF) +brw_inst_set_3src_a16_src1_type(devinfo, inst, 1); + + if (src2.type == BRW_REGISTER_TYPE_HF) +brw_inst_set_3src_a16_src2_type(devinfo, inst, 1); } } -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 30/42] intel/compiler: implement isign for int8
Reviewed-by: Jason Ekstrand --- src/intel/compiler/brw_fs_nir.cpp | 25 + 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index a739562c3ab..a3d193b8a44 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -912,11 +912,28 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) * Predicated OR sets 1 if val is positive. */ uint32_t bit_size = nir_dest_bit_size(instr->dest.dest); - assert(bit_size == 32 || bit_size == 16); - fs_reg zero = bit_size == 32 ? brw_imm_d(0) : brw_imm_w(0); - fs_reg one = bit_size == 32 ? brw_imm_d(1) : brw_imm_w(1); - fs_reg shift = bit_size == 32 ? brw_imm_d(31) : brw_imm_w(15); + fs_reg zero, one, shift; + switch (bit_size) { + case 32: + zero = brw_imm_d(0); + one = brw_imm_d(1); + shift = brw_imm_d(31); + break; + case 16: + zero = brw_imm_w(0); + one = brw_imm_w(1); + shift = brw_imm_w(15); + break; + case 8: { + zero = setup_imm_b(bld, 0); + one = setup_imm_b(bld, 1); + shift = setup_imm_b(bld, 7); + break; + } + default: + unreachable("unsupported bit-size"); + }; bld.CMP(bld.null_reg_d(), op[0], zero, BRW_CONDITIONAL_G); bld.ASR(result, op[0], shift); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 36/42] intel/compiler: implement is_zero, is_one, is_negative_one for 8-bit/16-bit
There are no 8-bit immediates, so assert in that case. 16-bit immediates are replicated in each word of a 32-bit immediate, so we only need to check the lower 16-bits. v2: - Fix is_zero with half-float to consider -0 as well (Jason). - Fix is_negative_one for word type. --- src/intel/compiler/brw_shader.cpp | 26 ++ 1 file changed, 26 insertions(+) diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index 97966c951a1..3c636c9d3a4 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -704,11 +704,20 @@ backend_reg::is_zero() const if (file != IMM) return false; + assert(type_sz(type) > 1); + switch (type) { + case BRW_REGISTER_TYPE_HF: + assert((d & 0x) == ((d >> 16) & 0x)); + return (d & 0x) == 0 || (d & 0x) == 0x8000; case BRW_REGISTER_TYPE_F: return f == 0; case BRW_REGISTER_TYPE_DF: return df == 0; + case BRW_REGISTER_TYPE_W: + case BRW_REGISTER_TYPE_UW: + assert((d & 0x) == ((d >> 16) & 0x)); + return (d & 0x) == 0; case BRW_REGISTER_TYPE_D: case BRW_REGISTER_TYPE_UD: return d == 0; @@ -726,11 +735,20 @@ backend_reg::is_one() const if (file != IMM) return false; + assert(type_sz(type) > 1); + switch (type) { + case BRW_REGISTER_TYPE_HF: + assert((d & 0x) == ((d >> 16) & 0x)); + return (d & 0x) == 0x3c00; case BRW_REGISTER_TYPE_F: return f == 1.0f; case BRW_REGISTER_TYPE_DF: return df == 1.0; + case BRW_REGISTER_TYPE_W: + case BRW_REGISTER_TYPE_UW: + assert((d & 0x) == ((d >> 16) & 0x)); + return (d & 0x) == 1; case BRW_REGISTER_TYPE_D: case BRW_REGISTER_TYPE_UD: return d == 1; @@ -748,11 +766,19 @@ backend_reg::is_negative_one() const if (file != IMM) return false; + assert(type_sz(type) > 1); + switch (type) { + case BRW_REGISTER_TYPE_HF: + assert((d & 0x) == ((d >> 16) & 0x)); + return (d & 0x) == 0xbc00; case BRW_REGISTER_TYPE_F: return f == -1.0; case BRW_REGISTER_TYPE_DF: return df == -1.0; + case BRW_REGISTER_TYPE_W: + assert((d & 0x) == ((d >> 16) & 0x)); + return (d & 0x) == 0x; case BRW_REGISTER_TYPE_D: return d == -1; case BRW_REGISTER_TYPE_Q: -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 19/42] intel/compiler: don't compact 3-src instructions with Src1Type or Src2Type bits
We are now using these bits, so don't assert that they are not set, just avoid compaction in that case. Reviewed-by: Topi Pohjolainen --- src/intel/compiler/brw_eu_compact.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/intel/compiler/brw_eu_compact.c b/src/intel/compiler/brw_eu_compact.c index ae14ef10ec0..20fed254331 100644 --- a/src/intel/compiler/brw_eu_compact.c +++ b/src/intel/compiler/brw_eu_compact.c @@ -928,8 +928,11 @@ has_3src_unmapped_bits(const struct gen_device_info *devinfo, assert(!brw_inst_bits(src, 127, 126) && !brw_inst_bits(src, 105, 105) && !brw_inst_bits(src, 84, 84) && - !brw_inst_bits(src, 36, 35) && !brw_inst_bits(src, 7, 7)); + + /* Src1Type and Src2Type, used for mixed-precision floating point */ + if (brw_inst_bits(src, 36, 35)) + return true; } return false; -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 41/42] intel/compiler: fix combine constants for Align16 with half-float prior to gen9
There is a hardware restriction where <0,1,0>:HF in Align16 doesn't replicate a single 16-bit channel, but instead it replicates a full 32-bit channel. --- .../compiler/brw_fs_combine_constants.cpp | 24 +-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp b/src/intel/compiler/brw_fs_combine_constants.cpp index 54017e5668b..56e414d3f4e 100644 --- a/src/intel/compiler/brw_fs_combine_constants.cpp +++ b/src/intel/compiler/brw_fs_combine_constants.cpp @@ -301,7 +301,26 @@ fs_visitor::opt_combine_constants() */ exec_node *n = (imm->inst ? imm->inst : imm->block->last_non_control_flow_inst()->next); - const fs_builder ibld = bld.at(imm->block, n).exec_all().group(1, 0); + + /* Prior to gen9 we also have to deal with this restriction: + * + * "In Align16 mode, the channel selects and channel enables apply to a + * pair of half-floats, because these parameters are defined for DWord + * elements ONLY. This is applicable when both source and destination + * are half-floats." + * + * This means that when we emit a 3-src instruction such as MAD or LRP, + * for which we use Align16, if we need to promote an HF constant to a + * register we need to be aware that the <0,1,0>:HF region would still + * read 2 HF slots and not not replicate the single one like we want. + * We fix this by populating both HF slots with the constant we need to + * read. + */ + const uint32_t width = + devinfo->gen < 9 && + imm->type == BRW_REGISTER_TYPE_HF && + (!imm->inst || imm->inst->is_3src(devinfo)) ? 2 : 1; + const fs_builder ibld = bld.at(imm->block, n).exec_all().group(width, 0); reg = retype(reg, imm->type); if (imm->type == BRW_REGISTER_TYPE_F) { @@ -314,7 +333,8 @@ fs_visitor::opt_combine_constants() imm->subreg_offset = reg.offset; /* Keep offsets 32-bit aligned since we are mixing 32-bit and 16-bit - * constants into the same register + * constants into the same register (and we are writing 32-bit slots + * prior to gen9 for HF constants anyway). * * TODO: try to pack pairs of HF constants into each 32-bit slot */ -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 38/42] intel/compiler: fix cmod propagation for non 32-bit types
v2: - Do not propagate if the bit-size changes --- src/intel/compiler/brw_fs_cmod_propagation.cpp | 14 +- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/intel/compiler/brw_fs_cmod_propagation.cpp b/src/intel/compiler/brw_fs_cmod_propagation.cpp index 7bb5c9afbc9..57d4e645c05 100644 --- a/src/intel/compiler/brw_fs_cmod_propagation.cpp +++ b/src/intel/compiler/brw_fs_cmod_propagation.cpp @@ -244,8 +244,7 @@ opt_cmod_propagation_local(const gen_device_info *devinfo, /* CMP's result is the same regardless of dest type. */ if (inst->conditional_mod == BRW_CONDITIONAL_NZ && scan_inst->opcode == BRW_OPCODE_CMP && -(inst->dst.type == BRW_REGISTER_TYPE_D || - inst->dst.type == BRW_REGISTER_TYPE_UD)) { +brw_reg_type_is_integer(inst->dst.type)) { inst->remove(block); progress = true; break; @@ -258,9 +257,14 @@ opt_cmod_propagation_local(const gen_device_info *devinfo, break; /* Comparisons operate differently for ints and floats */ -if (scan_inst->dst.type != inst->dst.type && -(scan_inst->dst.type == BRW_REGISTER_TYPE_F || - inst->dst.type == BRW_REGISTER_TYPE_F)) +if (brw_reg_type_is_floating_point(scan_inst->dst.type) != +brw_reg_type_is_floating_point(inst->dst.type)) + break; + +/* Comparison result may be altered if the bit-size changes + * since that affects range, denorms, etc + */ +if (type_sz(scan_inst->dst.type) != type_sz(inst->dst.type)) break; /* If the instruction generating inst's source also wrote the -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 20/42] intel/compiler: allow half-float on 3-source instructions since gen8
Reviewed-by: Topi Pohjolainen --- src/intel/compiler/brw_eu_emit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c index e21df4624b3..a785f96b650 100644 --- a/src/intel/compiler/brw_eu_emit.c +++ b/src/intel/compiler/brw_eu_emit.c @@ -755,7 +755,8 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest, assert(dest.type == BRW_REGISTER_TYPE_F || dest.type == BRW_REGISTER_TYPE_DF || dest.type == BRW_REGISTER_TYPE_D || - dest.type == BRW_REGISTER_TYPE_UD); + dest.type == BRW_REGISTER_TYPE_UD || + (dest.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 8)); if (devinfo->gen == 6) { brw_inst_set_3src_a16_dst_reg_file(devinfo, inst, dest.file == BRW_MESSAGE_REGISTER_FILE); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 26/42] intel/compiler: split is_partial_write() into two variants
This function is used in two different scenarios that for 32-bit instructions are the same, but for 16-bit instructions are not. One scenario is that in which we are working at a SIMD8 register level and we need to know if a register is fully defined or written. This is useful, for example, in the context of liveness analysis or register allocation, where we work with units of registers. The other scenario is that in which we want to know if an instruction is writing a full scalar component or just some subset of it. This is useful, for example, in the context of some optimization passes like copy propagation. For 32-bit instructions (or larger), a SIMD8 dispatch will always write at least a full SIMD8 register (32B) if the write is not partial. The function is_partial_write() checks this to determine if we have a partial write. However, when we deal with 16-bit instructions, that logic disables some optimizations that should be safe. For example, a SIMD8 16-bit MOV will only update half of a SIMD register, but it is still a complete write of the variable for a SIMD8 dispatch, so we should not prevent copy propagation in this scenario because we don't write all 32 bytes in the SIMD register or because the write starts at offset 16B (wehere we pack components Y or W of 16-bit vectors). This is a problem for SIMD8 executions (VS, TCS, TES, GS) of 16-bit instructions, which lose a number of optimizations because of this, most important of which is copy-propagation. This patch splits is_partial_write() into is_partial_reg_write(), which represents the current is_partial_write(), useful for things like liveness analysis, and is_partial_var_write(), which considers the dispatch size to check if we are writing a full variable (rather than a full register) to decide if the write is partial or not, which is what we really want in many optimization passes. Then the patch goes on and rewrites all uses of is_partial_write() to use one or the other version. Specifically, we use is_partial_var_write() in the following places: copy propagation, cmod propagation, common subexpression elimination, saturate propagation and sel peephole. Notice that the semantics of is_partial_var_write() exactly match the current implementation of is_partial_write() for anything that is 32-bit or larger, so no changes are expected for 32-bit instructions. Tested against ~5000 tests involving 16-bit instructions in CTS produced the following changes in instruction counts: Patched | Master|%| SIMD8 |621,900 |706,721| -12.00% | SIMD16 | 93,252 | 93,252| 0.00% | As expected, the change only affects SIMD8 dispatches. Reviewed-by: Topi Pohjolainen --- src/intel/compiler/brw_fs.cpp | 31 +++ .../compiler/brw_fs_cmod_propagation.cpp | 20 ++-- .../compiler/brw_fs_copy_propagation.cpp | 8 ++--- src/intel/compiler/brw_fs_cse.cpp | 3 +- .../compiler/brw_fs_dead_code_eliminate.cpp | 2 +- src/intel/compiler/brw_fs_live_variables.cpp | 2 +- src/intel/compiler/brw_fs_reg_allocate.cpp| 2 +- .../compiler/brw_fs_register_coalesce.cpp | 2 +- .../compiler/brw_fs_saturate_propagation.cpp | 7 +++-- src/intel/compiler/brw_fs_sel_peephole.cpp| 4 +-- src/intel/compiler/brw_ir_fs.h| 3 +- 11 files changed, 54 insertions(+), 30 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index d6096cd667d..77c955ac435 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -716,14 +716,33 @@ fs_visitor::limit_dispatch_width(unsigned n, const char *msg) * it. */ bool -fs_inst::is_partial_write() const +fs_inst::is_partial_reg_write() const { return ((this->predicate && this->opcode != BRW_OPCODE_SEL) || - (this->exec_size * type_sz(this->dst.type)) < 32 || !this->dst.is_contiguous() || + (this->exec_size * type_sz(this->dst.type)) < REG_SIZE || this->dst.offset % REG_SIZE != 0); } +/** + * Returns true if the instruction has a flag that means it won't + * update an entire variable for the given dispatch width. + * + * This is only different from is_partial_reg_write() for SIMD8 + * dispatches of 16-bit (or smaller) instructions. + */ +bool +fs_inst::is_partial_var_write(uint32_t dispatch_width) const +{ + const uint32_t type_size = type_sz(this->dst.type); + uint32_t var_size = MIN2(REG_SIZE, dispatch_width * type_size); + + return ((this->predicate && this->opcode != BRW_OPCODE_SEL) || + !this->dst.is_contiguous() || + (this->exec_size * type_sz(this->dst.type)) < var_size || + this->dst.offset % var_size != 0); +} + unsigned fs_inst::components_read(unsigned i) const { @@ -2896,7 +2915,7 @@ fs_v
[Mesa-dev] [PATCH v3 12/42] compiler/nir: add lowering for 16-bit flrp
Reviewed-by: Jason Ekstrand --- src/compiler/nir/nir.h| 1 + src/compiler/nir/nir_opt_algebraic.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 19056e79206..adcc8e36cc9 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -2106,6 +2106,7 @@ typedef struct nir_shader_compiler_options { bool lower_fdiv; bool lower_ffma; bool fuse_ffma; + bool lower_flrp16; bool lower_flrp32; /** Lowers flrp when it does not support doubles */ bool lower_flrp64; diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index cd969de1f88..40eb3de02c3 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -124,6 +124,7 @@ optimizations = [ (('~flrp', 0.0, a, b), ('fmul', a, b)), (('~flrp', a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp32'), (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)), + (('flrp@16', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp16'), (('flrp@32', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp32'), (('flrp@64', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp64'), (('ffloor', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'), -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 07/42] intel/compiler: lower 16-bit extended math to 32-bit prior to gen9
Extended math doesn't support half-float on these generations. Reviewed-by: Jason Ekstrand --- src/intel/compiler/brw_nir.c | 13 - 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index f0fe7f870c2..3b2909da33e 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -631,6 +631,8 @@ lower_bit_size_callback(const nir_alu_instr *alu, UNUSED void *data) if (alu->dest.dest.ssa.bit_size != 16) return 0; + const struct brw_compiler *compiler = (const struct brw_compiler *) data; + switch (alu->op) { case nir_op_idiv: case nir_op_imod: @@ -643,6 +645,15 @@ lower_bit_size_callback(const nir_alu_instr *alu, UNUSED void *data) case nir_op_fround_even: case nir_op_ftrunc: return 32; + case nir_op_frcp: + case nir_op_frsq: + case nir_op_fsqrt: + case nir_op_fpow: + case nir_op_fexp2: + case nir_op_flog2: + case nir_op_fsin: + case nir_op_fcos: + return compiler->devinfo->gen < 9 ? 32 : 0; default: return 0; } @@ -770,7 +781,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir) OPT(nir_opt_large_constants, NULL, 32); } - OPT(nir_lower_bit_size, lower_bit_size_callback, NULL); + OPT(nir_lower_bit_size, lower_bit_size_callback, (void *)compiler); if (is_scalar) { OPT(nir_lower_load_const_to_scalar); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 10/42] compiler/nir: add lowering option for 16-bit fmod
Reviewed-by: Jason Ekstrand --- src/compiler/nir/nir.h| 1 + src/compiler/nir/nir_opt_algebraic.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 3cb2d166cb3..19056e79206 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -2112,6 +2112,7 @@ typedef struct nir_shader_compiler_options { bool lower_fpow; bool lower_fsat; bool lower_fsqrt; + bool lower_fmod16; bool lower_fmod32; bool lower_fmod64; /** Lowers ibitfield_extract/ubitfield_extract to ibfe/ubfe. */ diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 75a3d2ad238..cd969de1f88 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -636,6 +636,7 @@ optimizations = [ (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)), # Misc. lowering + (('fmod@16', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b, 'options->lower_fmod16'), (('fmod@32', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b, 'options->lower_fmod32'), (('fmod@64', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b, 'options->lower_fmod64'), (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b, 'options->lower_fmod32'), -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 34/42] anv/pipeline: support Float16 and Int8 capabilities in gen8+
v2: - Merge Float16 and Int8 in a single patch (Jason) Reviewed-by: Jason Ekstrand (v1) --- src/intel/vulkan/anv_pipeline.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 899160746d4..663d1c77fa5 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -136,8 +136,10 @@ anv_shader_compile_to_nir(struct anv_device *device, .caps = { .device_group = true, .draw_parameters = true, + .float16 = device->instance->physicalDevice.info.gen >= 8, .float64 = device->instance->physicalDevice.info.gen >= 8, .image_write_without_format = true, + .int8 = device->instance->physicalDevice.info.gen >= 8, .int16 = device->instance->physicalDevice.info.gen >= 8, .int64 = device->instance->physicalDevice.info.gen >= 8, .min_lod = true, -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 40/42] intel/compiler: support half-float in the combine constants pass
Reviewed-by: Topi Pohjolainen --- .../compiler/brw_fs_combine_constants.cpp | 60 +++ 1 file changed, 49 insertions(+), 11 deletions(-) diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp b/src/intel/compiler/brw_fs_combine_constants.cpp index 7343f77bb45..54017e5668b 100644 --- a/src/intel/compiler/brw_fs_combine_constants.cpp +++ b/src/intel/compiler/brw_fs_combine_constants.cpp @@ -36,6 +36,7 @@ #include "brw_fs.h" #include "brw_cfg.h" +#include "util/half_float.h" using namespace brw; @@ -114,8 +115,9 @@ struct imm { */ exec_list *uses; - /** The immediate value. We currently only handle floats. */ + /** The immediate value. We currently only handle float and half-float. */ float val; + brw_reg_type type; /** * The GRF register and subregister number where we've decided to store the @@ -145,10 +147,10 @@ struct table { }; static struct imm * -find_imm(struct table *table, float val) +find_imm(struct table *table, float val, brw_reg_type type) { for (int i = 0; i < table->len; i++) { - if (table->imm[i].val == val) { + if (table->imm[i].val == val && table->imm[i].type == type) { return &table->imm[i]; } } @@ -190,6 +192,20 @@ compare(const void *_a, const void *_b) return a->first_use_ip - b->first_use_ip; } +static bool +needs_negate(float reg_val, float imm_val, brw_reg_type type) +{ + /* reg_val represents the immediate value in the register in its original +* bit-size, while imm_val is always a valid 32-bit float value. +*/ + if (type == BRW_REGISTER_TYPE_HF) { + uint32_t reg_val_ud = *((uint32_t *) ®_val); + reg_val = _mesa_half_to_float(reg_val_ud & 0x); + } + + return signbit(imm_val) != signbit(reg_val); +} + bool fs_visitor::opt_combine_constants() { @@ -215,12 +231,20 @@ fs_visitor::opt_combine_constants() for (int i = 0; i < inst->sources; i++) { if (inst->src[i].file != IMM || - inst->src[i].type != BRW_REGISTER_TYPE_F) + (inst->src[i].type != BRW_REGISTER_TYPE_F && + inst->src[i].type != BRW_REGISTER_TYPE_HF)) continue; - float val = !inst->can_do_source_mods(devinfo) ? inst->src[i].f : - fabs(inst->src[i].f); - struct imm *imm = find_imm(&table, val); + float val; + if (inst->src[i].type == BRW_REGISTER_TYPE_F) { +val = !inst->can_do_source_mods(devinfo) ? inst->src[i].f : +fabs(inst->src[i].f); + } else { +val = !inst->can_do_source_mods(devinfo) ? + _mesa_half_to_float(inst->src[i].d & 0x) : + fabs(_mesa_half_to_float(inst->src[i].d & 0x)); + } + struct imm *imm = find_imm(&table, val, inst->src[i].type); if (imm) { bblock_t *intersection = cfg_t::intersect(block, imm->block); @@ -238,6 +262,7 @@ fs_visitor::opt_combine_constants() imm->uses = new(const_ctx) exec_list(); imm->uses->push_tail(link(const_ctx, &inst->src[i])); imm->val = val; +imm->type = inst->src[i].type; imm->uses_by_coissue = could_coissue(devinfo, inst); imm->must_promote = must_promote_imm(devinfo, inst); imm->first_use_ip = ip; @@ -278,12 +303,23 @@ fs_visitor::opt_combine_constants() imm->block->last_non_control_flow_inst()->next); const fs_builder ibld = bld.at(imm->block, n).exec_all().group(1, 0); - ibld.MOV(reg, brw_imm_f(imm->val)); + reg = retype(reg, imm->type); + if (imm->type == BRW_REGISTER_TYPE_F) { + ibld.MOV(reg, brw_imm_f(imm->val)); + } else { + const uint16_t val_hf = _mesa_float_to_half(imm->val); + ibld.MOV(reg, retype(brw_imm_uw(val_hf), BRW_REGISTER_TYPE_HF)); + } imm->nr = reg.nr; imm->subreg_offset = reg.offset; + /* Keep offsets 32-bit aligned since we are mixing 32-bit and 16-bit + * constants into the same register + * + * TODO: try to pack pairs of HF constants into each 32-bit slot + */ reg.offset += sizeof(float); - if (reg.offset == 8 * sizeof(float)) { + if (reg.offset == REG_SIZE) { reg.nr = alloc.allocate(1); reg.offset = 0; } @@ -295,12 +331,14 @@ fs_visitor::opt_combine_constants() foreach_list_typed(reg_link, link, link, table.imm[i].uses) { fs_reg *reg = link->reg; assert((isnan(reg->f) && isnan(table.imm[i].val)) || -fabsf(reg->f) == fabs(table.imm[i].val)); +fabsf(reg->f) == fabs(table.imm[i].val) || +table.imm[i].type == BRW_REGISTER_TYPE_HF); reg->file = VGRF; + reg->type = table.imm[i].type; reg->offset = table.imm[i].subreg_offset; reg->stride = 0; - reg->negate = signbit(reg->f) != signbit(tab
[Mesa-dev] [PATCH v3 13/42] intel/compiler: lower 16-bit flrp
Reviewed-by: Jason Ekstrand --- src/intel/compiler/brw_compiler.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c index f885e79c3e6..04a1a7cac4e 100644 --- a/src/intel/compiler/brw_compiler.c +++ b/src/intel/compiler/brw_compiler.c @@ -33,6 +33,7 @@ .lower_sub = true, \ .lower_fdiv = true,\ .lower_scmp = true,\ + .lower_flrp16 = true, \ .lower_fmod16 = true, \ .lower_fmod32 = true, \ .lower_fmod64 = false, \ -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 29/42] intel/compiler: handle conversions between int and half-float on atom
Reviewed-by: Topi Pohjolainen --- src/intel/compiler/brw_fs_nir.cpp | 13 + 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index e454578d99b..a739562c3ab 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -784,13 +784,20 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) */ case nir_op_f2f16: - case nir_op_i2f16: - case nir_op_u2f16: case nir_op_i2i8: case nir_op_u2u8: + assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ + inst = bld.MOV(result, op[0]); + inst->saturate = instr->dest.saturate; + break; + + case nir_op_i2f16: + case nir_op_u2f16: case nir_op_f2i8: case nir_op_f2u8: assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ + case nir_op_f2i16: + case nir_op_f2u16: inst = bld.MOV(result, op[0]); inst->saturate = instr->dest.saturate; break; @@ -822,8 +829,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) case nir_op_f2f32: case nir_op_f2i32: case nir_op_f2u32: - case nir_op_f2i16: - case nir_op_f2u16: case nir_op_i2i32: case nir_op_u2u32: case nir_op_i2i16: -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 31/42] intel/compiler: ask for an integer type if requesting an 8-bit type
--- src/intel/compiler/brw_fs_nir.cpp | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index a3d193b8a44..ccf1891b925 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -346,7 +346,9 @@ fs_visitor::nir_emit_impl(nir_function_impl *impl) reg->num_array_elems == 0 ? 1 : reg->num_array_elems; unsigned size = array_elems * reg->num_components; const brw_reg_type reg_type = - brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F); + brw_reg_type_from_bit_size(reg->bit_size, +reg->bit_size == 8 ? BRW_REGISTER_TYPE_D : + BRW_REGISTER_TYPE_F); nir_locals[reg->index] = bld.vgrf(reg_type, size); } @@ -4281,7 +4283,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr fs_reg value = get_nir_src(instr->src[0]); if (instr->intrinsic == nir_intrinsic_vote_feq) { const unsigned bit_size = nir_src_bit_size(instr->src[0]); - value.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F); + value.type = +brw_reg_type_from_bit_size(bit_size, + bit_size == 8 ? BRW_REGISTER_TYPE_D : + BRW_REGISTER_TYPE_F); } fs_reg uniformized = bld.emit_uniformize(value); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 06/42] intel/compiler: lower some 16-bit float operations to 32-bit
The hardware doesn't support half-float for these. Reviewed-by: Topi Pohjolainen Reviewed-by: Jason Ekstrand --- src/intel/compiler/brw_nir.c | 5 + 1 file changed, 5 insertions(+) diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 572ab824a94..f0fe7f870c2 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -637,6 +637,11 @@ lower_bit_size_callback(const nir_alu_instr *alu, UNUSED void *data) case nir_op_irem: case nir_op_udiv: case nir_op_umod: + case nir_op_fceil: + case nir_op_ffloor: + case nir_op_ffract: + case nir_op_fround_even: + case nir_op_ftrunc: return 32; default: return 0; -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 23/42] intel/compiler: fix ddx and ddy for 16-bit float
We were assuming 32-bit elements. Also, In SIMD8 we pack 2 vector components in a single SIMD register, so for example, component Y of a 16-bit vec2 starts is at byte offset 16B. This means that when we compute the offset of the elements to be differentiated we should not stomp whatever base offset we have, but instead add to it. v2 - Use byte_offset() helper (Jason) - Merge the fix for SIMD8: using byte_offset() fixes that too. Reviewed-by: Jason Ekstrand (v1) --- src/intel/compiler/brw_fs_generator.cpp | 37 - 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index 5fc6cf5f8cc..d0cc4a6d231 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -1315,10 +1315,9 @@ fs_generator::generate_ddx(const fs_inst *inst, width = BRW_WIDTH_4; } - struct brw_reg src0 = src; + struct brw_reg src0 = byte_offset(src, type_sz(src.type));; struct brw_reg src1 = src; - src0.subnr = sizeof(float); src0.vstride = vstride; src0.width = width; src0.hstride = BRW_HORIZONTAL_STRIDE_0; @@ -1337,23 +1336,25 @@ void fs_generator::generate_ddy(const fs_inst *inst, struct brw_reg dst, struct brw_reg src) { + const uint32_t type_size = type_sz(src.type); + if (inst->opcode == FS_OPCODE_DDY_FINE) { /* produce accurate derivatives */ if (devinfo->gen >= 11) { src = stride(src, 0, 2, 1); - struct brw_reg src_0 = byte_offset(src, 0 * sizeof(float)); - struct brw_reg src_2 = byte_offset(src, 2 * sizeof(float)); - struct brw_reg src_4 = byte_offset(src, 4 * sizeof(float)); - struct brw_reg src_6 = byte_offset(src, 6 * sizeof(float)); - struct brw_reg src_8 = byte_offset(src, 8 * sizeof(float)); - struct brw_reg src_10 = byte_offset(src, 10 * sizeof(float)); - struct brw_reg src_12 = byte_offset(src, 12 * sizeof(float)); - struct brw_reg src_14 = byte_offset(src, 14 * sizeof(float)); - - struct brw_reg dst_0 = byte_offset(dst, 0 * sizeof(float)); - struct brw_reg dst_4 = byte_offset(dst, 4 * sizeof(float)); - struct brw_reg dst_8 = byte_offset(dst, 8 * sizeof(float)); - struct brw_reg dst_12 = byte_offset(dst, 12 * sizeof(float)); + struct brw_reg src_0 = byte_offset(src, 0 * type_size); + struct brw_reg src_2 = byte_offset(src, 2 * type_size); + struct brw_reg src_4 = byte_offset(src, 4 * type_size); + struct brw_reg src_6 = byte_offset(src, 6 * type_size); + struct brw_reg src_8 = byte_offset(src, 8 * type_size); + struct brw_reg src_10 = byte_offset(src, 10 * type_size); + struct brw_reg src_12 = byte_offset(src, 12 * type_size); + struct brw_reg src_14 = byte_offset(src, 14 * type_size); + + struct brw_reg dst_0 = byte_offset(dst, 0 * type_size); + struct brw_reg dst_4 = byte_offset(dst, 4 * type_size); + struct brw_reg dst_8 = byte_offset(dst, 8 * type_size); + struct brw_reg dst_12 = byte_offset(dst, 12 * type_size); brw_push_insn_state(p); brw_set_default_exec_size(p, BRW_EXECUTE_4); @@ -1380,10 +1381,8 @@ fs_generator::generate_ddy(const fs_inst *inst, } } else { /* replicate the derivative at the top-left pixel to other pixels */ - struct brw_reg src0 = stride(src, 4, 4, 0); - struct brw_reg src1 = stride(src, 4, 4, 0); - src0.subnr = 0 * sizeof(float); - src1.subnr = 2 * sizeof(float); + struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size); + struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size); brw_ADD(p, dst, negate(src0), src1); } -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 37/42] intel/compiler: add a brw_reg_type_is_integer helper
Reviewed-by: Jason Ekstrand --- src/intel/compiler/brw_reg_type.h | 18 ++ 1 file changed, 18 insertions(+) diff --git a/src/intel/compiler/brw_reg_type.h b/src/intel/compiler/brw_reg_type.h index ffbec90d3fe..a3365b7e34c 100644 --- a/src/intel/compiler/brw_reg_type.h +++ b/src/intel/compiler/brw_reg_type.h @@ -82,6 +82,24 @@ brw_reg_type_is_floating_point(enum brw_reg_type type) } } +static inline bool +brw_reg_type_is_integer(enum brw_reg_type type) +{ + switch (type) { + case BRW_REGISTER_TYPE_Q: + case BRW_REGISTER_TYPE_UQ: + case BRW_REGISTER_TYPE_D: + case BRW_REGISTER_TYPE_UD: + case BRW_REGISTER_TYPE_W: + case BRW_REGISTER_TYPE_UW: + case BRW_REGISTER_TYPE_B: + case BRW_REGISTER_TYPE_UV: + return true; + default: + return false; + } +} + unsigned brw_reg_type_to_hw_type(const struct gen_device_info *devinfo, enum brw_reg_file file, enum brw_reg_type type); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 25/42] intel/compiler: workaround for SIMD8 half-float MAD in gen8
Broadwell hardware has a bug that manifests in SIMD8 executions of 16-bit MAD instructions when any of the sources is a Y or W component. We pack these components in the same SIMD register as components X and Z respectively, but starting at offset 16B (so they live in the second half of the register). The problem does not exist in SKL or later. We work around this issue by moving any such sources to a temporary starting at offset 0B. We want to do this after the main optimization loop to prevent copy-propagation and friends to undo the fix. Reviewed-by: Topi Pohjolainen --- src/intel/compiler/brw_fs.cpp | 48 +++ src/intel/compiler/brw_fs.h | 1 + 2 files changed, 49 insertions(+) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 0b3ec94e2d2..d6096cd667d 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -6540,6 +6540,48 @@ fs_visitor::optimize() validate(); } +/** + * Broadwell hardware has a bug that manifests in SIMD8 executions of 16-bit + * MAD instructions when any of the sources is a Y or W component. We pack + * these components in the same SIMD register as components X and Z + * respectively, but starting at offset 16B (so they live in the second half + * of the register). + * + * We work around this issue by moving any such sources to a temporary + * starting at offset 0B. We want to do this after the main optimization loop + * to prevent copy-propagation and friends to undo the fix. + */ +void +fs_visitor::fixup_hf_mad() +{ + if (devinfo->gen != 8) + return; + + bool progress = false; + + foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { + if (inst->opcode != BRW_OPCODE_MAD || + inst->dst.type != BRW_REGISTER_TYPE_HF || + inst->exec_size > 8) + continue; + + for (int i = 0; i < 3; i++) { + if (inst->src[i].offset > 0) { +assert(inst->src[i].type == BRW_REGISTER_TYPE_HF); +const fs_builder ibld = + bld.at(block, inst).exec_all().group(inst->exec_size, 0); +fs_reg tmp = ibld.vgrf(inst->src[i].type); +ibld.MOV(tmp, inst->src[i]); +inst->src[i] = tmp; +progress = true; + } + } + } + + if (progress) + invalidate_live_intervals(); +} + /** * Three source instruction must have a GRF/MRF destination register. * ARF NULL is not allowed. Fix that up by allocating a temporary GRF. @@ -6698,6 +6740,7 @@ fs_visitor::run_vs() assign_curb_setup(); assign_vs_urb_setup(); + fixup_hf_mad(); fixup_3src_null_dest(); allocate_registers(8, true); @@ -6782,6 +6825,7 @@ fs_visitor::run_tcs_single_patch() assign_curb_setup(); assign_tcs_single_patch_urb_setup(); + fixup_hf_mad(); fixup_3src_null_dest(); allocate_registers(8, true); @@ -6816,6 +6860,7 @@ fs_visitor::run_tes() assign_curb_setup(); assign_tes_urb_setup(); + fixup_hf_mad(); fixup_3src_null_dest(); allocate_registers(8, true); @@ -6865,6 +6910,7 @@ fs_visitor::run_gs() assign_curb_setup(); assign_gs_urb_setup(); + fixup_hf_mad(); fixup_3src_null_dest(); allocate_registers(8, true); @@ -6965,6 +7011,7 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send) assign_urb_setup(); + fixup_hf_mad(); fixup_3src_null_dest(); allocate_registers(8, allow_spilling); @@ -7009,6 +7056,7 @@ fs_visitor::run_cs(unsigned min_dispatch_width) assign_curb_setup(); + fixup_hf_mad(); fixup_3src_null_dest(); allocate_registers(min_dispatch_width, true); diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 68287bcdcea..1879d4bc7f7 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -103,6 +103,7 @@ public: void setup_vs_payload(); void setup_gs_payload(); void setup_cs_payload(); + void fixup_hf_mad(); void fixup_3src_null_dest(); void assign_curb_setup(); void calculate_urb_setup(); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 33/42] compiler/spirv: add support for Float16 and Int8 capabilities
v2: - Merge Float16 and Int8 capabilities into a single patch (Jason) Reviewed-by: Jason Ekstrand (v1) --- src/compiler/shader_info.h| 2 ++ src/compiler/spirv/spirv_to_nir.c | 8 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/compiler/shader_info.h b/src/compiler/shader_info.h index 87a2c805d37..1d45433312a 100644 --- a/src/compiler/shader_info.h +++ b/src/compiler/shader_info.h @@ -37,12 +37,14 @@ struct spirv_supported_capabilities { bool descriptor_array_dynamic_indexing; bool device_group; bool draw_parameters; + bool float16; bool float64; bool geometry_streams; bool gcn_shader; bool image_ms_array; bool image_read_without_format; bool image_write_without_format; + bool int8; bool int16; bool int64; bool int64_atomics; diff --git a/src/compiler/spirv/spirv_to_nir.c b/src/compiler/spirv/spirv_to_nir.c index 76a997ee341..731b1cbea5b 100644 --- a/src/compiler/spirv/spirv_to_nir.c +++ b/src/compiler/spirv/spirv_to_nir.c @@ -3518,8 +3518,6 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode, case SpvCapabilityLinkage: case SpvCapabilityVector16: case SpvCapabilityFloat16Buffer: - case SpvCapabilityFloat16: - case SpvCapabilityInt8: case SpvCapabilitySparseResidency: vtn_warn("Unsupported SPIR-V capability: %s", spirv_capability_to_string(cap)); @@ -3536,12 +3534,18 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode, case SpvCapabilityFloat64: spv_check_supported(float64, cap); break; + case SpvCapabilityFloat16: + spv_check_supported(float16, cap); + break; case SpvCapabilityInt64: spv_check_supported(int64, cap); break; case SpvCapabilityInt16: spv_check_supported(int16, cap); break; + case SpvCapabilityInt8: + spv_check_supported(int8, cap); + break; case SpvCapabilityTransformFeedback: spv_check_supported(transform_feedback, cap); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 14/42] compiler/nir: add lowering for 16-bit ldexp
v2 (Topi): - Make bit-size handling order be 16-bit, 32-bit, 64-bit - Clamp lower exponent range at -28 instead of -30. Reviewed-by: Topi Pohjolainen Reviewed-by: Jason Ekstrand --- src/compiler/nir/nir_opt_algebraic.py | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 40eb3de02c3..71c626e1b3f 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -790,7 +790,9 @@ for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']): def fexp2i(exp, bits): # We assume that exp is already in the right range. - if bits == 32: + if bits == 16: + return ('i2i16', ('ishl', ('iadd', exp, 15), 10)) + elif bits == 32: return ('ishl', ('iadd', exp, 127), 23) elif bits == 64: return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20)) @@ -808,7 +810,9 @@ def ldexp(f, exp, bits): # handles a range on exp of [-252, 254] which allows you to create any # value (including denorms if the hardware supports it) and to adjust the # exponent of any normal value to anything you want. - if bits == 32: + if bits == 16: + exp = ('imin', ('imax', exp, -28), 30) + elif bits == 32: exp = ('imin', ('imax', exp, -252), 254) elif bits == 64: exp = ('imin', ('imax', exp, -2044), 2046) @@ -828,6 +832,7 @@ def ldexp(f, exp, bits): return ('fmul', ('fmul', f, pow2_1), pow2_2) optimizations += [ + (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'), (('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'), (('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'), ] -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 42/42] intel/compiler: allow propagating HF immediates to MAD/LRP
Even if we don't do 3-src algebraic optimizations for MAD and LRP in the backend any more, the combine constants pass can still do a fine job putting grouping these constants into single registers for better register pressure. v2: - updated comment to reference register pressure benefits rather than algebraic optimizations. --- src/intel/compiler/brw_fs_copy_propagation.cpp | 16 +++- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp index 4e20ddb683a..5695678b766 100644 --- a/src/intel/compiler/brw_fs_copy_propagation.cpp +++ b/src/intel/compiler/brw_fs_copy_propagation.cpp @@ -772,16 +772,14 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry) case BRW_OPCODE_MAD: case BRW_OPCODE_LRP: - /* 3-src instructions can't take IMM registers, however, for 32-bit - * floating instructions we rely on the combine constants pass to fix - * it up. For anything else, we shouldn't be promoting immediates - * until we can make the pass capable of combining constants of - * different sizes. + /* 3-src instructions can't take IMM registers, but we allow this + * here anyway and rely on the combine constants pass to fix it up + * later, hopefully leading to better register pressure. */ - if (val.type == BRW_REGISTER_TYPE_F) { -inst->src[i] = val; -progress = true; - } + assert(val.type == BRW_REGISTER_TYPE_F || +val.type == BRW_REGISTER_TYPE_HF); + inst->src[i] = val; + progress = true; break; default: -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 18/42] intel/compiler: add a helper function to query hardware type table
We open coded this in a couple of places, so a helper function is probably sensible. Plus it makes it more consistent with the 3src hardware type case. Suggested-by: Topi Pohjolainen --- src/intel/compiler/brw_reg_type.c | 34 --- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/src/intel/compiler/brw_reg_type.c b/src/intel/compiler/brw_reg_type.c index 09b3ea61d4c..0c9f522eca0 100644 --- a/src/intel/compiler/brw_reg_type.c +++ b/src/intel/compiler/brw_reg_type.c @@ -193,6 +193,20 @@ static const struct hw_3src_type { #undef E }; +static inline const struct hw_type * +get_hw_type_map(const struct gen_device_info *devinfo, uint32_t *size) +{ + if (devinfo->gen >= 11) { + if (size) + *size = ARRAY_SIZE(gen11_hw_type); + return gen11_hw_type; + } else { + if (size) + *size = ARRAY_SIZE(gen4_hw_type); + return gen4_hw_type; + } +} + /** * Convert a brw_reg_type enumeration value into the hardware representation. * @@ -203,16 +217,10 @@ brw_reg_type_to_hw_type(const struct gen_device_info *devinfo, enum brw_reg_file file, enum brw_reg_type type) { - const struct hw_type *table; - - if (devinfo->gen >= 11) { - assert(type < ARRAY_SIZE(gen11_hw_type)); - table = gen11_hw_type; - } else { - assert(type < ARRAY_SIZE(gen4_hw_type)); - table = gen4_hw_type; - } + uint32_t table_size; + const struct hw_type *table = get_hw_type_map(devinfo, &table_size); + assert(type < table_size); assert(devinfo->has_64bit_types || brw_reg_type_to_size(type) < 8 || type == BRW_REGISTER_TYPE_NF); @@ -234,13 +242,7 @@ enum brw_reg_type brw_hw_type_to_reg_type(const struct gen_device_info *devinfo, enum brw_reg_file file, unsigned hw_type) { - const struct hw_type *table; - - if (devinfo->gen >= 11) { - table = gen11_hw_type; - } else { - table = gen4_hw_type; - } + const struct hw_type *table = get_hw_type_map(devinfo, NULL); if (file == BRW_IMMEDIATE_VALUE) { for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) { -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 05/42] intel/compiler: assert restrictions on conversions to half-float
There are some hardware restrictions that brw_nir_lower_conversions should have taken care of before we get here. v2: - rebased on top of regioning lowering pass Reviewed-by: Topi Pohjolainen (v1) --- src/intel/compiler/brw_fs_nir.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index e1d0e318b35..d742f55a957 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -784,6 +784,9 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) */ case nir_op_f2f16: + case nir_op_i2f16: + case nir_op_u2f16: + assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ inst = bld.MOV(result, op[0]); inst->saturate = instr->dest.saturate; break; @@ -821,8 +824,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) case nir_op_u2u32: case nir_op_i2i16: case nir_op_u2u16: - case nir_op_i2f16: - case nir_op_u2f16: case nir_op_i2i8: case nir_op_u2u8: inst = bld.MOV(result, op[0]); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 11/42] intel/compiler: lower 16-bit fmod
Reviewed-by: Jason Ekstrand --- src/intel/compiler/brw_compiler.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c index fe632c5badc..f885e79c3e6 100644 --- a/src/intel/compiler/brw_compiler.c +++ b/src/intel/compiler/brw_compiler.c @@ -33,6 +33,7 @@ .lower_sub = true, \ .lower_fdiv = true,\ .lower_scmp = true,\ + .lower_fmod16 = true, \ .lower_fmod32 = true, \ .lower_fmod64 = false, \ .lower_bitfield_extract = true,\ -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 04/42] intel/compiler: handle b2i/b2f with other integer conversion opcodes
Since we handle booleans as integers this makes more sense. v2: - rebased to incorporate new boolean conversion opcodes v3: - rebased on top regioning lowering pass Reviewed-by: Jason Ekstrand (v1) Reviewed-by: Topi Pohjolainen (v2) --- src/intel/compiler/brw_fs_nir.cpp | 16 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index a59debf2b78..e1d0e318b35 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -788,6 +788,14 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) inst->saturate = instr->dest.saturate; break; + case nir_op_f2f64: + case nir_op_f2i64: + case nir_op_f2u64: + assert(type_sz(op[0].type) > 2); /* brw_nir_lower_conversions */ + inst = bld.MOV(result, op[0]); + inst->saturate = instr->dest.saturate; + break; + case nir_op_b2i8: case nir_op_b2i16: case nir_op_b2i32: @@ -798,14 +806,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) op[0].type = BRW_REGISTER_TYPE_D; op[0].negate = !op[0].negate; /* fallthrough */ - case nir_op_f2f64: - case nir_op_f2i64: - case nir_op_f2u64: - assert(type_sz(op[0].type) > 2); /* brw_nir_lower_conversions */ - inst = bld.MOV(result, op[0]); - inst->saturate = instr->dest.saturate; - break; - case nir_op_i2f64: case nir_op_i2i64: case nir_op_u2f64: -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 15/42] intel/compiler: Extended Math is limited to SIMD8 on half-float
From the Skylake PRM, Extended Math Function: "The execution size must be no more than 8 when half-floats are used in source or destination operand." Earlier generations do not support Extended Math with half-float. v2: - Rewrite the code to make it more readable (Jason). v3: - Use if-ladders or just if+return exclusively (Topi). Reviewed-by: Topi Pohjolainen (v1) --- src/intel/compiler/brw_fs.cpp | 27 ++- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 0359eb079f7..0b3ec94e2d2 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -5493,18 +5493,27 @@ get_lowered_simd_width(const struct gen_device_info *devinfo, case SHADER_OPCODE_EXP2: case SHADER_OPCODE_LOG2: case SHADER_OPCODE_SIN: - case SHADER_OPCODE_COS: + case SHADER_OPCODE_COS: { /* Unary extended math instructions are limited to SIMD8 on Gen4 and - * Gen6. + * Gen6. Extended Math Function is limited to SIMD8 with half-float. */ - return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) : - devinfo->gen == 5 || devinfo->is_g4x ? MIN2(16, inst->exec_size) : - MIN2(8, inst->exec_size)); + if (devinfo->gen == 6 || (devinfo->gen == 4 && !devinfo->is_g4x)) + return MIN2(8, inst->exec_size); + if (inst->dst.type == BRW_REGISTER_TYPE_HF) + return MIN2(8, inst->exec_size); + return MIN2(16, inst->exec_size); + } - case SHADER_OPCODE_POW: - /* SIMD16 is only allowed on Gen7+. */ - return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) : - MIN2(8, inst->exec_size)); + case SHADER_OPCODE_POW: { + /* SIMD16 is only allowed on Gen7+. Extended Math Function is limited + * to SIMD8 with half-float + */ + if (devinfo->gen < 7) + return MIN2(8, inst->exec_size); + if (inst->dst.type == BRW_REGISTER_TYPE_HF) + return MIN2(8, inst->exec_size); + return MIN2(16, inst->exec_size); + } case SHADER_OPCODE_INT_QUOTIENT: case SHADER_OPCODE_INT_REMAINDER: -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 08/42] intel/compiler: implement 16-bit fsign
v2: - make 16-bit be its own separate case (Jason) Reviewed-by: Topi Pohjolainen --- src/intel/compiler/brw_fs_nir.cpp | 18 +- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index d742f55a957..cf546b8ff09 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -844,7 +844,22 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) : bld.MOV(result, brw_imm_f(1.0f)); set_predicate(BRW_PREDICATE_NORMAL, inst); - } else if (type_sz(op[0].type) < 8) { + } else if (type_sz(op[0].type) == 2) { + /* AND(val, 0x8000) gives the sign bit. + * + * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero. + */ + fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF); + bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ); + + fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UW); + op[0].type = BRW_REGISTER_TYPE_UW; + result.type = BRW_REGISTER_TYPE_UW; + bld.AND(result_int, op[0], brw_imm_uw(0x8000u)); + + inst = bld.OR(result_int, result_int, brw_imm_uw(0x3c00u)); + inst->predicate = BRW_PREDICATE_NORMAL; + } else if (type_sz(op[0].type) == 4) { /* AND(val, 0x8000) gives the sign bit. * * Predicated OR ORs 1.0 (0x3f80) with the sign bit if val is not @@ -866,6 +881,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) * - The sign is encoded in the high 32-bit of each DF * - We need to produce a DF result. */ + assert(type_sz(op[0].type) == 8); fs_reg zero = vgrf(glsl_type::double_type); bld.MOV(zero, setup_imm_df(bld, 0.0)); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 22/42] intel/compiler: don't propagate HF immediates to 3-src instructions
3-src instructions don't support immediates, but since 36bc5f06dd22, we allow them on MAD and LRP relying on the combine constants pass to fix it up later. However, that pass is specialized for 32-bit float immediates and can't handle HF constants at present, so this patch ensures that copy-propagation only does this for 32-bit constants. Reviewed-by: Topi Pohjolainen --- src/intel/compiler/brw_fs_copy_propagation.cpp | 12 ++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp index c23ce1ef426..77f2749ba04 100644 --- a/src/intel/compiler/brw_fs_copy_propagation.cpp +++ b/src/intel/compiler/brw_fs_copy_propagation.cpp @@ -772,8 +772,16 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry) case BRW_OPCODE_MAD: case BRW_OPCODE_LRP: - inst->src[i] = val; - progress = true; + /* 3-src instructions can't take IMM registers, however, for 32-bit + * floating instructions we rely on the combine constants pass to fix + * it up. For anything else, we shouldn't be promoting immediates + * until we can make the pass capable of combining constants of + * different sizes. + */ + if (val.type == BRW_REGISTER_TYPE_F) { +inst->src[i] = val; +progress = true; + } break; default: -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 24/42] intel/compiler: fix ddy for half-float in gen8
We use ALign16 mode for this, since it is more convenient, but the PRM for Broadwell states in Volume 3D Media GPGPU, Chapter 'Register region restrictions', Section '1. Special Restrictions': "In Align16 mode, the channel selects and channel enables apply to a pair of half-floats, because these parameters are defined for DWord elements ONLY. This is applicable when both source and destination are half-floats." This means that we cannot select individual HF elements using swizzles like we do with 32-bit floats so we can't implement the required regioning for this. Use the gen11 path for this instead, which uses Align1 mode. The restriction is not present in gen9 or gen10, where the Align16 implementation seems to work just fine. Reviewed-by: Jason Ekstrand --- src/intel/compiler/brw_fs_generator.cpp | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index d0cc4a6d231..4310f0b7fdc 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -1339,8 +1339,14 @@ fs_generator::generate_ddy(const fs_inst *inst, const uint32_t type_size = type_sz(src.type); if (inst->opcode == FS_OPCODE_DDY_FINE) { - /* produce accurate derivatives */ - if (devinfo->gen >= 11) { + /* produce accurate derivatives. We can do this easily in Align16 + * but this is not supported in gen11+ and gen8 Align16 swizzles + * for Half-Float operands work in units of 32-bit and always + * select pairs of consecutive half-float elements, so we can't use + * use it for this. + */ + if (devinfo->gen >= 11 || + (devinfo->gen == 8 && src.type == BRW_REGISTER_TYPE_HF)) { src = stride(src, 0, 2, 1); struct brw_reg src_0 = byte_offset(src, 0 * type_size); struct brw_reg src_2 = byte_offset(src, 2 * type_size); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 00/42] intel: VK_KHR_shader_float16_int8 implementation
The changes in this version address review feedback to v2 and, most importantly, rebase on top of relevant changes in master, specifically Curro's regioning lowering pass. This new regioning pass simplifies some of the NIR translation code (specifically the code for translating regioning restrictions on conversions for atom platforms) making some of the previous work in this series unnecessary. The regioning restrictions for conversions between integer and half-float added with this series are are now implemented as part of this framework instead of doing it at NIR translation time. This version of the series also dropped the SPIR-V compiler patches that have already been merged. As always, a branch for with these patches is available for testing in the itoral/VK_KHR_shader_float16_int8 branch of the Igalia Mesa repository at https://github.com/Igalia/mesa. Iago Toral Quiroga (42): intel/compiler: handle conversions between int and half-float on atom intel/compiler: add a NIR pass to lower conversions intel/compiler: split float to 64-bit opcodes from int to 64-bit intel/compiler: handle b2i/b2f with other integer conversion opcodes intel/compiler: assert restrictions on conversions to half-float intel/compiler: lower some 16-bit float operations to 32-bit intel/compiler: lower 16-bit extended math to 32-bit prior to gen9 intel/compiler: implement 16-bit fsign intel/compiler: allow extended math functions with HF operands compiler/nir: add lowering option for 16-bit fmod intel/compiler: lower 16-bit fmod compiler/nir: add lowering for 16-bit flrp intel/compiler: lower 16-bit flrp compiler/nir: add lowering for 16-bit ldexp intel/compiler: Extended Math is limited to SIMD8 on half-float intel/compiler: add instruction setters for Src1Type and Src2Type. intel/compiler: add new half-float register type for 3-src instructions intel/compiler: add a helper function to query hardware type table intel/compiler: don't compact 3-src instructions with Src1Type or Src2Type bits intel/compiler: allow half-float on 3-source instructions since gen8 intel/compiler: set correct precision fields for 3-source float instructions intel/compiler: don't propagate HF immediates to 3-src instructions intel/compiler: fix ddx and ddy for 16-bit float intel/compiler: fix ddy for half-float in gen8 intel/compiler: workaround for SIMD8 half-float MAD in gen8 intel/compiler: split is_partial_write() into two variants intel/compiler: activate 16-bit bit-size lowerings also for 8-bit intel/compiler: handle 64-bit float to 8-bit integer conversions intel/compiler: handle conversions between int and half-float on atom intel/compiler: implement isign for int8 intel/compiler: ask for an integer type if requesting an 8-bit type intel/eu: force stride of 2 on NULL register for Byte instructions compiler/spirv: add support for Float16 and Int8 capabilities anv/pipeline: support Float16 and Int8 capabilities in gen8+ anv/device: expose shaderFloat16 and shaderInt8 in gen8+ intel/compiler: implement is_zero, is_one, is_negative_one for 8-bit/16-bit intel/compiler: add a brw_reg_type_is_integer helper intel/compiler: fix cmod propagation for non 32-bit types intel/compiler: remove MAD/LRP algebraic optimizations from the backend intel/compiler: support half-float in the combine constants pass intel/compiler: fix combine constants for Align16 with half-float prior to gen9 intel/compiler: allow propagating HF immediates to MAD/LRP src/compiler/nir/nir.h| 2 + src/compiler/nir/nir_opt_algebraic.py | 11 +- src/compiler/shader_info.h| 2 + src/compiler/spirv/spirv_to_nir.c | 8 +- src/intel/Makefile.sources| 1 + src/intel/compiler/brw_compiler.c | 2 + src/intel/compiler/brw_eu_compact.c | 5 +- src/intel/compiler/brw_eu_emit.c | 36 +++- src/intel/compiler/brw_fs.cpp | 143 ++-- src/intel/compiler/brw_fs.h | 1 + .../compiler/brw_fs_cmod_propagation.cpp | 34 ++-- .../compiler/brw_fs_combine_constants.cpp | 82 +++-- .../compiler/brw_fs_copy_propagation.cpp | 14 +- src/intel/compiler/brw_fs_cse.cpp | 3 +- .../compiler/brw_fs_dead_code_eliminate.cpp | 2 +- src/intel/compiler/brw_fs_generator.cpp | 47 +++--- src/intel/compiler/brw_fs_live_variables.cpp | 2 +- src/intel/compiler/brw_fs_nir.cpp | 85 -- src/intel/compiler/brw_fs_reg_allocate.cpp| 2 +- .../compiler/brw_fs_register_coalesce.cpp | 2 +- .../compiler/brw_fs_saturate_propagation.cpp | 7 +- src/intel/compiler/brw_fs_sel_peephole.cpp| 4 +- src/intel/compiler/brw_inst.h | 2 + src/intel/compiler/brw_ir_fs.h| 36 +++- src/intel/compiler/brw_nir.c
[Mesa-dev] [PATCH v3 01/42] intel/compiler: handle conversions between int and half-float on atom
v2: adapted to work with the new regioning lowering pass Reviewed-by: Topi Pohjolainen (v1) --- src/intel/compiler/brw_ir_fs.h | 33 ++--- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h index 3c23fb375e4..ba4d6a95720 100644 --- a/src/intel/compiler/brw_ir_fs.h +++ b/src/intel/compiler/brw_ir_fs.h @@ -497,9 +497,10 @@ is_unordered(const fs_inst *inst) } /** - * Return whether the following regioning restriction applies to the specified - * instruction. From the Cherryview PRM Vol 7. "Register Region - * Restrictions": + * Return whether one of the the following regioning restrictions apply to the + * specified instruction. + * + * From the Cherryview PRM Vol 7. "Register Region Restrictions": * * "When source or destination datatype is 64b or operation is integer DWord * multiply, regioning in Align1 must follow these rules: @@ -508,6 +509,14 @@ is_unordered(const fs_inst *inst) * 2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride. * 3. Source and Destination offset must be the same, except the case of * scalar source." + * + * From the Cherryview PRM Vol 7. "Register Region Restrictions": + * + *"Conversion between Integer and HF (Half Float) must be DWord + * aligned and strided by a DWord on the destination." + * + *The same restriction is listed for other hardware platforms, however, + *empirical testing suggests that only atom platforms are affected. */ static inline bool has_dst_aligned_region_restriction(const gen_device_info *devinfo, @@ -518,10 +527,20 @@ has_dst_aligned_region_restriction(const gen_device_info *devinfo, (inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD); if (type_sz(inst->dst.type) > 4 || type_sz(exec_type) > 4 || - (type_sz(exec_type) == 4 && is_int_multiply)) - return devinfo->is_cherryview || gen_device_info_is_9lp(devinfo); - else - return false; + (type_sz(exec_type) == 4 && is_int_multiply)) { + if (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo)) + return true; + } + + const bool dst_type_is_hf = inst->dst.type == BRW_REGISTER_TYPE_HF; + const bool exec_type_is_hf = exec_type == BRW_REGISTER_TYPE_HF; + if ((dst_type_is_hf && !brw_reg_type_is_floating_point(exec_type)) || + (exec_type_is_hf && !brw_reg_type_is_floating_point(inst->dst.type))) { + if (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo)) + return true; + } + + return false; } #endif -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 09/42] intel/compiler: allow extended math functions with HF operands
The PRM states that half-float operands are supported since gen9. Reviewed-by: Topi Pohjolainen --- src/intel/compiler/brw_eu_emit.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c index 45e2552783b..e21df4624b3 100644 --- a/src/intel/compiler/brw_eu_emit.c +++ b/src/intel/compiler/brw_eu_emit.c @@ -1874,8 +1874,10 @@ void gen6_math(struct brw_codegen *p, assert(src1.file == BRW_GENERAL_REGISTER_FILE || (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE)); } else { - assert(src0.type == BRW_REGISTER_TYPE_F); - assert(src1.type == BRW_REGISTER_TYPE_F); + assert(src0.type == BRW_REGISTER_TYPE_F || + (src0.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9)); + assert(src1.type == BRW_REGISTER_TYPE_F || + (src1.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9)); } /* Source modifiers are ignored for extended math instructions on Gen6. */ -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 02/42] intel/compiler: add a NIR pass to lower conversions
Some conversions are not directly supported in hardware and need to be split in two conversion instructions going through an intermediary type. Doing this at the NIR level simplifies a bit the complexity in the backend. v2: - Consider fp16 rounding conversion opcodes - Properly handle swizzles on conversion sources. Reviewed-by: Topi Pohjolainen (v1) --- src/intel/Makefile.sources| 1 + src/intel/compiler/brw_nir.c | 1 + src/intel/compiler/brw_nir.h | 2 + .../compiler/brw_nir_lower_conversions.c | 158 ++ src/intel/compiler/meson.build| 1 + 5 files changed, 163 insertions(+) create mode 100644 src/intel/compiler/brw_nir_lower_conversions.c diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources index 94a28d370e8..9975daa3ad1 100644 --- a/src/intel/Makefile.sources +++ b/src/intel/Makefile.sources @@ -83,6 +83,7 @@ COMPILER_FILES = \ compiler/brw_nir_analyze_boolean_resolves.c \ compiler/brw_nir_analyze_ubo_ranges.c \ compiler/brw_nir_attribute_workarounds.c \ + compiler/brw_nir_lower_conversions.c \ compiler/brw_nir_lower_cs_intrinsics.c \ compiler/brw_nir_lower_image_load_store.c \ compiler/brw_nir_lower_mem_access_bit_sizes.c \ diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 92d7fe4bede..572ab824a94 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -882,6 +882,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler, OPT(nir_opt_move_comparisons); OPT(nir_lower_bool_to_int32); + OPT(brw_nir_lower_conversions); OPT(nir_lower_locals_to_regs); diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h index bc81950d47e..662b2627e95 100644 --- a/src/intel/compiler/brw_nir.h +++ b/src/intel/compiler/brw_nir.h @@ -114,6 +114,8 @@ void brw_nir_lower_tcs_outputs(nir_shader *nir, const struct brw_vue_map *vue, GLenum tes_primitive_mode); void brw_nir_lower_fs_outputs(nir_shader *nir); +bool brw_nir_lower_conversions(nir_shader *nir); + bool brw_nir_lower_image_load_store(nir_shader *nir, const struct gen_device_info *devinfo); void brw_nir_rewrite_image_intrinsic(nir_intrinsic_instr *intrin, diff --git a/src/intel/compiler/brw_nir_lower_conversions.c b/src/intel/compiler/brw_nir_lower_conversions.c new file mode 100644 index 000..583167c7753 --- /dev/null +++ b/src/intel/compiler/brw_nir_lower_conversions.c @@ -0,0 +1,158 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_nir.h" +#include "compiler/nir/nir_builder.h" + +static nir_op +get_conversion_op(nir_alu_type src_type, + unsigned src_bit_size, + nir_alu_type dst_type, + unsigned dst_bit_size, + nir_rounding_mode rounding_mode) +{ + nir_alu_type src_full_type = (nir_alu_type) (src_type | src_bit_size); + nir_alu_type dst_full_type = (nir_alu_type) (dst_type | dst_bit_size); + + return nir_type_conversion_op(src_full_type, dst_full_type, rounding_mode); +} + +static nir_rounding_mode +get_opcode_rounding_mode(nir_op op) +{ + switch (op) { + case nir_op_f2f16_rtz: + return nir_rounding_mode_rtz; + case nir_op_f2f16_rtne: + return nir_rounding_mode_rtne; + default: + return nir_rounding_mode_undef; + } +} + +static void +split_conversion(nir_builder *b, nir_alu_instr *alu, nir_op op1, nir_op op2) +{ + b->cursor = nir_before_instr(&alu->instr); + assert(alu->dest.write_mask == 1); + nir_ssa_def *src = nir_ssa_for_alu_src(b, alu, 0); + nir_ssa_def *tmp = nir_build_alu(b, op1, src, NULL, NULL, NULL); + nir_ssa_def *res = nir_build_alu(b, op2, tmp, NULL, NULL, NUL
[Mesa-dev] [PATCH v4] anv/device: fix maximum number of images supported
We had defined MAX_IMAGES as 8, which we used to size the array for image push constant data. The comment there stated that this was for gen8, but anv_nir_apply_pipeline_layout runs for all gens and writes that array, asserting that we don't exceed that number of images, which imposes a limit of MAX_IMAGES on all gens. Furthermore, despite this, we are exposing up to 64 images per shader stage on all gens, gen8 included. This patch lowers the number of images we expose in gen8 to 8 and keeps 64 images for gen9+ while making sure that only pre-SKL gens use push constant space to handle images. v2: - <= instead of < in the assert (Eric, Lionel) - Change the way the assertion is written (Eric) v3: - Revert the way the assertion is written to the form it had in v1, the version in v2 was not equivalent and was incorrect. (Lionel) v4: - gen9+ doesn't need push constants for images at all (Jason) --- src/intel/vulkan/anv_device.c | 7 -- .../vulkan/anv_nir_apply_pipeline_layout.c| 4 +-- src/intel/vulkan/anv_private.h| 5 ++-- src/intel/vulkan/genX_cmd_buffer.c| 25 +-- 4 files changed, 28 insertions(+), 13 deletions(-) diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 523f1483e29..f85458b672e 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -987,9 +987,12 @@ void anv_GetPhysicalDeviceProperties( const uint32_t max_samplers = (devinfo->gen >= 8 || devinfo->is_haswell) ? 128 : 16; + const uint32_t max_images = devinfo->gen < 9 ? MAX_GEN8_IMAGES : MAX_IMAGES; + VkSampleCountFlags sample_counts = isl_device_get_sample_counts(&pdevice->isl_dev); + VkPhysicalDeviceLimits limits = { .maxImageDimension1D = (1 << 14), .maxImageDimension2D = (1 << 14), @@ -1009,7 +1012,7 @@ void anv_GetPhysicalDeviceProperties( .maxPerStageDescriptorUniformBuffers = 64, .maxPerStageDescriptorStorageBuffers = 64, .maxPerStageDescriptorSampledImages = max_samplers, - .maxPerStageDescriptorStorageImages = 64, + .maxPerStageDescriptorStorageImages = max_images, .maxPerStageDescriptorInputAttachments= 64, .maxPerStageResources = 250, .maxDescriptorSetSamplers = 6 * max_samplers, /* number of stages * maxPerStageDescriptorSamplers */ @@ -1018,7 +1021,7 @@ void anv_GetPhysicalDeviceProperties( .maxDescriptorSetStorageBuffers = 6 * 64, /* number of stages * maxPerStageDescriptorStorageBuffers */ .maxDescriptorSetStorageBuffersDynamic= MAX_DYNAMIC_BUFFERS / 2, .maxDescriptorSetSampledImages= 6 * max_samplers, /* number of stages * maxPerStageDescriptorSampledImages */ - .maxDescriptorSetStorageImages= 6 * 64, /* number of stages * maxPerStageDescriptorStorageImages */ + .maxDescriptorSetStorageImages= 6 * max_images, /* number of stages * maxPerStageDescriptorStorageImages */ .maxDescriptorSetInputAttachments = 256, .maxVertexInputAttributes = MAX_VBS, .maxVertexInputBindings = MAX_VBS, diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c index b3daf702bc0..623984b0f8c 100644 --- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c +++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c @@ -528,8 +528,8 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, } } - if (map->image_count > 0) { - assert(map->image_count <= MAX_IMAGES); + if (map->image_count > 0 && pdevice->compiler->devinfo->gen < 9) { + assert(map->image_count <= MAX_GEN8_IMAGES); assert(shader->num_uniforms == prog_data->nr_params * 4); state.first_image_uniform = shader->num_uniforms; uint32_t *param = brw_stage_prog_data_add_params(prog_data, diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 770254e93ea..47878adb066 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -157,7 +157,8 @@ struct gen_l3_config; #define MAX_SCISSORS16 #define MAX_PUSH_CONSTANTS_SIZE 128 #define MAX_DYNAMIC_BUFFERS 16 -#define MAX_IMAGES 8 +#define MAX_IMAGES 64 +#define MAX_GEN8_IMAGES 8 #define MAX_PUSH_DESCRIPTORS 32 /* Minimum requirement */ /* The kernel relocation API has a limitation of a 32-bit delta value @@ -1883,7 +1884,7 @@ struct anv_push_constants { uint32_t base_work_group_id[3]; /* Image data for image_load_store on pre-SKL */ - struct brw_image_param images[MAX_IMAGES]; + struct brw_image_param images[MAX_GEN8_IMAGES]; }; struct anv_dynamic_state { diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_b