Module: Mesa Branch: master Commit: 98a52fecdaaac073943fb0f1322a29d01bfeb9c7 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=98a52fecdaaac073943fb0f1322a29d01bfeb9c7
Author: Marek Olšák <[email protected]> Date: Mon May 11 02:42:18 2020 -0400 radeonsi: implement 16-bit FS color outputs This removes type conversions from 16 bits to 32 bits in the main function and then back to 16 bits in the epilog. Acked-by: Pierre-Eric Pelloux-Prayer <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6622> --- src/amd/llvm/ac_llvm_build.c | 22 +++++ src/amd/llvm/ac_llvm_build.h | 2 + src/amd/llvm/ac_nir_to_llvm.c | 1 + src/gallium/drivers/radeonsi/si_shader.c | 1 + src/gallium/drivers/radeonsi/si_shader.h | 10 ++ src/gallium/drivers/radeonsi/si_shader_llvm.c | 7 +- src/gallium/drivers/radeonsi/si_shader_llvm_ps.c | 115 ++++++++++++++++++----- src/gallium/drivers/radeonsi/si_shader_nir.c | 18 ++++ 8 files changed, 152 insertions(+), 24 deletions(-) diff --git a/src/amd/llvm/ac_llvm_build.c b/src/amd/llvm/ac_llvm_build.c index 9166ba8721c..506ea58ec97 100644 --- a/src/amd/llvm/ac_llvm_build.c +++ b/src/amd/llvm/ac_llvm_build.c @@ -2285,6 +2285,28 @@ LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef a return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); } +LLVMValueRef ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context *ctx, + LLVMValueRef args[2]) +{ + LLVMTypeRef param_types[] = {ctx->f16, ctx->f16}; + LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false); + LLVMValueRef code = LLVMConstInlineAsm(calltype, + "v_cvt_pknorm_i16_f16 $0, $1, $2", "=v,v,v", + false, false); + return LLVMBuildCall(ctx->builder, code, args, 2, ""); +} + +LLVMValueRef ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context *ctx, + LLVMValueRef args[2]) +{ + LLVMTypeRef param_types[] = {ctx->f16, ctx->f16}; + LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false); + LLVMValueRef code = LLVMConstInlineAsm(calltype, + "v_cvt_pknorm_u16_f16 $0, $1, $2", "=v,v,v", + false, false); + return LLVMBuildCall(ctx->builder, code, args, 2, ""); +} + /* The 8-bit and 10-bit clamping is for HW workarounds. */ LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits, bool hi) diff --git a/src/amd/llvm/ac_llvm_build.h b/src/amd/llvm/ac_llvm_build.h index 756bbebd8f5..2e08a990b2d 100644 --- a/src/amd/llvm/ac_llvm_build.h +++ b/src/amd/llvm/ac_llvm_build.h @@ -432,6 +432,8 @@ LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMVa LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2]); LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2]); LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2]); +LLVMValueRef ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2]); +LLVMValueRef ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2]); LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits, bool hi); LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits, diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index c86339205ee..c1411366871 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -2450,6 +2450,7 @@ static void visit_store_output(struct ac_nir_context *ctx, nir_intrinsic_instr * indir_index = get_src(ctx, offset); switch (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src))) { + case 16: case 32: break; case 64: diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 8a26ac0d06d..a83289ff1dd 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -2260,6 +2260,7 @@ void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *ke struct si_shader_info *info = &shader->selector->info; memset(key, 0, sizeof(*key)); key->ps_epilog.colors_written = info->colors_written; + key->ps_epilog.color_types = info->output_color_types; key->ps_epilog.writes_z = info->writes_z; key->ps_epilog.writes_stencil = info->writes_stencil; key->ps_epilog.writes_samplemask = info->writes_samplemask; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index d26f36a4388..da74404008e 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -315,6 +315,13 @@ struct si_compiler_ctx_state { bool is_debug_context; }; +enum si_color_output_type { + SI_TYPE_ANY32, + SI_TYPE_FLOAT16, + SI_TYPE_INT16, + SI_TYPE_UINT16, +}; + struct si_shader_info { shader_info base; @@ -330,6 +337,7 @@ struct si_shader_info { ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS]; ubyte output_readmask[PIPE_MAX_SHADER_OUTPUTS]; ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS]; + ubyte output_type[PIPE_MAX_SHADER_OUTPUTS]; /* enum nir_alu_type */ ubyte color_interpolate[2]; ubyte color_interpolate_loc[2]; @@ -341,6 +349,7 @@ struct si_shader_info { ubyte colors_read; /**< which color components are read by the FS */ ubyte colors_written; + uint16_t output_color_types; /**< Each bit pair is enum si_color_output_type */ bool color0_writes_all_cbufs; /**< gl_FragColor */ bool reads_samplemask; /**< does fragment shader read sample mask? */ bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */ @@ -577,6 +586,7 @@ union si_shader_part_key { struct { struct si_ps_epilog_bits states; unsigned colors_written : 8; + unsigned color_types : 16; unsigned writes_z : 1; unsigned writes_stencil : 1; unsigned writes_samplemask : 1; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index ab3aed107e3..e5f14cc0c9c 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -450,8 +450,13 @@ bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir) const struct si_shader_info *info = &ctx->shader->selector->info; for (unsigned i = 0; i < info->num_outputs; i++) { + LLVMTypeRef type = ctx->ac.f32; + + if (nir_alu_type_get_type_size(ctx->shader->selector->info.output_type[i]) == 16) + type = ctx->ac.f16; + for (unsigned j = 0; j < 4; j++) - ctx->abi.outputs[i * 4 + j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, ""); + ctx->abi.outputs[i * 4 + j] = ac_build_alloca_undef(&ctx->ac, type, ""); } ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args, nir); diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c index 37711eefa04..4527a9c4a88 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c @@ -209,6 +209,9 @@ static void si_alpha_test(struct si_shader_context *ctx, LLVMValueRef alpha) assert(cond); LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn, SI_PARAM_ALPHA_REF); + if (LLVMTypeOf(alpha) == ctx->ac.f16) + alpha_ref = LLVMBuildFPTrunc(ctx->ac.builder, alpha_ref, ctx->ac.f16, ""); + LLVMValueRef alpha_pass = LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, ""); ac_build_kill_if_false(&ctx->ac, alpha_pass); } else { @@ -233,6 +236,9 @@ static LLVMValueRef si_scale_alpha_by_sample_mask(struct si_shader_context *ctx, coverage = LLVMBuildFMul(ctx->ac.builder, coverage, LLVMConstReal(ctx->ac.f32, 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), ""); + if (LLVMTypeOf(alpha) == ctx->ac.f16) + coverage = LLVMBuildFPTrunc(ctx->ac.builder, coverage, ctx->ac.f16, ""); + return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, ""); } @@ -241,10 +247,36 @@ struct si_ps_exports { struct ac_export_args args[10]; }; +static LLVMValueRef pack_two_16bit(struct ac_llvm_context *ctx, LLVMValueRef args[2]) +{ + LLVMValueRef tmp = ac_build_gather_values(ctx, args, 2); + return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2f16, ""); +} + +static LLVMValueRef get_color_32bit(struct si_shader_context *ctx, unsigned color_type, + LLVMValueRef value) +{ + switch (color_type) { + case SI_TYPE_FLOAT16: + return LLVMBuildFPExt(ctx->ac.builder, value, ctx->ac.f32, ""); + case SI_TYPE_INT16: + value = ac_to_integer(&ctx->ac, value); + value = LLVMBuildSExt(ctx->ac.builder, value, ctx->ac.i32, ""); + return ac_to_float(&ctx->ac, value); + case SI_TYPE_UINT16: + value = ac_to_integer(&ctx->ac, value); + value = LLVMBuildZExt(ctx->ac.builder, value, ctx->ac.i32, ""); + return ac_to_float(&ctx->ac, value); + case SI_TYPE_ANY32: + return value; + } + return NULL; +} + /* Initialize arguments for the shader export intrinsic */ static void si_llvm_init_ps_export_args(struct si_shader_context *ctx, LLVMValueRef *values, unsigned cbuf, unsigned compacted_mrt_index, - struct ac_export_args *args) + unsigned color_type, struct ac_export_args *args) { const struct si_shader_key *key = &ctx->shader->key; unsigned col_formats = key->part.ps.epilog.spi_shader_col_format; @@ -289,49 +321,65 @@ static void si_llvm_init_ps_export_args(struct si_shader_context *ctx, LLVMValue case V_028714_SPI_SHADER_32_R: args->enabled_channels = 1; /* writemask */ - args->out[0] = values[0]; + args->out[0] = get_color_32bit(ctx, color_type, values[0]); break; case V_028714_SPI_SHADER_32_GR: args->enabled_channels = 0x3; /* writemask */ - args->out[0] = values[0]; - args->out[1] = values[1]; + args->out[0] = get_color_32bit(ctx, color_type, values[0]); + args->out[1] = get_color_32bit(ctx, color_type, values[1]); break; case V_028714_SPI_SHADER_32_AR: if (ctx->screen->info.chip_class >= GFX10) { args->enabled_channels = 0x3; /* writemask */ - args->out[0] = values[0]; - args->out[1] = values[3]; + args->out[0] = get_color_32bit(ctx, color_type, values[0]); + args->out[1] = get_color_32bit(ctx, color_type, values[3]); } else { args->enabled_channels = 0x9; /* writemask */ - args->out[0] = values[0]; - args->out[3] = values[3]; + args->out[0] = get_color_32bit(ctx, color_type, values[0]); + args->out[3] = get_color_32bit(ctx, color_type, values[3]); } break; case V_028714_SPI_SHADER_FP16_ABGR: - packf = ac_build_cvt_pkrtz_f16; + if (color_type != SI_TYPE_ANY32) + packf = pack_two_16bit; + else + packf = ac_build_cvt_pkrtz_f16; break; case V_028714_SPI_SHADER_UNORM16_ABGR: - packf = ac_build_cvt_pknorm_u16; + if (color_type != SI_TYPE_ANY32) + packf = ac_build_cvt_pknorm_u16_f16; + else + packf = ac_build_cvt_pknorm_u16; break; case V_028714_SPI_SHADER_SNORM16_ABGR: - packf = ac_build_cvt_pknorm_i16; + if (color_type != SI_TYPE_ANY32) + packf = ac_build_cvt_pknorm_i16_f16; + else + packf = ac_build_cvt_pknorm_i16; break; case V_028714_SPI_SHADER_UINT16_ABGR: - packi = ac_build_cvt_pk_u16; + if (color_type != SI_TYPE_ANY32) + packf = pack_two_16bit; + else + packi = ac_build_cvt_pk_u16; break; case V_028714_SPI_SHADER_SINT16_ABGR: - packi = ac_build_cvt_pk_i16; + if (color_type != SI_TYPE_ANY32) + packf = pack_two_16bit; + else + packi = ac_build_cvt_pk_i16; break; case V_028714_SPI_SHADER_32_ABGR: - memcpy(&args->out[0], values, sizeof(values[0]) * 4); + for (unsigned i = 0; i < 4; i++) + args->out[i] = get_color_32bit(ctx, color_type, values[i]); break; } @@ -362,7 +410,7 @@ static void si_llvm_init_ps_export_args(struct si_shader_context *ctx, LLVMValue static bool si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *color, unsigned index, unsigned compacted_mrt_index, unsigned samplemask_param, - bool is_last, struct si_ps_exports *exp) + bool is_last, unsigned color_type, struct si_ps_exports *exp) { int i; @@ -373,7 +421,7 @@ static bool si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *col /* Alpha to one */ if (ctx->shader->key.part.ps.epilog.alpha_to_one) - color[3] = ctx->ac.f32_1; + color[3] = LLVMConstReal(LLVMTypeOf(color[0]), 1); /* Alpha test */ if (index == 0 && ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) @@ -392,7 +440,8 @@ static bool si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *col /* Get the export arguments, also find out what the last one is. */ for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) { - si_llvm_init_ps_export_args(ctx, color, c, compacted_mrt_index, &args[c]); + si_llvm_init_ps_export_args(ctx, color, c, compacted_mrt_index, + color_type, &args[c]); if (args[c].enabled_channels) { compacted_mrt_index++; last = c; @@ -415,7 +464,8 @@ static bool si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *col struct ac_export_args args; /* Export */ - si_llvm_init_ps_export_args(ctx, color, index, compacted_mrt_index, &args); + si_llvm_init_ps_export_args(ctx, color, index, compacted_mrt_index, + color_type, &args); if (is_last) { args.valid_mask = 1; /* whether the EXEC mask is valid */ args.done = 1; /* DONE bit */ @@ -500,8 +550,17 @@ static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi, unsigned max_ou if (!color[i][0]) continue; - for (j = 0; j < 4; j++) - ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, ""); + if (LLVMTypeOf(color[i][0]) == ctx->ac.f16) { + for (j = 0; j < 2; j++) { + LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, &color[i][j * 2], 2); + tmp = LLVMBuildBitCast(builder, tmp, ctx->ac.f32, ""); + ret = LLVMBuildInsertValue(builder, ret, tmp, vgpr++, ""); + } + vgpr += 2; + } else { + for (j = 0; j < 4; j++) + ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, ""); + } } if (depth) ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, ""); @@ -868,13 +927,23 @@ void si_llvm_build_ps_epilog(struct si_shader_context *ctx, union si_shader_part while (colors_written) { LLVMValueRef color[4]; int output_index = u_bit_scan(&colors_written); + unsigned color_type = (key->ps_epilog.color_types >> (output_index * 2)) & 0x3; - for (i = 0; i < 4; i++) - color[i] = LLVMGetParam(ctx->main_fn, vgpr++); + if (color_type != SI_TYPE_ANY32) { + for (i = 0; i < 4; i++) { + color[i] = LLVMGetParam(ctx->main_fn, vgpr + i / 2); + color[i] = LLVMBuildBitCast(ctx->ac.builder, color[i], ctx->ac.v2f16, ""); + color[i] = ac_llvm_extract_elem(&ctx->ac, color[i], i % 2); + } + vgpr += 4; + } else { + for (i = 0; i < 4; i++) + color[i] = LLVMGetParam(ctx->main_fn, vgpr++); + } if (si_export_mrt_color(ctx, color, output_index, num_compacted_mrts, ctx->args.arg_count - 1, - output_index == last_color_export, &exp)) + output_index == last_color_export, color_type, &exp)) num_compacted_mrts++; } diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 7b39c6511eb..8ed40441976 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -163,6 +163,11 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr } } + if (nir_intrinsic_has_type(intr)) + info->output_type[loc] = nir_intrinsic_type(intr); + else + info->output_type[loc] = nir_type_float32; + info->output_usagemask[loc] |= mask; info->num_outputs = MAX2(info->num_outputs, loc + 1); @@ -181,6 +186,13 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr if (semantic >= FRAG_RESULT_DATA0 && semantic <= FRAG_RESULT_DATA7) { unsigned index = semantic - FRAG_RESULT_DATA0; info->colors_written |= 1 << (index + i); + + if (nir_intrinsic_type(intr) == nir_type_float16) + info->output_color_types |= SI_TYPE_FLOAT16 << (index * 2); + else if (nir_intrinsic_type(intr) == nir_type_int16) + info->output_color_types |= SI_TYPE_INT16 << (index * 2); + else if (nir_intrinsic_type(intr) == nir_type_uint16) + info->output_color_types |= SI_TYPE_UINT16 << (index * 2); } break; } @@ -678,6 +690,12 @@ static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir) NIR_PASS_V(nir, nir_lower_load_const_to_scalar); NIR_PASS_V(nir, nir_lower_var_copies); NIR_PASS_V(nir, nir_opt_access); + + if (nir->info.stage == MESA_SHADER_FRAGMENT && + sscreen->info.has_packed_math_16bit && + sscreen->b.get_shader_param(&sscreen->b, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_FP16)) + NIR_PASS_V(nir, nir_lower_mediump_outputs); + si_nir_opts(nir, true); /* Lower large variables that are always constant with load_constant _______________________________________________ mesa-commit mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/mesa-commit
