Module: Mesa Branch: main Commit: 177b54ebe916ef97e3f47cc04c7e211cc4ba7d69 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=177b54ebe916ef97e3f47cc04c7e211cc4ba7d69
Author: Rhys Perry <[email protected]> Date: Thu Jan 27 14:19:21 2022 +0000 aco/tests: add v_fma_mix tests Signed-off-by: Rhys Perry <[email protected]> Reviewed-by: Daniel Schürmann <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769> --- src/amd/compiler/tests/helpers.cpp | 8 +- src/amd/compiler/tests/test_optimizer.cpp | 485 ++++++++++++++++++++++++++++++ 2 files changed, 489 insertions(+), 4 deletions(-) diff --git a/src/amd/compiler/tests/helpers.cpp b/src/amd/compiler/tests/helpers.cpp index fd18ee2cd21..fc1ca054348 100644 --- a/src/amd/compiler/tests/helpers.cpp +++ b/src/amd/compiler/tests/helpers.cpp @@ -344,14 +344,14 @@ Temp fsat(Temp src, Builder b) Temp ext_ushort(Temp src, unsigned idx, Builder b) { - return b.pseudo(aco_opcode::p_extract, b.def(v1), src, Operand::c32(idx), Operand::c32(16u), - Operand::c32(false)); + return b.pseudo(aco_opcode::p_extract, b.def(src.regClass()), src, Operand::c32(idx), + Operand::c32(16u), Operand::c32(false)); } Temp ext_ubyte(Temp src, unsigned idx, Builder b) { - return b.pseudo(aco_opcode::p_extract, b.def(v1), src, Operand::c32(idx), Operand::c32(8u), - Operand::c32(false)); + return b.pseudo(aco_opcode::p_extract, b.def(src.regClass()), src, Operand::c32(idx), + Operand::c32(8u), Operand::c32(false)); } VkDevice get_vk_device(enum chip_class chip_class) diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index 668d1ff50f0..3f1da2dbd9a 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -1158,3 +1158,488 @@ BEGIN_TEST(optimize.casts) finish_opt_test(); END_TEST +BEGIN_TEST(optimize.mad_mix.input_conv.basic) + for (unsigned i = GFX9; i <= GFX10; i++) { + //>> v1: %a, v2b: %a16 = p_startpgm + if (!setup_cs("v1 v2b", (chip_class)i)) + continue; + + Temp a = inputs[0]; + Temp a16 = inputs[1]; + + //! v1: %res0 = v_fma_mix_f32 %a, lo(%a16), -0 + //! p_unit_test 0, %res0 + writeout(0, fmul(a, f2f32(a16))); + + //! v1: %res1 = v_fma_mix_f32 1.0, %a, lo(%a16) + //! p_unit_test 1, %res1 + writeout(1, fadd(a, f2f32(a16))); + + //! v1: %res2 = v_fma_mix_f32 1.0, lo(%a16), %a + //! p_unit_test 2, %res2 + writeout(2, fadd(f2f32(a16), a)); + + //! v1: %res3 = v_fma_mix_f32 %a, %a, lo(%a16) + //! p_unit_test 3, %res3 + writeout(3, fma(a, a, f2f32(a16))); + + //! v1: %res4 = v_fma_mix_f32 %a, %a, lo(%a16) + //! p_unit_test 4, %res4 + writeout(4, fma(a, a, f2f32(a16))); + + finish_opt_test(); + } +END_TEST + +BEGIN_TEST(optimize.mad_mix.input_conv.precision) + for (unsigned i = GFX9; i <= GFX10; i++) { + //>> v1: %a, v2b: %a16 = p_startpgm + if (!setup_cs("v1 v2b", (chip_class)i)) + continue; + + Temp a = inputs[0]; + Temp a16 = inputs[1]; + + /* precise arithmetic */ + //~gfx9! v1: %res0_cvt = v_cvt_f32_f16 %a16 + //~gfx9! v1: (precise)%res0 = v_fma_f32 %a, %a, %res0_cvt + //~gfx10! v1: (precise)%res0 = v_fma_mix_f32 %a, %a, lo(%a16) + //! p_unit_test 0, %res0 + writeout(0, fma(a, a, f2f32(a16), bld.precise())); + + //! v2b: %res1_cvt = v_cvt_f16_f32 %a + //! v2b: (precise)%res1 = v_mul_f16 %a16, %res1_cvt + //! p_unit_test 1, %res1 + writeout(1, fmul(a16, f2f16(a), bld.precise())); + + //! v2b: %res2_cvt = v_cvt_f16_f32 %a + //! v2b: (precise)%res2 = v_add_f16 %a16, %res2_cvt + //! p_unit_test 2, %res2 + writeout(2, fadd(a16, f2f16(a), bld.precise())); + + //! v2b: %res3_cvt = v_cvt_f16_f32 %a + //! v2b: (precise)%res3 = v_fma_f16 %a16, %a16, %res3_cvt + //! p_unit_test 3, %res3 + writeout(3, fma(a16, a16, f2f16(a), bld.precise())); + + /* precise conversions */ + //! v2b: (precise)%res4_cvt = v_cvt_f16_f32 %a + //! v2b: %res4 = v_mul_f16 %a16, %res4_cvt + //! p_unit_test 4, %res4 + writeout(4, fmul(a16, f2f16(a, bld.precise()))); + + //! v2b: (precise)%res5_cvt = v_cvt_f16_f32 %a + //! v2b: %res5 = v_add_f16 %a16, %res5_cvt + //! p_unit_test 5, %res5 + writeout(5, fadd(a16, f2f16(a, bld.precise()))); + + //! v2b: (precise)%res6_cvt = v_cvt_f16_f32 %a + //! v2b: %res6 = v_fma_f16 %a16, %a16, %res6_cvt + //! p_unit_test 6, %res6 + writeout(6, fma(a16, a16, f2f16(a, bld.precise()))); + + finish_opt_test(); + } +END_TEST + +BEGIN_TEST(optimize.mad_mix.input_conv.modifiers) + for (unsigned i = GFX9; i <= GFX10; i++) { + //>> v1: %a, v2b: %a16 = p_startpgm + if (!setup_cs("v1 v2b", (chip_class)i)) + continue; + + Temp a = inputs[0]; + Temp a16 = inputs[1]; + + /* check whether modifiers are preserved when converting to VOP3P */ + //! v1: %res0 = v_fma_mix_f32 -%a, lo(%a16), -0 + //! p_unit_test 0, %res0 + writeout(0, fmul(fneg(a), f2f32(a16))); + + //! v1: %res1 = v_fma_mix_f32 |%a|, lo(%a16), -0 + //! p_unit_test 1, %res1 + writeout(1, fmul(fabs(a), f2f32(a16))); + + /* fneg modifiers */ + //! v1: %res2 = v_fma_mix_f32 %a, -lo(%a16), -0 + //! p_unit_test 2, %res2 + writeout(2, fmul(a, fneg(f2f32(a16)))); + + //! v1: %res3 = v_fma_mix_f32 %a, -lo(%a16), -0 + //! p_unit_test 3, %res3 + writeout(3, fmul(a, f2f32(fneg(a16)))); + + /* fabs modifiers */ + //! v1: %res4 = v_fma_mix_f32 %a, |lo(%a16)|, -0 + //! p_unit_test 4, %res4 + writeout(4, fmul(a, fabs(f2f32(a16)))); + + //! v1: %res5 = v_fma_mix_f32 %a, |lo(%a16)|, -0 + //! p_unit_test 5, %res5 + writeout(5, fmul(a, f2f32(fabs(a16)))); + + /* both fabs and fneg modifiers */ + //! v1: %res6 = v_fma_mix_f32 %a, -|lo(%a16)|, -0 + //! p_unit_test 6, %res6 + writeout(6, fmul(a, fneg(f2f32(fabs(a16))))); + + //! v1: %res7 = v_fma_mix_f32 %a, |lo(%a16)|, -0 + //! p_unit_test 7, %res7 + writeout(7, fmul(a, fabs(f2f32(fabs(a16))))); + + //! v1: %res8 = v_fma_mix_f32 %a, -|lo(%a16)|, -0 + //! p_unit_test 8, %res8 + writeout(8, fmul(a, fneg(fabs(f2f32(fabs(a16)))))); + + //! v1: %res9 = v_fma_mix_f32 %a, -|lo(%a16)|, -0 + //! p_unit_test 9, %res9 + writeout(9, fmul(a, f2f32(fneg(fabs(a16))))); + + //! v1: %res10 = v_fma_mix_f32 %a, |lo(%a16)|, -0 + //! p_unit_test 10, %res10 + writeout(10, fmul(a, fneg(f2f32(fneg(fabs(a16)))))); + + //! v1: %res11 = v_fma_mix_f32 %a, |lo(%a16)|, -0 + //! p_unit_test 11, %res11 + writeout(11, fmul(a, fabs(f2f32(fneg(fabs(a16)))))); + + //! v1: %res12 = v_fma_mix_f32 %a, -|lo(%a16)|, -0 + //! p_unit_test 12, %res12 + writeout(12, fmul(a, fneg(fabs(f2f32(fneg(fabs(a16))))))); + + /* sdwa */ + //! v1: %res13 = v_fma_mix_f32 lo(%a), %a, -0 + //! p_unit_test 13, %res13 + writeout(13, fmul(f2f32(ext_ushort(a, 0)), a)); + + //! v1: %res14 = v_fma_mix_f32 hi(%a), %a, -0 + //! p_unit_test 14, %res14 + writeout(14, fmul(f2f32(ext_ushort(a, 1)), a)); + + //! v1: %res15_cvt = v_cvt_f32_f16 %a dst_sel:uword0 src0_sel:dword + //! v1: %res15 = v_mul_f32 %res15_cvt, %a + //! p_unit_test 15, %res15 + writeout(15, fmul(ext_ushort(f2f32(a), 0), a)); + + //! v1: %res16_cvt = v_cvt_f32_f16 %a + //! v1: %res16 = v_mul_f32 %res16_cvt, %a dst_sel:dword src0_sel:uword1 src1_sel:dword + //! p_unit_test 16, %res16 + writeout(16, fmul(ext_ushort(f2f32(a), 1), a)); + + //! v1: %res17_cvt = v_cvt_f32_f16 %a dst_sel:dword src0_sel:ubyte2 + //! v1: %res17 = v_mul_f32 %res17_cvt, %a + //! p_unit_test 17, %res17 + writeout(17, fmul(f2f32(ext_ubyte(a, 2)), a)); + + finish_opt_test(); + } +END_TEST + +BEGIN_TEST(optimize.mad_mix.output_conv.basic) + for (unsigned i = GFX9; i <= GFX10; i++) { + //>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %b16 = p_startpgm + if (!setup_cs("v1 v1 v1 v2b v2b", (chip_class)i)) + continue; + + Temp a = inputs[0]; + Temp b = inputs[1]; + Temp c = inputs[2]; + Temp a16 = inputs[3]; + Temp b16 = inputs[4]; + + //! v2b: %res0 = v_fma_mixlo_f16 %a, %b, -0 + //! p_unit_test 0, %res0 + writeout(0, f2f16(fmul(a, b))); + + //! v2b: %res1 = v_fma_mixlo_f16 1.0, %a, %b + //! p_unit_test 1, %res1 + writeout(1, f2f16(fadd(a, b))); + + //! v2b: %res2 = v_fma_mixlo_f16 %a, %b, %c + //! p_unit_test 2, %res2 + writeout(2, f2f16(fma(a, b, c))); + + //! v2b: %res3 = v_fma_mixlo_f16 lo(%a16), %b, -0 + //! p_unit_test 3, %res3 + writeout(3, f2f16(fmul(f2f32(a16), b))); + + //! v2b: %res4 = v_fma_mixlo_f16 1.0, %a, lo(%b16) + //! p_unit_test 4, %res4 + writeout(4, f2f16(fadd(a, f2f32(b16)))); + + //! v2b: %res5 = v_fma_mixlo_f16 %a, lo(%b16), %c + //! p_unit_test 5, %res5 + writeout(5, f2f16(fma(a, f2f32(b16), c))); + + finish_opt_test(); + } +END_TEST + +BEGIN_TEST(optimize.mad_mix.output_conv.precision) + for (unsigned i = GFX9; i <= GFX10; i++) { + //>> v2b: %a16 = p_startpgm + if (!setup_cs("v2b", (chip_class)i)) + continue; + + Temp a16 = inputs[0]; + + //! v2b: %res0_tmp = v_mul_f16 %a16, %a16 + //! v1: (precise)%res0 = v_cvt_f32_f16 %res0_tmp + //! p_unit_test 0, %res0 + writeout(0, f2f32(fmul(a16, a16), bld.precise())); + + //! v2b: (precise)%res1_tmp = v_mul_f16 %a16, %a16 + //! v1: %res1 = v_cvt_f32_f16 %res1_tmp + //! p_unit_test 1, %res1 + writeout(1, f2f32(fmul(a16, a16, bld.precise()))); + + finish_opt_test(); + } +END_TEST + +BEGIN_TEST(optimize.mad_mix.output_conv.modifiers) + for (unsigned i = GFX9; i <= GFX10; i++) { + //>> v1: %a, v1: %b, v2b: %a16, v2b: %b16 = p_startpgm + if (!setup_cs("v1 v1 v2b v2b", (chip_class)i)) + continue; + + Temp a = inputs[0]; + Temp b = inputs[1]; + Temp a16 = inputs[2]; + Temp b16 = inputs[3]; + + /* fneg/fabs */ + //! v1: %res0_add = v_add_f32 %1, %2 + //! v2b: %res0 = v_cvt_f16_f32 |%res0_add| + //! p_unit_test 0, %res0 + writeout(0, f2f16(fabs(fadd(a, b)))); + + //! v1: %res1_add = v_add_f32 %1, %2 + //! v2b: %res1 = v_cvt_f16_f32 -%res1_add + //! p_unit_test 1, %res1 + writeout(1, f2f16(fneg(fadd(a, b)))); + + //! v2b: %res2_add = v_add_f16 %3, %4 + //! v1: %res2 = v_cvt_f32_f16 |%res2_add| + //! p_unit_test 2, %res2 + writeout(2, f2f32(fabs(fadd(a16, b16)))); + + //! v2b: %res3_add = v_add_f16 %3, %4 + //! v1: %res3 = v_cvt_f32_f16 -%res3_add + //! p_unit_test 3, %res3 + writeout(3, f2f32(fneg(fadd(a16, b16)))); + + /* sdwa */ + //! v2b: %res4_add = v_fma_mixlo_f16 1.0, %a, %b + //! v2b: %res4 = p_extract %res4_add, 0, 8, 0 + //! p_unit_test 4, %res4 + writeout(4, ext_ubyte(f2f16(fadd(a, b)), 0)); + + //! v1: %res5_mul = v_add_f32 %a, %b dst_sel:uword0 src0_sel:dword src1_sel:dword + //! v2b: %res5 = v_cvt_f16_f32 %res5_mul + //! p_unit_test 5, %res5 + writeout(5, f2f16(ext_ushort(fadd(a, b), 0))); + + finish_opt_test(); + } +END_TEST + +BEGIN_TEST(optimize.mad_mix.fma.basic) + for (unsigned i = GFX9; i <= GFX10; i++) { + //>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %c16 = p_startpgm + if (!setup_cs("v1 v1 v1 v2b v2b", (chip_class)i)) + continue; + + Temp a = inputs[0]; + Temp b = inputs[1]; + Temp c = inputs[2]; + Temp a16 = inputs[3]; + Temp c16 = inputs[4]; + + //! v1: %res0 = v_fma_mix_f32 lo(%a16), %b, %c + //! p_unit_test 0, %res0 + writeout(0, fadd(fmul(f2f32(a16), b), c)); + + //! v1: %res1 = v_fma_mix_f32 %a, %b, lo(%c16) + //! p_unit_test 1, %res1 + writeout(1, fadd(fmul(a, b), f2f32(c16))); + + /* omod/clamp check */ + //! v1: %res2_mul = v_fma_mix_f32 lo(%a16), %b, -0 + //! v1: %res2 = v_add_f32 %res2_mul, %c *2 + //! p_unit_test 2, %res2 + writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000), fadd(fmul(f2f32(a16), b), c))); + + /* neg/abs modifiers */ + //! v1: %res3 = v_fma_mix_f32 -lo(%a16), %b, |lo(%c16)| + //! p_unit_test 3, %res3 + writeout(3, fadd(fmul(fneg(f2f32(a16)), b), fabs(f2f32(c16)))); + + //! v1: %res4 = v_fma_mix_f32 |%a|, |%b|, lo(%c16) + //! p_unit_test 4, %res4 + writeout(4, fadd(fabs(fmul(fneg(a), fneg(b))), f2f32(c16))); + + //! v1: %res5 = v_fma_mix_f32 %a, -%b, lo(%c16) + //! p_unit_test 5, %res5 + writeout(5, fadd(fneg(fmul(a, b)), f2f32(c16))); + + //! v1: %res6 = v_fma_mix_f32 |%a|, -|%b|, lo(%c16) + //! p_unit_test 6, %res6 + writeout(6, fadd(fneg(fabs(fmul(fneg(a), fneg(b)))), f2f32(c16))); + + /* output conversions */ + //! v2b: %res7 = v_fma_mixlo_f16 %a, %b, %c + //! p_unit_test 7, %res7 + writeout(7, f2f16(fadd(fmul(a, b), c))); + + finish_opt_test(); + } +END_TEST + +BEGIN_TEST(optimize.mad_mix.fma.precision) + for (unsigned i = GFX9; i <= GFX10; i++) { + //>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %b16 = p_startpgm + if (!setup_cs("v1 v1 v1 v2b v2b", (chip_class)i)) + continue; + + Temp a = inputs[0]; + Temp b = inputs[1]; + Temp c = inputs[2]; + Temp a16 = inputs[3]; + Temp b16 = inputs[4]; + + /* the optimization is precise for 32-bit on GFX9 */ + //~gfx9! v1: %res0 = v_fma_mix_f32 lo(%a16), %b, %c + //~gfx10! v1: (precise)%res0_tmp = v_fma_mix_f32 lo(%a16), %b, -0 + //~gfx10! v1: %res0 = v_add_f32 %res0_tmp, %c + //! p_unit_test 0, %res0 + writeout(0, fadd(fmul(f2f32(a16), b, bld.precise()), c)); + + //~gfx9! v1: (precise)%res1 = v_fma_mix_f32 lo(%a16), %b, %c + //~gfx10! v1: %res1_tmp = v_fma_mix_f32 lo(%a16), %b, -0 + //~gfx10! v1: (precise)%res1 = v_add_f32 %res1_tmp, %c + //! p_unit_test 1, %res1 + writeout(1, fadd(fmul(f2f32(a16), b), c, bld.precise())); + + /* never promote 16-bit arithmetic to 32-bit */ + //! v2b: %res2_tmp = v_cvt_f16_f32 %a + //! v2b: %res2 = v_add_f16 %res2_tmp, %b16 + //! p_unit_test 2, %res2 + writeout(2, fadd(f2f16(a), b16)); + + //! v2b: %res3_tmp = v_cvt_f16_f32 %a + //! v2b: %res3 = v_mul_f16 %res3_tmp, %b16 + //! p_unit_test 3, %res3 + writeout(3, fmul(f2f16(a), b16)); + + //! v2b: %res4_tmp = v_mul_f16 %a16, %b16 + //! v1: %res4 = v_cvt_f32_f16 %res4_tmp + //! p_unit_test 4, %res4 + writeout(4, f2f32(fmul(a16, b16))); + + //! v2b: %res5_tmp = v_add_f16 %a16, %b16 + //! v1: %res5 = v_cvt_f32_f16 %res5_tmp + //! p_unit_test 5, %res5 + writeout(5, f2f32(fadd(a16, b16))); + + //! v2b: %res6_tmp = v_fma_mixlo_f16 %a, %b, -0 + //! v2b: %res6 = v_add_f16 %res6_tmp, %a16 + //! p_unit_test 6, %res6 + writeout(6, fadd(f2f16(fmul(a, b)), a16)); + + //! v2b: %res7_tmp = v_mul_f16 %a16, %b16 + //! v1: %res7 = v_fma_mix_f32 1.0, lo(%res7_tmp), %c + //! p_unit_test 7, %res7 + writeout(7, fadd(f2f32(fmul(a16, b16)), c)); + + finish_opt_test(); + } +END_TEST + +BEGIN_TEST(optimize.mad_mix.clamp) + for (unsigned i = GFX9; i <= GFX10; i++) { + //>> v1: %a, v2b: %a16 = p_startpgm + if (!setup_cs("v1 v2b", (chip_class)i)) + continue; + + Temp a = inputs[0]; + Temp a16 = inputs[1]; + + //! v1: %res0 = v_fma_mix_f32 lo(%a16), %a, -0 clamp + //! p_unit_test 0, %res0 + writeout(0, fsat(fmul(f2f32(a16), a))); + + //! v2b: %res1 = v_fma_mixlo_f16 %a, %a, -0 clamp + //! p_unit_test 1, %res1 + writeout(1, f2f16(fsat(fmul(a, a)))); + + //! v2b: %res2 = v_fma_mixlo_f16 %a, %a, -0 clamp + //! p_unit_test 2, %res2 + writeout(2, fsat(f2f16(fmul(a, a)))); + + finish_opt_test(); + } +END_TEST + +BEGIN_TEST(optimize.mad_mix.cast) + for (unsigned i = GFX9; i <= GFX10; i++) { + //>> v1: %a, v2b: %a16 = p_startpgm + if (!setup_cs("v1 v2b", (chip_class)i)) + continue; + + Temp a = inputs[0]; + Temp a16 = inputs[1]; + + /* The optimizer copy-propagates v2b=p_extract_vector(v1, 0) and p_as_uniform, so the + * optimizer has to check compatibility. + */ + + //! v1: %res0_cvt = v_cvt_f32_f16 %a16 + //! v2b: %res0 = v_mul_f16 %res0_cvt, %a16 + //! p_unit_test 0, %res0 + writeout(0, fmul(u2u16(f2f32(a16)), a16)); + + //! v2b: %res1_cvt = v_cvt_f16_f32 %a + //! v1: %res1 = v_mul_f32 %res1_cvt, %a + //! p_unit_test 1, %res1 + writeout(1, fmul(bld.as_uniform(f2f16(a)), a)); + + //! v2b: %res2_mul = v_mul_f16 %a16, %a16 + //! v2b: %res2 = v_cvt_f16_f32 %res2_mul + //! p_unit_test 2, %res2 + writeout(2, f2f16(bld.as_uniform(fmul(a16, a16)))); + + //! v1: %res3_mul = v_mul_f32 %a, %a + //! v1: %res3 = v_cvt_f32_f16 %res3_mul + //! p_unit_test 3, %res3 + writeout(3, f2f32(u2u16(fmul(a, a)))); + + //! v1: %res4_mul = v_fma_mix_f32 lo(%a16), %a, -0 + //! v2b: %res4 = v_med3_f16 0, 1.0, %res4_mul + //! p_unit_test 4, %res4 + writeout(4, fsat(u2u16(fmul(f2f32(a16), a)))); + + //! v2b: %res5_mul = v_fma_mixlo_f16 %a, %a, -0 + //! v1: %res5 = v_med3_f32 0, 1.0, %res5_mul + //! p_unit_test 5, %res5 + writeout(5, fsat(bld.as_uniform(f2f16(fmul(a, a))))); + + //! v1: %res6_mul = v_mul_f32 %a, %a + //! v1: %res6 = v_fma_mix_f32 1.0, lo(%res6_mul), %a + //! p_unit_test 6, %res6 + writeout(6, fadd(f2f32(u2u16(fmul(a, a))), a)); + + //! v2b: %res7_mul = v_mul_f16 %a16, %a16 + //! v1: %res7 = v_fma_mix_f32 1.0, %res7_mul, lo(%a16) + //! p_unit_test 7, %res7 + writeout(7, fadd(bld.as_uniform(fmul(a16, a16)), f2f32(a16))); + + /* opsel_hi should be obtained from the original opcode, not the operand regclass */ + //! v1: %res8 = v_fma_mix_f32 lo(%a16), %a16, -0 + //! p_unit_test 8, %res8 + writeout(8, fmul(f2f32(a16), a16)); + + finish_opt_test(); + } +END_TEST +
