Or rather, have an additional tuning/selection flag that a core may choose to make -mrecip the default with -ffast-math.
Cheers, -- Evandro Menezes Austin, TX > -----Original Message----- > From: Andrew Pinski [mailto:pins...@gmail.com] > Sent: Saturday, July 18, 2015 3:26 > To: Benedikt Huber > Cc: GCC Patches; Philipp Tomsich; Kumar, Venkataramanan; Evandro Menezes > Subject: Re: [PATCH] 2015-07-14 Benedikt Huber <benedikt.huber@theobroma- > systems.com> Philipp Tomsich <philipp.toms...@theobroma-systems.com> > > On Sat, Jul 18, 2015 at 1:25 AM, Andrew Pinski <pins...@gmail.com> wrote: > > On Fri, Jul 17, 2015 at 8:43 AM, Benedikt Huber > > <benedikt.hu...@theobroma-systems.com> wrote: > >> * config/aarch64/aarch64-builtins.c: Builtins > >> for rsqrt and rsqrtf. > >> * config/aarch64/aarch64-protos.h: Declare. > >> * config/aarch64/aarch64-simd.md: Matching expressions > >> for frsqrte and frsqrts. > >> * config/aarch64/aarch64.c: New functions. Emit rsqrt > >> estimation code in fast math mode. > >> * config/aarch64/aarch64.md: Added enum entries. > >> * config/aarch64/aarch64.opt: Added options -mrecip and > >> -mlow-precision-recip-sqrt. > >> * testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans > >> for frsqrte and frsqrts > >> * testsuite/gcc.target/aarch64/rsqrt.c: Functional tests > >> for rsqrt. > > > > As I mentioned before, Can we have -mrecip defaulted to on since it > > regresses ThunderX performance by at least 10%. > > As I mentioned before, Can we have -mrecip not defaulted to on since it > regresses ThunderX performance by at least 10%? > > > > > > Thanks, > > Andrew > > > >> > >> Signed-off-by: Philipp Tomsich > >> <philipp.toms...@theobroma-systems.com> > >> --- > >> gcc/ChangeLog | 17 ++++ > >> gcc/config/aarch64/aarch64-builtins.c | 103 > ++++++++++++++++++++ > >> gcc/config/aarch64/aarch64-protos.h | 2 + > >> gcc/config/aarch64/aarch64-simd.md | 27 ++++++ > >> gcc/config/aarch64/aarch64.c | 58 +++++++++++ > >> gcc/config/aarch64/aarch64.md | 3 + > >> gcc/config/aarch64/aarch64.opt | 8 ++ > >> gcc/doc/invoke.texi | 20 ++++ > >> gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c | 63 ++++++++++++ > >> gcc/testsuite/gcc.target/aarch64/rsqrt.c | 107 > +++++++++++++++++++++ > >> 10 files changed, 408 insertions(+) > >> create mode 100644 > >> gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c > >> create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt.c > >> > >> diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 3432adb..f4b7407 > >> 100644 > >> --- a/gcc/ChangeLog > >> +++ b/gcc/ChangeLog > >> @@ -1,3 +1,20 @@ > >> +2015-07-14 Benedikt Huber <benedikt.hu...@theobroma-systems.com> > >> + Philipp Tomsich <philipp.toms...@theobroma-systems.com> > >> + > >> + * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and > >> + rsqrtf. > >> + * config/aarch64/aarch64-protos.h: Declare. > >> + * config/aarch64/aarch64-simd.md: Matching expressions for > >> + frsqrte and frsqrts. > >> + * config/aarch64/aarch64.c: New functions. Emit rsqrt > >> + estimation code in fast math mode. > >> + * config/aarch64/aarch64.md: Added enum entries. > >> + * config/aarch64/aarch64.opt: Added options -mrecip and > >> + -mlow-precision-recip-sqrt. > >> + * testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans > >> + for frsqrte and frsqrts > >> + * testsuite/gcc.target/aarch64/rsqrt.c: Functional tests for > rsqrt. > >> + > >> 2015-07-08 Jiong Wang <jiong.w...@arm.com> > >> > >> * config/aarch64/aarch64.c (aarch64_unspec_may_trap_p): New > function. > >> diff --git a/gcc/config/aarch64/aarch64-builtins.c > >> b/gcc/config/aarch64/aarch64-builtins.c > >> index b6c89b9..adcea07 100644 > >> --- a/gcc/config/aarch64/aarch64-builtins.c > >> +++ b/gcc/config/aarch64/aarch64-builtins.c > >> @@ -335,6 +335,11 @@ enum aarch64_builtins > >> AARCH64_BUILTIN_GET_FPSR, > >> AARCH64_BUILTIN_SET_FPSR, > >> > >> + AARCH64_BUILTIN_RSQRT_DF, > >> + AARCH64_BUILTIN_RSQRT_SF, > >> + AARCH64_BUILTIN_RSQRT_V2DF, > >> + AARCH64_BUILTIN_RSQRT_V2SF, > >> + AARCH64_BUILTIN_RSQRT_V4SF, > >> AARCH64_SIMD_BUILTIN_BASE, > >> AARCH64_SIMD_BUILTIN_LANE_CHECK, > >> #include "aarch64-simd-builtins.def" > >> @@ -824,6 +829,42 @@ aarch64_init_crc32_builtins () } > >> > >> void > >> +aarch64_add_builtin_rsqrt (void) > >> +{ > >> + tree fndecl = NULL; > >> + tree ftype = NULL; > >> + > >> + tree V2SF_type_node = build_vector_type (float_type_node, 2); > >> + tree V2DF_type_node = build_vector_type (double_type_node, 2); > >> + tree V4SF_type_node = build_vector_type (float_type_node, 4); > >> + > >> + ftype = build_function_type_list (double_type_node, > >> + double_type_node, NULL_TREE); fndecl = add_builtin_function > ("__builtin_aarch64_rsqrt_df", > >> + ftype, AARCH64_BUILTIN_RSQRT_DF, BUILT_IN_MD, NULL, NULL_TREE); > >> + aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF] = fndecl; > >> + > >> + ftype = build_function_type_list (float_type_node, > >> + float_type_node, NULL_TREE); fndecl = add_builtin_function > ("__builtin_aarch64_rsqrt_sf", > >> + ftype, AARCH64_BUILTIN_RSQRT_SF, BUILT_IN_MD, NULL, NULL_TREE); > >> + aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF] = fndecl; > >> + > >> + ftype = build_function_type_list (V2DF_type_node, V2DF_type_node, > >> + NULL_TREE); fndecl = add_builtin_function > ("__builtin_aarch64_rsqrt_v2df", > >> + ftype, AARCH64_BUILTIN_RSQRT_V2DF, BUILT_IN_MD, NULL, > >> + NULL_TREE); aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF] = > >> + fndecl; > >> + > >> + ftype = build_function_type_list (V2SF_type_node, V2SF_type_node, > >> + NULL_TREE); fndecl = add_builtin_function > ("__builtin_aarch64_rsqrt_v2sf", > >> + ftype, AARCH64_BUILTIN_RSQRT_V2SF, BUILT_IN_MD, NULL, > >> + NULL_TREE); aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF] = > >> + fndecl; > >> + > >> + ftype = build_function_type_list (V4SF_type_node, V4SF_type_node, > >> +NULL_TREE); > >> + fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v4sf", > >> + ftype, AARCH64_BUILTIN_RSQRT_V4SF, BUILT_IN_MD, NULL, > >> +NULL_TREE); > >> + aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF] = fndecl; } > >> + > >> +void > >> aarch64_init_builtins (void) > >> { > >> tree ftype_set_fpr > >> @@ -848,6 +889,7 @@ aarch64_init_builtins (void) > >> aarch64_init_simd_builtins (); > >> if (TARGET_CRC32) > >> aarch64_init_crc32_builtins (); > >> + aarch64_add_builtin_rsqrt (); > >> } > >> > >> tree > >> @@ -1092,6 +1134,38 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, > rtx target) > >> return target; > >> } > >> > >> +static rtx > >> +aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target) { > >> + rtx pat; > >> + tree arg0 = CALL_EXPR_ARG (exp, 0); > >> + rtx op0 = expand_normal (arg0); > >> + > >> + enum insn_code c = CODE_FOR_rsqrt_df; > >> + > >> + switch (fcode) > >> + { > >> + case AARCH64_BUILTIN_RSQRT_DF: > >> + c = CODE_FOR_rsqrt_df; break; > >> + case AARCH64_BUILTIN_RSQRT_SF: > >> + c = CODE_FOR_rsqrt_sf; break; > >> + case AARCH64_BUILTIN_RSQRT_V2DF: > >> + c = CODE_FOR_rsqrt_v2df; break; > >> + case AARCH64_BUILTIN_RSQRT_V2SF: > >> + c = CODE_FOR_rsqrt_v2sf; break; > >> + case AARCH64_BUILTIN_RSQRT_V4SF: > >> + c = CODE_FOR_rsqrt_v4sf; break; > >> + } > >> + > >> + if (!target) > >> + target = gen_reg_rtx (GET_MODE (op0)); > >> + > >> + pat = GEN_FCN (c) (target, op0); > >> + emit_insn (pat); > >> + > >> + return target; > >> +} > >> + > >> /* Expand an expression EXP that calls a built-in function, > >> with result going to TARGET if that's convenient. */ rtx @@ > >> -1139,6 +1213,13 @@ aarch64_expand_builtin (tree exp, > >> else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= > AARCH64_CRC32_BUILTIN_MAX) > >> return aarch64_crc32_expand_builtin (fcode, exp, target); > >> > >> + if (fcode == AARCH64_BUILTIN_RSQRT_DF || > >> + fcode == AARCH64_BUILTIN_RSQRT_SF || > >> + fcode == AARCH64_BUILTIN_RSQRT_V2DF || > >> + fcode == AARCH64_BUILTIN_RSQRT_V2SF || > >> + fcode == AARCH64_BUILTIN_RSQRT_V4SF) > >> + return aarch64_expand_builtin_rsqrt (fcode, exp, target); > >> + > >> gcc_unreachable (); > >> } > >> > >> @@ -1296,6 +1377,28 @@ aarch64_builtin_vectorized_function (tree fndecl, > tree type_out, tree type_in) > >> return NULL_TREE; > >> } > >> > >> +tree > >> +aarch64_builtin_rsqrt (unsigned int fn, bool md_fn) { > >> + if (md_fn) > >> + { > >> + if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df) > >> + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF]; > >> + if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf) > >> + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF]; > >> + if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf) > >> + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF]; > >> + } > >> + else > >> + { > >> + if (fn == BUILT_IN_SQRT) > >> + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF]; > >> + if (fn == BUILT_IN_SQRTF) > >> + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF]; > >> + } > >> + return NULL_TREE; > >> +} > >> + > >> #undef VAR1 > >> #define VAR1(T, N, MAP, A) \ > >> case AARCH64_SIMD_BUILTIN_##T##_##N##A: > >> diff --git a/gcc/config/aarch64/aarch64-protos.h > >> b/gcc/config/aarch64/aarch64-protos.h > >> index 4062c27..8b8a389 100644 > >> --- a/gcc/config/aarch64/aarch64-protos.h > >> +++ b/gcc/config/aarch64/aarch64-protos.h > >> @@ -321,6 +321,8 @@ void aarch64_print_operand (FILE *, rtx, char); > >> void aarch64_print_operand_address (FILE *, rtx); void > >> aarch64_emit_call_insn (rtx); > >> > >> +void aarch64_emit_swrsqrt (rtx, rtx); > >> + > >> /* Initialize builtins for SIMD intrinsics. */ void > >> init_aarch64_simd_builtins (void); > >> > >> diff --git a/gcc/config/aarch64/aarch64-simd.md > >> b/gcc/config/aarch64/aarch64-simd.md > >> index b90f938..ac8e12d 100644 > >> --- a/gcc/config/aarch64/aarch64-simd.md > >> +++ b/gcc/config/aarch64/aarch64-simd.md > >> @@ -353,6 +353,33 @@ > >> [(set_attr "type" "neon_fp_mul_d_scalar_q")] > >> ) > >> > >> +(define_insn "*rsqrte_<mode>" > >> + [(set (match_operand:VALLF 0 "register_operand" "=w") > >> + (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")] > >> + UNSPEC_RSQRTE))] > >> + "TARGET_SIMD" > >> + "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>" > >> + [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")]) > >> + > >> +(define_insn "*rsqrts_<mode>" > >> + [(set (match_operand:VALLF 0 "register_operand" "=w") > >> + (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w") > >> + (match_operand:VALLF 2 "register_operand" "w")] > >> + UNSPEC_RSQRTS))] > >> + "TARGET_SIMD" > >> + "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>" > >> + [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")]) > >> + > >> +(define_expand "rsqrt_<mode>" > >> + [(set (match_operand:VALLF 0 "register_operand" "=w") > >> + (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")] > >> + UNSPEC_RSQRT))] > >> + "TARGET_SIMD" > >> +{ > >> + aarch64_emit_swrsqrt (operands[0], operands[1]); > >> + DONE; > >> +}) > >> + > >> (define_insn "*aarch64_mul3_elt_to_64v2df" > >> [(set (match_operand:DF 0 "register_operand" "=w") > >> (mult:DF > >> diff --git a/gcc/config/aarch64/aarch64.c > >> b/gcc/config/aarch64/aarch64.c index 6c13a078..2e53103 100644 > >> --- a/gcc/config/aarch64/aarch64.c > >> +++ b/gcc/config/aarch64/aarch64.c > >> @@ -6961,6 +6961,61 @@ aarch64_memory_move_cost (machine_mode mode > ATTRIBUTE_UNUSED, > >> return aarch64_tune_params.memmov_cost; } > >> > >> +extern tree aarch64_builtin_rsqrt (unsigned int fn, bool md_fn); > >> + > >> +static tree > >> +aarch64_builtin_reciprocal (unsigned int fn, > >> + bool md_fn, > >> + bool) > >> +{ > >> + if (!flag_mrecip || !flag_finite_math_only || > >> + flag_trapping_math || !flag_unsafe_math_optimizations) > >> + { > >> + return NULL_TREE; > >> + } > >> + > >> + return aarch64_builtin_rsqrt (fn, md_fn); } > >> + > >> +void > >> +aarch64_emit_swrsqrt (rtx dst, rtx src) { > >> + enum machine_mode mode = GET_MODE (src); > >> + gcc_assert ( > >> + mode == SFmode || mode == V2SFmode || mode == V4SFmode || > >> + mode == DFmode || mode == V2DFmode); > >> + > >> + rtx xsrc = gen_reg_rtx (mode); > >> + emit_move_insn (xsrc, src); > >> + rtx x0 = gen_reg_rtx (mode); > >> + emit_set_insn (x0, > >> + gen_rtx_UNSPEC (mode, gen_rtvec (1, xsrc), UNSPEC_RSQRTE)); > >> + > >> + bool double_mode = (mode == DFmode || mode == V2DFmode); > >> + > >> + int iterations = 2; > >> + if (double_mode) > >> + iterations = 3; > >> + > >> + if (flag_mrecip_low_precision_sqrt) > >> + iterations--; > >> + > >> + for (int i = 0; i < iterations; ++i) > >> + { > >> + rtx x1 = gen_reg_rtx (mode); > >> + rtx x2 = gen_reg_rtx (mode); > >> + rtx x3 = gen_reg_rtx (mode); > >> + emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0)); > >> + emit_set_insn (x3, gen_rtx_UNSPEC ( > >> + mode, gen_rtvec (2, xsrc, x2), UNSPEC_RSQRTS)); > >> + emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3)); > >> + x0 = x1; > >> + } > >> + > >> + emit_move_insn (dst, x0); > >> + return; > >> +} > >> + > >> /* Return the number of instructions that can be issued per cycle. > >> */ static int aarch64_sched_issue_rate (void) @@ -12099,6 +12154,9 > >> @@ aarch64_unspec_may_trap_p (const_rtx x, unsigned flags) #undef > >> TARGET_USE_BLOCKS_FOR_CONSTANT_P #define > >> TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p > >> > >> +#undef TARGET_BUILTIN_RECIPROCAL > >> +#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal > >> + > >> #undef TARGET_VECTOR_MODE_SUPPORTED_P #define > >> TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p > >> > >> diff --git a/gcc/config/aarch64/aarch64.md > >> b/gcc/config/aarch64/aarch64.md index 1e343fa..d7944b2 100644 > >> --- a/gcc/config/aarch64/aarch64.md > >> +++ b/gcc/config/aarch64/aarch64.md > >> @@ -122,6 +122,9 @@ > >> UNSPEC_VSTRUCTDUMMY > >> UNSPEC_SP_SET > >> UNSPEC_SP_TEST > >> + UNSPEC_RSQRT > >> + UNSPEC_RSQRTE > >> + UNSPEC_RSQRTS > >> ]) > >> > >> (define_c_enum "unspecv" [ > >> diff --git a/gcc/config/aarch64/aarch64.opt > >> b/gcc/config/aarch64/aarch64.opt index 98ef9f6..303efa4 100644 > >> --- a/gcc/config/aarch64/aarch64.opt > >> +++ b/gcc/config/aarch64/aarch64.opt > >> @@ -124,3 +124,11 @@ Enum(aarch64_abi) String(ilp32) > >> Value(AARCH64_ABI_ILP32) > >> > >> EnumValue > >> Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64) > >> + > >> +mrecip > >> +Common Report Var(flag_mrecip) Optimization Init(1) Generate > >> +software reciprocal square root for better throughput. Default with fast- > math. > >> + > >> +mlow-precision-recip-sqrt > >> +Common Var(flag_mrecip_low_precision_sqrt) Optimization Run fewer > >> +approximation steps to reduce latency and precision. > >> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index > >> b28e5d6..a0ab23d 100644 > >> --- a/gcc/doc/invoke.texi > >> +++ b/gcc/doc/invoke.texi > >> @@ -515,6 +515,8 @@ Objective-C and Objective-C++ Dialects}. > >> -mtls-dialect=desc -mtls-dialect=traditional @gol > >> -mfix-cortex-a53-835769 -mno-fix-cortex-a53-835769 @gol > >> -mfix-cortex-a53-843419 -mno-fix-cortex-a53-843419 @gol > >> +-mrecip -no-mrecip @gol > >> +-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol > >> -march=@var{name} -mcpu=@var{name} -mtune=@var{name}} > >> > >> @emph{Adapteva Epiphany Options} > >> @@ -12426,6 +12428,24 @@ Enable or disable the workaround for the ARM > Cortex-A53 erratum number 843419. > >> This erratum workaround is made at link time and this will only pass > >> the corresponding flag to the linker. > >> > >> +@item -mrecip > >> +@item -mno-recip > >> +@opindex mrecip > >> +@opindex mno-recip > >> +This option enables use of the > >> +reciprocal square root estimate instructions with additional > >> +Newton-Raphson steps to increase precision instead of doing a square > >> +root and divide for floating-point arguments. > >> +This is the default setting with @option{-ffast-math}. > >> +It can only be used together with @option{-ffast-math}. > >> + > >> +@item -mlow-precision-recip-sqrt > >> +@item -mno-low-precision-recip-sqrt > >> +@opindex -mlow-precision-recip-sqrt > >> +@opindex -mno-low-precision-recip-sqrt The square root estimate uses > >> +two steps instead of three for double-precision, and one step > >> +instead of two for single-precision. Thus reducing latency and precision. > >> + > >> @item -march=@var{name} > >> @opindex march > >> Specify the name of the target architecture, optionally suffixed by > >> one or diff --git > >> a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c > >> b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c > >> new file mode 100644 > >> index 0000000..cd81d9e > >> --- /dev/null > >> +++ b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c > >> @@ -0,0 +1,63 @@ > >> +/* { dg-do compile } */ > >> +/* { dg-options "-O3 --save-temps -fverbose-asm -ffast-math" } */ > >> + > >> +#include <math.h> > >> + > >> +#define sqrt_float __builtin_sqrtf > >> +#define sqrt_double __builtin_sqrt > >> + > >> +#define TESTTYPE(TYPE) \ > >> +typedef struct { \ > >> + TYPE a; \ > >> + TYPE b; \ > >> + TYPE c; \ > >> + TYPE d; \ > >> +} s4_##TYPE; \ > >> + \ > >> +typedef struct { \ > >> + TYPE a; \ > >> + TYPE b; \ > >> +} s2_##TYPE; \ > >> + \ > >> +s4_##TYPE rsqrtv4_##TYPE (s4_##TYPE i) \ > >> +{ \ > >> + s4_##TYPE o; \ > >> + o.a = 1.0 / sqrt_##TYPE (i.a); \ > >> + o.b = 1.0 / sqrt_##TYPE (i.b); \ > >> + o.c = 1.0 / sqrt_##TYPE (i.c); \ > >> + o.d = 1.0 / sqrt_##TYPE (i.d); \ > >> + return o; \ > >> +} \ > >> + \ > >> +s2_##TYPE rsqrtv2_##TYPE (s2_##TYPE i) \ > >> +{ \ > >> + s2_##TYPE o; \ > >> + o.a = 1.0 / sqrt_##TYPE (i.a); \ > >> + o.b = 1.0 / sqrt_##TYPE (i.b); \ > >> + return o; \ > >> +} \ > >> + \ > >> +TYPE rsqrt_##TYPE (TYPE i) \ > >> +{ \ > >> + return 1.0 / sqrt_##TYPE (i); \ > >> +} \ > >> + \ > >> + > >> +TESTTYPE(double) > >> +TESTTYPE(float) > >> + > >> +/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, > >> +d\[0-9\]+" 1 } } */ > >> +/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, > >> +d\[0-9\]+, d\[0-9\]+" 3 } } */ > >> + > >> +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2d, > >> +v\[0-9\]+.2d" 3 } } */ > >> +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2d, > >> +v\[0-9\]+.2d, v\[0-9\]+.2d" 9 } } */ > >> + > >> + > >> +/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, > >> +s\[0-9\]+" 1 } } */ > >> +/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, > >> +s\[0-9\]+, s\[0-9\]+" 2 } } */ > >> + > >> +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.4s, > >> +v\[0-9\]+.4s" 1 } } */ > >> +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.4s, > >> +v\[0-9\]+.4s, v\[0-9\]+.4s" 2 } } */ > >> + > >> +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2s, > >> +v\[0-9\]+.2s" 1 } } */ > >> +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2s, > >> +v\[0-9\]+.2s, v\[0-9\]+.2s" 2 } } */ > >> diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt.c > >> b/gcc/testsuite/gcc.target/aarch64/rsqrt.c > >> new file mode 100644 > >> index 0000000..8e2ae10 > >> --- /dev/null > >> +++ b/gcc/testsuite/gcc.target/aarch64/rsqrt.c > >> @@ -0,0 +1,107 @@ > >> +/* { dg-do run } */ > >> +/* { dg-options "-O3 --save-temps -fverbose-asm -ffast-math" } */ > >> + > >> +#include <math.h> > >> +#include <stdio.h> > >> + > >> +#include <float.h> > >> + > >> +#define PI 3.141592653589793 > >> +#define SQRT2 1.4142135623730951 > >> + > >> +#define PI_4 0.7853981633974483 > >> +#define SQRT1_2 0.7071067811865475 > >> + > >> +/* 2^25+1, float has 24 significand bits > >> + * according to Single-precision floating-point format. */ > >> +#define TESTA8_FLT 33554433 > >> +/* 2^54+1, double has 53 significand bits > >> + * according to Double-precision floating-point format. */ > >> +#define TESTA8_DBL 18014398509481985 > >> + > >> +#define SD(a, b) t_double ((#a), (a), (b)); #define SF(a, b) t_float > >> +((#a), (a), (b)); > >> + > >> +#define EPSILON_double __DBL_EPSILON__ #define EPSILON_float > >> +__FLT_EPSILON__ #define ABS_double __builtin_fabs #define ABS_float > >> +__builtin_fabsf #define SQRT_double __builtin_sqrt #define > >> +SQRT_float __builtin_sqrtf > >> + > >> +extern void abort (void); > >> + > >> +#define TESTTYPE(TYPE) > \ > >> +TYPE rsqrt_##TYPE (TYPE a) > \ > >> +{ > \ > >> + return 1.0/SQRT_##TYPE(a); > \ > >> +} > \ > >> + > \ > >> +int equals_##TYPE (TYPE a, TYPE b) > \ > >> +{ > \ > >> + return (a == b || > \ > >> + (isnan (a) && isnan (b)) || > \ > >> + (ABS_##TYPE (a - b) < EPSILON_##TYPE)); > \ > >> +} > \ > >> + > \ > >> +void t_##TYPE (const char *s, TYPE a, TYPE result) > \ > >> +{ > \ > >> + TYPE r = rsqrt_##TYPE (a); > \ > >> + if (!equals_##TYPE (r, result)) > \ > >> + { > \ > >> + abort (); > \ > >> + } > \ > >> +} > \ > >> + > >> +// printf ("Problem in %20s: %30.18A should be %30.18A\n", s, r, > >> +result); \ > >> + > >> +TESTTYPE(double) > >> +TESTTYPE(float) > >> + > >> +int main () > >> +{ > >> + SD( 1.0/256, 0X1.00000000000000P+4 ); > >> + SD( 1.0, 0X1.00000000000000P+0 ); > >> + SD( -1.0, NAN); > >> + SD( 11.0, 0X1.34BF63D1568260P-2 ); > >> + SD( 0.0, INFINITY); > >> + SD( INFINITY, 0X0.00000000000000P+0 ); > >> + SD( NAN, NAN); > >> + SD( -NAN, -NAN); > >> + SD( DBL_MAX, 0X1.00000000000010P-512); > >> + SD( DBL_MIN, 0X1.00000000000000P+511); > >> + SD( PI, 0X1.20DD750429B6D0P-1 ); > >> + SD( PI_4, 0X1.20DD750429B6D0P+0 ); > >> + SD( SQRT2, 0X1.AE89F995AD3AE0P-1 ); > >> + SD( SQRT1_2, 0X1.306FE0A31B7150P+0 ); > >> + SD( -PI, NAN); > >> + SD( -SQRT2, NAN); > >> + SD( TESTA8_DBL, 0X1.00000000000000P-27 ); > >> + > >> + SF( 1.0/256, 0X1.00000000000000P+4 ); > >> + SF( 1.0, 0X1.00000000000000P+0 ); > >> + SF( -1.0, NAN); > >> + SF( 11.0, 0X1.34BF6400000000P-2 ); > >> + SF( 0.0, INFINITY); > >> + SF( INFINITY, 0X0.00000000000000P+0 ); > >> + SF( NAN, NAN); > >> + SF( -NAN, -NAN); > >> + SF( FLT_MAX, 0X1.00000200000000P-64 ); > >> + SF( FLT_MIN, 0X1.00000000000000P+63 ); > >> + SF( PI, 0X1.20DD7400000000P-1 ); > >> + SF( PI_4, 0X1.20DD7400000000P+0 ); > >> + SF( SQRT2, 0X1.AE89FA00000000P-1 ); > >> + SF( SQRT1_2, 0X1.306FE000000000P+0 ); > >> + SF( -PI, NAN); > >> + SF( -SQRT2, NAN); > >> + SF( TESTA8_FLT, 0X1.6A09E600000000P-13 ); > >> + > >> +// With -ffast-math these return positive INF. > >> +// SD( -0.0, -INFINITY); > >> +// SF( -0.0, -INFINITY); > >> +// The reason here is that -ffast-math flushes to zero. > >> +// SD(DBL_MIN/256, 0X1.00000000000000P+515); > >> +// SF(FLT_MIN/256, 0X1.00000000000000P+67 ); > >> + > >> + return 0; > >> +} > >> -- > >> 1.9.1 > >>