On Sat, Jan 03, 1970 at 03:27:04AM +0000, Benedikt Huber wrote: > * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and > rsqrtf. > * config/aarch64/aarch64-protos.h: Declare. > * config/aarch64/aarch64-simd.md: Matching expressions for > frsqrte and frsqrts. > * config/aarch64/aarch64-tuning-flags.def: Added > MRECIP_DEFAULT_ENABLED. > * config/aarch64/aarch64.c: New functions. Emit rsqrt > estimation code in fast math mode. > * config/aarch64/aarch64.md: Added enum entries. > * config/aarch64/aarch64.opt: Added options -mrecip and > -mlow-precision-recip-sqrt. > * testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans > for frsqrte and frsqrts > * testsuite/gcc.target/aarch64/rsqrt.c: Functional tests for rsqrt.
Hi, Thanks for this latest revision, I have some structural/refactoring comments, and I think I've spotted a bug. Otherwise this is getting close to ready. Some more comments in line. (As an asside, I find this style of patch submission to be very difficult to follow, as it misses my mail filters and does not keep the in-reply-to header correctly across patch revisions). > 2015-10-01 Lynn Boger <labo...@linux.vnet.ibm.com> > > PR target/66870 > diff --git a/gcc/config/aarch64/aarch64-builtins.c > b/gcc/config/aarch64/aarch64-builtins.c > index 80916a9..29cfbf5 100644 > --- a/gcc/config/aarch64/aarch64-builtins.c > +++ b/gcc/config/aarch64/aarch64-builtins.c > @@ -344,6 +344,11 @@ enum aarch64_builtins > AARCH64_BUILTIN_GET_FPSR, > AARCH64_BUILTIN_SET_FPSR, > > + AARCH64_BUILTIN_RSQRT_DF, > + AARCH64_BUILTIN_RSQRT_SF, > + AARCH64_BUILTIN_RSQRT_V2DF, > + AARCH64_BUILTIN_RSQRT_V2SF, > + AARCH64_BUILTIN_RSQRT_V4SF, > AARCH64_SIMD_BUILTIN_BASE, > AARCH64_SIMD_BUILTIN_LANE_CHECK, > #include "aarch64-simd-builtins.def" > @@ -842,6 +847,46 @@ aarch64_init_crc32_builtins () > } > } > > +/* Add builtins for reciprocal square root. */ > + > +void > +aarch64_add_builtin_rsqrt (void) > +{ > + tree fndecl = NULL; > + tree ftype = NULL; > + > + tree V2SF_type_node = build_vector_type (float_type_node, 2); > + tree V2DF_type_node = build_vector_type (double_type_node, 2); > + tree V4SF_type_node = build_vector_type (float_type_node, 4); > + > + ftype = build_function_type_list (double_type_node, double_type_node, > + NULL_TREE); > + fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_df", > + ftype, AARCH64_BUILTIN_RSQRT_DF, BUILT_IN_MD, NULL, NULL_TREE); > + aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF] = fndecl; > + > + ftype = build_function_type_list (float_type_node, float_type_node, > + NULL_TREE); > + fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_sf", > + ftype, AARCH64_BUILTIN_RSQRT_SF, BUILT_IN_MD, NULL, NULL_TREE); > + aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF] = fndecl; > + > + ftype = build_function_type_list (V2DF_type_node, V2DF_type_node, > NULL_TREE); > + fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2df", > + ftype, AARCH64_BUILTIN_RSQRT_V2DF, BUILT_IN_MD, NULL, NULL_TREE); > + aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF] = fndecl; > + > + ftype = build_function_type_list (V2SF_type_node, V2SF_type_node, > NULL_TREE); > + fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2sf", > + ftype, AARCH64_BUILTIN_RSQRT_V2SF, BUILT_IN_MD, NULL, NULL_TREE); > + aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF] = fndecl; > + > + ftype = build_function_type_list (V4SF_type_node, V4SF_type_node, > NULL_TREE); > + fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v4sf", > + ftype, AARCH64_BUILTIN_RSQRT_V4SF, BUILT_IN_MD, NULL, NULL_TREE); > + aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF] = fndecl; Given that this is all so mechanical, I'd have a preference towards refactoring this to loop over some structured data. Something like: {AARCH64_BUILTIN_RSQRT_SF, float_type_node, "__builtin_aarch64_rsqrt_sf"}, {AARCH64_BUILTIN_RSQRT_DF, double_type_node, "__builtin_aarch64_rsqrt_df"}, etc. > void > aarch64_init_builtins (void) > { > @@ -873,6 +918,7 @@ aarch64_init_builtins (void) > aarch64_init_simd_builtins (); > > aarch64_init_crc32_builtins (); > + aarch64_add_builtin_rsqrt (); Very minor nit, other functions use "init", you use "add". > } > > tree > @@ -1136,6 +1182,41 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, rtx > target) > return target; > } > > +/* Function to expand reciprocal square root builtins. */ > + > +static rtx > +aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target) > +{ > + rtx pat; > + tree arg0 = CALL_EXPR_ARG (exp, 0); > + rtx op0 = expand_normal (arg0); > + > + enum insn_code c; > + > + switch (fcode) > + { > + case AARCH64_BUILTIN_RSQRT_DF: > + c = CODE_FOR_rsqrt_df2; break; > + case AARCH64_BUILTIN_RSQRT_SF: > + c = CODE_FOR_rsqrt_sf2; break; > + case AARCH64_BUILTIN_RSQRT_V2DF: > + c = CODE_FOR_rsqrt_v2df2; break; > + case AARCH64_BUILTIN_RSQRT_V2SF: > + c = CODE_FOR_rsqrt_v2sf2; break; > + case AARCH64_BUILTIN_RSQRT_V4SF: > + c = CODE_FOR_rsqrt_v4sf2; break; > + default: gcc_unreachable (); > + } Formatting looks off for the "default" case. > + > + if (!target) > + target = gen_reg_rtx (GET_MODE (op0)); > + > + pat = GEN_FCN (c) (target, op0); > + emit_insn (pat); > + > + return target; Could we rewrite the above using function pointers and gen functions as you do elsewhere in the patch: rtx (*gen) (rtx, rtx); switch (fcode) { case AARCH64_BUILTIN_RSQRT_DF: gen = gen_rsqrt_df2; break; case AARCH64_BUILTIN_RSQRT_SF: gen = gen_rsqrt_sf2; break; <...> } emit_insn (gen (target, op0)); > /* Expand an expression EXP that calls a built-in function, > with result going to TARGET if that's convenient. */ > rtx > @@ -1183,6 +1264,13 @@ aarch64_expand_builtin (tree exp, > else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= > AARCH64_CRC32_BUILTIN_MAX) > return aarch64_crc32_expand_builtin (fcode, exp, target); > > + if (fcode == AARCH64_BUILTIN_RSQRT_DF > + || fcode == AARCH64_BUILTIN_RSQRT_SF > + || fcode == AARCH64_BUILTIN_RSQRT_V2DF > + || fcode == AARCH64_BUILTIN_RSQRT_V2SF > + || fcode == AARCH64_BUILTIN_RSQRT_V4SF) > + return aarch64_expand_builtin_rsqrt (fcode, exp, target); > + > gcc_unreachable (); > } > > @@ -1340,6 +1428,30 @@ aarch64_builtin_vectorized_function (tree fndecl, tree > type_out, tree type_in) > return NULL_TREE; > } > > +/* Return builtin for reciprocal square root. */ > + > +tree > +aarch64_builtin_rsqrt (unsigned int fn, bool md_fn) > +{ > + if (md_fn) > + { > + if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df) > + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF]; > + if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf) > + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF]; > + if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf) > + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF]; > + } > + else > + { > + if (fn == BUILT_IN_SQRT) > + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF]; > + if (fn == BUILT_IN_SQRTF) > + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF]; > + } > + return NULL_TREE; > +} > + > #undef VAR1 > #define VAR1(T, N, MAP, A) \ > case AARCH64_SIMD_BUILTIN_##T##_##N##A: > diff --git a/gcc/config/aarch64/aarch64-protos.h > b/gcc/config/aarch64/aarch64-protos.h > index baaf1bd..455b1da 100644 > --- a/gcc/config/aarch64/aarch64-protos.h > +++ b/gcc/config/aarch64/aarch64-protos.h > @@ -332,6 +332,8 @@ void aarch64_register_pragmas (void); > void aarch64_relayout_simd_types (void); > void aarch64_reset_previous_fndecl (void); > > +void aarch64_emit_swrsqrt (rtx, rtx); > + > /* Initialize builtins for SIMD intrinsics. */ > void init_aarch64_simd_builtins (void); > > @@ -400,4 +402,5 @@ int aarch64_ccmp_mode_to_code (enum machine_mode mode); > bool extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset); > bool aarch64_operands_ok_for_ldpstp (rtx *, bool, enum machine_mode); > bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, enum machine_mode); > +tree aarch64_builtin_rsqrt (unsigned int fn, bool md_fn); It is a losing battle, but at some point this file was in alphabetical order, first by type then by name. If we could keep to that, that would be good. > #endif /* GCC_AARCH64_PROTOS_H */ > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index 541faf9..d48ad3b 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -354,6 +354,33 @@ > [(set_attr "type" "neon_fp_mul_d_scalar_q")] > ) > > +(define_insn "rsqrte_<mode>2" As this is not a standard pattern name, keep it namespaced as aarch64_rsqrte<mode>2. > + [(set (match_operand:VALLF 0 "register_operand" "=w") > + (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")] > + UNSPEC_RSQRTE))] > + "TARGET_SIMD" > + "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>" > + [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")]) > + > +(define_insn "rsqrts_<mode>3" Likewise. > + [(set (match_operand:VALLF 0 "register_operand" "=w") > + (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w") > + (match_operand:VALLF 2 "register_operand" "w")] > + UNSPEC_RSQRTS))] > + "TARGET_SIMD" > + "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>" > + [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")]) > + > +(define_expand "rsqrt_<mode>2" Likewise. > + [(set (match_operand:VALLF 0 "register_operand" "=w") > + (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")] > + UNSPEC_RSQRT))] > + "TARGET_SIMD" > +{ > + aarch64_emit_swrsqrt (operands[0], operands[1]); > + DONE; > +}) > + > (define_insn "*aarch64_mul3_elt_to_64v2df" > [(set (match_operand:DF 0 "register_operand" "=w") > (mult:DF > diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def > b/gcc/config/aarch64/aarch64-tuning-flags.def > index 628386b..6f7dbce 100644 > --- a/gcc/config/aarch64/aarch64-tuning-flags.def > +++ b/gcc/config/aarch64/aarch64-tuning-flags.def > @@ -29,4 +29,5 @@ > AARCH64_TUNE_ to give an enum name. */ > > AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS) > +AARCH64_EXTRA_TUNING_OPTION ("recip_sqrt", RECIP_SQRT) > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index 034da7c..5ddfa5d 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -408,7 +408,8 @@ static const struct tune_params cortexa57_tunings = > 1, /* vec_reassoc_width. */ > 2, /* min_div_recip_mul_sf. */ > 2, /* min_div_recip_mul_df. */ > - (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */ > + (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS > + | AARCH64_EXTRA_TUNE_RECIP_SQRT) /* tune_flags. */ > }; > > static const struct tune_params cortexa72_tunings = > @@ -472,7 +473,7 @@ static const struct tune_params xgene1_tunings = > 1, /* vec_reassoc_width. */ > 2, /* min_div_recip_mul_sf. */ > 2, /* min_div_recip_mul_df. */ > - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ > + (AARCH64_EXTRA_TUNE_RECIP_SQRT) /* tune_flags. */ > }; > > /* Support for fine-grained override of the tuning structures. */ > @@ -7009,6 +7010,107 @@ aarch64_memory_move_cost (machine_mode mode > ATTRIBUTE_UNUSED, > return aarch64_tune_params.memmov_cost; > } > > +/* Function to decide when to use > + reciprocal square root builtins. */ > + > +static tree > +aarch64_builtin_reciprocal (unsigned int fn, > + bool md_fn, > + bool) > +{ > + if (flag_trapping_math > + || !flag_unsafe_math_optimizations > + || optimize_size > + || (aarch64_tune_params.extra_tuning_flags > + & AARCH64_EXTRA_TUNE_RECIP_SQRT)) I've checked a number of times, but this condition still looks backwards to me. As far as I can see, this says not to do the transform if (aarch64_tune_params.extra_tuning_flags & AARCH64_EXTRA_TUNE_RECIP_SQRT)) But it is Friday, so forgive me if I'm wrong. > + { > + return NULL_TREE; > + } > + > + return aarch64_builtin_rsqrt (fn, md_fn); > +} > + > +typedef rtx (*rsqrte_type) (rtx, rtx); > + > +/* Select reciprocal square root initial estimate > + insn depending on machine mode. */ > + > +rsqrte_type > +get_rsqrte_type (enum machine_mode mode) > +{ > + switch (mode) > + { > + case DFmode: return gen_rsqrte_df2; > + case SFmode: return gen_rsqrte_sf2; > + case V2DFmode: return gen_rsqrte_v2df2; > + case V2SFmode: return gen_rsqrte_v2sf2; > + case V4SFmode: return gen_rsqrte_v4sf2; > + default: gcc_unreachable (); > + } > +} > + > +typedef rtx (*rsqrts_type) (rtx, rtx, rtx); > + > +/* Select reciprocal square root Newton-Raphson step > + insn depending on machine mode. */ > + > +rsqrts_type > +get_rsqrts_type (enum machine_mode mode) > +{ > + switch (mode) > + { > + case DFmode: return gen_rsqrts_df3; > + case SFmode: return gen_rsqrts_sf3; > + case V2DFmode: return gen_rsqrts_v2df3; > + case V2SFmode: return gen_rsqrts_v2sf3; > + case V4SFmode: return gen_rsqrts_v4sf3; > + default: gcc_unreachable (); > + } > +} > + > +/* Emit instruction sequence to compute > + reciprocal square root. Use two Newton-Raphson steps > + for single precision and three for double precision. */ > + > +void > +aarch64_emit_swrsqrt (rtx dst, rtx src) > +{ > + enum machine_mode mode = GET_MODE (src); > + gcc_assert ( > + mode == SFmode || mode == V2SFmode || mode == V4SFmode || > + mode == DFmode || mode == V2DFmode); Split before the operator: mode == SFmode || mode == V2SFmode || mode == V4SFmode || mode == DFmode || mode == V2DFmode); > + > + rtx xsrc = gen_reg_rtx (mode); > + emit_move_insn (xsrc, src); > + rtx x0 = gen_reg_rtx (mode); > + > + emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc)); > + > + bool double_mode = (mode == DFmode || mode == V2DFmode); > + > + int iterations = 2; > + if (double_mode) > + iterations = 3; Personal preference: int iterations = double_mode ? 3 : 2; > + > + if (flag_mrecip_low_precision_sqrt) > + iterations--; > + > + for (int i = 0; i < iterations; ++i) > + { > + rtx x1 = gen_reg_rtx (mode); > + rtx x2 = gen_reg_rtx (mode); > + rtx x3 = gen_reg_rtx (mode); > + emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0)); > + > + emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2)); > + > + emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3)); > + x0 = x1; > + } > + > + emit_move_insn (dst, x0); > +} > + > /* Return the number of instructions that can be issued per cycle. */ > static int > aarch64_sched_issue_rate (void) > @@ -13387,6 +13489,9 @@ aarch64_promoted_type (const_tree t) > #undef TARGET_BUILD_BUILTIN_VA_LIST > #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list > > +#undef TARGET_BUILTIN_DECL > +#define TARGET_BUILTIN_DECL aarch64_builtin_decl > + Unrelated change? > #undef TARGET_CALLEE_COPIES > #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false > > @@ -13418,9 +13523,6 @@ aarch64_promoted_type (const_tree t) > #undef TARGET_CLASS_MAX_NREGS > #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs > > -#undef TARGET_BUILTIN_DECL > -#define TARGET_BUILTIN_DECL aarch64_builtin_decl > - > #undef TARGET_EXPAND_BUILTIN > #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin > > @@ -13561,6 +13663,9 @@ aarch64_promoted_type (const_tree t) > #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P > #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p > > +#undef TARGET_BUILTIN_RECIPROCAL > +#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal > + I think Marcus asked for this to be in alphabetical order in the v5 review. > #undef TARGET_VECTOR_MODE_SUPPORTED_P > #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p > > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > index c3cd58d..51c2b87 100644 > --- a/gcc/config/aarch64/aarch64.md > +++ b/gcc/config/aarch64/aarch64.md > @@ -126,6 +126,9 @@ > UNSPEC_VSTRUCTDUMMY > UNSPEC_SP_SET > UNSPEC_SP_TEST > + UNSPEC_RSQRT > + UNSPEC_RSQRTE > + UNSPEC_RSQRTS > ]) > > (define_c_enum "unspecv" [ > diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt > index a1ce58d..00084ea 100644 > --- a/gcc/config/aarch64/aarch64.opt > +++ b/gcc/config/aarch64/aarch64.opt > @@ -148,3 +148,7 @@ Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64) > mpc-relative-literal-loads > Target Report Save Var(nopcrelative_literal_loads) Init(2) Save > PC relative literal loads. > + > +mlow-precision-recip-sqrt > +Common Var(flag_mrecip_low_precision_sqrt) Optimization > +Run fewer approximation steps to reduce latency and precision. Don't make a definite claim about latency here. When calculating a sqrt approximation, run fewer steps. This reduces precision, but can result in faster computation. > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi > index c19be78..8b45837 100644 > --- a/gcc/doc/invoke.texi > +++ b/gcc/doc/invoke.texi > @@ -519,6 +519,7 @@ Objective-C and Objective-C++ Dialects}. > -mtls-size=@var{size} @gol > -mfix-cortex-a53-835769 -mno-fix-cortex-a53-835769 @gol > -mfix-cortex-a53-843419 -mno-fix-cortex-a53-843419 @gol > +-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol > -march=@var{name} -mcpu=@var{name} -mtune=@var{name}} > > @emph{Adapteva Epiphany Options} > @@ -12445,6 +12446,17 @@ Enable or disable the workaround for the ARM > Cortex-A53 erratum number 843419. > This erratum workaround is made at link time and this will only pass the > corresponding flag to the linker. > > +@item -mlow-precision-recip-sqrt > +@item -mno-low-precision-recip-sqrt > +@opindex -mlow-precision-recip-sqrt > +@opindex -mno-low-precision-recip-sqrt > +The square root estimate uses two steps instead of three for > double-precision, > +and one step instead of two for single-precision. > +Thus reducing latency and precision. > +This is only relevant if @option{-ffast-math} activates > +reciprocal square root estimate instructions. > +Which in turn depends on the CPU core. As above. To be consistent with the other documentation, s/CPU core/target processor/ > + > @item -march=@var{name} > @opindex march > Specify the name of the target architecture, optionally suffixed by one or > diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c > b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c > new file mode 100644 > index 0000000..9f17990 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c > @@ -0,0 +1,65 @@ > +/* Test for the recip_sqrt tuning > + ensuring the correct instructions are generated. */ > +/* { dg-do compile } */ > +/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm > -funsafe-math-optimizations -fno-math-errno" } */ Presumably this testcase needs guarding or otherwise tweaked to make sure it only runs for targets which want to use the estimate expansion? Additionally, a test showing the opposite - that a target which does not want the expansion doesn't get it - would be useful. > + > +#define sqrt_float __builtin_sqrtf > +#define sqrt_double __builtin_sqrt > + > +#define TESTTYPE(TYPE) \ > +typedef struct { \ > + TYPE a; \ > + TYPE b; \ > + TYPE c; \ > + TYPE d; \ > +} s4_##TYPE; \ > +\ > +typedef struct { \ > + TYPE a; \ > + TYPE b; \ > +} s2_##TYPE; \ > +\ > +s4_##TYPE \ > +rsqrtv4_##TYPE (s4_##TYPE i) \ > +{ \ > + s4_##TYPE o; \ > + o.a = 1.0 / sqrt_##TYPE (i.a); \ > + o.b = 1.0 / sqrt_##TYPE (i.b); \ > + o.c = 1.0 / sqrt_##TYPE (i.c); \ > + o.d = 1.0 / sqrt_##TYPE (i.d); \ > + return o; \ > +} \ > +\ > +s2_##TYPE \ > +rsqrtv2_##TYPE (s2_##TYPE i) \ > +{ \ > + s2_##TYPE o; \ > + o.a = 1.0 / sqrt_##TYPE (i.a); \ > + o.b = 1.0 / sqrt_##TYPE (i.b); \ > + return o; \ > +} \ > +\ > +TYPE \ > +rsqrt_##TYPE (TYPE i) \ > +{ \ > + return 1.0 / sqrt_##TYPE (i); \ > +} \ > + > +TESTTYPE (double) > +TESTTYPE (float) > + > +/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, d\[0-9\]+" 1 } } > */ > +/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, d\[0-9\]+, > d\[0-9\]+" 3 } } */ > + > +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2d, v\[0-9\]+.2d" > 3 } } */ > +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2d, v\[0-9\]+.2d, > v\[0-9\]+.2d" 9 } } */ > + > + > +/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, s\[0-9\]+" 1 } } > */ > +/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, s\[0-9\]+, > s\[0-9\]+" 2 } } */ > + > +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.4s, v\[0-9\]+.4s" > 1 } } */ > +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.4s, v\[0-9\]+.4s, > v\[0-9\]+.4s" 2 } } */ > + > +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2s, v\[0-9\]+.2s" > 1 } } */ > +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2s, v\[0-9\]+.2s, > v\[0-9\]+.2s" 2 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c > b/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c > new file mode 100644 > index 0000000..624f9b2 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c > @@ -0,0 +1,111 @@ > +/* Test for the recip_sqrt tuning > + ensuring functionality and sufficient accuracy. */ > +/* { dg-do run } */ > +/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm > -funsafe-math-optimizations -fno-math-errno" } */ Likewise. > + > +#define PI 3.141592653589793 > +#define SQRT2 1.4142135623730951 > + > +#define PI_4 0.7853981633974483 > +#define SQRT1_2 0.7071067811865475 > + > +/* 2^25+1, float has 24 significand bits > + * according to Single-precision floating-point format. */ > +#define TESTA8_FLT 33554433 > +/* 2^54+1, double has 53 significand bits > + * according to Double-precision floating-point format. */ > +#define TESTA8_DBL 18014398509481985 > + > +#define EPSILON_double __DBL_EPSILON__ > +#define EPSILON_float __FLT_EPSILON__ > +#define ABS_double __builtin_fabs > +#define ABS_float __builtin_fabsf > +#define SQRT_double __builtin_sqrt > +#define SQRT_float __builtin_sqrtf > +#define ISNAN_double __builtin_isnan > +#define ISNAN_float __builtin_isnanf > + > +extern void abort (void); > + > +#define TESTTYPE(TYPE) \ > +TYPE \ > +rsqrt_##TYPE (TYPE a) \ > +{ \ > + return 1.0/SQRT_##TYPE (a); \ > +} \ > +\ > +int \ > +equals_##TYPE (TYPE a, TYPE b) \ > +{ \ > + return (a == b || \ > + (ISNAN_##TYPE (a) && ISNAN_##TYPE (b)) || \ > + (ABS_##TYPE (a - b) < EPSILON_##TYPE)); \ > +} \ > +\ > +void \ > +t_##TYPE (TYPE a, TYPE result) \ > +{ \ > + TYPE r = rsqrt_##TYPE (a); \ > + if (!equals_##TYPE (r, result)) \ > + { \ > + abort (); \ > + } \ > +} \ > + > +TESTTYPE (double) > +TESTTYPE (float) > + > +int > +main () > +{ > + double nan = __builtin_nan (""); > + double inf = __builtin_inf (); > + float nanf = __builtin_nanf (""); > + float inff = __builtin_inff (); > + > + t_double (1.0/256, 0X1.00000000000000P+4); > + t_double (1.0, 0X1.00000000000000P+0); > + t_double (-1.0, nan); > + t_double (11.0, 0X1.34BF63D1568260P-2); > + t_double (0.0, inf); > + t_double (inf, 0X0.00000000000000P+0); > + t_double (nan, nan); > + t_double (-nan, -nan); > + t_double (__DBL_MAX__, 0X1.00000000000010P-512); > + t_double (__DBL_MIN__, 0X1.00000000000000P+511); > + t_double (PI, 0X1.20DD750429B6D0P-1); > + t_double (PI_4, 0X1.20DD750429B6D0P+0); > + t_double (SQRT2, 0X1.AE89F995AD3AE0P-1); > + t_double (SQRT1_2, 0X1.306FE0A31B7150P+0); > + t_double (-PI, nan); > + t_double (-SQRT2, nan); > + t_double (TESTA8_DBL, 0X1.00000000000000P-27); > + > + t_float (1.0/256, 0X1.00000000000000P+4); > + t_float (1.0, 0X1.00000000000000P+0); > + t_float (-1.0, nanf); > + t_float (11.0, 0X1.34BF6400000000P-2); > + t_float (0.0, inff); > + t_float (inff, 0X0.00000000000000P+0); > + t_float (nanf, nanf); > + t_float (-nanf, -nanf); > + t_float (__FLT_MAX__, 0X1.00000200000000P-64); > + t_float (__FLT_MIN__, 0X1.00000000000000P+63); > + t_float (PI, 0X1.20DD7400000000P-1); > + t_float (PI_4, 0X1.20DD7400000000P+0); > + t_float (SQRT2, 0X1.AE89FA00000000P-1); > + t_float (SQRT1_2, 0X1.306FE000000000P+0); > + t_float (-PI, nanf); > + t_float (-SQRT2, nanf); > + t_float (TESTA8_FLT, 0X1.6A09E600000000P-13); > + > +// With -ffast-math these return positive INF. > +// t_double (-0.0, -inf); > +// t_float (-0.0, -inff); > + > +// The reason here is that -ffast-math flushes to zero. > +// t_double (__DBL_MIN__/256, 0X1.00000000000000P+515); > +// t_float (__FLT_MIN__/256, 0X1.00000000000000P+67); > + > + return 0; > +} Thanks, James