The PowerPC port provides reciprocal sqrt but doesn't implement the extra incantation to utilize it for sqrtf.
The current implementation re-associates terms in the N-R iteration to utilize one constant instead of two, but does not provide a pre-computed estimate multiplied by the source, which requires an extra multiply at the end. The cost of the extra load of an FP constant through the LSU and a register to hold it seems to balance against the cost of the extra multiply in the VSU, so it's not clear that re-arranging the computation is beneficial. Thanks, David PR target/68609 * config/rs6000/rs6000-protos.h (rs6000_emit_swsqrt): Rename and add bool arguement. * config/rs6000/rs6000.c (rs6000_emit_swsqrt): Rename. Add non-reciporcal path. * config/rs6000/rs6000.md (rsqrt<mode>2): Call new function name. (sqrt<mode>2): Replace define_insn with define_expand that can call rs6000_emit_swsqrt. Index: rs6000-protos.h =================================================================== --- rs6000-protos.h (revision 231169) +++ rs6000-protos.h (working copy) @@ -137,7 +137,7 @@ extern void rs6000_expand_atomic_exchange (rtx op[]); extern void rs6000_expand_atomic_op (enum rtx_code, rtx, rtx, rtx, rtx, rtx); extern void rs6000_emit_swdiv (rtx, rtx, rtx, bool); -extern void rs6000_emit_swrsqrt (rtx, rtx); +extern void rs6000_emit_swsqrt (rtx, rtx, bool); extern void output_toc (FILE *, rtx, int, machine_mode); extern rtx rs6000_longcall_ref (rtx); extern void rs6000_fatal_bad_address (rtx); Index: rs6000.c =================================================================== --- rs6000.c (revision 231169) +++ rs6000.c (working copy) @@ -32889,7 +32889,7 @@ rs6000_emit_swdiv (rtx dst, rtx n, rtx d, bool not rsqrt. Assumes no trapping math and finite arguments. */ void -rs6000_emit_swrsqrt (rtx dst, rtx src) +rs6000_emit_swsqrt (rtx dst, rtx src, bool recip) { machine_mode mode = GET_MODE (src); rtx x0 = gen_reg_rtx (mode); @@ -32922,6 +32922,16 @@ void emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, src), UNSPEC_RSQRT))); + /* If (src == 0.0) filter infinity to prevent NaN for sqrt(0.0). */ + if (!recip) + { + rtx zero = force_reg (mode, CONST0_RTX (mode)); + rtx target = emit_conditional_move (x0, GT, src, zero, mode, + x0, zero, mode, 0); + if (target != x0) + emit_move_insn (x0, target); + } + /* y = 0.5 * src = 1.5 * src - src -> fewer constants */ rs6000_emit_msub (y, src, halfthree, src); @@ -32938,7 +32948,11 @@ void x0 = x1; } - emit_move_insn (dst, x0); + if (!recip) + emit_insn (gen_mul (dst, src, x0)); + else + emit_move_insn (dst, x0); + return; } Index: rs6000.md =================================================================== --- rs6000.md (revision 231169) +++ rs6000.md (working copy) @@ -4301,7 +4301,7 @@ (match_operand:RECIPF 1 "gpc_reg_operand" "")] "RS6000_RECIP_HAVE_RSQRTE_P (<MODE>mode)" { - rs6000_emit_swrsqrt (operands[0], operands[1]); + rs6000_emit_swsqrt (operands[0], operands[1], 1); DONE; }) ^L @@ -4426,7 +4426,7 @@ [(set_attr "type" "<Fs>div") (set_attr "fp_type" "fp_div_<Fs>")]) -(define_insn "sqrt<mode>2" +(define_insn "*sqrt<mode>2_internal" [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv2>") (sqrt:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv2>")))] "TARGET_<MODE>_FPR && !TARGET_SIMPLE_FPU @@ -4437,6 +4437,23 @@ [(set_attr "type" "<Fs>sqrt") (set_attr "fp_type" "fp_sqrt_<Fs>")]) +(define_expand "sqrt<mode>2" + [(set (match_operand:SFDF 0 "gpc_reg_operand" "") + (sqrt:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "")))] + "TARGET_<MODE>_FPR && !TARGET_SIMPLE_FPU + && (TARGET_PPC_GPOPT || (<MODE>mode == SFmode && TARGET_XILINX_FPU))" +{ + if (<MODE>mode == SFmode + && RS6000_RECIP_HAVE_RSQRTE_P (<MODE>mode) + && !optimize_function_for_size_p (cfun) + && flag_finite_math_only && !flag_trapping_math + && flag_unsafe_math_optimizations) + { + rs6000_emit_swsqrt (operands[0], operands[1], 0); + DONE; + } +}) + ;; Floating point reciprocal approximation (define_insn "fre<Fs>" [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv2>")