Emit square root using the Newton series
2015-12-03 Evandro Menezes <e.mene...@samsung.com>
gcc/
* config/aarch64/aarch64-protos.h (aarch64_emit_swsqrt):
Declare new
function.
* config/aarch64/aarch64-simd.md (sqrt<mode>2): New
expansion and
insn definitions.
* config/aarch64/aarch64-tuning-flags.def
(AARCH64_EXTRA_TUNE_FAST_SQRT): New tuning macro.
* config/aarch64/aarch64.c (aarch64_emit_swsqrt): Define
new function.
* config/aarch64/aarch64.md (sqrt<mode>2): New expansion
and insn
definitions.
* config/aarch64/aarch64.opt (mlow-precision-recip-sqrt):
Expand option
description.
* doc/invoke.texi (mlow-precision-recip-sqrt): Likewise.
This patch extends the patch that added support for implementing x^-1/2
using the Newton series by adding support for x^1/2 as well.
Is it OK at this point of stage 3?
Thank you,
--
Evandro Menezes
>From f173dace7b4137f8868a1a6ef9cdbbeefa92ffde Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.mene...@samsung.com>
Date: Thu, 3 Dec 2015 15:25:07 -0600
Subject: [PATCH] Emit square root using the Newton series
2015-12-03 Evandro Menezes <e.mene...@samsung.com>
gcc/
* config/aarch64/aarch64-protos.h (aarch64_emit_swsqrt): Declare new
function.
* config/aarch64/aarch64-simd.md (sqrt<mode>2): New expansion and
insn definitions.
* config/aarch64/aarch64-tuning-flags.def
(AARCH64_EXTRA_TUNE_FAST_SQRT): New tuning macro.
* config/aarch64/aarch64.c (aarch64_emit_swsqrt): Define new function.
* config/aarch64/aarch64.md (sqrt<mode>2): New expansion and insn
definitions.
* config/aarch64/aarch64.opt (mlow-precision-recip-sqrt): Expand option
description.
* doc/invoke.texi (mlow-precision-recip-sqrt): Likewise.
---
gcc/config/aarch64/aarch64-protos.h | 1 +
gcc/config/aarch64/aarch64-simd.md | 18 +++++++++++++++++-
gcc/config/aarch64/aarch64-tuning-flags.def | 2 +-
gcc/config/aarch64/aarch64.c | 25 +++++++++++++++++++++++--
gcc/config/aarch64/aarch64.md | 18 +++++++++++++++++-
gcc/config/aarch64/aarch64.opt | 2 +-
gcc/doc/invoke.texi | 13 ++++++-------
7 files changed, 66 insertions(+), 13 deletions(-)
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 1e0fb4e..7fe6074 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -356,6 +356,7 @@ void aarch64_relayout_simd_types (void);
void aarch64_reset_previous_fndecl (void);
void aarch64_emit_swrsqrt (rtx, rtx);
+void aarch64_emit_swsqrt (rtx, rtx);
/* Initialize builtins for SIMD intrinsics. */
void init_aarch64_simd_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 030a101..f6d2da4 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4280,7 +4280,23 @@
;; sqrt
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+ [(set (match_operand:VDQF 0 "register_operand")
+ (sqrt:VDQF (match_operand:VDQF 1 "register_operand")))]
+ "TARGET_SIMD"
+{
+ if ((AARCH64_EXTRA_TUNE_FAST_SQRT & aarch64_tune_params.extra_tuning_flags)
+ && !optimize_function_for_size_p (cfun)
+ && flag_finite_math_only
+ && !flag_trapping_math
+ && flag_unsafe_math_optimizations)
+ {
+ aarch64_emit_swsqrt (operands[0], operands[1]);
+ DONE;
+ }
+})
+
+(define_insn "*sqrt<mode>2"
[(set (match_operand:VDQF 0 "register_operand" "=w")
(sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
"TARGET_SIMD"
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 6f7dbce..11c6c9a 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -30,4 +30,4 @@
AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
AARCH64_EXTRA_TUNING_OPTION ("recip_sqrt", RECIP_SQRT)
-
+AARCH64_EXTRA_TUNING_OPTION ("fast_sqrt", FAST_SQRT)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ae4cfb3..3b58c35 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -533,8 +533,9 @@ static const struct tune_params exynosm1_tunings =
2, /* min_div_recip_mul_df. */
48, /* max_case_values. */
64, /* cache_line_size. */
- tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_RECIP_SQRT
+ | AARCH64_EXTRA_TUNE_FAST_SQRT) /* tune_flags. */
};
static const struct tune_params thunderx_tunings =
@@ -7515,6 +7516,26 @@ aarch64_emit_swrsqrt (rtx dst, rtx src)
emit_move_insn (dst, x0);
}
+/* Emit instruction sequence to compute the approximate square root. */
+
+void
+aarch64_emit_swsqrt (rtx dst, rtx src)
+{
+ machine_mode mode = GET_MODE (src);
+ gcc_assert (mode == SFmode || mode == V2SFmode || mode == V4SFmode
+ || mode == DFmode || mode == V2DFmode);
+
+ rtx xsrc = gen_reg_rtx (mode);
+ emit_move_insn (xsrc, src);
+
+ rtx xdst = gen_reg_rtx (mode);
+
+ /* Calculate the approximate square root by multiplying the original operand
+ by its approximate reciprocal square root. */
+ aarch64_emit_swrsqrt (xdst, xsrc);
+ emit_set_insn (dst, gen_rtx_MULT (mode, xdst, src));
+}
+
/* Return the number of instructions that can be issued per cycle. */
static int
aarch64_sched_issue_rate (void)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index d9fe1ae..d5930b9 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4534,7 +4534,23 @@
[(set_attr "type" "ffarith<s>")]
)
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+ [(set (match_operand:GPF 0 "register_operand")
+ (sqrt:GPF (match_operand:GPF 1 "register_operand")))]
+ "TARGET_SIMD"
+{
+ if ((AARCH64_EXTRA_TUNE_FAST_SQRT & aarch64_tune_params.extra_tuning_flags)
+ && !optimize_function_for_size_p (cfun)
+ && flag_finite_math_only
+ && !flag_trapping_math
+ && flag_unsafe_math_optimizations)
+ {
+ aarch64_emit_swsqrt (operands[0], operands[1]);
+ DONE;
+ }
+})
+
+(define_insn "*sqrt<mode>2"
[(set (match_operand:GPF 0 "register_operand" "=w")
(sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))]
"TARGET_FLOAT"
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index a0fbfd42..d02c5e8 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -151,5 +151,5 @@ PC relative literal loads.
mlow-precision-recip-sqrt
Common Var(flag_mrecip_low_precision_sqrt) Optimization
-When calculating a sqrt approximation, run fewer steps.
+Calculate the square-root or its reciprocal approximation in fewer steps.
This reduces precision, but can result in faster computation.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 5ab565c..f4a47a6 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -6141,7 +6141,7 @@ is usable even in freestanding environments.
@opindex fsanitize-coverage=trace-pc
Enable coverage-guided fuzzing code instrumentation.
Inserts call to __sanitizer_cov_trace_pc into every basic block.
-
+-
@item -fcheck-pointer-bounds
@opindex fcheck-pointer-bounds
@opindex fno-check-pointer-bounds
@@ -12561,12 +12561,11 @@ corresponding flag to the linker.
@item -mno-low-precision-recip-sqrt
@opindex -mlow-precision-recip-sqrt
@opindex -mno-low-precision-recip-sqrt
-The square root estimate uses two steps instead of three for double-precision,
-and one step instead of two for single-precision.
-Thus reducing latency and precision.
-This is only relevant if @option{-ffast-math} activates
-reciprocal square root estimate instructions.
-Which in turn depends on the target processor.
+The square root and its reciprocal approximation use one step less than
+otherwise, thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} enables
+the square root or its reciprocal approximation,
+which in turn depends on the target processor.
@item -march=@var{name}
@opindex march
--
1.9.1