Re: [AArch64] Emit square root using the Newton series

Evandro Menezes Tue, 08 Mar 2016 14:09:10 -0800

On 02/16/16 14:56, Evandro Menezes wrote:

On 12/08/15 15:35, Evandro Menezes wrote:
Emit square root using the Newton series
   2015-12-03  Evandro Menezes  <e.mene...@samsung.com>

   gcc/
            * config/aarch64/aarch64-protos.h (aarch64_emit_swsqrt):
   Declare new
            function.
            * config/aarch64/aarch64-simd.md (sqrt<mode>2): New
   expansion and
            insn definitions.
            * config/aarch64/aarch64-tuning-flags.def
            (AARCH64_EXTRA_TUNE_FAST_SQRT): New tuning macro.
            * config/aarch64/aarch64.c (aarch64_emit_swsqrt): Define
   new function.
            * config/aarch64/aarch64.md (sqrt<mode>2): New expansion
   and insn
            definitions.
            * config/aarch64/aarch64.opt (mlow-precision-recip-sqrt):
   Expand option
            description.
            * doc/invoke.texi (mlow-precision-recip-sqrt): Likewise.
This patch extends the patch that added support for implementingx^-1/2 using the Newton series by adding support for x^1/2 as well.
Is it OK at this point of stage 3?

Thank you,
James,
As I was saying, this patch results in some validation errors inCPU2000 benchmarks using DF. Although proving the algorithm to bepretty solid with a vast set of random values, I'm confused why somebenchmarks fail to validate with this implementation of the Newtonseries for square root too, when they pass with the Newton series forreciprocal square root.
Since I had no problems with the same algorithm on x86-64, I wonder ifthe initial estimate on AArch64, which offers just 8 bits, whereasx86-64 offers 11 bits, has to do with it. Then again, the algorithmiterated 1 less time on x86-64 than on AArch64.
Since it seems that the initial estimate is sufficient for CPU2000 tovalidate when using SF, I'm leaning towards restricting the Newtonseries for square root only for SF.
Your thoughts on the matter are appreciated,


        Add choices for the reciprocal square root approximation

        Allow a target to prefer such operation depending on the FP
   precision.

        gcc/
            * config/aarch64/aarch64-protos.h
            (AARCH64_EXTRA_TUNE_APPROX_RSQRT): New macro.
            * config/aarch64/aarch64-tuning-flags.def
            (AARCH64_EXTRA_TUNE_APPROX_RSQRT_DF): New mask.
            (AARCH64_EXTRA_TUNE_APPROX_RSQRT_SF): Likewise.
            * config/aarch64/aarch64.c
            (use_rsqrt_p): New argument for the mode.
            (aarch64_builtin_reciprocal): Devise mode from builtin.
            (aarch64_optab_supported_p): New argument for the mode.


Now that the patch is attached, feedback is appreciated.

Thank you,


--
Evandro Menezes

>From 0bb413550e854c81cc5ab180a3afdd43cd4faf0b Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.mene...@samsung.com>
Date: Thu, 3 Mar 2016 18:13:46 -0600
Subject: [PATCH] Add choices for the reciprocal square root approximation

Allow a target to prefer such operation depending on the FP precision.

gcc/
	* config/aarch64/aarch64-protos.h
	(AARCH64_EXTRA_TUNE_APPROX_RSQRT): New macro.
	* config/aarch64/aarch64-tuning-flags.def
	(AARCH64_EXTRA_TUNE_APPROX_RSQRT_DF): New mask.
	(AARCH64_EXTRA_TUNE_APPROX_RSQRT_SF): Likewise.
	* config/aarch64/aarch64.c
	(use_rsqrt_p): New argument for the mode.
	(aarch64_builtin_reciprocal): Devise mode from builtin.
	(aarch64_optab_supported_p): New argument for the mode.
---
 gcc/config/aarch64/aarch64-protos.h         |  3 +++
 gcc/config/aarch64/aarch64-tuning-flags.def |  3 ++-
 gcc/config/aarch64/aarch64.c                | 23 +++++++++++++++--------
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index acf2062..ee3505c 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -263,6 +263,9 @@ enum aarch64_extra_tuning_flags
 };
 #undef AARCH64_EXTRA_TUNING_OPTION
 
+#define AARCH64_EXTRA_TUNE_APPROX_RSQRT \
+  (AARCH64_EXTRA_TUNE_APPROX_RSQRT_DF | AARCH64_EXTRA_TUNE_APPROX_RSQRT_SF)
+
 extern struct tune_params aarch64_tune_params;
 
 HOST_WIDE_INT aarch64_initial_elimination_offset (unsigned, unsigned);
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 7e45a0c..57d9588 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -29,5 +29,6 @@
      AARCH64_TUNE_ to give an enum name. */
 
 AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
-AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT)
+AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT_DF)
+AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrtf", APPROX_RSQRT_SF)
 
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 801f95a..39a1a47 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -7464,12 +7464,16 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
    to optimize 1.0/sqrt.  */
 
 static bool
-use_rsqrt_p (void)
+use_rsqrt_p (machine_mode mode)
 {
   return (!flag_trapping_math
 	  && flag_unsafe_math_optimizations
-	  && ((aarch64_tune_params.extra_tuning_flags
-	       & AARCH64_EXTRA_TUNE_APPROX_RSQRT)
+	  && ((GET_MODE_INNER (mode) == SFmode
+	       && (aarch64_tune_params.extra_tuning_flags
+		   & AARCH64_EXTRA_TUNE_APPROX_RSQRT_SF))
+	      || (GET_MODE_INNER (mode) == DFmode
+		  && (aarch64_tune_params.extra_tuning_flags
+		      & AARCH64_EXTRA_TUNE_APPROX_RSQRT_DF))
 	      || flag_mrecip_low_precision_sqrt));
 }
 
@@ -7479,9 +7483,12 @@ use_rsqrt_p (void)
 static tree
 aarch64_builtin_reciprocal (tree fndecl)
 {
-  if (!use_rsqrt_p ())
-    return NULL_TREE;
-  return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
+  machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
+
+  if (use_rsqrt_p (mode))
+    return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
+
+  return NULL_TREE;
 }
 
 typedef rtx (*rsqrte_type) (rtx, rtx);
@@ -13960,13 +13967,13 @@ aarch64_promoted_type (const_tree t)
 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
 
 static bool
-aarch64_optab_supported_p (int op, machine_mode, machine_mode,
+aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
 			   optimization_type opt_type)
 {
   switch (op)
     {
     case rsqrt_optab:
-      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
+      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
 
     default:
       return true;
-- 
2.6.3

Re: [AArch64] Emit square root using the Newton series

Reply via email to