[PATCH 2/2]AArch64: support encoding integer immediates using floating point moves

Tamar Christina Mon, 30 Sep 2024 04:47:51 -0700

Hi All,

This patch extends our immediate SIMD generation cases to support generating
integer immediates using floating point operation if the integer immediate maps
to an exact FP value.


As an example:

uint32x4_t f1() {
    return vdupq_n_u32(0x3f800000);
}

currently generates:

f1:
        adrp    x0, .LC0
        ldr     q0, [x0, #:lo12:.LC0]
        ret

i.e. a load, but with this change:

f1:
        fmov    v0.4s, 1.0e+0
        ret

Such immediates are common in e.g. our Math routines in glibc because they are
created to extract or mark part of an FP immediate as masks.

Bootstrapped Regtested on aarch64-none-linux-gnu and <on-goin> issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * config/aarch64/aarch64-protos.h (aarch64_float_const_representable_p):
        Add overload.
        * config/aarch64/aarch64.cc (aarch64_float_const_zero_rtx_p): Reject
        integer modes.
        (aarch64_simd_valid_immediate, aarch64_float_const_representable_p):
        Check if integer value maps to an exact FP constant.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/const_create_using_fmov.c: New test.

---
diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
7a84acc59569da0b50af2300615db561a5de460a..6c683ea2d93e1b733cfe49fac38381ea6451fd55
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -974,6 +974,7 @@ void aarch64_split_simd_move (rtx, rtx);
 
 /* Check for a legitimate floating point constant for FMOV.  */
 bool aarch64_float_const_representable_p (rtx, machine_mode);
+bool aarch64_float_const_representable_p (rtx *, rtx, machine_mode);
 
 extern int aarch64_epilogue_uses (int);
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
1842f6ecf6330f11a64545d0903240c89b104ffc..2d44608d93b8e7542ea8d5eb4c3f99c9f88e70ed
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -10991,7 +10991,8 @@ aarch64_float_const_zero_rtx_p (rtx x)
   /* 0.0 in Decimal Floating Point cannot be represented by #0 or
      zr as our callers expect, so no need to check the actual
      value if X is of Decimal Floating Point type.  */
-  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT)
+  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT
+      || !CONST_DOUBLE_P (x))
     return false;
 
   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
@@ -23026,17 +23027,30 @@ aarch64_simd_valid_immediate (rtx op, 
simd_immediate_info *info,
   else
     return false;
 
-  scalar_float_mode elt_float_mode;
-  if (n_elts == 1
-      && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
+  if (n_elts == 1)
     {
       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
+      rtx new_elt = NULL_RTX;
       if (aarch64_float_const_zero_rtx_p (elt)
-         || aarch64_float_const_representable_p (elt, elt_mode))
-       {
-         if (info)
-           *info = simd_immediate_info (elt_float_mode, elt);
-         return true;
+         || aarch64_float_const_representable_p (&new_elt, elt, elt_mode))
+       {
+         scalar_float_mode elt_float_mode;
+         auto bitsize = GET_MODE_UNIT_BITSIZE (elt_mode);
+         if (is_a <scalar_float_mode> (elt_mode))
+           elt_float_mode = as_a <scalar_float_mode> (elt_mode);
+         else if (which == AARCH64_CHECK_MOV
+                  && new_elt
+                  && float_mode_for_size (bitsize).exists (&elt_float_mode))
+           elt = new_elt;
+         else
+           elt = NULL_RTX;
+
+         if (elt != NULL_RTX)
+           {
+             if (info)
+               *info = simd_immediate_info (elt_float_mode, elt);
+             return true;
+           }
        }
     }
 
@@ -25121,8 +25135,22 @@ aarch64_c_mode_for_suffix (char suffix)
 
 /* Return true iff X with mode MODE can be represented by a quarter-precision
    floating point immediate operand X.  Note, we cannot represent 0.0.  */
+
 bool
 aarch64_float_const_representable_p (rtx x, machine_mode mode)
+{
+  return aarch64_float_const_representable_p (NULL, x, mode);
+}
+
+
+/* Return true iff X with mode MODE can be represented by a quarter-precision
+   floating point immediate operand X.  Note, we cannot represent 0.0.
+   If the value is a CONST_INT that can be represented as an exact floating
+   point then OUT will contain the new floating point value to emit to generate
+   the integer constant.  */
+
+bool
+aarch64_float_const_representable_p (rtx *out, rtx x, machine_mode mode)
 {
   /* This represents our current view of how many bits
      make up the mantissa.  */
@@ -25134,14 +25162,45 @@ aarch64_float_const_representable_p (rtx x, 
machine_mode mode)
 
   x = unwrap_const_vec_duplicate (x);
   mode = GET_MODE_INNER (mode);
-  if (!CONST_DOUBLE_P (x))
+  if (!CONST_DOUBLE_P (x)
+      && !CONST_INT_P (x))
     return false;
 
   if (mode == VOIDmode
-      || (mode == HFmode && !TARGET_FP_F16INST))
+      || ((mode == HFmode || mode == HImode) && !TARGET_FP_F16INST))
     return false;
 
-  r = *CONST_DOUBLE_REAL_VALUE (x);
+  /* If we have an integer bit pattern, decode it back into a real.
+     real_from_target requires the representation to be split into
+     32-bit values and then put into two host wide ints.  */
+  if (CONST_INT_P (x))
+    {
+      HOST_WIDE_INT buf = INTVAL (x);
+      long int as_long_ints[2];
+      as_long_ints[0] = buf & 0xFFFFFFFF;
+      as_long_ints[1] = (buf >> 32) & 0xFFFFFFFF;
+      machine_mode fmode;
+      switch (mode)
+      {
+      case HImode:
+       fmode = HFmode;
+       break;
+      case SImode:
+       fmode = SFmode;
+       break;
+      case DImode:
+       fmode = DFmode;
+       break;
+      default:
+       return false;
+      }
+
+      real_from_target (&r, as_long_ints, fmode);
+      if (out)
+       *out = const_double_from_real_value (r, fmode);
+    }
+  else
+    r = *CONST_DOUBLE_REAL_VALUE (x);
 
   /* We cannot represent infinities, NaNs or +/-zero.  We won't
      know if we have +zero until we analyse the mantissa, but we
@@ -25170,6 +25229,7 @@ aarch64_float_const_representable_p (rtx x, 
machine_mode mode)
      the value.  */
   if (w.ulow () != 0)
     return false;
+
   /* We have rejected the lower HOST_WIDE_INT, so update our
      understanding of how many bits lie in the mantissa and
      look only at the high HOST_WIDE_INT.  */
@@ -25205,9 +25265,9 @@ aarch64_float_const_representable_p (rtx x, 
machine_mode mode)
   return (exponent >= 0 && exponent <= 7);
 }
 
-/* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
-   immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
-   output MOVI/MVNI, ORR or BIC immediate.  */
+/* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR, BIC or
+   FMOV immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether
+   to output MOVI/MVNI, ORR or BIC immediate.  */
 char*
 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
                                   enum simd_immediate_check which)
diff --git a/gcc/testsuite/gcc.target/aarch64/const_create_using_fmov.c 
b/gcc/testsuite/gcc.target/aarch64/const_create_using_fmov.c
new file mode 100644
index 
0000000000000000000000000000000000000000..e080afed8aa3578660027979335bfc859ca6bc91
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/const_create_using_fmov.c
@@ -0,0 +1,87 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv9-a -Ofast" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** g:
+**     fmov    v0\.4s, 1\.0e\+0
+**     ret
+*/
+float32x4_t g(){
+    return vdupq_n_f32(1);
+}
+
+/*
+** h:
+**     fmov    v0\.4s, 1\.0e\+0
+**     ret
+*/
+uint32x4_t h() {
+    return vreinterpretq_u32_f32(g());
+}
+
+/*
+** f1:
+**     fmov    v0\.4s, 1\.0e\+0
+**     ret
+*/
+uint32x4_t f1() {
+    return vdupq_n_u32(0x3f800000);
+}
+
+/*
+** f2:
+**     fmov    v0\.4s, 1\.5e\+0
+**     ret
+*/
+uint32x4_t f2() {
+    return vdupq_n_u32(0x3FC00000);
+}
+
+/*
+** f3:
+**     fmov    v0\.4s, 1\.25e\+0
+**     ret
+*/
+uint32x4_t f3() {
+    return vdupq_n_u32(0x3FA00000);
+}
+
+/*
+** f4:
+**     fmov    v0\.2d, 1\.0e\+0
+**     ret
+*/
+uint64x2_t f4() {
+    return vdupq_n_u64(0x3FF0000000000000);
+}
+
+/*
+** fn4:
+**     fmov    v0\.2d, -1\.0e\+0
+**     ret
+*/
+uint64x2_t fn4() {
+    return vdupq_n_u64(0xBFF0000000000000);
+}
+
+/*
+** f5:
+**     fmov    v0\.8h, 1\.5e\+0
+**     ret
+*/
+uint16x8_t f5() {
+    return vdupq_n_u16(0x3E00);
+}
+
+/*
+** f6:
+**     adrp    x0, \.LC0
+**     ldr     q0, \[x0, #:lo12:\.LC0\]
+**     ret
+*/
+uint32x4_t f6() {
+    return vdupq_n_u32(0x4f800000);
+}




--

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 7a84acc59569da0b50af2300615db561a5de460a..6c683ea2d93e1b733cfe49fac38381ea6451fd55 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -974,6 +974,7 @@ void aarch64_split_simd_move (rtx, rtx);
 
 /* Check for a legitimate floating point constant for FMOV.  */
 bool aarch64_float_const_representable_p (rtx, machine_mode);
+bool aarch64_float_const_representable_p (rtx *, rtx, machine_mode);
 
 extern int aarch64_epilogue_uses (int);
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 1842f6ecf6330f11a64545d0903240c89b104ffc..2d44608d93b8e7542ea8d5eb4c3f99c9f88e70ed 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -10991,7 +10991,8 @@ aarch64_float_const_zero_rtx_p (rtx x)
   /* 0.0 in Decimal Floating Point cannot be represented by #0 or
      zr as our callers expect, so no need to check the actual
      value if X is of Decimal Floating Point type.  */
-  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT)
+  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT
+      || !CONST_DOUBLE_P (x))
     return false;
 
   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
@@ -23026,17 +23027,30 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
   else
     return false;
 
-  scalar_float_mode elt_float_mode;
-  if (n_elts == 1
-      && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
+  if (n_elts == 1)
     {
       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
+      rtx new_elt = NULL_RTX;
       if (aarch64_float_const_zero_rtx_p (elt)
-	  || aarch64_float_const_representable_p (elt, elt_mode))
-	{
-	  if (info)
-	    *info = simd_immediate_info (elt_float_mode, elt);
-	  return true;
+	  || aarch64_float_const_representable_p (&new_elt, elt, elt_mode))
+	{
+	  scalar_float_mode elt_float_mode;
+	  auto bitsize = GET_MODE_UNIT_BITSIZE (elt_mode);
+	  if (is_a <scalar_float_mode> (elt_mode))
+	    elt_float_mode = as_a <scalar_float_mode> (elt_mode);
+	  else if (which == AARCH64_CHECK_MOV
+		   && new_elt
+		   && float_mode_for_size (bitsize).exists (&elt_float_mode))
+	    elt = new_elt;
+	  else
+	    elt = NULL_RTX;
+
+	  if (elt != NULL_RTX)
+	    {
+	      if (info)
+		*info = simd_immediate_info (elt_float_mode, elt);
+	      return true;
+	    }
 	}
     }
 
@@ -25121,8 +25135,22 @@ aarch64_c_mode_for_suffix (char suffix)
 
 /* Return true iff X with mode MODE can be represented by a quarter-precision
    floating point immediate operand X.  Note, we cannot represent 0.0.  */
+
 bool
 aarch64_float_const_representable_p (rtx x, machine_mode mode)
+{
+  return aarch64_float_const_representable_p (NULL, x, mode);
+}
+
+
+/* Return true iff X with mode MODE can be represented by a quarter-precision
+   floating point immediate operand X.  Note, we cannot represent 0.0.
+   If the value is a CONST_INT that can be represented as an exact floating
+   point then OUT will contain the new floating point value to emit to generate
+   the integer constant.  */
+
+bool
+aarch64_float_const_representable_p (rtx *out, rtx x, machine_mode mode)
 {
   /* This represents our current view of how many bits
      make up the mantissa.  */
@@ -25134,14 +25162,45 @@ aarch64_float_const_representable_p (rtx x, machine_mode mode)
 
   x = unwrap_const_vec_duplicate (x);
   mode = GET_MODE_INNER (mode);
-  if (!CONST_DOUBLE_P (x))
+  if (!CONST_DOUBLE_P (x)
+      && !CONST_INT_P (x))
     return false;
 
   if (mode == VOIDmode
-      || (mode == HFmode && !TARGET_FP_F16INST))
+      || ((mode == HFmode || mode == HImode) && !TARGET_FP_F16INST))
     return false;
 
-  r = *CONST_DOUBLE_REAL_VALUE (x);
+  /* If we have an integer bit pattern, decode it back into a real.
+     real_from_target requires the representation to be split into
+     32-bit values and then put into two host wide ints.  */
+  if (CONST_INT_P (x))
+    {
+      HOST_WIDE_INT buf = INTVAL (x);
+      long int as_long_ints[2];
+      as_long_ints[0] = buf & 0xFFFFFFFF;
+      as_long_ints[1] = (buf >> 32) & 0xFFFFFFFF;
+      machine_mode fmode;
+      switch (mode)
+      {
+      case HImode:
+	fmode = HFmode;
+	break;
+      case SImode:
+	fmode = SFmode;
+	break;
+      case DImode:
+	fmode = DFmode;
+	break;
+      default:
+	return false;
+      }
+
+      real_from_target (&r, as_long_ints, fmode);
+      if (out)
+	*out = const_double_from_real_value (r, fmode);
+    }
+  else
+    r = *CONST_DOUBLE_REAL_VALUE (x);
 
   /* We cannot represent infinities, NaNs or +/-zero.  We won't
      know if we have +zero until we analyse the mantissa, but we
@@ -25170,6 +25229,7 @@ aarch64_float_const_representable_p (rtx x, machine_mode mode)
      the value.  */
   if (w.ulow () != 0)
     return false;
+
   /* We have rejected the lower HOST_WIDE_INT, so update our
      understanding of how many bits lie in the mantissa and
      look only at the high HOST_WIDE_INT.  */
@@ -25205,9 +25265,9 @@ aarch64_float_const_representable_p (rtx x, machine_mode mode)
   return (exponent >= 0 && exponent <= 7);
 }
 
-/* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
-   immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
-   output MOVI/MVNI, ORR or BIC immediate.  */
+/* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR, BIC or
+   FMOV immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether
+   to output MOVI/MVNI, ORR or BIC immediate.  */
 char*
 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
 				   enum simd_immediate_check which)
diff --git a/gcc/testsuite/gcc.target/aarch64/const_create_using_fmov.c b/gcc/testsuite/gcc.target/aarch64/const_create_using_fmov.c
new file mode 100644
index 0000000000000000000000000000000000000000..e080afed8aa3578660027979335bfc859ca6bc91
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/const_create_using_fmov.c
@@ -0,0 +1,87 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv9-a -Ofast" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** g:
+** 	fmov	v0\.4s, 1\.0e\+0
+** 	ret
+*/
+float32x4_t g(){
+    return vdupq_n_f32(1);
+}
+
+/*
+** h:
+** 	fmov	v0\.4s, 1\.0e\+0
+** 	ret
+*/
+uint32x4_t h() {
+    return vreinterpretq_u32_f32(g());
+}
+
+/*
+** f1:
+** 	fmov	v0\.4s, 1\.0e\+0
+** 	ret
+*/
+uint32x4_t f1() {
+    return vdupq_n_u32(0x3f800000);
+}
+
+/*
+** f2:
+** 	fmov	v0\.4s, 1\.5e\+0
+** 	ret
+*/
+uint32x4_t f2() {
+    return vdupq_n_u32(0x3FC00000);
+}
+
+/*
+** f3:
+** 	fmov	v0\.4s, 1\.25e\+0
+** 	ret
+*/
+uint32x4_t f3() {
+    return vdupq_n_u32(0x3FA00000);
+}
+
+/*
+** f4:
+** 	fmov	v0\.2d, 1\.0e\+0
+** 	ret
+*/
+uint64x2_t f4() {
+    return vdupq_n_u64(0x3FF0000000000000);
+}
+
+/*
+** fn4:
+** 	fmov	v0\.2d, -1\.0e\+0
+** 	ret
+*/
+uint64x2_t fn4() {
+    return vdupq_n_u64(0xBFF0000000000000);
+}
+
+/*
+** f5:
+** 	fmov	v0\.8h, 1\.5e\+0
+** 	ret
+*/
+uint16x8_t f5() {
+    return vdupq_n_u16(0x3E00);
+}
+
+/*
+** f6:
+** 	adrp	x0, \.LC0
+** 	ldr	q0, \[x0, #:lo12:\.LC0\]
+** 	ret
+*/
+uint32x4_t f6() {
+    return vdupq_n_u32(0x4f800000);
+}

[PATCH 2/2]AArch64: support encoding integer immediates using floating point moves

Reply via email to