Hi All, This patch extends our immediate SIMD generation cases to support generating integer immediates using floating point operation if the integer immediate maps to an exact FP value.
As an example: uint32x4_t f1() { return vdupq_n_u32(0x3f800000); } currently generates: f1: adrp x0, .LC0 ldr q0, [x0, #:lo12:.LC0] ret i.e. a load, but with this change: f1: fmov v0.4s, 1.0e+0 ret Such immediates are common in e.g. our Math routines in glibc because they are created to extract or mark part of an FP immediate as masks. Bootstrapped Regtested on aarch64-none-linux-gnu and <on-goin> issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-protos.h (aarch64_float_const_representable_p): Add overload. * config/aarch64/aarch64.cc (aarch64_float_const_zero_rtx_p): Reject integer modes. (aarch64_simd_valid_immediate, aarch64_float_const_representable_p): Check if integer value maps to an exact FP constant. gcc/testsuite/ChangeLog: * gcc.target/aarch64/const_create_using_fmov.c: New test. --- diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 7a84acc59569da0b50af2300615db561a5de460a..6c683ea2d93e1b733cfe49fac38381ea6451fd55 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -974,6 +974,7 @@ void aarch64_split_simd_move (rtx, rtx); /* Check for a legitimate floating point constant for FMOV. */ bool aarch64_float_const_representable_p (rtx, machine_mode); +bool aarch64_float_const_representable_p (rtx *, rtx, machine_mode); extern int aarch64_epilogue_uses (int); diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 1842f6ecf6330f11a64545d0903240c89b104ffc..2d44608d93b8e7542ea8d5eb4c3f99c9f88e70ed 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -10991,7 +10991,8 @@ aarch64_float_const_zero_rtx_p (rtx x) /* 0.0 in Decimal Floating Point cannot be represented by #0 or zr as our callers expect, so no need to check the actual value if X is of Decimal Floating Point type. */ - if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT) + if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT + || !CONST_DOUBLE_P (x)) return false; if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x))) @@ -23026,17 +23027,30 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info, else return false; - scalar_float_mode elt_float_mode; - if (n_elts == 1 - && is_a <scalar_float_mode> (elt_mode, &elt_float_mode)) + if (n_elts == 1) { rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0); + rtx new_elt = NULL_RTX; if (aarch64_float_const_zero_rtx_p (elt) - || aarch64_float_const_representable_p (elt, elt_mode)) - { - if (info) - *info = simd_immediate_info (elt_float_mode, elt); - return true; + || aarch64_float_const_representable_p (&new_elt, elt, elt_mode)) + { + scalar_float_mode elt_float_mode; + auto bitsize = GET_MODE_UNIT_BITSIZE (elt_mode); + if (is_a <scalar_float_mode> (elt_mode)) + elt_float_mode = as_a <scalar_float_mode> (elt_mode); + else if (which == AARCH64_CHECK_MOV + && new_elt + && float_mode_for_size (bitsize).exists (&elt_float_mode)) + elt = new_elt; + else + elt = NULL_RTX; + + if (elt != NULL_RTX) + { + if (info) + *info = simd_immediate_info (elt_float_mode, elt); + return true; + } } } @@ -25121,8 +25135,22 @@ aarch64_c_mode_for_suffix (char suffix) /* Return true iff X with mode MODE can be represented by a quarter-precision floating point immediate operand X. Note, we cannot represent 0.0. */ + bool aarch64_float_const_representable_p (rtx x, machine_mode mode) +{ + return aarch64_float_const_representable_p (NULL, x, mode); +} + + +/* Return true iff X with mode MODE can be represented by a quarter-precision + floating point immediate operand X. Note, we cannot represent 0.0. + If the value is a CONST_INT that can be represented as an exact floating + point then OUT will contain the new floating point value to emit to generate + the integer constant. */ + +bool +aarch64_float_const_representable_p (rtx *out, rtx x, machine_mode mode) { /* This represents our current view of how many bits make up the mantissa. */ @@ -25134,14 +25162,45 @@ aarch64_float_const_representable_p (rtx x, machine_mode mode) x = unwrap_const_vec_duplicate (x); mode = GET_MODE_INNER (mode); - if (!CONST_DOUBLE_P (x)) + if (!CONST_DOUBLE_P (x) + && !CONST_INT_P (x)) return false; if (mode == VOIDmode - || (mode == HFmode && !TARGET_FP_F16INST)) + || ((mode == HFmode || mode == HImode) && !TARGET_FP_F16INST)) return false; - r = *CONST_DOUBLE_REAL_VALUE (x); + /* If we have an integer bit pattern, decode it back into a real. + real_from_target requires the representation to be split into + 32-bit values and then put into two host wide ints. */ + if (CONST_INT_P (x)) + { + HOST_WIDE_INT buf = INTVAL (x); + long int as_long_ints[2]; + as_long_ints[0] = buf & 0xFFFFFFFF; + as_long_ints[1] = (buf >> 32) & 0xFFFFFFFF; + machine_mode fmode; + switch (mode) + { + case HImode: + fmode = HFmode; + break; + case SImode: + fmode = SFmode; + break; + case DImode: + fmode = DFmode; + break; + default: + return false; + } + + real_from_target (&r, as_long_ints, fmode); + if (out) + *out = const_double_from_real_value (r, fmode); + } + else + r = *CONST_DOUBLE_REAL_VALUE (x); /* We cannot represent infinities, NaNs or +/-zero. We won't know if we have +zero until we analyse the mantissa, but we @@ -25170,6 +25229,7 @@ aarch64_float_const_representable_p (rtx x, machine_mode mode) the value. */ if (w.ulow () != 0) return false; + /* We have rejected the lower HOST_WIDE_INT, so update our understanding of how many bits lie in the mantissa and look only at the high HOST_WIDE_INT. */ @@ -25205,9 +25265,9 @@ aarch64_float_const_representable_p (rtx x, machine_mode mode) return (exponent >= 0 && exponent <= 7); } -/* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC - immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to - output MOVI/MVNI, ORR or BIC immediate. */ +/* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR, BIC or + FMOV immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether + to output MOVI/MVNI, ORR or BIC immediate. */ char* aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width, enum simd_immediate_check which) diff --git a/gcc/testsuite/gcc.target/aarch64/const_create_using_fmov.c b/gcc/testsuite/gcc.target/aarch64/const_create_using_fmov.c new file mode 100644 index 0000000000000000000000000000000000000000..e080afed8aa3578660027979335bfc859ca6bc91 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/const_create_using_fmov.c @@ -0,0 +1,87 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=armv9-a -Ofast" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#include <arm_neon.h> + +/* +** g: +** fmov v0\.4s, 1\.0e\+0 +** ret +*/ +float32x4_t g(){ + return vdupq_n_f32(1); +} + +/* +** h: +** fmov v0\.4s, 1\.0e\+0 +** ret +*/ +uint32x4_t h() { + return vreinterpretq_u32_f32(g()); +} + +/* +** f1: +** fmov v0\.4s, 1\.0e\+0 +** ret +*/ +uint32x4_t f1() { + return vdupq_n_u32(0x3f800000); +} + +/* +** f2: +** fmov v0\.4s, 1\.5e\+0 +** ret +*/ +uint32x4_t f2() { + return vdupq_n_u32(0x3FC00000); +} + +/* +** f3: +** fmov v0\.4s, 1\.25e\+0 +** ret +*/ +uint32x4_t f3() { + return vdupq_n_u32(0x3FA00000); +} + +/* +** f4: +** fmov v0\.2d, 1\.0e\+0 +** ret +*/ +uint64x2_t f4() { + return vdupq_n_u64(0x3FF0000000000000); +} + +/* +** fn4: +** fmov v0\.2d, -1\.0e\+0 +** ret +*/ +uint64x2_t fn4() { + return vdupq_n_u64(0xBFF0000000000000); +} + +/* +** f5: +** fmov v0\.8h, 1\.5e\+0 +** ret +*/ +uint16x8_t f5() { + return vdupq_n_u16(0x3E00); +} + +/* +** f6: +** adrp x0, \.LC0 +** ldr q0, \[x0, #:lo12:\.LC0\] +** ret +*/ +uint32x4_t f6() { + return vdupq_n_u32(0x4f800000); +} --
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 7a84acc59569da0b50af2300615db561a5de460a..6c683ea2d93e1b733cfe49fac38381ea6451fd55 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -974,6 +974,7 @@ void aarch64_split_simd_move (rtx, rtx); /* Check for a legitimate floating point constant for FMOV. */ bool aarch64_float_const_representable_p (rtx, machine_mode); +bool aarch64_float_const_representable_p (rtx *, rtx, machine_mode); extern int aarch64_epilogue_uses (int); diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 1842f6ecf6330f11a64545d0903240c89b104ffc..2d44608d93b8e7542ea8d5eb4c3f99c9f88e70ed 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -10991,7 +10991,8 @@ aarch64_float_const_zero_rtx_p (rtx x) /* 0.0 in Decimal Floating Point cannot be represented by #0 or zr as our callers expect, so no need to check the actual value if X is of Decimal Floating Point type. */ - if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT) + if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT + || !CONST_DOUBLE_P (x)) return false; if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x))) @@ -23026,17 +23027,30 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info, else return false; - scalar_float_mode elt_float_mode; - if (n_elts == 1 - && is_a <scalar_float_mode> (elt_mode, &elt_float_mode)) + if (n_elts == 1) { rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0); + rtx new_elt = NULL_RTX; if (aarch64_float_const_zero_rtx_p (elt) - || aarch64_float_const_representable_p (elt, elt_mode)) - { - if (info) - *info = simd_immediate_info (elt_float_mode, elt); - return true; + || aarch64_float_const_representable_p (&new_elt, elt, elt_mode)) + { + scalar_float_mode elt_float_mode; + auto bitsize = GET_MODE_UNIT_BITSIZE (elt_mode); + if (is_a <scalar_float_mode> (elt_mode)) + elt_float_mode = as_a <scalar_float_mode> (elt_mode); + else if (which == AARCH64_CHECK_MOV + && new_elt + && float_mode_for_size (bitsize).exists (&elt_float_mode)) + elt = new_elt; + else + elt = NULL_RTX; + + if (elt != NULL_RTX) + { + if (info) + *info = simd_immediate_info (elt_float_mode, elt); + return true; + } } } @@ -25121,8 +25135,22 @@ aarch64_c_mode_for_suffix (char suffix) /* Return true iff X with mode MODE can be represented by a quarter-precision floating point immediate operand X. Note, we cannot represent 0.0. */ + bool aarch64_float_const_representable_p (rtx x, machine_mode mode) +{ + return aarch64_float_const_representable_p (NULL, x, mode); +} + + +/* Return true iff X with mode MODE can be represented by a quarter-precision + floating point immediate operand X. Note, we cannot represent 0.0. + If the value is a CONST_INT that can be represented as an exact floating + point then OUT will contain the new floating point value to emit to generate + the integer constant. */ + +bool +aarch64_float_const_representable_p (rtx *out, rtx x, machine_mode mode) { /* This represents our current view of how many bits make up the mantissa. */ @@ -25134,14 +25162,45 @@ aarch64_float_const_representable_p (rtx x, machine_mode mode) x = unwrap_const_vec_duplicate (x); mode = GET_MODE_INNER (mode); - if (!CONST_DOUBLE_P (x)) + if (!CONST_DOUBLE_P (x) + && !CONST_INT_P (x)) return false; if (mode == VOIDmode - || (mode == HFmode && !TARGET_FP_F16INST)) + || ((mode == HFmode || mode == HImode) && !TARGET_FP_F16INST)) return false; - r = *CONST_DOUBLE_REAL_VALUE (x); + /* If we have an integer bit pattern, decode it back into a real. + real_from_target requires the representation to be split into + 32-bit values and then put into two host wide ints. */ + if (CONST_INT_P (x)) + { + HOST_WIDE_INT buf = INTVAL (x); + long int as_long_ints[2]; + as_long_ints[0] = buf & 0xFFFFFFFF; + as_long_ints[1] = (buf >> 32) & 0xFFFFFFFF; + machine_mode fmode; + switch (mode) + { + case HImode: + fmode = HFmode; + break; + case SImode: + fmode = SFmode; + break; + case DImode: + fmode = DFmode; + break; + default: + return false; + } + + real_from_target (&r, as_long_ints, fmode); + if (out) + *out = const_double_from_real_value (r, fmode); + } + else + r = *CONST_DOUBLE_REAL_VALUE (x); /* We cannot represent infinities, NaNs or +/-zero. We won't know if we have +zero until we analyse the mantissa, but we @@ -25170,6 +25229,7 @@ aarch64_float_const_representable_p (rtx x, machine_mode mode) the value. */ if (w.ulow () != 0) return false; + /* We have rejected the lower HOST_WIDE_INT, so update our understanding of how many bits lie in the mantissa and look only at the high HOST_WIDE_INT. */ @@ -25205,9 +25265,9 @@ aarch64_float_const_representable_p (rtx x, machine_mode mode) return (exponent >= 0 && exponent <= 7); } -/* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC - immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to - output MOVI/MVNI, ORR or BIC immediate. */ +/* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR, BIC or + FMOV immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether + to output MOVI/MVNI, ORR or BIC immediate. */ char* aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width, enum simd_immediate_check which) diff --git a/gcc/testsuite/gcc.target/aarch64/const_create_using_fmov.c b/gcc/testsuite/gcc.target/aarch64/const_create_using_fmov.c new file mode 100644 index 0000000000000000000000000000000000000000..e080afed8aa3578660027979335bfc859ca6bc91 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/const_create_using_fmov.c @@ -0,0 +1,87 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=armv9-a -Ofast" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#include <arm_neon.h> + +/* +** g: +** fmov v0\.4s, 1\.0e\+0 +** ret +*/ +float32x4_t g(){ + return vdupq_n_f32(1); +} + +/* +** h: +** fmov v0\.4s, 1\.0e\+0 +** ret +*/ +uint32x4_t h() { + return vreinterpretq_u32_f32(g()); +} + +/* +** f1: +** fmov v0\.4s, 1\.0e\+0 +** ret +*/ +uint32x4_t f1() { + return vdupq_n_u32(0x3f800000); +} + +/* +** f2: +** fmov v0\.4s, 1\.5e\+0 +** ret +*/ +uint32x4_t f2() { + return vdupq_n_u32(0x3FC00000); +} + +/* +** f3: +** fmov v0\.4s, 1\.25e\+0 +** ret +*/ +uint32x4_t f3() { + return vdupq_n_u32(0x3FA00000); +} + +/* +** f4: +** fmov v0\.2d, 1\.0e\+0 +** ret +*/ +uint64x2_t f4() { + return vdupq_n_u64(0x3FF0000000000000); +} + +/* +** fn4: +** fmov v0\.2d, -1\.0e\+0 +** ret +*/ +uint64x2_t fn4() { + return vdupq_n_u64(0xBFF0000000000000); +} + +/* +** f5: +** fmov v0\.8h, 1\.5e\+0 +** ret +*/ +uint16x8_t f5() { + return vdupq_n_u16(0x3E00); +} + +/* +** f6: +** adrp x0, \.LC0 +** ldr q0, \[x0, #:lo12:\.LC0\] +** ret +*/ +uint32x4_t f6() { + return vdupq_n_u32(0x4f800000); +}