[PATCH v2, rs6000] Add multiply-add expand pattern [PR103109]
Hi, This patch adds an expand and several insns for multiply-add with three 64bit operands. Compared with last version, the main changes are: 1 The "maddld" pattern is reused for the low-part generation. 2 A runnable testcase replaces the original compiling case. 3 Fixes indention problems. Bootstrapped and tested on powerpc64-linux BE and LE with no regressions. Is this okay for trunk? Any recommendations? Thanks a lot. ChangeLog 2022-08-08 Haochen Gui gcc/ PR target/103109 * config/rs6000/rs6000.md (maddditi4): New pattern for multiply-add. (madddi4_highpart): New. (madddi4_highpart_le): New. gcc/testsuite/ PR target/103109 * gcc.target/powerpc/pr103109.c: New. patch.diff diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index c55ee7e171a..4c58023490a 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -3217,7 +3217,7 @@ (define_expand "mul3" DONE; }) -(define_insn "*maddld4" +(define_insn "maddld4" [(set (match_operand:GPR 0 "gpc_reg_operand" "=r") (plus:GPR (mult:GPR (match_operand:GPR 1 "gpc_reg_operand" "r") (match_operand:GPR 2 "gpc_reg_operand" "r")) @@ -3226,6 +3226,52 @@ (define_insn "*maddld4" "maddld %0,%1,%2,%3" [(set_attr "type" "mul")]) +(define_expand "maddditi4" + [(set (match_operand:TI 0 "gpc_reg_operand") + (plus:TI + (mult:TI (any_extend:TI (match_operand:DI 1 "gpc_reg_operand")) + (any_extend:TI (match_operand:DI 2 "gpc_reg_operand"))) + (any_extend:TI (match_operand:DI 3 "gpc_reg_operand"] + "TARGET_MADDLD && TARGET_POWERPC64" +{ + rtx op0_lo = gen_rtx_SUBREG (DImode, operands[0], BYTES_BIG_ENDIAN ? 8 : 0); + rtx op0_hi = gen_rtx_SUBREG (DImode, operands[0], BYTES_BIG_ENDIAN ? 0 : 8); + + emit_insn (gen_maddlddi4 (op0_lo, operands[1], operands[2], operands[3])); + + if (BYTES_BIG_ENDIAN) +emit_insn (gen_madddi4_highpart (op0_hi, operands[1], operands[2], + operands[3])); + else +emit_insn (gen_madddi4_highpart_le (op0_hi, operands[1], operands[2], + operands[3])); + DONE; +}) + +(define_insn "madddi4_highpart" + [(set (match_operand:DI 0 "gpc_reg_operand" "=r") + (subreg:DI + (plus:TI + (mult:TI (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r")) +(any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r"))) + (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r"))) +0))] + "TARGET_MADDLD && BYTES_BIG_ENDIAN && TARGET_POWERPC64" + "maddhd %0,%1,%2,%3" + [(set_attr "type" "mul")]) + +(define_insn "madddi4_highpart_le" + [(set (match_operand:DI 0 "gpc_reg_operand" "=r") + (subreg:DI + (plus:TI + (mult:TI (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r")) +(any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r"))) + (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r"))) +8))] + "TARGET_MADDLD && !BYTES_BIG_ENDIAN && TARGET_POWERPC64" + "maddhd %0,%1,%2,%3" + [(set_attr "type" "mul")]) + (define_insn "udiv3" [(set (match_operand:GPR 0 "gpc_reg_operand" "=r") (udiv:GPR (match_operand:GPR 1 "gpc_reg_operand" "r") diff --git a/gcc/testsuite/gcc.target/powerpc/pr103109.c b/gcc/testsuite/gcc.target/powerpc/pr103109.c new file mode 100644 index 000..969b9751b21 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr103109.c @@ -0,0 +1,110 @@ +/* { dg-do run { target { has_arch_ppc64 } } } */ +/* { dg-options "-O2 -mdejagnu-cpu=power9 -save-temps" } */ +/* { dg-require-effective-target int128 } */ +/* { dg-require-effective-target p9modulo_hw } */ +/* { dg-final { scan-assembler-times {\mmaddld\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mmaddhd\M} 1 } } */ +/* { dg-final { scan-assembler-times {\mmaddhdu\M} 1 } } */ + +union U { + __int128 i128; + struct { +long l1; +long l2; + } s; +}; + +__int128 +create_i128 (long most_sig, long least_sig) +{ + union U u; + +#if __LITTLE_ENDIAN__ + u.s.l1 = least_sig; + u.s.l2 = most_sig; +#else + u.s.l1 = most_sig; + u.s.l2 = least_sig; +#endif + return u.i128; +} + + +#define DEBUG 0 + +#if DEBUG +#include +#include + +void print_i128(__int128 val, int unsignedp) +{ + if (unsignedp) +printf(" %llu ", (unsigned long long)(val >> 64)); + else +printf(" %lld ", (signed long long)(val >> 64)); + + printf("%llu (0x%llx %llx)", + (unsigned long long)(val & 0x), + (unsigned long long)(val >> 64), + (unsigned long long)(val & 0x)); +} +#endif + +void abort (void); + +__attribute__((noinline)) +__int128 multiply_add (long a, long b, long c) +{ + return (__int128) a * b + c; +} + +__attribute__((noinline)) +unsigned __int128 multiply_addu (unsigned long a, unsigned long b, +
Re: [PATCH][_GLIBCXX_DEBUG] Refine singular iterator state
Another version of this patch with just a new test case showing what wrong code was unnoticed previously by the _GLIBCXX_DEBUG mode. On 04/08/22 22:56, François Dumont wrote: This an old patch I had prepared a long time ago, I don't think I ever submitted it. libstdc++: [_GLIBCXX_DEBUG] Do not consider detached iterators as value-initialized An attach iterator has its _M_version set to something != 0. This value shall be preserved when detaching it so that the iterator does not look like a value initialized one. libstdc++-v3/ChangeLog: * include/debug/formatter.h (__singular_value_init): New _Iterator_state enum entry. (_Parameter<>(const _Safe_iterator<>&, const char*, _Is_iterator)): Check if iterator parameter is value-initialized. (_Parameter<>(const _Safe_local_iterator<>&, const char*, _Is_iterator)): Likewise. * include/debug/safe_iterator.h (_Safe_iterator<>::_M_value_initialized()): New. Adapt checks. * include/debug/safe_local_iterator.h (_Safe_local_iterator<>::_M_value_initialized()): New. Adapt checks. * src/c++11/debug.cc (_Safe_iterator_base::_M_reset): Do not reset _M_version. (print_field(PrintContext&, const _Parameter&, const char*)): Adapt state_names. * testsuite/23_containers/deque/debug/iterator1_neg.cc: New test. * testsuite/23_containers/deque/debug/iterator2_neg.cc: New test. * testsuite/23_containers/forward_list/debug/iterator1_neg.cc: New test. * testsuite/23_containers/forward_list/debug/iterator2_neg.cc: New test. Tested under Linux x86_64 _GLIBCXX_DEBUG mode. Ok to commit ? François diff --git a/libstdc++-v3/include/debug/formatter.h b/libstdc++-v3/include/debug/formatter.h index 80e8ba46d1e..748d4fbfea4 100644 --- a/libstdc++-v3/include/debug/formatter.h +++ b/libstdc++-v3/include/debug/formatter.h @@ -185,6 +185,7 @@ namespace __gnu_debug __rbegin, // dereferenceable, and at the reverse-beginning __rmiddle, // reverse-dereferenceable, not at the reverse-beginning __rend, // reverse-past-the-end + __singular_value_init, // singular, value initialized __last_state }; @@ -280,7 +281,12 @@ namespace __gnu_debug _M_variant._M_iterator._M_seq_type = _GLIBCXX_TYPEID(_Sequence); if (__it._M_singular()) - _M_variant._M_iterator._M_state = __singular; + { + if (__it._M_value_initialized()) + _M_variant._M_iterator._M_state = __singular_value_init; + else + _M_variant._M_iterator._M_state = __singular; + } else { if (__it._M_is_before_begin()) @@ -308,7 +314,12 @@ namespace __gnu_debug _M_variant._M_iterator._M_seq_type = _GLIBCXX_TYPEID(_Sequence); if (__it._M_singular()) - _M_variant._M_iterator._M_state = __singular; + { + if (__it._M_value_initialized()) + _M_variant._M_iterator._M_state = __singular_value_init; + else + _M_variant._M_iterator._M_state = __singular; + } else { if (__it._M_is_end()) diff --git a/libstdc++-v3/include/debug/safe_iterator.h b/libstdc++-v3/include/debug/safe_iterator.h index d613933e236..33f7a86478a 100644 --- a/libstdc++-v3/include/debug/safe_iterator.h +++ b/libstdc++-v3/include/debug/safe_iterator.h @@ -41,8 +41,8 @@ #define _GLIBCXX_DEBUG_VERIFY_OPERANDS(_Lhs, _Rhs, _BadMsgId, _DiffMsgId) \ _GLIBCXX_DEBUG_VERIFY(!_Lhs._M_singular() && !_Rhs._M_singular() \ - || (_Lhs.base() == _Iterator() \ - && _Rhs.base() == _Iterator()), \ + || (_Lhs._M_value_initialized() \ + && _Rhs._M_value_initialized()), \ _M_message(_BadMsgId)\ ._M_iterator(_Lhs, #_Lhs) \ ._M_iterator(_Rhs, #_Rhs)); \ @@ -177,7 +177,7 @@ namespace __gnu_debug // _GLIBCXX_RESOLVE_LIB_DEFECTS // DR 408. Is vector > forbidden? _GLIBCXX_DEBUG_VERIFY(!__x._M_singular() - || __x.base() == _Iterator(), + || __x._M_value_initialized(), _M_message(__msg_init_copy_singular) ._M_iterator(*this, "this") ._M_iterator(__x, "other")); @@ -193,7 +193,7 @@ namespace __gnu_debug : _Iter_base() { _GLIBCXX_DEBUG_VERIFY(!__x._M_singular() - || __x.base() == _Iterator(), + || __x._M_value_initialized(), _M_message(__msg_init_copy_singular) ._M_iterator(*this, "this") ._M_iterator(__x, "other")); @@ -220,7 +220,7 @@ namespace __gnu_debug // _GLIBCXX_RESOLVE_LIB_DEFECTS // DR 408. Is vector > forbidden? _GLIBCXX_DEBUG_VERIFY(!__x._M_singular() -|| __x.base() == _MutableIterator(), +|| __x._M_value_initialized(), _M_message(__msg_init_const_singular) ._M_iterator(*this, "this") ._M_iterator(__x, "other")); @@ -236,7 +236,7 @@ namespace __gnu_debug // _GLIBCXX_RESOLVE_LIB_DEFECTS // DR 408. Is vector > forbidden? _GLIBCXX_DEBUG_VERIFY(!__x._M_singular()
Re: 回复:[PATCH v5] LoongArch: add movable attribute
在 2022/8/5 下午5:53, Xi Ruoyao 写道: On Fri, 2022-08-05 at 15:58 +0800, Lulu Cheng wrote: I think the model of precpu is not very easy to describe. model(got)?model(global)? I also want to use attribute model and -mcmodel together, but this is just an initial idea, what do you think? It seems I had some misunderstanding about IA-64 model attribute. IA-64 actually does not have -mcmodel= options. And a code model only specifies where "the GOT and the local symbols" are, but our new attribute should apply for both local symbols and global symbols. So I don't think we should strongly bind the new attribute and -mcmodel. Maybe, __attribute__((addressing_model(got/pcrel32/pcrel64/abs32/abs64)) ? I think they are explicit enough (we can implement got and pc32 first, and adding the others when we implement other code models). I still think it makes a little bit more sense to put attribute(model) and -mcmodel together. -mcmodel sets the access range of all symbols in a single file, and attribute (model) sets the accsess range of a single symbol in a file. For example __attribute__((model(normal/large/extreme))).
[PATCH] rs6000: Fix incorrect RTL for Power LE when removing the UNSPECS [PR106069]
The native RTL expression for vec_mrghw should be same for BE and LE as they are register and endian-independent. So both BE and LE need generate exactly same RTL with index [0 4 1 5] when expanding vec_mrghw with vec_select and vec_concat. (set (reg:V4SI 141) (vec_select:V4SI (vec_concat:V8SI (subreg:V4SI (reg:V16QI 139) 0) (subreg:V4SI (reg:V16QI 140) 0)) [const_int 0 4 1 5])) Then combine pass could do the nested vec_select optimization in simplify-rtx.c:simplify_binary_operation_1 also on both BE and LE: 21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel [0 4 1 5]) 24: {r151:SI=vec_select(r150:V4SI,parallel [const_int 3]);} => 21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel) 24: {r151:SI=vec_select(r146:V4SI,parallel [const_int 1]);} The endianness check need only once at ASM generation finally. ASM would be better due to nested vec_select simplified to simple scalar load. Regression tested pass for Power8{LE,BE}{32,64} and Power{9,10}LE{32,64} Linux(Thanks to Kewen), OK for master? Or should we revert r12-4496 to restore to the UNSPEC implementation? gcc/ChangeLog: PR target/106069 * config/rs6000/altivec.md (altivec_vmrghb): Emit same native RTL for BE and LE. (altivec_vmrghh): Likewise. (altivec_vmrghw): Likewise. (*altivec_vmrghsf): Adjust. (altivec_vmrglb): Likewise. (altivec_vmrglh): Likewise. (altivec_vmrglw): Likewise. (*altivec_vmrglsf): Adjust. (altivec_vmrghb_direct): Emit different ASM for BE and LE. (altivec_vmrghh_direct): Likewise. (altivec_vmrghw_direct_): Likewise. (altivec_vmrglb_direct): Likewise. (altivec_vmrglh_direct): Likewise. (altivec_vmrglw_direct_): Likewise. (vec_widen_smult_hi_v16qi): Adjust. (vec_widen_smult_lo_v16qi): Adjust. (vec_widen_umult_hi_v16qi): Adjust. (vec_widen_umult_lo_v16qi): Adjust. (vec_widen_smult_hi_v8hi): Adjust. (vec_widen_smult_lo_v8hi): Adjust. (vec_widen_umult_hi_v8hi): Adjust. (vec_widen_umult_lo_v8hi): Adjust. * config/rs6000/rs6000.cc (altivec_expand_vec_perm_const): Emit same native RTL for BE and LE. * config/rs6000/vsx.md (vsx_xxmrghw_): Likewise. (vsx_xxmrglw_): Likewise. gcc/testsuite/ChangeLog: PR target/106069 * gcc.target/powerpc/pr106069.C: New test. Signed-off-by: Xionghu Luo --- gcc/config/rs6000/altivec.md| 122 gcc/config/rs6000/rs6000.cc | 36 +++--- gcc/config/rs6000/vsx.md| 16 +-- gcc/testsuite/gcc.target/powerpc/pr106069.C | 118 +++ 4 files changed, 209 insertions(+), 83 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr106069.C diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 2c4940f2e21..8d9c0109559 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -1144,11 +1144,7 @@ (define_expand "altivec_vmrghb" (use (match_operand:V16QI 2 "register_operand"))] "TARGET_ALTIVEC" { - rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct - : gen_altivec_vmrglb_direct; - if (!BYTES_BIG_ENDIAN) -std::swap (operands[1], operands[2]); - emit_insn (fun (operands[0], operands[1], operands[2])); + emit_insn (gen_altivec_vmrghb_direct (operands[0], operands[1], operands[2])); DONE; }) @@ -1167,7 +1163,12 @@ (define_insn "altivec_vmrghb_direct" (const_int 6) (const_int 22) (const_int 7) (const_int 23)])))] "TARGET_ALTIVEC" - "vmrghb %0,%1,%2" + { + if (BYTES_BIG_ENDIAN) + return "vmrghb %0,%1,%2"; +else + return "vmrglb %0,%2,%1"; + } [(set_attr "type" "vecperm")]) (define_expand "altivec_vmrghh" @@ -1176,11 +1177,7 @@ (define_expand "altivec_vmrghh" (use (match_operand:V8HI 2 "register_operand"))] "TARGET_ALTIVEC" { - rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrghh_direct - : gen_altivec_vmrglh_direct; - if (!BYTES_BIG_ENDIAN) -std::swap (operands[1], operands[2]); - emit_insn (fun (operands[0], operands[1], operands[2])); + emit_insn (gen_altivec_vmrghh_direct (operands[0], operands[1], operands[2])); DONE; }) @@ -1195,7 +1192,12 @@ (define_insn "altivec_vmrghh_direct" (const_int 2) (const_int 10) (const_int 3) (const_int 11)])))] "TARGET_ALTIVEC" - "vmrghh %0,%1,%2" + { + if (BYTES_BIG_ENDIAN) + return "vmrghh %0,%1,%2"; +else + return "vmrglh %0,%2,%1"; + } [(set_attr "type" "vecperm")]) (define_expand "altivec_vmrghw" @@ -1204,12 +1206,8 @@ (define_expand "altivec_vmrghw" (use (match_operand:V4SI 2 "register_operand"))] "V
[PATCH] Fix middle-end/103645: empty struct store not removed when using compound literal
From: Andrew Pinski For compound literals empty struct stores are not removed as they go down a different path of the gimplifier; trying to optimize the init constructor. This fixes the problem by not adding the gimple assignment at the end of gimplify_init_constructor if it was an empty type. Note this updates gcc.dg/pr87052.c where we had: const char d[0] = { }; And was expecting a store to d but after this, there is no store as the decl's type is zero in size. OK? Bootstrapped and tested on x86_64-linux-gnu with no regressions. gcc/ChangeLog: PR middle-end/103645 * gimplify.c (gimplify_init_constructor): Don't build/add gimple assignment of an empty type. testsuite/ChangeLog: * gcc.dg/pr87052.c: Update d var to expect nothing. --- gcc/gimplify.cc| 7 +-- gcc/testsuite/gcc.dg/pr87052.c | 6 +++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc index 2ac7ca0855e..f0fbdb48012 100644 --- a/gcc/gimplify.cc +++ b/gcc/gimplify.cc @@ -5488,8 +5488,11 @@ gimplify_init_constructor (tree *expr_p, gimple_seq *pre_p, gimple_seq *post_p, if (ret == GS_ERROR) return GS_ERROR; /* If we have gimplified both sides of the initializer but have - not emitted an assignment, do so now. */ - if (*expr_p) + not emitted an assignment, do so now. */ + if (*expr_p + /* If the type is an empty type, we don't need to emit the +assignment. */ + && !is_empty_type (TREE_TYPE (TREE_OPERAND (*expr_p, 0 { tree lhs = TREE_OPERAND (*expr_p, 0); tree rhs = TREE_OPERAND (*expr_p, 1); diff --git a/gcc/testsuite/gcc.dg/pr87052.c b/gcc/testsuite/gcc.dg/pr87052.c index 18e092c4674..796fe6440c1 100644 --- a/gcc/testsuite/gcc.dg/pr87052.c +++ b/gcc/testsuite/gcc.dg/pr87052.c @@ -23,8 +23,7 @@ void test (void) const char d[0] = { }; - /* Expect the following: - d = ""; */ + /* Expect nothing. */ const char e[0] = ""; @@ -36,6 +35,7 @@ void test (void) /* { dg-final { scan-tree-dump-times "a = \"x00ab\";" 1 "gimple" } } { dg-final { scan-tree-dump-times "b = \"ax00bc\";" 1 "gimple" } } { dg-final { scan-tree-dump-times "c = \"\";" 1 "gimple" } } - { dg-final { scan-tree-dump-times "d = { *};" 1 "gimple" } } + { dg-final { scan-tree-dump-times "d = " 1 "gimple" } } + { dg-final { scan-tree-dump-times "d = {CLOBBER\\(eol\\)}" 1 "gimple" } } { dg-final { scan-tree-dump-times "e = " 1 "gimple" } } { dg-final { scan-tree-dump-times "e = {CLOBBER\\(eol\\)}" 1 "gimple" } } */ -- 2.27.0
[COMMITTED] Move testcase gcc.dg/tree-ssa/pr93776.c to gcc.c-torture/compile/pr93776.c
From: Andrew Pinski Since this testcase is not exactly SSA specific and it would be a good idea to compile this at more than just at -O1, moving it to gcc.c-torture/compile would do that. Committed as obvious after a test on x86_64-linux-gnu. gcc/testsuite/ChangeLog: * gcc.dg/tree-ssa/pr93776.c: Moved to... * gcc.c-torture/compile/pr93776.c: ...here. --- .../{gcc.dg/tree-ssa => gcc.c-torture/compile}/pr93776.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename gcc/testsuite/{gcc.dg/tree-ssa => gcc.c-torture/compile}/pr93776.c (76%) diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr93776.c b/gcc/testsuite/gcc.c-torture/compile/pr93776.c similarity index 76% rename from gcc/testsuite/gcc.dg/tree-ssa/pr93776.c rename to gcc/testsuite/gcc.c-torture/compile/pr93776.c index c407a627718..3852736c040 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/pr93776.c +++ b/gcc/testsuite/gcc.c-torture/compile/pr93776.c @@ -1,5 +1,5 @@ -/* { dg-do compile } */ -/* { dg-options "-O1" } */ +/* This used to ICE in SRA as SRA got + confused by the zero signed assigment. */ struct empty {}; struct s { int i; }; -- 2.27.0
[Committed] Add -mno-stv to new gcc.target/i386/cmpti2.c test case.
Adding -march=cascadelake to the command line options of the new cmpti2.c testcase triggers TImode STV and produces vector code that doesn't match the scalar implementation that this test was intended to check. Adding -mno-stv to the options fixes this. Committed as obvious. 2022-08-07 Roger Sayle gcc/testsuite/ChangeLog * gcc.target/i386/cmpti2.c: Add -mno-stv to dg-options. Roger -- diff --git a/gcc/testsuite/gcc.target/i386/cmpti2.c b/gcc/testsuite/gcc.target/i386/cmpti2.c index ad9572901ce..ba7dd7292a0 100644 --- a/gcc/testsuite/gcc.target/i386/cmpti2.c +++ b/gcc/testsuite/gcc.target/i386/cmpti2.c @@ -1,5 +1,5 @@ /* { dg-do compile { target int128 } } */ -/* { dg-options "-O2" } */ +/* { dg-options "-O2 -mno-stv" } */ __int128 x; __int128 y;
[r13-1982 Regression] FAIL: gcc.target/i386/cmpti2.c scan-assembler-times xorq 4 on Linux/x86_64
On Linux/x86_64, a46bca36b7b3a8a7e15b04225fb2b4f9b1bed62c is the first bad commit commit a46bca36b7b3a8a7e15b04225fb2b4f9b1bed62c Author: Roger Sayle Date: Sun Aug 7 08:49:48 2022 +0100 Allow any immediate constant in *cmp_doubleword splitter on x86_64. caused FAIL: gcc.target/i386/cmpti2.c scan-assembler-times xorq 4 with GCC configured with ../../gcc/configure --prefix=/export/users/haochenj/src/gcc-bisect/master/master/r13-1982/usr --enable-clocale=gnu --with-system-zlib --with-demangler-in-ld --with-fpmath=sse --enable-languages=c,c++,fortran --enable-cet --without-isl --enable-libmpx x86_64-linux --disable-bootstrap To reproduce: $ cd {build_dir}/gcc && make check RUNTESTFLAGS="i386.exp=gcc.target/i386/cmpti2.c --target_board='unix{-m64\ -march=cascadelake}'" (Please do not reply to this email, for question about this report, contact me at haochen dot jiang at intel.com)
[PATCH] middle-end: Optimize ((X >> C1) & C2) != C3 for more cases.
Following my middle-end patch for PR tree-optimization/94026, I'd promised Jeff Law that I'd clean up the dead-code in fold-const.cc now that these optimizations are handled in match.pd. Alas, I discovered things aren't quite that simple, as the transformations I'd added avoided cases where C2 overlapped with the new bits introduced by the shift, but the original code handled any value of C2 provided that it had a single-bit set (under the condition that C3 was always zero). This patch upgrades the transformations supported by match.pd to cover any values of C2 and C3, provided that C1 is a valid bit shift constant, for all three shift types (logical right, arithmetic right and left). This then makes the code in fold-const.cc fully redundant, and adds support for some new (corner) cases not previously handled. If the constant C1 is valid for the type's precision, the shift is now always eliminated (with C2 and C3 possibly updated to test the sign bit). Interestingly, the fold-const.cc code that I'm now deleting was originally added by me back in 2006 to resolve PR middle-end/21137. I've confirmed that those testcase(s) remain resolved with this patch (and I'll close 21137 in Bugzilla). This patch also implements most (but not all) of the examples mentioned in PR tree-optimization/98954, for which I have some follow-up patches. This patch has been tested on x86_64-pc-linux-gnu with make bootstrap and make -k check, both with and without --target_board=unix{-m32}, with no new failures. Ok for mainline? 2022-08-07 Roger Sayle gcc/ChangeLog PR middle-end/21137 PR tree-optimization/98954 * fold-const.cc (fold_binary_loc): Remove optimizations to optimize ((X >> C1) & C2) ==/!= 0. * match.pd (cmp (bit_and (lshift @0 @1) @2) @3): Remove wi::ctz check, and handle all values of INTEGER_CSTs @2 and @3. (cmp (bit_and (rshift @0 @1) @2) @3): Likewise, remove wi::clz checks, and handle all values of INTEGER_CSTs @2 and @3. gcc/testsuite/ChangeLog PR middle-end/21137 PR tree-optimization/98954 * gcc.dg/fold-eqandshift-4.c: New test case. Thanks in advance, Roger -- diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc index 99021a8..4f4ec81 100644 --- a/gcc/fold-const.cc +++ b/gcc/fold-const.cc @@ -12204,60 +12204,6 @@ fold_binary_loc (location_t loc, enum tree_code code, tree type, } } - /* Fold ((X >> C1) & C2) == 0 and ((X >> C1) & C2) != 0 where -C1 is a valid shift constant, and C2 is a power of two, i.e. -a single bit. */ - if (TREE_CODE (arg0) == BIT_AND_EXPR - && integer_pow2p (TREE_OPERAND (arg0, 1)) - && integer_zerop (arg1)) - { - tree arg00 = TREE_OPERAND (arg0, 0); - STRIP_NOPS (arg00); - if (TREE_CODE (arg00) == RSHIFT_EXPR - && TREE_CODE (TREE_OPERAND (arg00, 1)) == INTEGER_CST) - { - tree itype = TREE_TYPE (arg00); - tree arg001 = TREE_OPERAND (arg00, 1); - prec = TYPE_PRECISION (itype); - - /* Check for a valid shift count. */ - if (wi::ltu_p (wi::to_wide (arg001), prec)) - { - tree arg01 = TREE_OPERAND (arg0, 1); - tree arg000 = TREE_OPERAND (arg00, 0); - unsigned HOST_WIDE_INT log2 = tree_log2 (arg01); - /* If (C2 << C1) doesn't overflow, then -((X >> C1) & C2) != 0 can be rewritten as -(X & (C2 << C1)) != 0. */ - if ((log2 + TREE_INT_CST_LOW (arg001)) < prec) - { - tem = fold_build2_loc (loc, LSHIFT_EXPR, itype, -arg01, arg001); - tem = fold_build2_loc (loc, BIT_AND_EXPR, itype, -arg000, tem); - return fold_build2_loc (loc, code, type, tem, - fold_convert_loc (loc, itype, arg1)); - } - /* Otherwise, for signed (arithmetic) shifts, -((X >> C1) & C2) != 0 is rewritten as X < 0, and -((X >> C1) & C2) == 0 is rewritten as X >= 0. */ - else if (!TYPE_UNSIGNED (itype)) - return fold_build2_loc (loc, code == EQ_EXPR ? GE_EXPR -: LT_EXPR, - type, arg000, - build_int_cst (itype, 0)); - /* Otherwise, of unsigned (logical) shifts, -((X >> C1) & C2) != 0 is rewritten as (X,false), and -((X >> C1) & C2) == 0 is rewritten as (X,true). */ - else - return omit_one_operand_loc (loc, type, -code == EQ_EXPR ? integer_one_node -
[x86 PATCH take #2] Add peephole2 to reduce double word register shuffling
This is a resubmission of my patch from June to fix some forms of inefficient register allocation using an additional peephole2 in i386.md. https://gcc.gnu.org/pipermail/gcc-patches/2022-June/596064.html Since the original, a number of supporting patches/improvements have been reviewed and approved, making this peephole even more effective. Hence for the simple function: __int128 foo(__int128 x, __int128 y) { return x+y; } mainline GCC on x86_64 with -O2 currently generates: movq%rsi, %rax movq%rdi, %r8 movq%rax, %rdi movq%rdx, %rax movq%rcx, %rdx addq%r8, %rax adcq%rdi, %rdx ret with this patch we now generate (a three insn improvement): movq%rdx, %rax movq%rcx, %rdx addq%rdi, %rax adcq%rsi, %rdx ret Back in June the review of the original patch stalled, as peephole2 isn't the ideal place to fix this (with which I fully agree), and this patch is really just a workaround for a deeper deficiency in reload/lra. To address this I've now filed a new enhancement PR in Bugzilla, PR rtl-optimization/106518, that describes that underlying issue, which might make an interesting (GSoC) project for anyone brave (fool hardy) enough to tweak GCC's register allocation. By comparison, this single peephole can't adversely affect other targets, and should the happy day come that it's no longer required, at worst would just become a harmless legacy transform that no longer triggers. I'm also investigating Uros' suggestion that it may be possible for RTL expansion to do a better job expanding the function prologue, but ultimately the hard register placement constraints are fixed by the target ABI, and poor allocation/assignment of hard registers is the responsibility/fault of the register allocation passes. But it may still be possible to reduce register pressure, but avoiding the use of SUBREGs (which keep the source and destination double words live during shuffling) along the lines of Richard's CONCAT suggestion. This patch has been retested again mainline using make bootstrap and make -k check, both with and without --target_board=unix{-m32}, with no new failures. Ok mainline? 2022-08-07 Roger Sayle gcc/ChangeLog PR target/43644 PR rtl-optimization/97756 PR rtl-optimization/98438 * config/i386/i386.md (define_peephole2): Recognize double word swap sequences, and replace them with more efficient idioms, including using xchg when optimizing for size. gcc/testsuite/ChangeLog PR target/43644 * gcc.target/i386/pr43644.c: New test case. Thanks in advance, Roger -- diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 298e4b3..a11fd5b 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -3039,6 +3039,36 @@ [(parallel [(set (match_dup 0) (match_dup 1)) (set (match_dup 1) (match_dup 0))])]) +;; Replace a double word swap that requires 4 mov insns with a +;; 3 mov insn implementation (or an xchg when optimizing for size). +(define_peephole2 + [(set (match_operand:DWIH 0 "general_reg_operand") + (match_operand:DWIH 1 "general_reg_operand")) + (set (match_operand:DWIH 2 "general_reg_operand") + (match_operand:DWIH 3 "general_reg_operand")) + (clobber (match_operand: 4 "general_reg_operand")) + (set (match_dup 3) (match_dup 0)) + (set (match_dup 1) (match_dup 2))] + "REGNO (operands[0]) != REGNO (operands[3]) + && REGNO (operands[1]) != REGNO (operands[2]) + && REGNO (operands[1]) != REGNO (operands[3]) + && REGNO (operands[3]) == REGNO (operands[4]) + && peep2_reg_dead_p (4, operands[0]) + && peep2_reg_dead_p (5, operands[2])" + [(parallel [(set (match_dup 1) (match_dup 3)) + (set (match_dup 3) (match_dup 1))])] +{ + if (!optimize_insn_for_size_p ()) +{ + rtx tmp = REGNO (operands[0]) > REGNO (operands[2]) ? operands[0] + : operands[2]; + emit_move_insn (tmp, operands[1]); + emit_move_insn (operands[1], operands[3]); + emit_move_insn (operands[3], tmp); + DONE; +} +}) + (define_expand "movstrict" [(set (strict_low_part (match_operand:SWI12 0 "register_operand")) (match_operand:SWI12 1 "general_operand"))] diff --git a/gcc/testsuite/gcc.target/i386/pr43644.c b/gcc/testsuite/gcc.target/i386/pr43644.c new file mode 100644 index 000..ffdf31c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr43644.c @@ -0,0 +1,11 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2" } */ + +__int128 foo(__int128 x, __int128 y) +{ + return x+y; +} + +/* { dg-final { scan-assembler-times "movq" 2 } } */ +/* { dg-final { scan-assembler-not "push" } } */ +/* { dg-final { scan-assembler-not "pop" } } */