[PATCH v2, rs6000] Add multiply-add expand pattern [PR103109]

2022-08-07 Thread HAO CHEN GUI via Gcc-patches
Hi,
  This patch adds an expand and several insns for multiply-add with three
64bit operands.

  Compared with last version, the main changes are:
1 The "maddld" pattern is reused for the low-part generation.
2 A runnable testcase replaces the original compiling case.
3 Fixes indention problems.

  Bootstrapped and tested on powerpc64-linux BE and LE with no regressions.
Is this okay for trunk? Any recommendations? Thanks a lot.

ChangeLog
2022-08-08  Haochen Gui  

gcc/
PR target/103109
* config/rs6000/rs6000.md (maddditi4): New pattern for multiply-add.
(madddi4_highpart): New.
(madddi4_highpart_le): New.

gcc/testsuite/
PR target/103109
* gcc.target/powerpc/pr103109.c: New.



patch.diff
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index c55ee7e171a..4c58023490a 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -3217,7 +3217,7 @@ (define_expand "mul3"
   DONE;
 })

-(define_insn "*maddld4"
+(define_insn "maddld4"
   [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
(plus:GPR (mult:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
(match_operand:GPR 2 "gpc_reg_operand" "r"))
@@ -3226,6 +3226,52 @@ (define_insn "*maddld4"
   "maddld %0,%1,%2,%3"
   [(set_attr "type" "mul")])

+(define_expand "maddditi4"
+  [(set (match_operand:TI 0 "gpc_reg_operand")
+   (plus:TI
+ (mult:TI (any_extend:TI (match_operand:DI 1 "gpc_reg_operand"))
+  (any_extend:TI (match_operand:DI 2 "gpc_reg_operand")))
+ (any_extend:TI (match_operand:DI 3 "gpc_reg_operand"]
+  "TARGET_MADDLD && TARGET_POWERPC64"
+{
+  rtx op0_lo = gen_rtx_SUBREG (DImode, operands[0], BYTES_BIG_ENDIAN ? 8 : 0);
+  rtx op0_hi = gen_rtx_SUBREG (DImode, operands[0], BYTES_BIG_ENDIAN ? 0 : 8);
+
+  emit_insn (gen_maddlddi4 (op0_lo, operands[1], operands[2], operands[3]));
+
+  if (BYTES_BIG_ENDIAN)
+emit_insn (gen_madddi4_highpart (op0_hi, operands[1], operands[2],
+   operands[3]));
+  else
+emit_insn (gen_madddi4_highpart_le (op0_hi, operands[1], operands[2],
+  operands[3]));
+  DONE;
+})
+
+(define_insn "madddi4_highpart"
+  [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
+   (subreg:DI
+ (plus:TI
+   (mult:TI (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
+(any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
+   (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r")))
+0))]
+  "TARGET_MADDLD && BYTES_BIG_ENDIAN && TARGET_POWERPC64"
+  "maddhd %0,%1,%2,%3"
+  [(set_attr "type" "mul")])
+
+(define_insn "madddi4_highpart_le"
+  [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
+   (subreg:DI
+ (plus:TI
+   (mult:TI (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
+(any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
+   (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r")))
+8))]
+  "TARGET_MADDLD && !BYTES_BIG_ENDIAN && TARGET_POWERPC64"
+  "maddhd %0,%1,%2,%3"
+  [(set_attr "type" "mul")])
+
 (define_insn "udiv3"
   [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
 (udiv:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
diff --git a/gcc/testsuite/gcc.target/powerpc/pr103109.c 
b/gcc/testsuite/gcc.target/powerpc/pr103109.c
new file mode 100644
index 000..969b9751b21
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr103109.c
@@ -0,0 +1,110 @@
+/* { dg-do run { target { has_arch_ppc64 } } } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9 -save-temps" } */
+/* { dg-require-effective-target int128 } */
+/* { dg-require-effective-target p9modulo_hw } */
+/* { dg-final { scan-assembler-times {\mmaddld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mmaddhd\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mmaddhdu\M} 1 } } */
+
+union U {
+  __int128 i128;
+  struct {
+long l1;
+long l2;
+  } s;
+};
+
+__int128
+create_i128 (long most_sig, long least_sig)
+{
+  union U u;
+
+#if __LITTLE_ENDIAN__
+  u.s.l1 = least_sig;
+  u.s.l2 = most_sig;
+#else
+  u.s.l1 = most_sig;
+  u.s.l2 = least_sig;
+#endif
+  return u.i128;
+}
+
+
+#define DEBUG 0
+
+#if DEBUG
+#include 
+#include 
+
+void print_i128(__int128 val, int unsignedp)
+{
+  if (unsignedp)
+printf(" %llu ", (unsigned long long)(val >> 64));
+  else
+printf(" %lld ", (signed long long)(val >> 64));
+
+  printf("%llu (0x%llx %llx)",
+ (unsigned long long)(val & 0x),
+ (unsigned long long)(val >> 64),
+ (unsigned long long)(val & 0x));
+}
+#endif
+
+void abort (void);
+
+__attribute__((noinline))
+__int128 multiply_add (long a, long b, long c)
+{
+  return (__int128) a * b + c;
+}
+
+__attribute__((noinline))
+unsigned __int128 multiply_addu (unsigned long a, unsigned long b,
+  

Re: [PATCH][_GLIBCXX_DEBUG] Refine singular iterator state

2022-08-07 Thread François Dumont via Gcc-patches
Another version of this patch with just a new test case showing what 
wrong code was unnoticed previously by the _GLIBCXX_DEBUG mode.


On 04/08/22 22:56, François Dumont wrote:
This an old patch I had prepared a long time ago, I don't think I ever 
submitted it.


    libstdc++: [_GLIBCXX_DEBUG] Do not consider detached iterators as 
value-initialized


    An attach iterator has its _M_version set to something != 0. This 
value shall be preserved
    when detaching it so that the iterator does not look like a value 
initialized one.


    libstdc++-v3/ChangeLog:

    * include/debug/formatter.h (__singular_value_init): New 
_Iterator_state enum entry.
    (_Parameter<>(const _Safe_iterator<>&, const char*, 
_Is_iterator)): Check if iterator

    parameter is value-initialized.
    (_Parameter<>(const _Safe_local_iterator<>&, const char*, 
_Is_iterator)): Likewise.
    * include/debug/safe_iterator.h 
(_Safe_iterator<>::_M_value_initialized()): New. Adapt

    checks.
    * include/debug/safe_local_iterator.h 
(_Safe_local_iterator<>::_M_value_initialized()): New.

    Adapt checks.
    * src/c++11/debug.cc (_Safe_iterator_base::_M_reset): Do 
not reset _M_version.
    (print_field(PrintContext&, const _Parameter&, const 
char*)): Adapt state_names.
    * testsuite/23_containers/deque/debug/iterator1_neg.cc: 
New test.
    * testsuite/23_containers/deque/debug/iterator2_neg.cc: 
New test.
    * 
testsuite/23_containers/forward_list/debug/iterator1_neg.cc: New test.
    * 
testsuite/23_containers/forward_list/debug/iterator2_neg.cc: New test.


Tested under Linux x86_64 _GLIBCXX_DEBUG mode.

Ok to commit ?

François


diff --git a/libstdc++-v3/include/debug/formatter.h b/libstdc++-v3/include/debug/formatter.h
index 80e8ba46d1e..748d4fbfea4 100644
--- a/libstdc++-v3/include/debug/formatter.h
+++ b/libstdc++-v3/include/debug/formatter.h
@@ -185,6 +185,7 @@ namespace __gnu_debug
   __rbegin,		// dereferenceable, and at the reverse-beginning
   __rmiddle,	// reverse-dereferenceable, not at the reverse-beginning
   __rend,		// reverse-past-the-end
+  __singular_value_init,	// singular, value initialized
   __last_state
 };
 
@@ -280,7 +281,12 @@ namespace __gnu_debug
 	  _M_variant._M_iterator._M_seq_type = _GLIBCXX_TYPEID(_Sequence);
 
 	  if (__it._M_singular())
-	_M_variant._M_iterator._M_state = __singular;
+	{
+	  if (__it._M_value_initialized())
+		_M_variant._M_iterator._M_state = __singular_value_init;
+	  else
+		_M_variant._M_iterator._M_state = __singular;
+	}
 	  else
 	{
 	  if (__it._M_is_before_begin())
@@ -308,7 +314,12 @@ namespace __gnu_debug
 	  _M_variant._M_iterator._M_seq_type = _GLIBCXX_TYPEID(_Sequence);
 
 	  if (__it._M_singular())
-	_M_variant._M_iterator._M_state = __singular;
+	{
+	  if (__it._M_value_initialized())
+		_M_variant._M_iterator._M_state = __singular_value_init;
+	  else
+		_M_variant._M_iterator._M_state = __singular;
+	}
 	  else
 	{
 	  if (__it._M_is_end())
diff --git a/libstdc++-v3/include/debug/safe_iterator.h b/libstdc++-v3/include/debug/safe_iterator.h
index d613933e236..33f7a86478a 100644
--- a/libstdc++-v3/include/debug/safe_iterator.h
+++ b/libstdc++-v3/include/debug/safe_iterator.h
@@ -41,8 +41,8 @@
 
 #define _GLIBCXX_DEBUG_VERIFY_OPERANDS(_Lhs, _Rhs, _BadMsgId, _DiffMsgId) \
   _GLIBCXX_DEBUG_VERIFY(!_Lhs._M_singular() && !_Rhs._M_singular()	\
-			|| (_Lhs.base() == _Iterator()			\
-			&& _Rhs.base() == _Iterator()),		\
+			|| (_Lhs._M_value_initialized()			\
+			&& _Rhs._M_value_initialized()),		\
 			_M_message(_BadMsgId)\
 			._M_iterator(_Lhs, #_Lhs)			\
 			._M_iterator(_Rhs, #_Rhs));			\
@@ -177,7 +177,7 @@ namespace __gnu_debug
 	// _GLIBCXX_RESOLVE_LIB_DEFECTS
 	// DR 408. Is vector > forbidden?
 	_GLIBCXX_DEBUG_VERIFY(!__x._M_singular()
-			  || __x.base() == _Iterator(),
+			  || __x._M_value_initialized(),
 			  _M_message(__msg_init_copy_singular)
 			  ._M_iterator(*this, "this")
 			  ._M_iterator(__x, "other"));
@@ -193,7 +193,7 @@ namespace __gnu_debug
   : _Iter_base()
   {
 	_GLIBCXX_DEBUG_VERIFY(!__x._M_singular()
-			  || __x.base() == _Iterator(),
+			  || __x._M_value_initialized(),
 			  _M_message(__msg_init_copy_singular)
 			  ._M_iterator(*this, "this")
 			  ._M_iterator(__x, "other"));
@@ -220,7 +220,7 @@ namespace __gnu_debug
 	  // _GLIBCXX_RESOLVE_LIB_DEFECTS
 	  // DR 408. Is vector > forbidden?
 	  _GLIBCXX_DEBUG_VERIFY(!__x._M_singular()
-|| __x.base() == _MutableIterator(),
+|| __x._M_value_initialized(),
 _M_message(__msg_init_const_singular)
 ._M_iterator(*this, "this")
 ._M_iterator(__x, "other"));
@@ -236,7 +236,7 @@ namespace __gnu_debug
 	// _GLIBCXX_RESOLVE_LIB_DEFECTS
 	// DR 408. Is vector > forbidden?
 	_GLIBCXX_DEBUG_VERIFY(!__x._M_singular()

Re: 回复:[PATCH v5] LoongArch: add movable attribute

2022-08-07 Thread Lulu Cheng



在 2022/8/5 下午5:53, Xi Ruoyao 写道:

On Fri, 2022-08-05 at 15:58 +0800, Lulu Cheng wrote:

I think the model of precpu is not very easy to describe. 
model(got)?model(global)?
I also want to use attribute model and -mcmodel together, but this is just an 
initial idea,
what do you think?

It seems I had some misunderstanding about IA-64 model attribute.  IA-64
actually does not have -mcmodel= options.  And a code model only
specifies where "the GOT and the local symbols" are, but our new
attribute should apply for both local symbols and global symbols.  So I
don't think we should strongly bind the new attribute and -mcmodel.

Maybe, __attribute__((addressing_model(got/pcrel32/pcrel64/abs32/abs64))
?  I think they are explicit enough (we can implement got and pc32
first, and adding the others when we implement other code models).


I still think it makes a little bit more sense to put attribute(model) 
and -mcmodel together.


-mcmodel sets the access range of all symbols in a single file, and 
attribute (model) sets the


accsess range of a single symbol in a file. For example 
__attribute__((model(normal/large/extreme))).




[PATCH] rs6000: Fix incorrect RTL for Power LE when removing the UNSPECS [PR106069]

2022-08-07 Thread Xionghu Luo via Gcc-patches
The native RTL expression for vec_mrghw should be same for BE and LE as
they are register and endian-independent.  So both BE and LE need
generate exactly same RTL with index [0 4 1 5] when expanding vec_mrghw
with vec_select and vec_concat.

(set (reg:V4SI 141) (vec_select:V4SI (vec_concat:V8SI
   (subreg:V4SI (reg:V16QI 139) 0)
   (subreg:V4SI (reg:V16QI 140) 0))
   [const_int 0 4 1 5]))

Then combine pass could do the nested vec_select optimization
in simplify-rtx.c:simplify_binary_operation_1 also on both BE and LE:

21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel [0 4 1 5])
24: {r151:SI=vec_select(r150:V4SI,parallel [const_int 3]);}

=>

21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel)
24: {r151:SI=vec_select(r146:V4SI,parallel [const_int 1]);}

The endianness check need only once at ASM generation finally.
ASM would be better due to nested vec_select simplified to simple scalar
load.

Regression tested pass for Power8{LE,BE}{32,64} and Power{9,10}LE{32,64}
Linux(Thanks to Kewen), OK for master?  Or should we revert r12-4496 to
restore to the UNSPEC implementation?

gcc/ChangeLog:
PR target/106069
* config/rs6000/altivec.md (altivec_vmrghb): Emit same native
RTL for BE and LE.
(altivec_vmrghh): Likewise.
(altivec_vmrghw): Likewise.
(*altivec_vmrghsf): Adjust.
(altivec_vmrglb): Likewise.
(altivec_vmrglh): Likewise.
(altivec_vmrglw): Likewise.
(*altivec_vmrglsf): Adjust.
(altivec_vmrghb_direct): Emit different ASM for BE and LE.
(altivec_vmrghh_direct): Likewise.
(altivec_vmrghw_direct_): Likewise.
(altivec_vmrglb_direct): Likewise.
(altivec_vmrglh_direct): Likewise.
(altivec_vmrglw_direct_): Likewise.
(vec_widen_smult_hi_v16qi): Adjust.
(vec_widen_smult_lo_v16qi): Adjust.
(vec_widen_umult_hi_v16qi): Adjust.
(vec_widen_umult_lo_v16qi): Adjust.
(vec_widen_smult_hi_v8hi): Adjust.
(vec_widen_smult_lo_v8hi): Adjust.
(vec_widen_umult_hi_v8hi): Adjust.
(vec_widen_umult_lo_v8hi): Adjust.
* config/rs6000/rs6000.cc (altivec_expand_vec_perm_const): Emit same
native RTL for BE and LE.
* config/rs6000/vsx.md (vsx_xxmrghw_): Likewise.
(vsx_xxmrglw_): Likewise.

gcc/testsuite/ChangeLog:
PR target/106069
* gcc.target/powerpc/pr106069.C: New test.

Signed-off-by: Xionghu Luo 
---
 gcc/config/rs6000/altivec.md| 122 
 gcc/config/rs6000/rs6000.cc |  36 +++---
 gcc/config/rs6000/vsx.md|  16 +--
 gcc/testsuite/gcc.target/powerpc/pr106069.C | 118 +++
 4 files changed, 209 insertions(+), 83 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr106069.C

diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 2c4940f2e21..8d9c0109559 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -1144,11 +1144,7 @@ (define_expand "altivec_vmrghb"
(use (match_operand:V16QI 2 "register_operand"))]
   "TARGET_ALTIVEC"
 {
-  rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct
-   : gen_altivec_vmrglb_direct;
-  if (!BYTES_BIG_ENDIAN)
-std::swap (operands[1], operands[2]);
-  emit_insn (fun (operands[0], operands[1], operands[2]));
+  emit_insn (gen_altivec_vmrghb_direct (operands[0], operands[1], 
operands[2]));
   DONE;
 })
 
@@ -1167,7 +1163,12 @@ (define_insn "altivec_vmrghb_direct"
 (const_int 6) (const_int 22)
 (const_int 7) (const_int 23)])))]
   "TARGET_ALTIVEC"
-  "vmrghb %0,%1,%2"
+  {
+ if (BYTES_BIG_ENDIAN)
+  return "vmrghb %0,%1,%2";
+else
+  return "vmrglb %0,%2,%1";
+ }
   [(set_attr "type" "vecperm")])
 
 (define_expand "altivec_vmrghh"
@@ -1176,11 +1177,7 @@ (define_expand "altivec_vmrghh"
(use (match_operand:V8HI 2 "register_operand"))]
   "TARGET_ALTIVEC"
 {
-  rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrghh_direct
-   : gen_altivec_vmrglh_direct;
-  if (!BYTES_BIG_ENDIAN)
-std::swap (operands[1], operands[2]);
-  emit_insn (fun (operands[0], operands[1], operands[2]));
+  emit_insn (gen_altivec_vmrghh_direct (operands[0], operands[1], 
operands[2]));
   DONE;
 })
 
@@ -1195,7 +1192,12 @@ (define_insn "altivec_vmrghh_direct"
 (const_int 2) (const_int 10)
 (const_int 3) (const_int 11)])))]
   "TARGET_ALTIVEC"
-  "vmrghh %0,%1,%2"
+  {
+ if (BYTES_BIG_ENDIAN)
+  return "vmrghh %0,%1,%2";
+else
+  return "vmrglh %0,%2,%1";
+ }
   [(set_attr "type" "vecperm")])
 
 (define_expand "altivec_vmrghw"
@@ -1204,12 +1206,8 @@ (define_expand "altivec_vmrghw"
(use (match_operand:V4SI 2 "register_operand"))]
   "V

[PATCH] Fix middle-end/103645: empty struct store not removed when using compound literal

2022-08-07 Thread apinski--- via Gcc-patches
From: Andrew Pinski 

For compound literals empty struct stores are not removed as they go down a
different path of the gimplifier; trying to optimize the init constructor.
This fixes the problem by not adding the gimple assignment at the end
of gimplify_init_constructor if it was an empty type.

Note this updates gcc.dg/pr87052.c where we had:
const char d[0] = { };
And was expecting a store to d but after this, there is no store
as the decl's type is zero in size.

OK? Bootstrapped and tested on x86_64-linux-gnu with no regressions.

gcc/ChangeLog:

PR middle-end/103645
* gimplify.c (gimplify_init_constructor): Don't build/add
gimple assignment of an empty type.

testsuite/ChangeLog:
* gcc.dg/pr87052.c: Update d var to expect nothing.
---
 gcc/gimplify.cc| 7 +--
 gcc/testsuite/gcc.dg/pr87052.c | 6 +++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc
index 2ac7ca0855e..f0fbdb48012 100644
--- a/gcc/gimplify.cc
+++ b/gcc/gimplify.cc
@@ -5488,8 +5488,11 @@ gimplify_init_constructor (tree *expr_p, gimple_seq 
*pre_p, gimple_seq *post_p,
   if (ret == GS_ERROR)
 return GS_ERROR;
   /* If we have gimplified both sides of the initializer but have
- not emitted an assignment, do so now.  */
-  if (*expr_p)
+ not emitted an assignment, do so now.   */
+  if (*expr_p
+  /* If the type is an empty type, we don't need to emit the
+assignment. */
+  && !is_empty_type (TREE_TYPE (TREE_OPERAND (*expr_p, 0
 {
   tree lhs = TREE_OPERAND (*expr_p, 0);
   tree rhs = TREE_OPERAND (*expr_p, 1);
diff --git a/gcc/testsuite/gcc.dg/pr87052.c b/gcc/testsuite/gcc.dg/pr87052.c
index 18e092c4674..796fe6440c1 100644
--- a/gcc/testsuite/gcc.dg/pr87052.c
+++ b/gcc/testsuite/gcc.dg/pr87052.c
@@ -23,8 +23,7 @@ void test (void)
 
   const char d[0] = { };
 
-  /* Expect the following:
- d = ""; */
+  /* Expect nothing.  */
 
   const char e[0] = "";
 
@@ -36,6 +35,7 @@ void test (void)
 /* { dg-final { scan-tree-dump-times "a = \"x00ab\";" 1 "gimple" } }
{ dg-final { scan-tree-dump-times "b = \"ax00bc\";"  1 "gimple" } }
{ dg-final { scan-tree-dump-times "c = \"\";"  1 "gimple" } }
-   { dg-final { scan-tree-dump-times "d = { *};"  1 "gimple" } }
+   { dg-final { scan-tree-dump-times "d = "  1 "gimple" } }
+   { dg-final { scan-tree-dump-times "d = {CLOBBER\\(eol\\)}"  1 "gimple" } }
{ dg-final { scan-tree-dump-times "e = "  1 "gimple" } }
{ dg-final { scan-tree-dump-times "e = {CLOBBER\\(eol\\)}"  1 "gimple" } }  
*/
-- 
2.27.0



[COMMITTED] Move testcase gcc.dg/tree-ssa/pr93776.c to gcc.c-torture/compile/pr93776.c

2022-08-07 Thread apinski--- via Gcc-patches
From: Andrew Pinski 

Since this testcase is not exactly SSA specific and it would
be a good idea to compile this at more than just at -O1, moving
it to gcc.c-torture/compile would do that.

Committed as obvious after a test on x86_64-linux-gnu.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/pr93776.c: Moved to...
* gcc.c-torture/compile/pr93776.c: ...here.
---
 .../{gcc.dg/tree-ssa => gcc.c-torture/compile}/pr93776.c  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename gcc/testsuite/{gcc.dg/tree-ssa => gcc.c-torture/compile}/pr93776.c (76%)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr93776.c 
b/gcc/testsuite/gcc.c-torture/compile/pr93776.c
similarity index 76%
rename from gcc/testsuite/gcc.dg/tree-ssa/pr93776.c
rename to gcc/testsuite/gcc.c-torture/compile/pr93776.c
index c407a627718..3852736c040 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr93776.c
+++ b/gcc/testsuite/gcc.c-torture/compile/pr93776.c
@@ -1,5 +1,5 @@
-/* { dg-do compile } */
-/* { dg-options "-O1" } */
+/* This used to ICE in SRA as SRA got
+   confused by the zero signed assigment. */
 
 struct empty {};
 struct s { int i; };
-- 
2.27.0



[Committed] Add -mno-stv to new gcc.target/i386/cmpti2.c test case.

2022-08-07 Thread Roger Sayle

Adding -march=cascadelake to the command line options of the new cmpti2.c
testcase triggers TImode STV and produces vector code that doesn't match
the scalar implementation that this test was intended to check.  Adding
-mno-stv to the options fixes this.  Committed as obvious.


2022-08-07  Roger Sayle  

gcc/testsuite/ChangeLog
* gcc.target/i386/cmpti2.c: Add -mno-stv to dg-options.

Roger
--

diff --git a/gcc/testsuite/gcc.target/i386/cmpti2.c 
b/gcc/testsuite/gcc.target/i386/cmpti2.c
index ad9572901ce..ba7dd7292a0 100644
--- a/gcc/testsuite/gcc.target/i386/cmpti2.c
+++ b/gcc/testsuite/gcc.target/i386/cmpti2.c
@@ -1,5 +1,5 @@
 /* { dg-do compile { target int128 } } */
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -mno-stv" } */
 
 __int128 x;
 __int128 y;


[r13-1982 Regression] FAIL: gcc.target/i386/cmpti2.c scan-assembler-times xorq 4 on Linux/x86_64

2022-08-07 Thread haochen.jiang via Gcc-patches
On Linux/x86_64,

a46bca36b7b3a8a7e15b04225fb2b4f9b1bed62c is the first bad commit
commit a46bca36b7b3a8a7e15b04225fb2b4f9b1bed62c
Author: Roger Sayle 
Date:   Sun Aug 7 08:49:48 2022 +0100

Allow any immediate constant in *cmp_doubleword splitter on x86_64.

caused

FAIL: gcc.target/i386/cmpti2.c scan-assembler-times xorq 4

with GCC configured with

../../gcc/configure 
--prefix=/export/users/haochenj/src/gcc-bisect/master/master/r13-1982/usr 
--enable-clocale=gnu --with-system-zlib --with-demangler-in-ld 
--with-fpmath=sse --enable-languages=c,c++,fortran --enable-cet --without-isl 
--enable-libmpx x86_64-linux --disable-bootstrap

To reproduce:

$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="i386.exp=gcc.target/i386/cmpti2.c --target_board='unix{-m64\ 
-march=cascadelake}'"

(Please do not reply to this email, for question about this report, contact me 
at haochen dot jiang at intel.com)


[PATCH] middle-end: Optimize ((X >> C1) & C2) != C3 for more cases.

2022-08-07 Thread Roger Sayle

Following my middle-end patch for PR tree-optimization/94026, I'd promised
Jeff Law that I'd clean up the dead-code in fold-const.cc now that these
optimizations are handled in match.pd.  Alas, I discovered things aren't
quite that simple, as the transformations I'd added avoided cases where
C2 overlapped with the new bits introduced by the shift, but the original
code handled any value of C2 provided that it had a single-bit set (under
the condition that C3 was always zero).

This patch upgrades the transformations supported by match.pd to cover
any values of C2 and C3, provided that C1 is a valid bit shift constant,
for all three shift types (logical right, arithmetic right and left).
This then makes the code in fold-const.cc fully redundant, and adds
support for some new (corner) cases not previously handled.  If the
constant C1 is valid for the type's precision, the shift is now always
eliminated (with C2 and C3 possibly updated to test the sign bit).

Interestingly, the fold-const.cc code that I'm now deleting was originally
added by me back in 2006 to resolve PR middle-end/21137.  I've confirmed
that those testcase(s) remain resolved with this patch (and I'll close
21137 in Bugzilla).  This patch also implements most (but not all) of the
examples mentioned in PR tree-optimization/98954, for which I have some
follow-up patches.

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32},
with no new failures. Ok for mainline?


2022-08-07  Roger Sayle  

gcc/ChangeLog
PR middle-end/21137
PR tree-optimization/98954
* fold-const.cc (fold_binary_loc): Remove optimizations to
optimize ((X >> C1) & C2) ==/!= 0.
* match.pd (cmp (bit_and (lshift @0 @1) @2) @3): Remove wi::ctz
check, and handle all values of INTEGER_CSTs @2 and @3.
(cmp (bit_and (rshift @0 @1) @2) @3): Likewise, remove wi::clz
checks, and handle all values of INTEGER_CSTs @2 and @3.

gcc/testsuite/ChangeLog
PR middle-end/21137
PR tree-optimization/98954
* gcc.dg/fold-eqandshift-4.c: New test case.


Thanks in advance,
Roger
--

diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index 99021a8..4f4ec81 100644
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -12204,60 +12204,6 @@ fold_binary_loc (location_t loc, enum tree_code code, 
tree type,
}
}
 
-  /* Fold ((X >> C1) & C2) == 0 and ((X >> C1) & C2) != 0 where
-C1 is a valid shift constant, and C2 is a power of two, i.e.
-a single bit.  */
-  if (TREE_CODE (arg0) == BIT_AND_EXPR
- && integer_pow2p (TREE_OPERAND (arg0, 1))
- && integer_zerop (arg1))
-   {
- tree arg00 = TREE_OPERAND (arg0, 0);
- STRIP_NOPS (arg00);
- if (TREE_CODE (arg00) == RSHIFT_EXPR
- && TREE_CODE (TREE_OPERAND (arg00, 1)) == INTEGER_CST)
-   {
- tree itype = TREE_TYPE (arg00);
- tree arg001 = TREE_OPERAND (arg00, 1);
- prec = TYPE_PRECISION (itype);
-
- /* Check for a valid shift count.  */
- if (wi::ltu_p (wi::to_wide (arg001), prec))
-   {
- tree arg01 = TREE_OPERAND (arg0, 1);
- tree arg000 = TREE_OPERAND (arg00, 0);
- unsigned HOST_WIDE_INT log2 = tree_log2 (arg01);
- /* If (C2 << C1) doesn't overflow, then
-((X >> C1) & C2) != 0 can be rewritten as
-(X & (C2 << C1)) != 0.  */
- if ((log2 + TREE_INT_CST_LOW (arg001)) < prec)
-   {
- tem = fold_build2_loc (loc, LSHIFT_EXPR, itype,
-arg01, arg001);
- tem = fold_build2_loc (loc, BIT_AND_EXPR, itype,
-arg000, tem);
- return fold_build2_loc (loc, code, type, tem,
-   fold_convert_loc (loc, itype, arg1));
-   }
- /* Otherwise, for signed (arithmetic) shifts,
-((X >> C1) & C2) != 0 is rewritten as X < 0, and
-((X >> C1) & C2) == 0 is rewritten as X >= 0.  */
- else if (!TYPE_UNSIGNED (itype))
-   return fold_build2_loc (loc, code == EQ_EXPR ? GE_EXPR
-: LT_EXPR,
-   type, arg000,
-   build_int_cst (itype, 0));
- /* Otherwise, of unsigned (logical) shifts,
-((X >> C1) & C2) != 0 is rewritten as (X,false), and
-((X >> C1) & C2) == 0 is rewritten as (X,true).  */
- else
-   return omit_one_operand_loc (loc, type,
-code == EQ_EXPR ? integer_one_node
-  

[x86 PATCH take #2] Add peephole2 to reduce double word register shuffling

2022-08-07 Thread Roger Sayle

This is a resubmission of my patch from June to fix some forms of
inefficient
register allocation using an additional peephole2 in i386.md.
https://gcc.gnu.org/pipermail/gcc-patches/2022-June/596064.html

Since the original, a number of supporting patches/improvements have
been reviewed and approved, making this peephole even more effective.
Hence for the simple function:
__int128 foo(__int128 x, __int128 y) { return x+y; }

mainline GCC on x86_64 with -O2 currently generates:
movq%rsi, %rax
movq%rdi, %r8
movq%rax, %rdi
movq%rdx, %rax
movq%rcx, %rdx
addq%r8, %rax
adcq%rdi, %rdx
ret

with this patch we now generate (a three insn improvement):
movq%rdx, %rax
movq%rcx, %rdx
addq%rdi, %rax
adcq%rsi, %rdx
ret

Back in June the review of the original patch stalled, as peephole2
isn't the ideal place to fix this (with which I fully agree), and this patch
is really just a workaround for a deeper deficiency in reload/lra.
To address this I've now filed a new enhancement PR in Bugzilla,
PR rtl-optimization/106518, that describes that underlying issue,
which might make an interesting (GSoC) project for anyone brave
(fool hardy) enough to tweak GCC's register allocation.

By comparison, this single peephole can't adversely affect other targets,
and should the happy day come that it's no longer required, at worst
would just become a harmless legacy transform that no longer triggers.

I'm also investigating Uros' suggestion that it may be possible for RTL
expansion to do a better job expanding the function prologue, but
ultimately the hard register placement constraints are fixed by the
target ABI, and poor allocation/assignment of hard registers is the
responsibility/fault of the register allocation passes.
But it may still be possible to reduce register pressure, but avoiding the
use of SUBREGs (which keep the source and destination double words
live during shuffling) along the lines of Richard's CONCAT suggestion.

This patch has been retested again mainline using make bootstrap and
make -k check, both with and without --target_board=unix{-m32},
with no new failures.  Ok mainline?  


2022-08-07  Roger Sayle  

gcc/ChangeLog
PR target/43644
PR rtl-optimization/97756
PR rtl-optimization/98438
* config/i386/i386.md (define_peephole2): Recognize double word
swap sequences, and replace them with more efficient idioms,
including using xchg when optimizing for size.

gcc/testsuite/ChangeLog
PR target/43644
* gcc.target/i386/pr43644.c: New test case.


Thanks in advance,
Roger
--

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 298e4b3..a11fd5b 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -3039,6 +3039,36 @@
   [(parallel [(set (match_dup 0) (match_dup 1))
  (set (match_dup 1) (match_dup 0))])])
 
+;; Replace a double word swap that requires 4 mov insns with a
+;; 3 mov insn implementation (or an xchg when optimizing for size).
+(define_peephole2
+  [(set (match_operand:DWIH 0 "general_reg_operand")
+   (match_operand:DWIH 1 "general_reg_operand"))
+   (set (match_operand:DWIH 2 "general_reg_operand")
+   (match_operand:DWIH 3 "general_reg_operand"))
+   (clobber (match_operand: 4 "general_reg_operand"))
+   (set (match_dup 3) (match_dup 0))
+   (set (match_dup 1) (match_dup 2))]
+  "REGNO (operands[0]) != REGNO (operands[3])
+   && REGNO (operands[1]) != REGNO (operands[2])
+   && REGNO (operands[1]) != REGNO (operands[3])
+   && REGNO (operands[3]) == REGNO (operands[4])
+   && peep2_reg_dead_p (4, operands[0])
+   && peep2_reg_dead_p (5, operands[2])"
+  [(parallel [(set (match_dup 1) (match_dup 3))
+ (set (match_dup 3) (match_dup 1))])]
+{
+  if (!optimize_insn_for_size_p ())
+{
+  rtx tmp = REGNO (operands[0]) > REGNO (operands[2]) ? operands[0]
+ : operands[2];
+  emit_move_insn (tmp, operands[1]);
+  emit_move_insn (operands[1], operands[3]);
+  emit_move_insn (operands[3], tmp);
+  DONE;
+}
+})
+
 (define_expand "movstrict"
   [(set (strict_low_part (match_operand:SWI12 0 "register_operand"))
(match_operand:SWI12 1 "general_operand"))]
diff --git a/gcc/testsuite/gcc.target/i386/pr43644.c 
b/gcc/testsuite/gcc.target/i386/pr43644.c
new file mode 100644
index 000..ffdf31c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr43644.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2" } */
+
+__int128 foo(__int128 x, __int128 y)
+{
+  return x+y;
+}
+
+/* { dg-final { scan-assembler-times "movq" 2 } } */
+/* { dg-final { scan-assembler-not "push" } } */
+/* { dg-final { scan-assembler-not "pop" } } */