PR111754

2023-11-27 Thread juzhe.zh...@rivai.ai
Hi, there is a regression in RISC-V caused by this patch:

FAIL: gcc.dg/vect/pr111754.c -flto -ffat-lto-objects  scan-tree-dump optimized 
"return { 0.0, 9.0e\\+0, 0.0, 0.0 }"
FAIL: gcc.dg/vect/pr111754.c scan-tree-dump optimized "return { 0.0, 9.0e\\+0, 
0.0, 0.0 }"

I have checked the dump is :
F foo (F a, F b)
{
   [local count: 1073741824]:
   = { 0.0, 9.0e+0, 0.0, 0.0 };
  return ;

}

The dump IR seems reasonable to me.
I wonder whether we should walk around in RISC-V backend to generate the same 
IR as ARM SVE ?
Or we should adjust the test ?

Thanks.


juzhe.zh...@rivai.ai


[PATCH v1] LoongArch: Remove duplicate definition of CLZ_DEFINED_VALUE_AT_ZERO.

2023-11-27 Thread Li Wei
In the r14-5547 commit, C[LT]Z_DEFINED_VALUE_AT_ZERO were defined at
the same time, but in fact, CLZ_DEFINED_VALUE_AT_ZERO has already been
defined, so remove the duplicate definition.

gcc/ChangeLog:

* config/loongarch/loongarch.h (CTZ_DEFINED_VALUE_AT_ZERO): Add
  description.
(CLZ_DEFINED_VALUE_AT_ZERO): Remove duplicate definition.
---
 gcc/config/loongarch/loongarch.h | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h
index 115222e70fd..fa8a3f5582f 100644
--- a/gcc/config/loongarch/loongarch.h
+++ b/gcc/config/loongarch/loongarch.h
@@ -288,10 +288,12 @@ along with GCC; see the file COPYING3.  If not see
 /* Define if loading short immediate values into registers sign extends.  */
 #define SHORT_IMMEDIATES_SIGN_EXTEND 1
 
-/* The clz.{w/d} instructions have the natural values at 0.  */
+/* The clz.{w/d}, ctz.{w/d} instructions have the natural values at 0.  */
 
 #define CLZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) \
   ((VALUE) = GET_MODE_UNIT_BITSIZE (MODE), 2)
+#define CTZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) \
+  ((VALUE) = GET_MODE_UNIT_BITSIZE (MODE), 2)
 
 /* Standard register usage.  */
 
@@ -1239,8 +1241,3 @@ struct GTY (()) machine_function
 
 #define TARGET_EXPLICIT_RELOCS \
   (la_opt_explicit_relocs == EXPLICIT_RELOCS_ALWAYS)
-
-#define CLZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) \
-  ((VALUE) = GET_MODE_UNIT_BITSIZE (MODE), 2)
-#define CTZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) \
-  ((VALUE) = GET_MODE_UNIT_BITSIZE (MODE), 2)
-- 
2.31.1



[PATCH] Take register pressure into account for vec_construct when the components are not loaded from memory.

2023-11-27 Thread liuhongt
For vec_contruct, the components must be live at the same time if
they're not loaded from memory, when the number of those components
exceeds available registers, spill happens. Try to account that with a
rough estimation.
??? Ideally, we should have an overall estimation of register pressure
if we know the live range of all variables.

The patch can avoid regressions due to .i.e. vec_contruct with 32 char.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.

Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Take
register pressure into account for vec_construct when the
components are not loaded from memory.
---
 gcc/config/i386/i386.cc | 22 +-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 683ac643bc8..f8417555930 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24706,6 +24706,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
   stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
   unsigned i;
   tree op;
+  unsigned reg_needed = 0;
   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
if (TREE_CODE (op) == SSA_NAME)
  TREE_VISITED (op) = 0;
@@ -24737,11 +24738,30 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
  && (gimple_assign_rhs_code (def) != BIT_FIELD_REF
  || !VECTOR_TYPE_P (TREE_TYPE
(TREE_OPERAND (gimple_assign_rhs1 (def), 0))
-   stmt_cost += ix86_cost->sse_to_integer;
+   {
+ stmt_cost += ix86_cost->sse_to_integer;
+ reg_needed++;
+   }
}
   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
if (TREE_CODE (op) == SSA_NAME)
  TREE_VISITED (op) = 0;
+
+  /* For vec_contruct, the components must be live at the same time if
+they're not loaded from memory, when the number of those components
+exceeds available registers, spill happens. Try to account that with a
+rough estimation. Currently only handle integral modes since scalar fp
+shares sse_regs with vectors.
+??? Ideally, we should have an overall estimation of register pressure
+if we know the live range of all variables.  */
+  if (!fp && kind == vec_construct
+ && reg_needed > target_avail_regs)
+   {
+ unsigned spill_cost = ix86_builtin_vectorization_cost (scalar_store,
+vectype,
+misalign);
+ stmt_cost += spill_cost * (reg_needed - target_avail_regs);
+   }
 }
   if (stmt_cost == -1)
 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
-- 
2.31.1



Re: [PATCH][RFC] middle-end/110237 - wrong MEM_ATTRs for partial loads/stores

2023-11-27 Thread Richard Biener
On Mon, 27 Nov 2023, Jeff Law wrote:

> 
> 
> On 11/27/23 05:39, Robin Dapp wrote:
> >> The easiest way to avoid running into the alias analysis problem is
> >> to scrap the MEM_EXPR when we expand the internal functions for
> >> partial loads/stores.  That avoids the disambiguation we run into
> >> which is realizing that we store to an object of less size as
> >> the size of the mode we appear to store.
> >>
> >> After the patch we see just
> >>
> >>[1  S64 A32]
> >>
> >> so we preserve the alias set, the alignment and the size (the size
> >> is redundant if the MEM insn't BLKmode).  That's still not good
> >> in case the RTL alias oracle would implement the same
> >> disambiguation but it fends off the gimple one.
> >>
> >> This fixes gcc.dg/torture/pr58955-2.c when built with AVX512
> >> and --param=vect-partial-vector-usage=1.
> > 
> > On riscv we're seeing a similar problem across the testsuite
> > and several execution failures as a result.  In the case I
> > looked at we move a scalar load upwards over a partial store
> > that aliases the load.
> > 
> > I independently arrived at the spot mentioned in
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110237#c4
> > before knowing about the PR.
> > 
> > I can confirm that your RFC patch fixes at least two of the
> > failures,  I haven't checked the others but very likely
> > they are similar.
> FWIW, it should always be safe to ignore the memory attributes.   So if
> there's a reasonable condition here, then we can use it and just ignore the
> attribute.
> 
> Does the attribute on a partial load/store indicate the potential load/store
> size or does it indicate the actual known load/store size. If the former, then
> we probably need to treat it as a may-read/may-write kind of reference.

There's no way to distinguish a partial vs. non-partial MEM on RTL and
while without the bogus MEM_ATTR the alias oracle pieces that
miscompiled the original case are fended off we still see the load/store
as full given they have a mode with a size - that for example means
that DSE can elide a previous store to a masked part.  Eventually
that's fended off by using an UNSPEC, but whether the RTL IL has
the correct semantics is questionable.

That said, I did propose scrapping the MEM_EXPR which I think is
the correct thing to do unless we want to put a CALL_EXPR into it
(nothing would use that at the moment) or re-do MEM_EXPR and instead
have an ao_ref (or sth slightly more complete) instead of the current
MEM_ATTRs - but that would be a lot of work.

This leaves the question wrt. semantics of for example x86 mask_store:

(insn 23 22 24 5 (set (mem:V4DF (plus:DI (reg/v/f:DI 106 [ x ])
(reg:DI 101 [ ivtmp.15 ])) [2 MEM  
[(double *)x_11(D) + ivtmp.15_33 * 1]+0 S32 A64])
(unspec:V4DF [
(reg:V4DI 104 [ mask__16.8 ])
(reg:V4DF 105 [ vect_cst__42 ])
(mem:V4DF (plus:DI (reg/v/f:DI 106 [ x ])
(reg:DI 101 [ ivtmp.15 ])) [2 MEM  [(double *)x_11(D) + ivtmp.15_33 * 1]+0 S32 A64])
] UNSPEC_MASKMOV)) "t.c":5:12 8523 {avx_maskstorepd256}
 (nil))

it uses a read-modify-write which makes it safe for DSE.  mask_load
looks like

(insn 28 27 29 6 (set (reg:V4DF 115 [ vect__7.11 ])
(unspec:V4DF [
(reg:V4DI 114 [ mask__8.8 ])
(mem:V4DF (plus:DI (reg/v/f:DI 118 [ val ])
(reg:DI 103 [ ivtmp.29 ])) [2 MEM  [(double *)val_13(D) + ivtmp.29_22 * 1]+0 S32 A64])
] UNSPEC_MASKMOV)) "t.c":5:17 8515 {avx_maskloadpd256}
 (nil))

both have (as operand of the UNSPEC) a MEM with V4DFmode (and a
MEM_EXPR with a similarly bougs MEM_EXPR) indicating the loads
are _not_ partial.  That means the disambiguation against a store
to an object that's smaller than V4DF is still possible.
Setting MEM_SIZE to UNKNOWN doesn't help - that just asks to look
at the mode.  As discussed using a BLKmode MEM _might_ be a way
out but I didn't try what will happen then (patterns would need to
be adjusted I guess).

That said, I'm happy to commit the partial fix, scrapping the
bogus MEM_EXPRs.

OK for that?

Thanks,
Richard.


[PATCH] Expand: Pass down equality only flag to cmpmem expand

2023-11-27 Thread HAO CHEN GUI
Hi,
  This patch passes down the equality only flags from
emit_block_cmp_hints to cmpmem optab so that the target specific expand
can generate optimized insns for equality only compare. Targets
(e.g. rs6000) can generate more efficient insn sequence if the block
compare is equality only.

  Bootstrapped and tested on x86 and powerpc64-linux BE and LE with
no regressions. Is this OK for trunk?

Thanks
Gui Haochen

ChangeLog
Expand: Pass down equality only flag to cmpmem expand

Targets (e.g. rs6000) can generate more efficient insn sequence if the
block compare is equality only.  This patch passes down the equality
only flags from emit_block_cmp_hints to cmpmem optab so that the target
specific expand can generate optimized insns for equality only compare.

gcc/
* expr.cc (expand_cmpstrn_or_cmpmem): Rename to...
(expand_cmpstrn): ...this.
(expand_cmpmem): New function.  Pass down equality only flag to
cmpmem expand.
(emit_block_cmp_via_cmpmem): Add an argument for equality only
flag and call expand_cmpmem instead of expand_cmpstrn_or_cmpmem.
(emit_block_cmp_hints): Call emit_block_cmp_via_cmpmem with
equality only flag.
* expr.h (expand_cmpstrn, expand_cmpmem): Declare.
* builtins.cc (expand_builtin_strcmp, expand_builtin_strncmp):
Call expand_cmpstrn instead of expand_cmpstrn_or_cmpmem.
* config/i386/i386.md (cmpmemsi): Add the sixth operand for
equality only flag.
* config/rs6000/rs6000.md (cmpmemsi): Likewise.
* config/s390/s390.md (cmpmemsi): Likewise.
* doc/md.texi (cmpmem): Modify the document and add an operand
for equality only flag.

patch.diff
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 5ece0d23eb9..c2dbc25433d 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4819,7 +4819,7 @@ expand_builtin_strcmp (tree exp, ATTRIBUTE_UNUSED rtx 
target)
   if (len && !TREE_SIDE_EFFECTS (len))
{
  arg3_rtx = expand_normal (len);
- result = expand_cmpstrn_or_cmpmem
+ result = expand_cmpstrn
(cmpstrn_icode, target, arg1_rtx, arg2_rtx, TREE_TYPE (len),
 arg3_rtx, MIN (arg1_align, arg2_align));
}
@@ -4929,9 +4929,9 @@ expand_builtin_strncmp (tree exp, ATTRIBUTE_UNUSED rtx 
target,
   rtx arg1_rtx = get_memory_rtx (arg1, len);
   rtx arg2_rtx = get_memory_rtx (arg2, len);
   rtx arg3_rtx = expand_normal (len);
-  result = expand_cmpstrn_or_cmpmem (cmpstrn_icode, target, arg1_rtx,
-arg2_rtx, TREE_TYPE (len), arg3_rtx,
-MIN (arg1_align, arg2_align));
+  result = expand_cmpstrn (cmpstrn_icode, target, arg1_rtx, arg2_rtx,
+  TREE_TYPE (len), arg3_rtx,
+  MIN (arg1_align, arg2_align));

   tree fndecl = get_callee_fndecl (exp);
   if (result)
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 1b5a794b9e5..775cba5d93d 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -23195,7 +23195,8 @@ (define_expand "cmpmemsi"
 (compare:SI (match_operand:BLK 1 "memory_operand" "")
 (match_operand:BLK 2 "memory_operand" "") ) )
(use (match_operand 3 "general_operand"))
-   (use (match_operand 4 "immediate_operand"))]
+   (use (match_operand 4 "immediate_operand"))
+   (use (match_operand 5 ""))]
   ""
 {
   if (ix86_expand_cmpstrn_or_cmpmem (operands[0], operands[1],
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 2a1b5ecfaee..e66330f876e 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -10097,7 +10097,8 @@ (define_expand "cmpmemsi"
(compare:SI (match_operand:BLK 1)
(match_operand:BLK 2)))
  (use (match_operand:SI 3))
- (use (match_operand:SI 4))])]
+ (use (match_operand:SI 4))
+ (use (match_operand:SI 5))])]
   "TARGET_POPCNTD"
 {
   if (expand_block_compare (operands))
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 4bdb679daf2..506e79fb035 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -3790,7 +3790,8 @@ (define_expand "cmpmemsi"
 (compare:SI (match_operand:BLK 1 "memory_operand" "")
 (match_operand:BLK 2 "memory_operand" "") ) )
(use (match_operand:SI 3 "general_operand" ""))
-   (use (match_operand:SI 4 "" ""))]
+   (use (match_operand:SI 4 "" ""))
+   (use (match_operand:SI 5 "" ""))]
   ""
 {
   if (s390_expand_cmpmem (operands[0], operands[1],
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index e01cdcbe22c..06955cd7e78 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -6992,14 +6992,19 @@ result of the comparison.

 @cindex @code{cmpmem@var{m}} instruction pattern
 @item @samp{cmpmem@var{m}}
-Block compare instruction, with five operands like the operands
-of @samp{cmpstr@var{m}}.  The two memory blocks

[PATCH v1 2/2] LoongArch: Optimize vector constant extract-{even/odd} permutation.

2023-11-27 Thread Li Wei
For vector constant extract-{even/odd} permutation replace the default
[x]vshuf instruction combination with [x]vilv{l/h} instruction, which
can reduce instructions and improves performance.

gcc/ChangeLog:

* config/loongarch/loongarch.cc (loongarch_is_odd_extraction):
  Supplementary function prototype.
(loongarch_is_even_extraction): Adjust.
(loongarch_try_expand_lsx_vshuf_const): Adjust.
(loongarch_is_extraction_permutation): Adjust.
(loongarch_expand_vec_perm_const_2): Adjust.

gcc/testsuite/ChangeLog:

* gcc.target/loongarch/lasx-extract-even_odd-opt.c: New test.
---
 gcc/config/loongarch/loongarch.cc | 33 +++-
 .../loongarch/lasx-extract-even_odd-opt.c | 54 +++
 2 files changed, 85 insertions(+), 2 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/loongarch/lasx-extract-even_odd-opt.c

diff --git a/gcc/config/loongarch/loongarch.cc 
b/gcc/config/loongarch/loongarch.cc
index d3896d72bc2..f89c346815d 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -8672,6 +8672,12 @@ loongarch_expand_vec_perm (rtx target, rtx op0, rtx op1, 
rtx sel)
 }
 }
 
+static bool
+loongarch_is_odd_extraction (struct expand_vec_perm_d *);
+
+static bool
+loongarch_is_even_extraction (struct expand_vec_perm_d *);
+
 static bool
 loongarch_try_expand_lsx_vshuf_const (struct expand_vec_perm_d *d)
 {
@@ -8694,6 +8700,24 @@ loongarch_try_expand_lsx_vshuf_const (struct 
expand_vec_perm_d *d)
   if (d->testing_p)
return true;
 
+  /* If match extract-even and extract-odd permutations pattern, use
+   * vselect much better than vshuf.  */
+  if (loongarch_is_odd_extraction (d)
+ || loongarch_is_even_extraction (d))
+   {
+ if (loongarch_expand_vselect_vconcat (d->target, d->op0, d->op1,
+   d->perm, d->nelt))
+   return true;
+
+ unsigned char perm2[MAX_VECT_LEN];
+ for (i = 0; i < d->nelt; ++i)
+   perm2[i] = (d->perm[i] + d->nelt) & (2 * d->nelt - 1);
+
+ if (loongarch_expand_vselect_vconcat (d->target, d->op1, d->op0,
+   perm2, d->nelt))
+   return true;
+   }
+
   for (i = 0; i < d->nelt; i += 1)
{
  rperm[i] = GEN_INT (d->perm[i]);
@@ -8878,7 +8902,7 @@ loongarch_is_even_extraction (struct expand_vec_perm_d *d)
  result = false;
  break;
}
-  buf += 1;
+  buf += 2;
 }
 
   return result;
@@ -8900,7 +8924,7 @@ loongarch_is_extraction_permutation (struct 
expand_vec_perm_d *d)
  result = false;
  break;
}
-  buf += 2;
+  buf += 1;
 }
 
   return result;
@@ -9377,6 +9401,11 @@ loongarch_expand_vec_perm_const_2 (struct 
expand_vec_perm_d *d)
 Selector after: { 1, 3, 1, 3 }.
 Even extraction selector sample: E_V4DImode, { 0, 2, 4, 6 }
 Selector after: { 0, 2, 0, 2 }.  */
+
+  /* Better implement of extract-even and extract-odd permutations.  */
+  if (loongarch_expand_vec_perm_even_odd (d))
+   return true;
+
   for (i = 0; i < d->nelt / 2; i += 1)
{
  idx = d->perm[i];
diff --git a/gcc/testsuite/gcc.target/loongarch/lasx-extract-even_odd-opt.c 
b/gcc/testsuite/gcc.target/loongarch/lasx-extract-even_odd-opt.c
new file mode 100644
index 000..515f0c8621a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/lasx-extract-even_odd-opt.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mlasx" } */
+/* { dg-final { scan-assembler "xvilvl.d" } } */
+/* { dg-final { scan-assembler "xvilvh.d" } } */
+
+#define CMUL(a, b, c) \
+  {   \
+(c).ai = (a).ai * (b).ai - (a).bi * (b).bi;   \
+(c).bi = (a).ai * (b).bi + (a).bi * (b).ai;   \
+(c).ci = (a).ci * (b).ci - (a).di * (b).di;   \
+(c).di = (a).ci * (b).di + (a).di * (b).ci;   \
+  }
+#define CSUM(a, b)\
+  {   \
+(a).ai += (b).ai; \
+(a).bi += (b).bi; \
+(a).ci += (b).ci; \
+(a).di += (b).di; \
+  }
+
+typedef struct
+{
+  double ai;
+  double bi;
+  double ci;
+  double di;
+} complex;
+
+typedef struct
+{
+  complex e[6][6];
+} matrix;
+
+typedef struct
+{
+  complex c[6];
+} vector;
+
+void
+mult_adj_mat_vec (matrix *a, vector *b, vector *c)
+{
+  register int i, j;
+  register 

[PATCH v1 1/2] LoongArch: Accelerate optimization of scalar signed/unsigned popcount.

2023-11-27 Thread Li Wei
In LoongArch, the vector popcount has corresponding instructions, while
the scalar does not. Currently, the scalar popcount is calculated
through a loop, and the value of a non-power of two needs to be iterated
several times, so the vector popcount instruction is considered for
optimization.

gcc/ChangeLog:

* config/loongarch/loongarch.md (v2di): Used to simplify the
  following templates.
(popcount2): New.

gcc/testsuite/ChangeLog:

* gcc.target/loongarch/popcnt.c: New test.
* gcc.target/loongarch/popcount.c: New test.
---
 gcc/config/loongarch/loongarch.md | 27 +++-
 gcc/testsuite/gcc.target/loongarch/popcnt.c   | 41 +++
 gcc/testsuite/gcc.target/loongarch/popcount.c | 17 
 3 files changed, 83 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/popcnt.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/popcount.c

diff --git a/gcc/config/loongarch/loongarch.md 
b/gcc/config/loongarch/loongarch.md
index cd4ed495697..c440d9c348f 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -1515,7 +1515,30 @@ (define_insn "truncdfsf2"
(set_attr "cnv_mode""D2S")
(set_attr "mode" "SF")])
 
-
+;; In vector registers, popcount can be implemented directly through
+;; the vector instruction [X]VPCNT.  For GP registers, we can implement
+;; it through the following method.  Compared with loop implementation
+;; of popcount, the following method has better performance.
+
+;; This attribute used for get connection of scalar mode and corresponding
+;; vector mode.
+(define_mode_attr cntmap [(SI "v4si") (DI "v2di")])
+
+(define_expand "popcount2"
+  [(set (match_operand:GPR 0 "register_operand")
+   (popcount:GPR (match_operand:GPR 1 "register_operand")))]
+  "ISA_HAS_LSX"
+{
+  rtx in = operands[1];
+  rtx out = operands[0];
+  rtx vreg = mode == SImode ? gen_reg_rtx (V4SImode) :
+   gen_reg_rtx (V2DImode);
+  emit_insn (gen_lsx_vinsgr2vr_ (vreg, in, vreg, GEN_INT (1)));
+  emit_insn (gen_popcount2 (vreg, vreg));
+  emit_insn (gen_lsx_vpickve2gr_ (out, vreg, GEN_INT (0)));
+  DONE;
+})
+
 ;;
 ;;  
 ;;
@@ -3882,7 +3905,7 @@ (define_peephole2
   (any_extend:SI (match_dup 3)))])]
   "")
 
-
+
 
 (define_mode_iterator QHSD [QI HI SI DI])
 
diff --git a/gcc/testsuite/gcc.target/loongarch/popcnt.c 
b/gcc/testsuite/gcc.target/loongarch/popcnt.c
new file mode 100644
index 000..a10fca42092
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/popcnt.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mlsx" } */
+/* { dg-final { scan-assembler-not {popcount} } } */
+/* { dg-final { scan-assembler-times "vpcnt.d" 2 { target { loongarch64*-*-* } 
} } } */
+/* { dg-final { scan-assembler-times "vpcnt.w" 4 { target { loongarch64*-*-* } 
} } } */
+
+int
+foo (int x)
+{
+  return __builtin_popcount (x);
+}
+
+long
+foo1 (long x)
+{
+  return __builtin_popcountl (x);
+}
+
+long long
+foo2 (long long x)
+{
+  return __builtin_popcountll (x);
+}
+
+int
+foo3 (int *p)
+{
+  return __builtin_popcount (*p);
+}
+
+unsigned
+foo4 (int x)
+{
+  return __builtin_popcount (x);
+}
+
+unsigned long
+foo5 (int x)
+{
+  return __builtin_popcount (x);
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/popcount.c 
b/gcc/testsuite/gcc.target/loongarch/popcount.c
new file mode 100644
index 000..390ff067617
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/popcount.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mlsx -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 
"optimized" } } */
+
+int
+PopCount (long b)
+{
+  int c = 0;
+
+  while (b)
+{
+  b &= b - 1;
+  c++;
+}
+
+  return c;
+}
-- 
2.31.1



Re: [PATCH v2] rs6000: Add new pass for replacement of contiguous addresses vector load lxv with lxvp

2023-11-27 Thread Michael Meissner
I tried using this patch to compare with the vector size attribute patch I
posted.  I could not build it as a cross compiler on my x86_64 because the
assembler gives the following error:

Error: operand out of domain (11 is not a multiple of 2) for
std_stacktrace-elf.o.  If you look at the assembler, it has combined a lxvp 11
and lxvp 12 into:

lxvp 11,0(9)

The powerpc architecture requires that registers that are loaded with load
vector pair and stored with store vector point instructions only load/store
even/odd register pairs, and not odd/even pairs.  Unfortunately, it will mean
that this optimization will match less often.

-- 
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: meiss...@linux.ibm.com


Re: [PATCH 3/4] c23: aliasing of compatible tagged types

2023-11-27 Thread Martin Uecker
Am Dienstag, dem 28.11.2023 um 01:00 + schrieb Joseph Myers:
> On Sun, 26 Nov 2023, Martin Uecker wrote:
> 
> > My understand is that it is used for aliasing analysis and also
> > checking of conversions.  TYPE_CANONICAL must be consistent with
> > the idea the middle-end has about type conversions.  But as long
> > as we do not give the same TYPE_CANONICAL to types the middle-end
> > thinks must be incompatible using its own type checking machinery,
> > it should be safe even for types the C standard thinks must be
> > incompatible for some reason.
> 
> And presumably also for types that definitely can't be assigned because 
> they have incompatible layout through the use of different array sizes - 
> since the front end won't generate such assignments, it would never matter 
> whether the middle end considers them valid without conversion or not?

Yes, for checking assignments we use the stricter language
semantics so we should never generate assignments for
structs with different fields offsets or sizes. (I will
check this again).

> > > I also think more rationale is needed for ignoring sizes like this.  Is 
> > > it 
> > > intended for e.g. making structs with flexible array members 
> > > alias-compatible with similar structs with a fixed-size array?
> > 
> > The main reason are pointers to arrays:
> > 
> > struct foo { int (*x)[]; }
> > struct foo { int (*x)[2]; };
> > struct foo { int (*x)[1]; };
> 
> Thanks for the explanation.
> 
> I guess the cases involving flexible array members actually show up a bug 
> in the standard rather than any kind of issue with this patch - the 
> standard allows one structure ending with a flexible array member, and 
> another ending with a fixed-size array, to be compatible (and in different 
> translation units allowed that even before C23), but there is also clear 
> text in the standard showing it's not intended to require the layout to be 
> consistent (the fixed-size and flexible arrays might have different 
> offsets), and what you'd actually get with an assignment or conditional 
> expression mixing such structures certainly isn't clearly specified.  

I agree, unfortunately it seems not well specified and we somehow missed 
that this now becomes more important.

> Maybe the right resolution for that issue with the standard would be to 
> make that particular case incompatible, but it would need to be raised as 
> an issue after C23 is out.

I think FAM may need further consideration from the standard
point of view for also other reasons.

My other issues I have now run into are structs with variably size
which I think do not work properly with LTO already today:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112716



Martin

> 






Re: Re: [PATCH v2] gimple-match.pd Add more optimization for gimple_cond

2023-11-27 Thread Andrew Pinski
On Mon, Nov 27, 2023 at 10:04 PM Feng Wang  wrote:
>
> On 2023-11-28 11:06  Andrew Pinski  wrote:
> >On Mon, Nov 27, 2023 at 6:56 PM Feng Wang  
> >wrote:
> >>
> >> The link of PATCH v1: 
> >> https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg326661.html
> >> This patch add another condition for gimple-cond optimization. Refer to
> >> the following test case.
> >> int foo1 (int data, int res)
> >> {
> >>   res = data & 0xf;
> >>   res |= res << 4;
> >>   if (res < 0x22)
> >> return 0x22;
> >>   return res;
> >> }
> >> with the compilation flag "-O2",
> >> before this patch the log info of phiopt2 pass is
> >>[local count: 1073741824]:
> >>   res_5 = data_1(D) & 15;
> >>   _6 = (unsigned int) res_5;
> >>   _7 = _6 * 17;
> >>   res_8 = (int) _7;
> >>   if (_7 <= 33)
> >> goto ; [21.72%]
> >>   else
> >> goto ; [78.28%]
> >>
> >>[local count: 233216728]:
> >>
> >>[local count: 1073741824]:
> >>   # _9 = PHI 
> >>   return _9;
> >> after this patch the the log info of phiopt2 pass is
> >>[local count: 1073741824]:
> >>   res_5 = data_1(D) & 15;
> >>   _6 = (unsigned int) res_5;
> >>   _7 = _6 * 17;
> >>   res_8 = (int) _7;
> >>   _10 = MAX_EXPR <_7, 34>;
> >>   _3 = (int) _10;
> >>   return _3;
> >> This patch optimizes the phi node to generate "MAX_EXPR".
> >> The root cause of minmax replacement failure is the type of "_7"
> >> is unsigned, but the type of const_int "34" is signed. It makes
> >> types_match (c2_type, from_type) return false. So I add another
> >> condition to process this scenario.
> >>
> >> gcc/ChangeLog:
> >>
> >> * match.pd: Add another condition to process type mismatch.
> >
> >This should most likely be:
> > ((cond (cmp (convert1? x) c1) (convert2? x) c2) pattern): Also allow
> >conversions that only change the sign.
> >
> >>
> >> gcc/testsuite/ChangeLog:
> >>
> >> * gcc.dg/tree-ssa/phi-opt-41.c: New test.
> >> ---
> >>  gcc/match.pd   |  5 -
> >>  gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c | 24 ++
> >>  2 files changed, 28 insertions(+), 1 deletion(-)
> >>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c
> >>
> >> diff --git a/gcc/match.pd b/gcc/match.pd
> >> index 95225e4ca5f..e864845bfa9 100644
> >> --- a/gcc/match.pd
> >> +++ b/gcc/match.pd
> >> @@ -5419,7 +5419,10 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >>  && (types_match (c2_type, from_type)
> >>  || (TYPE_PRECISION (c2_type) > TYPE_PRECISION (from_type)
> >>  && (TYPE_UNSIGNED (from_type)
> >> -|| TYPE_SIGN (c2_type) == TYPE_SIGN (from_type)
> >> +|| TYPE_SIGN (c2_type) == TYPE_SIGN (from_type)))
> >> + || (TYPE_UNSIGNED (from_type) != TYPE_UNSIGNED (c2_type)
> >> + && TYPE_PRECISION (c2_type) == TYPE_PRECISION (from_type)
> >> + && !TYPE_OVERFLOW_WRAPS (c2_type
> >
> >What is the need for TYPE_OVERFLOW_WRAPS here? Also I think you just
> >need the check for TYPE_PRECISION instead of the rest.
> >Maybe instead of types_match here, tree_nop_conversion_p could be used
> >instead. I am not 100% sure though.
> >
> >I also suspect you should add a few other testcases that don't depend
> >on VRP changing things. Maybe a runtime test too.
> >
> >Thanks,
> >Andrew
> >
>
>
> I want to make sure the cont_int "c2" won't be overflow,so I use the 
> TYPE_OVERFLOW_WRAPS.
That is exactly what the `int_fits_type_p (@2, from_type)` check is
there for in the first place.

Thanks,
Andrew

> I checked the code , tree_nop_conversion_p  judge the TYPE_PRECISION or 
> TYPE_MODE and doesn't
> care the unsigned_flag, it should be fine for this scenario, I'm not sure if 
> there's a problem with this
> modification, I  will run the regression to check whether it causes other 
> issues.


> Thanks,
> Feng Wang
>
>
> >> {
> >>  if (cmp != EQ_EXPR)
> >>code = minmax_from_comparison (cmp, @1, @3, @1, @2);
> >> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c 
> >> b/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c
> >> new file mode 100644
> >> index 000..d1101c2f9f7
> >> --- /dev/null
> >> +++ b/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c
> >> @@ -0,0 +1,24 @@
> >> +/* { dg-do compile } */
> >> +/* { dg-options "-O2 -fdump-tree-phiopt2" } */
> >> +
> >> +int foo1 (int data, int res)
> >> +{
> >> +  res = data & 0xf;
> >> +  res |= res << 4;
> >> +  if (res < 0x22)
> >> +return 0x22;
> >> +  return res;
> >> +}
> >> +
> >> +int foo2 (int data, int res)
> >> +{
> >> +  res = data & 0xf;
> >> +  unsigned int r = res;
> >> +  r*=17;
> >> +  res = r;
> >> +  if (r < 0x22)
> >> +return 0x22;
> >> +  return res;
> >> +}
> >> +
> >> +/* { dg-final { scan-tree-dump-times "MAX_EXPR" 2 "phiopt2" } } */
> >> \ No newline at end of file
> >> --
> >> 2.17.1
> >>


[V2] New pass for sign/zero extension elimination -- not ready for "final" review

2023-11-27 Thread Jeff Law


I've still got some comments from Richard S to work through, but some 
folks are trying to play with this and thus I want to get the fixes to 
date in their hands.


Changes since V1:

- Fix handling of CALL_INSN_FUNCTION_USAGE so we don't apply PATTERN to 
an EXPR_LIST.


- Various comments and comment fixes based on feedback from Richard S.

- Remove saturating ops from safe_for_live_propagation

- Adjust checks for too large of modes.  Still not completely fixed.

- Fix computation of size from mode of SUBREG when handling sets.

- Use subreg_lsb rather than an inline variant.

- Remove a redundant CONSTANT_P check.

- Move calculation of inverted_rev_post_order_compute out of loop

- Verify we have a SUBREG before looking at SUBREG_BYTE.


Given I've still got some feedback from Richard that needs work, 
there'll definitely be a V3.  So I wouldn't lose any sleep if this 
didn't get a deep dive from a review standpoint.



--

This is work originally started by Joern @ Embecosm.

There's been a long standing sense that we're generating too many 
sign/zero extensions on the RISC-V port.  REE is useful, but it's really 
focused on a relatively narrow part of the extension problem.


What Joern's patch does is introduce a new pass which tracks liveness of 
chunks of pseudo regs.  Specifically it tracks bits 0..7, 8..15, 16..31 
and 32..63.


If it encounters a sign/zero extend that sets bits that are never read, 
then it replaces the sign/zero extension with a narrowing subreg.  The 
narrowing subreg usually gets eliminated by subsequent passes (it's just 
a copy after all).


Jivan has done some analysis and found that it eliminates roughly 1% of 
the dynamic instruction stream for x264 as well as some redundant 
extensions in the coremark benchmark (both on rv64).  In my own testing 
as I worked through issues on other architectures I clearly saw it 
helping in various places within GCC itself or in the testsuite.


The basic structure is to first do a fairly standard liveness analysis 
on the chunks, seeding original state with the liveness data from DF. 
Once that's stable, we do a final pass to identify the useless 
extensions and transform them into narrowing subregs.


A few key points to remember.

For destination processing it is always safe to ignore a destination. 
Ignoring a destination merely means that whatever was live after the 
given insn will continue to be live before the insn.  What is not safe 
is to clear a bit in the LIVENOW bitmap for a destination chunk that is 
not set.  This comes into play with things like STRICT_LOW_PART.


For source processing the safe thing to do is to set all the chunks in a 
register as live.  It is never safe to fail to process a source operand.


When a destination object is not fully live, we try to transfer that 
limited liveness to the source operands.  So for example if bits 16..63 
are dead in a destination of a PLUS, we need not mark bits 16..63 as 
live for the source operands.  We have to be careful -- consider a shift 
count on a target without SHIFT_COUNT_TRUNCATED set.  So we have both a 
list of RTL codes where we can transfer liveness and a few codes where 
one of the operands may need to be fully live (ex, a shift count) while 
the other input may not need to be fully live (value left shifted).


Locally we have had this enabled at -O1 and above to encourage testing, 
but I'm thinking that for the trunk enabling at -O2 and above is the 
right thing to do.


This has (of course) been tested on rv64.  It's also been bootstrapped 
and regression tested on x86.  Bootstrap and regression tested (C only) 
for m68k, sh4, sh4eb, alpha.  Earlier versions were also bootstrapped 
and regression tested on ppc, hppa and s390x (C only for those as well). 
 It's also been tested on the various crosses in my tester.  So we've 
got reasonable coverage of 16, 32 and 64 bit targets, big and little 
endian, with and without SHIFT_COUNT_TRUNCATED and all kinds of other 
oddities.


The included tests are for RISC-V only because not all targets are going 
to have extraneous extensions.   There's tests from coremark, x264 and 
GCC's bz database.  It probably wouldn't be hard to add aarch64 
testscases.  The BZs listed are improved by this patch for aarch64.


Given the amount of work Jivan and I have done, I'm not comfortable 
self-approving at this time.  I'd much rather have another set of eyes 
on the code.  Hopefully the code is documented well enough for that to 
be useful exercise.


So, no need to work from Pago Pago for this patch.  I may make another 
attempt at the eswin conditional move work while working virtually in 
Pago Pago though.


Thoughts, comments, recommendations?PR target/95650
PR rtl-optimization/96031
PR rtl-optimization/104387
PR rtl-optimization/111384

gcc/
* Makefile.in (OBJS): Add ext-dce.o.
* common.opt (ext-dce): Add new option.
* df-scan.cc (df_get_exit_block_use_set): No l

Re: Re: [PATCH v2] gimple-match.pd Add more optimization for gimple_cond

2023-11-27 Thread Feng Wang
On 2023-11-28 11:06  Andrew Pinski  wrote:
>On Mon, Nov 27, 2023 at 6:56 PM Feng Wang  wrote:
>>
>> The link of PATCH v1: 
>> https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg326661.html
>> This patch add another condition for gimple-cond optimization. Refer to
>> the following test case.
>> int foo1 (int data, int res)
>> {
>>   res = data & 0xf;
>>   res |= res << 4;
>>   if (res < 0x22)
>> return 0x22;
>>   return res;
>> }
>> with the compilation flag "-O2",
>> before this patch the log info of phiopt2 pass is
>>    [local count: 1073741824]:
>>   res_5 = data_1(D) & 15;
>>   _6 = (unsigned int) res_5;
>>   _7 = _6 * 17;
>>   res_8 = (int) _7;
>>   if (_7 <= 33)
>> goto ; [21.72%]
>>   else
>> goto ; [78.28%]
>>
>>    [local count: 233216728]:
>>
>>    [local count: 1073741824]:
>>   # _9 = PHI 
>>   return _9;
>> after this patch the the log info of phiopt2 pass is
>>    [local count: 1073741824]:
>>   res_5 = data_1(D) & 15;
>>   _6 = (unsigned int) res_5;
>>   _7 = _6 * 17;
>>   res_8 = (int) _7;
>>   _10 = MAX_EXPR <_7, 34>;
>>   _3 = (int) _10;
>>   return _3;
>> This patch optimizes the phi node to generate "MAX_EXPR".
>> The root cause of minmax replacement failure is the type of "_7"
>> is unsigned, but the type of const_int "34" is signed. It makes
>> types_match (c2_type, from_type) return false. So I add another
>> condition to process this scenario.
>>
>> gcc/ChangeLog:
>>
>> * match.pd: Add another condition to process type mismatch.
>
>This should most likely be:
> ((cond (cmp (convert1? x) c1) (convert2? x) c2) pattern): Also allow
>conversions that only change the sign.
>
>>
>> gcc/testsuite/ChangeLog:
>>
>> * gcc.dg/tree-ssa/phi-opt-41.c: New test.
>> ---
>>  gcc/match.pd   |  5 -
>>  gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c | 24 ++
>>  2 files changed, 28 insertions(+), 1 deletion(-)
>>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c
>>
>> diff --git a/gcc/match.pd b/gcc/match.pd
>> index 95225e4ca5f..e864845bfa9 100644
>> --- a/gcc/match.pd
>> +++ b/gcc/match.pd
>> @@ -5419,7 +5419,10 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>  && (types_match (c2_type, from_type)
>>  || (TYPE_PRECISION (c2_type) > TYPE_PRECISION (from_type)
>>  && (TYPE_UNSIGNED (from_type)
>> -    || TYPE_SIGN (c2_type) == TYPE_SIGN (from_type)
>> +    || TYPE_SIGN (c2_type) == TYPE_SIGN (from_type)))
>> + || (TYPE_UNSIGNED (from_type) != TYPE_UNSIGNED (c2_type)
>> + && TYPE_PRECISION (c2_type) == TYPE_PRECISION (from_type)
>> + && !TYPE_OVERFLOW_WRAPS (c2_type
>
>What is the need for TYPE_OVERFLOW_WRAPS here? Also I think you just
>need the check for TYPE_PRECISION instead of the rest.
>Maybe instead of types_match here, tree_nop_conversion_p could be used
>instead. I am not 100% sure though.
>
>I also suspect you should add a few other testcases that don't depend
>on VRP changing things. Maybe a runtime test too.
>
>Thanks,
>Andrew
>


I want to make sure the cont_int "c2" won't be overflow,so I use the 
TYPE_OVERFLOW_WRAPS.
I checked the code , tree_nop_conversion_p  judge the TYPE_PRECISION or 
TYPE_MODE and doesn't
care the unsigned_flag, it should be fine for this scenario, I'm not sure if 
there's a problem with this
modification, I  will run the regression to check whether it causes other 
issues.
Thanks,
Feng Wang


>> {
>>  if (cmp != EQ_EXPR)
>>    code = minmax_from_comparison (cmp, @1, @3, @1, @2);
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c 
>> b/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c
>> new file mode 100644
>> index 000..d1101c2f9f7
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c
>> @@ -0,0 +1,24 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -fdump-tree-phiopt2" } */
>> +
>> +int foo1 (int data, int res)
>> +{
>> +  res = data & 0xf;
>> +  res |= res << 4;
>> +  if (res < 0x22)
>> +    return 0x22;
>> +  return res;
>> +}
>> +
>> +int foo2 (int data, int res)
>> +{
>> +  res = data & 0xf;
>> +  unsigned int r = res;
>> +  r*=17;
>> +  res = r;
>> +  if (r < 0x22)
>> +    return 0x22;
>> +  return res;
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "MAX_EXPR" 2 "phiopt2" } } */
>> \ No newline at end of file
>> --
>> 2.17.1
>>

Re: [RFA] New pass for sign/zero extension elimination

2023-11-27 Thread Jeff Law




On 11/27/23 11:19, Joern Rennecke wrote:

You are applying PATTERN to an INSN_LIST.
I know :-)  That was the late change to clean up some of the horrific 
control flow in the code.


jeff


Re: [RFA] New pass for sign/zero extension elimination

2023-11-27 Thread Jeff Law




On 11/27/23 10:36, Joern Rennecke wrote:

On 11/20/23 11:26, Richard Sandiford wrote:


+
+  mask = GET_MODE_MASK (GET_MODE (SUBREG_REG (x))) << bit;
+  if (!mask)
+ mask = -0x1ULL;


Not sure I follow this.  What does the -0x1ULL constant indicate?
Also, isn't it the mask of the outer register that is shifted, rather
than the mask of the inner mode?  E.g. if we have:

Jeff Law:

Inherited.  I should have marked it like the other one as needing
investigation.  Probably the fastest way is to just rip it out for a
test to see what breaks.


This is for support of types wider than DImode.

You unsupported tracking of these values in various places, though.
Because we don't track liveness beyond DImode in this code at all and if 
you don't filter out the larger modes, things will go wonky in a bad 
way.  It may be less of an issue after fixing the big endian correction 
code which was totally broken.


Supporting TI, OI, etc would certainly be possible, but it didn't seem 
worth the effort on current architectures.


Jeff


Re: [PATCH 2/4] [ifcvt] if convert x=c ? y+z : y by RISC-V Zicond like insns

2023-11-27 Thread Jeff Law




On 11/27/23 19:46, Fei Gao wrote:

On 2023-11-20 14:46  Jeff Law  wrote:




On 10/30/23 21:35, Fei Gao wrote:


So just a few notes to further illustrate why I'm currently looking to
take the VRULL+Ventana implementation.  The code above would be much
better handled by just calling noce_emit_cmove.  noce_emit_cmove will go
through the conditional move expander.  So any improvement we make in
the expander "just work" when called from the if-converter.

noce_emit_czero is used here to make sure czero insns are emited.
noce_emit_cmove includes SFB and Thead movcc, which will take precedence
over zicond in RISCV if enabled. Unfortunately we have products with SFB and 
Zicond
both available and saw such conflict.
And that is also the reason to add hook TARGET_HAVE_COND_ZERO
in [PATCH 1/4] to disallow ineffient code emited by SFB enable and Zicond 
disabled case.

I understand what you're trying to do, but I would consider the
TARGET_HAVE_COND_ZERO fundamentally the wrong approach.

Hi Jeff

Thanks for your review. I just post the new series.
https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg327148.html
https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg327151.html
https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg327149.html
https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg327150.html

TARGET_HAVE_COND_ZERO has been deleted.
THanks for the V2.  I'll see what I can do with them this week.  The 
series was posted prior to close of stage1, so it can still be 
integrated into gcc-14 if we get it cleaned up.


Jeff


Re: [PATCH 1/4] [RISC-V] prefer Zicond primitive semantics to SFB

2023-11-27 Thread Jeff Law




On 11/27/23 20:09, Kito Cheng wrote:

Personally I don't like to play with the pattern order to tweak the
code gen since it kinda introduces implicit relation/rule here, but I
guess the only way to prevent that is to duplicate the pattern for SFB
again, which is not an ideal solution...

I won't object to this patch, but I don't really like it either.

This patch highlights that the SFB code is not well integrated with the 
rest of the conditional move support.


Jeff


Re: [PATCH 4/4] [V2] [ifcvt] prefer SFB to Zicond for x=c ? (y op CONST) : y.

2023-11-27 Thread Jeff Law




On 11/27/23 19:32, Fei Gao wrote:

In x=c ? (y op CONST) : y cases, Zicond based czero ifcvt generates
more true dependency in code sequence than SFB based movcc. So exit
noce_try_cond_zero_arith in such cases to have a better code sequence
generated by noce_try_cmove_arith.

Take the following case for example.

CFLAGS: -mtune=sifive-7-series -march=rv64gc_zbb_zicond -mabi=lp64d -O2

unsigned int
test_RotateR_eqz_imm_int (unsigned int x, unsigned int y, unsigned int c)
{
   if (c)
 x = (y >> 11) | (y << (32 - 11));
   else
 x = y;
   return x;
}

before patch:
li  a5,11
czero.eqz   a2,a5,a2
rorwa0,a1,a2
ret

after patch:
roriw   a0,a1,11
bne a2,zero,1f  # movcc
mv  a0,a1
1:
ret

Co-authored-by: Xiao Zeng

gcc/ChangeLog:

 * config/riscv/riscv.cc (riscv_have_sfb): hook implementation
 (TARGET_HAVE_SFB): define hook in riscv
 * doc/tm.texi: add TARGET_HAVE_SFB
 * doc/tm.texi.in: add TARGET_HAVE_SFB
 * ifcvt.cc (noce_try_cond_zero_arith): prefer SFB for x=c ? (y op 
CONST) : y
 * target.def:add TARGET_HAVE_SFB
This is not OK.  It's basically a target #if.  It's not significantly 
different than the original kit which had a hook to prefer czero over 
other forms.



Selection between the two forms should be drive by the target cost 
modeling and expansion code.  Using a hook like this isn't acceptable.



Jeff


[PATCH] MATCH: Fix invalid signed boolean type usage

2023-11-27 Thread Andrew Pinski
This fixes the incorrect assumption that was done in r14-3721-ge6bcf839894783,
that being able to doing the negative after the conversion would be a valid 
thing
but really it is not valid for boolean types.

OK? Bootstrapped and tested on x86_64-linux-gnu.

gcc/ChangeLog:

PR tree-optimization/112738
* match.pd (`(nop_convert)-(convert)a`): Reject
when the outer type is boolean.

Signed-off-by: Andrew Pinski 
---
 gcc/match.pd | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/gcc/match.pd b/gcc/match.pd
index 95225e4ca5f..294e58ebf44 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1033,12 +1033,16 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 /* (nop_outer_cast)-(inner_cast)var -> -(outer_cast)(var)
if var is smaller in precision.
This is always safe for both doing the negative in signed or unsigned
-   as the value for undefined will not show up.  */
+   as the value for undefined will not show up.
+   Note the outer cast cannot be a boolean type as the only valid values
+   are 0,-1/1 (depending on the signedness of the boolean) and the negative
+   is there to get the correct value.  */
 (simplify
  (convert (negate:s@1 (convert:s @0)))
  (if (INTEGRAL_TYPE_P (type)
   && tree_nop_conversion_p (type, TREE_TYPE (@1))
-  && TYPE_PRECISION (type) > TYPE_PRECISION (TREE_TYPE (@0)))
+  && TYPE_PRECISION (type) > TYPE_PRECISION (TREE_TYPE (@0))
+  && TREE_CODE (type) != BOOLEAN_TYPE)
 (negate (convert @0
 
 (for op (negate abs)
-- 
2.39.3



Re: [PATCH v2] rs6000: Add new pass for replacement of contiguous addresses vector load lxv with lxvp

2023-11-27 Thread Michael Meissner
On Fri, Nov 24, 2023 at 05:31:20PM +0800, Kewen.Lin wrote:
> Hi Ajit,
> 
> Don't forget to CC David (CC-ed) :), some comments are inlined below.
> 
> on 2023/10/8 03:04, Ajit Agarwal wrote:
> > Hello All:
> > 
> > This patch add new pass to replace contiguous addresses vector load lxv 
> > with mma instruction
> > lxvp.
> 
> IMHO the current binding lxvp (and lxvpx, stxvp{x,}) to MMA looks wrong, it's 
> only
> Power10 and VSX required, these instructions should perform well without MMA 
> support.
> So one patch to separate their support from MMA seems to go first.

I tend to agree with you, but I recall the decision being made because at the
time, vector pairs and vector quads were only used with MMA.  We now have
various attempts to improve things for using vector pairs for non-MMA code. In
my patches, I keeped the MMA requirement, but if we decide to make it ISA 3.1
only if is fairly straight forward to look at all of the TARGET_MMA tests.

Now in the GCC 13 days, it was useful that -mmma controlled vector pair.  There
was an issue if we enabled memcpy to use store vector pair, it would lead to
one slow down.  When I was doing the tests, it was easy to use -mno-mma and it
would stop memcpy from using load/store vector pair since GCC doesn't generate
code to use MMA without using the built-ins.


Re: [PATCH 0/4] Add vector pair support to PowerPC attribute((vector_size(32)))

2023-11-27 Thread Michael Meissner
On Fri, Nov 24, 2023 at 05:41:02PM +0800, Kewen.Lin wrote:
> on 2023/11/20 16:56, Michael Meissner wrote:
> > On Mon, Nov 20, 2023 at 08:24:35AM +0100, Richard Biener wrote:
> >> I wouldn't expose the "fake" larger modes to the vectorizer but rather
> >> adjust m_suggested_unroll_factor (which you already do to some extent).
> > 
> > Thanks.  I figure I first need to fix the shuffle byes issue first and get a
> > clean test run (with the flag enabled by default), before delving into the
> > vectorization issues.
> > 
> > But testing has shown that at least in the loop I was looking at, that using
> > vector pair instructions (either through the built-ins I had previously 
> > posted
> > or with these patches), that even if I turn off unrolling completely for the
> > vector pair case, it still is faster than unrolling the loop 4 times for 
> > using
> > vector types (or auto vectorization).  Note, of course the margin is much
> > smaller in this case.
> > 
> > vector double:   (a * b) + c, unroll 4 loop time: 0.55483
> > vector double:   (a * b) + c, unroll default   loop time: 0.55638
> > vector double:   (a * b) + c, unroll 0 loop time: 0.55686
> > vector double:   (a * b) + c, unroll 2 loop time: 0.55772
> > 
> > vector32, w/vector pair: (a * b) + c, unroll 4 loop time: 0.48257
> > vector32, w/vector pair: (a * b) + c, unroll 2 loop time: 0.50782
> > vector32, w/vector pair: (a * b) + c, unroll default   loop time: 0.50864
> > vector32, w/vector pair: (a * b) + c, unroll 0 loop time: 0.52224
> > 
> > Of course being micro-benchmarks, it doesn't mean that this translates to 
> > the
> > behavior on actual code.
> > 
> > 
> 
> I noticed that Ajit posted a patch for adding one new pass to replace 
> contiguous
> addresses vector load lxv with lxvp:
> 
> https://inbox.sourceware.org/gcc-patches/ef0c54a5-c35c-3519-f062-9ac78ee66...@linux.ibm.com/
> 
> How about making this kind of rs6000 specific pass to pair both vector load 
> and
> store?  Users can make more unrolling with parameters and those memory 
> accesses
> from unrolling should be neat, I'd expect the pass can easily detect and pair 
> the
> candidates.

Yes, I tend to think a combination of things will be needed.  In my tests with
a saxpy type loop, I could not get the current built-ins to load/store vector
pairs to be fast enough.  Peter's code that he posted help, but ultimately it
was still slower than adding vector_size(32).  I will try out the patch and
compare it to my patches.

-- 
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: meiss...@linux.ibm.com


[PATCH 0/5] LoongArch: Add -mrecip option support

2023-11-27 Thread Jiahao Xu
LoongArch V1.1 instructions adds support for approximate instructions, which 
are utilized along
with additional Newton-Raphson steps implement single precision floating-point 
division, square
root and reciprocal square root operations for better throughput. Control the 
generation of
approximate instructions by implementing '-mrecip' and '-mrecip='.

Jiahao Xu (5):
  LoongArch: Add support for approximate instructions.
  LoongArch: Use standard pattern name for xvfrsqrt/vfrsqrt
instructions.
  LoongArch: Redefine pattern for xvfrecip/vfrecip instructions.
  LoongArch: New options -mrecip and -mrecip= with ffast-math.
  LoongArch: Vectorized loop unrolling is not performed on
divf/sqrtf/rsqrtf with turns on -mrecip.

 gcc/config/loongarch/genopts/loongarch.opt.in |  11 +
 gcc/config/loongarch/lasx.md  |  89 ++-
 gcc/config/loongarch/lasxintrin.h |  32 +++
 gcc/config/loongarch/loongarch-builtins.cc|  32 +++
 gcc/config/loongarch/loongarch-protos.h   |   2 +
 gcc/config/loongarch/loongarch.cc | 252 +-
 gcc/config/loongarch/loongarch.h  |  18 ++
 gcc/config/loongarch/loongarch.md | 104 ++--
 gcc/config/loongarch/loongarch.opt|  11 +
 gcc/config/loongarch/lsx.md   |  89 ++-
 gcc/config/loongarch/lsxintrin.h  |  32 +++
 gcc/config/loongarch/predicates.md|   8 +
 gcc/doc/invoke.texi   |  52 
 .../gcc.target/loongarch/recip-divf.c |   9 +
 .../gcc.target/loongarch/recip-sqrtf.c|  23 ++
 .../loongarch/vector/lasx/lasx-recip-divf.c   |  12 +
 .../loongarch/vector/lasx/lasx-recip-sqrtf.c  |  28 ++
 .../loongarch/vector/lsx/lsx-recip-divf.c |  12 +
 .../loongarch/vector/lsx/lsx-recip-sqrtf.c|  28 ++
 19 files changed, 803 insertions(+), 41 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/recip-divf.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/recip-sqrtf.c
 create mode 100644 
gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-recip-divf.c
 create mode 100644 
gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-recip-sqrtf.c
 create mode 100644 
gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-recip-divf.c
 create mode 100644 
gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-recip-sqrtf.c

-- 
2.20.1



Re: [PATCH v6 1/1] c++: Initial support for P0847R7 (Deducing This) [PR102609]

2023-11-27 Thread waffl3x
On Sunday, November 26th, 2023 at 7:40 PM, Jason Merrill  
wrote:


> 
> 
> On 11/26/23 20:44, waffl3x wrote:
> 
> > > > > > The other problem I'm having is
> > > > > > 
> > > > > > auto f0 = [n = 5, &m](this auto const&){ n = 10; };
> > > > > > This errors just fine, the lambda is unconditionally const so
> > > > > > LAMBDA_EXPR_MUTABLE_P flag is set for the closure.
> > > > > > 
> > > > > > This on the other hand does not. The constness of the captures 
> > > > > > depends
> > > > > > on (I assume) LAMBDA_EXPR_MUTABLE_P so all the captures are 
> > > > > > non-const
> > > > > > here.
> > > > > > auto f1 = [n = 5](this auto&& self){ n = 10; };
> > > > > > as_const(f1)();
> > > > > 
> > > > > That sounds correct, why would this be an error?
> > > > > 
> > > > > The constness of the captures doesn't depend on LAMBDA_EXPR_MUTABLE_P,
> > > > > it depends on the type of the object parameter, which in this case is
> > > > > non-const, so so are the captures.
> > > > 
> > > > Oops, I should have just made a less terse example, you missed the
> > > > "as_const" in the call to the lambda. The object parameter should be
> > > > deduced as const here. I definitely should have made that example
> > > > better, my bad.
> > > 
> > > Ah, yes, I see it now.
> > 
> > I don't remember if I relayed my planned fix for this to you. My
> > current idea is to modify the tree during instantiation of the lambda's
> > body somewhere near tsubst and apply const to all it's members. This is
> > unfortunately the best idea I have so far and it feels like an awful
> > hack. I am open to better ideas, but I don't think we can do anything
> > until the template is instantiated so I think it has to be there.
> 
> 
> I think the answer should be in lambda_proxy_type. The case where we
> build a DECLTYPE_TYPE may need to be expanded to cover this situation.
> 
> > Should I wait until I fix the issue in tsubst_lambda_expr before
> > submitting the patch? I'm fine to do it either way, just whatever you
> > prefer. If I finish cleaning up these tests before I hear back I'll go
> > ahead and submit it and then start looking at different solutions in
> > there.
> 
> 
> Go ahead and submit.
> 
> Jason

I'm going to need to sit down and set up a proper e-mail application
once I'm done with all this, I missed your reply because it went off to
another thread. Luckily, I decided to send the patch anyway, and when I
noticed that my patch was not under the same thread I came looking for
it. Ah well, what a pain, I guess getting used to these mail list
things is just going to take time.

> > It doesn't. The issue is messing with the type of (potentially) a lot
> > of different functions. Even if it doesn't actually break anything, it
> > seems like the kind of hidden mutation that you were objecting to.
> 
> 
> Oh... yeah..., I see the issue now. I still don't think the solution
> used for static lambdas will work, or be painless anyhow, but if I
> can't find something better I will try to use that one.
> 
> > > Well, even so, I can just clear it after it gets used as we just need
> > > it to pass the closure type down. Perhaps I should have led with this,
> > > but as it stands the version that uses TYPE_METHOD_BASETYPE bootstraps
> > > with no regressions. I'll still look deeper but I'm pretty confident in
> > > my decision here, I really don't want to try to unravel what
> > > build_memfn_type does, I would rather find a whole different way of
> > > passing that information down.
> > 
> > But the existing code already works fine, it's just a question of
> > changing the conditions so we handle xob lambdas the same way as static.
> 
> 
> I'm still concerned it wont cooperate with xobj parameters of unrelated
> type, but like I said, you've indicated my solution is definitely wrong
> so I'll look at fixing it.

I spent some time looking at it, I've decided you're probably right
that handling this the same way as the static lambda case is the best
in the short term. I still don't like it, but I've gone ahead and made
that change, and it seems to work just fine. I still find it icky, but
once I realized we do in fact need lambda_fntype since it might have
been substituted into in tsubst_lambda_expr, I don't see any better way
of doing this at the moment.

Since the added parameter just gets popped off by static_fn_type, and
tsubst_lambda_expr doesn't touch the xobj parameter, I'm pretty sure it
should behave properly. So no problems I guess, moving on to the
captures bug.

Alex



[PATCH 3/5] LoongArch: Redefine pattern for xvfrecip/vfrecip instructions.

2023-11-27 Thread Jiahao Xu
Redefine pattern for [x]vfrecip instructions use rtx code instead of unspec, 
and enable
[x]vfrecip instructions to be generated during auto-vectorization.

gcc/ChangeLog:

* config/loongarch/lasx.md (lasx_xvfrecip_): Renamed to ..
(recip3): .. this.
* config/loongarch/loongarch-builtins.cc (CODE_FOR_lsx_vfrecip_d): 
Redefine
to new pattern name.
(CODE_FOR_lsx_vfrecip_s): Ditto.
(CODE_FOR_lasx_xvfrecip_d): Ditto.
(CODE_FOR_lasx_xvfrecip_s): Ditto.
(loongarch_expand_builtin_direct): For the vector recip instructions, 
construct a
temporary parameter const1_vector.
* config/loongarch/lsx.md (lsx_vfrecip_): Renamed to ..
(recip3): .. this.
* config/loongarch/predicates.md (const_vector_1_operand): New 
predicate.

diff --git a/gcc/config/loongarch/lasx.md b/gcc/config/loongarch/lasx.md
index 5f78cc45ccd..2d7b8f02b4b 100644
--- a/gcc/config/loongarch/lasx.md
+++ b/gcc/config/loongarch/lasx.md
@@ -1681,12 +1681,12 @@ (define_insn "lasx_xvfmina_"
   [(set_attr "type" "simd_fminmax")
(set_attr "mode" "")])
 
-(define_insn "lasx_xvfrecip_"
+(define_insn "recip3"
   [(set (match_operand:FLASX 0 "register_operand" "=f")
-   (unspec:FLASX [(match_operand:FLASX 1 "register_operand" "f")]
- UNSPEC_LASX_XVFRECIP))]
+   (div:FLASX (match_operand:FLASX 1 "const_vector_1_operand" "")
+ (match_operand:FLASX 2 "register_operand" "f")))]
   "ISA_HAS_LASX"
-  "xvfrecip.\t%u0,%u1"
+  "xvfrecip.\t%u0,%u2"
   [(set_attr "type" "simd_fdiv")
(set_attr "mode" "")])
 
diff --git a/gcc/config/loongarch/loongarch-builtins.cc 
b/gcc/config/loongarch/loongarch-builtins.cc
index 43d853bc961..9d3644dbb9b 100644
--- a/gcc/config/loongarch/loongarch-builtins.cc
+++ b/gcc/config/loongarch/loongarch-builtins.cc
@@ -475,6 +475,8 @@ AVAIL_ALL (lasx, ISA_HAS_LASX)
 #define CODE_FOR_lsx_vssrlrn_wu_d CODE_FOR_lsx_vssrlrn_u_wu_d
 #define CODE_FOR_lsx_vfrsqrt_d CODE_FOR_rsqrtv2df2
 #define CODE_FOR_lsx_vfrsqrt_s CODE_FOR_rsqrtv4sf2
+#define CODE_FOR_lsx_vfrecip_d CODE_FOR_recipv2df3
+#define CODE_FOR_lsx_vfrecip_s CODE_FOR_recipv4sf3
 
 /* LoongArch ASX define CODE_FOR_lasx_mxxx */
 #define CODE_FOR_lasx_xvsadd_b CODE_FOR_ssaddv32qi3
@@ -747,6 +749,8 @@ AVAIL_ALL (lasx, ISA_HAS_LASX)
 #define CODE_FOR_lasx_xvsat_du CODE_FOR_lasx_xvsat_u_du
 #define CODE_FOR_lasx_xvfrsqrt_d CODE_FOR_rsqrtv4df2
 #define CODE_FOR_lasx_xvfrsqrt_s CODE_FOR_rsqrtv8sf2
+#define CODE_FOR_lasx_xvfrecip_d CODE_FOR_recipv4df3
+#define CODE_FOR_lasx_xvfrecip_s CODE_FOR_recipv8sf3
 
 static const struct loongarch_builtin_description loongarch_builtins[] = {
 #define LARCH_MOVFCSR2GR 0
@@ -2978,6 +2982,22 @@ loongarch_expand_builtin_direct (enum insn_code icode, 
rtx target, tree exp,
   if (has_target_p)
 create_output_operand (&ops[opno++], target, TYPE_MODE (TREE_TYPE (exp)));
 
+  /* For the vector reciprocal instructions, we need to construct a temporary
+ parameter const1_vector.  */
+  switch (icode)
+{
+case CODE_FOR_recipv8sf3:
+case CODE_FOR_recipv4df3:
+case CODE_FOR_recipv4sf3:
+case CODE_FOR_recipv2df3:
+  loongarch_prepare_builtin_arg (&ops[2], exp, 0);
+  create_input_operand (&ops[1], CONST1_RTX (ops[0].mode), ops[0].mode);
+  return loongarch_expand_builtin_insn (icode, 3, ops, has_target_p);
+
+default:
+  break;
+}
+
   /* Map the arguments to the other operands.  */
   gcc_assert (opno + call_expr_nargs (exp)
  == insn_data[icode].n_generator_args);
diff --git a/gcc/config/loongarch/lsx.md b/gcc/config/loongarch/lsx.md
index 130d77e164b..20946326e37 100644
--- a/gcc/config/loongarch/lsx.md
+++ b/gcc/config/loongarch/lsx.md
@@ -1609,12 +1609,12 @@ (define_insn "lsx_vfmina_"
   [(set_attr "type" "simd_fminmax")
(set_attr "mode" "")])
 
-(define_insn "lsx_vfrecip_"
+(define_insn "recip3"
   [(set (match_operand:FLSX 0 "register_operand" "=f")
-   (unspec:FLSX [(match_operand:FLSX 1 "register_operand" "f")]
-UNSPEC_LSX_VFRECIP))]
+   (div:FLSX (match_operand:FLSX 1 "const_vector_1_operand" "")
+(match_operand:FLSX 2 "register_operand" "f")))]
   "ISA_HAS_LSX"
-  "vfrecip.\t%w0,%w1"
+  "vfrecip.\t%w0,%w2"
   [(set_attr "type" "simd_fdiv")
(set_attr "mode" "")])
 
diff --git a/gcc/config/loongarch/predicates.md 
b/gcc/config/loongarch/predicates.md
index d02e846cb12..f7796da10b2 100644
--- a/gcc/config/loongarch/predicates.md
+++ b/gcc/config/loongarch/predicates.md
@@ -227,6 +227,10 @@ (define_predicate "const_1_operand"
   (and (match_code "const_int,const_wide_int,const_double,const_vector")
(match_test "op == CONST1_RTX (GET_MODE (op))")))
 
+(define_predicate "const_vector_1_operand"
+  (and (match_code "const_vector")
+   (match_test "op == CONST1_RTX (GET_MODE (op))")))
+
 (define_predicate "reg_or_1_operand"
   (ior (match_operand 0 "const_1_operand")
(match_operand 0 "register_op

[PATCH 5/5] LoongArch: Vectorized loop unrolling is not performed on divf/sqrtf/rsqrtf with turns on -mrecip.

2023-11-27 Thread Jiahao Xu
Using -mrecip generates a sequence of instructions to replace divf, sqrtf and 
rsqrtf. The number
of generated instructions is close to or exceeds the maximum issue of the 
LoongArch, so vectorized
loop unrolling is not performed on them.

gcc/ChangeLog:

* config/loongarch/loongarch.cc 
(loongarch_vector_costs::determine_suggested_unroll_factor):
If m_has_recip is true, uf return 1.
(loongarch_vector_costs::add_stmt_cost): Detect the use of approximate 
instruction sequence.

diff --git a/gcc/config/loongarch/loongarch.cc 
b/gcc/config/loongarch/loongarch.cc
index afee09c3b61..894ce0e1630 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -3974,7 +3974,9 @@ protected:
   /* Reduction factor for suggesting unroll factor.  */
   unsigned m_reduc_factor = 0;
   /* True if the loop contains an average operation. */
-  bool m_has_avg =false;
+  bool m_has_avg = false;
+  /* True if the loop uses approximation instruction sequence.  */
+  bool m_has_recip = false;
 };
 
 /* Implement TARGET_VECTORIZE_CREATE_COSTS.  */
@@ -4021,7 +4023,7 @@ loongarch_vector_costs::determine_suggested_unroll_factor 
(loop_vec_info loop_vi
 {
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 
-  if (m_has_avg)
+  if (m_has_avg || m_has_recip)
 return 1;
 
   /* Don't unroll if it's specified explicitly not to be unrolled.  */
@@ -4081,6 +4083,36 @@ loongarch_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
}
 }
 
+  combined_fn cfn;
+  if (kind == vector_stmt
+  && stmt_info
+  && stmt_info->stmt)
+{
+  /* Detect the use of approximate instruction sequence.  */
+  if ((TARGET_RECIP_VEC_SQRT || TARGET_RECIP_VEC_RSQRT)
+ && (cfn = gimple_call_combined_fn (stmt_info->stmt)) != CFN_LAST)
+   switch (cfn)
+ {
+ case CFN_BUILT_IN_SQRTF:
+   m_has_recip = true;
+ default:
+   break;
+ }
+  else if (TARGET_RECIP_VEC_DIV
+  && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN)
+   {
+ machine_mode mode = TYPE_MODE (vectype);
+ switch (gimple_assign_rhs_code (stmt_info->stmt))
+   {
+   case RDIV_EXPR:
+ if (GET_MODE_INNER (mode) == SFmode)
+   m_has_recip = true;
+   default:
+ break;
+   }
+   }
+}
+
   return retval;
 }
 
-- 
2.20.1



[PATCH 4/5] LoongArch: New options -mrecip and -mrecip= with ffast-math.

2023-11-27 Thread Jiahao Xu
When -mrecip option is turned on, use approximate reciprocal instructions and 
approximate
reciprocal square root instructions with additional Newton-Raphson steps to 
implement
single precision floating-point division, square root and reciprocal square 
root operations
for better throughput.

gcc/ChangeLog:

* config/loongarch/genopts/loongarch.opt.in (recip_mask): New variable.
(-mrecip, -mrecip): New options.
* config/loongarch/lasx.md (div3): New expander.
(*div3): Rename.
(sqrt2): New expander.
(*sqrt2): Rename.
(rsqrt2): New expander.
* config/loongarch/loongarch-protos.h (loongarch_emit_swrsqrtsf): New 
prototype.
(loongarch_emit_swdivsf): Ditto.
* config/loongarch/loongarch.cc (loongarch_option_override_internal): 
Set
recip_mask for -mrecip and -mrecip= options.
(loongarch_emit_swrsqrtsf): New function.
(loongarch_emit_swdivsf): Ditto.
(use_rsqrt_p): Ditto.
(loongarch_optab_supported_p): Ditto.
(TARGET_OPTAB_SUPPORTED_P): New hook.
* config/loongarch/loongarch.h (RECIP_MASK_NONE): New bitmasks.
(RECIP_MASK_DIV): Ditto.
(RECIP_MASK_SQRT): Ditto.
(RECIP_MASK_RSQRT): Ditto.
(RECIP_MASK_VEC_DIV): Ditto.
(RECIP_MASK_VEC_SQRT): Ditto.
(RECIP_MASK_VEC_RSQRT): Ditto.
(RECIP_MASK_ALL): Ditto.
(TARGET_RECIP_DIV): New tests.
(TARGET_RECIP_SQRT): Ditto.
(TARGET_RECIP_RSQRT): Ditto.
(TARGET_RECIP_VEC_DIV): Ditto.
(TARGET_RECIP_VEC_SQRT): Ditto.
(TARGET_RECIP_VEC_RSQRT): Ditto.
* config/loongarch/loongarch.md (sqrt2): New expander.
(*sqrt2): Rename.
(rsqrt2): New expander.
* config/loongarch/loongarch.opt (recip_mask): New variable.
(-mrecip, -mrecip): New options.
* config/loongarch/lsx.md (div3): New expander.
(*div3): Rename.
(sqrt2): New expander.
(*sqrt2): Rename.
(rsqrt2): New expander.
* config/loongarch/predicates.md (reg_or_vecotr_1_operand): New 
predicate.
* doc/invoke.texi (LoongArch Options): Document new options.

gcc/testsuite/ChangeLog:

* gcc.target/loongarch/recip-divf.c: New test.
* gcc.target/loongarch/recip-sqrtf.c: New test.
* gcc.target/loongarch/vector/lasx/lasx-recip-divf.c: New test.
* gcc.target/loongarch/vector/lasx/lasx-recip-sqrtf.c: New test.
* gcc.target/loongarch/vector/lsx/lsx-recip-divf.c: New test.
* gcc.target/loongarch/vector/lsx/lsx-recip-sqrtf.c: New test.

diff --git a/gcc/config/loongarch/genopts/loongarch.opt.in 
b/gcc/config/loongarch/genopts/loongarch.opt.in
index 8af6cc6f532..cc1a9daf7cf 100644
--- a/gcc/config/loongarch/genopts/loongarch.opt.in
+++ b/gcc/config/loongarch/genopts/loongarch.opt.in
@@ -23,6 +23,9 @@ config/loongarch/loongarch-opts.h
 HeaderInclude
 config/loongarch/loongarch-str.h
 
+TargetVariable
+unsigned int recip_mask = 0
+
 ; ISA related options
 ;; Base ISA
 Enum
@@ -197,6 +200,14 @@ mexplicit-relocs
 Target Var(la_opt_explicit_relocs_backward) Init(M_OPT_UNSET)
 Use %reloc() assembly operators (for backward compatibility).
 
+mrecip
+Target RejectNegative Var(loongarch_recip)
+Generate approximate reciprocal divide and square root for better throughput.
+
+mrecip=
+Target RejectNegative Joined Var(loongarch_recip_name)
+Control generation of reciprocal estimates.
+
 ; The code model option names for -mcmodel.
 Enum
 Name(cmodel) Type(int)
diff --git a/gcc/config/loongarch/lasx.md b/gcc/config/loongarch/lasx.md
index 2d7b8f02b4b..08c81ef53e4 100644
--- a/gcc/config/loongarch/lasx.md
+++ b/gcc/config/loongarch/lasx.md
@@ -1249,7 +1249,25 @@ (define_insn "mul3"
   [(set_attr "type" "simd_fmul")
(set_attr "mode" "")])
 
-(define_insn "div3"
+(define_expand "div3"
+  [(set (match_operand:FLASX 0 "register_operand")
+(div:FLASX (match_operand:FLASX 1 "reg_or_vecotr_1_operand")
+  (match_operand:FLASX 2 "register_operand")))]
+  "ISA_HAS_LASX"
+{
+  if (mode == V8SFmode
+&& TARGET_RECIP_VEC_DIV
+&& optimize_insn_for_speed_p ()
+&& flag_finite_math_only && !flag_trapping_math
+&& flag_unsafe_math_optimizations)
+  {
+loongarch_emit_swdivsf (operands[0], operands[1],
+   operands[2], V8SFmode);
+DONE;
+  }
+})
+
+(define_insn "*div3"
   [(set (match_operand:FLASX 0 "register_operand" "=f")
(div:FLASX (match_operand:FLASX 1 "register_operand" "f")
   (match_operand:FLASX 2 "register_operand" "f")))]
@@ -1278,7 +1296,23 @@ (define_insn "fnma4"
   [(set_attr "type" "simd_fmadd")
(set_attr "mode" "")])
 
-(define_insn "sqrt2"
+(define_expand "sqrt2"
+  [(set (match_operand:FLASX 0 "register_operand")
+(sqrt:FLASX (match_operand:FLASX 1 "register_operand")))]
+  "ISA_HAS_LASX"
+{
+  if (mode == V8SFmode
+  && TARGET_RECIP_VEC_SQRT
+  && flag_unsafe_math_optimizations
+  && optimize_insn_for

[PATCH 2/5] LoongArch: Use standard pattern name for xvfrsqrt/vfrsqrt instructions.

2023-11-27 Thread Jiahao Xu
Rename lasx_xvfrsqrt*/lsx_vfrsqrt* to rsqrt2 to align with standard
pattern name.

gcc/ChangeLog:

* config/loongarch/lasx.md (lasx_xvfrsqrt_): Renamed to ..
(*rsqrt2): .. this.
* config/loongarch/loongarch-builtins.cc
(CODE_FOR_lsx_vfrsqrt_d): Redefine to standard pattern name.
(CODE_FOR_lsx_vfrsqrt_s): Ditto.
(CODE_FOR_lasx_xvfrsqrt_d): Ditto.
(CODE_FOR_lasx_xvfrsqrt_s): Ditto.
* config/loongarch/loongarch.md (*rsqrta): Remove.
(*rsqrt2): New insn pattern.
(*rsqrtb): Remove.
* config/loongarch/lsx.md (lsx_vfrsqrt_): Renamed to ..
(*rsqrt2): .. this.

diff --git a/gcc/config/loongarch/lasx.md b/gcc/config/loongarch/lasx.md
index dd60d2bfed3..5f78cc45ccd 100644
--- a/gcc/config/loongarch/lasx.md
+++ b/gcc/config/loongarch/lasx.md
@@ -1710,10 +1710,10 @@ (define_insn "lasx_xvfrint_"
   [(set_attr "type" "simd_fcvt")
(set_attr "mode" "")])
 
-(define_insn "lasx_xvfrsqrt_"
+(define_insn "rsqrt2"
   [(set (match_operand:FLASX 0 "register_operand" "=f")
-   (unspec:FLASX [(match_operand:FLASX 1 "register_operand" "f")]
- UNSPEC_LASX_XVFRSQRT))]
+(unspec:FLASX [(match_operand:FLASX 1 "register_operand" "f")]
+ UNSPEC_LASX_XVFRSQRT))]
   "ISA_HAS_LASX"
   "xvfrsqrt.\t%u0,%u1"
   [(set_attr "type" "simd_fdiv")
diff --git a/gcc/config/loongarch/loongarch-builtins.cc 
b/gcc/config/loongarch/loongarch-builtins.cc
index 47f658d6ab5..43d853bc961 100644
--- a/gcc/config/loongarch/loongarch-builtins.cc
+++ b/gcc/config/loongarch/loongarch-builtins.cc
@@ -473,6 +473,8 @@ AVAIL_ALL (lasx, ISA_HAS_LASX)
 #define CODE_FOR_lsx_vssrlrn_bu_h CODE_FOR_lsx_vssrlrn_u_bu_h
 #define CODE_FOR_lsx_vssrlrn_hu_w CODE_FOR_lsx_vssrlrn_u_hu_w
 #define CODE_FOR_lsx_vssrlrn_wu_d CODE_FOR_lsx_vssrlrn_u_wu_d
+#define CODE_FOR_lsx_vfrsqrt_d CODE_FOR_rsqrtv2df2
+#define CODE_FOR_lsx_vfrsqrt_s CODE_FOR_rsqrtv4sf2
 
 /* LoongArch ASX define CODE_FOR_lasx_mxxx */
 #define CODE_FOR_lasx_xvsadd_b CODE_FOR_ssaddv32qi3
@@ -743,6 +745,8 @@ AVAIL_ALL (lasx, ISA_HAS_LASX)
 #define CODE_FOR_lasx_xvsat_hu CODE_FOR_lasx_xvsat_u_hu
 #define CODE_FOR_lasx_xvsat_wu CODE_FOR_lasx_xvsat_u_wu
 #define CODE_FOR_lasx_xvsat_du CODE_FOR_lasx_xvsat_u_du
+#define CODE_FOR_lasx_xvfrsqrt_d CODE_FOR_rsqrtv4df2
+#define CODE_FOR_lasx_xvfrsqrt_s CODE_FOR_rsqrtv8sf2
 
 static const struct loongarch_builtin_description loongarch_builtins[] = {
 #define LARCH_MOVFCSR2GR 0
diff --git a/gcc/config/loongarch/loongarch.md 
b/gcc/config/loongarch/loongarch.md
index 7b09926d1a7..0b6910d84ab 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -60,6 +60,7 @@ (define_c_enum "unspec" [
   UNSPEC_TIE
 
   ;; RSQRT
+  UNSPEC_RSQRT
   UNSPEC_RSQRTE
 
   ;; RECIP
@@ -1137,25 +1138,14 @@ (define_insn "sqrt2"
(set_attr "mode" "")
(set_attr "insn_count" "1")])
 
-(define_insn "*rsqrta"
+(define_insn "*rsqrt2"
   [(set (match_operand:ANYF 0 "register_operand" "=f")
-   (div:ANYF (match_operand:ANYF 1 "const_1_operand" "")
- (sqrt:ANYF (match_operand:ANYF 2 "register_operand" "f"]
-  "flag_unsafe_math_optimizations"
-  "frsqrt.\t%0,%2"
-  [(set_attr "type" "frsqrt")
-   (set_attr "mode" "")
-   (set_attr "insn_count" "1")])
-
-(define_insn "*rsqrtb"
-  [(set (match_operand:ANYF 0 "register_operand" "=f")
-   (sqrt:ANYF (div:ANYF (match_operand:ANYF 1 "const_1_operand" "")
-(match_operand:ANYF 2 "register_operand" "f"]
-  "flag_unsafe_math_optimizations"
-  "frsqrt.\t%0,%2"
+(unspec:ANYF [(match_operand:ANYF 1 "register_operand" "f")]
+UNSPEC_RSQRT))]
+  "TARGET_HARD_FLOAT"
+  "frsqrt.\t%0,%1"
   [(set_attr "type" "frsqrt")
-   (set_attr "mode" "")
-   (set_attr "insn_count" "1")])
+   (set_attr "mode" "")])
 
 ;; Approximate Reciprocal Square Root Instructions.
 
diff --git a/gcc/config/loongarch/lsx.md b/gcc/config/loongarch/lsx.md
index 391e84f8d1d..130d77e164b 100644
--- a/gcc/config/loongarch/lsx.md
+++ b/gcc/config/loongarch/lsx.md
@@ -1638,10 +1638,10 @@ (define_insn "lsx_vfrint_"
   [(set_attr "type" "simd_fcvt")
(set_attr "mode" "")])
 
-(define_insn "lsx_vfrsqrt_"
+(define_insn "rsqrt2"
   [(set (match_operand:FLSX 0 "register_operand" "=f")
-   (unspec:FLSX [(match_operand:FLSX 1 "register_operand" "f")]
-UNSPEC_LSX_VFRSQRT))]
+(unspec:FLSX [(match_operand:FLSX 1 "register_operand" "f")]
+UNSPEC_LSX_VFRSQRT))]
   "ISA_HAS_LSX"
   "vfrsqrt.\t%w0,%w1"
   [(set_attr "type" "simd_fdiv")
-- 
2.20.1



[PATCH 1/5] LoongArch: Add support for approximate instructions.

2023-11-27 Thread Jiahao Xu
LA664 introduces new instructions for reciprocal approximation and reciprocal 
square
root approximation. It includes the scalar instructions frecipe and frsrte, as 
well
as their corresponding vector instructions [x]vfrecipe and [x]vfrsqrte. This 
patch
adds define_insn/builtins/intrinsics for these instructions.

gcc/ChangeLog:

* config/loongarch/lasx.md (lasx_xvfrecipe_): New insn 
pattern.
(lasx_xvfrsqrte_): Ditto.
* config/loongarch/lasxintrin.h (__lasx_xvfrecipe_s): New intrinsic.
(__lasx_xvfrecipe_d): Ditto.
(__lasx_xvfrsqrte_s): Ditto.
(__lasx_xvfrsqrte_d): Ditto.
* config/loongarch/loongarch-builtins.cc: Add new builtin functions.
* config/loongarch/loongarch.md (recipe2): New insn pattern.
(rsqrte): Ditto.
* config/loongarch/lsx.md (lsx_vfrecipe_): Ditto.
(lsx_vfrsqrte_): Ditto.
* config/loongarch/lsxintrin.h (__lsx_vfrecipe_s): New intrinsic.
(__lsx_vfrecipe_d): Ditto.
(__lsx_vfsqrte_s): Ditto.
(__lsx_vfsqrte_d): Ditto.

diff --git a/gcc/config/loongarch/lasx.md b/gcc/config/loongarch/lasx.md
index 2e11f061202..dd60d2bfed3 100644
--- a/gcc/config/loongarch/lasx.md
+++ b/gcc/config/loongarch/lasx.md
@@ -40,8 +40,10 @@ (define_c_enum "unspec" [
   UNSPEC_LASX_XVFCVTL
   UNSPEC_LASX_XVFLOGB
   UNSPEC_LASX_XVFRECIP
+  UNSPEC_LASX_XVFRECIPE
   UNSPEC_LASX_XVFRINT
   UNSPEC_LASX_XVFRSQRT
+  UNSPEC_LASX_XVFRSQRTE
   UNSPEC_LASX_XVFCMP_SAF
   UNSPEC_LASX_XVFCMP_SEQ
   UNSPEC_LASX_XVFCMP_SLE
@@ -1688,6 +1690,17 @@ (define_insn "lasx_xvfrecip_"
   [(set_attr "type" "simd_fdiv")
(set_attr "mode" "")])
 
+;; Approximate Reciprocal Instructions.
+
+(define_insn "lasx_xvfrecipe_"
+  [(set (match_operand:FLASX 0 "register_operand" "=f")
+(unspec:FLASX [(match_operand:FLASX 1 "register_operand" "f")]
+ UNSPEC_LASX_XVFRECIPE))]
+  "ISA_HAS_LASX"
+  "xvfrecipe.\t%u0,%u1"
+  [(set_attr "type" "simd_fdiv")
+   (set_attr "mode" "")])
+
 (define_insn "lasx_xvfrint_"
   [(set (match_operand:FLASX 0 "register_operand" "=f")
(unspec:FLASX [(match_operand:FLASX 1 "register_operand" "f")]
@@ -1706,6 +1719,17 @@ (define_insn "lasx_xvfrsqrt_"
   [(set_attr "type" "simd_fdiv")
(set_attr "mode" "")])
 
+;; Approximate Reciprocal Square Root Instructions.
+
+(define_insn "lasx_xvfrsqrte_"
+  [(set (match_operand:FLASX 0 "register_operand" "=f")
+(unspec:FLASX [(match_operand:FLASX 1 "register_operand" "f")]
+ UNSPEC_LASX_XVFRSQRTE))]
+  "ISA_HAS_LASX"
+  "xvfrsqrte.\t%u0,%u1"
+  [(set_attr "type" "simd_fdiv")
+   (set_attr "mode" "")])
+
 (define_insn "lasx_xvftint_s__"
   [(set (match_operand: 0 "register_operand" "=f")
(unspec: [(match_operand:FLASX 1 "register_operand" "f")]
diff --git a/gcc/config/loongarch/lasxintrin.h 
b/gcc/config/loongarch/lasxintrin.h
index 7bce2c757f1..3017361a924 100644
--- a/gcc/config/loongarch/lasxintrin.h
+++ b/gcc/config/loongarch/lasxintrin.h
@@ -2399,6 +2399,22 @@ __m256d __lasx_xvfrecip_d (__m256d _1)
   return (__m256d)__builtin_lasx_xvfrecip_d ((v4f64)_1);
 }
 
+/* Assembly instruction format: xd, xj.  */
+/* Data types in instruction templates:  V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m256 __lasx_xvfrecipe_s (__m256 _1)
+{
+  return (__m256)__builtin_lasx_xvfrecipe_s ((v8f32)_1);
+}
+
+/* Assembly instruction format: xd, xj.  */
+/* Data types in instruction templates:  V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m256d __lasx_xvfrecipe_d (__m256d _1)
+{
+  return (__m256d)__builtin_lasx_xvfrecipe_d ((v4f64)_1);
+}
+
 /* Assembly instruction format:xd, xj.  */
 /* Data types in instruction templates:  V8SF, V8SF.  */
 extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
@@ -2431,6 +2447,22 @@ __m256d __lasx_xvfrsqrt_d (__m256d _1)
   return (__m256d)__builtin_lasx_xvfrsqrt_d ((v4f64)_1);
 }
 
+/* Assembly instruction format: xd, xj.  */
+/* Data types in instruction templates:  V8SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m256 __lasx_xvfrsqrte_s (__m256 _1)
+{
+  return (__m256)__builtin_lasx_xvfrsqrte_s ((v8f32)_1);
+}
+
+/* Assembly instruction format: xd, xj.  */
+/* Data types in instruction templates:  V4DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m256d __lasx_xvfrsqrte_d (__m256d _1)
+{
+  return (__m256d)__builtin_lasx_xvfrsqrte_d ((v4f64)_1);
+}
+
 /* Assembly instruction format:xd, xj.  */
 /* Data types in instruction templates:  V8SF, V8SF.  */
 extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
diff --git a/gcc/config/loongarch/loongarch-builtins.cc 
b/gcc/config/loongarch/loongarch-builtins.cc
index db02aacdc3f..47f658d6ab5 100644
--- a/gcc/config/loongarch/loongarch-builtins.cc
+

Re: [PATCH 1/4] [RISC-V] prefer Zicond primitive semantics to SFB

2023-11-27 Thread Kito Cheng
Personally I don't like to play with the pattern order to tweak the
code gen since it kinda introduces implicit relation/rule here, but I
guess the only way to prevent that is to duplicate the pattern for SFB
again, which is not an ideal solution...

Anyway, it's obviously a better code gen, so LGTM :)

On Tue, Nov 28, 2023 at 10:33 AM Fei Gao  wrote:
>
> Move Zicond md files ahead of SFB to recognize Zicond first.
>
> Take the following case for example.
>
> CFLAGS: -mtune=sifive-7-series -march=rv64gc_zicond -mabi=lp64d
>
> long primitiveSemantics_00(long a, long b) { return a == 0 ? 0 : b; }
>
> before patch:
> primitiveSemantics_00:
> bne a0,zero,1f  # movcc
> mv  a1,zero
> 1:
> mv  a0,a1
> ret
>
> after patch:
> primitiveSemantics_00:
> czero.eqz   a0,a1,a0
> ret
>
> Co-authored-by: Xiao Zeng
>
> gcc/ChangeLog:
>
> * config/riscv/riscv.md (*movcc):move to sfb.md
> * config/riscv/sfb.md: New file.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/zicond-sfb-primitiveSemantics.c: New test.
> ---
>  gcc/config/riscv/riscv.md | 19 +--
>  gcc/config/riscv/sfb.md   | 37 ++
>  .../riscv/zicond-sfb-primitiveSemantics.c | 50 +++
>  3 files changed, 88 insertions(+), 18 deletions(-)
>  create mode 100644 gcc/config/riscv/sfb.md
>  create mode 100644 
> gcc/testsuite/gcc.target/riscv/zicond-sfb-primitiveSemantics.c
>
> diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> index 935eeb7fd8e..d020988446f 100644
> --- a/gcc/config/riscv/riscv.md
> +++ b/gcc/config/riscv/riscv.md
> @@ -2711,24 +2711,6 @@
>DONE;
>  })
>
> -;; Patterns for implementations that optimize short forward branches.
> -
> -(define_insn "*movcc"
> -  [(set (match_operand:GPR 0 "register_operand" "=r,r")
> -   (if_then_else:GPR
> -(match_operator 5 "ordered_comparison_operator"
> -   [(match_operand:X 1 "register_operand" "r,r")
> -(match_operand:X 2 "reg_or_0_operand" "rJ,rJ")])
> -(match_operand:GPR 3 "register_operand" "0,0")
> -(match_operand:GPR 4 "sfb_alu_operand" "rJ,IL")))]
> -  "TARGET_SFB_ALU"
> -  "@
> -   b%C5\t%1,%z2,1f\t# movcc\;mv\t%0,%z4\n1:
> -   b%C5\t%1,%z2,1f\t# movcc\;li\t%0,%4\n1:"
> -  [(set_attr "length" "8")
> -   (set_attr "type" "sfb_alu")
> -   (set_attr "mode" "")])
> -
>  ;; Used to implement built-in functions.
>  (define_expand "condjump"
>[(set (pc)
> @@ -3748,5 +3730,6 @@
>  (include "generic-ooo.md")
>  (include "vector.md")
>  (include "zicond.md")
> +(include "sfb.md")
>  (include "zc.md")
>  (include "corev.md")
> diff --git a/gcc/config/riscv/sfb.md b/gcc/config/riscv/sfb.md
> new file mode 100644
> index 000..52af4b17d46
> --- /dev/null
> +++ b/gcc/config/riscv/sfb.md
> @@ -0,0 +1,37 @@
> +;; Machine description for short forward branches(SFB).
> +;; Copyright (C) 2023 Free Software Foundation, Inc.
> +
> +;; This file is part of GCC.
> +
> +;; GCC is free software; you can redistribute it and/or modify
> +;; it under the terms of the GNU General Public License as published by
> +;; the Free Software Foundation; either version 3, or (at your option)
> +;; any later version.
> +
> +;; GCC is distributed in the hope that it will be useful,
> +;; but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +;; GNU General Public License for more details.
> +
> +;; You should have received a copy of the GNU General Public License
> +;; along with GCC; see the file COPYING3.  If not see
> +;; .
> +
> +
> +;; Patterns for implementations that optimize short forward branches.
> +
> +(define_insn "*movcc"
> +  [(set (match_operand:GPR 0 "register_operand" "=r,r")
> +   (if_then_else:GPR
> +(match_operator 5 "ordered_comparison_operator"
> +   [(match_operand:X 1 "register_operand" "r,r")
> +(match_operand:X 2 "reg_or_0_operand" "rJ,rJ")])
> +(match_operand:GPR 3 "register_operand" "0,0")
> +(match_operand:GPR 4 "sfb_alu_operand" "rJ,IL")))]
> +  "TARGET_SFB_ALU"
> +  "@
> +   b%C5\t%1,%z2,1f\t# movcc\;mv\t%0,%z4\n1:
> +   b%C5\t%1,%z2,1f\t# movcc\;li\t%0,%4\n1:"
> +  [(set_attr "length" "8")
> +   (set_attr "type" "sfb_alu")
> +   (set_attr "mode" "")])
> diff --git a/gcc/testsuite/gcc.target/riscv/zicond-sfb-primitiveSemantics.c 
> b/gcc/testsuite/gcc.target/riscv/zicond-sfb-primitiveSemantics.c
> new file mode 100644
> index 000..2c60656d5eb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/zicond-sfb-primitiveSemantics.c
> @@ -0,0 +1,50 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mtune=sifive-7-series -march=rv64gc_zicond -mabi=lp64d" { 
> target { rv64 } } } */
> +/* { dg-options "-mtune=sifive-7-series -march=rv32gc_zicond -mabi=ilp32f" { 
> target { rv32 } } } */
> +/* { dg-skip-

Re: [PATCH v2] gimple-match.pd Add more optimization for gimple_cond

2023-11-27 Thread Andrew Pinski
On Mon, Nov 27, 2023 at 6:56 PM Feng Wang  wrote:
>
> The link of PATCH v1: 
> https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg326661.html
> This patch add another condition for gimple-cond optimization. Refer to
> the following test case.
> int foo1 (int data, int res)
> {
>   res = data & 0xf;
>   res |= res << 4;
>   if (res < 0x22)
> return 0x22;
>   return res;
> }
> with the compilation flag "-O2",
> before this patch the log info of phiopt2 pass is
>[local count: 1073741824]:
>   res_5 = data_1(D) & 15;
>   _6 = (unsigned int) res_5;
>   _7 = _6 * 17;
>   res_8 = (int) _7;
>   if (_7 <= 33)
> goto ; [21.72%]
>   else
> goto ; [78.28%]
>
>[local count: 233216728]:
>
>[local count: 1073741824]:
>   # _9 = PHI 
>   return _9;
> after this patch the the log info of phiopt2 pass is
>[local count: 1073741824]:
>   res_5 = data_1(D) & 15;
>   _6 = (unsigned int) res_5;
>   _7 = _6 * 17;
>   res_8 = (int) _7;
>   _10 = MAX_EXPR <_7, 34>;
>   _3 = (int) _10;
>   return _3;
> This patch optimizes the phi node to generate "MAX_EXPR".
> The root cause of minmax replacement failure is the type of "_7"
> is unsigned, but the type of const_int "34" is signed. It makes
> types_match (c2_type, from_type) return false. So I add another
> condition to process this scenario.
>
> gcc/ChangeLog:
>
> * match.pd: Add another condition to process type mismatch.

This should most likely be:
 ((cond (cmp (convert1? x) c1) (convert2? x) c2) pattern): Also allow
conversions that only change the sign.

>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/tree-ssa/phi-opt-41.c: New test.
> ---
>  gcc/match.pd   |  5 -
>  gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c | 24 ++
>  2 files changed, 28 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 95225e4ca5f..e864845bfa9 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -5419,7 +5419,10 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  && (types_match (c2_type, from_type)
>  || (TYPE_PRECISION (c2_type) > TYPE_PRECISION (from_type)
>  && (TYPE_UNSIGNED (from_type)
> -|| TYPE_SIGN (c2_type) == TYPE_SIGN (from_type)
> +|| TYPE_SIGN (c2_type) == TYPE_SIGN (from_type)))
> + || (TYPE_UNSIGNED (from_type) != TYPE_UNSIGNED (c2_type)
> + && TYPE_PRECISION (c2_type) == TYPE_PRECISION (from_type)
> + && !TYPE_OVERFLOW_WRAPS (c2_type

What is the need for TYPE_OVERFLOW_WRAPS here? Also I think you just
need the check for TYPE_PRECISION instead of the rest.
Maybe instead of types_match here, tree_nop_conversion_p could be used
instead. I am not 100% sure though.

I also suspect you should add a few other testcases that don't depend
on VRP changing things. Maybe a runtime test too.

Thanks,
Andrew

> {
>  if (cmp != EQ_EXPR)
>code = minmax_from_comparison (cmp, @1, @3, @1, @2);
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c
> new file mode 100644
> index 000..d1101c2f9f7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-phiopt2" } */
> +
> +int foo1 (int data, int res)
> +{
> +  res = data & 0xf;
> +  res |= res << 4;
> +  if (res < 0x22)
> +return 0x22;
> +  return res;
> +}
> +
> +int foo2 (int data, int res)
> +{
> +  res = data & 0xf;
> +  unsigned int r = res;
> +  r*=17;
> +  res = r;
> +  if (r < 0x22)
> +return 0x22;
> +  return res;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "MAX_EXPR" 2 "phiopt2" } } */
> \ No newline at end of file
> --
> 2.17.1
>


Re: Re: [PATCH 4/4] [ifcvt] if convert x=c ? y&z : y by RISC-V Zicond like insns

2023-11-27 Thread Fei Gao
On 2023-11-20 15:10  Jeff Law  wrote:
>
>
>
>On 10/30/23 01:25, Fei Gao wrote:
>
>> diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc
>> index 6e341fc4d4b..cfa9bc4b850 100644
>> --- a/gcc/ifcvt.cc
>> +++ b/gcc/ifcvt.cc
>> @@ -2911,7 +2911,7 @@ noce_try_sign_mask (struct noce_if_info *if_info)
>>   static bool
>>   noce_cond_zero_binary_op_supported (enum rtx_code op)
>>   {
>> -  if (op == PLUS || op == MINUS || op == IOR || op == XOR)
>> +  if (op == PLUS || op == MINUS || op == IOR || op == XOR || op == AND)
>>   return true;
>Include ASHIFT, LSHIFTRT, ASHIFTRT, ROTATE, ROTATERT.  That should pick
>up that critical conditional-shift-by-6 in leela. 

Done.

>
>
>
>
>> +  if (opcode == AND)
>> +    {
>> +  tmp
>> += expand_simple_binop (mode, AND, common, z, NULL_RTX, 0, OPTAB_DIRECT);
>OPTAB_WIDEN here I think. 
I restructured the codes to have a simple implementation. But AND is different 
from 
others operations in czero based ifcvt, I kept the ugly codes locally. I will 
refine it and post
if current codes accepted.

>
>
>> +  if (!tmp)
>> +{
>> +  end_sequence ();
>> +  return FALSE;
>> +}
>>  
>> -  /* If we have x = c ? x + z : x, use a new reg to avoid modifying x  */
>> -  if (common && rtx_equal_p (common, if_info->x))
>> -    target = gen_reg_rtx (mode);
>> -  else
>> -    target = if_info->x;
>> +  target = noce_emit_czero (if_info, czero_code, common, if_info->x);
>> +  if (!target)
>> +{
>> +  end_sequence ();
>> +  return FALSE;
>Please try to be consistent with upper/lower case.  In your prior
>patches you used lower case for true/false.  In this patch you're using
>upper case.  Lower case seems to be the standard in that file, so use
>lower case. 
>
>> +}
>>  
>> -  target = noce_emit_czero (if_info, czero_code, z, target);
>> -  if (!target)
>> -    {
>> -  end_sequence ();
>> -  return false;
>> +  target = expand_simple_binop (mode, IOR, tmp, target, if_info->x, 0,
>> +    OPTAB_DIRECT);
>>   }
>> +  else
>> +    {
>> +  /* If we have x = c ? x + z : x, use a new reg to avoid modifying x  
>> */
>> +  if (common && rtx_equal_p (common, if_info->x))
>> +target = gen_reg_rtx (mode);
>> +  else
>> +target = if_info->x;
>As noted before you may not be able to generate a new register when
>ifcvt is run after register allocation.  Your code needs to handle that
>correctly.
>
>
>> +
>> +  target = noce_emit_czero (if_info, czero_code, z, target);
>> +  if (!target)
>> +{
>> +  end_sequence ();
>> +  return false;
>> +}
>>  
>> -  target = expand_simple_binop (mode, opcode, common, target, if_info->x, 0,
>> -OPTAB_DIRECT);
>> +  target = expand_simple_binop (mode, opcode, common, target, 
>> if_info->x, 0,
>> +    OPTAB_DIRECT);
>OPTAB_WIDEN.
>
>And the usual comments about avoiding explicit registers in the tests.
>
>
>I would suggest you try to handle this case as well, I don't think it's
>handled by your current code:
>
>long
>eq2 (long a, long b)
>{
>   if (a == 0)
> return b;
>
>   return 0;
>} 
I tried both in old and new series. Zicond insns could be generated.

BR, 
Fei
>
>
>There's probably also a negated version of that to be handled as well.
>
>
>Overall I think we can go forward with your patches after things are
>fixed.  I'm inclined to wait until after Maciej has integrated his
>changes before actually committing them.  While I don't expect problems,
>I wouldn't want Maciej to have to respin a 40+ patch series.
>
>Note that while we transition to stage3 development today, your patch
>was posted while we were in stage1, so you've met the deadline.  We just
>need to get the updates done relatively soon rather than having it drag
>late into stage3.
>
>Jeff

Re: [PATCH V2] introduce light expander sra

2023-11-27 Thread Jiufu Guo


Hi,

Thanks so much for your helpful review!

Richard Biener  writes:

> On Fri, Oct 27, 2023 at 3:51 AM Jiufu Guo  wrote:
>>
>> Hi,
>>
>> Compare with previous version:
>> https://gcc.gnu.org/pipermail/gcc-patches/2023-October/632399.html
>> This verion supports TI/VEC mode of the access.
>>
>> There are a few PRs (meta-bug PR101926) on various targets.
>> The root causes of them are similar: the aggeragte param/
>> returns are passed by multi-registers, but they are stored
>> to stack from registers first; and then, access the
>> parameter through stack slot.
>>
>> A general idea to enhance this: accessing the aggregate
>> parameters/returns directly through incoming/outgoing
>> scalar registers.  This idea would be a kind of SRA.
>>
>> This experimental patch for light-expander-sra contains
>> below parts:
>>
>> a. Check if the parameters/returns are ok/profitable to
>>scalarize, and set the incoming/outgoing registers(
>>pseudos) for the parameter/return.
>>   - This is done in "expand_function_start", after the
>> incoming/outgoing hard registers are determined for the
>> paramter/return.
>> The scalarized registers are recorded in DECL_RTL for
>> the parameter/return in parallel form.
>>   - At the time when setting DECL_RTL, "scalarizable_aggregate"
>> is called to check the accesses are ok/profitable to
>> scalarize.
>> We can continue to enhance this function, to support
>> more cases.  For example:
>> - 'reverse storage order'.
>> - 'writing to parameter'/'overlap accesses'.
>>
>> b. When expanding the accesses of the parameters/returns,
>>according to the info of the access(e.g. bitpos,bitsize,
>>mode), the scalar(pseudos) can be figured out to expand
>>the access.  This may happen when expand below accesses:
>>   - The component access of a parameter: "_1 = arg.f1".
>> Or whole parameter access: rhs of "_2 = arg"
>>   - The assignment to a return val:
>> "D.xx = yy; or D.xx.f = zz" where D.xx occurs on return
>> stmt.
>>   - This is mainly done in expr.cc(expand_expr_real_1, and
>> expand_assignment).  Function "extract_sub_member" is
>> used to figure out the scalar rtxs(pseudos).
>>
>> Besides the above two parts, some work are done in the GIMPLE
>> tree:  collect sra candidates for parameters/returns, and
>> collect the SRA access info.
>> This is mainly done at the beginning of the expander pass.
>> Below are two major items of this part.
>>  - Collect light-expand-sra candidates.
>>   Each parameter is checked if it has the proper aggregate
>>   type.  Collect return val (VAR_P) on each return stmts if
>>   the function is returning via registers.
>>   This is implemented in expand_sra::collect_sra_candidates.
>>
>>  - Build/collect/manage all the access on the candidates.
>>   The function "scan_function" is used to do this work, it
>>   goes through all basicblocks, and all interesting stmts (
>>   phi, return, assign, call, asm) are checked.
>>   If there is an interesting expression (e.g. COMPONENT_REF
>>   or PARM_DECL), then record the required info for the access
>>   (e.g. pos, size, type, base).
>>   And if it is risky to do SRA, the candidates may be removed.
>>   e.g. address-taken and accessed via memory.
>>   "foo(struct S arg) {bar (&arg);}"
>>
>> This patch is tested on ppc64{,le} and x86_64.
>> Is this ok for trunk?
>>
>> BR,
>> Jeff (Jiufu Guo)
>>
>> PR target/65421
>>
>> gcc/ChangeLog:
>>
>> * cfgexpand.cc (struct access): New class.
>> (struct expand_sra): New class.
>> (expand_sra::collect_sra_candidates): New member function.
>> (expand_sra::add_sra_candidate): Likewise.
>> (expand_sra::build_access): Likewise.
>> (expand_sra::analyze_phi): Likewise.
>> (expand_sra::analyze_assign): Likewise.
>> (expand_sra::visit_base): Likewise.
>> (expand_sra::protect_mem_access_in_stmt): Likewise.
>> (expand_sra::expand_sra):  Class constructor.
>> (expand_sra::~expand_sra): Class destructor.
>> (expand_sra::scalarizable_access):  New member function.
>> (expand_sra::scalarizable_accesses):  Likewise.
>> (scalarizable_aggregate):  New function.
>> (set_scalar_rtx_for_returns):  New function.
>> (expand_value_return): Updated.
>> (expand_debug_expr): Updated.
>> (pass_expand::execute): Updated to use expand_sra.
>> * cfgexpand.h (scalarizable_aggregate): New declare.
>> (set_scalar_rtx_for_returns): New declare.
>> * expr.cc (expand_assignment): Updated.
>> (expand_constructor): Updated.
>> (query_position_in_parallel): New function.
>> (extract_sub_member): New function.
>> (expand_expr_real_1): Updated.
>> * expr.h (query_position_in_parallel): New declare.
>> * function.cc (assign_parm_setup_block): Updated.
>> (assign_parms): Updated.
>> (expand_function_start): 

Re: Re: [PATCH 2/4] [ifcvt] if convert x=c ? y+z : y by RISC-V Zicond like insns

2023-11-27 Thread Fei Gao
On 2023-11-20 14:59  Jeff Law  wrote:
>
>
>
>On 10/30/23 01:25, Fei Gao wrote:
>> Conditional add, if zero
>> rd = (rc == 0) ? (rs1 + rs2) : rs1
>> -->
>> czero.nez rd, rs2, rc
>> add rd, rs1, rd
>>
>> Conditional add, if non-zero
>> rd = (rc != 0) ? (rs1 + rs2) : rs1
>> -->
>> czero.eqz rd, rs2, rc
>> add rd, rs1, rd
>>
>> Co-authored-by: Xiao Zeng
>>
>> gcc/ChangeLog:
>>
>>  * ifcvt.cc (noce_emit_czero): helper for noce_try_cond_zero_arith
>>  (noce_try_cond_zero_arith): handler for condtional zero op
>>  (noce_process_if_block): add noce_try_cond_zero_arith with hook 
>>control
>>
>> gcc/testsuite/ChangeLog:
>>
>>  * gcc.target/riscv/zicond_ifcvt_opt.c: New test.
>> ---
>>   gcc/ifcvt.cc  | 112 +++
>>   .../gcc.target/riscv/zicond_ifcvt_opt.c   | 130 ++
>>   2 files changed, 242 insertions(+)
>>   create mode 100644 gcc/testsuite/gcc.target/riscv/zicond_ifcvt_opt.c
>>
>> diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc
>> index a0af553b9ff..4f98c1c7bf9 100644
>> --- a/gcc/ifcvt.cc
>> +++ b/gcc/ifcvt.cc
>> @@ -781,12 +781,14 @@ static bool noce_try_store_flag_constants (struct 
>> noce_if_info *);
>>   static bool noce_try_store_flag_mask (struct noce_if_info *);
>>   static rtx noce_emit_cmove (struct noce_if_info *, rtx, enum rtx_code, rtx,
>>       rtx, rtx, rtx, rtx = NULL, rtx = NULL);
>> +static rtx noce_emit_czero (struct noce_if_info *, enum rtx_code, rtx, rtx);
>>   static bool noce_try_cmove (struct noce_if_info *);
>>   static bool noce_try_cmove_arith (struct noce_if_info *);
>>   static rtx noce_get_alt_condition (struct noce_if_info *, rtx, rtx_insn 
>>**);
>>   static bool noce_try_minmax (struct noce_if_info *);
>>   static bool noce_try_abs (struct noce_if_info *);
>>   static bool noce_try_sign_mask (struct noce_if_info *);
>> +static bool noce_try_cond_zero_arith (struct noce_if_info *);
>>  
>>   /* Return the comparison code for reversed condition for IF_INFO,
>>  or UNKNOWN if reversing the condition is not possible.  */
>> @@ -1831,6 +1833,32 @@ noce_emit_cmove (struct noce_if_info *if_info, rtx x, 
>> enum rtx_code code,
>>   return NULL_RTX;
>>   }
>>  
>> +static rtx
>> +noce_emit_czero (struct noce_if_info *if_info, enum rtx_code czero_code, 
>> rtx z, rtx target)
>Every function needs a comment describing what the function does, it's
>return value(s) and its arguments.  There are many examples in ifcvt.cc
>you can use to guide you.  I might start with something like this:
>
>/* Emit a conditional zero, returning the location of the result
>    or NULL_RTX upon failure.
>
>    IF_INFO describes the if-conversion scenario under consideration.
>    CZERO_CODE selects the condition (EQ/NE).
>    Z is the nonzero operand of the conditional move
>    TARGET is the desired output register.  */
>
>Or something like that.  I would suggest renaming "Z" to something more
>meaningful. 
Hi Jeff

Thanks for your patients. All comments regarding coding style have been 
addressed in new patches.

>
>
>
>>  
>> +/* Convert x = c ? y + z : y or x = c ? y : y + z. */
>> +
>> +static bool
>> +noce_try_cond_zero_arith (struct noce_if_info *if_info)
>The function comment really should be improved.  For example it doesn't
>indicate what the return value is.
>
>> +
>> +  /* cond must be EQ or NEQ comparision of a reg and 0.  */
>In general when you refer to a variable in a comment, do so in upper
>case.  Use NE rather than NEQ as the former is how most code refers to a
>not-equal rtx code.
>
>
>> +  if (GET_CODE (cond) != NE && GET_CODE (cond) != EQ)
>> +    return false;
>> +  if (!REG_P (XEXP (cond, 0)) || !rtx_equal_p (XEXP (cond, 1), const0_rtx))
>> +    return false;
>> +
>> +  /* check y + z:y*/
>> +  if (GET_CODE (a) == PLUS && REG_P (XEXP (a, 0)) && REG_P (XEXP (a, 1))
>> +  && REG_P (b) && rtx_equal_p (XEXP (a, 0), b))
>Write comments as complete sentences.
>
>> +    {
>> +  common = b;
>> +  z = XEXP (a, 1);
>Rather than "z" use a more descriptive variable name.
>
>
>> +
>> +  /* If we have x = c ? x + z : x, use a new reg to avoid modifying x  */
>> +  if (common && rtx_equal_p (common, if_info->x))
>> +    target = gen_reg_rtx (mode);
>> +  else
>> +    target = if_info->x;
>if-conversion runs both before and after register allocation.  So you
>have to handle the case where you can not generate new registers.  Use
>can_create_pseudo_p () to test for that.  You may need to fail if you
>can't generate a new register. 
1. In find_if_header function, I found the following piece of codes:
if (!reload_completed && noce_find_if_block(...)), and find_if_header must 
be called before noce_try_cond_zero_arith().

2. In noce_try_strore_flag_constants, new registers are also generated
without can_create_pseudo_p() check.

So I guess no need to add can_create_pseudo_p() here.

>
>> +
>> +  target = noce_emit_czero (if_info, czero_code, z, target);
>> +  if (!target)
>> +    {
>> +  end_seque

[PATCH v2] gimple-match.pd Add more optimization for gimple_cond

2023-11-27 Thread Feng Wang
The link of PATCH v1: 
https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg326661.html
This patch add another condition for gimple-cond optimization. Refer to
the following test case.
int foo1 (int data, int res)
{
  res = data & 0xf;
  res |= res << 4;
  if (res < 0x22)
return 0x22;
  return res;
}
with the compilation flag "-O2",
before this patch the log info of phiopt2 pass is
   [local count: 1073741824]:
  res_5 = data_1(D) & 15;
  _6 = (unsigned int) res_5;
  _7 = _6 * 17;
  res_8 = (int) _7;
  if (_7 <= 33)
goto ; [21.72%]
  else
goto ; [78.28%]

   [local count: 233216728]:

   [local count: 1073741824]:
  # _9 = PHI 
  return _9;
after this patch the the log info of phiopt2 pass is
   [local count: 1073741824]:
  res_5 = data_1(D) & 15;
  _6 = (unsigned int) res_5;
  _7 = _6 * 17;
  res_8 = (int) _7;
  _10 = MAX_EXPR <_7, 34>;
  _3 = (int) _10;
  return _3;
This patch optimizes the phi node to generate "MAX_EXPR".
The root cause of minmax replacement failure is the type of "_7"
is unsigned, but the type of const_int "34" is signed. It makes
types_match (c2_type, from_type) return false. So I add another
condition to process this scenario.

gcc/ChangeLog:

* match.pd: Add another condition to process type mismatch.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/phi-opt-41.c: New test.
---
 gcc/match.pd   |  5 -
 gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c | 24 ++
 2 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c

diff --git a/gcc/match.pd b/gcc/match.pd
index 95225e4ca5f..e864845bfa9 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5419,7 +5419,10 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 && (types_match (c2_type, from_type)
 || (TYPE_PRECISION (c2_type) > TYPE_PRECISION (from_type)
 && (TYPE_UNSIGNED (from_type)
-|| TYPE_SIGN (c2_type) == TYPE_SIGN (from_type)
+|| TYPE_SIGN (c2_type) == TYPE_SIGN (from_type)))
+ || (TYPE_UNSIGNED (from_type) != TYPE_UNSIGNED (c2_type)
+ && TYPE_PRECISION (c2_type) == TYPE_PRECISION (from_type)
+ && !TYPE_OVERFLOW_WRAPS (c2_type
{
 if (cmp != EQ_EXPR)
   code = minmax_from_comparison (cmp, @1, @3, @1, @2);
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c 
b/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c
new file mode 100644
index 000..d1101c2f9f7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-phiopt2" } */
+
+int foo1 (int data, int res)
+{
+  res = data & 0xf;
+  res |= res << 4;
+  if (res < 0x22)
+return 0x22;
+  return res;
+}
+
+int foo2 (int data, int res)
+{
+  res = data & 0xf;
+  unsigned int r = res;
+  r*=17;
+  res = r;
+  if (r < 0x22)
+return 0x22;
+  return res;
+}
+
+/* { dg-final { scan-tree-dump-times "MAX_EXPR" 2 "phiopt2" } } */
\ No newline at end of file
-- 
2.17.1



Re: Re: [PATCH 2/4] [ifcvt] if convert x=c ? y+z : y by RISC-V Zicond like insns

2023-11-27 Thread Fei Gao
On 2023-11-20 14:46  Jeff Law  wrote:
>
>
>
>On 10/30/23 21:35, Fei Gao wrote:
>
>>> So just a few notes to further illustrate why I'm currently looking to
>>> take the VRULL+Ventana implementation.  The code above would be much
>>> better handled by just calling noce_emit_cmove.  noce_emit_cmove will go
>>> through the conditional move expander.  So any improvement we make in
>>> the expander "just work" when called from the if-converter.
>> noce_emit_czero is used here to make sure czero insns are emited.
>> noce_emit_cmove includes SFB and Thead movcc, which will take precedence
>> over zicond in RISCV if enabled. Unfortunately we have products with SFB and 
>> Zicond
>> both available and saw such conflict.
>> And that is also the reason to add hook TARGET_HAVE_COND_ZERO
>> in [PATCH 1/4] to disallow ineffient code emited by SFB enable and Zicond 
>> disabled case.
>I understand what you're trying to do, but I would consider the
>TARGET_HAVE_COND_ZERO fundamentally the wrong approach. 
Hi Jeff

Thanks for your review. I just post the new series.
https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg327148.html
https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg327151.html
https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg327149.html
https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg327150.html

TARGET_HAVE_COND_ZERO has been deleted. 

>
>I'm willing to defer routing everything through noce_emit_cmove for now,
>but that's really where this code needs to be going.  If that's causing
>a conflict for a particular implementation with both SFB and Zicond,
>then we'll have to look at the details and adjust things in the target
>files. 
Agree. We can try noce_emit_cmove later with more TCs integrated recently.
Also I tried to solve the conflict found in my TCs in [PATCH 1/4] and [PATCH 
4/4].

>
>
>> Cool and waiting for your submit. Shifts/rotates can be added in 
>> noce_try_cond_zero_arith.
>Fully agreed.  Those are easy. 
Shifts/rotates have been added. 

BR, 
Fei
>
>> I tried to keep noce_try_cond_zero_arith simple without introducing SCC and 
>> other stuff
>> as addtional insns will be generated for greater than like comparision
>> but may not be generated for branch-insn based SFB.
>And I think the result is we're going to fail to implement many
>profitable if-conversions.
>
>
>Jeff

[PATCH 2/4] [ifcvt] optimize x=c ? (y op z) : y by RISC-V Zicond like insns

2023-11-27 Thread Fei Gao
op=[PLUS, MINUS, IOR, XOR, ASHIFT, ASHIFTRT, LSHIFTRT, ROTATE, ROTATERT]

SIGN_EXTEND, ZERO_EXTEND and SUBREG has been considered
to support SImode in 64-bit machine.

Conditional op, if zero
rd = (rc == 0) ? (rs1 op rs2) : rs1
-->
czero.nez rd, rs2, rc
op rd, rs1, rd

Conditional op, if non-zero
rd = (rc != 0) ? (rs1 op rs2) : rs1
-->
czero.eqz rd, rs2, rc
op rd, rs1, rd

Co-authored-by: Xiao Zeng

gcc/ChangeLog:

* ifcvt.cc (noce_try_cond_zero_arith):handler for condtional zero based 
ifcvt
(noce_emit_czero): helper for noce_try_cond_zero_arith
(noce_cond_zero_binary_op_supported): check supported OPs for 
condtional zero based ifcvt
(get_base_reg): get base reg of a subreg or the reg itself
(noce_bbs_ok_for_cond_zero_arith): check if BBs are OK for condtional 
zero based ifcvt
(noce_process_if_block): add noce_try_cond_zero_arith

gcc/testsuite/ChangeLog:

* gcc.target/riscv/zicond_ifcvt_opt.c: New test.
---
 gcc/ifcvt.cc  | 210 ++
 .../gcc.target/riscv/zicond_ifcvt_opt.c   | 682 ++
 2 files changed, 892 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/zicond_ifcvt_opt.c

diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc
index a0af553b9ff..8f6a0e7f5fe 100644
--- a/gcc/ifcvt.cc
+++ b/gcc/ifcvt.cc
@@ -787,6 +787,7 @@ static rtx noce_get_alt_condition (struct noce_if_info *, 
rtx, rtx_insn **);
 static bool noce_try_minmax (struct noce_if_info *);
 static bool noce_try_abs (struct noce_if_info *);
 static bool noce_try_sign_mask (struct noce_if_info *);
+static int noce_try_cond_zero_arith (struct noce_if_info *);
 
 /* Return the comparison code for reversed condition for IF_INFO,
or UNKNOWN if reversing the condition is not possible.  */
@@ -1831,6 +1832,40 @@ noce_emit_cmove (struct noce_if_info *if_info, rtx x, 
enum rtx_code code,
 return NULL_RTX;
 }
 
+/*  Emit a conditional zero, returning TARGET or NULL_RTX upon failure.
+IF_INFO describes the if-conversion scenario under consideration.
+CZERO_CODE selects the condition (EQ/NE).
+NON_ZERO_OP is the nonzero operand of the conditional move
+TARGET is the desired output register.  */
+
+static rtx
+noce_emit_czero (struct noce_if_info *if_info, enum rtx_code czero_code,
+rtx non_zero_op, rtx target)
+{
+  machine_mode mode = GET_MODE (target);
+  rtx cond_op0 = XEXP (if_info->cond, 0);
+  rtx czero_cond
+= gen_rtx_fmt_ee (czero_code, GET_MODE (cond_op0), cond_op0, const0_rtx);
+  rtx if_then_else
+= gen_rtx_IF_THEN_ELSE (mode, czero_cond, const0_rtx, non_zero_op);
+  rtx set = gen_rtx_SET (target, if_then_else);
+
+  start_sequence ();
+  rtx_insn *insn = emit_insn (set);
+
+  if (recog_memoized (insn) >= 0)
+{
+  rtx_insn *seq = get_insns ();
+  end_sequence ();
+  emit_insn (seq);
+
+  return target;
+}
+
+  end_sequence ();
+  return NULL_RTX;
+}
+
 /* Try only simple constants and registers here.  More complex cases
are handled in noce_try_cmove_arith after noce_try_store_flag_arith
has had a go at it.  */
@@ -2880,6 +2915,178 @@ noce_try_sign_mask (struct noce_if_info *if_info)
   return true;
 }
 
+/*  Check if OP is supported by conditional zero based if conversion,
+returning TRUE if satisfied otherwise FALSE.
+
+OP is the operation to check.  */
+
+static bool
+noce_cond_zero_binary_op_supported (rtx op)
+{
+  enum rtx_code opcode = GET_CODE (op);
+
+  /* Strip SIGN_EXTEND or ZERO_EXTEND if any.  */
+  if (opcode == SIGN_EXTEND || opcode == ZERO_EXTEND)
+opcode = GET_CODE (XEXP (op, 0));
+
+  if (opcode == PLUS || opcode == MINUS || opcode == IOR || opcode == XOR
+  || opcode == ASHIFT || opcode == ASHIFTRT || opcode == LSHIFTRT
+  || opcode == ROTATE || opcode == ROTATERT)
+return true;
+
+  return false;
+}
+
+/*  Helper function to return REG itself or inner expression of a SUBREG,
+otherwise NULL_RTX for other RTX_CODE.  */
+
+static rtx
+get_base_reg (rtx exp)
+{
+  if (REG_P (exp))
+return exp;
+  else if (SUBREG_P (exp))
+return SUBREG_REG (exp);
+
+  return NULL_RTX;
+}
+
+/*  Check if IF-BB and THEN-BB satisfy the condition for conditional zero
+based if conversion, returning TRUE if satisfied otherwise FALSE.
+
+IF_INFO describes the if-conversion scenario under consideration.
+COMMON_PTR points to the common REG of canonicalized IF_INFO->A and
+IF_INFO->B.
+CZERO_CODE_PTR points to the comparison code to use in czero RTX.
+A_PTR points to the A expression of canonicalized IF_INFO->A.
+TO_REPLACE points to the RTX to be replaced by czero RTX destnation.  */
+
+static bool
+noce_bbs_ok_for_cond_zero_arith (struct noce_if_info *if_info, rtx *common_ptr,
+enum rtx_code *czero_code_ptr, rtx *a_ptr,
+rtx **to_replace)
+{
+  rtx common = NULL_RTX;
+  rtx cond = if_info->cond;
+  rtx a = copy_rtx (if_info->a);
+  rtx 

[PATCH 4/4] [V2] [ifcvt] prefer SFB to Zicond for x=c ? (y op CONST) : y.

2023-11-27 Thread Fei Gao
In x=c ? (y op CONST) : y cases, Zicond based czero ifcvt generates
more true dependency in code sequence than SFB based movcc. So exit
noce_try_cond_zero_arith in such cases to have a better code sequence
generated by noce_try_cmove_arith.

Take the following case for example.

CFLAGS: -mtune=sifive-7-series -march=rv64gc_zbb_zicond -mabi=lp64d -O2

unsigned int
test_RotateR_eqz_imm_int (unsigned int x, unsigned int y, unsigned int c)
{
  if (c)
x = (y >> 11) | (y << (32 - 11));
  else
x = y;
  return x;
}

before patch:
li  a5,11
czero.eqz   a2,a5,a2
rorwa0,a1,a2
ret

after patch:
roriw   a0,a1,11
bne a2,zero,1f  # movcc
mv  a0,a1
1:
ret

Co-authored-by: Xiao Zeng

gcc/ChangeLog:

* config/riscv/riscv.cc (riscv_have_sfb): hook implementation
(TARGET_HAVE_SFB): define hook in riscv
* doc/tm.texi: add TARGET_HAVE_SFB
* doc/tm.texi.in: add TARGET_HAVE_SFB
* ifcvt.cc (noce_try_cond_zero_arith): prefer SFB for x=c ? (y op 
CONST) : y
* target.def:add TARGET_HAVE_SFB

gcc/testsuite/ChangeLog:

* gcc.target/riscv/zicond_sfb_ifcvt_opt.c: New test.
---
 gcc/config/riscv/riscv.cc |   12 +
 gcc/doc/tm.texi   |4 +
 gcc/doc/tm.texi.in|2 +
 gcc/ifcvt.cc  |4 +-
 gcc/target.def|7 +
 .../gcc.target/riscv/zicond_sfb_ifcvt_opt.c   | 1354 +
 6 files changed, 1382 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/zicond_sfb_ifcvt_opt.c

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index d0efb939bf2..91fb4ebd653 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -10191,6 +10191,14 @@ riscv_vectorize_related_mode (machine_mode 
vector_mode, scalar_mode element_mode
   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
 }
 
+/* Implement TARGET_HAVE_SFB.  */
+
+bool
+riscv_have_sfb (void)
+{
+  return TARGET_SFB_ALU;
+}
+
 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
 
 static bool
@@ -10536,6 +10544,10 @@ extract_base_offset_in_addr (rtx mem, rtx *base, rtx 
*offset)
 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_ENDIAN)
 #endif
 
+#undef TARGET_HAVE_SFB
+#define TARGET_HAVE_SFB \
+riscv_have_sfb
+
 #undef TARGET_VECTOR_MODE_SUPPORTED_P
 #define TARGET_VECTOR_MODE_SUPPORTED_P riscv_vector_mode_supported_p
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 645559ea084..9b4e3f71569 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -12149,6 +12149,10 @@ This target hook is required only when the target has 
several different
 modes and they have different conditional execution capability, such as ARM.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_HAVE_SFB (void)
+This target hook returns true if the target supports Short Forward Branch.
+@end deftypefn
+
 @deftypefn {Target Hook} rtx TARGET_GEN_CCMP_FIRST (rtx_insn **@var{prep_seq}, 
rtx_insn **@var{gen_seq}, rtx_code @var{code}, tree @var{op0}, tree @var{op1})
 This function prepares to emit a comparison insn for the first compare in a
  sequence of conditional comparisions.  It returns an appropriate comparison
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 4ddc8507ed9..6dac432605f 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7843,6 +7843,8 @@ lists.
 
 @hook TARGET_HAVE_CONDITIONAL_EXECUTION
 
+@hook TARGET_HAVE_SFB
+
 @hook TARGET_GEN_CCMP_FIRST
 
 @hook TARGET_GEN_CCMP_NEXT
diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc
index 4cc6a125ff0..c0f42a7ab1f 100644
--- a/gcc/ifcvt.cc
+++ b/gcc/ifcvt.cc
@@ -3068,10 +3068,12 @@ noce_try_cond_zero_arith (struct noce_if_info *if_info)
&a, &to_replace))
 return false;
 
-  start_sequence ();
+  if (targetm.have_sfb () && CONST_INT_P (*to_replace))
+return false;
 
   bin_code = GET_CODE (bin_exp);
   bin_op0 = XEXP (bin_exp, 0);
+  start_sequence ();
 
   if (CONST_INT_P (*to_replace))
 {
diff --git a/gcc/target.def b/gcc/target.def
index 475c55c22c1..6d9b71e165b 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -2726,6 +2726,13 @@ modes and they have different conditional execution 
capability, such as ARM.",
  bool, (void),
  default_have_conditional_execution)
 
+/* Return true if the target supports SFB.  */
+DEFHOOK
+(have_sfb,
+ "This target hook returns true if the target supports Short Forward Branch.",
+ bool, (void),
+ hook_bool_void_false)
+
 DEFHOOK
 (gen_ccmp_first,
  "This function prepares to emit a comparison insn for the first compare in 
a\n\
diff --git a/gcc/testsuite/gcc.target/riscv/zicond_sfb_ifcvt_opt.c 
b/gcc/testsuite/gcc.target/riscv/zicond_sfb_ifcvt_opt.c
new file mode 100644
index 000..a9cad788956
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/zicond_sfb_ifcvt_opt.c
@@ -0,0 +1,1

[PATCH 3/4] [ifcvt] optimize x=c ? (y op const_int) : y by RISC-V Zicond like insns

2023-11-27 Thread Fei Gao
op=[PLUS, MINUS, IOR, XOR, ASHIFT, ASHIFTRT, LSHIFTRT, ROTATE, ROTATERT]

Co-authored-by: Xiao Zeng

gcc/ChangeLog:

* ifcvt.cc (noce_cond_zero_shift_op_supported): check if OP is shift 
like operation
(noce_cond_zero_binary_op_supported): restructure & call 
noce_cond_zero_shift_op_supported
(noce_bbs_ok_for_cond_zero_arith): add bin_exp_ptr interface
(noce_try_cond_zero_arith): add support for x=c ? (y op const_int)

gcc/testsuite/ChangeLog:

* gcc.target/riscv/zicond_ifcvt_opt.c: add TCs for x=c ? (y op 
const_int) : y
---
 gcc/ifcvt.cc  |  53 +-
 .../gcc.target/riscv/zicond_ifcvt_opt.c   | 675 +-
 2 files changed, 716 insertions(+), 12 deletions(-)

diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc
index 8f6a0e7f5fe..4cc6a125ff0 100644
--- a/gcc/ifcvt.cc
+++ b/gcc/ifcvt.cc
@@ -2920,6 +2920,16 @@ noce_try_sign_mask (struct noce_if_info *if_info)
 
 OP is the operation to check.  */
 
+static bool
+noce_cond_zero_shift_op_supported (enum rtx_code op)
+{
+  if (op == ASHIFT || op == ASHIFTRT || op == LSHIFTRT || op == ROTATE
+  || op == ROTATERT)
+return true;
+
+  return false;
+}
+
 static bool
 noce_cond_zero_binary_op_supported (rtx op)
 {
@@ -2930,8 +2940,7 @@ noce_cond_zero_binary_op_supported (rtx op)
 opcode = GET_CODE (XEXP (op, 0));
 
   if (opcode == PLUS || opcode == MINUS || opcode == IOR || opcode == XOR
-  || opcode == ASHIFT || opcode == ASHIFTRT || opcode == LSHIFTRT
-  || opcode == ROTATE || opcode == ROTATERT)
+  || noce_cond_zero_shift_op_supported (opcode))
 return true;
 
   return false;
@@ -2963,6 +2972,7 @@ get_base_reg (rtx exp)
 
 static bool
 noce_bbs_ok_for_cond_zero_arith (struct noce_if_info *if_info, rtx *common_ptr,
+rtx *bin_exp_ptr,
 enum rtx_code *czero_code_ptr, rtx *a_ptr,
 rtx **to_replace)
 {
@@ -3029,6 +3039,7 @@ noce_bbs_ok_for_cond_zero_arith (struct noce_if_info 
*if_info, rtx *common_ptr,
 return false;
 
   *common_ptr = common;
+  *bin_exp_ptr = bin_exp;
   *czero_code_ptr = czero_code;
   *a_ptr = a;
 
@@ -3047,20 +3058,28 @@ noce_try_cond_zero_arith (struct noce_if_info *if_info)
   machine_mode mode = GET_MODE (if_info->x);
   rtx common = NULL_RTX;
   enum rtx_code czero_code = UNKNOWN;
+  rtx bin_exp = NULL_RTX;
+  enum rtx_code bin_code = UNKNOWN;
+  rtx bin_op0 = NULL_RTX;
   rtx non_zero_op = NULL_RTX;
   rtx *to_replace = NULL;
 
-  if (!noce_bbs_ok_for_cond_zero_arith (if_info, &common, &czero_code, &a,
-   &to_replace))
+  if (!noce_bbs_ok_for_cond_zero_arith (if_info, &common, &bin_exp, 
&czero_code,
+   &a, &to_replace))
 return false;
 
-  /* Disallow CONST_INT currently for simplicity*/
-  if (to_replace == NULL || !REG_P (*to_replace))
-return false;
+  start_sequence ();
 
-  non_zero_op = *to_replace;
+  bin_code = GET_CODE (bin_exp);
+  bin_op0 = XEXP (bin_exp, 0);
 
-  start_sequence ();
+  if (CONST_INT_P (*to_replace))
+{
+  non_zero_op = gen_reg_rtx (mode);
+  noce_emit_move_insn (non_zero_op, *to_replace);
+}
+  else
+non_zero_op = *to_replace;
 
   /* If x is used in both input and out like x = c ? x + z : x,
  use a new reg to avoid modifying x  */
@@ -3076,7 +3095,21 @@ noce_try_cond_zero_arith (struct noce_if_info *if_info)
   return false;
 }
 
-  *to_replace = target;
+  if (CONST_INT_P (*to_replace))
+{
+  if (noce_cond_zero_shift_op_supported (bin_code))
+   {
+ *to_replace = gen_rtx_SUBREG (E_QImode, target, 0);
+ if (GET_CODE (a) == ZERO_EXTEND && bin_code == LSHIFTRT)
+   PUT_CODE (a, SIGN_EXTEND);
+   }
+  else if (SUBREG_P (bin_op0))
+   *to_replace = gen_rtx_SUBREG (GET_MODE (bin_op0), target, 0);
+  else
+   *to_replace = target;
+}
+  else
+*to_replace = target;
   emit_insn (gen_rtx_SET (if_info->x, a));
 
   seq = end_ifcvt_sequence (if_info);
diff --git a/gcc/testsuite/gcc.target/riscv/zicond_ifcvt_opt.c 
b/gcc/testsuite/gcc.target/riscv/zicond_ifcvt_opt.c
index 9357f26d978..c6b0518968b 100644
--- a/gcc/testsuite/gcc.target/riscv/zicond_ifcvt_opt.c
+++ b/gcc/testsuite/gcc.target/riscv/zicond_ifcvt_opt.c
@@ -678,5 +678,676 @@ test_RotateR_eqz_int (unsigned int x, unsigned int y, 
unsigned int z,
   return x;
 }
 
-/* { dg-final { scan-assembler-times {czero\.eqz} 39 } } */
-/* { dg-final { scan-assembler-times {czero\.nez} 28 } } */
+long
+test_ADD_ceqz_imm (long x, long y, long c)
+{
+  if (c)
+x = y + 11;
+  else
+x = y;
+  return x;
+}
+
+long
+test_ADD_ceqz_x_imm (long x, long c)
+{
+  if (c)
+x = x + 11;
+
+  return x;
+}
+
+long
+test_ADD_nez_imm (long x, long y, long c)
+{
+  if (c)
+x = y;
+  else
+x = y + 11;
+  return x;
+}
+
+long
+test_ADD_nez_x_imm (long x, long c)
+{
+  if (c)
+{
+}
+  else
+ 

[PATCH 1/4] [RISC-V] prefer Zicond primitive semantics to SFB

2023-11-27 Thread Fei Gao
Move Zicond md files ahead of SFB to recognize Zicond first.

Take the following case for example.

CFLAGS: -mtune=sifive-7-series -march=rv64gc_zicond -mabi=lp64d

long primitiveSemantics_00(long a, long b) { return a == 0 ? 0 : b; }

before patch:
primitiveSemantics_00:
bne a0,zero,1f  # movcc
mv  a1,zero
1:
mv  a0,a1
ret

after patch:
primitiveSemantics_00:
czero.eqz   a0,a1,a0
ret

Co-authored-by: Xiao Zeng

gcc/ChangeLog:

* config/riscv/riscv.md (*movcc):move to sfb.md
* config/riscv/sfb.md: New file.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/zicond-sfb-primitiveSemantics.c: New test.
---
 gcc/config/riscv/riscv.md | 19 +--
 gcc/config/riscv/sfb.md   | 37 ++
 .../riscv/zicond-sfb-primitiveSemantics.c | 50 +++
 3 files changed, 88 insertions(+), 18 deletions(-)
 create mode 100644 gcc/config/riscv/sfb.md
 create mode 100644 
gcc/testsuite/gcc.target/riscv/zicond-sfb-primitiveSemantics.c

diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 935eeb7fd8e..d020988446f 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -2711,24 +2711,6 @@
   DONE;
 })
 
-;; Patterns for implementations that optimize short forward branches.
-
-(define_insn "*movcc"
-  [(set (match_operand:GPR 0 "register_operand" "=r,r")
-   (if_then_else:GPR
-(match_operator 5 "ordered_comparison_operator"
-   [(match_operand:X 1 "register_operand" "r,r")
-(match_operand:X 2 "reg_or_0_operand" "rJ,rJ")])
-(match_operand:GPR 3 "register_operand" "0,0")
-(match_operand:GPR 4 "sfb_alu_operand" "rJ,IL")))]
-  "TARGET_SFB_ALU"
-  "@
-   b%C5\t%1,%z2,1f\t# movcc\;mv\t%0,%z4\n1:
-   b%C5\t%1,%z2,1f\t# movcc\;li\t%0,%4\n1:"
-  [(set_attr "length" "8")
-   (set_attr "type" "sfb_alu")
-   (set_attr "mode" "")])
-
 ;; Used to implement built-in functions.
 (define_expand "condjump"
   [(set (pc)
@@ -3748,5 +3730,6 @@
 (include "generic-ooo.md")
 (include "vector.md")
 (include "zicond.md")
+(include "sfb.md")
 (include "zc.md")
 (include "corev.md")
diff --git a/gcc/config/riscv/sfb.md b/gcc/config/riscv/sfb.md
new file mode 100644
index 000..52af4b17d46
--- /dev/null
+++ b/gcc/config/riscv/sfb.md
@@ -0,0 +1,37 @@
+;; Machine description for short forward branches(SFB).
+;; Copyright (C) 2023 Free Software Foundation, Inc.
+
+;; This file is part of GCC.
+
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; .
+
+
+;; Patterns for implementations that optimize short forward branches.
+
+(define_insn "*movcc"
+  [(set (match_operand:GPR 0 "register_operand" "=r,r")
+   (if_then_else:GPR
+(match_operator 5 "ordered_comparison_operator"
+   [(match_operand:X 1 "register_operand" "r,r")
+(match_operand:X 2 "reg_or_0_operand" "rJ,rJ")])
+(match_operand:GPR 3 "register_operand" "0,0")
+(match_operand:GPR 4 "sfb_alu_operand" "rJ,IL")))]
+  "TARGET_SFB_ALU"
+  "@
+   b%C5\t%1,%z2,1f\t# movcc\;mv\t%0,%z4\n1:
+   b%C5\t%1,%z2,1f\t# movcc\;li\t%0,%4\n1:"
+  [(set_attr "length" "8")
+   (set_attr "type" "sfb_alu")
+   (set_attr "mode" "")])
diff --git a/gcc/testsuite/gcc.target/riscv/zicond-sfb-primitiveSemantics.c 
b/gcc/testsuite/gcc.target/riscv/zicond-sfb-primitiveSemantics.c
new file mode 100644
index 000..2c60656d5eb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/zicond-sfb-primitiveSemantics.c
@@ -0,0 +1,50 @@
+/* { dg-do compile } */
+/* { dg-options "-mtune=sifive-7-series -march=rv64gc_zicond -mabi=lp64d" { 
target { rv64 } } } */
+/* { dg-options "-mtune=sifive-7-series -march=rv32gc_zicond -mabi=ilp32f" { 
target { rv32 } } } */
+/* { dg-skip-if "" { *-*-* } {"-O0" "-Og"} } */
+
+long primitiveSemantics_00(long a, long b) { return a == 0 ? 0 : b; }
+
+long primitiveSemantics_01(long a, long b) { return a != 0 ? 0 : b; }
+
+long primitiveSemantics_02(long a, long b) { return a == 0 ? b : 0; }
+
+long primitiveSemantics_03(long a, long b) { return a != 0 ? b : 0; }
+
+long primitiveSemantics_04(long a, long b) {
+  if (a)
+b = 0;
+  return b;
+}
+
+long primitiveSemantics_05(long a, long b) {
+  if (!a)
+b = 0;
+  return b;
+}
+
+int primitiveSemantics_06(int a, int b) { return a == 0 ? 0 : b; }
+
+int primitiveSemantics_07(int a, i

Re: [PATCH] RISC-V: Fix VSETVL PASS regression

2023-11-27 Thread juzhe.zhong
committed as it passed zvl128/256/512/1024 no regression. Replied Message FromJuzhe-ZhongDate11/27/2023 21:24 Togcc-patches@gcc.gnu.org Cckito.ch...@gmail.com,kito.ch...@sifive.com,jeffreya...@gmail.com,rdapp@gmail.com,Juzhe-ZhongSubject[PATCH] RISC-V: Fix VSETVL PASS regression


Re: [PATCH 5/5] diagnostics: don't print annotation lines when there's no column info

2023-11-27 Thread David Malcolm
On Tue, 2023-11-21 at 17:20 -0500, David Malcolm wrote:
> gcc/ChangeLog:
> * diagnostic-show-locus.cc
> (layout::maybe_add_location_range):
> Don't print annotation lines for ranges when there's no
> column
> info.
> (selftest::test_one_liner_no_column): New.
> (selftest::test_diagnostic_show_locus_one_liner): Call it.

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.

Not a bugfix, but low-risk and useful for libdiagnostics, so I've taken
the liberty of pushing this to gcc trunk as r14-5896-g5099525bff4f7c.

Dave



Re: [PATCH 4/5] diagnostics: add diagnostic_context::get_location_text

2023-11-27 Thread David Malcolm
On Tue, 2023-11-21 at 17:20 -0500, David Malcolm wrote:
> No functional change intended.
> 
> gcc/ChangeLog:
> * diagnostic.cc (diagnostic_get_location_text): Convert to...
> (diagnostic_context::get_location_text): ...this, and convert
> return type from char * to label_text.
> (diagnostic_build_prefix): Update for above change.
> (default_diagnostic_start_span_fn): Likewise.
> (selftest::assert_location_text): Likewise.
> * diagnostic.h (diagnostic_context::get_location_text): New
> decl.

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.

Not a bugfix, but low-risk and useful for libdiagnostics, so I've taken
the liberty of pushing this to gcc trunk as r14-5895-g93096d3ce14a89.

Dave



Re: [PATCH] libcpp: Fix unsigned promotion for unevaluated divide by zero [PR112701]

2023-11-27 Thread Joseph Myers
On Mon, 27 Nov 2023, Lewis Hyatt wrote:

> Hello-
> 
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112701
> 
> Here is a one-line fix to an edge case in libcpp's expression evaluator
> noted in the PR. Bootstrap + regtest all languages on x86-64 Linux. Is it OK
> please? Thanks!

OK.

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: [PATCH 3/4] c23: aliasing of compatible tagged types

2023-11-27 Thread Joseph Myers
On Sun, 26 Nov 2023, Martin Uecker wrote:

> My understand is that it is used for aliasing analysis and also
> checking of conversions.  TYPE_CANONICAL must be consistent with
> the idea the middle-end has about type conversions.  But as long
> as we do not give the same TYPE_CANONICAL to types the middle-end
> thinks must be incompatible using its own type checking machinery,
> it should be safe even for types the C standard thinks must be
> incompatible for some reason.

And presumably also for types that definitely can't be assigned because 
they have incompatible layout through the use of different array sizes - 
since the front end won't generate such assignments, it would never matter 
whether the middle end considers them valid without conversion or not?

> > I also think more rationale is needed for ignoring sizes like this.  Is it 
> > intended for e.g. making structs with flexible array members 
> > alias-compatible with similar structs with a fixed-size array?
> 
> The main reason are pointers to arrays:
> 
> struct foo { int (*x)[]; }
> struct foo { int (*x)[2]; };
> struct foo { int (*x)[1]; };

Thanks for the explanation.

I guess the cases involving flexible array members actually show up a bug 
in the standard rather than any kind of issue with this patch - the 
standard allows one structure ending with a flexible array member, and 
another ending with a fixed-size array, to be compatible (and in different 
translation units allowed that even before C23), but there is also clear 
text in the standard showing it's not intended to require the layout to be 
consistent (the fixed-size and flexible arrays might have different 
offsets), and what you'd actually get with an assignment or conditional 
expression mixing such structures certainly isn't clearly specified.  
Maybe the right resolution for that issue with the standard would be to 
make that particular case incompatible, but it would need to be raised as 
an issue after C23 is out.

-- 
Joseph S. Myers
jos...@codesourcery.com

Re: c: tree: target: C2x (...) function prototypes and va_start relaxation

2023-11-27 Thread Joseph Myers
On Sat, 25 Nov 2023, Gerald Pfeifer wrote:

> On Fri, 21 Oct 2022, Joseph Myers wrote:
> > C2x allows function prototypes to be given as (...), a prototype
> > meaning a variable-argument function with no named arguments.
> 
> I noticed this did not make it into gcc-13/changes.html ? Was that 
> intentional?

It's there (N2975, Relax requirements for variadic parameter lists).

-- 
Joseph S. Myers
jos...@codesourcery.com


[PATCH] libcpp: Fix unsigned promotion for unevaluated divide by zero [PR112701]

2023-11-27 Thread Lewis Hyatt
Hello-

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112701

Here is a one-line fix to an edge case in libcpp's expression evaluator
noted in the PR. Bootstrap + regtest all languages on x86-64 Linux. Is it OK
please? Thanks!

-Lewis

-- >8 --

When libcpp encounters a divide by zero while processing a constant
expression "x/y", it returns "x" as a fallback. The value of the fallback is
not normally important, since an error will be generated anyway, but if the
expression appears in an unevaluated context, such as "0 ? 0/0u : -1", then
there will be no error, and the fallback value will be meaningful to the
extent that it may cause promotion from signed to unsigned of an operand
encountered later. As the PR notes, libcpp does not do the unsigned
promotion correctly in this case; fix it by making the fallback return value
unsigned as necessary.

libcpp/ChangeLog:

PR preprocessor/112701
* expr.cc (num_div_op): Set unsignedp appropriately when returning a
stub value for divide by 0.

gcc/testsuite/ChangeLog:

PR preprocessor/112701
* gcc.dg/cpp/expr.c: Add additional tests to cover divide by 0 in an
unevaluated context, where the unsignedness still matters.
---
 libcpp/expr.cc  |  1 +
 gcc/testsuite/gcc.dg/cpp/expr.c | 22 --
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/libcpp/expr.cc b/libcpp/expr.cc
index 825d2c2369d..4f4a9722ac7 100644
--- a/libcpp/expr.cc
+++ b/libcpp/expr.cc
@@ -2216,6 +2216,7 @@ num_div_op (cpp_reader *pfile, cpp_num lhs, cpp_num rhs, 
enum cpp_ttype op,
   if (!pfile->state.skip_eval)
cpp_error_with_line (pfile, CPP_DL_ERROR, location, 0,
 "division by zero in #if");
+  lhs.unsignedp = unsignedp;
   return lhs;
 }
 
diff --git a/gcc/testsuite/gcc.dg/cpp/expr.c b/gcc/testsuite/gcc.dg/cpp/expr.c
index 532bd681237..055e17ae753 100644
--- a/gcc/testsuite/gcc.dg/cpp/expr.c
+++ b/gcc/testsuite/gcc.dg/cpp/expr.c
@@ -1,6 +1,7 @@
 /* Copyright (C) 2000, 2001 Free Software Foundation, Inc.  */
 
 /* { dg-do preprocess } */
+/* { dg-additional-options "-Wall" } */
 
 /* Test we get signedness of ?: operator correct.  We would skip
evaluation of one argument, and might therefore not transfer its
@@ -8,10 +9,27 @@
 
 /* Neil Booth, 19 Jul 2002.  */
 
-#if (1 ? -2: 0 + 1U) < 0
+#if (1 ? -2: 0 + 1U) < 0 /* { dg-warning {the left operand of ":" changes 
sign} } */
 #error /* { dg-bogus "error" } */
 #endif
 
-#if (0 ? 0 + 1U: -2) < 0
+#if (0 ? 0 + 1U: -2) < 0 /* { dg-warning {the right operand of ":" changes 
sign} } */
 #error /* { dg-bogus "error" } */
 #endif
+
+/* PR preprocessor/112701 */
+#if (0 ? 0/0u : -1) < 0 /* { dg-warning {the right operand of ":" changes 
sign} } */
+#error /* { dg-bogus "error" } */
+#endif
+
+#if (0 ? 0u/0 : -1) < 0 /* { dg-warning {the right operand of ":" changes 
sign} } */
+#error /* { dg-bogus "error" } */
+#endif
+
+#if (1 ? -1 : 0/0u) < 0 /* { dg-warning {the left operand of ":" changes sign} 
} */
+#error /* { dg-bogus "error" } */
+#endif
+
+#if (1 ? -1 : 0u/0) < 0 /* { dg-warning {the left operand of ":" changes sign} 
} */
+#error /* { dg-bogus "error" } */
+#endif


Re: [committed v2] libstdc++: Define std::ranges::to for C++23 (P1206R7) [PR111055]

2023-11-27 Thread Hans-Peter Nilsson
> From: Jonathan Wakely 
> Date: Thu, 23 Nov 2023 17:51:38 +

> libstdc++-v3/ChangeLog:
> 
>   PR libstdc++/111055
>   * include/bits/ranges_base.h (from_range_t): Define new tag
>   type.
>   (from_range): Define new tag object.
>   * include/bits/version.def (ranges_to_container): Define.
>   * include/bits/version.h: Regenerate.
>   * include/std/ranges (ranges::to): Define.
>   * testsuite/std/ranges/conv/1.cc: New test.
>   * testsuite/std/ranges/conv/2_neg.cc: New test.
>   * testsuite/std/ranges/conv/version.cc: New test.

JFTR, for the list: this (r14-5794-g7a6a29c455e775) caused
another one of those wonderful "xtreme test" regressions.

Logged as PR112737: "[14 Regression]
g++.dg/modules/xtreme-header-2_b.C -std=c++2b (test for
excess errors)", and pinskia quickly linked it to the
meta-bug for modules issues, PR103524 (thanks!)

IIRC, sometimes those tests show bugs elsewhere, suggesting
lack of coverage in other tests (and not just streaming
aspects), but this one actually mentions modules in key
error messages.

brgds, H-P


Re: [PATCH] fold-mem-offsets: Fix powerpc64le-linux profiledbootstrap [PR111601]

2023-11-27 Thread Andrew Pinski
On Mon, Nov 27, 2023 at 3:51 PM Jakub Jelinek  wrote:
>
> On Mon, Nov 27, 2023 at 09:52:14PM +0100, Jakub Jelinek wrote:
> > On Mon, Oct 16, 2023 at 01:11:01PM -0600, Jeff Law wrote:
> > > > gcc/ChangeLog:
> > > >
> > > >   * Makefile.in: Add fold-mem-offsets.o.
> > > >   * passes.def: Schedule a new pass.
> > > >   * tree-pass.h (make_pass_fold_mem_offsets): Declare.
> > > >   * common.opt: New options.
> > > >   * doc/invoke.texi: Document new option.
> > > >   * fold-mem-offsets.cc: New file.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > >   * gcc.target/riscv/fold-mem-offsets-1.c: New test.
> > > >   * gcc.target/riscv/fold-mem-offsets-2.c: New test.
> > > >   * gcc.target/riscv/fold-mem-offsets-3.c: New test.
> > > Thanks, I've pushed this to the trunk.
> >
> > This breaks profiledbootstrap on powerpc64le-linux.
> > >From what I can see, the pass works one basic block at a time and
> > will punt on any non-DEBUG_INSN uses outside of the current block
> > (I believe because of the
> >   /* This use affects instructions outside of CAN_FOLD_INSNS.  */
> >   if (!bitmap_bit_p (&can_fold_insns, INSN_UID (use)))
> > return 0;
> > test and can_fold_insns only set in do_analysis (when processing insns in
> > current bb, cleared at the end) or results of get_single_def_in_bb
> > (which are checked to be in the same bb).
> > But, while get_single_def_in_bb checks for
> >   if (DF_INSN_LUID (def) > DF_INSN_LUID (insn))
> > return NULL;
> > (OT, why not DF_INSN_INFO_LUID (DF_REF_INSN_INFO (ref_chain->ref))
> > instead of DF_INSN_LUID (def), then it doesn't need to look up
> > DF_INSN_INFO_GET (def)?), nothing when walking all uses of def does such
> > luid check.
> > The basic block in the PR in question has:
> > ...
> > (insn 212 210 215 25 (set (mem/f:DI (reg/v/f:DI 10 10 [orig:152 last_viable 
> > ] [152]) [2 *last_viable_336+0 S8 A64])
> > (reg/f:DI 9 9 [orig:155 _342 ] [155])) "pr111601.ii":50:17 683 
> > {*movdi_internal64}
> >  (expr_list:REG_DEAD (reg/v/f:DI 10 10 [orig:152 last_viable ] [152])
> > (nil)))
> > (insn 215 212 484 25 (set (reg:DI 5 5 [226])
> > (const_int 0 [0])) "pr111601.ii":52:12 683 {*movdi_internal64}
> >  (expr_list:REG_EQUIV (const_int 0 [0])
> > (nil)))
> > (insn 484 215 218 25 (set (reg/v/f:DI 10 10 [orig:152 last_viable ] [152])
> > (reg/f:DI 9 9 [orig:155 _342 ] [155])) "pr111601.ii":52:12 683 
> > {*movdi_internal64}
> >  (nil))
> > ...
> > (insn 564 214 216 25 (set (reg/v/f:DI 10 10 [orig:152 last_viable ] [152])
> > (plus:DI (reg/v/f:DI 10 10 [orig:152 last_viable ] [152])
> > (const_int 96 [0x60]))) "pr111601.ii":52:12 66 {*adddi3}
> >  (nil))
> > (insn 216 564 219 25 (set (mem/f:DI (reg/v/f:DI 10 10 [orig:152 last_viable 
> > ] [152]) [2 _343->next+0 S8 A64])
> > (reg:DI 5 5 [226])) "pr111601.ii":52:12 683 {*movdi_internal64}
> >  (expr_list:REG_DEAD (reg:DI 5 5 [226])
> > (nil)))
> > ...
> > and when asking for all uses of %r10 from def 564, it will see uses
> > in 216 and 212; the former is after the += 96 addition and gets changed
> > to load from %r10+96 with the addition being dropped, but there is
> > the other store which is a use across the backedge and when reached
> > from other edges certainly doesn't have the + 96 addition anywhere,
> > so the pass doesn't actually change that location.
> >
> > Haven't bootstrapped/regtested this yet, will start momentarily,
> > posting here just in case I'm missing something important.
>
> That version failed bootstrap because DF_INSN_LUID is signed int, not
> unsigned.
>
> Here is what so far passed on powerpc64le-linux
> ../configure --enable-languages=c,c++ --enable-checking=yes,rtl,extra 
> --disable-libsanitizer --with-long-double-128
> make -j160 profiledbootstrap
> which has been failing for more than a month now.
>
> I've noticed a couple of small formatting issues and fixed them too,
> doing normal {powerpc64le,x86_64,i686}-linux bootstraps/regtests:
>
> 2023-11-27  Jakub Jelinek  
>
> PR bootstrap/111601
> * fold-mem-offsets.cc (fold_offsets): Punt if use appears before
> def in the basic block.
> (get_single_def_in_bb, get_uses): Formatting fixes.
> (fold_offsets_1, pass_fold_mem_offsets::execute): Comment formatting
> fixes.
>
> * g++.dg/opt/pr111601.C: New test.
>
> --- gcc/fold-mem-offsets.cc.jj  2023-11-02 07:49:17.060865772 +0100
> +++ gcc/fold-mem-offsets.cc 2023-11-27 22:47:21.128591332 +0100
> @@ -154,7 +154,7 @@ static int stats_fold_count;
> The definition is desired for REG used in INSN.
> Return the definition insn or NULL if there's no definition with
> the desired criteria.  */
> -static rtx_insn*
> +static rtx_insn *
>  get_single_def_in_bb (rtx_insn *insn, rtx reg)
>  {
>df_ref use;
> @@ -205,7 +205,7 @@ get_single_def_in_bb (rtx_insn *insn, rt
>  /* Get all uses of REG which is set in INSN.  R

[PATCH] fold-mem-offsets: Fix powerpc64le-linux profiledbootstrap [PR111601]

2023-11-27 Thread Jakub Jelinek
On Mon, Nov 27, 2023 at 09:52:14PM +0100, Jakub Jelinek wrote:
> On Mon, Oct 16, 2023 at 01:11:01PM -0600, Jeff Law wrote:
> > > gcc/ChangeLog:
> > > 
> > >   * Makefile.in: Add fold-mem-offsets.o.
> > >   * passes.def: Schedule a new pass.
> > >   * tree-pass.h (make_pass_fold_mem_offsets): Declare.
> > >   * common.opt: New options.
> > >   * doc/invoke.texi: Document new option.
> > >   * fold-mem-offsets.cc: New file.
> > > 
> > > gcc/testsuite/ChangeLog:
> > > 
> > >   * gcc.target/riscv/fold-mem-offsets-1.c: New test.
> > >   * gcc.target/riscv/fold-mem-offsets-2.c: New test.
> > >   * gcc.target/riscv/fold-mem-offsets-3.c: New test.
> > Thanks, I've pushed this to the trunk.
> 
> This breaks profiledbootstrap on powerpc64le-linux.
> >From what I can see, the pass works one basic block at a time and
> will punt on any non-DEBUG_INSN uses outside of the current block
> (I believe because of the
>   /* This use affects instructions outside of CAN_FOLD_INSNS.  */
>   if (!bitmap_bit_p (&can_fold_insns, INSN_UID (use)))
> return 0;
> test and can_fold_insns only set in do_analysis (when processing insns in
> current bb, cleared at the end) or results of get_single_def_in_bb
> (which are checked to be in the same bb).
> But, while get_single_def_in_bb checks for
>   if (DF_INSN_LUID (def) > DF_INSN_LUID (insn))
> return NULL;
> (OT, why not DF_INSN_INFO_LUID (DF_REF_INSN_INFO (ref_chain->ref))
> instead of DF_INSN_LUID (def), then it doesn't need to look up
> DF_INSN_INFO_GET (def)?), nothing when walking all uses of def does such
> luid check.
> The basic block in the PR in question has:
> ...
> (insn 212 210 215 25 (set (mem/f:DI (reg/v/f:DI 10 10 [orig:152 last_viable ] 
> [152]) [2 *last_viable_336+0 S8 A64])
> (reg/f:DI 9 9 [orig:155 _342 ] [155])) "pr111601.ii":50:17 683 
> {*movdi_internal64}
>  (expr_list:REG_DEAD (reg/v/f:DI 10 10 [orig:152 last_viable ] [152])
> (nil)))
> (insn 215 212 484 25 (set (reg:DI 5 5 [226])
> (const_int 0 [0])) "pr111601.ii":52:12 683 {*movdi_internal64}
>  (expr_list:REG_EQUIV (const_int 0 [0])
> (nil)))
> (insn 484 215 218 25 (set (reg/v/f:DI 10 10 [orig:152 last_viable ] [152])
> (reg/f:DI 9 9 [orig:155 _342 ] [155])) "pr111601.ii":52:12 683 
> {*movdi_internal64}
>  (nil))
> ...
> (insn 564 214 216 25 (set (reg/v/f:DI 10 10 [orig:152 last_viable ] [152])
> (plus:DI (reg/v/f:DI 10 10 [orig:152 last_viable ] [152])
> (const_int 96 [0x60]))) "pr111601.ii":52:12 66 {*adddi3}
>  (nil))
> (insn 216 564 219 25 (set (mem/f:DI (reg/v/f:DI 10 10 [orig:152 last_viable ] 
> [152]) [2 _343->next+0 S8 A64])
> (reg:DI 5 5 [226])) "pr111601.ii":52:12 683 {*movdi_internal64}
>  (expr_list:REG_DEAD (reg:DI 5 5 [226])
> (nil)))
> ...
> and when asking for all uses of %r10 from def 564, it will see uses
> in 216 and 212; the former is after the += 96 addition and gets changed
> to load from %r10+96 with the addition being dropped, but there is
> the other store which is a use across the backedge and when reached
> from other edges certainly doesn't have the + 96 addition anywhere,
> so the pass doesn't actually change that location.
> 
> Haven't bootstrapped/regtested this yet, will start momentarily,
> posting here just in case I'm missing something important.

That version failed bootstrap because DF_INSN_LUID is signed int, not
unsigned.

Here is what so far passed on powerpc64le-linux
../configure --enable-languages=c,c++ --enable-checking=yes,rtl,extra 
--disable-libsanitizer --with-long-double-128
make -j160 profiledbootstrap
which has been failing for more than a month now.

I've noticed a couple of small formatting issues and fixed them too,
doing normal {powerpc64le,x86_64,i686}-linux bootstraps/regtests:

2023-11-27  Jakub Jelinek  

PR bootstrap/111601
* fold-mem-offsets.cc (fold_offsets): Punt if use appears before
def in the basic block.
(get_single_def_in_bb, get_uses): Formatting fixes.
(fold_offsets_1, pass_fold_mem_offsets::execute): Comment formatting
fixes.

* g++.dg/opt/pr111601.C: New test.

--- gcc/fold-mem-offsets.cc.jj  2023-11-02 07:49:17.060865772 +0100
+++ gcc/fold-mem-offsets.cc 2023-11-27 22:47:21.128591332 +0100
@@ -154,7 +154,7 @@ static int stats_fold_count;
The definition is desired for REG used in INSN.
Return the definition insn or NULL if there's no definition with
the desired criteria.  */
-static rtx_insn*
+static rtx_insn *
 get_single_def_in_bb (rtx_insn *insn, rtx reg)
 {
   df_ref use;
@@ -205,7 +205,7 @@ get_single_def_in_bb (rtx_insn *insn, rt
 /* Get all uses of REG which is set in INSN.  Return the use list or NULL if a
use is missing / irregular.  If SUCCESS is not NULL then set it to false if
there are missing / irregular uses and true otherwise.  */
-static struct df_link*
+static struct df_link *
 get_uses (rtx_insn *insn, rtx reg, 

Re: [PATCH] rs6000: Disable PCREL for unsupported targets [PR111045]

2023-11-27 Thread Michael Meissner
On Fri, Nov 10, 2023 at 06:03:40PM -0600, Peter Bergner wrote:
> On 8/25/23 6:20 AM, Kewen.Lin wrote:
> > btw, I was also expecting that we don't implicitly set
> > OPTION_MASK_PCREL any more for Power10, that is to remove
> > OPTION_MASK_PCREL from OTHER_POWER10_MASKS.
> 
> So my patch removes the flag from the default power10 flags, like
> you want.  However, it doesn't remove it from OTHER_POWER10_MASKS,
> since that is used to set ISA_3_1_MASKS_SERVER and I didn't want
> to change how rs6000_machine_from_flags() behaves, so instead, I
> just explicitly mask it off when defining the power10 default flags.

Historically the reason behind the two methods is they were done by different
people in parallel.  I had done the mask first in my pc-rel patches, but it
took a long time to integrate these into the compiler.  Bill Schmidt did the
functions as part of another change (rewriting built-ins maybe)

But prefixed and pc-rel cannot be added willy-nilly with -mcpu=power10 due to
conforming with other parts of the system (assembler, linker, ABIs, etc.).

-- 
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: meiss...@linux.ibm.com


[PATCH] aarch64: Improve cost of `a ? {-,}1 : b`

2023-11-27 Thread Andrew Pinski
While looking into PR 112454, I found the cost for
`(if_then_else (cmp) (const_int 1) (reg))` was being recorded as 8
(or `COSTS_N_INSNS (2)`) but it should have been 4 (or `COSTS_N_INSNS (1)`).
This improves the cost by not adding the cost of `(const_int 1)` to
the total cost.

It does not does not fully fix PR 112454 as that requires other changes to 
forwprop
the `(const_int 1)` earlier than combine. Though we do fix the loop case where 
the
constant was only used once.

Committed as approved after bootstrapped and tested on aarch64-linux-gnu with 
no regressions.

gcc/ChangeLog:

* config/aarch64/aarch64.cc (aarch64_if_then_else_costs):
Handle csinv/csinc case of 1/-1.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/csinc-3.c: New test.

Signed-off-by: Andrew Pinski 
---
 gcc/config/aarch64/aarch64.cc  | 12 
 gcc/testsuite/gcc.target/aarch64/csinc-3.c | 10 ++
 2 files changed, 22 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/csinc-3.c

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index b2093430937..4fd8c2de43a 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -11607,6 +11607,18 @@ aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, 
int *cost, bool speed)
/* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3).  */
op1 = XEXP (inner, 0);
}
+  else if (op1 == constm1_rtx || op1 == const1_rtx)
+   {
+ /* Use CSINV or CSINC.  */
+ *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
+ return true;
+   }
+  else if (op2 == constm1_rtx || op2 == const1_rtx)
+   {
+ /* Use CSINV or CSINC.  */
+ *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
+ return true;
+   }
 
   *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
   *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
diff --git a/gcc/testsuite/gcc.target/aarch64/csinc-3.c 
b/gcc/testsuite/gcc.target/aarch64/csinc-3.c
new file mode 100644
index 000..bde131a584e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/csinc-3.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-tree-vectorize" } */
+
+int f(int *a, int n, int *b, int d)
+{
+  for(int i = 0; i < n; i++)
+b[i] = a[i] == 100 ? 1 : d;
+  /* { dg-final { scan-assembler "csinc\tw\[0-9\].*wzr" } } */
+  return 0;
+}
-- 
2.34.1



[COMMITTED] Fix time-profiler-3.c after r14-5628-g53ba8d669550d3

2023-11-27 Thread Andrew Pinski
This testcase started to fail after r14-5628-g53ba8d669550d3 because
IPA-VRP can now start to figure out the functions return a constant
value and there was nothing that profiling needed to profile any more.
This disables IPA-VRP for this testcase to be able to profile again.

Bootrapped/tested on x86_64-linux-gnu with no regressions.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-prof/time-profiler-3.c: Add -fno-ipa-vrp.
---
 gcc/testsuite/gcc.dg/tree-prof/time-profiler-3.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/tree-prof/time-profiler-3.c 
b/gcc/testsuite/gcc.dg/tree-prof/time-profiler-3.c
index 69ce0260828..e54a06a7827 100644
--- a/gcc/testsuite/gcc.dg/tree-prof/time-profiler-3.c
+++ b/gcc/testsuite/gcc.dg/tree-prof/time-profiler-3.c
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fdump-ipa-profile -fprofile-update=atomic" } */
+/* { dg-options "-O2 -fdump-ipa-profile -fprofile-update=atomic -fno-ipa-vrp" 
} */
 /* { dg-require-effective-target profile_update_atomic } */
 
 __attribute__ ((noinline))
-- 
2.39.3



RE: [PATCH 9/21]middle-end: implement vectorizable_early_exit for codegen of exit code

2023-11-27 Thread Tamar Christina
Ping

> -Original Message-
> From: Tamar Christina 
> Sent: Monday, November 6, 2023 7:40 AM
> To: gcc-patches@gcc.gnu.org
> Cc: nd ; rguent...@suse.de; j...@ventanamicro.com
> Subject: [PATCH 9/21]middle-end: implement vectorizable_early_exit for
> codegen of exit code
> 
> Hi All,
> 
> This implements vectorable_early_exit which is used as the codegen part of
> vectorizing a gcond.
> 
> For the most part it shares the majority of the code with
> vectorizable_comparison with addition that it needs to be able to reduce
> multiple resulting statements into a single one for use in the gcond, and also
> needs to be able to perform masking on the comparisons.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>   * tree-vect-stmts.cc (vectorizable_comparison_1): Support stmts
> without
>   lhs.
>   (vectorizable_early_exit): New.
>   (vect_analyze_stmt, vect_transform_stmt): Use it.
>   (vect_is_simple_use, vect_get_vector_types_for_stmt): Support
> gcond.
> 
> --- inline copy of patch --
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index
> 36aeca60a22cfaea8d3b43348000d75de1d525c7..4809b822632279493a84
> 3d402a833c9267bb315e 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -12475,7 +12475,7 @@ vectorizable_comparison_1 (vec_info *vinfo,
> tree vectype,
>vec vec_oprnds0 = vNULL;
>vec vec_oprnds1 = vNULL;
>tree mask_type;
> -  tree mask;
> +  tree mask = NULL_TREE;
> 
>if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
>  return false;
> @@ -12615,8 +12615,9 @@ vectorizable_comparison_1 (vec_info *vinfo,
> tree vectype,
>/* Transform.  */
> 
>/* Handle def.  */
> -  lhs = gimple_assign_lhs (STMT_VINFO_STMT (stmt_info));
> -  mask = vect_create_destination_var (lhs, mask_type);
> +  lhs = gimple_get_lhs (STMT_VINFO_STMT (stmt_info));  if (lhs)
> +mask = vect_create_destination_var (lhs, mask_type);
> 
>vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
>rhs1, &vec_oprnds0, vectype,
> @@ -12630,7 +12631,10 @@ vectorizable_comparison_1 (vec_info *vinfo,
> tree vectype,
>gimple *new_stmt;
>vec_rhs2 = vec_oprnds1[i];
> 
> -  new_temp = make_ssa_name (mask);
> +  if (lhs)
> + new_temp = make_ssa_name (mask);
> +  else
> + new_temp = make_temp_ssa_name (mask_type, NULL, "cmp");
>if (bitop1 == NOP_EXPR)
>   {
> new_stmt = gimple_build_assign (new_temp, code, @@ -12709,6
> +12713,196 @@ vectorizable_comparison (vec_info *vinfo,
>return true;
>  }
> 
> +/* Check to see if the current early break given in STMT_INFO is valid for
> +   vectorization.  */
> +
> +static bool
> +vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info,
> +  gimple_stmt_iterator *gsi, gimple **vec_stmt,
> +  slp_tree slp_node, stmt_vector_for_cost *cost_vec) {
> +  loop_vec_info loop_vinfo = dyn_cast  (vinfo);
> +  if (!loop_vinfo
> +  || !is_a  (STMT_VINFO_STMT (stmt_info)))
> +return false;
> +
> +  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_early_exit_def)
> +return false;
> +
> +  if (!STMT_VINFO_RELEVANT_P (stmt_info))
> +return false;
> +
> +  gimple_match_op op;
> +  if (!gimple_extract_op (stmt_info->stmt, &op))
> +gcc_unreachable ();
> +  gcc_assert (op.code.is_tree_code ());  auto code = tree_code
> + (op.code);
> +
> +  tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);  gcc_assert
> + (vectype_out);
> +
> +  tree var_op = op.ops[0];
> +
> +  /* When vectorizing things like pointer comparisons we will assume that
> + the VF of both operands are the same. e.g. a pointer must be compared
> + to a pointer.  We'll leave this up to vectorizable_comparison_1 to
> + check further.  */
> +  tree vectype_op = vectype_out;
> +  if (SSA_VAR_P (var_op))
> +{
> +  stmt_vec_info operand0_info
> + = loop_vinfo->lookup_stmt (SSA_NAME_DEF_STMT (var_op));
> +  if (!operand0_info)
> + return false;
> +
> +  /* If we're in a pattern get the type of the original statement.  */
> +  if (STMT_VINFO_IN_PATTERN_P (operand0_info))
> + operand0_info = STMT_VINFO_RELATED_STMT (operand0_info);
> +  vectype_op = STMT_VINFO_VECTYPE (operand0_info);
> +}
> +
> +  tree truth_type = truth_type_for (vectype_op);  machine_mode mode =
> + TYPE_MODE (truth_type);  int ncopies;
> +
> +  if (slp_node)
> +ncopies = 1;
> +  else
> +ncopies = vect_get_num_copies (loop_vinfo, truth_type);
> +
> +  vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);  bool
> + masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> +
> +  /* Analyze only.  */
> +  if (!vec_stmt)
> +{
> +  if (direct_optab_handler (cbranch_optab, mode) == CODE_FOR_nothing)
> + {
> +   if (dump_enabled_p ())
> +   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +  

RE: [PATCH 10/21]middle-end: implement relevancy analysis support for control flow

2023-11-27 Thread Tamar Christina
Ping

> -Original Message-
> From: Tamar Christina 
> Sent: Monday, November 6, 2023 7:40 AM
> To: gcc-patches@gcc.gnu.org
> Cc: nd ; rguent...@suse.de; j...@ventanamicro.com
> Subject: [PATCH 10/21]middle-end: implement relevancy analysis support for
> control flow
> 
> Hi All,
> 
> This updates relevancy analysis to support marking gcond's belonging to early
> breaks as relevant for vectorization.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>   * tree-vect-stmts.cc (vect_stmt_relevant_p,
>   vect_mark_stmts_to_be_vectorized, vect_analyze_stmt,
> vect_is_simple_use,
>   vect_get_vector_types_for_stmt): Support early breaks.
> 
> --- inline copy of patch --
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index
> 4809b822632279493a843d402a833c9267bb315e..31474e923cc3feb2604
> ca2882ecfb300cd211679 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -359,9 +359,14 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info,
> loop_vec_info loop_vinfo,
>*live_p = false;
> 
>/* cond stmt other than loop exit cond.  */
> -  if (is_ctrl_stmt (stmt_info->stmt)
> -  && STMT_VINFO_TYPE (stmt_info) != loop_exit_ctrl_vec_info_type)
> -*relevant = vect_used_in_scope;
> +  gimple *stmt = STMT_VINFO_STMT (stmt_info);
> +  if (is_ctrl_stmt (stmt) && is_a  (stmt))
> +{
> +  gcond *cond = as_a  (stmt);
> +  if (LOOP_VINFO_LOOP_CONDS (loop_vinfo).contains (cond)
> +   && LOOP_VINFO_LOOP_IV_COND (loop_vinfo) != cond)
> + *relevant = vect_used_in_scope;
> +}
> 
>/* changing memory.  */
>if (gimple_code (stmt_info->stmt) != GIMPLE_PHI) @@ -374,6 +379,11 @@
> vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
>   *relevant = vect_used_in_scope;
>}
> 
> +  auto_vec exits = get_loop_exit_edges (loop);  auto_bitmap
> + exit_bbs;  for (edge exit : exits)
> +bitmap_set_bit (exit_bbs, exit->dest->index);
> +
>/* uses outside the loop.  */
>FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter,
> SSA_OP_DEF)
>  {
> @@ -392,7 +402,6 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info,
> loop_vec_info loop_vinfo,
> /* We expect all such uses to be in the loop exit phis
>(because of loop closed form)   */
> gcc_assert (gimple_code (USE_STMT (use_p)) == GIMPLE_PHI);
> -   gcc_assert (bb == single_exit (loop)->dest);
> 
>*live_p = true;
>   }
> @@ -793,6 +802,20 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info
> loop_vinfo, bool *fatal)
>   return res;
>   }
>   }
> + }
> +   else if (gcond *cond = dyn_cast  (stmt_vinfo->stmt))
> + {
> +   enum tree_code rhs_code = gimple_cond_code (cond);
> +   gcc_assert (TREE_CODE_CLASS (rhs_code) == tcc_comparison);
> +   opt_result res
> + = process_use (stmt_vinfo, gimple_cond_lhs (cond),
> +loop_vinfo, relevant, &worklist, false);
> +   if (!res)
> + return res;
> +   res = process_use (stmt_vinfo, gimple_cond_rhs (cond),
> + loop_vinfo, relevant, &worklist, false);
> +   if (!res)
> + return res;
>  }
> else if (gcall *call = dyn_cast  (stmt_vinfo->stmt))
>   {
> @@ -13043,11 +13066,15 @@ vect_analyze_stmt (vec_info *vinfo,
>node_instance, cost_vec);
>if (!res)
>   return res;
> -   }
> +}
> +
> +  if (is_ctrl_stmt (stmt_info->stmt))
> +STMT_VINFO_DEF_TYPE (stmt_info) = vect_early_exit_def;
> 
>switch (STMT_VINFO_DEF_TYPE (stmt_info))
>  {
>case vect_internal_def:
> +  case vect_early_exit_def:
>  break;
> 
>case vect_reduction_def:
> @@ -13080,6 +13107,7 @@ vect_analyze_stmt (vec_info *vinfo,
>  {
>gcall *call = dyn_cast  (stmt_info->stmt);
>gcc_assert (STMT_VINFO_VECTYPE (stmt_info)
> +   || gimple_code (stmt_info->stmt) == GIMPLE_COND
> || (call && gimple_call_lhs (call) == NULL_TREE));
>*need_to_vectorize = true;
>  }
> @@ -13835,6 +13863,14 @@ vect_is_simple_use (vec_info *vinfo,
> stmt_vec_info stmt, slp_tree slp_node,
> else
>   *op = gimple_op (ass, operand + 1);
>   }
> +  else if (gcond *cond = dyn_cast  (stmt->stmt))
> + {
> +   gimple_match_op m_op;
> +   if (!gimple_extract_op (cond, &m_op))
> + return false;
> +   gcc_assert (m_op.code.is_tree_code ());
> +   *op = m_op.ops[operand];
> + }
>else if (gcall *call = dyn_cast  (stmt->stmt))
>   *op = gimple_call_arg (call, operand);
>else
> @@ -14445,6 +14481,8 @@ vect_get_vector_types_for_stmt (vec_info
> *vinfo, stmt_vec_info stmt_info,
>*nunits_vectype_out = NULL_TREE;
> 
>i

RE: [PATCH 12/21]middle-end: Add remaining changes to peeling and vectorizer to support early breaks

2023-11-27 Thread Tamar Christina
Ping

> -Original Message-
> From: Tamar Christina 
> Sent: Monday, November 6, 2023 7:41 AM
> To: gcc-patches@gcc.gnu.org
> Cc: nd ; rguent...@suse.de; j...@ventanamicro.com
> Subject: [PATCH 12/21]middle-end: Add remaining changes to peeling and
> vectorizer to support early breaks
> 
> Hi All,
> 
> This finishes wiring that didn't fit in any of the other patches.
> Essentially just adding related changes so peeling for early break works.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>   * tree-vect-loop-manip.cc (vect_set_loop_condition_normal,
>   vect_do_peeling): Support early breaks.
>   * tree-vect-loop.cc (vect_need_peeling_or_partial_vectors_p):
> Likewise.
>   * tree-vectorizer.cc (pass_vectorize::execute): Check all exits.
> 
> --- inline copy of patch --
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc index
> eef2bb50c1505f5cf802d5d80300affc2cbe69f6..9c1405d79fd8fe8689007df
> 3b7605b7a3d3ecdd7 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -1187,7 +1187,7 @@ vect_set_loop_condition_partial_vectors_avx512
> (class loop *loop,
> loop handles exactly VF scalars per iteration.  */
> 
>  static gcond *
> -vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge
> exit_edge,
> +vect_set_loop_condition_normal (loop_vec_info loop_vinfo, edge
> +exit_edge,
>   class loop *loop, tree niters, tree step,
>   tree final_iv, bool niters_maybe_zero,
>   gimple_stmt_iterator loop_cond_gsi) @@ -
> 1296,7 +1296,8 @@ vect_set_loop_condition_normal (loop_vec_info /*
> loop_vinfo */, edge exit_edge,
>gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
> 
>/* Record the number of latch iterations.  */
> -  if (limit == niters)
> +  if (limit == niters
> +  || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
>  /* Case A: the loop iterates NITERS times.  Subtract one to get the
> latch count.  */
>  loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters, @@ -
> 3242,6 +3243,16 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters,
> tree nitersm1,
>  bound_epilog += vf - 1;
>if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
>  bound_epilog += 1;
> +
> +  /* For early breaks the scalar loop needs to execute at most VF times
> + to find the element that caused the break.  */  if
> + (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> +{
> +  bound_epilog = vf;
> +  /* Force a scalar epilogue as we can't vectorize the index finding.  */
> +  vect_epilogues = false;
> +}
> +
>bool epilog_peeling = maybe_ne (bound_epilog, 0U);
>poly_uint64 bound_scalar = bound_epilog;
> 
> @@ -3376,14 +3387,23 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree
> niters, tree nitersm1,
> bound_prolog + bound_epilog)
> : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
>|| vect_epilogues));
> +
> +  /* We only support early break vectorization on known bounds at this time.
> + This means that if the vector loop can't be entered then we won't 
> generate
> + it at all.  So for now force skip_vector off because the additional 
> control
> + flow messes with the BB exits and we've already analyzed them.  */
> + skip_vector = skip_vector && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
> +
>/* Epilog loop must be executed if the number of iterations for epilog
>   loop is known at compile time, otherwise we need to add a check at
>   the end of vector loop and skip to the end of epilog loop.  */
>bool skip_epilog = (prolog_peeling < 0
> || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> || !vf.is_constant ());
> -  /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
> -  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
> +  /* PEELING_FOR_GAPS and peeling for early breaks are special because
> epilog
> + loop must be executed.  */
> +  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
> +  || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
>  skip_epilog = false;
> 
>class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); diff --git
> a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index
> 55d6aee3d29151e6b528f6fdde15c693e5bdd847..51a054c5b035ac80dfbbf
> 3b5ba2f6da82fda91f6 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -1236,6 +1236,14 @@ vect_need_peeling_or_partial_vectors_p
> (loop_vec_info loop_vinfo)
>  th = LOOP_VINFO_COST_MODEL_THRESHOLD
> (LOOP_VINFO_ORIG_LOOP_INFO
> (loop_vinfo));
> 
> +  /* When we have multiple exits and VF is unknown, we must require partial
> + vectors because the loop bounds is not a minimum but a maximum.  That
> is to
> + say we cannot unpredicate the

RE: [PATCH 13/21]middle-end: Update loop form analysis to support early break

2023-11-27 Thread Tamar Christina
Ping

> -Original Message-
> From: Tamar Christina 
> Sent: Monday, November 6, 2023 7:41 AM
> To: gcc-patches@gcc.gnu.org
> Cc: nd ; rguent...@suse.de; j...@ventanamicro.com
> Subject: [PATCH 13/21]middle-end: Update loop form analysis to support
> early break
> 
> Hi All,
> 
> This sets LOOP_VINFO_EARLY_BREAKS and does some misc changes so the
> other patches are self contained.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>   * tree-vect-loop.cc (vect_analyze_loop_form): Analyse all exits.
>   (vect_create_loop_vinfo): Set LOOP_VINFO_EARLY_BREAKS.
>   (vect_transform_loop): Use it.
> 
> --- inline copy of patch --
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index
> 51a054c5b035ac80dfbbf3b5ba2f6da82fda91f6..f9483eff6e9606e835906fb
> 991f07cd6052491d0 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -1700,12 +1700,12 @@ vect_compute_single_scalar_iteration_cost
> (loop_vec_info loop_vinfo)
>loop_vinfo->scalar_costs->finish_cost (nullptr);  }
> 
> -
>  /* Function vect_analyze_loop_form.
> 
> Verify that certain CFG restrictions hold, including:
> - the loop has a pre-header
> -   - the loop has a single entry and exit
> +   - the loop has a single entry
> +   - nested loops can have only a single exit.
> - the loop exit condition is simple enough
> - the number of iterations can be analyzed, i.e, a countable loop.  The
>   niter could be analyzed under some assumptions.  */ @@ -1841,10
> +1841,14 @@ vect_analyze_loop_form (class loop *loop,
> vect_loop_form_info *info)
>  "not vectorized: latch block not empty.\n");
> 
>/* Make sure the exit is not abnormal.  */
> -  if (exit_e->flags & EDGE_ABNORMAL)
> -return opt_result::failure_at (vect_location,
> -"not vectorized:"
> -" abnormal loop exit edge.\n");
> +  auto_vec exits = get_loop_exit_edges (loop);
> +  for (edge e : exits)
> +{
> +  if (e->flags & EDGE_ABNORMAL)
> + return opt_result::failure_at (vect_location,
> +"not vectorized:"
> +" abnormal loop exit edge.\n");
> +}
> 
>info->conds
>  = vect_get_loop_niters (loop, exit_e, &info->assumptions, @@ -1920,6
> +1924,10 @@ vect_create_loop_vinfo (class loop *loop, vec_info_shared
> *shared,
> 
>LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
> 
> +  /* Check to see if we're vectorizing multiple exits.  */
> + LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> += !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
> +
>if (info->inner_loop_cond)
>  {
>stmt_vec_info inner_loop_cond_info @@ -11577,7 +11585,7 @@
> vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
>/* Make sure there exists a single-predecessor exit bb.  Do this before
>   versioning.   */
>edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
> -  if (! single_pred_p (e->dest))
> +  if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS
> + (loop_vinfo))
>  {
>split_loop_exit_edge (e, true);
>if (dump_enabled_p ())
> 
> 
> 
> 
> --


RE: [PATCH 8/21]middle-end: update vectorizable_live_reduction with support for multiple exits and different exits

2023-11-27 Thread Tamar Christina
 >
> > This is a respun patch with a fix for VLA.
> >
> > This adds support to vectorizable_live_reduction to handle multiple
> > exits by doing a search for which exit the live value should be 
> > materialized in.
> >
> > Additionally which value in the index we're after depends on whether
> > the exit it's materialized in is an early exit or whether the loop's
> > main exit is different from the loop's natural one (i.e. the one with
> > the same src block as the latch).
> >
> > In those two cases we want the first rather than the last value as
> > we're going to restart the iteration in the scalar loop.  For VLA this
> > means we need to reverse both the mask and vector since there's only a
> > way to get the last active element and not the first.
> >
> > For inductions and multiple exits:
> >   - we test if the target will support vectorizing the induction
> >   - mark all inductions in the loop as relevant
> >   - for codegen of non-live inductions during codegen
> >   - induction during an early exit gets the first element rather than last.
> >
> > For reductions and multiple exits:
> >   - Reductions for early exits reduces the reduction definition statement
> > rather than the reduction step.  This allows us to get the value at the
> > start of the iteration.
> >   - The peeling layout means that we just have to update one block, the
> merge
> > block.  We expect all the reductions to be the same but we leave it up 
> > to
> > the value numbering to clean up any duplicate code as we iterate over 
> > all
> > edges.
> >
> > These two changes fix the reduction codegen given before which has
> > been added to the testsuite for early vect.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > * tree-vect-loop.cc (vectorizable_live_operation): Support early exits.
> > (vect_analyze_loop_operations): Check if target supports vectorizing
> IV.
> > (vect_transform_loop): Call vectorizable_live_operation for non-live
> > inductions or reductions.
> > (find_connected_edge, vectorizable_live_operation_1): New.
> > (vect_create_epilog_for_reduction): Support reductions in early break.
> > * tree-vect-stmts.cc (perm_mask_for_reverse): Expose.
> > (vect_stmt_relevant_p): Mark all inductions when early break as being
> > relevant.
> > * tree-vectorizer.h (perm_mask_for_reverse): Expose.
> > (vect_iv_increment_position): New.
> > * tree-vect-loop-manip.cc (vect_iv_increment_position): Expose.
> >
> > --- inline copy of patch ---
> >
> > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> > index
> >
> 476be8a0bb6da2d06c4ca7052cb07bacecca60b1..1a4ba349fb6ae39c79401
> aecd4e7
> > e9e2b8a0 100644
> > --- a/gcc/tree-vect-loop-manip.cc
> > +++ b/gcc/tree-vect-loop-manip.cc
> > @@ -453,7 +453,7 @@ vect_adjust_loop_lens_control (tree iv_type,
> gimple_seq *seq,
> > INSERT_AFTER is set to true if the increment should be inserted after
> > *BSI.  */
> >
> > -static void
> > +void
> >  vect_iv_increment_position (edge loop_exit, gimple_stmt_iterator *bsi,
> > bool *insert_after)
> >  {
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index
> >
> 8a50380de49bc12105be47ea1d8ee3cf1f2bdab4..b42318b2999e6a27e698
> 33821907
> > 92602cb25af1 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -2163,6 +2163,15 @@ vect_analyze_loop_operations (loop_vec_info
> loop_vinfo)
> > ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL,
> NULL,
> >   -1, false, &cost_vec);
> >
> > + /* Check if we can perform the operation for early break if we force
> > +the live operation.  */
> > + if (ok
> > + && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> > + && !STMT_VINFO_LIVE_P (stmt_info)
> > + && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
> > +   ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL,
> NULL,
> > + -1, false, &cost_vec);
> 
> can you add && !PURE_SLP_STMT?
> 

I've cleaned up the patch a bit more, so these hunks are now all gone.

> > @@ -6132,23 +6147,30 @@ vect_create_epilog_for_reduction
> (loop_vec_info loop_vinfo,
> >   Store them in NEW_PHIS.  */
> >if (double_reduc)
> >  loop = outer_loop;
> > -  exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> > +  /* We need to reduce values in all exits.  */  exit_bb =
> > + loop_exit->dest;
> >exit_gsi = gsi_after_labels (exit_bb);
> >reduc_inputs.create (slp_node ? vec_num : ncopies);
> > +  vec  vec_stmts;
> > +  if (main_exit_p)
> > +vec_stmts = STMT_VINFO_VEC_STMTS (rdef_info);  else
> > +vec_stmts = STMT_VINFO_VEC_STMTS (STMT_VINFO_REDUC_DEF
> > + (rdef_info));
> 
> both would be wrong for SLP, also I think you need to look at
> STMT_VINFO

[PATCH]middle-end: refactor vectorizable_live_operation into helper method for codegen

2023-11-27 Thread Tamar Christina
Hi All,

To make code review of the updates to add multiple exit supports to
vectorizable_live_operation easier I've extracted the refactoring part to
its own patch.

This patch is a straight extract of the function with no functional changes.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* tree-vect-loop.cc (vectorizable_live_operation_1): New.
(vectorizable_live_operation): Extract code to 
vectorizable_live_operation_1.

--- inline copy of patch -- 
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 
8a50380de49bc12105be47ea1d8ee3cf1f2bdab4..df5e1d28fac2ce35e71decdec0d8e31fb75557f5
 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10481,6 +10481,95 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   return true;
 }
 
+
+/* Function vectorizable_live_operation_1.
+   helper function for vectorizable_live_operation.  */
+tree
+vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
+  stmt_vec_info stmt_info, edge exit_e,
+  tree vectype, int ncopies, slp_tree slp_node,
+  tree bitsize, tree bitstart, tree vec_lhs,
+  tree lhs_type, gimple_stmt_iterator *exit_gsi)
+{
+  basic_block exit_bb = exit_e->dest;
+  gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
+
+  tree vec_lhs_phi = copy_ssa_name (vec_lhs);
+  gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
+  for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
+SET_PHI_ARG_DEF (phi, i, vec_lhs);
+
+  gimple_seq stmts = NULL;
+  tree new_tree;
+  if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+{
+  /* Emit:
+SCALAR_RES = VEC_EXTRACT 
+where VEC_LHS is the vectorized live-out result and MASK is
+the loop mask for the final iteration.  */
+  gcc_assert (ncopies == 1 && !slp_node);
+  gimple_seq tem = NULL;
+  gimple_stmt_iterator gsi = gsi_last (tem);
+  tree len = vect_get_loop_len (loop_vinfo, &gsi,
+   &LOOP_VINFO_LENS (loop_vinfo),
+   1, vectype, 0, 0);
+  /* BIAS - 1.  */
+  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+  tree bias_minus_one
+   = int_const_binop (MINUS_EXPR,
+  build_int_cst (TREE_TYPE (len), biasval),
+  build_one_cst (TREE_TYPE (len)));
+  /* LAST_INDEX = LEN + (BIAS - 1).  */
+  tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
+len, bias_minus_one);
+  /* This needs to implement extraction of the first index, but not sure
+how the LEN stuff works.  At the moment we shouldn't get here since
+there's no LEN support for early breaks.  But guard this so there's
+no incorrect codegen.  */
+  gcc_assert (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
+
+  /* SCALAR_RES = VEC_EXTRACT .  */
+  tree scalar_res
+   = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
+   vec_lhs_phi, last_index);
+  /* Convert the extracted vector element to the scalar type.  */
+  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+}
+  else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+{
+  /* Emit:
+SCALAR_RES = EXTRACT_LAST 
+where VEC_LHS is the vectorized live-out result and MASK is
+the loop mask for the final iteration.  */
+  gcc_assert (!slp_node);
+  tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
+  gimple_seq tem = NULL;
+  gimple_stmt_iterator gsi = gsi_last (tem);
+  tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
+ &LOOP_VINFO_MASKS (loop_vinfo),
+ 1, vectype, 0);
+
+  gimple_seq_add_seq (&stmts, tem);
+   tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
+  mask, vec_lhs_phi);
+  /* Convert the extracted vector element to the scalar type.  */
+  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+}
+  else
+{
+  tree bftype = TREE_TYPE (vectype);
+  if (VECTOR_BOOLEAN_TYPE_P (vectype))
+   bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
+  new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, 
bitstart);
+  new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
+  &stmts, true, NULL_TREE);
+}
+  *exit_gsi = gsi_after_labels (exit_bb);
+  if (stmts)
+gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
+  return new_tree;
+}
+
 /* Function vectorizable_live_operation.
 
STMT_INFO computes a value that is used outside the loop.  Check if
@@ -10690,79 +10779,13 @@ vectorizable_live_operation (v

[PATCH]middle-end: prevent LIM from hoising vector compares from gconds if target does not support it.

2023-11-27 Thread Tamar Christina
Hi All,

LIM notices that in some cases the condition and the results are loop
invariant and tries to move them out of the loop.

While the resulting code is operationally sound, moving the compare out of the
gcond results in generating code that no longer branches, so cbranch is no
longer applicable.  As such I now add code to check during this motion to see
if the target supports flag setting vector comparison as general operation.

I have tried writing a GIMPLE testcase for this but the gimple FE seems to be
having some trouble with the vector types.  It seems to fail parsing.

The early break code testsuite however has a test for this
(vect-early-break_67.c).

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* tree-ssa-loop-im.cc (determine_max_movement): Import insn-codes.h
and optabs-tree.h and check for vector compare motion out of gcond.

--- inline copy of patch -- 
diff --git a/gcc/tree-ssa-loop-im.cc b/gcc/tree-ssa-loop-im.cc
index 
396963b6754c7671e2e5404302a69129918555e2..2ebf6d6548c4858fd5a8b4f9ab6f332f3fe8f6cd
 100644
--- a/gcc/tree-ssa-loop-im.cc
+++ b/gcc/tree-ssa-loop-im.cc
@@ -48,6 +48,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-dfa.h"
 #include "tree-ssa.h"
 #include "dbgcnt.h"
+#include "insn-codes.h"
+#include "optabs-tree.h"
 
 /* TODO:  Support for predicated code motion.  I.e.
 
@@ -852,6 +854,17 @@ determine_max_movement (gimple *stmt, bool 
must_preserve_exec)
  if (!extract_true_false_args_from_phi (dom, phi, NULL, NULL))
return false;
 
+   /* Check if one of the depedent statement is a vector compare whether
+  the target supports it,  otherwise it's invalid to hoist it out of
+  the gcond it belonged to.  */
+   if (VECTOR_TYPE_P (TREE_TYPE (gimple_cond_lhs (cond
+ {
+   tree type = TREE_TYPE (gimple_cond_lhs (cond));
+   auto code = gimple_cond_code (cond);
+   if (!target_supports_op_p (type, code, optab_vector))
+ return false;
+ }
+
  /* Fold in dependencies and cost of the condition.  */
  FOR_EACH_SSA_TREE_OPERAND (val, cond, iter, SSA_OP_USE)
{




-- 
diff --git a/gcc/tree-ssa-loop-im.cc b/gcc/tree-ssa-loop-im.cc
index 
396963b6754c7671e2e5404302a69129918555e2..2ebf6d6548c4858fd5a8b4f9ab6f332f3fe8f6cd
 100644
--- a/gcc/tree-ssa-loop-im.cc
+++ b/gcc/tree-ssa-loop-im.cc
@@ -48,6 +48,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-dfa.h"
 #include "tree-ssa.h"
 #include "dbgcnt.h"
+#include "insn-codes.h"
+#include "optabs-tree.h"
 
 /* TODO:  Support for predicated code motion.  I.e.
 
@@ -852,6 +854,17 @@ determine_max_movement (gimple *stmt, bool 
must_preserve_exec)
  if (!extract_true_false_args_from_phi (dom, phi, NULL, NULL))
return false;
 
+   /* Check if one of the depedent statement is a vector compare whether
+  the target supports it,  otherwise it's invalid to hoist it out of
+  the gcond it belonged to.  */
+   if (VECTOR_TYPE_P (TREE_TYPE (gimple_cond_lhs (cond
+ {
+   tree type = TREE_TYPE (gimple_cond_lhs (cond));
+   auto code = gimple_cond_code (cond);
+   if (!target_supports_op_p (type, code, optab_vector))
+ return false;
+ }
+
  /* Fold in dependencies and cost of the condition.  */
  FOR_EACH_SSA_TREE_OPERAND (val, cond, iter, SSA_OP_USE)
{





Re: [PATCH v7] Implement new RTL optimizations pass: fold-mem-offsets.

2023-11-27 Thread Jakub Jelinek
On Mon, Oct 16, 2023 at 01:11:01PM -0600, Jeff Law wrote:
> > gcc/ChangeLog:
> > 
> > * Makefile.in: Add fold-mem-offsets.o.
> > * passes.def: Schedule a new pass.
> > * tree-pass.h (make_pass_fold_mem_offsets): Declare.
> > * common.opt: New options.
> > * doc/invoke.texi: Document new option.
> > * fold-mem-offsets.cc: New file.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> > * gcc.target/riscv/fold-mem-offsets-1.c: New test.
> > * gcc.target/riscv/fold-mem-offsets-2.c: New test.
> > * gcc.target/riscv/fold-mem-offsets-3.c: New test.
> Thanks, I've pushed this to the trunk.

This breaks profiledbootstrap on powerpc64le-linux.
>From what I can see, the pass works one basic block at a time and
will punt on any non-DEBUG_INSN uses outside of the current block
(I believe because of the
  /* This use affects instructions outside of CAN_FOLD_INSNS.  */
  if (!bitmap_bit_p (&can_fold_insns, INSN_UID (use)))
return 0;
test and can_fold_insns only set in do_analysis (when processing insns in
current bb, cleared at the end) or results of get_single_def_in_bb
(which are checked to be in the same bb).
But, while get_single_def_in_bb checks for
  if (DF_INSN_LUID (def) > DF_INSN_LUID (insn))
return NULL;
(OT, why not DF_INSN_INFO_LUID (DF_REF_INSN_INFO (ref_chain->ref))
instead of DF_INSN_LUID (def), then it doesn't need to look up
DF_INSN_INFO_GET (def)?), nothing when walking all uses of def does such
luid check.
The basic block in the PR in question has:
...
(insn 212 210 215 25 (set (mem/f:DI (reg/v/f:DI 10 10 [orig:152 last_viable ] 
[152]) [2 *last_viable_336+0 S8 A64])
(reg/f:DI 9 9 [orig:155 _342 ] [155])) "pr111601.ii":50:17 683 
{*movdi_internal64}
 (expr_list:REG_DEAD (reg/v/f:DI 10 10 [orig:152 last_viable ] [152])
(nil)))
(insn 215 212 484 25 (set (reg:DI 5 5 [226])
(const_int 0 [0])) "pr111601.ii":52:12 683 {*movdi_internal64}
 (expr_list:REG_EQUIV (const_int 0 [0])
(nil)))
(insn 484 215 218 25 (set (reg/v/f:DI 10 10 [orig:152 last_viable ] [152])
(reg/f:DI 9 9 [orig:155 _342 ] [155])) "pr111601.ii":52:12 683 
{*movdi_internal64}
 (nil))
...
(insn 564 214 216 25 (set (reg/v/f:DI 10 10 [orig:152 last_viable ] [152])
(plus:DI (reg/v/f:DI 10 10 [orig:152 last_viable ] [152])
(const_int 96 [0x60]))) "pr111601.ii":52:12 66 {*adddi3}
 (nil))
(insn 216 564 219 25 (set (mem/f:DI (reg/v/f:DI 10 10 [orig:152 last_viable ] 
[152]) [2 _343->next+0 S8 A64])
(reg:DI 5 5 [226])) "pr111601.ii":52:12 683 {*movdi_internal64}
 (expr_list:REG_DEAD (reg:DI 5 5 [226])
(nil)))
...
and when asking for all uses of %r10 from def 564, it will see uses
in 216 and 212; the former is after the += 96 addition and gets changed
to load from %r10+96 with the addition being dropped, but there is
the other store which is a use across the backedge and when reached
from other edges certainly doesn't have the + 96 addition anywhere,
so the pass doesn't actually change that location.

Haven't bootstrapped/regtested this yet, will start momentarily,
posting here just in case I'm missing something important.

2023-11-27  Jakub Jelinek  

PR bootstrap/111601
* fold-mem-offsets.cc (fold_offsets): Punt if use appears before
def in the basic block.

* g++.dg/opt/pr111601.C: New test.

--- gcc/fold-mem-offsets.cc.jj  2023-11-02 07:49:17.060865772 +0100
+++ gcc/fold-mem-offsets.cc 2023-11-27 21:35:35.089007365 +0100
@@ -511,6 +511,7 @@ fold_offsets (rtx_insn *insn, rtx reg, b
   if (!success)
return 0;
 
+  unsigned luid = DF_INSN_LUID (def);
   for (ref_link = uses; ref_link; ref_link = ref_link->next)
{
  rtx_insn *use = DF_REF_INSN (ref_link->ref);
@@ -534,6 +535,11 @@ fold_offsets (rtx_insn *insn, rtx reg, b
  if (use_set && MEM_P (SET_DEST (use_set))
  && reg_mentioned_p (dest, SET_SRC (use_set)))
return 0;
+
+ /* Punt if use appears before def in the basic block.  See
+PR111601.  */
+ if (DF_INSN_INFO_LUID (DF_REF_INSN_INFO (ref_link->ref)) < luid)
+   return 0;
}
 
   bitmap_set_bit (&can_fold_insns, INSN_UID (def));
--- gcc/testsuite/g++.dg/opt/pr111601.C.jj  2023-11-27 21:33:12.605006881 
+0100
+++ gcc/testsuite/g++.dg/opt/pr111601.C 2023-11-27 21:34:47.267678510 +0100
@@ -0,0 +1,86 @@
+// PR bootstrap/111601
+// { dg-do run { target c++11 } }
+// { dg-options "-O2 -fno-exceptions -fno-rtti -fprofile-generate" }
+// { dg-require-profiling "-fprofile-generate" }
+// { dg-final { cleanup-coverage-files } }
+
+struct tree_base
+{
+  int code:16;
+};
+struct saved_scope
+{
+  void *pad[14];
+  int x_processing_template_decl;
+};
+struct saved_scope *scope_chain;
+struct z_candidate
+{
+  tree_base *fn;
+  void *pad[11];
+  z_candidate *next;
+  int viable;
+  int flags;
+};
+
+__attribute__((noipa)) struct z_candidate *
+splice_viable (

Re: [PATCH v3 00/11] : More warnings as errors by default

2023-11-27 Thread Sam James


Florian Weimer  writes:

> * Jeff Law:
>
>> On 11/20/23 02:55, Florian Weimer wrote:
>>> This revision addresses Marek's comment about handing
>>> -Wdeclaration-missing-parameter-type properly in conjunction with
>>> -fpermissive.  A new test (permerror-fpermissive-nowarning.c)
>>> demonstrates the expected behavior.  I added a test for -std=gnu89
>>> -fno-permissive, too.
>>> I'm including the precursor cleanup patches in this posting.
>>> Hopefully
>>> this will make the aarch64 tester happy.
>>> Thanks,
>>> Florian
>>> Florian Weimer (11):
>>>aarch64: Avoid -Wincompatible-pointer-types warning in Linux unwinder
>>>aarch64: Call named function in gcc.target/aarch64/aapcs64/ice_1.c
>>>gm2: Add missing declaration of m2pim_M2RTS_Terminate to test
>>>Add tests for validating future C permerrors
>>>c: Turn int-conversion warnings into permerrors
>>>c: Turn -Wimplicit-function-declaration into a permerror
>>>c: Turn -Wimplicit-int into a permerror
>>>c: Do not ignore some forms of -Wimplicit-int in system headers
>>>c: Turn -Wreturn-mismatch into a permerror
>>>c: Turn -Wincompatible-pointer-types into a permerror
>>>c: Add new -Wdeclaration-missing-parameter-type permerror
>
>> The series is fine by me.
>
> Thanks.
>
>> But give Marek additional time to chime in, particularly given the
>> holidays this week in the US.  Say through this time next week?
>
> [...]
>
> I'm also gathering some numbers regarding autoconf impact and potential
> silent miscompilation.

I'd actually forgot about another element here: FreeBSD 14 which was
just released now ships with Clang 16 so we seem to be getting some
activity from them which is a help.

I've resumed our testing for configure diffs and am going to
focus on that for now. It's just laborious because of how many errors
are actually fine.



Re: [RFA] New pass for sign/zero extension elimination

2023-11-27 Thread Jeff Law




On 11/27/23 13:03, Richard Sandiford wrote:

Joern Rennecke  writes:

  On 11/20/23 11:26, Richard Sandiford wrote:

+  /* ?!? What is the point of this adjustment to DST_MASK?  */
+  if (code == PLUS || code == MINUS
+  || code == MULT || code == ASHIFT)
+ dst_mask
+  = dst_mask ? ((2ULL << floor_log2 (dst_mask)) - 1) : 0;


Yeah, sympathise with the ?!? here :)

Jeff Law:

Inherited.  Like the other bit of magic I think I'll do a test with them
pulled out to see if I can make something undesirable trigger.


This represents the carry effect.  Even if the destination only cares about
some high order bits, you have to consider all lower order bits of the inputs.

For ASHIFT, you could refine this in the case of a constant shift count.


Ah, right.  Think it would be worth a comment.
Definitely.  Wouldn't SIGN_EXTEND have a similar problem?  While we 
don't care about all the low bits, we do care about that MSB.





But I wonder whether we should centralise all this code-specific
information into a single place.  I.e. rather than having one switch to
say "PLUS is OK" or "AND is OK", and then having code-specific handling
elsewhere, we could enumerate how to handle a code.
Yea.  That's where I was starting to go with the code which indicates we 
can't necessarily narrow a shift count.  ie, what are the properties of 
the opcodes and how do they translate into the bits we need clear from 
LIVENOW (for sets) and the bits we need to make live (for uses).


Jeff


Re: [RFA] New pass for sign/zero extension elimination

2023-11-27 Thread Richard Sandiford
[Sorry for the slow response]

Jeff Law  writes:
> On 11/20/23 11:26, Richard Sandiford wrote:
>> 
>>scalar_int_mode outer_mode;
>>if (!is_a (GET_MODE (x), &outer_mode)
>>|| GET_MODE_BITSIZE (outer_mode) > 64)
>>  continue;
> Wouldn't we also want to verify that the size is constant, or is it the 
> case that all the variable cases are vector (and would we want to 
> actually depend on that)?

Yeah, all the variable cases are vectors.  We don't support variable-length
scalars at the moment.  (And I hope that never changes. :))

>>> + /* We will handle the other operand of a binary operator
>>> +at the bottom of the loop by resetting Y.  */
>>> + if (BINARY_P (src))
>>> +   y = XEXP (src, 0);
>> 
>> What about UNARY_P, given that NOT is included in the codes above?
> We'll break that inner for(;;) then iterate into the subobject, marking 
> the relevant bits live.  FWIW, the control flow of this code continues 
> to be my biggest concern from a maintenance standpoint.  Figuring it out 
> was a major pain and I've tried to document what is and what is not 
> safe.  But it's still painful to walk through.
>
> I pondered if note_uses/note_stores would be better, but concluded we'd 
> just end up with a ton of state objects to carry around and reasoning 
> about that would be just as hard.

Feels like it would be good to handle the top-level structure explicitly,
(PARALLELs, SETs, SET_SRCs, etc.), then fall back to iteration at the
point that we can no longer do better then "all registers in this expression
are fully live".

If we do that, rtx_properties might be an alternative to explicit
iteration.  The advantage of that is that it can handle destination
and sources as the top-level expression, and records whether each
register is itself a destination or source.

Thanks,
Richard


Re: [RFA] New pass for sign/zero extension elimination

2023-11-27 Thread Richard Sandiford
Joern Rennecke  writes:
>  On 11/20/23 11:26, Richard Sandiford wrote:
>>> +  /* ?!? What is the point of this adjustment to DST_MASK?  */
>>> +  if (code == PLUS || code == MINUS
>>> +  || code == MULT || code == ASHIFT)
>>> + dst_mask
>>> +  = dst_mask ? ((2ULL << floor_log2 (dst_mask)) - 1) : 0;
>>
>> Yeah, sympathise with the ?!? here :)
> Jeff Law:
>> Inherited.  Like the other bit of magic I think I'll do a test with them
>> pulled out to see if I can make something undesirable trigger.
>
> This represents the carry effect.  Even if the destination only cares about
> some high order bits, you have to consider all lower order bits of the inputs.
>
> For ASHIFT, you could refine this in the case of a constant shift count.

Ah, right.  Think it would be worth a comment.

But I wonder whether we should centralise all this code-specific
information into a single place.  I.e. rather than having one switch to
say "PLUS is OK" or "AND is OK", and then having code-specific handling
elsewhere, we could enumerate how to handle a code.

Thanks,
Richard


[PATCH] Fortran: deferred-length character optional dummy arguments [PR93762,PR100651]

2023-11-27 Thread Harald Anlauf
Dear all,

the attached patch fixes the passing of deferred-length character
to optional dummy arguments: the character length shall be passed
by reference, not by value.

Original analysis of the issue by Steve in PR93762, independently
done by FX in PR100651.  The patch fixes both PRs.

Regtested on x86_64-pc-linux-gnu.  OK for mainline?

As the fix is local and affects only deferred-length character,
would it be ok to backport to 13-branch?

Thanks,
Harald

From 8ce1c8e7d0390361a1507000b7abbf6509b2fee9 Mon Sep 17 00:00:00 2001
From: Harald Anlauf 
Date: Mon, 27 Nov 2023 20:19:11 +0100
Subject: [PATCH] Fortran: deferred-length character optional dummy arguments
 [PR93762,PR100651]

gcc/fortran/ChangeLog:

	PR fortran/93762
	PR fortran/100651
	* trans-expr.cc (gfc_conv_missing_dummy): The character length for
	deferred-length dummy arguments is passed by reference, so that its
	value can be returned.  Adjust handling for optional dummies.

gcc/testsuite/ChangeLog:

	PR fortran/93762
	PR fortran/100651
	* gfortran.dg/optional_deferred_char_1.f90: New test.
---
 gcc/fortran/trans-expr.cc |  22 +++-
 .../gfortran.dg/optional_deferred_char_1.f90  | 100 ++
 2 files changed, 118 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gfortran.dg/optional_deferred_char_1.f90

diff --git a/gcc/fortran/trans-expr.cc b/gcc/fortran/trans-expr.cc
index 50c4604a025..e992f60d8bb 100644
--- a/gcc/fortran/trans-expr.cc
+++ b/gcc/fortran/trans-expr.cc
@@ -2116,10 +2116,24 @@ gfc_conv_missing_dummy (gfc_se * se, gfc_expr * arg, gfc_typespec ts, int kind)

   if (ts.type == BT_CHARACTER)
 {
-  tmp = build_int_cst (gfc_charlen_type_node, 0);
-  tmp = fold_build3_loc (input_location, COND_EXPR, gfc_charlen_type_node,
-			 present, se->string_length, tmp);
-  tmp = gfc_evaluate_now (tmp, &se->pre);
+  /* Handle deferred-length dummies that pass the character length by
+	 reference so that the value can be returned.  */
+  if (ts.deferred && INDIRECT_REF_P (se->string_length))
+	{
+	  tmp = gfc_build_addr_expr (NULL_TREE, se->string_length);
+	  tmp = fold_build3_loc (input_location, COND_EXPR, TREE_TYPE (tmp),
+ present, tmp, null_pointer_node);
+	  tmp = gfc_evaluate_now (tmp, &se->pre);
+	  tmp = build_fold_indirect_ref_loc (input_location, tmp);
+	}
+  else
+	{
+	  tmp = build_int_cst (gfc_charlen_type_node, 0);
+	  tmp = fold_build3_loc (input_location, COND_EXPR,
+ gfc_charlen_type_node,
+ present, se->string_length, tmp);
+	  tmp = gfc_evaluate_now (tmp, &se->pre);
+	}
   se->string_length = tmp;
 }
   return;
diff --git a/gcc/testsuite/gfortran.dg/optional_deferred_char_1.f90 b/gcc/testsuite/gfortran.dg/optional_deferred_char_1.f90
new file mode 100644
index 000..d399dd11ca2
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/optional_deferred_char_1.f90
@@ -0,0 +1,100 @@
+! { dg-do run }
+! PR fortran/93762
+! PR fortran/100651 - deferred-length character as optional dummy argument
+
+program main
+  implicit none
+  character(:), allocatable :: err_msg, msg3(:)
+  character(:), pointer :: err_msg2 => NULL()
+
+  ! Subroutines with optional arguments
+  call to_int ()
+  call to_int_p ()
+  call test_rank1 ()
+  call assert_code ()
+  call assert_p ()
+  call assert_rank1 ()
+
+  ! Test passing of optional arguments
+  call to_int (err_msg)
+  if (.not. allocated (err_msg)) stop 1
+  if (len (err_msg) /= 7)stop 2
+  if (err_msg(1:7) /= "foo bar") stop 3
+
+  call to_int2 (err_msg)
+  if (.not. allocated (err_msg)) stop 4
+  if (len (err_msg) /= 7)stop 5
+  if (err_msg(1:7) /= "foo bar") stop 6
+  deallocate (err_msg)
+
+  call to_int_p (err_msg2)
+  if (.not. associated (err_msg2)) stop 11
+  if (len (err_msg2) /= 8) stop 12
+  if (err_msg2(1:8) /= "poo bla ") stop 13
+  deallocate (err_msg2)
+
+  call to_int2_p (err_msg2)
+  if (.not. associated (err_msg2)) stop 14
+  if (len (err_msg2) /= 8) stop 15
+  if (err_msg2(1:8) /= "poo bla ") stop 16
+  deallocate (err_msg2)
+
+  call test_rank1 (msg3)
+  if (.not. allocated (msg3)) stop 21
+  if (len (msg3) /= 2)stop 22
+  if (size (msg3) /= 42)  stop 23
+  if (any (msg3 /= "ok")) stop 24
+  deallocate (msg3)
+
+contains
+
+  ! Deferred-length character, allocatable:
+  subroutine assert_code (err_msg0)
+character(:), optional, allocatable :: err_msg0
+if (present (err_msg0)) err_msg0 = 'foo bar'
+  end
+  ! Test: optional argument
+  subroutine to_int (err_msg1)
+character(:), optional, allocatable :: err_msg1
+call assert_code (err_msg1)
+  end
+  ! Control: non-optional argument
+  subroutine to_int2 (err_msg2)
+character(:), allocatable :: err_msg2
+call assert_code (err_msg2)
+  end
+
+  ! Rank-1:
+  subroutine assert_rank1 (msg)
+character(:), optional, allocatable, intent(out) :: msg(:)
+if (present (msg)) then
+   allocate (character(2) :: msg(42))
+   msg(:) = "ok"
+end if
+

Re: hurd: Ad default-pie and static-pie support

2023-11-27 Thread Samuel Thibault
Thomas Schwinge, le lun. 27 nov. 2023 15:52:02 +0100, a ecrit:
> On 2023-10-28T21:20:39+0200, Samuel Thibault  wrote:
> > This fixes the Hurd spec in the default-pie case, and adds static-pie
> > support.
> 
> I understand that your change does work for you as-is, so I've now pushed
> that to master branch in commit c768917402d4cba69a92c737e56e177f5b8ab0df
> "hurd: Ad default-pie and static-pie support", see attached.

Yes, thanks!
Samuel

> -
> Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
> München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
> Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
> München, HRB 106955

> From c768917402d4cba69a92c737e56e177f5b8ab0df Mon Sep 17 00:00:00 2001
> From: Samuel Thibault 
> Date: Sat, 6 May 2023 13:55:44 +0200
> Subject: [PATCH] hurd: Ad default-pie and static-pie support
> 
> This fixes the Hurd spec in the default-pie case, and adds static-pie
> support.
> 
> gcc/ChangeLog:
> 
>   * config/i386/gnu.h: Use PIE_SPEC, add static-pie case.
>   * config/i386/gnu64.h: Use PIE_SPEC, add static-pie case.
> ---
>  gcc/config/i386/gnu.h   | 6 +++---
>  gcc/config/i386/gnu64.h | 6 +++---
>  2 files changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/gcc/config/i386/gnu.h b/gcc/config/i386/gnu.h
> index 8dc6d9ee4e3..e776144f96c 100644
> --- a/gcc/config/i386/gnu.h
> +++ b/gcc/config/i386/gnu.h
> @@ -27,12 +27,12 @@ along with GCC.  If not, see 
> .
>  #undef   STARTFILE_SPEC
>  #if defined HAVE_LD_PIE
>  #define STARTFILE_SPEC \
> -  "%{!shared: 
> %{pg|p|profile:%{static:gcrt0.o%s;:gcrt1.o%s};pie:Scrt1.o%s;static:crt0.o%s;:crt1.o%s}}
>  \
> -   crti.o%s %{static:crtbeginT.o%s;shared|pie:crtbeginS.o%s;:crtbegin.o%s}"
> +  "%{!shared: 
> %{pg|p|profile:%{static-pie:grcrt0.o%s;static:gcrt0.o%s;:gcrt1.o%s};static-pie:rcrt0.o%s;static:crt0.o%s;"
>  PIE_SPEC ":Scrt1.o%s;:crt1.o%s}} \
> +   crti.o%s %{static:crtbeginT.o%s;shared|static-pie|" PIE_SPEC 
> ":crtbeginS.o%s;:crtbegin.o%s}"
>  #else
>  #define STARTFILE_SPEC \
>"%{!shared: 
> %{pg|p|profile:%{static:gcrt0.o%s;:gcrt1.o%s};static:crt0.o%s;:crt1.o%s}} \
> -   crti.o%s %{static:crtbeginT.o%s;shared|pie:crtbeginS.o%s;:crtbegin.o%s}"
> +   crti.o%s %{static:crtbeginT.o%s;shared:crtbeginS.o%s;:crtbegin.o%s}"
>  #endif
>  
>  #ifdef TARGET_LIBC_PROVIDES_SSP
> diff --git a/gcc/config/i386/gnu64.h b/gcc/config/i386/gnu64.h
> index a411f0e802a..332372fa067 100644
> --- a/gcc/config/i386/gnu64.h
> +++ b/gcc/config/i386/gnu64.h
> @@ -31,10 +31,10 @@ along with GCC.  If not, see 
> .
>  #undef   STARTFILE_SPEC
>  #if defined HAVE_LD_PIE
>  #define STARTFILE_SPEC \
> -  "%{!shared: 
> %{pg|p|profile:%{static:gcrt0.o%s;:gcrt1.o%s};pie:Scrt1.o%s;static:crt0.o%s;:crt1.o%s}}
>  \
> -   crti.o%s %{static:crtbeginT.o%s;shared|pie:crtbeginS.o%s;:crtbegin.o%s}"
> +  "%{!shared: 
> %{pg|p|profile:%{static-pie:grcrt0.o%s;static:gcrt0.o%s;:gcrt1.o%s};static-pie:rcrt0.o%s;static:crt0.o%s;"
>  PIE_SPEC ":Scrt1.o%s;:crt1.o%s}} \
> +   crti.o%s %{static:crtbeginT.o%s;shared|static-pie|" PIE_SPEC 
> ":crtbeginS.o%s;:crtbegin.o%s}"
>  #else
>  #define STARTFILE_SPEC \
>"%{!shared: 
> %{pg|p|profile:%{static:gcrt0.o%s;:gcrt1.o%s};static:crt0.o%s;:crt1.o%s}} \
> -   crti.o%s %{static:crtbeginT.o%s;shared|pie:crtbeginS.o%s;:crtbegin.o%s}"
> +   crti.o%s %{static:crtbeginT.o%s;shared|static-pie|" PIE_SPEC 
> ":crtbeginS.o%s;:crtbegin.o%s}"
>  #endif
> -- 
> 2.34.1
> 


-- 
Samuel
---
Pour une évaluation indépendante, transparente et rigoureuse !
Je soutiens la Commission d'Évaluation de l'Inria.


Re: hurd: Add multilib paths for gnu-x86_64

2023-11-27 Thread Samuel Thibault
Hello,

Thomas Schwinge, le lun. 27 nov. 2023 15:48:33 +0100, a ecrit:
> On 2023-10-28T21:19:59+0200, Samuel Thibault  wrote:
> > This is essentially based on t-linux64 version.
> 
> Yes, but isn't the overall setup diverged from GNU/Linux?

Not sure what you mean exactly?
I just meant that the content of t-gnu64 is almost the same as
t-linux64, the only difference being the multiarch path.

> Currently, x86_64 GNU/Hurd first gets 'i386/t-linux64', whose definitons
> are only later:
> 
> > --- a/gcc/config.gcc
> > +++ b/gcc/config.gcc
> > @@ -5828,6 +5828,9 @@ case ${target} in
> >   visium-*-*)
> >   target_cpu_default2="TARGET_CPU_$with_cpu"
> >   ;;
> > + x86_64-*-gnu*)
> > + tmake_file="$tmake_file i386/t-gnu64"
> > + ;;
> >  esac
> 
> ... then here (effectively) overwritten by 'i386/t-gnu64'.

Yes, like it is done for the x86_64-*-freebsd*) case

> Instead, I suppose, we should handle 'i386/t-linux64' and
> 'i386/t-gnu64' alike, and resolve relevant configuration differences.

So essentially move 

tmake_file="${tmake_file} i386/t-linux64"

down from where it is currently, to the 

# Set some miscellaneous flags for particular targets.
target_cpu_default2=
case ${target} in

part? That should be fine for kfreebsd as well.

> As fas a I can tell, 'i386/t-linux64' is also used for multilib-enabled
> ('test x$enable_targets = xall') x86 GNU/Linux, and that's not
> (correspondingly) done for x86 GNU/Hurd?

We don't really plan to support 32/64 multilib support in GNU/Hurd.

> However, such things can certainly be resolved incrementally, later on.
> I understand that your change does work for you as-is,

Thanks for your understanding :) that'll help pushing further in Debian.

Samuel


Re: [PATCH v2] Fixed problem with BTF defining smaller enums.

2023-11-27 Thread David Faust
Hi Cupertino,

On 11/27/23 09:21, Cupertino Miranda wrote:
> Hi everyone,
> 
> David: Thanks for the v1 review.
> 
> This version adds the following;
>  - test case,
>  - improves condition logic,
>  - fixes mask typo.
> 
> Looking forward to your review.

v2 LGTM, please apply.
Thanks!

> 
> v1 at: https://gcc.gnu.org/pipermail/gcc-patches/2023-November/636391.html
> 
> Cheers,
> Cupertino
> 
> 
> 0004-Fixed-problem-with-BTF-defining-smaller-enums.patch
> 
> commit 3f89d352a4ee90882089142d743f8a748013b5fe
> Author: Cupertino Miranda 
> Date:   Fri Nov 10 14:02:30 2023 +
> 
> Fixed problem with BTF defining smaller enums.
> 
> This patch fixes a BTF, which would become invalid when having
> smaller then 4 byte definitions of enums.
> For example, when using the __attribute__((mode(byte))) in the enum
> definition.
> 
> Two problems were identified:
>  - it would incorrectly create an entry for enum64 when the size of the
>enum was different then 4.
>  - it would allocate less then 4 bytes for the value entry in BTF, in
>case the type was smaller.
> 
> BTF generated was validated against clang.
> 
> gcc/ChangeLog:
> * bpfout.cc (btf_calc_num_vbytes): Fixed logic for enum64.
> (btf_asm_enum_const): Corrected logic for enum64 and smaller
> than 4 bytes values.
> 
> gcc/testsuite/ChangeLog:
> gcc.dg/debug/btf/btf-enum-small.c: Added test.
> 
> diff --git a/gcc/btfout.cc b/gcc/btfout.cc
> index e07fed302c24..5f2e99ce4725 100644
> --- a/gcc/btfout.cc
> +++ b/gcc/btfout.cc
> @@ -299,7 +299,7 @@ btf_calc_num_vbytes (ctf_dtdef_ref dtd)
>break;
>  
>  case BTF_KIND_ENUM:
> -  vlen_bytes += (dtd->dtd_data.ctti_size == 0x8)
> +  vlen_bytes += (dtd->dtd_data.ctti_size > 4)
>   ? vlen * sizeof (struct btf_enum64)
>   : vlen * sizeof (struct btf_enum);
>break;
> @@ -914,8 +914,8 @@ btf_asm_enum_const (unsigned int size, ctf_dmdef_t * dmd, 
> unsigned int idx)
>  {
>dw2_asm_output_data (4, dmd->dmd_name_offset, "ENUM_CONST '%s' idx=%u",
>  dmd->dmd_name, idx);
> -  if (size == 4)
> -dw2_asm_output_data (size, dmd->dmd_value, "bte_value");
> +  if (size <= 4)
> +dw2_asm_output_data (size < 4 ? 4 : size, dmd->dmd_value, "bte_value");
>else
>  {
>dw2_asm_output_data (4, dmd->dmd_value & 0x, "bte_value_lo32");
> diff --git a/gcc/testsuite/gcc.dg/debug/btf/btf-enum-small.c 
> b/gcc/testsuite/gcc.dg/debug/btf/btf-enum-small.c
> new file mode 100644
> index ..eb8a1bd2c438
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/debug/btf/btf-enum-small.c
> @@ -0,0 +1,28 @@
> +/* Test BTF generation for small enums.  */
> +
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -gbtf -dA" } */
> +
> +/* { dg-final { scan-assembler-not "bte_value_lo32" } } */
> +/* { dg-final { scan-assembler-not "bte_value_hi32" } } */
> +/* { dg-final { scan-assembler-times "\[\t \]0x602\[\t 
> \]+\[^\n\]*btt_info" 1 } } */
> +/* { dg-final { scan-assembler-times " ENUM_CONST 'eSMALL' idx=0" 1 } } */
> +/* { dg-final { scan-assembler-times " ENUM_CONST 'eSMALLY' idx=1" 1 } } */
> +/* { dg-final { scan-assembler-times "ascii \"eSMALL.0\"\[\t 
> \]+\[^\n\]*btf_string" 1 } } */
> +/* { dg-final { scan-assembler-times "ascii \"eSMALLY.0\"\[\t 
> \]+\[^\n\]*btf_string" 1 } } */
> +/* { dg-final { scan-assembler-times "bte_value" 2 } } */
> +
> +enum smalled_enum
> +{
> +  eSMALL,
> +  eSMALLY,
> +} __attribute__((mode(byte)));
> +
> +struct root_struct {
> +  enum smalled_enum esmall;
> +};
> +
> +enum smalled_enum
> +foo(struct root_struct *root) {
> +  return root->esmall;
> +}
> 


Re: [r14-5666 Regression] FAIL: gcc.dg/tree-prof/time-profiler-3.c scan-ipa-dump-times profile "Read tp_first_run: 2" 1 on Linux/x86_64

2023-11-27 Thread Andrew Pinski
On Mon, Nov 27, 2023 at 12:00 AM Sebastian Huber
 wrote:
>
> On 26.11.23 12:18, haochen.jiang wrote:
> > On Linux/x86_64,
> >
> > 41aacdea55c5d795a7aa195357d966645845d00e is the first bad commit
> > commit 41aacdea55c5d795a7aa195357d966645845d00e
> > Author: Sebastian Huber
> > Date:   Mon Nov 20 15:26:38 2023 +0100
> >
> >  gcov: Fix integer types in gen_counter_update()
> >
> > caused
> >
> > FAIL: gcc.dg/tree-prof/time-profiler-3.c scan-ipa-dump-times profile "Read 
> > tp_first_run: 0" 1
> > FAIL: gcc.dg/tree-prof/time-profiler-3.c scan-ipa-dump-times profile "Read 
> > tp_first_run: 2" 1
>
> Please have a look at:
>
> https://gcc.gnu.org/pipermail/gcc-patches/2023-November/638104.html

Also https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112689 .
Anyways I am testing a patch to fix this one.

Thanks,
Andrew

>
> --
> embedded brains GmbH & Co. KG
> Herr Sebastian HUBER
> Dornierstr. 4
> 82178 Puchheim
> Germany
> email: sebastian.hu...@embedded-brains.de
> phone: +49-89-18 94 741 - 16
> fax:   +49-89-18 94 741 - 08
>
> Registergericht: Amtsgericht München
> Registernummer: HRB 157899
> Vertretungsberechtigte Geschäftsführer: Peter Rasmussen, Thomas Dörfler
> Unsere Datenschutzerklärung finden Sie hier:
> https://embedded-brains.de/datenschutzerklaerung/


Re: [PATCH v6 0/21]middle-end: Support early break/return auto-vectorization

2023-11-27 Thread Richard Sandiford
Catching up on backlog, so this might already be resolved, but:

Richard Biener  writes:
> On Tue, 7 Nov 2023, Tamar Christina wrote:
>
>> > -Original Message-
>> > From: Richard Biener 
>> > Sent: Tuesday, November 7, 2023 9:43 AM
>> > To: Tamar Christina 
>> > Cc: gcc-patches@gcc.gnu.org; nd 
>> > Subject: RE: [PATCH v6 0/21]middle-end: Support early break/return auto-
>> > vectorization
>> > 
>> > On Mon, 6 Nov 2023, Tamar Christina wrote:
>> > 
>> > > > -Original Message-
>> > > > From: Richard Biener 
>> > > > Sent: Monday, November 6, 2023 2:25 PM
>> > > > To: Tamar Christina 
>> > > > Cc: gcc-patches@gcc.gnu.org; nd 
>> > > > Subject: Re: [PATCH v6 0/21]middle-end: Support early break/return
>> > > > auto- vectorization
>> > > >
>> > > > On Mon, 6 Nov 2023, Tamar Christina wrote:
>> > > >
>> > > > > Hi All,
>> > > > >
>> > > > > This patch adds initial support for early break vectorization in GCC.
>> > > > > The support is added for any target that implements a vector
>> > > > > cbranch optab, this includes both fully masked and non-masked 
>> > > > > targets.
>> > > > >
>> > > > > Depending on the operation, the vectorizer may also require
>> > > > > support for boolean mask reductions using Inclusive OR.  This is
>> > > > > however only checked then the comparison would produce multiple
>> > statements.
>> > > > >
>> > > > > Note: I am currently struggling to get patch 7 correct in all
>> > > > > cases and could
>> > > > use
>> > > > >   some feedback there.
>> > > > >
>> > > > > Concretely the kind of loops supported are of the forms:
>> > > > >
>> > > > >  for (int i = 0; i < N; i++)
>> > > > >  {
>> > > > >
>> > > > >if ()
>> > > > >  {
>> > > > >...
>> > > > >;
>> > > > >  }
>> > > > >
>> > > > >  }
>> > > > >
>> > > > > where  can be:
>> > > > >  - break
>> > > > >  - return
>> > > > >  - goto
>> > > > >
>> > > > > Any number of statements can be used before the  occurs.
>> > > > >
>> > > > > Since this is an initial version for GCC 14 it has the following
>> > > > > limitations and
>> > > > > features:
>> > > > >
>> > > > > - Only fixed sized iterations and buffers are supported.  That is to 
>> > > > > say any
>> > > > >   vectors loaded or stored must be to statically allocated arrays 
>> > > > > with
>> > known
>> > > > >   sizes. N must also be known.  This limitation is because our 
>> > > > > primary
>> > target
>> > > > >   for this optimization is SVE.  For VLA SVE we can't easily do 
>> > > > > cross page
>> > > > >   iteraion checks. The result is likely to also not be beneficial. 
>> > > > > For that
>> > > > >   reason we punt support for variable buffers till we have 
>> > > > > First-Faulting
>> > > > >   support in GCC.
>> > 
>> > Btw, for this I wonder if you thought about marking memory accesses 
>> > required
>> > for the early break condition as required to be vector-size aligned, thus 
>> > peeling
>> > or versioning them for alignment?  That should ensure they do not fault.
>> > 
>> > OTOH I somehow remember prologue peeling isn't supported for early break
>> > vectorization?  ..
>> > 
>> > > > > - any stores in  should not be to the same objects as in
>> > > > >   .  Loads are fine as long as they don't have the 
>> > > > > possibility to
>> > > > >   alias.  More concretely, we block RAW dependencies when the
>> > > > > intermediate
>> > > > value
>> > > > >   can't be separated fromt the store, or the store itself can't be 
>> > > > > moved.
>> > > > > - Prologue peeling, alignment peelinig and loop versioning are 
>> > > > > supported.
>> > 
>> > .. but here you say it is.  Not sure if peeling for alignment works for 
>> > VLA vectors
>> > though.  Just to say x86 doesn't support first-faulting loads.
>> 
>> For VLA we support it through masking.  i.e. if you need to peel N 
>> iterations, we
>> generate a masked copy of the loop vectorized which masks off the first N 
>> bits.
>> 
>> This is not typically needed, but we do support it.  But the problem with 
>> this
>> scheme and early break is obviously that the peeled loop needs to be 
>> vectorized
>> so you kinda end up with the same issue again.  So Atm it rejects it for VLA.
>
> Hmm, I see.  I thought peeling by masking is an optimization.

Yeah, it's an opt-in optimisation.  No current Arm cores opt in though.

> Anyhow, I think it should still work here - since all accesses are aligned
> and we know that there's at least one original scalar iteration in the
> first masked and the following "unmasked" vector iterations there
> should never be faults for any of the aligned accesses.

Peeling via masking works by using the main loop for the "peeled"
iteration (so it's a bit of a misnomer).  The vector pointers start
out lower than the original scalar pointers, with some leading
inactive elements.

The awkwardness would be in skipping those leading inactive elements
in the epilogue, if an early break occurs in the first vector iteration.
Definitely doable, b

Re: [RFA] New pass for sign/zero extension elimination

2023-11-27 Thread Joern Rennecke
You are applying PATTERN to an INSN_LIST.
diff --git a/gcc/ext-dce.cc b/gcc/ext-dce.cc
index 52032b50951..4523654538c 100644
--- a/gcc/ext-dce.cc
+++ b/gcc/ext-dce.cc
@@ -122,10 +122,9 @@ safe_for_live_propagation (rtx_code code)
optimziation phase during use handling will be.  */
 
 static void
-ext_dce_process_sets (rtx insn, bitmap livenow, bitmap live_tmp)
+ext_dce_process_sets (rtx insn, rtx pat, bitmap livenow, bitmap live_tmp)
 {
   subrtx_iterator::array_type array;
-  rtx pat = PATTERN (insn);
   FOR_EACH_SUBRTX (iter, array, pat, NONCONST)
 {
   const_rtx x = *iter;
@@ -377,7 +376,7 @@ binop_implies_op2_fully_live (rtx_code code)
eliminated in CHANGED_PSEUDOS.  */
 
 static void
-ext_dce_process_uses (rtx insn, bitmap livenow, bitmap live_tmp,
+ext_dce_process_uses (rtx insn, rtx pat, bitmap livenow, bitmap live_tmp,
  bool modify, bitmap changed_pseudos)
 {
   /* A nonlocal goto implicitly uses the frame pointer.  */
@@ -389,7 +388,6 @@ ext_dce_process_uses (rtx insn, bitmap livenow, bitmap 
live_tmp,
 }
 
   subrtx_var_iterator::array_type array_var;
-  rtx pat = PATTERN (insn);
   FOR_EACH_SUBRTX_VAR (iter, array_var, pat, NONCONST)
 {
   /* An EXPR_LIST (from call fusage) ends in NULL_RTX.  */
@@ -640,15 +638,16 @@ ext_dce_process_bb (basic_block bb, bitmap livenow,
   bitmap live_tmp = BITMAP_ALLOC (NULL);
 
   /* First process any sets/clobbers in INSN.  */
-  ext_dce_process_sets (insn, livenow, live_tmp);
+  ext_dce_process_sets (insn, PATTERN (insn), livenow, live_tmp);
 
   /* CALL_INSNs need processing their fusage data.  */
   if (GET_CODE (insn) == CALL_INSN)
-   ext_dce_process_sets (CALL_INSN_FUNCTION_USAGE (insn),
+   ext_dce_process_sets (insn, CALL_INSN_FUNCTION_USAGE (insn),
  livenow, live_tmp);
 
   /* And now uses, optimizing away SIGN/ZERO extensions as we go.  */
-  ext_dce_process_uses (insn, livenow, live_tmp, modify, changed_pseudos);
+  ext_dce_process_uses (insn, PATTERN (insn), livenow, live_tmp, modify,
+   changed_pseudos);
 
   /* And process fusage data for the use as well.  */
   if (GET_CODE (insn) == CALL_INSN)
@@ -663,7 +662,7 @@ ext_dce_process_bb (basic_block bb, bitmap livenow,
  if (global_regs[i])
bitmap_set_range (livenow, i * 4, 4);
 
- ext_dce_process_uses (CALL_INSN_FUNCTION_USAGE (insn),
+ ext_dce_process_uses (insn, CALL_INSN_FUNCTION_USAGE (insn),
livenow, live_tmp, modify, changed_pseudos);
}
 


Re: [PATCH] tree-sra: Avoid returns of references to SRA candidates

2023-11-27 Thread Andrew Pinski
On Mon, Nov 27, 2023 at 10:16 AM Martin Jambor  wrote:
>
> Hi,
>
> The enhancement to address PR 109849 contained an importsnt thinko,
> and that any reference that is passed to a function and does not
> escape, must also not happen to be aliased by the return value of the
> function.  This has quickly transpired as bugs PR 112711 and PR
> 112721.
>
> Just as IPA-modref does a good enough job to allow us to rely on the
> escaped set of variables, it sems to be doing well also on updating
> EAF_NOT_RETURNED_DIRECTLY call argument flag which happens to address
> exactly the situation we need to avoid.  Of course, if a call
> statement ignores any returned value, we also do not need to check the
> flag.
>
> Hopefully this does not pessimize things too much, I have verified
> that the PR 109849 testcae remains quick and so should also the
> benchmark it is derived from.
>
> The patch has passed bootstrap and testing on x86_64-linux, OK for
> master?
>
> Thanks,
>
> Martin
>
>
> gcc/ChangeLog:
>
> 2023-11-27  Martin Jambor  
>
> PR tree-optimization/112711
> PR tree-optimization/112721
> * tree-sra.cc (build_access_from_call_arg): New parameter
> CAN_BE_RETURNED, disqualify any candidate passed by reference if it is
> true.  Adjust leading comment.
> (scan_function): Pass appropriate value to CAN_BE_RETURNED of
> build_access_from_call_arg.
>
> gcc/testsuite/ChangeLog:
>
> 2023-11-27  Martin Jambor  
>
> PR tree-optimization/112711
> PR tree-optimization/112721
> * g++.dg/tree-ssa/pr112711.C: New test.
> * gcc.dg/tree-ssa/pr112721.c: Likewise.
> ---
>  gcc/testsuite/g++.dg/tree-ssa/pr112711.C | 31 ++
>  gcc/testsuite/gcc.dg/tree-ssa/pr112721.c | 26 +++
>  gcc/tree-sra.cc  | 40 ++--
>  3 files changed, 88 insertions(+), 9 deletions(-)
>  create mode 100644 gcc/testsuite/g++.dg/tree-ssa/pr112711.C
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112721.c
>
> diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr112711.C 
> b/gcc/testsuite/g++.dg/tree-ssa/pr112711.C
> new file mode 100644
> index 000..c04524b04a7
> --- /dev/null
> +++ b/gcc/testsuite/g++.dg/tree-ssa/pr112711.C
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-options "-O1" } */
> +
> +typedef  int i32;
> +typedef unsigned int u32;
> +
> +static inline void write_i32(void *memory, i32 value) {
> +  // swap i32 bytes as if it was u32:
> +  u32 u_value = value;
> +  value = __builtin_bswap32(u_value);
> +
> +  // llvm infers '1' alignment from destination type
> +  __builtin_memcpy(__builtin_assume_aligned(memory, 1), &value, 
> sizeof(value));
> +}
> +
> +__attribute__((noipa))
> +static void bug (void) {
> +  #define assert_eq(lhs, rhs) if (lhs != rhs) __builtin_trap()
> +
> +  unsigned char data[5];
> +  write_i32(data, -1362446643);
> +  assert_eq(data[0], 0xAE);
> +  assert_eq(data[1], 0xCA);
> +  write_i32(data + 1, -1362446643);
> +  assert_eq(data[1], 0xAE);
> +}

Only a comment on this testcase, it is only valid for little-endian
and 32bit int targets.
You can modify it to fix it for both though.

Thanks,
Andrew

> +
> +int main() {
> +bug();
> +return 0;
> +}
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112721.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/pr112721.c
> new file mode 100644
> index 000..adf62613266
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112721.c
> @@ -0,0 +1,26 @@
> +/* { dg-do run } */
> +/* { dg-options "-O1" } */
> +
> +unsigned * volatile gv;
> +
> +struct a {
> +  int b;
> +};
> +int c, e;
> +long d;
> +unsigned * __attribute__((noinline))
> +f(unsigned *g) {
> +  for (; c;)
> +e = d;
> +  return gv ? gv : g;
> +}
> +int main() {
> +  int *h;
> +  struct a i = {8};
> +  int *j = &i.b;
> +  h = (unsigned *) f(j);
> +  *h = 0;
> +  if (i.b != 0)
> +__builtin_abort ();
> +  return 0;
> +}
> diff --git a/gcc/tree-sra.cc b/gcc/tree-sra.cc
> index 3a0d52675fe..6a759783990 100644
> --- a/gcc/tree-sra.cc
> +++ b/gcc/tree-sra.cc
> @@ -1268,18 +1268,27 @@ abnormal_edge_after_stmt_p (gimple *stmt, enum 
> out_edge_check *oe_check)
>  }
>
>  /* Scan expression EXPR which is an argument of a call and create access
> -   structures for all accesses to candidates for scalarization.  Return true 
> if
> -   any access has been inserted.  STMT must be the statement from which the
> -   expression is taken.  */
> +   structures for all accesses to candidates for scalarization.  Return true
> +   if any access has been inserted.  STMT must be the statement from which 
> the
> +   expression is taken.  CAN_BE_RETURNED must be true if call argument flags
> +   do not rule out that the argument is directly returned.  OE_CHECK is used
> +   to remember result of a test for abnormal outgoing edges after this
> +   statement.  */
>
>  static bool
> -build_access_from_call_arg (tree expr, gimple *stmt,
> +build_access_from_call_arg (tree exp

Re: [PATCH V2 3/3] OpenMP: Use enumerators for names of trait-sets and traits

2023-11-27 Thread Tobias Burnus

On 27.11.23 18:19, Tobias Burnus wrote:

+   { "unified_address",
+ (1 << OMP_TRAIT_SET_IMPLEMENTATION),
+ OMP_TRAIT_PROPERTY_NONE, true,
+ NULL
+   },


I don't understand this code. This looks as if "requires" and
"unified_address"
are on the same level but in my understanding they have to be used as in:

 match(implementation = {requires(unified_address,
atomic_default_mem_order_properties(release)})

while from the syntax, it looks as if this would permit:

 match(implementation = {unified_address,
atomic_default_mem_order_properties(release))



Sandra pointed me to the spec: OpenMP 5.0 only permits the latter, i.e.
using the clause names of 'requires' directly. Since OpenMP 5.1, this
use is deprecated (removed in TR11/TR12) - in favor of the first syntax,
i.e. using them as argument to 'requires()'.

Thus, the code is fine. — And shows all the joy needing to read multiple
spec versions at the same time without getting confused.

Tobias

-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955


[PATCH] tree-sra: Avoid returns of references to SRA candidates

2023-11-27 Thread Martin Jambor
Hi,

The enhancement to address PR 109849 contained an importsnt thinko,
and that any reference that is passed to a function and does not
escape, must also not happen to be aliased by the return value of the
function.  This has quickly transpired as bugs PR 112711 and PR
112721.

Just as IPA-modref does a good enough job to allow us to rely on the
escaped set of variables, it sems to be doing well also on updating
EAF_NOT_RETURNED_DIRECTLY call argument flag which happens to address
exactly the situation we need to avoid.  Of course, if a call
statement ignores any returned value, we also do not need to check the
flag.

Hopefully this does not pessimize things too much, I have verified
that the PR 109849 testcae remains quick and so should also the
benchmark it is derived from.

The patch has passed bootstrap and testing on x86_64-linux, OK for
master?

Thanks,

Martin


gcc/ChangeLog:

2023-11-27  Martin Jambor  

PR tree-optimization/112711
PR tree-optimization/112721
* tree-sra.cc (build_access_from_call_arg): New parameter
CAN_BE_RETURNED, disqualify any candidate passed by reference if it is
true.  Adjust leading comment.
(scan_function): Pass appropriate value to CAN_BE_RETURNED of
build_access_from_call_arg.

gcc/testsuite/ChangeLog:

2023-11-27  Martin Jambor  

PR tree-optimization/112711
PR tree-optimization/112721
* g++.dg/tree-ssa/pr112711.C: New test.
* gcc.dg/tree-ssa/pr112721.c: Likewise.
---
 gcc/testsuite/g++.dg/tree-ssa/pr112711.C | 31 ++
 gcc/testsuite/gcc.dg/tree-ssa/pr112721.c | 26 +++
 gcc/tree-sra.cc  | 40 ++--
 3 files changed, 88 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/tree-ssa/pr112711.C
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112721.c

diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr112711.C 
b/gcc/testsuite/g++.dg/tree-ssa/pr112711.C
new file mode 100644
index 000..c04524b04a7
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr112711.C
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+
+typedef  int i32;
+typedef unsigned int u32;
+
+static inline void write_i32(void *memory, i32 value) {
+  // swap i32 bytes as if it was u32:
+  u32 u_value = value;
+  value = __builtin_bswap32(u_value);
+
+  // llvm infers '1' alignment from destination type
+  __builtin_memcpy(__builtin_assume_aligned(memory, 1), &value, sizeof(value));
+}
+
+__attribute__((noipa))
+static void bug (void) {
+  #define assert_eq(lhs, rhs) if (lhs != rhs) __builtin_trap()
+
+  unsigned char data[5];
+  write_i32(data, -1362446643);
+  assert_eq(data[0], 0xAE);
+  assert_eq(data[1], 0xCA);
+  write_i32(data + 1, -1362446643);
+  assert_eq(data[1], 0xAE);
+}
+
+int main() {
+bug();
+return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112721.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr112721.c
new file mode 100644
index 000..adf62613266
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112721.c
@@ -0,0 +1,26 @@
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+
+unsigned * volatile gv;
+
+struct a {
+  int b;
+};
+int c, e;
+long d;
+unsigned * __attribute__((noinline))
+f(unsigned *g) {
+  for (; c;)
+e = d;
+  return gv ? gv : g;
+}
+int main() {
+  int *h;
+  struct a i = {8};
+  int *j = &i.b;
+  h = (unsigned *) f(j);
+  *h = 0;
+  if (i.b != 0)
+__builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-sra.cc b/gcc/tree-sra.cc
index 3a0d52675fe..6a759783990 100644
--- a/gcc/tree-sra.cc
+++ b/gcc/tree-sra.cc
@@ -1268,18 +1268,27 @@ abnormal_edge_after_stmt_p (gimple *stmt, enum 
out_edge_check *oe_check)
 }
 
 /* Scan expression EXPR which is an argument of a call and create access
-   structures for all accesses to candidates for scalarization.  Return true if
-   any access has been inserted.  STMT must be the statement from which the
-   expression is taken.  */
+   structures for all accesses to candidates for scalarization.  Return true
+   if any access has been inserted.  STMT must be the statement from which the
+   expression is taken.  CAN_BE_RETURNED must be true if call argument flags
+   do not rule out that the argument is directly returned.  OE_CHECK is used
+   to remember result of a test for abnormal outgoing edges after this
+   statement.  */
 
 static bool
-build_access_from_call_arg (tree expr, gimple *stmt,
+build_access_from_call_arg (tree expr, gimple *stmt, bool can_be_returned,
enum out_edge_check *oe_check)
 {
   if (TREE_CODE (expr) == ADDR_EXPR)
 {
   tree base = get_base_address (TREE_OPERAND (expr, 0));
 
+  if (can_be_returned)
+   {
+ disqualify_base_of_expr (base, "Address possibly returned, "
+  "leading to an alis SRA may not know.");
+ return false;
+   }
   if (abnormal_edge_after_stmt_p (stmt, oe_check))
{
   

[committed] arm: libgcc: tweak warning from __sync_synchronize

2023-11-27 Thread Richard Earnshaw

My previous patch to add an implementation of __sync_syncrhonize with
a warning trips a testsuite failure in fortran (and possibly other
languages as well) as the framework expects no blank lines in the
output, but this warning was generating one.  So remove the newline
from the end of the message and rely on the one added by the linker
instead.

Since we're there, remove the trailing period from the message as
well, since the convention seems to be not to have one.

libgcc/

* config/arm/lib1funcs.S (__sync_synchronize): Adjust warning message.
---
 libgcc/config/arm/lib1funcs.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index 78887861616..40e9a7a87fb 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -2214,7 +2214,7 @@ LSYM(Lchange_\register):
 	.ascii "no effect.  Relink with\n"
 	.ascii "  -specs=sync-{none,dmb,cp15dmb}.specs\n"
 	.ascii "to specify exactly which barrier format to use and avoid "
-	.ascii "this warning.\n\0"
+	.ascii "this warning\0"
 #endif
 #endif
 #endif


Re: [RFA] New pass for sign/zero extension elimination

2023-11-27 Thread Joern Rennecke
 On 11/20/23 11:26, Richard Sandiford wrote:
>> +  /* ?!? What is the point of this adjustment to DST_MASK?  */
>> +  if (code == PLUS || code == MINUS
>> +  || code == MULT || code == ASHIFT)
>> + dst_mask
>> +  = dst_mask ? ((2ULL << floor_log2 (dst_mask)) - 1) : 0;
>
> Yeah, sympathise with the ?!? here :)
Jeff Law:
> Inherited.  Like the other bit of magic I think I'll do a test with them
> pulled out to see if I can make something undesirable trigger.

This represents the carry effect.  Even if the destination only cares about
some high order bits, you have to consider all lower order bits of the inputs.

For ASHIFT, you could refine this in the case of a constant shift count.


Re: [RFA] New pass for sign/zero extension elimination

2023-11-27 Thread Joern Rennecke
On 11/20/23 11:26, Richard Sandiford wrote:

>> +
>> +  mask = GET_MODE_MASK (GET_MODE (SUBREG_REG (x))) << bit;
>> +  if (!mask)
>> + mask = -0x1ULL;
>
> Not sure I follow this.  What does the -0x1ULL constant indicate?
> Also, isn't it the mask of the outer register that is shifted, rather
> than the mask of the inner mode?  E.g. if we have:
Jeff Law:
> Inherited.  I should have marked it like the other one as needing
> investigation.  Probably the fastest way is to just rip it out for a
> test to see what breaks.

This is for support of types wider than DImode.

You unsupported tracking of these values in various places, though.


[PATCH v2] Fortran: fix reallocation on assignment of polymorphic variables [PR110415]

2023-11-27 Thread Andrew Jenner

This is the second version of the patch - previous discussion at:
https://gcc.gnu.org/pipermail/gcc-patches/2023-November/636671.html

This patch adds the testcase from PR110415 and fixes the bug.

The problem is that in a couple of places in trans_class_assignment in
trans-expr.cc, we need to get the run-time size of the polymorphic
object from the vtbl, but we are currently getting that vtbl from the
lhs of the assignment rather than the rhs. This gives us the old value
of the size but we need to pass the new size to __builtin_malloc and
__builtin_realloc.

I'm fixing this by adding a parameter to trans_class_vptr_len_assignment
to retrieve the tree corresponding the vptr from the object on the rhs
of the assignment, and then passing this where it is needed. In the case
where trans_class_vptr_len_assignment returns NULL_TREE for the rhs vptr
we use the lhs vptr as before.

To get this to work I also needed to change the implementation of
trans_class_vptr_len_assignment to create a temporary for the assignment
in more circumstances. Currently, the "a = func()" assignment in MAIN__
doesn't hit the "Create a temporary for complication expressions" case
on line 9951 because "DECL_P (rse->expr)" is true - the expression has
already been placed into a temporary. That means we don't hit the "if
(temp_rhs ..." case on line 10038 and go on to get the vptr_expr from
"gfc_lval_expr_from_sym (gfc_find_vtab (&re->ts))" on line 10057 which
is the vtbl of the static type rather than the dynamic one from the rhs.
So with this fix we create an extra temporary, but that should be
optimised away in the middle-end so there should be no run-time effect.

I'm not sure if this is the best way to fix this (the Fortran front-end
is new territory for me) but I've verified that the testcase passes with
this change, fails without it, and that the change does not introduce
any FAILs when running the gfortran testcases on x86_64-pc-linux-gnu.

After the previous submission, Tobias Burnus found a closely related 
problem and contributed testcases and a fix for it, which I have 
incorporated into this version of the patch. The problem in this case is 
with the __builtin_realloc call that is executed if one polymorphic 
variable is replaced by another. The return value of this call was being 
ignored rather than used to replace the pointer being reallocated.


Is this OK for mainline, GCC 13 and OG13?

Thanks,

Andrew

gcc/fortran/
 PR fortran/110415
 * trans-expr.cc (trans_class_vptr_len_assignment): Add
 from_vptrp parameter. Populate it. Don't check for DECL_P
 when deciding whether to create temporary.
 (trans_class_pointer_fcn, gfc_trans_pointer_assignment): Add
 NULL argument to trans_class_vptr_len_assignment calls.
 (trans_class_assignment): Get rhs_vptr from
 trans_class_vptr_len_assignment and use it for determining size
 for allocation/reallocation. Use return value from realloc.

gcc/testsuite/
 PR fortran/110415
 * gfortran.dg/pr110415.f90: New test.
 * gfortran.dg/asan/pr110415-2.f90: New test.
 * gfortran.dg/asan/pr110415-3.f90: New test.diff --git a/gcc/fortran/trans-expr.cc b/gcc/fortran/trans-expr.cc
index 50c4604a025..bfe9996ced6 100644
--- a/gcc/fortran/trans-expr.cc
+++ b/gcc/fortran/trans-expr.cc
@@ -9936,7 +9936,8 @@ trans_get_upoly_len (stmtblock_t *block, gfc_expr *expr)
 static tree
 trans_class_vptr_len_assignment (stmtblock_t *block, gfc_expr * le,
 gfc_expr * re, gfc_se *rse,
-tree * to_lenp, tree * from_lenp)
+tree * to_lenp, tree * from_lenp,
+tree * from_vptrp)
 {
   gfc_se se;
   gfc_expr * vptr_expr;
@@ -9944,10 +9945,11 @@ trans_class_vptr_len_assignment (stmtblock_t *block, 
gfc_expr * le,
   bool set_vptr = false, temp_rhs = false;
   stmtblock_t *pre = block;
   tree class_expr = NULL_TREE;
+  tree from_vptr = NULL_TREE;
 
   /* Create a temporary for complicated expressions.  */
   if (re->expr_type != EXPR_VARIABLE && re->expr_type != EXPR_NULL
-  && rse->expr != NULL_TREE && !DECL_P (rse->expr))
+  && rse->expr != NULL_TREE)
 {
   if (re->ts.type == BT_CLASS && !GFC_CLASS_TYPE_P (TREE_TYPE (rse->expr)))
class_expr = gfc_get_class_from_expr (rse->expr);
@@ -10044,6 +10046,7 @@ trans_class_vptr_len_assignment (stmtblock_t *block, 
gfc_expr * le,
tmp = rse->expr;
 
  se.expr = gfc_class_vptr_get (tmp);
+ from_vptr = se.expr;
  if (UNLIMITED_POLY (re))
from_len = gfc_class_len_get (tmp);
 
@@ -10065,6 +10068,7 @@ trans_class_vptr_len_assignment (stmtblock_t *block, 
gfc_expr * le,
  gfc_free_expr (vptr_expr);
  gfc_add_block_to_block (block, &se.pre);
  gcc_assert (se.post.head == NULL_TREE);
+ from_vptr = se.expr;
}
   gfc_add_modify (pr

[PATCH v2] Fixed problem with BTF defining smaller enums.

2023-11-27 Thread Cupertino Miranda

Hi everyone,

David: Thanks for the v1 review.

This version adds the following;
 - test case,
 - improves condition logic,
 - fixes mask typo.

Looking forward to your review.

v1 at: https://gcc.gnu.org/pipermail/gcc-patches/2023-November/636391.html

Cheers,
Cupertino

commit 3f89d352a4ee90882089142d743f8a748013b5fe
Author: Cupertino Miranda 
Date:   Fri Nov 10 14:02:30 2023 +

Fixed problem with BTF defining smaller enums.

This patch fixes a BTF, which would become invalid when having
smaller then 4 byte definitions of enums.
For example, when using the __attribute__((mode(byte))) in the enum
definition.

Two problems were identified:
 - it would incorrectly create an entry for enum64 when the size of the
   enum was different then 4.
 - it would allocate less then 4 bytes for the value entry in BTF, in
   case the type was smaller.

BTF generated was validated against clang.

gcc/ChangeLog:
* bpfout.cc (btf_calc_num_vbytes): Fixed logic for enum64.
(btf_asm_enum_const): Corrected logic for enum64 and smaller
than 4 bytes values.

gcc/testsuite/ChangeLog:
gcc.dg/debug/btf/btf-enum-small.c: Added test.

diff --git a/gcc/btfout.cc b/gcc/btfout.cc
index e07fed302c24..5f2e99ce4725 100644
--- a/gcc/btfout.cc
+++ b/gcc/btfout.cc
@@ -299,7 +299,7 @@ btf_calc_num_vbytes (ctf_dtdef_ref dtd)
   break;
 
 case BTF_KIND_ENUM:
-  vlen_bytes += (dtd->dtd_data.ctti_size == 0x8)
+  vlen_bytes += (dtd->dtd_data.ctti_size > 4)
 			? vlen * sizeof (struct btf_enum64)
 			: vlen * sizeof (struct btf_enum);
   break;
@@ -914,8 +914,8 @@ btf_asm_enum_const (unsigned int size, ctf_dmdef_t * dmd, unsigned int idx)
 {
   dw2_asm_output_data (4, dmd->dmd_name_offset, "ENUM_CONST '%s' idx=%u",
 		   dmd->dmd_name, idx);
-  if (size == 4)
-dw2_asm_output_data (size, dmd->dmd_value, "bte_value");
+  if (size <= 4)
+dw2_asm_output_data (size < 4 ? 4 : size, dmd->dmd_value, "bte_value");
   else
 {
   dw2_asm_output_data (4, dmd->dmd_value & 0x, "bte_value_lo32");
diff --git a/gcc/testsuite/gcc.dg/debug/btf/btf-enum-small.c b/gcc/testsuite/gcc.dg/debug/btf/btf-enum-small.c
new file mode 100644
index ..eb8a1bd2c438
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/debug/btf/btf-enum-small.c
@@ -0,0 +1,28 @@
+/* Test BTF generation for small enums.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -gbtf -dA" } */
+
+/* { dg-final { scan-assembler-not "bte_value_lo32" } } */
+/* { dg-final { scan-assembler-not "bte_value_hi32" } } */
+/* { dg-final { scan-assembler-times "\[\t \]0x602\[\t \]+\[^\n\]*btt_info" 1 } } */
+/* { dg-final { scan-assembler-times " ENUM_CONST 'eSMALL' idx=0" 1 } } */
+/* { dg-final { scan-assembler-times " ENUM_CONST 'eSMALLY' idx=1" 1 } } */
+/* { dg-final { scan-assembler-times "ascii \"eSMALL.0\"\[\t \]+\[^\n\]*btf_string" 1 } } */
+/* { dg-final { scan-assembler-times "ascii \"eSMALLY.0\"\[\t \]+\[^\n\]*btf_string" 1 } } */
+/* { dg-final { scan-assembler-times "bte_value" 2 } } */
+
+enum smalled_enum
+{
+  eSMALL,
+  eSMALLY,
+} __attribute__((mode(byte)));
+
+struct root_struct {
+  enum smalled_enum esmall;
+};
+
+enum smalled_enum
+foo(struct root_struct *root) {
+  return root->esmall;
+}


Re: [PATCH V2 3/3] OpenMP: Use enumerators for names of trait-sets and traits

2023-11-27 Thread Tobias Burnus

Hi Sandra,

{BTW: 1/3 needs to be eventually rebased as it no longer applies
cleanly; I have not checked 2/3 or 3/3 yet.]

1/3+2/3 look good to me, unless Jakub has some comments, I think they
can go it.

Regarding 3/3, some first comments. I still want to read it a bit more
careful and play with it.

On 22.11.23 17:22, Sandra Loosemore wrote:

+static const char *const vendor_properties[] =
+  { "amd", "arm", "bsc", "cray", "fujitsu", "gnu", "ibm", "intel",
+"llvm", "nvidia", "pgi", "ti", "unknown", NULL };


Can you add "hpe"? Cf. "OpenMP API 5.2 Supplementary Source Code" at
https://www.openmp.org/specifications/


+static const char *const atomic_default_mem_order_properties[] =
+  { "seq_cst", "relaxed", "acq_rel", NULL };


Can you add "acquire" and "release"? Those have been added in OpenMP 5.1
for 'omp atomic', supported since GCC 12; albeit, for requires, that's
new since 5.2.


+   { "atomic_default_mem_order",
+ (1 << OMP_TRAIT_SET_IMPLEMENTATION),
+ OMP_TRAIT_PROPERTY_ID, true,
+ atomic_default_mem_order_properties,
+   },
+   { "requires",
+ (1 << OMP_TRAIT_SET_IMPLEMENTATION),
+ OMP_TRAIT_PROPERTY_CLAUSE_LIST, true,
+ NULL
+   },
+   { "unified_address",
+ (1 << OMP_TRAIT_SET_IMPLEMENTATION),
+ OMP_TRAIT_PROPERTY_NONE, true,
+ NULL
+   },


I don't understand this code. This looks as if "requires" and "unified_address"
are on the same level but in my understanding they have to be used as in:

 match(implementation = {requires(unified_address, 
atomic_default_mem_order_properties(release)})

while from the syntax, it looks as if this would permit:

 match(implementation = {unified_address, 
atomic_default_mem_order_properties(release))

Disclaimer: It might be that the code handles it correctly but I just misread 
it.
Or that I misread the spec.

 * * *


+   warning_at (loc, 0,
+   "unknown property %qE of %qs selector",


All '0' OpenMP warnings should now use 'OPT_Wopenmp' instead.

 * * *


-   if (selectors[i] == NULL)
+   /* Some trait sets permit extension traits which are supposed
+  to be ignored if the implementation doesn't support them.
+  GCC does not support any extension traits, and if it did, they
+  would have their own identifiers.  */


I am not sure whether I get this correctly. In my understanding

  match(implementation = {extension(ompx_myCompiler_abcd)])

should parse without error - but evaluate as false / not matching. Thus, it is 
not really
ignored but parsed – but still causing a not-matched.

(We can argue whether that should be silently accepted or still show a warning.)


Likewise for:
  match (implementation = { ompx_myCompiler_abcd(1) } )

albeit here a warning could make more sense than for 'extension', especially if 
a
typo fix would be available.

From the comment, it looks like as it is completely ignored - such that there 
could be still a match.

Disclaimer: I might have misunderstood the code - or might have missed 
something in the spec.

Tobias

-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955


Re: PR111754

2023-11-27 Thread Richard Sandiford
Prathamesh Kulkarni  writes:
> PR111754: Rework encoding of result for VEC_PERM_EXPR with constant input 
> vectors.
>
> gcc/ChangeLog:
>   PR middle-end/111754
>   * fold-const.cc (fold_vec_perm_cst): Set result's encoding to sel's
>   encoding, and set res_nelts_per_pattern to 2 if sel contains stepped
>   sequence but input vectors do not.
>   (test_nunits_min_2): New test Case 8.
>   (test_nunits_min_4): New tests Case 8 and Case 9.
>
> gcc/testsuite/ChangeLog:
>   PR middle-end/111754
>   * gcc.target/aarch64/sve/slp_3.c: Adjust code-gen.
>   * gcc.target/aarch64/sve/slp_4.c: Likewise.
>   * gcc.dg/vect/pr111754.c: New test.

OK, thanks.

Richard

> Co-authored-by: Richard Sandiford 
>
> diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> index 332bc8aead2..dff09b81f7b 100644
> --- a/gcc/fold-const.cc
> +++ b/gcc/fold-const.cc
> @@ -10803,27 +10803,38 @@ fold_vec_perm_cst (tree type, tree arg0, tree arg1, 
> const vec_perm_indices &sel,
>unsigned res_npatterns, res_nelts_per_pattern;
>unsigned HOST_WIDE_INT res_nelts;
>  
> -  /* (1) If SEL is a suitable mask as determined by
> - valid_mask_for_fold_vec_perm_cst_p, then:
> - res_npatterns = max of npatterns between ARG0, ARG1, and SEL
> - res_nelts_per_pattern = max of nelts_per_pattern between
> -  ARG0, ARG1 and SEL.
> - (2) If SEL is not a suitable mask, and TYPE is VLS then:
> - res_npatterns = nelts in result vector.
> - res_nelts_per_pattern = 1.
> - This exception is made so that VLS ARG0, ARG1 and SEL work as before.  
> */
> -  if (valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, reason))
> -{
> -  res_npatterns
> - = std::max (VECTOR_CST_NPATTERNS (arg0),
> - std::max (VECTOR_CST_NPATTERNS (arg1),
> -   sel.encoding ().npatterns ()));
> +  /* First try to implement the fold in a VLA-friendly way.
> +
> + (1) If the selector is simply a duplication of N elements, the
> +  result is likewise a duplication of N elements.
> +
> + (2) If the selector is N elements followed by a duplication
> +  of N elements, the result is too.
> +
> + (3) If the selector is N elements followed by an interleaving
> +  of N linear series, the situation is more complex.
> +
> +  valid_mask_for_fold_vec_perm_cst_p detects whether we
> +  can handle this case.  If we can, then each of the N linear
> +  series either (a) selects the same element each time or
> +  (b) selects a linear series from one of the input patterns.
>  
> -  res_nelts_per_pattern
> - = std::max (VECTOR_CST_NELTS_PER_PATTERN (arg0),
> - std::max (VECTOR_CST_NELTS_PER_PATTERN (arg1),
> -   sel.encoding ().nelts_per_pattern ()));
> +  If (b) holds for one of the linear series, the result
> +  will contain a linear series, and so the result will have
> +  the same shape as the selector.  If (a) holds for all of
> +  the linear series, the result will be the same as (2) above.
>  
> +  (b) can only hold if one of the input patterns has a
> +  stepped encoding.  */
> +
> +  if (valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, reason))
> +{
> +  res_npatterns = sel.encoding ().npatterns ();
> +  res_nelts_per_pattern = sel.encoding ().nelts_per_pattern ();
> +  if (res_nelts_per_pattern == 3
> +   && VECTOR_CST_NELTS_PER_PATTERN (arg0) < 3
> +   && VECTOR_CST_NELTS_PER_PATTERN (arg1) < 3)
> + res_nelts_per_pattern = 2;
>res_nelts = res_npatterns * res_nelts_per_pattern;
>  }
>else if (TYPE_VECTOR_SUBPARTS (type).is_constant (&res_nelts))
> @@ -17622,6 +17633,29 @@ test_nunits_min_2 (machine_mode vmode)
>   tree expected_res[] = { ARG0(0), ARG1(0), ARG1(1) };
>   validate_res (1, 3, res, expected_res);
>}
> +
> +  /* Case 8: Same as aarch64/sve/slp_3.c:
> +  arg0, arg1 are dup vectors.
> +  sel = { 0, len, 1, len+1, 2, len+2, ... } // (2, 3)
> +  So res = { arg0[0], arg1[0], ... } // (2, 1)
> +
> +  In this case, since the input vectors are dup, only the first two
> +  elements per pattern in sel are considered significant.  */
> +  {
> + tree arg0 = build_vec_cst_rand (vmode, 1, 1);
> + tree arg1 = build_vec_cst_rand (vmode, 1, 1);
> + poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> +
> + vec_perm_builder builder (len, 2, 3);
> + poly_uint64 mask_elems[] = { 0, len, 1, len + 1, 2, len + 2 };
> + builder_push_elems (builder, mask_elems);
> +
> + vec_perm_indices sel (builder, 2, len);
> + tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
> +
> + tree expected_res[] = { ARG0(0), ARG1(0) };
> + validate_res (2, 1, res, expected_res);
> +  }
>  }
>  }
>  
> @@ -17790,6 +17824,44 @@ test_nunits_min_4 (machine_mode vmode)
>   ASSERT_TRUE (res == NULL_TREE);
>   ASSERT_TR

Re: [RFC] vect: disable multiple calls of poly simdclones

2023-11-27 Thread Andre Vieira (lists)




On 06/11/2023 07:52, Richard Biener wrote:

On Fri, 3 Nov 2023, Andre Vieira (lists) wrote:


Hi,

The current codegen code to support VF's that are multiples of a simdclone
simdlen rely on BIT_FIELD_REF to create multiple input vectors.  This does not
work for non-constant simdclones, so we should disable using such clones when
the VF is a multiple of the non-constant simdlen until we change the codegen
to support those.

Enabling SVE simdclone support will cause ICEs if the vectorizer decides to
use a SVE simdclone with a VF that is larger than the simdlen. I'll be away
for the next two weeks, so cant' really discuss this further.
I initially tried to solve the problem, but the way
vectorizable_simd_clone_call is structured doesn't make it easy to replace
BIT_FIELD_REF with the poly-suitable solution right now of using
unpack_{hi,lo}.


I think it should be straight-forward to use unpack_{even,odd} (it's
even/odd for VLA, right?  If lo/hi would be possible then doing
BIT_FIELD_REF would be, too?  Also you need to have multiple stages
of unpack/pack when the factor is more than 2).

There's plenty of time even during stage3 to address this.

At least your patch should have come with a testcase (or two).


Yeah I didn't add one as it didn't trigger on AArch64 without my two 
outstanding aarch64 simdclone patches.


Is there a bugreport tracking this issue?  It should affect GCN as well
I guess.


No, since I can't trigger them yet on trunk until the reviews on my 
target specific patches are done and they are committed.


I don't have a GCN backend lying around but I suspect GCN doesn't use 
poly simdlen simdclones yet either... I haven't checked. The issue 
triggers for aarch64 when trying to generate SVE simdclones for 
functions with mixed types.  I'll give the unpack thing a go locally.


Re: [RFA] New pass for sign/zero extension elimination

2023-11-27 Thread Jeff Law




On 11/27/23 04:30, Andrew Stubbs wrote:
I tried this patch for AMD GCN. We have a similar problem with excess 
extends, but also for vector modes. Each lane has a minimum 32 bits and 
GCC's normal assumption is that vector registers have precisely the 
number of bits they need, so the amdgcn backend patterns have explicit 
sign/zero extends for QImode and HImode for the instructions that might 
need it. It would be cool if this pass could eliminate some of those, 
but at this point I just wanted to check it didn't break anything.


Unfortunately I get a crash building libgcc:
I strongly suspect this is the same thing that was originally reported 
by Xi Ruoyao.  Just getting back on top of things after the holiday. 
I'll get the V2 posted today.


Jeff


Re: [RFA] New pass for sign/zero extension elimination

2023-11-27 Thread Jeff Law




On 11/26/23 09:42, rep.dot@gmail.com wrote:

On 22 November 2023 23:23:41 CET, Jeff Law  wrote:



On 11/20/23 11:56, Dimitar Dimitrov wrote:

On Sun, Nov 19, 2023 at 05:47:56PM -0700, Jeff Law wrote:
...



+  enum rtx_code xcode = GET_CODE (x);
+  if (xcode == SET)
+   {
+ const_rtx dst = SET_DEST (x);
+ rtx src = SET_SRC (x);
+ const_rtx y;
+ unsigned HOST_WIDE_INT bit = 0;
+
+ /* The code of the RHS of a SET.  */
+ enum rtx_code code = GET_CODE (src);
+
+ /* ?!? How much of this should mirror SET handling, potentially
+being shared?   */
+ if (SUBREG_BYTE (dst).is_constant () && SUBREG_P (dst))


Shouldn't SUBREG_P be checked first like:
  if (SUBREG_P (dst) && SUBREG_BYTE (dst).is_constant ())

Yes, absolutely. It'll be fixed in the next update.

This also highlighted that I never added pru-elf to the configurations in my 
tester.  I remember thinking that it needed to be added, but obviously that 
mental TODO got lost.  I've just fixed that.



And please drop the superfluous enum from rtx_code while at it?

Sure.
jeff


RE: [PATCH] aarch64: Improve cost of `a ? {-,}1 : b`

2023-11-27 Thread Andrew Pinski (QUIC)
> -Original Message-
> From: Richard Sandiford 
> Sent: Monday, November 27, 2023 7:35 AM
> To: Andrew Pinski (QUIC) 
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH] aarch64: Improve cost of `a ? {-,}1 : b`
> 
> Andrew Pinski  writes:
> > While looking into PR 112454, I found the cost for `(if_then_else
> > (cmp) (const_int 1) (reg))` was being recorded as 8 (or `COSTS_N_INSNS
> > (2)`) but it should have been 4 (or `COSTS_N_INSNS (1)`).
> > This improves the cost by not adding the cost of `(const_int 1)` to
> > the total cost.
> >
> > It does not does not fix PR 112454 as that requires other changes to
> > forwprop the `(const_int 1)` earlier than combine.
> >
> > Bootstrapped and tested on aarch64-linux-gnu with no regressions.
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64.cc (aarch64_if_then_else_costs):
> > Don't add the cost of `1` or `-1`.
> >
> > Signed-off-by: Andrew Pinski 
> > ---
> >  gcc/config/aarch64/aarch64.cc | 13 ++---
> >  1 file changed, 10 insertions(+), 3 deletions(-)
> >
> > diff --git a/gcc/config/aarch64/aarch64.cc
> > b/gcc/config/aarch64/aarch64.cc index f6f6f94bf43..63241c5aaa5 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -11642,9 +11642,16 @@ aarch64_if_then_else_costs (rtx op0, rtx op1,
> rtx op2, int *cost, bool speed)
> > /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3).  */
> > op1 = XEXP (inner, 0);
> > }
> > -
> > -  *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
> > -  *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
> > +  if (op2 == constm1_rtx || op2 == const1_rtx)
> > +   *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
> > +  else if (op1 == constm1_rtx || op1 == const1_rtx)
> > +   *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
> 
> It looks like this is really an extra option on top of the previous if-else 
> chain,
> since it only applies when OP1 and OP2 are still the operands of the
> if_then_else.  So how about:
> 
>   else if (op1 == constm1_rtx || op1 == const1_rtx)
> {
> /* Use CSINV.  */
> *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
> return true;
> }
>   else if (op2 == constm1_rtx || op2 == const1_rtx)
> {
> /* Use CSINV.  */
> *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
> return true;
> }
> 
> leaving the code to fall through to:
> 
>   *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
>   *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
>   return true;
> 
> as it does currently.  OK in that form if you agree.

Yes I think this is the correct way of implementing this, Let me test it and 
get back to you.

Thanks,
Andrew

> 
> Let me know if you don't.  But in that case:
> 
> > +  else
> > +   {
> > + *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
> > + *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 1, speed);
> 
> should be 2, speed
> 
> > +   }
> > +
> 
> Thanks,
> Richard


Re: Ping: [PATCH] Fix PR112419

2023-11-27 Thread Martin Uecker
Am Montag, dem 27.11.2023 um 16:54 +0100 schrieb Martin Uecker:
> Am Montag, dem 27.11.2023 um 08:36 -0700 schrieb Jeff Law:
> > 
> > On 11/23/23 10:05, Hans-Peter Nilsson wrote:
> > > > From: Hans-Peter Nilsson 
> > > > Date: Thu, 16 Nov 2023 05:24:06 +0100
> > > > 
> > > > > From: Martin Uecker 
> > > > > Date: Tue, 07 Nov 2023 06:56:25 +0100
> > > > 
> > > > > Am Montag, dem 06.11.2023 um 21:01 -0700 schrieb Jeff Law:
> > > > > > 
> > > > > > On 11/6/23 20:58, Hans-Peter Nilsson wrote:
> > > > > > > This patch caused a testsuite regression: there's now an
> > > > > > > "excess error" failure for gcc.dg/Wnonnull-4.c for 32-bit
> > > > > > > targets (and 64-bit targets testing with a "-m32" option)
> > > > > > > after your r14-5115-g6e9ee44d96e5.  It's logged as PR112419.
> > > > > > It caused failures for just about every target ;(  Presumably it 
> > > > > > worked
> > > > > > on x86_64...
> > > > > 
> > > > > I do not think this is a true regression
> > > > > just a problem with the test on 32-bit which somehow surfaced
> > > > > due to the change.
> > > > > 
> > > > > The excess error is:
> > > > > 
> > > > > FAIL: gcc.dg/Wnonnull-4.c (test for excess errors)
> > > > > Excess errors:
> > > > > /home/tcwg-buildslave/workspace/tcwg_gnu_6/abe/snapshots/gcc.git~master/gcc/testsuite/gcc.dg/Wnonnull-4.c:144:3:
> > > > >  warning: 'fda_n_5' specified size 4294967256 exceeds maximum object 
> > > > > size
> > > > > 2147483647 [-Wstringop-overflow=]
> > > > > 
> > > > > I think the warning was suppressed before due to the other (nonnull)
> > > > > warning which I removed in this case.
> > > > > 
> > > > > I think the simple fix might be to to turn off -Wstringop-overflow.
> > > > 
> > > > No, that trigs many of the dg-warnings that are tested.
> > > > 
> > > > (I didn't pay attention to the actual warning messages and
> > > > tried to pursue that at first.)
> > > > 
> > > > Maybe think it's best to actually expect the warning, like
> > > > so.
> > > > 
> > > > Maintainers of 16-bit targets will have to address their
> > > > concerns separately.  For example, they may choose to not
> > > > run the test at all.
> > > > 
> > > > Ok to commit?
> > > > 
> > > > Subject: [PATCH] gcc.dg/Wnonnull-4.c: Handle new overflow warning for 
> > > > 32-bit targets [PR112419]
> > > > 
> > > > PR testsuite/112419
> > > > * gcc.dg/Wnonnull-4.c (test_fda_n_5): Expect warning for 
> > > > exceeding
> > > > maximum object size for 32-bit targets.
> > > > ---
> > > >   gcc/testsuite/gcc.dg/Wnonnull-4.c | 1 +
> > > >   1 file changed, 1 insertion(+)
> > > > 
> > > > diff --git a/gcc/testsuite/gcc.dg/Wnonnull-4.c 
> > > > b/gcc/testsuite/gcc.dg/Wnonnull-4.c
> > > > index 1f14fbba45df..d63e76da70a2 100644
> > > > --- a/gcc/testsuite/gcc.dg/Wnonnull-4.c
> > > > +++ b/gcc/testsuite/gcc.dg/Wnonnull-4.c
> > > > @@ -142,6 +142,7 @@ void test_fda_n_5 (int r_m1)
> > > > T (  1);  // { dg-bogus "argument 2 of variable length 
> > > > array 'double\\\[n]\\\[5]' is null but the corresponding bound argument 
> > > > 1 value is 1" }
> > > > T (  9);  // { dg-bogus "argument 2 of variable length 
> > > > array 'double\\\[n]\\\[5]' is null but the corresponding bound argument 
> > > > 1 value is 9" }
> > > > T (max);  // { dg-bogus "argument 2 of variable length 
> > > > array 'double\\\[n]\\\[5]' is null but the corresponding bound argument 
> > > > 1 value is \\d+" }
> > > > +// { dg-warning "size 4294967256 exceeds maximum object size" "" { 
> > > > target ilp32 } .-1 }
> > > >   }
> > Unfortunately I think we need to go back to the original issue that 
> > Martin (I think) dismissed.
> > 
> > Specifically, this is a regression.  It's very clear that prior to the 
> > patch in question there was no diagnostic about the size of the 
> > requested memory allocation and after the patch in question we get the 
> > "exceeds maximum object size" diagnostic.
> > 
> > Now one explanation could be that the diagnostic is warranted and it was 
> > a bug that the diagnostic hadn't been emitted prior to Martin's patch. 
> > In this case some kind of dg-blah is warranted, but I don't think anyone 
> > has made this argument.
> > 
> I believe the warning is correct but was suppressed before.
> 
> 
> My plan was to split up the test case in one which is for
> -Wstringop-overflow and one which is for -Wnonnull and then
> one could turn off the -Wstringop-overflow for the tests
> which are actually for -Wnonnull.  But adding the dg-blah
> would certainly be simpler.

Specifically, also with 13.2 if you suppress the warning which
I removed with -Wno-nonnull you will get the otherwise hidden
-Wstringop-overflow warning with -m32:

See here: https://godbolt.org/z/ev5GhMonq

The warning also seems correct to me, so I suggest to accept
the proposed patch. 

Martin






Re: [PATCH] c++: Implement P2582R1, CTAD from inherited constructors

2023-11-27 Thread Patrick Palka
On Fri, 24 Nov 2023, Patrick Palka wrote:

> On Wed, 22 Nov 2023, Patrick Palka wrote:
> 
> > Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
> > trunk?
> > 
> > -- >8 --
> > 
> > This patch implements C++23 class template argument deduction from
> > inherited constructors, which is specified in terms of C++20 alias
> > CTAD which we already fully support.  The rule for transforming
> > the return type of an inherited guide is specified in terms of a
> > partially specialized class template, but this patch implements it
> > in a simpler way, performing ahead of time deduction instead of
> > instantiation time deduction.  I wasn't able to find an example for
> > which this implementation strategy makes a difference, but I didn't
> > look very hard.  Support seems good enough to advertise as complete,
> > and there should be no functional change before C++23 mode.
> > 
> > There's a couple of FIXMEs, one in inherited_ctad_tweaks for recognizing
> > more forms of inherited constructors, and one in deduction_guides_for for
> > making the cache aware of base-class dependencies.
> > 
> > There doesn't seem to be a feature-test macro update for this paper.
> > 
> 
> Here's v2 with some minor changes:
> 
>   * set processing_template_decl when rewriting the return type of
> a template guide
>   * rather than adding an out parameter to type_targs_deducible_from,
> just make it return NULL_TREE or the deduced args
>   * add a testcase demonstrating each of the FIXMEs
> 
> -- >8 --
> 
> gcc/cp/ChangeLog:
> 
>   * cp-tree.h (type_targs_deducible_from): Adjust return type.
>   * pt.cc (alias_ctad_tweaks): Handle C++23 inherited CTAD.
>   (inherited_ctad_tweaks): Define.
>   (type_targs_deducible_from): Return the deduced arguments or
>   NULL_TREE instead of a bool.  Handle 'tmpl' being a TREE_LIST
>   representing a synthetic alias template.
>   (ctor_deduction_guides_for): Do inherited_ctad_tweaks for each
>   USING_DECL in C++23 mode.
>   (deduction_guides_for): Add FIXME for stale cache entries in
>   light of inherited CTAD.
> 
> gcc/testsuite/ChangeLog:
> 
>   * g++.dg/cpp1z/class-deduction67.C: Accept in C++23 mode.
>   * g++.dg/cpp23/class-deduction-inherited1.C: New test.
>   * g++.dg/cpp23/class-deduction-inherited2.C: New test.
>   * g++.dg/cpp23/class-deduction-inherited3.C: New test.
>   * g++.dg/cpp23/class-deduction-inherited4.C: New test.
> ---
>  gcc/cp/cp-tree.h  |   2 +-
>  gcc/cp/pt.cc  | 186 +++---
>  .../g++.dg/cpp1z/class-deduction67.C  |   5 +-
>  .../g++.dg/cpp23/class-deduction-inherited1.C |  38 
>  .../g++.dg/cpp23/class-deduction-inherited2.C |  26 +++
>  .../g++.dg/cpp23/class-deduction-inherited3.C |  16 ++
>  .../g++.dg/cpp23/class-deduction-inherited4.C |  32 +++
>  7 files changed, 272 insertions(+), 33 deletions(-)
>  create mode 100644 gcc/testsuite/g++.dg/cpp23/class-deduction-inherited1.C
>  create mode 100644 gcc/testsuite/g++.dg/cpp23/class-deduction-inherited2.C
>  create mode 100644 gcc/testsuite/g++.dg/cpp23/class-deduction-inherited3.C
>  create mode 100644 gcc/testsuite/g++.dg/cpp23/class-deduction-inherited4.C
> 
> diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
> index 7b0b7c6a17e..abc467fb290 100644
> --- a/gcc/cp/cp-tree.h
> +++ b/gcc/cp/cp-tree.h
> @@ -7457,7 +7457,7 @@ extern tree fn_type_unification (tree, 
> tree, tree,
>bool, bool);
>  extern void mark_decl_instantiated   (tree, int);
>  extern int more_specialized_fn   (tree, tree, int);
> -extern bool type_targs_deducible_from(tree, tree);
> +extern tree type_targs_deducible_from(tree, tree);
>  extern void do_decl_instantiation(tree, tree);
>  extern void do_type_instantiation(tree, tree, tsubst_flags_t);
>  extern bool always_instantiate_p (tree);
> diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
> index 092e6fdfd36..8b7aa96cf01 100644
> --- a/gcc/cp/pt.cc
> +++ b/gcc/cp/pt.cc
> @@ -223,6 +223,9 @@ static void instantiate_body (tree pattern, tree args, 
> tree d, bool nested);
>  static tree maybe_dependent_member_ref (tree, tree, tsubst_flags_t, tree);
>  static void mark_template_arguments_used (tree, tree);
>  static bool uses_outer_template_parms (tree);
> +static tree alias_ctad_tweaks (tree, tree);
> +static tree inherited_ctad_tweaks (tree, tree, tsubst_flags_t);
> +static tree deduction_guides_for (tree, bool&, tsubst_flags_t);
>  
>  /* Make the current scope suitable for access checking when we are
> processing T.  T can be FUNCTION_DECL for instantiated function
> @@ -29736,8 +29739,6 @@ is_spec_or_derived (tree etype, tree tmpl)
>return !err;
>  }
>  
> -static tree alias_ctad_tweaks (tree, tree);
> -
>  /* Return a C++20 aggregate deduction candidate for TYPE initial

Re: [PATCH v2 3/7] aarch64: Add eh_return compile tests

2023-11-27 Thread Szabolcs Nagy
The 11/26/2023 14:37, Richard Sandiford wrote:
> Szabolcs Nagy  writes:
> > +++ b/gcc/testsuite/gcc.target/aarch64/eh_return-3.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mbranch-protection=pac-ret+leaf" } */
> 
> Probably best to add -fno-schedule-insns -fno-schedule-insns2, so that the
> instructions in the check-function-bodies are in a more predictable order.
> 
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +/*
> > +**foo:
> > +** hint25 // paciasp
> > +** stp x0, x1, .*
> > +** stp x2, x3, .*
> > +** cbz w2, .*
> > +** mov x4, 0
> > +** ldp x2, x3, .*
> > +** ldp x0, x1, .*
> > +** cbz x4, .*
> > +** add sp, sp, x5
> > +** br  x6
> > +** hint29 // autiasp
> > +** ret
> > +** mov x5, x0
> > +** mov x6, x1
> > +** mov x4, 1
> > +** b   .*
> > +*/
> 
> What's the significance of x3 here?  It looks from the function definition
> like it should be undefined.  And what are the stps and ldps doing?
> 
> If those aren't an important part of the test, it might be better
> to stub them out with "...", e.g.:
> 
> /*
> **foo:
> **hint25 // paciasp
> **...
> **cbz w2, .*
> **mov x4, 0
> **...
> **cbz x4, .*
> **add sp, sp, x5
> **br  x6
> **hint29 // autiasp
> **ret
> **mov x5, x0
> **mov x6, x1
> **mov x4, 1
> **b   .*
> */
> 
> LGTM otherwise.

committed as

>From cad7e1e3e0dea1922f89290bbbc27b4c44f53bf5 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy 
Date: Fri, 2 Jun 2023 14:17:02 +0100
Subject: [PATCH] aarch64: Add eh_return compile tests

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/eh_return-2.c: New test.
* gcc.target/aarch64/eh_return-3.c: New test.
---
 .../gcc.target/aarch64/eh_return-2.c  |  9 ++
 .../gcc.target/aarch64/eh_return-3.c  | 28 +++
 2 files changed, 37 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/eh_return-2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/eh_return-3.c

diff --git a/gcc/testsuite/gcc.target/aarch64/eh_return-2.c 
b/gcc/testsuite/gcc.target/aarch64/eh_return-2.c
new file mode 100644
index 000..4a9d124e891
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/eh_return-2.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-final { scan-assembler "add\tsp, sp, x5" } } */
+/* { dg-final { scan-assembler "br\tx6" } } */
+
+void
+foo (unsigned long off, void *handler)
+{
+  __builtin_eh_return (off, handler);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/eh_return-3.c 
b/gcc/testsuite/gcc.target/aarch64/eh_return-3.c
new file mode 100644
index 000..a17baa86501
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/eh_return-3.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbranch-protection=pac-ret+leaf -fno-schedule-insns 
-fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+**foo:
+** hint25 // paciasp
+** ...
+** cbz w2, .*
+** mov x4, 0
+** ...
+** cbz x4, .*
+** add sp, sp, x5
+** br  x6
+** hint29 // autiasp
+** ret
+** mov x5, x0
+** mov x4, 1
+** mov x6, x1
+** b   .*
+*/
+void
+foo (unsigned long off, void *handler, int c)
+{
+  if (c)
+return;
+  __builtin_eh_return (off, handler);
+}
-- 
2.25.1



Re: Ping: [PATCH] Fix PR112419

2023-11-27 Thread Martin Uecker
Am Montag, dem 27.11.2023 um 08:36 -0700 schrieb Jeff Law:
> 
> On 11/23/23 10:05, Hans-Peter Nilsson wrote:
> > > From: Hans-Peter Nilsson 
> > > Date: Thu, 16 Nov 2023 05:24:06 +0100
> > > 
> > > > From: Martin Uecker 
> > > > Date: Tue, 07 Nov 2023 06:56:25 +0100
> > > 
> > > > Am Montag, dem 06.11.2023 um 21:01 -0700 schrieb Jeff Law:
> > > > > 
> > > > > On 11/6/23 20:58, Hans-Peter Nilsson wrote:
> > > > > > This patch caused a testsuite regression: there's now an
> > > > > > "excess error" failure for gcc.dg/Wnonnull-4.c for 32-bit
> > > > > > targets (and 64-bit targets testing with a "-m32" option)
> > > > > > after your r14-5115-g6e9ee44d96e5.  It's logged as PR112419.
> > > > > It caused failures for just about every target ;(  Presumably it 
> > > > > worked
> > > > > on x86_64...
> > > > 
> > > > I do not think this is a true regression
> > > > just a problem with the test on 32-bit which somehow surfaced
> > > > due to the change.
> > > > 
> > > > The excess error is:
> > > > 
> > > > FAIL: gcc.dg/Wnonnull-4.c (test for excess errors)
> > > > Excess errors:
> > > > /home/tcwg-buildslave/workspace/tcwg_gnu_6/abe/snapshots/gcc.git~master/gcc/testsuite/gcc.dg/Wnonnull-4.c:144:3:
> > > >  warning: 'fda_n_5' specified size 4294967256 exceeds maximum object 
> > > > size
> > > > 2147483647 [-Wstringop-overflow=]
> > > > 
> > > > I think the warning was suppressed before due to the other (nonnull)
> > > > warning which I removed in this case.
> > > > 
> > > > I think the simple fix might be to to turn off -Wstringop-overflow.
> > > 
> > > No, that trigs many of the dg-warnings that are tested.
> > > 
> > > (I didn't pay attention to the actual warning messages and
> > > tried to pursue that at first.)
> > > 
> > > Maybe think it's best to actually expect the warning, like
> > > so.
> > > 
> > > Maintainers of 16-bit targets will have to address their
> > > concerns separately.  For example, they may choose to not
> > > run the test at all.
> > > 
> > > Ok to commit?
> > > 
> > > Subject: [PATCH] gcc.dg/Wnonnull-4.c: Handle new overflow warning for 
> > > 32-bit targets [PR112419]
> > > 
> > >   PR testsuite/112419
> > >   * gcc.dg/Wnonnull-4.c (test_fda_n_5): Expect warning for exceeding
> > >   maximum object size for 32-bit targets.
> > > ---
> > >   gcc/testsuite/gcc.dg/Wnonnull-4.c | 1 +
> > >   1 file changed, 1 insertion(+)
> > > 
> > > diff --git a/gcc/testsuite/gcc.dg/Wnonnull-4.c 
> > > b/gcc/testsuite/gcc.dg/Wnonnull-4.c
> > > index 1f14fbba45df..d63e76da70a2 100644
> > > --- a/gcc/testsuite/gcc.dg/Wnonnull-4.c
> > > +++ b/gcc/testsuite/gcc.dg/Wnonnull-4.c
> > > @@ -142,6 +142,7 @@ void test_fda_n_5 (int r_m1)
> > > T (  1);  // { dg-bogus "argument 2 of variable length array 
> > > 'double\\\[n]\\\[5]' is null but the corresponding bound argument 1 value 
> > > is 1" }
> > > T (  9);  // { dg-bogus "argument 2 of variable length array 
> > > 'double\\\[n]\\\[5]' is null but the corresponding bound argument 1 value 
> > > is 9" }
> > > T (max);  // { dg-bogus "argument 2 of variable length array 
> > > 'double\\\[n]\\\[5]' is null but the corresponding bound argument 1 value 
> > > is \\d+" }
> > > +// { dg-warning "size 4294967256 exceeds maximum object size" "" { 
> > > target ilp32 } .-1 }
> > >   }
> Unfortunately I think we need to go back to the original issue that 
> Martin (I think) dismissed.
> 
> Specifically, this is a regression.  It's very clear that prior to the 
> patch in question there was no diagnostic about the size of the 
> requested memory allocation and after the patch in question we get the 
> "exceeds maximum object size" diagnostic.
> 
> Now one explanation could be that the diagnostic is warranted and it was 
> a bug that the diagnostic hadn't been emitted prior to Martin's patch. 
> In this case some kind of dg-blah is warranted, but I don't think anyone 
> has made this argument.
> 
I believe the warning is correct but was suppressed before.


My plan was to split up the test case in one which is for
-Wstringop-overflow and one which is for -Wnonnull and then
one could turn off the -Wstringop-overflow for the tests
which are actually for -Wnonnull.  But adding the dg-blah
would certainly be simpler.


Martin





Re: GCC/Rust libgrust-v2/to-submit branch

2023-11-27 Thread Thomas Schwinge
Hi!

On 2023-11-21T16:20:22+0100, Arthur Cohen  wrote:
> A newer version of the library has been force-pushed to the branch
> `libgrust-v2/to-submit`.

> On 11/20/23 15:55, Thomas Schwinge wrote:
>> Arthur and Pierre-Emmanuel have prepared a GCC/Rust libgrust-v2/to-submit
>> branch: .
>> In that one, most of the issues raised have been addressed, and which
>> I've now successfully "tested" in my different GCC configurations,
>> requiring just one additional change (see end of this email).  I'm using
>> "tested" in quotes here, as libgrust currently is still missing its
>> eventual content, and still is without actual users, so we may still be
>> up for surprises later on.  ;-)

>> On 2023-10-27T22:41:52+0200, I wrote:
>>> On 2023-09-27T00:25:16+0200, I wrote:
 don't we also directly need to
 incorporate here a few GCC/Rust master branch follow-on commits, like:

- commit 171ea4e2b3e202067c50f9c206974fbe1da691c0 "fixup: Fix bootstrap 
 build"
- commit 61cbe201029658c32e5c360823b9a1a17d21b03c "fixup: Fix missing 
 build dependency"
>>>
>>> I've not yet run into the need for these two.  Let's please leave these
>>> out of the upstream submission for now, until we understand what exactly
>>> these are necessary for.
>>
>> (Still the same.)
>
> Do you mean that we should remove the content of these commits from the
> submission? If so, I believe it's now done.

That's correct.  My theory is that "fixup: Fix bootstrap build" can be
dropped altogether (that is, reverted on GCC/Rust master branch; I'll
look into that, later), and "fixup: Fix missing build dependency" will be
necessary once the GCC/Rust front end links against libgrust (that is,
will then move into that commit).

>>> However:
>>>
- commit 6a8b207b9ef7f9038e0cae7766117428783825d8 "libgrust: Add 
 dependency to libstdc++"
>>>
>>> ... this one definitely is necessary right now; see discussion in
>>> 
>>> "Disable target libgrust if we're not building target libstdc++".
>>
>> This one still isn't in the GCC/Rust libgrust-v2/to-submit branch -- but
>> having now tested that branch, I'm now no longer seeing the respective
>> build failure.  Isn't that change "libgrust: Add dependency to libstdc++"
>> still necessary, conceptually?  (Maybe we're just lucky, currently?)
>> I'll be sure to re-test in my different GCC configurations once libgrust
>> gains actual content and use.  (..., which might then re-expose the
>> original problem?)

So I guess I really just was lucky in my testing, because: later I
actually again did run into the need for that commit, so:

> This commit was integrated into another one:
>
> fb31093105e build: Add libgrust as compilation modules
>
> (on libgrust-v2/to-submit as of 2 minutes ago)

ACK.

> --- a/gcc/rust/config-lang.in
> +++ b/gcc/rust/config-lang.in

> +target_libs="target-libffi target-libbacktrace target-libgrust"

 Please don't add back 'target-libffi' and 'target-libbacktrace' here;
 just 'target-libgrust'.  (As is present in GCC/Rust master branch, and
 per commit 7411eca498beb13729cc2acec77e68250940aa81
 "Rust: Don't depend on unused 'target-libffi', 'target-libbacktrace'".)
>>>
>>> ... that change is necessary, too.
>>
>> That's still unchanged in the GCC/Rust libgrust-v2/to-submit branch;
>> please apply to 'gcc/rust/config-lang.in':
>>
>>  -target_libs="target-libffi target-libbacktrace target-libgrust"
>>  +target_libs=target-libgrust

(That's now been addressed, too.)

>> Then, still should re-order the commits so that (re)generation of
>> auto-generated files comes before use of libgrust (so that later
>> bisection doesn't break), and move the 'contrib/gcc_update' update into
>> the commit that adds the auto-generated files.
>
> Do you mean that the regeneration should happen before the commit adding
> the proc_macro library? Or that when we keep going and adding more
> commits on top of this, we need to make sure the regeneration commit
> happens before any code starts using/depending on libgrust/?

My point is: once the 'gcc/rust/config-lang.in' changes appear (when a
'git bisect' tests commit "build: Add libgrust as compilation modules",
by chance), the GCC build system will then try to build libgrust.  But
given that, at that time in the commit history, the libgrust build system
('libgrust/configure' etc.) is not yet present, the GCC build will fail.

So I suggest:

  - "libgrust: Add entry for maintainers and stub changelog file"
  - "libgrust: Add libproc_macro and build system"
... plus 'autoreconf' in 'libgrust/' folded in.
... plus 'contrib/gcc_update' update moved here.
  - "build: Add libgrust as compilation modules"
... plus "Disable target libgrust if missing libstdc++" folded in.
... plus 'autoreconf' and 'autogen'in '/' folded in.
  - "Regenerate build files"

Re: [PATCH][RFC] middle-end/110237 - wrong MEM_ATTRs for partial loads/stores

2023-11-27 Thread Jeff Law




On 11/27/23 05:39, Robin Dapp wrote:

The easiest way to avoid running into the alias analysis problem is
to scrap the MEM_EXPR when we expand the internal functions for
partial loads/stores.  That avoids the disambiguation we run into
which is realizing that we store to an object of less size as
the size of the mode we appear to store.

After the patch we see just

   [1  S64 A32]

so we preserve the alias set, the alignment and the size (the size
is redundant if the MEM insn't BLKmode).  That's still not good
in case the RTL alias oracle would implement the same
disambiguation but it fends off the gimple one.

This fixes gcc.dg/torture/pr58955-2.c when built with AVX512
and --param=vect-partial-vector-usage=1.


On riscv we're seeing a similar problem across the testsuite
and several execution failures as a result.  In the case I
looked at we move a scalar load upwards over a partial store
that aliases the load.

I independently arrived at the spot mentioned in
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110237#c4
before knowing about the PR.

I can confirm that your RFC patch fixes at least two of the
failures,  I haven't checked the others but very likely
they are similar.
FWIW, it should always be safe to ignore the memory attributes.   So if 
there's a reasonable condition here, then we can use it and just ignore 
the attribute.


Does the attribute on a partial load/store indicate the potential 
load/store size or does it indicate the actual known load/store size. 
If the former, then we probably need to treat it as a may-read/may-write 
kind of reference.


Jeff


Re: [PATCH] aarch64: Improve cost of `a ? {-,}1 : b`

2023-11-27 Thread Richard Sandiford
Richard Sandiford  writes:
> Andrew Pinski  writes:
>> While looking into PR 112454, I found the cost for
>> `(if_then_else (cmp) (const_int 1) (reg))` was being recorded as 8
>> (or `COSTS_N_INSNS (2)`) but it should have been 4 (or `COSTS_N_INSNS (1)`).
>> This improves the cost by not adding the cost of `(const_int 1)` to
>> the total cost.
>>
>> It does not does not fix PR 112454 as that requires other changes to forwprop
>> the `(const_int 1)` earlier than combine.
>>
>> Bootstrapped and tested on aarch64-linux-gnu with no regressions.
>>
>> gcc/ChangeLog:
>>
>>  * config/aarch64/aarch64.cc (aarch64_if_then_else_costs):
>>  Don't add the cost of `1` or `-1`.
>>
>> Signed-off-by: Andrew Pinski 
>> ---
>>  gcc/config/aarch64/aarch64.cc | 13 ++---
>>  1 file changed, 10 insertions(+), 3 deletions(-)
>>
>> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> index f6f6f94bf43..63241c5aaa5 100644
>> --- a/gcc/config/aarch64/aarch64.cc
>> +++ b/gcc/config/aarch64/aarch64.cc
>> @@ -11642,9 +11642,16 @@ aarch64_if_then_else_costs (rtx op0, rtx op1, rtx 
>> op2, int *cost, bool speed)
>>  /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3).  */
>>  op1 = XEXP (inner, 0);
>>  }
>> -
>> -  *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
>> -  *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
>> +  if (op2 == constm1_rtx || op2 == const1_rtx)
>> +*cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
>> +  else if (op1 == constm1_rtx || op1 == const1_rtx)
>> +*cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
>
> It looks like this is really an extra option on top of the previous
> if-else chain, since it only applies when OP1 and OP2 are still the
> operands of the if_then_else.  So how about:
>
>   else if (op1 == constm1_rtx || op1 == const1_rtx)
> {
> /* Use CSINV.  */

eh, of course I meant CSINV or CSINC...

> *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
> return true;
> }
>   else if (op2 == constm1_rtx || op2 == const1_rtx)
> {
> /* Use CSINV.  */
> *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
> return true;
> }
>
> leaving the code to fall through to:
>
>   *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
>   *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
>   return true;
>
> as it does currently.  OK in that form if you agree.
>
> Let me know if you don't.  But in that case:
>
>> +  else
>> +{
>> +  *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
>> +  *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 1, speed);
>
> should be 2, speed
>
>> +}
>> +  
>
> Thanks,
> Richard


Re: Ping: [PATCH] Fix PR112419

2023-11-27 Thread Jeff Law




On 11/23/23 10:05, Hans-Peter Nilsson wrote:

From: Hans-Peter Nilsson 
Date: Thu, 16 Nov 2023 05:24:06 +0100


From: Martin Uecker 
Date: Tue, 07 Nov 2023 06:56:25 +0100



Am Montag, dem 06.11.2023 um 21:01 -0700 schrieb Jeff Law:


On 11/6/23 20:58, Hans-Peter Nilsson wrote:

This patch caused a testsuite regression: there's now an
"excess error" failure for gcc.dg/Wnonnull-4.c for 32-bit
targets (and 64-bit targets testing with a "-m32" option)
after your r14-5115-g6e9ee44d96e5.  It's logged as PR112419.

It caused failures for just about every target ;(  Presumably it worked
on x86_64...


I do not think this is a true regression
just a problem with the test on 32-bit which somehow surfaced
due to the change.

The excess error is:

FAIL: gcc.dg/Wnonnull-4.c (test for excess errors)
Excess errors:
/home/tcwg-buildslave/workspace/tcwg_gnu_6/abe/snapshots/gcc.git~master/gcc/testsuite/gcc.dg/Wnonnull-4.c:144:3:
 warning: 'fda_n_5' specified size 4294967256 exceeds maximum object size
2147483647 [-Wstringop-overflow=]

I think the warning was suppressed before due to the other (nonnull)
warning which I removed in this case.

I think the simple fix might be to to turn off -Wstringop-overflow.


No, that trigs many of the dg-warnings that are tested.

(I didn't pay attention to the actual warning messages and
tried to pursue that at first.)

Maybe think it's best to actually expect the warning, like
so.

Maintainers of 16-bit targets will have to address their
concerns separately.  For example, they may choose to not
run the test at all.

Ok to commit?

Subject: [PATCH] gcc.dg/Wnonnull-4.c: Handle new overflow warning for 32-bit 
targets [PR112419]

PR testsuite/112419
* gcc.dg/Wnonnull-4.c (test_fda_n_5): Expect warning for exceeding
maximum object size for 32-bit targets.
---
  gcc/testsuite/gcc.dg/Wnonnull-4.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/gcc/testsuite/gcc.dg/Wnonnull-4.c 
b/gcc/testsuite/gcc.dg/Wnonnull-4.c
index 1f14fbba45df..d63e76da70a2 100644
--- a/gcc/testsuite/gcc.dg/Wnonnull-4.c
+++ b/gcc/testsuite/gcc.dg/Wnonnull-4.c
@@ -142,6 +142,7 @@ void test_fda_n_5 (int r_m1)
T (  1);  // { dg-bogus "argument 2 of variable length array 
'double\\\[n]\\\[5]' is null but the corresponding bound argument 1 value is 1" }
T (  9);  // { dg-bogus "argument 2 of variable length array 
'double\\\[n]\\\[5]' is null but the corresponding bound argument 1 value is 9" }
T (max);  // { dg-bogus "argument 2 of variable length array 
'double\\\[n]\\\[5]' is null but the corresponding bound argument 1 value is \\d+" }
+// { dg-warning "size 4294967256 exceeds maximum object size" "" { target 
ilp32 } .-1 }
  }
Unfortunately I think we need to go back to the original issue that 
Martin (I think) dismissed.


Specifically, this is a regression.  It's very clear that prior to the 
patch in question there was no diagnostic about the size of the 
requested memory allocation and after the patch in question we get the 
"exceeds maximum object size" diagnostic.


Now one explanation could be that the diagnostic is warranted and it was 
a bug that the diagnostic hadn't been emitted prior to Martin's patch. 
In this case some kind of dg-blah is warranted, but I don't think anyone 
has made this argument.



Jeff


Re: [PATCH] aarch64: Improve cost of `a ? {-,}1 : b`

2023-11-27 Thread Richard Sandiford
Andrew Pinski  writes:
> While looking into PR 112454, I found the cost for
> `(if_then_else (cmp) (const_int 1) (reg))` was being recorded as 8
> (or `COSTS_N_INSNS (2)`) but it should have been 4 (or `COSTS_N_INSNS (1)`).
> This improves the cost by not adding the cost of `(const_int 1)` to
> the total cost.
>
> It does not does not fix PR 112454 as that requires other changes to forwprop
> the `(const_int 1)` earlier than combine.
>
> Bootstrapped and tested on aarch64-linux-gnu with no regressions.
>
> gcc/ChangeLog:
>
>   * config/aarch64/aarch64.cc (aarch64_if_then_else_costs):
>   Don't add the cost of `1` or `-1`.
>
> Signed-off-by: Andrew Pinski 
> ---
>  gcc/config/aarch64/aarch64.cc | 13 ++---
>  1 file changed, 10 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index f6f6f94bf43..63241c5aaa5 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -11642,9 +11642,16 @@ aarch64_if_then_else_costs (rtx op0, rtx op1, rtx 
> op2, int *cost, bool speed)
>   /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3).  */
>   op1 = XEXP (inner, 0);
>   }
> -
> -  *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
> -  *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
> +  if (op2 == constm1_rtx || op2 == const1_rtx)
> + *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
> +  else if (op1 == constm1_rtx || op1 == const1_rtx)
> + *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);

It looks like this is really an extra option on top of the previous
if-else chain, since it only applies when OP1 and OP2 are still the
operands of the if_then_else.  So how about:

  else if (op1 == constm1_rtx || op1 == const1_rtx)
{
  /* Use CSINV.  */
  *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
  return true;
}
  else if (op2 == constm1_rtx || op2 == const1_rtx)
{
  /* Use CSINV.  */
  *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
  return true;
}

leaving the code to fall through to:

  *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
  *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
  return true;

as it does currently.  OK in that form if you agree.

Let me know if you don't.  But in that case:

> +  else
> + {
> +   *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
> +   *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 1, speed);

should be 2, speed

> + }
> +  

Thanks,
Richard


Re: PR111754

2023-11-27 Thread Prathamesh Kulkarni
On Fri, 24 Nov 2023 at 03:13, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Thu, 26 Oct 2023 at 09:43, Prathamesh Kulkarni
> >  wrote:
> >>
> >> On Thu, 26 Oct 2023 at 04:09, Richard Sandiford
> >>  wrote:
> >> >
> >> > Prathamesh Kulkarni  writes:
> >> > > On Wed, 25 Oct 2023 at 02:58, Richard Sandiford
> >> > >  wrote:
> >> > >> So I think the PR could be solved by something like the attached.
> >> > >> Do you agree?  If so, could you base the patch on this instead?
> >> > >>
> >> > >> Only tested against the self-tests.
> >> > >>
> >> > >> Thanks,
> >> > >> Richard
> >> > >>
> >> > >> diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> >> > >> index 40767736389..00fce4945a7 100644
> >> > >> --- a/gcc/fold-const.cc
> >> > >> +++ b/gcc/fold-const.cc
> >> > >> @@ -10743,27 +10743,37 @@ fold_vec_perm_cst (tree type, tree arg0, 
> >> > >> tree arg1, const vec_perm_indices &sel,
> >> > >>unsigned res_npatterns, res_nelts_per_pattern;
> >> > >>unsigned HOST_WIDE_INT res_nelts;
> >> > >>
> >> > >> -  /* (1) If SEL is a suitable mask as determined by
> >> > >> - valid_mask_for_fold_vec_perm_cst_p, then:
> >> > >> - res_npatterns = max of npatterns between ARG0, ARG1, and SEL
> >> > >> - res_nelts_per_pattern = max of nelts_per_pattern between
> >> > >> -ARG0, ARG1 and SEL.
> >> > >> - (2) If SEL is not a suitable mask, and TYPE is VLS then:
> >> > >> - res_npatterns = nelts in result vector.
> >> > >> - res_nelts_per_pattern = 1.
> >> > >> - This exception is made so that VLS ARG0, ARG1 and SEL work as 
> >> > >> before.  */
> >> > >> -  if (valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, reason))
> >> > >> -{
> >> > >> -  res_npatterns
> >> > >> -   = std::max (VECTOR_CST_NPATTERNS (arg0),
> >> > >> -   std::max (VECTOR_CST_NPATTERNS (arg1),
> >> > >> - sel.encoding ().npatterns ()));
> >> > >> +  /* First try to implement the fold in a VLA-friendly way.
> >> > >> +
> >> > >> + (1) If the selector is simply a duplication of N elements, the
> >> > >> +result is likewise a duplication of N elements.
> >> > >> +
> >> > >> + (2) If the selector is N elements followed by a duplication
> >> > >> +of N elements, the result is too.
> >> > >>
> >> > >> -  res_nelts_per_pattern
> >> > >> -   = std::max (VECTOR_CST_NELTS_PER_PATTERN (arg0),
> >> > >> -   std::max (VECTOR_CST_NELTS_PER_PATTERN (arg1),
> >> > >> - sel.encoding ().nelts_per_pattern ()));
> >> > >> + (3) If the selector is N elements followed by an interleaving
> >> > >> +of N linear series, the situation is more complex.
> >> > >>
> >> > >> +valid_mask_for_fold_vec_perm_cst_p detects whether we
> >> > >> +can handle this case.  If we can, then each of the N linear
> >> > >> +series either (a) selects the same element each time or
> >> > >> +(b) selects a linear series from one of the input patterns.
> >> > >> +
> >> > >> +If (b) holds for one of the linear series, the result
> >> > >> +will contain a linear series, and so the result will have
> >> > >> +the same shape as the selector.  If (a) holds for all of
> >> > >> +the lienar series, the result will be the same as (2) above.
> >> > >> +
> >> > >> +(b) can only hold if one of the inputs pattern has a
> >> > >> +stepped encoding.  */
> >> > >> +  if (valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, reason))
> >> > >> +{
> >> > >> +  res_npatterns = sel.encoding ().npatterns ();
> >> > >> +  res_nelts_per_pattern = sel.encoding ().nelts_per_pattern ();
> >> > >> +  if (res_nelts_per_pattern == 3
> >> > >> + && VECTOR_CST_NELTS_PER_PATTERN (arg0) < 3
> >> > >> + && VECTOR_CST_NELTS_PER_PATTERN (arg1) < 3)
> >> > >> +   res_nelts_per_pattern = 2;
> >> > > Um, in this case, should we set:
> >> > > res_nelts_per_pattern = max (nelts_per_pattern (arg0), 
> >> > > nelts_per_pattern(arg1))
> >> > > if both have nelts_per_pattern == 1 ?
> >> >
> >> > No, it still needs to be 2 even if arg0 and arg1 are duplicates.
> >> > E.g. consider a selector that picks the first element of arg0
> >> > followed by a duplicate of the first element of arg1.
> >> >
> >> > > Also I suppose this matters only for non-integral element type, since
> >> > > for integral element type,
> >> > > vector_cst_elt will return the correct value even if the element is
> >> > > not explicitly encoded and input vector is dup ?
> >> >
> >> > Yeah, but it might help even for integers.  If we build fewer
> >> > elements explicitly, and so read fewer implicitly-encoded inputs,
> >> > there's less risk of running into:
> >> >
> >> >   if (!can_div_trunc_p (sel[i], len, &q, &r))
> >> > {
> >> >   if (reason)
> >> > *reason = "cannot divide selector element by arg len";
> >> 

Re: [patch] OpenMP: Add -Wopenmp and use it

2023-11-27 Thread Christophe Lyon
On Mon, 27 Nov 2023 at 11:33, Tobias Burnus  wrote:
>
> Hi,
>
> On 27.11.23 11:20, Christophe Lyon wrote:
>
> > I think the lack of final '.' in:
>
> Indeed - but you are lagging a bit behind:
>
> https://gcc.gnu.org/pipermail/gcc-patches/2023-November/638128.html
>
> [committed] c-family/c.opt (-Wopenmp): Add missing tailing '.'
>
> Fri Nov 24 18:56:21 GMT 2023
>
> Committed as r14-5835-g6eb1507107dee3
>

Great thanks! Sorry for the noise, it's a bit hard and error-prone to
track which regressions have already fixed and/or are being worked on.
Our bisect started at r14-5830, just a bit too early :-)

Thanks,

Christophe


> Tobias
>
> -
> Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
> München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
> Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
> München, HRB 106955


Re: [PATCH 0/3] [GCC] arm: vld1q_types_xN ACLE intrinsics

2023-11-27 Thread Richard Earnshaw




On 06/10/2023 10:49, ezra.sito...@arm.com wrote:

Add xN variants of vld1q_types intrinsic.




These patches are all OK, but please fix commit message formatting in 
line with the comments on the earlier series.


R.


Re: [PATCH 0/3] [GCC] arm: vld1_types_xN ACLE intrinsics

2023-11-27 Thread Richard Earnshaw




On 19/10/2023 14:41, ezra.sito...@arm.com wrote:

Add xN variants of vld1_types intrinsic for AArch32.




These patches are all OK, but please fix the commit message formatting 
as with earlier series.


R.


Re: [PATCH 3/3] [GCC] arm: vst1q_types_x4 ACLE intrinsics

2023-11-27 Thread Richard Earnshaw




On 10/10/2023 15:04, ezra.sito...@arm.com wrote:

From: Ezra Sitorus 

This patch is part of a series of patches implementing the _xN variants of the 
vst1q intrinsic for AArch32.
This patch adds the _x4 variants of the vst1q intrinsic.


OK, but see earlier comments about formatting.

R.



ACLE documents are at https://developer.arm.com/documentation/ihi0053/latest/
ISA documents are at https://developer.arm.com/documentation/ddi0487/latest/

gcc/ChangeLog:
 * config/arm/arm_neon.h
 (vst1q_u8_x4, vst1q_u16_x4, vst1q_u32_x4, vst1q_u64_x4): New.
 (vst1q_s8_x4, vst1q_s16_x4, vst1q_s32_x4, vst1q_s64_x4): New.
 (vst1q_f16_x4, vst1q_f32_x4): New.
 (vst1q_p8_x4, vst1q_p16_x4, vst1q_p64_x4): New.
 (vst1q_bf16_x4): New.
 * config/arm/arm_neon_builtins.def (vst1q_x4): New entries.
 * config/arm/neon.md (neon_vst1q_x4): New.

gcc/testsuite/ChangeLog:
 * gcc.target/arm/simd/vst1q_base_xN_1.c: Add new tests.
 * gcc.target/arm/simd/vst1q_bf16_xN_1.c: Add new tests.
 * gcc.target/arm/simd/vst1q_fp16_xN_1.c: Add new tests.
 * gcc.target/arm/simd/vst1q_p64_xN_1.c: Add new tests.
---
  gcc/config/arm/arm_neon.h | 114 ++
  gcc/config/arm/arm_neon_builtins.def  |   1 +
  gcc/config/arm/neon.md|  26 
  .../gcc.target/arm/simd/vst1q_base_xN_1.c |  59 +
  .../gcc.target/arm/simd/vst1q_bf16_xN_1.c |   8 +-
  .../gcc.target/arm/simd/vst1q_fp16_xN_1.c |   6 +
  .../gcc.target/arm/simd/vst1q_p64_xN_1.c  |   6 +
  7 files changed, 219 insertions(+), 1 deletion(-)

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 46ee888410f..df3e23b6e95 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -11391,6 +11391,38 @@ vst1q_s64_x3 (int64_t * __a, int64x2x3_t __b)
__builtin_neon_vst1q_x3v2di ((__builtin_neon_di *) __a, __bu.__o);
  }
  
+__extension__ extern __inline void

+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s8_x4 (int8_t * __a, int8x16x4_t __b)
+{
+  union { int8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst1q_x4v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s16_x4 (int16_t * __a, int16x8x4_t __b)
+{
+  union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst1q_x4v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s32_x4 (int32_t * __a, int32x4x4_t __b)
+{
+  union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst1q_x4v4si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s64_x4 (int64_t * __a, int64x2x4_t __b)
+{
+  union { int64x2x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst1q_x4v2di ((__builtin_neon_di *) __a, __bu.__o);
+}
+
  __extension__ extern __inline void
  __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
  vst1_s8_x3 (int8_t * __a, int8x8x3_t __b)
@@ -11736,6 +11768,14 @@ vst1q_p64_x3 (poly64_t * __a, poly64x2x3_t __b)
__builtin_neon_vst1q_x3v2di ((__builtin_neon_di *) __a, __bu.__o);
  }
  
+__extension__ extern __inline void

+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p64_x4 (poly64_t * __a, poly64x2x4_t __b)
+{
+  union { poly64x2x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst1q_x4v2di ((__builtin_neon_di *) __a, __bu.__o);
+}
+
  #pragma GCC pop_options
  __extension__ extern __inline void
  __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
@@ -11817,6 +11857,24 @@ vst1q_f32_x3 (float32_t * __a, float32x4x3_t __b)
__builtin_neon_vst1q_x3v4sf (__a, __bu.__o);
  }
  
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)

+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f16_x4 (float16_t * __a, float16x8x4_t __b)
+{
+  union { float16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst1q_x4v8hf (__a, __bu.__o);
+}
+#endif
+
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f32_x4 (float32_t * __a, float32x4x4_t __b)
+{
+  union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+  __builtin_neon_vst1q_x4v4sf (__a, __bu.__o);
+}
+
  __extension__ extern __inline void
  __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
  vst1q_u8 (uint8_t * __a, uint8x16_t __b)
@@ -11909,6 +11967,38 @@ vst1q_u64_x3 (uint64_t * __a, uint64x2x3_t __b)
__builtin_neon_vst1q_x3v2di ((__builtin_neon_di *) __a, __bu.__o);
  }
  
+__extension__ extern __inline void

+

Re: [PATCH 2/3] [GCC] arm: vst1q_types_x3 ACLE intrinsics

2023-11-27 Thread Richard Earnshaw




On 10/10/2023 15:04, ezra.sito...@arm.com wrote:

From: Ezra Sitorus 

This patch is part of a series of patches implementing the _xN variants of the 
vst1q intrinsic for AArch32.
This patch adds the _x3 variants of the vst1q intrinsic.


OK, but format lines to <= 70 columns please.

R.


ACLE documents are at https://developer.arm.com/documentation/ihi0053/latest/
ISA documents are at https://developer.arm.com/documentation/ddi0487/latest/

gcc/ChangeLog:
 * config/arm/arm_neon.h
 (vst1q_u8_x3, vst1q_u16_x3, vst1q_u32_x3, vst1q_u64_x3): New.
 (vst1q_s8_x3, vst1q_s16_x3, vst1q_s32_x3, vst1q_s64_x3): New.
 (vst1q_f16_x3, vst1q_f32_x3): New.
 (vst1q_p8_x3, vst1q_p16_x3, vst1q_p64_x3): New.
 (vst1q_bf16_x3): New.
 * config/arm/arm_neon_builtins.def (vst1q_x3): New entries.
 * config/arm/neon.md (neon_vst1q_x3): New.

gcc/testsuite/ChangeLog:
 * gcc.target/arm/simd/vst1q_base_xN_1.c: Add new tests.
 * gcc.target/arm/simd/vst1q_bf16_xN_1.c: Add new tests.
 * gcc.target/arm/simd/vst1q_fp16_xN_1.c: Add new tests.
 * gcc.target/arm/simd/vst1q_p64_xN_1.c: Add new tests.
---
  gcc/config/arm/arm_neon.h | 114 ++
  gcc/config/arm/arm_neon_builtins.def  |   1 +
  gcc/config/arm/neon.md|  24 
  .../gcc.target/arm/simd/vst1q_base_xN_1.c |  60 +
  .../gcc.target/arm/simd/vst1q_bf16_xN_1.c |   6 +
  .../gcc.target/arm/simd/vst1q_fp16_xN_1.c |   6 +
  .../gcc.target/arm/simd/vst1q_p64_xN_1.c  |   6 +
  7 files changed, 217 insertions(+)

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index b8f3fca3060..46ee888410f 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -11359,6 +11359,38 @@ vst1q_s64_x2 (int64_t * __a, int64x2x2_t __b)
__builtin_neon_vst1q_x2v2di ((__builtin_neon_di *) __a, __bu.__o);
  }
  
+__extension__ extern __inline void

+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s8_x3 (int8_t * __a, int8x16x3_t __b)
+{
+  union { int8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst1q_x3v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s16_x3 (int16_t * __a, int16x8x3_t __b)
+{
+  union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst1q_x3v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s32_x3 (int32_t * __a, int32x4x3_t __b)
+{
+  union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst1q_x3v4si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s64_x3 (int64_t * __a, int64x2x3_t __b)
+{
+  union { int64x2x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst1q_x3v2di ((__builtin_neon_di *) __a, __bu.__o);
+}
+
  __extension__ extern __inline void
  __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
  vst1_s8_x3 (int8_t * __a, int8x8x3_t __b)
@@ -11696,6 +11728,14 @@ vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t __b)
__builtin_neon_vst1q_x2v2di ((__builtin_neon_di *) __a, __bu.__o);
  }
  
+__extension__ extern __inline void

+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p64_x3 (poly64_t * __a, poly64x2x3_t __b)
+{
+  union { poly64x2x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst1q_x3v2di ((__builtin_neon_di *) __a, __bu.__o);
+}
+
  #pragma GCC pop_options
  __extension__ extern __inline void
  __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
@@ -11759,6 +11799,24 @@ vst1q_f32_x2 (float32_t * __a, float32x4x2_t __b)
__builtin_neon_vst1q_x2v4sf (__a, __bu.__o);
  }
  
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)

+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f16_x3 (float16_t * __a, float16x8x3_t __b)
+{
+  union { float16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst1q_x3v8hf (__a, __bu.__o);
+}
+#endif
+
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f32_x3 (float32_t * __a, float32x4x3_t __b)
+{
+  union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+  __builtin_neon_vst1q_x3v4sf (__a, __bu.__o);
+}
+
  __extension__ extern __inline void
  __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
  vst1q_u8 (uint8_t * __a, uint8x16_t __b)
@@ -11819,6 +11877,38 @@ vst1q_u64_x2 (uint64_t * __a, uint64x2x2_t __b)
__builtin_neon_vst1q_x2v2di ((__builtin_neon_di *) __a, __bu.__o);
  }
  
+__exten

Re: [PATCH 1/3] [GCC] arm: vst1q_types_x2 ACLE intrinsics

2023-11-27 Thread Richard Earnshaw




On 10/10/2023 15:04, ezra.sito...@arm.com wrote:

From: Ezra Sitorus 

This patch is part of a series of patches implementing the _xN variants of the 
vst1q intrinsic for AArch32.
This patch adds the _x2 variants of the vst1q intrinsic. Tests use xN so that 
the latter variants (_x3, _x4) could be added.

ACLE documents are at https://developer.arm.com/documentation/ihi0053/latest/
ISA documents are at https://developer.arm.com/documentation/ddi0487/latest/

gcc/ChangeLog:
 * config/arm/arm_neon.h
 (vst1q_u8_x2, vst1q_u16_x2, vst1q_u32_x2, vst1q_u64_x32): New.


The same issues that I noted on the previous set apply here too.

Otherwise OK.

R.


 (vst1q_s8_x2, vst1q_s16_x2, vst1q_s32_x2, vst1q_s64_x2): New.
 (vst1q_f16_x2, vst1q_f32_x2): New.
 (vst1q_p8_x2, vst1q_p16_x2, vst1q_p64_x2): New.
 (vst1q_bf16_x2): New.
 * config/arm/arm_neon_builtins.def (vst1<_x2): New entries.
 * config/arm/neon.md (neon_vst1_x2): Updated from 
neon_vst1_x2.
* config/arm/iterators.md (VMEMX2): New mode iterator.
(VMEMX2_q): New mode attribute.

gcc/testsuite/ChangeLog:
 * gcc.target/arm/simd/vst1q_base_xN_1.c: Add new tests.
 * gcc.target/arm/simd/vst1q_bf16_xN_1.c: Add new tests.
 * gcc.target/arm/simd/vst1q_fp16_xN_1.c: Add new tests.
 * gcc.target/arm/simd/vst1q_p64_xN_1.c: Add new tests.
---
  gcc/config/arm/arm_neon.h | 114 ++
  gcc/config/arm/arm_neon_builtins.def  |   1 +
  gcc/config/arm/iterators.md   |   6 +
  gcc/config/arm/neon.md|   6 +-
  .../gcc.target/arm/simd/vst1q_base_xN_1.c |  70 +++
  .../gcc.target/arm/simd/vst1q_bf16_xN_1.c |  13 ++
  .../gcc.target/arm/simd/vst1q_fp16_xN_1.c |  13 ++
  .../gcc.target/arm/simd/vst1q_p64_xN_1.c  |  13 ++
  8 files changed, 233 insertions(+), 3 deletions(-)
  create mode 100644 gcc/testsuite/gcc.target/arm/simd/vst1q_base_xN_1.c
  create mode 100644 gcc/testsuite/gcc.target/arm/simd/vst1q_bf16_xN_1.c
  create mode 100644 gcc/testsuite/gcc.target/arm/simd/vst1q_fp16_xN_1.c
  create mode 100644 gcc/testsuite/gcc.target/arm/simd/vst1q_p64_xN_1.c

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 41e645d8352..b8f3fca3060 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -11327,6 +11327,38 @@ vst1_s64_x2 (int64_t * __a, int64x1x2_t __b)
__builtin_neon_vst1_x2di ((__builtin_neon_di *) __a, __bu.__o);
  }
  
+__extension__ extern __inline void

+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s8_x2 (int8_t * __a, int8x16x2_t __b)
+{
+  union { int8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst1q_x2v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s16_x2 (int16_t * __a, int16x8x2_t __b)
+{
+  union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst1q_x2v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s32_x2 (int32_t * __a, int32x4x2_t __b)
+{
+  union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst1q_x2v4si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s64_x2 (int64_t * __a, int64x2x2_t __b)
+{
+  union { int64x2x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst1q_x2v2di ((__builtin_neon_di *) __a, __bu.__o);
+}
+
  __extension__ extern __inline void
  __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
  vst1_s8_x3 (int8_t * __a, int8x8x3_t __b)
@@ -11656,6 +11688,14 @@ vst1q_p64 (poly64_t * __a, poly64x2_t __b)
__builtin_neon_vst1v2di ((__builtin_neon_di *) __a, (int64x2_t) __b);
  }
  
+__extension__ extern __inline void

+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t __b)
+{
+  union { poly64x2x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst1q_x2v2di ((__builtin_neon_di *) __a, __bu.__o);
+}
+
  #pragma GCC pop_options
  __extension__ extern __inline void
  __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
@@ -11701,6 +11741,24 @@ vst1q_f32 (float32_t * __a, float32x4_t __b)
__builtin_neon_vst1v4sf ((__builtin_neon_sf *) __a, __b);
  }
  
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)

+__extension__ extern __inline void
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f16_x2 (float16_t * __a, float16x8x2_t __b)
+{
+  union { float16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+  __builtin_neon_vst1q_x2v8hf (__a, __bu.__o);
+}
+#

  1   2   >