Re: [PATCH] Add VXRM enum

2023-07-13 Thread Robin Dapp via Gcc-patches
> +enum __RISCV_VXRM {
> +  __RISCV_VXRM_RNU = 0,
> +  __RISCV_VXRM_RNE = 1,
> +  __RISCV_VXRM_RDN = 2,
> +  __RISCV_VXRM_ROD = 3,
> +};
> +
>  __extension__ extern __inline unsigned long
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vread_csr(enum RVV_CSR csr)

We have that already in riscv-protos.h :)
(fixed_point_rounding_mode)

Regards
 Robin



Re: [PATCH] Fix part of PR 110293: `A NEEQ (A NEEQ CST)` part

2023-07-13 Thread Richard Biener via Gcc-patches
On Wed, Jul 12, 2023 at 6:09 PM Andrew Pinski via Gcc-patches
 wrote:
>
> This fixes part of PR 110293, for the outer comparison case
> being `!=` or `==`.  In turn PR 110539 is able to be optimized
> again as the if statement for `(a&1) == ((a & 1) != 0)` gets optimized
> to `false` early enough to allow FRE/DOM to do a CSE for memory store/load.
>
> OK? Bootstrapped and tested on x86_64-linux with no regressions.

OK.

Thanks,
Richard.

> gcc/ChangeLog:
>
> PR tree-optimization/110293
> PR tree-optimization/110539
> * match.pd: Expand the `x != (typeof x)(x == 0)`
> pattern to handle where the inner and outer comparsions
> are either `!=` or `==` and handle other constants
> than 0.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/tree-ssa/pr110293-1.c: New test.
> * gcc.dg/tree-ssa/pr110539-1.c: New test.
> * gcc.dg/tree-ssa/pr110539-2.c: New test.
> * gcc.dg/tree-ssa/pr110539-3.c: New test.
> * gcc.dg/tree-ssa/pr110539-4.c: New test.
> ---
>  gcc/match.pd   | 39 --
>  gcc/testsuite/gcc.dg/tree-ssa/pr110293-1.c | 58 +++
>  gcc/testsuite/gcc.dg/tree-ssa/pr110539-1.c | 12 
>  gcc/testsuite/gcc.dg/tree-ssa/pr110539-2.c | 12 
>  gcc/testsuite/gcc.dg/tree-ssa/pr110539-3.c | 75 
>  gcc/testsuite/gcc.dg/tree-ssa/pr110539-4.c | 82 ++
>  6 files changed, 274 insertions(+), 4 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr110293-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr110539-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr110539-2.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr110539-3.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr110539-4.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 8543f777a28..351d9285e92 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -6429,10 +6429,41 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>   (if (TYPE_UNSIGNED (TREE_TYPE (@0)))
>{ constant_boolean_node (false, type); }))
>
> -/* x != (typeof x)(x == 0) is always true.  */
> -(simplify
> - (ne:c @0 (convert (eq @0 integer_zerop)))
> - { constant_boolean_node (true, type); })
> +/* x != (typeof x)(x == CST) -> CST == 0 ? 1 : (CST == 1 ? (x!=0&!=1) : x 
> != 0) */
> +/* x != (typeof x)(x != CST) -> CST == 1 ? 1 : (CST == 0 ? (x!=0&!=1) : x 
> != 1) */
> +/* x == (typeof x)(x == CST) -> CST == 0 ? 0 : (CST == 1 ? (x==0||x==1) : x 
> != 0) */
> +/* x == (typeof x)(x != CST) -> CST == 1 ? 0 : (CST == 0 ? (x==0||x==1) : x 
> != 1) */
> +(for outer (ne eq)
> + (for inner (ne eq)
> +  (simplify
> +   (outer:c @0 (convert (inner @0 INTEGER_CST@1)))
> +   (with {
> + bool cst1 = integer_onep (@1);
> + bool cst0 = integer_zerop (@1);
> + bool innereq = inner == EQ_EXPR;
> + bool outereq = outer == EQ_EXPR;
> +}
> +   (switch
> +(if (innereq ? cst0 : cst1)
> + { constant_boolean_node (!outereq, type); })
> +(if (innereq ? cst1 : cst0)
> + (with {
> +   tree utype = unsigned_type_for (TREE_TYPE (@0));
> +   tree ucst1 = build_one_cst (utype);
> +  }
> +  (if (!outereq)
> +   (gt (convert:utype @0) { ucst1; })
> +   (le (convert:utype @0) { ucst1; })
> +  )
> + )
> +)
> +(if (innereq)
> + (ne @0 { build_zero_cst (TREE_TYPE (@0)); }))
> +(ne @0 { build_one_cst (TREE_TYPE (@0)); }))
> +   )
> +  )
> + )
> +)
>
>  (for cmp (unordered ordered unlt unle ungt unge uneq ltgt)
>   /* If the second operand is NaN, the result is constant.  */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr110293-1.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/pr110293-1.c
> new file mode 100644
> index 000..24aea1a2d03
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr110293-1.c
> @@ -0,0 +1,58 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -fdump-tree-optimized-raw" } */
> +
> +_Bool eqeq0(unsigned x)
> +{
> +  return x == (x == 0);
> +}
> +_Bool eqeq1(unsigned x)
> +{
> +  return x == (x == 1);
> +}
> +_Bool eqeq2(unsigned x)
> +{
> +  return x == (x == 2);
> +}
> +
> +_Bool neeq0(unsigned x)
> +{
> +  return x != (x == 0);
> +}
> +_Bool neeq1(unsigned x)
> +{
> +  return x != (x == 1);
> +}
> +_Bool neeq2(unsigned x)
> +{
> +  return x != (x == 2);
> +}
> +
> +_Bool eqne0(unsigned x)
> +{
> +  return x == (x != 0);
> +}
> +_Bool eqne1(unsigned x)
> +{
> +  return x == (x != 1);
> +}
> +_Bool eqne2(unsigned x)
> +{
> +  return x == (x != 2);
> +}
> +
> +_Bool nene0(unsigned x)
> +{
> +  return x != (x != 0);
> +}
> +_Bool nene1(unsigned x)
> +{
> +  return x != (x != 1);
> +}
> +_Bool nene2(unsigned x)
> +{
> +  return x != (x != 2);
> +}
> +
> +/* All of these functions should have removed the inner most comparison which
> +   means all of the conversions from bool to unsigned should have been 
> removed too. */
> +/* { dg-final { scan-tree-dump-not "nop_expr," "optimized"} } */
> diff --git 

[PATCH] m2, build: Use LDLFAGS for mklink

2023-07-13 Thread Rainer Orth
When trying to bootstrap current trunk on macOS 14.0 beta 3 with Xcode
15 beta 4, the build failed running mklink in stage 2:

unset CC ; m2/boot-bin/mklink -s --langc++ --exit --name m2/mc-boot/main.cc 
/vol/gcc/src/hg/master/darwin/gcc/m2/init/mcinit
dyld[55825]: Library not loaded: /vol/gcc/lib/libstdc++.6.dylib

While it's unclear to me why this only happens on macOS 14, the problem
is clear: unlike other C++ executables, mklink isn't linked with
-static-libstdc++ which is passed in from toplevel in LDFLAGS.

This patch fixes that and allows the build to continue.

Bootstrapped on x86_64-apple-darwin23.0.0, i386-pc-solaris2.11, and
sparc-sun-solaris2.11.

Ok for trunk?

Rainer

-- 
-
Rainer Orth, Center for Biotechnology, Bielefeld University


2023-07-11  Rainer Orth  

gcc/m2:
* Make-lang.in (m2/boot-bin/mklink$(exeext)): Add $(LDFLAGS).

# HG changeset patch
# Parent  b4327a00f1cdf8ce96f0483bbac09cc5f6108218
m2: Use LDLFAGS for mklink

diff --git a/gcc/m2/Make-lang.in b/gcc/m2/Make-lang.in
--- a/gcc/m2/Make-lang.in
+++ b/gcc/m2/Make-lang.in
@@ -1653,7 +1653,7 @@ m2/gm2-compiler-boot/gm2.a: m2/boot-bin/
 
 m2/boot-bin/mklink$(exeext): $(srcdir)/m2/tools-src/mklink.c
 	-test -d $(@D) || $(mkinstalldirs) $(@D)
-	$(CXX) $(CFLAGS) -I$(srcdir)/m2 -Im2/gm2-libs-boot -Im2/gm2-compiler-boot -I$(srcdir)/m2/mc-boot-ch $(INCLUDES) $< -o $@
+	$(CXX) $(CFLAGS) $(LDFLAGS) -I$(srcdir)/m2 -Im2/gm2-libs-boot -Im2/gm2-compiler-boot -I$(srcdir)/m2/mc-boot-ch $(INCLUDES) $< -o $@
 
 m2/gm2-compiler-boot/$(SRC_PREFIX)%.h: $(srcdir)/m2/gm2-compiler-boot/%.def $(MCDEPS)
 	-test -d $(@D) || $(mkinstalldirs) $(@D)


Re: [PATCH v2] RISC-V: Refactor riscv mode after for VXRM and FRM

2023-07-13 Thread Kito Cheng via Gcc-patches
Hmmm, anyway, I guess it's not worth spending any more of your time,
LGTM for v3 :)

On Thu, Jul 13, 2023 at 5:10 PM Li, Pan2 via Gcc-patches

 wrote:
>
> It can pass the selftest with below diff based on v3, but got ICE when build 
> newlib.
>
> /home/pli/repos/gcc/222/riscv-gnu-toolchain/newlib/newlib/libc/time/../time/strftime.c:1426:1:
>  internal compiler error: in reg_overlap_mentioned_p, at rtlanal.cc:1928
>  1426 | }
>   | ^
> 0x87241f reg_overlap_mentioned_p(rtx_def const*, rtx_def const*)
> ../.././gcc/gcc/rtlanal.cc:1928
> 0x1005eab set_of_1
> ../.././gcc/gcc/rtlanal.cc:1440
> 0x10015c2 set_of(rtx_def const*, rtx_def const*)
> ../.././gcc/gcc/rtlanal.cc:1452
> 0x10015c2 reg_set_p(rtx_def const*, rtx_def const*)
> ../.././gcc/gcc/rtlanal.cc:1295
> 0x13f66c0 vxrm_unknown_p
> ../.././gcc/gcc/config/riscv/riscv.cc:7720
> 0x13f66c0 riscv_vxrm_mode_after
> ../.././gcc/gcc/config/riscv/riscv.cc:7760
> 0x13f66c0 riscv_mode_after
> ../.././gcc/gcc/config/riscv/riscv.cc:7799
> 0x1defe69 optimize_mode_switching
> ../.././gcc/gcc/mode-switching.cc:632
> 0x1defe69 execute
> ../.././gcc/gcc/mode-switching.cc:909
>
>
> Diff based on PATCH v3.
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 6ed735d6983..d66ba0030eb 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -7714,10 +7714,10 @@ asm_insn_p (rtx_insn *insn)
>  /* Return TRUE that an insn is unknown for VXRM.  */
>
>  static bool
> -vxrm_unknown_p (rtx_insn *insn)
> +vxrm_unknown_p (rtx_insn *insn, const_rtx vxrm_reg)
>  {
>/* Return true if there is a definition of VXRM.  */
> -  if (reg_set_p (gen_rtx_REG (SImode, VXRM_REGNUM), insn))
> +  if (reg_set_p (vxrm_reg, insn))
>  return true;
>
>/* A CALL function may contain an instruction that modifies the VXRM,
> @@ -7736,10 +7736,10 @@ vxrm_unknown_p (rtx_insn *insn)
>  /* Return TRUE that an insn is unknown dynamic for FRM.  */
>
>  static bool
> -frm_unknown_dynamic_p (rtx_insn *insn)
> +frm_unknown_dynamic_p (rtx_insn *insn, const_rtx frm_reg)
>  {
>/* Return true if there is a definition of FRM.  */
> -  if (reg_set_p (gen_rtx_REG (SImode, FRM_REGNUM), insn))
> +  if (reg_set_p (frm_reg, insn))
>  return true;
>
>/* A CALL function may contain an instruction that modifies the FRM,
> @@ -7755,13 +7755,15 @@ frm_unknown_dynamic_p (rtx_insn *insn)
>  static int
>  riscv_vxrm_mode_after (rtx_insn *insn, int mode)
>  {
> -  if (vxrm_unknown_p (insn))
> +  static const_rtx vxrm_reg = gen_rtx_REG (SImode, VXRM_REGNUM);
> +
> +  if (vxrm_unknown_p (insn, vxrm_reg))
>  return VXRM_MODE_NONE;
>
>if (recog_memoized (insn) < 0)
>  return mode;
>
> -  if (reg_mentioned_p (gen_rtx_REG (SImode, VXRM_REGNUM), PATTERN (insn)))
> +  if (reg_mentioned_p (vxrm_reg, PATTERN (insn)))
>  return get_attr_vxrm_mode (insn);
>else
>  return mode;
> @@ -7772,13 +7774,15 @@ riscv_vxrm_mode_after (rtx_insn *insn, int mode)
>  static int
>  riscv_frm_mode_after (rtx_insn *insn, int mode)
>  {
> -  if (frm_unknown_dynamic_p (insn))
> +  static const_rtx frm_reg = gen_rtx_REG (SImode, FRM_REGNUM);
> +
> +  if (frm_unknown_dynamic_p (insn, frm_reg))
>  return FRM_MODE_DYN;
>
>if (recog_memoized (insn) < 0)
>  return mode;
>
> -  if (reg_mentioned_p (gen_rtx_REG (SImode, FRM_REGNUM), PATTERN (insn)))
> +  if (reg_mentioned_p (frm_reg, PATTERN (insn)))
>  return get_attr_frm_mode (insn);
>else
>  return mode;
>
> Pan
>
> -Original Message-
> From: Li, Pan2
> Sent: Thursday, July 13, 2023 4:42 PM
> To: Kito Cheng 
> Cc: Jeff Law ; gcc-patches@gcc.gnu.org; 
> juzhe.zh...@rivai.ai; rdapp@gmail.com; Wang, Yanzhang 
> 
> Subject: RE: [PATCH v2] RISC-V: Refactor riscv mode after for VXRM and FRM
>
> Sure thing, get you point now, will have a try and send v4 if everything goes 
> well.
>
> Pan
>
> -Original Message-
> From: Kito Cheng 
> Sent: Thursday, July 13, 2023 3:35 PM
> To: Li, Pan2 
> Cc: Jeff Law ; gcc-patches@gcc.gnu.org; 
> juzhe.zh...@rivai.ai; rdapp@gmail.com; Wang, Yanzhang 
> 
> Subject: Re: [PATCH v2] RISC-V: Refactor riscv mode after for VXRM and FRM
>
> oh, I know why you failed on that, you need to put it within the
> function, not global static, function static variable will construct
> when first invoked rather than construct at program start.
>
> Could you try to apply my diff in the last mail and try again?
>
> On Thu, Jul 13, 2023 at 3:29 PM Li, Pan2 via Gcc-patches
>  wrote:
> >
> > Thanks Kito for review. Sorry didn't involve the code result in self test 
> > error in PATCH v3, but it can be reproduced with below diff based on PATCH 
> > v3. Let me know if I didn't get the point of your comments.
> >
> > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > index 6ed735d6983..76689eaf8d5 100644
> > --- a/gcc/config/riscv/riscv.cc
> > +++ b/gcc/config/riscv/riscv.cc
> > @@ 

[PATCH 1/6] arm: [MVE intrinsics] Factorize vcaddq vhcaddq

2023-07-13 Thread Christophe Lyon via Gcc-patches
Factorize vcaddq, vhcaddq so that they use the same parameterized
names.

To be able to use the same patterns, we add a suffix to vcaddq.

Note that vcadd uses UNSPEC_VCADDxx for builtins without predication,
and VCADDQ_ROTxx_M_x (that is, not starting with "UNSPEC_").  The
UNPEC_* names are also used by neon.md

2023-07-13  Christophe Lyon  

gcc/
* config/arm/arm_mve_builtins.def (vcaddq_rot90_, vcaddq_rot270_)
(vcaddq_rot90_f, vcaddq_rot90_f): Add "_" or "_f" suffix.
* config/arm/iterators.md (mve_insn): Add vcadd, vhcadd.
(isu): Add UNSPEC_VCADD90, UNSPEC_VCADD270, VCADDQ_ROT270_M_U,
VCADDQ_ROT270_M_S, VCADDQ_ROT90_M_U, VCADDQ_ROT90_M_S,
VHCADDQ_ROT90_M_S, VHCADDQ_ROT270_M_S, VHCADDQ_ROT90_S,
VHCADDQ_ROT270_S.
(rot): Add VCADDQ_ROT90_M_F, VCADDQ_ROT90_M_S, VCADDQ_ROT90_M_U,
VCADDQ_ROT270_M_F, VCADDQ_ROT270_M_S, VCADDQ_ROT270_M_U,
VHCADDQ_ROT90_S, VHCADDQ_ROT270_S, VHCADDQ_ROT90_M_S,
VHCADDQ_ROT270_M_S.
(mve_rot): Add VCADDQ_ROT90_M_F, VCADDQ_ROT90_M_S,
VCADDQ_ROT90_M_U, VCADDQ_ROT270_M_F, VCADDQ_ROT270_M_S,
VCADDQ_ROT270_M_U, VHCADDQ_ROT90_S, VHCADDQ_ROT270_S,
VHCADDQ_ROT90_M_S, VHCADDQ_ROT270_M_S.
(supf): Add VHCADDQ_ROT90_M_S, VHCADDQ_ROT270_M_S,
VHCADDQ_ROT90_S, VHCADDQ_ROT270_S, UNSPEC_VCADD90,
UNSPEC_VCADD270.
(VCADDQ_ROT270_M): Delete.
(VCADDQ_M_F VxCADDQ VxCADDQ_M): New.
(VCADDQ_ROT90_M): Delete.
* config/arm/mve.md (mve_vcaddq)
(mve_vhcaddq_rot270_s, mve_vhcaddq_rot90_s): Merge
into ...
(@mve_q_): ... this.
(mve_vcaddq): Rename into ...
(@mve_q_f): ... this
(mve_vcaddq_rot270_m_)
(mve_vcaddq_rot90_m_, mve_vhcaddq_rot270_m_s)
(mve_vhcaddq_rot90_m_s): Merge into ...
(@mve_q_m_): ... this.
(mve_vcaddq_rot270_m_f, mve_vcaddq_rot90_m_f): Merge
into ...
(@mve_q_m_f): ... this.
---
 gcc/config/arm/arm_mve_builtins.def |   6 +-
 gcc/config/arm/iterators.md |  38 +++-
 gcc/config/arm/mve.md   | 135 +---
 3 files changed, 62 insertions(+), 117 deletions(-)

diff --git a/gcc/config/arm/arm_mve_builtins.def 
b/gcc/config/arm/arm_mve_builtins.def
index 8de765de3b0..63ad1845593 100644
--- a/gcc/config/arm/arm_mve_builtins.def
+++ b/gcc/config/arm/arm_mve_builtins.def
@@ -187,6 +187,10 @@ VAR3 (BINOP_NONE_NONE_NONE, vmaxvq_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vmaxq_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vhsubq_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vhsubq_n_s, v16qi, v8hi, v4si)
+VAR3 (BINOP_NONE_NONE_NONE, vcaddq_rot90_, v16qi, v8hi, v4si)
+VAR3 (BINOP_NONE_NONE_NONE, vcaddq_rot270_, v16qi, v8hi, v4si)
+VAR2 (BINOP_NONE_NONE_NONE, vcaddq_rot90_f, v8hf, v4sf)
+VAR2 (BINOP_NONE_NONE_NONE, vcaddq_rot270_f, v8hf, v4sf)
 VAR3 (BINOP_NONE_NONE_NONE, vhcaddq_rot90_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vhcaddq_rot270_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vhaddq_s, v16qi, v8hi, v4si)
@@ -870,8 +874,6 @@ VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshlcq_m_vec_u, 
v16qi, v8hi, v4si)
 VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshlcq_m_carry_u, v16qi, v8hi, v4si)
 
 /* optabs without any suffixes.  */
-VAR5 (BINOP_NONE_NONE_NONE, vcaddq_rot90, v16qi, v8hi, v4si, v8hf, v4sf)
-VAR5 (BINOP_NONE_NONE_NONE, vcaddq_rot270, v16qi, v8hi, v4si, v8hf, v4sf)
 VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot90, v8hf, v4sf)
 VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot270, v8hf, v4sf)
 VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot180, v8hf, v4sf)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 9e77af55d60..da1ead34e58 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -902,6 +902,7 @@
 ])
 
 (define_int_attr mve_insn [
+(UNSPEC_VCADD90 "vcadd") (UNSPEC_VCADD270 "vcadd")
 (VABAVQ_P_S "vabav") (VABAVQ_P_U "vabav")
 (VABAVQ_S "vabav") (VABAVQ_U "vabav")
 (VABDQ_M_S "vabd") (VABDQ_M_U "vabd") (VABDQ_M_F "vabd")
@@ -925,6 +926,8 @@
 (VBICQ_N_S "vbic") (VBICQ_N_U "vbic")
 (VBRSRQ_M_N_S "vbrsr") (VBRSRQ_M_N_U "vbrsr") (VBRSRQ_M_N_F 
"vbrsr")
 (VBRSRQ_N_S "vbrsr") (VBRSRQ_N_U "vbrsr") (VBRSRQ_N_F "vbrsr")
+(VCADDQ_ROT270_M_U "vcadd") (VCADDQ_ROT270_M_S "vcadd") 
(VCADDQ_ROT270_M_F "vcadd")
+(VCADDQ_ROT90_M_U "vcadd") (VCADDQ_ROT90_M_S "vcadd") 
(VCADDQ_ROT90_M_F "vcadd")
 (VCLSQ_M_S "vcls")
 (VCLSQ_S "vcls")
 (VCLZQ_M_S "vclz") (VCLZQ_M_U "vclz")
@@ -944,6 +947,8 @@
 (VHADDQ_M_S "vhadd") (VHADDQ_M_U "vhadd")
 (VHADDQ_N_S "vhadd") (VHADDQ_N_U "vhadd")
 (VHADDQ_S "vhadd") (VHADDQ_U "vhadd")
+(VHCADDQ_ROT90_M_S "vhcadd") (VHCADDQ_ROT270_M_S "vhcadd")
+

[PATCH] vect: Handle demoting FLOAT and promoting FIX_TRUNC.

2023-07-13 Thread Robin Dapp via Gcc-patches
Hi,

the recent changes that allowed multi-step conversions for
"non-packing/unpacking", i.e. modifier == NONE targets included
promoting to-float and demoting to-int variants.  This patch
adds demoting to-float and promoting to-int handling.

Bootstrapped and regtested on x86 and aarch64.

A question that seems related: Why do we require !flag_trapping_math
for the "NONE" multistep conversion but not for the "NARROW_DST"
case when both seem to handle float -> int and there are float
values that do not have an int representation?  If a backend
can guarantee that the conversion traps, should it just implement
a multistep conversion in a matching expander?

Regards
 Robin


gcc/ChangeLog:

* tree-vect-stmts.cc (vectorizable_conversion): Handle
more demotion/promotion for modifier == NONE.
---
 gcc/tree-vect-stmts.cc | 40 +---
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 10e71178ce7..78e0510be7e 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -5324,28 +5324,46 @@ vectorizable_conversion (vec_info *vinfo,
break;
   }
 
-  /* For conversions between float and smaller integer types try whether we
-can use intermediate signed integer types to support the
+  /* For conversions between float and larger integer types try whether
+we can use intermediate signed integer types to support the
 conversion.  */
   if ((code == FLOAT_EXPR
-  && GET_MODE_SIZE (lhs_mode) > GET_MODE_SIZE (rhs_mode))
+  && GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode))
  || (code == FIX_TRUNC_EXPR
- && GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode)
- && !flag_trapping_math))
+ && ((GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode)
+ && !flag_trapping_math)
+ || GET_MODE_SIZE (rhs_mode) < GET_MODE_SIZE (lhs_mode
{
+ bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode);
  bool float_expr_p = code == FLOAT_EXPR;
- scalar_mode imode = float_expr_p ? rhs_mode : lhs_mode;
- fltsz = GET_MODE_SIZE (float_expr_p ? lhs_mode : rhs_mode);
+ unsigned short target_size;
+ scalar_mode intermediate_mode;
+ if (demotion)
+   {
+ intermediate_mode = lhs_mode;
+ target_size = GET_MODE_SIZE (rhs_mode);
+   }
+ else
+   {
+ target_size = GET_MODE_SIZE (lhs_mode);
+ tree itype
+   = build_nonstandard_integer_type (GET_MODE_BITSIZE
+ (rhs_mode), 0);
+ intermediate_mode = SCALAR_TYPE_MODE (itype);
+   }
  code1 = float_expr_p ? code : NOP_EXPR;
  codecvt1 = float_expr_p ? NOP_EXPR : code;
- FOR_EACH_2XWIDER_MODE (rhs_mode_iter, imode)
+ opt_scalar_mode mode_iter;
+ FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
{
- imode = rhs_mode_iter.require ();
- if (GET_MODE_SIZE (imode) > fltsz)
+ intermediate_mode = mode_iter.require ();
+
+ if (GET_MODE_SIZE (intermediate_mode) > target_size)
break;
 
  cvt_type
-   = build_nonstandard_integer_type (GET_MODE_BITSIZE (imode),
+   = build_nonstandard_integer_type (GET_MODE_BITSIZE
+ (intermediate_mode),
  0);
  cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type,
  slp_node);
-- 
2.41.0



Re: [PATCH] vect: Handle demoting FLOAT and promoting FIX_TRUNC.

2023-07-13 Thread Richard Biener via Gcc-patches
On Thu, Jul 13, 2023 at 12:31 PM Robin Dapp via Gcc-patches
 wrote:
>
> Hi,
>
> the recent changes that allowed multi-step conversions for
> "non-packing/unpacking", i.e. modifier == NONE targets included
> promoting to-float and demoting to-int variants.  This patch
> adds demoting to-float and promoting to-int handling.

Can you add testcases?  Also the current restriction is because
the variants you add are not always correct and I don't see any
checks that the intermediate type doesn't lose significant bits?

Richard.

> Bootstrapped and regtested on x86 and aarch64.
>
> A question that seems related: Why do we require !flag_trapping_math
> for the "NONE" multistep conversion but not for the "NARROW_DST"
> case when both seem to handle float -> int and there are float
> values that do not have an int representation?  If a backend
> can guarantee that the conversion traps, should it just implement
> a multistep conversion in a matching expander?
>
> Regards
>  Robin
>
>
> gcc/ChangeLog:
>
> * tree-vect-stmts.cc (vectorizable_conversion): Handle
> more demotion/promotion for modifier == NONE.
> ---
>  gcc/tree-vect-stmts.cc | 40 +---
>  1 file changed, 29 insertions(+), 11 deletions(-)
>
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 10e71178ce7..78e0510be7e 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -5324,28 +5324,46 @@ vectorizable_conversion (vec_info *vinfo,
> break;
>}
>
> -  /* For conversions between float and smaller integer types try whether 
> we
> -can use intermediate signed integer types to support the
> +  /* For conversions between float and larger integer types try whether
> +we can use intermediate signed integer types to support the
>  conversion.  */
>if ((code == FLOAT_EXPR
> -  && GET_MODE_SIZE (lhs_mode) > GET_MODE_SIZE (rhs_mode))
> +  && GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode))
>   || (code == FIX_TRUNC_EXPR
> - && GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode)
> - && !flag_trapping_math))
> + && ((GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode)
> + && !flag_trapping_math)
> + || GET_MODE_SIZE (rhs_mode) < GET_MODE_SIZE (lhs_mode
> {
> + bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode);
>   bool float_expr_p = code == FLOAT_EXPR;
> - scalar_mode imode = float_expr_p ? rhs_mode : lhs_mode;
> - fltsz = GET_MODE_SIZE (float_expr_p ? lhs_mode : rhs_mode);
> + unsigned short target_size;
> + scalar_mode intermediate_mode;
> + if (demotion)
> +   {
> + intermediate_mode = lhs_mode;
> + target_size = GET_MODE_SIZE (rhs_mode);
> +   }
> + else
> +   {
> + target_size = GET_MODE_SIZE (lhs_mode);
> + tree itype
> +   = build_nonstandard_integer_type (GET_MODE_BITSIZE
> + (rhs_mode), 0);
> + intermediate_mode = SCALAR_TYPE_MODE (itype);
> +   }
>   code1 = float_expr_p ? code : NOP_EXPR;
>   codecvt1 = float_expr_p ? NOP_EXPR : code;
> - FOR_EACH_2XWIDER_MODE (rhs_mode_iter, imode)
> + opt_scalar_mode mode_iter;
> + FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
> {
> - imode = rhs_mode_iter.require ();
> - if (GET_MODE_SIZE (imode) > fltsz)
> + intermediate_mode = mode_iter.require ();
> +
> + if (GET_MODE_SIZE (intermediate_mode) > target_size)
> break;
>
>   cvt_type
> -   = build_nonstandard_integer_type (GET_MODE_BITSIZE (imode),
> +   = build_nonstandard_integer_type (GET_MODE_BITSIZE
> + (intermediate_mode),
>   0);
>   cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type,
>   slp_node);
> --
> 2.41.0
>


RE: [PATCH 7/19]middle-end: Refactor vectorizer loop conditionals and separate out IV to new variables

2023-07-13 Thread Richard Biener via Gcc-patches
On Thu, 13 Jul 2023, Tamar Christina wrote:

> > e7ac2b5f3db55de3dbbab7bd2bfe08388f4ec533..cab82d7960e5be517bba2
> > 621f7f4
> > > 888e7bf3c295 100644
> > > --- a/gcc/cfgloop.h
> > > +++ b/gcc/cfgloop.h
> > > @@ -272,6 +272,14 @@ public:
> > >   the basic-block from being collected but its index can still be
> > >   reused.  */
> > >basic_block former_header;
> > > +
> > > +  /* The controlling loop IV for the current loop when vectorizing.  
> > > This IV
> > > + controls the natural exits of the loop.  */  edge  GTY ((skip
> > > + (""))) vec_loop_iv;
> > > +
> > > +  /* If the loop has multiple exits this structure contains the alternate
> > > + exits of the loop which are relevant for vectorization.  */
> > > + vec GTY ((skip (""))) vec_loop_alt_exits;
> > 
> > That's a quite heavy representation and as you say it's vectorizer 
> > specific.  May
> > I ask you to eliminate at _least_ vec_loop_alt_exits?
> > Are there not all exits in that vector?  Note there's already the list of 
> > exits and if
> > you have the canonical counting IV exit you can match against that to get 
> > all
> > the others?
> > 
> 
> Sure, though that means some filtering whenever one iterates over the alt 
> exits,
> not a problem though.
> 
> > >  /* Given LOOP this function generates a new copy of it and puts it
> > > on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
> > > @@ -1458,13 +1523,15 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class
> > loop *loop,
> > >edge exit, new_exit;
> > >bool duplicate_outer_loop = false;
> > >
> > > -  exit = single_exit (loop);
> > > +  exit = loop->vec_loop_iv;
> > >at_exit = (e == exit);
> > >if (!at_exit && e != loop_preheader_edge (loop))
> > >  return NULL;
> > >
> > >if (scalar_loop == NULL)
> > >  scalar_loop = loop;
> > > +  else
> > > +vec_init_exit_info (scalar_loop);
> > >
> > >bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
> > >pbbs = bbs + 1;
> > > @@ -1490,13 +1557,17 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class
> > loop *loop,
> > >bbs[0] = preheader;
> > >new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
> > >
> > > -  exit = single_exit (scalar_loop);
> > > +  exit = scalar_loop->vec_loop_iv;
> > >copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
> > >   , 1, _exit, NULL,
> > >   at_exit ? loop->latch : e->src, true);
> > > -  exit = single_exit (loop);
> > > +  exit = loop->vec_loop_iv;
> > >basic_block new_preheader = new_bbs[0];
> > >
> > > +  /* Record the new loop exit information.  new_loop doesn't have SCEV
> > data and
> > > + so we must initialize the exit information.  */
> > > +  vec_init_exit_info (new_loop);
> > > +
> > 
> > You have a mapping of old to new BB so you should be able to
> > map old to new exit by mapping e->src/dest and looking up the new edge?
> > 
> > The vec_loop_iv exit is mapped directly (new_exit).
> > 
> > So I don't really understand what's missing there.
> 
> But I don't have the mapping when the loop as versioned, e.g. by ifcvt.  So 
> in the cases
> where scalar_loop != loop in which case I still need them to match up.
> 
> vect_loop_form_info is destroyed after analysis though and is not available 
> during
> peeling. That's why we copy relevant information out in 
> vect_create_loop_vinfo.
> 
> But in general we only have 1 per loop as well, so it would be the same as 
> using loop_vinfo.
> 
> I could move it into loop_vinfo and then require you to pass the edges to the 
> peeling function
> as you mentioned.  This would solve the location we place them in, but still 
> not sure what to do
> about versioned loops.  Would need to get its main edge "somewhere", would 
> another field in
> loop_vinfo be ok?

I suppose since we're having ->scalar_loop adding ->scalar_loop_iv_exit
is straight-forward indeed.  As for matching them up I don't see how
you do that reliably right now?  It might be even that the if-converted
loop has one of the exits removed as unreachable (since we run VN
on its body) ...

What I could see working (but ick) is to extend the contract between
if-conversion and vectorization and for example record corresponding exit 
numbers in exits.  We have conveniently (*cough*) unused edge->aux
for this.  If you assign numbers to all edges of the original
loop the loop copies should inherit those (if I traced things
correctly - duplicate_block copies edge->aux but not bb->aux).

So in the vectorizer you could then match them up.

Richard.


> Cheers,
> Tamar
> 
> > > +  if (!loop->vec_loop_iv)
> > > +return opt_result::failure_at (vect_location,
> > > +"not vectorized:"
> > > +" could not determine main exit from"
> > > +" loop with multiple exits.\n");
> > > +
> > >/* Different restrictions apply when we are considering an inner-most 
> > > loop,
> > >   vs. an outer (nested) loop.
> > >   

Re: [PATCH] vect: Handle demoting FLOAT and promoting FIX_TRUNC.

2023-07-13 Thread Robin Dapp via Gcc-patches
> Can you add testcases?  Also the current restriction is because
> the variants you add are not always correct and I don't see any
> checks that the intermediate type doesn't lose significant bits?

The testcases I wanted to add with a follow-up RISC-V patch but
I can also try an aarch64 one.

So for my understanding, please correct, we have:
  
  promoting int -> float, should always be safe.  We currently
   vectorize this with WIDEN and NONE.

  demoting float -> int, this is safe as long as the float
   value can be represented in the int type, otherwise we must
   trap.
   We currently vectorize this on x86 using NARROW (regardless
   of -ftrapping-math) and using NONE only with -fno-trapping-math.

  demoting int -> float, this is safe as long as the
   intermediate types can hold the initial value?  How is
   this different to demoting e.g. int64_t -> int8_t?
   We currently do not vectorize this with either NARROW or NONE.
   LLVM vectorizes but only with their default(?) -fno-trapping-math.
   Yet I don't see how we could trap here?

  promoting float -> int, this is safe as long as the float
   value can be represented (as above)?  We currently vectorize
   this (regardless of -ftrapping-math) with WIDEN but not NONE.

So apart from unifying the -ftrapping-math behavior I think only
the third variant is somewhat critical?

Regards
 Robin



Re: [IRA] Skip empty register classes in setup_reg_class_relations

2023-07-13 Thread Vladimir Makarov via Gcc-patches



On 7/12/23 07:05, senthilkumar.selva...@microchip.com wrote:

Hi,

   I've been spending some (spare) time trying to get LRA working
   for the avr target.


Thank you for addressing this problem.

The code you changing is very sensitive and was a source of multiple PRs 
in the past.  But I found the change your propose logical and I think it 
will not create problems.  Still please be alert and revert the patch if 
people reports the problem with this change.



  After making a couple of changes to get
   libgcc going, I'm now hitting an assert at
   lra-constraints.cc:4423 for a subarch (avrtiny) that has a
   couple of regclasses with no available registers.

   The assert fires because in_class_p (correctly) returns
   false for get_reg_class (regno) = ALL_REGS, and new_class =
   NO_LD_REGS. For avrtiny, NO_LD_REGS is an empty regset, and
   therefore hard_reg_set_subset_p (NO_LD_REGS, lra_no_alloc_regs)
   is always true, making in_class_p return false.

   in_class_p picks NO_LD_REGS as new_class because common_class =
   ira_reg_class_subset[ALL_REGS][NO_REGS] evaluates as
   NO_LD_REGS. This appears wrong to me - it should be NO_REGS
   instead (lra-constraints.cc:4421 checks for NO_REGS).

   ira.cc:setup_reg_class_relations sets up
   ira_reg_class_subset (among other things), and the problem
   appears to be a missing continue statement if
   reg_class_contents[cl3] (in the innermost loop) is empty.

   In this case, for cl1 = ALL_REGS and cl2 = NO_REGS, cl3 =
   NO_LD_REGS, temp_hard_regset and temp_set2 are both empty, and
   hard_reg_subset_p (, ) is always true, so
   ira_reg_class_subset[ALL_REGS][NO_REGS] ends up being set to
   cl3 = NO_LD_REGS. Adding a continue if hard_reg_set_empty_p 
(temp_hard_regset)
   fixes the problem for me.

   Does the below patch look ok? Bootstrapping and regression
   testing passed on x86_64.

OK.



[PATCH 6/6] arm: [MVE intrinsics] rework vcmlaq

2023-07-13 Thread Christophe Lyon via Gcc-patches
Implement vcmlaq using the new MVE builtins framework.

2023-07-13  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc (vcmlaq, vcmlaq_rot90)
(vcmlaq_rot180, vcmlaq_rot270): New.
* config/arm/arm-mve-builtins-base.def (vcmlaq, vcmlaq_rot90)
(vcmlaq_rot180, vcmlaq_rot270): New.
* config/arm/arm-mve-builtins-base.h: (vcmlaq, vcmlaq_rot90)
(vcmlaq_rot180, vcmlaq_rot270): New.
* config/arm/arm-mve-builtins.cc
(function_instance::has_inactive_argument): Handle vcmlaq,
vcmlaq_rot90, vcmlaq_rot180, vcmlaq_rot270.
* config/arm/arm_mve.h (vcmlaq): Delete.
(vcmlaq_rot180): Delete.
(vcmlaq_rot270): Delete.
(vcmlaq_rot90): Delete.
(vcmlaq_m): Delete.
(vcmlaq_rot180_m): Delete.
(vcmlaq_rot270_m): Delete.
(vcmlaq_rot90_m): Delete.
(vcmlaq_f16): Delete.
(vcmlaq_rot180_f16): Delete.
(vcmlaq_rot270_f16): Delete.
(vcmlaq_rot90_f16): Delete.
(vcmlaq_f32): Delete.
(vcmlaq_rot180_f32): Delete.
(vcmlaq_rot270_f32): Delete.
(vcmlaq_rot90_f32): Delete.
(vcmlaq_m_f32): Delete.
(vcmlaq_m_f16): Delete.
(vcmlaq_rot180_m_f32): Delete.
(vcmlaq_rot180_m_f16): Delete.
(vcmlaq_rot270_m_f32): Delete.
(vcmlaq_rot270_m_f16): Delete.
(vcmlaq_rot90_m_f32): Delete.
(vcmlaq_rot90_m_f16): Delete.
(__arm_vcmlaq_f16): Delete.
(__arm_vcmlaq_rot180_f16): Delete.
(__arm_vcmlaq_rot270_f16): Delete.
(__arm_vcmlaq_rot90_f16): Delete.
(__arm_vcmlaq_f32): Delete.
(__arm_vcmlaq_rot180_f32): Delete.
(__arm_vcmlaq_rot270_f32): Delete.
(__arm_vcmlaq_rot90_f32): Delete.
(__arm_vcmlaq_m_f32): Delete.
(__arm_vcmlaq_m_f16): Delete.
(__arm_vcmlaq_rot180_m_f32): Delete.
(__arm_vcmlaq_rot180_m_f16): Delete.
(__arm_vcmlaq_rot270_m_f32): Delete.
(__arm_vcmlaq_rot270_m_f16): Delete.
(__arm_vcmlaq_rot90_m_f32): Delete.
(__arm_vcmlaq_rot90_m_f16): Delete.
(__arm_vcmlaq): Delete.
(__arm_vcmlaq_rot180): Delete.
(__arm_vcmlaq_rot270): Delete.
(__arm_vcmlaq_rot90): Delete.
(__arm_vcmlaq_m): Delete.
(__arm_vcmlaq_rot180_m): Delete.
(__arm_vcmlaq_rot270_m): Delete.
(__arm_vcmlaq_rot90_m): Delete.
---
 gcc/config/arm/arm-mve-builtins-base.cc  |   4 +
 gcc/config/arm/arm-mve-builtins-base.def |   4 +
 gcc/config/arm/arm-mve-builtins-base.h   |  16 +-
 gcc/config/arm/arm-mve-builtins.cc   |   4 +
 gcc/config/arm/arm_mve.h | 304 ---
 5 files changed, 22 insertions(+), 310 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc 
b/gcc/config/arm/arm-mve-builtins-base.cc
index 3ad8df304e8..e31095ae112 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -262,6 +262,10 @@ FUNCTION_WITH_RTX_M (vandq, AND, VANDQ)
 FUNCTION_ONLY_N (vbrsrq, VBRSRQ)
 FUNCTION (vcaddq_rot90, unspec_mve_function_exact_insn_rot, (UNSPEC_VCADD90, 
UNSPEC_VCADD90, UNSPEC_VCADD90, VCADDQ_ROT90_M_S, VCADDQ_ROT90_M_U, 
VCADDQ_ROT90_M_F))
 FUNCTION (vcaddq_rot270, unspec_mve_function_exact_insn_rot, (UNSPEC_VCADD270, 
UNSPEC_VCADD270, UNSPEC_VCADD270, VCADDQ_ROT270_M_S, VCADDQ_ROT270_M_U, 
VCADDQ_ROT270_M_F))
+FUNCTION (vcmlaq, unspec_mve_function_exact_insn_rot, (-1, -1, UNSPEC_VCMLA, 
-1, -1, VCMLAQ_M_F))
+FUNCTION (vcmlaq_rot90, unspec_mve_function_exact_insn_rot, (-1, -1, 
UNSPEC_VCMLA90, -1, -1, VCMLAQ_ROT90_M_F))
+FUNCTION (vcmlaq_rot180, unspec_mve_function_exact_insn_rot, (-1, -1, 
UNSPEC_VCMLA180, -1, -1, VCMLAQ_ROT180_M_F))
+FUNCTION (vcmlaq_rot270, unspec_mve_function_exact_insn_rot, (-1, -1, 
UNSPEC_VCMLA270, -1, -1, VCMLAQ_ROT270_M_F))
 FUNCTION (vcmulq, unspec_mve_function_exact_insn_rot, (-1, -1, UNSPEC_VCMUL, 
-1, -1, VCMULQ_M_F))
 FUNCTION (vcmulq_rot90, unspec_mve_function_exact_insn_rot, (-1, -1, 
UNSPEC_VCMUL90, -1, -1, VCMULQ_ROT90_M_F))
 FUNCTION (vcmulq_rot180, unspec_mve_function_exact_insn_rot, (-1, -1, 
UNSPEC_VCMUL180, -1, -1, VCMULQ_ROT180_M_F))
diff --git a/gcc/config/arm/arm-mve-builtins-base.def 
b/gcc/config/arm/arm-mve-builtins-base.def
index cbcf0d296cd..e7d466f2efd 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/config/arm/arm-mve-builtins-base.def
@@ -158,6 +158,10 @@ DEF_MVE_FUNCTION (vandq, binary, all_float, mx_or_none)
 DEF_MVE_FUNCTION (vbrsrq, binary_imm32, all_float, mx_or_none)
 DEF_MVE_FUNCTION (vcaddq_rot90, binary, all_float, mx_or_none)
 DEF_MVE_FUNCTION (vcaddq_rot270, binary, all_float, mx_or_none)
+DEF_MVE_FUNCTION (vcmlaq, ternary, all_float, m_or_none)
+DEF_MVE_FUNCTION (vcmlaq_rot90, ternary, all_float, m_or_none)
+DEF_MVE_FUNCTION (vcmlaq_rot180, ternary, all_float, m_or_none)
+DEF_MVE_FUNCTION (vcmlaq_rot270, ternary, all_float, m_or_none)
 DEF_MVE_FUNCTION (vcmulq, binary, 

[PATCH 2/2] [testsuite, arm]: Make mve_fp_fpu[12].c accept single or double precision FPU

2023-07-13 Thread Christophe Lyon via Gcc-patches
This tests currently expect a directive containing .fpu fpv5-sp-d16
and thus may fail if the test is executed for instance with
-march=armv8.1-m.main+mve.fp+fp.dp

This patch accepts either fpv5-sp-d16 or fpv5-d16 to avoid the failure.

2023-06-28  Christophe Lyon  

gcc/testsuite/
* gcc.target/arm/mve/intrinsics/mve_fp_fpu1.c: Fix .fpu
scan-assembler.
* gcc.target/arm/mve/intrinsics/mve_fp_fpu2.c: Likewise.
---
 gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu1.c | 2 +-
 gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu2.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu1.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu1.c
index e375327fb97..8358a616bb5 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu1.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu1.c
@@ -12,4 +12,4 @@ foo1 (int8x16_t value)
   return b;
 }
 
-/* { dg-final { scan-assembler "\.fpu fpv5-sp-d16" }  } */
+/* { dg-final { scan-assembler "\.fpu fpv5(-sp|)-d16" }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu2.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu2.c
index 1fca1100cf0..5dd2feefc35 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu2.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu2.c
@@ -12,4 +12,4 @@ foo1 (int8x16_t value)
   return b;
 }
 
-/* { dg-final { scan-assembler "\.fpu fpv5-sp-d16" }  } */
+/* { dg-final { scan-assembler "\.fpu fpv5(-sp|)-d16" }  } */
-- 
2.34.1



[PATCH 1/2] [testsuite,arm]: Make nomve_fp_1.c require arm_fp

2023-07-13 Thread Christophe Lyon via Gcc-patches
If GCC is configured with the default (soft) -mfloat-abi, and we don't
override the target_board test flags appropriately,
gcc.target/arm/mve/general-c/nomve_fp_1.c fails for lack of
-mfloat-abi=softfp or -mfloat-abi=hard, because it doesn't use
dg-add-options arm_v8_1m_mve (on purpose, see comment in the test).

Require and use the options needed for arm_fp to fix this problem.

2023-06-28  Christophe Lyon  

gcc/testsuite/
* gcc.target/arm/mve/general-c/nomve_fp_1.c: Require arm_fp.
---
 gcc/testsuite/gcc.target/arm/mve/general-c/nomve_fp_1.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gcc/testsuite/gcc.target/arm/mve/general-c/nomve_fp_1.c 
b/gcc/testsuite/gcc.target/arm/mve/general-c/nomve_fp_1.c
index 21c2af16a61..c9d279ead68 100644
--- a/gcc/testsuite/gcc.target/arm/mve/general-c/nomve_fp_1.c
+++ b/gcc/testsuite/gcc.target/arm/mve/general-c/nomve_fp_1.c
@@ -1,9 +1,11 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_fp_ok } */
 /* Do not use dg-add-options arm_v8_1m_mve, because this might expand to "",
which could imply mve+fp depending on the user settings. We want to make
sure the '+fp' extension is not enabled.  */
 /* { dg-options "-mfpu=auto -march=armv8.1-m.main+mve" } */
+/* { dg-add-options arm_fp } */
 
 #include 
 
-- 
2.34.1



[PATCH 2/6] arm: [MVE intrinsics] rework vcaddq vhcaddq

2023-07-13 Thread Christophe Lyon via Gcc-patches
Implement vcaddq, vhcaddq using the new MVE builtins framework.

2023-07-13  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc (vcaddq_rot90)
(vcaddq_rot270, vhcaddq_rot90, vhcaddq_rot270): New.
* config/arm/arm-mve-builtins-base.def (vcaddq_rot90)
(vcaddq_rot270, vhcaddq_rot90, vhcaddq_rot270): New.
* config/arm/arm-mve-builtins-base.h: (vcaddq_rot90)
(vcaddq_rot270, vhcaddq_rot90, vhcaddq_rot270): New.
* config/arm/arm-mve-builtins-functions.h (class
unspec_mve_function_exact_insn_rot): New.
* config/arm/arm_mve.h (vcaddq_rot90): Delete.
(vcaddq_rot270): Delete.
(vhcaddq_rot90): Delete.
(vhcaddq_rot270): Delete.
(vcaddq_rot270_m): Delete.
(vcaddq_rot90_m): Delete.
(vhcaddq_rot270_m): Delete.
(vhcaddq_rot90_m): Delete.
(vcaddq_rot90_x): Delete.
(vcaddq_rot270_x): Delete.
(vhcaddq_rot90_x): Delete.
(vhcaddq_rot270_x): Delete.
(vcaddq_rot90_u8): Delete.
(vcaddq_rot270_u8): Delete.
(vhcaddq_rot90_s8): Delete.
(vhcaddq_rot270_s8): Delete.
(vcaddq_rot90_s8): Delete.
(vcaddq_rot270_s8): Delete.
(vcaddq_rot90_u16): Delete.
(vcaddq_rot270_u16): Delete.
(vhcaddq_rot90_s16): Delete.
(vhcaddq_rot270_s16): Delete.
(vcaddq_rot90_s16): Delete.
(vcaddq_rot270_s16): Delete.
(vcaddq_rot90_u32): Delete.
(vcaddq_rot270_u32): Delete.
(vhcaddq_rot90_s32): Delete.
(vhcaddq_rot270_s32): Delete.
(vcaddq_rot90_s32): Delete.
(vcaddq_rot270_s32): Delete.
(vcaddq_rot90_f16): Delete.
(vcaddq_rot270_f16): Delete.
(vcaddq_rot90_f32): Delete.
(vcaddq_rot270_f32): Delete.
(vcaddq_rot270_m_s8): Delete.
(vcaddq_rot270_m_s32): Delete.
(vcaddq_rot270_m_s16): Delete.
(vcaddq_rot270_m_u8): Delete.
(vcaddq_rot270_m_u32): Delete.
(vcaddq_rot270_m_u16): Delete.
(vcaddq_rot90_m_s8): Delete.
(vcaddq_rot90_m_s32): Delete.
(vcaddq_rot90_m_s16): Delete.
(vcaddq_rot90_m_u8): Delete.
(vcaddq_rot90_m_u32): Delete.
(vcaddq_rot90_m_u16): Delete.
(vhcaddq_rot270_m_s8): Delete.
(vhcaddq_rot270_m_s32): Delete.
(vhcaddq_rot270_m_s16): Delete.
(vhcaddq_rot90_m_s8): Delete.
(vhcaddq_rot90_m_s32): Delete.
(vhcaddq_rot90_m_s16): Delete.
(vcaddq_rot270_m_f32): Delete.
(vcaddq_rot270_m_f16): Delete.
(vcaddq_rot90_m_f32): Delete.
(vcaddq_rot90_m_f16): Delete.
(vcaddq_rot90_x_s8): Delete.
(vcaddq_rot90_x_s16): Delete.
(vcaddq_rot90_x_s32): Delete.
(vcaddq_rot90_x_u8): Delete.
(vcaddq_rot90_x_u16): Delete.
(vcaddq_rot90_x_u32): Delete.
(vcaddq_rot270_x_s8): Delete.
(vcaddq_rot270_x_s16): Delete.
(vcaddq_rot270_x_s32): Delete.
(vcaddq_rot270_x_u8): Delete.
(vcaddq_rot270_x_u16): Delete.
(vcaddq_rot270_x_u32): Delete.
(vhcaddq_rot90_x_s8): Delete.
(vhcaddq_rot90_x_s16): Delete.
(vhcaddq_rot90_x_s32): Delete.
(vhcaddq_rot270_x_s8): Delete.
(vhcaddq_rot270_x_s16): Delete.
(vhcaddq_rot270_x_s32): Delete.
(vcaddq_rot90_x_f16): Delete.
(vcaddq_rot90_x_f32): Delete.
(vcaddq_rot270_x_f16): Delete.
(vcaddq_rot270_x_f32): Delete.
(__arm_vcaddq_rot90_u8): Delete.
(__arm_vcaddq_rot270_u8): Delete.
(__arm_vhcaddq_rot90_s8): Delete.
(__arm_vhcaddq_rot270_s8): Delete.
(__arm_vcaddq_rot90_s8): Delete.
(__arm_vcaddq_rot270_s8): Delete.
(__arm_vcaddq_rot90_u16): Delete.
(__arm_vcaddq_rot270_u16): Delete.
(__arm_vhcaddq_rot90_s16): Delete.
(__arm_vhcaddq_rot270_s16): Delete.
(__arm_vcaddq_rot90_s16): Delete.
(__arm_vcaddq_rot270_s16): Delete.
(__arm_vcaddq_rot90_u32): Delete.
(__arm_vcaddq_rot270_u32): Delete.
(__arm_vhcaddq_rot90_s32): Delete.
(__arm_vhcaddq_rot270_s32): Delete.
(__arm_vcaddq_rot90_s32): Delete.
(__arm_vcaddq_rot270_s32): Delete.
(__arm_vcaddq_rot270_m_s8): Delete.
(__arm_vcaddq_rot270_m_s32): Delete.
(__arm_vcaddq_rot270_m_s16): Delete.
(__arm_vcaddq_rot270_m_u8): Delete.
(__arm_vcaddq_rot270_m_u32): Delete.
(__arm_vcaddq_rot270_m_u16): Delete.
(__arm_vcaddq_rot90_m_s8): Delete.
(__arm_vcaddq_rot90_m_s32): Delete.
(__arm_vcaddq_rot90_m_s16): Delete.
(__arm_vcaddq_rot90_m_u8): Delete.
(__arm_vcaddq_rot90_m_u32): Delete.
(__arm_vcaddq_rot90_m_u16): Delete.
(__arm_vhcaddq_rot270_m_s8): Delete.
(__arm_vhcaddq_rot270_m_s32): Delete.
(__arm_vhcaddq_rot270_m_s16): Delete.
(__arm_vhcaddq_rot90_m_s8): Delete.

[PATCH 4/6] arm: [MVE intrinsics] rework vcmulq

2023-07-13 Thread Christophe Lyon via Gcc-patches
Implement vcmulq using the new MVE builtins framework.

2023-07-13 Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc (vcmulq, vcmulq_rot90)
(vcmulq_rot180, vcmulq_rot270): New.
* config/arm/arm-mve-builtins-base.def (vcmulq, vcmulq_rot90)
(vcmulq_rot180, vcmulq_rot270): New.
* config/arm/arm-mve-builtins-base.h: (vcmulq, vcmulq_rot90)
(vcmulq_rot180, vcmulq_rot270): New.
* config/arm/arm_mve.h (vcmulq_rot90): Delete.
(vcmulq_rot270): Delete.
(vcmulq_rot180): Delete.
(vcmulq): Delete.
(vcmulq_m): Delete.
(vcmulq_rot180_m): Delete.
(vcmulq_rot270_m): Delete.
(vcmulq_rot90_m): Delete.
(vcmulq_x): Delete.
(vcmulq_rot90_x): Delete.
(vcmulq_rot180_x): Delete.
(vcmulq_rot270_x): Delete.
(vcmulq_rot90_f16): Delete.
(vcmulq_rot270_f16): Delete.
(vcmulq_rot180_f16): Delete.
(vcmulq_f16): Delete.
(vcmulq_rot90_f32): Delete.
(vcmulq_rot270_f32): Delete.
(vcmulq_rot180_f32): Delete.
(vcmulq_f32): Delete.
(vcmulq_m_f32): Delete.
(vcmulq_m_f16): Delete.
(vcmulq_rot180_m_f32): Delete.
(vcmulq_rot180_m_f16): Delete.
(vcmulq_rot270_m_f32): Delete.
(vcmulq_rot270_m_f16): Delete.
(vcmulq_rot90_m_f32): Delete.
(vcmulq_rot90_m_f16): Delete.
(vcmulq_x_f16): Delete.
(vcmulq_x_f32): Delete.
(vcmulq_rot90_x_f16): Delete.
(vcmulq_rot90_x_f32): Delete.
(vcmulq_rot180_x_f16): Delete.
(vcmulq_rot180_x_f32): Delete.
(vcmulq_rot270_x_f16): Delete.
(vcmulq_rot270_x_f32): Delete.
(__arm_vcmulq_rot90_f16): Delete.
(__arm_vcmulq_rot270_f16): Delete.
(__arm_vcmulq_rot180_f16): Delete.
(__arm_vcmulq_f16): Delete.
(__arm_vcmulq_rot90_f32): Delete.
(__arm_vcmulq_rot270_f32): Delete.
(__arm_vcmulq_rot180_f32): Delete.
(__arm_vcmulq_f32): Delete.
(__arm_vcmulq_m_f32): Delete.
(__arm_vcmulq_m_f16): Delete.
(__arm_vcmulq_rot180_m_f32): Delete.
(__arm_vcmulq_rot180_m_f16): Delete.
(__arm_vcmulq_rot270_m_f32): Delete.
(__arm_vcmulq_rot270_m_f16): Delete.
(__arm_vcmulq_rot90_m_f32): Delete.
(__arm_vcmulq_rot90_m_f16): Delete.
(__arm_vcmulq_x_f16): Delete.
(__arm_vcmulq_x_f32): Delete.
(__arm_vcmulq_rot90_x_f16): Delete.
(__arm_vcmulq_rot90_x_f32): Delete.
(__arm_vcmulq_rot180_x_f16): Delete.
(__arm_vcmulq_rot180_x_f32): Delete.
(__arm_vcmulq_rot270_x_f16): Delete.
(__arm_vcmulq_rot270_x_f32): Delete.
(__arm_vcmulq_rot90): Delete.
(__arm_vcmulq_rot270): Delete.
(__arm_vcmulq_rot180): Delete.
(__arm_vcmulq): Delete.
(__arm_vcmulq_m): Delete.
(__arm_vcmulq_rot180_m): Delete.
(__arm_vcmulq_rot270_m): Delete.
(__arm_vcmulq_rot90_m): Delete.
(__arm_vcmulq_x): Delete.
(__arm_vcmulq_rot90_x): Delete.
(__arm_vcmulq_rot180_x): Delete.
(__arm_vcmulq_rot270_x): Delete.
---
 gcc/config/arm/arm-mve-builtins-base.cc  |   4 +
 gcc/config/arm/arm-mve-builtins-base.def |   4 +
 gcc/config/arm/arm-mve-builtins-base.h   |   4 +
 gcc/config/arm/arm_mve.h | 448 ---
 4 files changed, 12 insertions(+), 448 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc 
b/gcc/config/arm/arm-mve-builtins-base.cc
index f15bb926147..3ad8df304e8 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -262,6 +262,10 @@ FUNCTION_WITH_RTX_M (vandq, AND, VANDQ)
 FUNCTION_ONLY_N (vbrsrq, VBRSRQ)
 FUNCTION (vcaddq_rot90, unspec_mve_function_exact_insn_rot, (UNSPEC_VCADD90, 
UNSPEC_VCADD90, UNSPEC_VCADD90, VCADDQ_ROT90_M_S, VCADDQ_ROT90_M_U, 
VCADDQ_ROT90_M_F))
 FUNCTION (vcaddq_rot270, unspec_mve_function_exact_insn_rot, (UNSPEC_VCADD270, 
UNSPEC_VCADD270, UNSPEC_VCADD270, VCADDQ_ROT270_M_S, VCADDQ_ROT270_M_U, 
VCADDQ_ROT270_M_F))
+FUNCTION (vcmulq, unspec_mve_function_exact_insn_rot, (-1, -1, UNSPEC_VCMUL, 
-1, -1, VCMULQ_M_F))
+FUNCTION (vcmulq_rot90, unspec_mve_function_exact_insn_rot, (-1, -1, 
UNSPEC_VCMUL90, -1, -1, VCMULQ_ROT90_M_F))
+FUNCTION (vcmulq_rot180, unspec_mve_function_exact_insn_rot, (-1, -1, 
UNSPEC_VCMUL180, -1, -1, VCMULQ_ROT180_M_F))
+FUNCTION (vcmulq_rot270, unspec_mve_function_exact_insn_rot, (-1, -1, 
UNSPEC_VCMUL270, -1, -1, VCMULQ_ROT270_M_F))
 FUNCTION (vhcaddq_rot90, unspec_mve_function_exact_insn_rot, (VHCADDQ_ROT90_S, 
-1, -1, VHCADDQ_ROT90_M_S, -1, -1))
 FUNCTION (vhcaddq_rot270, unspec_mve_function_exact_insn_rot, 
(VHCADDQ_ROT270_S, -1, -1, VHCADDQ_ROT270_M_S, -1, -1))
 FUNCTION_WITHOUT_N_NO_U_F (vclsq, VCLSQ)
diff --git a/gcc/config/arm/arm-mve-builtins-base.def 
b/gcc/config/arm/arm-mve-builtins-base.def
index 9a793147960..cbcf0d296cd 

Re: [RFC] GNU Vector Extension -- Packed Boolean Vectors

2023-07-13 Thread Richard Biener via Gcc-patches
On Thu, Jul 13, 2023 at 12:15 PM Tejas Belagod  wrote:
>
> On 7/3/23 1:31 PM, Richard Biener wrote:
> > On Mon, Jul 3, 2023 at 8:50 AM Tejas Belagod  wrote:
> >>
> >> On 6/29/23 6:55 PM, Richard Biener wrote:
> >>> On Wed, Jun 28, 2023 at 1:26 PM Tejas Belagod  
> >>> wrote:
> 
> 
> 
> 
> 
>  From: Richard Biener 
>  Date: Tuesday, June 27, 2023 at 12:58 PM
>  To: Tejas Belagod 
>  Cc: gcc-patches@gcc.gnu.org 
>  Subject: Re: [RFC] GNU Vector Extension -- Packed Boolean Vectors
> 
>  On Tue, Jun 27, 2023 at 8:30 AM Tejas Belagod  
>  wrote:
> >
> >
> >
> >
> >
> > From: Richard Biener 
> > Date: Monday, June 26, 2023 at 2:23 PM
> > To: Tejas Belagod 
> > Cc: gcc-patches@gcc.gnu.org 
> > Subject: Re: [RFC] GNU Vector Extension -- Packed Boolean Vectors
> >
> > On Mon, Jun 26, 2023 at 8:24 AM Tejas Belagod via Gcc-patches
> >  wrote:
> >>
> >> Hi,
> >>
> >> Packed Boolean Vectors
> >> --
> >>
> >> I'd like to propose a feature addition to GNU Vector extensions to add 
> >> packed
> >> boolean vectors (PBV).  This has been discussed in the past here[1] 
> >> and a variant has
> >> been implemented in Clang recently[2].
> >>
> >> With predication features being added to vector architectures (SVE, 
> >> MVE, AVX),
> >> it is a useful feature to have to model predication on targets.  This 
> >> could
> >> find its use in intrinsics or just used as is as a GNU vector 
> >> extension being
> >> mapped to underlying target features.  For example, the packed boolean 
> >> vector
> >> could directly map to a predicate register on SVE.
> >>
> >> Also, this new packed boolean type GNU extension can be used with SVE 
> >> ACLE
> >> intrinsics to replace a fixed-length svbool_t.
> >>
> >> Here are a few options to represent the packed boolean vector type.
> >
> > The GIMPLE frontend uses a new 'vector_mask' attribute:
> >
> > typedef int v8si __attribute__((vector_size(8*sizeof(int;
> > typedef v8si v8sib __attribute__((vector_mask));
> >
> > it get's you a vector type that's the appropriate (dependent on the
> > target) vector
> > mask type for the vector data type (v8si in this case).
> >
> >
> >
> > Thanks Richard.
> >
> > Having had a quick look at the implementation, it does seem to tick the 
> > boxes.
> >
> > I must admit I haven't dug deep, but if the target hook allows the mask 
> > to be
> >
> > defined in way that is target-friendly (and I don't know how much 
> > effort it will
> >
> > be to migrate the attribute to more front-ends), it should do the job 
> > nicely.
> >
> > Let me go back and dig a bit deeper and get back with questions if any.
> 
> 
>  Let me add that the advantage of this is the compiler doesn't need
>  to support weird explicitely laid out packed boolean vectors that do
>  not match what the target supports and the user doesn't need to know
>  what the target supports (and thus have an #ifdef maze around explicitely
>  specified layouts).
> 
>  Sorry for the delayed response – I spent a day experimenting with 
>  vector_mask.
> 
> 
> 
>  Yeah, this is what option 4 in the RFC is trying to achieve – be 
>  portable enough
> 
>  to avoid having to sprinkle the code with ifdefs.
> 
> 
>  It does remove some flexibility though, for example with -mavx512f 
>  -mavx512vl
>  you'll get AVX512 style masks for V4SImode data vectors but of course the
>  target sill supports SSE2/AVX2 style masks as well, but those would not 
>  be
>  available as "packed boolean vectors", though they are of course in fact
>  equal to V4SImode data vectors with -1 or 0 values, so in this particular
>  case it might not matter.
> 
>  That said, the vector_mask attribute will get you V4SImode vectors with
>  signed boolean elements of 32 bits for V4SImode data vectors with
>  SSE2/AVX2.
> 
> 
> 
>  This sounds very much like what the scenario would be with NEON vs SVE. 
>  Coming to think
> 
>  of it, vector_mask resembles option 4 in the proposal with ‘n’ implied 
>  by the ‘base’ vector type
> 
>  and a ‘w’ specified for the type.
> 
> 
> 
>  Given its current implementation, if vector_mask is exposed to the CFE, 
>  would there be any
> 
>  major challenges wrt implementation or defining behaviour semantics? I 
>  played around with a
> 
>  few examples from the testsuite and wrote some new ones. I mostly tried 
>  operations that
> 
>  the new type would have to support (unary, binary bitwise, 
>  initializations etc) – with a couple of exceptions
> 
> 

RE: [PATCH v2] RISC-V: Refactor riscv mode after for VXRM and FRM

2023-07-13 Thread Li, Pan2 via Gcc-patches
Sure and committed, thanks Kito.

Pan

-Original Message-
From: Kito Cheng  
Sent: Thursday, July 13, 2023 5:19 PM
To: Li, Pan2 
Cc: Jeff Law ; gcc-patches@gcc.gnu.org; 
juzhe.zh...@rivai.ai; rdapp@gmail.com; Wang, Yanzhang 

Subject: Re: [PATCH v2] RISC-V: Refactor riscv mode after for VXRM and FRM

Hmmm, anyway, I guess it's not worth spending any more of your time,
LGTM for v3 :)

On Thu, Jul 13, 2023 at 5:10 PM Li, Pan2 via Gcc-patches

 wrote:
>
> It can pass the selftest with below diff based on v3, but got ICE when build 
> newlib.
>
> /home/pli/repos/gcc/222/riscv-gnu-toolchain/newlib/newlib/libc/time/../time/strftime.c:1426:1:
>  internal compiler error: in reg_overlap_mentioned_p, at rtlanal.cc:1928
>  1426 | }
>   | ^
> 0x87241f reg_overlap_mentioned_p(rtx_def const*, rtx_def const*)
> ../.././gcc/gcc/rtlanal.cc:1928
> 0x1005eab set_of_1
> ../.././gcc/gcc/rtlanal.cc:1440
> 0x10015c2 set_of(rtx_def const*, rtx_def const*)
> ../.././gcc/gcc/rtlanal.cc:1452
> 0x10015c2 reg_set_p(rtx_def const*, rtx_def const*)
> ../.././gcc/gcc/rtlanal.cc:1295
> 0x13f66c0 vxrm_unknown_p
> ../.././gcc/gcc/config/riscv/riscv.cc:7720
> 0x13f66c0 riscv_vxrm_mode_after
> ../.././gcc/gcc/config/riscv/riscv.cc:7760
> 0x13f66c0 riscv_mode_after
> ../.././gcc/gcc/config/riscv/riscv.cc:7799
> 0x1defe69 optimize_mode_switching
> ../.././gcc/gcc/mode-switching.cc:632
> 0x1defe69 execute
> ../.././gcc/gcc/mode-switching.cc:909
>
>
> Diff based on PATCH v3.
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 6ed735d6983..d66ba0030eb 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -7714,10 +7714,10 @@ asm_insn_p (rtx_insn *insn)
>  /* Return TRUE that an insn is unknown for VXRM.  */
>
>  static bool
> -vxrm_unknown_p (rtx_insn *insn)
> +vxrm_unknown_p (rtx_insn *insn, const_rtx vxrm_reg)
>  {
>/* Return true if there is a definition of VXRM.  */
> -  if (reg_set_p (gen_rtx_REG (SImode, VXRM_REGNUM), insn))
> +  if (reg_set_p (vxrm_reg, insn))
>  return true;
>
>/* A CALL function may contain an instruction that modifies the VXRM,
> @@ -7736,10 +7736,10 @@ vxrm_unknown_p (rtx_insn *insn)
>  /* Return TRUE that an insn is unknown dynamic for FRM.  */
>
>  static bool
> -frm_unknown_dynamic_p (rtx_insn *insn)
> +frm_unknown_dynamic_p (rtx_insn *insn, const_rtx frm_reg)
>  {
>/* Return true if there is a definition of FRM.  */
> -  if (reg_set_p (gen_rtx_REG (SImode, FRM_REGNUM), insn))
> +  if (reg_set_p (frm_reg, insn))
>  return true;
>
>/* A CALL function may contain an instruction that modifies the FRM,
> @@ -7755,13 +7755,15 @@ frm_unknown_dynamic_p (rtx_insn *insn)
>  static int
>  riscv_vxrm_mode_after (rtx_insn *insn, int mode)
>  {
> -  if (vxrm_unknown_p (insn))
> +  static const_rtx vxrm_reg = gen_rtx_REG (SImode, VXRM_REGNUM);
> +
> +  if (vxrm_unknown_p (insn, vxrm_reg))
>  return VXRM_MODE_NONE;
>
>if (recog_memoized (insn) < 0)
>  return mode;
>
> -  if (reg_mentioned_p (gen_rtx_REG (SImode, VXRM_REGNUM), PATTERN (insn)))
> +  if (reg_mentioned_p (vxrm_reg, PATTERN (insn)))
>  return get_attr_vxrm_mode (insn);
>else
>  return mode;
> @@ -7772,13 +7774,15 @@ riscv_vxrm_mode_after (rtx_insn *insn, int mode)
>  static int
>  riscv_frm_mode_after (rtx_insn *insn, int mode)
>  {
> -  if (frm_unknown_dynamic_p (insn))
> +  static const_rtx frm_reg = gen_rtx_REG (SImode, FRM_REGNUM);
> +
> +  if (frm_unknown_dynamic_p (insn, frm_reg))
>  return FRM_MODE_DYN;
>
>if (recog_memoized (insn) < 0)
>  return mode;
>
> -  if (reg_mentioned_p (gen_rtx_REG (SImode, FRM_REGNUM), PATTERN (insn)))
> +  if (reg_mentioned_p (frm_reg, PATTERN (insn)))
>  return get_attr_frm_mode (insn);
>else
>  return mode;
>
> Pan
>
> -Original Message-
> From: Li, Pan2
> Sent: Thursday, July 13, 2023 4:42 PM
> To: Kito Cheng 
> Cc: Jeff Law ; gcc-patches@gcc.gnu.org; 
> juzhe.zh...@rivai.ai; rdapp@gmail.com; Wang, Yanzhang 
> 
> Subject: RE: [PATCH v2] RISC-V: Refactor riscv mode after for VXRM and FRM
>
> Sure thing, get you point now, will have a try and send v4 if everything goes 
> well.
>
> Pan
>
> -Original Message-
> From: Kito Cheng 
> Sent: Thursday, July 13, 2023 3:35 PM
> To: Li, Pan2 
> Cc: Jeff Law ; gcc-patches@gcc.gnu.org; 
> juzhe.zh...@rivai.ai; rdapp@gmail.com; Wang, Yanzhang 
> 
> Subject: Re: [PATCH v2] RISC-V: Refactor riscv mode after for VXRM and FRM
>
> oh, I know why you failed on that, you need to put it within the
> function, not global static, function static variable will construct
> when first invoked rather than construct at program start.
>
> Could you try to apply my diff in the last mail and try again?
>
> On Thu, Jul 13, 2023 at 3:29 PM Li, Pan2 via Gcc-patches
>  wrote:
> >
> > Thanks Kito for review. Sorry didn't involve the code result in self test 
> > error in PATCH v3, but it 

[PATCH, OpenACC 2.7, v2] Implement host_data must have use_device clause requirement

2023-07-13 Thread Chung-Lin Tang via Gcc-patches
On 2023/6/16 5:13 PM, Thomas Schwinge wrote:
> OK with one small change, please -- unless there's a reason for doing it
> this way:
> 
>> --- a/gcc/fortran/trans-openmp.cc
>> +++ b/gcc/fortran/trans-openmp.cc
>> @@ -4677,6 +4677,12 @@ gfc_trans_oacc_construct (gfc_code *code)
>>   break;
>>case EXEC_OACC_HOST_DATA:
>>   construct_code = OACC_HOST_DATA;
>> + if (code->ext.omp_clauses->lists[OMP_LIST_USE_DEVICE] == NULL)
>> +   {
>> + error_at (gfc_get_location (>loc),
>> +   "% construct requires % 
>> clause");
>> + return NULL_TREE;
>> +   }
>>   break;
>>default:
>>   gcc_unreachable ();
> The OpenMP "must contain at least one [...] clause" checks are done in
> 'gcc/fortran/openmp.cc:resolve_omp_clauses'.  For consistency (or, to let
> 'gcc/fortran/trans-openmp.cc' continue to just deal with "directive
> translation"), do similar for OpenACC 'host_data'?  (..., and we later
> accordingly adjust 'gcc/fortran/openmp.cc:gfc_match_oacc_update', too?)

Hi Thomas,
I've adjusted the Fortran implementation as you described. Yes, I agree this way
more fits current Fortran FE conventions.

I've re-tested the attached v2 patch, will commit later this week if no major
objections.

Thanks,
Chung-Lin

gcc/c/ChangeLog:

* c-parser.cc (c_parser_oacc_host_data): Add checking requiring OpenACC
host_data construct to have an use_device clause.

gcc/cp/ChangeLog:

* parser.cc (cp_parser_oacc_host_data): Add checking requiring OpenACC
host_data construct to have an use_device clause.

gcc/fortran/ChangeLog:

* openmp.cc (resolve_omp_clauses): Add checking requiring
OpenACC host_data construct to have an use_device clause.

gcc/testsuite/ChangeLog:

* c-c++-common/goacc/host_data-2.c: Adjust testcase.
* gfortran.dg/goacc/host_data-error.f90: New testcase.
* gfortran.dg/goacc/pr71704.f90: Adjust testcase.diff --git a/gcc/c/c-parser.cc b/gcc/c/c-parser.cc
index 24a6eb6e459..80920b31f83 100644
--- a/gcc/c/c-parser.cc
+++ b/gcc/c/c-parser.cc
@@ -18461,8 +18461,13 @@ c_parser_oacc_host_data (location_t loc, c_parser 
*parser, bool *if_p)
   tree stmt, clauses, block;
 
   clauses = c_parser_oacc_all_clauses (parser, OACC_HOST_DATA_CLAUSE_MASK,
-  "#pragma acc host_data");
-
+  "#pragma acc host_data", false);
+  if (!omp_find_clause (clauses, OMP_CLAUSE_USE_DEVICE_PTR))
+{
+  error_at (loc, "% construct requires % clause");
+  return error_mark_node;
+}
+  clauses = c_finish_omp_clauses (clauses, C_ORT_ACC);
   block = c_begin_omp_parallel ();
   add_stmt (c_parser_omp_structured_block (parser, if_p));
   stmt = c_finish_oacc_host_data (loc, clauses, block);
diff --git a/gcc/cp/parser.cc b/gcc/cp/parser.cc
index 5e2b5cba57e..beb5b632e5e 100644
--- a/gcc/cp/parser.cc
+++ b/gcc/cp/parser.cc
@@ -45895,8 +45895,15 @@ cp_parser_oacc_host_data (cp_parser *parser, cp_token 
*pragma_tok, bool *if_p)
   unsigned int save;
 
   clauses = cp_parser_oacc_all_clauses (parser, OACC_HOST_DATA_CLAUSE_MASK,
-   "#pragma acc host_data", pragma_tok);
-
+   "#pragma acc host_data", pragma_tok,
+   false);
+  if (!omp_find_clause (clauses, OMP_CLAUSE_USE_DEVICE_PTR))
+{
+  error_at (pragma_tok->location,
+   "% construct requires % clause");
+  return error_mark_node;
+}
+  clauses = finish_omp_clauses (clauses, C_ORT_ACC);
   block = begin_omp_parallel ();
   save = cp_parser_begin_omp_structured_block (parser);
   cp_parser_statement (parser, NULL_TREE, false, if_p);
diff --git a/gcc/fortran/openmp.cc b/gcc/fortran/openmp.cc
index 8efc4b3ecfa..f7af02845de 100644
--- a/gcc/fortran/openmp.cc
+++ b/gcc/fortran/openmp.cc
@@ -8764,6 +8764,12 @@ resolve_omp_clauses (gfc_code *code, gfc_omp_clauses 
*omp_clauses,
   "% clause", _clauses->detach->where);
 }
 
+  if (openacc
+  && code->op == EXEC_OACC_HOST_DATA
+  && omp_clauses->lists[OMP_LIST_USE_DEVICE] == NULL)
+gfc_error ("% construct at %L requires % clause",
+  >loc);
+
   if (omp_clauses->assume)
 gfc_resolve_omp_assumptions (omp_clauses->assume);
 }
diff --git a/gcc/testsuite/c-c++-common/goacc/host_data-2.c 
b/gcc/testsuite/c-c++-common/goacc/host_data-2.c
index b3093e575ff..862a764eb3a 100644
--- a/gcc/testsuite/c-c++-common/goacc/host_data-2.c
+++ b/gcc/testsuite/c-c++-common/goacc/host_data-2.c
@@ -8,7 +8,9 @@ void
 f (void)
 {
   int v2 = 3;
-#pragma acc host_data copy(v2) /* { dg-error ".copy. is not valid for ..pragma 
acc host_data." } */
+#pragma acc host_data copy(v2)
+  /* { dg-error ".copy. is not valid for ..pragma acc host_data." "" { target 
*-*-* } .-1 } */
+  /* { dg-error ".host_data. construct requires .use_device. clause" "" { 
target *-*-* } .-2 } */
   ;
 
 

Re: [PATCH 7/19]middle-end: Refactor vectorizer loop conditionals and separate out IV to new variables

2023-07-13 Thread Richard Biener via Gcc-patches
On Wed, 28 Jun 2023, Tamar Christina wrote:

> Hi All,
> 
> This patch splits off the vectorizer's understanding of the main loop exit off
> from the normal loop infrastructure.
> 
> Essentially we're relaxing the use of single_exit() in the vectorizer as we 
> will
> no longer have a single single and need a well defined split between the main
> and secondary exits of loops for vectorization.

General comments here, more comments inline below.

> These new values were added to the loop class even though they're only used by
> the vectorizer for a couple of reasons:
>   - We need access to them in places where we have no loop_vinfo.

I've been passing down loop_vinfo to more places when cleaning up stuff
so this shouldn't be a limiting factor.  Passing down the relevant edge
if it's the middle-end that needs access is then the other option.

>   - We only have a single loop_vinfo for each loop under consideration, 
> however
> that same loop can have different copies, e.g. peeled/versioned copies or
> the scalar variant of the loop.  For each of these we still need to be 
> able
> to have a coherent exit definition.

I've noticed this as well dealing with how epilogue vectorization is
bolted on ... I think in an ideal world the main loop vectorization
would create loop_vinfo for each of those loops so it can push info
there.

> For these reason the placement in the loop class was the only way to keep the
> book keeping together with the loops and avoid possibly expensive lookups.
> 
> For this version of the patch the `main` exit of a loop is defined as the exit
> that is closest to the loop latch. This is stored in vec_loop_iv.  The 
> remaining
> exits which are relevant for the vectorizer are stored inside
> vec_loop_alt_exits.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>   * cfgloop.cc (alloc_loop): Initialize vec_loop_iv.
>   * cfgloop.h (class loop): Add vec_loop_iv and vec_loop_alt_exits.
>   * doc/loop.texi: Document get_edge_condition.
>   * tree-loop-distribution.cc (loop_distribution::distribute_loop):
>   Initialize vec_loop_iv since loop distributions calls loop peeling which
>   only understands vec_loop_iv now.
>   * tree-scalar-evolution.cc (get_edge_condition): New.
>   (get_loop_exit_condition): Refactor into get_edge_condition.
>   * tree-scalar-evolution.h (get_edge_condition): New.
>   * tree-vect-data-refs.cc (vect_enhance_data_refs_alignment): Update use
>   of single_exit.
>   * tree-vect-loop-manip.cc (vect_set_loop_condition_partial_vectors,
>   vect_set_loop_condition_normal, vect_set_loop_condition,
>   slpeel_tree_duplicate_loop_to_edge_cfg, slpeel_can_duplicate_loop_p,
>   find_loop_location, vect_update_ivs_after_vectorizer,
>   vect_gen_vector_loop_niters_mult_vf, find_guard_arg, vect_do_peeling):
>   Replace usages of single_exit.
>   (vec_init_exit_info): New.
>   * tree-vect-loop.cc (vect_analyze_loop_form,
>   vect_create_epilog_for_reduction, vectorizable_live_operation,
>   scale_profile_for_vect_loop, vect_transform_loop): New.
>   * tree-vectorizer.h (LOOP_VINFO_IV_EXIT, LOOP_VINFO_ALT_EXITS,
>   vec_init_exit_info): New.
> 
> --- inline copy of patch -- 
> diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
> index 
> e7ac2b5f3db55de3dbbab7bd2bfe08388f4ec533..cab82d7960e5be517bba2621f7f4888e7bf3c295
>  100644
> --- a/gcc/cfgloop.h
> +++ b/gcc/cfgloop.h
> @@ -272,6 +272,14 @@ public:
>   the basic-block from being collected but its index can still be
>   reused.  */
>basic_block former_header;
> +
> +  /* The controlling loop IV for the current loop when vectorizing.  This IV
> + controls the natural exits of the loop.  */
> +  edge  GTY ((skip (""))) vec_loop_iv;
> +
> +  /* If the loop has multiple exits this structure contains the alternate
> + exits of the loop which are relevant for vectorization.  */
> +  vec GTY ((skip (""))) vec_loop_alt_exits;

That's a quite heavy representation and as you say it's vectorizer
specific.  May I ask you to eliminate at _least_ vec_loop_alt_exits?
Are there not all exits in that vector?  Note there's already
the list of exits and if you have the canonical counting IV exit
you can match against that to get all the others?

>  };
>  
>  /* Set if the loop is known to be infinite.  */
> diff --git a/gcc/cfgloop.cc b/gcc/cfgloop.cc
> index 
> ccda7415d7037e26048425b5d85f3633a39fd325..98123f7dce98227c8dffe4833e159fbb05596831
>  100644
> --- a/gcc/cfgloop.cc
> +++ b/gcc/cfgloop.cc
> @@ -355,6 +355,7 @@ alloc_loop (void)
>loop->nb_iterations_upper_bound = 0;
>loop->nb_iterations_likely_upper_bound = 0;
>loop->nb_iterations_estimate = 0;
> +  loop->vec_loop_iv = NULL;
>return loop;
>  }
>  
> diff --git a/gcc/doc/loop.texi b/gcc/doc/loop.texi
> index 
> 

Re: [PATCH 8/19]middle-end: updated niters analysis to handle multiple exits.

2023-07-13 Thread Richard Biener via Gcc-patches
On Wed, 28 Jun 2023, Tamar Christina wrote:

> Hi All,
> 
> For early break vectorization we have to update niters analysis to record and
> analyze all exits of the loop, and so all conds.
> 
> The niters of the loop is still determined by the main/natural exit of the 
> loop
> as this is the O(n) bounds.  For now we don't do much with the secondary 
> conds,
> but their assumptions can be used to generate versioning checks later.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?

I probably confused vec_init_exit_info in the previous patch - that said,
I'm missing a clear function that determines the natural exit of the
original (if-converted) scalar loop.  As vec_init_exit_info seems
to (re-)compute that I'll comment on it here.

+  /* The main IV is to be determined by the block that's the first 
reachable
+ block from the latch.  We cannot rely on the order the loop analysis
+ returns and we don't have any SCEV analysis on the loop.  */
+  auto_vec  workset;
+  workset.safe_push (loop_latch_edge (loop));
+  hash_set  visited;
+
+  while (!workset.is_empty ())
+{
+  edge e = workset.pop ();
+  if (visited.contains (e))
+   continue;
+
+  bool found_p = false;
+  for (edge ex : e->src->succs)
+   {
+ if (exits.contains (ex))
+   {
+ found_p = true;
+ e = ex;
+ break;
+   }
+   }
+
+  if (found_p)
+   {
+ loop->vec_loop_iv = e;
+ for (edge ex : exits)
+   if (e != ex)
+ loop->vec_loop_alt_exits.safe_push (ex);
+ return;
+   }
+  else
+   {
+ for (edge ex : e->src->preds)
+   workset.safe_insert (0, ex);
+   }
+  visited.add (e);
+}

So this greedily follows edges from the latch and takes the first
exit.  Why's that better than simply choosing the first?

I'd have done

 auto_vec exits = get_loop_exit_edges (loop);
 for (e : exits)
   {
 if (vect_get_loop_niters (...))
   {
 if no assumptions use that edge, if assumptions continue
 searching, maybe ther's an edge w/o assumptions
   }
   }
 use (first) exit with assumptions

we probably want to know 'may_be_zero' as well and prefer an edge
without that.  So eventually call number_of_iterations_exit_assumptions
directly and look for the best niter_desc and pass that to
vect_get_loop_niters (or re-do the work).

As said for "copying" the exit to the loop copies use the block mapping.


> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>   * tree-vect-loop.cc (vect_get_loop_niters): Analyze all exits and return
>   all gconds.
>   (vect_analyze_loop_form): Update code checking for conds.
>   (vect_create_loop_vinfo): Handle having multiple conds.
>   (vect_analyze_loop): Release extra loop conds structures.
>   * tree-vectorizer.h (LOOP_VINFO_LOOP_CONDS,
>   LOOP_VINFO_LOOP_IV_COND): New.
>   (struct vect_loop_form_info): Add conds, loop_iv_cond.
> 
> --- inline copy of patch -- 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 
> 55e69a7ca0b24e0872477141db6f74dbf90b7981..9065811b3b9c2a550baf44768603172b9e26b94b
>  100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -849,80 +849,106 @@ vect_fixup_scalar_cycles_with_patterns (loop_vec_info 
> loop_vinfo)
> in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
> niter information holds in ASSUMPTIONS.
>  
> -   Return the loop exit condition.  */
> +   Return the loop exit conditions.  */
>  
>  
> -static gcond *
> +static vec
>  vect_get_loop_niters (class loop *loop, tree *assumptions,
> tree *number_of_iterations, tree *number_of_iterationsm1)
>  {
> -  edge exit = single_exit (loop);
> +  auto_vec exits = get_loop_exit_edges (loop);
> +  vec conds;
> +  conds.create (exits.length ());
>class tree_niter_desc niter_desc;
>tree niter_assumptions, niter, may_be_zero;
> -  gcond *cond = get_loop_exit_condition (loop);
>  
>*assumptions = boolean_true_node;
>*number_of_iterationsm1 = chrec_dont_know;
>*number_of_iterations = chrec_dont_know;
> +
>DUMP_VECT_SCOPE ("get_loop_niters");
>  
> -  if (!exit)
> -return cond;
> +  if (exits.is_empty ())
> +return conds;
>  
> -  may_be_zero = NULL_TREE;
> -  if (!number_of_iterations_exit_assumptions (loop, exit, _desc, NULL)
> -  || chrec_contains_undetermined (niter_desc.niter))
> -return cond;
> +  if (dump_enabled_p ())
> +dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
> +  exits.length ());
>  
> -  niter_assumptions = niter_desc.assumptions;
> -  may_be_zero = niter_desc.may_be_zero;
> -  niter = niter_desc.niter;
> +  edge exit;
> +  unsigned int i;
> +  FOR_EACH_VEC_ELT (exits, i, exit)
> +{
> +  gcond *cond = get_edge_condition (exit);
> +  if (cond)
> + conds.safe_push (cond);
>  
> -  if (may_be_zero && 

Re: [RFC] GNU Vector Extension -- Packed Boolean Vectors

2023-07-13 Thread Tejas Belagod via Gcc-patches

On 7/3/23 1:31 PM, Richard Biener wrote:

On Mon, Jul 3, 2023 at 8:50 AM Tejas Belagod  wrote:


On 6/29/23 6:55 PM, Richard Biener wrote:

On Wed, Jun 28, 2023 at 1:26 PM Tejas Belagod  wrote:






From: Richard Biener 
Date: Tuesday, June 27, 2023 at 12:58 PM
To: Tejas Belagod 
Cc: gcc-patches@gcc.gnu.org 
Subject: Re: [RFC] GNU Vector Extension -- Packed Boolean Vectors

On Tue, Jun 27, 2023 at 8:30 AM Tejas Belagod  wrote:






From: Richard Biener 
Date: Monday, June 26, 2023 at 2:23 PM
To: Tejas Belagod 
Cc: gcc-patches@gcc.gnu.org 
Subject: Re: [RFC] GNU Vector Extension -- Packed Boolean Vectors

On Mon, Jun 26, 2023 at 8:24 AM Tejas Belagod via Gcc-patches
 wrote:


Hi,

Packed Boolean Vectors
--

I'd like to propose a feature addition to GNU Vector extensions to add packed
boolean vectors (PBV).  This has been discussed in the past here[1] and a 
variant has
been implemented in Clang recently[2].

With predication features being added to vector architectures (SVE, MVE, AVX),
it is a useful feature to have to model predication on targets.  This could
find its use in intrinsics or just used as is as a GNU vector extension being
mapped to underlying target features.  For example, the packed boolean vector
could directly map to a predicate register on SVE.

Also, this new packed boolean type GNU extension can be used with SVE ACLE
intrinsics to replace a fixed-length svbool_t.

Here are a few options to represent the packed boolean vector type.


The GIMPLE frontend uses a new 'vector_mask' attribute:

typedef int v8si __attribute__((vector_size(8*sizeof(int;
typedef v8si v8sib __attribute__((vector_mask));

it get's you a vector type that's the appropriate (dependent on the
target) vector
mask type for the vector data type (v8si in this case).



Thanks Richard.

Having had a quick look at the implementation, it does seem to tick the boxes.

I must admit I haven't dug deep, but if the target hook allows the mask to be

defined in way that is target-friendly (and I don't know how much effort it will

be to migrate the attribute to more front-ends), it should do the job nicely.

Let me go back and dig a bit deeper and get back with questions if any.



Let me add that the advantage of this is the compiler doesn't need
to support weird explicitely laid out packed boolean vectors that do
not match what the target supports and the user doesn't need to know
what the target supports (and thus have an #ifdef maze around explicitely
specified layouts).

Sorry for the delayed response – I spent a day experimenting with vector_mask.



Yeah, this is what option 4 in the RFC is trying to achieve – be portable enough

to avoid having to sprinkle the code with ifdefs.


It does remove some flexibility though, for example with -mavx512f -mavx512vl
you'll get AVX512 style masks for V4SImode data vectors but of course the
target sill supports SSE2/AVX2 style masks as well, but those would not be
available as "packed boolean vectors", though they are of course in fact
equal to V4SImode data vectors with -1 or 0 values, so in this particular
case it might not matter.

That said, the vector_mask attribute will get you V4SImode vectors with
signed boolean elements of 32 bits for V4SImode data vectors with
SSE2/AVX2.



This sounds very much like what the scenario would be with NEON vs SVE. Coming 
to think

of it, vector_mask resembles option 4 in the proposal with ‘n’ implied by the 
‘base’ vector type

and a ‘w’ specified for the type.



Given its current implementation, if vector_mask is exposed to the CFE, would 
there be any

major challenges wrt implementation or defining behaviour semantics? I played 
around with a

few examples from the testsuite and wrote some new ones. I mostly tried 
operations that

the new type would have to support (unary, binary bitwise, initializations etc) 
– with a couple of exceptions

most of the ops seem to be supported. I also triggered a couple of ICEs in some 
tests involving

implicit conversions to wider/narrower vector_mask types (will raise reports 
for these). Correct me

if I’m wrong here, but we’d probably have to support a couple of new ops if 
vector_mask is exposed

to the CFE – initialization and subscript operations?


Yes, either that or restrict how the mask vectors can be used, thus
properly diagnose improper
uses.


Indeed.

   A question would be for example how to write common mask test

operations like
if (any (mask)) or if (all (mask)).


I see 2 options here. New builtins could support new types - they'd
provide a target independent way to test any and all conditions. Another
would be to let the target use its intrinsics to do them in the most
efficient way possible (which the builtins would get lowered down to
anyway).


   Likewise writing merge operations

- do those as

   a = a | (mask ? b : 0);

thus use ternary ?: for this?


Yes, like now, the ternary could just translate to

{mask[0] ? b[0] : 0, mask[1] ? b[1] : 0, 

[PATCH 3/6] arm: [MVE intrinsics factorize vcmulq

2023-07-13 Thread Christophe Lyon via Gcc-patches
Factorize vcmulq builtins so that they use parameterized names.

We can merged them with vcadd.

2023-07-13  Christophe Lyon  

gcc/:
* config/arm/arm_mve_builtins.def (vcmulq_rot90_f)
(vcmulq_rot270_f, vcmulq_rot180_f, vcmulq_f): Add "_f" suffix.
* config/arm/iterators.md (MVE_VCADDQ_VCMULQ)
(MVE_VCADDQ_VCMULQ_M): New.
(mve_insn): Add vcmul.
(rot): Add VCMULQ_M_F, VCMULQ_ROT90_M_F, VCMULQ_ROT180_M_F,
VCMULQ_ROT270_M_F.
(VCMUL): Delete.
(mve_rot): Add VCMULQ_M_F, VCMULQ_ROT90_M_F, VCMULQ_ROT180_M_F,
VCMULQ_ROT270_M_F.
* config/arm/mve.md (mve_vcmulq): Merge into
@mve_q_f.
(mve_vcmulq_m_f, mve_vcmulq_rot180_m_f)
(mve_vcmulq_rot270_m_f, mve_vcmulq_rot90_m_f): Merge
into @mve_q_m_f.
---
 gcc/config/arm/arm_mve_builtins.def |  8 +--
 gcc/config/arm/iterators.md | 27 +++--
 gcc/config/arm/mve.md   | 92 +++--
 3 files changed, 33 insertions(+), 94 deletions(-)

diff --git a/gcc/config/arm/arm_mve_builtins.def 
b/gcc/config/arm/arm_mve_builtins.def
index 63ad1845593..56358c0bd02 100644
--- a/gcc/config/arm/arm_mve_builtins.def
+++ b/gcc/config/arm/arm_mve_builtins.def
@@ -191,6 +191,10 @@ VAR3 (BINOP_NONE_NONE_NONE, vcaddq_rot90_, v16qi, v8hi, 
v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vcaddq_rot270_, v16qi, v8hi, v4si)
 VAR2 (BINOP_NONE_NONE_NONE, vcaddq_rot90_f, v8hf, v4sf)
 VAR2 (BINOP_NONE_NONE_NONE, vcaddq_rot270_f, v8hf, v4sf)
+VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot90_f, v8hf, v4sf)
+VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot270_f, v8hf, v4sf)
+VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot180_f, v8hf, v4sf)
+VAR2 (BINOP_NONE_NONE_NONE, vcmulq_f, v8hf, v4sf)
 VAR3 (BINOP_NONE_NONE_NONE, vhcaddq_rot90_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vhcaddq_rot270_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vhaddq_s, v16qi, v8hi, v4si)
@@ -874,10 +878,6 @@ VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshlcq_m_vec_u, 
v16qi, v8hi, v4si)
 VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshlcq_m_carry_u, v16qi, v8hi, v4si)
 
 /* optabs without any suffixes.  */
-VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot90, v8hf, v4sf)
-VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot270, v8hf, v4sf)
-VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot180, v8hf, v4sf)
-VAR2 (BINOP_NONE_NONE_NONE, vcmulq, v8hf, v4sf)
 VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot90, v8hf, v4sf)
 VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot270, v8hf, v4sf)
 VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot180, v8hf, v4sf)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index da1ead34e58..9f71404e26c 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -901,8 +901,19 @@
 VPSELQ_F
 ])
 
+(define_int_iterator MVE_VCADDQ_VCMULQ [
+UNSPEC_VCADD90 UNSPEC_VCADD270
+UNSPEC_VCMUL UNSPEC_VCMUL90 UNSPEC_VCMUL180 UNSPEC_VCMUL270
+])
+
+(define_int_iterator MVE_VCADDQ_VCMULQ_M [
+VCADDQ_ROT90_M_F VCADDQ_ROT270_M_F
+VCMULQ_M_F VCMULQ_ROT90_M_F VCMULQ_ROT180_M_F 
VCMULQ_ROT270_M_F
+])
+
 (define_int_attr mve_insn [
 (UNSPEC_VCADD90 "vcadd") (UNSPEC_VCADD270 "vcadd")
+(UNSPEC_VCMUL "vcmul") (UNSPEC_VCMUL90 "vcmul") 
(UNSPEC_VCMUL180 "vcmul") (UNSPEC_VCMUL270 "vcmul")
 (VABAVQ_P_S "vabav") (VABAVQ_P_U "vabav")
 (VABAVQ_S "vabav") (VABAVQ_U "vabav")
 (VABDQ_M_S "vabd") (VABDQ_M_U "vabd") (VABDQ_M_F "vabd")
@@ -931,6 +942,7 @@
 (VCLSQ_M_S "vcls")
 (VCLSQ_S "vcls")
 (VCLZQ_M_S "vclz") (VCLZQ_M_U "vclz")
+(VCMULQ_M_F "vcmul") (VCMULQ_ROT90_M_F "vcmul") 
(VCMULQ_ROT180_M_F "vcmul") (VCMULQ_ROT270_M_F "vcmul")
 (VCREATEQ_S "vcreate") (VCREATEQ_U "vcreate") (VCREATEQ_F 
"vcreate")
 (VDUPQ_M_N_S "vdup") (VDUPQ_M_N_U "vdup") (VDUPQ_M_N_F "vdup")
 (VDUPQ_N_S "vdup") (VDUPQ_N_U "vdup") (VDUPQ_N_F "vdup")
@@ -2182,7 +2194,11 @@
  (UNSPEC_VCMLA "0")
  (UNSPEC_VCMLA90 "90")
  (UNSPEC_VCMLA180 "180")
- (UNSPEC_VCMLA270 "270")])
+ (UNSPEC_VCMLA270 "270")
+ (VCMULQ_M_F "0")
+ (VCMULQ_ROT90_M_F "90")
+ (VCMULQ_ROT180_M_F "180")
+ (VCMULQ_ROT270_M_F "270")])
 
 ;; The complex operations when performed on a real complex number require two
 ;; instructions to perform the operation. e.g. complex multiplication requires
@@ -2230,10 +2246,11 @@
  (UNSPEC_VCMUL "")
  (UNSPEC_VCMUL90 "_rot90")
  (UNSPEC_VCMUL180 "_rot180")
- (UNSPEC_VCMUL270 "_rot270")])
-
-(define_int_iterator VCMUL [UNSPEC_VCMUL 

[PATCH 5/6] arm: [MVE intrinsics] factorize vcmlaq

2023-07-13 Thread Christophe Lyon via Gcc-patches
Factorize vcmlaq builtins so that they use parameterized names.

2023-17-13  Christophe Lyon  

gcc/
* config/arm/arm_mve_builtins.def (vcmlaq_rot90_f)
(vcmlaq_rot270_f, vcmlaq_rot180_f, vcmlaq_f): Add "_f" suffix.
* config/arm/iterators.md (MVE_VCMLAQ_M): New.
(mve_insn): Add vcmla.
(rot): Add VCMLAQ_M_F, VCMLAQ_ROT90_M_F, VCMLAQ_ROT180_M_F,
VCMLAQ_ROT270_M_F.
(mve_rot): Add VCMLAQ_M_F, VCMLAQ_ROT90_M_F, VCMLAQ_ROT180_M_F,
VCMLAQ_ROT270_M_F.
* config/arm/mve.md (mve_vcmlaq): Rename into ...
(@mve_q_f): ... this.
(mve_vcmlaq_m_f, mve_vcmlaq_rot180_m_f)
(mve_vcmlaq_rot270_m_f, mve_vcmlaq_rot90_m_f): Merge
into ...
(@mve_q_m_f): ... this.
---
 gcc/config/arm/arm_mve_builtins.def | 10 ++---
 gcc/config/arm/iterators.md | 19 -
 gcc/config/arm/mve.md   | 64 -
 3 files changed, 29 insertions(+), 64 deletions(-)

diff --git a/gcc/config/arm/arm_mve_builtins.def 
b/gcc/config/arm/arm_mve_builtins.def
index 56358c0bd02..43dacc3dda1 100644
--- a/gcc/config/arm/arm_mve_builtins.def
+++ b/gcc/config/arm/arm_mve_builtins.def
@@ -378,6 +378,10 @@ VAR3 (TERNOP_NONE_NONE_NONE_NONE, vmlasq_n_s, v16qi, v8hi, 
v4si)
 VAR3 (TERNOP_NONE_NONE_NONE_NONE, vmlaq_n_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_NONE_NONE_NONE_NONE, vmladavaxq_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_NONE_NONE_NONE_NONE, vmladavaq_s, v16qi, v8hi, v4si)
+VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot90_f, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot270_f, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot180_f, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_f, v8hf, v4sf)
 VAR3 (TERNOP_NONE_NONE_NONE_IMM, vsriq_n_s, v16qi, v8hi, v4si)
 VAR3 (TERNOP_NONE_NONE_NONE_IMM, vsliq_n_s, v16qi, v8hi, v4si)
 VAR2 (TERNOP_UNONE_UNONE_UNONE_PRED, vrev32q_m_u, v16qi, v8hi)
@@ -876,9 +880,3 @@ VAR3 (QUADOP_NONE_NONE_UNONE_IMM_PRED, vshlcq_m_vec_s, 
v16qi, v8hi, v4si)
 VAR3 (QUADOP_NONE_NONE_UNONE_IMM_PRED, vshlcq_m_carry_s, v16qi, v8hi, v4si)
 VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshlcq_m_vec_u, v16qi, v8hi, v4si)
 VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshlcq_m_carry_u, v16qi, v8hi, v4si)
-
-/* optabs without any suffixes.  */
-VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot90, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot270, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot180, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq, v8hf, v4sf)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 9f71404e26c..b13ff53d36f 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -911,6 +911,10 @@
 VCMULQ_M_F VCMULQ_ROT90_M_F VCMULQ_ROT180_M_F 
VCMULQ_ROT270_M_F
 ])
 
+(define_int_iterator MVE_VCMLAQ_M [
+VCMLAQ_M_F VCMLAQ_ROT90_M_F VCMLAQ_ROT180_M_F 
VCMLAQ_ROT270_M_F
+])
+
 (define_int_attr mve_insn [
 (UNSPEC_VCADD90 "vcadd") (UNSPEC_VCADD270 "vcadd")
 (UNSPEC_VCMUL "vcmul") (UNSPEC_VCMUL90 "vcmul") 
(UNSPEC_VCMUL180 "vcmul") (UNSPEC_VCMUL270 "vcmul")
@@ -942,6 +946,7 @@
 (VCLSQ_M_S "vcls")
 (VCLSQ_S "vcls")
 (VCLZQ_M_S "vclz") (VCLZQ_M_U "vclz")
+(VCMLAQ_M_F "vcmla") (VCMLAQ_ROT90_M_F "vcmla") 
(VCMLAQ_ROT180_M_F "vcmla") (VCMLAQ_ROT270_M_F "vcmla")
 (VCMULQ_M_F "vcmul") (VCMULQ_ROT90_M_F "vcmul") 
(VCMULQ_ROT180_M_F "vcmul") (VCMULQ_ROT270_M_F "vcmul")
 (VCREATEQ_S "vcreate") (VCREATEQ_U "vcreate") (VCREATEQ_F 
"vcreate")
 (VDUPQ_M_N_S "vdup") (VDUPQ_M_N_U "vdup") (VDUPQ_M_N_F "vdup")
@@ -1204,6 +1209,7 @@
 (VSUBQ_M_N_S "vsub") (VSUBQ_M_N_U "vsub") (VSUBQ_M_N_F "vsub")
 (VSUBQ_M_S "vsub") (VSUBQ_M_U "vsub") (VSUBQ_M_F "vsub")
 (VSUBQ_N_S "vsub") (VSUBQ_N_U "vsub") (VSUBQ_N_F "vsub")
+(UNSPEC_VCMLA "vcmla") (UNSPEC_VCMLA90 "vcmla") 
(UNSPEC_VCMLA180 "vcmla") (UNSPEC_VCMLA270 "vcmla")
 ])
 
 (define_int_attr isu[
@@ -2198,7 +2204,12 @@
  (VCMULQ_M_F "0")
  (VCMULQ_ROT90_M_F "90")
  (VCMULQ_ROT180_M_F "180")
- (VCMULQ_ROT270_M_F "270")])
+ (VCMULQ_ROT270_M_F "270")
+ (VCMLAQ_M_F "0")
+ (VCMLAQ_ROT90_M_F "90")
+ (VCMLAQ_ROT180_M_F "180")
+ (VCMLAQ_ROT270_M_F "270")
+ ])
 
 ;; The complex operations when performed on a real complex number require two
 ;; instructions to perform the operation. e.g. complex multiplication requires
@@ -2250,7 +2261,11 @@
  (VCMULQ_M_F "")
  (VCMULQ_ROT90_M_F "_rot90")
  (VCMULQ_ROT180_M_F "_rot180")
- 

Re: [PATCH V2] SSA MATH: Support COND_LEN_FMA for floating-point math optimization

2023-07-13 Thread Richard Biener via Gcc-patches
On Thu, 13 Jul 2023, juzhe.zh...@rivai.ai wrote:

> From: Ju-Zhe Zhong 
> 
> Hi, Richard and Richi.
> 
> Previous patch we support COND_LEN_* binary operations. However, we didn't
> support COND_LEN_* ternary.
> 
> Now, this patch support COND_LEN_* ternary. Consider this following case:
> 
> #define TEST_TYPE(TYPE)   
>  \
>   __attribute__ ((noipa)) void ternop_##TYPE (TYPE *__restrict dst,   
>  \
> TYPE *__restrict a,  \
> TYPE *__restrict b,\
> TYPE *__restrict c, int n)   \
>   {   
>  \
> for (int i = 0; i < n; i++)   
>  \
>   dst[i] += a[i] * b[i];  
>\
>   }
> 
> #define TEST_ALL() TEST_TYPE (double)
> 
> TEST_ALL ()
> 
> Before this patch:
> ...
> COND_LEN_MUL
> COND_LEN_ADD
> 
> Afther this patch:
> ...
> COND_LEN_FMA

OK.

Thanks,
Richard.

> gcc/ChangeLog:
> 
> * genmatch.cc (commutative_op): Add COND_LEN_*
> * internal-fn.cc (first_commutative_argument): Ditto.
> (CASE): Ditto.
> (get_unconditional_internal_fn): Ditto.
> (can_interpret_as_conditional_op_p): Ditto.
> (internal_fn_len_index): Ditto.
> * internal-fn.h (can_interpret_as_conditional_op_p): Ditt.
> * tree-ssa-math-opts.cc (convert_mult_to_fma_1): Ditto.
> (convert_mult_to_fma): Ditto.
> (math_opts_dom_walker::after_dom_children): Ditto.
> 
> ---
>  gcc/genmatch.cc   | 13 ++
>  gcc/internal-fn.cc| 87 ++-
>  gcc/internal-fn.h |  2 +-
>  gcc/tree-ssa-math-opts.cc | 80 +--
>  4 files changed, 159 insertions(+), 23 deletions(-)
> 
> diff --git a/gcc/genmatch.cc b/gcc/genmatch.cc
> index 5fceeec9780..2302f2a7ff0 100644
> --- a/gcc/genmatch.cc
> +++ b/gcc/genmatch.cc
> @@ -559,6 +559,19 @@ commutative_op (id_base *id)
>case CFN_COND_FMS:
>case CFN_COND_FNMA:
>case CFN_COND_FNMS:
> +  case CFN_COND_LEN_ADD:
> +  case CFN_COND_LEN_MUL:
> +  case CFN_COND_LEN_MIN:
> +  case CFN_COND_LEN_MAX:
> +  case CFN_COND_LEN_FMIN:
> +  case CFN_COND_LEN_FMAX:
> +  case CFN_COND_LEN_AND:
> +  case CFN_COND_LEN_IOR:
> +  case CFN_COND_LEN_XOR:
> +  case CFN_COND_LEN_FMA:
> +  case CFN_COND_LEN_FMS:
> +  case CFN_COND_LEN_FNMA:
> +  case CFN_COND_LEN_FNMS:
>   return 1;
>  
>default:
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index c11123a1173..e698f0bffc7 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4191,6 +4191,19 @@ first_commutative_argument (internal_fn fn)
>  case IFN_COND_FMS:
>  case IFN_COND_FNMA:
>  case IFN_COND_FNMS:
> +case IFN_COND_LEN_ADD:
> +case IFN_COND_LEN_MUL:
> +case IFN_COND_LEN_MIN:
> +case IFN_COND_LEN_MAX:
> +case IFN_COND_LEN_FMIN:
> +case IFN_COND_LEN_FMAX:
> +case IFN_COND_LEN_AND:
> +case IFN_COND_LEN_IOR:
> +case IFN_COND_LEN_XOR:
> +case IFN_COND_LEN_FMA:
> +case IFN_COND_LEN_FMS:
> +case IFN_COND_LEN_FNMA:
> +case IFN_COND_LEN_FNMS:
>return 1;
>  
>  default:
> @@ -4330,11 +4343,14 @@ conditional_internal_fn_code (internal_fn ifn)
>  {
>switch (ifn)
>  {
> -#define CASE(CODE, IFN) case IFN_COND_##IFN: return CODE;
> -  FOR_EACH_CODE_MAPPING(CASE)
> +#define CASE(CODE, IFN)  
>   \
> +  case IFN_COND_##IFN:   
>   \
> +  case IFN_COND_LEN_##IFN:   
>   \
> +return CODE;
> +  FOR_EACH_CODE_MAPPING (CASE)
>  #undef CASE
> -default:
> -  return ERROR_MARK;
> +  default:
> + return ERROR_MARK;
>  }
>  }
>  
> @@ -4433,6 +4449,18 @@ get_unconditional_internal_fn (internal_fn ifn)
> operating elementwise if the operands are vectors.  This includes
> the case of an all-true COND, so that the operation always happens.
>  
> +   There is an alternative approach to interpret the STMT when the operands
> +   are vectors which is the operation predicated by both conditional mask
> +   and loop control length, the equivalent C code:
> +
> + for (int i = 0; i < NUNTIS; i++)
> +  {
> + if (i < LEN + BIAS && COND[i])
> +   LHS[i] = A[i] CODE B[i];
> + else
> +   LHS[i] = ELSE[i];
> +  }
> +
> When returning true, set:
>  
> - *COND_OUT to the condition COND, or to NULL_TREE if the condition
> @@ -4440,13 +4468,18 @@ get_unconditional_internal_fn (internal_fn ifn)
> - *CODE_OUT to the tree code
> - OPS[I] to operand I of *CODE_OUT
> - *ELSE_OUT to the fallback value ELSE, or to 

[PATCH] RISC-V: Enable COND_LEN_FMA auto-vectorization

2023-07-13 Thread Juzhe-Zhong
Enable COND_LEN_FMA auto-vectorization for floating-point FMA 
auto-vectorization **NO** ffast-math.

Since the middle-end support has been approved and I will merge it after I 
finished bootstrap && regression on X86.
https://gcc.gnu.org/pipermail/gcc-patches/2023-July/624395.html

Now, it's time to send this patch.

Consider this following case:

#define TEST_TYPE(TYPE)\
  __attribute__ ((noipa)) void ternop_##TYPE (TYPE *__restrict dst,\
  TYPE *__restrict a,  \
  TYPE *__restrict b, int n)   \
  {\
for (int i = 0; i < n; i++)\
  dst[i] += a[i] * b[i];   \
  }

#define TEST_ALL() TEST_TYPE (double)

TEST_ALL ()

Before this patch:

ternop_double:
ble a3,zero,.L5
mv  a6,a0
.L3:
vsetvli a5,a3,e64,m1,tu,ma
sllia4,a5,3
vle64.v v1,0(a0)
vle64.v v2,0(a1)
vle64.v v3,0(a2)
sub a3,a3,a5
vfmul.vvv2,v2,v3
vfadd.vvv1,v1,v2
vse64.v v1,0(a6)
add a0,a0,a4
add a1,a1,a4
add a2,a2,a4
add a6,a6,a4
bne a3,zero,.L3
.L5:
ret

After this patch:

ternop_double:
ble a3,zero,.L5
mv  a6,a0
.L3:
vsetvli a5,a3,e64,m1,tu,ma
sllia4,a5,3
vle64.v v1,0(a0)
vle64.v v2,0(a1)
vle64.v v3,0(a2)
sub a3,a3,a5
vfmacc.vv   v1,v3,v2
vse64.v v1,0(a6)
add a0,a0,a4
add a1,a1,a4
add a2,a2,a4
add a6,a6,a4
bne a3,zero,.L3
.L5:
ret

Notice: This patch only supports COND_LEN_FMA, **NO** COND_LEN_FNMA, ... etc 
since I didn't support them
in the middle-end yet.

Will support them in the following patches soon.

gcc/ChangeLog:

* config/riscv/autovec.md (cond_len_fma): New pattern.
* config/riscv/riscv-protos.h (enum insn_type): New enum.
(expand_cond_len_ternop): New function.
* config/riscv/riscv-v.cc (emit_nonvlmax_fp_ternary_tu_insn): Ditto.
(expand_cond_len_ternop): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/gather-scatter/scatter_store_run-7.c: 
Adapt testcase for link fail.
* gcc.target/riscv/rvv/autovec/ternop/ternop_nofm-1.c: New test.
* gcc.target/riscv/rvv/autovec/ternop/ternop_nofm-2.c: New test.
* gcc.target/riscv/rvv/autovec/ternop/ternop_nofm-3.c: New test.
* gcc.target/riscv/rvv/autovec/ternop/ternop_nofm_run-1.c: New test.
* gcc.target/riscv/rvv/autovec/ternop/ternop_nofm_run-2.c: New test.
* gcc.target/riscv/rvv/autovec/ternop/ternop_nofm_run-3.c: New test.

---
 gcc/config/riscv/autovec.md   | 23 +
 gcc/config/riscv/riscv-protos.h   |  2 +
 gcc/config/riscv/riscv-v.cc   | 49 +++
 .../gather-scatter/scatter_store_run-7.c  |  2 +-
 .../riscv/rvv/autovec/ternop/ternop_nofm-1.c  |  7 +++
 .../riscv/rvv/autovec/ternop/ternop_nofm-2.c  | 11 +
 .../riscv/rvv/autovec/ternop/ternop_nofm-3.c  |  9 
 .../rvv/autovec/ternop/ternop_nofm_run-1.c|  4 ++
 .../rvv/autovec/ternop/ternop_nofm_run-2.c|  4 ++
 .../rvv/autovec/ternop/ternop_nofm_run-3.c|  4 ++
 10 files changed, 114 insertions(+), 1 deletion(-)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop_nofm-1.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop_nofm-2.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop_nofm-3.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop_nofm_run-1.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop_nofm_run-2.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/ternop/ternop_nofm_run-3.c

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 0476b1dea45..64a41bd7101 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -1531,3 +1531,26 @@
   riscv_vector::expand_cond_len_binop (, operands);
   DONE;
 })
+
+;; -
+;;  [FP] Conditional ternary operations
+;; -
+;; Includes:
+;; - vfmacc/...
+;; -
+
+(define_expand "cond_len_fma"
+  [(match_operand:VF 0 "register_operand")
+   (match_operand: 1 "vector_mask_operand")
+   (match_operand:VF 2 "register_operand")
+   (match_operand:VF 3 "register_operand")
+   (match_operand:VF 4 

Re: Re: [PATCH] RISC-V: Enable COND_LEN_FMA auto-vectorization

2023-07-13 Thread juzhe.zh...@rivai.ai
Could you tell me how to add the comment?
I am not familiar with link/binutils stuff.



juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2023-07-13 19:40
To: Juzhe-Zhong; gcc-patches
CC: rdapp.gcc; kito.cheng; kito.cheng; palmer; palmer; jeffreyalaw
Subject: Re: [PATCH] RISC-V: Enable COND_LEN_FMA auto-vectorization
Hi Juzhe,
 
thanks, no complaints from my side apart from one:
 
> +/* { dg-additional-options "-mcmodel=medany" } */
 
Please add a comment why we need this.
 
Regards
Robin
 


RE: [PATCH 8/19]middle-end: updated niters analysis to handle multiple exits.

2023-07-13 Thread Tamar Christina via Gcc-patches
> -Original Message-
> From: Richard Biener 
> Sent: Thursday, July 13, 2023 12:49 PM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd ; j...@ventanamicro.com
> Subject: Re: [PATCH 8/19]middle-end: updated niters analysis to handle
> multiple exits.
> 
> On Wed, 28 Jun 2023, Tamar Christina wrote:
> 
> > Hi All,
> >
> > For early break vectorization we have to update niters analysis to
> > record and analyze all exits of the loop, and so all conds.
> >
> > The niters of the loop is still determined by the main/natural exit of
> > the loop as this is the O(n) bounds.  For now we don't do much with
> > the secondary conds, but their assumptions can be used to generate
> versioning checks later.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> 
> I probably confused vec_init_exit_info in the previous patch - that said, I'm
> missing a clear function that determines the natural exit of the original (if-
> converted) scalar loop.  As vec_init_exit_info seems to (re-)compute that I'll
> comment on it here.

Ah was wondering if you'd seen it 

> 
> +  /* The main IV is to be determined by the block that's the first
> reachable
> + block from the latch.  We cannot rely on the order the loop analysis
> + returns and we don't have any SCEV analysis on the loop.  */
> + auto_vec  workset;  workset.safe_push (loop_latch_edge (loop));
> + hash_set  visited;
> +
> +  while (!workset.is_empty ())
> +{
> +  edge e = workset.pop ();
> +  if (visited.contains (e))
> +   continue;
> +
> +  bool found_p = false;
> +  for (edge ex : e->src->succs)
> +   {
> + if (exits.contains (ex))
> +   {
> + found_p = true;
> + e = ex;
> + break;
> +   }
> +   }
> +
> +  if (found_p)
> +   {
> + loop->vec_loop_iv = e;
> + for (edge ex : exits)
> +   if (e != ex)
> + loop->vec_loop_alt_exits.safe_push (ex);
> + return;
> +   }
> +  else
> +   {
> + for (edge ex : e->src->preds)
> +   workset.safe_insert (0, ex);
> +   }
> +  visited.add (e);
> +}
> 
> So this greedily follows edges from the latch and takes the first exit.  Why's
> that better than simply choosing the first?
> 
> I'd have done
> 
>  auto_vec exits = get_loop_exit_edges (loop);  for (e : exits)
>{
>  if (vect_get_loop_niters (...))
>{
>  if no assumptions use that edge, if assumptions continue
>  searching, maybe ther's an edge w/o assumptions
>}
>}
>  use (first) exit with assumptions
> 
> we probably want to know 'may_be_zero' as well and prefer an edge without
> that.  So eventually call number_of_iterations_exit_assumptions
> directly and look for the best niter_desc and pass that to 
> vect_get_loop_niters
> (or re-do the work).
> 
> As said for "copying" the exit to the loop copies use the block mapping.
> 

The issue is with the scalar loops, where we have no SCEV data and also no
SSA mapping data (from what I can tell, the map was cleared in ifcvt itself).

So for this to work with SCEV, we'd have to start analyzing the loop coming out 
of
LOOP_VINFO_SCALAR_LOOP as well unless I'm missing something?

Regards,
Tamar


Re: [PATCH] RISC-V: Enable COND_LEN_FMA auto-vectorization

2023-07-13 Thread Robin Dapp via Gcc-patches
Hi Juzhe,

thanks, no complaints from my side apart from one:

> +/* { dg-additional-options "-mcmodel=medany" } */

Please add a comment why we need this.

Regards
 Robin


Re: Re: [PATCH] RISC-V: Enable COND_LEN_FMA auto-vectorization

2023-07-13 Thread juzhe.zh...@rivai.ai
I have no ideal.

This testcase cause link ICE:
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/scatter_store_run-7.c (test 
for excess errors)
Excess errors:
scatter_store_run-7.c:(.text.startup+0xc8e): relocation truncated to fit: 
R_RISCV_GPREL_I against `.LANCHOR1'

only LMUL = M8 will cause such issue.



juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2023-07-13 19:40
To: Juzhe-Zhong; gcc-patches
CC: rdapp.gcc; kito.cheng; kito.cheng; palmer; palmer; jeffreyalaw
Subject: Re: [PATCH] RISC-V: Enable COND_LEN_FMA auto-vectorization
Hi Juzhe,
 
thanks, no complaints from my side apart from one:
 
> +/* { dg-additional-options "-mcmodel=medany" } */
 
Please add a comment why we need this.
 
Regards
Robin
 


RE: [PATCH 7/19]middle-end: Refactor vectorizer loop conditionals and separate out IV to new variables

2023-07-13 Thread Tamar Christina via Gcc-patches
> e7ac2b5f3db55de3dbbab7bd2bfe08388f4ec533..cab82d7960e5be517bba2
> 621f7f4
> > 888e7bf3c295 100644
> > --- a/gcc/cfgloop.h
> > +++ b/gcc/cfgloop.h
> > @@ -272,6 +272,14 @@ public:
> >   the basic-block from being collected but its index can still be
> >   reused.  */
> >basic_block former_header;
> > +
> > +  /* The controlling loop IV for the current loop when vectorizing.  This 
> > IV
> > + controls the natural exits of the loop.  */  edge  GTY ((skip
> > + (""))) vec_loop_iv;
> > +
> > +  /* If the loop has multiple exits this structure contains the alternate
> > + exits of the loop which are relevant for vectorization.  */
> > + vec GTY ((skip (""))) vec_loop_alt_exits;
> 
> That's a quite heavy representation and as you say it's vectorizer specific.  
> May
> I ask you to eliminate at _least_ vec_loop_alt_exits?
> Are there not all exits in that vector?  Note there's already the list of 
> exits and if
> you have the canonical counting IV exit you can match against that to get all
> the others?
> 

Sure, though that means some filtering whenever one iterates over the alt exits,
not a problem though.

> >  /* Given LOOP this function generates a new copy of it and puts it
> > on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
> > @@ -1458,13 +1523,15 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class
> loop *loop,
> >edge exit, new_exit;
> >bool duplicate_outer_loop = false;
> >
> > -  exit = single_exit (loop);
> > +  exit = loop->vec_loop_iv;
> >at_exit = (e == exit);
> >if (!at_exit && e != loop_preheader_edge (loop))
> >  return NULL;
> >
> >if (scalar_loop == NULL)
> >  scalar_loop = loop;
> > +  else
> > +vec_init_exit_info (scalar_loop);
> >
> >bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
> >pbbs = bbs + 1;
> > @@ -1490,13 +1557,17 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class
> loop *loop,
> >bbs[0] = preheader;
> >new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
> >
> > -  exit = single_exit (scalar_loop);
> > +  exit = scalar_loop->vec_loop_iv;
> >copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
> > , 1, _exit, NULL,
> > at_exit ? loop->latch : e->src, true);
> > -  exit = single_exit (loop);
> > +  exit = loop->vec_loop_iv;
> >basic_block new_preheader = new_bbs[0];
> >
> > +  /* Record the new loop exit information.  new_loop doesn't have SCEV
> data and
> > + so we must initialize the exit information.  */
> > +  vec_init_exit_info (new_loop);
> > +
> 
> You have a mapping of old to new BB so you should be able to
> map old to new exit by mapping e->src/dest and looking up the new edge?
> 
> The vec_loop_iv exit is mapped directly (new_exit).
> 
> So I don't really understand what's missing there.

But I don't have the mapping when the loop as versioned, e.g. by ifcvt.  So in 
the cases
where scalar_loop != loop in which case I still need them to match up.

vect_loop_form_info is destroyed after analysis though and is not available 
during
peeling. That's why we copy relevant information out in vect_create_loop_vinfo.

But in general we only have 1 per loop as well, so it would be the same as 
using loop_vinfo.

I could move it into loop_vinfo and then require you to pass the edges to the 
peeling function
as you mentioned.  This would solve the location we place them in, but still 
not sure what to do
about versioned loops.  Would need to get its main edge "somewhere", would 
another field in
loop_vinfo be ok?

Cheers,
Tamar

> > +  if (!loop->vec_loop_iv)
> > +return opt_result::failure_at (vect_location,
> > +  "not vectorized:"
> > +  " could not determine main exit from"
> > +  " loop with multiple exits.\n");
> > +
> >/* Different restrictions apply when we are considering an inner-most 
> > loop,
> >   vs. an outer (nested) loop.
> >   (FORNOW. May want to relax some of these restrictions in the future). 
> >  */
> > @@ -3025,9 +3032,8 @@ start_over:
> >if (dump_enabled_p ())
> >  dump_printf_loc (MSG_NOTE, vect_location, "epilog loop 
> > required\n");
> >if (!vect_can_advance_ivs_p (loop_vinfo)
> > - || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
> > -  single_exit (LOOP_VINFO_LOOP
> > -(loop_vinfo
> > + || !slpeel_can_duplicate_loop_p (loop_vinfo,
> > +  LOOP_VINFO_IV_EXIT (loop_vinfo)))
> >  {
> >   ok = opt_result::failure_at (vect_location,
> >"not vectorized: can't create required "
> > @@ -5964,7 +5970,7 @@ vect_create_epilog_for_reduction (loop_vec_info
> loop_vinfo,
> >   Store them in NEW_PHIS.  */
> >if (double_reduc)
> >  loop = outer_loop;
> > -  exit_bb = single_exit (loop)->dest;
> > +  

Re: [PATCH] vect: Handle demoting FLOAT and promoting FIX_TRUNC.

2023-07-13 Thread Richard Biener via Gcc-patches
On Thu, Jul 13, 2023 at 2:19 PM Robin Dapp  wrote:
>
> > Can you add testcases?  Also the current restriction is because
> > the variants you add are not always correct and I don't see any
> > checks that the intermediate type doesn't lose significant bits?
>
> The testcases I wanted to add with a follow-up RISC-V patch but
> I can also try an aarch64 one.
>
> So for my understanding, please correct, we have:
>
>   promoting int -> float, should always be safe.  We currently
>vectorize this with WIDEN and NONE.
>
>   demoting float -> int, this is safe as long as the float
>value can be represented in the int type, otherwise we must
>trap.
>We currently vectorize this on x86 using NARROW (regardless
>of -ftrapping-math) and using NONE only with -fno-trapping-math.
>
>   demoting int -> float, this is safe as long as the
>intermediate types can hold the initial value?  How is
>this different to demoting e.g. int64_t -> int8_t?
>We currently do not vectorize this with either NARROW or NONE.
>LLVM vectorizes but only with their default(?) -fno-trapping-math.
>Yet I don't see how we could trap here?
>
>   promoting float -> int, this is safe as long as the float
>value can be represented (as above)?  We currently vectorize
>this (regardless of -ftrapping-math) with WIDEN but not NONE.
>
> So apart from unifying the -ftrapping-math behavior I think only
> the third variant is somewhat critical?

I think all demoting cases need checks that are not present right now
irrespective of properly trapping.

Richard.

> Regards
>  Robin
>


Re: [PATCH ver4] rs6000, Add return value to __builtin_set_fpscr_rn

2023-07-13 Thread Kewen.Lin via Gcc-patches
Hi Carl,

on 2023/7/12 02:06, Carl Love wrote:
> GCC maintainers:
> 
> Ver 4, Removed extra space in subject line.  Added comment to commit
> log comments about new __SET_FPSCR_RN_RETURNS_FPSCR__ define.  Changed
> Added to Add and Renamed to Rename in ChangeLog.  Updated define_expand
> "rs6000_set_fpscr_rn" per Peter's comments to use new temporary
> register for output value.  Also, comments from Kewen about moving rtx
> tmp_di1 close to use.  Renamed tmp_di2 as orig_df_in_di.  Additionally,
> changed the name of tmp_di3 to tmp_di2 so the numbering is
> sequential.  Moved the new rtx tmp_di2 = gen_reg_rtx (DImode); right
> before its use to be consistent with previous move request.  Fixed tabs
> in comment.  Remove -std=c99 from test_fpscr_rn_builtin_1.c. Cleaned up
> comment and removed abort from test_fpscr_rn_builtin_2.c.  
> 
> Fixed a couple of additional issues with the ChangeLog per feedback
> from git gcc-verify.
> 
> Retested updated patch on Power 8, 9 and 10 to verify changes.
> 
> Ver 3, Renamed the patch per comments on ver 2.  Previous subject line
> was " [PATCH ver 2] rs6000, __builtin_set_fpscr_rn add retrun value".  
> Fixed spelling mistakes and formatting.  Updated define_expand
> "rs6000_set_fpscr_rn to have the rs6000_get_fpscr_fields and
> rs6000_update_fpscr_rn_field define expands inlined.  Optimized the
> code and fixed use of temporary register values. Updated the test file
> dg-do run arguments and dg-options.  Removed the check for
> __SET_FPSCR_RN_RETURNS_FPSCR__. Removed additional references to the
> overloaded built-in with double argument.  Fixed up the documentation
> file.  Updated patch retested on Power 8 BE/LE, Power 9 BE/LE and Power
> 10 LE.
> 
> Ver 2,  Went back thru the requirements and emails.  Not sure where I
> came up with the requirement for an overloaded version with double
> argument.  Removed the overloaded version with the double argument. 
> Added the macro to announce if the __builtin_set_fpscr_rn returns a
> void or a double with the FPSCR bits.  Updated the documentation file. 
> Retested on Power 8 BE/LE, Power 9 BE/LE, Power 10 LE.  Redid the test
> file.  Per request, the original test file functionality was not
> changed.  Just changed the name from test_fpscr_rn_builtin.c to 
> test_fpscr_rn_builtin_1.c.  Put new tests for the return values into a
> new test file, test_fpscr_rn_builtin_2.c.
> 
> The GLibC team requested a builtin to replace the mffscrn and
> mffscrniinline asm instructions in the GLibC code.  Previously there
> was discussion on adding builtins for the mffscrn instructions.
> 
> https://gcc.gnu.org/pipermail/gcc-patches/2023-May/620261.html
> 
> In the end, it was felt that it would be to extend the existing
> __builtin_set_fpscr_rn builtin to return a double instead of a void
> type.  The desire is that we could have the functionality of the
> mffscrn and mffscrni instructions on older ISAs.  The two instructions
> were initially added in ISA 3.0.  The __builtin_set_fpscr_rn has the
> needed functionality to set the RN field using the mffscrn and mffscrni
> instructions if ISA 3.0 is supported or fall back to using logical
> instructions to mask and set the bits for earlier ISAs.  The
> instructions return the current value of the FPSCR fields DRN, VE, OE,
> UE, ZE, XE, NI, RN bit positions then update the RN bit positions with
> the new RN value provided.
> 
> The current __builtin_set_fpscr_rn builtin has a return type of void. 
> So, changing the return type to double and returning the  FPSCR fields
> DRN, VE, OE, UE, ZE, XE, NI, RN bit positions would then give the
> functionally equivalent of the mffscrn and mffscrni instructions.  Any
> current uses of the builtin would just ignore the return value yet any
> new uses could use the return value.  So the requirement is for the
> change to the __builtin_set_fpscr_rn builtin to be backwardly
> compatible and work for all ISAs.
> 
> The following patch changes the return type of the
>  __builtin_set_fpscr_rn builtin from void to double.  The return value
> is the current value of the various FPSCR fields DRN, VE, OE, UE, ZE,
> XE, NI, RN bit positions when the builtin is called.  The builtin then
> updated the RN field with the new value provided as an argument to the
> builtin.  The patch adds new testcases to test_fpscr_rn_builtin.c to
> check that the builtin returns the current value of the FPSCR fields
> and then updates the RN field.
> 
> The GLibC team has reviewed the patch to make sure it met their needs
> as a drop in replacement for the inline asm mffscr and mffscrni
> statements in the GLibC code.  T
> 
> The patch has been tested on Power 8 LE/BE, Power 9 LE/BE and Power 10
> LE.
> 
> Please let me know if the patch is acceptable for mainline.  Thanks.
> 
>Carl 
> 
> -
> rs6000, Add return value to __builtin_set_fpscr_rn
> 
> Change the return value from void to double for 

Re: Re: [PATCH] Add VXRM enum

2023-07-13 Thread 陈逸轩
Oh, sorry for that, thank you very much! XD


 -Original Messages-
 From: "Kito Cheng" 
 Sent Time: 2023-07-13 15:24:45 (Thursday)
 To: "Robin Dapp" 
 Cc: chenyix...@iscas.ac.cn, gcc-patches@gcc.gnu.org, and...@sifive.com, 
shiyul...@iscas.ac.cn, oriachi...@gmail.com, shi...@iscas.ac.cn, 
jia...@iscas.ac.cn
 Subject: Re: [PATCH] Add VXRM enum
 
 Those enum values have been defined via `#pragma riscv intrinsic "vector"` 
:)
 
 
https://github.com/gcc-mirror/gcc/commit/01d62e9b6c3e9fd3132f1616843103ccf81778ed
 
 On Thu, Jul 13, 2023 at 2:55 PM Robin Dapp via Gcc-patches
  wrote:
 
   +enum __RISCV_VXRM {
   +  __RISCV_VXRM_RNU = 0,
   +  __RISCV_VXRM_RNE = 1,
   +  __RISCV_VXRM_RDN = 2,
   +  __RISCV_VXRM_ROD = 3,
   +};
   +
__extension__ extern __inline unsigned long
__attribute__ ((__always_inline__, __gnu_inline__, 
__artificial__))
vread_csr(enum RVV_CSR csr)
 
  We have that already in riscv-protos.h :)
  (fixed_point_rounding_mode)
 
  Regards
   Robin
 


Re: [PATCH 4/4] [RISC-V] support cm.mva01s cm.mvsa01 in zcmp

2023-07-13 Thread Kito Cheng via Gcc-patches
LGTM, thanks, just like other zc* patches, I would like to defer this
until the binutils part landed :)

On Wed, Jun 7, 2023 at 1:54 PM Fei Gao  wrote:
>
> From: Die Li 
>
> Signed-off-by: Die Li 
> Co-Authored-By: Fei Gao 
>
> gcc/ChangeLog:
>
> * config/riscv/peephole.md: New pattern.
> * config/riscv/predicates.md (a0a1_reg_operand): New predicate.
> (zcmp_mv_sreg_operand): New predicate.
> * config/riscv/riscv.md: New predicate.
> * config/riscv/zc.md (*mva01s): New pattern.
> (*mvsa01): New pattern.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/cm_mv_rv32.c: New test.
> ---
>  gcc/config/riscv/peephole.md| 28 +
>  gcc/config/riscv/predicates.md  | 11 
>  gcc/config/riscv/riscv.md   |  1 +
>  gcc/config/riscv/zc.md  | 22 
>  gcc/testsuite/gcc.target/riscv/cm_mv_rv32.c | 21 
>  5 files changed, 83 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/cm_mv_rv32.c
>
> diff --git a/gcc/config/riscv/peephole.md b/gcc/config/riscv/peephole.md
> index 67e7046d7e6..e8cb1ba4838 100644
> --- a/gcc/config/riscv/peephole.md
> +++ b/gcc/config/riscv/peephole.md
> @@ -94,3 +94,31 @@
>  {
>th_mempair_order_operands (operands, true, SImode);
>  })
> +
> +;; ZCMP
> +(define_peephole2
> +  [(set (match_operand:X 0 "a0a1_reg_operand")
> +(match_operand:X 1 "zcmp_mv_sreg_operand"))
> +   (set (match_operand:X 2 "a0a1_reg_operand")
> +(match_operand:X 3 "zcmp_mv_sreg_operand"))]
> +  "TARGET_ZCMP
> +   && (REGNO (operands[2]) != REGNO (operands[0]))"
> +  [(parallel [(set (match_dup 0)
> +   (match_dup 1))
> +  (set (match_dup 2)
> +   (match_dup 3))])]
> +)
> +
> +(define_peephole2
> +  [(set (match_operand:X 0 "zcmp_mv_sreg_operand")
> +(match_operand:X 1 "a0a1_reg_operand"))
> +   (set (match_operand:X 2 "zcmp_mv_sreg_operand")
> +(match_operand:X 3 "a0a1_reg_operand"))]
> +  "TARGET_ZCMP
> +   && (REGNO (operands[0]) != REGNO (operands[2]))
> +   && (REGNO (operands[1]) != REGNO (operands[3]))"
> +  [(parallel [(set (match_dup 0)
> +   (match_dup 1))
> +  (set (match_dup 2)
> +   (match_dup 3))])]
> +)
> diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
> index a1b9367b997..6d5e8630cb5 100644
> --- a/gcc/config/riscv/predicates.md
> +++ b/gcc/config/riscv/predicates.md
> @@ -207,6 +207,17 @@
>(and (match_code "const_int")
> (match_test "riscv_zcmp_valid_stack_adj_bytes_p (INTVAL (op), 13)")))
>
> +;; ZCMP predicates
> +(define_predicate "a0a1_reg_operand"
> +  (and (match_operand 0 "register_operand")
> +   (match_test "IN_RANGE (REGNO (op), A0_REGNUM, A1_REGNUM)")))
> +
> +(define_predicate "zcmp_mv_sreg_operand"
> +  (and (match_operand 0 "register_operand")
> +   (match_test "TARGET_RVE ? IN_RANGE (REGNO (op), S0_REGNUM, S1_REGNUM)
> +: IN_RANGE (REGNO (op), S0_REGNUM, S1_REGNUM)
> +|| IN_RANGE (REGNO (op), S2_REGNUM, S7_REGNUM)")))
> +
>  ;; Only use branch-on-bit sequences when the mask is not an ANDI immediate.
>  (define_predicate "branch_on_bit_operand"
>(and (match_code "const_int")
> diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> index 02802d2685d..25bc3e6ab4c 100644
> --- a/gcc/config/riscv/riscv.md
> +++ b/gcc/config/riscv/riscv.md
> @@ -121,6 +121,7 @@
> (S0_REGNUM  8)
> (S1_REGNUM  9)
> (A0_REGNUM  10)
> +   (A1_REGNUM  11)
> (S2_REGNUM  18)
> (S3_REGNUM  19)
> (S4_REGNUM  20)
> diff --git a/gcc/config/riscv/zc.md b/gcc/config/riscv/zc.md
> index 217e115035b..bb4975cd333 100644
> --- a/gcc/config/riscv/zc.md
> +++ b/gcc/config/riscv/zc.md
> @@ -1433,3 +1433,25 @@
>"TARGET_ZCMP"
>"cm.push {ra, s0-s11}, %0"
>  )
> +
> +;; ZCMP mv
> +(define_insn "*mva01s"
> +  [(set (match_operand:X 0 "a0a1_reg_operand" "=r")
> +(match_operand:X 1 "zcmp_mv_sreg_operand" "r"))
> +   (set (match_operand:X 2 "a0a1_reg_operand" "=r")
> +(match_operand:X 3 "zcmp_mv_sreg_operand" "r"))]
> +  "TARGET_ZCMP
> +   && (REGNO (operands[2]) != REGNO (operands[0]))"
> +  { return (REGNO (operands[0]) == 
> A0_REGNUM)?"cm.mva01s\t%1,%3":"cm.mva01s\t%3,%1"; }
> +  [(set_attr "mode" "")])
> +
> +(define_insn "*mvsa01"
> +  [(set (match_operand:X 0 "zcmp_mv_sreg_operand" "=r")
> +(match_operand:X 1 "a0a1_reg_operand" "r"))
> +   (set (match_operand:X 2 "zcmp_mv_sreg_operand" "=r")
> +(match_operand:X 3 "a0a1_reg_operand" "r"))]
> +  "TARGET_ZCMP
> +   && (REGNO (operands[0]) != REGNO (operands[2]))
> +   && (REGNO (operands[1]) != REGNO (operands[3]))"
> +  { return (REGNO (operands[1]) == 
> 

Re: [PATCH] tree-optimization/94864 - vector insert of vector extract simplification

2023-07-13 Thread Hongtao Liu via Gcc-patches
On Thu, Jul 13, 2023 at 2:32 PM Richard Biener  wrote:
>
> On Thu, 13 Jul 2023, Hongtao Liu wrote:
>
> > On Thu, Jul 13, 2023 at 10:47?AM Hongtao Liu  wrote:
> > >
> > > On Wed, Jul 12, 2023 at 9:37?PM Richard Biener via Gcc-patches
> > >  wrote:
> > > >
> > > > The PRs ask for optimizing of
> > > >
> > > >   _1 = BIT_FIELD_REF ;
> > > >   result_4 = BIT_INSERT_EXPR ;
> > > >
> > > > to a vector permutation.  The following implements this as
> > > > match.pd pattern, improving code generation on x86_64.
> > > >
> > > > On the RTL level we face the issue that backend patterns inconsistently
> > > > use vec_merge and vec_select of vec_concat to represent permutes.
> > > >
> > > > I think using a (supported) permute is almost always better
> > > > than an extract plus insert, maybe excluding the case we extract
> > > > element zero and that's aliased to a register that can be used
> > > > directly for insertion (not sure how to query that).
> > > >
> > > > But this regresses for example gcc.target/i386/pr54855-8.c because PRE
> > > > now realizes that
> > > >
> > > >   _1 = BIT_FIELD_REF ;
> > > >   if (_1 > a_4(D))
> > > > goto ; [50.00%]
> > > >   else
> > > > goto ; [50.00%]
> > > >
> > > >[local count: 536870913]:
> > > >
> > > >[local count: 1073741824]:
> > > >   # iftmp.0_2 = PHI <_1(3), a_4(D)(2)>
> > > >   x_5 = BIT_INSERT_EXPR ;
> > > >
> > > > is equal to
> > > >
> > > >[local count: 1073741824]:
> > > >   _1 = BIT_FIELD_REF ;
> > > >   if (_1 > a_4(D))
> > > > goto ; [50.00%]
> > > >   else
> > > > goto ; [50.00%]
> > > >
> > > >[local count: 536870912]:
> > > >   _7 = BIT_INSERT_EXPR ;
> > > >
> > > >[local count: 1073741824]:
> > > >   # prephitmp_8 = PHI 
> > > >
> > > > and that no longer produces the desired maxsd operation at the RTL
> > > The comparison is scalar mode, but operations in then_bb is
> > > vector_mode, if_convert can't eliminate the condition any more(and
> > > won't go into backend ix86_expand_sse_fp_minmax).
> > > I think for ordered comparisons like _1 > a_4, it doesn't match
> > > fmin/fmax, but match SSE MINSS/MAXSS since it alway returns the second
> > > operand(not the other operand) when there's NONE.
> > I mean NANs.
>
> Btw, I once tried to recognize MAX here at the GIMPLE level but
> while the x86 (vector) max insns are fine for x > y ? x : y we
> have no tree code or optab for exactly that, we have MAX_EXPR
> which behaves differently for NaN and .FMAX which is exactly IEEE
> which the x86 ISA isn't.
>
> I wonder if we thus should if-convert this on the GIMPLE level
> but to x > y ? x : y, thus a COND_EXPR?
COND_EXPR maps to movcc, for x86 it's expanded by
ix86_expand_fp_movcc which will try fp minmax detect.
It's probably ok.
>
> Richard.
>
> > > > level (we fail to match .FMAX at the GIMPLE level earlier).
> > > >
> > > > Bootstrapped and tested on x86_64-unknown-linux-gnu with regressions:
> > > >
> > > > FAIL: gcc.target/i386/pr54855-13.c scan-assembler-times vmaxsh[ t] 1
> > > > FAIL: gcc.target/i386/pr54855-13.c scan-assembler-not vcomish[ t]
> > > > FAIL: gcc.target/i386/pr54855-8.c scan-assembler-times maxsd 1
> > > > FAIL: gcc.target/i386/pr54855-8.c scan-assembler-not movsd
> > > > FAIL: gcc.target/i386/pr54855-9.c scan-assembler-times minss 1
> > > > FAIL: gcc.target/i386/pr54855-9.c scan-assembler-not movss
> > > >
> > > > I think this is also PR88540 (the lack of min/max detection, not
> > > > sure if the SSE min/max are suitable here)
> > > >
> > > > PR tree-optimization/94864
> > > > PR tree-optimization/94865
> > > > * match.pd (bit_insert @0 (BIT_FIELD_REF @1 ..) ..): New pattern
> > > > for vector insertion from vector extraction.
> > > >
> > > > * gcc.target/i386/pr94864.c: New testcase.
> > > > * gcc.target/i386/pr94865.c: Likewise.
> > > > ---
> > > >  gcc/match.pd| 25 +
> > > >  gcc/testsuite/gcc.target/i386/pr94864.c | 13 +
> > > >  gcc/testsuite/gcc.target/i386/pr94865.c | 13 +
> > > >  3 files changed, 51 insertions(+)
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr94864.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr94865.c
> > > >
> > > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > > index 8543f777a28..8cc106049c4 100644
> > > > --- a/gcc/match.pd
> > > > +++ b/gcc/match.pd
> > > > @@ -7770,6 +7770,31 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > > >   wi::to_wide (@ipos) + isize))
> > > >  (BIT_FIELD_REF @0 @rsize @rpos)
> > > >
> > > > +/* Simplify vector inserts of other vector extracts to a permute.  */
> > > > +(simplify
> > > > + (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos)
> > > > + (if (VECTOR_TYPE_P (type)
> > > > +  && types_match (@0, @1)
> > > > +  && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
> > > > +  && TYPE_VECTOR_SUBPARTS (type).is_constant ())
> > > > +  (with
> 

[PATCH 08/14] fortran: Push final procedure expr gen close to its one usage.

2023-07-13 Thread Mikael Morin via Gcc-patches
Final procedure pointer expression is generated in gfc_build_final_call
and only used in get_final_proc_ref.  Move the generation there.

gcc/fortran/ChangeLog:

* trans.cc (gfc_add_finalizer_call): Remove local variable
final_expr.  Pass down expr to get_final_proc_ref and move
final procedure expression generation down to its one usage
in get_final_proc_ref.
(get_final_proc_ref): Add argument expr.  Remove argument
final_wrapper.  Recreate final_wrapper from expr.
---
 gcc/fortran/trans.cc | 37 -
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/gcc/fortran/trans.cc b/gcc/fortran/trans.cc
index e5ad67199e7..c6a65c87c5c 100644
--- a/gcc/fortran/trans.cc
+++ b/gcc/fortran/trans.cc
@@ -1085,12 +1085,25 @@ gfc_call_free (tree var)
 }
 
 
-/* Generate the data reference to the finalization procedure pointer passed as
-   argument in FINAL_WRAPPER.  */
+/* Generate the data reference to the finalization procedure pointer associated
+   with the expression passed as argument in EXPR.  */
 
 static void
-get_final_proc_ref (gfc_se *se, gfc_expr *final_wrapper)
+get_final_proc_ref (gfc_se *se, gfc_expr *expr)
 {
+  gfc_expr *final_wrapper = NULL;
+
+  gcc_assert (expr->ts.type == BT_DERIVED || expr->ts.type == BT_CLASS);
+
+  if (expr->ts.type == BT_DERIVED)
+gfc_is_finalizable (expr->ts.u.derived, _wrapper);
+  else
+{
+  final_wrapper = gfc_copy_expr (expr);
+  gfc_add_vptr_component (final_wrapper);
+  gfc_add_final_component (final_wrapper);
+}
+
   gcc_assert (final_wrapper->expr_type == EXPR_VARIABLE);
 
   gfc_conv_expr (se, final_wrapper);
@@ -1308,7 +1321,6 @@ gfc_add_finalizer_call (stmtblock_t *block, gfc_expr 
*expr2)
   tree tmp;
   gfc_ref *ref;
   gfc_expr *expr;
-  gfc_expr *final_expr = NULL;
   bool has_finalizer = false;
 
   if (!expr2 || (expr2->ts.type != BT_DERIVED && expr2->ts.type != BT_CLASS))
@@ -1322,12 +1334,9 @@ gfc_add_finalizer_call (stmtblock_t *block, gfc_expr 
*expr2)
   && expr2->ts.u.derived->attr.defined_assign_comp)
 return false;
 
-  if (expr2->ts.type == BT_DERIVED)
-{
-  gfc_is_finalizable (expr2->ts.u.derived, _expr);
-  if (!final_expr)
-return false;
-}
+  if (expr2->ts.type == BT_DERIVED
+  && !gfc_is_finalizable (expr2->ts.u.derived, NULL))
+return false;
 
   /* If we have a class array, we need go back to the class
  container.  */
@@ -1358,20 +1367,14 @@ gfc_add_finalizer_call (stmtblock_t *block, gfc_expr 
*expr2)
 
   if (!expr2->rank && !expr2->ref && CLASS_DATA 
(expr2->symtree->n.sym)->as)
expr->rank = CLASS_DATA (expr2->symtree->n.sym)->as->rank;
-
-  final_expr = gfc_copy_expr (expr);
-  gfc_add_vptr_component (final_expr);
-  gfc_add_final_component (final_expr);
 }
 
-  gcc_assert (final_expr->expr_type == EXPR_VARIABLE);
-
   stmtblock_t tmp_block;
   gfc_start_block (_block);
 
   gfc_se final_se;
   gfc_init_se (_se, NULL);
-  get_final_proc_ref (_se, final_expr);
+  get_final_proc_ref (_se, expr);
   gfc_add_block_to_block (block, _se.pre);
 
   gfc_se size_se;
-- 
2.40.1



[PATCH 06/14] fortran: Reuse final procedure pointer expression

2023-07-13 Thread Mikael Morin via Gcc-patches
Reuse twice the same final procedure pointer expression instead of
translating it twice.
Final procedure pointer expressions were translated twice, once for the
final procedure call, and once for the check for non-nullness (if
applicable).

gcc/fortran/ChangeLog:

* trans.cc (gfc_add_finalizer_call): Move pre and post code for
the final procedure pointer expression to the outer block.
Reuse the previously evaluated final procedure pointer
expression.
---
 gcc/fortran/trans.cc | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/gcc/fortran/trans.cc b/gcc/fortran/trans.cc
index 5c953a07533..3750d4eca82 100644
--- a/gcc/fortran/trans.cc
+++ b/gcc/fortran/trans.cc
@@ -1375,7 +1375,7 @@ gfc_add_finalizer_call (stmtblock_t *block, gfc_expr 
*expr2)
   gfc_se final_se;
   gfc_init_se (_se, NULL);
   get_final_proc_ref (_se, final_expr);
-  gfc_add_block_to_block (_block, _se.pre);
+  gfc_add_block_to_block (block, _se.pre);
 
   gfc_se size_se;
   gfc_init_se (_se, NULL);
@@ -1395,7 +1395,6 @@ gfc_add_finalizer_call (stmtblock_t *block, gfc_expr 
*expr2)
 
   gfc_add_block_to_block (_block, _se.post);
   gfc_add_block_to_block (_block, _se.post);
-  gfc_add_block_to_block (_block, _se.post);
 
   tmp = gfc_finish_block (_block);
 
@@ -1404,11 +1403,10 @@ gfc_add_finalizer_call (stmtblock_t *block, gfc_expr 
*expr2)
   tree cond;
   gfc_se se;
 
-  gfc_init_se (, NULL);
-  se.want_pointer = 1;
-  gfc_conv_expr (, final_expr);
+  tree ptr = gfc_build_addr_expr (NULL_TREE, final_se.expr);
+
   cond = fold_build2_loc (input_location, NE_EXPR, logical_type_node,
- se.expr, build_int_cst (TREE_TYPE (se.expr), 0));
+ ptr, build_int_cst (TREE_TYPE (ptr), 0));
 
   /* For CLASS(*) not only sym->_vtab->_final can be NULL
 but already sym->_vtab itself.  */
@@ -1437,6 +1435,7 @@ gfc_add_finalizer_call (stmtblock_t *block, gfc_expr 
*expr2)
 }
 
   gfc_add_expr_to_block (block, tmp);
+  gfc_add_block_to_block (block, _se.post);
 
   return true;
 }
-- 
2.40.1



[PATCH 00/14] fortran: Use precalculated class container for deallocation [PR110618]

2023-07-13 Thread Mikael Morin via Gcc-patches
Hello, 

the following patches are abot PR110618, a PR similar to PR92178 from which
it is cloned.  Both are about a problem of dedendencies between arguments,
when one of them is associated to an allocatable intent(out) dummy, and thus
deallocated in the process of argument association.

PR110618 exposes a case where the data reference finalization code
for one argument references deallocated data from another argument.
The way I propose to fix this is similar to my recent patches for
PR92178 [1,2] (and is dependent on them).  Those patches try to use a data
reference pointer precalculated at the beginning of the process instead of
repeatedly evaluating an expression that becomes invalid at some point
in the generated code.

Unfortunately, the code for finalization is not prepared for this, as it
only manipulates front-end expressions, whereas the precalculated
pointer is available as middle-end's generic tree.

These patches refactor the finalization code to ease the introduction
of the forementioned pre-calculated class container pointer.  Basically,
four expressions are calculated to build the final procedure call:
the final procedure pointer, the element size, the data reference
(array) descriptor, and (optionally) the virtual table pointer.  Each of
the four is outlined stepwise to its own separate function in the
following patches.  This abstracts away the generation of these
expressions and makes it easier to add one other way to generate them.
This should also make the impact of the changes more
visible, and regressions easier to spot.

The main changes are the two last patches introducing an additional
precalculated pointer argument in relevant functions and using them if
set.  Details are in the specific patches.

Each patch has been bubble-bootstrapped and partially tested
with RUNTESTFLAGS="dg.exp=*final*".
The complete set has been fully tested on x86_64-pc-linux-gnu.
OK for master?

[1] https://gcc.gnu.org/pipermail/fortran/2023-July/059582.html
[2] https://gcc.gnu.org/pipermail/fortran/2023-July/059583.html

Mikael Morin (14):
  fortran: Outline final procedure pointer evaluation
  fortran: Outline element size evaluation
  fortran: Outline data reference descriptor evaluation
  fortran: Inline gfc_build_final_call
  fortran: Add missing cleanup blocks
  fortran: Reuse final procedure pointer expression
  fortran: Push element size expression generation close to its usage
  fortran: Push final procedure expr gen close to its one usage.
  fortran: Inline variable definition
  fortran: Remove redundant argument in get_var_descr
  fortran: Outline virtual table pointer evaluation
  fortran: Factor scalar descriptor generation
  fortran: Use pre-evaluated class container if available [PR110618]
  fortran: Pass pre-calculated class container argument [pr110618]

 gcc/fortran/trans-array.cc  |   2 +-
 gcc/fortran/trans-expr.cc   |   7 +-
 gcc/fortran/trans-stmt.cc   |   3 +-
 gcc/fortran/trans.cc| 314 
 gcc/fortran/trans.h |   9 +-
 gcc/testsuite/gfortran.dg/intent_out_22.f90 |  37 +++
 6 files changed, 237 insertions(+), 135 deletions(-)
 create mode 100644 gcc/testsuite/gfortran.dg/intent_out_22.f90

-- 
2.40.1



[PATCH 12/14] fortran: Factor scalar descriptor generation

2023-07-13 Thread Mikael Morin via Gcc-patches
The same scalar descriptor generation code is present twice, in the
case of derived type entities, and in the case of polymorphic
non-coarray entities.  Factor it in preparation for a future third case
that will also need the same code for scalar descriptor generation.

gcc/fortran/ChangeLog:

* trans.cc (get_var_descr): Factor scalar descriptor generation.
---
 gcc/fortran/trans.cc | 33 +++--
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/gcc/fortran/trans.cc b/gcc/fortran/trans.cc
index 731dfb626ab..69e9329c9cb 100644
--- a/gcc/fortran/trans.cc
+++ b/gcc/fortran/trans.cc
@@ -1146,7 +1146,6 @@ static void
 get_var_descr (gfc_se *se, gfc_expr *var)
 {
   gfc_se tmp_se;
-  symbol_attribute attr;
 
   gcc_assert (var);
 
@@ -1164,13 +1163,6 @@ get_var_descr (gfc_se *se, gfc_expr *var)
{
  gfc_conv_expr (_se, var);
 //   gcc_assert (se.pre.head == NULL_TREE && se.post.head == NULL_TREE);
-
- /* No copy back needed, hence set attr's allocatable/pointer
-to zero.  */
- gfc_clear_attr ();
- tmp_se.expr = gfc_conv_scalar_to_descriptor (_se, tmp_se.expr,
-  attr);
- gcc_assert (tmp_se.post.head == NULL_TREE);
}
 }
   else
@@ -1191,20 +1183,25 @@ get_var_descr (gfc_se *se, gfc_expr *var)
  gfc_add_data_component (array_expr);
  gfc_conv_expr (_se, array_expr);
  gcc_assert (tmp_se.post.head == NULL_TREE);
-
- if (!gfc_is_coarray (array_expr))
-   {
- /* No copy back needed, hence set attr's allocatable/pointer
-to zero.  */
- gfc_clear_attr ();
- tmp_se.expr = gfc_conv_scalar_to_descriptor (_se, tmp_se.expr,
-  attr);
-   }
- gcc_assert (tmp_se.post.head == NULL_TREE);
}
   gfc_free_expr (array_expr);
 }
 
+  if (var->rank == 0)
+{
+  if (var->ts.type == BT_DERIVED
+ || !gfc_is_coarray (var))
+   {
+ /* No copy back needed, hence set attr's allocatable/pointer
+to zero.  */
+ symbol_attribute attr;
+ gfc_clear_attr ();
+ tmp_se.expr = gfc_conv_scalar_to_descriptor (_se, tmp_se.expr,
+  attr);
+   }
+  gcc_assert (tmp_se.post.head == NULL_TREE);
+}
+
   if (!POINTER_TYPE_P (TREE_TYPE (tmp_se.expr)))
 tmp_se.expr = gfc_build_addr_expr (NULL, tmp_se.expr);
 
-- 
2.40.1



[PATCH 05/14] fortran: Add missing cleanup blocks

2023-07-13 Thread Mikael Morin via Gcc-patches
Move cleanup code for the data descriptor after the finalization code
as it makes more sense to have it after.
Other cleanup blocks should be empty (element size and final pointer
are just data references), but add them by the way, just in case.

gcc/fortran/ChangeLog:

* trans.cc (gfc_add_finalizer_call): Add post code for desc_se
after the finalizer call.  Add post code for final_se and
size_se as well.
---
 gcc/fortran/trans.cc | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/gcc/fortran/trans.cc b/gcc/fortran/trans.cc
index f8ca388ab9f..5c953a07533 100644
--- a/gcc/fortran/trans.cc
+++ b/gcc/fortran/trans.cc
@@ -1391,8 +1391,12 @@ gfc_add_finalizer_call (stmtblock_t *block, gfc_expr 
*expr2)
 desc_se.expr, size_se.expr,
 boolean_false_node);
 
-  gfc_add_block_to_block (_block, _se.post);
   gfc_add_expr_to_block (_block, tmp);
+
+  gfc_add_block_to_block (_block, _se.post);
+  gfc_add_block_to_block (_block, _se.post);
+  gfc_add_block_to_block (_block, _se.post);
+
   tmp = gfc_finish_block (_block);
 
   if (expr->ts.type == BT_CLASS && !has_finalizer)
-- 
2.40.1



[PATCH 10/14] fortran: Remove redundant argument in get_var_descr

2023-07-13 Thread Mikael Morin via Gcc-patches
get_var_descr get passed as argument both expr and expr->ts.
Remove the type argument which can be retrieved from the other
argument.

gcc/fortran/ChangeLog:

* trans.cc (get_var_descr): Remove argument ts.  Use var->ts
instead.
(gfc_add_finalizer_call): Update caller.
---
 gcc/fortran/trans.cc | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/gcc/fortran/trans.cc b/gcc/fortran/trans.cc
index 99677d37da7..bcf3341fd4b 100644
--- a/gcc/fortran/trans.cc
+++ b/gcc/fortran/trans.cc
@@ -1140,11 +1140,10 @@ get_elem_size (gfc_se *se, gfc_expr *expr)
 
 
 /* Generate the data reference (array) descriptor corresponding to the
-   expression passed as argument in VAR.  Use type in TS to pilot code
-   generation.  */
+   expression passed as argument in VAR.  */
 
 static void
-get_var_descr (gfc_se *se, gfc_typespec *ts, gfc_expr *var)
+get_var_descr (gfc_se *se, gfc_expr *var)
 {
   gfc_se tmp_se;
   symbol_attribute attr;
@@ -1153,7 +1152,7 @@ get_var_descr (gfc_se *se, gfc_typespec *ts, gfc_expr 
*var)
 
   gfc_init_se (_se, NULL);
 
-  if (ts->type == BT_DERIVED)
+  if (var->ts.type == BT_DERIVED)
 {
   tmp_se.want_pointer = 1;
   if (var->rank)
@@ -1381,7 +1380,7 @@ gfc_add_finalizer_call (stmtblock_t *block, gfc_expr 
*expr2)
 
   gfc_se desc_se;
   gfc_init_se (_se, NULL);
-  get_var_descr (_se, >ts, expr);
+  get_var_descr (_se, expr);
   gfc_add_block_to_block (_block, _se.pre);
 
   tmp = build_call_expr_loc (input_location, final_se.expr, 3,
-- 
2.40.1



[PATCH 1/4] Support Intel AVX-VNNI-INT16

2023-07-13 Thread Haochen Jiang via Gcc-patches
From: Kong Lingling 

gcc/ChangeLog

* common/config/i386/cpuinfo.h (get_available_features): Detect
avxvnniint16.
* common/config/i386/i386-common.cc
(OPTION_MASK_ISA2_AVXVNNIINT16_SET): New.
(OPTION_MASK_ISA2_AVXVNNIINT16_UNSET): Ditto.
(ix86_handle_option): Handle -mavxvnniint16.
* common/config/i386/i386-cpuinfo.h (enum processor_features):
Add FEATURE_AVXVNNIINT16.
* common/config/i386/i386-isas.h: Add ISA_NAME_TABLE_ENTRY for
avxvnniint16.
* config.gcc: Add avxvnniint16.h.
* config/i386/avxvnniint16intrin.h: New file.
* config/i386/cpuid.h (bit_AVXVNNIINT16): New.
* config/i386/i386-builtin.def: Add new builtins.
* config/i386/i386-c.cc (ix86_target_macros_internal): Define
__AVXVNNIINT16__.
* config/i386/i386-options.cc (isa2_opts): Add -mavxvnniint16.
(ix86_valid_target_attribute_inner_p): Handle avxvnniint16intrin.h.
* config/i386/i386-isa.def: Add DEF_PTA(AVXVNNIINT16).
* config/i386/i386.opt: Add option -mavxvnniint16.
* config/i386/immintrin.h: Include avxvnniint16.h.
* config/i386/sse.md
(vpdp_): New define_insn.
* doc/extend.texi: Document avxvnniint16.
* doc/invoke.texi: Document -mavxvnniint16.
* doc/sourcebuild.texi: Document target avxvnniint16.

gcc/testsuite/ChangeLog

* g++.dg/other/i386-2.C: Add -mavxvnniint16.
* g++.dg/other/i386-3.C: Ditto.
* gcc.target/i386/avx-check.h: Add avxvnniint16 check.
* gcc.target/i386/sse-12.c: Add -mavxvnniint16.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Ditto.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/funcspec-56.inc: Add new target attribute.
* lib/target-supports.exp
(check_effective_target_avxvnniint16): New.
* gcc.target/i386/avxvnniint16-1.c: Ditto.
* gcc.target/i386/avxvnniint16-vpdpwusd-2.c: Ditto.
* gcc.target/i386/avxvnniint16-vpdpwusds-2.c: Ditto.
* gcc.target/i386/avxvnniint16-vpdpwsud-2.c: Ditto.
* gcc.target/i386/avxvnniint16-vpdpwsuds-2.c: Ditto.
* gcc.target/i386/avxvnniint16-vpdpwuud-2.c: Ditto.
* gcc.target/i386/avxvnniint16-vpdpwuuds-2.c: Ditto.

Co-authored-by: Haochen Jiang 
---
 gcc/common/config/i386/cpuinfo.h  |   2 +
 gcc/common/config/i386/i386-common.cc |  22 ++-
 gcc/common/config/i386/i386-cpuinfo.h |   1 +
 gcc/common/config/i386/i386-isas.h|   2 +
 gcc/config.gcc|   2 +-
 gcc/config/i386/avxvnniint16intrin.h  | 138 ++
 gcc/config/i386/cpuid.h   |   1 +
 gcc/config/i386/i386-builtin.def  |  14 ++
 gcc/config/i386/i386-c.cc |   2 +
 gcc/config/i386/i386-isa.def  |   1 +
 gcc/config/i386/i386-options.cc   |   4 +-
 gcc/config/i386/i386.opt  |   5 +
 gcc/config/i386/immintrin.h   |   2 +
 gcc/config/i386/sse.md|  32 
 gcc/doc/extend.texi   |   5 +
 gcc/doc/invoke.texi   |  10 +-
 gcc/doc/sourcebuild.texi  |   3 +
 gcc/testsuite/g++.dg/other/i386-2.C   |   2 +-
 gcc/testsuite/g++.dg/other/i386-3.C   |   2 +-
 gcc/testsuite/gcc.target/i386/avx-check.h |   3 +
 .../gcc.target/i386/avxvnniint16-1.c  |  43 ++
 .../gcc.target/i386/avxvnniint16-vpdpwsud-2.c |  71 +
 .../i386/avxvnniint16-vpdpwsuds-2.c   |  72 +
 .../gcc.target/i386/avxvnniint16-vpdpwusd-2.c |  71 +
 .../i386/avxvnniint16-vpdpwusds-2.c   |  72 +
 .../gcc.target/i386/avxvnniint16-vpdpwuud-2.c |  71 +
 .../i386/avxvnniint16-vpdpwuuds-2.c   |  71 +
 gcc/testsuite/gcc.target/i386/funcspec-56.inc |   2 +
 gcc/testsuite/gcc.target/i386/sse-12.c|   2 +-
 gcc/testsuite/gcc.target/i386/sse-13.c|   2 +-
 gcc/testsuite/gcc.target/i386/sse-14.c|   2 +-
 gcc/testsuite/gcc.target/i386/sse-22.c|   4 +-
 gcc/testsuite/gcc.target/i386/sse-23.c|   2 +-
 gcc/testsuite/lib/target-supports.exp |  12 ++
 34 files changed, 735 insertions(+), 15 deletions(-)
 create mode 100644 gcc/config/i386/avxvnniint16intrin.h
 create mode 100644 gcc/testsuite/gcc.target/i386/avxvnniint16-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avxvnniint16-vpdpwsud-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avxvnniint16-vpdpwsuds-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avxvnniint16-vpdpwusd-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avxvnniint16-vpdpwusds-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avxvnniint16-vpdpwuud-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avxvnniint16-vpdpwuuds-2.c

[PATCH 3/4] Support Intel SHA512

2023-07-13 Thread Haochen Jiang via Gcc-patches
gcc/ChangeLog:

* common/config/i386/cpuinfo.h (get_available_features):
Detect SHA512.
* common/config/i386/i386-common.cc (OPTION_MASK_ISA2_SHA512_SET,
OPTION_MASK_ISA2_SHA512_UNSET): New.
(OPTION_MASK_ISA2_AVX_UNSET): Add SHA512.
(ix86_handle_option): Handle -msha512.
* common/config/i386/i386-cpuinfo.h (enum processor_features):
Add FEATURE_SHA512.
* common/config/i386/i386-isas.h: Add ISA_NAME_TABLE_ENTRY for
sha512.
* config.gcc: Add sha512intrin.h.
* config/i386/cpuid.h (bit_SHA512): New.
* config/i386/i386-builtin-types.def:
Add DEF_FUNCTION_TYPE (V4DI, V4DI, V4DI, V2DI).
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-c.cc (ix86_target_macros_internal): Define
__SHA512__.
* config/i386/i386-expand.cc (ix86_expand_args_builtin): Handle
V4DI_FTYPE_V4DI_V4DI_V2DI and V4DI_FTYPE_V4DI_V2DI.
* config/i386/i386-isa.def (SHA512): Add DEF_PTA(SHA512).
* config/i386/i386-options.cc (isa2_opts): Add -msha512.
(ix86_valid_target_attribute_inner_p): Handle sha512.
* config/i386/i386.opt: Add option -msha512.
* config/i386/immintrin.h: Include sha512intrin.h.
* config/i386/sse.md (vsha512msg1): New define insn.
(vsha512msg2): Ditto.
(vsha512rnds2): Ditto.
* doc/extend.texi: Document sha512.
* doc/invoke.texi: Document -msha512.
* doc/sourcebuild.texi: Document target sha512.
* config/i386/sha512intrin.h: New file.

gcc/testsuite/ChangeLog:

* g++.dg/others/i386-2.C: Add -msha512.
* g++.dg/others/i386-3.C: Ditto.
* gcc.target/i386/funcspec-56.inc: Add new target attribute.
* gcc.target/i386/sse-12.c: Add -msha512.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add sha512.
* gcc.target/i386/sse-23.c: Ditto.
* lib/target-supports.exp (check_effective_target_sha512): New.
* gcc.target/i386/sha512-1.c: New test.
* gcc.target/i386/sha512-check.h: Ditto.
* gcc.target/i386/sha512msg1-2.c: Ditto.
* gcc.target/i386/sha512msg2-2.c: Ditto.
* gcc.target/i386/sha512rnds2-2.c: Ditto.
---
 gcc/common/config/i386/cpuinfo.h  |  2 +
 gcc/common/config/i386/i386-common.cc | 19 -
 gcc/common/config/i386/i386-cpuinfo.h |  1 +
 gcc/common/config/i386/i386-isas.h|  1 +
 gcc/config.gcc|  2 +-
 gcc/config/i386/cpuid.h   |  1 +
 gcc/config/i386/i386-builtin-types.def|  3 +
 gcc/config/i386/i386-builtin.def  |  5 ++
 gcc/config/i386/i386-c.cc |  2 +
 gcc/config/i386/i386-expand.cc|  2 +
 gcc/config/i386/i386-isa.def  |  1 +
 gcc/config/i386/i386-options.cc   |  4 +-
 gcc/config/i386/i386.opt  | 10 +++
 gcc/config/i386/immintrin.h   |  2 +
 gcc/config/i386/sha512intrin.h| 64 ++
 gcc/config/i386/sse.md| 40 +
 gcc/doc/extend.texi   |  5 ++
 gcc/doc/invoke.texi   | 10 ++-
 gcc/doc/sourcebuild.texi  |  3 +
 gcc/testsuite/g++.dg/other/i386-2.C   |  2 +-
 gcc/testsuite/g++.dg/other/i386-3.C   |  2 +-
 gcc/testsuite/gcc.target/i386/funcspec-56.inc |  2 +
 gcc/testsuite/gcc.target/i386/sha512-1.c  | 18 
 gcc/testsuite/gcc.target/i386/sha512-check.h  | 43 ++
 gcc/testsuite/gcc.target/i386/sha512msg1-2.c  | 48 +++
 gcc/testsuite/gcc.target/i386/sha512msg2-2.c  | 47 ++
 gcc/testsuite/gcc.target/i386/sha512rnds2-2.c | 85 +++
 gcc/testsuite/gcc.target/i386/sse-12.c|  2 +-
 gcc/testsuite/gcc.target/i386/sse-13.c|  2 +-
 gcc/testsuite/gcc.target/i386/sse-14.c|  2 +-
 gcc/testsuite/gcc.target/i386/sse-22.c|  4 +-
 gcc/testsuite/gcc.target/i386/sse-23.c|  2 +-
 gcc/testsuite/lib/target-supports.exp | 14 +++
 33 files changed, 436 insertions(+), 14 deletions(-)
 create mode 100644 gcc/config/i386/sha512intrin.h
 create mode 100644 gcc/testsuite/gcc.target/i386/sha512-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sha512-check.h
 create mode 100644 gcc/testsuite/gcc.target/i386/sha512msg1-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sha512msg2-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sha512rnds2-2.c

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index e5cdffe017a..0cfde3ebccd 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -879,6 +879,8 @@ get_available_features (struct __processor_model *cpu_model,
set_feature (FEATURE_AVXVNNIINT16);
  if (eax & bit_SM3)
  

Re: [PATCH] tree-optimization/94864 - vector insert of vector extract simplification

2023-07-13 Thread Richard Biener via Gcc-patches
On Wed, 12 Jul 2023, Richard Sandiford wrote:

> Richard Biener  writes:
> > The PRs ask for optimizing of
> >
> >   _1 = BIT_FIELD_REF ;
> >   result_4 = BIT_INSERT_EXPR ;
> >
> > to a vector permutation.  The following implements this as
> > match.pd pattern, improving code generation on x86_64.
> >
> > On the RTL level we face the issue that backend patterns inconsistently
> > use vec_merge and vec_select of vec_concat to represent permutes.
> 
> Yeah, the current RTL codes probably overlap a bit too much.
> 
> Maybe we should have a rule that a vec_merge with a constant
> third operand should be canonicalised to a vec_select?

But vec_merge always has a constant third operand:

@findex vec_merge
@item (vec_merge:@var{m} @var{vec1} @var{vec2} @var{items})
This describes a merge operation between two vectors.  The result is a 
vector
of mode @var{m}; its elements are selected from either @var{vec1} or
@var{vec2}.  Which elements are selected is described by @var{items}, 
which
is a bit mask represented by a @code{const_int}; a zero bit indicates the
corresponding element in the result vector is taken from @var{vec2} while
a set bit indicates it is taken from @var{vec1}.

the "advantage" of vec_merge over vec_concat + vec_select is
that you don't need the 2x wider vector mode, but that's the
only one.  I guess we could allow a mode-less (VOIDmode) vec_concat as
the first operand of a vec_select since that mode isn't really
used for anything.

That said, we could work around the issue by having combine
also try to match vec_merge when the vec_select + vec_concat
combination is a blend.  But I fear that doesn't resonate well
with Segher.

>  And maybe
> change the first operand of vec_select to be an rtvec, so that
> no separate vec_concat (and thus wider mode) is needed for two-input
> permutes?  Would be a lot of work though...
> 
> > I think using a (supported) permute is almost always better
> > than an extract plus insert, maybe excluding the case we extract
> > element zero and that's aliased to a register that can be used
> > directly for insertion (not sure how to query that).
> 
> Yeah, extraction of the low element (0 for LE, N-1 for BE) is special
> in RTL, in that it is now folded to a subreg.  But IMO it's reasonable
> for even that case to through TARGET_VECTORIZE_VEC_PERM_CONST,
> maybe with a target-independent helper function to match permute
> vectors that are equivalent to extract-and-insert.
> 
> On AArch64, extract-and-insert is a single operation for other
> elements too, e.g.:
> 
>   ins v0.s[2], v1.s[1]
> 
> is a thing.  But if the helper returns the index of the extracted
> elements, targets can decide for themselves whether the index is
> supported or not.
> 
> Agree that this is the right thing for gimple to do FWIW.

I think so as well.  Btw, I think only proper re-association and
merging will handle a full sequence of select, merge and permute
optimally.  In principle we have the bswap pass facility for this.

Richard.

> Thanks,
> Richard
> 
> > But this regresses for example gcc.target/i386/pr54855-8.c because PRE
> > now realizes that
> >
> >   _1 = BIT_FIELD_REF ;
> >   if (_1 > a_4(D))
> > goto ; [50.00%]
> >   else
> > goto ; [50.00%]
> >
> >[local count: 536870913]:
> >
> >[local count: 1073741824]:
> >   # iftmp.0_2 = PHI <_1(3), a_4(D)(2)>
> >   x_5 = BIT_INSERT_EXPR ;
> >
> > is equal to
> >
> >[local count: 1073741824]:
> >   _1 = BIT_FIELD_REF ;
> >   if (_1 > a_4(D))
> > goto ; [50.00%]
> >   else
> > goto ; [50.00%]
> >
> >[local count: 536870912]:
> >   _7 = BIT_INSERT_EXPR ;
> >
> >[local count: 1073741824]:
> >   # prephitmp_8 = PHI 
> >
> > and that no longer produces the desired maxsd operation at the RTL
> > level (we fail to match .FMAX at the GIMPLE level earlier).
> >
> > Bootstrapped and tested on x86_64-unknown-linux-gnu with regressions:
> >
> > FAIL: gcc.target/i386/pr54855-13.c scan-assembler-times vmaxsh[ t] 1
> > FAIL: gcc.target/i386/pr54855-13.c scan-assembler-not vcomish[ t]
> > FAIL: gcc.target/i386/pr54855-8.c scan-assembler-times maxsd 1
> > FAIL: gcc.target/i386/pr54855-8.c scan-assembler-not movsd
> > FAIL: gcc.target/i386/pr54855-9.c scan-assembler-times minss 1
> > FAIL: gcc.target/i386/pr54855-9.c scan-assembler-not movss
> >
> > I think this is also PR88540 (the lack of min/max detection, not
> > sure if the SSE min/max are suitable here)
> >
> > PR tree-optimization/94864
> > PR tree-optimization/94865
> > * match.pd (bit_insert @0 (BIT_FIELD_REF @1 ..) ..): New pattern
> > for vector insertion from vector extraction.
> >
> > * gcc.target/i386/pr94864.c: New testcase.
> > * gcc.target/i386/pr94865.c: Likewise.
> > ---
> >  gcc/match.pd| 25 +
> >  gcc/testsuite/gcc.target/i386/pr94864.c | 13 +
> >  gcc/testsuite/gcc.target/i386/pr94865.c | 13 +
> >  3 files changed, 51 insertions(+)
> >  

Re: [PATCH] Add VXRM enum

2023-07-13 Thread Kito Cheng via Gcc-patches
Those enum values have been defined via `#pragma riscv intrinsic "vector"` :)

https://github.com/gcc-mirror/gcc/commit/01d62e9b6c3e9fd3132f1616843103ccf81778ed

On Thu, Jul 13, 2023 at 2:55 PM Robin Dapp via Gcc-patches
 wrote:
>
> > +enum __RISCV_VXRM {
> > +  __RISCV_VXRM_RNU = 0,
> > +  __RISCV_VXRM_RNE = 1,
> > +  __RISCV_VXRM_RDN = 2,
> > +  __RISCV_VXRM_ROD = 3,
> > +};
> > +
> >  __extension__ extern __inline unsigned long
> >  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> >  vread_csr(enum RVV_CSR csr)
>
> We have that already in riscv-protos.h :)
> (fixed_point_rounding_mode)
>
> Regards
>  Robin
>


Re: [PATCH v2] RISC-V: Refactor riscv mode after for VXRM and FRM

2023-07-13 Thread Kito Cheng via Gcc-patches
oh, I know why you failed on that, you need to put it within the
function, not global static, function static variable will construct
when first invoked rather than construct at program start.

Could you try to apply my diff in the last mail and try again?

On Thu, Jul 13, 2023 at 3:29 PM Li, Pan2 via Gcc-patches
 wrote:
>
> Thanks Kito for review. Sorry didn't involve the code result in self test 
> error in PATCH v3, but it can be reproduced with below diff based on PATCH 
> v3. Let me know if I didn't get the point of your comments.
>
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 6ed735d6983..76689eaf8d5 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -233,6 +233,9 @@ static int epilogue_cfa_sp_offset;
>  /* Which tuning parameters to use.  */
>  static const struct riscv_tune_param *tune_param;
>
> +static const_rtx vxrm_rtx = gen_rtx_REG (SImode, VXRM_REGNUM);
> +static const_rtx frm_rtx = gen_rtx_REG (SImode, FRM_REGNUM);
> +
>  /* Which automaton to use for tuning.  */
>  enum riscv_microarchitecture_type riscv_microarchitecture;
>
> @@ -7717,7 +7720,7 @@ static bool
>  vxrm_unknown_p (rtx_insn *insn)
>  {
>/* Return true if there is a definition of VXRM.  */
> -  if (reg_set_p (gen_rtx_REG (SImode, VXRM_REGNUM), insn))
> +  if (reg_set_p (vxrm_rtx, insn))
>  return true;
>
>/* A CALL function may contain an instruction that modifies the VXRM,
> @@ -7739,7 +7742,7 @@ static bool
>  frm_unknown_dynamic_p (rtx_insn *insn)
>  {
>/* Return true if there is a definition of FRM.  */
> -  if (reg_set_p (gen_rtx_REG (SImode, FRM_REGNUM), insn))
> +  if (reg_set_p (frm_rtx, insn))
>  return true;
>
>/* A CALL function may contain an instruction that modifies the FRM,
> @@ -7761,7 +7764,7 @@ riscv_vxrm_mode_after (rtx_insn *insn, int mode)
>if (recog_memoized (insn) < 0)
>  return mode;
>
> -  if (reg_mentioned_p (gen_rtx_REG (SImode, VXRM_REGNUM), PATTERN (insn)))
> +  if (reg_mentioned_p (vxrm_rtx, PATTERN (insn)))
>  return get_attr_vxrm_mode (insn);
>else
>  return mode;
> @@ -7778,7 +7781,7 @@ riscv_frm_mode_after (rtx_insn *insn, int mode)
>if (recog_memoized (insn) < 0)
>  return mode;
>
> -  if (reg_mentioned_p (gen_rtx_REG (SImode, FRM_REGNUM), PATTERN (insn)))
> +  if (reg_mentioned_p (frm_rtx, PATTERN (insn)))
>  return get_attr_frm_mode (insn);
>else
>  return mode;
>
> Pan
>
> -Original Message-
> From: Kito Cheng 
> Sent: Thursday, July 13, 2023 2:19 PM
> To: Li, Pan2 
> Cc: Jeff Law ; gcc-patches@gcc.gnu.org; 
> juzhe.zh...@rivai.ai; rdapp@gmail.com; Wang, Yanzhang 
> 
> Subject: Re: [PATCH v2] RISC-V: Refactor riscv mode after for VXRM and FRM
>
> Hmmm? I didn't get that error on selftest?
>
> my diff with your v2:
>
> $ git diff
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 12655f7fdc65..466e1aed91c7 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -8058,8 +8058,9 @@ asm_insn_p (rtx_insn *insn)
> static bool
> vxrm_unknown_p (rtx_insn *insn)
> {
> +  static const_rtx vxrm_reg = gen_rtx_REG (SImode, VXRM_REGNUM);
>   /* Return true if there is a definition of VXRM.  */
> -  if (reg_set_p (gen_rtx_REG (SImode, VXRM_REGNUM), insn))
> +  if (reg_set_p (vxrm_reg, insn))
> return true;
>
>   /* A CALL function may contain an instruction that modifies the VXRM,
> @@ -8080,8 +8081,9 @@ vxrm_unknown_p (rtx_insn *insn)
> static bool
> frm_unknown_dynamic_p (rtx_insn *insn)
> {
> +  static const_rtx frm_reg = gen_rtx_REG (SImode, FRM_REGNUM);
>   /* Return true if there is a definition of FRM.  */
> -  if (reg_set_p (gen_rtx_REG (SImode, FRM_REGNUM), insn))
> +  if (reg_set_p (frm_reg, insn))
> return true;
>
>   /* A CALL function may contain an instruction that modifies the FRM,
>
>
> On Thu, Jul 13, 2023 at 1:07 PM Li, Pan2 via Gcc-patches
>  wrote:
> >
> > Thanks Jeff and Kito for comments, update the V3 version as below.
> >
> > https://gcc.gnu.org/pipermail/gcc-patches/2023-July/624347.html
> >
> > > Extract vxrm reg to a local static variable to prevent construct that 
> > > again and again.
> >
> > The "static const_rtx vxrm_rtx = gen_rtx_REG (SImode, VXRM_REGMU)" results 
> > in some error when selftest like below, thus patch v3 doesn't include this 
> > change.
> >
> > /home/pli/repos/gcc/111/riscv-gnu-toolchain/build-gcc-newlib-stage1/./gcc/xgcc
> >  
> > -B/home/pli/repos/gcc/111/riscv-gnu-toolchain/build-gcc-newlib-stage1/./gcc/
> >   -xc -nostdinc /dev/null -S -o /dev/null 
> > -fself-test=../.././gcc/gcc/testsuite/selftests
> > virtual memory exhausted: Invalid argument
> > make[2]: *** [../.././gcc/gcc/c/Make-lang.in:153: s-selftest-c] Error 1
> >
> > Pan
> >
> > -Original Message-
> > From: Jeff Law 
> > Sent: Wednesday, July 12, 2023 11:31 PM
> > To: Li, Pan2 ; gcc-patches@gcc.gnu.org
> > Cc: juzhe.zh...@rivai.ai; rdapp@gmail.com; Wang, Yanzhang 
> > ; 

[PATCH V2] RISC-V: Throw compilation error for unknown sub-extension or supervisor extension

2023-07-13 Thread Lehua Ding
Hi,

This tiny patch add a check for extension starts with 'z' or 's' in `-march`
option. Currently this unknown extension will be passed to the assembler, which
then reports an error. With this patch, the compiler will throw a compilation
error if the extension starts with 'z' or 's' is not a standard sub-extension or
supervisor extension. Along with two extra changes. The first is to reduce
repeated errors, which are currently reported at least twice. The second is to
report as many mistakes as possible.

e.g.:

Run `riscv64-unknown-elf-gcc -march=rv64gvcw_zvl128_s123_x123 -mabi=lp64d a.c`
will throw these error:

riscv64-unknown-elf-gcc: error: '-march=rv64gcv_zvl128_s123': ISA string is not 
in canonical order. 'c'
riscv64-unknown-elf-gcc: error: '-march=rv64gcv_zvl128_s123': extension 'w' is 
unsupported standard single letter extension
riscv64-unknown-elf-gcc: error: '-march=rv64gcv_zvl128_s123': extension 
'zvl128' start with `z` but is unsupported standard extension
riscv64-unknown-elf-gcc: error: '-march=rv64gcv_zvl128_s123': extension 's123' 
start with `s` but is unsupported standard supervisor extension
riscv64-unknown-elf-gcc: error: '-march=rv64gcv_zvl128_s123': extension 'x123' 
start with `x` but is unsupported non-standard extension

Best,
Lehua

gcc/ChangeLog:

* common/config/riscv/riscv-common.cc (riscv_supported_std_ext): Init.
(standard_extensions_p): Add check.
(riscv_subset_list::add): Just return NULL if it failed before.
(riscv_subset_list::parse_std_ext): Continue parse when find a error
(riscv_subset_list::parse): Just return NULL if it failed before.
* config/riscv/riscv-subset.h (class riscv_subset_list): Add field.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/arch-2.c: Update -march.
* gcc.target/riscv/arch-3.c: Ditto.
* gcc.target/riscv/arch-5.c: Ditto.
* gcc.target/riscv/arch-8.c: Ditto.
* gcc.target/riscv/attribute-10.c: Ditto.
* gcc.target/riscv/attribute-18.c: Ditto.
* gcc.target/riscv/attribute-19.c: Ditto.
* gcc.target/riscv/attribute-8.c: Ditto.
* gcc.target/riscv/attribute-9.c: Ditto.
* gcc.target/riscv/pr102957.c: Ditto.
* gcc.target/riscv/arch-22.cc: New test.

---
 gcc/common/config/riscv/riscv-common.cc   | 68 +++
 gcc/config/riscv/riscv-subset.h   |  5 ++
 gcc/testsuite/gcc.target/riscv/arch-2.c   |  2 +-
 gcc/testsuite/gcc.target/riscv/arch-22.cc | 11 +++
 gcc/testsuite/gcc.target/riscv/arch-3.c   |  2 +-
 gcc/testsuite/gcc.target/riscv/arch-5.c   |  2 +-
 gcc/testsuite/gcc.target/riscv/arch-8.c   |  2 +-
 gcc/testsuite/gcc.target/riscv/attribute-10.c |  2 +-
 gcc/testsuite/gcc.target/riscv/attribute-18.c |  4 +-
 gcc/testsuite/gcc.target/riscv/attribute-19.c |  4 +-
 gcc/testsuite/gcc.target/riscv/attribute-8.c  |  4 +-
 gcc/testsuite/gcc.target/riscv/attribute-9.c  |  4 +-
 gcc/testsuite/gcc.target/riscv/pr102957.c |  2 +
 13 files changed, 87 insertions(+), 25 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/arch-22.cc

diff --git a/gcc/common/config/riscv/riscv-common.cc 
b/gcc/common/config/riscv/riscv-common.cc
index 6091d8f281b..9de7c54269e 100644
--- a/gcc/common/config/riscv/riscv-common.cc
+++ b/gcc/common/config/riscv/riscv-common.cc
@@ -311,6 +311,8 @@ static const char *riscv_tunes[] =
 
 static const char *riscv_supported_std_ext (void);
 
+bool riscv_subset_list::parse_failed = false;
+
 static riscv_subset_list *current_subset_list = NULL;
 
 const riscv_subset_list *riscv_current_subset_list ()
@@ -518,6 +520,18 @@ subset_cmp (const std::string , const std::string )
 }
 }
 
+/* Return true if EXT is a standard extension.  */
+
+static bool
+standard_extensions_p (const char *ext)
+{
+  const riscv_ext_version *ext_ver;
+  for (ext_ver = _ext_version_table[0]; ext_ver->name != NULL; ++ext_ver)
+if (strcmp (ext, ext_ver->name) == 0)
+  return true;
+  return false;
+}
+
 /* Add new subset to list.  */
 
 void
@@ -546,6 +560,38 @@ riscv_subset_list::add (const char *subset, int 
major_version,
 
   return;
 }
+  else if (strlen (subset) == 1 && !standard_extensions_p (subset))
+{
+  error_at (m_loc,
+   "%<-march=%s%>: extension %qs is unsupported standard single "
+   "letter extension",
+   m_arch, subset);
+  return;
+}
+  else if (subset[0] == 'z' && !standard_extensions_p (subset))
+{
+  error_at (m_loc,
+   "%<-march=%s%>: extension %qs starts with `z` but is "
+   "unsupported standard extension",
+   m_arch, subset);
+  return;
+}
+  else if (subset[0] == 's' && !standard_extensions_p (subset))
+{
+  error_at (m_loc,
+   "%<-march=%s%>: extension %qs start with `s` but is "
+   "unsupported standard supervisor extension",
+   m_arch, subset);
+  return;
+}
+  

Re: [PATCH 2/4] [RISC-V] support cm.popretz in zcmp

2023-07-13 Thread Kito Cheng via Gcc-patches
I was thinking does it possible to using peephole2 to optimize this
case, but I realized their is several barrier, like stack tie and
note...so it seems hard to just leverage peephole2.

And the patch is LGTM, only a few minor coding format issues, but you
don't need to send new patch, I can fix those stuff when I push, and I
would strongly suggest you setup git-format-patch, /contrib
has a clang format setting , that can release you from the boring
coding format issues.

# Copy to /.clang-format, so that clang-format can found that
automatically.
$ cp contrib/clang-format .clang-format


> @@ -5747,6 +5748,80 @@ riscv_adjust_libcall_cfi_epilogue ()
>return dwarf;
>  }
>
> +/* return true if popretz pattern can be matched.
> +   set (reg 10 a0) (const_int 0)
> +   use (reg 10 a0)
> +   NOTE_INSN_EPILOGUE_BEG  */
> +static rtx_insn *
> +riscv_zcmp_can_use_popretz(void)

Need space between function name and (void)

> +{
> +  rtx_insn *insn = NULL, *use = NULL, *clear = NULL;
> +
> +  /* sequence stack for NOTE_INSN_EPILOGUE_BEG*/
> +  struct sequence_stack * outer_seq = get_current_sequence ()->next;
> +  if (!outer_seq)
> +return NULL;
> +  insn = outer_seq->first;
> +  if(!insn || !NOTE_P (insn) || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG)
> +return NULL;
> +
> +  /* sequence stack for the insn before NOTE_INSN_EPILOGUE_BEG*/
> +  outer_seq = outer_seq->next;
> +  if (outer_seq)
> +insn = outer_seq->last;
> +
> +  /* skip notes  */
> +  while (insn && NOTE_P (insn))
> +{
> +  insn = PREV_INSN (insn);
> +}
> +  use = insn;
> +
> +  /* match use (reg 10 a0)  */
> +  if (use == NULL || !INSN_P (use)
> +  || GET_CODE (PATTERN (use)) != USE
> +  || !REG_P(XEXP(PATTERN (use), 0))
> +  || REGNO(XEXP(PATTERN (use), 0)) != A0_REGNUM)
> +return NULL;
> +
> +  /* match set (reg 10 a0) (const_int 0 [0])  */
> +  clear = PREV_INSN (use);
> +  if (clear != NULL && INSN_P (clear)
> +  && GET_CODE (PATTERN (clear)) == SET
> +  && REG_P (SET_DEST (PATTERN (clear)))
> +  && REGNO (SET_DEST (PATTERN (clear))) == A0_REGNUM
> +  && SET_SRC (PATTERN (clear)) == const0_rtx)
> +return clear;
> +
> +  return NULL;
> +}
> +
> +static void
> +riscv_gen_multi_pop_insn(bool use_multi_pop_normal, unsigned mask,
> + unsigned multipop_size)

Same issue here, need space between argument and function name.


Re: Re: [PATCH] SSA MATH: Support COND_LEN_FMA for floating-point math optimization

2023-07-13 Thread juzhe.zh...@rivai.ai
Hi, Richard.

>> either before or after white-space seems broken.
I use clang-format with the format in gcc/contrib/format. 
I manually adjust it, could you take a look to see whether the format issue is 
still there?

I have address all your comments with V2 patch:
https://gcc.gnu.org/pipermail/gcc-patches/2023-July/624395.html 

Does it look more reasonable ?

Thanks.


juzhe.zh...@rivai.ai
 
From: Richard Biener
Date: 2023-07-13 15:53
To: Ju-Zhe Zhong
CC: gcc-patches; richard.sandiford
Subject: Re: [PATCH] SSA MATH: Support COND_LEN_FMA for floating-point math 
optimization
On Thu, 13 Jul 2023, juzhe.zh...@rivai.ai wrote:
 
> From: Ju-Zhe Zhong 
> 
> Hi, Richard and Richi.
> 
> Previous patch we support COND_LEN_* binary operations. However, we didn't
> support COND_LEN_* ternary.
> 
> Now, this patch support COND_LEN_* ternary. Consider this following case:
> 
> #define TEST_TYPE(TYPE)   
>  \
>   __attribute__ ((noipa)) void ternop_##TYPE (TYPE *__restrict dst,   
>  \
>   TYPE *__restrict a,  \
>   TYPE *__restrict b,\
> TYPE *__restrict c, int n)   \
>   {   
>  \
> for (int i = 0; i < n; i++)   
>  \
>   dst[i] += a[i] * b[i];  
>\
>   }
> 
> #define TEST_ALL() TEST_TYPE (double)
> 
> TEST_ALL ()
> 
> Before this patch:
> ...
> COND_LEN_MUL
> COND_LEN_ADD
> 
> Afther this patch:
> ...
> COND_LEN_FMA
> 
> gcc/ChangeLog:
> 
> * genmatch.cc (commutative_op): Add COND_LEN_*
> * internal-fn.cc (first_commutative_argument): Ditto.
> (CASE): Ditto.
> (get_unconditional_internal_fn): Ditto.
> (can_interpret_as_conditional_op_p): Ditto.
> (internal_fn_len_index): Ditto.
> * internal-fn.h (can_interpret_as_conditional_op_p): Ditt.
> * tree-ssa-math-opts.cc (convert_mult_to_fma_1): Ditto.
> (convert_mult_to_fma): Ditto.
> (math_opts_dom_walker::after_dom_children): Ditto.
> 
> ---
>  gcc/genmatch.cc   | 13 +++
>  gcc/internal-fn.cc| 82 +++
>  gcc/internal-fn.h |  2 +-
>  gcc/tree-ssa-math-opts.cc | 57 ---
>  4 files changed, 139 insertions(+), 15 deletions(-)
> 
> diff --git a/gcc/genmatch.cc b/gcc/genmatch.cc
> index 5fceeec9780..2302f2a7ff0 100644
> --- a/gcc/genmatch.cc
> +++ b/gcc/genmatch.cc
> @@ -559,6 +559,19 @@ commutative_op (id_base *id)
>case CFN_COND_FMS:
>case CFN_COND_FNMA:
>case CFN_COND_FNMS:
> +  case CFN_COND_LEN_ADD:
> +  case CFN_COND_LEN_MUL:
> +  case CFN_COND_LEN_MIN:
> +  case CFN_COND_LEN_MAX:
> +  case CFN_COND_LEN_FMIN:
> +  case CFN_COND_LEN_FMAX:
> +  case CFN_COND_LEN_AND:
> +  case CFN_COND_LEN_IOR:
> +  case CFN_COND_LEN_XOR:
> +  case CFN_COND_LEN_FMA:
> +  case CFN_COND_LEN_FMS:
> +  case CFN_COND_LEN_FNMA:
> +  case CFN_COND_LEN_FNMS:
>  return 1;
>  
>default:
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index c11123a1173..e47b1377ff8 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4191,6 +4191,19 @@ first_commutative_argument (internal_fn fn)
>  case IFN_COND_FMS:
>  case IFN_COND_FNMA:
>  case IFN_COND_FNMS:
> +case IFN_COND_LEN_ADD:
> +case IFN_COND_LEN_MUL:
> +case IFN_COND_LEN_MIN:
> +case IFN_COND_LEN_MAX:
> +case IFN_COND_LEN_FMIN:
> +case IFN_COND_LEN_FMAX:
> +case IFN_COND_LEN_AND:
> +case IFN_COND_LEN_IOR:
> +case IFN_COND_LEN_XOR:
> +case IFN_COND_LEN_FMA:
> +case IFN_COND_LEN_FMS:
> +case IFN_COND_LEN_FNMA:
> +case IFN_COND_LEN_FNMS:
>return 1;
>  
>  default:
> @@ -4330,11 +4343,15 @@ conditional_internal_fn_code (internal_fn ifn)
>  {
>switch (ifn)
>  {
> -#define CASE(CODE, IFN) case IFN_COND_##IFN: return CODE;
> -  FOR_EACH_CODE_MAPPING(CASE)
> +#define CASE(CODE, IFN)  
>   \
> +  case IFN_COND_##IFN:   
>   \
> +return CODE; 
>   \
> +  case IFN_COND_LEN_##IFN:   
>   \
> +return CODE;
> +  FOR_EACH_CODE_MAPPING (CASE)
>  #undef CASE
> -default:
> -  return ERROR_MARK;
> +  default:
> + return ERROR_MARK;
 
either before or after white-space seems broken.
 
>  }
>  }
>  
> @@ -4433,6 +4450,18 @@ get_unconditional_internal_fn (internal_fn ifn)
> operating elementwise if the operands are vectors.  This includes
> the case of an all-true COND, so that the operation always happens.
>  
> +   There is an alternative approach to interpret the STMT when the operands
> +   are vectors 

[PATCH V2] SSA MATH: Support COND_LEN_FMA for floating-point math optimization

2023-07-13 Thread juzhe . zhong
From: Ju-Zhe Zhong 

Hi, Richard and Richi.

Previous patch we support COND_LEN_* binary operations. However, we didn't
support COND_LEN_* ternary.

Now, this patch support COND_LEN_* ternary. Consider this following case:

#define TEST_TYPE(TYPE)\
  __attribute__ ((noipa)) void ternop_##TYPE (TYPE *__restrict dst,\
  TYPE *__restrict a,  \
  TYPE *__restrict b,\
TYPE *__restrict c, int n)   \
  {\
for (int i = 0; i < n; i++)\
  dst[i] += a[i] * b[i];
 \
  }

#define TEST_ALL() TEST_TYPE (double)

TEST_ALL ()

Before this patch:
...
COND_LEN_MUL
COND_LEN_ADD

Afther this patch:
...
COND_LEN_FMA

gcc/ChangeLog:

* genmatch.cc (commutative_op): Add COND_LEN_*
* internal-fn.cc (first_commutative_argument): Ditto.
(CASE): Ditto.
(get_unconditional_internal_fn): Ditto.
(can_interpret_as_conditional_op_p): Ditto.
(internal_fn_len_index): Ditto.
* internal-fn.h (can_interpret_as_conditional_op_p): Ditt.
* tree-ssa-math-opts.cc (convert_mult_to_fma_1): Ditto.
(convert_mult_to_fma): Ditto.
(math_opts_dom_walker::after_dom_children): Ditto.

---
 gcc/genmatch.cc   | 13 ++
 gcc/internal-fn.cc| 87 ++-
 gcc/internal-fn.h |  2 +-
 gcc/tree-ssa-math-opts.cc | 80 +--
 4 files changed, 159 insertions(+), 23 deletions(-)

diff --git a/gcc/genmatch.cc b/gcc/genmatch.cc
index 5fceeec9780..2302f2a7ff0 100644
--- a/gcc/genmatch.cc
+++ b/gcc/genmatch.cc
@@ -559,6 +559,19 @@ commutative_op (id_base *id)
   case CFN_COND_FMS:
   case CFN_COND_FNMA:
   case CFN_COND_FNMS:
+  case CFN_COND_LEN_ADD:
+  case CFN_COND_LEN_MUL:
+  case CFN_COND_LEN_MIN:
+  case CFN_COND_LEN_MAX:
+  case CFN_COND_LEN_FMIN:
+  case CFN_COND_LEN_FMAX:
+  case CFN_COND_LEN_AND:
+  case CFN_COND_LEN_IOR:
+  case CFN_COND_LEN_XOR:
+  case CFN_COND_LEN_FMA:
+  case CFN_COND_LEN_FMS:
+  case CFN_COND_LEN_FNMA:
+  case CFN_COND_LEN_FNMS:
return 1;
 
   default:
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index c11123a1173..e698f0bffc7 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4191,6 +4191,19 @@ first_commutative_argument (internal_fn fn)
 case IFN_COND_FMS:
 case IFN_COND_FNMA:
 case IFN_COND_FNMS:
+case IFN_COND_LEN_ADD:
+case IFN_COND_LEN_MUL:
+case IFN_COND_LEN_MIN:
+case IFN_COND_LEN_MAX:
+case IFN_COND_LEN_FMIN:
+case IFN_COND_LEN_FMAX:
+case IFN_COND_LEN_AND:
+case IFN_COND_LEN_IOR:
+case IFN_COND_LEN_XOR:
+case IFN_COND_LEN_FMA:
+case IFN_COND_LEN_FMS:
+case IFN_COND_LEN_FNMA:
+case IFN_COND_LEN_FNMS:
   return 1;
 
 default:
@@ -4330,11 +4343,14 @@ conditional_internal_fn_code (internal_fn ifn)
 {
   switch (ifn)
 {
-#define CASE(CODE, IFN) case IFN_COND_##IFN: return CODE;
-  FOR_EACH_CODE_MAPPING(CASE)
+#define CASE(CODE, IFN)
\
+  case IFN_COND_##IFN: 
\
+  case IFN_COND_LEN_##IFN: 
\
+return CODE;
+  FOR_EACH_CODE_MAPPING (CASE)
 #undef CASE
-default:
-  return ERROR_MARK;
+  default:
+   return ERROR_MARK;
 }
 }
 
@@ -4433,6 +4449,18 @@ get_unconditional_internal_fn (internal_fn ifn)
operating elementwise if the operands are vectors.  This includes
the case of an all-true COND, so that the operation always happens.
 
+   There is an alternative approach to interpret the STMT when the operands
+   are vectors which is the operation predicated by both conditional mask
+   and loop control length, the equivalent C code:
+
+ for (int i = 0; i < NUNTIS; i++)
+  {
+   if (i < LEN + BIAS && COND[i])
+ LHS[i] = A[i] CODE B[i];
+   else
+ LHS[i] = ELSE[i];
+  }
+
When returning true, set:
 
- *COND_OUT to the condition COND, or to NULL_TREE if the condition
@@ -4440,13 +4468,18 @@ get_unconditional_internal_fn (internal_fn ifn)
- *CODE_OUT to the tree code
- OPS[I] to operand I of *CODE_OUT
- *ELSE_OUT to the fallback value ELSE, or to NULL_TREE if the
- condition is known to be all true.  */
+ condition is known to be all true.
+   - *LEN to the len argument if it COND_LEN_* operations or to NULL_TREE.
+   - *BIAS to the bias argument if it COND_LEN_* operations or to NULL_TREE.  
*/
 
 bool
 can_interpret_as_conditional_op_p (gimple *stmt, tree *cond_out,

[PATCH 13/14] fortran: Use pre-evaluated class container if available [PR110618]

2023-07-13 Thread Mikael Morin via Gcc-patches
Add the possibility to provide a pre-evaluated class container argument
to gfc_add_finalizer to avoid repeatedly evaluating data reference
expressions in the generated code.

PR fortran/110618

gcc/fortran/ChangeLog:

* trans.h (gfc_add_finalizer_call): Add class container argument.
* trans.cc (gfc_add_finalizer_call): Ditto.  Pass down new
argument to get_final_proc_ref, get_elem_size, get_var_desc,
and get_vptr.
(get_elem_size): Add class container argument.
Use provided class container if it's available.
(get_var_descr): Same.
(get_vptr): Same.
(get_final_proc_ref): Same.  Add boolean telling the class
container argument is used.  Set it.  Don't try to use
final_wrapper if class container argument was used.
---
 gcc/fortran/trans.cc | 61 +---
 gcc/fortran/trans.h  |  2 +-
 2 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/gcc/fortran/trans.cc b/gcc/fortran/trans.cc
index 69e9329c9cb..18965b9cbd2 100644
--- a/gcc/fortran/trans.cc
+++ b/gcc/fortran/trans.cc
@@ -1089,14 +1089,20 @@ gfc_call_free (tree var)
with the expression passed as argument in EXPR.  */
 
 static void
-get_final_proc_ref (gfc_se *se, gfc_expr *expr)
+get_final_proc_ref (gfc_se *se, gfc_expr *expr, tree class_container)
 {
   gfc_expr *final_wrapper = NULL;
 
   gcc_assert (expr->ts.type == BT_DERIVED || expr->ts.type == BT_CLASS);
 
+  bool using_class_container = false;
   if (expr->ts.type == BT_DERIVED)
 gfc_is_finalizable (expr->ts.u.derived, _wrapper);
+  else if (class_container)
+{
+  using_class_container = true;
+  se->expr = gfc_class_vtab_final_get (class_container);
+}
   else
 {
   final_wrapper = gfc_copy_expr (expr);
@@ -1104,9 +1110,12 @@ get_final_proc_ref (gfc_se *se, gfc_expr *expr)
   gfc_add_final_component (final_wrapper);
 }
 
-  gcc_assert (final_wrapper->expr_type == EXPR_VARIABLE);
+  if (!using_class_container)
+{
+  gcc_assert (final_wrapper->expr_type == EXPR_VARIABLE);
 
-  gfc_conv_expr (se, final_wrapper);
+  gfc_conv_expr (se, final_wrapper);
+}
 
   if (POINTER_TYPE_P (TREE_TYPE (se->expr)))
 se->expr = build_fold_indirect_ref_loc (input_location, se->expr);
@@ -1117,7 +1126,7 @@ get_final_proc_ref (gfc_se *se, gfc_expr *expr)
passed as argument in EXPR.  */
 
 static void
-get_elem_size (gfc_se *se, gfc_expr *expr)
+get_elem_size (gfc_se *se, gfc_expr *expr, tree class_container)
 {
   gcc_assert (expr->ts.type == BT_DERIVED || expr->ts.type == BT_CLASS);
 
@@ -1127,6 +1136,8 @@ get_elem_size (gfc_se *se, gfc_expr *expr)
   se->expr = TYPE_SIZE_UNIT (se->expr);
   se->expr = fold_convert (gfc_array_index_type, se->expr);
 }
+  else if (class_container)
+se->expr = gfc_class_vtab_size_get (class_container);
   else
 {
   gfc_expr *class_size = gfc_copy_expr (expr);
@@ -1143,7 +1154,7 @@ get_elem_size (gfc_se *se, gfc_expr *expr)
expression passed as argument in VAR.  */
 
 static void
-get_var_descr (gfc_se *se, gfc_expr *var)
+get_var_descr (gfc_se *se, gfc_expr *var, tree class_container)
 {
   gfc_se tmp_se;
 
@@ -1165,6 +1176,8 @@ get_var_descr (gfc_se *se, gfc_expr *var)
 //   gcc_assert (se.pre.head == NULL_TREE && se.post.head == NULL_TREE);
}
 }
+  else if (class_container)
+tmp_se.expr = gfc_class_data_get (class_container);
   else
 {
   gfc_expr *array_expr;
@@ -1212,20 +1225,25 @@ get_var_descr (gfc_se *se, gfc_expr *var)
 
 
 static void
-get_vptr (gfc_se *se, gfc_expr *expr)
+get_vptr (gfc_se *se, gfc_expr *expr, tree class_container)
 {
-  gfc_expr *vptr_expr = gfc_copy_expr (expr);
-  gfc_add_vptr_component (vptr_expr);
+  if (class_container)
+se->expr = gfc_class_vptr_get (class_container);
+  else
+{
+  gfc_expr *vptr_expr = gfc_copy_expr (expr);
+  gfc_add_vptr_component (vptr_expr);
 
-  gfc_se tmp_se;
-  gfc_init_se (_se, NULL);
-  tmp_se.want_pointer = 1;
-  gfc_conv_expr (_se, vptr_expr);
-  gfc_free_expr (vptr_expr);
+  gfc_se tmp_se;
+  gfc_init_se (_se, NULL);
+  tmp_se.want_pointer = 1;
+  gfc_conv_expr (_se, vptr_expr);
+  gfc_free_expr (vptr_expr);
 
-  gfc_add_block_to_block (>pre, _se.pre);
-  gfc_add_block_to_block (>post, _se.post);
-  se->expr = tmp_se.expr;
+  gfc_add_block_to_block (>pre, _se.pre);
+  gfc_add_block_to_block (>post, _se.post);
+  se->expr = tmp_se.expr;
+}
 }
 
 
@@ -1329,7 +1347,8 @@ gfc_add_comp_finalizer_call (stmtblock_t *block, tree 
decl, gfc_component *comp,
true when a finalizer call has been inserted.  */
 
 bool
-gfc_add_finalizer_call (stmtblock_t *block, gfc_expr *expr2)
+gfc_add_finalizer_call (stmtblock_t *block, gfc_expr *expr2,
+   tree class_container)
 {
   tree tmp;
   gfc_ref *ref;
@@ -1384,17 +1403,17 @@ gfc_add_finalizer_call (stmtblock_t *block, gfc_expr 
*expr2)
 
   gfc_se final_se;
  

Re: [PATCH ver 3] rs6000, fix vec_replace_unaligned built-in arguments

2023-07-13 Thread Kewen.Lin via Gcc-patches
Hi Carl,

on 2023/7/8 04:18, Carl Love wrote:
> 
> GCC maintainers:
> 
> Version 3, added code to altivec_resolve_overloaded_builtin so the
> correct instruction is selected for the size of the second argument. 
> This restores the instruction counts to the original values where the
> correct instructions were originally being generated.  The naming of

Nice, I have some comments inlined below.

> the overloaded builtin instances and builtin definitions were changed
> to reflect the type of the second argument since the type of the first
> argument is now the same for all overloaded instances.  A new builtin
> test file was added for the case where the first argument is cast to
> the unsigned long long type.  This test requires the -flax-vector-
> conversions gcc command line option.  Since the other tests do not
> require this option, I felt that the new test needed to be in a
> separate file.  Finally some formatting fixes were made in the original
> test file.  Patch has been retested on Power 10 with no regressions.
> 
> Version 2, fixed various typos.  Updated the change log body to say the
> instruction counts were updated.  The instruction counts changed as a
> result of changing the first argument of the vec_replace_unaligned
> builtin call from vector unsigned long long (vull) to vector unsigned
> char (vuc).  When the first argument was vull the builtin call
> generated the vinsd instruction for the two test cases.  The updated
> call with vuc as the first argument generates two vinsw instructions
> instead.  Patch was retested on Power 10 with no regressions.
> 
> The following patch fixes the first argument in the builtin definition
> and the corresponding test cases.  Initially, the builtin specification
> was wrong due to a cut and past error.  The documentation was fixed in:
> 
>commit ed3fea09b18f67e757b5768b42cb6e816626f1db
>Author: Bill Schmidt 
>Date:   Fri Feb 4 13:07:17 2022 -0600
> 
>rs6000: Correct function prototypes for vec_replace_unaligned
> 
>Due to a pasto error in the documentation, vec_replace_unaligned
> was
>implemented with the same function prototypes as
> vec_replace_elt.  It was
>intended that vec_replace_unaligned always specify output
> vectors as having
>type vector unsigned char, to emphasize that elements are
> potentially
>misaligned by this built-in function.  This patch corrects the
>misimplementation.
> 
> 
> This patch fixes the arguments in the definitions and updates the
> testcases accordingly.  Additionally, a few minor spacing issues are
> fixed.
> 
> The patch has been tested on Power 10 with no regressions.  Please let
> me know if the patch is acceptable for mainline.  Thanks.
> 
>  Carl 
> 
> --
> rs6000, fix vec_replace_unaligned built-in arguments
> 
> The first argument of the vec_replace_unaligned built-in should always be
> unsigned char, as specified in gcc/doc/extend.texi.

Maybe "be with type vector unsigned char"?

> 
> This patch fixes the builtin definitions and updates the test cases to use
> the correct arguments.  The original test file is renamed and a second test
> file is added for a new test case.
> 
> gcc/ChangeLog:
>   * config/rs6000/rs6000-builtins.def: Rename
>   __builtin_altivec_vreplace_un_uv2di as __builtin_altivec_vreplace_un_udi
>   __builtin_altivec_vreplace_un_uv4si as __builtin_altivec_vreplace_un_usi
>   __builtin_altivec_vreplace_un_v2df as __builtin_altivec_vreplace_un_df
>   __builtin_altivec_vreplace_un_v2di as __builtin_altivec_vreplace_un_di
>   __builtin_altivec_vreplace_un_v4sf as __builtin_altivec_vreplace_un_sf
>   __builtin_altivec_vreplace_un_v4si as __builtin_altivec_vreplace_un_si.
>   Rename VREPLACE_UN_UV2DI as VREPLACE_UN_UDI, VREPLACE_UN_UV4SI as
>   VREPLACE_UN_USI, VREPLACE_UN_V2DF as VREPLACE_UN_DF,
>   VREPLACE_UN_V2DI as VREPLACE_UN_DI, VREPLACE_UN_V4SF as
>   VREPLACE_UN_SF, VREPLACE_UN_V4SI as VREPLACE_UN_SI.
>   Rename vreplace_un_v2di as vreplace_un_di, vreplace_un_v4si as
>   vreplace_un_si, vreplace_un_v2df as vreplace_un_df,
>   vreplace_un_v2di as vreplace_un_di, vreplace_un_v4sf as
>   vreplace_un_sf, vreplace_un_v4si as vreplace_un_si.
>   * config/rs6000/rs6000-c.cc (find_instance): Add new argument
>   nargs.  Add nargs check.  Extend function to handle three arguments.
>   (altivec_resolve_overloaded_builtin): Add new argument nargs to
>   function calls.  Add case RS6000_OVLD_VEC_REPLACE_UN.
>   * config/rs6000/rs6000-overload.def (__builtin_vec_replace_un):
>   Fix first argument type.  Rename VREPLACE_UN_UV4SI as
>   VREPLACE_UN_USI, VREPLACE_UN_V4SI as VREPLACE_UN_SI,
>   VREPLACE_UN_UV2DI as VREPLACE_UN_UDI, VREPLACE_UN_V2DI as
>   VREPLACE_UN_DI, VREPLACE_UN_V4SF as VREPLACE_UN_SF,
>   VREPLACE_UN_V2DF as VREPLACE_UN_DF.
>   * config/rs6000/vsx.md (VEC_RU): New mode 

[PATCH 2/4] Support Intel SM3

2023-07-13 Thread Haochen Jiang via Gcc-patches
gcc/ChangeLog:

* common/config/i386/cpuinfo.h (get_available_features):
Detect SM3.
* common/config/i386/i386-common.cc (OPTION_MASK_ISA2_SM3_SET,
OPTION_MASK_ISA2_SM3_UNSET): New.
(OPTION_MASK_ISA2_AVX_UNSET): Add SM3.
(ix86_handle_option): Handle -msm3.
* common/config/i386/i386-cpuinfo.h (enum processor_features):
Add FEATURE_SM3.
* common/config/i386/i386-isas.h: Add ISA_NAME_TABLE_ENTRY for
SM3.
* config.gcc: Add sm3intrin.h
* config/i386/cpuid.h (bit_SM3): New.
* config/i386/i386-builtin-types.def:
Add DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, V4SI, INT).
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-c.cc (ix86_target_macros_internal): Define
__SM3__.
* config/i386/i386-expand.cc (ix86_expand_args_builtin): Handle
V4SI_FTYPE_V4SI_V4SI_V4SI_INT.
* config/i386/i386-isa.def (SM3): Add DEF_PTA(SM3).
* config/i386/i386-options.cc (isa2_opts): Add -msm3.
(ix86_valid_target_attribute_inner_p): Handle sm3.
* config/i386/i386.opt: Add option -msm3.
* config/i386/immintrin.h: Include sm3intrin.h.
* config/i386/sse.md (vsm3msg1): New define insn.
(vsm3msg2): Ditto.
(vsm3rnds2): Ditto.
* doc/extend.texi: Document sm3.
* doc/invoke.texi: Document -msm3.
* doc/sourcebuild.texi: Document target sm3.
* config/i386/sm3intrin.h: New file.

gcc/testsuite/ChangeLog:

* g++.dg/other/i386-2.C: Add -msm3.
* g++.dg/other/i386-3.C: Ditto.
* gcc.target/i386/avx-1.c: Add new define for immediate.
* gcc.target/i386/funcspec-56.inc: Add new target attribute.
* gcc.target/i386/sse-12.c: Add -msm3.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add sm3.
* gcc.target/i386/sse-23.c: Ditto.
* lib/target-supports.exp (check_effective_target_sm3): New.
* gcc.target/i386/sm3-1.c: New test.
* gcc.target/i386/sm3-check.h: Ditto.
* gcc.target/i386/sm3msg1-2.c: Ditto.
* gcc.target/i386/sm3msg2-2.c: Ditto.
* gcc.target/i386/sm3rnds2-2.c: Ditto.
---
 gcc/common/config/i386/cpuinfo.h  |   2 +
 gcc/common/config/i386/i386-common.cc |  20 +++-
 gcc/common/config/i386/i386-cpuinfo.h |   1 +
 gcc/common/config/i386/i386-isas.h|   1 +
 gcc/config.gcc|   3 +-
 gcc/config/i386/cpuid.h   |   1 +
 gcc/config/i386/i386-builtin-types.def|   3 +
 gcc/config/i386/i386-builtin.def  |   5 +
 gcc/config/i386/i386-c.cc |   2 +
 gcc/config/i386/i386-expand.cc|   1 +
 gcc/config/i386/i386-isa.def  |   1 +
 gcc/config/i386/i386-options.cc   |   2 +
 gcc/config/i386/i386.opt  |   5 +
 gcc/config/i386/immintrin.h   |   2 +
 gcc/config/i386/sm3intrin.h   |  72 
 gcc/config/i386/sse.md|  43 
 gcc/doc/extend.texi   |   5 +
 gcc/doc/invoke.texi   |   7 +-
 gcc/doc/sourcebuild.texi  |   3 +
 gcc/testsuite/g++.dg/other/i386-2.C   |   2 +-
 gcc/testsuite/g++.dg/other/i386-3.C   |   2 +-
 gcc/testsuite/gcc.target/i386/avx-1.c |   3 +
 gcc/testsuite/gcc.target/i386/funcspec-56.inc |   2 +
 gcc/testsuite/gcc.target/i386/sm3-1.c |  17 +++
 gcc/testsuite/gcc.target/i386/sm3-check.h |  37 +++
 gcc/testsuite/gcc.target/i386/sm3msg1-2.c |  54 +
 gcc/testsuite/gcc.target/i386/sm3msg2-2.c |  57 ++
 gcc/testsuite/gcc.target/i386/sm3rnds2-2.c| 104 ++
 gcc/testsuite/gcc.target/i386/sse-12.c|   2 +-
 gcc/testsuite/gcc.target/i386/sse-13.c|   5 +-
 gcc/testsuite/gcc.target/i386/sse-14.c|   5 +-
 gcc/testsuite/gcc.target/i386/sse-22.c|   7 +-
 gcc/testsuite/gcc.target/i386/sse-23.c|   5 +-
 gcc/testsuite/lib/target-supports.exp |  15 +++
 34 files changed, 484 insertions(+), 12 deletions(-)
 create mode 100644 gcc/config/i386/sm3intrin.h
 create mode 100644 gcc/testsuite/gcc.target/i386/sm3-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sm3-check.h
 create mode 100644 gcc/testsuite/gcc.target/i386/sm3msg1-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sm3msg2-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sm3rnds2-2.c

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 3599f9def2c..e5cdffe017a 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -877,6 +877,8 @@ get_available_features (struct __processor_model *cpu_model,
set_feature (FEATURE_AVXNECONVERT);
  if (edx & 

[PATCH 4/4] Support Intel SM4

2023-07-13 Thread Haochen Jiang via Gcc-patches
gcc/ChangeLog:

* common/config/i386/cpuinfo.h (get_available_features):
Detech SM4.
* common/config/i386/i386-common.cc (OPTION_MASK_ISA2_SM4_SET,
OPTION_MASK_ISA2_SM4_UNSET): New.
(OPTION_MASK_ISA2_AVX_UNSET): Add SM4.
(ix86_handle_option): Handle -msm4.
* common/config/i386/i386-cpuinfo.h (enum processor_features):
Add FEATURE_SM4.
* common/config/i386/i386-isas.h: Add ISA_NAME_TABLE_ENTRY for
sm4.
* config.gcc: Add sm4intrin.h.
* config/i386/cpuid.h (bit_SM4): New.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-c.cc (ix86_target_macros_internal): Define
__SM4__.
* config/i386/i386-isa.def (SM4): Add DEF_PTA(SM4).
* config/i386/i386-options.cc (isa2_opts): Add -msm4.
(ix86_valid_target_attribute_inner_p): Handle sm4.
* config/i386/i386.opt: Add option -msm4.
* config/i386/immintrin.h: Include sm4intrin.h
* config/i386/sse.md (vsm4key4_): New define insn.
(vsm4rnds4_): Ditto.
* doc/extend.texi: Document sm4.
* doc/invoke.texi: Document -msm4.
* doc/sourcebuild.texi: Document target sm4.
* config/i386/sm4intrin.h: New file.

gcc/testsuite/ChangeLog:

* g++.dg/other/i386-2.C: Add -msm4.
* g++.dg/other/i386-3.C: Ditto.
* gcc.target/i386/funcspec-56.inc: Add new target attribute.
* gcc.target/i386/sse-12.c: Add -msm4.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add sm4.
* gcc.target/i386/sse-23.c: Ditto.
* lib/target-supports.exp (check_effective_target_sm4): New.
* gcc.target/i386/sm4-1.c: New test.
* gcc.target/i386/sm4-check.h: Ditto.
* gcc.target/i386/sm4key4-2.c: Ditto.
* gcc.target/i386/sm4rnds4-2.c: Ditto.
---
 gcc/common/config/i386/cpuinfo.h  |   2 +
 gcc/common/config/i386/i386-common.cc |  20 +-
 gcc/common/config/i386/i386-cpuinfo.h |   1 +
 gcc/common/config/i386/i386-isas.h|   1 +
 gcc/config.gcc|   2 +-
 gcc/config/i386/cpuid.h   |   1 +
 gcc/config/i386/i386-builtin.def  |   6 +
 gcc/config/i386/i386-c.cc |   2 +
 gcc/config/i386/i386-isa.def  |   1 +
 gcc/config/i386/i386-options.cc   |   4 +-
 gcc/config/i386/i386.opt  |   5 +
 gcc/config/i386/immintrin.h   |   2 +
 gcc/config/i386/sm4intrin.h   |  70 +++
 gcc/config/i386/sse.md|  26 +++
 gcc/doc/extend.texi   |   5 +
 gcc/doc/invoke.texi   |   9 +-
 gcc/doc/sourcebuild.texi  |   3 +
 gcc/testsuite/g++.dg/other/i386-2.C   |   2 +-
 gcc/testsuite/g++.dg/other/i386-3.C   |   2 +-
 gcc/testsuite/gcc.target/i386/funcspec-56.inc |   2 +
 gcc/testsuite/gcc.target/i386/sm4-1.c |  20 ++
 gcc/testsuite/gcc.target/i386/sm4-check.h | 183 ++
 gcc/testsuite/gcc.target/i386/sm4key4-2.c |  14 ++
 gcc/testsuite/gcc.target/i386/sm4rnds4-2.c|  14 ++
 gcc/testsuite/gcc.target/i386/sse-12.c|   2 +-
 gcc/testsuite/gcc.target/i386/sse-13.c|   2 +-
 gcc/testsuite/gcc.target/i386/sse-14.c|   2 +-
 gcc/testsuite/gcc.target/i386/sse-22.c|   4 +-
 gcc/testsuite/gcc.target/i386/sse-23.c|   2 +-
 gcc/testsuite/lib/target-supports.exp |  14 ++
 30 files changed, 409 insertions(+), 14 deletions(-)
 create mode 100644 gcc/config/i386/sm4intrin.h
 create mode 100644 gcc/testsuite/gcc.target/i386/sm4-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sm4-check.h
 create mode 100644 gcc/testsuite/gcc.target/i386/sm4key4-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sm4rnds4-2.c

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 0cfde3ebccd..f9434f038ea 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -881,6 +881,8 @@ get_available_features (struct __processor_model *cpu_model,
set_feature (FEATURE_SM3);
  if (eax & bit_SHA512)
set_feature (FEATURE_SHA512);
+ if (eax & bit_SM4)
+   set_feature (FEATURE_SM4);
}
   if (avx512_usable)
{
diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index 97c3cdfe5e1..610cabe52c1 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -122,6 +122,7 @@ along with GCC; see the file COPYING3.  If not see
 #define OPTION_MASK_ISA2_AVXVNNIINT16_SET OPTION_MASK_ISA2_AVXVNNIINT16
 #define OPTION_MASK_ISA2_SM3_SET OPTION_MASK_ISA2_SM3
 #define OPTION_MASK_ISA2_SHA512_SET OPTION_MASK_ISA2_SHA512
+#define OPTION_MASK_ISA2_SM4_SET 

[PATCH 0/4] Support Intel Arrow Lake/Lunar Lake ISAs

2023-07-13 Thread Haochen Jiang via Gcc-patches
Hi all,

These four patches aimed to add Intel Arrow Lake/Lunar Lake
instructions, including AVX-VNNI-INT16, SM3, SHA512 and SM4.

The information is based on newly released
Intel Architecture Instruction Set Extensions and Future Features.

The document comes following:
https://www.intel.com/content/www/us/en/develop/download/intel-architecture-instruction-set-extensions-programming-reference.html

Regtested on x86_64-pc-linux-gnu. Ok for trunk?

BRs,
Haochen




Re: [PATCH v2] RISC-V: Refactor riscv mode after for VXRM and FRM

2023-07-13 Thread Kito Cheng via Gcc-patches
Hmmm? I didn't get that error on selftest?

my diff with your v2:

$ git diff
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 12655f7fdc65..466e1aed91c7 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -8058,8 +8058,9 @@ asm_insn_p (rtx_insn *insn)
static bool
vxrm_unknown_p (rtx_insn *insn)
{
+  static const_rtx vxrm_reg = gen_rtx_REG (SImode, VXRM_REGNUM);
  /* Return true if there is a definition of VXRM.  */
-  if (reg_set_p (gen_rtx_REG (SImode, VXRM_REGNUM), insn))
+  if (reg_set_p (vxrm_reg, insn))
return true;

  /* A CALL function may contain an instruction that modifies the VXRM,
@@ -8080,8 +8081,9 @@ vxrm_unknown_p (rtx_insn *insn)
static bool
frm_unknown_dynamic_p (rtx_insn *insn)
{
+  static const_rtx frm_reg = gen_rtx_REG (SImode, FRM_REGNUM);
  /* Return true if there is a definition of FRM.  */
-  if (reg_set_p (gen_rtx_REG (SImode, FRM_REGNUM), insn))
+  if (reg_set_p (frm_reg, insn))
return true;

  /* A CALL function may contain an instruction that modifies the FRM,


On Thu, Jul 13, 2023 at 1:07 PM Li, Pan2 via Gcc-patches
 wrote:
>
> Thanks Jeff and Kito for comments, update the V3 version as below.
>
> https://gcc.gnu.org/pipermail/gcc-patches/2023-July/624347.html
>
> > Extract vxrm reg to a local static variable to prevent construct that again 
> > and again.
>
> The "static const_rtx vxrm_rtx = gen_rtx_REG (SImode, VXRM_REGMU)" results in 
> some error when selftest like below, thus patch v3 doesn't include this 
> change.
>
> /home/pli/repos/gcc/111/riscv-gnu-toolchain/build-gcc-newlib-stage1/./gcc/xgcc
>  -B/home/pli/repos/gcc/111/riscv-gnu-toolchain/build-gcc-newlib-stage1/./gcc/ 
>  -xc -nostdinc /dev/null -S -o /dev/null 
> -fself-test=../.././gcc/gcc/testsuite/selftests
> virtual memory exhausted: Invalid argument
> make[2]: *** [../.././gcc/gcc/c/Make-lang.in:153: s-selftest-c] Error 1
>
> Pan
>
> -Original Message-
> From: Jeff Law 
> Sent: Wednesday, July 12, 2023 11:31 PM
> To: Li, Pan2 ; gcc-patches@gcc.gnu.org
> Cc: juzhe.zh...@rivai.ai; rdapp@gmail.com; Wang, Yanzhang 
> ; kito.ch...@gmail.com
> Subject: Re: [PATCH v2] RISC-V: Refactor riscv mode after for VXRM and FRM
>
>
>
> On 7/11/23 23:50, pan2...@intel.com wrote:
> > From: Pan Li 
> >
> > When investigate the FRM dynmaic rounding mode, we find the global
> > unknown status is quite different between the fixed-point and
> > floating-point. Thus, we separate the unknown function with extracting
> > some inner common functions.
> >
> > We will also prepare more test cases in another PATCH.
> >
> > Signed-off-by: Pan Li 
> >
> > gcc/ChangeLog:
> >
> >   * config/riscv/riscv.cc (regnum_definition_p): New function.
> >   (insn_asm_p): Ditto.
> >   (riscv_vxrm_mode_after): New function for fixed-point.
> >   (global_vxrm_state_unknown_p): Ditto.
> >   (riscv_frm_mode_after): New function for floating-point.
> >   (global_frm_state_unknown_p): Ditto.
> >   (riscv_mode_after): Leverage new functions.
> >   (riscv_entity_mode_after): Removed.
> > ---
> >   gcc/config/riscv/riscv.cc | 96 +--
> >   1 file changed, 82 insertions(+), 14 deletions(-)
> >
> > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > index 38d8eb2fcf5..553fbb4435a 100644
> > --- a/gcc/config/riscv/riscv.cc
> > +++ b/gcc/config/riscv/riscv.cc
> > @@ -7742,19 +7742,91 @@ global_state_unknown_p (rtx_insn *insn, unsigned 
> > int regno)
> > return false;
> >   }
> >
> > +static bool
> > +regnum_definition_p (rtx_insn *insn, unsigned int regno)
> Needs a function comment.  This is true for each new function added.  In
> this specific case somethign like this might be appropriate
>
> /* Return TRUE if REGNO is set in INSN, FALSE otherwise.  */
>
> Which begs the question, is there some reason why we're not using the
> existing reg_set_p or simple_regno_set from rtlanal.cc?
>
>
>
> Jeff


[PATCH] RISCV: Add -m(no)-omit-leaf-frame-pointer support.

2023-07-13 Thread yanzhang.wang--- via Gcc-patches
From: Yanzhang Wang 

gcc/ChangeLog:

* config/riscv/riscv.cc (riscv_save_reg_p): Save ra for leaf
  when enabling -mno-omit-leaf-frame-pointer
(riscv_option_override): Override omit-frame-pointer.
(riscv_frame_pointer_required): Save s0 for non-leaf function
(TARGET_FRAME_POINTER_REQUIRED): Override defination
* config/riscv/riscv.opt: Add option support.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/omit-frame-pointer-1.c: New test.
* gcc.target/riscv/omit-frame-pointer-2.c: New test.
* gcc.target/riscv/omit-frame-pointer-3.c: New test.
* gcc.target/riscv/omit-frame-pointer-4.c: New test.
* gcc.target/riscv/omit-frame-pointer-test.c: New test.

Signed-off-by: Yanzhang Wang 
---
 gcc/config/riscv/riscv.cc | 34 ++-
 gcc/config/riscv/riscv.opt|  4 +++
 .../gcc.target/riscv/omit-frame-pointer-1.c   |  7 
 .../gcc.target/riscv/omit-frame-pointer-2.c   |  7 
 .../gcc.target/riscv/omit-frame-pointer-3.c   |  7 
 .../gcc.target/riscv/omit-frame-pointer-4.c   |  7 
 .../riscv/omit-frame-pointer-test.c   | 13 +++
 7 files changed, 78 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/omit-frame-pointer-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/omit-frame-pointer-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/omit-frame-pointer-3.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/omit-frame-pointer-4.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/omit-frame-pointer-test.c

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 706c18416db..caae6168c29 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -379,6 +379,10 @@ static const struct riscv_tune_info 
riscv_tune_info_table[] = {
 #include "riscv-cores.def"
 };
 
+/* Global variable to distinguish whether we should save and restore s0/fp for
+   function.  */
+static bool riscv_save_frame_pointer;
+
 void riscv_frame_info::reset(void)
 {
   total_size = 0;
@@ -4948,7 +4952,11 @@ riscv_save_reg_p (unsigned int regno)
   if (regno == HARD_FRAME_POINTER_REGNUM && frame_pointer_needed)
 return true;
 
-  if (regno == RETURN_ADDR_REGNUM && crtl->calls_eh_return)
+  /* Need not to use ra for leaf when frame pointer is turned off by option
+ whatever the omit-leaf-frame's value.  */
+  bool keep_leaf_ra = frame_pointer_needed && crtl->is_leaf
+&& !TARGET_OMIT_LEAF_FRAME_POINTER;
+  if (regno == RETURN_ADDR_REGNUM && (crtl->calls_eh_return || keep_leaf_ra))
 return true;
 
   /* If this is an interrupt handler, then must save extra registers.  */
@@ -6577,6 +6585,21 @@ riscv_option_override (void)
   if (flag_pic)
 riscv_cmodel = CM_PIC;
 
+  /* We need to save the fp with ra for non-leaf functions with no fp and ra
+ for leaf functions while no-omit-frame-pointer with
+ omit-leaf-frame-pointer.  The x_flag_omit_frame_pointer has the first
+ priority to determine whether the frame pointer is needed.  If we do not
+ override it, the fp and ra will be stored for leaf functions, which is not
+ our wanted.  */
+  riscv_save_frame_pointer = false;
+  if (TARGET_OMIT_LEAF_FRAME_POINTER_P (global_options.x_target_flags))
+{
+  if (!global_options.x_flag_omit_frame_pointer)
+   riscv_save_frame_pointer = true;
+
+  global_options.x_flag_omit_frame_pointer = 1;
+}
+
   /* We get better code with explicit relocs for CM_MEDLOW, but
  worse code for the others (for now).  Pick the best default.  */
   if ((target_flags_explicit & MASK_EXPLICIT_RELOCS) == 0)
@@ -7857,6 +7880,12 @@ riscv_preferred_else_value (unsigned, tree, unsigned int 
nops, tree *ops)
   return nops == 3 ? ops[2] : ops[0];
 }
 
+static bool
+riscv_frame_pointer_required (void)
+{
+  return riscv_save_frame_pointer && !crtl->is_leaf;
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
 #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -8161,6 +8190,9 @@ riscv_preferred_else_value (unsigned, tree, unsigned int 
nops, tree *ops)
 #undef TARGET_PREFERRED_ELSE_VALUE
 #define TARGET_PREFERRED_ELSE_VALUE riscv_preferred_else_value
 
+#undef TARGET_FRAME_POINTER_REQUIRED
+#define TARGET_FRAME_POINTER_REQUIRED riscv_frame_pointer_required
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-riscv.h"
diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
index dd062f1c8bd..8e6a94fd01a 100644
--- a/gcc/config/riscv/riscv.opt
+++ b/gcc/config/riscv/riscv.opt
@@ -138,6 +138,10 @@ Enable the CSR checking for the ISA-dependent CRS and the 
read-only CSR.
 The ISA-dependent CSR are only valid when the specific ISA is set.  The
 read-only CSR can not be written by the CSR instructions.
 
+momit-leaf-frame-pointer
+Target Mask (OMIT_LEAF_FRAME_POINTER) Save
+Omit the frame pointer in leaf functions.
+
 Mask(64BIT)
 
 Mask(MUL)
diff --git 

Re: [PATCH] tree-optimization/94864 - vector insert of vector extract simplification

2023-07-13 Thread Richard Biener via Gcc-patches
On Thu, 13 Jul 2023, Hongtao Liu wrote:

> On Thu, Jul 13, 2023 at 10:47?AM Hongtao Liu  wrote:
> >
> > On Wed, Jul 12, 2023 at 9:37?PM Richard Biener via Gcc-patches
> >  wrote:
> > >
> > > The PRs ask for optimizing of
> > >
> > >   _1 = BIT_FIELD_REF ;
> > >   result_4 = BIT_INSERT_EXPR ;
> > >
> > > to a vector permutation.  The following implements this as
> > > match.pd pattern, improving code generation on x86_64.
> > >
> > > On the RTL level we face the issue that backend patterns inconsistently
> > > use vec_merge and vec_select of vec_concat to represent permutes.
> > >
> > > I think using a (supported) permute is almost always better
> > > than an extract plus insert, maybe excluding the case we extract
> > > element zero and that's aliased to a register that can be used
> > > directly for insertion (not sure how to query that).
> > >
> > > But this regresses for example gcc.target/i386/pr54855-8.c because PRE
> > > now realizes that
> > >
> > >   _1 = BIT_FIELD_REF ;
> > >   if (_1 > a_4(D))
> > > goto ; [50.00%]
> > >   else
> > > goto ; [50.00%]
> > >
> > >[local count: 536870913]:
> > >
> > >[local count: 1073741824]:
> > >   # iftmp.0_2 = PHI <_1(3), a_4(D)(2)>
> > >   x_5 = BIT_INSERT_EXPR ;
> > >
> > > is equal to
> > >
> > >[local count: 1073741824]:
> > >   _1 = BIT_FIELD_REF ;
> > >   if (_1 > a_4(D))
> > > goto ; [50.00%]
> > >   else
> > > goto ; [50.00%]
> > >
> > >[local count: 536870912]:
> > >   _7 = BIT_INSERT_EXPR ;
> > >
> > >[local count: 1073741824]:
> > >   # prephitmp_8 = PHI 
> > >
> > > and that no longer produces the desired maxsd operation at the RTL
> > The comparison is scalar mode, but operations in then_bb is
> > vector_mode, if_convert can't eliminate the condition any more(and
> > won't go into backend ix86_expand_sse_fp_minmax).
> > I think for ordered comparisons like _1 > a_4, it doesn't match
> > fmin/fmax, but match SSE MINSS/MAXSS since it alway returns the second
> > operand(not the other operand) when there's NONE.
> I mean NANs.

Btw, I once tried to recognize MAX here at the GIMPLE level but
while the x86 (vector) max insns are fine for x > y ? x : y we
have no tree code or optab for exactly that, we have MAX_EXPR
which behaves differently for NaN and .FMAX which is exactly IEEE
which the x86 ISA isn't.

I wonder if we thus should if-convert this on the GIMPLE level
but to x > y ? x : y, thus a COND_EXPR?

Richard.

> > > level (we fail to match .FMAX at the GIMPLE level earlier).
> > >
> > > Bootstrapped and tested on x86_64-unknown-linux-gnu with regressions:
> > >
> > > FAIL: gcc.target/i386/pr54855-13.c scan-assembler-times vmaxsh[ t] 1
> > > FAIL: gcc.target/i386/pr54855-13.c scan-assembler-not vcomish[ t]
> > > FAIL: gcc.target/i386/pr54855-8.c scan-assembler-times maxsd 1
> > > FAIL: gcc.target/i386/pr54855-8.c scan-assembler-not movsd
> > > FAIL: gcc.target/i386/pr54855-9.c scan-assembler-times minss 1
> > > FAIL: gcc.target/i386/pr54855-9.c scan-assembler-not movss
> > >
> > > I think this is also PR88540 (the lack of min/max detection, not
> > > sure if the SSE min/max are suitable here)
> > >
> > > PR tree-optimization/94864
> > > PR tree-optimization/94865
> > > * match.pd (bit_insert @0 (BIT_FIELD_REF @1 ..) ..): New pattern
> > > for vector insertion from vector extraction.
> > >
> > > * gcc.target/i386/pr94864.c: New testcase.
> > > * gcc.target/i386/pr94865.c: Likewise.
> > > ---
> > >  gcc/match.pd| 25 +
> > >  gcc/testsuite/gcc.target/i386/pr94864.c | 13 +
> > >  gcc/testsuite/gcc.target/i386/pr94865.c | 13 +
> > >  3 files changed, 51 insertions(+)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr94864.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr94865.c
> > >
> > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > index 8543f777a28..8cc106049c4 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -7770,6 +7770,31 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > >   wi::to_wide (@ipos) + isize))
> > >  (BIT_FIELD_REF @0 @rsize @rpos)
> > >
> > > +/* Simplify vector inserts of other vector extracts to a permute.  */
> > > +(simplify
> > > + (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos)
> > > + (if (VECTOR_TYPE_P (type)
> > > +  && types_match (@0, @1)
> > > +  && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
> > > +  && TYPE_VECTOR_SUBPARTS (type).is_constant ())
> > > +  (with
> > > +   {
> > > + unsigned HOST_WIDE_INT elsz
> > > +   = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@1;
> > > + poly_uint64 relt = exact_div (tree_to_poly_uint64 (@rpos), elsz);
> > > + poly_uint64 ielt = exact_div (tree_to_poly_uint64 (@ipos), elsz);
> > > + unsigned nunits = TYPE_VECTOR_SUBPARTS (type).to_constant ();
> > > + vec_perm_builder builder;
> > > + 

[PATCH] Add VXRM enum

2023-07-13 Thread chenyixuan
From: XYenChi 

Noticed that the rvv-intrinsic-doc updated the __RISCV_VXRM.
gcc/ChangeLog:Add __RISCV_VXRM enum to riscv_vector.h

2023-07-13  XYenChi  

* config/riscv/riscv_vector.h (enum __RISCV_VXRM):Add an enum 
__RISCV_VXRM to help express the rounding modes.


---
 gcc/config/riscv/riscv_vector.h | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/gcc/config/riscv/riscv_vector.h b/gcc/config/riscv/riscv_vector.h
index ff54b6be863..0a90816be1a 100644
--- a/gcc/config/riscv/riscv_vector.h
+++ b/gcc/config/riscv/riscv_vector.h
@@ -42,6 +42,13 @@ enum RVV_CSR {
   RVV_VCSR,
 };
 
+enum __RISCV_VXRM {
+  __RISCV_VXRM_RNU = 0,
+  __RISCV_VXRM_RNE = 1,
+  __RISCV_VXRM_RDN = 2,
+  __RISCV_VXRM_ROD = 3,
+};
+
 __extension__ extern __inline unsigned long
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vread_csr(enum RVV_CSR csr)
-- 
2.41.0



Re: Re: [PATCH V7] RISC-V: RISC-V: Support gather_load/scatter RVV auto-vectorization

2023-07-13 Thread Richard Biener via Gcc-patches
On Thu, Jul 13, 2023 at 1:30 AM 钟居哲  wrote:
>
> I notice vectorizable_call in Loop Vectorizer.
> It's vectorizing CALL function for example like fmax/fmin.
> From my understanding, we dont have RVV instruction for fmax/fmin?

There's things like .POPCOUNT which we can vectorize, but sure, it
depends on the ISA
if there's anything.

> So for now, I don't need to support builtin call function vectorization for 
> RVV.
> Am I right?
>
> I am wondering whether we do have some kind of builtin function call 
> vectorization by using RVV instructions.
>
>
> Thanks.
>
>
> juzhe.zh...@rivai.ai
>
> From: Jeff Law
> Date: 2023-07-13 06:25
> To: 钟居哲; gcc-patches
> CC: kito.cheng; kito.cheng; rdapp.gcc
> Subject: Re: [PATCH V7] RISC-V: RISC-V: Support gather_load/scatter RVV 
> auto-vectorization
>
>
> On 7/12/23 16:17, 钟居哲 wrote:
> > Thanks Jeff.
> > Will commit with formating the codes.
> >
> > I am gonna first support COND_FMA and reduction first (which I think
> > is higher priority).
> > Then come back support strided_load/store.
> Sure.One thing to note with strided loads, they can significantly
> help x264's sad/satd loops.  So hopefully you're testing with those :-)
>
>
>
> jeff
>


[PATCH][RFC] tree-optimization/88540 - FP x > y ? x : y if-conversion without -ffast-math

2023-07-13 Thread Richard Biener via Gcc-patches
The following makes sure that FP x > y ? x : y style max/min operations
are if-converted at the GIMPLE level.  While we can neither match
it to MAX_EXPR nor .FMAX as both have different semantics with IEEE
than the ternary ?: operation we can make sure to maintain this form
as a COND_EXPR so backends have the chance to match this to instructions
their ISA offers.

The patch does this in phiopt where we recognize min/max and instead
of giving up when we have to honor NaNs we alter the generated code
to a COND_EXPR.

This resolves PR88540 and we can then SLP vectorize the min operation
for its testcase.  It also resolves part of the regressions observed
with the change matching bit-inserts of bit-field-refs to vec_perm.

Expansion from a COND_EXPR rather than from compare-and-branch
regresses gcc.target/i386/pr54855-13.c and gcc.target/i386/pr54855-9.c
by producing extra moves while the corresponding min/max operations
are now already synthesized by RTL expansion, register selection
isn't optimal.  This can be also provoked without this change by
altering the operand order in the source.

It regresses gcc.target/i386/pr110170.c where we end up CSEing the
condition which makes RTL expansion no longer produce the min/max
directly and code generation is obfuscated enough to confuse
RTL if-conversion.

It also regresses gcc.target/i386/ssefp-[12].c where oddly one
variant isn't if-converted and ix86_expand_fp_movcc doesn't
match directly (the FP constants get expanded twice).  A fix
could be in emit_conditional_move where both prepare_cmp_insn
and emit_conditional_move_1 force the constants to (different)
registers.

Otherwise bootstrapped and tested on x86_64-unknown-linux-gnu.

PR tree-optimization/88540
* tree-ssa-phiopt.cc (minmax_replacement): Do not give up
with NaNs but handle the simple case by if-converting to a
COND_EXPR.

* gcc.target/i386/pr88540.c: New testcase.
* gcc.target/i386/pr54855-12.c: Adjust.
* gcc.target/i386/pr54855-13.c: Likewise.
---
 gcc/testsuite/gcc.target/i386/pr54855-12.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr54855-13.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr88540.c| 10 ++
 gcc/tree-ssa-phiopt.cc | 21 -
 4 files changed, 28 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88540.c

diff --git a/gcc/testsuite/gcc.target/i386/pr54855-12.c 
b/gcc/testsuite/gcc.target/i386/pr54855-12.c
index 2f8af392c83..09e8ab8ae39 100644
--- a/gcc/testsuite/gcc.target/i386/pr54855-12.c
+++ b/gcc/testsuite/gcc.target/i386/pr54855-12.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx512fp16" } */
-/* { dg-final { scan-assembler-times "vmaxsh\[ \\t\]" 1 } } */
+/* { dg-final { scan-assembler-times "vm\[ai\]\[nx\]sh\[ \\t\]" 1 } } */
 /* { dg-final { scan-assembler-not "vcomish\[ \\t\]" } } */
 /* { dg-final { scan-assembler-not "vmovsh\[ \\t\]" { target { ! ia32 } } } } 
*/
 
diff --git a/gcc/testsuite/gcc.target/i386/pr54855-13.c 
b/gcc/testsuite/gcc.target/i386/pr54855-13.c
index 87b4f459a5a..a4f25066f81 100644
--- a/gcc/testsuite/gcc.target/i386/pr54855-13.c
+++ b/gcc/testsuite/gcc.target/i386/pr54855-13.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx512fp16" } */
-/* { dg-final { scan-assembler-times "vmaxsh\[ \\t\]" 1 } } */
+/* { dg-final { scan-assembler-times "vm\[ai\]\[nx\]sh\[ \\t\]" 1 } } */
 /* { dg-final { scan-assembler-not "vcomish\[ \\t\]" } } */
 /* { dg-final { scan-assembler-not "vmovsh\[ \\t\]" { target { ! ia32 } } } } 
*/
 
diff --git a/gcc/testsuite/gcc.target/i386/pr88540.c 
b/gcc/testsuite/gcc.target/i386/pr88540.c
new file mode 100644
index 000..b927d0c57d5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88540.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+void test(double* __restrict d1, double* __restrict d2, double* __restrict d3)
+{
+  for (int n = 0; n < 2; ++n)
+d3[n] = d1[n] < d2[n] ? d1[n] : d2[n];
+}
+
+/* { dg-final { scan-assembler "minpd" } } */
diff --git a/gcc/tree-ssa-phiopt.cc b/gcc/tree-ssa-phiopt.cc
index 467c9fd108a..13ee486831d 100644
--- a/gcc/tree-ssa-phiopt.cc
+++ b/gcc/tree-ssa-phiopt.cc
@@ -1580,10 +1580,6 @@ minmax_replacement (basic_block cond_bb, basic_block 
middle_bb, basic_block alt_
 
   tree type = TREE_TYPE (PHI_RESULT (phi));
 
-  /* The optimization may be unsafe due to NaNs.  */
-  if (HONOR_NANS (type) || HONOR_SIGNED_ZEROS (type))
-return false;
-
   gcond *cond = as_a  (*gsi_last_bb (cond_bb));
   enum tree_code cmp = gimple_cond_code (cond);
   tree rhs = gimple_cond_rhs (cond);
@@ -1770,6 +1766,9 @@ minmax_replacement (basic_block cond_bb, basic_block 
middle_bb, basic_block alt_
   else
return false;
 }
+  else if (HONOR_NANS (type) || HONOR_SIGNED_ZEROS (type))
+/* The optimization may be unsafe due to NaNs.  */
+return false;
   else if (middle_bb != alt_middle_bb && 

RE: [PATCH v2] RISC-V: Refactor riscv mode after for VXRM and FRM

2023-07-13 Thread Li, Pan2 via Gcc-patches
Thanks Kito for review. Sorry didn't involve the code result in self test error 
in PATCH v3, but it can be reproduced with below diff based on PATCH v3. Let me 
know if I didn't get the point of your comments.

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 6ed735d6983..76689eaf8d5 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -233,6 +233,9 @@ static int epilogue_cfa_sp_offset;
 /* Which tuning parameters to use.  */
 static const struct riscv_tune_param *tune_param;

+static const_rtx vxrm_rtx = gen_rtx_REG (SImode, VXRM_REGNUM);
+static const_rtx frm_rtx = gen_rtx_REG (SImode, FRM_REGNUM);
+
 /* Which automaton to use for tuning.  */
 enum riscv_microarchitecture_type riscv_microarchitecture;

@@ -7717,7 +7720,7 @@ static bool
 vxrm_unknown_p (rtx_insn *insn)
 {
   /* Return true if there is a definition of VXRM.  */
-  if (reg_set_p (gen_rtx_REG (SImode, VXRM_REGNUM), insn))
+  if (reg_set_p (vxrm_rtx, insn))
 return true;

   /* A CALL function may contain an instruction that modifies the VXRM,
@@ -7739,7 +7742,7 @@ static bool
 frm_unknown_dynamic_p (rtx_insn *insn)
 {
   /* Return true if there is a definition of FRM.  */
-  if (reg_set_p (gen_rtx_REG (SImode, FRM_REGNUM), insn))
+  if (reg_set_p (frm_rtx, insn))
 return true;

   /* A CALL function may contain an instruction that modifies the FRM,
@@ -7761,7 +7764,7 @@ riscv_vxrm_mode_after (rtx_insn *insn, int mode)
   if (recog_memoized (insn) < 0)
 return mode;

-  if (reg_mentioned_p (gen_rtx_REG (SImode, VXRM_REGNUM), PATTERN (insn)))
+  if (reg_mentioned_p (vxrm_rtx, PATTERN (insn)))
 return get_attr_vxrm_mode (insn);
   else
 return mode;
@@ -7778,7 +7781,7 @@ riscv_frm_mode_after (rtx_insn *insn, int mode)
   if (recog_memoized (insn) < 0)
 return mode;

-  if (reg_mentioned_p (gen_rtx_REG (SImode, FRM_REGNUM), PATTERN (insn)))
+  if (reg_mentioned_p (frm_rtx, PATTERN (insn)))
 return get_attr_frm_mode (insn);
   else
 return mode;

Pan

-Original Message-
From: Kito Cheng  
Sent: Thursday, July 13, 2023 2:19 PM
To: Li, Pan2 
Cc: Jeff Law ; gcc-patches@gcc.gnu.org; 
juzhe.zh...@rivai.ai; rdapp@gmail.com; Wang, Yanzhang 

Subject: Re: [PATCH v2] RISC-V: Refactor riscv mode after for VXRM and FRM

Hmmm? I didn't get that error on selftest?

my diff with your v2:

$ git diff
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 12655f7fdc65..466e1aed91c7 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -8058,8 +8058,9 @@ asm_insn_p (rtx_insn *insn)
static bool
vxrm_unknown_p (rtx_insn *insn)
{
+  static const_rtx vxrm_reg = gen_rtx_REG (SImode, VXRM_REGNUM);
  /* Return true if there is a definition of VXRM.  */
-  if (reg_set_p (gen_rtx_REG (SImode, VXRM_REGNUM), insn))
+  if (reg_set_p (vxrm_reg, insn))
return true;

  /* A CALL function may contain an instruction that modifies the VXRM,
@@ -8080,8 +8081,9 @@ vxrm_unknown_p (rtx_insn *insn)
static bool
frm_unknown_dynamic_p (rtx_insn *insn)
{
+  static const_rtx frm_reg = gen_rtx_REG (SImode, FRM_REGNUM);
  /* Return true if there is a definition of FRM.  */
-  if (reg_set_p (gen_rtx_REG (SImode, FRM_REGNUM), insn))
+  if (reg_set_p (frm_reg, insn))
return true;

  /* A CALL function may contain an instruction that modifies the FRM,


On Thu, Jul 13, 2023 at 1:07 PM Li, Pan2 via Gcc-patches
 wrote:
>
> Thanks Jeff and Kito for comments, update the V3 version as below.
>
> https://gcc.gnu.org/pipermail/gcc-patches/2023-July/624347.html
>
> > Extract vxrm reg to a local static variable to prevent construct that again 
> > and again.
>
> The "static const_rtx vxrm_rtx = gen_rtx_REG (SImode, VXRM_REGMU)" results in 
> some error when selftest like below, thus patch v3 doesn't include this 
> change.
>
> /home/pli/repos/gcc/111/riscv-gnu-toolchain/build-gcc-newlib-stage1/./gcc/xgcc
>  -B/home/pli/repos/gcc/111/riscv-gnu-toolchain/build-gcc-newlib-stage1/./gcc/ 
>  -xc -nostdinc /dev/null -S -o /dev/null 
> -fself-test=../.././gcc/gcc/testsuite/selftests
> virtual memory exhausted: Invalid argument
> make[2]: *** [../.././gcc/gcc/c/Make-lang.in:153: s-selftest-c] Error 1
>
> Pan
>
> -Original Message-
> From: Jeff Law 
> Sent: Wednesday, July 12, 2023 11:31 PM
> To: Li, Pan2 ; gcc-patches@gcc.gnu.org
> Cc: juzhe.zh...@rivai.ai; rdapp@gmail.com; Wang, Yanzhang 
> ; kito.ch...@gmail.com
> Subject: Re: [PATCH v2] RISC-V: Refactor riscv mode after for VXRM and FRM
>
>
>
> On 7/11/23 23:50, pan2...@intel.com wrote:
> > From: Pan Li 
> >
> > When investigate the FRM dynmaic rounding mode, we find the global
> > unknown status is quite different between the fixed-point and
> > floating-point. Thus, we separate the unknown function with extracting
> > some inner common functions.
> >
> > We will also prepare more test cases in another PATCH.
> >
> > Signed-off-by: Pan Li 
> >
> > gcc/ChangeLog:
> >
> >   * config/riscv/riscv.cc 

Re: [PATCH] SSA MATH: Support COND_LEN_FMA for floating-point math optimization

2023-07-13 Thread Richard Biener via Gcc-patches
On Thu, 13 Jul 2023, juzhe.zh...@rivai.ai wrote:

> From: Ju-Zhe Zhong 
> 
> Hi, Richard and Richi.
> 
> Previous patch we support COND_LEN_* binary operations. However, we didn't
> support COND_LEN_* ternary.
> 
> Now, this patch support COND_LEN_* ternary. Consider this following case:
> 
> #define TEST_TYPE(TYPE)   
>  \
>   __attribute__ ((noipa)) void ternop_##TYPE (TYPE *__restrict dst,   
>  \
> TYPE *__restrict a,  \
> TYPE *__restrict b,\
> TYPE *__restrict c, int n)   \
>   {   
>  \
> for (int i = 0; i < n; i++)   
>  \
>   dst[i] += a[i] * b[i];  
>\
>   }
> 
> #define TEST_ALL() TEST_TYPE (double)
> 
> TEST_ALL ()
> 
> Before this patch:
> ...
> COND_LEN_MUL
> COND_LEN_ADD
> 
> Afther this patch:
> ...
> COND_LEN_FMA
> 
> gcc/ChangeLog:
> 
> * genmatch.cc (commutative_op): Add COND_LEN_*
> * internal-fn.cc (first_commutative_argument): Ditto.
> (CASE): Ditto.
> (get_unconditional_internal_fn): Ditto.
> (can_interpret_as_conditional_op_p): Ditto.
> (internal_fn_len_index): Ditto.
> * internal-fn.h (can_interpret_as_conditional_op_p): Ditt.
> * tree-ssa-math-opts.cc (convert_mult_to_fma_1): Ditto.
> (convert_mult_to_fma): Ditto.
> (math_opts_dom_walker::after_dom_children): Ditto.
> 
> ---
>  gcc/genmatch.cc   | 13 +++
>  gcc/internal-fn.cc| 82 +++
>  gcc/internal-fn.h |  2 +-
>  gcc/tree-ssa-math-opts.cc | 57 ---
>  4 files changed, 139 insertions(+), 15 deletions(-)
> 
> diff --git a/gcc/genmatch.cc b/gcc/genmatch.cc
> index 5fceeec9780..2302f2a7ff0 100644
> --- a/gcc/genmatch.cc
> +++ b/gcc/genmatch.cc
> @@ -559,6 +559,19 @@ commutative_op (id_base *id)
>case CFN_COND_FMS:
>case CFN_COND_FNMA:
>case CFN_COND_FNMS:
> +  case CFN_COND_LEN_ADD:
> +  case CFN_COND_LEN_MUL:
> +  case CFN_COND_LEN_MIN:
> +  case CFN_COND_LEN_MAX:
> +  case CFN_COND_LEN_FMIN:
> +  case CFN_COND_LEN_FMAX:
> +  case CFN_COND_LEN_AND:
> +  case CFN_COND_LEN_IOR:
> +  case CFN_COND_LEN_XOR:
> +  case CFN_COND_LEN_FMA:
> +  case CFN_COND_LEN_FMS:
> +  case CFN_COND_LEN_FNMA:
> +  case CFN_COND_LEN_FNMS:
>   return 1;
>  
>default:
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index c11123a1173..e47b1377ff8 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4191,6 +4191,19 @@ first_commutative_argument (internal_fn fn)
>  case IFN_COND_FMS:
>  case IFN_COND_FNMA:
>  case IFN_COND_FNMS:
> +case IFN_COND_LEN_ADD:
> +case IFN_COND_LEN_MUL:
> +case IFN_COND_LEN_MIN:
> +case IFN_COND_LEN_MAX:
> +case IFN_COND_LEN_FMIN:
> +case IFN_COND_LEN_FMAX:
> +case IFN_COND_LEN_AND:
> +case IFN_COND_LEN_IOR:
> +case IFN_COND_LEN_XOR:
> +case IFN_COND_LEN_FMA:
> +case IFN_COND_LEN_FMS:
> +case IFN_COND_LEN_FNMA:
> +case IFN_COND_LEN_FNMS:
>return 1;
>  
>  default:
> @@ -4330,11 +4343,15 @@ conditional_internal_fn_code (internal_fn ifn)
>  {
>switch (ifn)
>  {
> -#define CASE(CODE, IFN) case IFN_COND_##IFN: return CODE;
> -  FOR_EACH_CODE_MAPPING(CASE)
> +#define CASE(CODE, IFN)  
>   \
> +  case IFN_COND_##IFN:   
>   \
> +return CODE; 
>   \
> +  case IFN_COND_LEN_##IFN:   
>   \
> +return CODE;
> +  FOR_EACH_CODE_MAPPING (CASE)
>  #undef CASE
> -default:
> -  return ERROR_MARK;
> +  default:
> + return ERROR_MARK;

either before or after white-space seems broken.

>  }
>  }
>  
> @@ -4433,6 +4450,18 @@ get_unconditional_internal_fn (internal_fn ifn)
> operating elementwise if the operands are vectors.  This includes
> the case of an all-true COND, so that the operation always happens.
>  
> +   There is an alternative approach to interpret the STMT when the operands
> +   are vectors which is the operation predicated by both conditional mask
> +   and loop control length, the equivalent C code:
> +
> + for (int i = 0; i < NUNTIS; i++)
> +  {
> + if (i < LEN + BIAS && COND[i])
> +   LHS[i] = A[i] CODE B[i];
> + else
> +   LHS[i] = ELSE[i];
> +  }
> +
> When returning true, set:
>  
> - *COND_OUT to the condition COND, or to NULL_TREE if the condition
> @@ -4440,13 +4469,18 @@ get_unconditional_internal_fn (internal_fn ifn)
> - *CODE_OUT 

Re: [PATCH] RISC-V: Throw compilation error for unknown sub-extension or supervisor extension

2023-07-13 Thread Lehua Ding
Thanks for review. I uploaded version V2, which addresses Kito's comments,
along with two changes. The first is to reduce repeated errors, which are 
currently
reported at least twice. The second is to report as many mistakes as possible.


V2 URL:https://gcc.gnu.org/pipermail/gcc-patches/2023-July/624377.html


Best,
Lehua

--Original--
From: "KitoCheng"https://github.com/riscv-non-isa/riscv-c-api-doc/blob/master/riscv-c-api.md#architecture-extension-test-macro
[2] 
https://github.com/riscv/riscv-isa-manual/blob/main/src/naming.adoc#additional-standard-extension-names

RE: [PATCH v2] RISC-V: Refactor riscv mode after for VXRM and FRM

2023-07-13 Thread Li, Pan2 via Gcc-patches
Sure thing, get you point now, will have a try and send v4 if everything goes 
well.

Pan

-Original Message-
From: Kito Cheng  
Sent: Thursday, July 13, 2023 3:35 PM
To: Li, Pan2 
Cc: Jeff Law ; gcc-patches@gcc.gnu.org; 
juzhe.zh...@rivai.ai; rdapp@gmail.com; Wang, Yanzhang 

Subject: Re: [PATCH v2] RISC-V: Refactor riscv mode after for VXRM and FRM

oh, I know why you failed on that, you need to put it within the
function, not global static, function static variable will construct
when first invoked rather than construct at program start.

Could you try to apply my diff in the last mail and try again?

On Thu, Jul 13, 2023 at 3:29 PM Li, Pan2 via Gcc-patches
 wrote:
>
> Thanks Kito for review. Sorry didn't involve the code result in self test 
> error in PATCH v3, but it can be reproduced with below diff based on PATCH 
> v3. Let me know if I didn't get the point of your comments.
>
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 6ed735d6983..76689eaf8d5 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -233,6 +233,9 @@ static int epilogue_cfa_sp_offset;
>  /* Which tuning parameters to use.  */
>  static const struct riscv_tune_param *tune_param;
>
> +static const_rtx vxrm_rtx = gen_rtx_REG (SImode, VXRM_REGNUM);
> +static const_rtx frm_rtx = gen_rtx_REG (SImode, FRM_REGNUM);
> +
>  /* Which automaton to use for tuning.  */
>  enum riscv_microarchitecture_type riscv_microarchitecture;
>
> @@ -7717,7 +7720,7 @@ static bool
>  vxrm_unknown_p (rtx_insn *insn)
>  {
>/* Return true if there is a definition of VXRM.  */
> -  if (reg_set_p (gen_rtx_REG (SImode, VXRM_REGNUM), insn))
> +  if (reg_set_p (vxrm_rtx, insn))
>  return true;
>
>/* A CALL function may contain an instruction that modifies the VXRM,
> @@ -7739,7 +7742,7 @@ static bool
>  frm_unknown_dynamic_p (rtx_insn *insn)
>  {
>/* Return true if there is a definition of FRM.  */
> -  if (reg_set_p (gen_rtx_REG (SImode, FRM_REGNUM), insn))
> +  if (reg_set_p (frm_rtx, insn))
>  return true;
>
>/* A CALL function may contain an instruction that modifies the FRM,
> @@ -7761,7 +7764,7 @@ riscv_vxrm_mode_after (rtx_insn *insn, int mode)
>if (recog_memoized (insn) < 0)
>  return mode;
>
> -  if (reg_mentioned_p (gen_rtx_REG (SImode, VXRM_REGNUM), PATTERN (insn)))
> +  if (reg_mentioned_p (vxrm_rtx, PATTERN (insn)))
>  return get_attr_vxrm_mode (insn);
>else
>  return mode;
> @@ -7778,7 +7781,7 @@ riscv_frm_mode_after (rtx_insn *insn, int mode)
>if (recog_memoized (insn) < 0)
>  return mode;
>
> -  if (reg_mentioned_p (gen_rtx_REG (SImode, FRM_REGNUM), PATTERN (insn)))
> +  if (reg_mentioned_p (frm_rtx, PATTERN (insn)))
>  return get_attr_frm_mode (insn);
>else
>  return mode;
>
> Pan
>
> -Original Message-
> From: Kito Cheng 
> Sent: Thursday, July 13, 2023 2:19 PM
> To: Li, Pan2 
> Cc: Jeff Law ; gcc-patches@gcc.gnu.org; 
> juzhe.zh...@rivai.ai; rdapp@gmail.com; Wang, Yanzhang 
> 
> Subject: Re: [PATCH v2] RISC-V: Refactor riscv mode after for VXRM and FRM
>
> Hmmm? I didn't get that error on selftest?
>
> my diff with your v2:
>
> $ git diff
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 12655f7fdc65..466e1aed91c7 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -8058,8 +8058,9 @@ asm_insn_p (rtx_insn *insn)
> static bool
> vxrm_unknown_p (rtx_insn *insn)
> {
> +  static const_rtx vxrm_reg = gen_rtx_REG (SImode, VXRM_REGNUM);
>   /* Return true if there is a definition of VXRM.  */
> -  if (reg_set_p (gen_rtx_REG (SImode, VXRM_REGNUM), insn))
> +  if (reg_set_p (vxrm_reg, insn))
> return true;
>
>   /* A CALL function may contain an instruction that modifies the VXRM,
> @@ -8080,8 +8081,9 @@ vxrm_unknown_p (rtx_insn *insn)
> static bool
> frm_unknown_dynamic_p (rtx_insn *insn)
> {
> +  static const_rtx frm_reg = gen_rtx_REG (SImode, FRM_REGNUM);
>   /* Return true if there is a definition of FRM.  */
> -  if (reg_set_p (gen_rtx_REG (SImode, FRM_REGNUM), insn))
> +  if (reg_set_p (frm_reg, insn))
> return true;
>
>   /* A CALL function may contain an instruction that modifies the FRM,
>
>
> On Thu, Jul 13, 2023 at 1:07 PM Li, Pan2 via Gcc-patches
>  wrote:
> >
> > Thanks Jeff and Kito for comments, update the V3 version as below.
> >
> > https://gcc.gnu.org/pipermail/gcc-patches/2023-July/624347.html
> >
> > > Extract vxrm reg to a local static variable to prevent construct that 
> > > again and again.
> >
> > The "static const_rtx vxrm_rtx = gen_rtx_REG (SImode, VXRM_REGMU)" results 
> > in some error when selftest like below, thus patch v3 doesn't include this 
> > change.
> >
> > /home/pli/repos/gcc/111/riscv-gnu-toolchain/build-gcc-newlib-stage1/./gcc/xgcc
> >  
> > -B/home/pli/repos/gcc/111/riscv-gnu-toolchain/build-gcc-newlib-stage1/./gcc/
> >   -xc -nostdinc /dev/null -S -o /dev/null 
> > -fself-test=../.././gcc/gcc/testsuite/selftests

[PATCH 01/14] fortran: Outline final procedure pointer evaluation

2023-07-13 Thread Mikael Morin via Gcc-patches
gcc/fortran/ChangeLog:

* trans.cc (get_final_proc_ref): New function.
(gfc_build_final_call): Outline the pointer evaluation code
to get_final_proc_ref.
---
 gcc/fortran/trans.cc | 27 +--
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/gcc/fortran/trans.cc b/gcc/fortran/trans.cc
index f1a3aacd850..b5f7b16eda3 100644
--- a/gcc/fortran/trans.cc
+++ b/gcc/fortran/trans.cc
@@ -1085,6 +1085,21 @@ gfc_call_free (tree var)
 }
 
 
+/* Generate the data reference to the finalization procedure pointer passed as
+   argument in FINAL_WRAPPER.  */
+
+static void
+get_final_proc_ref (gfc_se *se, gfc_expr *final_wrapper)
+{
+  gcc_assert (final_wrapper->expr_type == EXPR_VARIABLE);
+
+  gfc_conv_expr (se, final_wrapper);
+
+  if (POINTER_TYPE_P (TREE_TYPE (se->expr)))
+se->expr = build_fold_indirect_ref_loc (input_location, se->expr);
+}
+
+
 /* Build a call to a FINAL procedure, which finalizes "var".  */
 
 static tree
@@ -1092,19 +1107,19 @@ gfc_build_final_call (gfc_typespec ts, gfc_expr 
*final_wrapper, gfc_expr *var,
  bool fini_coarray, gfc_expr *class_size)
 {
   stmtblock_t block;
+  gfc_se final_se;
   gfc_se se;
   tree final_fndecl, array, size, tmp;
   symbol_attribute attr;
 
-  gcc_assert (final_wrapper->expr_type == EXPR_VARIABLE);
   gcc_assert (var);
 
   gfc_start_block ();
-  gfc_init_se (, NULL);
-  gfc_conv_expr (, final_wrapper);
-  final_fndecl = se.expr;
-  if (POINTER_TYPE_P (TREE_TYPE (final_fndecl)))
-final_fndecl = build_fold_indirect_ref_loc (input_location, final_fndecl);
+
+  gfc_init_se (_se, NULL);
+  get_final_proc_ref (_se, final_wrapper);
+  gfc_add_block_to_block (, _se.pre);
+  final_fndecl = final_se.expr;
 
   if (ts.type == BT_DERIVED)
 {
-- 
2.40.1



[PATCH 07/14] fortran: Push element size expression generation close to its usage

2023-07-13 Thread Mikael Morin via Gcc-patches
gfc_add_finalizer_call creates one expression which is only used
by the get_final_proc_ref function.  Move the expression generation
there.

gcc/fortran/ChangeLog:

* trans.cc (gfc_add_finalizer_call): Remove local variable
elem_size.  Pass expression to get_elem_size and move the
element size expression generation close to its usage there.
(get_elem_size): Add argument expr, remove class_size argument
and rebuild it from expr.  Remove ts argument and use the
type of expr instead.
---
 gcc/fortran/trans.cc | 25 +++--
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/gcc/fortran/trans.cc b/gcc/fortran/trans.cc
index 3750d4eca82..e5ad67199e7 100644
--- a/gcc/fortran/trans.cc
+++ b/gcc/fortran/trans.cc
@@ -1100,24 +1100,26 @@ get_final_proc_ref (gfc_se *se, gfc_expr *final_wrapper)
 }
 
 
-/* Generate the code to obtain the value of the element size whose expression
-   is passed as argument in CLASS_SIZE.  */
+/* Generate the code to obtain the value of the element size of the expression
+   passed as argument in EXPR.  */
 
 static void
-get_elem_size (gfc_se *se, gfc_typespec *ts, gfc_expr *class_size)
+get_elem_size (gfc_se *se, gfc_expr *expr)
 {
-  gcc_assert (ts->type == BT_DERIVED || ts->type == BT_CLASS);
+  gcc_assert (expr->ts.type == BT_DERIVED || expr->ts.type == BT_CLASS);
 
-  if (ts->type == BT_DERIVED)
+  if (expr->ts.type == BT_DERIVED)
 {
-  gcc_assert (!class_size);
-  se->expr = gfc_typenode_for_spec (ts);
+  se->expr = gfc_typenode_for_spec (>ts);
   se->expr = TYPE_SIZE_UNIT (se->expr);
   se->expr = fold_convert (gfc_array_index_type, se->expr);
 }
   else
 {
-  gcc_assert (class_size);
+  gfc_expr *class_size = gfc_copy_expr (expr);
+  gfc_add_vptr_component (class_size);
+  gfc_add_size_component (class_size);
+
   gfc_conv_expr (se, class_size);
   gcc_assert (se->post.head == NULL_TREE);
 }
@@ -1307,7 +1309,6 @@ gfc_add_finalizer_call (stmtblock_t *block, gfc_expr 
*expr2)
   gfc_ref *ref;
   gfc_expr *expr;
   gfc_expr *final_expr = NULL;
-  gfc_expr *elem_size = NULL;
   bool has_finalizer = false;
 
   if (!expr2 || (expr2->ts.type != BT_DERIVED && expr2->ts.type != BT_CLASS))
@@ -1361,10 +1362,6 @@ gfc_add_finalizer_call (stmtblock_t *block, gfc_expr 
*expr2)
   final_expr = gfc_copy_expr (expr);
   gfc_add_vptr_component (final_expr);
   gfc_add_final_component (final_expr);
-
-  elem_size = gfc_copy_expr (expr);
-  gfc_add_vptr_component (elem_size);
-  gfc_add_size_component (elem_size);
 }
 
   gcc_assert (final_expr->expr_type == EXPR_VARIABLE);
@@ -1379,7 +1376,7 @@ gfc_add_finalizer_call (stmtblock_t *block, gfc_expr 
*expr2)
 
   gfc_se size_se;
   gfc_init_se (_se, NULL);
-  get_elem_size (_se, >ts, elem_size);
+  get_elem_size (_se, expr);
   gfc_add_block_to_block (_block, _se.pre);
 
   gfc_se desc_se;
-- 
2.40.1



[PATCH 09/14] fortran: Inline variable definition

2023-07-13 Thread Mikael Morin via Gcc-patches
The variable has_finalizer is only used in one place, inline its
definition there.

gcc/fortran/ChangeLog:

* trans.cc (gfc_add_finalizer_call): Inline definition of
variable has_finalizer.  Merge nested conditions.
---
 gcc/fortran/trans.cc | 16 +++-
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/gcc/fortran/trans.cc b/gcc/fortran/trans.cc
index c6a65c87c5c..99677d37da7 100644
--- a/gcc/fortran/trans.cc
+++ b/gcc/fortran/trans.cc
@@ -1321,7 +1321,6 @@ gfc_add_finalizer_call (stmtblock_t *block, gfc_expr 
*expr2)
   tree tmp;
   gfc_ref *ref;
   gfc_expr *expr;
-  bool has_finalizer = false;
 
   if (!expr2 || (expr2->ts.type != BT_DERIVED && expr2->ts.type != BT_CLASS))
 return false;
@@ -1361,13 +1360,11 @@ gfc_add_finalizer_call (stmtblock_t *block, gfc_expr 
*expr2)
  ref->next = NULL;
}
 
-  if (expr->ts.type == BT_CLASS)
-{
-  has_finalizer = gfc_is_finalizable (expr->ts.u.derived, NULL);
-
-  if (!expr2->rank && !expr2->ref && CLASS_DATA 
(expr2->symtree->n.sym)->as)
-   expr->rank = CLASS_DATA (expr2->symtree->n.sym)->as->rank;
-}
+  if (expr->ts.type == BT_CLASS
+  && !expr2->rank
+  && !expr2->ref
+  && CLASS_DATA (expr2->symtree->n.sym)->as)
+expr->rank = CLASS_DATA (expr2->symtree->n.sym)->as->rank;
 
   stmtblock_t tmp_block;
   gfc_start_block (_block);
@@ -1398,7 +1395,8 @@ gfc_add_finalizer_call (stmtblock_t *block, gfc_expr 
*expr2)
 
   tmp = gfc_finish_block (_block);
 
-  if (expr->ts.type == BT_CLASS && !has_finalizer)
+  if (expr->ts.type == BT_CLASS
+  && !gfc_is_finalizable (expr->ts.u.derived, NULL))
 {
   tree cond;
   gfc_se se;
-- 
2.40.1



RE: [PATCH v2] RISC-V: Refactor riscv mode after for VXRM and FRM

2023-07-13 Thread Li, Pan2 via Gcc-patches
It can pass the selftest with below diff based on v3, but got ICE when build 
newlib.

/home/pli/repos/gcc/222/riscv-gnu-toolchain/newlib/newlib/libc/time/../time/strftime.c:1426:1:
 internal compiler error: in reg_overlap_mentioned_p, at rtlanal.cc:1928
 1426 | }
  | ^
0x87241f reg_overlap_mentioned_p(rtx_def const*, rtx_def const*)
../.././gcc/gcc/rtlanal.cc:1928
0x1005eab set_of_1
../.././gcc/gcc/rtlanal.cc:1440
0x10015c2 set_of(rtx_def const*, rtx_def const*)
../.././gcc/gcc/rtlanal.cc:1452
0x10015c2 reg_set_p(rtx_def const*, rtx_def const*)
../.././gcc/gcc/rtlanal.cc:1295
0x13f66c0 vxrm_unknown_p
../.././gcc/gcc/config/riscv/riscv.cc:7720
0x13f66c0 riscv_vxrm_mode_after
../.././gcc/gcc/config/riscv/riscv.cc:7760
0x13f66c0 riscv_mode_after
../.././gcc/gcc/config/riscv/riscv.cc:7799
0x1defe69 optimize_mode_switching
../.././gcc/gcc/mode-switching.cc:632
0x1defe69 execute
../.././gcc/gcc/mode-switching.cc:909


Diff based on PATCH v3.
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 6ed735d6983..d66ba0030eb 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -7714,10 +7714,10 @@ asm_insn_p (rtx_insn *insn)
 /* Return TRUE that an insn is unknown for VXRM.  */
 
 static bool
-vxrm_unknown_p (rtx_insn *insn)
+vxrm_unknown_p (rtx_insn *insn, const_rtx vxrm_reg)
 {
   /* Return true if there is a definition of VXRM.  */
-  if (reg_set_p (gen_rtx_REG (SImode, VXRM_REGNUM), insn))
+  if (reg_set_p (vxrm_reg, insn))
 return true;
 
   /* A CALL function may contain an instruction that modifies the VXRM,
@@ -7736,10 +7736,10 @@ vxrm_unknown_p (rtx_insn *insn)
 /* Return TRUE that an insn is unknown dynamic for FRM.  */
 
 static bool
-frm_unknown_dynamic_p (rtx_insn *insn)
+frm_unknown_dynamic_p (rtx_insn *insn, const_rtx frm_reg)
 {
   /* Return true if there is a definition of FRM.  */
-  if (reg_set_p (gen_rtx_REG (SImode, FRM_REGNUM), insn))
+  if (reg_set_p (frm_reg, insn))
 return true;
 
   /* A CALL function may contain an instruction that modifies the FRM,
@@ -7755,13 +7755,15 @@ frm_unknown_dynamic_p (rtx_insn *insn)
 static int
 riscv_vxrm_mode_after (rtx_insn *insn, int mode)
 {
-  if (vxrm_unknown_p (insn))
+  static const_rtx vxrm_reg = gen_rtx_REG (SImode, VXRM_REGNUM);
+
+  if (vxrm_unknown_p (insn, vxrm_reg))
 return VXRM_MODE_NONE;
 
   if (recog_memoized (insn) < 0)
 return mode;
 
-  if (reg_mentioned_p (gen_rtx_REG (SImode, VXRM_REGNUM), PATTERN (insn)))
+  if (reg_mentioned_p (vxrm_reg, PATTERN (insn)))
 return get_attr_vxrm_mode (insn);
   else
 return mode;
@@ -7772,13 +7774,15 @@ riscv_vxrm_mode_after (rtx_insn *insn, int mode)
 static int
 riscv_frm_mode_after (rtx_insn *insn, int mode)
 {
-  if (frm_unknown_dynamic_p (insn))
+  static const_rtx frm_reg = gen_rtx_REG (SImode, FRM_REGNUM);
+
+  if (frm_unknown_dynamic_p (insn, frm_reg))
 return FRM_MODE_DYN;
 
   if (recog_memoized (insn) < 0)
 return mode;
 
-  if (reg_mentioned_p (gen_rtx_REG (SImode, FRM_REGNUM), PATTERN (insn)))
+  if (reg_mentioned_p (frm_reg, PATTERN (insn)))
 return get_attr_frm_mode (insn);
   else
 return mode;

Pan

-Original Message-
From: Li, Pan2 
Sent: Thursday, July 13, 2023 4:42 PM
To: Kito Cheng 
Cc: Jeff Law ; gcc-patches@gcc.gnu.org; 
juzhe.zh...@rivai.ai; rdapp@gmail.com; Wang, Yanzhang 

Subject: RE: [PATCH v2] RISC-V: Refactor riscv mode after for VXRM and FRM

Sure thing, get you point now, will have a try and send v4 if everything goes 
well.

Pan

-Original Message-
From: Kito Cheng  
Sent: Thursday, July 13, 2023 3:35 PM
To: Li, Pan2 
Cc: Jeff Law ; gcc-patches@gcc.gnu.org; 
juzhe.zh...@rivai.ai; rdapp@gmail.com; Wang, Yanzhang 

Subject: Re: [PATCH v2] RISC-V: Refactor riscv mode after for VXRM and FRM

oh, I know why you failed on that, you need to put it within the
function, not global static, function static variable will construct
when first invoked rather than construct at program start.

Could you try to apply my diff in the last mail and try again?

On Thu, Jul 13, 2023 at 3:29 PM Li, Pan2 via Gcc-patches
 wrote:
>
> Thanks Kito for review. Sorry didn't involve the code result in self test 
> error in PATCH v3, but it can be reproduced with below diff based on PATCH 
> v3. Let me know if I didn't get the point of your comments.
>
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 6ed735d6983..76689eaf8d5 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -233,6 +233,9 @@ static int epilogue_cfa_sp_offset;
>  /* Which tuning parameters to use.  */
>  static const struct riscv_tune_param *tune_param;
>
> +static const_rtx vxrm_rtx = gen_rtx_REG (SImode, VXRM_REGNUM);
> +static const_rtx frm_rtx = gen_rtx_REG (SImode, FRM_REGNUM);
> +
>  /* Which automaton to use for tuning.  */
>  enum riscv_microarchitecture_type riscv_microarchitecture;
>
> @@ 

Re: [PATCH v2 1/2] c++, libstdc++: implement __is_pointer built-in trait

2023-07-13 Thread Jonathan Wakely via Gcc-patches
On Wed, 12 Jul 2023 at 21:42, Ken Matsui  wrote:
>
> On Wed, Jul 12, 2023 at 3:01 AM Jonathan Wakely  wrote:
> >
> > On Mon, 10 Jul 2023 at 06:51, Ken Matsui via Libstdc++
> >  wrote:
> > >
> > > Hi,
> > >
> > > Here is the benchmark result for is_pointer:
> > >
> > > https://github.com/ken-matsui/gcc-benches/blob/main/is_pointer.md#sun-jul--9-103948-pm-pdt-2023
> > >
> > > Time: -62.1344%
> > > Peak Memory Usage: -52.4281%
> > > Total Memory Usage: -53.5889%
> >
> > Wow!
> >
> > Although maybe we could have improved our std::is_pointer_v anyway, like so:
> >
> > template 
> >   inline constexpr bool is_pointer_v = false;
> > template 
> >   inline constexpr bool is_pointer_v<_Tp*> = true;
> > template 
> >   inline constexpr bool is_pointer_v<_Tp* const> = true;
> > template 
> >   inline constexpr bool is_pointer_v<_Tp* volatile> = true;
> > template 
> >   inline constexpr bool is_pointer_v<_Tp* const volatile> = true;
> >
> > I'm not sure why I didn't already do that.
> >
> > Could you please benchmark that? And if it is better than the current
> > impl using is_pointer<_Tp>::value then we should do this in the
> > library:
> >
> > #if __has_builtin(__is_pointer)
> > template 
> >   inline constexpr bool is_pointer_v = __is_pointer(_Tp);
> > #else
> > template 
> >   inline constexpr bool is_pointer_v = false;
> > template 
> >   inline constexpr bool is_pointer_v<_Tp*> = true;
> > template 
> >   inline constexpr bool is_pointer_v<_Tp* const> = true;
> > template 
> >   inline constexpr bool is_pointer_v<_Tp* volatile> = true;
> > template 
> >   inline constexpr bool is_pointer_v<_Tp* const volatile> = true;
> > #endif
>
> Hi François and Jonathan,
>
> Thank you for your reviews! I will rename the four underscores to the
> appropriate name and take a benchmark once I get home.
>
> If I apply your change on is_pointer_v, is it better to add the
> `Co-authored-by:` line in the commit?

Yes, that would be the correct thing to do (although in this case the
change is small enough that I don't really care about getting credit
for it :-)



[PATCH 03/14] fortran: Outline data reference descriptor evaluation

2023-07-13 Thread Mikael Morin via Gcc-patches
gcc/fortran/ChangeLog:

* trans.cc (get_var_descr): New function.
(gfc_build_final_call): Outline the data reference descriptor
evaluation code to get_var_descr.
---
 gcc/fortran/trans.cc | 149 ---
 1 file changed, 83 insertions(+), 66 deletions(-)

diff --git a/gcc/fortran/trans.cc b/gcc/fortran/trans.cc
index 1e4779f94af..9807b7eb9d9 100644
--- a/gcc/fortran/trans.cc
+++ b/gcc/fortran/trans.cc
@@ -1124,6 +1124,83 @@ get_elem_size (gfc_se *se, gfc_typespec *ts, gfc_expr 
*class_size)
 }
 
 
+/* Generate the data reference (array) descriptor corresponding to the
+   expression passed as argument in VAR.  Use type in TS to pilot code
+   generation.  */
+
+static void
+get_var_descr (gfc_se *se, gfc_typespec *ts, gfc_expr *var)
+{
+  gfc_se tmp_se;
+  symbol_attribute attr;
+
+  gcc_assert (var);
+
+  gfc_init_se (_se, NULL);
+
+  if (ts->type == BT_DERIVED)
+{
+  tmp_se.want_pointer = 1;
+  if (var->rank)
+   {
+ tmp_se.descriptor_only = 1;
+ gfc_conv_expr_descriptor (_se, var);
+   }
+  else
+   {
+ gfc_conv_expr (_se, var);
+//   gcc_assert (se.pre.head == NULL_TREE && se.post.head == NULL_TREE);
+
+ /* No copy back needed, hence set attr's allocatable/pointer
+to zero.  */
+ gfc_clear_attr ();
+ tmp_se.expr = gfc_conv_scalar_to_descriptor (_se, tmp_se.expr,
+  attr);
+ gcc_assert (tmp_se.post.head == NULL_TREE);
+   }
+}
+  else
+{
+  gfc_expr *array_expr;
+
+  array_expr = gfc_copy_expr (var);
+
+  tmp_se.want_pointer = 1;
+  if (array_expr->rank)
+   {
+ gfc_add_class_array_ref (array_expr);
+ tmp_se.descriptor_only = 1;
+ gfc_conv_expr_descriptor (_se, array_expr);
+   }
+  else
+   {
+ gfc_add_data_component (array_expr);
+ gfc_conv_expr (_se, array_expr);
+ gcc_assert (tmp_se.post.head == NULL_TREE);
+
+ if (!gfc_is_coarray (array_expr))
+   {
+ /* No copy back needed, hence set attr's allocatable/pointer
+to zero.  */
+ gfc_clear_attr ();
+ tmp_se.expr = gfc_conv_scalar_to_descriptor (_se, tmp_se.expr,
+  attr);
+   }
+ gcc_assert (tmp_se.post.head == NULL_TREE);
+   }
+  gfc_free_expr (array_expr);
+}
+
+  if (!POINTER_TYPE_P (TREE_TYPE (tmp_se.expr)))
+tmp_se.expr = gfc_build_addr_expr (NULL, tmp_se.expr);
+
+  gfc_add_block_to_block (>pre, _se.pre);
+  gfc_add_block_to_block (>post, _se.post);
+  se->expr = tmp_se.expr;
+}
+
+
+
 /* Build a call to a FINAL procedure, which finalizes "var".  */
 
 static tree
@@ -1131,10 +1208,8 @@ gfc_build_final_call (gfc_typespec ts, gfc_expr 
*final_wrapper, gfc_expr *var,
  bool fini_coarray, gfc_expr *class_size)
 {
   stmtblock_t block;
-  gfc_se final_se, size_se;
-  gfc_se se;
+  gfc_se final_se, size_se, desc_se;
   tree final_fndecl, array, size, tmp;
-  symbol_attribute attr;
 
   gcc_assert (var);
 
@@ -1150,74 +1225,16 @@ gfc_build_final_call (gfc_typespec ts, gfc_expr 
*final_wrapper, gfc_expr *var,
   gfc_add_block_to_block (, _se.pre);
   size = size_se.expr;
 
-  if (ts.type == BT_DERIVED)
-{
-  gfc_init_se (, NULL);
-  se.want_pointer = 1;
-  if (var->rank)
-   {
- se.descriptor_only = 1;
- gfc_conv_expr_descriptor (, var);
- array = se.expr;
-   }
-  else
-   {
- gfc_conv_expr (, var);
-//   gcc_assert (se.pre.head == NULL_TREE && se.post.head == NULL_TREE);
- array = se.expr;
+  gfc_init_se (_se, NULL);
+  get_var_descr (_se, , var);
+  gfc_add_block_to_block (, _se.pre);
+  array = desc_se.expr;
 
- /* No copy back needed, hence set attr's allocatable/pointer
-to zero.  */
- gfc_clear_attr ();
- gfc_init_se (, NULL);
- array = gfc_conv_scalar_to_descriptor (, array, attr);
- gcc_assert (se.post.head == NULL_TREE);
-   }
-}
-  else
-{
-  gfc_expr *array_expr;
-
-  array_expr = gfc_copy_expr (var);
-  gfc_init_se (, NULL);
-  se.want_pointer = 1;
-  if (array_expr->rank)
-   {
- gfc_add_class_array_ref (array_expr);
- se.descriptor_only = 1;
- gfc_conv_expr_descriptor (, array_expr);
- array = se.expr;
-   }
-  else
-   {
- gfc_add_data_component (array_expr);
- gfc_conv_expr (, array_expr);
- gfc_add_block_to_block (, );
- gcc_assert (se.post.head == NULL_TREE);
- array = se.expr;
-
- if (!gfc_is_coarray (array_expr))
-   {
- /* No copy back needed, hence set attr's allocatable/pointer
-to zero.  */
- gfc_clear_attr ();
- gfc_init_se (, NULL);
-

[PATCH 02/14] fortran: Outline element size evaluation

2023-07-13 Thread Mikael Morin via Gcc-patches
gcc/fortran/ChangeLog:

* trans.cc (get_elem_size): New function.
(gfc_build_final_call): Outline the element size evaluation
to get_elem_size.
---
 gcc/fortran/trans.cc | 44 ++--
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/gcc/fortran/trans.cc b/gcc/fortran/trans.cc
index b5f7b16eda3..1e4779f94af 100644
--- a/gcc/fortran/trans.cc
+++ b/gcc/fortran/trans.cc
@@ -1100,6 +1100,30 @@ get_final_proc_ref (gfc_se *se, gfc_expr *final_wrapper)
 }
 
 
+/* Generate the code to obtain the value of the element size whose expression
+   is passed as argument in CLASS_SIZE.  */
+
+static void
+get_elem_size (gfc_se *se, gfc_typespec *ts, gfc_expr *class_size)
+{
+  gcc_assert (ts->type == BT_DERIVED || ts->type == BT_CLASS);
+
+  if (ts->type == BT_DERIVED)
+{
+  gcc_assert (!class_size);
+  se->expr = gfc_typenode_for_spec (ts);
+  se->expr = TYPE_SIZE_UNIT (se->expr);
+  se->expr = fold_convert (gfc_array_index_type, se->expr);
+}
+  else
+{
+  gcc_assert (class_size);
+  gfc_conv_expr (se, class_size);
+  gcc_assert (se->post.head == NULL_TREE);
+}
+}
+
+
 /* Build a call to a FINAL procedure, which finalizes "var".  */
 
 static tree
@@ -1107,7 +1131,7 @@ gfc_build_final_call (gfc_typespec ts, gfc_expr 
*final_wrapper, gfc_expr *var,
  bool fini_coarray, gfc_expr *class_size)
 {
   stmtblock_t block;
-  gfc_se final_se;
+  gfc_se final_se, size_se;
   gfc_se se;
   tree final_fndecl, array, size, tmp;
   symbol_attribute attr;
@@ -1121,15 +1145,13 @@ gfc_build_final_call (gfc_typespec ts, gfc_expr 
*final_wrapper, gfc_expr *var,
   gfc_add_block_to_block (, _se.pre);
   final_fndecl = final_se.expr;
 
+  gfc_init_se (_se, NULL);
+  get_elem_size (_se, , class_size);
+  gfc_add_block_to_block (, _se.pre);
+  size = size_se.expr;
+
   if (ts.type == BT_DERIVED)
 {
-  tree elem_size;
-
-  gcc_assert (!class_size);
-  elem_size = gfc_typenode_for_spec ();
-  elem_size = TYPE_SIZE_UNIT (elem_size);
-  size = fold_convert (gfc_array_index_type, elem_size);
-
   gfc_init_se (, NULL);
   se.want_pointer = 1;
   if (var->rank)
@@ -1155,12 +1177,6 @@ gfc_build_final_call (gfc_typespec ts, gfc_expr 
*final_wrapper, gfc_expr *var,
   else
 {
   gfc_expr *array_expr;
-  gcc_assert (class_size);
-  gfc_init_se (, NULL);
-  gfc_conv_expr (, class_size);
-  gfc_add_block_to_block (, );
-  gcc_assert (se.post.head == NULL_TREE);
-  size = se.expr;
 
   array_expr = gfc_copy_expr (var);
   gfc_init_se (, NULL);
-- 
2.40.1



[PATCH 04/14] fortran: Inline gfc_build_final_call

2023-07-13 Thread Mikael Morin via Gcc-patches
Function gfc_build_final_call has been simplified, inline it.

gcc/fortran/ChangeLog:

* trans.cc (gfc_build_final_call): Inline...
(gfc_add_finalizer_call): ... to its one caller.
---
 gcc/fortran/trans.cc | 66 +---
 1 file changed, 25 insertions(+), 41 deletions(-)

diff --git a/gcc/fortran/trans.cc b/gcc/fortran/trans.cc
index 9807b7eb9d9..f8ca388ab9f 100644
--- a/gcc/fortran/trans.cc
+++ b/gcc/fortran/trans.cc
@@ -1201,45 +1201,6 @@ get_var_descr (gfc_se *se, gfc_typespec *ts, gfc_expr 
*var)
 
 
 
-/* Build a call to a FINAL procedure, which finalizes "var".  */
-
-static tree
-gfc_build_final_call (gfc_typespec ts, gfc_expr *final_wrapper, gfc_expr *var,
- bool fini_coarray, gfc_expr *class_size)
-{
-  stmtblock_t block;
-  gfc_se final_se, size_se, desc_se;
-  tree final_fndecl, array, size, tmp;
-
-  gcc_assert (var);
-
-  gfc_start_block ();
-
-  gfc_init_se (_se, NULL);
-  get_final_proc_ref (_se, final_wrapper);
-  gfc_add_block_to_block (, _se.pre);
-  final_fndecl = final_se.expr;
-
-  gfc_init_se (_se, NULL);
-  get_elem_size (_se, , class_size);
-  gfc_add_block_to_block (, _se.pre);
-  size = size_se.expr;
-
-  gfc_init_se (_se, NULL);
-  get_var_descr (_se, , var);
-  gfc_add_block_to_block (, _se.pre);
-  array = desc_se.expr;
-
-  tmp = build_call_expr_loc (input_location,
-final_fndecl, 3, array,
-size, fini_coarray ? boolean_true_node
-   : boolean_false_node);
-  gfc_add_block_to_block (, _se.post);
-  gfc_add_expr_to_block (, tmp);
-  return gfc_finish_block ();
-}
-
-
 bool
 gfc_add_comp_finalizer_call (stmtblock_t *block, tree decl, gfc_component 
*comp,
 bool fini_coarray)
@@ -1408,8 +1369,31 @@ gfc_add_finalizer_call (stmtblock_t *block, gfc_expr 
*expr2)
 
   gcc_assert (final_expr->expr_type == EXPR_VARIABLE);
 
-  tmp = gfc_build_final_call (expr->ts, final_expr, expr,
- false, elem_size);
+  stmtblock_t tmp_block;
+  gfc_start_block (_block);
+
+  gfc_se final_se;
+  gfc_init_se (_se, NULL);
+  get_final_proc_ref (_se, final_expr);
+  gfc_add_block_to_block (_block, _se.pre);
+
+  gfc_se size_se;
+  gfc_init_se (_se, NULL);
+  get_elem_size (_se, >ts, elem_size);
+  gfc_add_block_to_block (_block, _se.pre);
+
+  gfc_se desc_se;
+  gfc_init_se (_se, NULL);
+  get_var_descr (_se, >ts, expr);
+  gfc_add_block_to_block (_block, _se.pre);
+
+  tmp = build_call_expr_loc (input_location, final_se.expr, 3,
+desc_se.expr, size_se.expr,
+boolean_false_node);
+
+  gfc_add_block_to_block (_block, _se.post);
+  gfc_add_expr_to_block (_block, tmp);
+  tmp = gfc_finish_block (_block);
 
   if (expr->ts.type == BT_CLASS && !has_finalizer)
 {
-- 
2.40.1



[PATCH 11/14] fortran: Outline virtual table pointer evaluation

2023-07-13 Thread Mikael Morin via Gcc-patches
gcc/fortran/ChangeLog:

* trans.cc (get_vptr): New function.
(gfc_add_finalizer_call): Move virtual table pointer evaluation
to get_vptr.
---
 gcc/fortran/trans.cc | 33 ++---
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/gcc/fortran/trans.cc b/gcc/fortran/trans.cc
index bcf3341fd4b..731dfb626ab 100644
--- a/gcc/fortran/trans.cc
+++ b/gcc/fortran/trans.cc
@@ -1214,6 +1214,23 @@ get_var_descr (gfc_se *se, gfc_expr *var)
 }
 
 
+static void
+get_vptr (gfc_se *se, gfc_expr *expr)
+{
+  gfc_expr *vptr_expr = gfc_copy_expr (expr);
+  gfc_add_vptr_component (vptr_expr);
+
+  gfc_se tmp_se;
+  gfc_init_se (_se, NULL);
+  tmp_se.want_pointer = 1;
+  gfc_conv_expr (_se, vptr_expr);
+  gfc_free_expr (vptr_expr);
+
+  gfc_add_block_to_block (>pre, _se.pre);
+  gfc_add_block_to_block (>post, _se.post);
+  se->expr = tmp_se.expr;
+}
+
 
 bool
 gfc_add_comp_finalizer_call (stmtblock_t *block, tree decl, gfc_component 
*comp,
@@ -1398,7 +1415,6 @@ gfc_add_finalizer_call (stmtblock_t *block, gfc_expr 
*expr2)
   && !gfc_is_finalizable (expr->ts.u.derived, NULL))
 {
   tree cond;
-  gfc_se se;
 
   tree ptr = gfc_build_addr_expr (NULL_TREE, final_se.expr);
 
@@ -1410,19 +1426,14 @@ gfc_add_finalizer_call (stmtblock_t *block, gfc_expr 
*expr2)
   if (UNLIMITED_POLY (expr))
{
  tree cond2;
- gfc_expr *vptr_expr;
+ gfc_se vptr_se;
 
- vptr_expr = gfc_copy_expr (expr);
- gfc_add_vptr_component (vptr_expr);
-
- gfc_init_se (, NULL);
- se.want_pointer = 1;
- gfc_conv_expr (, vptr_expr);
- gfc_free_expr (vptr_expr);
+ gfc_init_se (_se, NULL);
+ get_vptr (_se, expr);
 
  cond2 = fold_build2_loc (input_location, NE_EXPR, logical_type_node,
-  se.expr,
-  build_int_cst (TREE_TYPE (se.expr), 0));
+  vptr_se.expr,
+  build_int_cst (TREE_TYPE (vptr_se.expr), 0));
  cond = fold_build2_loc (input_location, TRUTH_ANDIF_EXPR,
  logical_type_node, cond2, cond);
}
-- 
2.40.1



[PATCH 14/14] fortran: Pass pre-calculated class container argument [pr110618]

2023-07-13 Thread Mikael Morin via Gcc-patches
Pass already evaluated class container argument from
gfc_conv_procedure_call down to gfc_add_finalizer_call through
gfc_deallocate_scalar_with_status and gfc_deallocate_with_status,
to avoid repeatedly evaluating the same data reference expressions
in the generated code.

PR fortran/110618

gcc/fortran/ChangeLog:

* trans.h (gfc_deallocate_with_status): Add class container
argument.
(gfc_deallocate_scalar_with_status): Ditto.
* trans.cc (gfc_deallocate_with_status): Add class container
argument and pass it down to gfc_add_finalize_call.
(gfc_deallocate_scalar_with_status): Same.
* trans-array.cc (structure_alloc_comps): Update caller.
* trans-stmt.cc (gfc_trans_deallocate): Ditto.
* trans-expr.cc (gfc_conv_procedure_call): Ditto.  Pass
pre-evaluated class container argument if it's available.

gcc/testsuite/ChangeLog:

* gfortran.dg/intent_out_22.f90: New test.
---
 gcc/fortran/trans-array.cc  |  2 +-
 gcc/fortran/trans-expr.cc   |  7 ++--
 gcc/fortran/trans-stmt.cc   |  3 +-
 gcc/fortran/trans.cc| 11 +++---
 gcc/fortran/trans.h |  7 ++--
 gcc/testsuite/gfortran.dg/intent_out_22.f90 | 37 +
 6 files changed, 55 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gfortran.dg/intent_out_22.f90

diff --git a/gcc/fortran/trans-array.cc b/gcc/fortran/trans-array.cc
index 1c2af55d436..951cecfa5d5 100644
--- a/gcc/fortran/trans-array.cc
+++ b/gcc/fortran/trans-array.cc
@@ -9472,7 +9472,7 @@ structure_alloc_comps (gfc_symbol * der_type, tree decl, 
tree dest,
 
  tmp = gfc_deallocate_with_status (comp, NULL_TREE, NULL_TREE,
NULL_TREE, NULL_TREE, true,
-   NULL, caf_dereg_mode,
+   NULL, caf_dereg_mode, NULL_TREE,
add_when_allocated, caf_token);
 
  gfc_add_expr_to_block (, tmp);
diff --git a/gcc/fortran/trans-expr.cc b/gcc/fortran/trans-expr.cc
index dbb04f8c434..8258543b456 100644
--- a/gcc/fortran/trans-expr.cc
+++ b/gcc/fortran/trans-expr.cc
@@ -6706,9 +6706,10 @@ gfc_conv_procedure_call (gfc_se * se, gfc_symbol * sym,
  if (e->ts.type == BT_CLASS)
ptr = gfc_class_data_get (ptr);
 
+ tree cls = parmse.class_container;
  tmp = gfc_deallocate_scalar_with_status (ptr, NULL_TREE,
   NULL_TREE, true,
-  e, e->ts);
+  e, e->ts, cls);
  gfc_add_expr_to_block (, tmp);
  tmp = fold_build2_loc (input_location, MODIFY_EXPR,
 void_type_node, ptr,
@@ -6900,10 +6901,12 @@ gfc_conv_procedure_call (gfc_se * se, gfc_symbol * sym,
  ptr = parmse.expr;
  ptr = gfc_class_data_get (ptr);
 
+ tree cls = parmse.class_container;
  tmp = gfc_deallocate_with_status (ptr, NULL_TREE,
NULL_TREE, NULL_TREE,
NULL_TREE, true, e,
-   GFC_CAF_COARRAY_NOCOARRAY);
+   GFC_CAF_COARRAY_NOCOARRAY,
+   cls);
  gfc_add_expr_to_block (, tmp);
  tmp = fold_build2_loc (input_location, MODIFY_EXPR,
 void_type_node, ptr,
diff --git a/gcc/fortran/trans-stmt.cc b/gcc/fortran/trans-stmt.cc
index 7e768343a57..93f36bfb955 100644
--- a/gcc/fortran/trans-stmt.cc
+++ b/gcc/fortran/trans-stmt.cc
@@ -7462,7 +7462,8 @@ gfc_trans_deallocate (gfc_code *code)
{
  tmp = gfc_deallocate_scalar_with_status (se.expr, pstat, label_finish,
   false, al->expr,
-  al->expr->ts, is_coarray);
+  al->expr->ts, NULL_TREE,
+  is_coarray);
  gfc_add_expr_to_block (, tmp);
 
  /* Set to zero after deallocation.  */
diff --git a/gcc/fortran/trans.cc b/gcc/fortran/trans.cc
index 18965b9cbd2..569fad45031 100644
--- a/gcc/fortran/trans.cc
+++ b/gcc/fortran/trans.cc
@@ -1777,8 +1777,8 @@ tree
 gfc_deallocate_with_status (tree pointer, tree status, tree errmsg,
tree errlen, tree label_finish,
bool can_fail, gfc_expr* expr,
-   int 

[PATCH] c++: mangling template-id of unknown template [PR110524]

2023-07-13 Thread Patrick Palka via Gcc-patches
Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK
for trunk and perhaps 13?

-- >8 --

This fixes a crash when mangling an ADL-enabled call to a template-id
naming an unknown template (as per P0846R0).

PR c++/110524

gcc/cp/ChangeLog:

* mangle.cc (write_expression): Handle TEMPLATE_ID_EXPR
whose template is already an IDENTIFIER_NODE.

gcc/testsuite/ChangeLog:

* g++.dg/cpp2a/fn-template26.C: New test.
---
 gcc/cp/mangle.cc   |  3 ++-
 gcc/testsuite/g++.dg/cpp2a/fn-template26.C | 16 
 2 files changed, 18 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/g++.dg/cpp2a/fn-template26.C

diff --git a/gcc/cp/mangle.cc b/gcc/cp/mangle.cc
index 7dab4e62bc9..bef0fda6d22 100644
--- a/gcc/cp/mangle.cc
+++ b/gcc/cp/mangle.cc
@@ -3312,7 +3312,8 @@ write_expression (tree expr)
   else if (TREE_CODE (expr) == TEMPLATE_ID_EXPR)
 {
   tree fn = TREE_OPERAND (expr, 0);
-  fn = OVL_NAME (fn);
+  if (!identifier_p (fn))
+   fn = OVL_NAME (fn);
   if (IDENTIFIER_ANY_OP_P (fn))
write_string ("on");
   write_unqualified_id (fn);
diff --git a/gcc/testsuite/g++.dg/cpp2a/fn-template26.C 
b/gcc/testsuite/g++.dg/cpp2a/fn-template26.C
new file mode 100644
index 000..d4a17eb9bd1
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp2a/fn-template26.C
@@ -0,0 +1,16 @@
+// PR c++/110524
+// { dg-do compile { target c++20 } }
+
+template
+auto f(T t) -> decltype(g(t));
+
+namespace N {
+  struct A { };
+  template void g(T);
+};
+
+int main() {
+  f(N::A{});
+}
+
+// { dg-final { scan-assembler "_Z1fIN1N1AEEDTcl1gIT_Efp_EES2_" } }
-- 
2.41.0.327.gaa9166bcc0



Re: [PATCH V7] RISC-V: RISC-V: Support gather_load/scatter RVV auto-vectorization

2023-07-13 Thread Jeff Law via Gcc-patches




On 7/13/23 01:47, Richard Biener wrote:

On Thu, Jul 13, 2023 at 1:30 AM 钟居哲  wrote:


I notice vectorizable_call in Loop Vectorizer.
It's vectorizing CALL function for example like fmax/fmin.
 From my understanding, we dont have RVV instruction for fmax/fmin?


There's things like .POPCOUNT which we can vectorize, but sure, it
depends on the ISA if there's anything.
Right.  And RV has some of these -- vcpop, vfirst...  Supporting them 
obviously isn't a requirement for a vector implementation, but they're 
nice to have :-)


Jeff


RE: [PATCH V2] SSA MATH: Support COND_LEN_FMA for floating-point math optimization

2023-07-13 Thread Li, Pan2 via Gcc-patches
Committed, thanks Richard.

Pan

-Original Message-
From: Gcc-patches  On Behalf 
Of Richard Biener via Gcc-patches
Sent: Thursday, July 13, 2023 6:51 PM
To: Ju-Zhe Zhong 
Cc: gcc-patches@gcc.gnu.org; richard.sandif...@arm.com
Subject: Re: [PATCH V2] SSA MATH: Support COND_LEN_FMA for floating-point math 
optimization

On Thu, 13 Jul 2023, juzhe.zh...@rivai.ai wrote:

> From: Ju-Zhe Zhong 
> 
> Hi, Richard and Richi.
> 
> Previous patch we support COND_LEN_* binary operations. However, we didn't
> support COND_LEN_* ternary.
> 
> Now, this patch support COND_LEN_* ternary. Consider this following case:
> 
> #define TEST_TYPE(TYPE)   
>  \
>   __attribute__ ((noipa)) void ternop_##TYPE (TYPE *__restrict dst,   
>  \
> TYPE *__restrict a,  \
> TYPE *__restrict b,\
> TYPE *__restrict c, int n)   \
>   {   
>  \
> for (int i = 0; i < n; i++)   
>  \
>   dst[i] += a[i] * b[i];  
>\
>   }
> 
> #define TEST_ALL() TEST_TYPE (double)
> 
> TEST_ALL ()
> 
> Before this patch:
> ...
> COND_LEN_MUL
> COND_LEN_ADD
> 
> Afther this patch:
> ...
> COND_LEN_FMA

OK.

Thanks,
Richard.

> gcc/ChangeLog:
> 
> * genmatch.cc (commutative_op): Add COND_LEN_*
> * internal-fn.cc (first_commutative_argument): Ditto.
> (CASE): Ditto.
> (get_unconditional_internal_fn): Ditto.
> (can_interpret_as_conditional_op_p): Ditto.
> (internal_fn_len_index): Ditto.
> * internal-fn.h (can_interpret_as_conditional_op_p): Ditt.
> * tree-ssa-math-opts.cc (convert_mult_to_fma_1): Ditto.
> (convert_mult_to_fma): Ditto.
> (math_opts_dom_walker::after_dom_children): Ditto.
> 
> ---
>  gcc/genmatch.cc   | 13 ++
>  gcc/internal-fn.cc| 87 ++-
>  gcc/internal-fn.h |  2 +-
>  gcc/tree-ssa-math-opts.cc | 80 +--
>  4 files changed, 159 insertions(+), 23 deletions(-)
> 
> diff --git a/gcc/genmatch.cc b/gcc/genmatch.cc
> index 5fceeec9780..2302f2a7ff0 100644
> --- a/gcc/genmatch.cc
> +++ b/gcc/genmatch.cc
> @@ -559,6 +559,19 @@ commutative_op (id_base *id)
>case CFN_COND_FMS:
>case CFN_COND_FNMA:
>case CFN_COND_FNMS:
> +  case CFN_COND_LEN_ADD:
> +  case CFN_COND_LEN_MUL:
> +  case CFN_COND_LEN_MIN:
> +  case CFN_COND_LEN_MAX:
> +  case CFN_COND_LEN_FMIN:
> +  case CFN_COND_LEN_FMAX:
> +  case CFN_COND_LEN_AND:
> +  case CFN_COND_LEN_IOR:
> +  case CFN_COND_LEN_XOR:
> +  case CFN_COND_LEN_FMA:
> +  case CFN_COND_LEN_FMS:
> +  case CFN_COND_LEN_FNMA:
> +  case CFN_COND_LEN_FNMS:
>   return 1;
>  
>default:
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index c11123a1173..e698f0bffc7 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4191,6 +4191,19 @@ first_commutative_argument (internal_fn fn)
>  case IFN_COND_FMS:
>  case IFN_COND_FNMA:
>  case IFN_COND_FNMS:
> +case IFN_COND_LEN_ADD:
> +case IFN_COND_LEN_MUL:
> +case IFN_COND_LEN_MIN:
> +case IFN_COND_LEN_MAX:
> +case IFN_COND_LEN_FMIN:
> +case IFN_COND_LEN_FMAX:
> +case IFN_COND_LEN_AND:
> +case IFN_COND_LEN_IOR:
> +case IFN_COND_LEN_XOR:
> +case IFN_COND_LEN_FMA:
> +case IFN_COND_LEN_FMS:
> +case IFN_COND_LEN_FNMA:
> +case IFN_COND_LEN_FNMS:
>return 1;
>  
>  default:
> @@ -4330,11 +4343,14 @@ conditional_internal_fn_code (internal_fn ifn)
>  {
>switch (ifn)
>  {
> -#define CASE(CODE, IFN) case IFN_COND_##IFN: return CODE;
> -  FOR_EACH_CODE_MAPPING(CASE)
> +#define CASE(CODE, IFN)  
>   \
> +  case IFN_COND_##IFN:   
>   \
> +  case IFN_COND_LEN_##IFN:   
>   \
> +return CODE;
> +  FOR_EACH_CODE_MAPPING (CASE)
>  #undef CASE
> -default:
> -  return ERROR_MARK;
> +  default:
> + return ERROR_MARK;
>  }
>  }
>  
> @@ -4433,6 +4449,18 @@ get_unconditional_internal_fn (internal_fn ifn)
> operating elementwise if the operands are vectors.  This includes
> the case of an all-true COND, so that the operation always happens.
>  
> +   There is an alternative approach to interpret the STMT when the operands
> +   are vectors which is the operation predicated by both conditional mask
> +   and loop control length, the equivalent C code:
> +
> + for (int i = 0; i < NUNTIS; i++)
> +  {
> + if (i < LEN + BIAS && COND[i])
> +   LHS[i] = A[i] CODE B[i];
> + else
> +   LHS[i] = 

Re: [PATCH V7] RISC-V: RISC-V: Support gather_load/scatter RVV auto-vectorization

2023-07-13 Thread Robin Dapp via Gcc-patches
  From my understanding, we dont have RVV instruction for fmax/fmin?
> 
> Unless I'm misunderstanding, we do.  The ISA manual says
> 
> === Vector Floating-Point MIN/MAX Instructions
> 
> The vector floating-point `vfmin` and `vfmax` instructions have the
> same behavior as the corresponding scalar floating-point instructions
> in version 2.2 of the RISC-V F/D/Q extension: they perform the 
> `minimumNumber`
> or `maximumNumber` operation on active elements.
> 
> 
> # Floating-point minimum
> vfmin.vv vd, vs2, vs1, vm   # Vector-vector
> vfmin.vf vd, vs2, rs1, vm   # vector-scalar
> 
> # Floating-point maximum
> vfmax.vv vd, vs2, vs1, vm   # Vector-vector
> vfmax.vf vd, vs2, rs1, vm   # vector-scalar
> 
> 
> so we should be able to match at least some loops.

We're already emitting those (e.g. for a[i] = a[i] > b[i] ? a[i] : b[i])
but for fmin/fmax they are not wired up yet (as opposed to the scalar variants).
Juzhe are you referring to something else?  I mean it's always a bit tricky
for backends to verify if the fmin/fmax behavior exactly matches the instruction
regards signaling nans, rounding etc but if the scalar variant is fine
I don't see why the vector variant would be worse. 

Regards
 Robin



Re: Re: [PATCH] RISC-V: Enable COND_LEN_FMA auto-vectorization

2023-07-13 Thread Kito Cheng via Gcc-patches
I didn’t try on local yet, but it sounds like …the code size might larger
than normal case?

juzhe.zh...@rivai.ai 於 2023年7月13日 週四,19:50寫道:

> Could you tell me how to add the comment?
> I am not familiar with link/binutils stuff.
>
> --
> juzhe.zh...@rivai.ai
>
>
> *From:* Robin Dapp 
> *Date:* 2023-07-13 19:40
> *To:* Juzhe-Zhong ; gcc-patches
> 
> *CC:* rdapp.gcc ; kito.cheng ;
> kito.cheng ; palmer ; palmer
> ; jeffreyalaw 
> *Subject:* Re: [PATCH] RISC-V: Enable COND_LEN_FMA auto-vectorization
> Hi Juzhe,
>
> thanks, no complaints from my side apart from one:
>
> > +/* { dg-additional-options "-mcmodel=medany" } */
>
> Please add a comment why we need this.
>
> Regards
> Robin
>
>
>


Re: [PATCH V7] RISC-V: RISC-V: Support gather_load/scatter RVV auto-vectorization

2023-07-13 Thread Jeff Law via Gcc-patches




On 7/12/23 17:30, 钟居哲 wrote:

I notice vectorizable_call in Loop Vectorizer.
It's vectorizing CALL function for example like fmax/fmin.
 From my understanding, we dont have RVV instruction for fmax/fmin?

So for now, I don't need to support builtin call function vectorization 
for RVV.

Am I right?

Yes, you are correct.



I am wondering whether we do have some kind of builtin function call 
vectorization by using RVV instructions.
It can be advantageous, even if the call doesn't collapse down to a 
single vector instruction.  Consider libmvec which is an API to provide 
things like sin, cos, exp, log, etc in vector form.


Once the library routines are written, those can then be exposed to the 
compiler in turn allowing vectorization of loops with a subset of calls 
such as sin, cos, pow, log, etc.


jeff


[PATCH v2 2/2] ifcvt: Allow more operations in multiple set if conversion

2023-07-13 Thread Manolis Tsamis
Currently the operations allowed for if conversion of a basic block with
multiple sets are few, namely REG, SUBREG and CONST_INT (as controlled by
bb_ok_for_noce_convert_multiple_sets).

This commit allows more operations (arithmetic, compare, etc) to participate
in if conversion. The target's profitability hook and ifcvt's costing is
expected to reject sequences that are unprofitable.

This is especially useful for targets which provide a rich selection of
conditional instructions (like aarch64 which has cinc, csneg, csinv, ccmp, ...)
which are currently not used in basic blocks with more than a single set.

gcc/ChangeLog:

* ifcvt.cc (try_emit_cmove_seq): Modify comments.
(noce_convert_multiple_sets_1): Modify comments.
(bb_ok_for_noce_convert_multiple_sets): Allow more operations.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/ifcvt_multiple_sets_arithm.c: New test.

Signed-off-by: Manolis Tsamis 
---

Changes in v2:
- Change "conditional moves" to "conditional instructions"
in bb_ok_for_noce_convert_multiple_sets's comment.

 gcc/ifcvt.cc  | 60 +++--
 .../aarch64/ifcvt_multiple_sets_arithm.c  | 67 +++
 2 files changed, 108 insertions(+), 19 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/ifcvt_multiple_sets_arithm.c

diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc
index 3273aeca125..be29403a5b5 100644
--- a/gcc/ifcvt.cc
+++ b/gcc/ifcvt.cc
@@ -3215,13 +3215,13 @@ try_emit_cmove_seq (struct noce_if_info *if_info, rtx 
temp,
 /* We have something like:
 
  if (x > y)
-   { i = a; j = b; k = c; }
+   { i = EXPR_A; j = EXPR_B; k = EXPR_C; }
 
Make it:
 
- tmp_i = (x > y) ? a : i;
- tmp_j = (x > y) ? b : j;
- tmp_k = (x > y) ? c : k;
+ tmp_i = (x > y) ? EXPR_A : i;
+ tmp_j = (x > y) ? EXPR_B : j;
+ tmp_k = (x > y) ? EXPR_C : k;
  i = tmp_i;
  j = tmp_j;
  k = tmp_k;
@@ -3637,11 +3637,10 @@ noce_convert_multiple_sets_1 (struct noce_if_info 
*if_info,
 
 
 
-/* Return true iff basic block TEST_BB is comprised of only
-   (SET (REG) (REG)) insns suitable for conversion to a series
-   of conditional moves.  Also check that we have more than one set
-   (other routines can handle a single set better than we would), and
-   fewer than PARAM_MAX_RTL_IF_CONVERSION_INSNS sets.  While going
+/* Return true iff basic block TEST_BB is suitable for conversion to a
+   series of conditional instructions.  Also check that we have more than
+   one set (other routines can handle a single set better than we would),
+   and fewer than PARAM_MAX_RTL_IF_CONVERSION_INSNS sets.  While going
through the insns store the sum of their potential costs in COST.  */
 
 static bool
@@ -3667,20 +3666,43 @@ bb_ok_for_noce_convert_multiple_sets (basic_block 
test_bb, unsigned *cost)
   rtx dest = SET_DEST (set);
   rtx src = SET_SRC (set);
 
-  /* We can possibly relax this, but for now only handle REG to REG
-(including subreg) moves.  This avoids any issues that might come
-from introducing loads/stores that might violate data-race-freedom
-guarantees.  */
-  if (!REG_P (dest))
+  /* Do not handle anything involving memory loads/stores since it might
+violate data-race-freedom guarantees.  */
+  if (!REG_P (dest) || contains_mem_rtx_p (src))
return false;
 
-  if (!((REG_P (src) || CONSTANT_P (src))
-   || (GET_CODE (src) == SUBREG && REG_P (SUBREG_REG (src))
- && subreg_lowpart_p (src
+  /* Allow a wide range of operations and let the costing function decide
+if the conversion is worth it later.  */
+  enum rtx_code code = GET_CODE (src);
+  if (!(CONSTANT_P (src)
+   || code == REG
+   || code == SUBREG
+   || code == ZERO_EXTEND
+   || code == SIGN_EXTEND
+   || code == NOT
+   || code == NEG
+   || code == PLUS
+   || code == MINUS
+   || code == AND
+   || code == IOR
+   || code == MULT
+   || code == ASHIFT
+   || code == ASHIFTRT
+   || code == NE
+   || code == EQ
+   || code == GE
+   || code == GT
+   || code == LE
+   || code == LT
+   || code == GEU
+   || code == GTU
+   || code == LEU
+   || code == LTU
+   || code == COMPARE))
return false;
 
-  /* Destination must be appropriate for a conditional write.  */
-  if (!noce_operand_ok (dest))
+  /* Destination and source must be appropriate.  */
+  if (!noce_operand_ok (dest) || !noce_operand_ok (src))
return false;
 
   /* We must be able to conditionally move in this mode.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/ifcvt_multiple_sets_arithm.c 
b/gcc/testsuite/gcc.target/aarch64/ifcvt_multiple_sets_arithm.c
new file mode 100644
index 

[PATCH v2 1/2] ifcvt: handle sequences that clobber flags in noce_convert_multiple_sets

2023-07-13 Thread Manolis Tsamis
This is an extension of what was done in PR106590.

Currently if a sequence generated in noce_convert_multiple_sets clobbers the
condition rtx (cc_cmp or rev_cc_cmp) then only seq1 is used afterwards
(sequences that emit the comparison itself). Since this applies only from the
next iteration it assumes that the sequences generated (in particular seq2)
doesn't clobber the condition rtx itself before using it in the if_then_else,
which is only true in specific cases (currently only register/subregister moves
are allowed).

This patch changes this so it also tests if seq2 clobbers cc_cmp/rev_cc_cmp in
the current iteration. This makes it possible to include arithmetic operations
in noce_convert_multiple_sets.

gcc/ChangeLog:

* ifcvt.cc (check_for_cc_cmp_clobbers): Use modified_in_p instead.
(noce_convert_multiple_sets_1): Don't use seq2 if it clobbers cc_cmp.

Signed-off-by: Manolis Tsamis 
---

(no changes since v1)

 gcc/ifcvt.cc | 49 +++--
 1 file changed, 19 insertions(+), 30 deletions(-)

diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc
index a0af553b9ff..3273aeca125 100644
--- a/gcc/ifcvt.cc
+++ b/gcc/ifcvt.cc
@@ -3375,20 +3375,6 @@ noce_convert_multiple_sets (struct noce_if_info *if_info)
   return true;
 }
 
-/* Helper function for noce_convert_multiple_sets_1.  If store to
-   DEST can affect P[0] or P[1], clear P[0].  Called via note_stores.  */
-
-static void
-check_for_cc_cmp_clobbers (rtx dest, const_rtx, void *p0)
-{
-  rtx *p = (rtx *) p0;
-  if (p[0] == NULL_RTX)
-return;
-  if (reg_overlap_mentioned_p (dest, p[0])
-  || (p[1] && reg_overlap_mentioned_p (dest, p[1])))
-p[0] = NULL_RTX;
-}
-
 /* This goes through all relevant insns of IF_INFO->then_bb and tries to
create conditional moves.  In case a simple move sufficis the insn
should be listed in NEED_NO_CMOV.  The rewired-src cases should be
@@ -3552,9 +3538,17 @@ noce_convert_multiple_sets_1 (struct noce_if_info 
*if_info,
 creating an additional compare for each.  If successful, costing
 is easier and this sequence is usually preferred.  */
   if (cc_cmp)
-   seq2 = try_emit_cmove_seq (if_info, temp, cond,
-  new_val, old_val, need_cmov,
-  , _dest2, cc_cmp, rev_cc_cmp);
+   {
+ seq2 = try_emit_cmove_seq (if_info, temp, cond,
+new_val, old_val, need_cmov,
+, _dest2, cc_cmp, rev_cc_cmp);
+
+ /* The if_then_else in SEQ2 may be affected when cc_cmp/rev_cc_cmp is
+clobbered.  We can't safely use the sequence in this case.  */
+ if (seq2 && (modified_in_p (cc_cmp, seq2)
+ || (rev_cc_cmp && modified_in_p (rev_cc_cmp, seq2
+   seq2 = NULL;
+   }
 
   /* The backend might have created a sequence that uses the
 condition.  Check this.  */
@@ -3609,21 +3603,16 @@ noce_convert_multiple_sets_1 (struct noce_if_info 
*if_info,
  return false;
}
 
-  if (cc_cmp)
+  if (cc_cmp && seq == seq1)
{
- /* Check if SEQ can clobber registers mentioned in
-cc_cmp and/or rev_cc_cmp.  If yes, we need to use
-only seq1 from that point on.  */
- rtx cc_cmp_pair[2] = { cc_cmp, rev_cc_cmp };
- for (walk = seq; walk; walk = NEXT_INSN (walk))
+ /* Check if SEQ can clobber registers mentioned in cc_cmp/rev_cc_cmp.
+If yes, we need to use only seq1 from that point on.
+Only check when we use seq1 since we have already tested seq2.  */
+ if (modified_in_p (cc_cmp, seq)
+ || (rev_cc_cmp && modified_in_p (rev_cc_cmp, seq)))
{
- note_stores (walk, check_for_cc_cmp_clobbers, cc_cmp_pair);
- if (cc_cmp_pair[0] == NULL_RTX)
-   {
- cc_cmp = NULL_RTX;
- rev_cc_cmp = NULL_RTX;
- break;
-   }
+ cc_cmp = NULL_RTX;
+ rev_cc_cmp = NULL_RTX;
}
}
 
-- 
2.34.1



Re: [PATCH v2] Implement new RTL optimizations pass: fold-mem-offsets.

2023-07-13 Thread Manolis Tsamis
On Wed, Jul 12, 2023 at 5:14 PM Jeff Law  wrote:
>
>
>
> On 7/12/23 03:12, Manolis Tsamis wrote:
> > On Mon, Jul 10, 2023 at 12:58 AM Hans-Peter Nilsson  
> > wrote:
> >>
> >> On Sun, 9 Jul 2023, Hans-Peter Nilsson wrote:
> >>
> >>> On Thu, 15 Jun 2023, Manolis Tsamis wrote:
> >>>
>  This is a new RTL pass that tries to optimize memory offset calculations
>  by moving them from add immediate instructions to the memory 
>  loads/stores.
> >>
> >>> It punts on all "use" insns that are not SET.
> >>> Why not use single_set there too?
> >>
> >> Also, I don't see insn costs considered?
> >> (Also: typo "immidiate".)
> >>
> >
> > The only change that this pass does is to change offsets where
> > possible and then simplify add immediate instructions to register
> > moves.
> > I don't see how this could result in worse performance and by
> > extension I don't see where insn costs could be used.
> > Do you have any thoughts about where to use the costs?
> If the offset crosses an architectural size boundary such that the
> instruction was longer, but still valid, it could affect the cost.
>
Ok, I haven't thought about that. I will try a prototype in case we
want to include it in a next iteration of this.

> That's the most obvious case to me.  There may be others.
>
> Any progress on that m68k issue?  I've also got a report of x264 failing
> to build on riscv64 with the V2 variant, but I haven't distilled that
> down to a testcase yet.
>
I have sent a V3 which contains a number of fixes and improvements:
https://gcc.gnu.org/pipermail/gcc-patches/2023-July/624439.html
I tested the new version rebased on master and the m68k issue did not reproduce.
I don't know what exactly fixed it; do we need to know why or is it
enough that the issue is gone following some general fixes?
It is highly possible that this also fixes the x264 failure. Please
let me know if the issue persists with v3 once you're able to test.

Manolis

> jeff


Re: Re: [PATCH V7] RISC-V: RISC-V: Support gather_load/scatter RVV auto-vectorization

2023-07-13 Thread 钟居哲
No, I am just want to whether we have some CALL vectorization need len or mask 
predication.

For example, Current GCC vectorization  CALL onyl FMAX/FMIN/FMA/FNMA/FMS/FNMS 
these CALL function
need length or mask predicate. I don't care about sin/cos/popcount...etc. We 
just use full vector autovectorization
is fine, no need to support RVV into middle-end.



juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2023-07-13 22:32
To: Palmer Dabbelt; gcc-patches
CC: rdapp.gcc; richard.guenther; juzhe.zhong; Kito Cheng; kito.cheng
Subject: Re: [PATCH V7] RISC-V: RISC-V: Support gather_load/scatter RVV 
auto-vectorization
  From my understanding, we dont have RVV instruction for fmax/fmin?
> 
> Unless I'm misunderstanding, we do.  The ISA manual says
> 
> === Vector Floating-Point MIN/MAX Instructions
> 
> The vector floating-point `vfmin` and `vfmax` instructions have the
> same behavior as the corresponding scalar floating-point instructions
> in version 2.2 of the RISC-V F/D/Q extension: they perform the 
> `minimumNumber`
> or `maximumNumber` operation on active elements.
> 
> 
> # Floating-point minimum
> vfmin.vv vd, vs2, vs1, vm   # Vector-vector
> vfmin.vf vd, vs2, rs1, vm   # vector-scalar
> 
> # Floating-point maximum
> vfmax.vv vd, vs2, vs1, vm   # Vector-vector
> vfmax.vf vd, vs2, rs1, vm   # vector-scalar
> 
> 
> so we should be able to match at least some loops.
 
We're already emitting those (e.g. for a[i] = a[i] > b[i] ? a[i] : b[i])
but for fmin/fmax they are not wired up yet (as opposed to the scalar variants).
Juzhe are you referring to something else?  I mean it's always a bit tricky
for backends to verify if the fmin/fmax behavior exactly matches the instruction
regards signaling nans, rounding etc but if the scalar variant is fine
I don't see why the vector variant would be worse. 
 
Regards
Robin
 
 


Re: [RFC] light expander sra for parameters and returns

2023-07-13 Thread Jiufu Guo via Gcc-patches


Hi Martin,

Jiufu Guo via Gcc-patches  writes:

> Hi,
>
> Martin Jambor  writes:
>
>> Hi,
>>
>> On Tue, May 30 2023, Richard Biener wrote:
>>> On Mon, 29 May 2023, Jiufu Guo wrote:
>>>
 Hi,
 
 Previously, I was investigating some struct parameters and returns related
 PRs 69143/65421/108073.
 
 Investigating the issues case by case, and drafting patches for each of
 them one by one. This would help us to enhance code incrementally.
 While, this way, patches would interact with each other and implement
 different codes for similar issues (because of the different paths in
 gimple/rtl).  We may have a common fix for those issues.
 
 We know a few other related PRs(such as meta-bug PR101926) exist. For those
 PRs in different targets with different symptoms (and also different root
 cause), I would expect a method could help some of them, but it may
 be hard to handle all of them in one fix.
 
 With investigation and check discussion for the issues, I remember a
 suggestion from Richard: it would be nice to perform some SRA-like analysis
 for the accesses on the structs (parameter/returns).
 https://gcc.gnu.org/pipermail/gcc-patches/2022-November/605117.html
 This may be a 'fairly common method' for those issues. With this idea,
 I drafted a patch as below in this mail.
 
 I also thought about directly using tree-sra.cc, e.g. enhance it and rerun 
 it
 at the end of GIMPLE passes. While since some issues are introduced inside
 the expander, so below patch also co-works with other parts of the 
 expander.
 And since we already have tree-sra in gimple pass, we only need to take 
 more
 care on parameter and return in this patch: other decls could be handled
 well in tree-sra.
 
 The steps of this patch are:
 1. Collect struct type parameters and returns, and then scan the function 
 to
 get the accesses on them. And figure out the accesses which would be 
 profitable
 to be scalarized (using registers of the parameter/return ). Now, reading 
 on
 parameter and writing on returns are checked in the current patch.
 2. When/after the scalar registers are determined/expanded for the return 
 or
 parameters, compute the corresponding scalar register(s) for each accesses 
 of
 the return/parameter, and prepare the scalar RTLs for those accesses.
 3. When using/expanding the accesses expression, leverage the 
 computed/prepared
 scalars directly.
 
 This patch is tested on ppc64 both LE and BE.
 To continue, I would ask for comments and suggestions first. And then I 
 would
 update/enhance accordingly.  Thanks in advance!
>>>
>>> Thanks for working on this - the description above sounds exactly like
>>> what should be done.
>>>
>>> Now - I'd like the code to re-use the access tree data structure from
>>> SRA plus at least the worker creating the accesses from a stmt.
>>

I'm thinking about which part of the code can be re-used from
ipa-sra and tree-sra.
It seems there are some similar concepts between them:
"access with offset/size", "collect and check candidates",
"analyze accesses"...

While because the purposes are different, the logic and behavior
between them (ipa-sra, tree-sra, and expander-sra) are different,
even for similar concepts.

The same behavior and similar concept may be reusable. Below list
may be part of them.
*. allocate and maintain access
   basic access structure: offset, size, reverse
*. type or expr checking
*. disqualify
*. scan and build expr access
*. scan and walk stmts (return/assign/call/asm)
*. collect candidates
*. initialize/deinitialize
*. access dump

There are different behaviors for a similar concept.
For examples:
*. Access has grg/queues in tree-sra, access has nonarg in ipa-sra,
and expander-sra does not check access's child/sibling yet.
*. for same stmt(assign/call), different sra checks different logic.
*. candidates have different checking logic: ipa-sra checks more stuff.

Is this align with your thoughts?  Thanks for comments!

BR,
Jeff (Jiufu Guo)

> Thanks Martin for your reply and thanks for your time!
>
>> I have had a first look at the patch but still need to look into it more
>> to understand how it uses the information it gathers.
>>
>> My plan is to make the access-tree infrastructure of IPA-SRA more
>> generic and hopefully usable even for this purpose, rather than the one
>> in tree-sra.cc.  But that really builds a tree of accesses, bailing out
>> on any partial overlaps, for example, which may not be the right thing
>> here since I don't see any tree-building here.
>
> Yeap, both in tree-sra and ipa-sra, there are concepts about
> "access" and "scan functions/stmts". In this light-sra, these concepts
> are also used. And you may notice that ipa-sra and tree-sra have more
> logic than the current 'light-expand-sra'.
>
> Currently, the 

[PATCH v2 0/2] ifcvt: Allow if conversion of arithmetic in basic blocks with multiple sets

2023-07-13 Thread Manolis Tsamis


noce_convert_multiple_sets has been introduced and extended over time to handle
if conversion for blocks with multiple sets. Currently this is focused on
register moves and rejects any sort of arithmetic operations.

This series is an extension to allow more sequences to take part in if
conversion. The first patch is a required change to emit correct code and the
second patch whitelists a larger number of operations through
bb_ok_for_noce_convert_multiple_sets.

For targets that have a rich selection of conditional instructions,
like aarch64, I have seen an ~5x increase of profitable if conversions for
multiple set blocks in SPEC benchmarks. Also tested with a wide variety of
benchmarks and I have not seen performance regressions on either x64 / aarch64.

Some samples that previously resulted in a branch but now better use these
instructions can be seen in the provided test case.

Tested on aarch64 and x64; On x64 some tests that use __builtin_rint are
failing with an ICE but I believe that it's not an issue of this change.
force_operand crashes when (and:DF (not:DF (reg:DF 88)) (reg/v:DF 83 [ x ]))
is provided through emit_conditional_move.


Changes in v2:
- Change "conditional moves" to "conditional instructions"
in bb_ok_for_noce_convert_multiple_sets's comment.

Manolis Tsamis (2):
  ifcvt: handle sequences that clobber flags in
noce_convert_multiple_sets
  ifcvt: Allow more operations in multiple set if conversion

 gcc/ifcvt.cc  | 109 ++
 .../aarch64/ifcvt_multiple_sets_arithm.c  |  67 +++
 2 files changed, 127 insertions(+), 49 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/ifcvt_multiple_sets_arithm.c

-- 
2.34.1



Re: [PATCH 2/2] ifcvt: Allow more operations in multiple set if conversion

2023-07-13 Thread Manolis Tsamis
I resent this with just the change in the comment.
OK to merge?

Manolis

On Tue, Jul 4, 2023 at 5:32 PM Manolis Tsamis  wrote:
>
> On Mon, Jul 3, 2023 at 12:12 PM Robin Dapp  wrote:
> >
> > Hi Manolis,
> >
> > that looks like a nice enhancement of what's already possible.  The concern
> > I had some years back already was that this function would eventually
> > grow and cannibalize on some of what the other functions in ifcvt already
> > do :)  At some point we really should unify but that's not within the
> > scope of this patch.
> >
>
> Hi Robin,
>
> Indeed and it would be nice to extend the multi statement
> implementation to the point that the others are not needed :)
> I have some future plans to analyze cases where the multi-statement
> performs worse and improve on that.
>
> > IMHO we're already pretty far towards general "conditional execution"
> > with conditional increments, selects and so on (and the function is still
> > called "_noce") and historically the cond_exec functions would have
> > taken care of that.  To my knowledge though, none of the major backends
> > implements anything like (cond_exec ...) anymore and relies on bit-twiddling
> > tricks to generate the conditional instructions.
> >
> > Have you checked whether cond_exec and others could be adjusted to
> > handle the conditional instructions you want to see?  They don't perform
> > full cost comparison though but just count.
> >
>
> Thanks for mentioning that, I was not really aware of cond_exec usage.
> As you say, it looks like cond_exec isn't used very much on major backends.
>
> Since noce_convert_multiple_sets_1 is just using the existing ifcvt
> machinery (specifically noce_emit_cmove / try_emit_cmove_seq), is this
> a question of whether we want to replace (if_then_else ...) with
> (cond_exec ...) in general?
> If that is beneficial then I could try to implement a change like
> this, but that should probably be a separate effort from this
> implementation.
>
> > I would expect a bit of discussion around that but from a first look
> > I don't have major concerns.
> >
> > > -/* Return true iff basic block TEST_BB is comprised of only
> > > -   (SET (REG) (REG)) insns suitable for conversion to a series
> > > -   of conditional moves.  Also check that we have more than one set
> > > -   (other routines can handle a single set better than we would), and
> > > -   fewer than PARAM_MAX_RTL_IF_CONVERSION_INSNS sets.  While going
> > > +/* Return true iff basic block TEST_BB is suitable for conversion to a
> > > +   series of conditional moves.  Also check that we have more than one
> >
> > Might want to change the "conditional moves" while you're at it.
> >
>
> Thanks for pointing out this comment, I missed it. I will rewrite the
> relevant parts.
>
> > >
> > > -  if (!((REG_P (src) || CONSTANT_P (src))
> > > - || (GET_CODE (src) == SUBREG && REG_P (SUBREG_REG (src))
> > > -   && subreg_lowpart_p (src
> > > +  /* Allow a wide range of operations and let the costing function 
> > > decide
> > > +  if the conversion is worth it later.  */
> > > +  enum rtx_code code = GET_CODE (src);
> > > +  if (!(CONSTANT_P (src)
> > > + || code == REG
> > > + || code == SUBREG
> > > + || code == ZERO_EXTEND
> > > + || code == SIGN_EXTEND
> > > + || code == NOT
> > > + || code == NEG
> > > + || code == PLUS
> > > + || code == MINUS
> > > + || code == AND
> > > + || code == IOR
> > > + || code == MULT
> > > + || code == ASHIFT
> > > + || code == ASHIFTRT
> > > + || code == NE
> > > + || code == EQ
> > > + || code == GE
> > > + || code == GT
> > > + || code == LE
> > > + || code == LT
> > > + || code == GEU
> > > + || code == GTU
> > > + || code == LEU
> > > + || code == LTU
> > > + || code == COMPARE))
> >
> > We're potentially checking many more patterns than before.  Maybe it
> > would make sense to ask the backend whether it has a pattern for
> > the respective code?
> >
>
> Is it an issue if the backend doesn't have a pattern for a respective code?
>
> My goal here is to not limit if conversion for sequences based on the
> code but rather let ifcvt / the backedn decide based on costing.
> That's because from what I've seen, conditional set instructions can
> be beneficial even when the backend doesn't have a specific
> instruction for that code.
>
> Best,
> Manolis
>
> > Regards
> >  Robin
> >


[pushed][RA][PR109520]: Catch error when there are no enough registers for asm insn

2023-07-13 Thread Vladimir Makarov via Gcc-patches

The following patch solves

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109520

The patch was successfully bootstrapped and tested on x86-64, aarch64, 
and ppc64le.


commit b175b4887f928118af997f6d4d75097a64dcec5d
Author: Vladimir N. Makarov 
Date:   Thu Jul 13 10:42:17 2023 -0400

[RA][PR109520]: Catch error when there are no enough registers for asm insn

Asm insn unlike other insns can have so many operands whose
constraints can not be satisfied.  It results in LRA cycling for such
test case.  The following patch catches such situation and reports the
problem.

PR middle-end/109520

gcc/ChangeLog:

* lra-int.h (lra_insn_recog_data): Add member asm_reloads_num.
(lra_asm_insn_error): New prototype.
* lra.cc: Include rtl_error.h.
(lra_set_insn_recog_data): Initialize asm_reloads_num.
(lra_asm_insn_error): New func whose code is taken from ...
* lra-assigns.cc (lra_split_hard_reg_for): ... here.  Use lra_asm_insn_error.
* lra-constraints.cc (curr_insn_transform): Check reloads nummber for asm.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr109520.c: New test.

diff --git a/gcc/lra-assigns.cc b/gcc/lra-assigns.cc
index 2f95121df06..3555926af66 100644
--- a/gcc/lra-assigns.cc
+++ b/gcc/lra-assigns.cc
@@ -1851,20 +1851,8 @@ lra_split_hard_reg_for (void)
   insn = lra_insn_recog_data[u]->insn;
   if (asm_noperands (PATTERN (insn)) >= 0)
 	{
-	  lra_asm_error_p = asm_p = true;
-	  error_for_asm (insn,
-			 "% operand has impossible constraints");
-	  /* Avoid further trouble with this insn.  */
-	  if (JUMP_P (insn))
-	{
-	  ira_nullify_asm_goto (insn);
-	  lra_update_insn_regno_info (insn);
-	}
-	  else
-	{
-	  PATTERN (insn) = gen_rtx_USE (VOIDmode, const0_rtx);
-	  lra_set_insn_deleted (insn);
-	}
+	  asm_p = true;
+	  lra_asm_insn_error (insn);
 	}
   else if (!asm_p)
 	{
diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index 9bfc88149ff..0c6912d6e7d 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -4813,6 +4813,10 @@ curr_insn_transform (bool check_only_p)
   lra_update_operator_dups (curr_id);
   /* Something changes -- process the insn.	 */
   lra_update_insn_regno_info (curr_insn);
+  if (asm_noperands (PATTERN (curr_insn)) >= 0
+	  && ++curr_id->asm_reloads_num >= FIRST_PSEUDO_REGISTER)
+	/* Most probably there are no enough registers to satisfy asm insn: */
+	lra_asm_insn_error (curr_insn);
 }
   lra_process_new_insns (curr_insn, before, after, "Inserting insn reload");
   return change_p;
diff --git a/gcc/lra-int.h b/gcc/lra-int.h
index 4dbe6672f3a..a32359e5772 100644
--- a/gcc/lra-int.h
+++ b/gcc/lra-int.h
@@ -209,6 +209,9 @@ public:
  debug insn.  LRA_NON_CLOBBERED_ALT means ignoring any earlier
  clobbers for the insn.  */
   int used_insn_alternative;
+  /* Defined for asm insn and it is how many times we already generated reloads
+ for the asm insn.  */
+  int asm_reloads_num;
   /* SP offset before the insn relative to one at the func start.  */
   poly_int64 sp_offset;
   /* The insn itself.  */
@@ -307,6 +310,7 @@ extern void lra_delete_dead_insn (rtx_insn *);
 extern void lra_emit_add (rtx, rtx, rtx);
 extern void lra_emit_move (rtx, rtx);
 extern void lra_update_dups (lra_insn_recog_data_t, signed char *);
+extern void lra_asm_insn_error (rtx_insn *insn);
 
 extern void lra_process_new_insns (rtx_insn *, rtx_insn *, rtx_insn *,
    const char *);
diff --git a/gcc/lra.cc b/gcc/lra.cc
index c8b3f139acd..563aff10b96 100644
--- a/gcc/lra.cc
+++ b/gcc/lra.cc
@@ -106,6 +106,7 @@ along with GCC; see the file COPYING3.	If not see
 #include "backend.h"
 #include "target.h"
 #include "rtl.h"
+#include "rtl-error.h"
 #include "tree.h"
 #include "predict.h"
 #include "df.h"
@@ -536,6 +537,27 @@ lra_update_dups (lra_insn_recog_data_t id, signed char *nops)
 	*id->dup_loc[i] = *id->operand_loc[nop];
 }
 
+/* Report asm insn error and modify the asm insn.  */
+void
+lra_asm_insn_error (rtx_insn *insn)
+{
+  lra_asm_error_p = true;
+  error_for_asm (insn,
+		 "% operand has impossible constraints"
+		 " or there are not enough registers");
+  /* Avoid further trouble with this insn.  */
+  if (JUMP_P (insn))
+{
+  ira_nullify_asm_goto (insn);
+  lra_update_insn_regno_info (insn);
+}
+  else
+{
+  PATTERN (insn) = gen_rtx_USE (VOIDmode, const0_rtx);
+  lra_set_insn_deleted (insn);
+}
+}
+
 
 
 /* This page contains code dealing with info about registers in the
@@ -973,6 +995,7 @@ lra_set_insn_recog_data (rtx_insn *insn)
   lra_insn_recog_data[uid] = data;
   data->insn = insn;
   data->used_insn_alternative = LRA_UNKNOWN_ALT;
+  data->asm_reloads_num = 0;
   data->icode = icode;
   data->regs = NULL;
   if (DEBUG_INSN_P (insn))
diff --git a/gcc/testsuite/gcc.target/i386/pr109520.c 

Re: [PATCH V7] RISC-V: RISC-V: Support gather_load/scatter RVV auto-vectorization

2023-07-13 Thread Palmer Dabbelt
On Thu, 13 Jul 2023 07:01:26 PDT (-0700), gcc-patches@gcc.gnu.org wrote:
>
>
> On 7/13/23 01:47, Richard Biener wrote:
>> On Thu, Jul 13, 2023 at 1:30 AM 钟居哲  wrote:
>>>
>>> I notice vectorizable_call in Loop Vectorizer.
>>> It's vectorizing CALL function for example like fmax/fmin.
>>>  From my understanding, we dont have RVV instruction for fmax/fmin?

Unless I'm misunderstanding, we do.  The ISA manual says

=== Vector Floating-Point MIN/MAX Instructions

The vector floating-point `vfmin` and `vfmax` instructions have the
same behavior as the corresponding scalar floating-point instructions
in version 2.2 of the RISC-V F/D/Q extension: they perform the 
`minimumNumber`
or `maximumNumber` operation on active elements.


# Floating-point minimum
vfmin.vv vd, vs2, vs1, vm   # Vector-vector
vfmin.vf vd, vs2, rs1, vm   # vector-scalar

# Floating-point maximum
vfmax.vv vd, vs2, vs1, vm   # Vector-vector
vfmax.vf vd, vs2, rs1, vm   # vector-scalar


so we should be able to match at least some loops.

>>
>> There's things like .POPCOUNT which we can vectorize, but sure, it
>> depends on the ISA if there's anything.
> Right.  And RV has some of these -- vcpop, vfirst...  Supporting them
> obviously isn't a requirement for a vector implementation, but they're
> nice to have :-)
>
> Jeff


[PATCH v3] Implement new RTL optimizations pass: fold-mem-offsets.

2023-07-13 Thread Manolis Tsamis
This is a new RTL pass that tries to optimize memory offset calculations
by moving them from add immediate instructions to the memory loads/stores.
For example it can transform this:

  addi t4,sp,16
  add  t2,a6,t4
  shl  t3,t2,1
  ld   a2,0(t3)
  addi a2,1
  sd   a2,8(t2)

into the following (one instruction less):

  add  t2,a6,sp
  shl  t3,t2,1
  ld   a2,32(t3)
  addi a2,1
  sd   a2,24(t2)

Although there are places where this is done already, this pass is more
powerful and can handle the more difficult cases that are currently not
optimized. Also, it runs late enough and can optimize away unnecessary
stack pointer calculations.

gcc/ChangeLog:

* Makefile.in: Add fold-mem-offsets.o.
* passes.def: Schedule a new pass.
* tree-pass.h (make_pass_fold_mem_offsets): Declare.
* common.opt: New options.
* doc/invoke.texi: Document new option.
* fold-mem-offsets.cc: New file.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/fold-mem-offsets-1.c: New test.
* gcc.target/riscv/fold-mem-offsets-2.c: New test.
* gcc.target/riscv/fold-mem-offsets-3.c: New test.

Signed-off-by: Manolis Tsamis 
---

Changes in v3:
- Added propagation for more codes:
  sub, neg, mul.
- Added folding / elimination for sub and
  const int moves.
- For the validity check of the generated addresses
  also test memory_address_addr_space_p.
- Replaced GEN_INT with gen_int_mode.
- Replaced some bitmap_head with auto_bitmap.
- Refactor each phase into own function for readability.
- Add dump details.
- Replace rtx iteration with reg_mentioned_p.
- Return early for codes that we can't propagate through.

 gcc/Makefile.in   |   1 +
 gcc/common.opt|   4 +
 gcc/doc/invoke.texi   |   8 +
 gcc/fold-mem-offsets.cc   | 749 ++
 gcc/passes.def|   1 +
 .../gcc.target/riscv/fold-mem-offsets-1.c |  16 +
 .../gcc.target/riscv/fold-mem-offsets-2.c |  24 +
 .../gcc.target/riscv/fold-mem-offsets-3.c |  17 +
 gcc/tree-pass.h   |   1 +
 9 files changed, 821 insertions(+)
 create mode 100644 gcc/fold-mem-offsets.cc
 create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index c478ec85201..6a5c2915133 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1430,6 +1430,7 @@ OBJS = \
fixed-value.o \
fold-const.o \
fold-const-call.o \
+   fold-mem-offsets.o \
function.o \
function-abi.o \
function-tests.o \
diff --git a/gcc/common.opt b/gcc/common.opt
index 25f650e2dae..901947f1db5 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1248,6 +1248,10 @@ fcprop-registers
 Common Var(flag_cprop_registers) Optimization
 Perform a register copy-propagation optimization pass.
 
+ffold-mem-offsets
+Target Bool Var(flag_fold_mem_offsets) Init(1)
+Fold instructions calculating memory offsets to the memory access instruction 
if possible.
+
 fcrossjumping
 Common Var(flag_crossjumping) Optimization
 Perform cross-jumping optimization.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index cbc1282c274..dc4e6922bb5 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -539,6 +539,7 @@ Objective-C and Objective-C++ Dialects}.
 -fauto-inc-dec  -fbranch-probabilities
 -fcaller-saves
 -fcombine-stack-adjustments  -fconserve-stack
+-ffold-mem-offsets
 -fcompare-elim  -fcprop-registers  -fcrossjumping
 -fcse-follow-jumps  -fcse-skip-blocks  -fcx-fortran-rules
 -fcx-limited-range
@@ -14293,6 +14294,13 @@ the comparison operation before register allocation is 
complete.
 
 Enabled at levels @option{-O1}, @option{-O2}, @option{-O3}, @option{-Os}.
 
+@opindex ffold-mem-offsets
+@item -ffold-mem-offsets
+@itemx -fno-fold-mem-offsets
+Try to eliminate add instructions by folding them in memory loads/stores.
+
+Enabled at levels @option{-O2}, @option{-O3}.
+
 @opindex fcprop-registers
 @item -fcprop-registers
 After register allocation and post-register allocation instruction splitting,
diff --git a/gcc/fold-mem-offsets.cc b/gcc/fold-mem-offsets.cc
new file mode 100644
index 000..a27c9ab18a4
--- /dev/null
+++ b/gcc/fold-mem-offsets.cc
@@ -0,0 +1,749 @@
+/* Late RTL pass to fold memory offsets.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT 

Re: Re: [PATCH] RISC-V: Enable COND_LEN_FMA auto-vectorization

2023-07-13 Thread 钟居哲
Yes. Not always fail.



juzhe.zh...@rivai.ai
 
From: Kito Cheng
Date: 2023-07-13 22:39
To: juzhe.zh...@rivai.ai
CC: Robin Dapp; gcc-patches; jeffreyalaw; kito.cheng; palmer; palmer
Subject: Re: Re: [PATCH] RISC-V: Enable COND_LEN_FMA auto-vectorization
I didn’t try on local yet, but it sounds like …the code size might larger than 
normal case?

juzhe.zh...@rivai.ai 於 2023年7月13日 週四,19:50寫道:
Could you tell me how to add the comment?
I am not familiar with link/binutils stuff.



juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2023-07-13 19:40
To: Juzhe-Zhong; gcc-patches
CC: rdapp.gcc; kito.cheng; kito.cheng; palmer; palmer; jeffreyalaw
Subject: Re: [PATCH] RISC-V: Enable COND_LEN_FMA auto-vectorization
Hi Juzhe,
 
thanks, no complaints from my side apart from one:
 
> +/* { dg-additional-options "-mcmodel=medany" } */
 
Please add a comment why we need this.
 
Regards
Robin
 


Re: [PATCH v3] Implement new RTL optimizations pass: fold-mem-offsets.

2023-07-13 Thread Manolis Tsamis
In this version I have made f-m-o able to also eliminate constant
moves in addition to the add constant instructions.
This increases the number of simplified/eliminated instructions and is
a good addition for RISC style ISAs where these are more common.

This has led to pr52146.c failing in x86, which I haven't been able to
find a way to fix.
This involves directly writing to a constant address with -mx32

The code
movl$-18874240, %eax
movl$0, (%eax)

is 'optimized' to
movl$0, %eax
movl$0, -18874240(%eax)

Which is actually
movl$0, -18874240

which is wrong per the ticket.
The fix for the ticket involved changes to legitimate_address_p which
f-m-o does call but it doesn't reject due to the existence of (%eax)
which in turn is actually zero.
I believe this is not strictly an f-m-o issue since the pass calls all
the required functions to test whether the newly synthesized memory
instruction is valid.

Any ideas on how to solve this issue is appreciated.

Manolis

On Thu, Jul 13, 2023 at 5:13 PM Manolis Tsamis  wrote:
>
> This is a new RTL pass that tries to optimize memory offset calculations
> by moving them from add immediate instructions to the memory loads/stores.
> For example it can transform this:
>
>   addi t4,sp,16
>   add  t2,a6,t4
>   shl  t3,t2,1
>   ld   a2,0(t3)
>   addi a2,1
>   sd   a2,8(t2)
>
> into the following (one instruction less):
>
>   add  t2,a6,sp
>   shl  t3,t2,1
>   ld   a2,32(t3)
>   addi a2,1
>   sd   a2,24(t2)
>
> Although there are places where this is done already, this pass is more
> powerful and can handle the more difficult cases that are currently not
> optimized. Also, it runs late enough and can optimize away unnecessary
> stack pointer calculations.
>
> gcc/ChangeLog:
>
> * Makefile.in: Add fold-mem-offsets.o.
> * passes.def: Schedule a new pass.
> * tree-pass.h (make_pass_fold_mem_offsets): Declare.
> * common.opt: New options.
> * doc/invoke.texi: Document new option.
> * fold-mem-offsets.cc: New file.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/fold-mem-offsets-1.c: New test.
> * gcc.target/riscv/fold-mem-offsets-2.c: New test.
> * gcc.target/riscv/fold-mem-offsets-3.c: New test.
>
> Signed-off-by: Manolis Tsamis 
> ---
>
> Changes in v3:
> - Added propagation for more codes:
>   sub, neg, mul.
> - Added folding / elimination for sub and
>   const int moves.
> - For the validity check of the generated addresses
>   also test memory_address_addr_space_p.
> - Replaced GEN_INT with gen_int_mode.
> - Replaced some bitmap_head with auto_bitmap.
> - Refactor each phase into own function for readability.
> - Add dump details.
> - Replace rtx iteration with reg_mentioned_p.
> - Return early for codes that we can't propagate through.
>
>  gcc/Makefile.in   |   1 +
>  gcc/common.opt|   4 +
>  gcc/doc/invoke.texi   |   8 +
>  gcc/fold-mem-offsets.cc   | 749 ++
>  gcc/passes.def|   1 +
>  .../gcc.target/riscv/fold-mem-offsets-1.c |  16 +
>  .../gcc.target/riscv/fold-mem-offsets-2.c |  24 +
>  .../gcc.target/riscv/fold-mem-offsets-3.c |  17 +
>  gcc/tree-pass.h   |   1 +
>  9 files changed, 821 insertions(+)
>  create mode 100644 gcc/fold-mem-offsets.cc
>  create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c
>
> diff --git a/gcc/Makefile.in b/gcc/Makefile.in
> index c478ec85201..6a5c2915133 100644
> --- a/gcc/Makefile.in
> +++ b/gcc/Makefile.in
> @@ -1430,6 +1430,7 @@ OBJS = \
> fixed-value.o \
> fold-const.o \
> fold-const-call.o \
> +   fold-mem-offsets.o \
> function.o \
> function-abi.o \
> function-tests.o \
> diff --git a/gcc/common.opt b/gcc/common.opt
> index 25f650e2dae..901947f1db5 100644
> --- a/gcc/common.opt
> +++ b/gcc/common.opt
> @@ -1248,6 +1248,10 @@ fcprop-registers
>  Common Var(flag_cprop_registers) Optimization
>  Perform a register copy-propagation optimization pass.
>
> +ffold-mem-offsets
> +Target Bool Var(flag_fold_mem_offsets) Init(1)
> +Fold instructions calculating memory offsets to the memory access 
> instruction if possible.
> +
>  fcrossjumping
>  Common Var(flag_crossjumping) Optimization
>  Perform cross-jumping optimization.
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index cbc1282c274..dc4e6922bb5 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -539,6 +539,7 @@ Objective-C and Objective-C++ Dialects}.
>  -fauto-inc-dec  -fbranch-probabilities
>  -fcaller-saves
>  

Re: [PATCH] RISC-V: Enable COND_LEN_FMA auto-vectorization

2023-07-13 Thread Robin Dapp via Gcc-patches


> Is COND _LEN FMA ok for trunk?  I can commit it without changing
> scatter store testcase fix.
> 
> It makes no sense block cond Len fma support. The middle end support
> has already been merged.

Then just add a TODO or so that says e.g. "For some reason we exceed
the default code model's +-2 GiB limits.  We should investigate why and
add a proper description here.  For now just make sure the test case
compiles properly".

Regards
 Robin



[committed] alpha: Fix computation mode in alpha_emit_set_long_cost [PR106966]

2023-07-13 Thread Uros Bizjak via Gcc-patches
PR target/106966

gcc/ChangeLog:

* config/alpha/alpha.cc (alpha_emit_set_long_const):
Always use DImode when constructing long const.

gcc/testsuite/ChangeLog:

* gcc.target/alpha/pr106966.c: New test.

Bootstrapped and regression tested by Matthias on alpha-linux-gnu.

Uros.
diff --git a/gcc/config/alpha/alpha.cc b/gcc/config/alpha/alpha.cc
index 360b50e20d4..beeab06a1aa 100644
--- a/gcc/config/alpha/alpha.cc
+++ b/gcc/config/alpha/alpha.cc
@@ -2070,6 +2070,8 @@ static rtx
 alpha_emit_set_long_const (rtx target, HOST_WIDE_INT c1)
 {
   HOST_WIDE_INT d1, d2, d3, d4;
+  machine_mode mode = GET_MODE (target);
+  rtx orig_target = target;
 
   /* Decompose the entire word */
 
@@ -2082,6 +2084,9 @@ alpha_emit_set_long_const (rtx target, HOST_WIDE_INT c1)
   d4 = ((c1 & 0x) ^ 0x8000) - 0x8000;
   gcc_assert (c1 == d4);
 
+  if (mode != DImode)
+target = gen_lowpart (DImode, target);
+
   /* Construct the high word */
   if (d4)
 {
@@ -2101,7 +2106,7 @@ alpha_emit_set_long_const (rtx target, HOST_WIDE_INT c1)
   if (d1)
 emit_move_insn (target, gen_rtx_PLUS (DImode, target, GEN_INT (d1)));
 
-  return target;
+  return orig_target;
 }
 
 /* Given an integral CONST_INT or CONST_VECTOR, return the low 64 bits.  */
diff --git a/gcc/testsuite/gcc.target/alpha/pr106966.c 
b/gcc/testsuite/gcc.target/alpha/pr106966.c
new file mode 100644
index 000..7145c2096c6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/alpha/pr106966.c
@@ -0,0 +1,13 @@
+/* PR target/106906 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbuild-constants" } */
+
+void
+do_console (unsigned short *vga)
+{
+  vga[0] = 'H';
+  vga[1] = 'e';
+  vga[2] = 'l';
+  vga[3] = 'l';
+  vga[4] = 'o';
+}


Re: [PATCH v2] Implement new RTL optimizations pass: fold-mem-offsets.

2023-07-13 Thread Jeff Law via Gcc-patches




On 7/13/23 08:20, Manolis Tsamis wrote:



I have sent a V3 which contains a number of fixes and improvements:
https://gcc.gnu.org/pipermail/gcc-patches/2023-July/624439.html
I tested the new version rebased on master and the m68k issue did not reproduce.
I don't know what exactly fixed it; do we need to know why or is it
enough that the issue is gone following some general fixes?
It is highly possible that this also fixes the x264 failure. Please
let me know if the issue persists with v3 once you're able to test.

Sounds good.  I'll test both m68k and x264 on rv64 with the latest patch.

jeff


[x86 PATCH] PR target/110588: Add *bt_setncqi_2 to generate btl

2023-07-13 Thread Roger Sayle

This patch resolves PR target/110588 to catch another case in combine
where the i386 backend should be generating a btl instruction.  This adds
another define_insn_and_split to recognize the RTL representation for this
case.

I also noticed that two related define_insn_and_split weren't using the
preferred string style for single statement preparation-statements, so
I've reformatted these to be consistent in style with the new one.

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures.  Ok for mainline?


2023-07-13  Roger Sayle  

gcc/ChangeLog
PR target/110588
* config/i386/i386.md (*bt_setcqi): Prefer string form
preparation statement over braces for a single statement.
(*bt_setncqi): Likewise.
(*bt_setncqi_2): New define_insn_and_split.

gcc/testsuite/ChangeLog
PR target/110588
* gcc.target/i386/pr110588.c: New test case.


Thanks again,
Roger
--

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e47ced1..04eca049 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16170,9 +16170,7 @@
  (const_int 0)))
(set (match_dup 0)
 (eq:QI (reg:CCC FLAGS_REG) (const_int 0)))]
-{
-  operands[2] = lowpart_subreg (SImode, operands[2], QImode);
-})
+  "operands[2] = lowpart_subreg (SImode, operands[2], QImode);")
 
 ;; Help combine recognize bt followed by setnc
 (define_insn_and_split "*bt_setncqi"
@@ -16193,9 +16191,7 @@
  (const_int 0)))
(set (match_dup 0)
 (ne:QI (reg:CCC FLAGS_REG) (const_int 0)))]
-{
-  operands[2] = lowpart_subreg (SImode, operands[2], QImode);
-})
+  "operands[2] = lowpart_subreg (SImode, operands[2], QImode);")
 
 (define_insn_and_split "*bt_setnc"
   [(set (match_operand:SWI48 0 "register_operand")
@@ -16219,6 +16215,27 @@
   operands[2] = lowpart_subreg (SImode, operands[2], QImode);
   operands[3] = gen_reg_rtx (QImode);
 })
+
+;; Help combine recognize bt followed by setnc (PR target/110588)
+(define_insn_and_split "*bt_setncqi_2"
+  [(set (match_operand:QI 0 "register_operand")
+   (eq:QI
+ (zero_extract:SWI48
+   (match_operand:SWI48 1 "register_operand")
+   (const_int 1)
+   (zero_extend:SI (match_operand:QI 2 "register_operand")))
+ (const_int 0)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_USE_BT && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG)
+(compare:CCC
+ (zero_extract:SWI48 (match_dup 1) (const_int 1) (match_dup 2))
+ (const_int 0)))
+   (set (match_dup 0)
+(ne:QI (reg:CCC FLAGS_REG) (const_int 0)))]
+  "operands[2] = lowpart_subreg (SImode, operands[2], QImode);")
 
 ;; Store-flag instructions.
 
diff --git a/gcc/testsuite/gcc.target/i386/pr110588.c 
b/gcc/testsuite/gcc.target/i386/pr110588.c
new file mode 100644
index 000..4505c87
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110588.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=core2" } */
+
+unsigned char foo (unsigned char x, int y)
+{
+  int _1 = (int) x;
+  int _2 = _1 >> y;
+  int _3 = _2 & 1;
+  unsigned char _8 = (unsigned char) _3;
+  unsigned char _6 = _8 ^ 1;
+  return _6;
+}
+
+/* { dg-final { scan-assembler "btl" } } */
+/* { dg-final { scan-assembler "setnc" } } */
+/* { dg-final { scan-assembler-not "sarl" } } */
+/* { dg-final { scan-assembler-not "andl" } } */
+/* { dg-final { scan-assembler-not "xorl" } } */


Re: [PATCH] c++: redundant targ coercion for var/alias tmpls

2023-07-13 Thread Patrick Palka via Gcc-patches
On Wed, 28 Jun 2023, Patrick Palka wrote:

> On Wed, Jun 28, 2023 at 11:50 AM Jason Merrill  wrote:
> >
> > On 6/23/23 12:23, Patrick Palka wrote:
> > > On Fri, 23 Jun 2023, Jason Merrill wrote:
> > >
> > >> On 6/21/23 13:19, Patrick Palka wrote:
> > >>> When stepping through the variable/alias template specialization code
> > >>> paths, I noticed we perform template argument coercion twice: first from
> > >>> instantiate_alias_template / finish_template_variable and again from
> > >>> tsubst_decl (during instantiate_template).  It should suffice to perform
> > >>> coercion once.
> > >>>
> > >>> To that end patch elides this second coercion from tsubst_decl when
> > >>> possible.  We can't get rid of it completely because we don't always
> > >>> specialize a variable template from finish_template_variable: we could
> > >>> also be doing so directly from instantiate_template during variable
> > >>> template partial specialization selection, in which case the coercion
> > >>> from tsubst_decl would be the first and only coercion.
> > >>
> > >> Perhaps we should be coercing in lookup_template_variable rather than
> > >> finish_template_variable?
> > >
> > > Ah yes, there's a patch for that at
> > > https://gcc.gnu.org/pipermail/gcc-patches/2023-May/617377.html :)
> >
> > So after that patch, can we get rid of the second coercion completely?
> 
> On second thought it should be possible to get rid of it, if we
> rearrange things to always pass the primary arguments to tsubst_decl,
> and perform partial specialization selection from there instead of
> instantiate_template.  Let me try...

Like so?  Bootstrapped and regtested on x86_64-pc-linux-gnu.

-- >8 --

When stepping through the variable/alias template specialization code
paths, I noticed we perform template argument coercion twice: first from
instantiate_alias_template / finish_template_variable and again from
tsubst_decl (during instantiate_template).  It'd be good to avoid this
redundant coercion.

It turns out that this coercion could be safely elided whenever
specializing a primary variable/alias template, because we can rely on
lookup_template_variable and instantiate_alias_template to already have
coerced the arguments.

The other situation to consider is when fully specializing a partial
variable template specialization (from instantiate_template), in which
case the passed 'args' are the (already coerced) arguments relative to
the partial template and 'argvec', the result of substitution into
DECL_TI_ARGS, are the (uncoerced) arguments relative to the primary
template, so coercion is still necessary.  We can still avoid this
coercion however if we always pass the primary variable template to
tsubst_decl from instantiate_template, and instead perform partial
specialization selection directly from tsubst_decl.  This patch
implements this approach.

gcc/cp/ChangeLog:

* pt.cc (tsubst_decl) : Don't call
coerce_template_parms.  Call most_specialized_partial_spec
when fully specializing a variable template here ...
(instantiate_template): ... instead of here.  Always pass
the primary variable template pattern to tsubst_decl.
---
 gcc/cp/pt.cc | 62 +++-
 1 file changed, 27 insertions(+), 35 deletions(-)

diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index fa15b75b9c5..53968b823d5 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -15194,6 +15194,7 @@ tsubst_decl (tree t, tree args, tsubst_flags_t complain)
/* Check to see if we already have the specialization we
   need.  */
tree spec = NULL_TREE;
+   tree partial_ti = NULL_TREE;
bool local_p = false;
tree ctx = DECL_CONTEXT (t);
if (!(VAR_P (t) && DECL_LOCAL_DECL_P (t))
@@ -15230,17 +15231,29 @@ tsubst_decl (tree t, tree args, tsubst_flags_t 
complain)
tmpl = DECL_TI_TEMPLATE (t);
gen_tmpl = most_general_template (tmpl);
argvec = tsubst (DECL_TI_ARGS (t), args, complain, in_decl);
-   if (argvec != error_mark_node
-   && PRIMARY_TEMPLATE_P (gen_tmpl)
-   && TMPL_ARGS_DEPTH (args) >= TMPL_ARGS_DEPTH (argvec))
- /* We're fully specializing a template declaration, so
-we need to coerce the innermost arguments corresponding to
-the template.  */
- argvec = (coerce_template_parms
-   (DECL_TEMPLATE_PARMS (gen_tmpl),
-argvec, tmpl, complain));
if (argvec == error_mark_node)
  RETURN (error_mark_node);
+   if (variable_template_p (gen_tmpl)
+   && TMPL_ARGS_DEPTH (args) >= TMPL_ARGS_DEPTH (argvec))
+ {
+   /* We need to determine if we're using a partial
+  specialization now, because the type of the
+  variable could be different.  */
+   

RE: [PATCH 9/19] middle-end: refactor vectorizable_comparison to make the main body re-usable.

2023-07-13 Thread Richard Biener via Gcc-patches
On Wed, 28 Jun 2023, Tamar Christina wrote:

> Adding proper maintainers.
> 
> > -Original Message-
> > From: Tamar Christina 
> > Sent: Wednesday, June 28, 2023 2:46 PM
> > To: gcc-patches@gcc.gnu.org
> > Cc: nd ; Richard Earnshaw ;
> > Marcus Shawcroft ; Kyrylo Tkachov
> > ; Richard Sandiford
> > 
> > Subject: [PATCH 9/19]AArch64 middle-end: refactor vectorizable_comparison
> > to make the main body re-usable.
> > 
> > Hi All,
> > 
> > Vectorization of a gcond starts off essentially the same as vectorizing a
> > comparison witht he only difference being how the operands are extracted.
> > 
> > This refactors vectorable_comparison such that we now have a generic
> > function that can be used from vectorizable_early_break.  The refactoring
> > splits the gassign checks and actual validation/codegen off to a helper
> > function.
> > 
> > No change in functionality expected.
> > 
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > 
> > Ok for master?
> > 
> > Thanks,
> > Tamar
> > 
> > gcc/ChangeLog:
> > 
> > * tree-vect-stmts.cc (vectorizable_comparison): Refactor, splitting
> > body
> > to ...
> > (vectorizable_comparison_1): ...This.
> > 
> > --- inline copy of patch --
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index
> > ae24f3e66e63d9bd9763284a47fb2c911335c4c1..f3e33cd4ed125b9564ca8
> > 1acd197693fc3457c31 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -11332,21 +11332,22 @@ vectorizable_condition (vec_info *vinfo,
> > 
> >  /* vectorizable_comparison.
> > 
> > -   Check if STMT_INFO is comparison expression that can be vectorized.
> > +/* Helper of vectorizable_comparison.
> > +
> > +   Check if STMT_INFO is comparison expression CODE that can be vectorized.
> > If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
> > comparison, put it in VEC_STMT, and insert it at GSI.
> > 
> > Return true if STMT_INFO is vectorizable in this way.  */
> > 
> >  static bool
> > -vectorizable_comparison (vec_info *vinfo,
> > -stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
> > -gimple **vec_stmt,
> > -slp_tree slp_node, stmt_vector_for_cost *cost_vec)
> > +vectorizable_comparison_1 (vec_info *vinfo, tree vectype,
> > +  stmt_vec_info stmt_info, tree_code code,
> > +  gimple_stmt_iterator *gsi, gimple **vec_stmt,
> > +  slp_tree slp_node, stmt_vector_for_cost *cost_vec)
> >  {
> >tree lhs, rhs1, rhs2;
> >tree vectype1 = NULL_TREE, vectype2 = NULL_TREE;
> > -  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> >tree vec_rhs1 = NULL_TREE, vec_rhs2 = NULL_TREE;
> >tree new_temp;
> >loop_vec_info loop_vinfo = dyn_cast  (vinfo); @@ -11354,7
> > +11355,7 @@ vectorizable_comparison (vec_info *vinfo,
> >int ndts = 2;
> >poly_uint64 nunits;
> >int ncopies;
> > -  enum tree_code code, bitop1 = NOP_EXPR, bitop2 = NOP_EXPR;
> > +  enum tree_code bitop1 = NOP_EXPR, bitop2 = NOP_EXPR;
> >int i;
> >bb_vec_info bb_vinfo = dyn_cast  (vinfo);
> >vec vec_oprnds0 = vNULL;
> > @@ -11377,14 +11378,6 @@ vectorizable_comparison (vec_info *vinfo,
> >  ncopies = vect_get_num_copies (loop_vinfo, vectype);
> > 
> >gcc_assert (ncopies >= 1);
> > -  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
> > -return false;
> > -
> > -  gassign *stmt = dyn_cast  (stmt_info->stmt);
> > -  if (!stmt)
> > -return false;
> > -
> > -  code = gimple_assign_rhs_code (stmt);
> > 
> >if (TREE_CODE_CLASS (code) != tcc_comparison)
> >  return false;
> > @@ -11499,7 +11492,6 @@ vectorizable_comparison (vec_info *vinfo,
> >   return false;
> > }
> > 
> > -  STMT_VINFO_TYPE (stmt_info) = comparison_vec_info_type;
> >vect_model_simple_cost (vinfo, stmt_info,
> >   ncopies * (1 + (bitop2 != NOP_EXPR)),
> >   dts, ndts, slp_node, cost_vec); @@ -11565,6
> > +11557,44 @@ vectorizable_comparison (vec_info *vinfo,
> >return true;
> >  }
> > 
> > +/* vectorizable_comparison.
> > +
> > +   Check if STMT_INFO is comparison expression that can be vectorized.
> > +   If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
> > +   comparison, put it in VEC_STMT, and insert it at GSI.
> > +
> > +   Return true if STMT_INFO is vectorizable in this way.  */
> > +
> > +static bool
> > +vectorizable_comparison (vec_info *vinfo,
> > +stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
> > +gimple **vec_stmt,
> > +slp_tree slp_node, stmt_vector_for_cost *cost_vec) {

{ to the next line

> > +  bb_vec_info bb_vinfo = dyn_cast  (vinfo);
> > +
> > +  if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
> > +return false;
> > +
> > +  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
> > +return false;
> > +
> > +  

[x86_64 PATCH] Improved insv of DImode/DFmode {high, low}parts into TImode.

2023-07-13 Thread Roger Sayle

This is the next piece towards a fix for (the x86_64 ABI issues affecting)
PR 88873.  This patch generalizes the recent tweak to ix86_expand_move
for setting the highpart of a TImode reg from a DImode source using
*insvti_highpart_1, to handle both DImode and DFmode sources, and also
use the recently added *insvti_lowpart_1 for setting the lowpart.

Although this is another intermediate step (not yet a fix), towards
enabling *insvti and *concat* patterns to be candidates for TImode STV
(by using V2DI/V2DF instructions), it already improves things a little.

For the test case from PR 88873

typedef struct { double x, y; } s_t;
typedef double v2df __attribute__ ((vector_size (2 * sizeof(double;

s_t foo (s_t a, s_t b, s_t c)
{
  return (s_t) { fma(a.x, b.x, c.x), fma (a.y, b.y, c.y) };
}


With -O2 -march=cascadelake, GCC currently generates:

Before (29 instructions):
vmovq   %xmm2, -56(%rsp)
movq-56(%rsp), %rdx
vmovq   %xmm4, -40(%rsp)
movq$0, -48(%rsp)
movq%rdx, -56(%rsp)
movq-40(%rsp), %rdx
vmovq   %xmm0, -24(%rsp)
movq%rdx, -40(%rsp)
movq-24(%rsp), %rsi
movq-56(%rsp), %rax
movq$0, -32(%rsp)
vmovq   %xmm3, -48(%rsp)
movq-48(%rsp), %rcx
vmovq   %xmm5, -32(%rsp)
vmovq   %rax, %xmm6
movq-40(%rsp), %rax
movq$0, -16(%rsp)
movq%rsi, -24(%rsp)
movq-32(%rsp), %rsi
vpinsrq $1, %rcx, %xmm6, %xmm6
vmovq   %rax, %xmm7
vmovq   %xmm1, -16(%rsp)
vmovapd %xmm6, %xmm3
vpinsrq $1, %rsi, %xmm7, %xmm7
vfmadd132pd -24(%rsp), %xmm7, %xmm3
vmovapd %xmm3, -56(%rsp)
vmovsd  -48(%rsp), %xmm1
vmovsd  -56(%rsp), %xmm0
ret

After (20 instructions):
vmovq   %xmm2, -56(%rsp)
movq-56(%rsp), %rax
vmovq   %xmm3, -48(%rsp)
vmovq   %xmm4, -40(%rsp)
movq-48(%rsp), %rcx
vmovq   %xmm5, -32(%rsp)
vmovq   %rax, %xmm6
movq-40(%rsp), %rax
movq-32(%rsp), %rsi
vpinsrq $1, %rcx, %xmm6, %xmm6
vmovq   %xmm0, -24(%rsp)
vmovq   %rax, %xmm7
vmovq   %xmm1, -16(%rsp)
vmovapd %xmm6, %xmm2
vpinsrq $1, %rsi, %xmm7, %xmm7
vfmadd132pd -24(%rsp), %xmm7, %xmm2
vmovapd %xmm2, -56(%rsp)
vmovsd  -48(%rsp), %xmm1
vmovsd  -56(%rsp), %xmm0
ret

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures.  No testcase yet, as the above code will hopefully
change dramatically with the next pieces.  Ok for mainline?


2023-07-13  Roger Sayle  

gcc/ChangeLog
* config/i386/i386-expand.cc (ix86_expand_move): Generalize special
case inserting of 64-bit values into a TImode register, to handle
both DImode and DFmode using either *insvti_lowpart_1
or *isnvti_highpart_1.


Thanks again,
Roger
--

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 92ffa4b..fe87f8e 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -542,22 +542,39 @@ ix86_expand_move (machine_mode mode, rtx operands[])
}
 }
 
-  /* Use *insvti_highpart_1 to set highpart of TImode register.  */
+  /* Special case inserting 64-bit values into a TImode register.  */
   if (TARGET_64BIT
-  && mode == DImode
+  && (mode == DImode || mode == DFmode)
   && SUBREG_P (op0)
-  && SUBREG_BYTE (op0) == 8
   && GET_MODE (SUBREG_REG (op0)) == TImode
   && REG_P (SUBREG_REG (op0))
   && REG_P (op1))
 {
-  wide_int mask = wi::mask (64, false, 128);
-  rtx tmp = immed_wide_int_const (mask, TImode);
-  op0 = SUBREG_REG (op0);
-  tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
-  op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
-  op1 = gen_rtx_ASHIFT (TImode, op1, GEN_INT (64));
-  op1 = gen_rtx_IOR (TImode, tmp, op1);
+  /* Use *insvti_lowpart_1 to set lowpart.  */
+  if (SUBREG_BYTE (op0) == 0)
+   {
+ wide_int mask = wi::mask (64, true, 128);
+ rtx tmp = immed_wide_int_const (mask, TImode);
+ op0 = SUBREG_REG (op0);
+ tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
+ if (mode == DFmode)
+   op1 = force_reg (DImode, gen_lowpart (DImode, op1));
+ op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
+ op1 = gen_rtx_IOR (TImode, tmp, op1);
+   }
+  /* Use *insvti_highpart_1 to set highpart.  */
+  else if (SUBREG_BYTE (op0) == 8)
+   {
+ wide_int mask = wi::mask (64, false, 128);
+ rtx tmp = immed_wide_int_const (mask, TImode);
+ op0 = SUBREG_REG (op0);
+ tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
+ if (mode == DFmode)
+   op1 = force_reg (DImode, gen_lowpart (DImode, op1));
+

  1   2   >