[PATCH v5] libgfortran: Replace mutex with rwlock

2023-05-08 Thread Lipeng Zhu via Gcc-patches
This patch try to introduce the rwlock and split the read/write to
unit_root tree and unit_cache with rwlock instead of the mutex to
increase CPU efficiency. In the get_gfc_unit function, the percentage
to step into the insert_unit function is around 30%, in most instances,
we can get the unit in the phase of reading the unit_cache or unit_root
tree. So split the read/write phase by rwlock would be an approach to
make it more parallel.

BTW, the IPC metrics can gain around 9x in our test
server with 220 cores. The benchmark we used is
https://github.com/rwesson/NEAT

libgcc/ChangeLog:

* gthr-posix.h (__GTHREAD_RWLOCK_INIT): New macro
(__gthrw): New function
(__gthread_rwlock_rdlock): New function
(__gthread_rwlock_tryrdlock): New function
(__gthread_rwlock_wrlock): New function
(__gthread_rwlock_trywrlock): New function
(__gthread_rwlock_unlock): New function

libgfortran/ChangeLog:

* io/async.c (DEBUG_LINE): New
* io/async.h (RWLOCK_DEBUG_ADD): New macro
(CHECK_RDLOCK): New macro
(CHECK_WRLOCK): New macro
(TAIL_RWLOCK_DEBUG_QUEUE): New macro
(IN_RWLOCK_DEBUG_QUEUE): New macro
(RDLOCK): New macro
(WRLOCK): New macro
(RWUNLOCK): New macro
(RD_TO_WRLOCK): New macro
(INTERN_RDLOCK): New macro
(INTERN_WRLOCK): New macro
(INTERN_RWUNLOCK): New macro
* io/io.h (internal_proto): Define unit_rwlock
* io/transfer.c (st_read_done_worker): Relace unit_lock with unit_rwlock
(st_write_done_worker): Relace unit_lock with unit_rwlock
* io/unit.c (get_gfc_unit): Relace unit_lock with unit_rwlock
(if): Relace unit_lock with unit_rwlock
(close_unit_1): Relace unit_lock with unit_rwlock
(close_units): Relace unit_lock with unit_rwlock
(newunit_alloc): Relace unit_lock with unit_rwlock
* io/unix.c (flush_all_units): Relace unit_lock with unit_rwlock

---
v1 -> v2:
Limit the pthread_rwlock usage in libgcc only when __cplusplus isn't defined.

v2 -> v3:
Rebase the patch with trunk branch.

v3 -> v4:
Update the comments.

v4 -> v5:
Fix typos and code formatter.

Reviewed-by: Hongjiu Lu 
Signed-off-by: Lipeng Zhu 
---
 libgcc/gthr-posix.h   |  60 +++
 libgfortran/io/async.c|   4 +
 libgfortran/io/async.h| 154 +-
 libgfortran/io/io.h   |  15 ++--
 libgfortran/io/transfer.c |   8 +-
 libgfortran/io/unit.c |  71 +++---
 libgfortran/io/unix.c |  16 ++--
 7 files changed, 281 insertions(+), 47 deletions(-)

diff --git a/libgcc/gthr-posix.h b/libgcc/gthr-posix.h
index aebcfdd9f4c..73283082997 100644
--- a/libgcc/gthr-posix.h
+++ b/libgcc/gthr-posix.h
@@ -48,6 +48,9 @@ typedef pthread_t __gthread_t;
 typedef pthread_key_t __gthread_key_t;
 typedef pthread_once_t __gthread_once_t;
 typedef pthread_mutex_t __gthread_mutex_t;
+#ifndef __cplusplus
+typedef pthread_rwlock_t __gthread_rwlock_t;
+#endif
 typedef pthread_mutex_t __gthread_recursive_mutex_t;
 typedef pthread_cond_t __gthread_cond_t;
 typedef struct timespec __gthread_time_t;
@@ -58,6 +61,9 @@ typedef struct timespec __gthread_time_t;
 
 #define __GTHREAD_MUTEX_INIT PTHREAD_MUTEX_INITIALIZER
 #define __GTHREAD_MUTEX_INIT_FUNCTION __gthread_mutex_init_function
+#ifndef __cplusplus
+#define __GTHREAD_RWLOCK_INIT PTHREAD_RWLOCK_INITIALIZER
+#endif
 #define __GTHREAD_ONCE_INIT PTHREAD_ONCE_INIT
 #if defined(PTHREAD_RECURSIVE_MUTEX_INITIALIZER)
 #define __GTHREAD_RECURSIVE_MUTEX_INIT PTHREAD_RECURSIVE_MUTEX_INITIALIZER
@@ -135,6 +141,13 @@ __gthrw(pthread_mutexattr_init)
 __gthrw(pthread_mutexattr_settype)
 __gthrw(pthread_mutexattr_destroy)
 
+#ifndef __cplusplus
+__gthrw(pthread_rwlock_rdlock)
+__gthrw(pthread_rwlock_tryrdlock)
+__gthrw(pthread_rwlock_wrlock)
+__gthrw(pthread_rwlock_trywrlock)
+__gthrw(pthread_rwlock_unlock)
+#endif
 
 #if defined(_LIBOBJC) || defined(_LIBOBJC_WEAK)
 /* Objective-C.  */
@@ -885,6 +898,53 @@ __gthread_cond_destroy (__gthread_cond_t* __cond)
   return __gthrw_(pthread_cond_destroy) (__cond);
 }
 
+#ifndef __cplusplus
+static inline int
+__gthread_rwlock_rdlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_rdlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_tryrdlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_tryrdlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_wrlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_wrlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_trywrlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_trywrlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_unlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_unlock) (__rwlock);
+  else
+ 

Re: [PATCH v4] libgfortran: Replace mutex with rwlock

2023-05-08 Thread Zhu, Lipeng via Gcc-patches




On 1/1/1970 8:00 AM, Bernhard Reutner-Fischer wrote:

On Mon,  8 May 2023 17:44:43 +0800
Lipeng Zhu  wrote:


This patch try to introduce the rwlock and split the read/write to
unit_root tree and unit_cache with rwlock instead of the mutex to
increase CPU efficiency. In the get_gfc_unit function, the percentage
to step into the insert_unit function is around 30%, in most
instances, we can get the unit in the phase of reading the unit_cache
or unit_root tree. So split the read/write phase by rwlock would be an
approach to make it more parallel.

BTW, the IPC metrics can gain around 9x in our test server with 220
cores. The benchmark we used is https://github.com/rwesson/NEAT


See commentary typos below.
You did not state if you regression tested the patch?
I use valgrind --tool=helgrind or --tool=drd to test 'make 
check-fortran'. Is it necessary to add an additional unit test for this 
patch?



Other than that it LGTM but i cannot approve it.
Thank you for your kind help for this patch, is there anything that I 
can do or can you help to push this patch forward?





diff --git a/libgfortran/io/async.h b/libgfortran/io/async.h index
ad226c8e856..0033cc74252 100644
--- a/libgfortran/io/async.h
+++ b/libgfortran/io/async.h
@@ -210,6 +210,128 @@
  DEBUG_PRINTF ("%s" DEBUG_RED "ACQ:" DEBUG_NORM " %-30s %78p\n", 
aio_prefix, #mutex,

mutex); \

Thanks, corrected in Patch v5.


} while (0)
  
+#ifdef __GTHREAD_RWLOCK_INIT

+#define RWLOCK_DEBUG_ADD(rwlock) do {  \
+aio_rwlock_debug *n;   \
+n = xmalloc (sizeof(aio_rwlock_debug));\


Missing space before the open brace: sizeof (


Thanks, corrected in Patch v5.


diff --git a/libgfortran/io/unit.c b/libgfortran/io/unit.c index
82664dc5f98..62f1db21d34 100644
--- a/libgfortran/io/unit.c
+++ b/libgfortran/io/unit.c
@@ -33,34 +33,36 @@ see the files COPYING3 and COPYING.RUNTIME
respectively.  If not, see
  
  
  /* IO locking rules:

-   UNIT_LOCK is a master lock, protecting UNIT_ROOT tree and UNIT_CACHE.
+   UNIT_RWLOCK is a master lock, protecting UNIT_ROOT tree and UNIT_CACHE.
+   And use the rwlock to spilt read and write phase to UNIT_ROOT tree
+   and UNIT_CACHE to increase CPU efficiency.


s/spilt/split. Maybe:

Using an rwlock improves efficiency by allowing us to separate readers and 
writers of both UNIT_ROOT
and UNIT_CACHE.


Thanks, corrected in Patch v5.


@@ -350,6 +356,17 @@ retry:
if (c == 0)
break;
  }
+  /* We did not find a unit in the cache nor in the unit list, create a new
+(locked) unit and insert into the unit list and cache.
+Manipulating either or both the unit list and the unit cache requires to
+hold a write-lock [for obvious reasons]:
+1. By separating the read/write lock, it will greatly reduce the contention
+   at the read part, while write part is not always necessary or most
+   unlikely once the unit hit in cache.


+By separating the read/write lock, we will greatly reduce the contention
+on the read part, while the write part is unlikely once the unit hits
+the cache.


+2. We try to balance the implementation complexity and the performance
+   gains that fit into current cases we observed by just using a
+   pthread_rwlock. */


Let's drop 2.


Got it, thanks!

thanks,


[PATCH] RISC-V: Fix dead loop for user vsetvli intrinsic avl checking [PR109773]

2023-05-08 Thread juzhe . zhong
From: Juzhe-Zhong 

This patch is fix dead loop in vsetvl intrinsic avl checking.

vsetvli->get_def () has vsetvli->get_def () has vsetvli.
Then it will keep looping in the vsetvli avl checking which is a dead loop.

PR target/109773

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc (avl_source_has_vsetvl_p): New function.
(source_equal_p): Fix dead loop in vsetvl avl checking.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/vsetvl/pr109773-1.c: New test.
* gcc.target/riscv/rvv/vsetvl/pr109773-2.c: New test.

---
 gcc/config/riscv/riscv-vsetvl.cc  | 25 ++
 .../gcc.target/riscv/rvv/vsetvl/pr109773-1.c  | 20 ++
 .../gcc.target/riscv/rvv/vsetvl/pr109773-2.c  | 26 +++
 3 files changed, 71 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109773-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109773-2.c

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 72aa2bfcf6f..2577b2bd9b7 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -1056,6 +1056,24 @@ change_vsetvl_insn (const insn_info *insn, const 
vector_insn_info )
   change_insn (rinsn, new_pat);
 }
 
+static bool
+avl_source_has_vsetvl_p (set_info *avl_source)
+{
+  if (!avl_source)
+return false;
+  if (!avl_source->insn ())
+return false;
+  if (avl_source->insn ()->is_real ())
+return vsetvl_insn_p (avl_source->insn ()->rtl ());
+  hash_set sets = get_all_sets (avl_source, true, false, true);
+  for (const auto set : sets)
+{
+  if (set->insn ()->is_real () && vsetvl_insn_p (set->insn ()->rtl ()))
+   return true;
+}
+  return false;
+}
+
 static bool
 source_equal_p (insn_info *insn1, insn_info *insn2)
 {
@@ -1098,6 +1116,13 @@ source_equal_p (insn_info *insn1, insn_info *insn2)
   vector_insn_info insn1_info, insn2_info;
   insn1_info.parse_insn (insn1);
   insn2_info.parse_insn (insn2);
+
+  /* To avoid dead loop, we don't optimize a vsetvli def has vsetvli
+instructions which will complicate the situation.  */
+  if (avl_source_has_vsetvl_p (insn1_info.get_avl_source ())
+ || avl_source_has_vsetvl_p (insn2_info.get_avl_source ()))
+   return false;
+
   if (insn1_info.same_vlmax_p (insn2_info)
  && insn1_info.compatible_avl_p (insn2_info))
return true;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109773-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109773-1.c
new file mode 100644
index 000..8656e473117
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109773-1.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32 -fno-tree-vectorize 
-fno-schedule-insns -fno-schedule-insns2" } */
+
+#include "riscv_vector.h"
+
+void f (int32_t *a, int32_t *b, int n)
+{
+  if (n <= 0)
+return;
+  int i = n;
+  size_t vl = __riscv_vsetvl_e8mf4 (i);
+  for (; i >= 0; i--)
+{
+  vint32m1_t v = __riscv_vle32_v_i32m1 (a + i, vl);
+  __riscv_vse32_v_i32m1 (b + i, v, vl);
+  vl = __riscv_vsetvl_e8mf4 (vl);
+}
+}
+
+/* { dg-final { scan-assembler-times {vsetvli} 3 { target { no-opts "-O0" 
no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" 
} } } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109773-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109773-2.c
new file mode 100644
index 000..7bfbaaf3713
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109773-2.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32 -fno-tree-vectorize 
-fno-schedule-insns -fno-schedule-insns2" } */
+
+#include "riscv_vector.h"
+
+void f (int32_t * a, int32_t * b, int n)
+{
+if (n <= 0)
+  return;
+int i = n;
+size_t vl = __riscv_vsetvl_e8mf4 (i);
+for (; i >= 0; i--)
+  {
+vint32m1_t v = __riscv_vle32_v_i32m1 (a + i, vl);
+   v = __riscv_vle32_v_i32m1_tu (v, a + i + 100, vl);
+__riscv_vse32_v_i32m1 (b + i, v, vl);
+
+if (i >= vl)
+  continue;
+if (i == 0)
+  return;
+vl = __riscv_vsetvl_e8mf4 (vl);
+  }
+}
+
+/* { dg-final { scan-assembler {vsetvli} { target { no-opts "-O0" no-opts 
"-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } 
*/
-- 
2.36.3



[PATCH V3] RISC-V: Optimize vsetvli of LCM INSERTED edge for user vsetvli [PR 109743]

2023-05-08 Thread juzhe . zhong
From: Juzhe-Zhong 

Rebase to trunk and send V3 patch for:
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/617821.html

This patch is fixing: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109743.

This issue happens is because we are currently very conservative in 
optimization of user vsetvli.

Consider this following case:

bb 1:
  vsetvli a5,a4... (demand AVL = a4).
bb 2:
  RVV insn use a5 (demand AVL = a5).

LCM will hoist vsetvl of bb 2 into bb 1.
We don't do AVL propagation for this situation since it's complicated that
we should analyze the code sequence between vsetvli in bb 1 and RVV insn in bb 
2.
They are not necessary the consecutive blocks.

This patch is doing the optimizations after LCM, we will check and eliminate 
the vsetvli
in LCM inserted edge if such vsetvli is redundant. Such approach is much 
simplier and safe.

code:
void
foo2 (int32_t *a, int32_t *b, int n)
{
  if (n <= 0)
  return;
  int i = n;
  size_t vl = __riscv_vsetvl_e32m1 (i);

  for (; i >= 0; i--)
  {
vint32m1_t v = __riscv_vle32_v_i32m1 (a, vl);
__riscv_vse32_v_i32m1 (b, v, vl);

if (i >= vl)
  continue;

if (i == 0)
  return;

vl = __riscv_vsetvl_e32m1 (i);
  }
}

Before this patch:
foo2:
.LFB2:
.cfi_startproc
ble a2,zero,.L1
mv  a4,a2
li  a3,-1
vsetvli a5,a2,e32,m1,ta,mu
vsetvli zero,a5,e32,m1,ta,ma  <- can be eliminated.
.L5:
vle32.v v1,0(a0)
vse32.v v1,0(a1)
bgeua4,a5,.L3
.L10:
beq a2,zero,.L1
vsetvli a5,a4,e32,m1,ta,mu
addia4,a4,-1
vsetvli zero,a5,e32,m1,ta,ma  <- can be eliminated.
vle32.v v1,0(a0)
vse32.v v1,0(a1)
addiw   a2,a2,-1
bltua4,a5,.L10
.L3:
addiw   a2,a2,-1
addia4,a4,-1
bne a2,a3,.L5
.L1:
ret

After this patch:
f:
ble a2,zero,.L1
mv  a4,a2
li  a3,-1
vsetvli a5,a2,e32,m1,ta,ma
.L5:
vle32.v v1,0(a0)
vse32.v v1,0(a1)
bgeua4,a5,.L3
.L10:
beq a2,zero,.L1
vsetvli a5,a4,e32,m1,ta,ma
addia4,a4,-1
vle32.v v1,0(a0)
vse32.v v1,0(a1)
addiw   a2,a2,-1
bltua4,a5,.L10
.L3:
addiw   a2,a2,-1
addia4,a4,-1
bne a2,a3,.L5
.L1:
ret

PR target/109743

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc 
(pass_vsetvl::local_eliminate_vsetvl_insn): Enhance local optimizations for LCM.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/vsetvl/pr109743-1.c: New test.
* gcc.target/riscv/rvv/vsetvl/pr109743-2.c: New test.
* gcc.target/riscv/rvv/vsetvl/pr109743-3.c: New test.
* gcc.target/riscv/rvv/vsetvl/pr109743-4.c: New test.

---
 gcc/config/riscv/riscv-vsetvl.cc  | 47 ++-
 .../gcc.target/riscv/rvv/vsetvl/pr109743-1.c  | 26 ++
 .../gcc.target/riscv/rvv/vsetvl/pr109743-2.c  | 27 +++
 .../gcc.target/riscv/rvv/vsetvl/pr109743-3.c  | 28 +++
 .../gcc.target/riscv/rvv/vsetvl/pr109743-4.c  | 28 +++
 5 files changed, 155 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-3.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-4.c

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index d4d6f336ef9..72aa2bfcf6f 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -4026,7 +4026,8 @@ pass_vsetvl::local_eliminate_vsetvl_insn (const 
vector_insn_info ) const
{
  if (i->is_call () || i->is_asm ()
  || find_access (i->defs (), VL_REGNUM)
- || find_access (i->defs (), VTYPE_REGNUM))
+ || find_access (i->defs (), VTYPE_REGNUM)
+ || find_access (i->defs (), REGNO (vl)))
return;
 
  if (has_vtype_op (i->rtl ()))
@@ -4051,6 +4052,50 @@ pass_vsetvl::local_eliminate_vsetvl_insn (const 
vector_insn_info ) const
  return;
}
}
+
+  /* Here we optimize the VSETVL is hoisted by LCM:
+
+Before LCM:
+  bb 1:
+vsetvli a5,a2,e32,m1,ta,mu
+  bb 2:
+vsetvli zero,a5,e32,m1,ta,mu
+...
+
+After LCM:
+  bb 1:
+vsetvli a5,a2,e32,m1,ta,mu
+LCM INSERTED: vsetvli zero,a5,e32,m1,ta,mu --> eliminate
+  bb 2:
+...
+  Such instruction can not be accessed in RTL_SSA when we don't re-init
+  the new RTL_SSA framework but it is definetely at the END of the block. 
*/
+  rtx_insn *end_vsetvl = BB_END (bb->cfg_bb ());
+  if (!vsetvl_discard_result_insn_p (end_vsetvl))
+   {
+ if (JUMP_P (end_vsetvl)
+ && 

Ping: [PATCH 0/2] Unify and deduplicate FTM code

2023-05-08 Thread Arsen Arsenović via Gcc-patches
Ping.

This patch rebases cleanly as of right now.  No new changes to review.

Have a lovely day!
-- 
Arsen Arsenović


signature.asc
Description: PGP signature


[PATCH][PUSHED] Fix cfg maintenance after inlining in AutoFDO

2023-05-08 Thread Eugene Rozenfeld via Gcc-patches
Todo from early_inliner needs to be propagated so that
cleanup_tree_cfg () is called if necessary.

This bug was causing an assert in get_loop_body during
ipa-sra in autoprofiledbootstrap build since loops weren't
fixed up and one of the loops had num_nodes set to 0.

Tested on x86_64-pc-linux-gnu.

gcc/ChangeLog:

* auto-profile.cc (auto_profile): Check todo from early_inline
to see if cleanup_tree_vfg needs to be called.
(early_inline): Return todo from early_inliner.
---
 gcc/auto-profile.cc | 21 -
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index 360c42c4b89..e3af3555e75 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -1589,13 +1589,14 @@ afdo_annotate_cfg (const stmt_set _stmts)
 
 /* Wrapper function to invoke early inliner.  */
 
-static void
+static unsigned int
 early_inline ()
 {
   compute_fn_summary (cgraph_node::get (current_function_decl), true);
-  unsigned todo = early_inliner (cfun);
+  unsigned int todo = early_inliner (cfun);
   if (todo & TODO_update_ssa_any)
 update_ssa (TODO_update_ssa);
+  return todo;
 }
 
 /* Use AutoFDO profile to annoate the control flow graph.
@@ -1651,20 +1652,22 @@ auto_profile (void)
function before annotation, so the profile inside bar@loc_foo2
will be useful.  */
 autofdo::stmt_set promoted_stmts;
+unsigned int todo = 0;
 for (int i = 0; i < 10; i++)
   {
-if (!flag_value_profile_transformations
-|| !autofdo::afdo_vpt_for_early_inline (_stmts))
-  break;
-early_inline ();
+   if (!flag_value_profile_transformations
+   || !autofdo::afdo_vpt_for_early_inline (_stmts))
+ break;
+   todo |= early_inline ();
   }
 
-early_inline ();
+todo |= early_inline ();
 autofdo::afdo_annotate_cfg (promoted_stmts);
 compute_function_frequency ();
 
 /* Local pure-const may imply need to fixup the cfg.  */
-if (execute_fixup_cfg () & TODO_cleanup_cfg)
+todo |= execute_fixup_cfg ();
+if (todo & TODO_cleanup_cfg)
   cleanup_tree_cfg ();
 
 free_dominance_info (CDI_DOMINATORS);
@@ -1674,7 +1677,7 @@ auto_profile (void)
 pop_cfun ();
   }
 
-  return TODO_rebuild_cgraph_edges;
+  return 0;
 }
 } /* namespace autofdo.  */
 
-- 
2.25.1



Re: [PATCH v2] xtensa: Make full transition to LRA

2023-05-08 Thread Jeff Law via Gcc-patches




On 5/8/23 08:44, Takayuki 'January June' Suwa via Gcc-patches wrote:

On 2023/05/08 22:43, Richard Biener wrote:
[snip]

-mlra


If they were in any released compiler options should be kept
(doing nothing) for backward compatibility.  Use for example

mlra
Target WarnRemoved
Removed in GCC 14.  This switch has no effect.

or

mlra
Target Ignore
Does nothing.  Preserved for backward compatibility.

which doesn't inform the user (I think that's the better choice here).


-Target Mask(LRA)

Thank you for your helpful advice.

=
gcc/ChangeLog:

* config/xtensa/constraints.md (R, T, U):
Change define_constraint to define_memory_constraint.
* config/xtensa/xtensa.cc
(xtensa_lra_p, TARGET_LRA_P): Remove.
(xtensa_emit_move_sequence): Remove "if (reload_in_progress)"
clause as it can no longer be true.
(xtensa_output_integer_literal_parts): Consider 16-bit wide
constants.
(xtensa_legitimate_constant_p): Add short-circuit path for
integer load instructions.
* config/xtensa/xtensa.md (movsf): Use can_create_pseudo_p()
rather reload_in_progress and reload_completed.
* config/xtensa/xtensa.opt (mlra): Change to no effect.
---
  gcc/config/xtensa/constraints.md | 26 --
  gcc/config/xtensa/xtensa.cc  | 26 +-
  gcc/config/xtensa/xtensa.md  |  2 +-
  gcc/config/xtensa/xtensa.opt |  4 ++--
  4 files changed, 16 insertions(+), 42 deletions(-)

diff --git a/gcc/config/xtensa/constraints.md b/gcc/config/xtensa/constraints.md
index 53e4d0d8dd1..9b31e162941 100644
--- a/gcc/config/xtensa/constraints.md
+++ b/gcc/config/xtensa/constraints.md
@@ -123,29 +123,19 @@
(and (match_code "const_int")
   (match_test "! xtensa_split1_finished_p ()"
  
-;; Memory constraints.  Do not use define_memory_constraint here.  Doing so

-;; causes reload to force some constants into the constant pool, but since
-;; the Xtensa constant pool can only be accessed with L32R instructions, it
-;; is always better to just copy a constant into a register.  Instead, use
-;; regular constraints but add a check to allow pseudos during reload.
+;; Memory constraints.
  
-(define_constraint "R"

+(define_memory_constraint "R"
   "Memory that can be accessed with a 4-bit unsigned offset from a register."
- (ior (and (match_code "mem")
-  (match_test "smalloffset_mem_p (op)"))
-  (and (match_code "reg")
-  (match_test "reload_in_progress
-   && REGNO (op) >= FIRST_PSEUDO_REGISTER"
+ (and (match_code "mem")
+  (match_test "smalloffset_mem_p (op)")))
  
-(define_constraint "T"

+(define_memory_constraint "T"
   "Memory in a literal pool (addressable with an L32R instruction)."
   (and (match_code "mem")
(match_test "!TARGET_CONST16 && constantpool_mem_p (op)")))
  
-(define_constraint "U"

+(define_memory_constraint "U"
   "Memory that is not in a literal pool."
- (ior (and (match_code "mem")
-  (match_test "! constantpool_mem_p (op)"))
-  (and (match_code "reg")
-  (match_test "reload_in_progress
-   && REGNO (op) >= FIRST_PSEUDO_REGISTER"
+ (and (match_code "mem")
+  (match_test "! constantpool_mem_p (op)")))
Given the old comment, you might want to use 
define_special_memory_constraint rather than define_memory_constraint.


It might be worth doing some before/after comparisons to see if there's 
any noticeable impact.


jeff


Re: [PATCH] RISC-V: Update RVV integer compare simplification comments

2023-05-08 Thread Jeff Law via Gcc-patches




On 5/8/23 02:54, Pan Li via Gcc-patches wrote:

From: Pan Li 

The VMSET simplification RVV integer comparision has merged already.
This patch would like to update the comments for the cases that the
define_split will act on.

Signed-off-by: Pan Li 

gcc/ChangeLog:

* config/riscv/vector.md: Add comments for simplifying to vmset.

OK.  Thanks.
jeff


Re: [PATCH] Don't call emit_clobber in lower-subreg.cc's resolve_simple_move.

2023-05-08 Thread Jeff Law via Gcc-patches




On 5/8/23 00:43, Richard Biener wrote:

On Sat, May 6, 2023 at 8:46 PM Jeff Law via Gcc-patches
 wrote:




On 5/6/23 06:57, Roger Sayle wrote:


Following up on posts/reviews by Segher and Uros, there's some question
over why the middle-end's lower subreg pass emits a clobber (of a
multi-word register) into the instruction stream before emitting the
sequence of moves of the word-sized parts.  This clobber interferes
with (LRA) register allocation, preventing the multi-word pseudo to
remain in the same hard registers.  This patch eliminates this
(presumably superfluous) clobber and thereby improves register allocation.

Those clobbered used to help dataflow analysis know that a multi word
register was fully assigned by a subsequent sequence.  I suspect they
haven't been terribly useful in quite a while.


Likely - maybe they still make a difference for some targets though.
It might be interesting to see whether combining the clobber with the
first set or making the set a multi-set with a parallel would be any
better?
Wrapping them inside a PARALLEL might be better, but probably isn't 
worth the effort.  I think all this stuff dates back to the era where we 
had flow.c to provide the register lifetimes used by local-alloc.  We 
also had things like REG_NO_CONFLICT to indicate that the sub-object 
assignments didn't conflict.  In all it was rather hackish.


Jeff


Re: [PATCH v2] RISC-V: Fix CTZ unnecessary sign extension [PR #106888]

2023-05-08 Thread Jeff Law via Gcc-patches




On 5/8/23 08:12, Raphael Moreira Zinsly wrote:

Changes since v1:
- Remove subreg from operand 1.

-- >8 --

We were not able to match the CTZ sign extend pattern on RISC-V
because it gets optimized to zero extend and/or to ANDI patterns.
For the ANDI case, combine scrambles the RTL and generates the
extension by using subregs.

gcc/ChangeLog:
PR target/106888
* config/riscv/bitmanip.md
(disi2): Match with any_extend.
(disi2_sext): New pattern to match
with sign extend using an ANDI instruction.

gcc/testsuite/ChangeLog:
PR target/106888
* gcc.target/riscv/pr106888.c: New test.
* gcc.target/riscv/zbbw.c: Check for ANDI.

OK
jeff


Re: [PATCH v2] RISC-V: Add bext pattern for ZBS

2023-05-08 Thread Jeff Law via Gcc-patches




On 5/8/23 08:11, Raphael Moreira Zinsly wrote:

Changes since v1:
 - Removed name clash change.
 - Fix new pattern indentation.

-- >8 --

When (a & (1 << bit_no)) is tested inside an IF we can use a bit extract.

gcc/ChangeLog:

* config/riscv/bitmanip.md
(branch_bext): New split pattern.

gcc/testsuite/ChangeLog:
* gcc.target/riscv/zbs-bext-02.c: New test.

OK
jeff


Re: For GCC, newlib combined tree, newlib build-tree testing, use standard search paths

2023-05-08 Thread Thomas Schwinge
Hi!

Ping: OK to push to newlib main branch the attached
"For GCC, newlib combined tree, newlib build-tree testing, use standard search 
paths"?
Or, has anybody got adverse comments/insight into this?


Grüße
 Thomas


On 2023-04-14T22:03:28+0200, I wrote:
> Hi!
>
> OK to push to newlib main branch the attached
> "For GCC, newlib combined tree, newlib build-tree testing, use standard 
> search paths"
> -- or is something else wrong here, or should this be done differently?
> (I mean, I'm confused why this doesn't just work; I'm certainly not the
> first person to be testing such a setup?)
>
> I'm not doing anything special here: just symlink 'newlib' into the GCC
> source directory, build the combined tree, and then run 'make check', as
> mentioned in the attached Git commit log.
>
>
> Grüße
>  Thomas


-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
>From e56b38625929c3cf62c71d3fbd9264aaeef39d0c Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Fri, 14 Apr 2023 21:26:32 +0200
Subject: [PATCH] For GCC, newlib combined tree, newlib build-tree testing, use
 standard search paths

For example, for GCC/GCN target (AMD GPUs), target libraries are built
individually per supported hardware ISA ('-march=[...]').  Testing such a
toolchain via, for example:

$ make RUNTESTFLAGS='--target_board=[...]/-march=gfx90a' check[...]

... does work fine for all 'check-gcc-[...]' as well as GCC-provided target
libraries, 'check-target-[...]'.  Just for 'check-target-newlib', for the
example above, not the '-march=gfx90a' newlib libraries are linked in, but
instead always the default ones, which results in link FAILure.  This is cured
simply by skipping use of 'newlib/testsuite/lib/flags.exp', so that the
standard search paths as determined by GCC, DejaGnu are used for newlib, too.
---
 newlib/testsuite/lib/flags.exp | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/newlib/testsuite/lib/flags.exp b/newlib/testsuite/lib/flags.exp
index e1e9acb18..697291e7a 100644
--- a/newlib/testsuite/lib/flags.exp
+++ b/newlib/testsuite/lib/flags.exp
@@ -4,6 +4,13 @@
 # is freely granted, provided that this notice is preserved.
 #
 
+if [info exists env(XGCC_FLAGS_FOR_TARGET)] {
+verbose "GCC, newlib combined tree, build-tree testing; using standard search paths"
+# ... instead of the search paths built here, based on 'objdir' as set in
+# newlib's 'site.exp', which always points to the default multilib.
+return
+}
+
 # flags.exp: overrides the dejagnu versions of libgloss_link_flags,
 # newlib_link_flags, and newlib_include_flags.
 
-- 
2.25.1



Re: [patch, fortran] PR109662 Namelist input with comma after name accepted

2023-05-08 Thread Harald Anlauf via Gcc-patches

Steve,

On 5/8/23 02:13, Steve Kargl via Gcc-patches wrote:

Harald,
Thanks for keeping us honest.  I didn't check what other
separators might cause a problem.

After 2 decades of working on gfortran, I've come to conclusion
that -std=f2018 should be the default.  When f2023 is ratified,
the default becomes -std=f2023.  All GNU fortran extension
should be behind an option, and we should be aggressive
eliminating extensions.

Yes, this means that 'real*4' and similar would require
a -fallow-nonstandard-declaration option.



please don't let us get off-topic.

The issue behind the PR was F2018: 13.11.3.1, Namelist input,
which has

Input for a namelist input statement consists of
(1) optional blanks and namelist comments,
(2) the character & followed immediately by the namelist-group-name as 
specified in the NAMELIST statement,

(3) one or more blanks,

where "blanks" was to be interpreted.  Separators are discussed
separately.

Jerry has resolved "," and ";".  Good.

There is another weird issue that is visible in the testcase
output in my previous mail for "!".  Reducing that further now
suggests that the EOF condition of the namelist read of the
single line affects the namelist read of the next multi-line read.

So this one is actually a different bug, likely libgfortran's
internal state.




Re: [PATCH 0/3] Trivial cleanups for genmatch

2023-05-08 Thread Richard Biener via Gcc-patches



> Am 08.05.2023 um 20:14 schrieb Alexander Monakov via Gcc-patches 
> :
> 
> I'm trying to study match.pd/genmatch with the eventual goal of
> improving match-and-simplify code generation. Here's some trivial
> cleanups for the recent refactoring in the meantime.
> 
> Alexander Monakov (3):
>  genmatch: clean up emit_func
>  genmatch: clean up showUsage
>  genmatch: fixup get_out_file

Ok for all.

Richard 


> gcc/genmatch.cc | 159 
> 1 file changed, 79 insertions(+), 80 deletions(-)
> 
> -- 
> 2.39.2
> 


[PATCH 1/3] genmatch: clean up emit_func

2023-05-08 Thread Alexander Monakov via Gcc-patches
Eliminate boolean parameters of emit_func. The first ('open') just
prints 'extern' to generated header, which is unnecessary. Introduce a
separate function to use when finishing a declaration in place of the
second ('close').

Rename emit_func to 'fp_decl' (matching 'fprintf' in length) to unbreak
indentation in several places.

Reshuffle emitted line breaks in a few places to make generated
declarations less ugly.

gcc/ChangeLog:

* genmatch.cc (header_file): Make static.
(emit_func): Rename to...
(fp_decl): ... this.  Adjust all uses.
(fp_decl_done): New function.  Use it...
(decision_tree::gen): ... here and...
(write_predicate): ... here.
(main): Adjust.
---
 gcc/genmatch.cc | 97 ++---
 1 file changed, 52 insertions(+), 45 deletions(-)

diff --git a/gcc/genmatch.cc b/gcc/genmatch.cc
index c593814871..d5e56e2d68 100644
--- a/gcc/genmatch.cc
+++ b/gcc/genmatch.cc
@@ -183,31 +183,37 @@ fprintf_indent (FILE *f, unsigned int indent, const char 
*format, ...)
   va_end (ap);
 }
 
-/* Like fprintf, but print to two files, one header one C implementation.  */
-FILE *header_file = NULL;
+/* Secondary stream for fp_decl.  */
+static FILE *header_file;
 
+/* Start or continue emitting a declaration in fprintf-like manner,
+   printing both to F and global header_file, if non-null.  */
 static void
 #if GCC_VERSION >= 4001
-__attribute__((format (printf, 4, 5)))
+__attribute__((format (printf, 2, 3)))
 #endif
-emit_func (FILE *f, bool open, bool close, const char *format, ...)
+fp_decl (FILE *f, const char *format, ...)
 {
-  va_list ap1, ap2;
-  if (header_file != NULL)
-{
-  if (open)
-   fprintf (header_file, "extern ");
-  va_start (ap2, format);
-  vfprintf (header_file, format, ap2);
-  va_end (ap2);
-  if (close)
-   fprintf (header_file, ";\n");
-}
+  va_list ap;
+  va_start (ap, format);
+  vfprintf (f, format, ap);
+  va_end (ap);
 
-  va_start (ap1, format);
-  vfprintf (f, format, ap1);
-  va_end (ap1);
-  fputc ('\n', f);
+  if (!header_file)
+return;
+
+  va_start (ap, format);
+  vfprintf (header_file, format, ap);
+  va_end (ap);
+}
+
+/* Finish a declaration being emitted by fp_decl.  */
+static void
+fp_decl_done (FILE *f, const char *trailer)
+{
+  fprintf (f, "%s\n", trailer);
+  if (header_file)
+fprintf (header_file, "%s;", trailer);
 }
 
 static void
@@ -3924,35 +3930,35 @@ decision_tree::gen (vec  , bool gimple)
   s->fname = xasprintf ("%s_simplify_%u", gimple ? "gimple" : "generic",
fcnt++);
   if (gimple)
-   emit_func (f, true, false, "\nbool\n"
+   fp_decl (f, "\nbool\n"
 "%s (gimple_match_op *res_op, gimple_seq *seq,\n"
 " tree (*valueize)(tree) ATTRIBUTE_UNUSED,\n"
 " const tree ARG_UNUSED (type), tree 
*ARG_UNUSED "
-"(captures)\n",
+"(captures)",
 s->fname);
   else
{
- emit_func (f, true, false, "\ntree\n"
+ fp_decl (f, "\ntree\n"
   "%s (location_t ARG_UNUSED (loc), const tree ARG_UNUSED 
(type),\n",
   (*iter).second->fname);
  for (unsigned i = 0;
   i < as_a (s->s->s->match)->ops.length (); ++i)
-   emit_func (f, false, false, " tree ARG_UNUSED (_p%d),", i);
- emit_func (f, false, false, " tree *captures\n");
+   fp_decl (f, " tree ARG_UNUSED (_p%d),", i);
+ fp_decl (f, " tree *captures");
}
   for (unsigned i = 0; i < s->s->s->for_subst_vec.length (); ++i)
{
  if (! s->s->s->for_subst_vec[i].first->used)
continue;
  if (is_a  (s->s->s->for_subst_vec[i].second))
-   emit_func (f, false, false, ", const enum tree_code ARG_UNUSED 
(%s)",
+   fp_decl (f, ",\n const enum tree_code ARG_UNUSED (%s)",
 s->s->s->for_subst_vec[i].first->id);
  else if (is_a  (s->s->s->for_subst_vec[i].second))
-   emit_func (f, false, false, ", const combined_fn ARG_UNUSED (%s)",
+   fp_decl (f, ",\n const combined_fn ARG_UNUSED (%s)",
 s->s->s->for_subst_vec[i].first->id);
}
 
-  emit_func (f, false, true, ")");
+  fp_decl_done (f, ")");
   fprintf (f, "{\n");
   fprintf_indent (f, 2, "const bool debug_dump = "
"dump_file && (dump_flags & TDF_FOLDING);\n");
@@ -3988,22 +3994,22 @@ decision_tree::gen (vec  , bool gimple)
  FILE *f = get_out_file (files);
 
  if (gimple)
-   emit_func (f, true, false,"\nbool\n"
+   fp_decl (f, "\nbool\n"
 "gimple_simplify_%s (gimple_match_op *res_op,"
 " gimple_seq *seq,\n"
 " tree (*valueize)(tree) "
 "ATTRIBUTE_UNUSED,\n"
 "  

[PATCH 3/3] genmatch: fixup get_out_file

2023-05-08 Thread Alexander Monakov via Gcc-patches
get_out_file did not follow the coding conventions (mixing three-space
and two-space indentation, missing linebreak before function name).

Take that as an excuse to reimplement it in a more terse manner and
rename as 'choose_output', which is hopefully more descriptive.

gcc/ChangeLog:

* genmatch.cc (get_out_file): Make static and rename to ...
(choose_output): ... this. Reimplement. Update all uses ...
(decision_tree::gen): ... here and ...
(main): ... here.
---
 gcc/genmatch.cc | 41 +
 1 file changed, 17 insertions(+), 24 deletions(-)

diff --git a/gcc/genmatch.cc b/gcc/genmatch.cc
index baf93855a6..177c13d87c 100644
--- a/gcc/genmatch.cc
+++ b/gcc/genmatch.cc
@@ -255,28 +255,21 @@ output_line_directive (FILE *f, location_t location,
 
 #define SIZED_BASED_CHUNKS 1
 
-int current_file = 0;
-FILE *get_out_file (vec  )
+static FILE *
+choose_output (const vec )
 {
 #ifdef SIZED_BASED_CHUNKS
-   if (parts.length () == 1)
- return parts[0];
-
-   FILE *f = NULL;
-   long min = 0;
-   /* We've started writing all the files at pos 0, so ftell is equivalent
-  to the size and should be much faster.  */
-   for (unsigned i = 0; i < parts.length (); i++)
- {
-   long res = ftell (parts[i]);
-   if (!f || res < min)
- {
-   min = res;
-   f = parts[i];
- }
- }
-  return f;
+  FILE *shortest = NULL;
+  long min = 0;
+  for (FILE *part : parts)
+{
+  long len = ftell (part);
+  if (!shortest || min > len)
+   shortest = part, min = len;
+}
+  return shortest;
 #else
+  static int current_file;
   return parts[current_file++ % parts.length ()];
 #endif
 }
@@ -3924,7 +3917,7 @@ decision_tree::gen (vec  , bool gimple)
}
 
   /* Cycle the file buffers.  */
-  FILE *f = get_out_file (files);
+  FILE *f = choose_output (files);
 
   /* Generate a split out function with the leaf transform code.  */
   s->fname = xasprintf ("%s_simplify_%u", gimple ? "gimple" : "generic",
@@ -3991,7 +3984,7 @@ decision_tree::gen (vec  , bool gimple)
 
 
  /* Cycle the file buffers.  */
- FILE *f = get_out_file (files);
+ FILE *f = choose_output (files);
 
  if (gimple)
fp_decl (f, "\nbool\n"
@@ -4028,7 +4021,7 @@ decision_tree::gen (vec  , bool gimple)
{
 
  /* Cycle the file buffers.  */
- FILE *f = get_out_file (files);
+ FILE *f = choose_output (files);
 
  if (gimple)
fp_decl (f, "\nbool\n"
@@ -4053,7 +4046,7 @@ decision_tree::gen (vec  , bool gimple)
 
 
   /* Cycle the file buffers.  */
-  FILE *f = get_out_file (files);
+  FILE *f = choose_output (files);
 
   /* Then generate the main entry with the outermost switch and
  tail-calls to the split-out functions.  */
@@ -5461,7 +5454,7 @@ main (int argc, char **argv)
dt.print (stderr);
 
   /* Cycle the file buffers.  */
-  FILE *f = get_out_file (parts);
+  FILE *f = choose_output (parts);
 
   write_predicate (f, pred, dt, gimple);
 }
-- 
2.39.2



[PATCH 2/3] genmatch: clean up showUsage

2023-05-08 Thread Alexander Monakov via Gcc-patches
Display usage more consistently and get rid of camelCase.

gcc/ChangeLog:

* genmatch.cc (showUsage): Reimplement as ...
(usage): ...this.  Adjust all uses.
(main): Print usage when no arguments.  Add missing 'return 1'.
---
 gcc/genmatch.cc | 21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/gcc/genmatch.cc b/gcc/genmatch.cc
index d5e56e2d68..baf93855a6 100644
--- a/gcc/genmatch.cc
+++ b/gcc/genmatch.cc
@@ -5301,13 +5301,12 @@ round_alloc_size (size_t s)
 /* Construct and display the help menu.  */
 
 static void
-showUsage ()
+usage ()
 {
-  fprintf (stderr, "Usage: genmatch [--gimple] [--generic] "
-  "[--header=] [--include=] [-v[v]] input "
-  "[...]\n");
-  fprintf (stderr, "\nWhen more then one outputfile is specified --header "
-  "is required.\n");
+  const char *usage = "Usage:\n"
+" %s [--gimple|--generic] [-v[v]] \n"
+" %s [options] [--include=FILE] --header=FILE  ...\n";
+  fprintf (stderr, usage, progname, progname);
 }
 
 /* Write out the correct include to the match-head fle containing the helper
@@ -5332,9 +5331,6 @@ main (int argc, char **argv)
 
   progname = "genmatch";
 
-  if (argc < 2)
-return 1;
-
   bool gimple = true;
   char *s_header_file = NULL;
   char *s_include_file = NULL;
@@ -5359,14 +5355,17 @@ main (int argc, char **argv)
files.safe_push (argv[i]);
   else
{
- showUsage ();
+ usage ();
  return 1;
}
 }
 
   /* Validate if the combinations are valid.  */
   if ((files.length () > 1 && !s_header_file) || files.is_empty ())
-showUsage ();
+{
+  usage ();
+  return 1;
+}
 
   if (!s_include_file)
 s_include_file = s_header_file;
-- 
2.39.2



[PATCH 0/3] Trivial cleanups for genmatch

2023-05-08 Thread Alexander Monakov via Gcc-patches
I'm trying to study match.pd/genmatch with the eventual goal of
improving match-and-simplify code generation. Here's some trivial
cleanups for the recent refactoring in the meantime.

Alexander Monakov (3):
  genmatch: clean up emit_func
  genmatch: clean up showUsage
  genmatch: fixup get_out_file

 gcc/genmatch.cc | 159 
 1 file changed, 79 insertions(+), 80 deletions(-)

-- 
2.39.2



[COMMITTED] Fix pr81192.c for int16 targets

2023-05-08 Thread Andrew Pinski via Gcc-patches
I had missed when converting this
testcase to Gimple that there was a define
for int/unsigned type specifically to get
an INT32 type. This means when using a
literal integer constant you need to use the
`_Literal (type)` to form the types correctly on the
constants.

This fixes the issue and has been both tested on
xstormy16-elf and x86_64-linux-gnu.

Committed as obvious.

gcc/testsuite/ChangeLog:

PR testsuite/109776
* gcc.dg/pr81192.c: Fix integer constants for int16 targets.
---
 gcc/testsuite/gcc.dg/pr81192.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/pr81192.c b/gcc/testsuite/gcc.dg/pr81192.c
index f6d201ee71a..c46ac18fd9a 100644
--- a/gcc/testsuite/gcc.dg/pr81192.c
+++ b/gcc/testsuite/gcc.dg/pr81192.c
@@ -22,21 +22,21 @@ void __GIMPLE(ssa, startwith("pre")) fn2   ()
   goto __BB7;
 
   __BB(3):
-  if (j_6(D) != 2147483647)
+  if (j_6(D) != _Literal (int)2147483647)
 goto __BB4;
   else
 goto __BB5;
 
   __BB(4):
-  iftmp2_8 = j_6(D) + 1;
+  iftmp2_8 = j_6(D) + _Literal (int)1;
   goto __BB5;
 
   __BB(5):
-  b_lsm6_10 = 2147483647;
+  b_lsm6_10 = _Literal (int)2147483647;
   goto __BB6;
 
   __BB(6):
-  if (c0_1 != 0)
+  if (c0_1 != _Literal (int) 0)
 goto __BB3;
   else
 goto __BB8;
-- 
2.39.1



Re: Support parallel testing in libgomp, part I [PR66005]

2023-05-08 Thread Bernhard Reutner-Fischer via Gcc-patches
On Mon, 8 May 2023 12:42:33 +0200
Thomas Schwinge  wrote:

> >> +if [info exists ::env(GCC_RUNTEST_PARALLELIZE_DIR)] {
> >> +load_file ../libgomp-test-support.exp
> >> +} else {
> >> +load_file libgomp-test-support.exp
> >> +}  
> >
> > Do we have to re-read those? Otherwise this would be load_lib:  
> 
> Indeed there is some confusion there; conceptually the content of that
> file has to be expected to vary per libgomp (multilib) build variant.
> On the other hand, given the GCC target libraries testing setup, we're
> running testing always via the default variant's build, so always read
> the default variant's file.  I agree that should get un-confused, however
> it's a pre-existing issue that I suggest we tackle independently of this
> mechanical change.

Sure. One thing at a time.

> > Some cosmetic nits.
> > See Jakubs one_to_.  
> 
> Thanks.  That got installed while I'd already finished my patch.  ;-)
> Note that the new 'one_to_' etc. is just the existing
> 'check_p_numbers' etc. renamed and moved, as it now has a second use.
> I'm happy to incorporate that into my changes -- if we agree that it
> should also go into other GCC target libraries' parallel testing code:
> libphobos, libstdc++-v3.

Yes. Streamlining these can be done in follow-ups if the respective
maintainers agree.

> >> >> It is far from trivial though.
> >> >> The point is that most of the OpenMP tests are parallelized with the
> >> >> default OMP_NUM_THREADS, so running the tests in parallel 
> >> >> oversubscribes the
> >> >> machine a lot, the higher number of hw threads the more.  
[]
> >> > the same time?  For example, a new dg-* directive to run them wrapped
> >> > through »flock [libgomp/testsuite/serial.lock] [a.out]« or some such?  
> >
> > I think we all agree one that, yes.  
> 
> Conceptually, yes.  However, given that my
> "Support parallel testing in libgomp, part II [PR66005]" changes already
> seem to enable a great amount of parallelism, occupying CPUs almost
> fully,  I'm not actually sure now if adding more parallelism via
> tedious-to-maintain new dg-* directives is actually necessary/useful.  As
> long as libgomp testing now no longer is at the long tail end of overall
> testing time, I'd rather keep it simple?

If the testsuite runtime is fine with your part II as is then we
should keep it as simple as possible.

> >> > (Will again have the problem that DejaGnu doesn't provide infrastructure
> >> > to communicate environment variables to boards in remote testing.)  
> >
> > Are you sure? I'm pretty confident that this worked fine at least at
> > one point in the past for certain targets.  
> 
> For example, see the comments/references in recent
> .

oh, i think i had an rsh2 remote that uploaded $program.env and the rsh
itself was something like
 $RSH $rsh_useropts $hostname sh -c 'test -r $program.env && .
 $program.env \\; $program $pargs \\; echo ...

but that was ages ago and was properly implemented in the meantime, one
would hope? Well, apparently not.

PS: Back then I usually needed very different per-program env and not a
single, static env. So for me it made no sense to have a set of default
env and have tests add their own variables on-demand on top of the
default. The situation in gcc might be the exact opposite?

thanks,


Re: [PATCH] tree: Fix up save_expr [PR52339]

2023-05-08 Thread Jakub Jelinek via Gcc-patches
On Mon, May 08, 2023 at 06:23:54AM +, Richard Biener wrote:
> I wonder if we should defer some of the choices to a langhook
> like make the tree_invariant_p_1 a langhook invocation with the
> default to call tree_invariant_p_1.  After lowering we can reset
> the langhook to the default.

We certainly can.  The only outliner would be Ada I think, but we really
need to know what holds and doesn't hold there.

Jakub



[PATCH v2] xtensa: Make full transition to LRA

2023-05-08 Thread Takayuki 'January June' Suwa via Gcc-patches
On 2023/05/08 22:43, Richard Biener wrote:
[snip]
>> -mlra
> 
> If they were in any released compiler options should be kept
> (doing nothing) for backward compatibility.  Use for example
> 
> mlra
> Target WarnRemoved
> Removed in GCC 14.  This switch has no effect.
> 
> or
> 
> mlra
> Target Ignore
> Does nothing.  Preserved for backward compatibility.
> 
> which doesn't inform the user (I think that's the better choice here).
> 
>> -Target Mask(LRA)
Thank you for your helpful advice.

=
gcc/ChangeLog:

* config/xtensa/constraints.md (R, T, U):
Change define_constraint to define_memory_constraint.
* config/xtensa/xtensa.cc
(xtensa_lra_p, TARGET_LRA_P): Remove.
(xtensa_emit_move_sequence): Remove "if (reload_in_progress)"
clause as it can no longer be true.
(xtensa_output_integer_literal_parts): Consider 16-bit wide
constants.
(xtensa_legitimate_constant_p): Add short-circuit path for
integer load instructions.
* config/xtensa/xtensa.md (movsf): Use can_create_pseudo_p()
rather reload_in_progress and reload_completed.
* config/xtensa/xtensa.opt (mlra): Change to no effect.
---
 gcc/config/xtensa/constraints.md | 26 --
 gcc/config/xtensa/xtensa.cc  | 26 +-
 gcc/config/xtensa/xtensa.md  |  2 +-
 gcc/config/xtensa/xtensa.opt |  4 ++--
 4 files changed, 16 insertions(+), 42 deletions(-)

diff --git a/gcc/config/xtensa/constraints.md b/gcc/config/xtensa/constraints.md
index 53e4d0d8dd1..9b31e162941 100644
--- a/gcc/config/xtensa/constraints.md
+++ b/gcc/config/xtensa/constraints.md
@@ -123,29 +123,19 @@
   (and (match_code "const_int")
   (match_test "! xtensa_split1_finished_p ()"
 
-;; Memory constraints.  Do not use define_memory_constraint here.  Doing so
-;; causes reload to force some constants into the constant pool, but since
-;; the Xtensa constant pool can only be accessed with L32R instructions, it
-;; is always better to just copy a constant into a register.  Instead, use
-;; regular constraints but add a check to allow pseudos during reload.
+;; Memory constraints.
 
-(define_constraint "R"
+(define_memory_constraint "R"
  "Memory that can be accessed with a 4-bit unsigned offset from a register."
- (ior (and (match_code "mem")
-  (match_test "smalloffset_mem_p (op)"))
-  (and (match_code "reg")
-  (match_test "reload_in_progress
-   && REGNO (op) >= FIRST_PSEUDO_REGISTER"
+ (and (match_code "mem")
+  (match_test "smalloffset_mem_p (op)")))
 
-(define_constraint "T"
+(define_memory_constraint "T"
  "Memory in a literal pool (addressable with an L32R instruction)."
  (and (match_code "mem")
   (match_test "!TARGET_CONST16 && constantpool_mem_p (op)")))
 
-(define_constraint "U"
+(define_memory_constraint "U"
  "Memory that is not in a literal pool."
- (ior (and (match_code "mem")
-  (match_test "! constantpool_mem_p (op)"))
-  (and (match_code "reg")
-  (match_test "reload_in_progress
-   && REGNO (op) >= FIRST_PSEUDO_REGISTER"
+ (and (match_code "mem")
+  (match_test "! constantpool_mem_p (op)")))
diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
index 9e5d314e143..f4434ec6e2c 100644
--- a/gcc/config/xtensa/xtensa.cc
+++ b/gcc/config/xtensa/xtensa.cc
@@ -190,7 +190,6 @@ static void xtensa_output_mi_thunk (FILE *file, tree thunk 
ATTRIBUTE_UNUSED,
HOST_WIDE_INT delta,
HOST_WIDE_INT vcall_offset,
tree function);
-static bool xtensa_lra_p (void);
 
 static rtx xtensa_delegitimize_address (rtx);
 
@@ -286,9 +285,6 @@ static rtx xtensa_delegitimize_address (rtx);
 #undef TARGET_CANNOT_FORCE_CONST_MEM
 #define TARGET_CANNOT_FORCE_CONST_MEM xtensa_cannot_force_const_mem
 
-#undef TARGET_LRA_P
-#define TARGET_LRA_P xtensa_lra_p
-
 #undef TARGET_LEGITIMATE_ADDRESS_P
 #define TARGET_LEGITIMATE_ADDRESS_Pxtensa_legitimate_address_p
 
@@ -1266,14 +1262,6 @@ xtensa_emit_move_sequence (rtx *operands, machine_mode 
mode)
 
   operands[1] = xtensa_copy_incoming_a7 (operands[1]);
 
-  /* During reload we don't want to emit (subreg:X (mem:Y)) since that
- instruction won't be recognized after reload, so we remove the
- subreg and adjust mem accordingly.  */
-  if (reload_in_progress)
-{
-  operands[0] = fixup_subreg_mem (operands[0]);
-  operands[1] = fixup_subreg_mem (operands[1]);
-}
   return 0;
 }
 
@@ -3196,7 +3184,7 @@ xtensa_output_integer_literal_parts (FILE *file, rtx x, 
int size)
   fputs (", ", file);
   xtensa_output_integer_literal_parts (file, second, size / 2);
 }
-  else if (size == 4)
+  else if (size == 4 || size == 2)
 {
   output_addr_const (file, x);
 }
@@ -4876,6 +4864,10 @@ xtensa_trampoline_init (rtx m_tramp, tree fndecl, rtx 
chain)
 

Re: Re: [PATCH] RISC-V: Optimize vsetvli of LCM INSERTED edge for user vsetvli [PR 109743]

2023-05-08 Thread 钟居哲
Ok. Address comment and V2 patch:
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/617821.html 

Thanks.


juzhe.zh...@rivai.ai
 
From: Kito Cheng
Date: 2023-05-08 17:53
To: juzhe.zh...@rivai.ai
CC: gcc-patches
Subject: Re: [PATCH] RISC-V: Optimize vsetvli of LCM INSERTED edge for user 
vsetvli [PR 109743]
I am wondering if it is possible to do this on
local_eliminate_vsetvl_insn? I feel this is sort of local elimination,
so putting them together would be better than handling that in many
different places.
 
On Mon, May 8, 2023 at 9:35 AM juzhe.zh...@rivai.ai
 wrote:
>
> Gentle ping this patch.
>
> Is this Ok for trunk? Thanks.
>
>
> juzhe.zh...@rivai.ai
>
> From: juzhe.zhong
> Date: 2023-05-06 19:14
> To: gcc-patches
> CC: kito.cheng; Juzhe-Zhong
> Subject: [PATCH] RISC-V: Optimize vsetvli of LCM INSERTED edge for user 
> vsetvli [PR 109743]
> From: Juzhe-Zhong 
>
> This patch is fixing: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109743.
>
> This issue happens is because we are currently very conservative in 
> optimization of user vsetvli.
>
> Consider this following case:
>
> bb 1:
>   vsetvli a5,a4... (demand AVL = a4).
> bb 2:
>   RVV insn use a5 (demand AVL = a5).
>
> LCM will hoist vsetvl of bb 2 into bb 1.
> We don't do AVL propagation for this situation since it's complicated that
> we should analyze the code sequence between vsetvli in bb 1 and RVV insn in 
> bb 2.
> They are not necessary the consecutive blocks.
>
> This patch is doing the optimizations after LCM, we will check and eliminate 
> the vsetvli
> in LCM inserted edge if such vsetvli is redundant. Such approach is much 
> simplier and safe.
>
> code:
> void
> foo2 (int32_t *a, int32_t *b, int n)
> {
>   if (n <= 0)
>   return;
>   int i = n;
>   size_t vl = __riscv_vsetvl_e32m1 (i);
>
>   for (; i >= 0; i--)
>   {
> vint32m1_t v = __riscv_vle32_v_i32m1 (a, vl);
> __riscv_vse32_v_i32m1 (b, v, vl);
>
> if (i >= vl)
>   continue;
>
> if (i == 0)
>   return;
>
> vl = __riscv_vsetvl_e32m1 (i);
>   }
> }
>
> Before this patch:
> foo2:
> .LFB2:
> .cfi_startproc
> ble a2,zero,.L1
> mv  a4,a2
> li  a3,-1
> vsetvli a5,a2,e32,m1,ta,mu
> vsetvli zero,a5,e32,m1,ta,ma  <- can be eliminated.
> .L5:
> vle32.v v1,0(a0)
> vse32.v v1,0(a1)
> bgeua4,a5,.L3
> .L10:
> beq a2,zero,.L1
> vsetvli a5,a4,e32,m1,ta,mu
> addia4,a4,-1
> vsetvli zero,a5,e32,m1,ta,ma  <- can be eliminated.
> vle32.v v1,0(a0)
> vse32.v v1,0(a1)
> addiw   a2,a2,-1
> bltua4,a5,.L10
> .L3:
> addiw   a2,a2,-1
> addia4,a4,-1
> bne a2,a3,.L5
> .L1:
> ret
>
> After this patch:
> f:
> ble a2,zero,.L1
> mv  a4,a2
> li  a3,-1
> vsetvli a5,a2,e32,m1,ta,ma
> .L5:
> vle32.v v1,0(a0)
> vse32.v v1,0(a1)
> bgeua4,a5,.L3
> .L10:
> beq a2,zero,.L1
> vsetvli a5,a4,e32,m1,ta,ma
> addia4,a4,-1
> vle32.v v1,0(a0)
> vse32.v v1,0(a1)
> addiw   a2,a2,-1
> bltua4,a5,.L10
> .L3:
> addiw   a2,a2,-1
> addia4,a4,-1
> bne a2,a3,.L5
> .L1:
> ret
>
> PR target/109743
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-vsetvl.cc (pass_vsetvl::commit_vsetvls): Add 
> optimization for LCM inserted edge.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/vsetvl/pr109743-1.c: New test.
> * gcc.target/riscv/rvv/vsetvl/pr109743-2.c: New test.
> * gcc.target/riscv/rvv/vsetvl/pr109743-3.c: New test.
> * gcc.target/riscv/rvv/vsetvl/pr109743-4.c: New test.
>
> ---
> gcc/config/riscv/riscv-vsetvl.cc  | 42 +++
> .../gcc.target/riscv/rvv/vsetvl/pr109743-1.c  | 26 
> .../gcc.target/riscv/rvv/vsetvl/pr109743-2.c  | 27 
> .../gcc.target/riscv/rvv/vsetvl/pr109743-3.c  | 28 +
> .../gcc.target/riscv/rvv/vsetvl/pr109743-4.c  | 28 +
> 5 files changed, 151 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-1.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-2.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-3.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-4.c
>
> diff --git a/gcc/config/riscv/riscv-vsetvl.cc 
> b/gcc/config/riscv/riscv-vsetvl.cc
> index f55907a410e..fcee7fdf323 100644
> --- a/gcc/config/riscv/riscv-vsetvl.cc
> +++ b/gcc/config/riscv/riscv-vsetvl.cc
> @@ -3834,6 +3834,48 @@ pass_vsetvl::commit_vsetvls (void)
>   const vector_insn_info *require
> = m_vector_manager->vector_exprs[i];
>   gcc_assert (require->valid_or_dirty_p ());
> +
> +   /* Here we optimize the VSETVL is hoisted by LCM:
> +
> + Before LCM:
> +bb 1:
> +  vsetvli a5,a2,e32,m1,ta,mu
> +bb 2:
> + 

[PATCH V2] RISC-V: Optimize vsetvli of LCM INSERTED edge for user vsetvli [PR 109743]

2023-05-08 Thread juzhe . zhong
From: Juzhe-Zhong 

This patch is fixing: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109743.

This issue happens is because we are currently very conservative in 
optimization of user vsetvli.

Consider this following case:

bb 1:
  vsetvli a5,a4... (demand AVL = a4).
bb 2:
  RVV insn use a5 (demand AVL = a5).

LCM will hoist vsetvl of bb 2 into bb 1.
We don't do AVL propagation for this situation since it's complicated that
we should analyze the code sequence between vsetvli in bb 1 and RVV insn in bb 
2.
They are not necessary the consecutive blocks.

This patch is doing the optimizations after LCM, we will check and eliminate 
the vsetvli
in LCM inserted edge if such vsetvli is redundant. Such approach is much 
simplier and safe.

code:
void
foo2 (int32_t *a, int32_t *b, int n)
{
  if (n <= 0)
  return;
  int i = n;
  size_t vl = __riscv_vsetvl_e32m1 (i);

  for (; i >= 0; i--)
  {
vint32m1_t v = __riscv_vle32_v_i32m1 (a, vl);
__riscv_vse32_v_i32m1 (b, v, vl);

if (i >= vl)
  continue;

if (i == 0)
  return;

vl = __riscv_vsetvl_e32m1 (i);
  }
}

Before this patch:
foo2:
.LFB2:
.cfi_startproc
ble a2,zero,.L1
mv  a4,a2
li  a3,-1
vsetvli a5,a2,e32,m1,ta,mu
vsetvli zero,a5,e32,m1,ta,ma  <- can be eliminated.
.L5:
vle32.v v1,0(a0)
vse32.v v1,0(a1)
bgeua4,a5,.L3
.L10:
beq a2,zero,.L1
vsetvli a5,a4,e32,m1,ta,mu
addia4,a4,-1
vsetvli zero,a5,e32,m1,ta,ma  <- can be eliminated.
vle32.v v1,0(a0)
vse32.v v1,0(a1)
addiw   a2,a2,-1
bltua4,a5,.L10
.L3:
addiw   a2,a2,-1
addia4,a4,-1
bne a2,a3,.L5
.L1:
ret

After this patch:
f:
ble a2,zero,.L1
mv  a4,a2
li  a3,-1
vsetvli a5,a2,e32,m1,ta,ma
.L5:
vle32.v v1,0(a0)
vse32.v v1,0(a1)
bgeua4,a5,.L3
.L10:
beq a2,zero,.L1
vsetvli a5,a4,e32,m1,ta,ma
addia4,a4,-1
vle32.v v1,0(a0)
vse32.v v1,0(a1)
addiw   a2,a2,-1
bltua4,a5,.L10
.L3:
addiw   a2,a2,-1
addia4,a4,-1
bne a2,a3,.L5
.L1:
ret

PR target/109743

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc 
(pass_vsetvl::local_eliminate_vsetvl_insn): Enhance local optimizations for LCM.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/vsetvl/pr109743-1.c: New test.
* gcc.target/riscv/rvv/vsetvl/pr109743-2.c: New test.
* gcc.target/riscv/rvv/vsetvl/pr109743-3.c: New test.
* gcc.target/riscv/rvv/vsetvl/pr109743-4.c: New test.

---
 gcc/config/riscv/riscv-vsetvl.cc  | 49 ++-
 .../gcc.target/riscv/rvv/vsetvl/pr109743-1.c  | 26 ++
 .../gcc.target/riscv/rvv/vsetvl/pr109743-2.c  | 27 ++
 .../gcc.target/riscv/rvv/vsetvl/pr109743-3.c  | 28 +++
 .../gcc.target/riscv/rvv/vsetvl/pr109743-4.c  | 28 +++
 5 files changed, 157 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-3.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-4.c

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index f55907a410e..090a4737c17 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -3978,7 +3978,8 @@ pass_vsetvl::local_eliminate_vsetvl_insn (const 
vector_insn_info ) const
{
  if (i->is_call () || i->is_asm ()
  || find_access (i->defs (), VL_REGNUM)
- || find_access (i->defs (), VTYPE_REGNUM))
+ || find_access (i->defs (), VTYPE_REGNUM)
+ || find_access (i->defs (), REGNO (vl)))
return;
 
  if (has_vtype_op (i->rtl ()))
@@ -4004,6 +4005,52 @@ pass_vsetvl::local_eliminate_vsetvl_insn (const 
vector_insn_info ) const
  return;
}
}
+
+  /* Here we optimize the VSETVL is hoisted by LCM:
+
+Before LCM:
+  bb 1:
+vsetvli a5,a2,e32,m1,ta,mu
+  bb 2:
+vsetvli zero,a5,e32,m1,ta,mu
+...
+
+After LCM:
+  bb 1:
+vsetvli a5,a2,e32,m1,ta,mu
+LCM INSERTED: vsetvli zero,a5,e32,m1,ta,mu --> eliminate
+  bb 2:
+...
+  Such instruction can not be accessed in RTL_SSA when we don't re-init
+  the new RTL_SSA framework but it is definetely at the END of the block. 
*/
+  rtx_insn *end_vsetvl = BB_END (bb->cfg_bb ());
+  if (!vsetvl_discard_result_insn_p (end_vsetvl))
+   {
+ if (JUMP_P (end_vsetvl)
+ && vsetvl_discard_result_insn_p (PREV_INSN (end_vsetvl)))
+   end_vsetvl = PREV_INSN (end_vsetvl);
+ 

[committed] RISC-V: Improve portability of testcases

2023-05-08 Thread Kito Cheng via Gcc-patches
stdint.h will require having corresponding multi-lib existing, so using
stdint-gcc.h instead, also added a riscv_vector.h wrapper to
gcc.target/riscv/rvv/autovec/.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/partial/single_rgroup-1.h: Change
stdint.h to stdint-gcc.h.
* gcc.target/riscv/rvv/autovec/template-1.h: Ditto.
* gcc.target/riscv/rvv/autovec/riscv_vector.h: New.
---
 .../riscv/rvv/autovec/partial/single_rgroup-1.h   |  2 +-
 .../gcc.target/riscv/rvv/autovec/riscv_vector.h   | 11 +++
 .../gcc.target/riscv/rvv/autovec/template-1.h |  2 +-
 3 files changed, 13 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/riscv_vector.h

diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/single_rgroup-1.h 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/single_rgroup-1.h
index be6b4c641cbb..f64799d4e2a9 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/single_rgroup-1.h
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/single_rgroup-1.h
@@ -1,5 +1,5 @@
 #include 
-#include 
+#include 
 
 #define N 777
 
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/riscv_vector.h 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/riscv_vector.h
new file mode 100644
index ..fbb4858fc867
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/riscv_vector.h
@@ -0,0 +1,11 @@
+/* Wrapper of riscv_vector.h, prevent riscv_vector.h including stdint.h from
+   C library, that might cause problem on testing RV32 related testcase when
+   we disable multilib.  */
+#ifndef _RISCV_VECTOR_WRAP_H
+
+#define _GCC_WRAP_STDINT_H
+#include "stdint-gcc.h"
+#include_next 
+#define _RISCV_VECTOR_WRAP_H
+
+#endif
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/template-1.h 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/template-1.h
index 799e2d7d7542..074952f21d5f 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/template-1.h
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/template-1.h
@@ -1,5 +1,5 @@
 #include 
-#include 
+#include 
 
 void
 foo0 (int8_t *__restrict f, int16_t *__restrict d, int n)
-- 
2.39.2



[committed] RISC-V: Factor out vector manager code in vsetvli insertion pass. [NFC]

2023-05-08 Thread Kito Cheng via Gcc-patches
gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc (pass_vsetvl::get_vector_info):
New.
(pass_vsetvl::get_block_info): New.
(pass_vsetvl::update_vector_info): New.
(pass_vsetvl::simple_vsetvl): Use get_vector_info.
(pass_vsetvl::compute_local_backward_infos): Ditto.
(pass_vsetvl::transfer_before): Ditto.
(pass_vsetvl::transfer_after): Ditto.
(pass_vsetvl::emit_local_forward_vsetvls): Ditto.
(pass_vsetvl::local_eliminate_vsetvl_insn): Ditto.
(pass_vsetvl::cleanup_insns): Ditto.
(pass_vsetvl::compute_local_backward_infos): Use
update_vector_info.
---
 gcc/config/riscv/riscv-vsetvl.cc | 87 
 1 file changed, 67 insertions(+), 20 deletions(-)

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index f55907a410e6..d4d6f336ef99 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -2596,7 +2596,15 @@ const pass_data pass_data_vsetvl = {
 class pass_vsetvl : public rtl_opt_pass
 {
 private:
-  class vector_infos_manager *m_vector_manager;
+  vector_infos_manager *m_vector_manager;
+
+  const vector_insn_info _vector_info (const rtx_insn *) const;
+  const vector_insn_info _vector_info (const insn_info *) const;
+  const vector_block_info _block_info (const basic_block) const;
+  const vector_block_info _block_info (const bb_info *) const;
+  vector_block_info _block_info (const basic_block);
+  vector_block_info _block_info (const bb_info *);
+  void update_vector_info (const insn_info *, const vector_insn_info &);
 
   void simple_vsetvl (void) const;
   void lazy_vsetvl (void);
@@ -2647,6 +2655,49 @@ public:
   virtual unsigned int execute (function *) final override;
 }; // class pass_vsetvl
 
+const vector_insn_info &
+pass_vsetvl::get_vector_info (const rtx_insn *i) const
+{
+  return m_vector_manager->vector_insn_infos[INSN_UID (i)];
+}
+
+const vector_insn_info &
+pass_vsetvl::get_vector_info (const insn_info *i) const
+{
+  return m_vector_manager->vector_insn_infos[i->uid ()];
+}
+
+const vector_block_info &
+pass_vsetvl::get_block_info (const basic_block bb) const
+{
+  return m_vector_manager->vector_block_infos[bb->index];
+}
+
+const vector_block_info &
+pass_vsetvl::get_block_info (const bb_info *bb) const
+{
+  return m_vector_manager->vector_block_infos[bb->index ()];
+}
+
+vector_block_info &
+pass_vsetvl::get_block_info (const basic_block bb)
+{
+  return m_vector_manager->vector_block_infos[bb->index];
+}
+
+vector_block_info &
+pass_vsetvl::get_block_info (const bb_info *bb)
+{
+  return m_vector_manager->vector_block_infos[bb->index ()];
+}
+
+void
+pass_vsetvl::update_vector_info (const insn_info *i,
+const vector_insn_info _info)
+{
+  m_vector_manager->vector_insn_infos[i->uid ()] = new_info;
+}
+
 /* Simple m_vsetvl_insert vsetvl for optimize == 0.  */
 void
 pass_vsetvl::simple_vsetvl (void) const
@@ -2667,8 +2718,7 @@ pass_vsetvl::simple_vsetvl (void) const
continue;
  if (has_vtype_op (rinsn))
{
- const auto info
-   = m_vector_manager->vector_insn_infos[INSN_UID (rinsn)];
+ const auto info = get_vector_info (rinsn);
  emit_vsetvl_insn (VSETVL_DISCARD_RESULT, EMIT_BEFORE, info,
NULL_RTX, rinsn);
}
@@ -2688,11 +2738,11 @@ pass_vsetvl::compute_local_backward_infos (const 
bb_info *bb)
 
   for (insn_info *insn : bb->reverse_real_nondebug_insns ())
 {
-  auto  = m_vector_manager->vector_insn_infos[insn->uid ()];
+  auto  = get_vector_info (insn);
 
   if (info.uninit_p ())
/* If it is uninitialized, propagate it directly.  */
-   info = change;
+   update_vector_info (insn, change);
   else if (info.unknown_p ())
change = info;
   else
@@ -2704,7 +2754,7 @@ pass_vsetvl::compute_local_backward_infos (const bb_info 
*bb)
&& !reg_available_p (insn, change))
  && change.compatible_p (info))
{
- info = change.merge (info, LOCAL_MERGE);
+ update_vector_info (insn, change.merge (info, LOCAL_MERGE));
  /* Fix PR109399, we should update user vsetvl instruction
 if there is a change in demand fusion.  */
  if (vsetvl_insn_p (insn->rtl ()))
@@ -2744,8 +2794,7 @@ pass_vsetvl::transfer_before (vector_insn_info , 
insn_info *insn) const
   if (!has_vtype_op (insn->rtl ()))
 return;
 
-  const vector_insn_info require
-= m_vector_manager->vector_insn_infos[insn->uid ()];
+  const vector_insn_info require = get_vector_info (insn);
   if (info.valid_p () && !need_vsetvl (require, info))
 return;
   info = require;
@@ -2759,7 +2808,7 @@ pass_vsetvl::transfer_after (vector_insn_info , 
insn_info *insn) const
 {
   if (vector_config_insn_p (insn->rtl ()))
 {
-  

[committed] Fix minor length computation on stormy16

2023-05-08 Thread Jeff Law via Gcc-patches


Today's build of xstormy16-elf failed due to a branch to an out of range 
target.  Manual inspection of the assembly code for the affected 
function (divdi3) showed that the zero-extension patterns were claiming 
a length of 2, but clearly assembled into 4 bytes.


This patch adds an explicit length to the zero extension pattern and 
appears to resolve the issue in my test builds.


Committed to the trunk,

Jeff
commit 148de3aaac6d2b66c635c76d245c7cd1537fa4e0
Author: Jeff Law 
Date:   Mon May 8 08:28:26 2023 -0600

Fix minor length computation on stormy16

Today's build of xstormy16-elf failed due to a branch to an out of range
target.  Manual inspection of the assembly code for the affected function
(divdi3) showed that the zero-extension patterns were claiming a length
of 2, but clearly assembled into 4 bytes.

This patch adds an explicit length to the zero extension pattern and
appears to resolve the issue in my test builds.

gcc/

* config/stormy16/stormy16.md (zero_extendhisi2): Fix length.

diff --git a/gcc/config/stormy16/stormy16.md b/gcc/config/stormy16/stormy16.md
index 91e4bb1cff7..430ec297e5a 100644
--- a/gcc/config/stormy16/stormy16.md
+++ b/gcc/config/stormy16/stormy16.md
@@ -299,7 +299,8 @@ (define_insn "zero_extendhisi2"
(zero_extend:SI (match_operand:HI 1 "register_operand" "0")))]
   ""
   "mov %h0,#0"
-  [(set_attr "psw_operand" "clobber")])
+  [(set_attr "length" "4")
+   (set_attr "psw_operand" "clobber")])
 
 ;; 
 ;; ::


[PATCH v2] RISC-V: Fix CTZ unnecessary sign extension [PR #106888]

2023-05-08 Thread Raphael Moreira Zinsly
Changes since v1:
- Remove subreg from operand 1.

-- >8 --

We were not able to match the CTZ sign extend pattern on RISC-V
because it gets optimized to zero extend and/or to ANDI patterns.
For the ANDI case, combine scrambles the RTL and generates the
extension by using subregs.

gcc/ChangeLog:
PR target/106888
* config/riscv/bitmanip.md
(disi2): Match with any_extend.
(disi2_sext): New pattern to match
with sign extend using an ANDI instruction.

gcc/testsuite/ChangeLog:
PR target/106888
* gcc.target/riscv/pr106888.c: New test.
* gcc.target/riscv/zbbw.c: Check for ANDI.
---
 gcc/config/riscv/bitmanip.md  | 13 -
 gcc/testsuite/gcc.target/riscv/pr106888.c | 12 
 gcc/testsuite/gcc.target/riscv/zbbw.c |  1 +
 3 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/pr106888.c

diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
index a27fc3e34a1..064b8f7df8c 100644
--- a/gcc/config/riscv/bitmanip.md
+++ b/gcc/config/riscv/bitmanip.md
@@ -246,13 +246,24 @@
 
 (define_insn "*disi2"
   [(set (match_operand:DI 0 "register_operand" "=r")
-(sign_extend:DI
+(any_extend:DI
   (clz_ctz_pcnt:SI (match_operand:SI 1 "register_operand" "r"]
   "TARGET_64BIT && TARGET_ZBB"
   "w\t%0,%1"
   [(set_attr "type" "")
(set_attr "mode" "SI")])
 
+;; A SImode clz_ctz_pcnt may be extended to DImode via subreg.
+(define_insn "*disi2_sext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+(and:DI (subreg:DI
+  (clz_ctz_pcnt:SI (match_operand:SI 1 "register_operand" "r")) 0)
+  (match_operand:DI 2 "const_int_operand")))]
+  "TARGET_64BIT && TARGET_ZBB && ((INTVAL (operands[2]) & 0x3f) == 0x3f)"
+  "w\t%0,%1"
+  [(set_attr "type" "bitmanip")
+   (set_attr "mode" "SI")])
+
 (define_insn "*di2"
   [(set (match_operand:DI 0 "register_operand" "=r")
 (clz_ctz_pcnt:DI (match_operand:DI 1 "register_operand" "r")))]
diff --git a/gcc/testsuite/gcc.target/riscv/pr106888.c 
b/gcc/testsuite/gcc.target/riscv/pr106888.c
new file mode 100644
index 000..77fb8e5b79c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/pr106888.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc_zbb -mabi=lp64" } */
+
+int
+ctz (int i)
+{
+  int res = __builtin_ctz (i);
+  return res&0x;
+}
+
+/* { dg-final { scan-assembler-times "ctzw" 1 } } */
+/* { dg-final { scan-assembler-not "andi" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/zbbw.c 
b/gcc/testsuite/gcc.target/riscv/zbbw.c
index 709743c3b68..f7b2b63853f 100644
--- a/gcc/testsuite/gcc.target/riscv/zbbw.c
+++ b/gcc/testsuite/gcc.target/riscv/zbbw.c
@@ -23,3 +23,4 @@ popcount (int i)
 /* { dg-final { scan-assembler-times "clzw" 1 } } */
 /* { dg-final { scan-assembler-times "ctzw" 1 } } */
 /* { dg-final { scan-assembler-times "cpopw" 1 } } */
+/* { dg-final { scan-assembler-not "andi\t" } } */
-- 
2.40.0



[PATCH v2] RISC-V: Add bext pattern for ZBS

2023-05-08 Thread Raphael Moreira Zinsly
Changes since v1: 
- Removed name clash change.
- Fix new pattern indentation.

-- >8 --

When (a & (1 << bit_no)) is tested inside an IF we can use a bit extract.

gcc/ChangeLog:

* config/riscv/bitmanip.md
(branch_bext): New split pattern.

gcc/testsuite/ChangeLog:
* gcc.target/riscv/zbs-bext-02.c: New test.
---
 gcc/config/riscv/bitmanip.md | 23 
 gcc/testsuite/gcc.target/riscv/zbs-bext-02.c | 18 +++
 2 files changed, 41 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/zbs-bext-02.c

diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
index a27fc3e34a1..df522073344 100644
--- a/gcc/config/riscv/bitmanip.md
+++ b/gcc/config/riscv/bitmanip.md
@@ -720,6 +720,29 @@
operands[9] = GEN_INT (clearbit);
 })
 
+;; IF_THEN_ELSE: test for (a & (1 << BIT_NO))
+(define_insn_and_split "*branch_bext"
+  [(set (pc)
+   (if_then_else
+ (match_operator 1 "equality_operator"
+ [(zero_extract:X (match_operand:X 2 "register_operand" "r")
+  (const_int 1)
+  (zero_extend:X
+(match_operand:QI 3 "register_operand" "r")))
+   (const_int 0)])
+   (label_ref (match_operand 0 "" ""))
+   (pc)))
+  (clobber (match_scratch:X 4 "="))]
+  "TARGET_ZBS"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 4) (zero_extract:X (match_dup 2)
+   (const_int 1)
+   (zero_extend:X (match_dup 3
+   (set (pc) (if_then_else (match_op_dup 1 [(match_dup 4) (const_int 0)])
+  (label_ref (match_dup 0))
+  (pc)))])
+
 ;; ZBKC or ZBC extension
 (define_insn "riscv_clmul_"
   [(set (match_operand:X 0 "register_operand" "=r")
diff --git a/gcc/testsuite/gcc.target/riscv/zbs-bext-02.c 
b/gcc/testsuite/gcc.target/riscv/zbs-bext-02.c
new file mode 100644
index 000..3f3b8404eca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/zbs-bext-02.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc_zbs -mabi=lp64" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-funroll-loops" } } */
+
+int
+foo(const long long B, int a)
+{
+  long long b = 1;
+  for (int sq = 0; sq < 64; sq++)
+if (B & (b << sq)) 
+  a++;
+
+  return a;
+}
+
+/* { dg-final { scan-assembler-times "bext\t" 1 } } */
+/* { dg-final { scan-assembler-not "bset" } } */
+/* { dg-final { scan-assembler-not "and" } } */
-- 
2.40.0



libgm2: Remove 'autogen.sh' (was: libgm2: Adjust 'autogen.sh' to 'ACLOCAL_AMFLAGS', and simplify)

2023-05-08 Thread Thomas Schwinge
Hi!

On 2023-04-14T13:49:20+0100, Gaius Mulley via Gcc-patches 
 wrote:
> Thomas Schwinge  writes:
>> Separately, given that plain 'autoreconf' works, why have 'autogen.sh' at
>> all?
>
> If autoreconf does the same as autogen.sh then yes this can be removed

Pushed to master branch commit bd6dbdb196da5aa5c7354e0fc7b0a146237bcf8a
"libgm2: Remove 'autogen.sh'".


> (or its contents replaced with a call to autoreconf perhaps?),

I didn't see any advantage in that.


Grüße
 Thomas


-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
>From bd6dbdb196da5aa5c7354e0fc7b0a146237bcf8a Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Mon, 8 May 2023 15:53:47 +0200
Subject: [PATCH] libgm2: Remove 'autogen.sh'

... given that plain 'autoreconf' achieves the same.

	libgm2/
	* autogen.sh: Remove.
---
 libgm2/autogen.sh | 30 --
 1 file changed, 30 deletions(-)
 delete mode 100755 libgm2/autogen.sh

diff --git a/libgm2/autogen.sh b/libgm2/autogen.sh
deleted file mode 100755
index 1b144970a570..
--- a/libgm2/autogen.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/sh
-
-# autogen.sh regenerate the autoconf files.
-#   Copyright 2013-2022  Free Software Foundation, Inc.
-#
-# This file is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; see the file COPYING3.  If not see
-# .
-
-rm -rf autom4te.cache
-
-# libtoolize
-rm -f aclocal.m4
-aclocal -I .. -I ../config
-autoreconf -I .. -I ../config
-automake --include-deps
-
-rm -rf autom4te.cache
-
-exit 0
-- 
2.39.2



Re: [PATCH] xtensa: Make full transition to LRA

2023-05-08 Thread Richard Biener via Gcc-patches
On Mon, May 8, 2023 at 3:39 PM Takayuki 'January June' Suwa via
Gcc-patches  wrote:
>
> gcc/ChangeLog:
>
> * config/xtensa/constraints.md (R, T, U):
> Change define_constraint to define_memory_constraint.
> * config/xtensa/xtensa.cc
> (xtensa_lra_p, TARGET_LRA_P): Remove.
> (xtensa_emit_move_sequence): Remove "if (reload_in_progress)"
> clause as it can no longer be true.
> (xtensa_output_integer_literal_parts): Consider 16-bit wide
> constants.
> (xtensa_legitimate_constant_p): Add short-circuit path for
> integer load instructions.
> * config/xtensa/xtensa.md (movsf): Use can_create_pseudo_p()
> rather reload_in_progress and reload_completed.
> * config/xtensa/xtensa.opt (mlra): Remove.
> ---
>  gcc/config/xtensa/constraints.md | 26 --
>  gcc/config/xtensa/xtensa.cc  | 26 +-
>  gcc/config/xtensa/xtensa.md  |  2 +-
>  gcc/config/xtensa/xtensa.opt |  4 
>  4 files changed, 14 insertions(+), 44 deletions(-)
>
> diff --git a/gcc/config/xtensa/constraints.md 
> b/gcc/config/xtensa/constraints.md
> index 53e4d0d8dd1..9b31e162941 100644
> --- a/gcc/config/xtensa/constraints.md
> +++ b/gcc/config/xtensa/constraints.md
> @@ -123,29 +123,19 @@
>(and (match_code "const_int")
>(match_test "! xtensa_split1_finished_p ()"
>
> -;; Memory constraints.  Do not use define_memory_constraint here.  Doing so
> -;; causes reload to force some constants into the constant pool, but since
> -;; the Xtensa constant pool can only be accessed with L32R instructions, it
> -;; is always better to just copy a constant into a register.  Instead, use
> -;; regular constraints but add a check to allow pseudos during reload.
> +;; Memory constraints.
>
> -(define_constraint "R"
> +(define_memory_constraint "R"
>   "Memory that can be accessed with a 4-bit unsigned offset from a register."
> - (ior (and (match_code "mem")
> -  (match_test "smalloffset_mem_p (op)"))
> -  (and (match_code "reg")
> -  (match_test "reload_in_progress
> -   && REGNO (op) >= FIRST_PSEUDO_REGISTER"
> + (and (match_code "mem")
> +  (match_test "smalloffset_mem_p (op)")))
>
> -(define_constraint "T"
> +(define_memory_constraint "T"
>   "Memory in a literal pool (addressable with an L32R instruction)."
>   (and (match_code "mem")
>(match_test "!TARGET_CONST16 && constantpool_mem_p (op)")))
>
> -(define_constraint "U"
> +(define_memory_constraint "U"
>   "Memory that is not in a literal pool."
> - (ior (and (match_code "mem")
> -  (match_test "! constantpool_mem_p (op)"))
> -  (and (match_code "reg")
> -  (match_test "reload_in_progress
> -   && REGNO (op) >= FIRST_PSEUDO_REGISTER"
> + (and (match_code "mem")
> +  (match_test "! constantpool_mem_p (op)")))
> diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
> index 9e5d314e143..f4434ec6e2c 100644
> --- a/gcc/config/xtensa/xtensa.cc
> +++ b/gcc/config/xtensa/xtensa.cc
> @@ -190,7 +190,6 @@ static void xtensa_output_mi_thunk (FILE *file, tree 
> thunk ATTRIBUTE_UNUSED,
> HOST_WIDE_INT delta,
> HOST_WIDE_INT vcall_offset,
> tree function);
> -static bool xtensa_lra_p (void);
>
>  static rtx xtensa_delegitimize_address (rtx);
>
> @@ -286,9 +285,6 @@ static rtx xtensa_delegitimize_address (rtx);
>  #undef TARGET_CANNOT_FORCE_CONST_MEM
>  #define TARGET_CANNOT_FORCE_CONST_MEM xtensa_cannot_force_const_mem
>
> -#undef TARGET_LRA_P
> -#define TARGET_LRA_P xtensa_lra_p
> -
>  #undef TARGET_LEGITIMATE_ADDRESS_P
>  #define TARGET_LEGITIMATE_ADDRESS_Pxtensa_legitimate_address_p
>
> @@ -1266,14 +1262,6 @@ xtensa_emit_move_sequence (rtx *operands, machine_mode 
> mode)
>
>operands[1] = xtensa_copy_incoming_a7 (operands[1]);
>
> -  /* During reload we don't want to emit (subreg:X (mem:Y)) since that
> - instruction won't be recognized after reload, so we remove the
> - subreg and adjust mem accordingly.  */
> -  if (reload_in_progress)
> -{
> -  operands[0] = fixup_subreg_mem (operands[0]);
> -  operands[1] = fixup_subreg_mem (operands[1]);
> -}
>return 0;
>  }
>
> @@ -3196,7 +3184,7 @@ xtensa_output_integer_literal_parts (FILE *file, rtx x, 
> int size)
>fputs (", ", file);
>xtensa_output_integer_literal_parts (file, second, size / 2);
>  }
> -  else if (size == 4)
> +  else if (size == 4 || size == 2)
>  {
>output_addr_const (file, x);
>  }
> @@ -4876,6 +4864,10 @@ xtensa_trampoline_init (rtx m_tramp, tree fndecl, rtx 
> chain)
>  static bool
>  xtensa_legitimate_constant_p (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
>  {
> +  if (CONST_INT_P (x))
> +return TARGET_AUTO_LITPOOLS || TARGET_CONST16
> +  || xtensa_simm12b (INTVAL 

[PATCH] xtensa: Make full transition to LRA

2023-05-08 Thread Takayuki 'January June' Suwa via Gcc-patches
gcc/ChangeLog:

* config/xtensa/constraints.md (R, T, U):
Change define_constraint to define_memory_constraint.
* config/xtensa/xtensa.cc
(xtensa_lra_p, TARGET_LRA_P): Remove.
(xtensa_emit_move_sequence): Remove "if (reload_in_progress)"
clause as it can no longer be true.
(xtensa_output_integer_literal_parts): Consider 16-bit wide
constants.
(xtensa_legitimate_constant_p): Add short-circuit path for
integer load instructions.
* config/xtensa/xtensa.md (movsf): Use can_create_pseudo_p()
rather reload_in_progress and reload_completed.
* config/xtensa/xtensa.opt (mlra): Remove.
---
 gcc/config/xtensa/constraints.md | 26 --
 gcc/config/xtensa/xtensa.cc  | 26 +-
 gcc/config/xtensa/xtensa.md  |  2 +-
 gcc/config/xtensa/xtensa.opt |  4 
 4 files changed, 14 insertions(+), 44 deletions(-)

diff --git a/gcc/config/xtensa/constraints.md b/gcc/config/xtensa/constraints.md
index 53e4d0d8dd1..9b31e162941 100644
--- a/gcc/config/xtensa/constraints.md
+++ b/gcc/config/xtensa/constraints.md
@@ -123,29 +123,19 @@
   (and (match_code "const_int")
   (match_test "! xtensa_split1_finished_p ()"
 
-;; Memory constraints.  Do not use define_memory_constraint here.  Doing so
-;; causes reload to force some constants into the constant pool, but since
-;; the Xtensa constant pool can only be accessed with L32R instructions, it
-;; is always better to just copy a constant into a register.  Instead, use
-;; regular constraints but add a check to allow pseudos during reload.
+;; Memory constraints.
 
-(define_constraint "R"
+(define_memory_constraint "R"
  "Memory that can be accessed with a 4-bit unsigned offset from a register."
- (ior (and (match_code "mem")
-  (match_test "smalloffset_mem_p (op)"))
-  (and (match_code "reg")
-  (match_test "reload_in_progress
-   && REGNO (op) >= FIRST_PSEUDO_REGISTER"
+ (and (match_code "mem")
+  (match_test "smalloffset_mem_p (op)")))
 
-(define_constraint "T"
+(define_memory_constraint "T"
  "Memory in a literal pool (addressable with an L32R instruction)."
  (and (match_code "mem")
   (match_test "!TARGET_CONST16 && constantpool_mem_p (op)")))
 
-(define_constraint "U"
+(define_memory_constraint "U"
  "Memory that is not in a literal pool."
- (ior (and (match_code "mem")
-  (match_test "! constantpool_mem_p (op)"))
-  (and (match_code "reg")
-  (match_test "reload_in_progress
-   && REGNO (op) >= FIRST_PSEUDO_REGISTER"
+ (and (match_code "mem")
+  (match_test "! constantpool_mem_p (op)")))
diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
index 9e5d314e143..f4434ec6e2c 100644
--- a/gcc/config/xtensa/xtensa.cc
+++ b/gcc/config/xtensa/xtensa.cc
@@ -190,7 +190,6 @@ static void xtensa_output_mi_thunk (FILE *file, tree thunk 
ATTRIBUTE_UNUSED,
HOST_WIDE_INT delta,
HOST_WIDE_INT vcall_offset,
tree function);
-static bool xtensa_lra_p (void);
 
 static rtx xtensa_delegitimize_address (rtx);
 
@@ -286,9 +285,6 @@ static rtx xtensa_delegitimize_address (rtx);
 #undef TARGET_CANNOT_FORCE_CONST_MEM
 #define TARGET_CANNOT_FORCE_CONST_MEM xtensa_cannot_force_const_mem
 
-#undef TARGET_LRA_P
-#define TARGET_LRA_P xtensa_lra_p
-
 #undef TARGET_LEGITIMATE_ADDRESS_P
 #define TARGET_LEGITIMATE_ADDRESS_Pxtensa_legitimate_address_p
 
@@ -1266,14 +1262,6 @@ xtensa_emit_move_sequence (rtx *operands, machine_mode 
mode)
 
   operands[1] = xtensa_copy_incoming_a7 (operands[1]);
 
-  /* During reload we don't want to emit (subreg:X (mem:Y)) since that
- instruction won't be recognized after reload, so we remove the
- subreg and adjust mem accordingly.  */
-  if (reload_in_progress)
-{
-  operands[0] = fixup_subreg_mem (operands[0]);
-  operands[1] = fixup_subreg_mem (operands[1]);
-}
   return 0;
 }
 
@@ -3196,7 +3184,7 @@ xtensa_output_integer_literal_parts (FILE *file, rtx x, 
int size)
   fputs (", ", file);
   xtensa_output_integer_literal_parts (file, second, size / 2);
 }
-  else if (size == 4)
+  else if (size == 4 || size == 2)
 {
   output_addr_const (file, x);
 }
@@ -4876,6 +4864,10 @@ xtensa_trampoline_init (rtx m_tramp, tree fndecl, rtx 
chain)
 static bool
 xtensa_legitimate_constant_p (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
 {
+  if (CONST_INT_P (x))
+return TARGET_AUTO_LITPOOLS || TARGET_CONST16
+  || xtensa_simm12b (INTVAL (x));
+
   return !xtensa_tls_referenced_p (x);
 }
 
@@ -5317,12 +5309,4 @@ xtensa_delegitimize_address (rtx op)
   return op;
 }
 
-/* Implement TARGET_LRA_P.  */
-
-static bool
-xtensa_lra_p (void)
-{
-  return TARGET_LRA;
-}
-
 #include "gt-xtensa.h"
diff --git a/gcc/config/xtensa/xtensa.md 

Ping^8: [PATCH] jit: Install jit headers in $(libsubincludedir) [PR 101491]

2023-05-08 Thread Lorenzo Salvadore via Gcc-patches
Hello,

Ping https://gcc.gnu.org/pipermail/gcc-patches/2022-November/606450.html

Thanks,

Lorenzo Salvadore

> From f8e2c2ee89a7d8741bb65163d1f1c20edcd546ac Mon Sep 17 00:00:00 2001
> From: Lorenzo Salvadore develo...@lorenzosalvadore.it
>
> Date: Wed, 16 Nov 2022 11:27:38 +0100
> Subject: [PATCH] jit: Install jit headers in $(libsubincludedir) [PR 101491]
>
> Installing jit/libgccjit.h and jit/libgccjit++.h headers in
> $(includedir) can be a problem for machines where multiple versions of
> GCC are required simultaneously, see for example this bug report on
> FreeBSD:
>
> https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=257060
>
> Hence,
>
> - define $(libsubincludedir) the same way it is defined in libgomp;
> - install jit/libgccjit.h and jit/libgccjit++.h in $(libsubincludedir).
>
> The patch has already been applied successfully in the official FreeBSD
> ports tree for the ports lang/gcc11 and lang/gcc12. Please see the
> following commits:
>
> https://cgit.freebsd.org/ports/commit/?id=0338e04504ee269b7a95e6707f1314bc1c4239fe
> https://cgit.freebsd.org/ports/commit/?id=f1957296ed2dce8a09bb9582e9a5a715bf8b3d4d
>
> gcc/ChangeLog:
>
> 2022-11-16 Lorenzo Salvadore develo...@lorenzosalvadore.it
>
> PR jit/101491
> * Makefile.in: Define and create $(libsubincludedir)
>
> gcc/jit/ChangeLog:
>
> 2022-11-16 Lorenzo Salvadore develo...@lorenzosalvadore.it
>
> PR jit/101491
> * Make-lang.in: Install headers in $(libsubincludedir)
> ---
> gcc/Makefile.in | 3 +++
> gcc/jit/Make-lang.in | 4 ++--
> 2 files changed, 5 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/Makefile.in b/gcc/Makefile.in
> index f672e6ea549..3bcf1c491ab 100644
> --- a/gcc/Makefile.in
> +++ b/gcc/Makefile.in
> @@ -635,6 +635,8 @@ libexecdir = @libexecdir@
>
> # Directory in which the compiler finds libraries etc.
> libsubdir = 
> $(libdir)/gcc/$(real_target_noncanonical)/$(version)$(accel_dir_suffix)
> +# Directory in which the compiler finds headers.
> +libsubincludedir = $(libdir)/gcc/$(target_alias)/$(version)/include
> # Directory in which the compiler finds executables
> libexecsubdir = 
> $(libexecdir)/gcc/$(real_target_noncanonical)/$(version)$(accel_dir_suffix)
> # Directory in which all plugin resources are installed
> @@ -3642,6 +3644,7 @@ install-cpp: installdirs cpp$(exeext)
> # $(libdir)/gcc/include isn't currently searched by cpp.
> installdirs:
> $(mkinstalldirs) $(DESTDIR)$(libsubdir)
> + $(mkinstalldirs) $(DESTDIR)$(libsubincludedir)
> $(mkinstalldirs) $(DESTDIR)$(libexecsubdir)
> $(mkinstalldirs) $(DESTDIR)$(bindir)
> $(mkinstalldirs) $(DESTDIR)$(includedir)
> diff --git a/gcc/jit/Make-lang.in b/gcc/jit/Make-lang.in
> index 248ec45b729..ba1b3e95da5 100644
> --- a/gcc/jit/Make-lang.in
> +++ b/gcc/jit/Make-lang.in
> @@ -360,9 +360,9 @@ selftest-jit:
> # Install hooks:
> jit.install-headers: installdirs
> $(INSTALL_DATA) $(srcdir)/jit/libgccjit.h \
> - $(DESTDIR)$(includedir)/libgccjit.h
> + $(DESTDIR)$(libsubincludedir)/libgccjit.h
> $(INSTALL_DATA) $(srcdir)/jit/libgccjit++.h \
> - $(DESTDIR)$(includedir)/libgccjit++.h
> + $(DESTDIR)$(libsubincludedir)/libgccjit++.h
>
> ifneq (,$(findstring mingw,$(target)))
> jit.install-common: installdirs jit.install-headers
> --
> 2.38.0


Re: [PATCH] ira: Don't create copies for earlyclobbered pairs

2023-05-08 Thread Vladimir Makarov via Gcc-patches



On 5/5/23 12:59, Richard Sandiford wrote:

This patch follows on from g:9f635bd13fe9e85872e441b6f3618947f989909a
("the previous patch").  To start by quoting that:

If an insn requires two operands to be tied, and the input operand dies
in the insn, IRA acts as though there were a copy from the input to the
output with the same execution frequency as the insn.  Allocating the
same register to the input and the output then saves the cost of a move.

If there is no such tie, but an input operand nevertheless dies
in the insn, IRA creates a similar move, but with an eighth of the
frequency.  This helps to ensure that chains of instructions reuse
registers in a natural way, rather than using arbitrarily different
registers for no reason.

This heuristic seems to work well in the vast majority of cases.
However, the problem fixed in the previous patch was that we
could create a copy for an operand pair even if, for all relevant
alternatives, the output and input register classes did not have
any registers in common.  It is then impossible for the output
operand to reuse the dying input register.

This left unfixed a further case where copies don't make sense:
there is no point trying to reuse the dying input register if,
for all relevant alternatives, the output is earlyclobbered and
the input doesn't match the output.  (Matched earlyclobbers are fine.)

Handling that case fixes several existing XFAILs and helps with
a follow-on aarch64 patch.

Tested on aarch64-linux-gnu and x86_64-linux-gnu.  A SPEC2017 run
on aarch64 showed no differences outside the noise.  Also, I tried
compiling gcc.c-torture, gcc.dg, and g++.dg for at least one target
per cpu directory, using the options -Os -fno-schedule-insns{,2}.
The results below summarise the tests that showed a difference in LOC:

Target   Tests   GoodBad   DeltaBest   Worst  Median
==   =   ===   =   =  ==
amdgcn-amdhsa   14  7  7   3 -18  10  -1
arm-linux-gnueabihf 16 15  1 -22  -4   2  -1
csky-elf 6  6  0 -21  -6  -2  -4
hppa64-hp-hpux11.23  5  5  0  -7  -2  -1  -1
ia64-linux-gnu  16 16  0 -70 -15  -1  -3
m32r-elf53  1 52  64  -2   8   1
mcore-elf2  2  0  -8  -6  -2  -6
microblaze-elf 285283  2-909 -68   4  -1
mmix 7  7  0   -2101   -2091  -1  -1
msp430-elf   1  1  0  -4  -4  -4  -4
pru-elf  8  6  2 -12  -6   2  -2
rx-elf  22 18  4 -40  -5   6  -2
sparc-linux-gnu 15 14  1 -40  -8   1  -2
sparc-wrs-vxworks   15 14  1 -40  -8   1  -2
visium-elf   2  1  1   0  -2   2  -2
xstormy16-elf1  1  0  -2  -2  -2  -2

with other targets showing no sensitivity to the patch.  The only
target that seems to be negatively affected is m32r-elf; otherwise
the patch seems like an extremely minor but still clear improvement.

OK to install?


Yes, Richard.

Thank you for measuring the patch effect.  I wish other people would do 
the same for patches affecting generated code performance.




GCC 12.3.1 Status Report (2023-05-08)

2023-05-08 Thread Richard Biener via Gcc-patches
Status
==

The gcc-12 branch is again open for regression and documentation fixes.


Quality Data


Priority  #   Change from last report
---   ---
P10
P2  501   -   1
P3   54   +   5
P4  232
P5   24
---   ---
Total P1-P3 555   +   4
Total   811   +   4


Previous Report
===

https://gcc.gnu.org/pipermail/gcc/2023-May/241235.html


Re: [PATCH] Bump up precision size to 16 bits.

2023-05-08 Thread Richard Biener via Gcc-patches
On Fri, Feb 3, 2023 at 8:34 AM Richard Biener
 wrote:
>
> On Thu, Feb 2, 2023 at 6:39 PM Michael Meissner via Gcc-patches
>  wrote:
> >
> > The new __dmr type that is being added as a possible future PowerPC 
> > instruction
>
> "is being added" means this feature is already in GCC 13?
>
> > set bumps into a structure field size issue.  The size of the __dmr type is 
> > 1024 bits.
> > The precision field in tree_type_common is currently 10 bits, so if you 
> > store
> > 1,024 into field, you get a 0 back.  When you get 0 in the precision field, 
> > the
> > ccp pass passes this 0 to sext_hwi in hwint.h.  That function in turn 
> > generates
> > a shift that is equal to the host wide int bit size, which is undefined as
> > machine dependent for shifting in C/C++.
> >
> >   int shift = HOST_BITS_PER_WIDE_INT - prec;
> >   return ((HOST_WIDE_INT) ((unsigned HOST_WIDE_INT) src << shift)) >> 
> > shift;
> >
> > It turns out the x86_64 where I first did my tests returns the original 
> > input
> > before the two shifts, while the PowerPC always returns 0.  In the ccp 
> > pass, the
> > original input is -1, and so it worked.  When I did the runs on the 
> > PowerPC, the
> > result was 0, which ultimately led to the failure.
> >
> > In addition, once the precision field is larger, it will help PR C/102989 
> > (C2x
> > _BigInt) as well as the implementation of the SET_TYPE_VECTOR_SUBPARTS 
> > macro.
> >
> > I bootstraped various PowerPC compilers (power10 LE, power9 LE, power8 BE)
> > along with an x86_64 build.  There were no regressions.  My proposed patches
> > for the __dmr type now run fine.  Can I install this into the master branch 
> > for
> > GCC 13?
>
> ... because since we're in stage4 this should fix a regression or at least a
> bug that's like ice-on-valid or wrong-code?
>
> Definitely OK for stage1.

I have pushed this now after bootstrapping / testing on
x86_64-unknown-linux-gnu.

Richard.

> Thanks,
> Richard.
>
> > 2023-02-02   Richard Biener  
> >  Michael Meissner  
> >
> > gcc/
> >
> > PR middle-end/108623
> > * hwint.h (sext_hwi): Add assertion against precision 0.
> > * tree-core.h (tree_type_common): Bump up precision field to 16 
> > bits.
> > Align bit fields > 1 bit to at least an 8-bit boundary.
> > ---
> >  gcc/hwint.h |  1 +
> >  gcc/tree-core.h | 24 
> >  2 files changed, 13 insertions(+), 12 deletions(-)
> >
> > diff --git a/gcc/hwint.h b/gcc/hwint.h
> > index e31aa006fa4..ba92efbfc25 100644
> > --- a/gcc/hwint.h
> > +++ b/gcc/hwint.h
> > @@ -277,6 +277,7 @@ ctz_or_zero (unsigned HOST_WIDE_INT x)
> >  static inline HOST_WIDE_INT
> >  sext_hwi (HOST_WIDE_INT src, unsigned int prec)
> >  {
> > +  gcc_checking_assert (prec != 0);
> >if (prec == HOST_BITS_PER_WIDE_INT)
> >  return src;
> >else
> > diff --git a/gcc/tree-core.h b/gcc/tree-core.h
> > index 8124a1328d4..b71748c6c02 100644
> > --- a/gcc/tree-core.h
> > +++ b/gcc/tree-core.h
> > @@ -1686,18 +1686,8 @@ struct GTY(()) tree_type_common {
> >tree attributes;
> >unsigned int uid;
> >
> > -  unsigned int precision : 10;
> > -  unsigned no_force_blk_flag : 1;
> > -  unsigned needs_constructing_flag : 1;
> > -  unsigned transparent_aggr_flag : 1;
> > -  unsigned restrict_flag : 1;
> > -  unsigned contains_placeholder_bits : 2;
> > -
> > +  unsigned int precision : 16;
> >ENUM_BITFIELD(machine_mode) mode : 8;
> > -
> > -  /* TYPE_STRING_FLAG for INTEGER_TYPE and ARRAY_TYPE.
> > - TYPE_CXX_ODR_P for RECORD_TYPE and UNION_TYPE.  */
> > -  unsigned string_flag : 1;
> >unsigned lang_flag_0 : 1;
> >unsigned lang_flag_1 : 1;
> >unsigned lang_flag_2 : 1;
> > @@ -1713,12 +1703,22 @@ struct GTY(()) tree_type_common {
> >   so we need to store the value 32 (not 31, as we need the zero
> >   as well), hence six bits.  */
> >unsigned align : 6;
> > +  /* TYPE_STRING_FLAG for INTEGER_TYPE and ARRAY_TYPE.
> > + TYPE_CXX_ODR_P for RECORD_TYPE and UNION_TYPE.  */
> > +  unsigned string_flag : 1;
> > +  unsigned no_force_blk_flag : 1;
> > +
> >unsigned warn_if_not_align : 6;
> > +  unsigned needs_constructing_flag : 1;
> > +  unsigned transparent_aggr_flag : 1;
> > +
> > +  unsigned contains_placeholder_bits : 2;
> > +  unsigned restrict_flag : 1;
> >unsigned typeless_storage : 1;
> >unsigned empty_flag : 1;
> >unsigned indivisible_p : 1;
> >unsigned no_named_args_stdarg_p : 1;
> > -  unsigned spare : 15;
> > +  unsigned spare : 9;
> >
> >alias_set_type alias_set;
> >tree pointer_to;
> > --
> > 2.39.1
> >
> >
> > --
> > Michael Meissner, IBM
> > PO Box 98, Ayer, Massachusetts, USA, 01432
> > email: meiss...@linux.ibm.com


Re: [PATCH v2 0/3] c++: Track lifetimes in constant evaluation [PR70331, ...]

2023-05-08 Thread Nathaniel Shead via Gcc-patches
Just pinging in case this fix has fallen through the cracks.

https://gcc.gnu.org/pipermail/gcc-patches/2023-March/614811.html

On Wed, Mar 29, 2023 at 1:33 PM Nathaniel Shead
 wrote:
>
> This is an update of the patch series at
> https://gcc.gnu.org/pipermail/gcc-patches/2023-March/614759.html
>
> The main change is modifying the first patch to store the "expired" flag
> in the C++-specific lang_decl_base struct instead of tree_decl_common.
> The second and third patches to improve diagnostic locations are
> otherwise unchanged.
>
> Bootstrapped and regression tested on x86_64 linux.
>
> Nathaniel
>
> ---
>
> Nathaniel Shead (3):
>   c++: Track lifetimes in constant evaluation [PR70331,PR96630,PR98675]
>   c++: Improve constexpr error for dangling local variables
>   c++: Improve location information in constexpr evaluation
>
>  gcc/cp/constexpr.cc   | 152 --
>  gcc/cp/cp-tree.h  |  10 +-
>  gcc/cp/module.cc  |   2 +
>  gcc/cp/semantics.cc   |   5 +-
>  gcc/cp/typeck.cc  |   5 +-
>  gcc/testsuite/g++.dg/cpp0x/constexpr-48089.C  |  10 +-
>  gcc/testsuite/g++.dg/cpp0x/constexpr-diag3.C  |   2 +-
>  gcc/testsuite/g++.dg/cpp0x/constexpr-ice20.C  |   2 +-
>  gcc/testsuite/g++.dg/cpp1y/constexpr-89481.C  |   3 +-
>  .../g++.dg/cpp1y/constexpr-lifetime1.C|  14 ++
>  .../g++.dg/cpp1y/constexpr-lifetime2.C|  20 +++
>  .../g++.dg/cpp1y/constexpr-lifetime3.C|  13 ++
>  .../g++.dg/cpp1y/constexpr-lifetime4.C|  11 ++
>  .../g++.dg/cpp1y/constexpr-lifetime5.C|  11 ++
>  gcc/testsuite/g++.dg/cpp1y/constexpr-union5.C |   4 +-
>  gcc/testsuite/g++.dg/cpp1y/pr68180.C  |   4 +-
>  .../g++.dg/cpp1z/constexpr-lambda6.C  |   4 +-
>  gcc/testsuite/g++.dg/cpp2a/bit-cast11.C   |  10 +-
>  gcc/testsuite/g++.dg/cpp2a/bit-cast12.C   |  10 +-
>  gcc/testsuite/g++.dg/cpp2a/bit-cast14.C   |  14 +-
>  gcc/testsuite/g++.dg/cpp2a/constexpr-98122.C  |   4 +-
>  .../g++.dg/cpp2a/constexpr-dynamic17.C|   5 +-
>  gcc/testsuite/g++.dg/cpp2a/constexpr-init1.C  |   5 +-
>  gcc/testsuite/g++.dg/cpp2a/constexpr-new12.C  |   6 +-
>  gcc/testsuite/g++.dg/cpp2a/constexpr-new3.C   |  10 +-
>  gcc/testsuite/g++.dg/ext/constexpr-vla2.C |   4 +-
>  gcc/testsuite/g++.dg/ext/constexpr-vla3.C |   4 +-
>  gcc/testsuite/g++.dg/ubsan/pr63956.C  |   4 +-
>  .../g++.dg/warn/Wreturn-local-addr-6.C|   3 -
>  .../25_algorithms/equal/constexpr_neg.cc  |   7 +-
>  30 files changed, 246 insertions(+), 112 deletions(-)
>  create mode 100644 gcc/testsuite/g++.dg/cpp1y/constexpr-lifetime1.C
>  create mode 100644 gcc/testsuite/g++.dg/cpp1y/constexpr-lifetime2.C
>  create mode 100644 gcc/testsuite/g++.dg/cpp1y/constexpr-lifetime3.C
>  create mode 100644 gcc/testsuite/g++.dg/cpp1y/constexpr-lifetime4.C
>  create mode 100644 gcc/testsuite/g++.dg/cpp1y/constexpr-lifetime5.C
>
> --
> 2.34.1
>


Re: Support parallel testing in libgomp, part I [PR66005]

2023-05-08 Thread Thomas Schwinge
Hi Bernhard!

Thanks for your comments.


On 2023-05-06T16:15:45+0200, Bernhard Reutner-Fischer via Gcc-patches 
 wrote:
> On Fri, 5 May 2023 10:55:41 +0200
> Thomas Schwinge  wrote:
>
>> >> > with a minimal change
>> >> > to libgomp.exp so the generated libgomp-test-support.exp file is found
>> >> > in both the sequential and parallel cases.  This isn't an issue in
>> >> > libstdc++ since all necessary variables are stored in a single
>> >> > site.exp.
>>
>> ... in 'libgomp/testsuite/lib/libgomp.exp', I've changed:
>>
>> -load_file libgomp-test-support.exp
>> +# Search in both .. and . to support parallel and sequential testing.
>> +load_file -1 ../libgomp-test-support.exp libgomp-test-support.exp
>>
>> ... into the more explicit:
>>
>> -load_file libgomp-test-support.exp
>> +# Search in '..' vs. '.' to support parallel vs. sequential testing.
>> +if [info exists ::env(GCC_RUNTEST_PARALLELIZE_DIR)] {
>> +load_file ../libgomp-test-support.exp
>> +} else {
>> +load_file libgomp-test-support.exp
>> +}
>
> Do we have to re-read those? Otherwise this would be load_lib:

Indeed there is some confusion there; conceptually the content of that
file has to be expected to vary per libgomp (multilib) build variant.
On the other hand, given the GCC target libraries testing setup, we're
running testing always via the default variant's build, so always read
the default variant's file.  I agree that should get un-confused, however
it's a pre-existing issue that I suggest we tackle independently of this
mechanical change.

> We have libdirs in the minimum deja we require.
>
> Speaking of which. IIRC i additionally deleted all load_gcc_lib:
> https://gcc.gnu.org/legacy-ml/fortran/2012-03/msg00094.html
> in lib{atomic,ffi,go,gomp,itm,phobos,stdc++-v3,vtv}

Aha, another pre-existing/independent thing to look into.


>> And, for now, I hard-code the number of parallel slots to one.  This
>> means that libgomp for 'make -j' now does use the parallel testing code
>> paths, but is restricted to just one slot.  That is, no actual change in
>> behavior, other than that 'libgomp.sum' then is filtered through
>> 'contrib/dg-extract-results.sh'.
>>
>> OK to push the attached
>> "Support parallel testing in libgomp, part I [PR66005]"?
>
> Some cosmetic nits.
> See Jakubs one_to_.

Thanks.  That got installed while I'd already finished my patch.  ;-)
Note that the new 'one_to_' etc. is just the existing
'check_p_numbers' etc. renamed and moved, as it now has a second use.
I'm happy to incorporate that into my changes -- if we agree that it
should also go into other GCC target libraries' parallel testing code:
libphobos, libstdc++-v3.

> + @test ! -f $*/site.exp || mv $*/site.exp $*/site.bak
> that's twisted
>
> +   rm -rf libgomp-parallel || true; \
>
> just || :; \
> I count 4 times.
>
> There seems to be a mixture of ${PWD_COMMAND} and am__cd && pwd:
> + @objdir=`${PWD_COMMAND}`/$*; \
> + srcdir=`$(am__cd) $(srcdir) && pwd`; export srcdir; \
>
> + runtest=$(_RUNTEST); \
> + if [ -z "$$runtest" ]; then runtest=runtest; fi; \
> I think I have plain $${RUNTEST-runtest}
> off the default wildcard $(top_srcdir)/../dejagnu/runtest

As I've written: "libstdc++ parallel testing infrastructure [...] is
where these changes have been copied from", and that one had it copied
from GCC compilers parallel testing infrastructure, I presume.  I do
agree that there's value in unifying and, as feasible, simplifying this
code, but again that to me sounds like a pre-existing/separate thing to
do?


>> >> It is far from trivial though.
>> >> The point is that most of the OpenMP tests are parallelized with the
>> >> default OMP_NUM_THREADS, so running the tests in parallel oversubscribes 
>> >> the
>> >> machine a lot, the higher number of hw threads the more.
>> >
>> > Do you agree that we have two classes of test cases in libgomp: 1) test
>> > cases that don't place a considerably higher load on the machine compared
>> > to "normal" (single-threaded) execution tests, because they're just
>> > testing some functionality that is not expected to actively depend
>> > on/interfere with parallelism.  If needed, and/or if not already done,
>> > such test cases can be parameterized (OMP_NUM_THREADS, OpenACC num_gangs,
>> > num_workers, vector_length clauses, and so on) for low parallelism
>> > levels.  And, 2) test cases that place a considerably higher load on the
>> > machine compared to "normal" (single-threaded) execution tests, because
>> > they're testing some functionality that actively depends on/interferes
>> > with some kind of parallelism.  What about marking such tests specially,
>> > such that DejaGnu will only ever schedule one of them for execution at
>> > the same time?  For example, a new dg-* directive to run them wrapped
>> > through »flock [libgomp/testsuite/serial.lock] [a.out]« or some such?
>
> I think we all agree one that, yes.

Conceptually, yes.  However, 

Re: [PATCH v4] libgfortran: Replace mutex with rwlock

2023-05-08 Thread Bernhard Reutner-Fischer via Gcc-patches
On Mon,  8 May 2023 17:44:43 +0800
Lipeng Zhu  wrote:

> This patch try to introduce the rwlock and split the read/write to
> unit_root tree and unit_cache with rwlock instead of the mutex to
> increase CPU efficiency. In the get_gfc_unit function, the percentage
> to step into the insert_unit function is around 30%, in most instances,
> we can get the unit in the phase of reading the unit_cache or unit_root
> tree. So split the read/write phase by rwlock would be an approach to
> make it more parallel.
> 
> BTW, the IPC metrics can gain around 9x in our test
> server with 220 cores. The benchmark we used is
> https://github.com/rwesson/NEAT

See commentary typos below.
You did not state if you regression tested the patch?
Other than that it LGTM but i cannot approve it.

> diff --git a/libgfortran/io/async.h b/libgfortran/io/async.h
> index ad226c8e856..0033cc74252 100644
> --- a/libgfortran/io/async.h
> +++ b/libgfortran/io/async.h
> @@ -210,6 +210,128 @@
>  DEBUG_PRINTF ("%s" DEBUG_RED "ACQ:" DEBUG_NORM " %-30s %78p\n", 
> aio_prefix, #mutex, mutex); \
>} while (0)
>  
> +#ifdef __GTHREAD_RWLOCK_INIT
> +#define RWLOCK_DEBUG_ADD(rwlock) do {\
> +aio_rwlock_debug *n; \
> +n = xmalloc (sizeof(aio_rwlock_debug));  \

Missing space before the open brace: sizeof (

> diff --git a/libgfortran/io/unit.c b/libgfortran/io/unit.c
> index 82664dc5f98..62f1db21d34 100644
> --- a/libgfortran/io/unit.c
> +++ b/libgfortran/io/unit.c
> @@ -33,34 +33,36 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
> If not, see
>  
>  
>  /* IO locking rules:
> -   UNIT_LOCK is a master lock, protecting UNIT_ROOT tree and UNIT_CACHE.
> +   UNIT_RWLOCK is a master lock, protecting UNIT_ROOT tree and UNIT_CACHE.
> +   And use the rwlock to spilt read and write phase to UNIT_ROOT tree
> +   and UNIT_CACHE to increase CPU efficiency.

s/spilt/split. Maybe:

Using an rwlock improves efficiency by allowing us to separate readers
and writers of both UNIT_ROOT and UNIT_CACHE.

> @@ -350,6 +356,17 @@ retry:
>if (c == 0)
>   break;
>  }
> +  /* We did not find a unit in the cache nor in the unit list, create a new
> +(locked) unit and insert into the unit list and cache.
> +Manipulating either or both the unit list and the unit cache requires to
> +hold a write-lock [for obvious reasons]:
> +1. By separating the read/write lock, it will greatly reduce the 
> contention
> +   at the read part, while write part is not always necessary or most
> +   unlikely once the unit hit in cache.

+By separating the read/write lock, we will greatly reduce the contention
+on the read part, while the write part is unlikely once the unit hits
+the cache.

> +2. We try to balance the implementation complexity and the performance
> +   gains that fit into current cases we observed by just using a
> +   pthread_rwlock. */

Let's drop 2.
thanks,


Re: [PATCH v3] libgfortran: Replace mutex with rwlock

2023-05-08 Thread Bernhard Reutner-Fischer via Gcc-patches
On Mon, 8 May 2023 17:31:27 +0800
"Zhu, Lipeng"  wrote:

> > BTW, did you look at the RELEASE semantics, respectively the note that one 
> > day (and now is that very
> > day), we might improve on the release semantics? Can of course be 
> > incremental AFAIC
> >   
> Not quite understand your question about looking at the RELEASE 
> semantics. Can you be more specific?

libgfortran/io/io.h: could be further optimized by making this be an 
__ATOMIC_RELEASE,

But let's defer that to a follow-up improvement independently of the
rwlock.
thanks,


Re: [PATCH] RISC-V: Optimize vsetvli of LCM INSERTED edge for user vsetvli [PR 109743]

2023-05-08 Thread Kito Cheng via Gcc-patches
I am wondering if it is possible to do this on
local_eliminate_vsetvl_insn? I feel this is sort of local elimination,
so putting them together would be better than handling that in many
different places.

On Mon, May 8, 2023 at 9:35 AM juzhe.zh...@rivai.ai
 wrote:
>
> Gentle ping this patch.
>
> Is this Ok for trunk? Thanks.
>
>
> juzhe.zh...@rivai.ai
>
> From: juzhe.zhong
> Date: 2023-05-06 19:14
> To: gcc-patches
> CC: kito.cheng; Juzhe-Zhong
> Subject: [PATCH] RISC-V: Optimize vsetvli of LCM INSERTED edge for user 
> vsetvli [PR 109743]
> From: Juzhe-Zhong 
>
> This patch is fixing: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109743.
>
> This issue happens is because we are currently very conservative in 
> optimization of user vsetvli.
>
> Consider this following case:
>
> bb 1:
>   vsetvli a5,a4... (demand AVL = a4).
> bb 2:
>   RVV insn use a5 (demand AVL = a5).
>
> LCM will hoist vsetvl of bb 2 into bb 1.
> We don't do AVL propagation for this situation since it's complicated that
> we should analyze the code sequence between vsetvli in bb 1 and RVV insn in 
> bb 2.
> They are not necessary the consecutive blocks.
>
> This patch is doing the optimizations after LCM, we will check and eliminate 
> the vsetvli
> in LCM inserted edge if such vsetvli is redundant. Such approach is much 
> simplier and safe.
>
> code:
> void
> foo2 (int32_t *a, int32_t *b, int n)
> {
>   if (n <= 0)
>   return;
>   int i = n;
>   size_t vl = __riscv_vsetvl_e32m1 (i);
>
>   for (; i >= 0; i--)
>   {
> vint32m1_t v = __riscv_vle32_v_i32m1 (a, vl);
> __riscv_vse32_v_i32m1 (b, v, vl);
>
> if (i >= vl)
>   continue;
>
> if (i == 0)
>   return;
>
> vl = __riscv_vsetvl_e32m1 (i);
>   }
> }
>
> Before this patch:
> foo2:
> .LFB2:
> .cfi_startproc
> ble a2,zero,.L1
> mv  a4,a2
> li  a3,-1
> vsetvli a5,a2,e32,m1,ta,mu
> vsetvli zero,a5,e32,m1,ta,ma  <- can be eliminated.
> .L5:
> vle32.v v1,0(a0)
> vse32.v v1,0(a1)
> bgeua4,a5,.L3
> .L10:
> beq a2,zero,.L1
> vsetvli a5,a4,e32,m1,ta,mu
> addia4,a4,-1
> vsetvli zero,a5,e32,m1,ta,ma  <- can be eliminated.
> vle32.v v1,0(a0)
> vse32.v v1,0(a1)
> addiw   a2,a2,-1
> bltua4,a5,.L10
> .L3:
> addiw   a2,a2,-1
> addia4,a4,-1
> bne a2,a3,.L5
> .L1:
> ret
>
> After this patch:
> f:
> ble a2,zero,.L1
> mv  a4,a2
> li  a3,-1
> vsetvli a5,a2,e32,m1,ta,ma
> .L5:
> vle32.v v1,0(a0)
> vse32.v v1,0(a1)
> bgeua4,a5,.L3
> .L10:
> beq a2,zero,.L1
> vsetvli a5,a4,e32,m1,ta,ma
> addia4,a4,-1
> vle32.v v1,0(a0)
> vse32.v v1,0(a1)
> addiw   a2,a2,-1
> bltua4,a5,.L10
> .L3:
> addiw   a2,a2,-1
> addia4,a4,-1
> bne a2,a3,.L5
> .L1:
> ret
>
> PR target/109743
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-vsetvl.cc (pass_vsetvl::commit_vsetvls): Add 
> optimization for LCM inserted edge.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/vsetvl/pr109743-1.c: New test.
> * gcc.target/riscv/rvv/vsetvl/pr109743-2.c: New test.
> * gcc.target/riscv/rvv/vsetvl/pr109743-3.c: New test.
> * gcc.target/riscv/rvv/vsetvl/pr109743-4.c: New test.
>
> ---
> gcc/config/riscv/riscv-vsetvl.cc  | 42 +++
> .../gcc.target/riscv/rvv/vsetvl/pr109743-1.c  | 26 
> .../gcc.target/riscv/rvv/vsetvl/pr109743-2.c  | 27 
> .../gcc.target/riscv/rvv/vsetvl/pr109743-3.c  | 28 +
> .../gcc.target/riscv/rvv/vsetvl/pr109743-4.c  | 28 +
> 5 files changed, 151 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-1.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-2.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-3.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-4.c
>
> diff --git a/gcc/config/riscv/riscv-vsetvl.cc 
> b/gcc/config/riscv/riscv-vsetvl.cc
> index f55907a410e..fcee7fdf323 100644
> --- a/gcc/config/riscv/riscv-vsetvl.cc
> +++ b/gcc/config/riscv/riscv-vsetvl.cc
> @@ -3834,6 +3834,48 @@ pass_vsetvl::commit_vsetvls (void)
>   const vector_insn_info *require
> = m_vector_manager->vector_exprs[i];
>   gcc_assert (require->valid_or_dirty_p ());
> +
> +   /* Here we optimize the VSETVL is hoisted by LCM:
> +
> + Before LCM:
> +bb 1:
> +  vsetvli a5,a2,e32,m1,ta,mu
> +bb 2:
> +  vsetvli zero,a5,e32,m1,ta,mu
> +  ...
> +
> + After LCM:
> +bb 1:
> +  vsetvli a5,a2,e32,m1,ta,mu
> +  LCM INSERTED: vsetvli zero,a5,e32,m1,ta,mu --> eliminate
> +bb 2:
> +  ...
> +*/
> +   const basic_block pred_cfg_bb = eg->src;
> +   const auto block_info
> + = 

[PATCH v4] libgfortran: Replace mutex with rwlock

2023-05-08 Thread Lipeng Zhu via Gcc-patches
This patch try to introduce the rwlock and split the read/write to
unit_root tree and unit_cache with rwlock instead of the mutex to
increase CPU efficiency. In the get_gfc_unit function, the percentage
to step into the insert_unit function is around 30%, in most instances,
we can get the unit in the phase of reading the unit_cache or unit_root
tree. So split the read/write phase by rwlock would be an approach to
make it more parallel.

BTW, the IPC metrics can gain around 9x in our test
server with 220 cores. The benchmark we used is
https://github.com/rwesson/NEAT

libgcc/ChangeLog:

* gthr-posix.h (__GTHREAD_RWLOCK_INIT): New macro
(__gthrw): New function
(__gthread_rwlock_rdlock): New function
(__gthread_rwlock_tryrdlock): New function
(__gthread_rwlock_wrlock): New function
(__gthread_rwlock_trywrlock): New function
(__gthread_rwlock_unlock): New function

libgfortran/ChangeLog:

* io/async.c (DEBUG_LINE): New
* io/async.h (RWLOCK_DEBUG_ADD): New macro
(CHECK_RDLOCK): New macro
(CHECK_WRLOCK): New macro
(TAIL_RWLOCK_DEBUG_QUEUE): New macro
(IN_RWLOCK_DEBUG_QUEUE): New macro
(RDLOCK): New macro
(WRLOCK): New macro
(RWUNLOCK): New macro
(RD_TO_WRLOCK): New macro
(INTERN_RDLOCK): New macro
(INTERN_WRLOCK): New macro
(INTERN_RWUNLOCK): New macro
* io/io.h (internal_proto): Define unit_rwlock
* io/transfer.c (st_read_done_worker): Relace unit_lock with unit_rwlock
(st_write_done_worker): Relace unit_lock with unit_rwlock
* io/unit.c (get_gfc_unit): Relace unit_lock with unit_rwlock
(if): Relace unit_lock with unit_rwlock
(close_unit_1): Relace unit_lock with unit_rwlock
(close_units): Relace unit_lock with unit_rwlock
(newunit_alloc): Relace unit_lock with unit_rwlock
* io/unix.c (flush_all_units): Relace unit_lock with unit_rwlock

---
v1 -> v2:
Limit the pthread_rwlock usage in libgcc only when __cplusplus isn't defined.

v2 -> v3:
Rebase the patch with trunk branch.

v3 -> v4:
Update the comments.

Reviewed-by: Hongjiu Lu 
Signed-off-by: Lipeng Zhu 
---
 libgcc/gthr-posix.h   |  60 +++
 libgfortran/io/async.c|   4 +
 libgfortran/io/async.h| 151 ++
 libgfortran/io/io.h   |  15 ++--
 libgfortran/io/transfer.c |   8 +-
 libgfortran/io/unit.c |  75 +++
 libgfortran/io/unix.c |  16 ++--
 7 files changed, 283 insertions(+), 46 deletions(-)

diff --git a/libgcc/gthr-posix.h b/libgcc/gthr-posix.h
index aebcfdd9f4c..73283082997 100644
--- a/libgcc/gthr-posix.h
+++ b/libgcc/gthr-posix.h
@@ -48,6 +48,9 @@ typedef pthread_t __gthread_t;
 typedef pthread_key_t __gthread_key_t;
 typedef pthread_once_t __gthread_once_t;
 typedef pthread_mutex_t __gthread_mutex_t;
+#ifndef __cplusplus
+typedef pthread_rwlock_t __gthread_rwlock_t;
+#endif
 typedef pthread_mutex_t __gthread_recursive_mutex_t;
 typedef pthread_cond_t __gthread_cond_t;
 typedef struct timespec __gthread_time_t;
@@ -58,6 +61,9 @@ typedef struct timespec __gthread_time_t;
 
 #define __GTHREAD_MUTEX_INIT PTHREAD_MUTEX_INITIALIZER
 #define __GTHREAD_MUTEX_INIT_FUNCTION __gthread_mutex_init_function
+#ifndef __cplusplus
+#define __GTHREAD_RWLOCK_INIT PTHREAD_RWLOCK_INITIALIZER
+#endif
 #define __GTHREAD_ONCE_INIT PTHREAD_ONCE_INIT
 #if defined(PTHREAD_RECURSIVE_MUTEX_INITIALIZER)
 #define __GTHREAD_RECURSIVE_MUTEX_INIT PTHREAD_RECURSIVE_MUTEX_INITIALIZER
@@ -135,6 +141,13 @@ __gthrw(pthread_mutexattr_init)
 __gthrw(pthread_mutexattr_settype)
 __gthrw(pthread_mutexattr_destroy)
 
+#ifndef __cplusplus
+__gthrw(pthread_rwlock_rdlock)
+__gthrw(pthread_rwlock_tryrdlock)
+__gthrw(pthread_rwlock_wrlock)
+__gthrw(pthread_rwlock_trywrlock)
+__gthrw(pthread_rwlock_unlock)
+#endif
 
 #if defined(_LIBOBJC) || defined(_LIBOBJC_WEAK)
 /* Objective-C.  */
@@ -885,6 +898,53 @@ __gthread_cond_destroy (__gthread_cond_t* __cond)
   return __gthrw_(pthread_cond_destroy) (__cond);
 }
 
+#ifndef __cplusplus
+static inline int
+__gthread_rwlock_rdlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_rdlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_tryrdlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_tryrdlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_wrlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_wrlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_trywrlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_trywrlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_unlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_unlock) (__rwlock);
+  else
+return 0;
+}
+#endif
+
 #endif /* 

Re: [PATCH v3] libgfortran: Replace mutex with rwlock

2023-05-08 Thread Zhu, Lipeng via Gcc-patches




On 1/1/1970 8:00 AM, Bernhard Reutner-Fischer wrote:

Hi!

[please do not top-post]


Sure, thanks for the reminder.


On Thu, 20 Apr 2023 21:13:08 +0800
"Zhu, Lipeng"  wrote:


Hi Bernhard,

Thanks for your questions and suggestions.
The rwlock could allow multiple threads to have concurrent read-only
access to the cache/unit list, only a single writer is allowed.


right.


Write lock will not be acquired until all read lock are released.


So i must have confused rwlock with something else, something that allows self 
to hold a read-lock and
upgrade that to a write-lock, purposely starving all successive incoming 
readers. I.e. just toggle your
RD_TO_WRLOCK impl, here, atomically. This proved to be benefical in some 
situations in the past.
Doesn't seem to work with your rwlock, does it

Pthread API doesn't support the upgrade rdlock to wrlock directly, the 
calling thread may deadlock if at the time the call is made it holds the 
read-write lock (whether a read or write lock). 
https://linux.die.net/man/3/pthread_rwlock_wrlock.
That is why we implement RD_TO_WRLOCK like this: release read lock 
firstly and then acquire write lock.



And I didn't change the mutex scope when refactor the code, only make
a more fine-grained distinction for the read/write cache/unit list.


Yes of course, i can see you did that.


I complete the comment according to your template, I will insert the
comment in the source code in next version patch with other refinement
by your suggestions.
"
Now we did not get a unit in cache and unit list, so we need to create
a new unit, and update it to cache and unit list.


s/Now/By now/ or s/Now w/W/ and s/get/find/ "
We did not find a unit in the cache nor in the unit list, create a new
(locked) unit and insert into the unit list and cache.
Manipulating either or both the unit list and the unit cache requires to hold a 
write-lock [for obvious
reasons]"

Superfluous when talking about pthread_rwlock_wrlock since that implies that 
even the process
acquiring the wrlock has to first release it's very own rdlock.


Thanks for the correction, I will update the comments in next v4 patch.


Prior to update the cache and list, we need to release all read locks,
and then immediately to acquire write lock, thus ensure the exclusive
update to the cache and unit list.
Either way, we will manipulate the cache and/or the unit list so we
must take a write lock now.
We don't take the write bit in *addition* to the read lock because:
1. It will needlessly complicate releasing the respective lock;


Under pthread_rwlock_wrlock it will deadlock, so that's wrong?
Drop that point then? If not, what's your reasoning / observation?

Under my lock, you hold the R, additionally take the W and then immediately 
release the R because you
yourself won't read, just write.
But mine's not the pthread_rwlock you talk about, admittedly.

Yes, pthread_rwlock doesn't support upgrade rdlock to wrlock directly, 
so we can’t hold rdlock while trying to acquire wrlock. I will drop the 
first point and update the comment accordingly.



2. By separate the read/write lock, it will greatly reduce the
contention at the read part, while write part is not always necessary
or most unlikely once the unit hit in cache;


We know that.


3. We try to balance the implementation complexity and the performance
gains that fit into current cases we observed.


.. by just using a pthread_rwlock. And that's the top point iff you keep it at 
that. That's a fair step, sure.
BTW, did you look at the RELEASE semantics, respectively the note that one day 
(and now is that very
day), we might improve on the release semantics? Can of course be incremental 
AFAIC

Not quite understand your question about looking at the RELEASE 
semantics. Can you be more specific?



"


If folks agree on this first step then you have my OK with a catchy malloc and 
the discussion recorded
here on the list. A second step would be RELEASE.
And, depending on the underlying capabilities of available locks, further 
tweaks, obviously.

PS: and, please, don't top-post

thanks,



Best Regards,
Zhu, Lipeng

On 1/1/1970 8:00 AM, Bernhard Reutner-Fischer wrote:

On 19 April 2023 09:06:28 CEST, Lipeng Zhu via Fortran  
wrote:

This patch try to introduce the rwlock and split the read/write to
unit_root tree and unit_cache with rwlock instead of the mutex to
increase CPU efficiency. In the get_gfc_unit function, the
percentage to step into the insert_unit function is around 30%, in
most instances, we can get the unit in the phase of reading the
unit_cache or unit_root tree. So split the read/write phase by
rwlock would be an approach to make it more parallel.

BTW, the IPC metrics can gain around 9x in our test server with 220
cores. The benchmark we used is https://github.com/rwesson/NEAT
  
   

+#define RD_TO_WRLOCK(rwlock) \
+  RWUNLOCK (rwlock);\
+  WRLOCK (rwlock);
+#endif
+


   

diff --git a/libgfortran/io/unit.c b/libgfortran/io/unit.c index

Re: Re: [PATCH 1/2] [RISC-V] disable shrink-wrap-separate if zcmp enabled.

2023-05-08 Thread Kito Cheng via Gcc-patches
-msave-restore is a different story; it's only enabled when the user
requests, but `-march` describes the capability of the target
architecture, not specify the preference of performance or size, which
should be determined by -O1~-O3/-Ofast or -Os/-Oz.

On Mon, May 8, 2023 at 4:54 PM Fei Gao  wrote:
>
> On 2023-05-08 16:05  Kito Cheng  wrote:
> >
> >> > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> >> > index 45a63cab9c9..629e5e45cac 100644
> >> > --- a/gcc/config/riscv/riscv.cc
> >> > +++ b/gcc/config/riscv/riscv.cc
> >> > @@ -5729,7 +5729,8 @@ riscv_get_separate_components (void)
> >> >
> >> >if (riscv_use_save_libcall (>machine->frame)
> >> >|| cfun->machine->interrupt_handler_p
> >> > -  || !cfun->machine->frame.gp_sp_offset.is_constant ())
> >> > +  || !cfun->machine->frame.gp_sp_offset.is_constant ()
> >> > +  || TARGET_ZCMP)
> >> >  return components;
> >>
> >> I think this is a bad idea. I have a use case where we use the C
> >> extensions but still compile for -O2 because we want the code to be
> >> fast as possible but still having the savings of the C extensions.
> >
> >Yeah, agree, so I would prefer to drop this from the patch series.
>
> Zcmp is a little different here than C.
> C extension is done fully in AS.  So  we have the code to be
> fast as possible but still having the savings of the C extensions.
>
> Zcmp and shrink-wrap-separate are both done in prologue/epilogue pass
> and you can only have one switch active to direct sregs save and restore.
> In my understanding, zcmp push and pop insns seem to
> be mutually exclusive in functionality to shrink-wrap-separate.
> It's not expected to see zcmp insns at the begining/end of prologue/epilogue,
> and also repeated store/load sregs in separate blocks.
>
> Same for save and restore, and i guess that's why we have
> riscv_use_save_libcall (>machine->frame) check here.
>
> BR,
> Fei
>
> >
> >> Thanks,
> >> Andrew Pinski


[PATCH] RISC-V: Update RVV integer compare simplification comments

2023-05-08 Thread Pan Li via Gcc-patches
From: Pan Li 

The VMSET simplification RVV integer comparision has merged already.
This patch would like to update the comments for the cases that the
define_split will act on.

Signed-off-by: Pan Li 

gcc/ChangeLog:

* config/riscv/vector.md: Add comments for simplifying to vmset.

Signed-off-by: Pan Li 
---
 gcc/config/riscv/vector.md | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 9ccc0d6a513..9904d4ca5e2 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -8159,13 +8159,20 @@ (define_insn 
"@pred_indexed_store"
 ;; 
-
 ;;  Integer Compare Instructions Simplification
 ;; 
-
-;; Simplify to VMCLR.m Includes:
+;; Simplify OP(V, V) Instructions to VMCLR.m Includes:
 ;; - 1.  VMSNE
 ;; - 2.  VMSLT
 ;; - 3.  VMSLTU
 ;; - 4.  VMSGT
 ;; - 5.  VMSGTU
 ;; 
-
+;; Simplify OP(V, V) Instructions to VMSET.m Includes:
+;; - 1.  VMSEQ
+;; - 2.  VMSLE
+;; - 3.  VMSLEU
+;; - 4.  VMSGE
+;; - 5.  VMSGEU
+;; 
-
 (define_split
   [(set (match_operand:VB  0 "register_operand")
(if_then_else:VB
-- 
2.34.1



Re: Re: [PATCH 1/2] [RISC-V] disable shrink-wrap-separate if zcmp enabled.

2023-05-08 Thread Fei Gao
On 2023-05-08 16:05  Kito Cheng  wrote:
>
>> > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
>> > index 45a63cab9c9..629e5e45cac 100644
>> > --- a/gcc/config/riscv/riscv.cc
>> > +++ b/gcc/config/riscv/riscv.cc
>> > @@ -5729,7 +5729,8 @@ riscv_get_separate_components (void)
>> >
>> >    if (riscv_use_save_libcall (>machine->frame)
>> >    || cfun->machine->interrupt_handler_p
>> > -  || !cfun->machine->frame.gp_sp_offset.is_constant ())
>> > +  || !cfun->machine->frame.gp_sp_offset.is_constant ()
>> > +  || TARGET_ZCMP)
>> >  return components;
>>
>> I think this is a bad idea. I have a use case where we use the C
>> extensions but still compile for -O2 because we want the code to be
>> fast as possible but still having the savings of the C extensions.
>
>Yeah, agree, so I would prefer to drop this from the patch series. 

Zcmp is a little different here than C. 
C extension is done fully in AS.  So  we have the code to be
fast as possible but still having the savings of the C extensions.

Zcmp and shrink-wrap-separate are both done in prologue/epilogue pass
and you can only have one switch active to direct sregs save and restore.
In my understanding, zcmp push and pop insns seem to
be mutually exclusive in functionality to shrink-wrap-separate. 
It's not expected to see zcmp insns at the begining/end of prologue/epilogue, 
and also repeated store/load sregs in separate blocks.  

Same for save and restore, and i guess that's why we have 
riscv_use_save_libcall (>machine->frame) check here.

BR, 
Fei

>
>> Thanks,
>> Andrew Pinski

Re: Re: [PATCH V4] VECT: Add decrement IV iteration loop control by variable amount support

2023-05-08 Thread juzhe.zh...@rivai.ai
Hi, Kewen.

I have tried to implement "decrement IV" feature and incorporate into 
"vect_set_loop_controls_directly".
Since the implementation is quite different from 
vect_set_loop_controls_directly, it will make  vect_set_loop_controls_directly
very complicated sometimes it makes me very hard to debug when I am testing. 

I am not sure but I can try again.

This patch isolated those implementation into a single function 
"vect_set_loop_controls_by_select_vl" which makes Richards
easier to review codes. Well, I think I can try again to incorporate those 
codes into "vect_set_loop_controls_directly" when they 
finish the review process of  "vect_set_loop_controls_by_select_vl".

Thanks.


juzhe.zh...@rivai.ai
 
From: Kewen.Lin
Date: 2023-05-08 15:55
To: juzhe.zh...@rivai.ai
CC: gcc-patches; rguenther; richard.sandiford
Subject: Re: [PATCH V4] VECT: Add decrement IV iteration loop control by 
variable amount support
Hi Juzhe,
 
> Hi, Kewen.
> 
>>> Sorry for chiming in, I had some concern here.
>>> We already have some handlings for partial vector in length in 
>>> vect_set_loop_controls_directly
>>>(actually it deals with both mask and length), the proposed 
>>>vect_set_loop_controls_by_select_vl
>>>for select_vl looks like a variant for partial vector in length (comparing 
>>>to the normal MIN),
>>>and also adopts decrement IV.  IMHO, it seems better to modify/extend the 
>>>current handling in
>>>vect_set_loop_controls_directly for length, or factor out the handlings for 
>>>length there and
>>>extend the factored one.  Otherwise, it means we have two sets of handlings 
>>>for partial vector
>>>in lengths, it looks bad to maintain.  As the previous discussion, adopting 
>>>decrement IV is an
>>>enhancement for loop control, it's good for both cases w/ or w/o select_vl.  
>>>If the above
>>>understanding is correct, the possible steps seem to be:
>>>  - factor out the handling for length (* optional)
>>>  - modify it with decrement IV
>>>  - extend it with select_vl.
>>>In future if some RVV vendor wants to degenerate select_vl to min, it can 
>>>just adopt the same
>>>handlings with min by not defining select_vl optab.
> 
> You mean like this:
> doing this inside vect_set_loop_controls_directly ?
> if (use_while_len_p)
>   return vect_set_loop_controls_by_while_len(...)
 
No, I meant either factoring out those handlings for partial vector in length 
in function
vect_set_loop_controls_directly to one separated function like: 
vect_set_loop_controls_directly_length
and rename the existing one to vect_set_loop_controls_directly_mask, or keep 
the existing
vect_set_loop_controls_directly for both mask and length but modify/extend the 
part for length.
If there is no much code to share between mask and length, the former may be 
better.
 
BR,
Kewen
 


RE: [PATCH] machine_mode type size: Extend enum size from 8-bit to 16-bit

2023-05-08 Thread Li, Pan2 via Gcc-patches
After the bits patch like below.

rtx_def code 16 => 8 bits.
rtx_def mode 8 => 16 bits.
tree_base code unchanged.

The structure layout of both the rtx_def and tree_base will be something 
similar as below. As I understand, the lower 8-bits of tree_base will be 
inspected when 'dv' is a tree for the rtx conversion.

tree_base   rtx_def
code: 16code: 8
side_effects_flag: 1mode: 16
constant_flag: 1
addressable_flag: 1
volatile_flag: 1
readonly_flag: 1
asm_written_flag: 1
nowarning_flag: 1
visited: 1
used_flag: 1
nothrow_flag: 1
static_flag: 1
public_flag: 1
private_flag: 1
protected_flag: 1
deprecated_flag: 1
default_def_flag: 1

I have a try a similar approach (as below) as you mentioned, aka shrink 
tree_code as 1:1 overlap to rtx_code. And completed one memory allocated bytes 
test in another email.

rtx_def code 16 => 12 bits.
rtx_def mode 8 => 12 bits.
tree_base code 16 => 12 bits.

Pan

-Original Message-
From: Richard Biener  
Sent: Monday, May 8, 2023 3:38 PM
To: Li, Pan2 
Cc: Jeff Law ; Kito Cheng ; 
juzhe.zh...@rivai.ai; richard.sandiford ; 
gcc-patches ; palmer ; jakub 

Subject: RE: [PATCH] machine_mode type size: Extend enum size from 8-bit to 
16-bit

On Mon, 8 May 2023, Li, Pan2 wrote:

> return !dv || (int) GET_CODE ((rtx) dv) != (int) VALUE; } is able to 
> fix this ICE after mode bits change.

Can you check which bits this will inspect when 'dv' is a tree after your 
patch?  VALUE is 1 and would map to IDENTIFIER_NODE on the tree side when there 
was a 1:1 overlap.

I think for all cases but struct loc_exp_dep we could find a bit to record 
wheter we deal with a VALUE or a decl, but for loc_exp_dep it's going to be 
difficult (unless we start to take bits from pointer representations).

That said, I agree with Jeff that the code is ugly, but a simplistic conversion 
isn't what we want.

An alternative "solution" might be to also shrink tree_code when we shrink 
rtx_code and keep the 1:1 overlap.

Richard.

> I will re-trigger the memory allocate bytes test with below changes 
> for X86.
> 
> rtx_def code 16 => 8 bits.
> rtx_def mode 8 => 16 bits.
> tree_base code unchanged.
> 
> Pan
> 
> -Original Message-
> From: Li, Pan2
> Sent: Monday, May 8, 2023 2:42 PM
> To: Richard Biener ; Jeff Law 
> 
> Cc: Kito Cheng ; juzhe.zh...@rivai.ai; 
> richard.sandiford ; gcc-patches 
> ; palmer ; jakub 
> 
> Subject: RE: [PATCH] machine_mode type size: Extend enum size from 
> 8-bit to 16-bit
> 
> Oops. Actually I am patching a version as you mentioned like storage 
> allocation. Thank you Richard, will try your suggestion and keep you posted.
> 
> Pan
> 
> -Original Message-
> From: Richard Biener 
> Sent: Monday, May 8, 2023 2:30 PM
> To: Jeff Law 
> Cc: Li, Pan2 ; Kito Cheng ; 
> juzhe.zh...@rivai.ai; richard.sandiford ; 
> gcc-patches ; palmer ; 
> jakub 
> Subject: Re: [PATCH] machine_mode type size: Extend enum size from 
> 8-bit to 16-bit
> 
> On Sun, 7 May 2023, Jeff Law wrote:
> 
> > 
> > 
> > On 5/6/23 19:55, Li, Pan2 wrote:
> > > It looks like we cannot simply swap the code and mode in rtx_def, 
> > > the code may have to be the same bits as the tree_code in tree_base.
> > > Or we will meet ICE like below.
> > > 
> > > rtx_def code 16 => 8 bits.
> > > rtx_def mode 8 => 16 bits.
> > > 
> > > static inline decl_or_value
> > > dv_from_value (rtx value)
> > > {
> > >decl_or_value dv;
> > >dv = value;
> > >gcc_checking_assert (dv_is_value_p (dv));  <=  ICE
> > >return dv;
> > Ugh.  We really just need to fix this code.  It assumes particular 
> > structure layouts and that's just wrong/dumb.
> 
> Well, it's a neat trick ... we just need to adjust it to
> 
> static inline bool
> dv_is_decl_p (decl_or_value dv)
> {
>   return !dv || (int) GET_CODE ((rtx) dv) != (int) VALUE; }
> 
> I think (and hope for the 'decl' case the bits inspected are never 'VALUE').  
> Of course the above stinks from a TBAA perspective ...
> 
> Any "real" fix would require allocating storage for a discriminator and thus 
> hurt the resource constrained var-tracking a lot.
> 
> Richard.
> 

--
Richard Biener 
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, 
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman; HRB 
36809 (AG Nuernberg)


Re: [PATCH 1/2] [RISC-V] disable shrink-wrap-separate if zcmp enabled.

2023-05-08 Thread Kito Cheng via Gcc-patches
> > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > index 45a63cab9c9..629e5e45cac 100644
> > --- a/gcc/config/riscv/riscv.cc
> > +++ b/gcc/config/riscv/riscv.cc
> > @@ -5729,7 +5729,8 @@ riscv_get_separate_components (void)
> >
> >if (riscv_use_save_libcall (>machine->frame)
> >|| cfun->machine->interrupt_handler_p
> > -  || !cfun->machine->frame.gp_sp_offset.is_constant ())
> > +  || !cfun->machine->frame.gp_sp_offset.is_constant ()
> > +  || TARGET_ZCMP)
> >  return components;
>
> I think this is a bad idea. I have a use case where we use the C
> extensions but still compile for -O2 because we want the code to be
> fast as possible but still having the savings of the C extensions.

Yeah, agree, so I would prefer to drop this from the patch series.

> Thanks,
> Andrew Pinski


Re: [PATCH V4] VECT: Add decrement IV iteration loop control by variable amount support

2023-05-08 Thread Kewen.Lin via Gcc-patches
Hi Juzhe,

> Hi, Kewen.
> 
>>> Sorry for chiming in, I had some concern here.
>>> We already have some handlings for partial vector in length in 
>>> vect_set_loop_controls_directly
>>>(actually it deals with both mask and length), the proposed 
>>>vect_set_loop_controls_by_select_vl
>>>for select_vl looks like a variant for partial vector in length (comparing 
>>>to the normal MIN),
>>>and also adopts decrement IV.  IMHO, it seems better to modify/extend the 
>>>current handling in
>>>vect_set_loop_controls_directly for length, or factor out the handlings for 
>>>length there and
>>>extend the factored one.  Otherwise, it means we have two sets of handlings 
>>>for partial vector
>>>in lengths, it looks bad to maintain.  As the previous discussion, adopting 
>>>decrement IV is an
>>>enhancement for loop control, it's good for both cases w/ or w/o select_vl.  
>>>If the above
>>>understanding is correct, the possible steps seem to be:
>>>  - factor out the handling for length (* optional)
>>>  - modify it with decrement IV
>>>  - extend it with select_vl.
>>>In future if some RVV vendor wants to degenerate select_vl to min, it can 
>>>just adopt the same
>>>handlings with min by not defining select_vl optab.
> 
> You mean like this:
> doing this inside vect_set_loop_controls_directly ?
>         if (use_while_len_p)
>           return vect_set_loop_controls_by_while_len(...)

No, I meant either factoring out those handlings for partial vector in length 
in function
vect_set_loop_controls_directly to one separated function like: 
vect_set_loop_controls_directly_length
and rename the existing one to vect_set_loop_controls_directly_mask, or keep 
the existing
vect_set_loop_controls_directly for both mask and length but modify/extend the 
part for length.
If there is no much code to share between mask and length, the former may be 
better.

BR,
Kewen


Re: [PATCH 1/2] [RISC-V] disable shrink-wrap-separate if zcmp enabled.

2023-05-08 Thread Andrew Pinski via Gcc-patches
On Sat, May 6, 2023 at 1:41 AM Fei Gao  wrote:
>
> zcmp aims to reduce code size, while shrink-wrap-separate prefers
> speed to code size. So disable shrink-wrap-separate if zcmp
> enabled, just like what save-restore has done.
>
> author: Zhangjin Liao liaozhang...@eswincomputing.com
>
> gcc/ChangeLog:
>
> * config/riscv/riscv.cc (riscv_get_separate_components):
> ---
>  gcc/config/riscv/riscv.cc | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 45a63cab9c9..629e5e45cac 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -5729,7 +5729,8 @@ riscv_get_separate_components (void)
>
>if (riscv_use_save_libcall (>machine->frame)
>|| cfun->machine->interrupt_handler_p
> -  || !cfun->machine->frame.gp_sp_offset.is_constant ())
> +  || !cfun->machine->frame.gp_sp_offset.is_constant ()
> +  || TARGET_ZCMP)
>  return components;

I think this is a bad idea. I have a use case where we use the C
extensions but still compile for -O2 because we want the code to be
fast as possible but still having the savings of the C extensions.

Thanks,
Andrew Pinski

>
>offset = cfun->machine->frame.gp_sp_offset.to_constant ();
> --
> 2.17.1
>


Re: Re: [PATCH 1/2] [RISC-V] disable shrink-wrap-separate if zcmp enabled.

2023-05-08 Thread Fei Gao
On 2023-05-08 10:41  Kito Cheng  wrote:
>
>shrink-wraping already gated by Os so I think we don't need add more
>gate here, unless we are trying to claim force optimize for size if
>zcmp is present.
> 

hi Kito

Zcmp is added here just like save-restore.

Either we add them both, or delete.

BR, 
Fei

>On Sat, May 6, 2023 at 4:41 PM Fei Gao  wrote:
>>
>> zcmp aims to reduce code size, while shrink-wrap-separate prefers
>> speed to code size. So disable shrink-wrap-separate if zcmp
>> enabled, just like what save-restore has done.
>>
>> author: Zhangjin Liao liaozhang...@eswincomputing.com
>>
>> gcc/ChangeLog:
>>
>> * config/riscv/riscv.cc (riscv_get_separate_components):
>> ---
>>  gcc/config/riscv/riscv.cc | 3 ++-
>>  1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
>> index 45a63cab9c9..629e5e45cac 100644
>> --- a/gcc/config/riscv/riscv.cc
>> +++ b/gcc/config/riscv/riscv.cc
>> @@ -5729,7 +5729,8 @@ riscv_get_separate_components (void)
>>
>>    if (riscv_use_save_libcall (>machine->frame)
>>    || cfun->machine->interrupt_handler_p
>> -  || !cfun->machine->frame.gp_sp_offset.is_constant ())
>> +  || !cfun->machine->frame.gp_sp_offset.is_constant ()
>> +  || TARGET_ZCMP)
>>  return components;
>>
>>    offset = cfun->machine->frame.gp_sp_offset.to_constant ();
>> --
>> 2.17.1
>>

RE: [PATCH] machine_mode type size: Extend enum size from 8-bit to 16-bit

2023-05-08 Thread Richard Biener via Gcc-patches
On Mon, 8 May 2023, Li, Pan2 wrote:

> return !dv || (int) GET_CODE ((rtx) dv) != (int) VALUE; } is able to fix 
> this ICE after mode bits change.

Can you check which bits this will inspect when 'dv' is a tree after your
patch?  VALUE is 1 and would map to IDENTIFIER_NODE on the tree side
when there was a 1:1 overlap.

I think for all cases but struct loc_exp_dep we could find a bit to
record wheter we deal with a VALUE or a decl, but for loc_exp_dep
it's going to be difficult (unless we start to take bits from
pointer representations).

That said, I agree with Jeff that the code is ugly, but a simplistic
conversion isn't what we want.

An alternative "solution" might be to also shrink tree_code when
we shrink rtx_code and keep the 1:1 overlap.

Richard.

> I will re-trigger the memory allocate 
> bytes test with below changes for X86.
> 
> rtx_def code 16 => 8 bits.
> rtx_def mode 8 => 16 bits.
> tree_base code unchanged.
> 
> Pan
> 
> -Original Message-
> From: Li, Pan2 
> Sent: Monday, May 8, 2023 2:42 PM
> To: Richard Biener ; Jeff Law 
> Cc: Kito Cheng ; juzhe.zh...@rivai.ai; 
> richard.sandiford ; gcc-patches 
> ; palmer ; jakub 
> 
> Subject: RE: [PATCH] machine_mode type size: Extend enum size from 8-bit to 
> 16-bit
> 
> Oops. Actually I am patching a version as you mentioned like storage 
> allocation. Thank you Richard, will try your suggestion and keep you posted.
> 
> Pan
> 
> -Original Message-
> From: Richard Biener 
> Sent: Monday, May 8, 2023 2:30 PM
> To: Jeff Law 
> Cc: Li, Pan2 ; Kito Cheng ; 
> juzhe.zh...@rivai.ai; richard.sandiford ; 
> gcc-patches ; palmer ; jakub 
> 
> Subject: Re: [PATCH] machine_mode type size: Extend enum size from 8-bit to 
> 16-bit
> 
> On Sun, 7 May 2023, Jeff Law wrote:
> 
> > 
> > 
> > On 5/6/23 19:55, Li, Pan2 wrote:
> > > It looks like we cannot simply swap the code and mode in rtx_def, 
> > > the code may have to be the same bits as the tree_code in tree_base.
> > > Or we will meet ICE like below.
> > > 
> > > rtx_def code 16 => 8 bits.
> > > rtx_def mode 8 => 16 bits.
> > > 
> > > static inline decl_or_value
> > > dv_from_value (rtx value)
> > > {
> > >decl_or_value dv;
> > >dv = value;
> > >gcc_checking_assert (dv_is_value_p (dv));  <=  ICE
> > >return dv;
> > Ugh.  We really just need to fix this code.  It assumes particular 
> > structure layouts and that's just wrong/dumb.
> 
> Well, it's a neat trick ... we just need to adjust it to
> 
> static inline bool
> dv_is_decl_p (decl_or_value dv)
> {
>   return !dv || (int) GET_CODE ((rtx) dv) != (int) VALUE; }  
> 
> I think (and hope for the 'decl' case the bits inspected are never 'VALUE').  
> Of course the above stinks from a TBAA perspective ...
> 
> Any "real" fix would require allocating storage for a discriminator and thus 
> hurt the resource constrained var-tracking a lot.
> 
> Richard.
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)


Re: [PATCH 1/3] PHIOPT: Add diamond bb form to factor_out_conditional_conversion

2023-05-08 Thread Richard Biener via Gcc-patches
On Mon, May 8, 2023 at 9:09 AM Andrew Pinski  wrote:
>
> On Sun, May 7, 2023 at 11:53 PM Richard Biener via Gcc-patches
>  wrote:
> >
> > On Mon, May 8, 2023 at 8:51 AM Richard Biener
> >  wrote:
> > >
> > > On Sun, May 7, 2023 at 7:05 AM Andrew Pinski via Gcc-patches
> > >  wrote:
> > > >
> > > > So the function factor_out_conditional_conversion already supports
> > > > diamond shaped bb forms, just need to be called for such a thing.
> > > >
> > > > harden-cond-comp.c needed to be changed as we would optimize out the
> > > > conversion now and that causes the compare hardening not needing to
> > > > split the block which it was testing. So change it such that there
> > > > would be no chance of optimization.
> > > >
> > > > Also add two testcases that showed the improvement. PR 103771 is
> > > > solved in ifconvert also for the vectorizer but now it is solved
> > > > in a general sense.
> > > >
> > > > OK? Bootstrapped and tested on x86_64-linux-gnu with no regressions.
> > >
> > > So what does it do for the diamond case?  Factor out _common_ operations
> > > only or also promote conditionally executed operations to unconditionally
> > > executed?  Apart from cost considerations (with the looping patch any 
> > > number
> > > might end up promoted) there's correctness issues for negation (signed 
> > > overflow)
> > > and for non-call exceptions there's possible external throws.
> > >
> > > If it only factors out common operations the patch is OK (but the 
> > > testcases
> > > suggest that's not the case).
>
> Right because the testcases are also testing if there was another
> phiopt that happened in the same pass rather than having to wait
> again.
>
> >> Otherwise we should add a limit as to how many
> > > ops we hoist (maybe one for the diamond case?).  Thinking over it the same
> > > issue of course exists for the non-diamond case?
> >
> > Oh, and we hoist the stuff without being sure the final PHI will be 
> > if-converted
> > in the end, correct?  Can we somehow improve on that, aka tentatively
> > convert if-convert the PHI first?
>
> Yes it hoists the stuff without any checks on if the final PHI will
> have happened.
> We have the same issue with hoisting adjacent loads.
> I will think of a way to handle to maybe handle this better and even
> undoing it; I will have to get back to you the details though.
> And yes that was already an existing issue with the non-diamond case
> of this; there was some extra checks added to prevent moving some
> casts from being moved out of the PHI due to extra zero extends with
> respect of __builtin_popcount/clz, etc.

OK.  Would be nice if you can somehow handle this.  Even adding a
static limit to the number of hoisted ops isn't good - multiple phiopt
passes will simply run up to N times such limit :/  Undoing might be
"easiest", best would be separating analysis and transform phase
somehow.

Let's go with the patch as-is in the meantime.

Thanks,
Richard.

> Thanks,
> Andrew
>
>
>
> >
> > Richard.
> >
> > > Richard.
> > >
> > > > PR tree-optimization/49959
> > > > PR tree-optimization/103771
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > * tree-ssa-phiopt.cc (pass_phiopt::execute): Support
> > > > Diamond shapped bb form for factor_out_conditional_conversion.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > * c-c++-common/torture/harden-cond-comp.c: Change testcase
> > > > slightly to avoid the new phiopt optimization.
> > > > * gcc.dg/tree-ssa/abs-2.c: New test.
> > > > * gcc.dg/tree-ssa/pr103771.c: New test.
> > > > ---
> > > >  .../c-c++-common/torture/harden-cond-comp.c   |  6 +++---
> > > >  gcc/testsuite/gcc.dg/tree-ssa/abs-2.c | 20 +++
> > > >  gcc/testsuite/gcc.dg/tree-ssa/pr103771.c  | 18 +
> > > >  gcc/tree-ssa-phiopt.cc|  2 +-
> > > >  4 files changed, 42 insertions(+), 4 deletions(-)
> > > >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/abs-2.c
> > > >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr103771.c
> > > >
> > > > diff --git a/gcc/testsuite/c-c++-common/torture/harden-cond-comp.c 
> > > > b/gcc/testsuite/c-c++-common/torture/harden-cond-comp.c
> > > > index 5aad890a1d3..dcf364ee993 100644
> > > > --- a/gcc/testsuite/c-c++-common/torture/harden-cond-comp.c
> > > > +++ b/gcc/testsuite/c-c++-common/torture/harden-cond-comp.c
> > > > @@ -1,11 +1,11 @@
> > > >  /* { dg-do compile } */
> > > >  /* { dg-options "-fharden-conditional-branches -fharden-compares 
> > > > -fdump-tree-hardcbr -fdump-tree-hardcmp -ffat-lto-objects" } */
> > > >
> > > > -int f(int i, int j) {
> > > > +int f(int i, int j, int k, int l) {
> > > >if (i == 0)
> > > > -return j != 0;
> > > > +return (j != 0) + l;
> > > >else
> > > > -return i * j != 0;
> > > > +return (i * j != 0) * k;
> > > >  }
> > > >
> > > >  /* { dg-final { scan-tree-dump-times "Splitting edge" 2 "hardcbr" } } 
> > > > */
> > > > 

Re: [PATCH 1/3] PHIOPT: Add diamond bb form to factor_out_conditional_conversion

2023-05-08 Thread Andrew Pinski via Gcc-patches
On Sun, May 7, 2023 at 11:53 PM Richard Biener via Gcc-patches
 wrote:
>
> On Mon, May 8, 2023 at 8:51 AM Richard Biener
>  wrote:
> >
> > On Sun, May 7, 2023 at 7:05 AM Andrew Pinski via Gcc-patches
> >  wrote:
> > >
> > > So the function factor_out_conditional_conversion already supports
> > > diamond shaped bb forms, just need to be called for such a thing.
> > >
> > > harden-cond-comp.c needed to be changed as we would optimize out the
> > > conversion now and that causes the compare hardening not needing to
> > > split the block which it was testing. So change it such that there
> > > would be no chance of optimization.
> > >
> > > Also add two testcases that showed the improvement. PR 103771 is
> > > solved in ifconvert also for the vectorizer but now it is solved
> > > in a general sense.
> > >
> > > OK? Bootstrapped and tested on x86_64-linux-gnu with no regressions.
> >
> > So what does it do for the diamond case?  Factor out _common_ operations
> > only or also promote conditionally executed operations to unconditionally
> > executed?  Apart from cost considerations (with the looping patch any number
> > might end up promoted) there's correctness issues for negation (signed 
> > overflow)
> > and for non-call exceptions there's possible external throws.
> >
> > If it only factors out common operations the patch is OK (but the testcases
> > suggest that's not the case).

Right because the testcases are also testing if there was another
phiopt that happened in the same pass rather than having to wait
again.

>> Otherwise we should add a limit as to how many
> > ops we hoist (maybe one for the diamond case?).  Thinking over it the same
> > issue of course exists for the non-diamond case?
>
> Oh, and we hoist the stuff without being sure the final PHI will be 
> if-converted
> in the end, correct?  Can we somehow improve on that, aka tentatively
> convert if-convert the PHI first?

Yes it hoists the stuff without any checks on if the final PHI will
have happened.
We have the same issue with hoisting adjacent loads.
I will think of a way to handle to maybe handle this better and even
undoing it; I will have to get back to you the details though.
And yes that was already an existing issue with the non-diamond case
of this; there was some extra checks added to prevent moving some
casts from being moved out of the PHI due to extra zero extends with
respect of __builtin_popcount/clz, etc.

Thanks,
Andrew



>
> Richard.
>
> > Richard.
> >
> > > PR tree-optimization/49959
> > > PR tree-optimization/103771
> > >
> > > gcc/ChangeLog:
> > >
> > > * tree-ssa-phiopt.cc (pass_phiopt::execute): Support
> > > Diamond shapped bb form for factor_out_conditional_conversion.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > > * c-c++-common/torture/harden-cond-comp.c: Change testcase
> > > slightly to avoid the new phiopt optimization.
> > > * gcc.dg/tree-ssa/abs-2.c: New test.
> > > * gcc.dg/tree-ssa/pr103771.c: New test.
> > > ---
> > >  .../c-c++-common/torture/harden-cond-comp.c   |  6 +++---
> > >  gcc/testsuite/gcc.dg/tree-ssa/abs-2.c | 20 +++
> > >  gcc/testsuite/gcc.dg/tree-ssa/pr103771.c  | 18 +
> > >  gcc/tree-ssa-phiopt.cc|  2 +-
> > >  4 files changed, 42 insertions(+), 4 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/abs-2.c
> > >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr103771.c
> > >
> > > diff --git a/gcc/testsuite/c-c++-common/torture/harden-cond-comp.c 
> > > b/gcc/testsuite/c-c++-common/torture/harden-cond-comp.c
> > > index 5aad890a1d3..dcf364ee993 100644
> > > --- a/gcc/testsuite/c-c++-common/torture/harden-cond-comp.c
> > > +++ b/gcc/testsuite/c-c++-common/torture/harden-cond-comp.c
> > > @@ -1,11 +1,11 @@
> > >  /* { dg-do compile } */
> > >  /* { dg-options "-fharden-conditional-branches -fharden-compares 
> > > -fdump-tree-hardcbr -fdump-tree-hardcmp -ffat-lto-objects" } */
> > >
> > > -int f(int i, int j) {
> > > +int f(int i, int j, int k, int l) {
> > >if (i == 0)
> > > -return j != 0;
> > > +return (j != 0) + l;
> > >else
> > > -return i * j != 0;
> > > +return (i * j != 0) * k;
> > >  }
> > >
> > >  /* { dg-final { scan-tree-dump-times "Splitting edge" 2 "hardcbr" } } */
> > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/abs-2.c 
> > > b/gcc/testsuite/gcc.dg/tree-ssa/abs-2.c
> > > new file mode 100644
> > > index 000..328b1802541
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/abs-2.c
> > > @@ -0,0 +1,20 @@
> > > +/* PR tree-optimization/49959 */
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O -fdump-tree-phiopt1-details" } */
> > > +
> > > +#define ABS(X)(((X)>0)?(X):-(X))
> > > +unsigned long
> > > +test_abs(int *cur)
> > > +{
> > > +  unsigned long sad = 0;
> > > +  if (cur[0] > 0)
> > > +sad = cur[0];
> > > +  else
> > > +sad = -cur[0];
> > > +  return sad;
> 

Re: [PATCH v2] RISC-V: Handle multi-lib path correclty for linux

2023-05-08 Thread Kito Cheng via Gcc-patches
Committed to trunk

On Thu, May 4, 2023 at 4:03 PM Kito Cheng via Gcc-patches
 wrote:
>
> RISC-V Linux encodes the ABI into the path, so in theory, we can only use that
> to select multi-lib paths, and no way to use different multi-lib paths between
> `rv32i/ilp32` and `rv32ima/ilp32`, we'll mapping both to `/lib/ilp32`.
>
> It's hard to do that with GCC's builtin multi-lib selection mechanism; builtin
> mechanism did the option string compare and then enumerate all possible reuse
> rules during the build time. However, it's impossible to RISC-V; we have a 
> huge
> number of combinations of `-march`, so implementing a customized multi-lib
> selection becomes the only solution.
>
> Multi-lib configuration is only used for determines which ISA should be used
> when compiling the corresponding ABI variant after this patch.
>
> During the multi-lib selection stage, only consider -mabi as the only key to
> select the multi-lib path.
>
> gcc/ChangeLog:
>
> * common/config/riscv/riscv-common.cc (riscv_select_multilib_by_abi): 
> New.
> (riscv_select_multilib): New.
> (riscv_compute_multilib): Extract logic to riscv_select_multilib and
> also handle select_by_abi.
> * config/riscv/elf.h (RISCV_USE_CUSTOMISED_MULTI_LIB): Change it
> to select_by_abi_arch_cmodel from 1.
> * config/riscv/linux.h (RISCV_USE_CUSTOMISED_MULTI_LIB): Define.
> * config/riscv/riscv-opts.h (enum riscv_multilib_select_kind): New.
>
> ---
> V2 Changes:
> - Fix some trivial issue cause I forgot to squash patches...
>
> This patch also plan backport to GCC 13 after landing to trunk.
>
> ---
>  gcc/common/config/riscv/riscv-common.cc | 128 
>  gcc/config/riscv/elf.h  |   2 +-
>  gcc/config/riscv/linux.h|   2 +
>  gcc/config/riscv/riscv-opts.h   |   9 ++
>  4 files changed, 100 insertions(+), 41 deletions(-)
>
> diff --git a/gcc/common/config/riscv/riscv-common.cc 
> b/gcc/common/config/riscv/riscv-common.cc
> index 309a52def75f..57a2a279ef53 100644
> --- a/gcc/common/config/riscv/riscv-common.cc
> +++ b/gcc/common/config/riscv/riscv-common.cc
> @@ -1441,9 +1441,6 @@ riscv_multi_lib_check (int argc ATTRIBUTE_UNUSED,
>return "";
>  }
>
> -/* We only override this in bare-metal toolchain.  */
> -#ifdef RISCV_USE_CUSTOMISED_MULTI_LIB
> -
>  /* Find last switch with the prefix, options are take last one in general,
> return NULL if not found, and return the option value if found, it could
> return empty string if the option has no value.  */
> @@ -1597,6 +1594,68 @@ riscv_check_conds (
>return match_score + ok_count * 100;
>  }
>
> +static const char *
> +riscv_select_multilib_by_abi (
> +  const std::string _current_arch_str,
> +  const std::string _current_abi_str,
> +  const riscv_subset_list *subset_list, const struct switchstr *switches,
> +  int n_switches, const std::vector _infos)
> +{
> +  for (size_t i = 0; i < multilib_infos.size (); ++i)
> +if (riscv_current_abi_str == multilib_infos[i].abi_str)
> +  return xstrdup (multilib_infos[i].path.c_str ());
> +
> +  return NULL;
> +}
> +
> +static const char *
> +riscv_select_multilib (
> +  const std::string _current_arch_str,
> +  const std::string _current_abi_str,
> +  const riscv_subset_list *subset_list, const struct switchstr *switches,
> +  int n_switches, const std::vector _infos)
> +{
> +  int match_score = 0;
> +  int max_match_score = 0;
> +  int best_match_multi_lib = -1;
> +  /* Try to decision which set we should used.  */
> +  /* We have 3 level decision tree here, ABI, check input arch/ABI must
> + be superset of multi-lib arch, and other rest option checking.  */
> +  for (size_t i = 0; i < multilib_infos.size (); ++i)
> +{
> +  /* Check ABI is same first.  */
> +  if (riscv_current_abi_str != multilib_infos[i].abi_str)
> +   continue;
> +
> +  /* Found a potential compatible multi-lib setting!
> +Calculate the match score.  */
> +  match_score = subset_list->match_score (multilib_infos[i].subset_list);
> +
> +  /* Checking other cond in the multi-lib setting.  */
> +  match_score = riscv_check_conds (switches, n_switches, match_score,
> +  multilib_infos[i].conds);
> +
> +  /* Record highest match score multi-lib setting.  */
> +  if (match_score > max_match_score)
> +   {
> + best_match_multi_lib = i;
> + max_match_score = match_score;
> +   }
> +}
> +
> +  if (best_match_multi_lib == -1)
> +{
> +  riscv_no_matched_multi_lib = true;
> +  return NULL;
> +}
> +  else
> +return xstrdup (multilib_infos[best_match_multi_lib].path.c_str ());
> +}
> +
> +#ifndef RISCV_USE_CUSTOMISED_MULTI_LIB
> +#define RISCV_USE_CUSTOMISED_MULTI_LIB select_by_builtin
> +#endif
> +
>  /* Implement TARGET_COMPUTE_MULTILIB.  */
>  static const char *
>  riscv_compute_multilib (
> @@ -1609,6 +1668,11 @@ 

Re: [PATCH] RISC-V: Fix ugly && incorrect codes of RVV auto-vectorization

2023-05-08 Thread Kito Cheng via Gcc-patches
Committed to trunk, thanks!

On Mon, May 8, 2023 at 11:42 AM  wrote:
>
> From: Juzhe-Zhong 
>
> 1. Add movmisalign pattern for TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
>targethook, current RISC-V has supported this target hook, we can't make
>it supported without movmisalign pattern.
>
> 2. Remove global extern of get_mask_policy_no_pred && get_tail_policy_no_pred.
>These 2 functions are comming from intrinsic builtin frameworks.
>We are sure we don't need them in auto-vectorization implementation.
>
> 3. Refine mask mode implementation.
>
> 4. We should not have "riscv_vector_" in riscv_vector namspace since it
>makes the codes inconsistent and ugly.
>
>For example:
>Before this patch:
>static opt_machine_mode
>riscv_get_mask_mode (machine_mode mode)
>{
>  machine_mode mask_mode = VOIDmode;
>  if (TARGET_VECTOR && riscv_vector::riscv_vector_get_mask_mode 
> (mode).exists (_mode))
>   return mask_mode;
>..
>
>After this patch:
>riscv_get_mask_mode (machine_mode mode)
>{
>  machine_mode mask_mode = VOIDmode;
>  if (TARGET_VECTOR && riscv_vector::get_mask_mode (mode).exists 
> (_mode))
>   return mask_mode;
>..
>
> 5. Fix fail testcase fixed-vlmax-1.c.
>
> gcc/ChangeLog:
>
> * config/riscv/autovec.md (movmisalign): New pattern.
> * config/riscv/riscv-protos.h (riscv_vector_mask_mode_p): Delete.
> (riscv_vector_get_mask_mode): Ditto.
> (get_mask_policy_no_pred): Ditto.
> (get_tail_policy_no_pred): Ditto.
> (get_mask_mode): New function.
> * config/riscv/riscv-v.cc (get_mask_policy_no_pred): Delete.
> (get_tail_policy_no_pred): Ditto.
> (riscv_vector_mask_mode_p): Ditto.
> (riscv_vector_get_mask_mode): Ditto.
> (get_mask_mode): New function.
> * config/riscv/riscv-vector-builtins.cc (use_real_merge_p): Remove 
> global extern.
> (get_tail_policy_for_pred): Ditto.
> * config/riscv/riscv-vector-builtins.h (get_tail_policy_for_pred): 
> Ditto.
> (get_mask_policy_for_pred): Ditto
> * config/riscv/riscv.cc (riscv_get_mask_mode): Refine codes.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/autovec/fixed-vlmax-1.c: Fix typo.
>
> ---
>  gcc/config/riscv/autovec.md   | 11 ++
>  gcc/config/riscv/riscv-protos.h   |  5 +--
>  gcc/config/riscv/riscv-v.cc   | 36 ++-
>  gcc/config/riscv/riscv-vector-builtins.cc |  4 +--
>  gcc/config/riscv/riscv-vector-builtins.h  |  3 --
>  gcc/config/riscv/riscv.cc |  3 +-
>  .../riscv/rvv/autovec/fixed-vlmax-1.c |  4 +--
>  7 files changed, 19 insertions(+), 47 deletions(-)
>
> diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
> index b5d46ff57ab..f1c5ff5951b 100644
> --- a/gcc/config/riscv/autovec.md
> +++ b/gcc/config/riscv/autovec.md
> @@ -47,3 +47,14 @@
>   operands[1], operands[2], mode);
>DONE;
>  })
> +
> +(define_expand "movmisalign"
> +  [(set (match_operand:V 0 "nonimmediate_operand")
> +   (match_operand:V 1 "general_operand"))]
> +  "TARGET_VECTOR"
> +  {
> +/* Equivalent to a normal move for our purpooses.  */
> +emit_move_insn (operands[0], operands[1]);
> +DONE;
> +  }
> +)
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index d83ea2c77e4..c0293a306f9 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -218,10 +218,7 @@ bool slide1_sew64_helper (int, machine_mode, 
> machine_mode,
>  rtx gen_avl_for_scalar_move (rtx);
>  void expand_tuple_move (machine_mode, rtx *);
>  machine_mode preferred_simd_mode (scalar_mode);
> -extern bool riscv_vector_mask_mode_p (machine_mode);
> -extern opt_machine_mode riscv_vector_get_mask_mode (machine_mode mode);
> -extern rtx get_mask_policy_no_pred (void);
> -extern rtx get_tail_policy_no_pred (void);
> +opt_machine_mode get_mask_mode (machine_mode);
>  }
>
>  /* We classify builtin types into two classes:
> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index 8c7f3206771..7ca49ca67c1 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -43,7 +43,6 @@
>  #include "expr.h"
>  #include "optabs.h"
>  #include "tm-constrs.h"
> -#include "riscv-vector-builtins.h"
>  #include "rtx-vector-builder.h"
>  #include "targhooks.h"
>
> @@ -479,43 +478,12 @@ get_avl_type_rtx (enum avl_type type)
>return gen_int_mode (type, Pmode);
>  }
>
> -/* Return the mask policy for no predication.  */
> -rtx
> -get_mask_policy_no_pred (void)
> -{
> -  return get_mask_policy_for_pred (PRED_TYPE_none);
> -}
> -
> -/* Return the tail policy for no predication.  */
> -rtx
> -get_tail_policy_no_pred (void)
> -{
> -  return get_tail_policy_for_pred (PRED_TYPE_none);
> -}
> -
> -/* Return true if it is a RVV mask mode.  */
> -bool
> 

RE: [PATCH] machine_mode type size: Extend enum size from 8-bit to 16-bit

2023-05-08 Thread Li, Pan2 via Gcc-patches
return !dv || (int) GET_CODE ((rtx) dv) != (int) VALUE; } is able to fix this 
ICE after mode bits change. I will re-trigger the memory allocate bytes test 
with below changes for X86.

rtx_def code 16 => 8 bits.
rtx_def mode 8 => 16 bits.
tree_base code unchanged.

Pan

-Original Message-
From: Li, Pan2 
Sent: Monday, May 8, 2023 2:42 PM
To: Richard Biener ; Jeff Law 
Cc: Kito Cheng ; juzhe.zh...@rivai.ai; richard.sandiford 
; gcc-patches ; palmer 
; jakub 
Subject: RE: [PATCH] machine_mode type size: Extend enum size from 8-bit to 
16-bit

Oops. Actually I am patching a version as you mentioned like storage 
allocation. Thank you Richard, will try your suggestion and keep you posted.

Pan

-Original Message-
From: Richard Biener 
Sent: Monday, May 8, 2023 2:30 PM
To: Jeff Law 
Cc: Li, Pan2 ; Kito Cheng ; 
juzhe.zh...@rivai.ai; richard.sandiford ; 
gcc-patches ; palmer ; jakub 

Subject: Re: [PATCH] machine_mode type size: Extend enum size from 8-bit to 
16-bit

On Sun, 7 May 2023, Jeff Law wrote:

> 
> 
> On 5/6/23 19:55, Li, Pan2 wrote:
> > It looks like we cannot simply swap the code and mode in rtx_def, 
> > the code may have to be the same bits as the tree_code in tree_base.
> > Or we will meet ICE like below.
> > 
> > rtx_def code 16 => 8 bits.
> > rtx_def mode 8 => 16 bits.
> > 
> > static inline decl_or_value
> > dv_from_value (rtx value)
> > {
> >decl_or_value dv;
> >dv = value;
> >gcc_checking_assert (dv_is_value_p (dv));  <=  ICE
> >return dv;
> Ugh.  We really just need to fix this code.  It assumes particular 
> structure layouts and that's just wrong/dumb.

Well, it's a neat trick ... we just need to adjust it to

static inline bool
dv_is_decl_p (decl_or_value dv)
{
  return !dv || (int) GET_CODE ((rtx) dv) != (int) VALUE; }  

I think (and hope for the 'decl' case the bits inspected are never 'VALUE').  
Of course the above stinks from a TBAA perspective ...

Any "real" fix would require allocating storage for a discriminator and thus 
hurt the resource constrained var-tracking a lot.

Richard.


Re: [PATCH 1/3] PHIOPT: Add diamond bb form to factor_out_conditional_conversion

2023-05-08 Thread Richard Biener via Gcc-patches
On Mon, May 8, 2023 at 8:51 AM Richard Biener
 wrote:
>
> On Sun, May 7, 2023 at 7:05 AM Andrew Pinski via Gcc-patches
>  wrote:
> >
> > So the function factor_out_conditional_conversion already supports
> > diamond shaped bb forms, just need to be called for such a thing.
> >
> > harden-cond-comp.c needed to be changed as we would optimize out the
> > conversion now and that causes the compare hardening not needing to
> > split the block which it was testing. So change it such that there
> > would be no chance of optimization.
> >
> > Also add two testcases that showed the improvement. PR 103771 is
> > solved in ifconvert also for the vectorizer but now it is solved
> > in a general sense.
> >
> > OK? Bootstrapped and tested on x86_64-linux-gnu with no regressions.
>
> So what does it do for the diamond case?  Factor out _common_ operations
> only or also promote conditionally executed operations to unconditionally
> executed?  Apart from cost considerations (with the looping patch any number
> might end up promoted) there's correctness issues for negation (signed 
> overflow)
> and for non-call exceptions there's possible external throws.
>
> If it only factors out common operations the patch is OK (but the testcases
> suggest that's not the case).  Otherwise we should add a limit as to how many
> ops we hoist (maybe one for the diamond case?).  Thinking over it the same
> issue of course exists for the non-diamond case?

Oh, and we hoist the stuff without being sure the final PHI will be if-converted
in the end, correct?  Can we somehow improve on that, aka tentatively
convert if-convert the PHI first?

Richard.

> Richard.
>
> > PR tree-optimization/49959
> > PR tree-optimization/103771
> >
> > gcc/ChangeLog:
> >
> > * tree-ssa-phiopt.cc (pass_phiopt::execute): Support
> > Diamond shapped bb form for factor_out_conditional_conversion.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * c-c++-common/torture/harden-cond-comp.c: Change testcase
> > slightly to avoid the new phiopt optimization.
> > * gcc.dg/tree-ssa/abs-2.c: New test.
> > * gcc.dg/tree-ssa/pr103771.c: New test.
> > ---
> >  .../c-c++-common/torture/harden-cond-comp.c   |  6 +++---
> >  gcc/testsuite/gcc.dg/tree-ssa/abs-2.c | 20 +++
> >  gcc/testsuite/gcc.dg/tree-ssa/pr103771.c  | 18 +
> >  gcc/tree-ssa-phiopt.cc|  2 +-
> >  4 files changed, 42 insertions(+), 4 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/abs-2.c
> >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr103771.c
> >
> > diff --git a/gcc/testsuite/c-c++-common/torture/harden-cond-comp.c 
> > b/gcc/testsuite/c-c++-common/torture/harden-cond-comp.c
> > index 5aad890a1d3..dcf364ee993 100644
> > --- a/gcc/testsuite/c-c++-common/torture/harden-cond-comp.c
> > +++ b/gcc/testsuite/c-c++-common/torture/harden-cond-comp.c
> > @@ -1,11 +1,11 @@
> >  /* { dg-do compile } */
> >  /* { dg-options "-fharden-conditional-branches -fharden-compares 
> > -fdump-tree-hardcbr -fdump-tree-hardcmp -ffat-lto-objects" } */
> >
> > -int f(int i, int j) {
> > +int f(int i, int j, int k, int l) {
> >if (i == 0)
> > -return j != 0;
> > +return (j != 0) + l;
> >else
> > -return i * j != 0;
> > +return (i * j != 0) * k;
> >  }
> >
> >  /* { dg-final { scan-tree-dump-times "Splitting edge" 2 "hardcbr" } } */
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/abs-2.c 
> > b/gcc/testsuite/gcc.dg/tree-ssa/abs-2.c
> > new file mode 100644
> > index 000..328b1802541
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/abs-2.c
> > @@ -0,0 +1,20 @@
> > +/* PR tree-optimization/49959 */
> > +/* { dg-do compile } */
> > +/* { dg-options "-O -fdump-tree-phiopt1-details" } */
> > +
> > +#define ABS(X)(((X)>0)?(X):-(X))
> > +unsigned long
> > +test_abs(int *cur)
> > +{
> > +  unsigned long sad = 0;
> > +  if (cur[0] > 0)
> > +sad = cur[0];
> > +  else
> > +sad = -cur[0];
> > +  return sad;
> > +}
> > +
> > +/* We should figure out that test_abs has an ABS_EXPR in it. */
> > +/* { dg-final { scan-tree-dump " = ABS_EXPR" "phiopt1"} } */
> > +/* { dg-final { scan-tree-dump-times "changed to factor conversion out 
> > from" 1 "phiopt1"} } */
> > +
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr103771.c 
> > b/gcc/testsuite/gcc.dg/tree-ssa/pr103771.c
> > new file mode 100644
> > index 000..97c9db846cb
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr103771.c
> > @@ -0,0 +1,18 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -fdump-tree-phiopt1-details" } */
> > +/* { dg-final { scan-tree-dump-times "changed to factor conversion out 
> > from COND_EXPR." 1 "phiopt1" } } */
> > +
> > +typedef unsigned char uint8_t;
> > +
> > +static uint8_t x264_clip_uint8 (int x)
> > +{
> > +  return x & (~255) ? (-x) >> 31 : x;
> > +}
> > +
> > +void
> > +mc_weight (uint8_t* __restrict dst, uint8_t* __restrict src,
> > +   

Re: [PATCH 1/3] PHIOPT: Add diamond bb form to factor_out_conditional_conversion

2023-05-08 Thread Richard Biener via Gcc-patches
On Sun, May 7, 2023 at 7:05 AM Andrew Pinski via Gcc-patches
 wrote:
>
> So the function factor_out_conditional_conversion already supports
> diamond shaped bb forms, just need to be called for such a thing.
>
> harden-cond-comp.c needed to be changed as we would optimize out the
> conversion now and that causes the compare hardening not needing to
> split the block which it was testing. So change it such that there
> would be no chance of optimization.
>
> Also add two testcases that showed the improvement. PR 103771 is
> solved in ifconvert also for the vectorizer but now it is solved
> in a general sense.
>
> OK? Bootstrapped and tested on x86_64-linux-gnu with no regressions.

So what does it do for the diamond case?  Factor out _common_ operations
only or also promote conditionally executed operations to unconditionally
executed?  Apart from cost considerations (with the looping patch any number
might end up promoted) there's correctness issues for negation (signed overflow)
and for non-call exceptions there's possible external throws.

If it only factors out common operations the patch is OK (but the testcases
suggest that's not the case).  Otherwise we should add a limit as to how many
ops we hoist (maybe one for the diamond case?).  Thinking over it the same
issue of course exists for the non-diamond case?

Richard.

> PR tree-optimization/49959
> PR tree-optimization/103771
>
> gcc/ChangeLog:
>
> * tree-ssa-phiopt.cc (pass_phiopt::execute): Support
> Diamond shapped bb form for factor_out_conditional_conversion.
>
> gcc/testsuite/ChangeLog:
>
> * c-c++-common/torture/harden-cond-comp.c: Change testcase
> slightly to avoid the new phiopt optimization.
> * gcc.dg/tree-ssa/abs-2.c: New test.
> * gcc.dg/tree-ssa/pr103771.c: New test.
> ---
>  .../c-c++-common/torture/harden-cond-comp.c   |  6 +++---
>  gcc/testsuite/gcc.dg/tree-ssa/abs-2.c | 20 +++
>  gcc/testsuite/gcc.dg/tree-ssa/pr103771.c  | 18 +
>  gcc/tree-ssa-phiopt.cc|  2 +-
>  4 files changed, 42 insertions(+), 4 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/abs-2.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr103771.c
>
> diff --git a/gcc/testsuite/c-c++-common/torture/harden-cond-comp.c 
> b/gcc/testsuite/c-c++-common/torture/harden-cond-comp.c
> index 5aad890a1d3..dcf364ee993 100644
> --- a/gcc/testsuite/c-c++-common/torture/harden-cond-comp.c
> +++ b/gcc/testsuite/c-c++-common/torture/harden-cond-comp.c
> @@ -1,11 +1,11 @@
>  /* { dg-do compile } */
>  /* { dg-options "-fharden-conditional-branches -fharden-compares 
> -fdump-tree-hardcbr -fdump-tree-hardcmp -ffat-lto-objects" } */
>
> -int f(int i, int j) {
> +int f(int i, int j, int k, int l) {
>if (i == 0)
> -return j != 0;
> +return (j != 0) + l;
>else
> -return i * j != 0;
> +return (i * j != 0) * k;
>  }
>
>  /* { dg-final { scan-tree-dump-times "Splitting edge" 2 "hardcbr" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/abs-2.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/abs-2.c
> new file mode 100644
> index 000..328b1802541
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/abs-2.c
> @@ -0,0 +1,20 @@
> +/* PR tree-optimization/49959 */
> +/* { dg-do compile } */
> +/* { dg-options "-O -fdump-tree-phiopt1-details" } */
> +
> +#define ABS(X)(((X)>0)?(X):-(X))
> +unsigned long
> +test_abs(int *cur)
> +{
> +  unsigned long sad = 0;
> +  if (cur[0] > 0)
> +sad = cur[0];
> +  else
> +sad = -cur[0];
> +  return sad;
> +}
> +
> +/* We should figure out that test_abs has an ABS_EXPR in it. */
> +/* { dg-final { scan-tree-dump " = ABS_EXPR" "phiopt1"} } */
> +/* { dg-final { scan-tree-dump-times "changed to factor conversion out from" 
> 1 "phiopt1"} } */
> +
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr103771.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/pr103771.c
> new file mode 100644
> index 000..97c9db846cb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr103771.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fdump-tree-phiopt1-details" } */
> +/* { dg-final { scan-tree-dump-times "changed to factor conversion out from 
> COND_EXPR." 1 "phiopt1" } } */
> +
> +typedef unsigned char uint8_t;
> +
> +static uint8_t x264_clip_uint8 (int x)
> +{
> +  return x & (~255) ? (-x) >> 31 : x;
> +}
> +
> +void
> +mc_weight (uint8_t* __restrict dst, uint8_t* __restrict src,
> +  int i_width,int i_scale)
> +{
> +  for(int x = 0; x < i_width; x++)
> +dst[x] = x264_clip_uint8 (src[x] * i_scale);
> +}
> diff --git a/gcc/tree-ssa-phiopt.cc b/gcc/tree-ssa-phiopt.cc
> index f14b7e8b7e6..41fea78dc8d 100644
> --- a/gcc/tree-ssa-phiopt.cc
> +++ b/gcc/tree-ssa-phiopt.cc
> @@ -4087,7 +4087,7 @@ pass_phiopt::execute (function *)
>
>gphi *newphi;
>if (single_pred_p (bb1)
> - && !diamond_p
> + && EDGE_COUNT (merge->preds) == 2
>   

Re: [PATCH 2/3] PHIOPT: Loop over calling factor_out_conditional_conversion

2023-05-08 Thread Richard Biener via Gcc-patches
On Sun, May 7, 2023 at 6:45 AM Andrew Pinski via Gcc-patches
 wrote:
>
> After adding diamond shaped bb support to factor_out_conditional_conversion,
> we can get a case where we have two conversions that needs factored out
> and then would have another phiopt happen.
> An example is:
> ```
> static inline unsigned long long g(int t)
> {
>   unsigned t1 = t;
>   return t1;
> }
> unsigned long long f(int c, int d, int e)
> {
>   unsigned long long t;
>   if (c > d)
> t = g(c);
>   else
> t = g(d);
>   return t;
> }
> ```
> In this case we should get a MAX_EXPR in phiopt1 with two casts.
> Before this patch, we would just factor out the outer cast and then
> wait till phiopt2 to factor out the inner cast.
>
> OK? Bootstrapped and tested on x86_64-linux-gnu with no regressions.

OK.

Thanks,
Richard.

> gcc/ChangeLog:
>
> * tree-ssa-phiopt.cc (pass_phiopt::execute): Loop
> over factor_out_conditional_conversion.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/tree-ssa/minmax-17.c: New test.
> ---
>  gcc/testsuite/gcc.dg/tree-ssa/minmax-17.c | 21 ++
>  gcc/tree-ssa-phiopt.cc| 27 +--
>  2 files changed, 36 insertions(+), 12 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/minmax-17.c
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/minmax-17.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/minmax-17.c
> new file mode 100644
> index 000..bd737e6b4cb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/minmax-17.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O -fdump-tree-phiopt1-details" } */
> +
> +static inline unsigned long long g(int t)
> +{
> +  unsigned t1 = t;
> +  return t1;
> +}
> +unsigned long long test_max(int c, int d, int e)
> +{
> +  unsigned long long t;
> +  if (c > d)
> +t = g(c);
> +  else
> +t = g(d);
> +  return t;
> +}
> +
> +/* We should figure out that test_max has an MAX_EXPR in it. */
> +/* { dg-final { scan-tree-dump " = MAX_EXPR" "phiopt1"} } */
> +/* { dg-final { scan-tree-dump-times "changed to factor conversion out from" 
> 2 "phiopt1"} } */
> diff --git a/gcc/tree-ssa-phiopt.cc b/gcc/tree-ssa-phiopt.cc
> index 41fea78dc8d..7fe088b13ff 100644
> --- a/gcc/tree-ssa-phiopt.cc
> +++ b/gcc/tree-ssa-phiopt.cc
> @@ -4085,20 +4085,23 @@ pass_phiopt::execute (function *)
>   node.  */
>gcc_assert (arg0 != NULL_TREE && arg1 != NULL_TREE);
>
> -  gphi *newphi;
>if (single_pred_p (bb1)
> - && EDGE_COUNT (merge->preds) == 2
> - && (newphi = factor_out_conditional_conversion (e1, e2, phi,
> - arg0, arg1,
> - cond_stmt)))
> + && EDGE_COUNT (merge->preds) == 2)
> {
> - phi = newphi;
> - /* factor_out_conditional_conversion may create a new PHI in
> -BB2 and eliminate an existing PHI in BB2.  Recompute values
> -that may be affected by that change.  */
> - arg0 = gimple_phi_arg_def (phi, e1->dest_idx);
> - arg1 = gimple_phi_arg_def (phi, e2->dest_idx);
> - gcc_assert (arg0 != NULL_TREE && arg1 != NULL_TREE);
> + gphi *newphi = phi;
> + while (newphi)
> +   {
> + phi = newphi;
> + /* factor_out_conditional_conversion may create a new PHI in
> +BB2 and eliminate an existing PHI in BB2.  Recompute values
> +that may be affected by that change.  */
> + arg0 = gimple_phi_arg_def (phi, e1->dest_idx);
> + arg1 = gimple_phi_arg_def (phi, e2->dest_idx);
> + gcc_assert (arg0 != NULL_TREE && arg1 != NULL_TREE);
> + newphi = factor_out_conditional_conversion (e1, e2, phi,
> + arg0, arg1,
> + cond_stmt);
> +   }
> }
>
>/* Do the replacement of conditional if it can be done.  */
> --
> 2.31.1
>


Re: [PATCH 3/3] PHIOPT: factor out unary operations instead of just conversions

2023-05-08 Thread Richard Biener via Gcc-patches
On Sun, May 7, 2023 at 6:44 AM Andrew Pinski via Gcc-patches
 wrote:
>
> After using factor_out_conditional_conversion with diamond bb,
> we should be able do use it also for all normal unary gimple and not
> just conversions. This allows to optimize PR 59424 for an example.
> This is also a start to optimize PR 64700 and a few others.
>
> OK? Bootstrapped and tested on x86_64-linux-gnu with no regressions.

OK.

> An example of this is:
> ```
> static inline unsigned long long g(int t)
> {
>   unsigned t1 = t;
>   return t1;
> }
> static int abs1(int a)
> {
>   if (a < 0)
> a = -a;
>   return a;
> }
> unsigned long long f(int c, int d, int e)
> {
>   unsigned long long t;
>   if (d > e)
> t = g(abs1(d));
>   else
> t = g(abs1(e));
>   return t;
> }
> ```
>
> Which should be optimized to:
>   _9 = MAX_EXPR ;
>   _4 = ABS_EXPR <_9>;
>   t_3 = (long long unsigned intD.16) _4;
>
> gcc/ChangeLog:
>
> * tree-ssa-phiopt.cc (factor_out_conditional_conversion): Rename to 
> ...
> (factor_out_conditional_operation): This and add support for all unary
> operations.
> (pass_phiopt::execute): Update call to 
> factor_out_conditional_conversion
> to call factor_out_conditional_operation instead.
>
> PR tree-optimization/109424
> PR tree-optimization/59424
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/tree-ssa/abs-2.c: Update tree scan for
> details change in wording.
> * gcc.dg/tree-ssa/minmax-17.c: Likewise.
> * gcc.dg/tree-ssa/pr103771.c: Likewise.
> * gcc.dg/tree-ssa/minmax-18.c: New test.
> * gcc.dg/tree-ssa/minmax-19.c: New test.
> ---
>  gcc/testsuite/gcc.dg/tree-ssa/abs-2.c |  2 +-
>  gcc/testsuite/gcc.dg/tree-ssa/minmax-17.c |  2 +-
>  gcc/testsuite/gcc.dg/tree-ssa/minmax-18.c | 27 +++
>  gcc/testsuite/gcc.dg/tree-ssa/minmax-19.c | 10 
>  gcc/testsuite/gcc.dg/tree-ssa/pr103771.c  |  2 +-
>  gcc/tree-ssa-phiopt.cc| 56 +--
>  6 files changed, 71 insertions(+), 28 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/minmax-18.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/minmax-19.c
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/abs-2.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/abs-2.c
> index 328b1802541..f8bbeb43237 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/abs-2.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/abs-2.c
> @@ -16,5 +16,5 @@ test_abs(int *cur)
>
>  /* We should figure out that test_abs has an ABS_EXPR in it. */
>  /* { dg-final { scan-tree-dump " = ABS_EXPR" "phiopt1"} } */
> -/* { dg-final { scan-tree-dump-times "changed to factor conversion out from" 
> 1 "phiopt1"} } */
> +/* { dg-final { scan-tree-dump-times "changed to factor operation out from" 
> 1 "phiopt1"} } */
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/minmax-17.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/minmax-17.c
> index bd737e6b4cb..7c76cfc62a9 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/minmax-17.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/minmax-17.c
> @@ -18,4 +18,4 @@ unsigned long long test_max(int c, int d, int e)
>
>  /* We should figure out that test_max has an MAX_EXPR in it. */
>  /* { dg-final { scan-tree-dump " = MAX_EXPR" "phiopt1"} } */
> -/* { dg-final { scan-tree-dump-times "changed to factor conversion out from" 
> 2 "phiopt1"} } */
> +/* { dg-final { scan-tree-dump-times "changed to factor operation out from" 
> 2 "phiopt1"} } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/minmax-18.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/minmax-18.c
> new file mode 100644
> index 000..c8e1670f64a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/minmax-18.c
> @@ -0,0 +1,27 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O -fdump-tree-phiopt1-details" } */
> +
> +static inline unsigned long long g(int t)
> +{
> +  unsigned t1 = t;
> +  return t1;
> +}
> +static inline int abs1(int a)
> +{
> +  if (a < 0)
> +a = -a;
> +  return a;
> +}
> +unsigned long long f(int c, int d, int e)
> +{
> +  unsigned long long t;
> +  if (d > e)
> +t = g(abs1(d));
> +  else
> +t = g(abs1(e));
> +  return t;
> +}
> +
> +/* { dg-final { scan-tree-dump " = MAX_EXPR" "phiopt1"} } */
> +/* { dg-final { scan-tree-dump-times " = ABS_EXPR" 2 "phiopt1"} } */
> +/* { dg-final { scan-tree-dump-times "changed to factor operation out from" 
> 3 "phiopt1"} } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/minmax-19.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/minmax-19.c
> new file mode 100644
> index 000..5ed55fe2e23
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/minmax-19.c
> @@ -0,0 +1,10 @@
> +/* PR tree-optimization/109424 */
> +/* { dg-do compile } */
> +/* { dg-options "-O -fdump-tree-phiopt1-details" } */
> +
> +int f2(int x, int y)
> +{
> +return (x > y) ? ~x : ~y;
> +}
> +/* { dg-final { scan-tree-dump " = MAX_EXPR" "phiopt1"} } */
> +/* { dg-final { scan-tree-dump-times "changed to factor operation out from" 
> 1 "phiopt1"} } */
> diff --git 

Re: [PATCH] Don't call emit_clobber in lower-subreg.cc's resolve_simple_move.

2023-05-08 Thread Richard Biener via Gcc-patches
On Sat, May 6, 2023 at 8:46 PM Jeff Law via Gcc-patches
 wrote:
>
>
>
> On 5/6/23 06:57, Roger Sayle wrote:
> >
> > Following up on posts/reviews by Segher and Uros, there's some question
> > over why the middle-end's lower subreg pass emits a clobber (of a
> > multi-word register) into the instruction stream before emitting the
> > sequence of moves of the word-sized parts.  This clobber interferes
> > with (LRA) register allocation, preventing the multi-word pseudo to
> > remain in the same hard registers.  This patch eliminates this
> > (presumably superfluous) clobber and thereby improves register allocation.
> Those clobbered used to help dataflow analysis know that a multi word
> register was fully assigned by a subsequent sequence.  I suspect they
> haven't been terribly useful in quite a while.

Likely - maybe they still make a difference for some targets though.
It might be interesting to see whether combining the clobber with the
first set or making the set a multi-set with a parallel would be any
better?

>
>
> >
> > A concrete example of the observed improvement is PR target/43644.
> > For the test case:
> > __int128 foo(__int128 x, __int128 y) { return x+y; }
> >
> > on x86_64-pc-linux-gnu, gcc -O2 currently generates:
> >
> > foo:movq%rsi, %rax
> >  movq%rdi, %r8
> >  movq%rax, %rdi
> >  movq%rdx, %rax
> >  movq%rcx, %rdx
> >  addq%r8, %rax
> >  adcq%rdi, %rdx
> >  ret
> >
> > with this patch, we now generate the much improved:
> >
> > foo:movq%rdx, %rax
> >  movq%rcx, %rdx
> >  addq%rdi, %rax
> >  adcq%rsi, %rdx
> >  ret
> >
> > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> > and make -k check, both with and without --target_board=unix{-m32} with
> > no new failures.  OK for mainline?
> >
> >
> > 2023-05-06  Roger Sayle  
> >
> > gcc/ChangeLog
> >  PR target/43644
> >  * lower-subreg.cc (resolve_simple_move): Don't emit a clobber
> >  immediately before moving a multi-word register by parts.
> >
> > gcc/testsuite/ChangeLog
> >  PR target/43644
> >  * gcc.target/i386/pr43644.c: New test case.
> OK for the trunk.  I won't be at all surprised to see fallout in the
> various target tests.  We can fault in fixes as needed.  More
> importantly I think we want as much soak time for this change as we can
> in case there are unexpected consequences.
>
> jeff


RE: [PATCH] machine_mode type size: Extend enum size from 8-bit to 16-bit

2023-05-08 Thread Li, Pan2 via Gcc-patches
Oops. Actually I am patching a version as you mentioned like storage 
allocation. Thank you Richard, will try your suggestion and keep you posted.

Pan

-Original Message-
From: Richard Biener  
Sent: Monday, May 8, 2023 2:30 PM
To: Jeff Law 
Cc: Li, Pan2 ; Kito Cheng ; 
juzhe.zh...@rivai.ai; richard.sandiford ; 
gcc-patches ; palmer ; jakub 

Subject: Re: [PATCH] machine_mode type size: Extend enum size from 8-bit to 
16-bit

On Sun, 7 May 2023, Jeff Law wrote:

> 
> 
> On 5/6/23 19:55, Li, Pan2 wrote:
> > It looks like we cannot simply swap the code and mode in rtx_def, 
> > the code may have to be the same bits as the tree_code in tree_base. 
> > Or we will meet ICE like below.
> > 
> > rtx_def code 16 => 8 bits.
> > rtx_def mode 8 => 16 bits.
> > 
> > static inline decl_or_value
> > dv_from_value (rtx value)
> > {
> >decl_or_value dv;
> >dv = value;
> >gcc_checking_assert (dv_is_value_p (dv));  <=  ICE
> >return dv;
> Ugh.  We really just need to fix this code.  It assumes particular 
> structure layouts and that's just wrong/dumb.

Well, it's a neat trick ... we just need to adjust it to

static inline bool
dv_is_decl_p (decl_or_value dv)
{
  return !dv || (int) GET_CODE ((rtx) dv) != (int) VALUE; }  

I think (and hope for the 'decl' case the bits inspected are never 'VALUE').  
Of course the above stinks from a TBAA perspective ...

Any "real" fix would require allocating storage for a discriminator and thus 
hurt the resource constrained var-tracking a lot.

Richard.


Re: [PATCH-4, rs6000] Change ilp32 target check for some scalar-extract-sig and scalar-insert-exp test cases

2023-05-08 Thread Kewen.Lin via Gcc-patches
Hi Haochen,

on 2023/1/4 14:17, HAO CHEN GUI wrote:
> Hi,
>   "ilp32" is used in these test cases to make sure test cases only run on a
> 32-bit environment. Unfortunately, these cases also run with
> "-m32/-mpowerpc64" which causes unexpected errors. This patch changes the
> target check to skip if "has_arch_ppc64" is set. So the test cases won't run
> when arch_ppc64 has been set.

I think you meant one artificial run with runtestflags like
RUNTESTFLAGS="--target_board=unix/-m32/-mpowerpc64 ...", thanks for catching,
this patch is OK for trunk.

BR,
Kewen

> 
>   Bootstrapped and tested on powerpc64-linux BE and LE with no regressions.
> Is this okay for trunk? Any recommendations? Thanks a lot.
> 
> Gui Haochen
> 
> ChangeLog
> 2023-01-03  Haochen Gui  
> 
> gcc/testsuite/
>   * gcc.target/powerpc/bfp/scalar-extract-sig-2.c: Replace ilp32 check
>   with dg-skip-if has_arch_ppc64.
>   * gcc.target/powerpc/bfp/scalar-insert-exp-2.c: Likewise.
>   * gcc.target/powerpc/bfp/scalar-insert-exp-5.c: Likewise.
> 
> patch.diff
> diff --git a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-sig-2.c 
> b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-sig-2.c
> index 39ee74c94dc..148b5fbd9fa 100644
> --- a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-sig-2.c
> +++ b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-sig-2.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile { target { powerpc*-*-* } } } */
> -/* { dg-require-effective-target ilp32 } */
> +/* { dg-skip-if "" { has_arch_ppc64 } } */
>  /* { dg-require-effective-target powerpc_p9vector_ok } */
>  /* { dg-options "-mdejagnu-cpu=power9" } */
> 
> diff --git a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-insert-exp-2.c 
> b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-insert-exp-2.c
> index efd69725905..956c1183beb 100644
> --- a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-insert-exp-2.c
> +++ b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-insert-exp-2.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile { target { powerpc*-*-* } } } */
> -/* { dg-require-effective-target ilp32 } */
> +/* { dg-skip-if "" { has_arch_ppc64 } } */
>  /* { dg-require-effective-target powerpc_p9vector_ok } */
>  /* { dg-options "-mdejagnu-cpu=power9" } */
> 
> diff --git a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-insert-exp-5.c 
> b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-insert-exp-5.c
> index f85966a6fdf..9a7949fb89a 100644
> --- a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-insert-exp-5.c
> +++ b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-insert-exp-5.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile { target { powerpc*-*-* } } } */
> -/* { dg-require-effective-target ilp32 } */
> +/* { dg-skip-if "" { has_arch_ppc64 } } */
>  /* { dg-require-effective-target powerpc_p9vector_ok } */
>  /* { dg-options "-mdejagnu-cpu=power9" } */
> 



Re: [PATCH-3, rs6000] Change mode and insn condition for scalar insert exp instruction

2023-05-08 Thread Kewen.Lin via Gcc-patches
Hi Haochen,

on 2023/1/4 14:17, HAO CHEN GUI wrote:
> Hi,
>   This patch changes the mode of exponent to GPR in scalar insert exp
> pattern, as the exponent can be put into a 32-bit register. Also the
> condition check is changed from TARGET_64BIT to TARGET_POWERPC64.
> 
>   The test cases are modified according to the changes of expand pattern.
> 
>   Bootstrapped and tested on powerpc64-linux BE and LE with no regressions.
> Is this okay for trunk? Any recommendations? Thanks a lot.
> 

This patch is OK for trunk, thanks!

BR,
Kewen

> Gui Haochen
> 
> ChangeLog
> 2023-01-03  Haochen Gui  
> 
> gcc/
>   * config/rs6000/rs6000-builtins.def
>   (__builtin_vsx_scalar_insert_exp): Replace bif-pattern from xsiexpdp
>   to xsiexpdp_di.
>   (__builtin_vsx_scalar_insert_exp_dp): Replace bif-pattern from
>   xsiexpdpf to xsiexpdpf_di.
>   * config/rs6000/vsx.md (xsiexpdp): Rename to...
>   (xsiexpdp_): ..., set the mode of second operand to GPR and
>   replace TARGET_64BIT with TARGET_POWERPC64.
>   (xsiexpdpf): Rename to...
>   (xsiexpdpf_): ..., set the mode of second operand to GPR and
>   replace TARGET_64BIT with TARGET_POWERPC64.
> 
> gcc/testsuite/
>   * gcc.target/powerpc/bfp/scalar-insert-exp-0.c: Replace lp64 check
>   with has_arch_ppc64.
>   * gcc.target/powerpc/bfp/scalar-insert-exp-1.c: Likewise.
>   * gcc.target/powerpc/bfp/scalar-insert-exp-12.c: Likewise.
>   * gcc.target/powerpc/bfp/scalar-insert-exp-13.c: Likewise.
>   * gcc.target/powerpc/bfp/scalar-insert-exp-3.c: Likewise.
>   * gcc.target/powerpc/bfp/scalar-insert-exp-4.c: Likewise.
> 
> patch.diff
> diff --git a/gcc/config/rs6000/rs6000-builtins.def 
> b/gcc/config/rs6000/rs6000-builtins.def
> index 25647b7bdd2..b1b5002d7d9 100644
> --- a/gcc/config/rs6000/rs6000-builtins.def
> +++ b/gcc/config/rs6000/rs6000-builtins.def
> @@ -2854,10 +2854,10 @@
> 
>const double __builtin_vsx_scalar_insert_exp (unsigned long long, \
>  unsigned long long);
> -VSIEDP xsiexpdp {}
> +VSIEDP xsiexpdp_di {}
> 
>const double __builtin_vsx_scalar_insert_exp_dp (double, unsigned long 
> long);
> -VSIEDPF xsiexpdpf {}
> +VSIEDPF xsiexpdpf_di {}
> 
>pure vsc __builtin_vsx_xl_len_r (void *, signed long);
>  XL_LEN_R xl_len_r {}
> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index 27e03a4cf6c..3376090cc6f 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -5137,22 +5137,22 @@ (define_insn "xsiexpqp_"
>[(set_attr "type" "vecmove")])
> 
>  ;; VSX Scalar Insert Exponent Double-Precision
> -(define_insn "xsiexpdp"
> +(define_insn "xsiexpdp_"
>[(set (match_operand:DF 0 "vsx_register_operand" "=wa")
>   (unspec:DF [(match_operand:DI 1 "register_operand" "r")
> - (match_operand:DI 2 "register_operand" "r")]
> + (match_operand:GPR 2 "register_operand" "r")]
>UNSPEC_VSX_SIEXPDP))]
> -  "TARGET_P9_VECTOR && TARGET_64BIT"
> +  "TARGET_P9_VECTOR && TARGET_POWERPC64"
>"xsiexpdp %x0,%1,%2"
>[(set_attr "type" "fpsimple")])
> 
>  ;; VSX Scalar Insert Exponent Double-Precision Floating Point Argument
> -(define_insn "xsiexpdpf"
> +(define_insn "xsiexpdpf_"
>[(set (match_operand:DF 0 "vsx_register_operand" "=wa")
>   (unspec:DF [(match_operand:DF 1 "register_operand" "r")
> - (match_operand:DI 2 "register_operand" "r")]
> + (match_operand:GPR 2 "register_operand" "r")]
>UNSPEC_VSX_SIEXPDP))]
> -  "TARGET_P9_VECTOR && TARGET_64BIT"
> +  "TARGET_P9_VECTOR && TARGET_POWERPC64"
>"xsiexpdp %x0,%1,%2"
>[(set_attr "type" "fpsimple")])
> 
> diff --git a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-insert-exp-0.c 
> b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-insert-exp-0.c
> index d8243258a67..88d77564158 100644
> --- a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-insert-exp-0.c
> +++ b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-insert-exp-0.c
> @@ -1,7 +1,7 @@
>  /* { dg-do compile { target { powerpc*-*-* } } } */
> -/* { dg-require-effective-target lp64 } */
>  /* { dg-require-effective-target powerpc_p9vector_ok } */
>  /* { dg-options "-mdejagnu-cpu=power9" } */
> +/* { dg-require-effective-target has_arch_ppc64 } */
> 
>  /* This test should succeed only on 64-bit configurations.  */
>  #include 
> diff --git a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-insert-exp-1.c 
> b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-insert-exp-1.c
> index 8260b107178..2f219ddc83a 100644
> --- a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-insert-exp-1.c
> +++ b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-insert-exp-1.c
> @@ -1,7 +1,7 @@
>  /* { dg-do compile { target { powerpc*-*-* } } } */
> -/* { dg-require-effective-target lp64 } */
>  /* { dg-require-effective-target powerpc_p9vector_ok } */
>  /* { dg-options "-mdejagnu-cpu=power8" } */
> +/* { dg-require-effective-target has_arch_ppc64 

Re: [PATCH-2, rs6000] Change mode and insn condition for scalar extract sig instruction

2023-05-08 Thread Kewen.Lin via Gcc-patches
Hi Haochen,

on 2023/1/4 14:16, HAO CHEN GUI wrote:
> Hi,
>   This patch changes the return type of __builtin_vsx_scalar_extract_sig
> from const signed long to const signed long long, so that it can be called
> with "-m32/-mpowerpc64" option. The bif needs TARGET_POWERPC64 instead of
> TARGET_64BIT. So the condition check in the expander is changed.
> 
>   The test cases are modified according to the changes of expand pattern.
> 
>   Bootstrapped and tested on powerpc64-linux BE and LE with no regressions.
> Is this okay for trunk? Any recommendations? Thanks a lot.

This patch is OK for trunk, thanks!

BR,
Kewen

> 
> Gui Haochen
> 
> ChangeLog
> 2023-01-03  Haochen Gui  
> 
> gcc/
>   * config/rs6000/rs6000-builtins.def
>   (__builtin_vsx_scalar_extract_sig): Set return type to const signed
>   long long.
>   * config/rs6000/vsx.md (xsxsigdp): Replace TARGET_64BIT with
>   TARGET_POWERPC64.
> 
> gcc/testsuite/
>   * gcc.target/powerpc/bfp/scalar-extract-sig-0.c: Replace lp64 check
>   with has_arch_ppc64.
>   * gcc.target/powerpc/bfp/scalar-extract-sig-1.c: Likewise.
>   * gcc.target/powerpc/bfp/scalar-extract-sig-6.c: Likewise.
> 
> patch.diff
> diff --git a/gcc/config/rs6000/rs6000-builtins.def 
> b/gcc/config/rs6000/rs6000-builtins.def
> index a8f1d3f1b3d..25647b7bdd2 100644
> --- a/gcc/config/rs6000/rs6000-builtins.def
> +++ b/gcc/config/rs6000/rs6000-builtins.def
> @@ -2849,7 +2849,7 @@
>pure vsc __builtin_vsx_lxvl (const void *, signed long);
>  LXVL lxvl {}
> 
> -  const signed long __builtin_vsx_scalar_extract_sig (double);
> +  const signed long long __builtin_vsx_scalar_extract_sig (double);
>  VSESDP xsxsigdp {}
> 
>const double __builtin_vsx_scalar_insert_exp (unsigned long long, \
> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index 229c26c3a61..27e03a4cf6c 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -5111,7 +5111,7 @@ (define_insn "xsxsigdp"
>[(set (match_operand:DI 0 "register_operand" "=r")
>   (unspec:DI [(match_operand:DF 1 "vsx_register_operand" "wa")]
>UNSPEC_VSX_SXSIG))]
> -  "TARGET_P9_VECTOR && TARGET_64BIT"
> +  "TARGET_P9_VECTOR && TARGET_POWERPC64"
>"xsxsigdp %0,%x1"
>[(set_attr "type" "integer")])
> 
> diff --git a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-sig-0.c 
> b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-sig-0.c
> index 637080652b7..d22f7d1b274 100644
> --- a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-sig-0.c
> +++ b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-sig-0.c
> @@ -1,7 +1,7 @@
>  /* { dg-do compile { target { powerpc*-*-* } } } */
> -/* { dg-require-effective-target lp64 } */
>  /* { dg-require-effective-target powerpc_p9vector_ok } */
>  /* { dg-options "-mdejagnu-cpu=power9" } */
> +/* { dg-require-effective-target has_arch_ppc64 } */
> 
>  /* This test should succeed only on 64-bit configurations.  */
>  #include 
> diff --git a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-sig-1.c 
> b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-sig-1.c
> index f12eed3d9d5..64747d73a51 100644
> --- a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-sig-1.c
> +++ b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-sig-1.c
> @@ -1,7 +1,7 @@
>  /* { dg-do compile { target { powerpc*-*-* } } } */
> -/* { dg-require-effective-target lp64 } */
>  /* { dg-require-effective-target powerpc_p9vector_ok } */
>  /* { dg-options "-mdejagnu-cpu=power8" } */
> +/* { dg-require-effective-target has_arch_ppc64 } */
> 
>  /* This test should succeed only on 64-bit configurations.  */
>  #include 
> diff --git a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-sig-6.c 
> b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-sig-6.c
> index c85072da138..561be53fb9b 100644
> --- a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-sig-6.c
> +++ b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-sig-6.c
> @@ -1,7 +1,7 @@
>  /* { dg-do run { target { powerpc*-*-* } } } */
> -/* { dg-require-effective-target lp64 } */
>  /* { dg-require-effective-target p9vector_hw } */
>  /* { dg-options "-mdejagnu-cpu=power9" } */
> +/* { dg-require-effective-target has_arch_ppc64 } */
> 
>  /* This test should succeed only on 64-bit configurations.  */
>  #include 


Re: [PATCH] machine_mode type size: Extend enum size from 8-bit to 16-bit

2023-05-08 Thread Richard Biener via Gcc-patches
On Sun, 7 May 2023, Jeff Law wrote:

> 
> 
> On 5/6/23 19:55, Li, Pan2 wrote:
> > It looks like we cannot simply swap the code and mode in rtx_def, the code
> > may have to be the same bits as the tree_code in tree_base. Or we will meet
> > ICE like below.
> > 
> > rtx_def code 16 => 8 bits.
> > rtx_def mode 8 => 16 bits.
> > 
> > static inline decl_or_value
> > dv_from_value (rtx value)
> > {
> >decl_or_value dv;
> >dv = value;
> >gcc_checking_assert (dv_is_value_p (dv));  <=  ICE
> >return dv;
> Ugh.  We really just need to fix this code.  It assumes particular structure
> layouts and that's just wrong/dumb.

Well, it's a neat trick ... we just need to adjust it to

static inline bool
dv_is_decl_p (decl_or_value dv)
{ 
  return !dv || (int) GET_CODE ((rtx) dv) != (int) VALUE;
}  

I think (and hope for the 'decl' case the bits inspected are never 
'VALUE').  Of course the above stinks from a TBAA perspective ...

Any "real" fix would require allocating storage for a discriminator
and thus hurt the resource constrained var-tracking a lot.

Richard.


Re: Re: [PATCH V4] VECT: Add decrement IV iteration loop control by variable amount support

2023-05-08 Thread juzhe.zh...@rivai.ai
Hi, Kewen.>> Sorry for chiming in, I had some concern here.
>> We already have some handlings for partial vector in length in 
>> vect_set_loop_controls_directly
>>(actually it deals with both mask and length), the proposed 
>>vect_set_loop_controls_by_select_vl
>>for select_vl looks like a variant for partial vector in length (comparing to 
>>the normal MIN),
>>and also adopts decrement IV.  IMHO, it seems better to modify/extend the 
>>current handling in
>>vect_set_loop_controls_directly for length, or factor out the handlings for 
>>length there and
>>extend the factored one.  Otherwise, it means we have two sets of handlings 
>>for partial vector
>>in lengths, it looks bad to maintain.  As the previous discussion, adopting 
>>decrement IV is an
>>enhancement for loop control, it's good for both cases w/ or w/o select_vl.  
>>If the above
>>understanding is correct, the possible steps seem to be:
>>  - factor out the handling for length (* optional)
>>  - modify it with decrement IV
>>  - extend it with select_vl.
>>In future if some RVV vendor wants to degenerate select_vl to min, it can 
>>just adopt the same
>>handlings with min by not defining select_vl optab.
You mean like this:
doing this inside vect_set_loop_controls_directly ?
if (use_while_len_p)
  return vect_set_loop_controls_by_while_len (...)

Thanks.


juzhe.zh...@rivai.ai
 
From: Kewen.Lin
Date: 2023-05-08 13:35
To: juzhe.zhong; rguenther; richard.sandiford
CC: gcc-patches
Subject: Re: [PATCH V4] VECT: Add decrement IV iteration loop control by 
variable amount support
Hi,
 
on 2023/5/4 21:25, juzhe.zh...@rivai.ai wrote:
> From: Ju-Zhe Zhong 
> 
> This patch is fixing V3 patch:
> https://patchwork.sourceware.org/project/gcc/patch/20230407014741.139387-1-juzhe.zh...@rivai.ai/
> 
> Fix issues according to Richard Sandiford && Richard Biener.
> 
> 1. Rename WHILE_LEN pattern into SELECT_VL according to Richard Sandiford.
> 2. Support multiple-rgroup for non-SLP auto-vectorization.
> 
>For vec_pack_trunc pattern (multi-rgroup of non-SLP), we generate the 
> total length:
> 
>  _36 = MIN_EXPR ;
> 
>  First length (MIN (X, VF/N)):
>loop_len_15 = MIN_EXPR <_36, POLY_INT_CST [2, 2]>;
> 
>  Second length (X - MIN (X, 1 * VF/N)):
>loop_len_16 = _36 - loop_len_15;
> 
>  Third length (X - MIN (X, 2 * VF/N)):
>_38 = MIN_EXPR <_36, POLY_INT_CST [4, 4]>;
>loop_len_17 = _36 - _38;
> 
>  Forth length (X - MIN (X, 3 * VF/N)):
>_39 = MIN_EXPR <_36, POLY_INT_CST [6, 6]>;
>loop_len_18 = _36 - _39;
> 
> The reason that I use MIN_EXPR instead of SELECT_VL to calculate total length 
> since using SELECT_VL
> to adapt induction IV consumes more instructions than just using MIN_EXPR. 
> Also, during testing,
> I found it's hard to adjust length correctly according to SELECT_VL.
> 
> So, this patch we only use SELECT_VL for single-rgroup with single length 
> control.
> 
> 3. Fix document of select_vl for Richard Biener (remove mode N).
> 4. Fix comments of vect_set_loop_controls_by_select_vl according to Richard 
> Biener.
> 5. Keep loop_vinfo as first parameter for "vect_get_loop_len".
> 6. make requirement of get_while_len_data_ref_ptr outside, let it to be gated 
> at the caller site.
> 
> More comments from Richard Biener:
>>> So it's not actually saturating.  The saturating operation is done by 
>>> .WHILE_LEN?
> I define the outcome of SELECT_VL (n, vf)  (WHILE_LEN) = IN_RANGE (0, min (n, 
> vf)) will make 
> the loop control counter never underflow zero.
> 
>>> I see.  I wonder if it makes sense to leave .WHILE_LEN aside for a start,
>>> the above scheme should also work for single rgroups, no?
>>> As said, it _looks_ like you can progress without .WHILE_LEN and using
>>> .WHILE_LEN is a pure optimization?
> Yes, SELECT_VL (WHILE_LEN) is pure optimization for single-rgroup and allow
> target adjust any length = INRANGE (0, min (n, vf)) each iteration.
> 
> Let me known if I missed something for the V3 patch.
> Thanks.
> 
> ---
>  gcc/cfgloopmanip.cc|   2 +-
>  gcc/doc/md.texi|  34 +++
>  gcc/gimple-loop-interchange.cc |   2 +-
>  gcc/internal-fn.def|   1 +
>  gcc/optabs.def |   1 +
>  gcc/tree-ssa-loop-ivcanon.cc   |   2 +-
>  gcc/tree-ssa-loop-ivopts.cc|   2 +-
>  gcc/tree-ssa-loop-manip.cc |  18 +-
>  gcc/tree-ssa-loop-manip.h  |   4 +-
>  gcc/tree-vect-data-refs.cc |   8 +-
>  gcc/tree-vect-loop-manip.cc| 374 -
>  gcc/tree-vect-loop.cc  |  32 ++-
>  gcc/tree-vect-stmts.cc |  89 +++-
>  gcc/tree-vectorizer.h  |   4 +-
>  14 files changed, 535 insertions(+), 38 deletions(-)
> 
> diff --git a/gcc/cfgloopmanip.cc b/gcc/cfgloopmanip.cc
> index 0e3ad8ed742..6e09dcbb0b1 100644
> --- a/gcc/cfgloopmanip.cc
> +++ b/gcc/cfgloopmanip.cc
> @@ -826,7 +826,7 @@ create_empty_loop_on_edge (edge entry_edge,
>  }
>  
>gsi = gsi_last_bb 

Re: [PATCH] tree: Fix up save_expr [PR52339]

2023-05-08 Thread Richard Biener via Gcc-patches
On Fri, 5 May 2023, Jakub Jelinek wrote:

> On Fri, May 05, 2023 at 01:32:02PM -0400, Jason Merrill wrote:
> > > --- gcc/ada/gcc-interface/utils2.cc.jj2023-01-16 23:19:05.539727388 
> > > +0100
> > > +++ gcc/ada/gcc-interface/utils2.cc   2023-05-05 15:37:30.193990948 
> > > +0200
> > > @@ -3332,6 +3332,7 @@ gnat_invariant_expr (tree expr)
> > >   case IMAGPART_EXPR:
> > >   case VIEW_CONVERT_EXPR:
> > >   CASE_CONVERT:
> > > + case SAVE_EXPR:
> > 
> > I guess doing this would allow gnat_invariant_expr to handle
> > DECL_INVARIANT_P that save_expr doesn't know about.  But it seems that it
> > makes the same assumption as tree_invariant_p_1 about the pointed-to object
> > not changing:
> > 
> > > case INDIRECT_REF:
> > >   if ((!invariant_p && !TREE_READONLY (t)) || TREE_SIDE_EFFECTS 
> > > (t))
> > > return NULL_TREE;
> > 
> > I don't know if this assumption is any more valid in Ada than in C/C++.
> 
> I think we really need Eric (as one who e.g. introduced the
> DECL_INVARIANT_P apparently for this kind of stuff) to have a look at that on 
> the
> Ada side.
> 
> The question is if the posted tree.cc (smallest) patch + 3 new testcases
> + the 7 ada testsuite workarounds are ok for trunk if it passes
> bootstrap/regtest, then I'd file a PR about the Ada regression and only once
> it is dealt with would consider backporting, or if we need to wait for Eric
> before making progress.

I wonder if we should defer some of the choices to a langhook
like make the tree_invariant_p_1 a langhook invocation with the
default to call tree_invariant_p_1.  After lowering we can reset
the langhook to the default.

I think we at least need the generic save_expr to be langhook
aware during the time we fold the original non-gimplified
GENERIC IL (but no longer for the generated GENERIC
and to be gimplified IL the middle-end eventually generates
later).

Richard.


RE: [PATCH] Makefile.in: clean up match.pd-related dependencies

2023-05-08 Thread Alexander Monakov via Gcc-patches
On Fri, 5 May 2023, Alexander Monakov wrote:

> > > gimple-head-export.cc does not exist.
> > > 
> > > gimple-match-exports.cc is not a generated file. It's under source 
> > > control and
> > > edited independently from genmatch.cc. It is compiled separately, 
> > > producing
> > > gimple-match-exports.o.
> > > 
> > > gimple-match-head.cc is also not a generated file, also under source 
> > > control.
> > > It is transitively included into gimple-match-N.o files. If it changes, 
> > > they will be
> > > rebuilt. This is not changed by my patch.
> > > 
> > > gimple-match-auto.h is a generated file. It depends on s-gimple-match 
> > > stamp
> > > file, which in turn depends on genmatch and match.pd. If either changes, 
> > > the
> > > rule for the stamp file triggers. gimple-match-N.o files also depend on 
> > > the
> > > stamp file, so they will be rebuilt as well.
> > 
> > s-gimple-match does not depend on gimple-match-head.cc. if it changes the 
> > stamp
> > is not invalidated. 
> 
> Right, this is correct: there's no need to rerun the recipe for the stamp,
> because contents of gimple-match-head.cc do not affect it.
> 
> > This happens to work because gimple-match-N.cc does depend on 
> > gimple-match-head.cc,
> > but if the gimple-match-N.cc already exists then nothing changes.
> 
> No, if gimple-match-N.cc already exist, make notices they are out-of-date via
> 
> $(GIMPLE_MATCH_PD_SEQ_SRC): s-gimple-match gimple-match-head.cc; @true
> 
> and this triggers rebuilding gimple-match-N.o.
> 
> I tested this. After 'touch gimple-match-head.cc' all ten gimple-match-N.o 
> files
> are rebuilt.

My explanation was incomplete here. The gcc/Makefile.in rule quoted above
applies to .cc files and does not trigger rebuilds of .o files on its own.
The reason .o files get rebuilt is implicit dependency tracking: initial
build records header dependencies in gcc/.deps/*.Po files, and incremental
rebuild sees that gimple-match-1.o depends on gimple-match-head.cc.

Alexander


Re: [PATCH-1, rs6000] Change mode and insn condition for scalar extract exp instruction

2023-05-08 Thread Kewen.Lin via Gcc-patches
Hi Haochen,

on 2023/1/4 14:16, HAO CHEN GUI wrote:
> Hi,
>   This patch changes the return type of __builtin_vsx_scalar_extract_exp
> from const signed long to const signed int, as the exponent can be put in
> a signed int. It is also inline with the external interface definition of
> the bif. The mode of exponent operand in "xsxexpdp" is changed to GPR mode
> and TARGET_64BIT check is removed, as the instruction can be executed on
> a 32-bit environment.
> 
>   The test cases are modified according to the changes of expand pattern.
> 
>   Bootstrapped and tested on powerpc64-linux BE and LE with no regressions.
> Is this okay for trunk? Any recommendations? Thanks a lot.
> 
> Gui Haochen
> 
> ChangeLog
> 2022-12-23  Haochen Gui  
> 
> gcc/
>   * config/rs6000/rs6000-builtins.def
>   (__builtin_vsx_scalar_extract_exp): Set return type to const unsigned

typo, s/unsigned/signed/

>   int and set its bif-pattern to xsxexpdp_si, move it from power9-64 to
>   power9 catalog.
>   * config/rs6000/vsx.md (xsxexpdp): Rename to ...
>   (xsxexpdp_): ..., set mode of operand 0 to GPR and remove
>   TARGET_64BIT check.
>   * doc/extend.texi (scalar_extract_exp): Remove 64-bit environment
>   requirement when it has a 64-bit argument.
> 
> gcc/testsuite/
>   * gcc.target/powerpc/bfp/scalar-extract-exp-0.c: Remove lp64 check.
>   * gcc.target/powerpc/bfp/scalar-extract-exp-1.c: Likewise.
>   * gcc.target/powerpc/bfp/scalar-extract-exp-2.c: Deleted as the case is
>   invalid.

s/Deleted/Delete/

OK for trunk with these nits fixed, thanks!

BR,
Kewen

>   * gcc.target/powerpc/bfp/scalar-extract-exp-6.c: Remove lp64 check.
> 
> patch.diff
> diff --git a/gcc/config/rs6000/rs6000-builtins.def 
> b/gcc/config/rs6000/rs6000-builtins.def
> index f76f54793d7..a8f1d3f1b3d 100644
> --- a/gcc/config/rs6000/rs6000-builtins.def
> +++ b/gcc/config/rs6000/rs6000-builtins.def
> @@ -2833,6 +2833,8 @@
>const signed int __builtin_dtstsfi_ov_td (const int<6>, _Decimal128);
>  TSTSFI_OV_TD dfptstsfi_unordered_td {}
> 
> +  const signed int  __builtin_vsx_scalar_extract_exp (double);
> +VSEEDP xsxexpdp_si {}
> 
>  [power9-64]
>void __builtin_altivec_xst_len_r (vsc, void *, long);
> @@ -2847,9 +2849,6 @@
>pure vsc __builtin_vsx_lxvl (const void *, signed long);
>  LXVL lxvl {}
> 
> -  const signed long __builtin_vsx_scalar_extract_exp (double);
> -VSEEDP xsxexpdp {}
> -
>const signed long __builtin_vsx_scalar_extract_sig (double);
>  VSESDP xsxsigdp {}
> 
> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index 992fbc983be..229c26c3a61 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -5089,11 +5089,11 @@ (define_insn "xsxexpqp_"
>[(set_attr "type" "vecmove")])
> 
>  ;; VSX Scalar Extract Exponent Double-Precision
> -(define_insn "xsxexpdp"
> -  [(set (match_operand:DI 0 "register_operand" "=r")
> - (unspec:DI [(match_operand:DF 1 "vsx_register_operand" "wa")]
> +(define_insn "xsxexpdp_"
> +  [(set (match_operand:GPR 0 "register_operand" "=r")
> + (unspec:GPR [(match_operand:DF 1 "vsx_register_operand" "wa")]
>UNSPEC_VSX_SXEXPDP))]
> -  "TARGET_P9_VECTOR && TARGET_64BIT"
> +  "TARGET_P9_VECTOR"
>"xsxexpdp %0,%x1"
>[(set_attr "type" "integer")])
> 
> diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
> index d3812fa55b0..7c087967234 100644
> --- a/gcc/doc/extend.texi
> +++ b/gcc/doc/extend.texi
> @@ -19598,7 +19598,10 @@ bool scalar_test_neg (double source);
>  bool scalar_test_neg (__ieee128 source);
>  @end smallexample
> 
> -The @code{scalar_extract_exp} and @code{scalar_extract_sig}
> +The @code{scalar_extract_exp} with a 64-bit source argument
> +function requires an environment supporting ISA 3.0 or later.
> +The @code{scalar_extract_exp} with a 128-bit source argument
> +and @code{scalar_extract_sig}
>  functions require a 64-bit environment supporting ISA 3.0 or later.
>  The @code{scalar_extract_exp} and @code{scalar_extract_sig} built-in
>  functions return the significand and the biased exponent value
> diff --git a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-exp-0.c 
> b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-exp-0.c
> index 35bf1b240f3..d971833748e 100644
> --- a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-exp-0.c
> +++ b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-exp-0.c
> @@ -1,9 +1,7 @@
>  /* { dg-do compile { target { powerpc*-*-* } } } */
> -/* { dg-require-effective-target lp64 } */
>  /* { dg-require-effective-target powerpc_p9vector_ok } */
>  /* { dg-options "-mdejagnu-cpu=power9" } */
> 
> -/* This test should succeed only on 64-bit configurations.  */
>  #include 
> 
>  unsigned int
> diff --git a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-exp-1.c 
> b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-exp-1.c
> index 9737762c1d4..1cb438f9b70 100644
> --- 

Re: [PATCH] Move substitute_and_fold over to use simple_dce_from_worklist

2023-05-08 Thread Richard Biener via Gcc-patches
On Fri, May 5, 2023 at 5:18 PM Andrew Pinski via Gcc-patches
 wrote:
>
> While looking into a different issue, I noticed that it
> would take until the second forwprop pass to do some
> forward proping and it was because the ssa name was
> used more than once but the second statement was
> "dead" and we don't remove that until much later.
>
> So this uses simple_dce_from_worklist instead of manually
> removing of the known unused statements instead.
> Propagate engine does not do a cleanupcfg afterwards either but manually
> cleans up possible EH edges so simple_dce_from_worklist
> needs to communicate that back to the propagate engine.
>
> Some testcases needed to be updated/changed even because of better 
> optimization.
> gcc.dg/pr81192.c even had to be changed to be using the gimple FE so it would
> be less fragile in the future too.
> gcc.dg/tree-ssa/pr98737-1.c was failing because __atomic_fetch_ was being 
> matched
> but in those cases, the result was not being used so both __atomic_fetch_ and
> __atomic_x_and_fetch_ are valid choices and would not make a code generation 
> difference.
> evrp7.c, evrp8.c, vrp35.c, vrp36.c: just needed a slightly change as the 
> removal message
> is different slightly.
> kernels-alias-8.c: ccp1 is able to remove an unused load which causes ealias 
> to have
> one less load to analysis so update the expected scan #.
>
> OK? Bootstrapped and tested on x86_64-linux-gnu with no regressions.

OK.

Thanks,
Richard.

> gcc/ChangeLog:
>
> PR tree-optimization/109691
> * tree-ssa-dce.cc (simple_dce_from_worklist): Add need_eh_cleanup
> argument.
> If the removed statement can throw, have need_eh_cleanup
> include the bb of that statement.
> * tree-ssa-dce.h (simple_dce_from_worklist): Update declaration.
> * tree-ssa-propagate.cc (struct prop_stats_d): Remove
> num_dce.
> (substitute_and_fold_dom_walker::substitute_and_fold_dom_walker):
> Initialize dceworklist instead of stmts_to_remove.
> (substitute_and_fold_dom_walker::~substitute_and_fold_dom_walker):
> Destore dceworklist instead of stmts_to_remove.
> (substitute_and_fold_dom_walker::before_dom_children):
> Set dceworklist instead of adding to stmts_to_remove.
> (substitute_and_fold_engine::substitute_and_fold):
> Call simple_dce_from_worklist instead of poping
> from the list.
> Don't update the stat on removal statements.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/tree-ssa/evrp7.c: Update for output change.
> * gcc.dg/tree-ssa/evrp8.c: Likewise.
> * gcc.dg/tree-ssa/vrp35.c: Likewise.
> * gcc.dg/tree-ssa/vrp36.c: Likewise.
> * gcc.dg/tree-ssa/pr98737-1.c: Update scan-tree-dump-not
> to check for assignment too instead of just a call.
> * c-c++-common/goacc/kernels-alias-8.c: Update test
> for removal of load.
> * gcc.dg/pr81192.c: Rewrite testcase in gimple based test.
> ---
>  .../c-c++-common/goacc/kernels-alias-8.c  |  6 +-
>  gcc/testsuite/gcc.dg/pr81192.c| 64 ---
>  gcc/testsuite/gcc.dg/tree-ssa/evrp7.c |  2 +-
>  gcc/testsuite/gcc.dg/tree-ssa/evrp8.c |  2 +-
>  gcc/testsuite/gcc.dg/tree-ssa/pr98737-1.c |  7 +-
>  gcc/testsuite/gcc.dg/tree-ssa/vrp35.c |  2 +-
>  gcc/testsuite/gcc.dg/tree-ssa/vrp36.c |  2 +-
>  gcc/tree-ssa-dce.cc   |  7 +-
>  gcc/tree-ssa-dce.h|  2 +-
>  gcc/tree-ssa-propagate.cc | 39 ++-
>  10 files changed, 82 insertions(+), 51 deletions(-)
>
> diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-alias-8.c 
> b/gcc/testsuite/c-c++-common/goacc/kernels-alias-8.c
> index 69200ccf192..c3922e33241 100644
> --- a/gcc/testsuite/c-c++-common/goacc/kernels-alias-8.c
> +++ b/gcc/testsuite/c-c++-common/goacc/kernels-alias-8.c
> @@ -16,7 +16,9 @@ foo (int *a, size_t n)
>}
>  }
>
> -/* Only the omp_data_i related loads should be annotated with cliques.  */
> -/* { dg-final { scan-tree-dump-times "clique 1 base 1" 2 "ealias" } } */
> +/* Only the omp_data_i related loads should be annotated with cliques.
> +   Note ccp can remove one of the omp_data_i loads which is why there
> +   is there only one clique base still there.  */
> +/* { dg-final { scan-tree-dump-times "clique 1 base 1" 1 "ealias" } } */
>  /* { dg-final { scan-tree-dump-times "(?n)clique 1 base 0" 2 "ealias" } } */
>
> diff --git a/gcc/testsuite/gcc.dg/pr81192.c b/gcc/testsuite/gcc.dg/pr81192.c
> index 6cab6056558..f6d201ee71a 100644
> --- a/gcc/testsuite/gcc.dg/pr81192.c
> +++ b/gcc/testsuite/gcc.dg/pr81192.c
> @@ -1,5 +1,58 @@
> -/* { dg-options "-Os -fdump-tree-pre-details -fdisable-tree-evrp 
> -fno-tree-dse" } */
> +/* { dg-options "-Os -fgimple -fdump-tree-pre-details -fdisable-tree-evrp 
> -fno-tree-dse" } */
>
> +#if __SIZEOF_INT__ == 2
> +#define unsigned 

Re: [PATCH][stage1] Remove conditionals around free()

2023-05-08 Thread Bernhard Reutner-Fischer via Gcc-patches
On Wed, 1 Mar 2023 16:07:14 -0800
Steve Kargl  wrote:

> On Wed, Mar 01, 2023 at 10:28:56PM +0100, Bernhard Reutner-Fischer via 
> Fortran wrote:
> >  libgfortran/caf/single.c |6 ++
> >  libgfortran/io/async.c   |6 ++
> >  libgfortran/io/format.c  |3 +--
> >  libgfortran/io/transfer.c|6 ++
> >  libgfortran/io/unix.c|3 +--  
> 
> The Fortran ones are OK.
> 

I've pushed the fortran and libgfortran bits as r14-568-gca2f64d5d08c16
thanks,


Re: ping Re: [PATCH 3/3] Fortran: Fix mpz and mpfr memory leaks

2023-05-08 Thread Bernhard Reutner-Fischer via Gcc-patches
On Mon, 17 Apr 2023 15:18:27 -0700
Steve Kargl  wrote:
> On Mon, Apr 17, 2023 at 09:47:50PM +0200, Bernhard Reutner-Fischer via 
> Fortran wrote:
> > On Mon, 03 Apr 2023 23:42:06 +0200
> > Bernhard Reutner-Fischer  wrote:
> >   
> > > On 3 April 2023 21:50:49 CEST, Harald Anlauf  wrote:  
> > > >Hi Bernhard,
> > > >
> > > >there is neither context nor a related PR with a testcase showing
> > > >that this patch fixes issues seen there.
> > > 
> > > Yes, i forgot to mention the PR:
> > > 
> > > PR fortran/68800
> > > 
> > > I did not construct individual test cases but it should be obvious that 
> > > we should not leak these.
> > >   
> > > >
> > > >On 4/2/23 17:05, Bernhard Reutner-Fischer via Gcc-patches wrote:
> > > >> From: Bernhard Reutner-Fischer 
> > > >> 
> > > >> Cc: fort...@gcc.gnu.org
> > > >> 
> > > >> gcc/fortran/ChangeLog:
> > > >> 
> > > >>* array.cc (gfc_ref_dimen_size): Free mpz memory before ICEing.
> > > >>* expr.cc (find_array_section): Fix mpz memory leak.
> > > >>* simplify.cc (gfc_simplify_reshape): Fix mpz memory leaks in
> > > >>error paths.
> > > >>(gfc_simplify_set_exponent): Fix mpfr memory leak.
> > > >> ---
> > > >>   gcc/fortran/array.cc| 3 +++
> > > >>   gcc/fortran/expr.cc | 8 
> > > >>   gcc/fortran/simplify.cc | 7 ++-
> > > >>   3 files changed, 13 insertions(+), 5 deletions(-)
> > > >> 
> > > >> diff --git a/gcc/fortran/array.cc b/gcc/fortran/array.cc
> > > >> index be5eb8b6a0f..8b1e816a859 100644
> > > >> --- a/gcc/fortran/array.cc
> > > >> +++ b/gcc/fortran/array.cc
> > > >> @@ -2541,6 +2541,9 @@ gfc_ref_dimen_size (gfc_array_ref *ar, int 
> > > >> dimen, mpz_t *result, mpz_t *end)
> > > >> return t;
> > > >> 
> > > >>   default:
> > > >> +  mpz_clear (lower);
> > > >> +  mpz_clear (stride);
> > > >> +  mpz_clear (upper);
> > > >> gfc_internal_error ("gfc_ref_dimen_size(): Bad dimen_type");
> > > >>   }
> > > >
> > > >  What is the point of clearing variables before issuing
> > > >  a gfc_internal_error?
> > > 
> > > To make it obvious that we are aware that we allocated these.  
> 
> I must admit I agree with Harald on this portion
> of the diff.  What's the point?  There is alot more
> allocated than just those 3 mzp_t variables when the
> internal error occurs.  For example, gfortran does not
> walk the namespaces and free those before the ICE.
> I suppose silencing valgrind might be sufficient 
> reason for the clutter.  So, ok.

I've dropped this hunk and pushed the rest as r14-567-g2521390dd2f8e5
Harald fixed the leak of expr in gfc_simplify_set_exponent in the
meantime.
thanks,