Re: [PATCH] RISC-V: Fix RVV binary auto-vectorizaiton test fails

2023-05-12 Thread Kito Cheng via Gcc-patches
We would like users to explicitly set that, so that implication rule
won't screw anything up or unexpect -mabi, that's kind of the
conclusion of most RISC-V GCC maintainers (Palmer/Jim Willsom/me).

Also the behavior is there for years, we don't want to make surprise
to user for the behavior change.

One more thing is -mcpu will also implicitly set -march if -march is not given:
so if we made -mabi could be implying from -march...then that means it
could be implying from -mcpu if -march is not given.
That's kind of...weird and confusing people I think.

And yes, clang did the ABI implication from -march and -mcpu, and I
got several issue around that within SIFive :P

On Fri, May 12, 2023 at 2:34 PM Robin Dapp via Gcc-patches
 wrote:
>
> > ok, thanks :)
> This has likely been discussed at length before, but why need to
> specify the additional -mabi with -march (instead of -march implying
> a matching abi)?


libgomp testsuite: Generalize 'lang_library_path' into a list of 'lang_library_paths' (was: libgomp testsuite: (not) using a specific driver for C++, Fortran?)

2023-05-12 Thread Thomas Schwinge
Hi!

On 2014-11-04T10:31:37-0800, Mike Stump  wrote:
> On Nov 4, 2014, at 4:13 AM, Thomas Schwinge  wrote:
>> On Wed, 15 Oct 2014 17:46:48 +0200, I wrote:
>>> [...]
>>>
>>> Am I on the right track with the following?
>>
>> Nobody commented, which also means nobody disagreed
>
> :-)
>
>> OK to commit all that to trunk?
>
> Ok, thanks.

Rebased, adjusted, retested another 2014 clean-up patch; pushed to
master branch commit b794dc779382bb9e645ccc10b4447d4e411f6000
"libgomp testsuite: Generalize 'lang_library_path' into a list of 
'lang_library_paths'"
("..., and use that for libquadmath, too"), see attached.


Grüße
 Thomas


-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
>From b794dc779382bb9e645ccc10b4447d4e411f6000 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Sun, 2 Nov 2014 17:49:31 +0100
Subject: [PATCH] libgomp testsuite: Generalize 'lang_library_path' into a list
 of 'lang_library_paths'

..., and use that for libquadmath, too.

	libgomp/
	* testsuite/lib/libgomp.exp (libgomp_target_compile): Generalize
	'lang_library_path' into a list of 'lang_library_paths'.
	* testsuite/libgomp.c++/c++.exp: Adjust.
	* testsuite/libgomp.oacc-c++/c++.exp: Likewise.
	* testsuite/libgomp.fortran/fortran.exp: Adjust.  Use that for
	libquadmath, too.
	* testsuite/libgomp.oacc-fortran/fortran.exp: Likewise.
---
 libgomp/testsuite/lib/libgomp.exp | 14 
 libgomp/testsuite/libgomp.c++/c++.exp | 14 
 libgomp/testsuite/libgomp.fortran/fortran.exp | 33 +
 libgomp/testsuite/libgomp.oacc-c++/c++.exp| 14 
 .../libgomp.oacc-fortran/fortran.exp  | 35 +++
 5 files changed, 63 insertions(+), 47 deletions(-)

diff --git a/libgomp/testsuite/lib/libgomp.exp b/libgomp/testsuite/lib/libgomp.exp
index 2295fbbbd417..9fea31d80672 100644
--- a/libgomp/testsuite/lib/libgomp.exp
+++ b/libgomp/testsuite/lib/libgomp.exp
@@ -241,12 +241,14 @@ proc libgomp_target_compile { source dest type options } {
 	lappend options "additional_flags=${lang_include_flags}"
 }
 
-global lang_library_path
-if { [info exists lang_library_path] } {
-	# Some targets use libgfortran.a%s in their specs, so they need
-	# a -B option for uninstalled testing.
-	lappend options "additional_flags=-B${blddir}/${lang_library_path}"
-	lappend options "ldflags=-L${blddir}/${lang_library_path}"
+global lang_library_paths
+if { [info exists lang_library_paths] } {
+	foreach lang_library_path $lang_library_paths {
+	# targets that use lib[...].a%s in their specs need a -B option
+	# for uninstalled testing.
+	lappend options "additional_flags=-B${blddir}/${lang_library_path}"
+	lappend options "ldflags=-L${blddir}/${lang_library_path}"
+	}
 }
 global lang_link_flags
 if { [info exists lang_link_flags] } {
diff --git a/libgomp/testsuite/libgomp.c++/c++.exp b/libgomp/testsuite/libgomp.c++/c++.exp
index 8307baf32fcf..1a1c3ee22252 100644
--- a/libgomp/testsuite/libgomp.c++/c++.exp
+++ b/libgomp/testsuite/libgomp.c++/c++.exp
@@ -2,14 +2,15 @@ load_lib libgomp-dg.exp
 load_gcc_lib gcc-dg.exp
 
 if { $blddir != "" } {
-set lang_library_path "../libstdc++-v3/src/.libs"
+set libstdc++_library_path "../libstdc++-v3/src/.libs"
 set shlib_ext [get_shlib_extension]
-if { ![file exists "${blddir}/${lang_library_path}/libstdc++.a"]
-	 && ![file exists "${blddir}/${lang_library_path}/libstdc++.${shlib_ext}"] } {
+if { ![file exists "${blddir}/${libstdc++_library_path}/libstdc++.a"]
+	 && ![file exists "${blddir}/${libstdc++_library_path}/libstdc++.${shlib_ext}"] } {
 	verbose -log "No libstdc++ library found, will not execute c++ tests"
-	unset lang_library_path
+	unset libstdc++_library_path
 	return
 }
+lappend lang_library_paths ${libstdc++_library_path}
 } elseif { ![info exists GXX_UNDER_TEST] } {
 verbose -log "GXX_UNDER_TEST not defined, will not execute c++ tests"
 return
@@ -39,7 +40,7 @@ set tests [lsort [concat \
 
 set ld_library_path $always_ld_library_path
 if { $blddir != "" } {
-append ld_library_path ":${blddir}/${lang_library_path}"
+append ld_library_path ":${blddir}/${libstdc++_library_path}"
 }
 append ld_library_path [gcc-set-multilib-library-path $GCC_UNDER_TEST]
 set_ld_library_path_env_vars
@@ -62,7 +63,8 @@ if [info exists lang_include_flags] then {
 unset lang_include_flags
 }
 if { $blddir != "" } {
-unset lang_library_path
+unset libstdc++_library_path
+unset lang_library_paths
 }
 unset lang_link_flags
 
diff --git a/libgomp/testsuite/libgomp.fortran/fortran.exp b/libgomp/testsuite/libgomp.fortran/fortran.exp
index d98739c8c99d..9295bbae65dd 100644
--- a/libgomp/testsuite/libgomp.fortran/fortran.exp
+++ b/libgomp/testsuite/libgomp.fortran/fortran.exp
@@ -3

Re: Re: [PATCH 4/5] RISC-V: Add Zcmp extension supports.

2023-05-12 Thread Sinan via Gcc-patches
Hi Fei,
Sorry for the late reply, I've been busy with moving these days :(.
Thanks for working on it. I would prefer removing the extra pass for popretz if 
possible ... I will test your patches ASAP. 
BR,
Sinan
--
Sender:Fei Gao 
Sent At:2023 May 6 (Sat.) 16:53
Recipient:Sinan 
Cc:jiawei ; gcc-patches 
Subject:Re: Re: [PATCH 4/5] RISC-V: Add Zcmp extension supports.
On 2023-05-05 23:57 Sinan  wrote:
>
>> hi Jiawei
>>
>> Please ignore my previous reply. I accidently sent the email before I 
>> finished it.
>> Sorry for that!
>>
>> I downloaded the series of patches from you and found in some cases
>> it fails to generate zcmp push and pop insns.
>>
>> TC:
>>
>> char my_getchar();
>> int test_s0()
>> {
>>
>> int a = my_getchar();
>> int b = my_getchar();
>> return a+b;
>> }
>>
>> cc1 -fno-shrink-wrap-separate -O2 -march=rv32e_zca_zcmp -mabi=ilp32e 
>> -mcmodel=medlow test.c
>>
>> -fno-shrink-wrap-separate is used here to avoid the impact from 
>> shrink-wrap-separate that is by default
>> enabled in O2.
>>
>> As i'm also interested in Zc*, i did some changes mainly in prologue and 
>> epilogue pass quite simliar to
>> what has been done for save and restore except the CFI directives due to 
>> reversed order that zcmp
>> pushes and pops ra, s regs than what save and restore do.
>>
>> I will refine and share the code soon for your review.
>>
>> BR
>> Fei
>Hi Fei,
>In the current implementation, cm.push will not increase the original 
>adjustment size of the stack pointer. As cm.push uses a minimum adjustment 
>size of 16, and in your example, the adjustment size of sp is 12, so cm.push 
>will not be generated.
>you can find the check at riscv_use_push_pop
>> > + */
>> > + if (base_size > frame_size)
>> > + return false;
>> > +
>And if this check is removed, then you can get the output that you expect.
>```
> cm.push {ra,s0},-16
> call my_getchar
> mv s0,a0
> call my_getchar
> add a0,s0,a0
> cm.popret {ra,s0},16
>```
>In many scenarios of rv32e, cm.push cannot be generated as a result. Perhaps 
>we can remove this check? I haven't tested if it is ok to remove this check, 
>and CC jiawei to help test it.
>BR,
>Sinan 
hi Sinan
Thanks for your reply. 
I posted my codes at 
https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg306921.html
In the cover letter, i did some comparision. 
Could you please review?
Thanks & BR, 
Fei
>--
>Sender:Fei Gao 
>Sent At:2023 Apr. 25 (Tue.) 18:12
>Recipient:jiawei 
>Cc:gcc-patches 
>Subject:[PATCH 4/5] RISC-V: Add Zcmp extension supports.
>hi Jiawei
>Please ignore my previous reply. I accidently sent the email before I finished 
>it.
>Sorry for that!
>I downloaded the series of patches from you and found in some cases
>it fails to generate zcmp push and pop insns.
>TC:
>char my_getchar();
>int test_s0()
>{
> int a = my_getchar();
> int b = my_getchar();
> return a+b;
>}
>cc1 -fno-shrink-wrap-separate -O2 -march=rv32e_zca_zcmp -mabi=ilp32e 
>-mcmodel=medlow test.c
>-fno-shrink-wrap-separate is used here to avoid the impact from 
>shrink-wrap-separate that is by default
>enabled in O2.
>As i'm also interested in Zc*, i did some changes mainly in prologue and 
>epilogue pass quite simliar to
>what has been done for save and restore except the CFI directives due to 
>reversed order that zcmp
>pushes and pops ra, s regs than what save and restore do.
>I will refine and share the code soon for your review.
>BR
>Fei
>On Thu Apr 6 06:21:17 GMT 2023 Jiawei jia...@iscas.ac.cn wrote:
>>
>>Add Zcmp extension instructions support. Generate push/pop
>>with follow steps:
>>
>> 1. preprocessing:
>> 1.1. if there is no push rtx, then just return. e.g.
>> (note 5 1 22 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
>> (insn/f 22 5 23 2 (set (reg/f:SI 2 sp)
>> (plus:SI (reg/f:SI 2 sp)
>> (const_int -32 [0xffe0])))
>> (nil))
>> (note 23 22 2 2 NOTE_INSN_PROLOGUE_END)
>> 1.2. if push rtx exists, then we compute the number of
>> pushed s-registers, n_sreg.
>>
>> push rtx should be find before NOTE_INSN_PROLOGUE_END tag
>>
>> [2 and 3 happend simultaneously]
>>
>> 2. find valid move pattern, mv sN, aN, where N < n_sreg,
>> and aN is not used the move pattern, and sN is not
>> defined before the move pattern (from prologue to the
>> position of move pattern).
>>
>> 3. analysis use and reach of every instruction from prologue
>> to the position of move pattern.
>> if any sN is used, then we mark the corresponding argument list
>> candidate as invalid.
>> e.g.
>> push {ra,s0-s3}, {}, -32
>> sw s0,44(sp) # s0 is used, then argument list is invalid
>> mv a0,a5 # a0 is defined, then argument list is invalid
>> ...
>> mv s0,a0
>> mv s1,a1
>> mv s2,a2
>>
>> 4. if there is a valid argument list, then replace the pop
>> push parallel insn, and delete mv pattern.
>> if not, skip.
>>
>>All "zcmpe" means Zcmp with RVE extension.
>>The push/pop instrunction implement is mostly finished by Si

Re: [PATCH] Machine_Mode: Extend machine_mode from 8 to 16 bits

2023-05-12 Thread Richard Sandiford via Gcc-patches
pan2...@intel.com writes:
> From: Pan Li 
>
> We are running out of the machine_mode(8 bits) in RISC-V backend. Thus
> we would like to extend the machine mode bit size from 8 to 16 bits.
> However, it is sensitive to extend the memory size in common structure
> like tree or rtx. This patch would like to extend the machine mode bits
> to 16 bits by shrinking, like:
>
> * Swap the bit size of code and machine code in rtx_def.
> * Reconcile the machine_mode location and spare in tree.
>
> The memory impact of this patch for correlated structure looks like below:
>
> +---+--+-+--+
> | struct/bytes  | upstream | patched | diff |
> +---+--+-+--+
> | rtx_obj_reference |8 |  12 |   +4 |
> | ext_modified  |2 |   3 |   +1 |
> | ira_allocno   |  192 | 200 |   +8 |
> | qty_table_elem|   40 |  40 |0 |
> | reg_stat_type |   64 |  64 |0 |
> | rtx_def   |   40 |  40 |0 |
> | table_elt |   80 |  80 |0 |
> | tree_decl_common  |  112 | 112 |0 |
> | tree_type_common  |  128 | 128 |0 |
> +---+--+-+--+
>
> The tree and rtx related struct has no memory changes after this patch,
> and the machine_mode changes to 16 bits already.
>
> Signed-off-by: Pan Li 
> Co-authored-by: Ju-Zhe Zhong 
> Co-authored-by: Kito Cheng 
>
> gcc/ChangeLog:
>
>   * combine.cc (struct reg_stat_type): Extended machine mode to 16 bits.
>   * cse.cc (struct qty_table_elem): Ditto.
>   (struct table_elt): Ditto.
>   (struct set): Ditto.
>   * genopinit.cc (main): Reconciled the machine mode limit.
>   * ira-int.h (struct ira_allocno): Extended machine mode to 16 bits.
>   * ree.cc (struct ATTRIBUTE_PACKED): Ditto.
>   * rtl-ssa/accesses.h: Ditto.
>   * rtl.h (RTX_CODE_BITSIZE): New macro.
>   (RTX_MACHINE_MODE_BITSIZE): Ditto.
>   (struct GTY): Swap bit size between code and machine mode.
>   (subreg_shape::unique_id): Reconciled the machine mode limit.
>   * rtlanal.h: Extended machine mode to 16 bits.
>   * tree-core.h (struct tree_type_common): Ditto.
>   (struct tree_decl_common): Reconciled the locate and extended
>   bit size of machine mode.
> ---
>  gcc/combine.cc |  4 ++--
>  gcc/cse.cc |  8 
>  gcc/genopinit.cc   |  3 ++-
>  gcc/ira-int.h  | 12 
>  gcc/ree.cc |  2 +-
>  gcc/rtl-ssa/accesses.h |  6 --
>  gcc/rtl.h  |  9 ++---
>  gcc/rtlanal.h  |  5 +++--
>  gcc/tree-core.h| 11 ---
>  9 files changed, 38 insertions(+), 22 deletions(-)
>
> diff --git a/gcc/combine.cc b/gcc/combine.cc
> index 5aa0ec5c45a..bdf6f635c80 100644
> --- a/gcc/combine.cc
> +++ b/gcc/combine.cc
> @@ -200,7 +200,7 @@ struct reg_stat_type {
>  
>unsigned HOST_WIDE_INT last_set_nonzero_bits;
>char   last_set_sign_bit_copies;
> -  ENUM_BITFIELD(machine_mode)last_set_mode : 8;
> +  ENUM_BITFIELD(machine_mode)last_set_mode : 
> RTX_MACHINE_MODE_BITSIZE;
>  
>/* Set nonzero if references to register n in expressions should not be
>   used.  last_set_invalid is set nonzero when this register is being
> @@ -235,7 +235,7 @@ struct reg_stat_type {
>   truncation if we know that value already contains a truncated
>   value.  */
>  
> -  ENUM_BITFIELD(machine_mode)truncated_to_mode : 8;
> +  ENUM_BITFIELD(machine_mode)truncated_to_mode : 
> RTX_MACHINE_MODE_BITSIZE;
>  };
>  
>  
> diff --git a/gcc/cse.cc b/gcc/cse.cc
> index b10c9b0c94d..fe594c1bc3d 100644
> --- a/gcc/cse.cc
> +++ b/gcc/cse.cc
> @@ -250,8 +250,8 @@ struct qty_table_elem
>unsigned int first_reg, last_reg;
>/* The sizes of these fields should match the sizes of the
>   code and mode fields of struct rtx_def (see rtl.h).  */

The comment can be removed, since you're now adding macros to ensure
this (thanks).  Same for other instances of the comment.

> -  ENUM_BITFIELD(rtx_code) comparison_code : 16;
> -  ENUM_BITFIELD(machine_mode) mode : 8;
> +  ENUM_BITFIELD(rtx_code) comparison_code : RTX_CODE_BITSIZE;
> +  ENUM_BITFIELD(machine_mode) mode : RTX_MACHINE_MODE_BITSIZE;

Please put the mode first, so that the 16-bit value is aligned
to 16 bits.

>  };
>  
>  /* The table of all qtys, indexed by qty number.  */
> @@ -406,7 +406,7 @@ struct table_elt
>int regcost;
>/* The size of this field should match the size
>   of the mode field of struct rtx_def (see rtl.h).  */
> -  ENUM_BITFIELD(machine_mode) mode : 8;
> +  ENUM_BITFIELD(machine_mode) mode : RTX_MACHINE_MODE_BITSIZE;
>char in_memory;
>char is_const;
>char flag;
> @@ -4155,7 +4155,7 @@ struct set
>/* Original machine mode, in case it becomes a CONST_INT.
>   The size of this field should match the size of the mode
>   fiel

libgomp testsuite: Have each '*.exp' file specify the compiler to use [PR91884] (was: libgomp testsuite: (not) using a specific driver for C++, Fortran?)

2023-05-12 Thread Thomas Schwinge
Hi!

The cleanup is done, now turn ourselves to the changes proper re PR91884
"libgomp testsuite: (not) using a specific driver for C++, Fortran":

On 2014-11-04T10:31:37-0800, Mike Stump  wrote:
> On Nov 4, 2014, at 4:13 AM, Thomas Schwinge  wrote:
>> On Wed, 15 Oct 2014 17:46:48 +0200, I wrote:
>>> No matter whether it's C, C++, or Fortran source code, the libgomp
>>> testsuite always uses (for build-tree testing) gcc/xgcc, or (for
>>> installed testing) GCC_UNDER_TEST.  It doesn't make use of
>>> GXX_UNDER_TEST, GFORTRAN_UNDER_TEST.  To support the latter two
>>> languages' needs, some -l[...] flags are then added via lang_link_flags.
>>> For example, for Fortran this is -lgfortran.  This is, however, not what
>>> would happen if using the gfortran driver to build (which is what a user
>>> would be doing -- which we should replicate as much as possible at least
>>> for installed testing): the gfortran driver also adds -lquadmath, if
>>> applicable.
>>>
>>> Now, I wonder why to re-invent all that in the libgomp testsuite, if the
>>> respective driver already has that knowledge, via spec files, for
>>> example?  (Also, the regular GCC compiler tests, gcc/testsuite/, are
>>> doing the right thing.)  Why is libgomp testsuite implemented this way --
>>> just a legacy of the past, or is there a need for that (that I'm not
>>> seeing)?
>>>
>>> [...]
>>>
>>> Am I on the right track with the following?
>>
>> Nobody commented, which also means nobody disagreed
>
> :-)
>
>> OK to commit all that to trunk?
>
> Ok, thanks.

> Watch for any review points from the libgomp people, they might trickle a few 
> in.  I don’t mean to cut short any review points from them.

After Tobias in his 2019 "PR testsuite/91884 Add -lquadmath if available"
again picked up my idea and parts of my 2014 patches (... but in the end
settled for a "simpler" solution to the problem at hand), my original
proposal also got an ACK from Rainer,
.

> Also, please watch for breakage.

Always.  :-)


First, another "no change in behavior" new patch, loosely based
on/extracted out of my earlier work.  I'm posting it separately, but
given that it's in line with my earlier work (just a separate step), I
intend to push it soon, unless there are any objections, of course.
"libgomp testsuite: Have each '*.exp' file specify the compiler to use 
[PR91884]"
("..., which is still 'GCC_UNDER_TEST' for all of them"), see attached.


Grüße
 Thomas


-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
>From 1b2c8132495435dd4455a1c79e5abbd88073c754 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Wed, 10 May 2023 14:43:21 +0200
Subject: [PATCH] libgomp testsuite: Have each '*.exp' file specify the
 compiler to use [PR91884]

..., which is still 'GCC_UNDER_TEST' for all of them; no change in behavior.

	PR testsuite/91884
	libgomp/
	* testsuite/lib/libgomp.exp (libgomp_target_compile): Don't
	specify compiler.
	* testsuite/libgomp.c++/c++.exp (ALWAYS_CFLAGS): Specify compiler.
	* testsuite/libgomp.c/c.exp (ALWAYS_CFLAGS): Likewise.
	* testsuite/libgomp.fortran/fortran.exp (ALWAYS_CFLAGS): Likewise.
	* testsuite/libgomp.graphite/graphite.exp (ALWAYS_CFLAGS):
	Likewise.
	* testsuite/libgomp.oacc-c++/c++.exp (ALWAYS_CFLAGS): Likewise.
	* testsuite/libgomp.oacc-c/c.exp (ALWAYS_CFLAGS): Likewise.
	* testsuite/libgomp.oacc-fortran/fortran.exp (ALWAYS_CFLAGS):
	Likewise.
---
 libgomp/testsuite/lib/libgomp.exp  |  5 -
 libgomp/testsuite/libgomp.c++/c++.exp  | 10 +-
 libgomp/testsuite/libgomp.c/c.exp  |  2 ++
 libgomp/testsuite/libgomp.fortran/fortran.exp  |  1 +
 libgomp/testsuite/libgomp.graphite/graphite.exp|  1 +
 libgomp/testsuite/libgomp.oacc-c++/c++.exp | 10 +-
 libgomp/testsuite/libgomp.oacc-c/c.exp |  2 ++
 libgomp/testsuite/libgomp.oacc-fortran/fortran.exp |  1 +
 8 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/libgomp/testsuite/lib/libgomp.exp b/libgomp/testsuite/lib/libgomp.exp
index 9fea31d8067..48c43e4a136 100644
--- a/libgomp/testsuite/lib/libgomp.exp
+++ b/libgomp/testsuite/lib/libgomp.exp
@@ -46,10 +46,6 @@ load_file libgomp-test-support.exp
 
 set dg-do-what-default run
 
-#
-# GCC_UNDER_TEST is the compiler under test.
-#
-
 set libgomp_compile_options ""
 
 #
@@ -262,7 +258,6 @@ proc libgomp_target_compile { source dest type options } {
 
 lappend options "additional_flags=[libio_include_flags]"
 lappend options "timeout=[timeout_value]"
-lappend options "compiler=$GCC_UNDER_TEST"
 
 set options [concat $libgomp_compile_options $options]
 
diff --git a/libgomp/testsuite/libgomp.c++/c++.exp b/libgomp/testsuite/libgomp.c++/c++.exp
index 1a1c3ee2225..4

libgomp testsuite: As appropriate, use the 'gcc', 'g++', 'gfortran' driver [PR91884] (was: libgomp testsuite: Have each '*.exp' file specify the compiler to use [PR91884] (was: libgomp testsuite: (not

2023-05-12 Thread Thomas Schwinge
Hi!

On 2023-05-12T10:27:29+0200, I wrote:
> The cleanup is done, now turn ourselves to the changes proper re PR91884
> "libgomp testsuite: (not) using a specific driver for C++, Fortran":
>
> On 2014-11-04T10:31:37-0800, Mike Stump  wrote:
>> On Nov 4, 2014, at 4:13 AM, Thomas Schwinge  wrote:
>>> On Wed, 15 Oct 2014 17:46:48 +0200, I wrote:
 No matter whether it's C, C++, or Fortran source code, the libgomp
 testsuite always uses (for build-tree testing) gcc/xgcc, or (for
 installed testing) GCC_UNDER_TEST.  It doesn't make use of
 GXX_UNDER_TEST, GFORTRAN_UNDER_TEST.  To support the latter two
 languages' needs, some -l[...] flags are then added via lang_link_flags.
 For example, for Fortran this is -lgfortran.  This is, however, not what
 would happen if using the gfortran driver to build (which is what a user
 would be doing -- which we should replicate as much as possible at least
 for installed testing): the gfortran driver also adds -lquadmath, if
 applicable.

 Now, I wonder why to re-invent all that in the libgomp testsuite, if the
 respective driver already has that knowledge, via spec files, for
 example?  (Also, the regular GCC compiler tests, gcc/testsuite/, are
 doing the right thing.)  Why is libgomp testsuite implemented this way --
 just a legacy of the past, or is there a need for that (that I'm not
 seeing)?

 [...]

 Am I on the right track with the following?
>>>
>>> Nobody commented, which also means nobody disagreed
>>
>> :-)
>>
>>> OK to commit all that to trunk?
>>
>> Ok, thanks.
>
>> Watch for any review points from the libgomp people, they might trickle a 
>> few in.  I don’t mean to cut short any review points from them.
>
> After Tobias in his 2019 "PR testsuite/91884 Add -lquadmath if available"
> again picked up my idea and parts of my 2014 patches (... but in the end
> settled for a "simpler" solution to the problem at hand), my original
> proposal also got an ACK from Rainer,
> .
>
>> Also, please watch for breakage.
>
> Always.  :-)
>
>
> First, another "no change in behavior" new patch, loosely based
> on/extracted out of my earlier work.  I'm posting it separately, but
> given that it's in line with my earlier work (just a separate step), I
> intend to push it soon, unless there are any objections, of course.
> "libgomp testsuite: Have each '*.exp' file specify the compiler to use 
> [PR91884]"
> ("..., which is still 'GCC_UNDER_TEST' for all of them"), see attached.

..., and then finally
"libgomp testsuite: As appropriate, use the 'gcc', 'g++', 'gfortran' driver 
[PR91884]",
see attached.

..., that is, 'GCC_UNDER_TEST', 'GXX_UNDER_TEST', 'GFORTRAN_UNDER_TEST' 
instead
of 'GCC_UNDER_TEST' for all of them.  No need anymore for 'gcc -lstdc++ -x 
c++'
for C++ code, or 'gcc -lgfortran' plus conditional '-lquadmath' for Fortran
code.  (Getting rid of explicit '-foffload=-lgfortran' is for another day.)


By the way, all changes (individually) tested in a number of different
configurations: '--enable-languages=[...]', native vs. cross, build-tree
vs. installed testing, etc.


Grüße
 Thomas


-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
>From 5324d8fb140c1ab790ea4756983bff203c549ce3 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Wed, 10 May 2023 15:01:55 +0200
Subject: [PATCH] libgomp testsuite: As appropriate, use the 'gcc', 'g++',
 'gfortran' driver [PR91884]

..., that is, 'GCC_UNDER_TEST', 'GXX_UNDER_TEST', 'GFORTRAN_UNDER_TEST' instead
of 'GCC_UNDER_TEST' for all of them.  No need anymore for 'gcc -lstdc++ -x c++'
for C++ code, or 'gcc -lgfortran' plus conditional '-lquadmath' for Fortran
code.  (Getting rid of explicit '-foffload=-lgfortran' is for another day.)

	PR testsuite/91884
	libgomp/
	* configure.ac: 'AC_SUBST(CXX)'.
	* configure: Regenerate.
	* Makefile.in: Likewise.
	* testsuite/Makefile.in: Likewise.
	* testsuite/libgomp-site-extra.exp.in (GXX_UNDER_TEST)
	(GFORTRAN_UNDER_TEST): Set.
	* testsuite/lib/libgomp.exp (libgomp_init): Adjust.
	* testsuite/libgomp.c++/c++.exp: Use 'GXX_UNDER_TEST'.
	* testsuite/libgomp.oacc-c++/c++.exp: Likewise.
	* testsuite/libgomp.fortran/fortran.exp: Use
	'GFORTRAN_UNDER_TEST'.
	* testsuite/libgomp.oacc-fortran/fortran.exp: Likewise.
---
 libgomp/Makefile.in   |  1 +
 libgomp/configure | 18 --
 libgomp/configure.ac  | 13 ++-
 libgomp/testsuite/Makefile.in |  1 +
 libgomp/testsuite/lib/libgomp.exp | 35 ++-
 libgomp/testsuite/libgomp-site-extra.exp.in   |  2 ++
 libgomp/testsuite/libgomp.c++/c++.exp | 22

Re: [PATCH v2] RISC-V: Handle multi-lib path correclty for linux

2023-05-12 Thread Andreas Schwab
WTF?

../../gcc/common/config/riscv/riscv-common.cc: In function 'const char* 
riscv_select_multilib_by_abi(const std::string&, const std::string&, const 
riscv_subset_list*, const switchstr*, int, const 
std::vector&)':
../../gcc/common/config/riscv/riscv-common.cc:1599:22: error: unused parameter 
'riscv_current_arch_str' [-Werror=unused-parameter]
 1599 |   const std::string &riscv_current_arch_str,
  |   ~~~^~
../../gcc/common/config/riscv/riscv-common.cc:1601:28: error: unused parameter 
'subset_list' [-Werror=unused-parameter]
 1601 |   const riscv_subset_list *subset_list, const struct switchstr 
*switches,
  |   ~^~~
../../gcc/common/config/riscv/riscv-common.cc:1601:65: error: unused parameter 
'switches' [-Werror=unused-parameter]
 1601 |   const riscv_subset_list *subset_list, const struct switchstr 
*switches,
  | ^~~~
../../gcc/common/config/riscv/riscv-common.cc:1602:7: error: unused parameter 
'n_switches' [-Werror=unused-parameter]
 1602 |   int n_switches, const std::vector 
&multilib_infos)
  |   ^~
../../gcc/common/config/riscv/riscv-common.cc: In function 'const char* 
riscv_select_multilib(const std::string&, const std::string&, const 
riscv_subset_list*, const switchstr*, int, const 
std::vector&)':
../../gcc/common/config/riscv/riscv-common.cc:1613:22: error: unused parameter 
'riscv_current_arch_str' [-Werror=unused-parameter]
 1613 |   const std::string &riscv_current_arch_str,
  |   ~~~^~
cc1plus: all warnings being treated as errors
make[3]: *** [Makefile:2485: riscv-common.o] Error 1

-- 
Andreas Schwab, sch...@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."


Re: [PATCH] RISC-V: Fix fail of vmv-imm-rv64.c in rv32

2023-05-12 Thread Robin Dapp via Gcc-patches
>> After update local codebase to the trunk. I realize there is one more fail 
>> in RV32.
>> After this patch, all fails of RVV are cleaned up.
>> Thanks.

But only because we build vmv-imm with autovec-preference=scalable.  With 
fixed-vlmax
it still does not work because I messed up the rebase against the series patch.

The following patch fixes it and adds another test similar to the repeating 
series
ones with fixed-vlmax.  Btw why is the vls-vlmax directory under autovec?
It's not really autovectorization (no loops).

 
Subject: [PATCH] [RISC-V] Allow more loading of const vectors.

This patch fixes the recent vmv patch in order to allow loading
of constants via vmv.vi.  It also adds another test analogous
to the series tests.

gcc/ChangeLog:

* config/riscv/riscv.cc (riscv_const_insns): Remove else.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-7.c: New test.
---
 gcc/config/riscv/riscv.cc |   3 +-
 .../riscv/rvv/autovec/vls-vlmax/repeat-7.c| 219 ++
 .../rvv/autovec/vls-vlmax/repeat_run-7.c  | 145 
 3 files changed, 366 insertions(+), 1 deletion(-)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-7.c

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index de578b5b899..5b109766c35 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -1291,11 +1291,12 @@ riscv_const_insns (rtx x)
return 1;
  }
  }
+
/* Constants from -16 to 15 can be loaded with vmv.v.i.
   The Wc0, Wc1 constraints are already covered by the
   vi constraint so we do not need to check them here
   separately.  */
-   else if (TARGET_VECTOR && satisfies_constraint_vi (x))
+   if (satisfies_constraint_vi (x))
  return 1;
 
/* TODO: We may support more const vector in the future.  */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c
new file mode 100644
index 000..bc5580ebd1d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c
@@ -0,0 +1,219 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d" } */
+
+#include 
+
+typedef int8_t vnx2qi __attribute__ ((vector_size (2)));
+typedef int8_t vnx4qi __attribute__ ((vector_size (4)));
+typedef int8_t vnx8qi __attribute__ ((vector_size (8)));
+typedef int8_t vnx16qi __attribute__ ((vector_size (16)));
+typedef int8_t vnx32qi __attribute__ ((vector_size (32)));
+typedef int8_t vnx64qi __attribute__ ((vector_size (64)));
+typedef int8_t vnx128qi __attribute__ ((vector_size (128)));
+
+typedef int16_t vnx2hi __attribute__ ((vector_size (4)));
+typedef int16_t vnx4hi __attribute__ ((vector_size (8)));
+typedef int16_t vnx8hi __attribute__ ((vector_size (16)));
+typedef int16_t vnx16hi __attribute__ ((vector_size (32)));
+typedef int16_t vnx32hi __attribute__ ((vector_size (64)));
+typedef int16_t vnx64hi __attribute__ ((vector_size (128)));
+
+typedef int32_t vnx2si __attribute__ ((vector_size (8)));
+typedef int32_t vnx4si __attribute__ ((vector_size (16)));
+typedef int32_t vnx8si __attribute__ ((vector_size (32)));
+typedef int32_t vnx16si __attribute__ ((vector_size (64)));
+typedef int32_t vnx32si __attribute__ ((vector_size (128)));
+
+typedef int64_t vnx2di __attribute__ ((vector_size (16)));
+typedef int64_t vnx4di __attribute__ ((vector_size (32)));
+typedef int64_t vnx8di __attribute__ ((vector_size (64)));
+typedef int64_t vnx16di __attribute__ ((vector_size (128)));
+
+__attribute__ ((noipa)) void
+f_vnx2qi (int8_t *out)
+{
+  vnx2qi v = {-16, -16};
+  *(vnx2qi *) out = v;
+}
+
+__attribute__ ((noipa)) void
+f_vnx4qi (int8_t *out)
+{
+  vnx4qi v = {-15, -15, -15, -15};
+  *(vnx4qi *) out = v;
+}
+
+__attribute__ ((noipa)) void
+f_vnx8qi (int8_t *out)
+{
+  vnx8qi v = {-14, -14, -14, -14, -14, -14, -14, -14};
+  *(vnx8qi *) out = v;
+}
+
+__attribute__ ((noipa)) void
+f_vnx16qi (int8_t *out)
+{
+  vnx16qi v = {-13, -13, -13, -13, -13, -13, -13, -13,
+  -13, -13, -13, -13, -13, -13, -13, -13};
+  *(vnx16qi *) out = v;
+}
+
+__attribute__ ((noipa)) void
+f_vnx32qi (int8_t *out)
+{
+  vnx32qi v = {7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
+  *(vnx32qi *) out = v;
+}
+
+__attribute__ ((noipa)) void
+f_vnx64qi (int8_t *out)
+{
+  vnx64qi v = {-7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7,
+  -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7,
+  -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7,
+  -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7,
+  -7, -7, -7, -7, -7, -7, -

Re: [PATCH 4/5] RISC-V: Add Zcmp extension supports.

2023-05-12 Thread Sinan via Gcc-patches
Hi, Kito and Jiawei
I have noticed several comments are not accurate or no longer valid(e.g. only 
for zc 0.5) and they need an update or improvement.
> +
>> +namespace {
>> +
>> +/*
>> + 1. preprocessing:
>> + 1.1. if there is no push rtx, then just return. e.g.
>> + (note 5 1 22 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
>> + (insn/f 22 5 23 2 (set (reg/f:SI 2 sp)
>> + (plus:SI (reg/f:SI 2 sp)
>> + (const_int -32 [0xffe0])))
>> + (nil))
>> + (note 23 22 2 2 NOTE_INSN_PROLOGUE_END)
>> + 1.2. if push rtx exists, then we compute the number of
>> + pushed s-registers, n_sreg.
>> +
>> + push rtx should be find before NOTE_INSN_PROLOGUE_END tag
>> +
>> + [2 and 3 happend simultaneously]
>> + 2. find valid move pattern, mv sN, aN, where N < n_sreg,
>> + and aN is not used the move pattern, and sN is not
>> + defined before the move pattern (from prologue to the
>> + position of move pattern).
>> + 3. analysis use and reach of every instruction from prologue
>> + to the position of move pattern.
>> + if any sN is used, then we mark the corresponding argument list
>> + candidate as invalid.
>> + e.g.
>> + push {ra,s0-s3}, {}, -32
>> + sw s0,44(sp) # s0 is used, then argument list is invalid
>> + mv a0,a5 # a0 is defined, then argument list is invalid
>> + ...
>> + mv s0,a0
>> + mv s1,a1
>> + mv s2,a2
>> +
>> + 4. if there is a valid argument list, then replace the pop
>> + push parallel insn, and delete mv pattern.
>> + if not, skip.
>> +*/
>
>I am not sure I understand this optimization pass correctly,
>could you give more example or indicate which testcase can demonstrate
>this pass?
>
>And I would prefer this pass split from this patch, let it become a separated
>patch including testcase.
This comment is incorrect.
this pass is to search `ret`, `cm.pop` and `mv a0, 0` and try to combine them 
into cm.popretz and you can find relevant cm.popretz testcases from 
https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg304545.html
> @@ -4777,6 +4881,66 @@ riscv_use_save_libcall (const struct riscv_frame_info 
> *frame)
>> return frame->save_libcall_adjustment != 0;
>> }
>>
>> +/* Determine how many instructions related to push/pop instructions. */
>> +
>> +static unsigned
>> +riscv_save_push_pop_count (unsigned mask)
>> +{
>> + if (!BITSET_P (mask, GP_REG_FIRST + RETURN_ADDR_REGNUM))
>> + return 0;
>> + for (unsigned n = GP_REG_LAST; n > GP_REG_FIRST; n--)
>> + if (BITSET_P (mask, n)
>> + && !call_used_regs [n])
>> + /* add ra saving and sp adjust. */
>> + return CALLEE_SAVED_REG_NUMBER (n) + 1 + 2;
>
>What the magic number of `+ 1 + 2`?
well, it is really misleading here, and it is better to make it more clear ...
`riscv_save_push_pop_count` is used to calculate the expected size of the 
push/pop parallel pattern(the number saved/restored registers plus one sp 
adjust pattern), so
the number of xreg saved/restored = CALLEE_SAVED_REG_NUMBER (n) + 1 and then 
the `+2` is for ra and sp adjustment patterns ...
>> +riscv_emit_push_insn (struct riscv_frame_info *frame, HOST_WIDE_INT size)
>> +{
>> + unsigned int veclen = riscv_save_push_pop_count (frame->mask);
>> + unsigned int n_reg = veclen - 1;
>
> Need comment to explain why `- 1` here.
so we could use `-1` to calculate how many registers are saved/restored here.
BR,
Sinan
--
Sender:Kito Cheng 
Sent At:2023 May 4 (Thu.) 17:04
Recipient:Jiawei 
Cc:gcc-patches ; kito.cheng ; 
palmer ; christoph.muellner ; 
jeremy.bennett ; mary.bennett 
; nandni.jamnadas ; 
charlie.keaney ; simon.cook 
; tariq.kurd ; 
ibrahim.abu.kharmeh1 ; sinan.lin 
; wuwei2016 ; shihua 
; shiyulong ; chenyixuan 

Subject:Re: [PATCH 4/5] RISC-V: Add Zcmp extension supports.
Could you rebase this patch, we have some changes on
> All "zcmpe" means Zcmp with RVE extension.
Use zcmp_rve instead, zcmpe seems like a new ext. name
> diff --git a/gcc/config/riscv/riscv-zcmp-popret.cc 
> b/gcc/config/riscv/riscv-zcmp-popret.cc
> new file mode 100644
> index 000..d7b40f6a3e2
> --- /dev/null
> +++ b/gcc/config/riscv/riscv-zcmp-popret.cc
> @@ -0,0 +1,260 @@
Need a header here like "^#$% for RISC-V Copyright (C) 2023 Free
Software Foundation, Inc." here
> +#include "config.h"
...
> +#include "cfgrtl.h"
> +
> +#define IN_TARGET_CODE 1
This should appear before include anything.
> +
> +namespace {
> +
> +/*
> + 1. preprocessing:
> + 1.1. if there is no push rtx, then just return. e.g.
> + (note 5 1 22 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
> + (insn/f 22 5 23 2 (set (reg/f:SI 2 sp)
> + (plus:SI (reg/f:SI 2 sp)
> + (const_int -32 [0xffe0])))
> + (nil))
> + (note 23 22 2 2 NOTE_INSN_PROLOGUE_END)
> + 1.2. if push rtx exists, then we compute the number of
> + pushed s-registers, n_sreg.
> +
> + push rtx should be find before NOTE_INSN_PROLOGUE_END tag
> +
> + [2 and 3 happend simultaneously]
> + 2. find valid move pattern, mv sN, aN, where N < n_sreg,
> + and aN is not used the move pattern, and sN is not
> + defined before the move patte

[committed] RISC-V: Suppress unused parameter warning in riscv-common.cc

2023-05-12 Thread Kito Cheng via Gcc-patches
gcc/ChangeLog:

* common/config/riscv/riscv-common.cc (riscv_select_multilib_by_abi):
Drop unused parameter.
(riscv_select_multilib): Ditto.
(riscv_compute_multilib): Update call site of
riscv_select_multilib_by_abi and riscv_select_multilib_by_abi.
---
 gcc/common/config/riscv/riscv-common.cc | 12 +++-
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/gcc/common/config/riscv/riscv-common.cc 
b/gcc/common/config/riscv/riscv-common.cc
index 57a2a279ef53..3a285dfbff0e 100644
--- a/gcc/common/config/riscv/riscv-common.cc
+++ b/gcc/common/config/riscv/riscv-common.cc
@@ -1596,10 +1596,8 @@ riscv_check_conds (
 
 static const char *
 riscv_select_multilib_by_abi (
-  const std::string &riscv_current_arch_str,
   const std::string &riscv_current_abi_str,
-  const riscv_subset_list *subset_list, const struct switchstr *switches,
-  int n_switches, const std::vector &multilib_infos)
+  const std::vector &multilib_infos)
 {
   for (size_t i = 0; i < multilib_infos.size (); ++i)
 if (riscv_current_abi_str == multilib_infos[i].abi_str)
@@ -1610,7 +1608,6 @@ riscv_select_multilib_by_abi (
 
 static const char *
 riscv_select_multilib (
-  const std::string &riscv_current_arch_str,
   const std::string &riscv_current_abi_str,
   const riscv_subset_list *subset_list, const struct switchstr *switches,
   int n_switches, const std::vector &multilib_infos)
@@ -1780,13 +1777,10 @@ riscv_compute_multilib (
   switch (select_kind)
 {
 case select_by_abi:
-  return riscv_select_multilib (riscv_current_arch_str,
-   riscv_current_abi_str, subset_list,
+  return riscv_select_multilib (riscv_current_abi_str, subset_list,
switches, n_switches, multilib_infos);
 case select_by_abi_arch_cmodel:
-  return riscv_select_multilib_by_abi (riscv_current_arch_str,
-  riscv_current_abi_str, subset_list,
-  switches, n_switches,
+  return riscv_select_multilib_by_abi (riscv_current_abi_str,
   multilib_infos);
 case select_by_builtin:
   gcc_unreachable ();
-- 
2.39.2



Re: [PATCH v2] RISC-V: Handle multi-lib path correclty for linux

2023-05-12 Thread Kito Cheng via Gcc-patches
Apologize for that...

fixed on top of trunk:
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=7a7f6b26259d22115ee4813ce130622ad1073d16

On Fri, May 12, 2023 at 4:36 PM Andreas Schwab  wrote:
>
> WTF?
>
> ../../gcc/common/config/riscv/riscv-common.cc: In function 'const char* 
> riscv_select_multilib_by_abi(const std::string&, const std::string&, const 
> riscv_subset_list*, const switchstr*, int, const 
> std::vector&)':
> ../../gcc/common/config/riscv/riscv-common.cc:1599:22: error: unused 
> parameter 'riscv_current_arch_str' [-Werror=unused-parameter]
>  1599 |   const std::string &riscv_current_arch_str,
>   |   ~~~^~
> ../../gcc/common/config/riscv/riscv-common.cc:1601:28: error: unused 
> parameter 'subset_list' [-Werror=unused-parameter]
>  1601 |   const riscv_subset_list *subset_list, const struct switchstr 
> *switches,
>   |   ~^~~
> ../../gcc/common/config/riscv/riscv-common.cc:1601:65: error: unused 
> parameter 'switches' [-Werror=unused-parameter]
>  1601 |   const riscv_subset_list *subset_list, const struct switchstr 
> *switches,
>   | 
> ^~~~
> ../../gcc/common/config/riscv/riscv-common.cc:1602:7: error: unused parameter 
> 'n_switches' [-Werror=unused-parameter]
>  1602 |   int n_switches, const std::vector 
> &multilib_infos)
>   |   ^~
> ../../gcc/common/config/riscv/riscv-common.cc: In function 'const char* 
> riscv_select_multilib(const std::string&, const std::string&, const 
> riscv_subset_list*, const switchstr*, int, const 
> std::vector&)':
> ../../gcc/common/config/riscv/riscv-common.cc:1613:22: error: unused 
> parameter 'riscv_current_arch_str' [-Werror=unused-parameter]
>  1613 |   const std::string &riscv_current_arch_str,
>   |   ~~~^~
> cc1plus: all warnings being treated as errors
> make[3]: *** [Makefile:2485: riscv-common.o] Error 1
>
> --
> Andreas Schwab, sch...@linux-m68k.org
> GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
> "And now for something completely different."


RE: [PATCH 1/2] PR gcc/98350:Add a param to control the length of the chain with FMA in reassoc pass

2023-05-12 Thread Cui, Lili via Gcc-patches
> ISTR there were no sufficient comments in the code explaining why
> rewrite_expr_tree_parallel_for_fma is better by design.  In fact ...
> 
> >
> > >
> > > >   if (!reassoc_insert_powi_p
> > > > - && ops.length () > 3
> > > > + && len > 3
> > > > + && (!keep_fma_chain
> > > > + || (keep_fma_chain
> > > > + && len >
> > > > + param_reassoc_max_chain_length_with_fma))
> > >
> > > in the case len < param_reassoc_max_chain_length_with_fma we have
> > > the chain re-sorted but fall through to non-parallel rewrite.  I
> > > wonder if we do not want to instead adjust the reassociation width?
> > > I'd say it depends on the number of mult cases in the chain (sth the re-
> sorting could have computed).
> > > Why do we have two completely independent --params here?  Can you
> > > give an example --param value combination that makes "sense" and
> > > show how it is beneficial?
> >
> > For this small case https://godbolt.org/z/Pxczrre8P a * b + c * d + e
> > * f  + j
> >
> > GCC trunk: ops_num = 4, targetm.sched.reassociation_width is 4 (scalar fp
> cost is 4). Calculated: Width = 2. we can get 2 FMAs.
> > --
> >   _1 = a_6(D) * b_7(D);
> >   _2 = c_8(D) * d_9(D);
> >   _5 = _1 + _2;
> >   _4 = e_10(D) * f_11(D);
> >   _3 = _4 + j_12(D);
> >   _13 = _3 + _5;
> > 
> >   _2 = c_8(D) * d_9(D);
> >   _5 = .FMA (a_6(D), b_7(D), _2);
> >   _3 = .FMA (e_10(D), f_11(D), j_12(D));
> >   _13 = _3 + _5;
> > 
> > New patch: If just rearrange ops and fall through to parallel rewrite to
> break the chain with width = 2.
> >
> > -
> >   _1 = a_6(D) * b_7(D);
> >   _2 = j + _1;  -> put j at the first.
> >   _3 = c_8(D) * d_9(D);
> >   _4 = e_10(D) * f_11(D);
> >   _5 = _3 + _4;   -> break chain with width = 2. we lost a FMA here.
> >   _13 = _2 + 5;
> >
> > ---
> >   _3 = c_8(D) * d_9(D);
> >   _2 = .FMA (a_6(D), b_7(D), j);
> >   _5 = .FMA (e_10(D), f_11(D), _3);
> >   _13 = _2 + _5;
> > 
> > Sometimes break chain will lose FMA( break chain needs put two
> > mult-ops together, which will lose one FMA ), we can only get 2 FMAs
> > here, if we want to get 3 FMAs, we need to keep the chain and not
> > break it. So I added a param to control chain length
> > "param_reassoc_max_chain_length_with_fma = 4" (For the small case in
> > Bugzilla 98350, we need to keep the chain to generate 6 FMAs.)
> > ---
> >   _1 = a_6(D) * b_7(D);
> >   _2 = c_8(D) * d_9(D);
> >   _4 = e_10(D) * f_11(D);
> >   _15 = _4 + j_12(D);
> >   _16 = _15 + _2;
> >   _13 = _16 + _1;
> > ---
> >   _15 = .FMA (e_10(D), f_11(D), j_12(D));
> >   _16 = .FMA (c_8(D), d_9(D), _15);
> >   _13 = .FMA (a_6(D), b_7(D), _16);
> > ---
> > In some case we want to break the chain with width, we can set
> "param_reassoc_max_chain_length_with_fma = 2", it will rearrange ops and
> break the chain with width.
> 
> ... it sounds like the problem could be fully addressed by sorting the chain
> with reassoc-width in mind?
> Wouldn't it be preferable if rewrite_expr_tree_parallel would get a vector of
> mul and a vector of non-mul ops so it can pick from the optimal candidate?
> 
> That said, I think rewrite_expr_tree_parallel_for_fma at least needs more
> comments.
> 
Sorry for not writing note clearly enough, I'll add more. 
I have two places that need to be clarified.

1. For some case we need to keep chain to generate more FMAs, because break 
chain will lose FMA.
   for example  g + a * b + c * d + e * f,
   Keep chain can get 3 FMAs, break chain can get 2 FMAs. It's hard to say 
which one is better, so we provide a param for users to customize.
   
2. when the chain has FMAs and need to break the chain with width,
for example l + a * b + c * d + e * f + g * h + j * k;(we already put non-mul 
first)
rewrite_expr_tree_parallel :
when width = 2, it will break the chain like this. actually it break the chain 
in to 3. It ignores the width and adds all ops two by two. it will lose FMA.  

ssa1 = l + a * b;
ssa2 = c * d + e * f;
ssa3 = g * h + j * k;
ssa4 = ssa1 + ssa2;
ssa5 = ssa4 + ssa3;

rewrite_expr_tree_parallel_for_fma
when width = 2, we break the chain into two like this.

ssa1 = l + a * b; 
ssa2 = c * d + e * f;
ssa3 = ssa1 + g * h;
ssa4 = ssa2 + j * k;
ssa5 = ssa3 +ssa4;

I think it's okay to remove or keep rewrite_expr_tree_parallel_for_fma. More 
FMAs are generated only for some special cases.
I'm not sure whether the new method is better than the old one. I created a 
small c

[PATCH 0/1] [V2] RISC-V: support Zcmp extension

2023-05-12 Thread Fei Gao
Before implementing Zcmp, I did some optimizations and restructures to 
save-restore.
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=a5b2a3bff8152aa34408d8ce40add82f4d22ff87
https://gcc.gnu.org/git/?p=gcc.git;a=commitdiff;h=60524be1e3929d83e15fceac6e2aa053c8a6fb20
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=a782346757c54a5a3cfb9f416a7ebe3554a617d7

Then Zcmp can share the same logic as save-restore in stack allocation: 
pre-allocation
by cm.push, step 1 and step 2.

please be noted cm.push pushes ra, s0-s11 in reverse order than what 
save-restore does.
So adaption has been done in .cfi directives in my patch. A discussion be found 
here:
https://github.com/riscv/riscv-code-size-reduction/issues/182

Weeks before, Jiawei also posted Zcmp in 
https://gcc.gnu.org/pipermail/gcc-patches/2023-April/615287.html.
[PATCH 0/5] RISC-V: Support ZC* extensions.   Jiawei
[PATCH 1/5] RISC-V: Minimal support for ZC extensions.   Jiawei
[PATCH 2/5] RISC-V: Enable compressible features when use ZC* extensions.   
Jiawei
[PATCH 3/5] RISC-V: Add ZC* test for march args being passed.   Jiawei
[PATCH 4/5] RISC-V: Add Zcmp extension supports.   Jiawei
[PATCH 5/5] RISC-V: Add ZCMP push/pop testcases.   Jiawei

I tested his codes and observed some issues in [PATCH 4/5],
see https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg306921.html for 
details.
So I plan to post my codes as an alternative of Jiawei's [PATCH 4/5].

My Zcmp switch codes are almost same as Jiawei's.
So i avoid repeating them in my patch series.
Please pick up Jiawei's [PATCH 1/5] before picking up my patch series.

Fei Gao (1):
  [RISC-V] support  cm.push cm.pop cm.popret in zcmp

 gcc/config/riscv/predicates.md|  148 +++
 gcc/config/riscv/riscv-protos.h   |2 +
 gcc/config/riscv/riscv.cc |  477 +++-
 gcc/config/riscv/riscv.h  |   23 +
 gcc/config/riscv/riscv.md |2 +
 gcc/config/riscv/zc.md| 1042 +
 gcc/testsuite/gcc.target/riscv/rv32e_zcmp.c   |  239 
 gcc/testsuite/gcc.target/riscv/rv32i_zcmp.c   |  239 
 .../gcc.target/riscv/zcmp_stack_alignment.c   |   23 +
 9 files changed, 2155 insertions(+), 40 deletions(-)
 create mode 100644 gcc/config/riscv/zc.md
 create mode 100644 gcc/testsuite/gcc.target/riscv/rv32e_zcmp.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rv32i_zcmp.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/zcmp_stack_alignment.c

-- 
2.17.1



[PATCH 1/1] [V2] [RISC-V] support cm.push cm.pop cm.popret in zcmp

2023-05-12 Thread Fei Gao
Zcmp can share the same logic as save-restore in stack allocation: 
pre-allocation
by cm.push, step 1 and step 2.

please be noted cm.push pushes ra, s0-s11 in reverse order than what 
save-restore does.
So adaption has been done in .cfi directives in my patch.

gcc/ChangeLog:

* config/riscv/predicates.md (slot_0_offset_operand): predicates for 
slot 0 offset.
(slot_1_offset_operand): likewise
(slot_2_offset_operand): likewise
(slot_3_offset_operand): likewise
(slot_4_offset_operand): likewise
(slot_5_offset_operand): likewise
(slot_6_offset_operand): likewise
(slot_7_offset_operand): likewise
(slot_8_offset_operand): likewise
(slot_9_offset_operand): likewise
(slot_10_offset_operand): likewise
(slot_11_offset_operand): likewise
(slot_12_offset_operand): likewise
(stack_push_up_to_ra_operand): predicates for stack adjust of pushing ra
(stack_push_up_to_s0_operand): predicates for stack adjust of pushing 
ra, s0
(stack_push_up_to_s1_operand): likewise
(stack_push_up_to_s2_operand): likewise
(stack_push_up_to_s3_operand): likewise
(stack_push_up_to_s4_operand): likewise
(stack_push_up_to_s5_operand): likewise
(stack_push_up_to_s6_operand): likewise
(stack_push_up_to_s7_operand): likewise
(stack_push_up_to_s8_operand): likewise
(stack_push_up_to_s9_operand): likewise
(stack_push_up_to_s11_operand): likewise
(stack_pop_up_to_ra_operand): predicates for stack adjust of poping ra
(stack_pop_up_to_s0_operand): predicates for stack adjust of poping ra, 
s0
(stack_pop_up_to_s1_operand): likewise
(stack_pop_up_to_s2_operand): likewise
(stack_pop_up_to_s3_operand): likewise
(stack_pop_up_to_s4_operand): likewise
(stack_pop_up_to_s5_operand): likewise
(stack_pop_up_to_s6_operand): likewise
(stack_pop_up_to_s7_operand): likewise
(stack_pop_up_to_s8_operand): likewise
(stack_pop_up_to_s9_operand): likewise
(stack_pop_up_to_s11_operand): likewise
* config/riscv/riscv-protos.h (riscv_zcmp_valid_slot_offset_p): 
declaration
(riscv_zcmp_valid_stack_adj_bytes_p): declaration
* config/riscv/riscv.cc (struct riscv_frame_info): comment change
(riscv_avoid_multi_push): helper function of riscv_use_multi_push
(riscv_use_multi_push): true if multi push is used
(riscv_multi_push_sregs_count): num of sregs in multi-push
(riscv_multi_push_regs_count): num of regs in multi-push
(riscv_16bytes_align): align to 16 bytes
(riscv_stack_align): moved to a better place
(riscv_save_libcall_count): no functional change
(riscv_compute_frame_info): add zcmp frame info
(riscv_adjust_multi_push_cfi_prologue): adjust cfi for cm.push
(get_slot_offset_rtx): get the rtx of slot to push or pop
(riscv_gen_multi_push_pop_insn): gen function for multi push and pop
(riscv_expand_prologue): allocate stack by cm.push
(riscv_adjust_multi_pop_cfi_epilogue): adjust cfi for cm.pop[ret]
(riscv_expand_epilogue): allocate stack by cm.pop[ret]
(zcmp_base_adj): calculate stack adjustment base size
(zcmp_additional_adj): calculate stack adjustment additional size
(riscv_zcmp_valid_slot_offset_p): check if offset is valid for a slot
(riscv_zcmp_valid_stack_adj_bytes_p): check if stack adjustment size is 
valid
* config/riscv/riscv.h (RETURN_ADDR_MASK): mask of ra
(S0_MASK): likewise
(S1_MASK): likewise
(S2_MASK): likewise
(S3_MASK): likewise
(S4_MASK): likewise
(S5_MASK): likewise
(S6_MASK): likewise
(S7_MASK): likewise
(S8_MASK): likewise
(S9_MASK): likewise
(S10_MASK): likewise
(S11_MASK): likewise
(MULTI_PUSH_GPR_MASK): GPR_MASK that cm.push can cover at most
(ZCMP_MAX_SPIMM): max spimm value
(ZCMP_SP_INC_STEP): zcmp sp increment step
(ZCMP_INVALID_S0S10_SREGS_COUNTS): num of s0-s10
(ZCMP_S0S11_SREGS_COUNTS): num of s0-s11
(ZCMP_MAX_GRP_SLOTS): max slots of pushing and poping in zcmp
* config/riscv/riscv.md: include zc.md
* config/riscv/zc.md: New file. machine description for zcmp

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rv32e_zcmp.c: New test.
* gcc.target/riscv/rv32i_zcmp.c: New test.
* gcc.target/riscv/zcmp_stack_alignment.c: New test.
---
 gcc/config/riscv/predicates.md|  148 +++
 gcc/config/riscv/riscv-protos.h   |2 +
 gcc/config/riscv/riscv.cc |  477 +++-
 gcc/config/riscv/riscv.h  |   23 +
 gcc/config/riscv/riscv.md |2 +
 gcc/config/riscv/zc.md| 1042 +
 gcc/testsuite/gcc.

Re: Re: [PATCH 4/5] RISC-V: Add Zcmp extension supports.

2023-05-12 Thread Fei Gao
On 2023-05-12 16:12  Sinan  wrote:
>
>Hi Fei,
>Sorry for the late reply, I've been busy with moving these days :(.
>Thanks for working on it. I would prefer removing the extra pass for popretz 
>if possible ... I will test your patches ASAP.
>BR,
>Sinan 

hi Sinan

I posted V2 based on Kito's comment just now.
https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg307507.html

For popretz, we can discuss further offline if it's convenient to you.

BR, 
Fei
>--
>Sender:Fei Gao 
>Sent At:2023 May 6 (Sat.) 16:53
>Recipient:Sinan 
>Cc:jiawei ; gcc-patches 
>Subject:Re: Re: [PATCH 4/5] RISC-V: Add Zcmp extension supports.
>On 2023-05-05 23:57 Sinan  wrote:
>>
>>> hi Jiawei
>>>
>>> Please ignore my previous reply. I accidently sent the email before I 
>>> finished it.
>>> Sorry for that!
>>>
>>> I downloaded the series of patches from you and found in some cases
>>> it fails to generate zcmp push and pop insns.
>>>
>>> TC:
>>>
>>> char my_getchar();
>>> int test_s0()
>>> {
>>>
>>> int a = my_getchar();
>>> int b = my_getchar();
>>> return a+b;
>>> }
>>>
>>> cc1 -fno-shrink-wrap-separate -O2 -march=rv32e_zca_zcmp -mabi=ilp32e 
>>> -mcmodel=medlow test.c
>>>
>>> -fno-shrink-wrap-separate is used here to avoid the impact from 
>>> shrink-wrap-separate that is by default
>>> enabled in O2.
>>>
>>> As i'm also interested in Zc*, i did some changes mainly in prologue and 
>>> epilogue pass quite simliar to
>>> what has been done for save and restore except the CFI directives due to 
>>> reversed order that zcmp
>>> pushes and pops ra, s regs than what save and restore do.
>>>
>>> I will refine and share the code soon for your review.
>>>
>>> BR
>>> Fei
>>Hi Fei,
>>In the current implementation, cm.push will not increase the original 
>>adjustment size of the stack pointer. As cm.push uses a minimum adjustment 
>>size of 16, and in your example, the adjustment size of sp is 12, so cm.push 
>>will not be generated.
>>you can find the check at riscv_use_push_pop
>>> > + */
>>> > + if (base_size > frame_size)
>>> > + return false;
>>> > +
>>And if this check is removed, then you can get the output that you expect.
>>```
>> cm.push {ra,s0},-16
>> call my_getchar
>> mv s0,a0
>> call my_getchar
>> add a0,s0,a0
>> cm.popret {ra,s0},16
>>```
>>In many scenarios of rv32e, cm.push cannot be generated as a result. Perhaps 
>>we can remove this check? I haven't tested if it is ok to remove this check, 
>>and CC jiawei to help test it.
>>BR,
>>Sinan
>hi Sinan
>Thanks for your reply.
>I posted my codes at 
>https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg306921.html
>In the cover letter, i did some comparision.
>Could you please review?
>Thanks & BR,
>Fei
>>--
>>Sender:Fei Gao 
>>Sent At:2023 Apr. 25 (Tue.) 18:12
>>Recipient:jiawei 
>>Cc:gcc-patches 
>>Subject:[PATCH 4/5] RISC-V: Add Zcmp extension supports.
>>hi Jiawei
>>Please ignore my previous reply. I accidently sent the email before I 
>>finished it.
>>Sorry for that!
>>I downloaded the series of patches from you and found in some cases
>>it fails to generate zcmp push and pop insns.
>>TC:
>>char my_getchar();
>>int test_s0()
>>{
>> int a = my_getchar();
>> int b = my_getchar();
>> return a+b;
>>}
>>cc1 -fno-shrink-wrap-separate -O2 -march=rv32e_zca_zcmp -mabi=ilp32e 
>>-mcmodel=medlow test.c
>>-fno-shrink-wrap-separate is used here to avoid the impact from 
>>shrink-wrap-separate that is by default
>>enabled in O2.
>>As i'm also interested in Zc*, i did some changes mainly in prologue and 
>>epilogue pass quite simliar to
>>what has been done for save and restore except the CFI directives due to 
>>reversed order that zcmp
>>pushes and pops ra, s regs than what save and restore do.
>>I will refine and share the code soon for your review.
>>BR
>>Fei
>>On Thu Apr 6 06:21:17 GMT 2023 Jiawei jia...@iscas.ac.cn wrote:
>>>
>>>Add Zcmp extension instructions support. Generate push/pop
>>>with follow steps:
>>>
>>> 1. preprocessing:
>>> 1.1. if there is no push rtx, then just return. e.g.
>>> (note 5 1 22 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
>>> (insn/f 22 5 23 2 (set (reg/f:SI 2 sp)
>>> (plus:SI (reg/f:SI 2 sp)
>>> (const_int -32 [0xffe0])))
>>> (nil))
>>> (note 23 22 2 2 NOTE_INSN_PROLOGUE_END)
>>> 1.2. if push rtx exists, then we compute the number of
>>> pushed s-registers, n_sreg.
>>>
>>> push rtx should be find before NOTE_INSN_PROLOGUE_END tag
>>>
>>> [2 and 3 happend simultaneously]
>>>
>>> 2. find valid move pattern, mv sN, aN, where N < n_sreg,
>>> and aN is not used the move pattern, and sN is not
>>> defined before the move pattern (from prologue to the
>>> position of move pattern).
>>>
>>> 3. analysis use and reach of every instruction from prologue
>>> to the position of move pattern.
>>> if any sN is used, then we mark the corresponding argument list
>>> candidate as invalid.
>>> e.g.
>>> push {ra,s0-s3}, {}, -32
>>> sw s0,44(

Re: Re: [PATCH] RISC-V: Fix fail of vmv-imm-rv64.c in rv32

2023-05-12 Thread juzhe.zh...@rivai.ai
vec_init is auto-vectorization pattern.
The test is not loop since using vector type is easier to test the patterns.



juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2023-05-12 16:53
To: Li, Pan2; Kito Cheng; juzhe.zh...@rivai.ai
CC: gcc-patches@gcc.gnu.org; pal...@dabbelt.com; jeffreya...@gmail.com
Subject: Re: [PATCH] RISC-V: Fix fail of vmv-imm-rv64.c in rv32
>> After update local codebase to the trunk. I realize there is one more fail 
>> in RV32.
>> After this patch, all fails of RVV are cleaned up.
>> Thanks.
 
But only because we build vmv-imm with autovec-preference=scalable.  With 
fixed-vlmax
it still does not work because I messed up the rebase against the series patch.
 
The following patch fixes it and adds another test similar to the repeating 
series
ones with fixed-vlmax.  Btw why is the vls-vlmax directory under autovec?
It's not really autovectorization (no loops).
 
Subject: [PATCH] [RISC-V] Allow more loading of const vectors.
 
This patch fixes the recent vmv patch in order to allow loading
of constants via vmv.vi.  It also adds another test analogous
to the series tests.
 
gcc/ChangeLog:
 
* config/riscv/riscv.cc (riscv_const_insns): Remove else.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-7.c: New test.
---
gcc/config/riscv/riscv.cc |   3 +-
.../riscv/rvv/autovec/vls-vlmax/repeat-7.c| 219 ++
.../rvv/autovec/vls-vlmax/repeat_run-7.c  | 145 
3 files changed, 366 insertions(+), 1 deletion(-)
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-7.c
 
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index de578b5b899..5b109766c35 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -1291,11 +1291,12 @@ riscv_const_insns (rtx x)
return 1;
  }
  }
+
/* Constants from -16 to 15 can be loaded with vmv.v.i.
   The Wc0, Wc1 constraints are already covered by the
   vi constraint so we do not need to check them here
   separately.  */
- else if (TARGET_VECTOR && satisfies_constraint_vi (x))
+ if (satisfies_constraint_vi (x))
  return 1;
/* TODO: We may support more const vector in the future.  */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c
new file mode 100644
index 000..bc5580ebd1d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c
@@ -0,0 +1,219 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d" } */
+
+#include 
+
+typedef int8_t vnx2qi __attribute__ ((vector_size (2)));
+typedef int8_t vnx4qi __attribute__ ((vector_size (4)));
+typedef int8_t vnx8qi __attribute__ ((vector_size (8)));
+typedef int8_t vnx16qi __attribute__ ((vector_size (16)));
+typedef int8_t vnx32qi __attribute__ ((vector_size (32)));
+typedef int8_t vnx64qi __attribute__ ((vector_size (64)));
+typedef int8_t vnx128qi __attribute__ ((vector_size (128)));
+
+typedef int16_t vnx2hi __attribute__ ((vector_size (4)));
+typedef int16_t vnx4hi __attribute__ ((vector_size (8)));
+typedef int16_t vnx8hi __attribute__ ((vector_size (16)));
+typedef int16_t vnx16hi __attribute__ ((vector_size (32)));
+typedef int16_t vnx32hi __attribute__ ((vector_size (64)));
+typedef int16_t vnx64hi __attribute__ ((vector_size (128)));
+
+typedef int32_t vnx2si __attribute__ ((vector_size (8)));
+typedef int32_t vnx4si __attribute__ ((vector_size (16)));
+typedef int32_t vnx8si __attribute__ ((vector_size (32)));
+typedef int32_t vnx16si __attribute__ ((vector_size (64)));
+typedef int32_t vnx32si __attribute__ ((vector_size (128)));
+
+typedef int64_t vnx2di __attribute__ ((vector_size (16)));
+typedef int64_t vnx4di __attribute__ ((vector_size (32)));
+typedef int64_t vnx8di __attribute__ ((vector_size (64)));
+typedef int64_t vnx16di __attribute__ ((vector_size (128)));
+
+__attribute__ ((noipa)) void
+f_vnx2qi (int8_t *out)
+{
+  vnx2qi v = {-16, -16};
+  *(vnx2qi *) out = v;
+}
+
+__attribute__ ((noipa)) void
+f_vnx4qi (int8_t *out)
+{
+  vnx4qi v = {-15, -15, -15, -15};
+  *(vnx4qi *) out = v;
+}
+
+__attribute__ ((noipa)) void
+f_vnx8qi (int8_t *out)
+{
+  vnx8qi v = {-14, -14, -14, -14, -14, -14, -14, -14};
+  *(vnx8qi *) out = v;
+}
+
+__attribute__ ((noipa)) void
+f_vnx16qi (int8_t *out)
+{
+  vnx16qi v = {-13, -13, -13, -13, -13, -13, -13, -13,
+-13, -13, -13, -13, -13, -13, -13, -13};
+  *(vnx16qi *) out = v;
+}
+
+__attribute__ ((noipa)) void
+f_vnx32qi (int8_t *out)
+{
+  vnx32qi v = {7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
+  *(vnx32qi *) out = v;
+}
+
+__attribute__ ((noipa)) void
+f_vnx64qi (int8_t *out)
+{
+  vnx64qi v = {-7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7,
+

Re: [PATCH 2/2] nvptx: Prevent emitting duplicate declarations for '__nvptx_stacks', '__nvptx_uni'

2023-05-12 Thread Thomas Schwinge
Hi!

On 2022-12-19T21:40:07+0100, Thomas Schwinge  wrote:
> As I have reported to Nvidia in 2022-12-01 'NVIDIA Incident Report (3891704):
> ptxas: Duplicate declaration error: "cannot be resolved by a '.static'"',
> 'ptxas' has an inscrutable error mode for duplicate declarations:
>
> ptxas softstack-decl-1.o, line 11; error   : '.extern' variable 
> '__nvptx_stacks' cannot be resolved by a '.static'
> ptxas fatal   : Ptx assembly aborted due to errors
> nvptx-as: ptxas returned 255 exit status
>
> ptxas uniform-simt-decl-1.o, line 12; error   : '.extern' variable 
> '__nvptx_uni' cannot be resolved by a '.static'
> ptxas fatal   : Ptx assembly aborted due to errors
> nvptx-as: ptxas returned 255 exit status
>
> This is inscrutable, because (a) what is "cannot be resolved by a '.static'"
> supposed to tell me (there is no '.static' in PTX?), and (b) why arent't
> repeated declaration just verified to match the first, but otherwise a no-op
> (like in other programming languages)?

Since my report, this had its 'Status changed [...] to "Closed - Fixed"'
(2023-01-28), with comment:

| [...] fix should be available in a later release.
| The compiler was modified to allow duplicate declaration of extern symbol. 
You will not see an error for this case.
| The documentation is also being changed to reflect this new change.

I've not yet verified the CUDA/'ptxas'-level fix, but I suggest to
retract my GCC-level proposed change:

> --- a/gcc/config/nvptx/nvptx.cc
> +++ b/gcc/config/nvptx/nvptx.cc

> +static bool have_softstack_decl;

> +static bool have_unisimt_decl;

> @@ -2571,6 +2573,13 @@ nvptx_assemble_undefined_decl (FILE *file, const char 
> *name, const_tree decl)
>  TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
>  DECL_ALIGN (decl), true);
>nvptx_assemble_decl_end ();
> +
> +  static tree softstack_id = get_identifier ("__nvptx_stacks");
> +  static tree unisimt_id = get_identifier ("__nvptx_uni");
> +  if (DECL_NAME (decl) == softstack_id)
> +have_softstack_decl = true;
> +  else if (DECL_NAME (decl) == unisimt_id)
> +have_unisimt_decl = true;
>  }

> @@ -6002,7 +6011,7 @@ nvptx_file_end (void)
>  write_shared_buffer (asm_out_file, gang_private_shared_sym,
>  gang_private_shared_align, gang_private_shared_size);
>
> -  if (need_softstack_decl)
> +  if (need_softstack_decl && !have_softstack_decl)
>  {
>write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
>/* 32 is the maximum number of warps in a block.  Even though it's an
> @@ -6011,7 +6020,8 @@ nvptx_file_end (void)
>fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
>POINTER_SIZE);
>  }
> -  if (need_unisimt_decl)
> +
> +  if (need_unisimt_decl && !have_unisimt_decl)
>  {
>write_var_marker (asm_out_file, false, true, "__nvptx_uni");
>fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");

..., and suggest that we instead fix up duplicate declarations in the
nvptx-tools 'as', and once GCC depends on a nvptx-tools version with that
addressed, we still change the test cases from "compile" to "assemble" as
proposed:

> --- a/gcc/testsuite/gcc.target/nvptx/softstack-decl-1.c
> +++ b/gcc/testsuite/gcc.target/nvptx/softstack-decl-1.c
> @@ -1,4 +1,4 @@
> -/* { dg-do compile } */
> +/* { dg-do assemble } */
>  /* { dg-options {-save-temps -O0 -msoft-stack} } */
>
>  extern void *__nvptx_stacks[32] __attribute__((shared,nocommon));

> --- a/gcc/testsuite/gcc.target/nvptx/uniform-simt-decl-1.c
> +++ b/gcc/testsuite/gcc.target/nvptx/uniform-simt-decl-1.c
> @@ -1,4 +1,4 @@
> -/* { dg-do compile } */
> +/* { dg-do assemble } */
>  /* { dg-options {-save-temps -O0 -muniform-simt} } */
>
>  extern unsigned __nvptx_uni[32] __attribute__((shared,nocommon));

..., but (obviously) without the following changes:

> --- a/gcc/testsuite/gcc.target/nvptx/softstack-decl-1.c
> +++ b/gcc/testsuite/gcc.target/nvptx/softstack-decl-1.c

> -/* The implicit (via 'need_softstack_decl') and explicit declarations of
> -   '__nvptx_stacks' are both emitted:
> -   { dg-final { scan-assembler-times {(?n)\.extern .* __nvptx_stacks\[32\];} 
> 2 } }
> +/* Of the implicit (via 'need_softstack_decl') and explicit declarations of
> +   '__nvptx_stacks', only one is emitted:
> +   { dg-final { scan-assembler-times {(?n)\.extern .* __nvptx_stacks\[32\];} 
> 1 } }

> --- a/gcc/testsuite/gcc.target/nvptx/uniform-simt-decl-1.c
> +++ b/gcc/testsuite/gcc.target/nvptx/uniform-simt-decl-1.c

> -/* The implicit (via 'need_unisimt_decl') and explicit declarations of
> -   '__nvptx_uni' are both emitted:
> -   { dg-final { scan-assembler-times {(?n)\.extern .* __nvptx_uni\[32\];} 2 
> } }
> +/* Of the implicit (via 'need_unisimt_decl') and explicit declarations of
> +   '__nvptx_uni', only one is emitted:
> +   { dg-final { scan-assembler-times {(?n)\.extern .* __nvptx_uni\[32\

[PATCH 01/26] arm: [MVE intrinsics] add binary_widen_opt_n shape

2023-05-12 Thread Christophe Lyon via Gcc-patches
This patch adds the binary_widen_opt_n shape description.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (binary_widen_opt_n): New.
* config/arm/arm-mve-builtins-shapes.h (binary_widen_opt_n): New.
---
 gcc/config/arm/arm-mve-builtins-shapes.cc | 49 +++
 gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
 2 files changed, 50 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc 
b/gcc/config/arm/arm-mve-builtins-shapes.cc
index 5a299a272f5..ee4bc3f8ea4 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -1098,6 +1098,55 @@ struct binary_widen_n_def : public overloaded_base<0>
 };
 SHAPE (binary_widen_n)
 
+/* _t vfoo[_t0](_t, _t)
+   _t vfoo[_n_t0](_t, _t)
+
+   Example: vqdmullbq.
+   int32x4_t [__arm_]vqdmulltq[_n_s16](int16x8_t a, int16_t b)
+   int32x4_t [__arm_]vqdmulltq_m[_n_s16](int32x4_t inactive, int16x8_t a, 
int16_t b, mve_pred16_t p)
+   int32x4_t [__arm_]vqdmulltq[_s16](int16x8_t a, int16x8_t b)
+   int32x4_t [__arm_]vqdmulltq_m[_s16](int32x4_t inactive, int16x8_t a, 
int16x8_t b, mve_pred16_t p)  */
+struct binary_widen_opt_n_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group,
+bool preserve_user_namespace) const override
+  {
+b.add_overloaded_functions (group, MODE_none, preserve_user_namespace);
+build_all (b, "vw0,v0,v0", group, MODE_none, preserve_user_namespace);
+build_all (b, "vw0,v0,s0", group, MODE_n, preserve_user_namespace);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+unsigned int i, nargs;
+type_suffix_index type;
+if (!r.check_gp_argument (2, i, nargs)
+   || (type = r.infer_vector_type (i - 1)) == NUM_TYPE_SUFFIXES)
+  return error_mark_node;
+
+type_suffix_index wide_suffix
+  = find_type_suffix (type_suffixes[type].tclass,
+ type_suffixes[type].element_bits * 2);
+
+/* Skip last argument, may be scalar, will be checked below by
+   finish_opt_n_resolution.  */
+unsigned int last_arg = i--;
+for (; i > 0; i--)
+  if (!r.require_matching_vector_type (i, type))
+   return error_mark_node;
+
+/* Check the inactive argument has the wide type.  */
+if ((r.pred == PRED_m)
+   && (r.infer_vector_type (0) != wide_suffix))
+return r.report_no_such_form (type);
+
+return r.finish_opt_n_resolution (last_arg, 0, type);
+  }
+};
+SHAPE (binary_widen_opt_n)
+
 /* Shape for comparison operations that operate on
uniform types.
 
diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h 
b/gcc/config/arm/arm-mve-builtins-shapes.h
index a28cd6a1547..07b12b4af68 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.h
+++ b/gcc/config/arm/arm-mve-builtins-shapes.h
@@ -53,6 +53,7 @@ namespace arm_mve
 extern const function_shape *const binary_rshift_narrow;
 extern const function_shape *const binary_rshift_narrow_unsigned;
 extern const function_shape *const binary_widen_n;
+extern const function_shape *const binary_widen_opt_n;
 extern const function_shape *const cmp;
 extern const function_shape *const create;
 extern const function_shape *const inherent;
-- 
2.34.1



[PATCH 09/26] arm: [MVE intrinsics] add binary_imm32 shape

2023-05-12 Thread Christophe Lyon via Gcc-patches
This patch adds the binary_imm32 shape description.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (binary_imm32): New.
* config/arm/arm-mve-builtins-shapes.h (binary_imm32): New.
---
 gcc/config/arm/arm-mve-builtins-shapes.cc | 27 +++
 gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
 2 files changed, 28 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc 
b/gcc/config/arm/arm-mve-builtins-shapes.cc
index 91540838e03..c2e138c12e1 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -489,6 +489,33 @@ struct binary_acca_int64_def : public overloaded_base<0>
 };
 SHAPE (binary_acca_int64)
 
+/* _t vfoo[_n_t0](_t, int32_t)
+
+   i.e. the shape for binary operations that operate on
+   a vector and an int32_t.
+
+   Example: vbrsrq.
+   int16x8_t [__arm_]vbrsrq[_n_s16](int16x8_t a, int32_t b)
+   int16x8_t [__arm_]vbrsrq_m[_n_s16](int16x8_t inactive, int16x8_t a, int32_t 
b, mve_pred16_t p)
+   int16x8_t [__arm_]vbrsrq_x[_n_s16](int16x8_t a, int32_t b, mve_pred16_t p)  
*/
+struct binary_imm32_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group,
+bool preserve_user_namespace) const override
+  {
+b.add_overloaded_functions (group, MODE_n, preserve_user_namespace);
+build_all (b, "v0,v0,ss32", group, MODE_n, preserve_user_namespace);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+return r.resolve_uniform (1, 1);
+  }
+};
+SHAPE (binary_imm32)
+
 /* _t vfoo[_n_t0](_t, const int)
 
Shape for vector shift right operations that take a vector first
diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h 
b/gcc/config/arm/arm-mve-builtins-shapes.h
index 6ae1443f26b..bba38194ce2 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.h
+++ b/gcc/config/arm/arm-mve-builtins-shapes.h
@@ -41,6 +41,7 @@ namespace arm_mve
 extern const function_shape *const binary_acc_int64;
 extern const function_shape *const binary_acca_int32;
 extern const function_shape *const binary_acca_int64;
+extern const function_shape *const binary_imm32;
 extern const function_shape *const binary_lshift_unsigned;
 extern const function_shape *const binary_maxamina;
 extern const function_shape *const binary_maxavminav;
-- 
2.34.1



[PATCH 07/26] arm: [MVE intrinsics] factorize vqshluq

2023-05-12 Thread Christophe Lyon via Gcc-patches
Factorize vqshluq builtins so that they use parameterized names.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/iterators.md (mve_insn): Add vqshlu.
(supf): Add VQSHLUQ_M_N_S, VQSHLUQ_N_S.
(VQSHLUQ_M_N, VQSHLUQ_N): New.
* config/arm/mve.md (mve_vqshluq_n_s): Change name into ...
(@mve_q_n_): ... this.
(mve_vqshluq_m_n_s): Change name into ...
(@mve_q_m_n_): ... this.
---
 gcc/config/arm/iterators.md |  6 ++
 gcc/config/arm/mve.md   | 12 ++--
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 116dd95fd88..d1d14488b56 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -1071,6 +1071,8 @@ (define_int_attr mve_insn [
 (VQSHLQ_N_S "vqshl") (VQSHLQ_N_U "vqshl")
 (VQSHLQ_R_S "vqshl") (VQSHLQ_R_U "vqshl")
 (VQSHLQ_S "vqshl") (VQSHLQ_U "vqshl")
+(VQSHLUQ_M_N_S "vqshlu")
+(VQSHLUQ_N_S "vqshlu")
 (VQSHRNBQ_M_N_S "vqshrnb") (VQSHRNBQ_M_N_U "vqshrnb")
 (VQSHRNBQ_N_S "vqshrnb") (VQSHRNBQ_N_U "vqshrnb")
 (VQSHRNTQ_M_N_S "vqshrnt") (VQSHRNTQ_M_N_U "vqshrnt")
@@ -2490,6 +2492,8 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U 
"u") (VREV16Q_S "s")
   (VRMLSLDAVHAXQ_P_S "s")
   (VRMLSLDAVHAXQ_S "s")
   (VRMLALDAVHAQ_P_S "s") (VRMLALDAVHAQ_P_U "u")
+  (VQSHLUQ_M_N_S "s")
+  (VQSHLUQ_N_S "s")
   ])
 
 ;; Both kinds of return insn.
@@ -2793,6 +2797,8 @@ (define_int_iterator VADCQ_M [VADCQ_M_U VADCQ_M_S])
 (define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48])
 (define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48])
 (define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U])
+(define_int_iterator VQSHLUQ_M_N [VQSHLUQ_M_N_S])
+(define_int_iterator VQSHLUQ_N [VQSHLUQ_N_S])
 
 ;; Define iterators for VCMLA operations
 (define_int_iterator VCMLA_OP [UNSPEC_VCMLA
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index b4faf7a4b18..7898361b859 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -1150,15 +1150,15 @@ (define_insn "@mve_q_r_"
 ;;
 ;; [vqshluq_n_s])
 ;;
-(define_insn "mve_vqshluq_n_s"
+(define_insn "@mve_q_n_"
   [
(set (match_operand:MVE_2 0 "s_register_operand" "=w")
(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
   (match_operand:SI 2 "" "")]
-VQSHLUQ_N_S))
+VQSHLUQ_N))
   ]
   "TARGET_HAVE_MVE"
-  "vqshlu.s%#\t%q0, %q1, %2"
+  ".%#\t%q0, %q1, %2"
   [(set_attr "type" "mve_move")
 ])
 
@@ -2653,17 +2653,17 @@ (define_insn "@mve_q_p_"
 ;;
 ;; [vqshluq_m_n_s])
 ;;
-(define_insn "mve_vqshluq_m_n_s"
+(define_insn "@mve_q_m_n_"
   [
(set (match_operand:MVE_2 0 "s_register_operand" "=w")
(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
   (match_operand:MVE_2 2 "s_register_operand" "w")
   (match_operand:SI 3 "" "")
   (match_operand: 4 "vpr_register_operand" 
"Up")]
-VQSHLUQ_M_N_S))
+VQSHLUQ_M_N))
   ]
   "TARGET_HAVE_MVE"
-  "vpst\n\tvqshlut.s%#\t%q0, %q2, %3"
+  "vpst\n\tt.%#\t%q0, %q2, %3"
   [(set_attr "type" "mve_move")
(set_attr "length" "8")])
 
-- 
2.34.1



[PATCH 06/26] arm: [MVE intrinsics] add binary_lshift_unsigned shape

2023-05-12 Thread Christophe Lyon via Gcc-patches
This patch adds the binary_lshift_unsigned shape description.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc
(binary_lshift_unsigned): New.
* config/arm/arm-mve-builtins-shapes.h
(binary_lshift_unsigned): New.
---
 gcc/config/arm/arm-mve-builtins-shapes.cc | 58 +++
 gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
 2 files changed, 59 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc 
b/gcc/config/arm/arm-mve-builtins-shapes.cc
index ee4bc3f8ea4..91540838e03 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -526,6 +526,64 @@ struct binary_rshift_def : public overloaded_base<0>
 SHAPE (binary_rshift)
 
 
+/* _t vfoo[_n_t0](_t, int)
+
+   Shape for vector saturating shift left operations that take a
+   vector of signed elements as first argument and an integer, and
+   produce a vector of unsigned elements.
+
+   Check that 'imm' is in the [0..#bits-1] range.
+
+   Example: vqshluq.
+   uint16x8_t [__arm_]vqshluq[_n_s16](int16x8_t a, const int imm)
+   uint16x8_t [__arm_]vqshluq_m[_n_s16](uint16x8_t inactive, int16x8_t a, 
const int imm, mve_pred16_t p)  */
+struct binary_lshift_unsigned_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group,
+bool preserve_user_namespace) const override
+  {
+b.add_overloaded_functions (group, MODE_n, preserve_user_namespace);
+build_all (b, "vu0,vs0,ss32", group, MODE_n, preserve_user_namespace);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+unsigned int i, nargs;
+type_suffix_index type;
+if (!r.check_gp_argument (2, i, nargs)
+   || (type = r.infer_vector_type (i-1)) == NUM_TYPE_SUFFIXES)
+  return error_mark_node;
+
+if (r.pred == PRED_m)
+  {
+   /* With PRED_m, check that the 'inactive' first argument has
+  the expeected unsigned type.  */
+   type_suffix_index return_type
+ = find_type_suffix (TYPE_unsigned, type_suffixes[type].element_bits);
+
+   if (!r.require_matching_vector_type (0, return_type))
+ return error_mark_node;
+  }
+
+for (; i < nargs; ++i)
+  if (!r.require_integer_immediate (i))
+   return error_mark_node;
+
+return r.resolve_to (r.mode_suffix_id, type);
+  }
+
+  bool
+  check (function_checker &c) const override
+  {
+unsigned int bits = c.type_suffix (0).element_bits;
+return c.require_immediate_range (1, 0, bits - 1);
+  }
+
+};
+SHAPE (binary_lshift_unsigned)
+
 /* _t vfoo[_t0](_t, _t)
 
i.e. binary operations that take a vector of unsigned elements as first 
argument and a
diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h 
b/gcc/config/arm/arm-mve-builtins-shapes.h
index 07b12b4af68..6ae1443f26b 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.h
+++ b/gcc/config/arm/arm-mve-builtins-shapes.h
@@ -41,6 +41,7 @@ namespace arm_mve
 extern const function_shape *const binary_acc_int64;
 extern const function_shape *const binary_acca_int32;
 extern const function_shape *const binary_acca_int64;
+extern const function_shape *const binary_lshift_unsigned;
 extern const function_shape *const binary_maxamina;
 extern const function_shape *const binary_maxavminav;
 extern const function_shape *const binary_maxvminv;
-- 
2.34.1



[PATCH 05/26] arm: [MVE intrinsics] rework vrmlaldavhaq vrmlaldavhaxq vrmlsldavhaq vrmlsldavhaxq

2023-05-12 Thread Christophe Lyon via Gcc-patches
Implement vrmlaldavhaq, vrmlaldavhaxq, vrmlsldavhaq, vrmlsldavhaxq
using the new MVE builtins framework.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc (vrmlaldavhaq)
(vrmlaldavhaxq, vrmlsldavhaq, vrmlsldavhaxq): New.
* config/arm/arm-mve-builtins-base.def (vrmlaldavhaq)
(vrmlaldavhaxq, vrmlsldavhaq, vrmlsldavhaxq): New.
* config/arm/arm-mve-builtins-base.h (vrmlaldavhaq)
(vrmlaldavhaxq, vrmlsldavhaq, vrmlsldavhaxq): New.
* config/arm/arm-mve-builtins-functions.h: Handle vrmlaldavhaq,
vrmlaldavhaxq, vrmlsldavhaq, vrmlsldavhaxq.
* config/arm/arm_mve.h (vrmlaldavhaq): Remove.
(vrmlaldavhaxq): Remove.
(vrmlsldavhaq): Remove.
(vrmlsldavhaxq): Remove.
(vrmlaldavhaq_p): Remove.
(vrmlaldavhaxq_p): Remove.
(vrmlsldavhaq_p): Remove.
(vrmlsldavhaxq_p): Remove.
(vrmlaldavhaq_s32): Remove.
(vrmlaldavhaq_u32): Remove.
(vrmlaldavhaxq_s32): Remove.
(vrmlsldavhaq_s32): Remove.
(vrmlsldavhaxq_s32): Remove.
(vrmlaldavhaq_p_s32): Remove.
(vrmlaldavhaq_p_u32): Remove.
(vrmlaldavhaxq_p_s32): Remove.
(vrmlsldavhaq_p_s32): Remove.
(vrmlsldavhaxq_p_s32): Remove.
(__arm_vrmlaldavhaq_s32): Remove.
(__arm_vrmlaldavhaq_u32): Remove.
(__arm_vrmlaldavhaxq_s32): Remove.
(__arm_vrmlsldavhaq_s32): Remove.
(__arm_vrmlsldavhaxq_s32): Remove.
(__arm_vrmlaldavhaq_p_s32): Remove.
(__arm_vrmlaldavhaq_p_u32): Remove.
(__arm_vrmlaldavhaxq_p_s32): Remove.
(__arm_vrmlsldavhaq_p_s32): Remove.
(__arm_vrmlsldavhaxq_p_s32): Remove.
(__arm_vrmlaldavhaq): Remove.
(__arm_vrmlaldavhaxq): Remove.
(__arm_vrmlsldavhaq): Remove.
(__arm_vrmlsldavhaxq): Remove.
(__arm_vrmlaldavhaq_p): Remove.
(__arm_vrmlaldavhaxq_p): Remove.
(__arm_vrmlsldavhaq_p): Remove.
(__arm_vrmlsldavhaxq_p): Remove.
---
 gcc/config/arm/arm-mve-builtins-base.cc |   4 +
 gcc/config/arm/arm-mve-builtins-base.def|   4 +
 gcc/config/arm/arm-mve-builtins-base.h  |   4 +
 gcc/config/arm/arm-mve-builtins-functions.h |   4 +
 gcc/config/arm/arm_mve.h| 184 
 5 files changed, 16 insertions(+), 184 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc 
b/gcc/config/arm/arm-mve-builtins-base.cc
index 5ecc61ebf03..a2b227bb2aa 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -346,8 +346,12 @@ FUNCTION_WITHOUT_N_NO_F (vrev16q, VREV16Q)
 FUNCTION_WITHOUT_N (vrev32q, VREV32Q)
 FUNCTION_WITHOUT_N (vrev64q, VREV64Q)
 FUNCTION_WITHOUT_N_NO_F (vrhaddq, VRHADDQ)
+FUNCTION_PRED_P_S_U (vrmlaldavhaq, VRMLALDAVHAQ)
+FUNCTION_PRED_P_S (vrmlaldavhaxq, VRMLALDAVHAXQ)
 FUNCTION_PRED_P_S_U (vrmlaldavhq, VRMLALDAVHQ)
 FUNCTION_PRED_P_S (vrmlaldavhxq, VRMLALDAVHXQ)
+FUNCTION_PRED_P_S (vrmlsldavhaq, VRMLSLDAVHAQ)
+FUNCTION_PRED_P_S (vrmlsldavhaxq, VRMLSLDAVHAXQ)
 FUNCTION_PRED_P_S (vrmlsldavhq, VRMLSLDAVHQ)
 FUNCTION_PRED_P_S (vrmlsldavhxq, VRMLSLDAVHXQ)
 FUNCTION_WITHOUT_N_NO_F (vrmulhq, VRMULHQ)
diff --git a/gcc/config/arm/arm-mve-builtins-base.def 
b/gcc/config/arm/arm-mve-builtins-base.def
index 19cfd9933c0..c4ef74169dd 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/config/arm/arm-mve-builtins-base.def
@@ -116,8 +116,12 @@ DEF_MVE_FUNCTION (vrev16q, unary, integer_8, mx_or_none)
 DEF_MVE_FUNCTION (vrev32q, unary, integer_8_16, mx_or_none)
 DEF_MVE_FUNCTION (vrev64q, unary, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vrhaddq, binary, all_integer, mx_or_none)
+DEF_MVE_FUNCTION (vrmlaldavhaq, binary_acca_int64, integer_32, p_or_none)
+DEF_MVE_FUNCTION (vrmlaldavhaxq, binary_acca_int64, integer_32, p_or_none)
 DEF_MVE_FUNCTION (vrmlaldavhq, binary_acc_int64, integer_32, p_or_none)
 DEF_MVE_FUNCTION (vrmlaldavhxq, binary_acc_int64, signed_32, p_or_none)
+DEF_MVE_FUNCTION (vrmlsldavhaq, binary_acca_int64, integer_32, p_or_none)
+DEF_MVE_FUNCTION (vrmlsldavhaxq, binary_acca_int64, integer_32, p_or_none)
 DEF_MVE_FUNCTION (vrmlsldavhq, binary_acc_int64, signed_32, p_or_none)
 DEF_MVE_FUNCTION (vrmlsldavhxq, binary_acc_int64, signed_32, p_or_none)
 DEF_MVE_FUNCTION (vrmulhq, binary, all_integer, mx_or_none)
diff --git a/gcc/config/arm/arm-mve-builtins-base.h 
b/gcc/config/arm/arm-mve-builtins-base.h
index ca66b3b8caf..41b2e19c2d7 100644
--- a/gcc/config/arm/arm-mve-builtins-base.h
+++ b/gcc/config/arm/arm-mve-builtins-base.h
@@ -128,8 +128,12 @@ extern const function_base *const vrev16q;
 extern const function_base *const vrev32q;
 extern const function_base *const vrev64q;
 extern const function_base *const vrhaddq;
+extern const function_base *const vrmlaldavhaq;
+extern const function_base *const vrmlaldavhaxq;
 extern const function_base *const vrmlaldavhq;
 extern const function_base *const vrmlaldavhxq;
+exter

[PATCH 03/26] arm: [MVE intrinsics] rework vqdmullbq vqdmulltq

2023-05-12 Thread Christophe Lyon via Gcc-patches
Implement vqdmullbq, vqdmulltq using the new MVE builtins framework.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc (vqdmullbq, vqdmulltq): New.
* config/arm/arm-mve-builtins-base.def (vqdmullbq, vqdmulltq):
New.
* config/arm/arm-mve-builtins-base.h (vqdmullbq, vqdmulltq): New.
* config/arm/arm_mve.h (vqdmulltq): Remove.
(vqdmullbq): Remove.
(vqdmullbq_m): Remove.
(vqdmulltq_m): Remove.
(vqdmulltq_s16): Remove.
(vqdmulltq_n_s16): Remove.
(vqdmullbq_s16): Remove.
(vqdmullbq_n_s16): Remove.
(vqdmulltq_s32): Remove.
(vqdmulltq_n_s32): Remove.
(vqdmullbq_s32): Remove.
(vqdmullbq_n_s32): Remove.
(vqdmullbq_m_n_s32): Remove.
(vqdmullbq_m_n_s16): Remove.
(vqdmullbq_m_s32): Remove.
(vqdmullbq_m_s16): Remove.
(vqdmulltq_m_n_s32): Remove.
(vqdmulltq_m_n_s16): Remove.
(vqdmulltq_m_s32): Remove.
(vqdmulltq_m_s16): Remove.
(__arm_vqdmulltq_s16): Remove.
(__arm_vqdmulltq_n_s16): Remove.
(__arm_vqdmullbq_s16): Remove.
(__arm_vqdmullbq_n_s16): Remove.
(__arm_vqdmulltq_s32): Remove.
(__arm_vqdmulltq_n_s32): Remove.
(__arm_vqdmullbq_s32): Remove.
(__arm_vqdmullbq_n_s32): Remove.
(__arm_vqdmullbq_m_n_s32): Remove.
(__arm_vqdmullbq_m_n_s16): Remove.
(__arm_vqdmullbq_m_s32): Remove.
(__arm_vqdmullbq_m_s16): Remove.
(__arm_vqdmulltq_m_n_s32): Remove.
(__arm_vqdmulltq_m_n_s16): Remove.
(__arm_vqdmulltq_m_s32): Remove.
(__arm_vqdmulltq_m_s16): Remove.
(__arm_vqdmulltq): Remove.
(__arm_vqdmullbq): Remove.
(__arm_vqdmullbq_m): Remove.
(__arm_vqdmulltq_m): Remove.
---
 gcc/config/arm/arm-mve-builtins-base.cc  |   2 +
 gcc/config/arm/arm-mve-builtins-base.def |   2 +
 gcc/config/arm/arm-mve-builtins-base.h   |   2 +
 gcc/config/arm/arm_mve.h | 294 ---
 4 files changed, 6 insertions(+), 294 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc 
b/gcc/config/arm/arm-mve-builtins-base.cc
index ca2fb67a07c..5ecc61ebf03 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -320,6 +320,8 @@ FUNCTION_ONLY_N_NO_U_F (vqdmlashq, VQDMLASHQ)
 FUNCTION_WITHOUT_N_NO_U_F (vqdmlsdhq, VQDMLSDHQ)
 FUNCTION_WITHOUT_N_NO_U_F (vqdmlsdhxq, VQDMLSDHXQ)
 FUNCTION_WITH_M_N_NO_U_F (vqdmulhq, VQDMULHQ)
+FUNCTION_WITH_M_N_NO_U_F (vqdmullbq, VQDMULLBQ)
+FUNCTION_WITH_M_N_NO_U_F (vqdmulltq, VQDMULLTQ)
 FUNCTION_WITHOUT_N_NO_U_F (vqrdmladhq, VQRDMLADHQ)
 FUNCTION_WITHOUT_N_NO_U_F (vqrdmladhxq, VQRDMLADHXQ)
 FUNCTION_ONLY_N_NO_U_F (vqrdmlahq, VQRDMLAHQ)
diff --git a/gcc/config/arm/arm-mve-builtins-base.def 
b/gcc/config/arm/arm-mve-builtins-base.def
index 601384d5a95..19cfd9933c0 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/config/arm/arm-mve-builtins-base.def
@@ -85,6 +85,8 @@ DEF_MVE_FUNCTION (vqdmlashq, ternary_n, all_signed, m_or_none)
 DEF_MVE_FUNCTION (vqdmlsdhq, ternary, all_signed, m_or_none)
 DEF_MVE_FUNCTION (vqdmlsdhxq, ternary, all_signed, m_or_none)
 DEF_MVE_FUNCTION (vqdmulhq, binary_opt_n, all_signed, m_or_none)
+DEF_MVE_FUNCTION (vqdmullbq, binary_widen_opt_n, signed_16_32, m_or_none)
+DEF_MVE_FUNCTION (vqdmulltq, binary_widen_opt_n, signed_16_32, m_or_none)
 DEF_MVE_FUNCTION (vqmovnbq, binary_move_narrow, integer_16_32, m_or_none)
 DEF_MVE_FUNCTION (vqmovntq, binary_move_narrow, integer_16_32, m_or_none)
 DEF_MVE_FUNCTION (vqmovunbq, binary_move_narrow_unsigned, signed_16_32, 
m_or_none)
diff --git a/gcc/config/arm/arm-mve-builtins-base.h 
b/gcc/config/arm/arm-mve-builtins-base.h
index 574ed97a4b3..ca66b3b8caf 100644
--- a/gcc/config/arm/arm-mve-builtins-base.h
+++ b/gcc/config/arm/arm-mve-builtins-base.h
@@ -98,6 +98,8 @@ extern const function_base *const vqdmlashq;
 extern const function_base *const vqdmlsdhq;
 extern const function_base *const vqdmlsdhxq;
 extern const function_base *const vqdmulhq;
+extern const function_base *const vqdmullbq;
+extern const function_base *const vqdmulltq;
 extern const function_base *const vqmovnbq;
 extern const function_base *const vqmovntq;
 extern const function_base *const vqmovunbq;
diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index 09b9564ed48..e0025f017ca 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -55,8 +55,6 @@
 #define vhcaddq_rot270(__a, __b) __arm_vhcaddq_rot270(__a, __b)
 #define vmulltq_poly(__a, __b) __arm_vmulltq_poly(__a, __b)
 #define vmullbq_poly(__a, __b) __arm_vmullbq_poly(__a, __b)
-#define vqdmulltq(__a, __b) __arm_vqdmulltq(__a, __b)
-#define vqdmullbq(__a, __b) __arm_vqdmullbq(__a, __b)
 #define vbicq_m_n(__a, __imm, __p) __arm_vbicq_m_n(__a, __imm, __p)
 #define vrmlaldavhaq(__a, __b, __c) __arm_vrmlaldavhaq(__a, __b, __c)
 #define vshlcq(__a,

[PATCH 04/26] arm: [MVE intrinsics] factorize vrmlaldavhaq vrmlaldavhaxq vrmlsldavhaq vrmlsldavhaxq

2023-05-12 Thread Christophe Lyon via Gcc-patches
Factorize vrmlaldavhaq, vrmlaldavhaxq, vrmlsldavhaq, vrmlsldavhaxq
builtins so that they use the same parameterized names.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/iterators.md (MVE_VRMLxLDAVHAxQ)
(MVE_VRMLxLDAVHAxQ_P): New.
(mve_insn): Add vrmlaldavha, vrmlaldavhax, vrmlsldavha,
vrmlsldavhax.
(supf): Add VRMLALDAVHAXQ_P_S, VRMLALDAVHAXQ_S, VRMLSLDAVHAQ_P_S,
VRMLSLDAVHAQ_S, VRMLSLDAVHAXQ_P_S, VRMLSLDAVHAXQ_S,
VRMLALDAVHAQ_P_S.
* config/arm/mve.md (mve_vrmlaldavhaq_v4si)
(mve_vrmlaldavhaxq_sv4si, mve_vrmlsldavhaxq_sv4si)
(mve_vrmlsldavhaq_sv4si): Merge into ...
(@mve_q_v4si): ... this.
(mve_vrmlaldavhaq_p_sv4si, mve_vrmlaldavhaq_p_uv4si)
(mve_vrmlaldavhaxq_p_sv4si, mve_vrmlsldavhaq_p_sv4si)
(mve_vrmlsldavhaxq_p_sv4si): Merge into ...
(@mve_q_p_v4si): ... this.
---
 gcc/config/arm/iterators.md |  29 
 gcc/config/arm/mve.md   | 140 
 2 files changed, 44 insertions(+), 125 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index f88da604c19..116dd95fd88 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -809,6 +809,20 @@ (define_int_iterator MVE_VRMLxLDAVHxQ_P [
 VRMLSLDAVHXQ_P_S
 ])
 
+(define_int_iterator MVE_VRMLxLDAVHAxQ [
+VRMLALDAVHAQ_S VRMLALDAVHAQ_U
+VRMLALDAVHAXQ_S
+VRMLSLDAVHAQ_S
+VRMLSLDAVHAXQ_S
+])
+
+(define_int_iterator MVE_VRMLxLDAVHAxQ_P [
+VRMLALDAVHAQ_P_S VRMLALDAVHAQ_P_U
+VRMLALDAVHAXQ_P_S
+VRMLSLDAVHAQ_P_S
+VRMLSLDAVHAXQ_P_S
+])
+
 (define_int_iterator MVE_MOVN [
 VMOVNBQ_S VMOVNBQ_U
 VMOVNTQ_S VMOVNTQ_U
@@ -1077,10 +1091,18 @@ (define_int_attr mve_insn [
 (VREV64Q_S "vrev64") (VREV64Q_U "vrev64") (VREV64Q_F "vrev64")
 (VRHADDQ_M_S "vrhadd") (VRHADDQ_M_U "vrhadd")
 (VRHADDQ_S "vrhadd") (VRHADDQ_U "vrhadd")
+(VRMLALDAVHAQ_P_S "vrmlaldavha") (VRMLALDAVHAQ_P_U 
"vrmlaldavha")
+(VRMLALDAVHAQ_S "vrmlaldavha") (VRMLALDAVHAQ_U "vrmlaldavha")
+(VRMLALDAVHAXQ_P_S "vrmlaldavhax")
+(VRMLALDAVHAXQ_S "vrmlaldavhax")
 (VRMLALDAVHQ_P_S "vrmlaldavh") (VRMLALDAVHQ_P_U "vrmlaldavh")
 (VRMLALDAVHQ_S "vrmlaldavh") (VRMLALDAVHQ_U "vrmlaldavh")
 (VRMLALDAVHXQ_P_S "vrmlaldavhx")
 (VRMLALDAVHXQ_S "vrmlaldavhx")
+(VRMLSLDAVHAQ_P_S "vrmlsldavha")
+(VRMLSLDAVHAQ_S "vrmlsldavha")
+(VRMLSLDAVHAXQ_P_S "vrmlsldavhax")
+(VRMLSLDAVHAXQ_S "vrmlsldavhax")
 (VRMLSLDAVHQ_P_S "vrmlsldavh")
 (VRMLSLDAVHQ_S "vrmlsldavh")
 (VRMLSLDAVHXQ_P_S "vrmlsldavhx")
@@ -2461,6 +2483,13 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U 
"u") (VREV16Q_S "s")
   (VQDMULLTQ_M_S "s")
   (VQDMULLTQ_M_N_S "s")
   (VQDMULLTQ_N_S "s")
+  (VRMLALDAVHAXQ_P_S "s")
+  (VRMLALDAVHAXQ_S "s")
+  (VRMLSLDAVHAQ_P_S "s")
+  (VRMLSLDAVHAQ_S "s")
+  (VRMLSLDAVHAXQ_P_S "s")
+  (VRMLSLDAVHAXQ_S "s")
+  (VRMLALDAVHAQ_P_S "s") (VRMLALDAVHAQ_P_U "u")
   ])
 
 ;; Both kinds of return insn.
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index e75a30b7ed4..b4faf7a4b18 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -1677,18 +1677,21 @@ (define_insn "@mve_q_n_"
 ])
 
 ;;
-;; [vrmlaldavhaq_s vrmlaldavhaq_u])
+;; [vrmlaldavhaq_s vrmlaldavhaq_u]
+;; [vrmlaldavhaxq_s]
+;; [vrmlsldavhaq_s]
+;; [vrmlsldavhaxq_s]
 ;;
-(define_insn "mve_vrmlaldavhaq_v4si"
+(define_insn "@mve_q_v4si"
   [
(set (match_operand:DI 0 "s_register_operand" "=r")
(unspec:DI [(match_operand:DI 1 "s_register_operand" "0")
(match_operand:V4SI 2 "s_register_operand" "w")
(match_operand:V4SI 3 "s_register_operand" "w")]
-VRMLALDAVHAQ))
+MVE_VRMLxLDAVHAxQ))
   ]
   "TARGET_HAVE_MVE"
-  "vrmlaldavha.32\t%Q0, %R0, %q2, %q3"
+  ".32\t%Q0, %R0, %q2, %q3"
   [(set_attr "type" "mve_move")
 ])
 
@@ -2514,22 +2517,6 @@ (define_insn "@mve_q_m_f"
   [(set_attr "type" "mve_move")
(set_attr "length""8")])
 
-;;
-;; [vrmlaldavhaxq_s])
-;;
-(define_insn "mve_vrmlaldavhaxq_sv4si"
-  [
-   (set (match_operand:DI 0 "s_register_operand" "=r")
-   (unspec:DI [(match_operand:DI 1 "s_register_operand" "0")
-  (match_operand:V4SI 2 "s_register_operand" "w")
-  (mat

[PATCH 11/26] arm: [MVE intrinsics] rework vbrsrq

2023-05-12 Thread Christophe Lyon via Gcc-patches
Implement vbrsrq using the new MVE builtins framework.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc (vbrsrq): New.
* config/arm/arm-mve-builtins-base.def (vbrsrq): New.
* config/arm/arm-mve-builtins-base.h (vbrsrq): New.
* config/arm/arm_mve.h (vbrsrq): Remove.
(vbrsrq_m): Remove.
(vbrsrq_x): Remove.
(vbrsrq_n_f16): Remove.
(vbrsrq_n_f32): Remove.
(vbrsrq_n_u8): Remove.
(vbrsrq_n_s8): Remove.
(vbrsrq_n_u16): Remove.
(vbrsrq_n_s16): Remove.
(vbrsrq_n_u32): Remove.
(vbrsrq_n_s32): Remove.
(vbrsrq_m_n_s8): Remove.
(vbrsrq_m_n_s32): Remove.
(vbrsrq_m_n_s16): Remove.
(vbrsrq_m_n_u8): Remove.
(vbrsrq_m_n_u32): Remove.
(vbrsrq_m_n_u16): Remove.
(vbrsrq_m_n_f32): Remove.
(vbrsrq_m_n_f16): Remove.
(vbrsrq_x_n_s8): Remove.
(vbrsrq_x_n_s16): Remove.
(vbrsrq_x_n_s32): Remove.
(vbrsrq_x_n_u8): Remove.
(vbrsrq_x_n_u16): Remove.
(vbrsrq_x_n_u32): Remove.
(vbrsrq_x_n_f16): Remove.
(vbrsrq_x_n_f32): Remove.
(__arm_vbrsrq_n_u8): Remove.
(__arm_vbrsrq_n_s8): Remove.
(__arm_vbrsrq_n_u16): Remove.
(__arm_vbrsrq_n_s16): Remove.
(__arm_vbrsrq_n_u32): Remove.
(__arm_vbrsrq_n_s32): Remove.
(__arm_vbrsrq_m_n_s8): Remove.
(__arm_vbrsrq_m_n_s32): Remove.
(__arm_vbrsrq_m_n_s16): Remove.
(__arm_vbrsrq_m_n_u8): Remove.
(__arm_vbrsrq_m_n_u32): Remove.
(__arm_vbrsrq_m_n_u16): Remove.
(__arm_vbrsrq_x_n_s8): Remove.
(__arm_vbrsrq_x_n_s16): Remove.
(__arm_vbrsrq_x_n_s32): Remove.
(__arm_vbrsrq_x_n_u8): Remove.
(__arm_vbrsrq_x_n_u16): Remove.
(__arm_vbrsrq_x_n_u32): Remove.
(__arm_vbrsrq_n_f16): Remove.
(__arm_vbrsrq_n_f32): Remove.
(__arm_vbrsrq_m_n_f32): Remove.
(__arm_vbrsrq_m_n_f16): Remove.
(__arm_vbrsrq_x_n_f16): Remove.
(__arm_vbrsrq_x_n_f32): Remove.
(__arm_vbrsrq): Remove.
(__arm_vbrsrq_m): Remove.
(__arm_vbrsrq_x): Remove.
---
 gcc/config/arm/arm-mve-builtins-base.cc  |   1 +
 gcc/config/arm/arm-mve-builtins-base.def |   2 +
 gcc/config/arm/arm-mve-builtins-base.h   |   1 +
 gcc/config/arm/arm_mve.h | 426 ---
 4 files changed, 4 insertions(+), 426 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc 
b/gcc/config/arm/arm-mve-builtins-base.cc
index 739ab604843..2fb81c197da 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -250,6 +250,7 @@ FUNCTION_PRED_P_S_U (vaddlvq, VADDLVQ)
 FUNCTION_PRED_P_S_U (vaddvq, VADDVQ)
 FUNCTION_PRED_P_S_U (vaddvaq, VADDVAQ)
 FUNCTION_WITH_RTX_M (vandq, AND, VANDQ)
+FUNCTION_ONLY_N (vbrsrq, VBRSRQ)
 FUNCTION_WITHOUT_N_NO_U_F (vclsq, VCLSQ)
 FUNCTION (vclzq, unspec_based_mve_function_exact_insn, (CLZ, CLZ, CLZ, -1, -1, 
-1, VCLZQ_M_S, VCLZQ_M_U, -1, -1, -1 ,-1))
 FUNCTION (vcmpeqq, unspec_based_mve_function_exact_insn_vcmp, (EQ, EQ, EQ, 
VCMPEQQ_M_S, VCMPEQQ_M_U, VCMPEQQ_M_F, VCMPEQQ_M_N_S, VCMPEQQ_M_N_U, 
VCMPEQQ_M_N_F))
diff --git a/gcc/config/arm/arm-mve-builtins-base.def 
b/gcc/config/arm/arm-mve-builtins-base.def
index 3f7bb414e40..e53cb2c1992 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/config/arm/arm-mve-builtins-base.def
@@ -27,6 +27,7 @@ DEF_MVE_FUNCTION (vaddq, binary_opt_n, all_integer, 
mx_or_none)
 DEF_MVE_FUNCTION (vaddvaq, unary_int32_acc, all_integer, p_or_none)
 DEF_MVE_FUNCTION (vaddvq, unary_int32, all_integer, p_or_none)
 DEF_MVE_FUNCTION (vandq, binary, all_integer, mx_or_none)
+DEF_MVE_FUNCTION (vbrsrq, binary_imm32, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vclsq, unary, all_signed, mx_or_none)
 DEF_MVE_FUNCTION (vclzq, unary, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vcmpcsq, cmp, all_unsigned, m_or_none)
@@ -146,6 +147,7 @@ DEF_MVE_FUNCTION (vabdq, binary, all_float, mx_or_none)
 DEF_MVE_FUNCTION (vabsq, unary, all_float, mx_or_none)
 DEF_MVE_FUNCTION (vaddq, binary_opt_n, all_float, mx_or_none)
 DEF_MVE_FUNCTION (vandq, binary, all_float, mx_or_none)
+DEF_MVE_FUNCTION (vbrsrq, binary_imm32, all_float, mx_or_none)
 DEF_MVE_FUNCTION (vcmpeqq, cmp, all_float, m_or_none)
 DEF_MVE_FUNCTION (vcmpgeq, cmp, all_float, m_or_none)
 DEF_MVE_FUNCTION (vcmpgtq, cmp, all_float, m_or_none)
diff --git a/gcc/config/arm/arm-mve-builtins-base.h 
b/gcc/config/arm/arm-mve-builtins-base.h
index 797f8ba2f5e..49c60536961 100644
--- a/gcc/config/arm/arm-mve-builtins-base.h
+++ b/gcc/config/arm/arm-mve-builtins-base.h
@@ -32,6 +32,7 @@ extern const function_base *const vaddq;
 extern const function_base *const vaddvaq;
 extern const function_base *const vaddvq;
 extern const function_base *const vandq;
+extern const function_base *const vbrsrq;
 extern const function_base *const vclsq;
 extern const

[PATCH 02/26] arm: [MVE intrinsics] factorize vqdmullbq vqdmulltq

2023-05-12 Thread Christophe Lyon via Gcc-patches
Factorize vqdmullbq, vqdmulltq builtins so that they use the same
parameterized names.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/iterators.md (MVE_VQDMULLxQ, MVE_VQDMULLxQ_M)
(MVE_VQDMULLxQ_M_N, MVE_VQDMULLxQ_N): New.
(mve_insn): Add vqdmullb, vqdmullt.
(supf): Add VQDMULLBQ_S, VQDMULLBQ_M_S, VQDMULLBQ_M_N_S,
VQDMULLBQ_N_S, VQDMULLTQ_S, VQDMULLTQ_M_S, VQDMULLTQ_M_N_S,
VQDMULLTQ_N_S.
* config/arm/mve.md (mve_vqdmullbq_n_s)
(mve_vqdmulltq_n_s): Merge into ...
(@mve_q_n_): ... this.
(mve_vqdmullbq_s, mve_vqdmulltq_s): Merge into ...
(@mve_q_): ... this.
(mve_vqdmullbq_m_n_s, mve_vqdmulltq_m_n_s): Merge into
...
(@mve_q_m_n_): ... this.
(mve_vqdmullbq_m_s, mve_vqdmulltq_m_s): Merge into ...
(@mve_q_m_): ... this.
---
 gcc/config/arm/iterators.md |  36 +
 gcc/config/arm/mve.md   | 100 
 2 files changed, 56 insertions(+), 80 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index abd904da11e..f88da604c19 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -764,6 +764,26 @@ (define_int_iterator MVE_VMLxLDAVAxQ_P [
 VMLSLDAVAXQ_P_S
 ])
 
+(define_int_iterator MVE_VQDMULLxQ [
+VQDMULLBQ_S
+VQDMULLTQ_S
+])
+
+(define_int_iterator MVE_VQDMULLxQ_M [
+VQDMULLBQ_M_S
+VQDMULLTQ_M_S
+])
+
+(define_int_iterator MVE_VQDMULLxQ_M_N [
+VQDMULLBQ_M_N_S
+VQDMULLTQ_M_N_S
+])
+
+(define_int_iterator MVE_VQDMULLxQ_N [
+VQDMULLBQ_N_S
+VQDMULLTQ_N_S
+])
+
 (define_int_iterator MVE_VQxDMLxDHxQ_S [
 VQDMLADHQ_S
 VQDMLADHXQ_S
@@ -985,6 +1005,14 @@ (define_int_attr mve_insn [
 (VQDMULHQ_M_S "vqdmulh")
 (VQDMULHQ_N_S "vqdmulh")
 (VQDMULHQ_S "vqdmulh")
+(VQDMULLBQ_M_N_S "vqdmullb")
+(VQDMULLBQ_M_S "vqdmullb")
+(VQDMULLBQ_N_S "vqdmullb")
+(VQDMULLBQ_S "vqdmullb")
+(VQDMULLTQ_M_N_S "vqdmullt")
+(VQDMULLTQ_M_S "vqdmullt")
+(VQDMULLTQ_N_S "vqdmullt")
+(VQDMULLTQ_S "vqdmullt")
 (VQMOVNBQ_M_S "vqmovnb") (VQMOVNBQ_M_U "vqmovnb")
 (VQMOVNBQ_S "vqmovnb") (VQMOVNBQ_U "vqmovnb")
 (VQMOVNTQ_M_S "vqmovnt") (VQMOVNTQ_M_U "vqmovnt")
@@ -2425,6 +2453,14 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U 
"u") (VREV16Q_S "s")
   (VQDMLASHQ_N_S "s")
   (VQRDMLAHQ_N_S "s")
   (VQRDMLASHQ_N_S "s")
+  (VQDMULLBQ_S "s")
+  (VQDMULLBQ_M_S "s")
+  (VQDMULLBQ_M_N_S "s")
+  (VQDMULLBQ_N_S "s")
+  (VQDMULLTQ_S "s")
+  (VQDMULLTQ_M_S "s")
+  (VQDMULLTQ_M_N_S "s")
+  (VQDMULLTQ_N_S "s")
   ])
 
 ;; Both kinds of return insn.
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 14634cbf333..e75a30b7ed4 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -1503,62 +1503,34 @@ (define_insn "@mve_q_n_"
 ])
 
 ;;
-;; [vqdmullbq_n_s])
+;; [vqdmullbq_n_s]
+;; [vqdmulltq_n_s]
 ;;
-(define_insn "mve_vqdmullbq_n_s"
-  [
-   (set (match_operand: 0 "s_register_operand" 
"")
-   (unspec: [(match_operand:MVE_5 1 "s_register_operand" 
"w")
- (match_operand: 2 
"s_register_operand" "r")]
-VQDMULLBQ_N_S))
-  ]
-  "TARGET_HAVE_MVE"
-  "vqdmullb.s%# %q0, %q1, %2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vqdmullbq_s])
-;;
-(define_insn "mve_vqdmullbq_s"
-  [
-   (set (match_operand: 0 "s_register_operand" 
"")
-   (unspec: [(match_operand:MVE_5 1 "s_register_operand" 
"w")
- (match_operand:MVE_5 2 "s_register_operand" 
"w")]
-VQDMULLBQ_S))
-  ]
-  "TARGET_HAVE_MVE"
-  "vqdmullb.s%# %q0, %q1, %q2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vqdmulltq_n_s])
-;;
-(define_insn "mve_vqdmulltq_n_s"
+(define_insn "@mve_q_n_"
   [
(set (match_operand: 0 "s_register_operand" 
"")
(unspec: [(match_operand:MVE_5 1 "s_register_operand" 
"w")
  (match_operand: 2 
"s_register_operand" "r")]
-VQDMULLTQ_N_S))
+MVE_VQDMULLxQ_N))
   ]
   "TARGET_HAVE_MVE"
-  "vqdmullt.s%# %q0, %q1, %2"
+  ".s%#\t%q0, %q1, %2"
   [(set_attr "type" "mve_move")
 ])
 
 ;;
-;; [vqdmulltq_s])
+;; [vqdmullbq_s]
+;; [vqdmulltq_s]
 ;;
-(define_insn "mve_vqdmulltq_s"
+(define_insn "@mve_q_"
   [
(set (match

[PATCH 10/26] arm: [MVE intrinsics] factorize vrbsrq

2023-05-12 Thread Christophe Lyon via Gcc-patches
Factorize vrbsrq builtins so that they use parameterized names.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/iterators.md (MVE_VBRSR_M_N_FP, MVE_VBRSR_N_FP): New.
(mve_insn): Add vbrsr.
* config/arm/mve.md (mve_vbrsrq_n_f): Rename into ...
(@mve_q_n_f): ... this.
(mve_vbrsrq_n_): Rename into ...
(@mve_q_n_): ... this.
(mve_vbrsrq_m_n_): Rename into ...
(@mve_q_m_n_): ... this.
(mve_vbrsrq_m_n_f): Rename into ...
(@mve_q_m_n_f): ... this.
---
 gcc/config/arm/iterators.md | 10 ++
 gcc/config/arm/mve.md   | 20 ++--
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index d1d14488b56..dfc8d9cae72 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -610,6 +610,14 @@ (define_int_iterator MVE_FP_CREATE_ONLY [
 VCREATEQ_F
 ])
 
+(define_int_iterator MVE_VBRSR_M_N_FP [
+VBRSRQ_M_N_F
+])
+
+(define_int_iterator MVE_VBRSR_N_FP [
+VBRSRQ_N_F
+])
+
 ;; MVE comparison iterators
 (define_int_iterator MVE_CMP_M [
 VCMPCSQ_M_U
@@ -900,6 +908,8 @@ (define_int_attr mve_insn [
 (VBICQ_M_N_S "vbic") (VBICQ_M_N_U "vbic")
 (VBICQ_M_S "vbic") (VBICQ_M_U "vbic") (VBICQ_M_F "vbic")
 (VBICQ_N_S "vbic") (VBICQ_N_U "vbic")
+(VBRSRQ_M_N_S "vbrsr") (VBRSRQ_M_N_U "vbrsr") (VBRSRQ_M_N_F 
"vbrsr")
+(VBRSRQ_N_S "vbrsr") (VBRSRQ_N_U "vbrsr") (VBRSRQ_N_F "vbrsr")
 (VCLSQ_M_S "vcls")
 (VCLSQ_S "vcls")
 (VCLZQ_M_S "vclz") (VCLZQ_M_U "vclz")
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 7898361b859..beca74d4964 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -529,15 +529,15 @@ (define_insn "mve_vpnotv16bi"
 ;;
 ;; [vbrsrq_n_f])
 ;;
-(define_insn "mve_vbrsrq_n_f"
+(define_insn "@mve_q_n_f"
   [
(set (match_operand:MVE_0 0 "s_register_operand" "=w")
(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
   (match_operand:SI 2 "s_register_operand" "r")]
-VBRSRQ_N_F))
+MVE_VBRSR_N_FP))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vbrsr.  %q0, %q1, %2"
+  ".\t%q0, %q1, %2"
   [(set_attr "type" "mve_move")
 ])
 
@@ -826,7 +826,7 @@ (define_expand "mve_vbicq_s"
 ;;
 ;; [vbrsrq_n_u, vbrsrq_n_s])
 ;;
-(define_insn "mve_vbrsrq_n_"
+(define_insn "@mve_q_n_"
   [
(set (match_operand:MVE_2 0 "s_register_operand" "=w")
(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
@@ -834,7 +834,7 @@ (define_insn "mve_vbrsrq_n_"
 VBRSRQ_N))
   ]
   "TARGET_HAVE_MVE"
-  "vbrsr.%# %q0, %q1, %2"
+  ".%#\t%q0, %q1, %2"
   [(set_attr "type" "mve_move")
 ])
 
@@ -2802,7 +2802,7 @@ (define_insn "@mve_q_m_"
 ;;
 ;; [vbrsrq_m_n_u, vbrsrq_m_n_s])
 ;;
-(define_insn "mve_vbrsrq_m_n_"
+(define_insn "@mve_q_m_n_"
   [
(set (match_operand:MVE_2 0 "s_register_operand" "=w")
(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
@@ -2812,7 +2812,7 @@ (define_insn "mve_vbrsrq_m_n_"
 VBRSRQ_M_N))
   ]
   "TARGET_HAVE_MVE"
-  "vpst\;vbrsrt.%#  %q0, %q2, %3"
+  "vpst\;t.%#\t%q0, %q2, %3"
   [(set_attr "type" "mve_move")
(set_attr "length""8")])
 
@@ -3257,17 +3257,17 @@ (define_insn "@mve_q_m_f"
 ;;
 ;; [vbrsrq_m_n_f])
 ;;
-(define_insn "mve_vbrsrq_m_n_f"
+(define_insn "@mve_q_m_n_f"
   [
(set (match_operand:MVE_0 0 "s_register_operand" "=w")
(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
   (match_operand:MVE_0 2 "s_register_operand" "w")
   (match_operand:SI 3 "s_register_operand" "r")
   (match_operand: 4 "vpr_register_operand" 
"Up")]
-VBRSRQ_M_N_F))
+MVE_VBRSR_M_N_FP))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vpst\;vbrsrt.%#  %q0, %q2, %3"
+  "vpst\;t.%#\t%q0, %q2, %3"
   [(set_attr "type" "mve_move")
(set_attr "length""8")])
 
-- 
2.34.1



[PATCH 19/26] arm: [MVE intrinsics] add vpsel shape

2023-05-12 Thread Christophe Lyon via Gcc-patches
This patch adds the vpsel shape description.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (vpsel): New.
* config/arm/arm-mve-builtins-shapes.h (vpsel): New.
---
 gcc/config/arm/arm-mve-builtins-shapes.cc | 39 +++
 gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
 2 files changed, 40 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc 
b/gcc/config/arm/arm-mve-builtins-shapes.cc
index 43532601fbe..012cf3ef4c0 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -1706,6 +1706,45 @@ struct unary_widen_acc_def : public overloaded_base<0>
 };
 SHAPE (unary_widen_acc)
 
+/* _t vfoo[_t0](_t, _t, mve_pred16_t)
+
+   i.e. a version of the standard ternary shape in which
+   the final argument is always a set of predicates.
+
+   Example: vpselq.
+   int16x8_t [__arm_]vpselq[_s16](int16x8_t a, int16x8_t b, mve_pred16_t p)  */
+struct vpsel_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group,
+bool preserve_user_namespace) const override
+  {
+b.add_overloaded_functions (group, MODE_none, preserve_user_namespace);
+build_all (b, "v0,v0,v0,p", group, MODE_none, preserve_user_namespace);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+unsigned int i, nargs;
+type_suffix_index type;
+if (!r.check_gp_argument (3, i, nargs)
+   || (type = r.infer_vector_type (0)) == NUM_TYPE_SUFFIXES)
+  return error_mark_node;
+
+unsigned int last_arg = i;
+for (i = 0; i < last_arg; i++)
+  if (!r.require_matching_vector_type (i, type))
+   return error_mark_node;
+
+if (!r.require_vector_type (2 , VECTOR_TYPE_mve_pred16_t))
+  return error_mark_node;
+
+return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+SHAPE (vpsel)
+
 } /* end namespace arm_mve */
 
 #undef SHAPE
diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h 
b/gcc/config/arm/arm-mve-builtins-shapes.h
index f67a484c146..6e818092a87 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.h
+++ b/gcc/config/arm/arm-mve-builtins-shapes.h
@@ -71,6 +71,7 @@ namespace arm_mve
 extern const function_shape *const unary_n;
 extern const function_shape *const unary_widen;
 extern const function_shape *const unary_widen_acc;
+extern const function_shape *const vpsel;
 
   } /* end namespace arm_mve::shapes */
 } /* end namespace arm_mve */
-- 
2.34.1



[PATCH 22/26] arm: [MVE intrinsics] factorize vsliq

2023-05-12 Thread Christophe Lyon via Gcc-patches
Factorize vsliq builtins so that they use parameterized names.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/iterators.md (mve_insn>): Add vsli.
* config/arm/mve.md (mve_vsliq_n_): Rename into ...
(@mve_q_n_): ... this.
(mve_vsliq_m_n_): Rename into ...
(@mve_q_m_n_): ... this.
---
 gcc/config/arm/iterators.md | 2 ++
 gcc/config/arm/mve.md   | 8 
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 3d4a9cf9cc2..7e7219033cf 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -1181,6 +1181,8 @@ (define_int_attr mve_insn [
 (VSHRNTQ_N_S "vshrnt") (VSHRNTQ_N_U "vshrnt")
 (VSHRQ_M_N_S "vshr") (VSHRQ_M_N_U "vshr")
 (VSHRQ_N_S "vshr") (VSHRQ_N_U "vshr")
+(VSLIQ_M_N_S "vsli") (VSLIQ_M_N_U "vsli")
+(VSLIQ_N_S "vsli") (VSLIQ_N_U "vsli")
 (VSUBQ_M_N_S "vsub") (VSUBQ_M_N_U "vsub") (VSUBQ_M_N_F "vsub")
 (VSUBQ_M_S "vsub") (VSUBQ_M_U "vsub") (VSUBQ_M_F "vsub")
 (VSUBQ_N_S "vsub") (VSUBQ_N_U "vsub") (VSUBQ_N_F "vsub")
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index c6f9c0b9afb..a1c2cad9d2e 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -2058,7 +2058,7 @@ (define_insn "@mve_q_m_"
 ;;
 ;; [vsliq_n_u, vsliq_n_s])
 ;;
-(define_insn "mve_vsliq_n_"
+(define_insn "@mve_q_n_"
   [
(set (match_operand:MVE_2 0 "s_register_operand" "=w")
(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
@@ -2067,7 +2067,7 @@ (define_insn "mve_vsliq_n_"
 VSLIQ_N))
   ]
   "TARGET_HAVE_MVE"
-  "vsli.%#\t%q0, %q2, %3"
+  ".%#\t%q0, %q2, %3"
   [(set_attr "type" "mve_move")
 ])
 
@@ -2960,7 +2960,7 @@ (define_insn "@mve_q_m_n_"
 ;;
 ;; [vsliq_m_n_u, vsliq_m_n_s])
 ;;
-(define_insn "mve_vsliq_m_n_"
+(define_insn "@mve_q_m_n_"
[
(set (match_operand:MVE_2 0 "s_register_operand" "=w")
(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
@@ -2970,7 +2970,7 @@ (define_insn "mve_vsliq_m_n_"
 VSLIQ_M_N))
   ]
   "TARGET_HAVE_MVE"
-  "vpst\;vslit.%#\t%q0, %q2, %3"
+  "vpst\;t.%#\t%q0, %q2, %3"
   [(set_attr "type" "mve_move")
(set_attr "length""8")])
 
-- 
2.34.1



[PATCH 23/26] arm: [MVE intrinsics] rework vsliq

2023-05-12 Thread Christophe Lyon via Gcc-patches
Implement vsliq using the new MVE builtins framework.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc (vsliq): New.
* config/arm/arm-mve-builtins-base.def (vsliq): New.
* config/arm/arm-mve-builtins-base.h (vsliq): New.
* config/arm/arm-mve-builtins.cc
(function_instance::has_inactive_argument): Handle vsliq.
* config/arm/arm_mve.h (vsliq): Remove.
(vsliq_m): Remove.
(vsliq_n_u8): Remove.
(vsliq_n_s8): Remove.
(vsliq_n_u16): Remove.
(vsliq_n_s16): Remove.
(vsliq_n_u32): Remove.
(vsliq_n_s32): Remove.
(vsliq_m_n_s8): Remove.
(vsliq_m_n_s32): Remove.
(vsliq_m_n_s16): Remove.
(vsliq_m_n_u8): Remove.
(vsliq_m_n_u32): Remove.
(vsliq_m_n_u16): Remove.
(__arm_vsliq_n_u8): Remove.
(__arm_vsliq_n_s8): Remove.
(__arm_vsliq_n_u16): Remove.
(__arm_vsliq_n_s16): Remove.
(__arm_vsliq_n_u32): Remove.
(__arm_vsliq_n_s32): Remove.
(__arm_vsliq_m_n_s8): Remove.
(__arm_vsliq_m_n_s32): Remove.
(__arm_vsliq_m_n_s16): Remove.
(__arm_vsliq_m_n_u8): Remove.
(__arm_vsliq_m_n_u32): Remove.
(__arm_vsliq_m_n_u16): Remove.
(__arm_vsliq): Remove.
(__arm_vsliq_m): Remove.
---
 gcc/config/arm/arm-mve-builtins-base.cc  |   1 +
 gcc/config/arm/arm-mve-builtins-base.def |   1 +
 gcc/config/arm/arm-mve-builtins-base.h   |   1 +
 gcc/config/arm/arm-mve-builtins.cc   |   3 +-
 gcc/config/arm/arm_mve.h | 212 ---
 5 files changed, 5 insertions(+), 213 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc 
b/gcc/config/arm/arm-mve-builtins-base.cc
index b1440ca489e..873c7d365f3 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -387,6 +387,7 @@ FUNCTION_WITH_M_N_R (vshlq, VSHLQ)
 FUNCTION_ONLY_N_NO_F (vshrnbq, VSHRNBQ)
 FUNCTION_ONLY_N_NO_F (vshrntq, VSHRNTQ)
 FUNCTION_ONLY_N_NO_F (vshrq, VSHRQ)
+FUNCTION_ONLY_N_NO_F (vsliq, VSLIQ)
 FUNCTION_WITH_RTX_M_N (vsubq, MINUS, VSUBQ)
 FUNCTION (vuninitializedq, vuninitializedq_impl,)
 
diff --git a/gcc/config/arm/arm-mve-builtins-base.def 
b/gcc/config/arm/arm-mve-builtins-base.def
index de4c473f618..2d1b87b90c3 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/config/arm/arm-mve-builtins-base.def
@@ -140,6 +140,7 @@ DEF_MVE_FUNCTION (vshlq, binary_lshift_r, all_integer, 
m_or_none) // "_r" forms
 DEF_MVE_FUNCTION (vshrnbq, binary_rshift_narrow, integer_16_32, m_or_none)
 DEF_MVE_FUNCTION (vshrntq, binary_rshift_narrow, integer_16_32, m_or_none)
 DEF_MVE_FUNCTION (vshrq, binary_rshift, all_integer, mx_or_none)
+DEF_MVE_FUNCTION (vsliq, ternary_lshift, all_integer, m_or_none)
 DEF_MVE_FUNCTION (vsubq, binary_opt_n, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vuninitializedq, inherent, all_integer_with_64, none)
 #undef REQUIRES_FLOAT
diff --git a/gcc/config/arm/arm-mve-builtins-base.h 
b/gcc/config/arm/arm-mve-builtins-base.h
index ec5b4fbffb9..84fff0f6d0e 100644
--- a/gcc/config/arm/arm-mve-builtins-base.h
+++ b/gcc/config/arm/arm-mve-builtins-base.h
@@ -160,6 +160,7 @@ extern const function_base *const vshlq;
 extern const function_base *const vshrnbq;
 extern const function_base *const vshrntq;
 extern const function_base *const vshrq;
+extern const function_base *const vsliq;
 extern const function_base *const vsubq;
 extern const function_base *const vuninitializedq;
 
diff --git a/gcc/config/arm/arm-mve-builtins.cc 
b/gcc/config/arm/arm-mve-builtins.cc
index 87fcbc31f2f..f5056bdd1bb 100644
--- a/gcc/config/arm/arm-mve-builtins.cc
+++ b/gcc/config/arm/arm-mve-builtins.cc
@@ -719,7 +719,8 @@ function_instance::has_inactive_argument () const
   || base == functions::vrshrnbq
   || base == functions::vrshrntq
   || base == functions::vshrnbq
-  || base == functions::vshrntq)
+  || base == functions::vshrntq
+  || base == functions::vsliq)
 return false;
 
   return true;
diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index 72b50764963..72177f9c53e 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -55,7 +55,6 @@
 #define vbicq_m_n(__a, __imm, __p) __arm_vbicq_m_n(__a, __imm, __p)
 #define vshlcq(__a, __b, __imm) __arm_vshlcq(__a, __b, __imm)
 #define vsriq(__a, __b, __imm) __arm_vsriq(__a, __b, __imm)
-#define vsliq(__a, __b, __imm) __arm_vsliq(__a, __b, __imm)
 #define vsriq_m(__a, __b, __imm, __p) __arm_vsriq_m(__a, __b, __imm, __p)
 #define vbicq_m(__inactive, __a, __b, __p) __arm_vbicq_m(__inactive, __a, __b, 
__p)
 #define vcaddq_rot270_m(__inactive, __a, __b, __p) 
__arm_vcaddq_rot270_m(__inactive, __a, __b, __p)
@@ -65,7 +64,6 @@
 #define vmullbq_int_m(__inactive, __a, __b, __p) 
__arm_vmullbq_int_m(__inactive, __a, __b, __p)
 #define vmulltq_int_m(__inactive, __a, __b, __p) 
__arm_vmulltq_int_m(__inactive, __a, __b, __p)

[PATCH 13/26] arm: [MVE intrinsics] factorize vmvnq

2023-05-12 Thread Christophe Lyon via Gcc-patches
Factorize vmvnq builtins so that they use parameterized names.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/iterators.md (mve_insn): Add vmvn.
* config/arm/mve.md (mve_vmvnq_n_): Rename into ...
(@mve_q_n_): ... this.
(mve_vmvnq_m_): Rename into ...
(@mve_q_m_): ... this.
(mve_vmvnq_m_n_): Rename into ...
(@mve_q_m_n_): ... this.
---
 gcc/config/arm/iterators.md |  3 +++
 gcc/config/arm/mve.md   | 12 ++--
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index dfc8d9cae72..7fbfea49ff3 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -1002,6 +1002,9 @@ (define_int_attr mve_insn [
 (VMULQ_M_N_S "vmul") (VMULQ_M_N_U "vmul") (VMULQ_M_N_F "vmul")
 (VMULQ_M_S "vmul") (VMULQ_M_U "vmul") (VMULQ_M_F "vmul")
 (VMULQ_N_S "vmul") (VMULQ_N_U "vmul") (VMULQ_N_F "vmul")
+(VMVNQ_M_N_S "vmvn") (VMVNQ_M_N_U "vmvn")
+(VMVNQ_M_S "vmvn") (VMVNQ_M_U "vmvn")
+(VMVNQ_N_S "vmvn") (VMVNQ_N_U "vmvn")
 (VNEGQ_M_F "vneg")
 (VNEGQ_M_S "vneg")
 (VORRQ_M_N_S "vorr") (VORRQ_M_N_U "vorr")
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index beca74d4964..57ba65d3c76 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -459,14 +459,14 @@ (define_insn "mve_vcvtaq_"
 ;;
 ;; [vmvnq_n_u, vmvnq_n_s])
 ;;
-(define_insn "mve_vmvnq_n_"
+(define_insn "@mve_q_n_"
   [
(set (match_operand:MVE_5 0 "s_register_operand" "=w")
(unspec:MVE_5 [(match_operand: 1 "immediate_operand" "i")]
 VMVNQ_N))
   ]
   "TARGET_HAVE_MVE"
-  "vmvn.i%#  %q0, %1"
+  ".i%#\t%q0, %1"
   [(set_attr "type" "mve_move")
 ])
 
@@ -1953,7 +1953,7 @@ (define_insn "@mve_q_n_"
 ;;
 ;; [vmvnq_m_s, vmvnq_m_u])
 ;;
-(define_insn "mve_vmvnq_m_"
+(define_insn "@mve_q_m_"
   [
(set (match_operand:MVE_2 0 "s_register_operand" "=w")
(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
@@ -1962,7 +1962,7 @@ (define_insn "mve_vmvnq_m_"
 VMVNQ_M))
   ]
   "TARGET_HAVE_MVE"
-  "vpst\;vmvnt %q0, %q2"
+  "vpst\;t\t%q0, %q2"
   [(set_attr "type" "mve_move")
(set_attr "length""8")])
 
@@ -2423,7 +2423,7 @@ (define_insn "@mve_q_m_"
 ;;
 ;; [vmvnq_m_n_u, vmvnq_m_n_s])
 ;;
-(define_insn "mve_vmvnq_m_n_"
+(define_insn "@mve_q_m_n_"
   [
(set (match_operand:MVE_5 0 "s_register_operand" "=w")
(unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0")
@@ -2432,7 +2432,7 @@ (define_insn "mve_vmvnq_m_n_"
 VMVNQ_M_N))
   ]
   "TARGET_HAVE_MVE"
-  "vpst\;vmvnt.i%#  %q0, %2"
+  "vpst\;t.i%#\t%q0, %2"
   [(set_attr "type" "mve_move")
(set_attr "length""8")])
 
-- 
2.34.1



[PATCH 15/26] arm: [MVE intrinsics] add ternary_opt_n shape

2023-05-12 Thread Christophe Lyon via Gcc-patches
This patch adds the ternary_opt_n shape description.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (ternary_opt_n): New.
* config/arm/arm-mve-builtins-shapes.h (ternary_opt_n): New.
---
 gcc/config/arm/arm-mve-builtins-shapes.cc | 30 +++
 gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
 2 files changed, 31 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc 
b/gcc/config/arm/arm-mve-builtins-shapes.cc
index 6401a79c570..43532601fbe 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -1399,6 +1399,36 @@ struct ternary_n_def : public overloaded_base<0>
 };
 SHAPE (ternary_n)
 
+/* _t vfoo[_t0](_t, _t, _t)
+   _t vfoo[_n_t0](_t, _t, _t)
+
+   i.e. the standard shape for ternary operations that operate on
+   uniform types.
+
+   Example: vfmaq.
+   float16x8_t [__arm_]vfmaq[_n_f16](float16x8_t add, float16x8_t m1, 
float16_t m2)
+   float16x8_t [__arm_]vfmaq_m[_n_f16](float16x8_t add, float16x8_t m1, 
float16_t m2, mve_pred16_t p)
+   float16x8_t [__arm_]vfmaq[_f16](float16x8_t add, float16x8_t m1, 
float16x8_t m2)
+   float16x8_t [__arm_]vfmaq_m[_f16](float16x8_t add, float16x8_t m1, 
float16x8_t m2, mve_pred16_t p)  */
+struct ternary_opt_n_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group,
+bool preserve_user_namespace) const override
+  {
+b.add_overloaded_functions (group, MODE_none, preserve_user_namespace);
+build_all (b, "v0,v0,v0,v0", group, MODE_none, preserve_user_namespace);
+build_all (b, "v0,v0,v0,s0", group, MODE_n, preserve_user_namespace);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+return r.resolve_uniform_opt_n (3);
+  }
+};
+SHAPE (ternary_opt_n)
+
 /* _t vfoo[_t0](_t)
 
i.e. the standard shape for unary operations that operate on
diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h 
b/gcc/config/arm/arm-mve-builtins-shapes.h
index ba53e8cc52e..f67a484c146 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.h
+++ b/gcc/config/arm/arm-mve-builtins-shapes.h
@@ -62,6 +62,7 @@ namespace arm_mve
 extern const function_shape *const mvn;
 extern const function_shape *const ternary;
 extern const function_shape *const ternary_n;
+extern const function_shape *const ternary_opt_n;
 extern const function_shape *const unary;
 extern const function_shape *const unary_acc;
 extern const function_shape *const unary_convert;
-- 
2.34.1



[PATCH 08/26] arm: [MVE intrinsics] rework vqshluq

2023-05-12 Thread Christophe Lyon via Gcc-patches
Implement vqshluq using the new MVE builtins framework.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc (vqshluq): New.
* config/arm/arm-mve-builtins-base.def (vqshluq): New.
* config/arm/arm-mve-builtins-base.h (vqshluq): New.
* config/arm/arm_mve.h (vqshluq): Remove.
(vqshluq_m): Remove.
(vqshluq_n_s8): Remove.
(vqshluq_n_s16): Remove.
(vqshluq_n_s32): Remove.
(vqshluq_m_n_s8): Remove.
(vqshluq_m_n_s16): Remove.
(vqshluq_m_n_s32): Remove.
(__arm_vqshluq_n_s8): Remove.
(__arm_vqshluq_n_s16): Remove.
(__arm_vqshluq_n_s32): Remove.
(__arm_vqshluq_m_n_s8): Remove.
(__arm_vqshluq_m_n_s16): Remove.
(__arm_vqshluq_m_n_s32): Remove.
(__arm_vqshluq): Remove.
(__arm_vqshluq_m): Remove.
---
 gcc/config/arm/arm-mve-builtins-base.cc  |   1 +
 gcc/config/arm/arm-mve-builtins-base.def |   1 +
 gcc/config/arm/arm-mve-builtins-base.h   |   1 +
 gcc/config/arm/arm_mve.h | 111 ---
 4 files changed, 3 insertions(+), 111 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc 
b/gcc/config/arm/arm-mve-builtins-base.cc
index a2b227bb2aa..739ab604843 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -332,6 +332,7 @@ FUNCTION_WITHOUT_N_NO_U_F (vqnegq, VQNEGQ)
 FUNCTION_WITH_M_N_NO_F (vqrshlq, VQRSHLQ)
 FUNCTION_WITH_M_N_NO_U_F (vqrdmulhq, VQRDMULHQ)
 FUNCTION_WITH_M_N_R (vqshlq, VQSHLQ)
+FUNCTION_ONLY_N_NO_U_F (vqshluq, VQSHLUQ)
 FUNCTION_ONLY_N_NO_F (vqrshrnbq, VQRSHRNBQ)
 FUNCTION_ONLY_N_NO_F (vqrshrntq, VQRSHRNTQ)
 FUNCTION_ONLY_N_NO_U_F (vqrshrunbq, VQRSHRUNBQ)
diff --git a/gcc/config/arm/arm-mve-builtins-base.def 
b/gcc/config/arm/arm-mve-builtins-base.def
index c4ef74169dd..3f7bb414e40 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/config/arm/arm-mve-builtins-base.def
@@ -106,6 +106,7 @@ DEF_MVE_FUNCTION (vqrshrunbq, 
binary_rshift_narrow_unsigned, signed_16_32, m_or_
 DEF_MVE_FUNCTION (vqrshruntq, binary_rshift_narrow_unsigned, signed_16_32, 
m_or_none)
 DEF_MVE_FUNCTION (vqshlq, binary_lshift, all_integer, m_or_none)
 DEF_MVE_FUNCTION (vqshlq, binary_lshift_r, all_integer, m_or_none)
+DEF_MVE_FUNCTION (vqshluq, binary_lshift_unsigned, all_signed, m_or_none)
 DEF_MVE_FUNCTION (vqshrnbq, binary_rshift_narrow, integer_16_32, m_or_none)
 DEF_MVE_FUNCTION (vqshrntq, binary_rshift_narrow, integer_16_32, m_or_none)
 DEF_MVE_FUNCTION (vqshrunbq, binary_rshift_narrow_unsigned, signed_16_32, 
m_or_none)
diff --git a/gcc/config/arm/arm-mve-builtins-base.h 
b/gcc/config/arm/arm-mve-builtins-base.h
index 41b2e19c2d7..797f8ba2f5e 100644
--- a/gcc/config/arm/arm-mve-builtins-base.h
+++ b/gcc/config/arm/arm-mve-builtins-base.h
@@ -118,6 +118,7 @@ extern const function_base *const vqrshrntq;
 extern const function_base *const vqrshrunbq;
 extern const function_base *const vqrshruntq;
 extern const function_base *const vqshlq;
+extern const function_base *const vqshluq;
 extern const function_base *const vqshrnbq;
 extern const function_base *const vqshrntq;
 extern const function_base *const vqshrunbq;
diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index c995093e12f..673a3df1bfd 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -50,7 +50,6 @@
 #define vcaddq_rot270(__a, __b) __arm_vcaddq_rot270(__a, __b)
 #define vbicq(__a, __b) __arm_vbicq(__a, __b)
 #define vbrsrq(__a, __b) __arm_vbrsrq(__a, __b)
-#define vqshluq(__a, __imm) __arm_vqshluq(__a, __imm)
 #define vhcaddq_rot90(__a, __b) __arm_vhcaddq_rot90(__a, __b)
 #define vhcaddq_rot270(__a, __b) __arm_vhcaddq_rot270(__a, __b)
 #define vmulltq_poly(__a, __b) __arm_vmulltq_poly(__a, __b)
@@ -62,7 +61,6 @@
 #define vsriq(__a, __b, __imm) __arm_vsriq(__a, __b, __imm)
 #define vsliq(__a, __b, __imm) __arm_vsliq(__a, __b, __imm)
 #define vsriq_m(__a, __b, __imm, __p) __arm_vsriq_m(__a, __b, __imm, __p)
-#define vqshluq_m(__inactive, __a, __imm, __p) __arm_vqshluq_m(__inactive, 
__a, __imm, __p)
 #define vbicq_m(__inactive, __a, __b, __p) __arm_vbicq_m(__inactive, __a, __b, 
__p)
 #define vbrsrq_m(__inactive, __a, __b, __p) __arm_vbrsrq_m(__inactive, __a, 
__b, __p)
 #define vcaddq_rot270_m(__inactive, __a, __b, __p) 
__arm_vcaddq_rot270_m(__inactive, __a, __b, __p)
@@ -284,7 +282,6 @@
 #define vcaddq_rot270_u8(__a, __b) __arm_vcaddq_rot270_u8(__a, __b)
 #define vbicq_u8(__a, __b) __arm_vbicq_u8(__a, __b)
 #define vbrsrq_n_u8(__a, __b) __arm_vbrsrq_n_u8(__a, __b)
-#define vqshluq_n_s8(__a,  __imm) __arm_vqshluq_n_s8(__a,  __imm)
 #define vornq_s8(__a, __b) __arm_vornq_s8(__a, __b)
 #define vmulltq_int_s8(__a, __b) __arm_vmulltq_int_s8(__a, __b)
 #define vmullbq_int_s8(__a, __b) __arm_vmullbq_int_s8(__a, __b)
@@ -301,7 +298,6 @@
 #define vcaddq_rot270_u16(__a, __b) __arm_vcaddq_rot270_u16(__a, __b)
 #define vbicq_u16(__a, __b) __arm_vbicq_u16(__a, __b)
 #d

[PATCH 24/26] arm: [MVE intrinsics] add ternary_rshift shape

2023-05-12 Thread Christophe Lyon via Gcc-patches
This patch adds the ternary_rshift shape description.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (ternary_rshift): New.
* config/arm/arm-mve-builtins-shapes.h (ternary_rshift): New.
---
 gcc/config/arm/arm-mve-builtins-shapes.cc | 38 +++
 gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
 2 files changed, 39 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc 
b/gcc/config/arm/arm-mve-builtins-shapes.cc
index a8e94b4f8f8..d4c30ed2e8c 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -1467,6 +1467,44 @@ struct ternary_opt_n_def : public overloaded_base<0>
 };
 SHAPE (ternary_opt_n)
 
+/* _t vfoo[_t0](_t, _t, const int)
+
+   i.e. ternary operations that operate on a pair of vectors of the
+   same type as the destination, and take a third integer argument.
+
+   Check that 'imm' is in the [1..#bits] range.
+
+   Example: vsriq.
+   int8x16_t [__arm_]vsriq[_n_s8](int8x16_t a, int8x16_t b, const int imm)
+   int8x16_t [__arm_]vsriq_m[_n_s8](int8x16_t a, int8x16_t b, const int imm, 
mve_pred16_t p)  */
+struct ternary_rshift_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group,
+bool preserve_user_namespace) const override
+  {
+b.add_overloaded_functions (group, MODE_n, preserve_user_namespace);
+build_all (b, "v0,v0,v0,ss32", group, MODE_n, preserve_user_namespace);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+return r.resolve_uniform (2, 1);
+  }
+
+  bool
+  check (function_checker &c) const override
+  {
+if (c.mode_suffix_id != MODE_n)
+  return true;
+
+unsigned int bits = c.type_suffix (0).element_bits;
+return c.require_immediate_range (2, 1, bits);
+  }
+};
+SHAPE (ternary_rshift)
+
 /* _t vfoo[_t0](_t)
 
i.e. the standard shape for unary operations that operate on
diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h 
b/gcc/config/arm/arm-mve-builtins-shapes.h
index 73375186d82..a1842f5845c 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.h
+++ b/gcc/config/arm/arm-mve-builtins-shapes.h
@@ -64,6 +64,7 @@ namespace arm_mve
 extern const function_shape *const ternary_lshift;
 extern const function_shape *const ternary_n;
 extern const function_shape *const ternary_opt_n;
+extern const function_shape *const ternary_rshift;
 extern const function_shape *const unary;
 extern const function_shape *const unary_acc;
 extern const function_shape *const unary_convert;
-- 
2.34.1



[PATCH 18/26] arm: [MVE intrinsics] factorize vpselq

2023-05-12 Thread Christophe Lyon via Gcc-patches
Factorize vpselq builtins so that they use parameterized names.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm.cc (arm_expand_vcond): Use gen_mve_q instead of
gen_mve_vpselq.
* config/arm/iterators.md (MVE_VPSELQ_F): New.
(mve_insn): Add vpsel.
* config/arm/mve.md (@mve_vpselq_): Rename into ...
(@mve_q_): ... this.
(@mve_vpselq_f): Rename into ...
(@mve_q_f): ... this.
---
 gcc/config/arm/arm.cc   |  8 
 gcc/config/arm/iterators.md |  5 +
 gcc/config/arm/mve.md   | 18 +-
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 06e0756e4e8..da7e9c81465 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -31633,13 +31633,13 @@ arm_expand_vcond (rtx *operands, machine_mode 
cmp_result_mode)
   switch (GET_MODE_CLASS (cmp_mode))
{
case MODE_VECTOR_INT:
- emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_mode, operands[0],
-operands[1], operands[2], mask));
+ emit_insn (gen_mve_q (VPSELQ_S, VPSELQ_S, cmp_mode, operands[0],
+   operands[1], operands[2], mask));
  break;
case MODE_VECTOR_FLOAT:
  if (TARGET_HAVE_MVE_FLOAT)
-   emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0],
-operands[1], operands[2], mask));
+   emit_insn (gen_mve_q_f (VPSELQ_F, cmp_mode, operands[0],
+   operands[1], operands[2], mask));
  else
gcc_unreachable ();
  break;
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 022744f04d9..3d4a9cf9cc2 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -898,6 +898,10 @@ (define_int_attr mve_cmp_op1 [
 (VCMPNEQ_M_N_F "ne")
 ])
 
+(define_int_iterator MVE_VPSELQ_F [
+VPSELQ_F
+])
+
 (define_int_attr mve_insn [
 (VABAVQ_P_S "vabav") (VABAVQ_P_U "vabav")
 (VABAVQ_S "vabav") (VABAVQ_U "vabav")
@@ -1030,6 +1034,7 @@ (define_int_attr mve_insn [
 (VORRQ_M_N_S "vorr") (VORRQ_M_N_U "vorr")
 (VORRQ_M_S "vorr") (VORRQ_M_U "vorr") (VORRQ_M_F "vorr")
 (VORRQ_N_S "vorr") (VORRQ_N_U "vorr")
+(VPSELQ_S "vpsel") (VPSELQ_U "vpsel") (VPSELQ_F "vpsel")
 (VQABSQ_M_S "vqabs")
 (VQABSQ_S "vqabs")
 (VQADDQ_M_N_S "vqadd") (VQADDQ_M_N_U "vqadd")
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index b87798730a2..c6f9c0b9afb 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -1969,7 +1969,7 @@ (define_insn "@mve_q_m_"
 ;;
 ;; [vpselq_u, vpselq_s])
 ;;
-(define_insn "@mve_vpselq_"
+(define_insn "@mve_q_"
   [
(set (match_operand:MVE_1 0 "s_register_operand" "=w")
(unspec:MVE_1 [(match_operand:MVE_1 1 "s_register_operand" "w")
@@ -1978,7 +1978,7 @@ (define_insn "@mve_vpselq_"
 VPSELQ))
   ]
   "TARGET_HAVE_MVE"
-  "vpsel %q0, %q1, %q2"
+  "\t%q0, %q1, %q2"
   [(set_attr "type" "mve_move")
 ])
 
@@ -2427,16 +2427,16 @@ (define_insn "@mve_q_m_n_"
 ;;
 ;; [vpselq_f])
 ;;
-(define_insn "@mve_vpselq_f"
+(define_insn "@mve_q_f"
   [
(set (match_operand:MVE_0 0 "s_register_operand" "=w")
(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
   (match_operand:MVE_0 2 "s_register_operand" "w")
   (match_operand: 3 "vpr_register_operand" 
"Up")]
-VPSELQ_F))
+MVE_VPSELQ_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vpsel %q0, %q1, %q2"
+  "\t%q0, %q1, %q2"
   [(set_attr "type" "mve_move")
 ])
 
@@ -6867,12 +6867,12 @@ (define_expand "vcond_mask_"
   switch (GET_MODE_CLASS (mode))
 {
   case MODE_VECTOR_INT:
-   emit_insn (gen_mve_vpselq (VPSELQ_S, mode, operands[0],
-  operands[1], operands[2], operands[3]));
+   emit_insn (gen_mve_q (VPSELQ_S, VPSELQ_S, mode, operands[0],
+ operands[1], operands[2], operands[3]));
break;
   case MODE_VECTOR_FLOAT:
-   emit_insn (gen_mve_vpselq_f (mode, operands[0],
-operands[1], operands[2], operands[3]));
+   emit_insn (gen_mve_q_f (VPSELQ_F, mode, operands[0],
+   operands[1], operands[2], operands[3]));
break;
   default:
gcc_unreachable ();
-- 
2.34.1



[PATCH 12/26] arm: [MVE intrinsics] add mvn shape

2023-05-12 Thread Christophe Lyon via Gcc-patches
This patch adds the mvn shape description.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (mvn): New.
* config/arm/arm-mve-builtins-shapes.h (mvn): New.
---
 gcc/config/arm/arm-mve-builtins-shapes.cc | 49 +++
 gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
 2 files changed, 50 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc 
b/gcc/config/arm/arm-mve-builtins-shapes.cc
index c2e138c12e1..6401a79c570 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -1297,6 +1297,55 @@ struct inherent_def : public nonoverloaded_base
 };
 SHAPE (inherent)
 
+/* _t vfoo[_t0](_t)
+   _t vfoo_n_t0(_t)
+
+   For MODE_n, define only the 16 and 32 bits versions.
+
+   Example: vmvnq.
+   int16x8_t [__arm_]vmvnq[_s16](int16x8_t a)
+   int16x8_t [__arm_]vmvnq_m[_s16](int16x8_t inactive, int16x8_t a, 
mve_pred16_t p)
+   int16x8_t [__arm_]vmvnq_x[_s16](int16x8_t a, mve_pred16_t p)
+   int16x8_t [__arm_]vmvnq_n_s16(const int16_t imm)
+   int16x8_t [__arm_]vmvnq_m[_n_s16](int16x8_t inactive, const int16_t imm, 
mve_pred16_t p)
+   int16x8_t [__arm_]vmvnq_x_n_s16(const int16_t imm, mve_pred16_t p)  */
+struct mvn_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group,
+bool preserve_user_namespace) const override
+  {
+b.add_overloaded_functions (group, MODE_none, preserve_user_namespace);
+/* Do not build a separate instance for MODE_n, since we want to
+   share vmvnq_m[_n_s16] with vmvnq_m[_s16].  */
+build_all (b, "v0,v0", group, MODE_none, preserve_user_namespace);
+build_16_32 (b, "v0,s0", group, MODE_n, preserve_user_namespace);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+unsigned int i, nargs;
+type_suffix_index type;
+if (!r.check_gp_argument (1, i, nargs)
+   /* Same type for arg 0 and 1 if _m, so using 0 is OK */
+   || (type = r.infer_vector_type (0)) == NUM_TYPE_SUFFIXES)
+  return error_mark_node;
+
+  /* Skip last argument, may be scalar.  */
+unsigned int last_arg = i;
+for (i = 0; i < last_arg; i++)
+  if (!r.require_matching_vector_type (i, type))
+   return error_mark_node;
+
+if (last_arg == 0)
+  return r.resolve_to (r.mode_suffix_id, type);
+
+return r.finish_opt_n_resolution (last_arg, 0, type);
+  }
+};
+SHAPE (mvn)
+
 /* _t vfoo[_t0](_t, _t, _t)
 
i.e. the standard shape for ternary operations that operate on
diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h 
b/gcc/config/arm/arm-mve-builtins-shapes.h
index bba38194ce2..ba53e8cc52e 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.h
+++ b/gcc/config/arm/arm-mve-builtins-shapes.h
@@ -59,6 +59,7 @@ namespace arm_mve
 extern const function_shape *const cmp;
 extern const function_shape *const create;
 extern const function_shape *const inherent;
+extern const function_shape *const mvn;
 extern const function_shape *const ternary;
 extern const function_shape *const ternary_n;
 extern const function_shape *const unary;
-- 
2.34.1



[PATCH 14/26] arm: [MVE intrinsics] rework vmvnq

2023-05-12 Thread Christophe Lyon via Gcc-patches
Implement vmvnq using the new MVE builtins framework.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc
(FUNCTION_WITH_RTX_M_N_NO_F): New.
(vmvnq): New.
* config/arm/arm-mve-builtins-base.def (vmvnq): New.
* config/arm/arm-mve-builtins-base.h (vmvnq): New.
* config/arm/arm_mve.h (vmvnq): Remove.
(vmvnq_m): Remove.
(vmvnq_x): Remove.
(vmvnq_s8): Remove.
(vmvnq_s16): Remove.
(vmvnq_s32): Remove.
(vmvnq_n_s16): Remove.
(vmvnq_n_s32): Remove.
(vmvnq_u8): Remove.
(vmvnq_u16): Remove.
(vmvnq_u32): Remove.
(vmvnq_n_u16): Remove.
(vmvnq_n_u32): Remove.
(vmvnq_m_u8): Remove.
(vmvnq_m_s8): Remove.
(vmvnq_m_u16): Remove.
(vmvnq_m_s16): Remove.
(vmvnq_m_u32): Remove.
(vmvnq_m_s32): Remove.
(vmvnq_m_n_s16): Remove.
(vmvnq_m_n_u16): Remove.
(vmvnq_m_n_s32): Remove.
(vmvnq_m_n_u32): Remove.
(vmvnq_x_s8): Remove.
(vmvnq_x_s16): Remove.
(vmvnq_x_s32): Remove.
(vmvnq_x_u8): Remove.
(vmvnq_x_u16): Remove.
(vmvnq_x_u32): Remove.
(vmvnq_x_n_s16): Remove.
(vmvnq_x_n_s32): Remove.
(vmvnq_x_n_u16): Remove.
(vmvnq_x_n_u32): Remove.
(__arm_vmvnq_s8): Remove.
(__arm_vmvnq_s16): Remove.
(__arm_vmvnq_s32): Remove.
(__arm_vmvnq_n_s16): Remove.
(__arm_vmvnq_n_s32): Remove.
(__arm_vmvnq_u8): Remove.
(__arm_vmvnq_u16): Remove.
(__arm_vmvnq_u32): Remove.
(__arm_vmvnq_n_u16): Remove.
(__arm_vmvnq_n_u32): Remove.
(__arm_vmvnq_m_u8): Remove.
(__arm_vmvnq_m_s8): Remove.
(__arm_vmvnq_m_u16): Remove.
(__arm_vmvnq_m_s16): Remove.
(__arm_vmvnq_m_u32): Remove.
(__arm_vmvnq_m_s32): Remove.
(__arm_vmvnq_m_n_s16): Remove.
(__arm_vmvnq_m_n_u16): Remove.
(__arm_vmvnq_m_n_s32): Remove.
(__arm_vmvnq_m_n_u32): Remove.
(__arm_vmvnq_x_s8): Remove.
(__arm_vmvnq_x_s16): Remove.
(__arm_vmvnq_x_s32): Remove.
(__arm_vmvnq_x_u8): Remove.
(__arm_vmvnq_x_u16): Remove.
(__arm_vmvnq_x_u32): Remove.
(__arm_vmvnq_x_n_s16): Remove.
(__arm_vmvnq_x_n_s32): Remove.
(__arm_vmvnq_x_n_u16): Remove.
(__arm_vmvnq_x_n_u32): Remove.
(__arm_vmvnq): Remove.
(__arm_vmvnq_m): Remove.
(__arm_vmvnq_x): Remove.
---
 gcc/config/arm/arm-mve-builtins-base.cc  |  10 +
 gcc/config/arm/arm-mve-builtins-base.def |   1 +
 gcc/config/arm/arm-mve-builtins-base.h   |   1 +
 gcc/config/arm/arm_mve.h | 438 ---
 4 files changed, 12 insertions(+), 438 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc 
b/gcc/config/arm/arm-mve-builtins-base.cc
index 2fb81c197da..6286d4a147a 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -103,6 +103,15 @@ namespace arm_mve {
 UNSPEC##_M_S, UNSPEC##_M_U, UNSPEC##_M_F,  \
 -1, -1, -1))
 
+  /* Helper for builtins with RTX codes, _m predicated and _n
+ overrides, but no floating-point version.  */
+#define FUNCTION_WITH_RTX_M_N_NO_F(NAME, RTX, UNSPEC) FUNCTION \
+  (NAME, unspec_based_mve_function_exact_insn, \
+   (RTX, RTX, UNKNOWN, \
+UNSPEC##_N_S, UNSPEC##_N_U, -1,\
+UNSPEC##_M_S, UNSPEC##_M_U, -1,\
+UNSPEC##_M_N_S, UNSPEC##_M_N_U, -1))
+
   /* Helper for builtins with RTX codes, _m predicated and _n overrides.  */
 #define FUNCTION_WITH_RTX_M_N_NO_N_F(NAME, RTX, UNSPEC) FUNCTION   \
   (NAME, unspec_based_mve_function_exact_insn, \
@@ -306,6 +315,7 @@ FUNCTION_WITHOUT_N_NO_F (vmovnbq, VMOVNBQ)
 FUNCTION_WITHOUT_N_NO_F (vmovntq, VMOVNTQ)
 FUNCTION_WITHOUT_N_NO_F (vmulhq, VMULHQ)
 FUNCTION_WITH_RTX_M_N (vmulq, MULT, VMULQ)
+FUNCTION_WITH_RTX_M_N_NO_F (vmvnq, NOT, VMVNQ)
 FUNCTION (vnegq, unspec_based_mve_function_exact_insn, (NEG, NEG, NEG, -1, -1, 
-1, VNEGQ_M_S, -1, VNEGQ_M_F, -1, -1, -1))
 FUNCTION_WITH_RTX_M_N_NO_N_F (vorrq, IOR, VORRQ)
 FUNCTION_WITHOUT_N_NO_U_F (vqabsq, VQABSQ)
diff --git a/gcc/config/arm/arm-mve-builtins-base.def 
b/gcc/config/arm/arm-mve-builtins-base.def
index e53cb2c1992..141d057924e 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/config/arm/arm-mve-builtins-base.def
@@ -75,6 +75,7 @@ DEF_MVE_FUNCTION (vmovnbq, binary_move_narrow, integer_16_32, 
m_or_none)
 DEF_MVE_FUNCTION (vmovntq, binary_move_narrow, integer_16_32, m_or_none)
 DEF_MVE_FUNCTION (vmulhq, binary, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vmulq, binary_opt_n, all_integer, mx_or_none)
+DEF_MVE_FUNCTION (vmvnq, mvn, all_integer, mx_or_none)
 D

[PATCH 21/26] arm: [MVE intrinsics] add ternary_lshift shape

2023-05-12 Thread Christophe Lyon via Gcc-patches
This patch adds the ternary_lshift shape description.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (ternary_lshift): New.
* config/arm/arm-mve-builtins-shapes.h (ternary_lshift): New.
---
 gcc/config/arm/arm-mve-builtins-shapes.cc | 38 +++
 gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
 2 files changed, 39 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc 
b/gcc/config/arm/arm-mve-builtins-shapes.cc
index 012cf3ef4c0..a8e94b4f8f8 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -1372,6 +1372,44 @@ struct ternary_def : public overloaded_base<0>
 };
 SHAPE (ternary)
 
+/* _t vfoo[_t0](_t, _t, const int)
+
+   i.e. ternary operations that operate on a pair of vectors of the
+   same type as the destination, and take a third integer argument.
+
+   Check that 'imm' is in the [0..#bits-1] range.
+
+   Example: vsliq.
+   int16x8_t [__arm_]vsliq[_n_s16](int16x8_t a, int16x8_t b, const int imm)
+   int16x8_t [__arm_]vsliq_m[_n_s16](int16x8_t a, int16x8_t b, const int imm, 
mve_pred16_t p)  */
+struct ternary_lshift_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group,
+bool preserve_user_namespace) const override
+  {
+b.add_overloaded_functions (group, MODE_n, preserve_user_namespace);
+build_all (b, "v0,v0,v0,ss32", group, MODE_n, preserve_user_namespace);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+return r.resolve_uniform (2, 1);
+  }
+
+  bool
+  check (function_checker &c) const override
+  {
+if (c.mode_suffix_id != MODE_n)
+  return true;
+
+unsigned int bits = c.type_suffix (0).element_bits;
+return c.require_immediate_range (2, 0, bits - 1);
+  }
+};
+SHAPE (ternary_lshift)
+
 /* _t vfoo[_n_t0](_t, _t, _t)
 
i.e. the standard shape for ternary operations that operate on a
diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h 
b/gcc/config/arm/arm-mve-builtins-shapes.h
index 6e818092a87..73375186d82 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.h
+++ b/gcc/config/arm/arm-mve-builtins-shapes.h
@@ -61,6 +61,7 @@ namespace arm_mve
 extern const function_shape *const inherent;
 extern const function_shape *const mvn;
 extern const function_shape *const ternary;
+extern const function_shape *const ternary_lshift;
 extern const function_shape *const ternary_n;
 extern const function_shape *const ternary_opt_n;
 extern const function_shape *const unary;
-- 
2.34.1



[PATCH 17/26] arm: [MVE intrinsics] rework vfmaq vfmasq vfmsq

2023-05-12 Thread Christophe Lyon via Gcc-patches
Implement vfmaq, vfmasq, vfmsq using the new MVE builtins framework.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc (vfmaq, vfmasq, vfmsq): New.
* config/arm/arm-mve-builtins-base.def (vfmaq, vfmasq, vfmsq): New.
* config/arm/arm-mve-builtins-base.h (vfmaq, vfmasq, vfmsq): New.
* config/arm/arm-mve-builtins.cc
(function_instance::has_inactive_argument): Handle vfmaq, vfmasq,
vfmsq.
* config/arm/arm_mve.h (vfmaq): Remove.
(vfmasq): Remove.
(vfmsq): Remove.
(vfmaq_m): Remove.
(vfmasq_m): Remove.
(vfmsq_m): Remove.
(vfmaq_f16): Remove.
(vfmaq_n_f16): Remove.
(vfmasq_n_f16): Remove.
(vfmsq_f16): Remove.
(vfmaq_f32): Remove.
(vfmaq_n_f32): Remove.
(vfmasq_n_f32): Remove.
(vfmsq_f32): Remove.
(vfmaq_m_f32): Remove.
(vfmaq_m_f16): Remove.
(vfmaq_m_n_f32): Remove.
(vfmaq_m_n_f16): Remove.
(vfmasq_m_n_f32): Remove.
(vfmasq_m_n_f16): Remove.
(vfmsq_m_f32): Remove.
(vfmsq_m_f16): Remove.
(__arm_vfmaq_f16): Remove.
(__arm_vfmaq_n_f16): Remove.
(__arm_vfmasq_n_f16): Remove.
(__arm_vfmsq_f16): Remove.
(__arm_vfmaq_f32): Remove.
(__arm_vfmaq_n_f32): Remove.
(__arm_vfmasq_n_f32): Remove.
(__arm_vfmsq_f32): Remove.
(__arm_vfmaq_m_f32): Remove.
(__arm_vfmaq_m_f16): Remove.
(__arm_vfmaq_m_n_f32): Remove.
(__arm_vfmaq_m_n_f16): Remove.
(__arm_vfmasq_m_n_f32): Remove.
(__arm_vfmasq_m_n_f16): Remove.
(__arm_vfmsq_m_f32): Remove.
(__arm_vfmsq_m_f16): Remove.
(__arm_vfmaq): Remove.
(__arm_vfmasq): Remove.
(__arm_vfmsq): Remove.
(__arm_vfmaq_m): Remove.
(__arm_vfmasq_m): Remove.
(__arm_vfmsq_m): Remove.
---
 gcc/config/arm/arm-mve-builtins-base.cc  |   3 +
 gcc/config/arm/arm-mve-builtins-base.def |   3 +
 gcc/config/arm/arm-mve-builtins-base.h   |   3 +
 gcc/config/arm/arm-mve-builtins.cc   |   3 +
 gcc/config/arm/arm_mve.h | 292 ---
 5 files changed, 12 insertions(+), 292 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc 
b/gcc/config/arm/arm-mve-builtins-base.cc
index 6286d4a147a..91d397d6208 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -273,6 +273,9 @@ FUNCTION (vcmphiq, 
unspec_based_mve_function_exact_insn_vcmp, (UNKNOWN, GTU, UNK
 FUNCTION_WITHOUT_M_N (vcreateq, VCREATEQ)
 FUNCTION_ONLY_N (vdupq, VDUPQ)
 FUNCTION_WITH_RTX_M (veorq, XOR, VEORQ)
+FUNCTION (vfmaq, unspec_mve_function_exact_insn, (-1, -1, VFMAQ_F, -1, -1, 
VFMAQ_N_F, -1, -1, VFMAQ_M_F, -1, -1, VFMAQ_M_N_F))
+FUNCTION (vfmasq, unspec_mve_function_exact_insn, (-1, -1, -1, -1, -1, 
VFMASQ_N_F, -1, -1, -1, -1, -1, VFMASQ_M_N_F))
+FUNCTION (vfmsq, unspec_mve_function_exact_insn, (-1, -1, VFMSQ_F, -1, -1, -1, 
-1, -1, VFMSQ_M_F, -1, -1, -1))
 FUNCTION_WITH_M_N_NO_F (vhaddq, VHADDQ)
 FUNCTION_WITH_M_N_NO_F (vhsubq, VHSUBQ)
 FUNCTION_PRED_P_S (vmaxavq, VMAXAVQ)
diff --git a/gcc/config/arm/arm-mve-builtins-base.def 
b/gcc/config/arm/arm-mve-builtins-base.def
index 141d057924e..8894f9e5372 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/config/arm/arm-mve-builtins-base.def
@@ -158,6 +158,9 @@ DEF_MVE_FUNCTION (vcmpneq, cmp, all_float, m_or_none)
 DEF_MVE_FUNCTION (vcreateq, create, all_float, none)
 DEF_MVE_FUNCTION (vdupq, unary_n, all_float, mx_or_none)
 DEF_MVE_FUNCTION (veorq, binary, all_float, mx_or_none)
+DEF_MVE_FUNCTION (vfmaq, ternary_opt_n, all_float, m_or_none)
+DEF_MVE_FUNCTION (vfmasq, ternary_n, all_float, m_or_none)
+DEF_MVE_FUNCTION (vfmsq, ternary, all_float, m_or_none)
 DEF_MVE_FUNCTION (vmaxnmaq, binary, all_float, m_or_none)
 DEF_MVE_FUNCTION (vmaxnmavq, binary_maxvminv, all_float, p_or_none)
 DEF_MVE_FUNCTION (vmaxnmq, binary, all_float, mx_or_none)
diff --git a/gcc/config/arm/arm-mve-builtins-base.h 
b/gcc/config/arm/arm-mve-builtins-base.h
index b1783a2c917..dabb52e800d 100644
--- a/gcc/config/arm/arm-mve-builtins-base.h
+++ b/gcc/config/arm/arm-mve-builtins-base.h
@@ -46,6 +46,9 @@ extern const function_base *const vcmpneq;
 extern const function_base *const vcreateq;
 extern const function_base *const vdupq;
 extern const function_base *const veorq;
+extern const function_base *const vfmaq;
+extern const function_base *const vfmasq;
+extern const function_base *const vfmsq;
 extern const function_base *const vhaddq;
 extern const function_base *const vhsubq;
 extern const function_base *const vmaxaq;
diff --git a/gcc/config/arm/arm-mve-builtins.cc 
b/gcc/config/arm/arm-mve-builtins.cc
index c157a3ec8a3..87fcbc31f2f 100644
--- a/gcc/config/arm/arm-mve-builtins.cc
+++ b/gcc/config/arm/arm-mve-builtins.cc
@@ -678,6 +678,9 @@ function_instance::has_inactive_argument () const
   || base == f

[PATCH 20/26] arm: [MVE intrinsics] rework vpselq

2023-05-12 Thread Christophe Lyon via Gcc-patches
Implement vpselq using the new MVE builtins framework.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc (vpselq): New.
* config/arm/arm-mve-builtins-base.def (vpselq): New.
* config/arm/arm-mve-builtins-base.h (vpselq): New.
* config/arm/arm_mve.h (vpselq): Remove.
(vpselq_u8): Remove.
(vpselq_s8): Remove.
(vpselq_u16): Remove.
(vpselq_s16): Remove.
(vpselq_u32): Remove.
(vpselq_s32): Remove.
(vpselq_u64): Remove.
(vpselq_s64): Remove.
(vpselq_f16): Remove.
(vpselq_f32): Remove.
(__arm_vpselq_u8): Remove.
(__arm_vpselq_s8): Remove.
(__arm_vpselq_u16): Remove.
(__arm_vpselq_s16): Remove.
(__arm_vpselq_u32): Remove.
(__arm_vpselq_s32): Remove.
(__arm_vpselq_u64): Remove.
(__arm_vpselq_s64): Remove.
(__arm_vpselq_f16): Remove.
(__arm_vpselq_f32): Remove.
(__arm_vpselq): Remove.
---
 gcc/config/arm/arm-mve-builtins-base.cc  |   1 +
 gcc/config/arm/arm-mve-builtins-base.def |   2 +
 gcc/config/arm/arm-mve-builtins-base.h   |   1 +
 gcc/config/arm/arm_mve.h | 177 ---
 4 files changed, 4 insertions(+), 177 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc 
b/gcc/config/arm/arm-mve-builtins-base.cc
index 91d397d6208..b1440ca489e 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -320,6 +320,7 @@ FUNCTION_WITHOUT_N_NO_F (vmulhq, VMULHQ)
 FUNCTION_WITH_RTX_M_N (vmulq, MULT, VMULQ)
 FUNCTION_WITH_RTX_M_N_NO_F (vmvnq, NOT, VMVNQ)
 FUNCTION (vnegq, unspec_based_mve_function_exact_insn, (NEG, NEG, NEG, -1, -1, 
-1, VNEGQ_M_S, -1, VNEGQ_M_F, -1, -1, -1))
+FUNCTION_WITHOUT_M_N (vpselq, VPSELQ)
 FUNCTION_WITH_RTX_M_N_NO_N_F (vorrq, IOR, VORRQ)
 FUNCTION_WITHOUT_N_NO_U_F (vqabsq, VQABSQ)
 FUNCTION_WITH_M_N_NO_F (vqaddq, VQADDQ)
diff --git a/gcc/config/arm/arm-mve-builtins-base.def 
b/gcc/config/arm/arm-mve-builtins-base.def
index 8894f9e5372..de4c473f618 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/config/arm/arm-mve-builtins-base.def
@@ -78,6 +78,7 @@ DEF_MVE_FUNCTION (vmulq, binary_opt_n, all_integer, 
mx_or_none)
 DEF_MVE_FUNCTION (vmvnq, mvn, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vnegq, unary, all_signed, mx_or_none)
 DEF_MVE_FUNCTION (vorrq, binary_orrq, all_integer, mx_or_none)
+DEF_MVE_FUNCTION (vpselq, vpsel, all_integer_with_64, none)
 DEF_MVE_FUNCTION (vqabsq, unary, all_signed, m_or_none)
 DEF_MVE_FUNCTION (vqaddq, binary_opt_n, all_integer, m_or_none)
 DEF_MVE_FUNCTION (vqdmladhq, ternary, all_signed, m_or_none)
@@ -172,6 +173,7 @@ DEF_MVE_FUNCTION (vminnmvq, binary_maxvminv, all_float, 
p_or_none)
 DEF_MVE_FUNCTION (vmulq, binary_opt_n, all_float, mx_or_none)
 DEF_MVE_FUNCTION (vnegq, unary, all_float, mx_or_none)
 DEF_MVE_FUNCTION (vorrq, binary_orrq, all_float, mx_or_none)
+DEF_MVE_FUNCTION (vpselq, vpsel, all_float, none)
 DEF_MVE_FUNCTION (vreinterpretq, unary_convert, reinterpret_float, none)
 DEF_MVE_FUNCTION (vrev32q, unary, float16, mx_or_none)
 DEF_MVE_FUNCTION (vrev64q, unary, all_float, mx_or_none)
diff --git a/gcc/config/arm/arm-mve-builtins-base.h 
b/gcc/config/arm/arm-mve-builtins-base.h
index dabb52e800d..ec5b4fbffb9 100644
--- a/gcc/config/arm/arm-mve-builtins-base.h
+++ b/gcc/config/arm/arm-mve-builtins-base.h
@@ -94,6 +94,7 @@ extern const function_base *const vmulq;
 extern const function_base *const vmvnq;
 extern const function_base *const vnegq;
 extern const function_base *const vorrq;
+extern const function_base *const vpselq;
 extern const function_base *const vqabsq;
 extern const function_base *const vqaddq;
 extern const function_base *const vqdmladhq;
diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index 747cdc3509d..72b50764963 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -54,7 +54,6 @@
 #define vmullbq_poly(__a, __b) __arm_vmullbq_poly(__a, __b)
 #define vbicq_m_n(__a, __imm, __p) __arm_vbicq_m_n(__a, __imm, __p)
 #define vshlcq(__a, __b, __imm) __arm_vshlcq(__a, __b, __imm)
-#define vpselq(__a, __b, __p) __arm_vpselq(__a, __b, __p)
 #define vsriq(__a, __b, __imm) __arm_vsriq(__a, __b, __imm)
 #define vsliq(__a, __b, __imm) __arm_vsliq(__a, __b, __imm)
 #define vsriq_m(__a, __b, __imm, __p) __arm_vsriq_m(__a, __b, __imm, __p)
@@ -341,26 +340,18 @@
 #define vshlcq_u16(__a,  __b,  __imm) __arm_vshlcq_u16(__a,  __b,  __imm)
 #define vshlcq_s32(__a,  __b,  __imm) __arm_vshlcq_s32(__a,  __b,  __imm)
 #define vshlcq_u32(__a,  __b,  __imm) __arm_vshlcq_u32(__a,  __b,  __imm)
-#define vpselq_u8(__a, __b, __p) __arm_vpselq_u8(__a, __b, __p)
-#define vpselq_s8(__a, __b, __p) __arm_vpselq_s8(__a, __b, __p)
 #define vsriq_n_u8(__a, __b,  __imm) __arm_vsriq_n_u8(__a, __b,  __imm)
 #define vsliq_n_u8(__a, __b,  __imm) __arm_vsliq_n_u8(__a, __b,  __imm)
 #define vsriq_n_s8(__a, __b,  __imm) __arm

[PATCH 25/26] arm: [MVE intrinsics] factorize vsriq

2023-05-12 Thread Christophe Lyon via Gcc-patches
Factorize vsriq builtins so that they use parameterized names.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/iterators.md (mve_insn): Add vsri.
* config/arm/mve.md (mve_vsriq_n_): Rename into ...
(@mve_q_n_): .,. this.
(mve_vsriq_m_n_): Rename into ...
(@mve_q_m_n_): ... this.
---
 gcc/config/arm/iterators.md | 2 ++
 gcc/config/arm/mve.md   | 8 
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 7e7219033cf..597c1dae640 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -1183,6 +1183,8 @@ (define_int_attr mve_insn [
 (VSHRQ_N_S "vshr") (VSHRQ_N_U "vshr")
 (VSLIQ_M_N_S "vsli") (VSLIQ_M_N_U "vsli")
 (VSLIQ_N_S "vsli") (VSLIQ_N_U "vsli")
+(VSRIQ_M_N_S "vsri") (VSRIQ_M_N_U "vsri")
+(VSRIQ_N_S "vsri") (VSRIQ_N_U "vsri")
 (VSUBQ_M_N_S "vsub") (VSUBQ_M_N_U "vsub") (VSUBQ_M_N_F "vsub")
 (VSUBQ_M_S "vsub") (VSUBQ_M_U "vsub") (VSUBQ_M_F "vsub")
 (VSUBQ_N_S "vsub") (VSUBQ_N_U "vsub") (VSUBQ_N_F "vsub")
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index a1c2cad9d2e..85d701a66b3 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -2074,7 +2074,7 @@ (define_insn "@mve_q_n_"
 ;;
 ;; [vsriq_n_u, vsriq_n_s])
 ;;
-(define_insn "mve_vsriq_n_"
+(define_insn "@mve_q_n_"
   [
(set (match_operand:MVE_2 0 "s_register_operand" "=w")
(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
@@ -2083,7 +2083,7 @@ (define_insn "mve_vsriq_n_"
 VSRIQ_N))
   ]
   "TARGET_HAVE_MVE"
-  "vsri.%#\t%q0, %q2, %3"
+  ".%#\t%q0, %q2, %3"
   [(set_attr "type" "mve_move")
 ])
 
@@ -2641,7 +2641,7 @@ (define_insn "@mve_q_m_n_"
 ;;
 ;; [vsriq_m_n_s, vsriq_m_n_u])
 ;;
-(define_insn "mve_vsriq_m_n_"
+(define_insn "@mve_q_m_n_"
   [
(set (match_operand:MVE_2 0 "s_register_operand" "=w")
(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
@@ -2651,7 +2651,7 @@ (define_insn "mve_vsriq_m_n_"
 VSRIQ_M_N))
   ]
   "TARGET_HAVE_MVE"
-  "vpst\;vsrit.%#\t%q0, %q2, %3"
+  "vpst\;t.%#\t%q0, %q2, %3"
   [(set_attr "type" "mve_move")
(set_attr "length" "8")])
 
-- 
2.34.1



[PATCH 16/26] arm: [MVE intrinsics] factorize vfmaq vfmsq vfmasq

2023-05-12 Thread Christophe Lyon via Gcc-patches
Factorize vmvnq builtins so that they use parameterized names.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/iterators.md (MVE_FP_M_BINARY): Add VFMAQ_M_F,
VFMSQ_M_F.
(MVE_FP_M_N_BINARY): Add VFMAQ_M_N_F, VFMASQ_M_N_F.
(MVE_VFMxQ_F, MVE_VFMAxQ_N_F): New.
(mve_insn): Add vfma, vfmas, vfms.
* config/arm/mve.md (mve_vfmaq_f, mve_vfmsq_f): Merge
into ...
(@mve_q_f): ... this.
(mve_vfmaq_n_f, mve_vfmasq_n_f): Merge into ...
(@mve_q_n_f): ... this.
(mve_vfmaq_m_f, mve_vfmsq_m_f): Merge into
@mve_q_m_f.
(mve_vfmaq_m_n_f, mve_vfmasq_m_n_f): Merge into
@mve_q_m_n_f.
---
 gcc/config/arm/iterators.md |  20 ++
 gcc/config/arm/mve.md   | 123 +---
 2 files changed, 35 insertions(+), 108 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 7fbfea49ff3..022744f04d9 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -577,6 +577,8 @@ (define_int_iterator MVE_SHRN_M_N [
 (define_int_iterator MVE_FP_M_BINARY   [
 VABDQ_M_F
 VADDQ_M_F
+VFMAQ_M_F
+VFMSQ_M_F
 VMAXNMQ_M_F
 VMINNMQ_M_F
 VMULQ_M_F
@@ -592,6 +594,8 @@ (define_int_iterator MVE_FP_M_BINARY_LOGIC   [
 
 (define_int_iterator MVE_FP_M_N_BINARY [
 VADDQ_M_N_F
+VFMAQ_M_N_F
+VFMASQ_M_N_F
 VMULQ_M_N_F
 VSUBQ_M_N_F
 ])
@@ -659,6 +663,14 @@ (define_int_iterator MVE_CMP_M_N_F [
 VCMPNEQ_M_N_F
 ])
 
+(define_int_iterator MVE_VFMxQ_F [
+VFMAQ_F VFMSQ_F
+])
+
+(define_int_iterator MVE_VFMAxQ_N_F [
+VFMAQ_N_F VFMASQ_N_F
+])
+
 (define_int_iterator MVE_VMAXVQ_VMINVQ [
 VMAXAVQ_S
 VMAXVQ_S VMAXVQ_U
@@ -917,6 +929,14 @@ (define_int_attr mve_insn [
 (VDUPQ_M_N_S "vdup") (VDUPQ_M_N_U "vdup") (VDUPQ_M_N_F "vdup")
 (VDUPQ_N_S "vdup") (VDUPQ_N_U "vdup") (VDUPQ_N_F "vdup")
 (VEORQ_M_S "veor") (VEORQ_M_U "veor") (VEORQ_M_F "veor")
+(VFMAQ_F "vfma")
+(VFMAQ_M_F "vfma")
+(VFMAQ_M_N_F "vfma")
+(VFMAQ_N_F "vfma")
+(VFMASQ_M_N_F "vfmas")
+(VFMASQ_N_F "vfmas")
+(VFMSQ_F "vfms")
+(VFMSQ_M_F "vfms")
 (VHADDQ_M_N_S "vhadd") (VHADDQ_M_N_U "vhadd")
 (VHADDQ_M_S "vhadd") (VHADDQ_M_U "vhadd")
 (VHADDQ_N_S "vhadd") (VHADDQ_N_U "vhadd")
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 57ba65d3c76..b87798730a2 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -2246,65 +2246,36 @@ (define_insn "@mve_q_m_n_f"
(set_attr "length""8")])
 
 ;;
-;; [vfmaq_f])
+;; [vfmaq_f]
+;; [vfmsq_f]
 ;;
-(define_insn "mve_vfmaq_f"
+(define_insn "@mve_q_f"
   [
(set (match_operand:MVE_0 0 "s_register_operand" "=w")
(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
   (match_operand:MVE_0 2 "s_register_operand" "w")
   (match_operand:MVE_0 3 "s_register_operand" "w")]
-VFMAQ_F))
+MVE_VFMxQ_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vfma.f%# %q0, %q2, %q3"
+  ".f%#\t%q0, %q2, %q3"
   [(set_attr "type" "mve_move")
 ])
 
 ;;
-;; [vfmaq_n_f])
+;; [vfmaq_n_f]
+;; [vfmasq_n_f]
 ;;
-(define_insn "mve_vfmaq_n_f"
+(define_insn "@mve_q_n_f"
   [
(set (match_operand:MVE_0 0 "s_register_operand" "=w")
(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
   (match_operand:MVE_0 2 "s_register_operand" "w")
   (match_operand: 3 "s_register_operand" "r")]
-VFMAQ_N_F))
+MVE_VFMAxQ_N_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vfma.f%# %q0, %q2, %3"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vfmasq_n_f])
-;;
-(define_insn "mve_vfmasq_n_f"
-  [
-   (set (match_operand:MVE_0 0 "s_register_operand" "=w")
-   (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
-  (match_operand:MVE_0 2 "s_register_operand" "w")
-  (match_operand: 3 "s_register_operand" "r")]
-VFMASQ_N_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vfmas.f%#%q0, %q2, %3"
-  [(set_attr "type" "mve_move")
-])
-;;
-;; [vfmsq_f])
-;;
-(define_insn "mve_vfmsq_f"
-  [
-   (set (match_operand:MVE_0 0 "s_register_operand" "=w")
-   (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
-  (match_operand:MVE_0 2 "s_register_operand" "w")
-  (match_operand:MVE_0 3 "s_register

[PATCH 26/26] arm: [MVE intrinsics] rework vsriq

2023-05-12 Thread Christophe Lyon via Gcc-patches
Implement vsriq using the new MVE builtins framework.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc (vsriq): New.
* config/arm/arm-mve-builtins-base.def (vsriq): New.
* config/arm/arm-mve-builtins-base.h (vsriq): New.
* config/arm/arm-mve-builtins.cc
(function_instance::has_inactive_argument): Handle vsriq.
* config/arm/arm_mve.h (vsriq): Remove.
(vsriq_m): Remove.
(vsriq_n_u8): Remove.
(vsriq_n_s8): Remove.
(vsriq_n_u16): Remove.
(vsriq_n_s16): Remove.
(vsriq_n_u32): Remove.
(vsriq_n_s32): Remove.
(vsriq_m_n_s8): Remove.
(vsriq_m_n_u8): Remove.
(vsriq_m_n_s16): Remove.
(vsriq_m_n_u16): Remove.
(vsriq_m_n_s32): Remove.
(vsriq_m_n_u32): Remove.
(__arm_vsriq_n_u8): Remove.
(__arm_vsriq_n_s8): Remove.
(__arm_vsriq_n_u16): Remove.
(__arm_vsriq_n_s16): Remove.
(__arm_vsriq_n_u32): Remove.
(__arm_vsriq_n_s32): Remove.
(__arm_vsriq_m_n_s8): Remove.
(__arm_vsriq_m_n_u8): Remove.
(__arm_vsriq_m_n_s16): Remove.
(__arm_vsriq_m_n_u16): Remove.
(__arm_vsriq_m_n_s32): Remove.
(__arm_vsriq_m_n_u32): Remove.
(__arm_vsriq): Remove.
(__arm_vsriq_m): Remove.
---
 gcc/config/arm/arm-mve-builtins-base.cc  |   1 +
 gcc/config/arm/arm-mve-builtins-base.def |   1 +
 gcc/config/arm/arm-mve-builtins-base.h   |   1 +
 gcc/config/arm/arm-mve-builtins.cc   |   3 +-
 gcc/config/arm/arm_mve.h | 212 ---
 5 files changed, 5 insertions(+), 213 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc 
b/gcc/config/arm/arm-mve-builtins-base.cc
index 873c7d365f3..af02397f1c4 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -388,6 +388,7 @@ FUNCTION_ONLY_N_NO_F (vshrnbq, VSHRNBQ)
 FUNCTION_ONLY_N_NO_F (vshrntq, VSHRNTQ)
 FUNCTION_ONLY_N_NO_F (vshrq, VSHRQ)
 FUNCTION_ONLY_N_NO_F (vsliq, VSLIQ)
+FUNCTION_ONLY_N_NO_F (vsriq, VSRIQ)
 FUNCTION_WITH_RTX_M_N (vsubq, MINUS, VSUBQ)
 FUNCTION (vuninitializedq, vuninitializedq_impl,)
 
diff --git a/gcc/config/arm/arm-mve-builtins-base.def 
b/gcc/config/arm/arm-mve-builtins-base.def
index 2d1b87b90c3..ee08d063407 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/config/arm/arm-mve-builtins-base.def
@@ -141,6 +141,7 @@ DEF_MVE_FUNCTION (vshrnbq, binary_rshift_narrow, 
integer_16_32, m_or_none)
 DEF_MVE_FUNCTION (vshrntq, binary_rshift_narrow, integer_16_32, m_or_none)
 DEF_MVE_FUNCTION (vshrq, binary_rshift, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vsliq, ternary_lshift, all_integer, m_or_none)
+DEF_MVE_FUNCTION (vsriq, ternary_rshift, all_integer, m_or_none)
 DEF_MVE_FUNCTION (vsubq, binary_opt_n, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vuninitializedq, inherent, all_integer_with_64, none)
 #undef REQUIRES_FLOAT
diff --git a/gcc/config/arm/arm-mve-builtins-base.h 
b/gcc/config/arm/arm-mve-builtins-base.h
index 84fff0f6d0e..942c8587446 100644
--- a/gcc/config/arm/arm-mve-builtins-base.h
+++ b/gcc/config/arm/arm-mve-builtins-base.h
@@ -161,6 +161,7 @@ extern const function_base *const vshrnbq;
 extern const function_base *const vshrntq;
 extern const function_base *const vshrq;
 extern const function_base *const vsliq;
+extern const function_base *const vsriq;
 extern const function_base *const vsubq;
 extern const function_base *const vuninitializedq;
 
diff --git a/gcc/config/arm/arm-mve-builtins.cc 
b/gcc/config/arm/arm-mve-builtins.cc
index f5056bdd1bb..7033e41a571 100644
--- a/gcc/config/arm/arm-mve-builtins.cc
+++ b/gcc/config/arm/arm-mve-builtins.cc
@@ -720,7 +720,8 @@ function_instance::has_inactive_argument () const
   || base == functions::vrshrntq
   || base == functions::vshrnbq
   || base == functions::vshrntq
-  || base == functions::vsliq)
+  || base == functions::vsliq
+  || base == functions::vsriq)
 return false;
 
   return true;
diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index 72177f9c53e..1774e6eca2b 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -54,8 +54,6 @@
 #define vmullbq_poly(__a, __b) __arm_vmullbq_poly(__a, __b)
 #define vbicq_m_n(__a, __imm, __p) __arm_vbicq_m_n(__a, __imm, __p)
 #define vshlcq(__a, __b, __imm) __arm_vshlcq(__a, __b, __imm)
-#define vsriq(__a, __b, __imm) __arm_vsriq(__a, __b, __imm)
-#define vsriq_m(__a, __b, __imm, __p) __arm_vsriq_m(__a, __b, __imm, __p)
 #define vbicq_m(__inactive, __a, __b, __p) __arm_vbicq_m(__inactive, __a, __b, 
__p)
 #define vcaddq_rot270_m(__inactive, __a, __b, __p) 
__arm_vcaddq_rot270_m(__inactive, __a, __b, __p)
 #define vcaddq_rot90_m(__inactive, __a, __b, __p) 
__arm_vcaddq_rot90_m(__inactive, __a, __b, __p)
@@ -338,12 +336,6 @@
 #define vshlcq_u16(__a,  __b,  __imm) __arm_vshlcq_u16(__a,  __b,  __imm)
 #define vshlcq_s32(__a,  __b,  __im

[PATCH v3] MIPS: add speculation_barrier support

2023-05-12 Thread YunQiang Su
speculation_barrier for MIPS needs sync+jr.hb (r2+),
so we implement __speculation_barrier in libgcc, like arm32 does.

gcc/ChangeLog:
* config/mips/mips-protos.h (mips_emit_speculation_barrier): New
prototype.
* config/mips/mips.cc (speculation_barrier_libfunc): New static
variable.
(mips_init_libfuncs): Initialize it.
(mips_emit_speculation_barrier): New function.
* config/mips/mips.md (speculation_barrier): Call
mips_emit_speculation_barrier.
* configure.ac: error if gas doesn't accept ssnop for mips1.
* configure: regenerated.
* doc/install.texi: documents mips requires binutils 2.21+.

libgcc/ChangeLog:
* config/mips/lib1funcs.S: New file.
define __speculation_barrier and include mips16.S.
* config/mips/t-mips: define LIB1ASMSRC as mips/lib1funcs.S.
define LIB1ASMFUNCS as _speculation_barrier.
set version info for __speculation_barrier.
* config/mips/libgcc-mips.ver: New file.
* config/mips/t-mips16: don't define LIB1ASMSRC as mips16.S is
included in lib1funcs.S now.
---
 gcc/config/mips/mips-protos.h  |  2 +
 gcc/config/mips/mips.cc| 13 ++
 gcc/config/mips/mips.md| 12 ++
 gcc/configure  | 32 +++
 gcc/configure.ac   |  7 
 gcc/doc/install.texi   |  2 +
 libgcc/config/mips/lib1funcs.S | 63 ++
 libgcc/config/mips/libgcc-mips.ver | 21 ++
 libgcc/config/mips/t-mips  |  7 
 libgcc/config/mips/t-mips16|  3 +-
 10 files changed, 160 insertions(+), 2 deletions(-)
 create mode 100644 libgcc/config/mips/lib1funcs.S
 create mode 100644 libgcc/config/mips/libgcc-mips.ver

diff --git a/gcc/config/mips/mips-protos.h b/gcc/config/mips/mips-protos.h
index 20483469105..da7902c235b 100644
--- a/gcc/config/mips/mips-protos.h
+++ b/gcc/config/mips/mips-protos.h
@@ -388,4 +388,6 @@ extern void mips_register_frame_header_opt (void);
 extern void mips_expand_vec_cond_expr (machine_mode, machine_mode, rtx *);
 extern void mips_expand_vec_cmp_expr (rtx *);
 
+extern void mips_emit_speculation_barrier_function (void);
+
 #endif /* ! GCC_MIPS_PROTOS_H */
diff --git a/gcc/config/mips/mips.cc b/gcc/config/mips/mips.cc
index ca822758b41..139707fda34 100644
--- a/gcc/config/mips/mips.cc
+++ b/gcc/config/mips/mips.cc
@@ -13611,6 +13611,9 @@ mips_autovectorize_vector_modes (vector_modes *modes, 
bool)
   return 0;
 }
 
+
+static GTY(()) rtx speculation_barrier_libfunc;
+
 /* Implement TARGET_INIT_LIBFUNCS.  */
 
 static void
@@ -13680,6 +13683,7 @@ mips_init_libfuncs (void)
   synchronize_libfunc = init_one_libfunc ("__sync_synchronize");
   init_sync_libfuncs (UNITS_PER_WORD);
 }
+  speculation_barrier_libfunc = init_one_libfunc ("__speculation_barrier");
 }
 
 /* Build up a multi-insn sequence that loads label TARGET into $AT.  */
@@ -19092,6 +19096,15 @@ mips_avoid_hazard (rtx_insn *after, rtx_insn *insn, 
int *hilo_delay,
   }
 }
 
+/* Emit a speculation barrier.
+   JR.HB is needed, so we need to put
+   speculation_barrier_libfunc in libgcc */
+void
+mips_emit_speculation_barrier_function ()
+{
+  emit_library_call (speculation_barrier_libfunc, LCT_NORMAL, VOIDmode);
+}
+
 /* A SEQUENCE is breakable iff the branch inside it has a compact form
and the target has compact branches.  */
 
diff --git a/gcc/config/mips/mips.md b/gcc/config/mips/mips.md
index ac1d77afc7d..5d04ac566dd 100644
--- a/gcc/config/mips/mips.md
+++ b/gcc/config/mips/mips.md
@@ -160,6 +160,8 @@
   ;; The `.insn' pseudo-op.
   UNSPEC_INSN_PSEUDO
   UNSPEC_JRHB
+
+  VUNSPEC_SPECULATION_BARRIER
 ])
 
 (define_constants
@@ -7455,6 +7457,16 @@
   mips_expand_conditional_move (operands);
   DONE;
 })
+
+(define_expand "speculation_barrier"
+  [(unspec_volatile [(const_int 0)] VUNSPEC_SPECULATION_BARRIER)]
+  ""
+  "
+  mips_emit_speculation_barrier_function ();
+  DONE;
+  "
+)
+
 
 ;;
 ;;  
diff --git a/gcc/configure b/gcc/configure
index 191f68581b3..e55b40d7f4c 100755
--- a/gcc/configure
+++ b/gcc/configure
@@ -28699,6 +28699,38 @@ $as_echo "#define HAVE_LD_PERSONALITY_RELAXATION 1" 
>>confdefs.h
 
 fi
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking assembler for mips1 
recognize ssnop" >&5
+$as_echo_n "checking assembler for mips1 recognize ssnop... " >&6; }
+if ${gcc_cv_as_mips1_ssnop+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  gcc_cv_as_mips1_ssnop=no
+  if test x$gcc_cv_as != x; then
+$as_echo 'ssnop' > conftest.s
+if { ac_try='$gcc_cv_as $gcc_cv_as_flags -mabi=32 -mips1 -o conftest.o 
conftest.s >&5'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+ echo $ac_try
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }
+then
+   gcc_cv_as_mips1_ssnop=yes
+else
+  

RE: [PATCH 01/26] arm: [MVE intrinsics] add binary_widen_opt_n shape

2023-05-12 Thread Kyrylo Tkachov via Gcc-patches



> -Original Message-
> From: Christophe Lyon 
> Sent: Friday, May 12, 2023 10:39 AM
> To: gcc-patches@gcc.gnu.org; Kyrylo Tkachov ;
> Richard Earnshaw ; Richard Sandiford
> 
> Cc: Christophe Lyon 
> Subject: [PATCH 01/26] arm: [MVE intrinsics] add binary_widen_opt_n shape
> 
> This patch adds the binary_widen_opt_n shape description.

This series is ok with one style nit fixed in this first patch...

> 
> 2022-12-12  Christophe Lyon  
> 
>   gcc/
>   * config/arm/arm-mve-builtins-shapes.cc (binary_widen_opt_n):
> New.
>   * config/arm/arm-mve-builtins-shapes.h (binary_widen_opt_n): New.
> ---
>  gcc/config/arm/arm-mve-builtins-shapes.cc | 49 +++
>  gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
>  2 files changed, 50 insertions(+)
> 
> diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc b/gcc/config/arm/arm-
> mve-builtins-shapes.cc
> index 5a299a272f5..ee4bc3f8ea4 100644
> --- a/gcc/config/arm/arm-mve-builtins-shapes.cc
> +++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
> @@ -1098,6 +1098,55 @@ struct binary_widen_n_def : public
> overloaded_base<0>
>  };
>  SHAPE (binary_widen_n)
> 
> +/* _t vfoo[_t0](_t, _t)
> +   _t vfoo[_n_t0](_t, _t)
> +
> +   Example: vqdmullbq.
> +   int32x4_t [__arm_]vqdmulltq[_n_s16](int16x8_t a, int16_t b)
> +   int32x4_t [__arm_]vqdmulltq_m[_n_s16](int32x4_t inactive, int16x8_t a,
> int16_t b, mve_pred16_t p)
> +   int32x4_t [__arm_]vqdmulltq[_s16](int16x8_t a, int16x8_t b)
> +   int32x4_t [__arm_]vqdmulltq_m[_s16](int32x4_t inactive, int16x8_t a,
> int16x8_t b, mve_pred16_t p)  */
> +struct binary_widen_opt_n_def : public overloaded_base<0>
> +{
> +  void
> +  build (function_builder &b, const function_group_info &group,
> +  bool preserve_user_namespace) const override
> +  {
> +b.add_overloaded_functions (group, MODE_none,
> preserve_user_namespace);
> +build_all (b, "vw0,v0,v0", group, MODE_none,
> preserve_user_namespace);
> +build_all (b, "vw0,v0,s0", group, MODE_n, preserve_user_namespace);
> +  }
> +
> +  tree
> +  resolve (function_resolver &r) const override
> +  {
> +unsigned int i, nargs;
> +type_suffix_index type;
> +if (!r.check_gp_argument (2, i, nargs)
> + || (type = r.infer_vector_type (i - 1)) == NUM_TYPE_SUFFIXES)
> +  return error_mark_node;
> +
> +type_suffix_index wide_suffix
> +  = find_type_suffix (type_suffixes[type].tclass,
> +   type_suffixes[type].element_bits * 2);
> +
> +/* Skip last argument, may be scalar, will be checked below by
> +   finish_opt_n_resolution.  */
> +unsigned int last_arg = i--;
> +for (; i > 0; i--)
> +  if (!r.require_matching_vector_type (i, type))
> + return error_mark_node;
> +
> +/* Check the inactive argument has the wide type.  */
> +if ((r.pred == PRED_m)
> + && (r.infer_vector_type (0) != wide_suffix))
> +return r.report_no_such_form (type);

Indentation is off here I think.
Thanks,
Kyrill

> +
> +return r.finish_opt_n_resolution (last_arg, 0, type);
> +  }
> +};
> +SHAPE (binary_widen_opt_n)
> +
>  /* Shape for comparison operations that operate on
> uniform types.
> 
> diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h b/gcc/config/arm/arm-
> mve-builtins-shapes.h
> index a28cd6a1547..07b12b4af68 100644
> --- a/gcc/config/arm/arm-mve-builtins-shapes.h
> +++ b/gcc/config/arm/arm-mve-builtins-shapes.h
> @@ -53,6 +53,7 @@ namespace arm_mve
>  extern const function_shape *const binary_rshift_narrow;
>  extern const function_shape *const binary_rshift_narrow_unsigned;
>  extern const function_shape *const binary_widen_n;
> +extern const function_shape *const binary_widen_opt_n;
>  extern const function_shape *const cmp;
>  extern const function_shape *const create;
>  extern const function_shape *const inherent;
> --
> 2.34.1



Re: [PATCH v2] libstdc++: Do not use pthread_mutex_clocklock with ThreadSanitizer

2023-05-12 Thread Mike Crowe via Gcc-patches
On Thursday 11 May 2023 at 21:52:22 +0100, Jonathan Wakely wrote:
> On Thu, 11 May 2023 at 13:42, Jonathan Wakely  wrote:
> 
> >
> >
> > On Thu, 11 May 2023 at 13:19, Mike Crowe  wrote:
> >
> >> However, ...
> >>
> >> > > diff --git a/libstdc++-v3/acinclude.m4 b/libstdc++-v3/acinclude.m4
> >> > > index 89e7f5f5f45..e2700b05ec3 100644
> >> > > --- a/libstdc++-v3/acinclude.m4
> >> > > +++ b/libstdc++-v3/acinclude.m4
> >> > > @@ -4284,7 +4284,7 @@
> >> AC_DEFUN([GLIBCXX_CHECK_PTHREAD_COND_CLOCKWAIT], [
> >> > >[glibcxx_cv_PTHREAD_COND_CLOCKWAIT=no])
> >> > >])
> >> > >if test $glibcxx_cv_PTHREAD_COND_CLOCKWAIT = yes; then
> >> > > -AC_DEFINE(_GLIBCXX_USE_PTHREAD_COND_CLOCKWAIT, 1, [Define if
> >> > > pthread_cond_clockwait is available in .])
> >> > > +AC_DEFINE(_GLIBCXX_USE_PTHREAD_COND_CLOCKWAIT,
> >> (_GLIBCXX_TSAN==0),
> >> > > [Define if pthread_cond_clockwait is available in .])
> >> > >fi
> >>
> >> TSan does appear to have an interceptor for pthread_cond_clockwait, even
> >> if
> >> it lacks the others. Does this mean that this part is unnecessary?
> >>
> >
> > Ah good point, thanks. I grepped for clocklock but not clockwait.
> >
> 
> In fact it seems like we don't need to change
> _GLIBCXX_USE_PTHREAD_RWLOCK_CLOCKLOCK either, because I don't get any tsan
> warnings for that. It doesn't have interceptors for
> pthread_rwlock_{rd,wr}lock, but it doesn't complain anyway (maybe it's
> simply not instrumenting the rwlock functions at all?!)

It looks like TSan does have interceptors for pthread_rwlock_timedrdlock
etc. I can't explain why this doesn't cause problems when libstdc++ uses
pthread_rwlock_clockrdlock etc.

> So I'm now retesting with this version of the patch, which only touches the
> USE_PTHREAD_LOCKLOCK macro.
> 
> Please take another look, thanks.

> commit 4fc14825c125eece32980df21d09da35e3d5bac6
> Author: Jonathan Wakely 
> Date:   Tue May 9 09:30:48 2023
> 
> libstdc++: Do not use pthread_mutex_clocklock with ThreadSanitizer
> 
> As noted in https://github.com/llvm/llvm-project/issues/62623 there are
> no tsan interceptors for some of the new POSIX-1:202x APIs added by
> https://austingroupbugs.net/view.php?id=1216 so tsan gives false
> positive warnings for try_lock_for on timed mutexes.
> 
> Disable the uses of the new pthread_mutex_clocklock API when tsan is
> active. This changes the semantics of the try_lock_for functions,
> because it can change which clock is used for the wait. This means those
> functions might be affected by system clock adjustments when tsan is
> used, when they would not be affected otherwise.
> 
> libstdc++-v3/ChangeLog:
> 
> * acinclude.m4 (GLIBCXX_CHECK_PTHREAD_MUTEX_CLOCKLOCK): Define
> _GLIBCXX_USE_PTHREAD_MUTEX_CLOCKLOCK in terms of _GLIBCXX_TSAN.
> * configure: Regenerate.
> 
> diff --git a/libstdc++-v3/acinclude.m4 b/libstdc++-v3/acinclude.m4
> index 89e7f5f5f45..dce3d16aa5c 100644
> --- a/libstdc++-v3/acinclude.m4
> +++ b/libstdc++-v3/acinclude.m4
> @@ -4314,7 +4314,7 @@ AC_DEFUN([GLIBCXX_CHECK_PTHREAD_MUTEX_CLOCKLOCK], [
>[glibcxx_cv_PTHREAD_MUTEX_CLOCKLOCK=no])
>])
>if test $glibcxx_cv_PTHREAD_MUTEX_CLOCKLOCK = yes; then
> -AC_DEFINE(_GLIBCXX_USE_PTHREAD_MUTEX_CLOCKLOCK, 1, [Define if 
> pthread_mutex_clocklock is available in .])
> +AC_DEFINE(_GLIBCXX_USE_PTHREAD_MUTEX_CLOCKLOCK, (_GLIBCXX_TSAN==0), 
> [Define if pthread_mutex_clocklock is available in .])
>fi
>  
>CXXFLAGS="$ac_save_CXXFLAGS"

LGTM.

Mike.


[PATCH v4] MIPS: add speculation_barrier support

2023-05-12 Thread YunQiang Su
speculation_barrier for MIPS needs sync+jr.hb (r2+),
so we implement __speculation_barrier in libgcc, like arm32 does.

gcc/ChangeLog:
* config/mips/mips-protos.h (mips_emit_speculation_barrier): New
prototype.
* config/mips/mips.cc (speculation_barrier_libfunc): New static
variable.
(mips_init_libfuncs): Initialize it.
(mips_emit_speculation_barrier): New function.
* config/mips/mips.md (speculation_barrier): Call
mips_emit_speculation_barrier.
* configure.ac: error if gas doesn't accept ssnop for mips1.
* configure: regenerated.
* doc/install.texi: documents mips requires binutils 2.21+.

libgcc/ChangeLog:
* config/mips/lib1funcs.S: New file.
define __speculation_barrier and include mips16.S.
* config/mips/t-mips: define LIB1ASMSRC as mips/lib1funcs.S.
define LIB1ASMFUNCS as _speculation_barrier.
set version info for __speculation_barrier.
* config/mips/libgcc-mips.ver: New file.
* config/mips/t-mips16: don't define LIB1ASMSRC as mips16.S is
included in lib1funcs.S now.
---
 gcc/config/mips/mips-protos.h  |  2 +
 gcc/config/mips/mips.cc| 13 +++
 gcc/config/mips/mips.md| 12 ++
 gcc/configure  | 32 +++
 gcc/configure.ac   |  7 
 gcc/doc/install.texi   |  2 +
 libgcc/config/mips/lib1funcs.S | 62 ++
 libgcc/config/mips/libgcc-mips.ver | 21 ++
 libgcc/config/mips/t-mips  |  7 
 libgcc/config/mips/t-mips16|  3 +-
 10 files changed, 159 insertions(+), 2 deletions(-)
 create mode 100644 libgcc/config/mips/lib1funcs.S
 create mode 100644 libgcc/config/mips/libgcc-mips.ver

diff --git a/gcc/config/mips/mips-protos.h b/gcc/config/mips/mips-protos.h
index 20483469105..da7902c235b 100644
--- a/gcc/config/mips/mips-protos.h
+++ b/gcc/config/mips/mips-protos.h
@@ -388,4 +388,6 @@ extern void mips_register_frame_header_opt (void);
 extern void mips_expand_vec_cond_expr (machine_mode, machine_mode, rtx *);
 extern void mips_expand_vec_cmp_expr (rtx *);
 
+extern void mips_emit_speculation_barrier_function (void);
+
 #endif /* ! GCC_MIPS_PROTOS_H */
diff --git a/gcc/config/mips/mips.cc b/gcc/config/mips/mips.cc
index ca822758b41..139707fda34 100644
--- a/gcc/config/mips/mips.cc
+++ b/gcc/config/mips/mips.cc
@@ -13611,6 +13611,9 @@ mips_autovectorize_vector_modes (vector_modes *modes, 
bool)
   return 0;
 }
 
+
+static GTY(()) rtx speculation_barrier_libfunc;
+
 /* Implement TARGET_INIT_LIBFUNCS.  */
 
 static void
@@ -13680,6 +13683,7 @@ mips_init_libfuncs (void)
   synchronize_libfunc = init_one_libfunc ("__sync_synchronize");
   init_sync_libfuncs (UNITS_PER_WORD);
 }
+  speculation_barrier_libfunc = init_one_libfunc ("__speculation_barrier");
 }
 
 /* Build up a multi-insn sequence that loads label TARGET into $AT.  */
@@ -19092,6 +19096,15 @@ mips_avoid_hazard (rtx_insn *after, rtx_insn *insn, 
int *hilo_delay,
   }
 }
 
+/* Emit a speculation barrier.
+   JR.HB is needed, so we need to put
+   speculation_barrier_libfunc in libgcc */
+void
+mips_emit_speculation_barrier_function ()
+{
+  emit_library_call (speculation_barrier_libfunc, LCT_NORMAL, VOIDmode);
+}
+
 /* A SEQUENCE is breakable iff the branch inside it has a compact form
and the target has compact branches.  */
 
diff --git a/gcc/config/mips/mips.md b/gcc/config/mips/mips.md
index ac1d77afc7d..5d04ac566dd 100644
--- a/gcc/config/mips/mips.md
+++ b/gcc/config/mips/mips.md
@@ -160,6 +160,8 @@
   ;; The `.insn' pseudo-op.
   UNSPEC_INSN_PSEUDO
   UNSPEC_JRHB
+
+  VUNSPEC_SPECULATION_BARRIER
 ])
 
 (define_constants
@@ -7455,6 +7457,16 @@
   mips_expand_conditional_move (operands);
   DONE;
 })
+
+(define_expand "speculation_barrier"
+  [(unspec_volatile [(const_int 0)] VUNSPEC_SPECULATION_BARRIER)]
+  ""
+  "
+  mips_emit_speculation_barrier_function ();
+  DONE;
+  "
+)
+
 
 ;;
 ;;  
diff --git a/gcc/configure b/gcc/configure
index 191f68581b3..e55b40d7f4c 100755
--- a/gcc/configure
+++ b/gcc/configure
@@ -28699,6 +28699,38 @@ $as_echo "#define HAVE_LD_PERSONALITY_RELAXATION 1" 
>>confdefs.h
 
 fi
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking assembler for mips1 
recognize ssnop" >&5
+$as_echo_n "checking assembler for mips1 recognize ssnop... " >&6; }
+if ${gcc_cv_as_mips1_ssnop+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  gcc_cv_as_mips1_ssnop=no
+  if test x$gcc_cv_as != x; then
+$as_echo 'ssnop' > conftest.s
+if { ac_try='$gcc_cv_as $gcc_cv_as_flags -mabi=32 -mips1 -o conftest.o 
conftest.s >&5'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+ echo $ac_try
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }
+then
+   gcc_cv_as_mips1_ssnop=yes
+else
+ 

Re: [PATCH v2] libstdc++: Do not use pthread_mutex_clocklock with ThreadSanitizer

2023-05-12 Thread Jonathan Wakely via Gcc-patches
On Fri, 12 May 2023 at 11:30, Mike Crowe  wrote:

> On Thursday 11 May 2023 at 21:52:22 +0100, Jonathan Wakely wrote:
> > On Thu, 11 May 2023 at 13:42, Jonathan Wakely 
> wrote:
> >
> > >
> > >
> > > On Thu, 11 May 2023 at 13:19, Mike Crowe  wrote:
> > >
> > >> However, ...
> > >>
> > >> > > diff --git a/libstdc++-v3/acinclude.m4 b/libstdc++-v3/acinclude.m4
> > >> > > index 89e7f5f5f45..e2700b05ec3 100644
> > >> > > --- a/libstdc++-v3/acinclude.m4
> > >> > > +++ b/libstdc++-v3/acinclude.m4
> > >> > > @@ -4284,7 +4284,7 @@
> > >> AC_DEFUN([GLIBCXX_CHECK_PTHREAD_COND_CLOCKWAIT], [
> > >> > >[glibcxx_cv_PTHREAD_COND_CLOCKWAIT=no])
> > >> > >])
> > >> > >if test $glibcxx_cv_PTHREAD_COND_CLOCKWAIT = yes; then
> > >> > > -AC_DEFINE(_GLIBCXX_USE_PTHREAD_COND_CLOCKWAIT, 1, [Define if
> > >> > > pthread_cond_clockwait is available in .])
> > >> > > +AC_DEFINE(_GLIBCXX_USE_PTHREAD_COND_CLOCKWAIT,
> > >> (_GLIBCXX_TSAN==0),
> > >> > > [Define if pthread_cond_clockwait is available in .])
> > >> > >fi
> > >>
> > >> TSan does appear to have an interceptor for pthread_cond_clockwait,
> even
> > >> if
> > >> it lacks the others. Does this mean that this part is unnecessary?
> > >>
> > >
> > > Ah good point, thanks. I grepped for clocklock but not clockwait.
> > >
> >
> > In fact it seems like we don't need to change
> > _GLIBCXX_USE_PTHREAD_RWLOCK_CLOCKLOCK either, because I don't get any
> tsan
> > warnings for that. It doesn't have interceptors for
> > pthread_rwlock_{rd,wr}lock, but it doesn't complain anyway (maybe it's
> > simply not instrumenting the rwlock functions at all?!)
>
> It looks like TSan does have interceptors for pthread_rwlock_timedrdlock
> etc. I can't explain why this doesn't cause problems when libstdc++ uses
> pthread_rwlock_clockrdlock etc.
>

I think glibc has renamed the rwlock functions, and so the interceptors no
longer work.

# ifdef __USE_XOPEN2K
/* Try to acquire read lock for RWLOCK or return after specfied time.  */
#  ifndef __USE_TIME_BITS64
extern int pthread_rwlock_timedrdlock (pthread_rwlock_t *__restrict
__rwlock,
  const struct timespec *__restrict
  __abstime) __THROWNL __nonnull ((1, 2));
#  else
#   ifdef __REDIRECT_NTHNL
extern int __REDIRECT_NTHNL (pthread_rwlock_timedrdlock,
 (pthread_rwlock_t *__restrict __rwlock,
  const struct timespec *__restrict __abstime),
 __pthread_rwlock_timedrdlock64)
__nonnull ((1, 2));
#   else
#define pthread_rwlock_timedrdlock __pthread_rwlock_timedrdlock64
#   endif
#  endif
# endif

If glibc is really providing a function called
__pthread_rwlock_timedrdlock64 then will tsan be able to intercept that?



> > So I'm now retesting with this version of the patch, which only touches
> the
> > USE_PTHREAD_LOCKLOCK macro.
> >
> > Please take another look, thanks.
>
> > commit 4fc14825c125eece32980df21d09da35e3d5bac6
> > Author: Jonathan Wakely 
> > Date:   Tue May 9 09:30:48 2023
> >
> > libstdc++: Do not use pthread_mutex_clocklock with ThreadSanitizer
> >
> > As noted in https://github.com/llvm/llvm-project/issues/62623 there
> are
> > no tsan interceptors for some of the new POSIX-1:202x APIs added by
> > https://austingroupbugs.net/view.php?id=1216 so tsan gives false
> > positive warnings for try_lock_for on timed mutexes.
> >
> > Disable the uses of the new pthread_mutex_clocklock API when tsan is
> > active. This changes the semantics of the try_lock_for functions,
> > because it can change which clock is used for the wait. This means
> those
> > functions might be affected by system clock adjustments when tsan is
> > used, when they would not be affected otherwise.
> >
> > libstdc++-v3/ChangeLog:
> >
> > * acinclude.m4 (GLIBCXX_CHECK_PTHREAD_MUTEX_CLOCKLOCK):
> Define
> > _GLIBCXX_USE_PTHREAD_MUTEX_CLOCKLOCK in terms of
> _GLIBCXX_TSAN.
> > * configure: Regenerate.
> >
> > diff --git a/libstdc++-v3/acinclude.m4 b/libstdc++-v3/acinclude.m4
> > index 89e7f5f5f45..dce3d16aa5c 100644
> > --- a/libstdc++-v3/acinclude.m4
> > +++ b/libstdc++-v3/acinclude.m4
> > @@ -4314,7 +4314,7 @@ AC_DEFUN([GLIBCXX_CHECK_PTHREAD_MUTEX_CLOCKLOCK], [
> >[glibcxx_cv_PTHREAD_MUTEX_CLOCKLOCK=no])
> >])
> >if test $glibcxx_cv_PTHREAD_MUTEX_CLOCKLOCK = yes; then
> > -AC_DEFINE(_GLIBCXX_USE_PTHREAD_MUTEX_CLOCKLOCK, 1, [Define if
> pthread_mutex_clocklock is available in .])
> > +AC_DEFINE(_GLIBCXX_USE_PTHREAD_MUTEX_CLOCKLOCK, (_GLIBCXX_TSAN==0),
> [Define if pthread_mutex_clocklock is available in .])
> >fi
> >
> >CXXFLAGS="$ac_save_CXXFLAGS"
>
> LGTM.
>
> Mike.
>
>


Re: [PATCH 01/26] arm: [MVE intrinsics] add binary_widen_opt_n shape

2023-05-12 Thread Christophe Lyon via Gcc-patches




On 5/12/23 12:17, Kyrylo Tkachov wrote:




-Original Message-
From: Christophe Lyon 
Sent: Friday, May 12, 2023 10:39 AM
To: gcc-patches@gcc.gnu.org; Kyrylo Tkachov ;
Richard Earnshaw ; Richard Sandiford

Cc: Christophe Lyon 
Subject: [PATCH 01/26] arm: [MVE intrinsics] add binary_widen_opt_n shape

This patch adds the binary_widen_opt_n shape description.


This series is ok with one style nit fixed in this first patch...



2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (binary_widen_opt_n):
New.
* config/arm/arm-mve-builtins-shapes.h (binary_widen_opt_n): New.
---
  gcc/config/arm/arm-mve-builtins-shapes.cc | 49 +++
  gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
  2 files changed, 50 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc b/gcc/config/arm/arm-
mve-builtins-shapes.cc
index 5a299a272f5..ee4bc3f8ea4 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -1098,6 +1098,55 @@ struct binary_widen_n_def : public
overloaded_base<0>
  };
  SHAPE (binary_widen_n)

+/* _t vfoo[_t0](_t, _t)
+   _t vfoo[_n_t0](_t, _t)
+
+   Example: vqdmullbq.
+   int32x4_t [__arm_]vqdmulltq[_n_s16](int16x8_t a, int16_t b)
+   int32x4_t [__arm_]vqdmulltq_m[_n_s16](int32x4_t inactive, int16x8_t a,
int16_t b, mve_pred16_t p)
+   int32x4_t [__arm_]vqdmulltq[_s16](int16x8_t a, int16x8_t b)
+   int32x4_t [__arm_]vqdmulltq_m[_s16](int32x4_t inactive, int16x8_t a,
int16x8_t b, mve_pred16_t p)  */
+struct binary_widen_opt_n_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group,
+bool preserve_user_namespace) const override
+  {
+b.add_overloaded_functions (group, MODE_none,
preserve_user_namespace);
+build_all (b, "vw0,v0,v0", group, MODE_none,
preserve_user_namespace);
+build_all (b, "vw0,v0,s0", group, MODE_n, preserve_user_namespace);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+unsigned int i, nargs;
+type_suffix_index type;
+if (!r.check_gp_argument (2, i, nargs)
+   || (type = r.infer_vector_type (i - 1)) == NUM_TYPE_SUFFIXES)
+  return error_mark_node;
+
+type_suffix_index wide_suffix
+  = find_type_suffix (type_suffixes[type].tclass,
+ type_suffixes[type].element_bits * 2);
+
+/* Skip last argument, may be scalar, will be checked below by
+   finish_opt_n_resolution.  */
+unsigned int last_arg = i--;
+for (; i > 0; i--)
+  if (!r.require_matching_vector_type (i, type))
+   return error_mark_node;
+
+/* Check the inactive argument has the wide type.  */
+if ((r.pred == PRED_m)
+   && (r.infer_vector_type (0) != wide_suffix))
+return r.report_no_such_form (type);


Indentation is off here I think.


Indeed!

Thanks,

Christophe


Thanks,
Kyrill


+
+return r.finish_opt_n_resolution (last_arg, 0, type);
+  }
+};
+SHAPE (binary_widen_opt_n)
+
  /* Shape for comparison operations that operate on
 uniform types.

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h b/gcc/config/arm/arm-
mve-builtins-shapes.h
index a28cd6a1547..07b12b4af68 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.h
+++ b/gcc/config/arm/arm-mve-builtins-shapes.h
@@ -53,6 +53,7 @@ namespace arm_mve
  extern const function_shape *const binary_rshift_narrow;
  extern const function_shape *const binary_rshift_narrow_unsigned;
  extern const function_shape *const binary_widen_n;
+extern const function_shape *const binary_widen_opt_n;
  extern const function_shape *const cmp;
  extern const function_shape *const create;
  extern const function_shape *const inherent;
--
2.34.1




[PATCH] RISC-V: Using merge approach to optimize repeating sequence in vec_init

2023-05-12 Thread juzhe . zhong
From: Juzhe-Zhong 

Consider this following case:
typedef int64_t vnx32di __attribute__ ((vector_size (256)));


__attribute__ ((noipa)) void
f_vnx32di (int64_t a, int64_t b, int64_t *out)
{
  vnx32di v
= {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, 
a, b, a, b, a, b, a, b};
  *(vnx32di *) out = v;
}

Since we dont't have SEW = 128 in vec_duplicate, we can't combine ab into SEW = 
128 element and then
broadcast this big element.

This patch is optimize the case as above.

-march=rv64gcv_zvl256b --param riscv-autovec-preference=fixed-vlmax

Before this patch:

..
vslide1down.vx (x31 times)
..

After this patch:
li  a5,-1431654400
addia5,a5,-1365
li  a3,-1431654400
addia3,a3,-1366
sllia5,a5,32
add a5,a5,a3
vsetvli a4,zero,e64,m8,ta,ma
vmv.v.x v8,a0
vmv.s.x v0,a5
vmerge.vxm  v8,v8,a1,v0
vs8r.v  v8,0(a2)
ret


gcc/ChangeLog:

* config/riscv/riscv-v.cc 
(rvv_builder::repeating_sequence_use_merge_profitable_p): New function.
(rvv_builder::get_merge_mask_bitfield): Ditto..
(emit_scalar_move_op): Ditto.
(emit_merge_op): Ditto.
(expand_vector_init_merge_repeating_sequence): Ditto.
(expand_vec_init): Add merge approach to optimize repeating sequence.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-10.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-11.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-8.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-9.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-11.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-7.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-8.c: New test.

---
 gcc/config/riscv/riscv-v.cc   | 252 --
 .../riscv/rvv/autovec/vls-vlmax/repeat-10.c   |  19 ++
 .../riscv/rvv/autovec/vls-vlmax/repeat-11.c   |  25 ++
 .../riscv/rvv/autovec/vls-vlmax/repeat-7.c|  25 ++
 .../riscv/rvv/autovec/vls-vlmax/repeat-8.c|  15 ++
 .../riscv/rvv/autovec/vls-vlmax/repeat-9.c|  16 ++
 .../rvv/autovec/vls-vlmax/repeat_run-11.c |  45 
 .../rvv/autovec/vls-vlmax/repeat_run-7.c  |  45 
 .../rvv/autovec/vls-vlmax/repeat_run-8.c  |  41 +++
 9 files changed, 459 insertions(+), 24 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-10.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-11.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-8.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-9.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-11.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-7.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-8.c

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index b8dc333f54e..e5c0db0eea8 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -68,15 +68,23 @@ public:
   {
 add_input_operand (CONSTM1_RTX (mode), mode);
   }
+  void add_scalar_move_mask_operand (machine_mode mode)
+  {
+rtx scalar_move_mask = gen_scalar_move_mask (mode);
+add_input_operand (scalar_move_mask, mode);
+  }
   void add_vundef_operand (machine_mode mode)
   {
 add_input_operand (RVV_VUNDEF (mode), mode);
   }
-  void add_policy_operand (enum tail_policy vta, enum mask_policy vma)
+  void add_ta_policy_operand (enum tail_policy vta)
   {
 rtx tail_policy_rtx = gen_int_mode (vta, Pmode);
-rtx mask_policy_rtx = gen_int_mode (vma, Pmode);
 add_input_operand (tail_policy_rtx, Pmode);
+  }
+  void add_ma_policy_operand (enum mask_policy vma)
+  {
+rtx mask_policy_rtx = gen_int_mode (vma, Pmode);
 add_input_operand (mask_policy_rtx, Pmode);
   }
   void add_avl_type_operand (avl_type type)
@@ -84,40 +92,52 @@ public:
 add_input_operand (gen_int_mode (type, Pmode), Pmode);
   }
 
-  void set_dest_and_mask (rtx mask, rtx dest, machine_mode mask_mode)
+  void set_dest_and_mask (rtx mask, rtx dest, machine_mode mask_mode,
+ bool scalar_move = false, bool merge_op = false)
   {
 dest_mode = GET_MODE (dest);
 has_dest = true;
 
 add_output_operand (dest, dest_mode);
 
-if (mask)
-  add_input_operand (mask, GET_MODE (mask));
-else
-  add_all_one_mask_operand (mask_mode);
+if (!merge_op)
+  {
+   if (mask)
+ add_input_operand (mask, GET_MODE (mask));
+   else
+ {
+   if (scalar_move)
+   

Re: [PATCH V6] VECT: Add decrement IV support in Loop Vectorizer

2023-05-12 Thread Richard Sandiford via Gcc-patches
juzhe.zh...@rivai.ai writes:
> From: Ju-Zhe Zhong 
>
> 1. Fix document description according Jeff && Richard.
> 2. Add LOOP_VINFO_USING_SELECT_VL_P for single rgroup.
> 3. Add LOOP_VINFO_USING_SLP_ADJUSTED_LEN_P for SLP multiple rgroup.
>
> Fix bugs for V5 after testing:
> https://gcc.gnu.org/pipermail/gcc-patches/2023-May/618209.html
>
> gcc/ChangeLog:
>
> * doc/md.texi: Add seletc_vl pattern.
> * internal-fn.def (SELECT_VL): New ifn.
> * optabs.def (OPTAB_D): New optab.
> * tree-vect-loop-manip.cc (vect_adjust_loop_lens): New function.
> (vect_set_loop_controls_by_select_vl): Ditto.
> (vect_set_loop_condition_partial_vectors): Add loop control for 
> decrement IV.
> * tree-vect-loop.cc (vect_get_loop_len): Adjust loop len for SLP.
> * tree-vect-stmts.cc (get_select_vl_data_ref_ptr): New function.
> (vectorizable_store): Support data reference IV added by outcome of 
> SELECT_VL.
> (vectorizable_load): Ditto.
> * tree-vectorizer.h (LOOP_VINFO_USING_SELECT_VL_P): New macro.
> (LOOP_VINFO_USING_SLP_ADJUSTED_LEN_P): Ditto.
> (vect_get_loop_len): Adjust loop len for SLP.
>
> ---
>  gcc/doc/md.texi |  36 
>  gcc/internal-fn.def |   1 +
>  gcc/optabs.def  |   1 +
>  gcc/tree-vect-loop-manip.cc | 380 +++-
>  gcc/tree-vect-loop.cc   |  31 ++-
>  gcc/tree-vect-stmts.cc  |  79 +++-
>  gcc/tree-vectorizer.h   |  12 +-
>  7 files changed, 526 insertions(+), 14 deletions(-)
>
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 8ebce31ba78..a94ffc4456d 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -4974,6 +4974,42 @@ for (i = 1; i < operand3; i++)
>operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
>  @end smallexample
>  
> +@cindex @code{select_vl@var{m}} instruction pattern
> +@item @code{select_vl@var{m}}
> +Set operand 0 to the number of active elements in a vector to be updated 
> +in a loop iteration based on the total number of elements to be updated, 
> +the vectorization factor and vector properties of the target.
> +operand 1 is the total elements in the vector to be updated.
> +operand 2 is the vectorization factor.
> +The value of operand 0 is target dependent and flexible in each iteration.
> +The operation of this pattern can be:
> +
> +@smallexample
> +Case 1:
> +operand0 = MIN (operand1, operand2);
> +operand2 can be const_poly_int or poly_int related to vector mode size.
> +Some target like RISC-V has a standalone instruction to get MIN (n, MODE 
> SIZE) so
> +that we can reduce a use of general purpose register.
> +
> +In this case, only the last iteration of the loop is partial iteration.
> +@end smallexample
> +
> +@smallexample
> +Case 2:
> +if (operand1 <= operand2)
> +  operand0 = operand1;
> +else if (operand1 < 2 * operand2)
> +  operand0 = ceil (operand1 / 2);
> +else
> +  operand0 = operand2;
> +
> +This case will evenly distribute work over the last 2 iterations of a 
> stripmine loop.
> +@end smallexample
> +
> +The output of this pattern is not only used as IV of loop control counter, 
> but also
> +is used as the IV of address calculation with multiply/shift operation. This 
> allows
> +dynamic adjustment of the number of elements processed each loop iteration.
> +

I don't think we need to restrict the definition to the two RVV cases.
How about:

---
Set operand 0 to the number of scalar iterations that should be handled
by one iteration of a vector loop.  Operand 1 is the total number of
scalar iterations that the loop needs to process and operand 2 is a
maximum bound on the result (also known as the maximum ``vectorization
factor'').

The maximum value of operand 0 is given by:
@smallexample
operand0 = MIN (operand1, operand2)
@end smallexample
However, targets might choose a lower value than this, based on
target-specific criteria.  Each iteration of the vector loop might
therefore process a different number of scalar iterations, which in turn
means that induction variables will have a variable step.  Because of
this, it is generally not useful to define this instruction if it will
always calculate the maximum value.

This optab is only useful on targets that implement @samp{len_load_@var{m}}
and/or @samp{len_store_@var{m}}.
---

>  @cindex @code{check_raw_ptrs@var{m}} instruction pattern
>  @item @samp{check_raw_ptrs@var{m}}
>  Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 7fe742c2ae7..6f6fa7d37f9 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
>  DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
>  
>  DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_

RE: [PATCH] Machine_Mode: Extend machine_mode from 8 to 16 bits

2023-05-12 Thread Li, Pan2 via Gcc-patches
Thanks Richard for comments. In previous, I am not sure it is reasonable to let 
everywhere consume the same macro in rtl.h (As the includes you mentioned). 
Thus, make a conservative change in PATCH v1.

I will address the comments and try to align the bit size to the one and the 
only one macro soon.

Pan


-Original Message-
From: Richard Sandiford  
Sent: Friday, May 12, 2023 4:24 PM
To: Li, Pan2 
Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; kito.ch...@sifive.com; Wang, 
Yanzhang ; jeffreya...@gmail.com; rguent...@suse.de
Subject: Re: [PATCH] Machine_Mode: Extend machine_mode from 8 to 16 bits

pan2...@intel.com writes:
> From: Pan Li 
>
> We are running out of the machine_mode(8 bits) in RISC-V backend. Thus 
> we would like to extend the machine mode bit size from 8 to 16 bits.
> However, it is sensitive to extend the memory size in common structure 
> like tree or rtx. This patch would like to extend the machine mode 
> bits to 16 bits by shrinking, like:
>
> * Swap the bit size of code and machine code in rtx_def.
> * Reconcile the machine_mode location and spare in tree.
>
> The memory impact of this patch for correlated structure looks like below:
>
> +---+--+-+--+
> | struct/bytes  | upstream | patched | diff |
> +---+--+-+--+
> | rtx_obj_reference |8 |  12 |   +4 |
> | ext_modified  |2 |   3 |   +1 |
> | ira_allocno   |  192 | 200 |   +8 |
> | qty_table_elem|   40 |  40 |0 |
> | reg_stat_type |   64 |  64 |0 |
> | rtx_def   |   40 |  40 |0 |
> | table_elt |   80 |  80 |0 |
> | tree_decl_common  |  112 | 112 |0 |
> | tree_type_common  |  128 | 128 |0 |
> +---+--+-+--+
>
> The tree and rtx related struct has no memory changes after this 
> patch, and the machine_mode changes to 16 bits already.
>
> Signed-off-by: Pan Li 
> Co-authored-by: Ju-Zhe Zhong 
> Co-authored-by: Kito Cheng 
>
> gcc/ChangeLog:
>
>   * combine.cc (struct reg_stat_type): Extended machine mode to 16 bits.
>   * cse.cc (struct qty_table_elem): Ditto.
>   (struct table_elt): Ditto.
>   (struct set): Ditto.
>   * genopinit.cc (main): Reconciled the machine mode limit.
>   * ira-int.h (struct ira_allocno): Extended machine mode to 16 bits.
>   * ree.cc (struct ATTRIBUTE_PACKED): Ditto.
>   * rtl-ssa/accesses.h: Ditto.
>   * rtl.h (RTX_CODE_BITSIZE): New macro.
>   (RTX_MACHINE_MODE_BITSIZE): Ditto.
>   (struct GTY): Swap bit size between code and machine mode.
>   (subreg_shape::unique_id): Reconciled the machine mode limit.
>   * rtlanal.h: Extended machine mode to 16 bits.
>   * tree-core.h (struct tree_type_common): Ditto.
>   (struct tree_decl_common): Reconciled the locate and extended
>   bit size of machine mode.
> ---
>  gcc/combine.cc |  4 ++--
>  gcc/cse.cc |  8 
>  gcc/genopinit.cc   |  3 ++-
>  gcc/ira-int.h  | 12 
>  gcc/ree.cc |  2 +-
>  gcc/rtl-ssa/accesses.h |  6 --
>  gcc/rtl.h  |  9 ++---
>  gcc/rtlanal.h  |  5 +++--
>  gcc/tree-core.h| 11 ---
>  9 files changed, 38 insertions(+), 22 deletions(-)
>
> diff --git a/gcc/combine.cc b/gcc/combine.cc index 
> 5aa0ec5c45a..bdf6f635c80 100644
> --- a/gcc/combine.cc
> +++ b/gcc/combine.cc
> @@ -200,7 +200,7 @@ struct reg_stat_type {
>  
>unsigned HOST_WIDE_INT last_set_nonzero_bits;
>char   last_set_sign_bit_copies;
> -  ENUM_BITFIELD(machine_mode)last_set_mode : 8;
> +  ENUM_BITFIELD(machine_mode)last_set_mode : 
> RTX_MACHINE_MODE_BITSIZE;
>  
>/* Set nonzero if references to register n in expressions should not be
>   used.  last_set_invalid is set nonzero when this register is 
> being @@ -235,7 +235,7 @@ struct reg_stat_type {
>   truncation if we know that value already contains a truncated
>   value.  */
>  
> -  ENUM_BITFIELD(machine_mode)truncated_to_mode : 8;
> +  ENUM_BITFIELD(machine_mode)truncated_to_mode : 
> RTX_MACHINE_MODE_BITSIZE;
>  };
>  
>  
> diff --git a/gcc/cse.cc b/gcc/cse.cc
> index b10c9b0c94d..fe594c1bc3d 100644
> --- a/gcc/cse.cc
> +++ b/gcc/cse.cc
> @@ -250,8 +250,8 @@ struct qty_table_elem
>unsigned int first_reg, last_reg;
>/* The sizes of these fields should match the sizes of the
>   code and mode fields of struct rtx_def (see rtl.h).  */

The comment can be removed, since you're now adding macros to ensure this 
(thanks).  Same for other instances of the comment.

> -  ENUM_BITFIELD(rtx_code) comparison_code : 16;
> -  ENUM_BITFIELD(machine_mode) mode : 8;
> +  ENUM_BITFIELD(rtx_code) comparison_code : RTX_CODE_BITSIZE;
> +  ENUM_BITFIELD(machine_mode) mode : RTX_MACHINE_MODE_BITSIZE;

Please put the mode first, 

Re: [PATCH v3 3/4] ree: Main functionality to Improve ree pass for rs6000 target

2023-05-12 Thread Ajit Agarwal via Gcc-patches
Hello Jeff:


On 29/04/23 3:40 am, Jeff Law wrote:
> 
> 
> On 4/20/23 15:03, Ajit Agarwal wrote:
> 
>>
>> Currently I support AND with const1_rtx. This is what is equivalent to zero 
>> extension instruction in power instruction set. When you specify many other 
>> constants and Could you please specify what other constants needs to be 
>> supported and how to determine on the Input and output modes.
> x AND  will result in a zero-extended representation for a variety 
> of constants, not just 1.  For example
> 
> For example x AND 3, x AND 7, x AND 15, etc.
> 
> If (const_int 1) is really that special here, then I've either completely 
> misunderstood the intention of your patch or there's something quite special 
> about the PPC port that I'm not aware of.
>

Here is the patch to address above.

ree: Improve ree pass for rs6000 target

For rs6000 target we see redundant zero and sign
extension and done to improve ree pass to eliminate
such redundant zero and sign extension. Support of
AND with extension with different constants other
than 1.

2023-05-12  Ajit Kumar Agarwal  

gcc/ChangeLog:

* ree.cc (rtx_is_zext_p): Add AND with varying contsants as
extensions.
---
 gcc/ree.cc | 15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/gcc/ree.cc b/gcc/ree.cc
index 96fda1ac658..ddda5f194bb 100644
--- a/gcc/ree.cc
+++ b/gcc/ree.cc
@@ -269,8 +269,11 @@ rtx_is_zext_p (rtx insn)
   rtx set = XEXP (insn, 0);
   if (REG_P (set))
{
-  if (XEXP (insn, 1) == const1_rtx)
-return true;
+ rtx src = XEXP (insn, 1);
+
+ if (CONST_INT_P (src)
+ && IN_RANGE (exact_log2 (UINTVAL (src)), 0, 7))
+   return true;
}
   else
return false;
@@ -297,9 +300,11 @@ rtx_is_zext_p (rtx_insn *insn)
 
  if (REG_P (set) && GET_MODE (SET_DEST (body)) == GET_MODE (set))
{
-if (GET_MODE_UNIT_SIZE (GET_MODE (SET_DEST (body)))
->= GET_MODE_UNIT_SIZE (GET_MODE (set)))
-  return true;
+ rtx src = XEXP (SET_SRC (body), 1);
+
+ if (CONST_INT_P (src)
+ && IN_RANGE (exact_log2 (UINTVAL (src)), 0, 7))
+   return true;
}
  else
   return false;
-- 
2.31.1


 
> Jeff


Re: [PATCH] Machine_Mode: Extend machine_mode from 8 to 16 bits

2023-05-12 Thread Richard Sandiford via Gcc-patches
"Li, Pan2 via Gcc-patches"  writes:
> Thanks Richard for comments. In previous, I am not sure it is reasonable to 
> let everywhere consume the same macro in rtl.h (As the includes you 
> mentioned). Thus, make a conservative change in PATCH v1.
>
> I will address the comments and try to align the bit size to the one and the 
> only one macro soon.

Sorry, I should have thought about this earlier, but it would
probably make sense to name the macro MACHINE_MODE_BITSIZE and
define it in machmode.h rather than rtl.h.  (The rtx_code stuff
should stay as-is.)

Thanks,
Richard

>
> Pan
>
>
> -Original Message-
> From: Richard Sandiford  
> Sent: Friday, May 12, 2023 4:24 PM
> To: Li, Pan2 
> Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; kito.ch...@sifive.com; 
> Wang, Yanzhang ; jeffreya...@gmail.com; 
> rguent...@suse.de
> Subject: Re: [PATCH] Machine_Mode: Extend machine_mode from 8 to 16 bits
>
> pan2...@intel.com writes:
>> From: Pan Li 
>>
>> We are running out of the machine_mode(8 bits) in RISC-V backend. Thus 
>> we would like to extend the machine mode bit size from 8 to 16 bits.
>> However, it is sensitive to extend the memory size in common structure 
>> like tree or rtx. This patch would like to extend the machine mode 
>> bits to 16 bits by shrinking, like:
>>
>> * Swap the bit size of code and machine code in rtx_def.
>> * Reconcile the machine_mode location and spare in tree.
>>
>> The memory impact of this patch for correlated structure looks like below:
>>
>> +---+--+-+--+
>> | struct/bytes  | upstream | patched | diff |
>> +---+--+-+--+
>> | rtx_obj_reference |8 |  12 |   +4 |
>> | ext_modified  |2 |   3 |   +1 |
>> | ira_allocno   |  192 | 200 |   +8 |
>> | qty_table_elem|   40 |  40 |0 |
>> | reg_stat_type |   64 |  64 |0 |
>> | rtx_def   |   40 |  40 |0 |
>> | table_elt |   80 |  80 |0 |
>> | tree_decl_common  |  112 | 112 |0 |
>> | tree_type_common  |  128 | 128 |0 |
>> +---+--+-+--+
>>
>> The tree and rtx related struct has no memory changes after this 
>> patch, and the machine_mode changes to 16 bits already.
>>
>> Signed-off-by: Pan Li 
>> Co-authored-by: Ju-Zhe Zhong 
>> Co-authored-by: Kito Cheng 
>>
>> gcc/ChangeLog:
>>
>>  * combine.cc (struct reg_stat_type): Extended machine mode to 16 bits.
>>  * cse.cc (struct qty_table_elem): Ditto.
>>  (struct table_elt): Ditto.
>>  (struct set): Ditto.
>>  * genopinit.cc (main): Reconciled the machine mode limit.
>>  * ira-int.h (struct ira_allocno): Extended machine mode to 16 bits.
>>  * ree.cc (struct ATTRIBUTE_PACKED): Ditto.
>>  * rtl-ssa/accesses.h: Ditto.
>>  * rtl.h (RTX_CODE_BITSIZE): New macro.
>>  (RTX_MACHINE_MODE_BITSIZE): Ditto.
>>  (struct GTY): Swap bit size between code and machine mode.
>>  (subreg_shape::unique_id): Reconciled the machine mode limit.
>>  * rtlanal.h: Extended machine mode to 16 bits.
>>  * tree-core.h (struct tree_type_common): Ditto.
>>  (struct tree_decl_common): Reconciled the locate and extended
>>  bit size of machine mode.
>> ---
>>  gcc/combine.cc |  4 ++--
>>  gcc/cse.cc |  8 
>>  gcc/genopinit.cc   |  3 ++-
>>  gcc/ira-int.h  | 12 
>>  gcc/ree.cc |  2 +-
>>  gcc/rtl-ssa/accesses.h |  6 --
>>  gcc/rtl.h  |  9 ++---
>>  gcc/rtlanal.h  |  5 +++--
>>  gcc/tree-core.h| 11 ---
>>  9 files changed, 38 insertions(+), 22 deletions(-)
>>
>> diff --git a/gcc/combine.cc b/gcc/combine.cc index 
>> 5aa0ec5c45a..bdf6f635c80 100644
>> --- a/gcc/combine.cc
>> +++ b/gcc/combine.cc
>> @@ -200,7 +200,7 @@ struct reg_stat_type {
>>  
>>unsigned HOST_WIDE_INTlast_set_nonzero_bits;
>>char  last_set_sign_bit_copies;
>> -  ENUM_BITFIELD(machine_mode)   last_set_mode : 8;
>> +  ENUM_BITFIELD(machine_mode)   last_set_mode : 
>> RTX_MACHINE_MODE_BITSIZE;
>>  
>>/* Set nonzero if references to register n in expressions should not be
>>   used.  last_set_invalid is set nonzero when this register is 
>> being @@ -235,7 +235,7 @@ struct reg_stat_type {
>>   truncation if we know that value already contains a truncated
>>   value.  */
>>  
>> -  ENUM_BITFIELD(machine_mode)   truncated_to_mode : 8;
>> +  ENUM_BITFIELD(machine_mode)   truncated_to_mode : 
>> RTX_MACHINE_MODE_BITSIZE;
>>  };
>>  
>>  
>> diff --git a/gcc/cse.cc b/gcc/cse.cc
>> index b10c9b0c94d..fe594c1bc3d 100644
>> --- a/gcc/cse.cc
>> +++ b/gcc/cse.cc
>> @@ -250,8 +250,8 @@ struct qty_table_elem
>>unsigned int first_reg, last_reg;
>>/* The sizes of these fields should match the sizes of the
>>   code and mode fields of struct rtx_def (see rtl.h).  */
>
> The 

Re: [PATCH V6] VECT: Add decrement IV support in Loop Vectorizer

2023-05-12 Thread Richard Sandiford via Gcc-patches
"juzhe.zhong"  writes:
> Thanks Richard.
>  I will do that as you suggested. I have a question for the first patch. How 
> to
> enable decrement IV? Should I add a target hook or something to let target
> decide whether enable decrement IV?

At the moment, the only other targets that use IFN_LOAD_LEN and
IFN_STORE_LEN are PowerPC and s390.  Both targets default to
--param vect-partial-vector-usage=1 (i.e. use partial vectors
for epilogues only).

So I think the condition should be that the loop:

  (a) uses length "controls"; and
  (b) can iterate more than once

No target checks should be needed.

Thanks,
Richard

>  Replied Message 
>
> From  Richard Sandiford
>
> Date  05/12/2023 19:08
>
> Tojuzhe.zh...@rivai.ai
>
> Ccgcc-patches@gcc.gnu.org,
>   kito.ch...@gmail.com,
>   pal...@dabbelt.com,
>   richard.guent...@gmail.com
>
> Subject   Re: [PATCH V6] VECT: Add decrement IV support in Loop Vectorizer
>
> juzhe.zh...@rivai.ai writes:
>> From: Ju-Zhe Zhong 
>>
>> 1. Fix document description according Jeff && Richard.
>> 2. Add LOOP_VINFO_USING_SELECT_VL_P for single rgroup.
>> 3. Add LOOP_VINFO_USING_SLP_ADJUSTED_LEN_P for SLP multiple rgroup.
>>
>> Fix bugs for V5 after testing:
>> https://gcc.gnu.org/pipermail/gcc-patches/2023-May/618209.html
>>
>> gcc/ChangeLog:
>>
>> * doc/md.texi: Add seletc_vl pattern.
>> * internal-fn.def (SELECT_VL): New ifn.
>> * optabs.def (OPTAB_D): New optab.
>> * tree-vect-loop-manip.cc (vect_adjust_loop_lens): New function.
>> (vect_set_loop_controls_by_select_vl): Ditto.
>> (vect_set_loop_condition_partial_vectors): Add loop control for
> decrement IV.
>> * tree-vect-loop.cc (vect_get_loop_len): Adjust loop len for SLP.
>> * tree-vect-stmts.cc (get_select_vl_data_ref_ptr): New function.
>> (vectorizable_store): Support data reference IV added by outcome of
> SELECT_VL.
>> (vectorizable_load): Ditto.
>> * tree-vectorizer.h (LOOP_VINFO_USING_SELECT_VL_P): New macro.
>> (LOOP_VINFO_USING_SLP_ADJUSTED_LEN_P): Ditto.
>> (vect_get_loop_len): Adjust loop len for SLP.
>>
>> ---
>>  gcc/doc/md.texi |  36 
>>  gcc/internal-fn.def |   1 +
>>  gcc/optabs.def  |   1 +
>>  gcc/tree-vect-loop-manip.cc | 380 +++-
>>  gcc/tree-vect-loop.cc   |  31 ++-
>>  gcc/tree-vect-stmts.cc  |  79 +++-
>>  gcc/tree-vectorizer.h   |  12 +-
>>  7 files changed, 526 insertions(+), 14 deletions(-)
>>
>> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
>> index 8ebce31ba78..a94ffc4456d 100644
>> --- a/gcc/doc/md.texi
>> +++ b/gcc/doc/md.texi
>> @@ -4974,6 +4974,42 @@ for (i = 1; i < operand3; i++)
>>operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
>>  @end smallexample
>>  
>> +@cindex @code{select_vl@var{m}} instruction pattern
>> +@item @code{select_vl@var{m}}
>> +Set operand 0 to the number of active elements in a vector to be updated
>> +in a loop iteration based on the total number of elements to be updated,
>> +the vectorization factor and vector properties of the target.
>> +operand 1 is the total elements in the vector to be updated.
>> +operand 2 is the vectorization factor.
>> +The value of operand 0 is target dependent and flexible in each iteration.
>> +The operation of this pattern can be:
>> +
>> +@smallexample
>> +Case 1:
>> +operand0 = MIN (operand1, operand2);
>> +operand2 can be const_poly_int or poly_int related to vector mode size.
>> +Some target like RISC-V has a standalone instruction to get MIN (n, MODE
> SIZE) so
>> +that we can reduce a use of general purpose register.
>> +
>> +In this case, only the last iteration of the loop is partial iteration.
>> +@end smallexample
>> +
>> +@smallexample
>> +Case 2:
>> +if (operand1 <= operand2)
>> +  operand0 = operand1;
>> +else if (operand1 < 2 * operand2)
>> +  operand0 = ceil (operand1 / 2);
>> +else
>> +  operand0 = operand2;
>> +
>> +This case will evenly distribute work over the last 2 iterations of a
> stripmine loop.
>> +@end smallexample
>> +
>> +The output of this pattern is not only used as IV of loop control counter,
> but also
>> +is used as the IV of address calculation with multiply/shift operation. This
> allows
>> +dynamic adjustment of the number of elements processed each loop iteration.
>> +
>
> I don't think we need to restrict the definition to the two RVV cases.
> How about:
>
> ---
> Set operand 0 to the number of scalar iterations that should be handled
> by one iteration of a vector loop.  Operand 1 is the total number of
> scalar iterations that the loop needs to process and operand 2 is a
> maximum bound on the result (also known as the maximum ``vectorization
> factor'').
>
> The maximum value of operand 0 is given by:
> @smallexample
> operand0 = MIN (operand1, operand2)
> @end smallexample
> Ho

[PATCH] tree-optimization/109791 - simplify (unsigned)&foo - (unsigned)(&foo + o)

2023-05-12 Thread Richard Biener via Gcc-patches
The following adds another variant of address difference simplification.
The utility ptr_difference_const only handles constant differences
(we also cannot code generate anything else), so exposing a possible
POINTER_PLUS_EXPR in the match and computing the difference on the
base only makes it possible to handle one case of a variable offset.
This simplifies

(unsigned long) &MEM  [(void *)&str + 2B] - (unsigned long) (&str + 
(_69 + 1))

down to (1 - (unsigned long) _69) during niter analysis, allowing
ranger to eliminate a condition later and avoiding a bogus
-Wstringop-overflow diagnostic for the testcase in the PR.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

PR tree-optimization/109791
* match.pd (minus (convert ADDR_EXPR@0) (convert (pointer_plus @1 @2))):
New pattern.
(minus (convert (pointer_plus @1 @2)) (convert ADDR_EXPR@0)):
Likewise.
---
 gcc/match.pd | 12 
 1 file changed, 12 insertions(+)

diff --git a/gcc/match.pd b/gcc/match.pd
index b7f28ab074c..2e46e074e93 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -2572,6 +2572,18 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(bit_and @0 { algn; })))
 
 /* Try folding difference of addresses.  */
+(simplify
+ (minus (convert ADDR_EXPR@0) (convert (pointer_plus @1 @2)))
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
+  (with { poly_int64 diff; }
+   (if (ptr_difference_const (@0, @1, &diff))
+(minus { build_int_cst_type (type, diff); } (convert @2))
+(simplify
+ (minus (convert (pointer_plus @0 @2)) (convert ADDR_EXPR@1))
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
+  (with { poly_int64 diff; }
+   (if (ptr_difference_const (@0, @1, &diff))
+(plus (convert @2) { build_int_cst_type (type, diff); })
 (simplify
  (minus (convert ADDR_EXPR@0) (convert @1))
  (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
-- 
2.35.3


RE: [PATCH] Machine_Mode: Extend machine_mode from 8 to 16 bits

2023-05-12 Thread Li, Pan2 via Gcc-patches
Never minder. When preparing the PR, I am keeping ask myself that is everywhere 
about machine code bit size updated? Thus would like to align the bit size to 
one macro, to avoid developers (perhaps myself in future) suffering such kind 
of concern.

Will try to move the machine mode to machmode.h.

Pan

-Original Message-
From: Richard Sandiford  
Sent: Friday, May 12, 2023 7:32 PM
To: Li, Pan2 via Gcc-patches 
Cc: Li, Pan2 ; juzhe.zh...@rivai.ai; kito.ch...@sifive.com; 
Wang, Yanzhang ; jeffreya...@gmail.com; 
rguent...@suse.de
Subject: Re: [PATCH] Machine_Mode: Extend machine_mode from 8 to 16 bits

"Li, Pan2 via Gcc-patches"  writes:
> Thanks Richard for comments. In previous, I am not sure it is reasonable to 
> let everywhere consume the same macro in rtl.h (As the includes you 
> mentioned). Thus, make a conservative change in PATCH v1.
>
> I will address the comments and try to align the bit size to the one and the 
> only one macro soon.

Sorry, I should have thought about this earlier, but it would probably make 
sense to name the macro MACHINE_MODE_BITSIZE and define it in machmode.h rather 
than rtl.h.  (The rtx_code stuff should stay as-is.)

Thanks,
Richard

>
> Pan
>
>
> -Original Message-
> From: Richard Sandiford 
> Sent: Friday, May 12, 2023 4:24 PM
> To: Li, Pan2 
> Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; 
> kito.ch...@sifive.com; Wang, Yanzhang ; 
> jeffreya...@gmail.com; rguent...@suse.de
> Subject: Re: [PATCH] Machine_Mode: Extend machine_mode from 8 to 16 
> bits
>
> pan2...@intel.com writes:
>> From: Pan Li 
>>
>> We are running out of the machine_mode(8 bits) in RISC-V backend. 
>> Thus we would like to extend the machine mode bit size from 8 to 16 bits.
>> However, it is sensitive to extend the memory size in common 
>> structure like tree or rtx. This patch would like to extend the 
>> machine mode bits to 16 bits by shrinking, like:
>>
>> * Swap the bit size of code and machine code in rtx_def.
>> * Reconcile the machine_mode location and spare in tree.
>>
>> The memory impact of this patch for correlated structure looks like below:
>>
>> +---+--+-+--+
>> | struct/bytes  | upstream | patched | diff |
>> +---+--+-+--+
>> | rtx_obj_reference |8 |  12 |   +4 |
>> | ext_modified  |2 |   3 |   +1 |
>> | ira_allocno   |  192 | 200 |   +8 |
>> | qty_table_elem|   40 |  40 |0 |
>> | reg_stat_type |   64 |  64 |0 |
>> | rtx_def   |   40 |  40 |0 |
>> | table_elt |   80 |  80 |0 |
>> | tree_decl_common  |  112 | 112 |0 |
>> | tree_type_common  |  128 | 128 |0 |
>> +---+--+-+--+
>>
>> The tree and rtx related struct has no memory changes after this 
>> patch, and the machine_mode changes to 16 bits already.
>>
>> Signed-off-by: Pan Li 
>> Co-authored-by: Ju-Zhe Zhong 
>> Co-authored-by: Kito Cheng 
>>
>> gcc/ChangeLog:
>>
>>  * combine.cc (struct reg_stat_type): Extended machine mode to 16 bits.
>>  * cse.cc (struct qty_table_elem): Ditto.
>>  (struct table_elt): Ditto.
>>  (struct set): Ditto.
>>  * genopinit.cc (main): Reconciled the machine mode limit.
>>  * ira-int.h (struct ira_allocno): Extended machine mode to 16 bits.
>>  * ree.cc (struct ATTRIBUTE_PACKED): Ditto.
>>  * rtl-ssa/accesses.h: Ditto.
>>  * rtl.h (RTX_CODE_BITSIZE): New macro.
>>  (RTX_MACHINE_MODE_BITSIZE): Ditto.
>>  (struct GTY): Swap bit size between code and machine mode.
>>  (subreg_shape::unique_id): Reconciled the machine mode limit.
>>  * rtlanal.h: Extended machine mode to 16 bits.
>>  * tree-core.h (struct tree_type_common): Ditto.
>>  (struct tree_decl_common): Reconciled the locate and extended
>>  bit size of machine mode.
>> ---
>>  gcc/combine.cc |  4 ++--
>>  gcc/cse.cc |  8 
>>  gcc/genopinit.cc   |  3 ++-
>>  gcc/ira-int.h  | 12 
>>  gcc/ree.cc |  2 +-
>>  gcc/rtl-ssa/accesses.h |  6 --
>>  gcc/rtl.h  |  9 ++---
>>  gcc/rtlanal.h  |  5 +++--
>>  gcc/tree-core.h| 11 ---
>>  9 files changed, 38 insertions(+), 22 deletions(-)
>>
>> diff --git a/gcc/combine.cc b/gcc/combine.cc index
>> 5aa0ec5c45a..bdf6f635c80 100644
>> --- a/gcc/combine.cc
>> +++ b/gcc/combine.cc
>> @@ -200,7 +200,7 @@ struct reg_stat_type {
>>  
>>unsigned HOST_WIDE_INTlast_set_nonzero_bits;
>>char  last_set_sign_bit_copies;
>> -  ENUM_BITFIELD(machine_mode)   last_set_mode : 8;
>> +  ENUM_BITFIELD(machine_mode)   last_set_mode : 
>> RTX_MACHINE_MODE_BITSIZE;
>>  
>>/* Set nonzero if references to register n in expressions should not be
>>   used.  last_set_invalid is set nonzero when this register is 
>> being @@ -235,7 +235,7 @@ struct reg_

[PATCH] OpenMP: Constructors and destructors for "declare target" static aggregates

2023-05-12 Thread Julian Brown
This patch adds support for running constructors and destructors for
static (file-scope) aggregates for C++ objects which are marked with
"declare target" directives on OpenMP offload targets.

At present, space is allocated on the target for such aggregates, but
nothing ever constructs them properly, so they end up zero-initialised.

The approach taken is to generate a set of constructors to run on the
target: this currently works for AMD GCN, but fails on NVPTX due
to lack of constructor/destructor support there so far on mainline.
(See the new test static-aggr-constructor-destructor-3.C for a reason
why running constructors on the target is preferable to e.g. constructing
on the host and then copying the resulting object to the target.)

This patch was previously posted for the og12 branch here:

  https://gcc.gnu.org/pipermail/gcc-patches/2023-March/614710.html
  https://gcc.gnu.org/pipermail/gcc-patches/2023-April/615013.html
  https://gcc.gnu.org/pipermail/gcc-patches/2023-April/615144.html

though needed a fair amount of rework for mainline due to Nathan's
(earlier!) patch:

  https://gcc.gnu.org/pipermail/gcc-patches/2022-June/596402.html

Tested with offloading to AMD GCN and bootstrapped. OK for mainline?

Thanks,

Julian

2023-05-12  Julian Brown  

gcc/cp/
* decl2.cc (tree-inline.h): Include.
(static_init_fini_fns): Bump to four entries. Update comment.
(start_objects, start_partial_init_fini_fn): Add 'omp_target'
parameter. Support "declare target" decls. Update forward declaration.
(emit_partial_init_fini_fn): Add 'host_fn' parameter. Return tree for
the created function. Support "declare target".
(OMP_SSDF_IDENTIFIER): New macro.
(partition_vars_for_init_fini): Support partitioning "declare target"
variables also.
(generate_ctor_or_dtor_function): Add 'omp_target' parameter. Support
"declare target" decls.
(c_parse_final_cleanups): Support constructors/destructors on OpenMP
offload targets.

gcc/
* omp-builtins.def (BUILT_IN_OMP_IS_INITIAL_DEVICE): New builtin.
* tree.cc (get_file_function_name): Support names for on-target
constructor/destructor functions.

libgomp/
* testsuite/libgomp.c++/static-aggr-constructor-destructor-1.C: New
test.
* testsuite/libgomp.c++/static-aggr-constructor-destructor-2.C: New
test.
* testsuite/libgomp.c++/static-aggr-constructor-destructor-3.C: New
test.
---
 gcc/cp/decl2.cc   | 243 +++---
 gcc/omp-builtins.def  |   2 +
 gcc/tree.cc   |   6 +-
 .../static-aggr-constructor-destructor-1.C|  28 ++
 .../static-aggr-constructor-destructor-2.C|  31 +++
 .../static-aggr-constructor-destructor-3.C|  36 +++
 6 files changed, 305 insertions(+), 41 deletions(-)
 create mode 100644 
libgomp/testsuite/libgomp.c++/static-aggr-constructor-destructor-1.C
 create mode 100644 
libgomp/testsuite/libgomp.c++/static-aggr-constructor-destructor-2.C
 create mode 100644 
libgomp/testsuite/libgomp.c++/static-aggr-constructor-destructor-3.C

diff --git a/gcc/cp/decl2.cc b/gcc/cp/decl2.cc
index b510cdac554..ceec681fbeb 100644
--- a/gcc/cp/decl2.cc
+++ b/gcc/cp/decl2.cc
@@ -50,20 +50,22 @@ along with GCC; see the file COPYING3.  If not see
 #include "asan.h"
 #include "optabs-query.h"
 #include "omp-general.h"
+#include "tree-inline.h"
 
 /* Id for dumping the raw trees.  */
 int raw_dump_id;
  
 extern cpp_reader *parse_in;
 
-static tree start_objects (bool, unsigned, bool);
+static tree start_objects (bool, unsigned, bool, bool);
 static tree finish_objects (bool, unsigned, tree, bool = true);
-static tree start_partial_init_fini_fn (bool, unsigned, unsigned);
+static tree start_partial_init_fini_fn (bool, unsigned, unsigned, bool);
 static void finish_partial_init_fini_fn (tree);
-static void emit_partial_init_fini_fn (bool, unsigned, tree,
-  unsigned, location_t);
+static tree emit_partial_init_fini_fn (bool, unsigned, tree,
+  unsigned, location_t, tree);
 static void one_static_initialization_or_destruction (bool, tree, tree);
-static void generate_ctor_or_dtor_function (bool, unsigned, tree, location_t);
+static void generate_ctor_or_dtor_function (bool, unsigned, tree, location_t,
+   bool);
 static tree prune_vars_needing_no_initialization (tree *);
 static void write_out_vars (tree);
 static void import_export_class (tree);
@@ -165,9 +167,10 @@ struct priority_map_traits
 typedef hash_map priority_map_t;
 
-/* A pair of such hash tables, indexed by initp -- one for fini and
-   one for init.  The fini table is only ever used when !cxa_atexit.  */
-static GTY(()) priority_map_t *static_init_fini_fns[2];
+/* Two pairs of such hash tables, for the host and an OpenMP offload device.
+   Each 

Re: [PATCH 1/3] Refactor to allow internal_fn's

2023-05-12 Thread Andre Vieira (lists) via Gcc-patches

Hi,

I think I tackled all of your comments, let me know if I missed something.


gcc/ChangeLog:

2023-05-12  Andre Vieira  
Joel Hutton  

* tree-vect-patterns.cc (vect_gimple_build): New Function.
(vect_recog_widen_op_pattern): Refactor to use code_helper.
* tree-vect-stmts.cc (vect_gen_widened_results_half): Likewise.
(vect_create_vectorized_demotion_stmts): Likewise.
(vect_create_vectorized_promotion_stmts): Likewise.
(vect_create_half_widening_stmts): Likewise.
(vectorizable_conversion): Likewise.
(vectorizable_call): Likewise.
(supportable_widening_operation): Likewise.
(supportable_narrowing_operation): Likewise.
(simple_integer_narrowing): Likewise.
* tree-vectorizer.h (supportable_widening_operation): Likewise.
(supportable_narrowing_operation): Likewise.
(vect_gimple_build): New function prototype.
* tree.h (code_helper::safe_as_tree_code): New function.
(code_helper::safe_as_fn_code): New function.diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 
33a8b2bb60601dc1a67de62a56bbf3c355e12dbd..1778af0242898e3dc73d94d22a5b8505628a53b5
 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -25,6 +25,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "rtl.h"
 #include "tree.h"
 #include "gimple.h"
+#include "gimple-iterator.h"
+#include "gimple-fold.h"
 #include "ssa.h"
 #include "expmed.h"
 #include "optabs-tree.h"
@@ -1392,7 +1394,7 @@ vect_recog_sad_pattern (vec_info *vinfo,
 static gimple *
 vect_recog_widen_op_pattern (vec_info *vinfo,
 stmt_vec_info last_stmt_info, tree *type_out,
-tree_code orig_code, tree_code wide_code,
+tree_code orig_code, code_helper wide_code,
 bool shift_p, const char *name)
 {
   gimple *last_stmt = last_stmt_info->stmt;
@@ -1435,7 +1437,7 @@ vect_recog_widen_op_pattern (vec_info *vinfo,
   vecctype = get_vectype_for_scalar_type (vinfo, ctype);
 }
 
-  enum tree_code dummy_code;
+  code_helper dummy_code;
   int dummy_int;
   auto_vec dummy_vec;
   if (!vectype
@@ -1456,8 +1458,7 @@ vect_recog_widen_op_pattern (vec_info *vinfo,
   2, oprnd, half_type, unprom, vectype);
 
   tree var = vect_recog_temp_ssa_var (itype, NULL);
-  gimple *pattern_stmt = gimple_build_assign (var, wide_code,
- oprnd[0], oprnd[1]);
+  gimple *pattern_stmt = vect_gimple_build (var, wide_code, oprnd[0], 
oprnd[1]);
 
   if (vecctype != vecitype)
 pattern_stmt = vect_convert_output (vinfo, last_stmt_info, ctype,
@@ -6808,3 +6809,20 @@ vect_pattern_recog (vec_info *vinfo)
   /* After this no more add_stmt calls are allowed.  */
   vinfo->stmt_vec_info_ro = true;
 }
+
+/* Build a GIMPLE_ASSIGN or GIMPLE_CALL with the tree_code,
+   or internal_fn contained in ch, respectively.  */
+gimple *
+vect_gimple_build (tree lhs, code_helper ch, tree op0, tree op1)
+{
+  gcc_assert (op0 != NULL_TREE);
+  if (ch.is_tree_code ())
+return gimple_build_assign (lhs, (tree_code) ch, op0, op1);
+
+  gcc_assert (ch.is_internal_fn ());
+  gimple* stmt = gimple_build_call_internal (as_internal_fn ((combined_fn) ch),
+op1 == NULL_TREE ? 1 : 2,
+op0, op1);
+  gimple_call_set_lhs (stmt, lhs);
+  return stmt;
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
61a2da4ecee9c449c1469cab3c4cfa1a782471d5..d152ae9ab10b361b88c0f839d6951c43b954750a
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -3261,13 +3261,13 @@ vectorizable_bswap (vec_info *vinfo,
 
 static bool
 simple_integer_narrowing (tree vectype_out, tree vectype_in,
- tree_code *convert_code)
+ code_helper *convert_code)
 {
   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype_out))
   || !INTEGRAL_TYPE_P (TREE_TYPE (vectype_in)))
 return false;
 
-  tree_code code;
+  code_helper code;
   int multi_step_cvt = 0;
   auto_vec  interm_types;
   if (!supportable_narrowing_operation (NOP_EXPR, vectype_out, vectype_in,
@@ -3481,7 +3481,7 @@ vectorizable_call (vec_info *vinfo,
   tree callee = gimple_call_fndecl (stmt);
 
   /* First try using an internal function.  */
-  tree_code convert_code = ERROR_MARK;
+  code_helper convert_code = MAX_TREE_CODES;
   if (cfn != CFN_LAST
   && (modifier == NONE
  || (modifier == NARROW
@@ -3667,8 +3667,8 @@ vectorizable_call (vec_info *vinfo,
  continue;
}
  new_temp = make_ssa_name (vec_dest);
- new_stmt = gimple_build_assign (new_temp, convert_code,
- prev_res, half_res);
+ new_stmt = vect_gimple_build (new_temp,

Re: [PATCH 2/3] Refactor widen_plus as internal_fn

2023-05-12 Thread Andre Vieira (lists) via Gcc-patches
I have dealt with, I think..., most of your comments. There's quite a 
few changes, I think it's all a bit simpler now. I made some other 
changes to the costing in tree-inline.cc and gimple-range-op.cc in which 
I try to preserve the same behaviour as we had with the tree codes 
before. Also added some extra checks to tree-cfg.cc that made sense to me.


I am still regression testing the gimple-range-op change, as that was a 
last minute change, but the rest survived a bootstrap and regression 
test on aarch64-unknown-linux-gnu.


cover letter:

This patch replaces the existing tree_code widen_plus and widen_minus
patterns with internal_fn versions.

DEF_INTERNAL_OPTAB_WIDENING_HILO_FN and 
DEF_INTERNAL_OPTAB_NARROWING_HILO_FN are like 
DEF_INTERNAL_SIGNED_OPTAB_FN and DEF_INTERNAL_OPTAB_FN respectively 
except they provide convenience wrappers for defining conversions that 
require a hi/lo split.  Each definition for  will require optabs 
for _hi and _lo and each of those will also require a signed and 
unsigned version in the case of widening. The hi/lo pair is necessary 
because the widening and narrowing operations take n narrow elements as 
inputs and return n/2 wide elements as outputs. The 'lo' operation 
operates on the first n/2 elements of input. The 'hi' operation operates 
on the second n/2 elements of input. Defining an internal_fn along with 
hi/lo variations allows a single internal function to be returned from a 
vect_recog function that will later be expanded to hi/lo.



 For example:
 IFN_VEC_WIDEN_PLUS -> IFN_VEC_WIDEN_PLUS_HI, IFN_VEC_WIDEN_PLUS_LO
for aarch64: IFN_VEC_WIDEN_PLUS_HI   -> vec_widen_add_hi_ -> 
(u/s)addl2
   IFN_VEC_WIDEN_PLUS_LO  -> 
vec_widen_add_lo_ -> (u/s)addl


This gives the same functionality as the previous WIDEN_PLUS/WIDEN_MINUS 
tree codes which are expanded into VEC_WIDEN_PLUS_LO, VEC_WIDEN_PLUS_HI.


gcc/ChangeLog:

2023-05-12  Andre Vieira  
Joel Hutton  
Tamar Christina  

* config/aarch64/aarch64-simd.md 
(vec_widen_addl_lo_): Rename

this ...
(vec_widen_add_lo_): ... to this.
(vec_widen_addl_hi_): Rename this ...
(vec_widen_add_hi_): ... to this.
(vec_widen_subl_lo_): Rename this ...
(vec_widen_sub_lo_): ... to this.
(vec_widen_subl_hi_): Rename this ...
(vec_widen_sub_hi_): ...to this.
* doc/generic.texi: Document new IFN codes.
* internal-fn.cc (DEF_INTERNAL_OPTAB_WIDENING_HILO_FN): Macro to define 
an
internal_fn that expands into multiple internal_fns for widening.
(DEF_INTERNAL_OPTAB_NARROWING_HILO_FN): Likewise but for narrowing.
(ifn_cmp): Function to compare ifn's for sorting/searching.
(lookup_hilo_internal_fn): Add lookup function.
(commutative_binary_fn_p): Add widen_plus fn's.
(widening_fn_p): New function.
(narrowing_fn_p): New function.
(decomposes_to_hilo_fn_p): New function.
(direct_internal_fn_optab): Change visibility.
* internal-fn.def (DEF_INTERNAL_OPTAB_WIDENING_HILO_FN): Define widening
plus,minus functions.
(VEC_WIDEN_PLUS): Replacement for VEC_WIDEN_PLUS_EXPR tree code.
(VEC_WIDEN_MINUS): Replacement for VEC_WIDEN_MINUS_EXPR tree code.
* internal-fn.h (GCC_INTERNAL_FN_H): Add headers.
(direct_internal_fn_optab): Declare new prototype.
(lookup_hilo_internal_fn): Likewise.
(widening_fn_p): Likewise.
(Narrowing_fn_p): Likewise.
(decomposes_to_hilo_fn_p): Likewise.
* optabs.cc (commutative_optab_p): Add widening plus optabs.
* optabs.def (OPTAB_D): Define widen add, sub optabs.
* tree-cfg.cc (verify_gimple_call): Add checks for new widen
add and sub IFNs.
* tree-inline.cc (estimate_num_insns): Return same
cost for widen add and sub IFNs as previous tree_codes.
* tree-vect-patterns.cc (vect_recog_widen_op_pattern): Support
patterns with a hi/lo split.
(vect_recog_sad_pattern): Refactor to use new IFN codes.
(vect_recog_widen_plus_pattern): Likewise.
(vect_recog_widen_minus_pattern): Likewise.
(vect_recog_average_pattern): Likewise.
* tree-vect-stmts.cc (vectorizable_conversion): Add support for
_HILO IFNs.
(supportable_widening_operation): Likewise.
* tree.def (WIDEN_SUM_EXPR): Update example to use new IFNs.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vect-widen-add.c: Test that new
IFN_VEC_WIDEN_PLUS is being used.
* gcc.target/aarch64/vect-widen-sub.c: Test that new
IFN_VEC_WIDEN_MINUS is being used.diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
bfc98a8d943467b33390defab9682f44efab5907..ffbbecb9409e1c2835d658c2a8855cd0e955c0f2
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4626,7 +4626,7 @@
   [(set_attr "type" "neon

Re: [PATCH 3/3] Remove widen_plus/minus_expr tree codes

2023-05-12 Thread Andre Vieira (lists) via Gcc-patches
Moved the 'changes' from this patch back to the second so it's all just 
about removing code that we no longer use. I don't really know why Joel 
formatted the patches this way, but I thought I'd keep it as is for now.


cover letter:

This patch removes the old widen plus/minus tree codes which have been
replaced by internal functions.

gcc/ChangeLog:

2023-05-12  Andre Vieira  
Joel Hutton  

* cfgexpand.cc (expand_debug_expr): Remove old tree codes.
* doc/generic.texi: Likewise.
* expr.cc (expand_expr_real_2): Likewise.
* gimple-pretty-print.cc (dump_binary_rhs): Likewise.
* gimple-range-op.cc (gimple_range_op_handler::maybe_non_standard):
Likewise.
* optabs-tree.cc (optab_for_tree_code): Likewise.
(supportable_half_widening_operation): Likewise.
* optabs.cc (commutative_optab_p): Likewise.
* optabs.def (OPTAB_D): Likewise.
* tree-cfg.cc (verify_gimple_assign_binary): Likewise.
* tree-inline.cc (estimate_operator_cost): Likewise.
(op_symbol_code): Likewise.
* tree-pretty-print.cc (dump_generic_node): Remove tree code definition.
* tree-vect-data-refs.cc (vect_get_smallest_scalar_type): Likewise.
(vect_analyze_data_ref_accesses): Likewise.
* tree-vect-generic.cc (expand_vector_operations_1): Likewise.
* tree-vect-stmts.cc (vectorizable_conversion): Likewise.
(supportable_widening_operation): Likewise.
* tree.def (WIDEN_PLUS_EXPR, WIDEN_MINUS_EXPR, VEC_WIDEN_PLUS_HI_EXPR,
VEC_WIDEN_PLUS_LO_EXPR, VEC_WIDEN_MINUS_HI_EXPR,
VEC_WIDEN_MINUS_LO_EXPR): Likewise.diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
index 
1a1b26b1c6c23ce273bcd08dc9a973f777174007..25b1558dcb941ea491a19aeeb2cd8f4d2dbdf7c6
 100644
--- a/gcc/cfgexpand.cc
+++ b/gcc/cfgexpand.cc
@@ -5365,10 +5365,6 @@ expand_debug_expr (tree exp)
 case VEC_WIDEN_MULT_ODD_EXPR:
 case VEC_WIDEN_LSHIFT_HI_EXPR:
 case VEC_WIDEN_LSHIFT_LO_EXPR:
-case VEC_WIDEN_PLUS_HI_EXPR:
-case VEC_WIDEN_PLUS_LO_EXPR:
-case VEC_WIDEN_MINUS_HI_EXPR:
-case VEC_WIDEN_MINUS_LO_EXPR:
 case VEC_PERM_EXPR:
 case VEC_DUPLICATE_EXPR:
 case VEC_SERIES_EXPR:
@@ -5405,8 +5401,6 @@ expand_debug_expr (tree exp)
 case WIDEN_MULT_EXPR:
 case WIDEN_MULT_PLUS_EXPR:
 case WIDEN_MULT_MINUS_EXPR:
-case WIDEN_PLUS_EXPR:
-case WIDEN_MINUS_EXPR:
   if (SCALAR_INT_MODE_P (GET_MODE (op0))
  && SCALAR_INT_MODE_P (mode))
{
@@ -5419,10 +5413,6 @@ expand_debug_expr (tree exp)
op1 = simplify_gen_unary (ZERO_EXTEND, mode, op1, inner_mode);
  else
op1 = simplify_gen_unary (SIGN_EXTEND, mode, op1, inner_mode);
- if (TREE_CODE (exp) == WIDEN_PLUS_EXPR)
-   return simplify_gen_binary (PLUS, mode, op0, op1);
- else if (TREE_CODE (exp) == WIDEN_MINUS_EXPR)
-   return simplify_gen_binary (MINUS, mode, op0, op1);
  op0 = simplify_gen_binary (MULT, mode, op0, op1);
  if (TREE_CODE (exp) == WIDEN_MULT_EXPR)
return op0;
diff --git a/gcc/doc/generic.texi b/gcc/doc/generic.texi
index 
0fd7e6cce8bbd4ecb8027b702722adcf6c32eb55..a23d57af20610e0bb4809f06fb0c91253ae56d11
 100644
--- a/gcc/doc/generic.texi
+++ b/gcc/doc/generic.texi
@@ -1815,10 +1815,6 @@ a value from @code{enum annot_expr_kind}, the third is 
an @code{INTEGER_CST}.
 @tindex IFN_VEC_WIDEN_PLUS_LO
 @tindex IFN_VEC_WIDEN_MINUS_HI
 @tindex IFN_VEC_WIDEN_MINUS_LO
-@tindex VEC_WIDEN_PLUS_HI_EXPR
-@tindex VEC_WIDEN_PLUS_LO_EXPR
-@tindex VEC_WIDEN_MINUS_HI_EXPR
-@tindex VEC_WIDEN_MINUS_LO_EXPR
 @tindex VEC_UNPACK_HI_EXPR
 @tindex VEC_UNPACK_LO_EXPR
 @tindex VEC_UNPACK_FLOAT_HI_EXPR
@@ -1892,33 +1888,6 @@ vector of @code{N/2} products.  In the case of
 vector are subtracted from the low @code{N/2} of the first to produce the
 vector of @code{N/2} products.
 
-@item VEC_WIDEN_PLUS_HI_EXPR
-@itemx VEC_WIDEN_PLUS_LO_EXPR
-These nodes represent widening vector addition of the high and low parts of
-the two input vectors, respectively.  Their operands are vectors that contain
-the same number of elements (@code{N}) of the same integral type. The result
-is a vector that contains half as many elements, of an integral type whose size
-is twice as wide.  In the case of @code{VEC_WIDEN_PLUS_HI_EXPR} the high
-@code{N/2} elements of the two vectors are added to produce the vector of
-@code{N/2} products.  In the case of @code{VEC_WIDEN_PLUS_LO_EXPR} the low
-@code{N/2} elements of the two vectors are added to produce the vector of
-@code{N/2} products.
-
-@item VEC_WIDEN_MINUS_HI_EXPR
-@itemx VEC_WIDEN_MINUS_LO_EXPR
-These nodes represent widening vector subtraction of the high and low parts of
-the two input vectors, respectively.  Their operands are vectors that contain
-the same number of elements (@code{N}) of the same integral type. The high/low
-elements of the second vector are subtracted from the high/low elements of

[pushed] c++: remove redundant testcase [PR83258]

2023-05-12 Thread Patrick Palka via Gcc-patches
I noticed only after the fact that the new testcase template/function2.C
(from r14-708-gc3afdb8ba8f183) is just a subset of ext/visibility/anon8.C,
so let's get rid of it.

PR c++/83258

gcc/testsuite/ChangeLog:

* g++.dg/ext/visibility/anon8.C: Mention PR83258.
* g++.dg/template/function2.C: Removed.
---
 gcc/testsuite/g++.dg/ext/visibility/anon8.C | 2 +-
 gcc/testsuite/g++.dg/template/function2.C   | 8 
 2 files changed, 1 insertion(+), 9 deletions(-)
 delete mode 100644 gcc/testsuite/g++.dg/template/function2.C

diff --git a/gcc/testsuite/g++.dg/ext/visibility/anon8.C 
b/gcc/testsuite/g++.dg/ext/visibility/anon8.C
index bfcc2d06df6..2c3a8acac93 100644
--- a/gcc/testsuite/g++.dg/ext/visibility/anon8.C
+++ b/gcc/testsuite/g++.dg/ext/visibility/anon8.C
@@ -26,7 +26,7 @@ int main ()
 static void fn2 () {}
   };
   call<&B1::fn1> ();
-  call<&B2::fn2> ();   // { dg-error "linkage|no matching" "" { target 
c++14_down } }
+  call<&B2::fn2> ();   // { dg-error "linkage|no matching" "PR83258" { target 
c++14_down } }
   call<&fn3> ();
   call<&B1::fn4> ();
   call<&fn5> ();   // { dg-error "linkage|no matching" "" { target { ! 
c++11 } } }
diff --git a/gcc/testsuite/g++.dg/template/function2.C 
b/gcc/testsuite/g++.dg/template/function2.C
deleted file mode 100644
index 54c48e6b36f..000
--- a/gcc/testsuite/g++.dg/template/function2.C
+++ /dev/null
@@ -1,8 +0,0 @@
-// PR c++/83258
-
-template struct A { };
-
-int main() {
-  struct B { static void f() { } };
-  A a; // { dg-error "linkage" "" { target c++14_down } }
-}
-- 
2.40.1.552.g91428f078b



[pushed] c++: robustify testcase [PR109752]

2023-05-12 Thread Patrick Palka via Gcc-patches
This rewrites the testcase for PR109752 to make it simpler and more
robust (i.e. no longer dependent on r13-4035-gc41bbfcaf9d6ef).

PR c++/109752

gcc/testsuite/ChangeLog:

* g++.dg/cpp2a/concepts-pr109752.C: Rename to ...
* g++.dg/cpp2a/concepts-complete4.C: ... this.  Rewrite.
---
 .../g++.dg/cpp2a/concepts-complete4.C | 13 ++
 .../g++.dg/cpp2a/concepts-pr109752.C  | 26 ---
 2 files changed, 13 insertions(+), 26 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/cpp2a/concepts-complete4.C
 delete mode 100644 gcc/testsuite/g++.dg/cpp2a/concepts-pr109752.C

diff --git a/gcc/testsuite/g++.dg/cpp2a/concepts-complete4.C 
b/gcc/testsuite/g++.dg/cpp2a/concepts-complete4.C
new file mode 100644
index 000..988b0ddcfdd
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp2a/concepts-complete4.C
@@ -0,0 +1,13 @@
+// PR c++/109752
+// { dg-do compile { target c++20 } }
+
+template
+concept C = requires { sizeof(T); } && T::value; // { dg-error "changed from" }
+
+struct A;
+
+static_assert(!C);
+
+struct A { static constexpr bool value = false; };
+
+static_assert(C); // { dg-error "assert" }
diff --git a/gcc/testsuite/g++.dg/cpp2a/concepts-pr109752.C 
b/gcc/testsuite/g++.dg/cpp2a/concepts-pr109752.C
deleted file mode 100644
index d54ce295e50..000
--- a/gcc/testsuite/g++.dg/cpp2a/concepts-pr109752.C
+++ /dev/null
@@ -1,26 +0,0 @@
-// PR c++/109752
-// { dg-do compile { target c++20 } }
-
-template 
-  inline constexpr bool is_constructible_v = __is_constructible(_Tp, _Args...);
-template
-  concept __weakly_eq_cmp_with
- = requires(_Tp __t, _Up __u) {{ __u != __t } ; // { dg-error "changed 
from" }
- };
-  template
-concept regular =  is_constructible_v<_Tp>  && __weakly_eq_cmp_with<_Tp, 
_Tp>;
-  template concept incrementable = true
-&& regular<_Iter>
-&& requires(_Iter __i) { { __i++ } ;}
-;
-template
-struct iterator_interface
-{
-  friend constexpr bool operator>=(D lhs, D rhs) requires 
__weakly_eq_cmp_with { return true; }
-};
-template
-struct iterator : iterator_interface>
-{
-bool operator==(iterator) const;
-};
-static_assert(incrementable>); // { dg-error "assert" }
-- 
2.40.1.552.g91428f078b



[PATCH] ipa: Self-DCE of uses of removed call LHSs (PR 108007)

2023-05-12 Thread Martin Jambor
Hi,

PR 108007 is another manifestation where we rely on DCE to clean-up
after IPA-SRA and if the user explicitely switches DCE off, IPA-SRA
can leave behind statements which are fed uninitialized values and
trap, even though their results are themselves never used.

I have already fixed this for unused parameters in callees, this bug
shows that almost the same thing can happen for removed returns, on
the side of callers.  This means that the issue has to be fixed
elsewhere, in call redirection.  This patch adds a function which
recursivewly looks for uses of operations fed specific SSA names and
removes them all.

That would have been easy if it wasn't for debug statements during
tree-inline (from which call redirection is also invoked).  Debug
statements are decoupled from the rest at this point and iterating
over uses of SSAs does not bring them up.  During tree-inline they are
handled especially at the end, I assume in order to make sure that
relative ordering of UIDs are the same with and without debug info.

This means that during tree-inline we need to make a hash of killed
SSAs, that we already have in copy_body_data, available to the
function making the purging.  So the patch duly does also that, making
the interface slightly ugly.

Bootstrapped and tested on x86_64-linux.  OK for master?  (I am not sure
the problem is grave enough to warrant backporting to release branches
but can do that as well if people think I should.)

Thanks,

Martin


gcc/ChangeLog:

2023-05-11  Martin Jambor  

PR ipa/108007
* cgraph.h (cgraph_edge): Add a parameter to
redirect_call_stmt_to_callee.
* ipa-param-manipulation.h (ipa_param_adjustments): Added a
parameter to modify_call.
* cgraph.cc (cgraph_edge::redirect_call_stmt_to_callee): New
parameter killed_ssas, pass it to padjs->modify_call.
* ipa-param-manipulation.cc (purge_transitive_uses): New function.
(ipa_param_adjustments::modify_call): New parameter killed_ssas.
Instead of substitutin uses, invoke purge_transitive_uses.  If
hash of killed SSAs has not been provided, create a temporary one
and release SSAs that have been added to it.
* tree-inline.cc (redirect_all_calls): Create
id->killed_new_ssa_names earlier, pass it to edge redirection,
adjust a comment.
(copy_body): Release SSAs in id->killed_new_ssa_names.

gcc/testsuite/ChangeLog:

2023-05-11  Martin Jambor  

PR ipa/108007
* gcc.dg/ipa/pr108007.c: New test.
---
 gcc/cgraph.cc   | 10 +++-
 gcc/cgraph.h|  9 ++-
 gcc/ipa-param-manipulation.cc   | 85 +
 gcc/ipa-param-manipulation.h|  3 +-
 gcc/testsuite/gcc.dg/ipa/pr108007.c | 32 +++
 gcc/tree-inline.cc  | 28 ++
 6 files changed, 129 insertions(+), 38 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/ipa/pr108007.c

diff --git a/gcc/cgraph.cc b/gcc/cgraph.cc
index e8f9bec8227..5e923bf0557 100644
--- a/gcc/cgraph.cc
+++ b/gcc/cgraph.cc
@@ -1403,11 +1403,17 @@ cgraph_edge::redirect_callee (cgraph_node *n)
speculative indirect call, remove "speculative" of the indirect call and
also redirect stmt to it's final direct target.
 
+   When called from within tree-inline, KILLED_SSAs has to contain the pointer
+   to killed_new_ssa_names within the copy_body_data structure and SSAs
+   discovered to be useless (if LHS is removed) will be added to it, otherwise
+   it needs to be NULL.
+
It is up to caller to iteratively transform each "speculative"
direct call as appropriate.  */
 
 gimple *
-cgraph_edge::redirect_call_stmt_to_callee (cgraph_edge *e)
+cgraph_edge::redirect_call_stmt_to_callee (cgraph_edge *e,
+  hash_set  *killed_ssas)
 {
   tree decl = gimple_call_fndecl (e->call_stmt);
   gcall *new_stmt;
@@ -1527,7 +1533,7 @@ cgraph_edge::redirect_call_stmt_to_callee (cgraph_edge *e)
remove_stmt_from_eh_lp (e->call_stmt);
 
   tree old_fntype = gimple_call_fntype (e->call_stmt);
-  new_stmt = padjs->modify_call (e, false);
+  new_stmt = padjs->modify_call (e, false, killed_ssas);
   cgraph_node *origin = e->callee;
   while (origin->clone_of)
origin = origin->clone_of;
diff --git a/gcc/cgraph.h b/gcc/cgraph.h
index f5f54769eda..c1a3691b6f5 100644
--- a/gcc/cgraph.h
+++ b/gcc/cgraph.h
@@ -1833,9 +1833,16 @@ public:
  speculative indirect call, remove "speculative" of the indirect call and
  also redirect stmt to it's final direct target.
 
+ When called from within tree-inline, KILLED_SSAs has to contain the
+ pointer to killed_new_ssa_names within the copy_body_data structure and
+ SSAs discovered to be useless (if LHS is removed) will be added to it,
+ otherwise it needs to be NULL.
+
  It is up to caller to iteratively transform each "speculative"
  direct call as appropriate.  

[Patch] LTO: Fix writing of toplevel asm with offloading [PR109816]

2023-05-12 Thread Tobias Burnus

Long standing issue but as top-level 'asm' statement were rare, it did not show 
up.
However, the fix for PR108969 in commit r14-321-g9a41d2cdbcd added code

+#elif defined(_GLIBCXX_SYMVER_GNU)
+  __extension__ __asm (".globl _ZSt21ios_base_library_initv");
q
libstdc++-v3/include/std/iostream. This was then duly written by the 
offloading-device
lto1 for digestion by the device-target assembler. While the llvm-mc linker 
user by
GCN did accept .globl, nvptx's ptxas did choke on it.

Additionally, as the assembly was already written for offloading, the output was
lost on the host when using LTO for not only for offload but for real (i.e. 
with -flto).

Has someone an idea how to check whether the offloading-code assembler does not
contain the _ZSt21ios_base_library_initv while the host-side (before or after 
LTO)
should contain it, but only with _GLIBCXX_SYMVER_GNU?
Otherwise, the testcase tests only and at least whether it breaks with nvptx
as ptxas does not like the symbol.

* * *

Tested (manually + running the OvO and sollve-testsuite) on x86-64-gnu-linux 
with nvptx
offloading and with "make check -k" on x86-64-gnu-linux, albeit without 
offloading configured.
The installed-build regtesting of "make check-target-libgomp" seems to be 
currently broken
as it does run all checking code (check_effective_target...) but does not seem 
to find
any actual testcase to be run, probably a side effect of the recent testsuite 
changes.

OK for mainline and GCC 13?

Tobias
-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
LTO: Fix writing of toplevel asm with offloading [PR109816]

When offloading was enabled, top-level 'asm' were added to the offloading section,
confusing assemblers which did not support the syntax. Additionally, with offloading
and -flto, the top-level assembler code did not end up in the host files.

As r14-321-g9a41d2cdbcd added top-level 'asm' to some libstdc++ header files, the issue became
more apparent, causing fails with nvptx for C++ testcases.

	PR libstdc++/109816

gcc/ChangeLog:
	* lto-cgraph.cc (output_symtab): Guard lto_output_toplevel_asms by
	'!lto_stream_offload_p'.

libgomp/ChangeLog:

	* testsuite/libgomp.c++/target-map-class-1.C: New test.
	* testsuite/libgomp.c++/target-map-class-2.C: New test.

 gcc/lto-cgraph.cc  |  2 +-
 libgomp/testsuite/libgomp.c++/target-map-class-1.C | 98 ++
 libgomp/testsuite/libgomp.c++/target-map-class-2.C |  6 ++
 3 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/gcc/lto-cgraph.cc b/gcc/lto-cgraph.cc
index 805c785..aed5e9d 100644
--- a/gcc/lto-cgraph.cc
+++ b/gcc/lto-cgraph.cc
@@ -1020,7 +1020,7 @@ output_symtab (void)
  When doing WPA we must output every asm just once.  Since we do not partition asm
  nodes at all, output them to first output.  This is kind of hack, but should work
  well.  */
-  if (!asm_nodes_output)
+  if (!asm_nodes_output && !lto_stream_offload_p)
 {
   asm_nodes_output = true;
   lto_output_toplevel_asms ();
diff --git a/libgomp/testsuite/libgomp.c++/target-map-class-1.C b/libgomp/testsuite/libgomp.c++/target-map-class-1.C
new file mode 100644
index 000..ad4802d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-map-class-1.C
@@ -0,0 +1,98 @@
+/* PR middle-end/109816  */
+
+/* This variant: without -flto, see target-map-class-2.C for -flto. */
+
+/* iostream.h adds 'globl _ZSt21ios_base_library_initv' with _GLIBCXX_SYMVER_GNU,
+   but it shouldn't end up in the offload assembly but only in the host assembly. */
+
+/* Example based on sollve_vv's test_target_data_map_classes.cpp; however,
+   relevant is only the 'include' and not the actual executable code.  */
+
+#include 
+#include 
+
+using namespace std;
+
+#define N 1000
+
+struct A
+{
+  int *h_array;
+  int size, sum;
+
+  A (int *array, const int s) : h_array(array), size(s), sum(0) { }
+  ~A() { h_array = NULL; }
+};
+
+void
+test_map_tofrom_class_heap ()
+{
+  int *array = new int[N];
+  A *obj = new A (array, N);
+
+  #pragma omp target map(from: array[:N]) map(tofrom: obj[:1])
+{
+  int *tmp_h_array = obj->h_array;
+  obj->h_array = array;
+  int tmp = 0;
+  for (int i = 0; i < N; ++i)
+	{
+	  obj->h_array[i] = 4*i;
+	  tmp += 3;
+	}
+  obj->h_array = tmp_h_array;
+  obj->sum = tmp;
+}
+
+  for (int i = 0; i < N; ++i)
+if (obj->h_array[i] != 4*i)
+  __builtin_abort ();
+
+  if (3*N != obj->sum)
+{
+  std::cout << "sum: " << obj->sum << std::endl;
+  __builtin_abort ();
+}
+
+  delete obj;
+  delete[] array;
+}
+
+void
+test_map_tofrom_class_stack ()
+{
+  int array[N];
+  A obj(array, N);
+
+  #pragma omp target map(from: array[:N]) map(tofrom: obj)
+{
+  int *tmp_h_array = obj.h_array;
+ 

[PATCH] tree-optimization/64731 - extend store-from CTOR lowering to TARGET_MEM_REF

2023-05-12 Thread Richard Biener via Gcc-patches
The following also covers TARGET_MEM_REF when decomposing stores from
CTORs to supported elementwise operations.  This avoids spilling
and cleans up after vector lowering which doesn't touch loads or
stores.  It also mimics what we already do for loads.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

PR tree-optimization/64731
* tree-ssa-forwprop.cc (pass_forwprop::execute): Also
handle TARGET_MEM_REF destinations of stores from vector
CTORs.

* gcc.target/i386/pr64731.c: New testcase.
---
 gcc/testsuite/gcc.target/i386/pr64731.c | 14 +
 gcc/tree-ssa-forwprop.cc| 41 +++--
 2 files changed, 38 insertions(+), 17 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr64731.c

diff --git a/gcc/testsuite/gcc.target/i386/pr64731.c 
b/gcc/testsuite/gcc.target/i386/pr64731.c
new file mode 100644
index 000..dea5141ad24
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr64731.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx" } */
+
+typedef double double4 __attribute__((vector_size(32)));
+
+void fun(double * a, double * b)
+{
+  for (int i = 0; i < 1024; i+=4)
+*(double4*)&a[i] += *(double4 *)&b[i];
+}
+
+/* We don't want to spill but have both loads and stores lowered
+   to supported SSE operations.  */
+/* { dg-final { scan-assembler-not "movap\[sd\].*\[er\]sp" } } */
diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
index 9dc67b5309c..e63d2ab82c9 100644
--- a/gcc/tree-ssa-forwprop.cc
+++ b/gcc/tree-ssa-forwprop.cc
@@ -3236,6 +3236,26 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
   return true;
 }
 
+/* Prepare a TARGET_MEM_REF ref so that it can be subsetted as
+   lvalue.  This splits out an address computation stmt before *GSI
+   and returns a MEM_REF wrapping the address.  */
+
+static tree
+prepare_target_mem_ref_lvalue (tree ref, gimple_stmt_iterator *gsi)
+{
+  if (TREE_CODE (TREE_OPERAND (ref, 0)) == ADDR_EXPR)
+mark_addressable (TREE_OPERAND (TREE_OPERAND (ref, 0), 0));
+  tree ptrtype = build_pointer_type (TREE_TYPE (ref));
+  tree tem = make_ssa_name (ptrtype);
+  gimple *new_stmt
+= gimple_build_assign (tem, build1 (ADDR_EXPR, TREE_TYPE (tem),
+   unshare_expr (ref)));
+  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
+  ref = build2_loc (EXPR_LOCATION (ref),
+   MEM_REF, TREE_TYPE (ref), tem,
+   build_int_cst (TREE_TYPE (TREE_OPERAND (ref, 1)), 0));
+  return ref;
+}
 
 /* Rewrite the vector load at *GSI to component-wise loads if the load
is only used in BIT_FIELD_REF extractions with eventual intermediate
@@ -3317,20 +3337,7 @@ optimize_vector_load (gimple_stmt_iterator *gsi)
  For TARGET_MEM_REFs we have to separate the LEA from the reference.  */
   tree load_rhs = rhs;
   if (TREE_CODE (load_rhs) == TARGET_MEM_REF)
-{
-  if (TREE_CODE (TREE_OPERAND (load_rhs, 0)) == ADDR_EXPR)
-   mark_addressable (TREE_OPERAND (TREE_OPERAND (load_rhs, 0), 0));
-  tree ptrtype = build_pointer_type (TREE_TYPE (load_rhs));
-  tree tem = make_ssa_name (ptrtype);
-  gimple *new_stmt
-   = gimple_build_assign (tem, build1 (ADDR_EXPR, TREE_TYPE (tem),
-   unshare_expr (load_rhs)));
-  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
-  load_rhs = build2_loc (EXPR_LOCATION (load_rhs),
-MEM_REF, TREE_TYPE (load_rhs), tem,
-build_int_cst
-  (TREE_TYPE (TREE_OPERAND (load_rhs, 1)), 0));
-}
+load_rhs = prepare_target_mem_ref_lvalue (load_rhs, gsi);
 
   /* Rewrite the BIT_FIELD_REFs to be actual loads, re-emitting them at
  the place of the original load.  */
@@ -3823,9 +3830,7 @@ pass_forwprop::execute (function *fun)
  && gimple_store_p (use_stmt)
  && !gimple_has_volatile_ops (use_stmt)
  && !stmt_can_throw_internal (fun, use_stmt)
- && is_gimple_assign (use_stmt)
- && (TREE_CODE (gimple_assign_lhs (use_stmt))
- != TARGET_MEM_REF))
+ && is_gimple_assign (use_stmt))
{
  tree elt_t = TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value);
  unsigned HOST_WIDE_INT elt_w
@@ -3835,6 +3840,8 @@ pass_forwprop::execute (function *fun)
  tree use_lhs = gimple_assign_lhs (use_stmt);
  if (auto_var_p (use_lhs))
DECL_NOT_GIMPLE_REG_P (use_lhs) = 1;
+ else if (TREE_CODE (use_lhs) == TARGET_MEM_REF)
+   use_lhs = prepare_target_mem_ref_lvalue (use_lhs, &gsi);
  for (unsigned HOST_WIDE_INT bi = 0; bi < n; bi += elt_w)
{
  unsigned HOST_WIDE_INT ci = bi / elt_w;
-- 
2.35.3


Re: [PATCH] RISC-V: Using merge approach to optimize repeating sequence in vec_init

2023-05-12 Thread Robin Dapp via Gcc-patches
Hi,

in general LGTM, just minor nits and comments.

> -  void set_len_and_policy (rtx len, bool force_vlmax = false)
> -{
> -  bool vlmax_p = force_vlmax;
> -  gcc_assert (has_dest);
> +  void set_len_and_policy (rtx len, bool force_vlmax = false, bool ta_p = 
> true,
> +bool ma_p = true)
> +  {
> +bool vlmax_p = force_vlmax;
> +gcc_assert (has_dest);

Indentation?

>  m_inner_mode = GET_MODE_INNER (mode);
> -m_inner_size = GET_MODE_BITSIZE (m_inner_mode).to_constant ();
> +m_inner_size = GET_MODE_BITSIZE (m_inner_mode);
> +m_inner_units = GET_MODE_SIZE (m_inner_mode);

I find it a bit misleading to call this units here.  Granted it's an inner
mode (i.e. referring to "bytes") but in the context of vector modes I'm likely
to think of a vector "unit" or lane.  What about m_inner_size_bytes or
m_inner_size_units?

> +bool
> +rvv_builder::repeating_sequence_use_merge_profitable_p ()
> +{
> +  return repeating_sequence_p (0, full_nelts ().to_constant (), npatterns ())
> +  && inner_units () <= UNITS_PER_WORD
> +  && 3 * npatterns () < full_nelts ().to_constant ();
> +}

Appreciate the explanatory comment and number of instructions is good for
now.  In the future and given the different uarchs we will want a proper
costing comparison.

> +/* Get the mask for merge approach.
> +
> + Consider such following case:
> +   {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
> + To merge "a", the mask should be 1010
> + To merge "a", the mask should be 0101
> +*/

Second line should be "b".

> +/* Emit merge instruction.  */
> +
> +static void
> +emit_merge_op (rtx dest, rtx src1, rtx src2, rtx mask)
> +{
> +  insn_expander<8> e;
> +  machine_mode mode = GET_MODE (dest);
> +  e.set_dest_and_mask (NULL_RTX, dest, GET_MODE (mask), true, true);
> +  e.add_input_operand (src1, mode);
> +  if (VECTOR_MODE_P (GET_MODE (src2)))
> +e.add_input_operand (src2, mode);
> +  else
> +e.add_input_operand (src2, GET_MODE_INNER (mode));
> +
> +  e.add_input_operand (mask, GET_MODE (mask));
> +  e.set_len_and_policy (NULL_RTX, true, true, false);
> +  if (VECTOR_MODE_P (GET_MODE (src2)))
> +e.expand (code_for_pred_merge (mode), false);
> +  else
> +e.expand (code_for_pred_merge_scalar (mode), false);
> +}

Looks a lot like binop.  Might need another round of wrappers
soon :)

Regards
 Robin


Re: [Patch] LTO: Fix writing of toplevel asm with offloading [PR109816]

2023-05-12 Thread Richard Biener via Gcc-patches
On Fri, 12 May 2023, Tobias Burnus wrote:

> Long standing issue but as top-level 'asm' statement were rare, it did not
> show up.
> However, the fix for PR108969 in commit r14-321-g9a41d2cdbcd added code
> 
> +#elif defined(_GLIBCXX_SYMVER_GNU)
> +  __extension__ __asm (".globl _ZSt21ios_base_library_initv");
> q
> libstdc++-v3/include/std/iostream. This was then duly written by the
> offloading-device
> lto1 for digestion by the device-target assembler. While the llvm-mc linker
> user by
> GCN did accept .globl, nvptx's ptxas did choke on it.
> 
> Additionally, as the assembly was already written for offloading, the output
> was
> lost on the host when using LTO for not only for offload but for real (i.e.
> with -flto).
> 
> Has someone an idea how to check whether the offloading-code assembler does
> not
> contain the _ZSt21ios_base_library_initv while the host-side (before or after
> LTO)
> should contain it, but only with _GLIBCXX_SYMVER_GNU?
> Otherwise, the testcase tests only and at least whether it breaks with nvptx
> as ptxas does not like the symbol.
> 
> * * *
> 
> Tested (manually + running the OvO and sollve-testsuite) on x86-64-gnu-linux
> with nvptx
> offloading and with "make check -k" on x86-64-gnu-linux, albeit without
> offloading configured.
> The installed-build regtesting of "make check-target-libgomp" seems to be
> currently broken
> as it does run all checking code (check_effective_target...) but does not seem
> to find
> any actual testcase to be run, probably a side effect of the recent testsuite
> changes.
> 
> OK for mainline and GCC 13?

Without any idea about the details above the patch looks quite reasonable,
thus OK for trunk.  OK for branch after a while if no issues showed up.

Thanks,
Richard.


Re: [PATCH 1/3] Refactor to allow internal_fn's

2023-05-12 Thread Richard Biener via Gcc-patches
On Fri, 12 May 2023, Andre Vieira (lists) wrote:

> Hi,
> 
> I think I tackled all of your comments, let me know if I missed something.

This first and the last patch look good to me now.  Let me comment on the
second.

Thanks,
Richard.

> 
> gcc/ChangeLog:
> 
> 2023-05-12  Andre Vieira  
> Joel Hutton  
> 
> * tree-vect-patterns.cc (vect_gimple_build): New Function.
> (vect_recog_widen_op_pattern): Refactor to use code_helper.
> * tree-vect-stmts.cc (vect_gen_widened_results_half): Likewise.
> (vect_create_vectorized_demotion_stmts): Likewise.
> (vect_create_vectorized_promotion_stmts): Likewise.
> (vect_create_half_widening_stmts): Likewise.
> (vectorizable_conversion): Likewise.
> (vectorizable_call): Likewise.
> (supportable_widening_operation): Likewise.
> (supportable_narrowing_operation): Likewise.
> (simple_integer_narrowing): Likewise.
> * tree-vectorizer.h (supportable_widening_operation): Likewise.
> (supportable_narrowing_operation): Likewise.
> (vect_gimple_build): New function prototype.
> * tree.h (code_helper::safe_as_tree_code): New function.
> (code_helper::safe_as_fn_code): New function.
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)


Re: [PATCH V6] VECT: Add decrement IV support in Loop Vectorizer

2023-05-12 Thread Richard Sandiford via Gcc-patches
"juzhe.zhong"  writes:
> Hi, Richard.  For "can iterate more than once", is it correct use the 
> condition
> "LOOP_LENS ().length >1". 

No, that says whether any LOAD_LENs or STORE_LENs operate on multiple
vectors, rather than just single vectors.

I meant: whether the vector loop body might be executed more than once
(i.e. whether the branch-back condition can be true).

This is true for a scalar loop that goes from 0 to some unbounded
variable n.  It's false for a scalar loop that goes from 0 to 6,
if the vectors are known to have at least 8 elements.

Thanks,
Richard

>  Replied Message 
>
> From  Richard Sandiford
>
> Date  05/12/2023 19:39
>
> Tojuzhe.zhong
>
> Ccgcc-patches@gcc.gnu.org,
>   kito.ch...@gmail.com,
>   pal...@dabbelt.com,
>   richard.guent...@gmail.com
>
> Subject   Re: [PATCH V6] VECT: Add decrement IV support in Loop Vectorizer
>
> "juzhe.zhong"  writes:
>> Thanks Richard.
>>  I will do that as you suggested. I have a question for the first patch. How
> to
>> enable decrement IV? Should I add a target hook or something to let target
>> decide whether enable decrement IV?
>
> At the moment, the only other targets that use IFN_LOAD_LEN and
> IFN_STORE_LEN are PowerPC and s390.  Both targets default to
> --param vect-partial-vector-usage=1 (i.e. use partial vectors
> for epilogues only).
>
> So I think the condition should be that the loop:
>
>  (a) uses length "controls"; and
>  (b) can iterate more than once
>
> No target checks should be needed.
>
> Thanks,
> Richard
>
>>  Replied Message 
>>
>> From  Richard Sandiford
>>
>> Date  05/12/2023 19:08
>>
>> Tojuzhe.zh...@rivai.ai
>>
>> Ccgcc-patches@gcc.gnu.org,
>>   kito.ch...@gmail.com,
>>   pal...@dabbelt.com,
>>   richard.guent...@gmail.com
>>
>> Subject   Re: [PATCH V6] VECT: Add decrement IV support in Loop Vectorizer
>>
>> juzhe.zh...@rivai.ai writes:
>>> From: Ju-Zhe Zhong 
>>>
>>> 1. Fix document description according Jeff && Richard.
>>> 2. Add LOOP_VINFO_USING_SELECT_VL_P for single rgroup.
>>> 3. Add LOOP_VINFO_USING_SLP_ADJUSTED_LEN_P for SLP multiple rgroup.
>>>
>>> Fix bugs for V5 after testing:
>>> https://gcc.gnu.org/pipermail/gcc-patches/2023-May/618209.html
>>>
>>> gcc/ChangeLog:
>>>
>>> * doc/md.texi: Add seletc_vl pattern.
>>> * internal-fn.def (SELECT_VL): New ifn.
>>> * optabs.def (OPTAB_D): New optab.
>>> * tree-vect-loop-manip.cc (vect_adjust_loop_lens): New function.
>>> (vect_set_loop_controls_by_select_vl): Ditto.
>>> (vect_set_loop_condition_partial_vectors): Add loop control for
>> decrement IV.
>>> * tree-vect-loop.cc (vect_get_loop_len): Adjust loop len for SLP.
>>> * tree-vect-stmts.cc (get_select_vl_data_ref_ptr): New function.
>>> (vectorizable_store): Support data reference IV added by outcome of
>> SELECT_VL.
>>> (vectorizable_load): Ditto.
>>> * tree-vectorizer.h (LOOP_VINFO_USING_SELECT_VL_P): New macro.
>>> (LOOP_VINFO_USING_SLP_ADJUSTED_LEN_P): Ditto.
>>> (vect_get_loop_len): Adjust loop len for SLP.
>>>
>>> ---
>>>  gcc/doc/md.texi |  36 
>>>  gcc/internal-fn.def |   1 +
>>>  gcc/optabs.def  |   1 +
>>>  gcc/tree-vect-loop-manip.cc | 380 +++-
>>>  gcc/tree-vect-loop.cc   |  31 ++-
>>>  gcc/tree-vect-stmts.cc  |  79 +++-
>>>  gcc/tree-vectorizer.h   |  12 +-
>>>  7 files changed, 526 insertions(+), 14 deletions(-)
>>>
>>> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
>>> index 8ebce31ba78..a94ffc4456d 100644
>>> --- a/gcc/doc/md.texi
>>> +++ b/gcc/doc/md.texi
>>> @@ -4974,6 +4974,42 @@ for (i = 1; i < operand3; i++)
>>>operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
>>>  @end smallexample
>>>  
>>> +@cindex @code{select_vl@var{m}} instruction pattern
>>> +@item @code{select_vl@var{m}}
>>> +Set operand 0 to the number of active elements in a vector to be updated
>>> +in a loop iteration based on the total number of elements to be updated,
>>> +the vectorization factor and vector properties of the target.
>>> +operand 1 is the total elements in the vector to be updated.
>>> +operand 2 is the vectorization factor.
>>> +The value of operand 0 is target dependent and flexible in each iteration.
>>> +The operation of this pattern can be:
>>> +
>>> +@smallexample
>>> +Case 1:
>>> +operand0 = MIN (operand1, operand2);
>>> +operand2 can be const_poly_int or poly_int related to vector mode size.
>>> +Some target like RISC-V has a standalone instruction to get MIN (n, MODE
>> SIZE) so
>>> +that we can reduce a use of general purpose register.
>>> +
>>> +In this case, only the last iteration of the loop is partial iteration.
>>> +@end smallexample
>>> +
>>> +@smallexample
>>> +Case 2:
>>> +if (operand1 <= operand2)
>>> +  operand0 = operand1;
>>> +else if (operand1 < 2 * operand2)
>>> +  opera

Re: [PATCH 2/3] Refactor widen_plus as internal_fn

2023-05-12 Thread Richard Biener via Gcc-patches
On Fri, 12 May 2023, Andre Vieira (lists) wrote:

> I have dealt with, I think..., most of your comments. There's quite a few
> changes, I think it's all a bit simpler now. I made some other changes to the
> costing in tree-inline.cc and gimple-range-op.cc in which I try to preserve
> the same behaviour as we had with the tree codes before. Also added some extra
> checks to tree-cfg.cc that made sense to me.
> 
> I am still regression testing the gimple-range-op change, as that was a last
> minute change, but the rest survived a bootstrap and regression test on
> aarch64-unknown-linux-gnu.
> 
> cover letter:
> 
> This patch replaces the existing tree_code widen_plus and widen_minus
> patterns with internal_fn versions.
> 
> DEF_INTERNAL_OPTAB_WIDENING_HILO_FN and DEF_INTERNAL_OPTAB_NARROWING_HILO_FN
> are like DEF_INTERNAL_SIGNED_OPTAB_FN and DEF_INTERNAL_OPTAB_FN respectively
> except they provide convenience wrappers for defining conversions that require
> a hi/lo split.  Each definition for  will require optabs for _hi and _lo
> and each of those will also require a signed and unsigned version in the case
> of widening. The hi/lo pair is necessary because the widening and narrowing
> operations take n narrow elements as inputs and return n/2 wide elements as
> outputs. The 'lo' operation operates on the first n/2 elements of input. The
> 'hi' operation operates on the second n/2 elements of input. Defining an
> internal_fn along with hi/lo variations allows a single internal function to
> be returned from a vect_recog function that will later be expanded to hi/lo.
> 
> 
>  For example:
>  IFN_VEC_WIDEN_PLUS -> IFN_VEC_WIDEN_PLUS_HI, IFN_VEC_WIDEN_PLUS_LO
> for aarch64: IFN_VEC_WIDEN_PLUS_HI   -> vec_widen_add_hi_ ->
> (u/s)addl2
>IFN_VEC_WIDEN_PLUS_LO  -> vec_widen_add_lo_
> -> (u/s)addl
> 
> This gives the same functionality as the previous WIDEN_PLUS/WIDEN_MINUS tree
> codes which are expanded into VEC_WIDEN_PLUS_LO, VEC_WIDEN_PLUS_HI.

What I still don't understand is how we are so narrowly focused on
HI/LO?  We need a combined scalar IFN for pattern selection (not
sure why that's now called _HILO, I expected no suffix).  Then there's
three possibilities the target can implement this:

 1) with a widen_[su]add instruction - I _think_ that's what
RISCV is going to offer since it is a target where vector modes
have "padding" (aka you cannot subreg a V2SI to get V4HI).  Instead
RVV can do a V4HI to V4SI widening and widening add/subtract
using vwadd[u] and vwsub[u] (the HI->SI widening is actually
done with a widening add of zero - eh).
IIRC GCN is the same here.
 2) with a widen_[su]add{_lo,_hi} combo - that's what the tree
codes currently support (exclusively)
 3) similar, but widen_[su]add{_even,_odd}

that said, things like decomposes_to_hilo_fn_p look to paint us into
a 2) corner without good reason.

Richard.

> gcc/ChangeLog:
> 
> 2023-05-12  Andre Vieira  
> Joel Hutton  
> Tamar Christina  
> 
> * config/aarch64/aarch64-simd.md (vec_widen_addl_lo_):
> Rename
> this ...
> (vec_widen_add_lo_): ... to this.
> (vec_widen_addl_hi_): Rename this ...
> (vec_widen_add_hi_): ... to this.
> (vec_widen_subl_lo_): Rename this ...
> (vec_widen_sub_lo_): ... to this.
> (vec_widen_subl_hi_): Rename this ...
> (vec_widen_sub_hi_): ...to this.
> * doc/generic.texi: Document new IFN codes.
>   * internal-fn.cc (DEF_INTERNAL_OPTAB_WIDENING_HILO_FN): Macro to
>   define an
> internal_fn that expands into multiple internal_fns for widening.
> (DEF_INTERNAL_OPTAB_NARROWING_HILO_FN): Likewise but for narrowing.
>   (ifn_cmp): Function to compare ifn's for sorting/searching.
>   (lookup_hilo_internal_fn): Add lookup function.
>   (commutative_binary_fn_p): Add widen_plus fn's.
>   (widening_fn_p): New function.
>   (narrowing_fn_p): New function.
>   (decomposes_to_hilo_fn_p): New function.
>(direct_internal_fn_optab): Change visibility.
>   * internal-fn.def (DEF_INTERNAL_OPTAB_WIDENING_HILO_FN): Define
> widening
> plus,minus functions.
>   (VEC_WIDEN_PLUS): Replacement for VEC_WIDEN_PLUS_EXPR tree code.
>   (VEC_WIDEN_MINUS): Replacement for VEC_WIDEN_MINUS_EXPR tree code.
>   * internal-fn.h (GCC_INTERNAL_FN_H): Add headers.
>(direct_internal_fn_optab): Declare new prototype.
>   (lookup_hilo_internal_fn): Likewise.
>   (widening_fn_p): Likewise.
>   (Narrowing_fn_p): Likewise.
>   (decomposes_to_hilo_fn_p): Likewise.
>   * optabs.cc (commutative_optab_p): Add widening plus optabs.
>   * optabs.def (OPTAB_D): Define widen add, sub optabs.
> * tree-cfg.cc (verify_gimple_call): Add checks for new widen
> add and sub IFNs.
> * tree-inline.cc (estimate_num_insns): Return same
> cost for widen add and sub IFN

[committed v4] RISC-V: Optimize vsetvli of LCM INSERTED edge for user vsetvli [PR 109743]

2023-05-12 Thread Kito Cheng via Gcc-patches
V4 Changes:
- Use different approach of V3.
- Rewrite local_eliminate_vsetvl_insn to unify the optimization
flow.
- Also improve few more case.

Off list discussed and reviewed by Ju-Zhe.

---

This issue happens is because we are currently very conservative in 
optimization of user vsetvli.

Consider this following case:

bb 1:
  vsetvli a5,a4... (demand AVL = a4).
bb 2:
  RVV insn use a5 (demand AVL = a5).

LCM will hoist vsetvl of bb 2 into bb 1.
We don't do AVL propagation for this situation since it's complicated that
we should analyze the code sequence between vsetvli in bb 1 and RVV insn in bb 
2.
They are not necessary the consecutive blocks.

This patch is doing the optimizations after LCM, we will check and eliminate 
the vsetvli
in LCM inserted edge if such vsetvli is redundant. Such approach is much 
simplier and safe.

code:
void
foo2 (int32_t *a, int32_t *b, int n)
{
  if (n <= 0)
  return;
  int i = n;
  size_t vl = __riscv_vsetvl_e32m1 (i);

  for (; i >= 0; i--)
  {
vint32m1_t v = __riscv_vle32_v_i32m1 (a, vl);
__riscv_vse32_v_i32m1 (b, v, vl);

if (i >= vl)
  continue;

if (i == 0)
  return;

vl = __riscv_vsetvl_e32m1 (i);
  }
}

Before this patch:
foo2:
.LFB2:
.cfi_startproc
ble a2,zero,.L1
mv  a4,a2
li  a3,-1
vsetvli a5,a2,e32,m1,ta,mu
vsetvli zero,a5,e32,m1,ta,ma  <- can be eliminated.
.L5:
vle32.v v1,0(a0)
vse32.v v1,0(a1)
bgeua4,a5,.L3
.L10:
beq a2,zero,.L1
vsetvli a5,a4,e32,m1,ta,mu
addia4,a4,-1
vsetvli zero,a5,e32,m1,ta,ma  <- can be eliminated.
vle32.v v1,0(a0)
vse32.v v1,0(a1)
addiw   a2,a2,-1
bltua4,a5,.L10
.L3:
addiw   a2,a2,-1
addia4,a4,-1
bne a2,a3,.L5
.L1:
ret

After this patch:
f:
ble a2,zero,.L1
mv  a4,a2
li  a3,-1
vsetvli a5,a2,e32,m1,ta,ma
.L5:
vle32.v v1,0(a0)
vse32.v v1,0(a1)
bgeua4,a5,.L3
.L10:
beq a2,zero,.L1
vsetvli a5,a4,e32,m1,ta,ma
addia4,a4,-1
vle32.v v1,0(a0)
vse32.v v1,0(a1)
addiw   a2,a2,-1
bltua4,a5,.L10
.L3:
addiw   a2,a2,-1
addia4,a4,-1
bne a2,a3,.L5
.L1:
ret

PR target/109743

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc (pass_vsetvl::get_vsetvl_at_end): New.
(local_avl_compatible_p): New.
(pass_vsetvl::local_eliminate_vsetvl_insn): Enhance local optimizations
for LCM, rewrite as a backward algorithm.
(pass_vsetvl::cleanup_insns): Use new local_eliminate_vsetvl_insn
interface, handle a BB at once.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/vsetvl/pr109743-1.c: New test.
* gcc.target/riscv/rvv/vsetvl/pr109743-2.c: New test.
* gcc.target/riscv/rvv/vsetvl/pr109743-3.c: New test.
* gcc.target/riscv/rvv/vsetvl/pr109743-4.c: New test.

Co-authored-by: Juzhe-Zhong 

---
 gcc/config/riscv/riscv-vsetvl.cc  | 213 ++
 .../gcc.target/riscv/rvv/vsetvl/pr109743-1.c  |  26 +++
 .../gcc.target/riscv/rvv/vsetvl/pr109743-2.c  |  27 +++
 .../gcc.target/riscv/rvv/vsetvl/pr109743-3.c  |  28 +++
 .../gcc.target/riscv/rvv/vsetvl/pr109743-4.c  |  28 +++
 5 files changed, 277 insertions(+), 45 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-3.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109743-4.c

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 20d43372d332..f1c47e8f9be3 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -2667,7 +2667,8 @@ private:
   void pre_vsetvl (void);
 
   /* Phase 5.  */
-  void local_eliminate_vsetvl_insn (const vector_insn_info &) const;
+  rtx_insn *get_vsetvl_at_end (const bb_info *, vector_insn_info *) const;
+  void local_eliminate_vsetvl_insn (const bb_info *) const;
   void cleanup_insns (void) const;
 
   /* Phase 6.  */
@@ -4029,6 +4030,60 @@ pass_vsetvl::pre_vsetvl (void)
 commit_edge_insertions ();
 }
 
+/* Some instruction can not be accessed in RTL_SSA when we don't re-init
+   the new RTL_SSA framework but it is definetely at the END of the block.
+
+  Here we optimize the VSETVL is hoisted by LCM:
+
+   Before LCM:
+ bb 1:
+   vsetvli a5,a2,e32,m1,ta,mu
+ bb 2:
+   vsetvli zero,a5,e32,m1,ta,mu
+   ...
+
+   After LCM:
+ bb 1:
+   vsetvli a5,a2,e32,m1,ta,mu
+   LCM INSERTED: vsetvli zero,a5,e32,m1,ta,mu --> eliminate
+ bb 2:
+   ...
+   */
+rtx_insn *
+pass_vsetvl::get_vsetvl_at_end (const bb_info *bb, vector_insn_info *dem) const
+{
+  rtx_insn *end_vsetvl

[PATCH] RISC-V: Improve vector_insn_info::dump for LMUL and policy

2023-05-12 Thread Kito Cheng via Gcc-patches
Convert vlmul and policy to human readable string, some example below:

Before:
[VALID,Demand 
field={1(VL),0(DEMAND_NONZERO_AVL),1(SEW),0(DEMAND_GE_SEW),1(LMUL),0(RATIO),0(TAIL_POLICY),0(MASK_POLICY)}
AVL=(reg:DI 0 zero)
SEW=16,VLMUL=3,RATIO=2,TAIL_POLICY=1,MASK_POLICY=1]
 ^ ^ ^

After:
[VALID,Demand 
field={1(VL),0(DEMAND_NONZERO_AVL),1(SEW),0(DEMAND_GE_SEW),1(LMUL),0(RATIO),0(TAIL_POLICY),0(MASK_POLICY)}
AVL=(reg:DI 0 zero)
SEW=16,VLMUL=m8,RATIO=2,TAIL_POLICY=agnostic,MASK_POLICY=agnostic]
 ^^  

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc (vlmul_to_str): New.
(policy_to_str): New.
(vector_insn_info::dump): Use vlmul_to_str and policy_to_str.
---
 gcc/config/riscv/riscv-vsetvl.cc | 39 +---
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index f1c47e8f9be3..5c4b349dd9db 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -2241,6 +2241,39 @@ vector_insn_info::update_fault_first_load_avl (insn_info 
*insn)
   return false;
 }
 
+static const char *
+vlmul_to_str (vlmul_type vlmul)
+{
+  switch (vlmul)
+{
+case LMUL_1:
+  return "m1";
+case LMUL_2:
+  return "m2";
+case LMUL_4:
+  return "m4";
+case LMUL_8:
+  return "m8";
+case LMUL_RESERVED:
+  return "INVALID LMUL";
+case LMUL_F8:
+  return "mf8";
+case LMUL_F4:
+  return "mf4";
+case LMUL_F2:
+  return "mf2";
+
+default:
+  gcc_unreachable ();
+}
+}
+
+static const char *
+policy_to_str (bool agnostic_p)
+{
+  return agnostic_p ? "agnostic" : "undisturbed";
+}
+
 void
 vector_insn_info::dump (FILE *file) const
 {
@@ -2272,10 +2305,10 @@ vector_insn_info::dump (FILE *file) const
   fprintf (file, "AVL=");
   print_rtl_single (file, get_avl ());
   fprintf (file, "SEW=%d,", get_sew ());
-  fprintf (file, "VLMUL=%d,", get_vlmul ());
+  fprintf (file, "VLMUL=%s,", vlmul_to_str (get_vlmul ()));
   fprintf (file, "RATIO=%d,", get_ratio ());
-  fprintf (file, "TAIL_POLICY=%d,", get_ta ());
-  fprintf (file, "MASK_POLICY=%d", get_ma ());
+  fprintf (file, "TAIL_POLICY=%s,", policy_to_str (get_ta ()));
+  fprintf (file, "MASK_POLICY=%s", policy_to_str (get_ma ()));
   fprintf (file, "]\n");
 
   if (valid_p ())
-- 
2.39.2



Re: [PATCH] RISC-V: Using merge approach to optimize repeating sequence in vec_init

2023-05-12 Thread Robin Dapp via Gcc-patches
> emit_merge_op can not be wrapped into binop since mask position is
> different in pattern.
> 
> I prefer merge op in different wrapper.

Yes, I didn't mean literally the same but that things already
become a bit confusing with all the different variants and bool
arguments or code duplication with slight differences.  Something
like foo (true, false, true) is a bit of a code smell.  Nothing
to address immediately, I just need to keep an eye on it.  Shouldn't
prevent the patch from going in.


Re: [PATCH] OpenMP: Constructors and destructors for "declare target" static aggregates

2023-05-12 Thread Tobias Burnus

Hi Julian, hi all,

(I have not (yet) looked at the patch/testcase in depth - and I prefer
that someone knowledgeable with C++ constructors comments. However:)

Regarding only nvptx - and the testcase on nvptx:

On 12.05.23 14:02, Julian Brown wrote:

The approach taken is to generate a set of constructors to run on the
target: this currently works for AMD GCN, but fails on NVPTX due
to lack of constructor/destructor support there so far on mainline.
(See the new test static-aggr-constructor-destructor-3.C for a reason
why running constructors on the target is preferable to e.g. constructing
on the host and then copying the resulting object to the target.)


(I think also the wording in OpenMP implies that it is constructed
on the device and not copied over from the host, albeit the wording
is not very clear.)

For nvptx constructor/destructor support, see email thread
"nvptx: Support global constructors/destructors via 'collect2' for offloading",
e.g., https://gcc.gnu.org/pipermail/gcc-patches/2023-January/609711.html
* * *

If I run the testcase manually with nvptx offloading and -O0, I get:
  libgomp/testsuite/libgomp.c++/static-aggr-constructor-destructor-3.C:8:3: 
error:
alias definitions not supported in this configuration

and with -O1 and higher:
  libgomp/testsuite/libgomp.c++/static-aggr-constructor-destructor-3.C:36:1:
 sorry, unimplemented: global constructors not supported on this target

As DEFAULT_CFLAGS is -O2 checking for the proper error works, i.e. something
like (untested and ... to be filled in):

  /* { dg-message "sorry, ..." "..." target { offload_target_nvptx } } }
  /* { dg-excess-errors "Follow-up errors from mkoffload and lto-wrapper" { 
target { offload_target_nvptx } } */


Tobias

-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955


[committed] libstdc++: Remove test dependencies on _GLIBCXX_USE_C99_STDINT_TR1

2023-05-12 Thread Jonathan Wakely via Gcc-patches
Tested x86_64-linux. Pushed to trunk.

-- >8 --

These #ifdef checks should have been removed in r9-2029-g612c9c702e2c9e
when the u16string_view and u32string_view aliases were changed to be
defined unconditionally.

libstdc++-v3/ChangeLog:

* testsuite/21_strings/basic_string_view/typedefs.cc: Remove
dependency on _GLIBCXX_USE_C99_STDINT_TR1.
* testsuite/experimental/string_view/typedefs.cc: Likewise.
---
 libstdc++-v3/testsuite/21_strings/basic_string_view/typedefs.cc | 2 --
 libstdc++-v3/testsuite/experimental/string_view/typedefs.cc | 2 --
 2 files changed, 4 deletions(-)

diff --git a/libstdc++-v3/testsuite/21_strings/basic_string_view/typedefs.cc 
b/libstdc++-v3/testsuite/21_strings/basic_string_view/typedefs.cc
index b139e51e107..1beb2e1c54d 100644
--- a/libstdc++-v3/testsuite/21_strings/basic_string_view/typedefs.cc
+++ b/libstdc++-v3/testsuite/21_strings/basic_string_view/typedefs.cc
@@ -28,9 +28,7 @@ using check2_t = std::string_view;
 using check3_t = std::u8string_view;
 #endif
 
-#ifdef _GLIBCXX_USE_C99_STDINT_TR1
 using check4_t = std::u16string_view;
 using check5_t = std::u32string_view;
-#endif
 
 using check6_t = std::wstring_view;
diff --git a/libstdc++-v3/testsuite/experimental/string_view/typedefs.cc 
b/libstdc++-v3/testsuite/experimental/string_view/typedefs.cc
index 60c4e12aa4f..a09ed8f8a22 100644
--- a/libstdc++-v3/testsuite/experimental/string_view/typedefs.cc
+++ b/libstdc++-v3/testsuite/experimental/string_view/typedefs.cc
@@ -28,9 +28,7 @@ using check2_t = 
std::experimental::fundamentals_v1::string_view;
 using check3_t = std::experimental::fundamentals_v1::u8string_view;
 #endif
 
-#ifdef _GLIBCXX_USE_C99_STDINT_TR1
 using check4_t = std::experimental::fundamentals_v1::u16string_view;
 using check5_t = std::experimental::fundamentals_v1::u32string_view;
-#endif
 
 using check6_t = std::experimental::fundamentals_v1::wstring_view;
-- 
2.40.1



[committed] libstdc++: Remove test dependency on _GLIBCXX_USE_C99_STDINT_TR1

2023-05-12 Thread Jonathan Wakely via Gcc-patches
Tested x86_64-linux. Pushed to trunk.

-- >8 --

This should have been done in r9-2028-g8ba7f29e3dd064 when
std::shared_mutex was changed to be defined without depending on
_GLIBCXX_USE_C99_STDINT_TR1.

libstdc++-v3/ChangeLog:

* testsuite/experimental/feat-cxx14.cc: Remove dependency on
_GLIBCXX_USE_C99_STDINT_TR1.
---
 libstdc++-v3/testsuite/experimental/feat-cxx14.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libstdc++-v3/testsuite/experimental/feat-cxx14.cc 
b/libstdc++-v3/testsuite/experimental/feat-cxx14.cc
index 8c0061eddbf..effe382e06e 100644
--- a/libstdc++-v3/testsuite/experimental/feat-cxx14.cc
+++ b/libstdc++-v3/testsuite/experimental/feat-cxx14.cc
@@ -107,7 +107,7 @@
 #  error ""
 #endif
 
-#if defined(_GLIBCXX_HAS_GTHREADS) && defined(_GLIBCXX_USE_C99_STDINT_TR1)
+#if defined(_GLIBCXX_HAS_GTHREADS)
 #  ifndef  __cpp_lib_shared_timed_mutex
 #error "__cpp_lib_shared_timed_mutex"
 #  elif  __cpp_lib_shared_timed_mutex != 201402
-- 
2.40.1



[committed] libstdc++: Remove test dependency on _GLIBCXX_USE_C99_STDINT_TR1

2023-05-12 Thread Jonathan Wakely via Gcc-patches
Tested x86_64-linux. Pushed to trunk.

-- >8 --

This should have been removed in r9-2029-g612c9c702e2c9e when the
char16_t and char32_t specializations of std::codecvt were changed to be
defined unconditionally.

libstdc++-v3/ChangeLog:

* testsuite/22_locale/locale/cons/unicode.cc: Remove dependency
on _GLIBCXX_USE_C99_STDINT_TR1.
---
 libstdc++-v3/testsuite/22_locale/locale/cons/unicode.cc | 4 
 1 file changed, 4 deletions(-)

diff --git a/libstdc++-v3/testsuite/22_locale/locale/cons/unicode.cc 
b/libstdc++-v3/testsuite/22_locale/locale/cons/unicode.cc
index 0520b637be5..7fbbcae2aa3 100644
--- a/libstdc++-v3/testsuite/22_locale/locale/cons/unicode.cc
+++ b/libstdc++-v3/testsuite/22_locale/locale/cons/unicode.cc
@@ -33,14 +33,12 @@ typedef std::codecvt  
c_codecvt;
 typedef std::codecvt  w_codecvt;
 #endif
 
-#ifdef _GLIBCXX_USE_C99_STDINT_TR1
 typedef std::codecvt u16_codecvt;
 typedef std::codecvt u32_codecvt;
 #ifdef _GLIBCXX_USE_CHAR8_T
 typedef std::codecvt  
u16u8_codecvt;
 typedef std::codecvt  
u32u8_codecvt;
 #endif
-#endif
 
 class gnu_facet: public std::locale::facet
 {
@@ -69,13 +67,11 @@ void test01()
 #ifdef _GLIBCXX_USE_WCHAR_T
   VERIFY( has_facet(loc13) );
 #endif
-#ifdef _GLIBCXX_USE_C99_STDINT_TR1
   VERIFY( has_facet(loc13) );
   VERIFY( has_facet(loc13) );
 #ifdef _GLIBCXX_USE_CHAR8_T
   VERIFY( has_facet(loc13) );
   VERIFY( has_facet(loc13) );
-#endif
 #endif
   VERIFY( has_facet(loc13) );
 }
-- 
2.40.1



Re: [committed] Convert xstormy16 to LRA

2023-05-12 Thread Hans-Peter Nilsson via Gcc-patches
> From: Hans-Peter Nilsson 
> Date: Thu, 11 May 2023 17:05:40 +0200

> Next, I'll turn around completely, and try defaulting to
> -fsplit-wide-types-early, which sounds more promising. :)
> I don't like throwing defaults around randomly, but trying
> out a promising idea this way is easy.

Absolutely nothing changed (not counting now running
"subreg2" and generating a dump-file), compared to the
default.  Besides coremark and local micro-benchmarks I
inspected running arith-rand-ll.c with -O2 and briefly
stepped through the passes with gdb: the costs guiding the
splits are fine, properly enabling the splits, but not all
DImode registers are naturally "splittable"; looks like the
ones used in non-decomposable operations remain.  It seems
all splittable opportunities are dealt with by the first
pass ("subreg1").  I guess this pass has the most impact for
targets that have few or no DImode operations at all.

But why is the option called -fsplit-wide-types-early when
what it does is enabling a "subreg2" pass, there being
"subreg1" and "subreg3" enabled with -fsplit-wide-types?  It
should rather be called -fsplit-wide-types-second! :)

Looking at its placement in passes.def makes me wonder what
magic properties targets have that benefit from it.

Anyway, Roger mentioned that the clobbers emitted by the
lower-subreg passes were apparently damaging, so I'll try
this out "for fun", on the assumption that they're actually
unnecessary.  I don't think actually removing them has been
attempted?

The patch below seems to substantially lower register
pressure for arith-rand-ll for CRIS, but I've only inspected
the assembly source (not even compared the result to the
reload version).  Quoting it for reference only, and if it
"works" (passes regtest for cris-elf and x86-64-linux) I
think I'll resubmit as a proper patch:

--- lower-subreg.cc.orig2023-04-29 02:53:39.0 +0200
+++ lower-subreg.cc 2023-05-12 15:35:25.574668930 +0200
@@ -1086,9 +1086,6 @@ resolve_simple_move (rtx set, rtx_insn *
 {
   unsigned int i;
 
-  if (REG_P (dest) && !HARD_REGISTER_NUM_P (REGNO (dest)))
-   emit_clobber (dest);
-
   for (i = 0; i < words; ++i)
{
  rtx t = simplify_gen_subreg_concatn (word_mode, dest,

brgds, H-P


Re: [PATCH 2/3] Refactor widen_plus as internal_fn

2023-05-12 Thread Andre Vieira (lists) via Gcc-patches




On 12/05/2023 14:28, Richard Biener wrote:

On Fri, 12 May 2023, Andre Vieira (lists) wrote:


I have dealt with, I think..., most of your comments. There's quite a few
changes, I think it's all a bit simpler now. I made some other changes to the
costing in tree-inline.cc and gimple-range-op.cc in which I try to preserve
the same behaviour as we had with the tree codes before. Also added some extra
checks to tree-cfg.cc that made sense to me.

I am still regression testing the gimple-range-op change, as that was a last
minute change, but the rest survived a bootstrap and regression test on
aarch64-unknown-linux-gnu.

cover letter:

This patch replaces the existing tree_code widen_plus and widen_minus
patterns with internal_fn versions.

DEF_INTERNAL_OPTAB_WIDENING_HILO_FN and DEF_INTERNAL_OPTAB_NARROWING_HILO_FN
are like DEF_INTERNAL_SIGNED_OPTAB_FN and DEF_INTERNAL_OPTAB_FN respectively
except they provide convenience wrappers for defining conversions that require
a hi/lo split.  Each definition for  will require optabs for _hi and _lo
and each of those will also require a signed and unsigned version in the case
of widening. The hi/lo pair is necessary because the widening and narrowing
operations take n narrow elements as inputs and return n/2 wide elements as
outputs. The 'lo' operation operates on the first n/2 elements of input. The
'hi' operation operates on the second n/2 elements of input. Defining an
internal_fn along with hi/lo variations allows a single internal function to
be returned from a vect_recog function that will later be expanded to hi/lo.


  For example:
  IFN_VEC_WIDEN_PLUS -> IFN_VEC_WIDEN_PLUS_HI, IFN_VEC_WIDEN_PLUS_LO
for aarch64: IFN_VEC_WIDEN_PLUS_HI   -> vec_widen_add_hi_ ->
(u/s)addl2
IFN_VEC_WIDEN_PLUS_LO  -> vec_widen_add_lo_
-> (u/s)addl

This gives the same functionality as the previous WIDEN_PLUS/WIDEN_MINUS tree
codes which are expanded into VEC_WIDEN_PLUS_LO, VEC_WIDEN_PLUS_HI.


What I still don't understand is how we are so narrowly focused on
HI/LO?  We need a combined scalar IFN for pattern selection (not
sure why that's now called _HILO, I expected no suffix).  Then there's
three possibilities the target can implement this:

  1) with a widen_[su]add instruction - I _think_ that's what
 RISCV is going to offer since it is a target where vector modes
 have "padding" (aka you cannot subreg a V2SI to get V4HI).  Instead
 RVV can do a V4HI to V4SI widening and widening add/subtract
 using vwadd[u] and vwsub[u] (the HI->SI widening is actually
 done with a widening add of zero - eh).
 IIRC GCN is the same here.
  2) with a widen_[su]add{_lo,_hi} combo - that's what the tree
 codes currently support (exclusively)
  3) similar, but widen_[su]add{_even,_odd}

that said, things like decomposes_to_hilo_fn_p look to paint us into
a 2) corner without good reason.


I was kind of just keeping the naming, I had forgotten to mention I was 
also going to add _EVENODD but you are right, the pattern selection IFN 
does not need to be restrictive.


And then at supportable_widening_operation we could check what the 
target offers support for (either 1, 2 or 3). We can then actually just 
get rid of decomposes_to_hilo_fn_p and just assume that for all 
narrowing or widening IFN's there are optabs (that may or may not be 
implemented by a target) for all three variants


Having said that, that means we should have an optab to cover 1, which 
should probably just have the original name. Let me write it out...


Say we have a IFN_VEC_WIDEN_PLUS pattern and assume its signed, 
supportable_widening_operation would then first check if the target 
supported vec_widen_sadd_optab for say V8HI -> V8SI? Risc-V would take 
this path I guess?


If the target doesn't then it could check for support for:
vec_widen_sadd_lo_optab V4HI -> V4SI
vec_widen_sadd_hi_optab V4HI -> V4SI

AArch64 Advanced SIMD would implement this.

If the target still didn't support this it would check for (not sure 
about the modes here):

vec_widen_sadd_even_optab VNx8HI -> VNx4SI
vec_widen_sadd_odd_optab VNx8HI -> VNx4SI

This is one SVE would implement.


So that would mean that I'd probably end up rewriting
#define DEF_INTERNAL_OPTAB_WIDENING_FN (NAME, FLAGS, SELECTOR, SOPTAB, 
UOPTAB, TYPE)

as:
for1)
  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR, SOPTAB, UOPTAB, 
TYPE)


for 2)
  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME##_LO, FLAGS, SELECTOR, SOPTAB, 
UOPTAB, TYPE)
  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME##_HI, FLAGS, SELECTOR, SOPTAB, 
UOPTAB, TYPE)


for 3)
  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME##_EVEN, FLAGS, SELECTOR, SOPTAB, 
UOPTAB, TYPE)
  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME##_ODD, FLAGS, SELECTOR, SOPTAB, 
UOPTAB, TYPE)


And the same for narrowing (but with DEF_INTERNAL_OPTAB_FN instead of 
SIGNED_OPTAB).


So each widening and narrowing IFN would have optabs for all its 
variants and each target implements the ones it supports.


I'm happy to do this, but im

Re: [PATCH 2/3] Refactor widen_plus as internal_fn

2023-05-12 Thread Richard Sandiford via Gcc-patches
Richard Biener  writes:
> On Fri, 12 May 2023, Andre Vieira (lists) wrote:
>
>> I have dealt with, I think..., most of your comments. There's quite a few
>> changes, I think it's all a bit simpler now. I made some other changes to the
>> costing in tree-inline.cc and gimple-range-op.cc in which I try to preserve
>> the same behaviour as we had with the tree codes before. Also added some 
>> extra
>> checks to tree-cfg.cc that made sense to me.
>> 
>> I am still regression testing the gimple-range-op change, as that was a last
>> minute change, but the rest survived a bootstrap and regression test on
>> aarch64-unknown-linux-gnu.
>> 
>> cover letter:
>> 
>> This patch replaces the existing tree_code widen_plus and widen_minus
>> patterns with internal_fn versions.
>> 
>> DEF_INTERNAL_OPTAB_WIDENING_HILO_FN and DEF_INTERNAL_OPTAB_NARROWING_HILO_FN
>> are like DEF_INTERNAL_SIGNED_OPTAB_FN and DEF_INTERNAL_OPTAB_FN respectively
>> except they provide convenience wrappers for defining conversions that 
>> require
>> a hi/lo split.  Each definition for  will require optabs for _hi and 
>> _lo
>> and each of those will also require a signed and unsigned version in the case
>> of widening. The hi/lo pair is necessary because the widening and narrowing
>> operations take n narrow elements as inputs and return n/2 wide elements as
>> outputs. The 'lo' operation operates on the first n/2 elements of input. The
>> 'hi' operation operates on the second n/2 elements of input. Defining an
>> internal_fn along with hi/lo variations allows a single internal function to
>> be returned from a vect_recog function that will later be expanded to hi/lo.
>> 
>> 
>>  For example:
>>  IFN_VEC_WIDEN_PLUS -> IFN_VEC_WIDEN_PLUS_HI, IFN_VEC_WIDEN_PLUS_LO
>> for aarch64: IFN_VEC_WIDEN_PLUS_HI   -> vec_widen_add_hi_ ->
>> (u/s)addl2
>>IFN_VEC_WIDEN_PLUS_LO  -> vec_widen_add_lo_
>> -> (u/s)addl
>> 
>> This gives the same functionality as the previous WIDEN_PLUS/WIDEN_MINUS tree
>> codes which are expanded into VEC_WIDEN_PLUS_LO, VEC_WIDEN_PLUS_HI.
>
> What I still don't understand is how we are so narrowly focused on
> HI/LO?  We need a combined scalar IFN for pattern selection (not
> sure why that's now called _HILO, I expected no suffix).  Then there's
> three possibilities the target can implement this:
>
>  1) with a widen_[su]add instruction - I _think_ that's what
> RISCV is going to offer since it is a target where vector modes
> have "padding" (aka you cannot subreg a V2SI to get V4HI).  Instead
> RVV can do a V4HI to V4SI widening and widening add/subtract
> using vwadd[u] and vwsub[u] (the HI->SI widening is actually
> done with a widening add of zero - eh).
> IIRC GCN is the same here.

SVE currently does this too, but the addition and widening are
separate operations.  E.g. in principle there's no reason why
you can't sign-extend one operand, zero-extend the other, and
then add the result together.  Or you could extend them from
different sizes (QI and HI).  All of those are supported
(if the costing allows them).

If the target has operations to do combined extending and adding (or
whatever), then at the moment we rely on combine to generate them.

So I think this case is separate from Andre's work.  The addition
itself is just an ordinary addition, and any widening happens by
vectorising a CONVERT/NOP_EXPR.

>  2) with a widen_[su]add{_lo,_hi} combo - that's what the tree
> codes currently support (exclusively)
>  3) similar, but widen_[su]add{_even,_odd}
>
> that said, things like decomposes_to_hilo_fn_p look to paint us into
> a 2) corner without good reason.

I suppose one question is: how much of the patch is really specific
to HI/LO, and how much is just grouping two halves together?  The nice
thing about the internal-fn grouping macros is that, if (3) is
implemented in future, the structure will strongly encourage even/odd
pairs to be supported for all operations that support hi/lo.  That is,
I would expect the grouping macros to be extended to define even/odd
ifns alongside hi/lo ones, rather than adding separate definitions
for even/odd functions.

If so, at least from the internal-fn.* side of things, I think the question
is whether it's OK to stick with hilo names for now, or whether we should
use more forward-looking names.

Thanks,
Richard

>
> Richard.
>
>> gcc/ChangeLog:
>> 
>> 2023-05-12  Andre Vieira  
>> Joel Hutton  
>> Tamar Christina  
>> 
>> * config/aarch64/aarch64-simd.md (vec_widen_addl_lo_):
>> Rename
>> this ...
>> (vec_widen_add_lo_): ... to this.
>> (vec_widen_addl_hi_): Rename this ...
>> (vec_widen_add_hi_): ... to this.
>> (vec_widen_subl_lo_): Rename this ...
>> (vec_widen_sub_lo_): ... to this.
>> (vec_widen_subl_hi_): Rename this ...
>> (vec_widen_sub_hi_): ...to this.
>> * doc/generic.texi: Document new IFN codes.
>>  * internal-fn.c

Re: [committed] Convert xstormy16 to LRA

2023-05-12 Thread Hans-Peter Nilsson via Gcc-patches
> From: Hans-Peter Nilsson 
> Date: Fri, 12 May 2023 15:53:49 +0200

> Anyway, Roger mentioned that the clobbers emitted by the
> lower-subreg passes were apparently damaging, so I'll try
> this out "for fun", on the assumption that they're actually
> unnecessary.  I don't think actually removing them has been
> attempted?

> --- lower-subreg.cc.orig  2023-04-29 02:53:39.0 +0200
> +++ lower-subreg.cc   2023-05-12 15:35:25.574668930 +0200

Bah, then I noticed r14-554-gd8a6945c6ea22e, committed
several days ago.  I should have checked up-to-date sources...
Thanks Roger!

Now off to measure the impact.  Maybe up to par with reload
now? :)

brgds, H-P


RE: [committed] Convert xstormy16 to LRA

2023-05-12 Thread Roger Sayle


Hi H-P,
This patch should now already be on trunk:
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=d8a6945c6ea22efa4d5e42fe1922d2
b27953c8cd
Many thanks to Jeff for the review/approval.
There have been no reported adverse effects so far.
Please let me/us know if this has helped CRIS.

Cheers,
Roger
--

-Original Message-
From: Hans-Peter Nilsson  
Sent: 12 May 2023 14:54
To: Hans-Peter Nilsson 
Cc: ro...@nextmovesoftware.com; jeffreya...@gmail.com;
gcc-patches@gcc.gnu.org; seg...@kernel.crashing.org
Subject: Re: [committed] Convert xstormy16 to LRA

> From: Hans-Peter Nilsson 
> Date: Thu, 11 May 2023 17:05:40 +0200

> Next, I'll turn around completely, and try defaulting to 
> -fsplit-wide-types-early, which sounds more promising. :) I don't like 
> throwing defaults around randomly, but trying out a promising idea 
> this way is easy.

Absolutely nothing changed (not counting now running "subreg2" and
generating a dump-file), compared to the default.  Besides coremark and
local micro-benchmarks I inspected running arith-rand-ll.c with -O2 and
briefly stepped through the passes with gdb: the costs guiding the splits
are fine, properly enabling the splits, but not all DImode registers are
naturally "splittable"; looks like the ones used in non-decomposable
operations remain.  It seems all splittable opportunities are dealt with by
the first pass ("subreg1").  I guess this pass has the most impact for
targets that have few or no DImode operations at all.

But why is the option called -fsplit-wide-types-early when what it does is
enabling a "subreg2" pass, there being "subreg1" and "subreg3" enabled with
-fsplit-wide-types?  It should rather be called -fsplit-wide-types-second!
:)

Looking at its placement in passes.def makes me wonder what magic properties
targets have that benefit from it.

Anyway, Roger mentioned that the clobbers emitted by the lower-subreg passes
were apparently damaging, so I'll try this out "for fun", on the assumption
that they're actually unnecessary.  I don't think actually removing them has
been attempted?

The patch below seems to substantially lower register pressure for
arith-rand-ll for CRIS, but I've only inspected the assembly source (not
even compared the result to the reload version).  Quoting it for reference
only, and if it "works" (passes regtest for cris-elf and x86-64-linux) I
think I'll resubmit as a proper patch:

--- lower-subreg.cc.orig2023-04-29 02:53:39.0 +0200
+++ lower-subreg.cc 2023-05-12 15:35:25.574668930 +0200
@@ -1086,9 +1086,6 @@ resolve_simple_move (rtx set, rtx_insn *
 {
   unsigned int i;
 
-  if (REG_P (dest) && !HARD_REGISTER_NUM_P (REGNO (dest)))
-   emit_clobber (dest);
-
   for (i = 0; i < words; ++i)
{
  rtx t = simplify_gen_subreg_concatn (word_mode, dest,

brgds, H-P



Re: [PATCH] i386: Honour -mdirect-extern-access when calling __fentry__

2023-05-12 Thread Ard Biesheuvel via Gcc-patches
On Thu, 11 May 2023 at 08:08, Uros Bizjak  wrote:
>
> On Thu, May 11, 2023 at 12:04 AM H.J. Lu  wrote:
> >
> > On Wed, May 10, 2023 at 2:17 AM Uros Bizjak  wrote:
> > >
> > > On Tue, May 9, 2023 at 10:58 AM Ard Biesheuvel  wrote:
> > > >
> > > > The small and medium PIC code models generate profiling calls that
> > > > always load the address of __fentry__() via the GOT, even if
> > > > -mdirect-extern-access is in effect.
> > > >
> > > > This deviates from the behavior with respect to other external
> > > > references, and results in a longer opcode that relies on linker
> > > > relaxation to eliminate the GOT load. In this particular case, the
> > > > transformation replaces an indirect 'CALL *__fentry__@GOTPCREL(%rip)'
> > > > with either 'CALL __fentry__; NOP' or 'NOP; CALL __fentry__', where the
> > > > NOP is a 1 byte NOP that preserves the 6 byte length of the sequence.
> > > >
> > > > This is problematic for the Linux kernel, which generally relies on
> > > > -mdirect-extern-access and hidden visibility to eliminate GOT based
> > > > symbol references in code generated with -fpie/-fpic, without having to
> > > > depend on linker relaxation.
> > > >
> > > > The Linux kernel relies on code patching to replace these opcodes with
> > > > NOPs at runtime, and this is complicated code that we'd prefer not to
> > > > complicate even more by adding support for patching both 5 and 6 byte
> > > > sequences as well as parsing the instruction stream to decide which
> > > > variant of CALL+NOP we are dealing with.
> > > >
> > > > So let's honour -mdirect-extern-access, and only load the address of
> > > > __fentry__ via the GOT if direct references to external symbols are not
> > > > permitted.
> > > >
> > > > Note that the GOT reference in question is in fact a data reference: we
> > > > explicitly load the address of __fentry__ from the GOT, which amounts to
> > > > eager binding, rather than emitting a PLT call that could bind eagerly,
> > > > lazily or directly at link time.
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > * config/i386/i386.cc (x86_function_profiler): Take
> > > >   ix86_direct_extern_access into account when generating calls
> > > >   to __fentry__()
> > >
> > > HJ, is the patch OK with you?
> >
> > LGTM.
>
> OK then.
>

Thanks all. Is anything expected of me at this point?


Re: [PATCH v5 3/5] p1689r5: initial support

2023-05-12 Thread Ben Boeckel via Gcc-patches
On Tue, Feb 14, 2023 at 16:50:27 -0500, Jason Merrill wrote:
> I notice that the actual flags are all -fdep-*, though some of them are 
> -fdeps-* here, and the internal variables all seem to be fdeps_*.  I 
> lean toward harmonizing on "deps", I think.

Done.

> I don't love the three separate options, but I suppose it's fine.  I'd 
> prefer "target" instead of "output".

Done.

> It should be possible to omit both -file and -target and get reasonable 
> defaults, like the ones for -MD/-MQ in gcc.cc:cpp_unique_options.

`file` can be omitted (the `output_stream` will be used then). I *think*
I see that adding:

%{fdeps_file:-fdeps-file=%{!o:%b.ddi}%{o*:%.ddi%*}}

would at least do for `-fdeps-file` defaults? I don't know if there's a
reasonable default for `-fdeps-target=` though given that this command
line has no information about the object file that will be used (`-o` is
used for preprocessor output since we're leaning on `-E` here).

--Ben


Re: [PATCH v5 1/5] libcpp: reject codepoints above 0x10FFFF

2023-05-12 Thread Ben Boeckel via Gcc-patches
On Mon, Feb 13, 2023 at 10:53:17 -0500, Jason Merrill wrote:
> On 1/25/23 13:06, Ben Boeckel wrote:
> > Unicode does not support such values because they are unrepresentable in
> > UTF-16.
> > 
> > libcpp/
> > 
> > * charset.cc: Reject encodings of codepoints above 0x10.
> > UTF-16 does not support such codepoints and therefore all
> > Unicode rejects such values.
> 
> It seems that this causes a bunch of testsuite failures from tests that 
> expect this limit to be checked elsewhere with a different diagnostic, 
> so I think the easiest thing is to fold this into _cpp_valid_utf8_str 
> instead, i.e.:

Since then, `cpp_valid_utf8_p` has appeared and takes care of the
over-long encodings. The new patchset just checks for codepoints beyond
0x10 and rejects them in this function (and the test suite matches
`master` results for me then).

--Ben


Re: [PATCH v5 4/5] c++modules: report imported CMI files as dependencies

2023-05-12 Thread Ben Boeckel via Gcc-patches
On Mon, Feb 13, 2023 at 13:33:50 -0500, Jason Merrill wrote:
> Both this and the mapper dependency patch seem to cause most of the 
> modules testcases to crash; please remember to run the regression tests 
> (https://gcc.gnu.org/contribute.html#testing)

Fixed for v6. `cpp_get_deps` can return `NULL` which `deps_add_dep`
assumes to not be true; fixed by checking before calling.

--Ben


[PATCH V2] RISC-V: Using merge approach to optimize repeating sequence in vec_init

2023-05-12 Thread juzhe . zhong
From: Juzhe-Zhong 

Address comment from Robin.

Consider this following case:
typedef int64_t vnx32di __attribute__ ((vector_size (256)));


__attribute__ ((noipa)) void
f_vnx32di (int64_t a, int64_t b, int64_t *out)
{
  vnx32di v
= {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, 
a, b, a, b, a, b, a, b};
  *(vnx32di *) out = v;
}

Since we dont't have SEW = 128 in vec_duplicate, we can't combine ab into SEW = 
128 element and then
broadcast this big element.

This patch is optimize the case as above.

-march=rv64gcv_zvl256b --param riscv-autovec-preference=fixed-vlmax

Before this patch:

..
vslide1down.vx (x31 times)
..

After this patch:
li  a5,-1431654400
addia5,a5,-1365
li  a3,-1431654400
addia3,a3,-1366
sllia5,a5,32
add a5,a5,a3
vsetvli a4,zero,e64,m8,ta,ma
vmv.v.x v8,a0
vmv.s.x v0,a5
vmerge.vxm  v8,v8,a1,v0
vs8r.v  v8,0(a2)
ret

---
 gcc/config/riscv/riscv-v.cc   | 260 --
 .../riscv/rvv/autovec/vls-vlmax/repeat-10.c   |  19 ++
 .../riscv/rvv/autovec/vls-vlmax/repeat-11.c   |  25 ++
 .../riscv/rvv/autovec/vls-vlmax/repeat-7.c|  25 ++
 .../riscv/rvv/autovec/vls-vlmax/repeat-8.c|  15 +
 .../riscv/rvv/autovec/vls-vlmax/repeat-9.c|  16 ++
 .../rvv/autovec/vls-vlmax/repeat_run-11.c |  45 +++
 .../rvv/autovec/vls-vlmax/repeat_run-7.c  |  45 +++
 .../rvv/autovec/vls-vlmax/repeat_run-8.c  |  41 +++
 9 files changed, 463 insertions(+), 28 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-10.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-11.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-8.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-9.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-11.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-7.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-8.c

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index b8dc333f54e..c95a506b9ec 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -68,15 +68,23 @@ public:
   {
 add_input_operand (CONSTM1_RTX (mode), mode);
   }
+  void add_scalar_move_mask_operand (machine_mode mode)
+  {
+rtx scalar_move_mask = gen_scalar_move_mask (mode);
+add_input_operand (scalar_move_mask, mode);
+  }
   void add_vundef_operand (machine_mode mode)
   {
 add_input_operand (RVV_VUNDEF (mode), mode);
   }
-  void add_policy_operand (enum tail_policy vta, enum mask_policy vma)
+  void add_ta_policy_operand (enum tail_policy vta)
   {
 rtx tail_policy_rtx = gen_int_mode (vta, Pmode);
-rtx mask_policy_rtx = gen_int_mode (vma, Pmode);
 add_input_operand (tail_policy_rtx, Pmode);
+  }
+  void add_ma_policy_operand (enum mask_policy vma)
+  {
+rtx mask_policy_rtx = gen_int_mode (vma, Pmode);
 add_input_operand (mask_policy_rtx, Pmode);
   }
   void add_avl_type_operand (avl_type type)
@@ -84,40 +92,52 @@ public:
 add_input_operand (gen_int_mode (type, Pmode), Pmode);
   }
 
-  void set_dest_and_mask (rtx mask, rtx dest, machine_mode mask_mode)
+  void set_dest_and_mask (rtx mask, rtx dest, machine_mode mask_mode,
+ bool scalar_move = false, bool merge_op = false)
   {
 dest_mode = GET_MODE (dest);
 has_dest = true;
 
 add_output_operand (dest, dest_mode);
 
-if (mask)
-  add_input_operand (mask, GET_MODE (mask));
-else
-  add_all_one_mask_operand (mask_mode);
+if (!merge_op)
+  {
+   if (mask)
+ add_input_operand (mask, GET_MODE (mask));
+   else
+ {
+   if (scalar_move)
+ add_scalar_move_mask_operand (mask_mode);
+   else
+ add_all_one_mask_operand (mask_mode);
+ }
+  }
 
 add_vundef_operand (dest_mode);
   }
 
-  void set_len_and_policy (rtx len, bool force_vlmax = false)
-{
-  bool vlmax_p = force_vlmax;
-  gcc_assert (has_dest);
+  void set_len_and_policy (rtx len, bool force_vlmax = false, bool ta_p = true,
+  bool ma_p = true)
+  {
+bool vlmax_p = force_vlmax;
+gcc_assert (has_dest);
 
-  if (!len)
-   {
- vlmax_p = true;
- len = gen_reg_rtx (Pmode);
- emit_vlmax_vsetvl (dest_mode, len);
-   }
+if (!len)
+  {
+   vlmax_p = true;
+   len = gen_reg_rtx (Pmode);
+   emit_vlmax_vsetvl (dest_mode, len);
+  }
 
-  add_input_operand (len, Pmode);
+add_input_operand (len, Pmode);
 
-  if (GET_MODE_CLASS (dest_mode) != MODE_VECTOR_BOOL)
-   add_policy_operand (get_prefe

PING: [PATCH] release the sorted FDE array when deregistering a frame [PR109685]

2023-05-12 Thread Thomas Neumann via Gcc-patches
Summary: The old linear scan logic called free while searching the list 
of frames. The atomic fast path finds the frame quickly, but forgot the 
free call. This patches adds the missing free. Bugzilla #109685.


See:
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/617245.html

Best

Thomas


Re: [PATCH V2] RISC-V: Using merge approach to optimize repeating sequence in vec_init

2023-05-12 Thread Kito Cheng via Gcc-patches
two minor comments:

> +  void add_ta_policy_operand (enum tail_policy vta)
> +  void add_ma_policy_operand (enum mask_policy vma)

You could just named as add_policy_operand since the arugment type is
already sufficient to distinguish.

> @@ -84,40 +92,52 @@ public:
>  add_input_operand (gen_int_mode (type, Pmode), Pmode);
>}
>
> -  void set_dest_and_mask (rtx mask, rtx dest, machine_mode mask_mode)
> +  void set_dest_and_mask (rtx mask, rtx dest, machine_mode mask_mode,
> + bool scalar_move = false, bool merge_op = false)
>{
>  dest_mode = GET_MODE (dest);
>  has_dest = true;
>
>  add_output_operand (dest, dest_mode);
>
> -if (mask)
> -  add_input_operand (mask, GET_MODE (mask));
> -else
> -  add_all_one_mask_operand (mask_mode);
> +if (!merge_op)
> +  {
> +   if (mask)
> + add_input_operand (mask, GET_MODE (mask));
> +   else
> + {
> +   if (scalar_move)
> + add_scalar_move_mask_operand (mask_mode);
> +   else
> + add_all_one_mask_operand (mask_mode);
> + }
> +  }

I would like to have set_dest_and_mask_merge_op to reduce the
complexity of set_dest_and_mask.
Accroding the code I got is:
- mask and merge_op are mutual exclusion.
- scalar_move will become meanless if merge_op is true.

So I think they mixed those togeter is not good idea.


[PATCH V3] RISC-V: Using merge approach to optimize repeating sequence in vec_init

2023-05-12 Thread juzhe . zhong
From: Juzhe-Zhong 

Address comments from Kito.

Consider this following case:
typedef int64_t vnx32di __attribute__ ((vector_size (256)));


__attribute__ ((noipa)) void
f_vnx32di (int64_t a, int64_t b, int64_t *out)
{
  vnx32di v
= {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, 
a, b, a, b, a, b, a, b};
  *(vnx32di *) out = v;
}

Since we dont't have SEW = 128 in vec_duplicate, we can't combine ab into SEW = 
128 element and then
broadcast this big element.

This patch is optimize the case as above.

-march=rv64gcv_zvl256b --param riscv-autovec-preference=fixed-vlmax

Before this patch:

..
vslide1down.vx (x31 times)
..

After this patch:
li  a5,-1431654400
addia5,a5,-1365
li  a3,-1431654400
addia3,a3,-1366
sllia5,a5,32
add a5,a5,a3
vsetvli a4,zero,e64,m8,ta,ma
vmv.v.x v8,a0
vmv.s.x v0,a5
vmerge.vxm  v8,v8,a1,v0
vs8r.v  v8,0(a2)
ret

gcc/ChangeLog:

* config/riscv/riscv-v.cc 
(rvv_builder::can_duplicate_repeating_sequence_p): New function.
(rvv_builder::get_merged_repeating_sequence): Ditto.
(rvv_builder::repeating_sequence_use_merge_profitable_p): Ditto.
(rvv_builder::get_merge_mask_bitfield): Ditto.
(emit_scalar_move_op): Ditto.
(emit_merge_op): Ditto.
(expand_vector_init_merge_repeating_sequence): Ditto.
(expand_vec_init): Add merge approach for reapeating sequence.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-10.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-11.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-8.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-9.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-11.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-7.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-8.c: New test.

---
 gcc/config/riscv/riscv-v.cc   | 245 --
 .../riscv/rvv/autovec/vls-vlmax/repeat-10.c   |  19 ++
 .../riscv/rvv/autovec/vls-vlmax/repeat-11.c   |  25 ++
 .../riscv/rvv/autovec/vls-vlmax/repeat-7.c|  25 ++
 .../riscv/rvv/autovec/vls-vlmax/repeat-8.c|  15 ++
 .../riscv/rvv/autovec/vls-vlmax/repeat-9.c|  16 ++
 .../rvv/autovec/vls-vlmax/repeat_run-11.c |  45 
 .../rvv/autovec/vls-vlmax/repeat_run-7.c  |  45 
 .../rvv/autovec/vls-vlmax/repeat_run-8.c  |  41 +++
 9 files changed, 453 insertions(+), 23 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-10.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-11.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-8.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-9.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-11.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-7.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-8.c

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index b8dc333f54e..b336b11228b 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -72,11 +72,14 @@ public:
   {
 add_input_operand (RVV_VUNDEF (mode), mode);
   }
-  void add_policy_operand (enum tail_policy vta, enum mask_policy vma)
+  void add_policy_operand (enum tail_policy vta)
   {
 rtx tail_policy_rtx = gen_int_mode (vta, Pmode);
-rtx mask_policy_rtx = gen_int_mode (vma, Pmode);
 add_input_operand (tail_policy_rtx, Pmode);
+  }
+  void add_policy_operand (enum mask_policy vma)
+  {
+rtx mask_policy_rtx = gen_int_mode (vma, Pmode);
 add_input_operand (mask_policy_rtx, Pmode);
   }
   void add_avl_type_operand (avl_type type)
@@ -99,25 +102,36 @@ public:
 add_vundef_operand (dest_mode);
   }
 
-  void set_len_and_policy (rtx len, bool force_vlmax = false)
-{
-  bool vlmax_p = force_vlmax;
-  gcc_assert (has_dest);
+  void set_dest_merge (rtx dest)
+  {
+dest_mode = GET_MODE (dest);
+has_dest = true;
+add_output_operand (dest, dest_mode);
+add_vundef_operand (dest_mode);
+  }
 
-  if (!len)
-   {
- vlmax_p = true;
- len = gen_reg_rtx (Pmode);
- emit_vlmax_vsetvl (dest_mode, len);
-   }
+  void set_len_and_policy (rtx len, bool force_vlmax = false, bool ta_p = true,
+  bool ma_p = true)
+  {
+bool vlmax_p = force_vlmax;
+gcc_assert (has_dest);
 
-  add_input_operand (len, Pmode);
+if (!len)
+  {
+   vlmax_p = true;
+   len = gen_reg

[PATCH v2] Machine_Mode: Extend machine_mode from 8 to 16 bits

2023-05-12 Thread Pan Li via Gcc-patches
From: Pan Li 

We are running out of the machine_mode(8 bits) in RISC-V backend. Thus
we would like to extend the machine_mode bit size from 8 to 16 bits.
However, it is sensitive to extend the memory size in common structure
like tree or rtx. This patch would like to extend the machine_mode bits
to 16 bits by shrinking, like:

* Swap the bit size of code and machine code in rtx_def.
* Reconcile the machine_mode location and spare in tree.

The memory impact of this patch for correlated structure looks like below:

+---+--+-+--+
| struct/bytes  | upstream | patched | diff |
+---+--+-+--+
| rtx_obj_reference |8 |  12 |   +4 |
| ext_modified  |2 |   4 |   +2 |
| ira_allocno   |  192 | 184 |   -8 |
| qty_table_elem|   40 |  40 |0 |
| reg_stat_type |   64 |  64 |0 |
| rtx_def   |   40 |  40 |0 |
| table_elt |   80 |  80 |0 |
| tree_decl_common  |  112 | 112 |0 |
| tree_type_common  |  128 | 128 |0 |
+---+--+-+--+

The tree and rtx related struct has no memory changes after this patch,
and the machine_mode changes to 16 bits already.

Signed-off-by: Pan Li 
Co-authored-by: Ju-Zhe Zhong 
Co-authored-by: Kito Cheng 
Co-Authored-By: Richard Biener 
Co-Authored-By: Richard Sandiford 

gcc/ChangeLog:

* combine.cc (struct reg_stat_type): Extended machine_mode to 16 bits.
* cse.cc (struct qty_table_elem): Extended machine_mode to 16 bits
and re-ordered the struct fields for alignment.
(struct table_elt): Extended machine_mode to 16 bits.
(struct set): Ditto.
* genopinit.cc (main): Reconciled the machine_mode limit.
* ira-int.h (struct ira_allocno): Extended machine_mode to 16 bits.
re-ordered the struct fields for padding.
* machmode.h (MACHINE_MODE_BITSIZE): New macro.
* ree.cc (struct ext_modified): Extended machine_mode to 16 bits and
removed the ATTRIBUTE_PACKED.
* rtl-ssa/accesses.h: Extended machine_mode to 16 bits.
* rtl.h (RTX_CODE_BITSIZE): New macro.
(struct rtx_def): Swap both the bit size and location between the
rtx_code and the machine_mode.
(subreg_shape::unique_id): Reconciled the machine_mode limit.
* rtlanal.h: Extended machine_mode to 16 bits.
* tree-core.h (struct tree_type_common): Extended machine_mode to 16
bits and re-ordered the struct fields for padding.
(struct tree_decl_common): Extended machine_mode to 16 bits.
---
 gcc/combine.cc |  4 +--
 gcc/cse.cc | 16 
 gcc/genopinit.cc   |  3 ++-
 gcc/ira-int.h  | 56 +-
 gcc/machmode.h |  2 ++
 gcc/ree.cc |  4 +--
 gcc/rtl-ssa/accesses.h |  2 +-
 gcc/rtl.h  | 12 +
 gcc/rtlanal.h  |  2 +-
 gcc/tree-core.h|  9 ---
 10 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/gcc/combine.cc b/gcc/combine.cc
index 5aa0ec5c45a..a23caeed96f 100644
--- a/gcc/combine.cc
+++ b/gcc/combine.cc
@@ -200,7 +200,7 @@ struct reg_stat_type {
 
   unsigned HOST_WIDE_INT   last_set_nonzero_bits;
   char last_set_sign_bit_copies;
-  ENUM_BITFIELD(machine_mode)  last_set_mode : 8;
+  ENUM_BITFIELD(machine_mode)  last_set_mode : MACHINE_MODE_BITSIZE;
 
   /* Set nonzero if references to register n in expressions should not be
  used.  last_set_invalid is set nonzero when this register is being
@@ -235,7 +235,7 @@ struct reg_stat_type {
  truncation if we know that value already contains a truncated
  value.  */
 
-  ENUM_BITFIELD(machine_mode)  truncated_to_mode : 8;
+  ENUM_BITFIELD(machine_mode)  truncated_to_mode : MACHINE_MODE_BITSIZE;
 };
 
 
diff --git a/gcc/cse.cc b/gcc/cse.cc
index b10c9b0c94d..86403b95938 100644
--- a/gcc/cse.cc
+++ b/gcc/cse.cc
@@ -248,10 +248,8 @@ struct qty_table_elem
   rtx comparison_const;
   int comparison_qty;
   unsigned int first_reg, last_reg;
-  /* The sizes of these fields should match the sizes of the
- code and mode fields of struct rtx_def (see rtl.h).  */
-  ENUM_BITFIELD(rtx_code) comparison_code : 16;
-  ENUM_BITFIELD(machine_mode) mode : 8;
+  ENUM_BITFIELD(machine_mode) mode : MACHINE_MODE_BITSIZE;
+  ENUM_BITFIELD(rtx_code) comparison_code : RTX_CODE_BITSIZE;
 };
 
 /* The table of all qtys, indexed by qty number.  */
@@ -404,9 +402,7 @@ struct table_elt
   struct table_elt *related_value;
   int cost;
   int regcost;
-  /* The size of this field should match the size
- of the mode field of struct rtx_def (see rtl.h).  */
-  ENUM_BITFIELD(machine_mode) mode : 8;
+  ENUM_BITFIELD(machine_mode) mode : MACHINE_MODE_BITSIZE;
   char in_memory;
   char is_const;
   char flag;
@@ -4152,10 +4148,8 @@ struct set
   /* Nonzero if the SET_S

[PATCH V4] RISC-V: Using merge approach to optimize repeating sequence in vec_init

2023-05-12 Thread juzhe . zhong
From: Juzhe-Zhong 

Patch V3 fail testcase
V4 all pass
Address kito's comment

Consider this following case:
typedef int64_t vnx32di __attribute__ ((vector_size (256)));


__attribute__ ((noipa)) void
f_vnx32di (int64_t a, int64_t b, int64_t *out)
{
  vnx32di v
= {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, 
a, b, a, b, a, b, a, b};
  *(vnx32di *) out = v;
}

Since we dont't have SEW = 128 in vec_duplicate, we can't combine ab into SEW = 
128 element and then
broadcast this big element.

This patch is optimize the case as above.

-march=rv64gcv_zvl256b --param riscv-autovec-preference=fixed-vlmax

Before this patch:

..
vslide1down.vx (x31 times)
..

After this patch:
li  a5,-1431654400
addia5,a5,-1365
li  a3,-1431654400
addia3,a3,-1366
sllia5,a5,32
add a5,a5,a3
vsetvli a4,zero,e64,m8,ta,ma
vmv.v.x v8,a0
vmv.s.x v0,a5
vmerge.vxm  v8,v8,a1,v0
vs8r.v  v8,0(a2)
ret

gcc/ChangeLog:

* config/riscv/riscv-v.cc 
(rvv_builder::can_duplicate_repeating_sequence_p): New function.
(rvv_builder::get_merged_repeating_sequence): Ditto.
(rvv_builder::repeating_sequence_use_merge_profitable_p): Ditto.
(rvv_builder::get_merge_mask_bitfield): Ditto.
(emit_scalar_move_op): Ditto.
(emit_merge_op): Ditto.
(expand_vector_init_merge_repeating_sequence): Ditto.
(expand_vec_init): Add merge approach for reapeating sequence.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-10.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-11.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-8.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-9.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-11.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-7.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-8.c: New test.

---
 gcc/config/riscv/riscv-v.cc   | 245 --
 .../riscv/rvv/autovec/vls-vlmax/repeat-10.c   |  19 ++
 .../riscv/rvv/autovec/vls-vlmax/repeat-11.c   |  25 ++
 .../riscv/rvv/autovec/vls-vlmax/repeat-7.c|  25 ++
 .../riscv/rvv/autovec/vls-vlmax/repeat-8.c|  15 ++
 .../riscv/rvv/autovec/vls-vlmax/repeat-9.c|  16 ++
 .../rvv/autovec/vls-vlmax/repeat_run-11.c |  45 
 .../rvv/autovec/vls-vlmax/repeat_run-7.c  |  45 
 .../rvv/autovec/vls-vlmax/repeat_run-8.c  |  41 +++
 9 files changed, 453 insertions(+), 23 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-10.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-11.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-8.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-9.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-11.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-7.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-8.c

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index b8dc333f54e..10cbc1442aa 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -72,11 +72,14 @@ public:
   {
 add_input_operand (RVV_VUNDEF (mode), mode);
   }
-  void add_policy_operand (enum tail_policy vta, enum mask_policy vma)
+  void add_policy_operand (enum tail_policy vta)
   {
 rtx tail_policy_rtx = gen_int_mode (vta, Pmode);
-rtx mask_policy_rtx = gen_int_mode (vma, Pmode);
 add_input_operand (tail_policy_rtx, Pmode);
+  }
+  void add_policy_operand (enum mask_policy vma)
+  {
+rtx mask_policy_rtx = gen_int_mode (vma, Pmode);
 add_input_operand (mask_policy_rtx, Pmode);
   }
   void add_avl_type_operand (avl_type type)
@@ -99,25 +102,36 @@ public:
 add_vundef_operand (dest_mode);
   }
 
-  void set_len_and_policy (rtx len, bool force_vlmax = false)
-{
-  bool vlmax_p = force_vlmax;
-  gcc_assert (has_dest);
+  void set_dest_merge (rtx dest)
+  {
+dest_mode = GET_MODE (dest);
+has_dest = true;
+add_output_operand (dest, dest_mode);
+add_vundef_operand (dest_mode);
+  }
 
-  if (!len)
-   {
- vlmax_p = true;
- len = gen_reg_rtx (Pmode);
- emit_vlmax_vsetvl (dest_mode, len);
-   }
+  void set_len_and_policy (rtx len, bool force_vlmax = false, bool ta_p = true,
+  bool ma_p = true)
+  {
+bool vlmax_p = force_vlmax;
+gcc_assert (has_dest);
 
-  add_input_operand (len, Pmode);
+if (!len)
+  {
+   vlmax_p

Re: Re: [PATCH V2] RISC-V: Using merge approach to optimize repeating sequence in vec_init

2023-05-12 Thread 钟居哲
Address comments.
V4 patch: 
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/618375.html 
Regresion PASSED.

Thanks.


juzhe.zh...@rivai.ai
 
From: Kito Cheng
Date: 2023-05-12 23:19
To: juzhe.zhong
CC: gcc-patches; palmer; rdapp.gcc; jeffreyalaw
Subject: Re: [PATCH V2] RISC-V: Using merge approach to optimize repeating 
sequence in vec_init
two minor comments:
 
> +  void add_ta_policy_operand (enum tail_policy vta)
> +  void add_ma_policy_operand (enum mask_policy vma)
 
You could just named as add_policy_operand since the arugment type is
already sufficient to distinguish.
 
> @@ -84,40 +92,52 @@ public:
>  add_input_operand (gen_int_mode (type, Pmode), Pmode);
>}
>
> -  void set_dest_and_mask (rtx mask, rtx dest, machine_mode mask_mode)
> +  void set_dest_and_mask (rtx mask, rtx dest, machine_mode mask_mode,
> + bool scalar_move = false, bool merge_op = false)
>{
>  dest_mode = GET_MODE (dest);
>  has_dest = true;
>
>  add_output_operand (dest, dest_mode);
>
> -if (mask)
> -  add_input_operand (mask, GET_MODE (mask));
> -else
> -  add_all_one_mask_operand (mask_mode);
> +if (!merge_op)
> +  {
> +   if (mask)
> + add_input_operand (mask, GET_MODE (mask));
> +   else
> + {
> +   if (scalar_move)
> + add_scalar_move_mask_operand (mask_mode);
> +   else
> + add_all_one_mask_operand (mask_mode);
> + }
> +  }
 
I would like to have set_dest_and_mask_merge_op to reduce the
complexity of set_dest_and_mask.
Accroding the code I got is:
- mask and merge_op are mutual exclusion.
- scalar_move will become meanless if merge_op is true.
 
So I think they mixed those togeter is not good idea.
 


Re: [PATCH 1/2] c++: potentiality of templated memfn call [PR109480]

2023-05-12 Thread Martin Jambor
Hello Patrick,

On Wed, May 03 2023, Patrick Palka via Gcc-patches wrote:
>
[...]
>
> Subject: [PATCH] c++: potentiality of templated memfn call [PR109480]
>
> Here we're incorrectly deeming the templated call a.g() inside b's
> initializer as potentially constant, despite g being non-constexpr,
> which leads to us wastefully instantiating the initializer ahead of time,
> which incidentally tiggers a bug in access checking deferral (to be
> fixed by the subsequent patch).
>
> This patch fixes this by calling get_fns earlier during CALL_EXPR
> potentiality checking so that we're able to extract a FUNCTION_DECL out
> of a templated member function call (whose overall is typically a
> COMPONENT_REF) and to the usual checking if the called function is
> constexpr etc.
>
> In passing, I noticed potential_constant_expression_1's special handling
> of the object argument of a non-static member function call is effectively
> the same as the generic argument handling a few lines later.  So this
> patch just gets rid of this special handling; otherwise we'd have to adapt
> it to handle templated versions of such calls.
>
>   PR c++/109480
>
> gcc/cp/ChangeLog:
>
>   * constexpr.cc (potential_constant_expression_1) :
>   Reorganize to call get_fns sooner.  Remove special handling of
>   the object argument of a non-static member function call.  Remove
>   dead store to 'fun'.
>

This patch makes g++ no longer accept the following, complaining that
get_subsys is non-constexpr (with just -std=c++17 -S), which is of
course auto-reduced from a much larger source file from Ceph:

--- 8< ---
struct {
  void get_subsys();
} PriorSet_dpp;
struct PriorSet {
  template  PriorSet();
};
template  PriorSet::PriorSet() {
  [](auto cctX) { cctX.template should_gather; };
}
--- 8< ---

I assume that is intentional and am actually somewhat surprised it was
accepted before, but can you please confirm?

Thanks,

Martin


Re: [PATCH V4] RISC-V: Using merge approach to optimize repeating sequence in vec_init

2023-05-12 Thread Kito Cheng via Gcc-patches
> +/* Get the mask for merge approach.
> +
> + Consider such following case:
> +   {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
> + To merge "a", the mask should be 1010
> + To merge "b", the mask should be 0101
> +*/
> +rtx
> +rvv_builder::get_merge_mask_bitfield (unsigned int index) const
> +{
> +  uint64_t base_mask = (1ULL << index);
> +  uint64_t mask = 0;
> +  for (unsigned int i = 0; i < (64 / npatterns ()); i++)

What the magic 64 means?
...

> +static void
> +expand_vector_init_merge_repeating_sequence (rtx target,
> +const rvv_builder &builder)
> +{
> +  machine_mode mask_mode;
> +  gcc_assert (get_mask_mode (builder.mode ()).exists (&mask_mode));
> +
> +  machine_mode dup_mode = builder.mode ();
> +  if (known_gt (GET_MODE_SIZE (dup_mode), BYTES_PER_RISCV_VECTOR))
> +{
> +  poly_uint64 nunits
> +   = exact_div (BYTES_PER_RISCV_VECTOR, builder.inner_units ());
> +  gcc_assert (
> +   get_vector_mode (builder.inner_int_mode (), nunits).exists 
> (&dup_mode));

gcc_assert will removed at release mode, so it's not you want I guess?

> +}
> +  else
> +{
> +  if (FLOAT_MODE_P (dup_mode))
> +   gcc_assert (get_vector_mode (builder.inner_int_mode (),
> +GET_MODE_NUNITS (dup_mode))
> + .exists (&dup_mode));

Same issue

> +}
> +
> +  machine_mode dup_mask_mode;
> +  gcc_assert (get_mask_mode (dup_mode).exists (&dup_mask_mode));

Same issue


[PATCH] i386: Remove mulv2si emulated sequence for TARGET_SSE2 [PR109797]

2023-05-12 Thread Uros Bizjak via Gcc-patches
Remove mulv2si emulated sequence for TARGET_SSE2 and enable
only native PMULLD instruction for TARGET_SSE4_1.  Ideally, the
vectorization for TARGET_SSE2 should depend on more precise cost
estimation (the PR contains patch for ix86_multiplication_cost),
but even with patched cost function the runtime regression
was not fixed.

PR target/109797

gcc/ChangeLog:

* config/i386/mmx.md (mulv2si3): Remove expander.
(mulv2si3): Rename insn pattern from *mulv2si.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Pushed to master.

Uros.
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index e7ca921dd2b..b2954fff8ae 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -2092,39 +2092,7 @@ (define_insn "*3"
(set_attr "type" "sseadd")
(set_attr "mode" "TI")])
 
-(define_expand "mulv2si3"
-  [(set (match_operand:V2SI 0 "register_operand")
-   (mult:V2SI
- (match_operand:V2SI 1 "register_operand")
- (match_operand:V2SI 2 "register_operand")))]
-  "TARGET_MMX_WITH_SSE"
-{
-  if (!TARGET_SSE4_1)
-{
-  rtx op1 = lowpart_subreg (V4SImode, force_reg (V2SImode, operands[1]),
-   V2SImode);
-  rtx op2 = lowpart_subreg (V4SImode, force_reg (V2SImode, operands[2]),
-   V2SImode);
-
-  rtx tmp1 = gen_reg_rtx (V4SImode);
-  emit_insn (gen_vec_interleave_lowv4si (tmp1, op1, op1));
-  rtx tmp2 = gen_reg_rtx (V4SImode);
-  emit_insn (gen_vec_interleave_lowv4si (tmp2, op2, op2));
-
-  rtx res = gen_reg_rtx (V2DImode);
-  emit_insn (gen_vec_widen_umult_even_v4si (res, tmp1, tmp2));
-
-  rtx op0 = gen_reg_rtx (V4SImode);
-  emit_insn (gen_sse2_pshufd_1 (op0, gen_lowpart (V4SImode, res),
-   const0_rtx, const2_rtx,
-   const0_rtx, const2_rtx));
-
-  emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode));
-  DONE;
-}
-})
-
-(define_insn "*mulv2si3"
+(define_insn "mulv2si3"
   [(set (match_operand:V2SI 0 "register_operand" "=Yr,*x,v")
(mult:V2SI
  (match_operand:V2SI 1 "register_operand" "%0,0,v")


[committed] libstdc++: Remove redundant dependencies on _GLIBCXX_USE_C99_STDINT_TR1

2023-05-12 Thread Jonathan Wakely via Gcc-patches
Tested powerpc64le-linux. Pushed to trunk.

-- >8 --

We never need to use std::make_unsigned in std::char_traits
and std::char_traits because  guarantees to provide
the types we need, since r9-2028-g8ba7f29e3dd064.

Similarly, experimental::source_location can just assume uint_least32_t
is defined by .

libstdc++-v3/ChangeLog:

* include/bits/char_traits.h (char_traits): Do not
depend on _GLIBCXX_USE_C99_STDINT_TR1.
(char_traits): Likewise.
* include/experimental/source_location: Likewise.
---
 libstdc++-v3/include/bits/char_traits.h   | 8 ++--
 libstdc++-v3/include/experimental/source_location | 6 --
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/libstdc++-v3/include/bits/char_traits.h 
b/libstdc++-v3/include/bits/char_traits.h
index 68ed827f982..0928137854b 100644
--- a/libstdc++-v3/include/bits/char_traits.h
+++ b/libstdc++-v3/include/bits/char_traits.h
@@ -762,10 +762,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   typedef char16_t  char_type;
 #ifdef __UINT_LEAST16_TYPE__
   typedef __UINT_LEAST16_TYPE__int_type;
-#elif defined _GLIBCXX_USE_C99_STDINT_TR1
-  typedef uint_least16_tint_type;
 #else
-  typedef make_unsigned::type int_type;
+  typedef uint_least16_tint_type;
 #endif
 #if _GLIBCXX_HOSTED
   typedef streamoff off_type;
@@ -891,10 +889,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   typedef char32_t  char_type;
 #ifdef __UINT_LEAST32_TYPE__
   typedef __UINT_LEAST32_TYPE__int_type;
-#elif defined _GLIBCXX_USE_C99_STDINT_TR1
-  typedef uint_least32_tint_type;
 #else
-  typedef make_unsigned::type int_type;
+  typedef uint_least32_tint_type;
 #endif
 #if _GLIBCXX_HOSTED
   typedef streamoff off_type;
diff --git a/libstdc++-v3/include/experimental/source_location 
b/libstdc++-v3/include/experimental/source_location
index 1dfce7343c6..ee94a36cc43 100644
--- a/libstdc++-v3/include/experimental/source_location
+++ b/libstdc++-v3/include/experimental/source_location
@@ -44,12 +44,6 @@ inline namespace fundamentals_v2 {
 
   struct source_location
   {
-#ifndef _GLIBCXX_USE_C99_STDINT_TR1
-  private:
-using uint_least32_t = unsigned;
-  public:
-#endif
-
 // 14.1.2, source_location creation
 static constexpr source_location
 current(const char* __file = __builtin_FILE(),
-- 
2.40.1



[committed] libstdc++: Remove dependency on _GLIBCXX_USE_C99_STDINT_TR1

2023-05-12 Thread Jonathan Wakely via Gcc-patches
Tested powerpc64le-linux. Pushed to trunk.

-- >8 --

Since r9-2028-g8ba7f29e3dd064 we've defined most of 
unconditionally, including uint_least32_t. This means that all of
 can be defined unconditionally, which means that std::shuffle
and std::ranges::shuffle can be too.

libstdc++-v3/ChangeLog:

* include/bits/algorithmfwd.h (shuffle): Do not depend on
_GLIBCXX_USE_C99_STDINT_TR1.
* include/bits/ranges_algo.h (shuffle): Likewise.
* include/bits/stl_algo.h (shuffle): Likewise.
* include/ext/random: Likewise.
* include/ext/throw_allocator.h (random_condition): Likewise.
* include/std/random: Likewise.
* src/c++11/cow-string-inst.cc: Likewise.
* src/c++11/random.cc: Likewise.
---
 libstdc++-v3/include/bits/algorithmfwd.h   | 2 +-
 libstdc++-v3/include/bits/ranges_algo.h| 2 --
 libstdc++-v3/include/bits/stl_algo.h   | 3 ---
 libstdc++-v3/include/ext/random| 4 ++--
 libstdc++-v3/include/ext/throw_allocator.h | 8 
 libstdc++-v3/include/std/random| 7 +--
 libstdc++-v3/src/c++11/cow-string-inst.cc  | 2 --
 libstdc++-v3/src/c++11/random.cc   | 3 ---
 8 files changed, 4 insertions(+), 27 deletions(-)

diff --git a/libstdc++-v3/include/bits/algorithmfwd.h 
b/libstdc++-v3/include/bits/algorithmfwd.h
index 03e627866ad..0d623901cd2 100644
--- a/libstdc++-v3/include/bits/algorithmfwd.h
+++ b/libstdc++-v3/include/bits/algorithmfwd.h
@@ -622,7 +622,7 @@ _GLIBCXX_END_INLINE_ABI_NAMESPACE(_V2)
   // set_symmetric_difference
   // set_union
 
-#if (__cplusplus >= 201103L) && defined(_GLIBCXX_USE_C99_STDINT_TR1)
+#if __cplusplus >= 201103L
   template
 void
 shuffle(_RAIter, _RAIter, _UGenerator&&);
diff --git a/libstdc++-v3/include/bits/ranges_algo.h 
b/libstdc++-v3/include/bits/ranges_algo.h
index 410d3ae1dd8..da66ff8045d 100644
--- a/libstdc++-v3/include/bits/ranges_algo.h
+++ b/libstdc++-v3/include/bits/ranges_algo.h
@@ -1564,7 +1564,6 @@ namespace ranges
 
   inline constexpr __sample_fn sample{};
 
-#ifdef _GLIBCXX_USE_C99_STDINT_TR1
   struct __shuffle_fn
   {
 template _Sent,
@@ -1591,7 +1590,6 @@ namespace ranges
   };
 
   inline constexpr __shuffle_fn shuffle{};
-#endif
 
   struct __push_heap_fn
   {
diff --git a/libstdc++-v3/include/bits/stl_algo.h 
b/libstdc++-v3/include/bits/stl_algo.h
index 3d37091a9b4..54695490166 100644
--- a/libstdc++-v3/include/bits/stl_algo.h
+++ b/libstdc++-v3/include/bits/stl_algo.h
@@ -3692,7 +3692,6 @@ _GLIBCXX_END_INLINE_ABI_NAMESPACE(_V2)
 #endif // C++17
 #endif // C++14
 
-#ifdef _GLIBCXX_USE_C99_STDINT_TR1
   /**
*  @brief Generate two uniformly distributed integers using a
* single distribution invocation.
@@ -3803,8 +3802,6 @@ _GLIBCXX_END_INLINE_ABI_NAMESPACE(_V2)
   for (_RandomAccessIterator __i = __first + 1; __i != __last; ++__i)
std::iter_swap(__i, __first + __d(__g, __p_type(0, __i - __first)));
 }
-#endif // USE C99_STDINT
-
 #endif // C++11
 
 _GLIBCXX_BEGIN_NAMESPACE_ALGO
diff --git a/libstdc++-v3/include/ext/random b/libstdc++-v3/include/ext/random
index 795c3c5389a..62acb67e05b 100644
--- a/libstdc++-v3/include/ext/random
+++ b/libstdc++-v3/include/ext/random
@@ -45,7 +45,7 @@
 # include 
 #endif
 
-#if defined(_GLIBCXX_USE_C99_STDINT_TR1) && defined(UINT32_C)
+#ifdef UINT32_C
 
 namespace __gnu_cxx _GLIBCXX_VISIBILITY(default)
 {
@@ -3923,7 +3923,7 @@ _GLIBCXX_END_NAMESPACE_VERSION
 #include 
 #include 
 
-#endif // _GLIBCXX_USE_C99_STDINT_TR1 && UINT32_C
+#endif // UINT32_C
 
 #endif // C++11
 
diff --git a/libstdc++-v3/include/ext/throw_allocator.h 
b/libstdc++-v3/include/ext/throw_allocator.h
index 0dbf00176dc..71b7198fa1e 100644
--- a/libstdc++-v3/include/ext/throw_allocator.h
+++ b/libstdc++-v3/include/ext/throw_allocator.h
@@ -495,7 +495,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 }
   };
 
-#ifdef _GLIBCXX_USE_C99_STDINT_TR1
   /**
*  @brief Base class for random probability control and throw.
*/
@@ -613,7 +612,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   return _S_e;
 }
   };
-#endif // _GLIBCXX_USE_C99_STDINT_TR1
 
   /**
*  @brief Class with exception generation control. Intended to be
@@ -769,7 +767,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 #endif
   };
 
-#ifdef _GLIBCXX_USE_C99_STDINT_TR1
   /// Type throwing via random condition.
   struct throw_value_random : public throw_value_base
   {
@@ -800,7 +797,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 operator=(throw_value_random&&) = default;
 #endif
   };
-#endif // _GLIBCXX_USE_C99_STDINT_TR1
 
   /**
*  @brief Allocator class with logging and exception generation control.
@@ -947,7 +943,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 #endif
 };
 
-#ifdef _GLIBCXX_USE_C99_STDINT_TR1
   /// Allocator throwing via random condition.
   template
 struct throw_allocator_random
@@ -973,7 +968,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   operator=(const throw_allocator_random&) = default;
 #endif
 };
-#endif // _GLIBCXX_USE_C9

[committed] libstdc++: Reduce dependency on _GLIBCXX_USE_C99_STDINT_TR1

2023-05-12 Thread Jonathan Wakely via Gcc-patches
Tested powerpc64le-linux. Pushed to trunk.

-- >8 --

Since r9-2028-g8ba7f29e3dd064 we've defined most of 
unconditionally, so we can do the same for most of the std::atomic
aliases such as std::atomic_int_least32_t.

The only aliases that need to depend on _GLIBCXX_USE_C99_STDINT_TR1 are
the ones for the integer types that are not guaranteed to be defined,
e.g. std::atomic_int32_t.

libstdc++-v3/ChangeLog:

* include/std/atomic (atomic_int_least8_t, atomic_uint_least8_t)
(atomic_int_least16_t, atomic_uint_least16_t)
(atomic_int_least32_t, atomic_uint_least32_t)
(atomic_int_least64_t, atomic_uint_least64_t)
(atomic_int_fast16_t, atomic_uint_fast16_t)
(atomic_int_fast32_t, atomic_uint_fast32_t)
(atomic_int_fast64_t, atomic_uint_fast64_t)
(atomic_intmax_t, atomic_uintmax_t): Define unconditionally.
* testsuite/29_atomics/headers/stdatomic.h/c_compat.cc: Adjust.
---
 libstdc++-v3/include/std/atomic  | 5 +
 .../testsuite/29_atomics/headers/stdatomic.h/c_compat.cc | 4 +---
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/libstdc++-v3/include/std/atomic b/libstdc++-v3/include/std/atomic
index 96e87ded864..b502027e4a4 100644
--- a/libstdc++-v3/include/std/atomic
+++ b/libstdc++-v3/include/std/atomic
@@ -1130,7 +1130,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
   /// atomic_uint64_t
   typedef atomic atomic_uint64_t;
-
+#endif
 
   /// atomic_int_least8_t
   typedef atomic atomic_int_least8_t;
@@ -1180,7 +1180,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
   /// atomic_uint_fast64_t
   typedef atomicatomic_uint_fast64_t;
-#endif
 
 
   /// atomic_intptr_t
@@ -1195,13 +1194,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   /// atomic_ptrdiff_t
   typedef atomicatomic_ptrdiff_t;
 
-#ifdef _GLIBCXX_USE_C99_STDINT_TR1
   /// atomic_intmax_t
   typedef atomic atomic_intmax_t;
 
   /// atomic_uintmax_t
   typedef atomicatomic_uintmax_t;
-#endif
 
   // Function definitions, atomic_flag operations.
   inline bool
diff --git a/libstdc++-v3/testsuite/29_atomics/headers/stdatomic.h/c_compat.cc 
b/libstdc++-v3/testsuite/29_atomics/headers/stdatomic.h/c_compat.cc
index edf19960cbb..8dd7054a997 100644
--- a/libstdc++-v3/testsuite/29_atomics/headers/stdatomic.h/c_compat.cc
+++ b/libstdc++-v3/testsuite/29_atomics/headers/stdatomic.h/c_compat.cc
@@ -79,6 +79,7 @@ static_assert(is_same);
 static_assert(is_same);
 static_assert(is_same);
 static_assert(is_same);
+#endif
 static_assert(is_same);
 static_assert(is_same);
 static_assert(is_same);
@@ -95,13 +96,10 @@ static_assert(is_same);
 static_assert(is_same);
 static_assert(is_same);
 static_assert(is_same);
-#endif
 static_assert(is_same);
 static_assert(is_same);
-#ifdef _GLIBCXX_USE_C99_STDINT_TR1
 static_assert(is_same);
 static_assert(is_same);
-#endif
 #include 
 static_assert(is_same);
 static_assert(is_same);
-- 
2.40.1



  1   2   >