Re: [PATCH] arc: Add --with-fpu support for ARCv2 cpus

2021-06-08 Thread Claudiu Zissulescu via Gcc-patches

Thank you for your input.

I have made an update using grep's ERE. Please let me know if it is ok.

//Claudiu
>From 3f598e0fc9bc88c3f40f3e381c2955ab36e77ce0 Mon Sep 17 00:00:00 2001
From: Claudiu Zissulescu 
Date: Wed, 21 Oct 2020 16:11:43 +0300
Subject: [PATCH] arc: Add --with-fpu support for ARCv2 cpus

Support for a compile-time default FPU. The --with-fpu configuration
option is ignored if -mfpu compiler option is specified. The FPU
options are only available for ARCv2 cpus.

gcc/
-mm-dd  Claudiu Zissulescu  

	* config.gcc (arc): Add support for with_cpu option.
	* config/arc/arc.h (OPTION_DEFAULT_SPECS): Add fpu.

Signed-off-by: Claudiu Zissulescu 
---
 gcc/config.gcc   | 58 +---
 gcc/config/arc/arc.h |  4 +++
 2 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 610422fb29ee..d4445e98e0c9 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -4258,18 +4258,70 @@ case "${target}" in
 		;;
 
 	arc*-*-*)
-		supported_defaults="cpu"
+		supported_defaults="cpu fpu"
 
+		new_cpu=hs38_linux
 		if [ x"$with_cpu" = x ] \
-		|| grep "^ARC_CPU ($with_cpu," \
+		|| grep -E "^ARC_CPU \($with_cpu," \
 		   ${srcdir}/config/arc/arc-cpus.def \
 		   > /dev/null; then
 		 # Ok
-		 true
+		 new_cpu=$with_cpu
 		else
 		 echo "Unknown cpu used in --with-cpu=$with_cpu" 1>&2
 		 exit 1
 		fi
+
+		# see if --with-fpu matches any of the supported FPUs
+		case "$with_fpu" in
+		"")
+			# OK
+			;;
+		fpus | fpus_div | fpus_fma | fpus_all)
+			# OK if em or hs
+			if grep -E "^ARC_CPU \($new_cpu,[[:space:]]+[emhs]+," \
+			   ${srcdir}/config/arc/arc-cpus.def \
+			   > /dev/null; then
+			   # OK
+			   true
+			else
+			 echo "Unknown floating point type used in "\
+			 "--with-fpu=$with_fpu for cpu $new_cpu" 1>&2
+			 exit 1
+			fi
+		;;
+		fpuda | fpuda_div | fpuda_fma | fpuda_all)
+			# OK only em
+			if grep -E "^ARC_CPU \($new_cpu,[[:space:]]+em," \
+			   ${srcdir}/config/arc/arc-cpus.def \
+			   > /dev/null; then
+			   # OK
+			   true
+			else
+			 echo "Unknown floating point type used in "\
+			  "--with-fpu=$with_fpu for cpu $new_cpu" 1>&2
+			 exit 1
+			fi
+			;;
+		fpud | fpud_div | fpud_fma | fpud_all)
+			# OK only hs
+			if grep -E "^ARC_CPU \($new_cpu,[[:space:]]+hs," \
+			   ${srcdir}/config/arc/arc-cpus.def \
+			   > /dev/null; then
+			   # OK
+			   true
+			else
+			 echo "Unknown floating point type used in"\
+			  "--with-fpu=$with_fpu for cpu $new_cpu" 1>&2
+			 exit 1
+			fi
+			;;
+		*)
+			echo "Unknown floating point type used in "\
+			 "--with-fpu=$with_fpu" 1>&2
+			exit 1
+			;;
+		esac
 		;;
 
 csky-*-*)
diff --git a/gcc/config/arc/arc.h b/gcc/config/arc/arc.h
index 722bb10b8813..b9c4ba0398e5 100644
--- a/gcc/config/arc/arc.h
+++ b/gcc/config/arc/arc.h
@@ -100,7 +100,11 @@ extern const char *arc_cpu_to_as (int argc, const char **argv);
   "%:cpu_to_as(%{mcpu=*:%*}) %{mspfp*} %{mdpfp*} "  \
   "%{mfpu=fpuda*:-mfpuda} %{mcode-density}"
 
+/* Support for a compile-time default CPU and FPU.  The rules are:
+   --with-cpu is ignored if -mcpu, mARC*, marc*, mA7, mA6 are specified.
+   --with-fpu is ignored if -mfpu is specified.  */
 #define OPTION_DEFAULT_SPECS		\
+  {"fpu", "%{!mfpu=*:-mfpu=%(VALUE)}"},	\
   {"cpu", "%{!mcpu=*:%{!mARC*:%{!marc*:%{!mA7:%{!mA6:-mcpu=%(VALUE)}" }
 
 #ifndef DRIVER_ENDIAN_SELF_SPECS
-- 
2.31.1



Re: [PATCH] rtl: Join the insn and split conditions in define_insn_and_split

2021-06-08 Thread Richard Biener via Gcc-patches
On Tue, Jun 8, 2021 at 12:05 AM Segher Boessenkool
 wrote:
>
> In theory we could have a split condition not inclusive of the insn
> condition in the past.  That never was a good idea, the code does not do
> what a non-suspicious reader would think it does.  But it leads to more
> serious problems together with iterators: if the split condition (as
> written) does not start with "&&", you do not get the insn condition
> included in the split condition, and that holds for the part of the insn
> condition that was generated by the iterator as well!
>
> This patch simply always joins the two conditions (after the iterators
> have done their work) to get the effective split condition.
>
> I tested this on all Linux targets, building the Linux kernel for each,
> and it does not change generated code for any of them, so I think we do
> not have much breakage to fear.  But it is possible for other targets of
> course, and for floating point or vector code, etc.
>
> Is this okay for trunk?

Even if it looks uglier I would prefer to enforce a leading "&& " on the
split condition.  That keeps the semantic of the define_insn_and_split
the same on trunk and branches and thus maintaining things easier.
I suppose once branches without such enforcement go out of
maintainance we can mass-strip the "&& "s.

I guess a mass-change to add "&& "s at this point is smaller than
a corresponding change to drop them (IMHO leaving both after this
change would be confusing).

Richard.

>
> Segher
>
>
> 2021-06-07  Segher Boessenkool  
>
> * gensupport.c (process_rtx) [DEFINE_INSN_AND_SPLIT]: Always include
> the insn condition in the split condition.
>
> ---
>  gcc/gensupport.c | 25 -
>  1 file changed, 16 insertions(+), 9 deletions(-)
>
> diff --git a/gcc/gensupport.c b/gcc/gensupport.c
> index 2cb760ffb90f..8a6345d36470 100644
> --- a/gcc/gensupport.c
> +++ b/gcc/gensupport.c
> @@ -590,7 +590,6 @@ process_rtx (rtx desc, file_location loc)
>  case DEFINE_INSN_AND_SPLIT:
>  case DEFINE_INSN_AND_REWRITE:
>{
> -   const char *split_cond;
> rtx split;
> rtvec attr;
> int i;
> @@ -609,17 +608,25 @@ process_rtx (rtx desc, file_location loc)
> remove_constraints (XVECEXP (split, 0, i));
>   }
>
> -   /* If the split condition starts with "&&", append it to the
> -  insn condition to create the new split condition.  */
> -   split_cond = XSTR (desc, 4);
> -   if (split_cond[0] == '&' && split_cond[1] == '&')
> +   const char *insn_cond = XSTR (desc, 2);
> +   const char *split_cond = XSTR (desc, 4);
> +   if (strncmp (split_cond, "&&", 2)
> +   && GET_CODE (desc) == DEFINE_INSN_AND_REWRITE)
> + error_at (loc, "the rewrite condition must start with `&&'");
> +
> +   /* If the split condition starts with "&&", skip that.  */
> +   if (!strncmp (split_cond, "&&", 2))
>   {
> rtx_reader_ptr->copy_md_ptr_loc (split_cond + 2, split_cond);
> -   split_cond = rtx_reader_ptr->join_c_conditions (XSTR (desc, 2),
> -   split_cond + 2);
> +   split_cond += 2;
>   }
> -   else if (GET_CODE (desc) == DEFINE_INSN_AND_REWRITE)
> - error_at (loc, "the rewrite condition must start with `&&'");
> +
> +   /* Always use the conjunction of the given split condition and the
> +  insn condition (which includes stuff from iterators, it is not just
> +  what is given in the pattern in the machine description) as the
> +  split condition to use.  */
> +   split_cond = rtx_reader_ptr->join_c_conditions (insn_cond, 
> split_cond);
> +
> XSTR (split, 1) = split_cond;
> if (GET_CODE (desc) == DEFINE_INSN_AND_REWRITE)
>   XVEC (split, 2) = gen_rewrite_sequence (XVEC (desc, 1));
> --
> 1.8.3.1
>


Re: [RFC/PATCH 00/11] Fix up some unexpected empty split conditions

2021-06-08 Thread Richard Biener via Gcc-patches
On Tue, Jun 8, 2021 at 4:10 AM Kewen.Lin via Gcc-patches
 wrote:
>
> Hi Segher,
>
> on 2021/6/8 上午7:50, Segher Boessenkool wrote:
> > Hi!
> >
> > On Fri, Jun 04, 2021 at 10:57:51AM +0800, Kewen.Lin via Gcc-patches wrote:
> >> To find out those need fixing seems to be the critical part.  It's
> >> not hard to add one explicit "&&" to those that don't have it now, but
> >> even with further bootstrapped and regression tested I'm still not
> >> confident the adjustments are safe enough, since the testing coverage
> >> could be limited.  It may need more efforts to revisit, or/and test
> >> with more coverages, and port maintainers' reviews.
> >
> > https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572120.html
> >
> > This adds an "&&" everywhere (or in fact, it just skips any existing
> > one, it just has the same effect of adding it everywhere).  I tested it
> > with building gcc and Linux for all supported targets (31 of them; I do
> > some with multiple configs, mostly 32-bit and 64-bit).  None had any
> > difference before and after the change.
> >
> > So I am no longer worried that there will be any fallout from doing
> > this.  There are many things that *could* go wrong, but I don't think
> > there will be enough at all to be an impediment to just throwing the
> > switch.
> >
> > If we go this way no target will need any significant fixing, maybe none
> > at all will be needed across all targets.  And no changes will be needed
> > anywhere immediately.  We could make leading "&&" deprecated, and the
> > same for split condition "1" (which was "&& 1").  This is easy to change
> > automatically as well.
> >
> Thanks very much for doing this!
>
> I guess we are not going to backport this?  If we won't, it seems to need
> some way to ensure the implied "&&" will show up explicitly when backporting
> some define_insn_and_split.

For this reason I'd prefer the explicit "&& ", Seghers testing means
mass-changing all define_insn_and_split is reasonable.

Richard.

> BR,
> Kewen


Re: [PATCH] define auto_vec copy ctor and assignment (PR 90904)

2021-06-08 Thread Richard Biener via Gcc-patches
On Tue, Jun 8, 2021 at 5:26 AM Trevor Saunders  wrote:
>
> On Mon, Jun 07, 2021 at 02:34:26PM -0600, Martin Sebor wrote:
> > On 6/7/21 2:51 AM, Richard Biener wrote:
> > > On Thu, Jun 3, 2021 at 10:29 AM Trevor Saunders  
> > > wrote:
> > > >
> > > > On Wed, Jun 02, 2021 at 10:04:03AM -0600, Martin Sebor via Gcc-patches 
> > > > wrote:
> > > > > On 6/2/21 12:55 AM, Richard Biener wrote:
> > > > > > On Tue, Jun 1, 2021 at 9:56 PM Martin Sebor  
> > > > > > wrote:
> > > > > > >
> > > > > > > On 5/27/21 2:53 PM, Jason Merrill wrote:
> > > > > > > > On 4/27/21 11:52 AM, Martin Sebor via Gcc-patches wrote:
> > > > > > > > > On 4/27/21 8:04 AM, Richard Biener wrote:
> > > > > > > > > > On Tue, Apr 27, 2021 at 3:59 PM Martin Sebor 
> > > > > > > > > >  wrote:
> > > > > > > > > > >
> > > > > > > > > > > On 4/27/21 1:58 AM, Richard Biener wrote:
> > > > > > > > > > > > On Tue, Apr 27, 2021 at 2:46 AM Martin Sebor via 
> > > > > > > > > > > > Gcc-patches
> > > > > > > > > > > >  wrote:
> > > > > > > > > > > > >
> > > > > > > > > > > > > PR 90904 notes that auto_vec is unsafe to copy and 
> > > > > > > > > > > > > assign because
> > > > > > > > > > > > > the class manages its own memory but doesn't define 
> > > > > > > > > > > > > (or delete)
> > > > > > > > > > > > > either special function.  Since I first ran into the 
> > > > > > > > > > > > > problem,
> > > > > > > > > > > > > auto_vec has grown a move ctor and move assignment 
> > > > > > > > > > > > > from
> > > > > > > > > > > > > a dynamically-allocated vec but still no copy ctor or 
> > > > > > > > > > > > > copy
> > > > > > > > > > > > > assignment operator.
> > > > > > > > > > > > >
> > > > > > > > > > > > > The attached patch adds the two special functions to 
> > > > > > > > > > > > > auto_vec along
> > > > > > > > > > > > > with a few simple tests.  It makes auto_vec safe to 
> > > > > > > > > > > > > use in containers
> > > > > > > > > > > > > that expect copyable and assignable element types and 
> > > > > > > > > > > > > passes
> > > > > > > > > > > > > bootstrap
> > > > > > > > > > > > > and regression testing on x86_64-linux.
> > > > > > > > > > > >
> > > > > > > > > > > > The question is whether we want such uses to appear 
> > > > > > > > > > > > since those
> > > > > > > > > > > > can be quite inefficient?  Thus the option is to delete 
> > > > > > > > > > > > those
> > > > > > > > > > > > operators?
> > > > > > > > > > >
> > > > > > > > > > > I would strongly prefer the generic vector class to have 
> > > > > > > > > > > the properties
> > > > > > > > > > > expected of any other generic container: copyable and 
> > > > > > > > > > > assignable.  If
> > > > > > > > > > > we also want another vector type with this restriction I 
> > > > > > > > > > > suggest to add
> > > > > > > > > > > another "noncopyable" type and make that property 
> > > > > > > > > > > explicit in its name.
> > > > > > > > > > > I can submit one in a followup patch if you think we need 
> > > > > > > > > > > one.
> > > > > > > > > >
> > > > > > > > > > I'm not sure (and not strictly against the copy and 
> > > > > > > > > > assign).  Looking
> > > > > > > > > > around
> > > > > > > > > > I see that vec<> does not do deep copying.  Making 
> > > > > > > > > > auto_vec<> do it
> > > > > > > > > > might be surprising (I added the move capability to match 
> > > > > > > > > > how vec<>
> > > > > > > > > > is used - as "reference" to a vector)
> > > > > > > > >
> > > > > > > > > The vec base classes are special: they have no ctors at all 
> > > > > > > > > (because
> > > > > > > > > of their use in unions).  That's something we might have to 
> > > > > > > > > live with
> > > > > > > > > but it's not a model to follow in ordinary containers.
> > > > > > > >
> > > > > > > > I don't think we have to live with it anymore, now that we're 
> > > > > > > > writing
> > > > > > > > C++11.
> > > > > > > >
> > > > > > > > > The auto_vec class was introduced to fill the need for a 
> > > > > > > > > conventional
> > > > > > > > > sequence container with a ctor and dtor.  The missing copy 
> > > > > > > > > ctor and
> > > > > > > > > assignment operators were an oversight, not a deliberate 
> > > > > > > > > feature.
> > > > > > > > > This change fixes that oversight.
> > > >
> > > > I've been away a while, but trying to get back into this, sorry.  It was
> > > > definitely an oversight to leave these undefined for the compiler to
> > > > provide a default definition of, but I agree with Richi, the better
> > > > thing to have done, or do now would be to mark them as deleted and make
> > > > auto_vec move only (with copy() for when you really need a deep copy.
> > > > > > > > >
> > > > > > > > > The revised patch also adds a copy ctor/assignment to the 
> > > > > > > > > auto_vec
> > > > > > > > > primary template (that's also missing it).  In addition, it 
> > > > > > > > > adds
> > > > > > > > > a new class called auto_vec_ncopy that disables copying and
> > > > > > > > > assignment as y

Re: [PATCH] Implement a context aware points-to analyzer for use in evrp.

2021-06-08 Thread Richard Biener via Gcc-patches
On Mon, Jun 7, 2021 at 9:20 PM Andrew MacLeod  wrote:
>
> On 6/7/21 9:30 AM, Richard Biener via Gcc-patches wrote:
> > On Mon, Jun 7, 2021 at 12:10 PM Aldy Hernandez via Gcc-patches
> >  wrote:
> >> The substitute_and_fold_engine which evrp uses is expecting symbolics
> >> from value_of_expr / value_on_edge / etc, which ranger does not provide.
> >> In some cases, these provide important folding cues, as in the case of
> >> aliases for pointers.  For example, legacy evrp may return [&foo, &foo]
> >> for the value of "bar" where bar is on an edge where bar == &foo, or
> >> when bar has been globally set to &foo.  This information is then used
> >> by the subst & fold engine to propagate the known value of bar.
> >>
> >> Currently this is a major source of discrepancies between evrp and
> >> ranger.  Of the 284 cases legacy evrp is getting over ranger, 237 are
> >> for pointer equality as discussed above.
> >>
> >> This patch implements a context aware points-to class which
> >> ranger-evrp can use to query what a pointer is currently pointing to.
> >> With it, we reduce the 284 cases legacy evrp is getting to 47.
> >>
> >> The API for the points-to analyzer is the following:
> >>
> >> class points_to_analyzer
> >> {
> >> public:
> >>points_to_analyzer (gimple_ranger *r);
> >>~points_to_analyzer ();
> >>void enter (basic_block);
> >>void leave (basic_block);
> >>void visit_stmt (gimple *stmt);
> >>tree get_points_to (tree name) const;
> >> ...
> >> };
> >>
> >> The enter(), leave(), and visit_stmt() methods are meant to be called
> >> from a DOM walk.   At any point throughout the walk, one can call
> >> get_points_to() to get whatever an SSA is pointing to.
> >>
> >> If this class is useful to others, we could place it in a more generic
> >> location.
> >>
> >> Tested on x86-64 Linux with a regular bootstrap/tests and by comparing
> >> EVRP folds over ranger before and after this patch.
> > Hmm, but why call it "points-to" - when I look at the implementation
> > it's really about equivalences.  Thus,
> >
> >   if (var1_2 == var2_3)
> >
> > could be handled the same way.  Also "points-to" implies (to me)
> > that &p[1] and &p[2] point to the same object but your points-to
> > is clearly tracking equivalences only.
> >
> > So maybe at least rename it to pointer_equiv_analyzer?  ISTR
> > propagating random (symbolic) equivalences has issues.
>
> Yeah, pointer_equiv is probably more accurate. This is purely for cases
> where we know a pointer points to something that isn't an ssa_name.
> Eventually this is likely to be subsumed into a pointer_range object,
> but unlikely in this release.
>
> I don't think this is actually doing the propagation though... It tracks
> that a_2 currently points to &foo.. and returns that to either
> simplifier or folder thru value_of_expr().  Presumably it is up to them
> to determine whether the tree expression passed back is safe to
> propagate.   Is there any attempt in EVRP to NOT set the range of
> something to [&foo, &foo] under some conditions?   This is what the
> change amounts to.  Ranger would just return a range of [1, +INF], and
> value_of_expr  would therefore return NULL.  This allows value_of to
> return &foo in these conditions.   Aldy, did you see any other checks in
> the vr-values code?
>
> Things like   if (var1_2 == var2_3) deal with just ssa-names and will be
> handled by an ssa_name relation oracle. It just treats equivalencies
> like a a slightly special kind of relation. Im just about to bring that
> forward this week.

Ah, great - I'm looking forward to this.  Currently both DOM and VN
do a very simplistic thing when trying to simplify downstream conditions
based on earlier ones, abusing their known-expressions hash tables
by, for example, registering (a < b) == 1, (a > b) == 0, (a == b) == 0,
(a != b) == 1 for an earlier a < b condition on the true edge.  So I wonder
if this relation code can be somehow used there.  In VN there's the
extra complication that it iterates, but DOM is just a DOM-walk and
the VN code also has a non-iterating mode (but not a DOM walk).

Of course the code is also used to simplify

 if (a > b)
c = a != b;

but the relation oracle should be able to handle that as well I guess.

Richard.

>
> Andrew
>
>


Re: [PATCH][version 3]add -ftrivial-auto-var-init and variable attribute "uninitialized" to gcc

2021-06-08 Thread Richard Biener
On Mon, 7 Jun 2021, Qing Zhao wrote:

> (Kees, can you answer one of Richard’s question below? On the reason to 
> initialize padding of structures)
> 
> Richard,
> 
> 
> On Jun 7, 2021, at 2:48 AM, Richard Biener 
> mailto:rguent...@suse.de>> wrote:
> 
> Meh - can you try using a mailer that does proper quoting?  It's difficult
> to spot your added comments.  Will try anyway (and sorry for the delay)
> 
> Only the email replied to gcc-patch alias had this issue, all the other 
> emails I sent are fine. Not sure why?

All your mails have this problem for me, it makes it quite difficult to
follow the conversation.

> Both clang and my patch add initialization to the above auto variable “line”.
> 
> So, I have the following questions need help:
> 
> 1. Do we need to exclude C++ class with ctor from auto initialization?
> 
> 2. I see Clang use call to internal memset to initialize such class, but for 
> my patch, I only initialize the data fields inside this class.
>Which one is better?
> 
> I can't answer either question, but generally using block-initialization
> (for example via memset, but we'd generally prefer X = {}) is better for
> later optimization.
> 
> Okay. So, Is this he same reason as lowering the call to .DEFFERED_INIT 
> through expand_builtin_memset other than expand_assign?

Yes, more efficient code generated and more efficient code generation.

> seeing this, can you explain why using .DEFERRED_INIT does not
> work for VLAs?
> 
> The major reason for going different routes for VLAs vs. no-VLAs is:
> 
> In the original gimplification phase, VLAs and no-VLAs go different routes.
> I just followed the different routes for them:
> 
> In “gimplify_decl_expr”, VLA goes to “gimplify_vla_decl”, and is expanded to
> call to alloca.  Naturally, I add calls to “memset/memcpy” in 
> “gimplify_vla_decl” to
> Initialize it.
> 
> On the other hand, no-VLAs are handled differently in “gimplify_decl_expr”, so
> I added calls to “.DEFFERED_INIT” to initialize them.
> 
> What’s the major issue if I add calls to “memset/memcpy” in 
> “gimplify_vla_decl” to
> Initialize VLAs?
> 
> Just inconsistency and unexpected different behavior with respect to
> uninitialized warnings?
> 
> Okay.
> Will try to initialize VLA through the call to .DEFFERED_INIT too. And see 
> whether there is any issue with it.

Thanks.

> 
> @@ -5001,6 +5185,17 @@ gimplify_init_constructor (tree *expr_p, gimple_seq
> *pre_p, gimple_seq *post_p,
> /* If a single access to the target must be ensured and all
> elements
>are zero, then it's optimal to clear whatever their number.
> */
> cleared = true;
> +   else if (flag_trivial_auto_var_init > AUTO_INIT_UNINITIALIZED
> +&& !TREE_STATIC (object)
> +&& type_has_padding (type))
> + /* If the user requests to initialize automatic variables with
> +paddings inside the type, we should initialize the paddings
> too.
> +C guarantees that brace-init with fewer initializers than
> members
> +aggregate will initialize the rest of the aggregate as-if it
> were
> +static initialization.  In turn static initialization
> guarantees
> +that pad is initialized to zero bits.
> +So, it's better to clear the whole record under such
> situation.  */
> + cleared = true;
> 
> so here we have padding as well - I think this warrants to be controlled
> by an extra option?  And we can maybe split this out to a separate
> patch? (the whole padding stuff)
> 
> Clang does the padding initialization with this option, shall we be
> consistent with Clang?
> 
> Just for the sake of consistency?  No.  Is there a technical reason
> for this complication?  Say we have
> 
>  struct { short s; int i; } a;
> 
> what's the technical reason to initialize the padding?  I might
> be tempted to use -ftrivial-auto-init but I'd definitely don't
> want to spend cycles/instructions initializing the padding in the
> above struct.
> 
> Kees, could you please answer this question? What’s the major reason to 
> initialize padding
> of structures from the security point of view?
> 
> 
> At this point I also wonder whether doing the actual initialization
> by block-initializing the current function frame at allocation
> time.
> 
> Which phase is for “allocation time”, please point me to the specific phase 
> and source file.

I actually don't know exactly but it would be the function prologue
stack adjustment (that would also cover spill slots if they are
accumulated), maybe config/i386/i386.c:pro_epilogue_adjust_stack.

Maybe it can be hooked into the -fstack-clash-protection code as well
by changing the probe sequence to a zeroing/patterning sequence.

As said, it was just a thought - zeroing/patterning of auto vars
with not needing to respect object boundaries can be more efficient
for example on x86 it could be just a single rep movq;

Richard.


Re: [PATCH][version 3]add -ftrivial-auto-var-init and variable attribute "uninitialized" to gcc

2021-06-08 Thread Richard Biener
On Mon, 7 Jun 2021, Qing Zhao wrote:

> Hi, 
> 
> > On Jun 7, 2021, at 2:53 AM, Richard Biener  wrote:
> > 
> >> 
> >> To address the above suggestion:
> >> 
> >> My study shows: the call to __builtin_clear_padding is expanded during 
> >> gimplification phase.
> >> And there is no __bultin_clear_padding expanding during rtx expanding 
> >> phase.
> >> However, for -ftrivial-auto-var-init, padding initialization should be 
> >> done both in gimplification phase and rtx expanding phase.
> >> since the __builtin_clear_padding might not be good for rtx expanding, 
> >> reusing __builtin_clear_padding might not work.
> >> 
> >> Let me know if you have any more comments on this.
> > 
> > Yes, I didn't suggest to literally emit calls to __builtin_clear_padding 
> > but instead to leverage the lowering code, more specifically share the
> > code that figures _what_ is to be initialized (where the padding is)
> > and eventually the actual code generation pieces.  That might need some
> > refactoring but the code where padding resides should be present only
> > a single time (since it's quite complex).
> 
> Okay, I see your point here.
> 
> > 
> > Which is also why I suggested to split out the padding initialization
> > bits to a separate patch (and option).
> 
> Personally, I am okay with splitting padding initialization from this current 
> patch,
> Kees, what’s your opinion on this? i.e, the current -ftrivial-auto-var-init 
> will NOT initialize padding, we will add another option to 
> Explicitly initialize padding.

It would also be possible to have -fauto-var-init, -fauto-var-init-padding
and have -ftrivial-auto-var-init for clang compatibility enabling
both.  Or -fauto-var-init={zero,pattern,padding} and allow
-fauto-var-init=pattern,padding to be specified.  Note there's also
padding between auto variables on the stack - that "trailing"
padding isn't initialized either?  (yes, GCC sorts variables to minimize
that padding)  For example for

void foo()
{
  char a[3];
  bar (a);
}

there's 12 bytes padding after 'a', shouldn't we initialize that?  If not,
why's other padding important to be initialized?

Richard.


[PATCH][pushed] Fix "tailing" typo.

2021-06-08 Thread Martin Liška

Pushed as obvious typo fix.

Martin

gcc/fortran/ChangeLog:

* intrinsic.texi: Fix typo.
* trans-expr.c (gfc_trans_pointer_assignment): Likewise.

gcc/ChangeLog:

* genautomata.c (create_automata): Fix typo.

libgfortran/ChangeLog:

* intrinsics/chmod.c (chmod_internal): Fix typo.
* io/transfer.c (read_sf): Likewise.

libquadmath/ChangeLog:

* libquadmath.texi: Fix typo.

gcc/testsuite/ChangeLog:

* gcc.dg/format/strfmon-1.c: Fix typo.
* gfortran.dg/char4-subscript.f90: Likewise.
---
 gcc/fortran/intrinsic.texi| 2 +-
 gcc/fortran/trans-expr.c  | 2 +-
 gcc/genautomata.c | 2 +-
 gcc/testsuite/gcc.dg/format/strfmon-1.c   | 2 +-
 gcc/testsuite/gfortran.dg/char4-subscript.f90 | 2 +-
 libgfortran/intrinsics/chmod.c| 2 +-
 libgfortran/io/transfer.c | 2 +-
 libquadmath/libquadmath.texi  | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/gcc/fortran/intrinsic.texi b/gcc/fortran/intrinsic.texi
index c9049b539d5..260dbaae76b 100644
--- a/gcc/fortran/intrinsic.texi
+++ b/gcc/fortran/intrinsic.texi
@@ -502,7 +502,7 @@ Inquiry function
 @item @emph{Arguments}:
 @multitable @columnfractions .15 .70
 @item @var{NAME} @tab Scalar @code{CHARACTER} of default kind with the
-file name. Tailing blank are ignored unless the character @code{achar(0)}
+file name. Trailing blank are ignored unless the character @code{achar(0)}
 is present, then all characters up to and excluding @code{achar(0)} are
 used as file name.
 @item @var{MODE} @tab Scalar @code{CHARACTER} of default kind with the
diff --git a/gcc/fortran/trans-expr.c b/gcc/fortran/trans-expr.c
index e3bc8863f1b..de406ad2e8f 100644
--- a/gcc/fortran/trans-expr.c
+++ b/gcc/fortran/trans-expr.c
@@ -9513,7 +9513,7 @@ gfc_trans_pointer_assignment (gfc_expr * expr1, gfc_expr 
* expr2)
 {
   gfc_add_data_component (expr2);
   /* The following is required as gfc_add_data_component doesn't
-update ts.type if there is a tailing REF_ARRAY.  */
+update ts.type if there is a trailing REF_ARRAY.  */
   expr2->ts.type = BT_DERIVED;
 }
 
diff --git a/gcc/genautomata.c b/gcc/genautomata.c

index e6295e60757..6bbfc684afa 100644
--- a/gcc/genautomata.c
+++ b/gcc/genautomata.c
@@ -6787,7 +6787,7 @@ create_automata (void)
finish_regexp_representation calls.  */
 
 /* This recursive function forms string representation of regexp

-   (without tailing '\0').  */
+   (without trailing '\0').  */
 static void
 form_regexp (regexp_t regexp)
 {
diff --git a/gcc/testsuite/gcc.dg/format/strfmon-1.c 
b/gcc/testsuite/gcc.dg/format/strfmon-1.c
index 934242a238c..a790db53c9d 100644
--- a/gcc/testsuite/gcc.dg/format/strfmon-1.c
+++ b/gcc/testsuite/gcc.dg/format/strfmon-1.c
@@ -57,7 +57,7 @@ foo (char *s, size_t m, double d, long double ld)
   strfmon (s, m, "%n%n", d); /* { dg-warning "matching" "too few args" } */
   strfmon (s, m, ""); /* { dg-warning "zero-length" "empty" } */
   strfmon (s, m, NULL); /* { dg-warning "null" "null format string" } */
-  strfmon (s, m, "%"); /* { dg-warning "trailing" "tailing %" } */
+  strfmon (s, m, "%"); /* { dg-warning "trailing" "trailing %" } */
   strfmon (s, m, "%n\0", d); /* { dg-warning "embedded" "embedded NUL" } */
   strfmon (s, m, "%^^n", d); /* { dg-warning "repeated" "repeated flag" } */
 }
diff --git a/gcc/testsuite/gfortran.dg/char4-subscript.f90 
b/gcc/testsuite/gfortran.dg/char4-subscript.f90
index fd1cf69754e..b4e2d11d6c5 100644
--- a/gcc/testsuite/gfortran.dg/char4-subscript.f90
+++ b/gcc/testsuite/gfortran.dg/char4-subscript.f90
@@ -22,7 +22,7 @@ if (ichar(var%str2(5:5)) /= int(Z'1F608')) stop 2
 deallocate(var%str2)
 end
 
-! Note: the last '\x00' is regarded as string terminator, hence, the tailing \0 byte is not in the dump

+! Note: the last '\x00' is regarded as string terminator, hence, the trailing 
\0 byte is not in the dump
 
 ! { dg-final { scan-tree-dump {  \(\*var\.str2\)\[1\]{lb: 1 sz: 4} = "(d\\x00\\x00|\\x00\\x00\\x00d)"\[1\]{lb: 1 sz: 4};} "original" } }

 ! { dg-final { scan-tree-dump {  __builtin_memmove \(\(void \*\) &\(\*var.str2\)\[2\]{lb: 1 sz: 4}, 
\(void \*\) &"(e\\x00\\x00\\x00f\\x00\\x00|\\x00\\x00\\x00e\\x00\\x00\\x00f)"\[1\]{lb: 1 sz: 
4}, 8\);} "original" } }
diff --git a/libgfortran/intrinsics/chmod.c b/libgfortran/intrinsics/chmod.c
index d0371ce560f..f4057cb7d06 100644
--- a/libgfortran/intrinsics/chmod.c
+++ b/libgfortran/intrinsics/chmod.c
@@ -271,7 +271,7 @@ chmod_internal (char *file, char *mode, gfc_charlen_type 
mode_len)
  part = 3;
  break;
 
-	/* Tailing blanks are valid in Fortran.  */

+   /* Trailing blanks are valid in Fortran.  */
case ' ':
  for (i++; i < mode_len; i++)
if (mode[i] != ' ')
diff --git a/libgfortran/io/transfer.c b/libgfortran/io/transfer.c
index 36e35b48cd3..e44b2df6

Re: RFC: Sphinx for GCC documentation

2021-06-08 Thread Martin Liška

On 6/7/21 11:26 PM, Bernhard Reutner-Fischer wrote:

On Mon, 7 Jun 2021 15:30:22 +0200
Martin Liška  wrote:


Anyway, this is resolved as I use more appropriate directive:
https://splichal.eu/scripts/sphinx/gfortran/_build/html/intrinsic-procedures/access-checks-file-access-modes.html


ISTM there's a typo s/Tailing/Trailing/ in gcc/fortran/intrinsic.texi


Yes, it is :)



git grep -wi Tailing
seems to highlight a couple more.
Maybe you have time to fix these?

PS: The occurrence in gcc/testsuite/gcc.dg/format/strfmon-1.c sounds
odd.
TIA,



Fixed that with a commit I've just pushed.

Martin



[PATCH] docs: document evrp-sparse-threshold param

2021-06-08 Thread Martin Liška

Pushed as obvious.

Martin

gcc/ChangeLog:

* doc/invoke.texi: Document new param evrp-sparse-threshold.
---
 gcc/doc/invoke.texi | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 04048cd8332..6063e466c13 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -14207,6 +14207,9 @@ we may be able to devirtualize speculatively.
 The maximum number of assertions to add along the default edge of a switch
 statement during VRP.
 
+@item evrp-sparse-threshold

+Maximum number of basic blocks before EVRP uses a sparse cache.
+
 @item evrp-mode
 Specifies the mode Early VRP should operate in.
 
--

2.31.1



[committed] testsuite: Add -Wno-psabi -w to pr100887.c test [PR100943]

2021-06-08 Thread Jakub Jelinek via Gcc-patches
Hi!

On x86 the test is using -mavx512f and so never reports the various
-Wpsabi notes/warnings, but on other targets it can.

Committed to trunk as obvious.

2021-06-08  Jakub Jelinek  

PR target/100887
PR testsuite/100943
* gcc.dg/pr100887.c: Add -Wno-psabi -w to dg-options.

--- gcc/testsuite/gcc.dg/pr100887.c.jj  2021-06-07 09:28:21.990747548 +0200
+++ gcc/testsuite/gcc.dg/pr100887.c 2021-06-07 17:46:14.413145889 +0200
@@ -1,6 +1,6 @@
 /* PR target/100887 */
 /* { dg-do compile } */
-/* { dg-options "" } */
+/* { dg-options "-Wno-psabi -w" } */
 /* { dg-additional-options "-mavx512f" { target { i?86-*-* x86_64-*-* } } } */
 
 typedef unsigned long long __attribute__((__vector_size__ (2 * sizeof (long 
long U;

Jakub



Re: [PATCH][version 3]add -ftrivial-auto-var-init and variable attribute "uninitialized" to gcc

2021-06-08 Thread Richard Biener
On Mon, 7 Jun 2021, Kees Cook wrote:

> On Mon, Jun 07, 2021 at 09:48:41AM +0200, Richard Biener wrote:
> > On Thu, 27 May 2021, Qing Zhao wrote:
> > > @@ -5001,6 +5185,17 @@ gimplify_init_constructor (tree *expr_p, gimple_seq
> > > *pre_p, gimple_seq *post_p,
> > >  /* If a single access to the target must be ensured and all
> > > elements
> > > are zero, then it's optimal to clear whatever their number.
> > > */
> > >  cleared = true;
> > > +   else if (flag_trivial_auto_var_init > AUTO_INIT_UNINITIALIZED
> > > +&& !TREE_STATIC (object)
> > > +&& type_has_padding (type))
> > > + /* If the user requests to initialize automatic variables with
> > > +paddings inside the type, we should initialize the paddings
> > > too.
> > > +C guarantees that brace-init with fewer initializers than
> > > members
> > > +aggregate will initialize the rest of the aggregate as-if it
> > > were
> > > +static initialization.  In turn static initialization
> > > guarantees
> > > +that pad is initialized to zero bits.
> > > +So, it's better to clear the whole record under such
> > > situation.  */
> > > + cleared = true;
> > > 
> > > so here we have padding as well - I think this warrants to be controlled
> > > by an extra option?  And we can maybe split this out to a separate
> > > patch? (the whole padding stuff)
> > > 
> > > Clang does the padding initialization with this option, shall we be 
> > > consistent with Clang?
> > 
> > Just for the sake of consistency?  No.  Is there a technical reason
> > for this complication?  Say we have
> > 
> >   struct { short s; int i; } a;
> > 
> > what's the technical reason to initialize the padding?  I might
> > be tempted to use -ftrivial-auto-init but I'd definitely don't
> > want to spend cycles/instructions initializing the padding in the
> > above struct.
> 
> Yes, this is very important. This is one of the more common ways memory
> content leaks happen in programs (especially the kernel). e.g.:
> 
> struct example {
>   short s;
>   int i;
> };
> 
> struct example instance = { .i = foo };
> 
> While "s" gets zeroed, the padding may not, and may contain prior memory
> contents. Having this be deterministically zero is important for this
> feature. If the structure gets byte-copied to a buffer (e.g. syscall,
> etc), the padding will go along for the ride.

OK, so IMHO this is really a separate feature then - note that even
allocated memory suffers from this issue if the allocator does not
zero allocated blocks in full.  It then applies to even fully correctly
initialized objects and would eventually be better suited for a
language extension.

That said, the user documentation should elaborate on use cases.
pre-zeroing of fields makes bugs due to uninitialized accesses
more "reliable", zeroing of everything avoids information leaks.
All only for auto-vars (I suppose securing of allocated storage
should then happen in the allocator itself and thus likely a bit
less optimized).

Btw, see my other suggestion about simply making sure to pre-initialize
the whole frame at its allocation point.

Richard.


[PATCH] middle-end/100951 - make sure to generate VECTOR_CST in lowering

2021-06-08 Thread Richard Biener
When vector lowering creates piecewise ops make sure to create
VECTOR_CSTs instead of CONSTRUCTORs when possible.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

gcc/

2021-06-07  Richard Biener  

PR middle-end/100951
* tree-vect-generic.c (expand_vector_piecewise): Build a
VECTOR_CST if all elements are constant.
(expand_vector_condition): Likewise.
(lower_vec_perm): Likewise.
(expand_vector_conversion): Likewise.

gcc/testsuite/

2021-06-07  H.J. Lu  

PR middle-end/100951
* gcc.target/i386/pr100951.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr100951.c | 15 +++
 gcc/tree-vect-generic.c  | 34 +---
 2 files changed, 45 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100951.c

diff --git a/gcc/testsuite/gcc.target/i386/pr100951.c 
b/gcc/testsuite/gcc.target/i386/pr100951.c
new file mode 100644
index 000..16d8bafa663
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100951.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -march=x86-64" } */
+
+typedef short __attribute__((__vector_size__ (8 * sizeof (short V;
+V v, w;
+
+void
+foo (void)
+{
+  w = __builtin_shuffle (v != v, 0 < (V) {}, (V) {192} >> 5);
+}
+
+/* { dg-final { scan-assembler-not "punpcklwd" } } */
+/* { dg-final { scan-assembler-not "pshufd" } } */
+/* { dg-final { scan-assembler-times "pxor\[\\t \]%xmm\[0-9\]+, %xmm\[0-9\]+" 
1 } } */
diff --git a/gcc/tree-vect-generic.c b/gcc/tree-vect-generic.c
index d9c0ac9de7e..5f3f9fa005e 100644
--- a/gcc/tree-vect-generic.c
+++ b/gcc/tree-vect-generic.c
@@ -328,16 +328,22 @@ expand_vector_piecewise (gimple_stmt_iterator *gsi, 
elem_op_func f,
   if (!ret_type)
 ret_type = type;
   vec_alloc (v, (nunits + delta - 1) / delta);
+  bool constant_p = true;
   for (i = 0; i < nunits;
i += delta, index = int_const_binop (PLUS_EXPR, index, part_width))
 {
   tree result = f (gsi, inner_type, a, b, index, part_width, code,
   ret_type);
+  if (!CONSTANT_CLASS_P (result))
+   constant_p = false;
   constructor_elt ce = {NULL_TREE, result};
   v->quick_push (ce);
 }
 
-  return build_constructor (ret_type, v);
+  if (constant_p)
+return build_vector_from_ctor (ret_type, v);
+  else
+return build_constructor (ret_type, v);
 }
 
 /* Expand a vector operation to scalars with the freedom to use
@@ -1105,6 +,7 @@ expand_vector_condition (gimple_stmt_iterator *gsi, 
bitmap dce_ssa_names)
 
   int nunits = nunits_for_known_piecewise_op (type);
   vec_alloc (v, nunits);
+  bool constant_p = true;
   for (int i = 0; i < nunits; i++)
 {
   tree aa, result;
@@ -1129,6 +1136,8 @@ expand_vector_condition (gimple_stmt_iterator *gsi, 
bitmap dce_ssa_names)
   else
aa = tree_vec_extract (gsi, cond_type, a, width, index);
   result = gimplify_build3 (gsi, COND_EXPR, inner_type, aa, bb, cc);
+  if (!CONSTANT_CLASS_P (result))
+   constant_p = false;
   constructor_elt ce = {NULL_TREE, result};
   v->quick_push (ce);
   index = int_const_binop (PLUS_EXPR, index, width);
@@ -1138,7 +1147,10 @@ expand_vector_condition (gimple_stmt_iterator *gsi, 
bitmap dce_ssa_names)
comp_index = int_const_binop (PLUS_EXPR, comp_index, comp_width);
 }
 
-  constr = build_constructor (type, v);
+  if (constant_p)
+constr = build_vector_from_ctor (type, v);
+  else
+constr = build_constructor (type, v);
   gimple_assign_set_rhs_from_tree (gsi, constr);
   update_stmt (gsi_stmt (*gsi));
 
@@ -1578,6 +1590,7 @@ lower_vec_perm (gimple_stmt_iterator *gsi)
   "vector shuffling operation will be expanded piecewise");
 
   vec_alloc (v, elements);
+  bool constant_p = true;
   for (i = 0; i < elements; i++)
 {
   si = size_int (i);
@@ -1639,10 +1652,15 @@ lower_vec_perm (gimple_stmt_iterator *gsi)
t = v0_val;
 }
 
+  if (!CONSTANT_CLASS_P (t))
+   constant_p = false;
   CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, t);
 }
 
-  constr = build_constructor (vect_type, v);
+  if (constant_p)
+constr = build_vector_from_ctor (vect_type, v);
+  else
+constr = build_constructor (vect_type, v);
   gimple_assign_set_rhs_from_tree (gsi, constr);
   update_stmt (gsi_stmt (*gsi));
 }
@@ -2014,6 +2032,7 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
}
 
  vec_alloc (v, (nunits + delta - 1) / delta * 2);
+ bool constant_p = true;
  for (i = 0; i < nunits;
   i += delta, index = int_const_binop (PLUS_EXPR, index,
part_width))
@@ -2024,12 +2043,19 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
  index);
  tree result = gimplify_build1 (gsi, code1, cretd_type, a);
  constructor_elt ce = {

[PATCH 0/3] Improve and document stdx::simd testsuite

2021-06-08 Thread Matthias Kretz
As discussed a long time ago on IRC, this improves (i.e. decreases by default) 
the verbosity of make check-simd, gives more verbosity options, and finally 
documents how the simd testsuite is used and how it works. In addition, after 
PR98834 was resolved, remove the -fno-tree-vrp workaround.

Tested on x86_64-linux (and more).


Matthias Kretz (3):
  libstdc++: Remove -fno-tree-vrp after PR98834 was resolved
  libstdc++: Improve output verbosity options and default
  libstdc++: Document simd testsuite

 libstdc++-v3/testsuite/Makefile.am|   3 +-
 libstdc++-v3/testsuite/Makefile.in|   3 +-
 .../testsuite/experimental/simd/README.md | 257 ++
 .../testsuite/experimental/simd/driver.sh | 137 +++---
 .../experimental/simd/generate_makefile.sh|  33 ++-
 5 files changed, 380 insertions(+), 53 deletions(-)
 create mode 100644 libstdc++-v3/testsuite/experimental/simd/README.md

-- 
──
 Dr. Matthias Kretz   https://mattkretz.github.io
 GSI Helmholtz Centre for Heavy Ion Research   https://gsi.de
 std::experimental::simd  https://github.com/VcDevel/std-simd
──



[PATCH 1/3] libstdc++: Remove -fno-tree-vrp after PR98834 was resolved

2021-06-08 Thread Matthias Kretz


libstdc++-v3/ChangeLog:

* testsuite/Makefile.am (check-simd): Remove -fno-tree-vrp flag
and associated warning.
* testsuite/Makefile.in: Regenerate.

Signed-off-by: Matthias Kretz 
---
 libstdc++-v3/testsuite/Makefile.am | 3 +--
 libstdc++-v3/testsuite/Makefile.in | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)


--
──
 Dr. Matthias Kretz   https://mattkretz.github.io
 GSI Helmholtz Centre for Heavy Ion Research   https://gsi.de
 std::experimental::simd  https://github.com/VcDevel/std-simd
──diff --git a/libstdc++-v3/testsuite/Makefile.am b/libstdc++-v3/testsuite/Makefile.am
index ba5023a8b54..d2011f03c64 100644
--- a/libstdc++-v3/testsuite/Makefile.am
+++ b/libstdc++-v3/testsuite/Makefile.am
@@ -191,10 +191,9 @@ check-simd: $(srcdir)/experimental/simd/generate_makefile.sh \
 	${glibcxx_srcdir}/scripts/check_simd \
 	testsuite_files_simd \
 	${glibcxx_builddir}/scripts/testsuite_flags
-	@echo "WARNING: Adding -fno-tree-vrp to CXXFLAGS to work around PR98834."
 	@rm -f .simd.summary
 	@echo "Generating simd testsuite subdirs and Makefiles ..."
-	@${glibcxx_srcdir}/scripts/check_simd "${glibcxx_srcdir}" "${glibcxx_builddir}" "$(CXXFLAGS) -fno-tree-vrp" | \
+	@${glibcxx_srcdir}/scripts/check_simd "${glibcxx_srcdir}" "${glibcxx_builddir}" "$(CXXFLAGS)" | \
 	  while read subdir; do \
 	$(MAKE) -C "$${subdir}"; \
 	tail -n20 $${subdir}/simd_testsuite.sum | \
diff --git a/libstdc++-v3/testsuite/Makefile.in b/libstdc++-v3/testsuite/Makefile.in
index c9dd7f5da61..c65cdaf2015 100644
--- a/libstdc++-v3/testsuite/Makefile.in
+++ b/libstdc++-v3/testsuite/Makefile.in
@@ -716,10 +716,9 @@ check-simd: $(srcdir)/experimental/simd/generate_makefile.sh \
 	${glibcxx_srcdir}/scripts/check_simd \
 	testsuite_files_simd \
 	${glibcxx_builddir}/scripts/testsuite_flags
-	@echo "WARNING: Adding -fno-tree-vrp to CXXFLAGS to work around PR98834."
 	@rm -f .simd.summary
 	@echo "Generating simd testsuite subdirs and Makefiles ..."
-	@${glibcxx_srcdir}/scripts/check_simd "${glibcxx_srcdir}" "${glibcxx_builddir}" "$(CXXFLAGS) -fno-tree-vrp" | \
+	@${glibcxx_srcdir}/scripts/check_simd "${glibcxx_srcdir}" "${glibcxx_builddir}" "$(CXXFLAGS)" | \
 	  while read subdir; do \
 	$(MAKE) -C "$${subdir}"; \
 	tail -n20 $${subdir}/simd_testsuite.sum | \


[PATCH 2/3] libstdc++: Improve output verbosity options and default

2021-06-08 Thread Matthias Kretz


For most uses --quiet was too quiet while the default was too noisy. Now
the default output, if stdout is a tty, shows the last successful test
on the same line. With --percentage it adds a percentage at the start of
the line. --percentage is not default because it requires more resources
and might not be 100% compatible to all environments.
If stdout is not a tty the default is quiet output like for dejagnu.

Additionally, argument parsing now recognizes contracted short options
which is easier to use with e.g. DRIVEROPTS=-pxk.

libstdc++-v3/ChangeLog:

* testsuite/experimental/simd/driver.sh: Rewrite output
verbosity logic. Add -p/--percentage option. Allow -v/--verbose
to be used twice. Add -x and -o short options. Parse long
options with = instead of separating space generically. Parce
contracted short options. Make unrecognized options an error.
If same-line output is active, trap on EXIT to increment the
progress (only with --percentage), erase the line and print the
current status.
* testsuite/experimental/simd/generate_makefile.sh: Initialize
helper files for progress account keeping. Update help target
for changes to DRIVEROPTS.

Signed-off-by: Matthias Kretz 
---
 .../testsuite/experimental/simd/driver.sh | 137 +-
 .../experimental/simd/generate_makefile.sh|  33 +++--
 2 files changed, 121 insertions(+), 49 deletions(-)


--
──
 Dr. Matthias Kretz   https://mattkretz.github.io
 GSI Helmholtz Centre for Heavy Ion Research   https://gsi.de
 std::experimental::simd  https://github.com/VcDevel/std-simd
──diff --git a/libstdc++-v3/testsuite/experimental/simd/driver.sh b/libstdc++-v3/testsuite/experimental/simd/driver.sh
index f2d31c70bd0..5ae9905e3a3 100755
--- a/libstdc++-v3/testsuite/experimental/simd/driver.sh
+++ b/libstdc++-v3/testsuite/experimental/simd/driver.sh
@@ -5,8 +5,22 @@ abi=0
 name=
 srcdir="$(cd "${0%/*}" && pwd)/tests"
 sim="$GCC_TEST_SIMULATOR"
-quiet=false
-verbose=false
+
+# output_mode values:
+# print only failures with minimal context
+readonly really_quiet=0
+# as above plus same-line output of last successful test
+readonly same_line=1
+# as above plus percentage
+readonly percentage=2
+# print one line per finished test with minimal context on failure
+readonly verbose=3
+# print one line per finished test with full output of the compiler and test
+readonly really_verbose=4
+
+output_mode=$really_quiet
+[ -t 1 ] && output_mode=$same_line
+
 timeout=180
 run_expensive=false
 if [ -n "$GCC_TEST_RUN_EXPENSIVE" ]; then
@@ -21,8 +35,12 @@ Usage: $0 [Options] 
 
 Options:
   -h, --help  Print this message and exit.
-  -q, --quiet Only print failures.
-  -v, --verbose   Print compiler and test output on failure.
+  -q, --quiet Disable same-line progress output (default if stdout is
+  not a tty).
+  -p, --percentageAdd percentage to default same-line progress output.
+  -v, --verbose   Print one line per test and minimal extra information on
+  failure.
+  -vv Print all compiler and test output.
   -t , --type 
   The value_type to test (default: $type).
   -a [0-9], --abi [0-9]
@@ -36,9 +54,10 @@ Options:
   GCC_TEST_SIMULATOR).
   --timeout-factor 
   Multiply the default timeout with x.
-  --run-expensive Compile and run tests marked as expensive (default:
+  -x, --run-expensive Compile and run tests marked as expensive (default:
   true if GCC_TEST_RUN_EXPENSIVE is set, false otherwise).
-  --only Compile and run only tests matching the given pattern.
+  -o , --only 
+  Compile and run only tests matching the given pattern.
 EOF
 }
 
@@ -49,71 +68,74 @@ while [ $# -gt 0 ]; do
 exit
 ;;
   -q|--quiet)
-quiet=true
+output_mode=$really_quiet
+;;
+  -p|--percentage)
+output_mode=$percentage
 ;;
   -v|--verbose)
-verbose=true
+if [ $output_mode -lt $verbose ]; then
+  output_mode=$verbose
+else
+  output_mode=$really_verbose
+fi
 ;;
-  --run-expensive)
+  -x|--run-expensive)
 run_expensive=true
 ;;
   -k|--keep-failed)
 keep_failed=true
 ;;
-  --only)
+  -o|--only)
 only="$2"
 shift
 ;;
-  --only=*)
-only="${1#--only=}"
-;;
   -t|--type)
 type="$2"
 shift
 ;;
-  --type=*)
-type="${1#--type=}"
-;;
   -a|--abi)
 abi="$2"
 shift
 ;;
-  --abi=*)
-abi="${1#--abi=}"
-;;
   -n|--name)
 name="$2"
 shift
 ;;
-  --name=*)
-name="${1#--name=}"
-;;
   --srcdir)
 srcdir="$2"
 shift
 ;;
-  --srcdir=*)
-srcdir="${1#--srcdir=}"
-;;
 

[PATCH 3/3] libstdc++: Document simd testsuite

2021-06-08 Thread Matthias Kretz


libstdc++-v3/ChangeLog:

* testsuite/experimental/simd/README.md: New file.

Signed-off-by: Matthias Kretz 
---
 .../testsuite/experimental/simd/README.md | 257 ++
 1 file changed, 257 insertions(+)
 create mode 100644 libstdc++-v3/testsuite/experimental/simd/README.md


--
──
 Dr. Matthias Kretz   https://mattkretz.github.io
 GSI Helmholtz Centre for Heavy Ion Research   https://gsi.de
 std::experimental::simd  https://github.com/VcDevel/std-simd
──diff --git a/libstdc++-v3/testsuite/experimental/simd/README.md b/libstdc++-v3/testsuite/experimental/simd/README.md
new file mode 100644
index 000..db0d71f8d43
--- /dev/null
+++ b/libstdc++-v3/testsuite/experimental/simd/README.md
@@ -0,0 +1,257 @@
+# SIMD Tests
+
+To execute the simd testsuite, call `make check-simd`, typically with `-j N` 
+argument.
+
+For more control over verbosity, compiler flags, and use of a simulator, use 
+the environment variables documented below.
+
+## Environment variables
+
+### `target_list`
+
+Similar to dejagnu target lists: E.g. 
+`target_list="unix{-march=sandybridge,-march=native/-ffast-math,-march=native/-ffinite-math-only}" 
+would create three subdirs in `testsuite/simd/` to run the complete simd 
+testsuite first with `-march=sandybridge`, then with `-march=native 
+-ffast-math`, and finally with `-march=native -ffinite-math-only`.
+
+
+### `CHECK_SIMD_CONFIG`
+
+This variable can be set to a path to a file which is equivalent to a dejagnu 
+board. The file needs to be a valid `sh` script since it is sourced from the 
+`scripts/check_simd` script. It's purpose is to set the `target_list` variable 
+depending on `$target_triplet` (or whatever else makes sense for you). Example:
+
+```sh
+case "$target_triplet" in
+x86_64-*)
+  target_list="unix{-march=sandybridge,-march=skylake-avx512,-march=native/-ffast-math,-march=athlon64,-march=core2,-march=nehalem,-march=skylake,-march=native/-ffinite-math-only,-march=knl}"
+  ;;
+
+powerpc64le-*)
+  define_target power7 "-mcpu=power7 -static" "$HOME/bin/run_on_gccfarm gcc112"
+  define_target power8 "-mcpu=power8 -static" "$HOME/bin/run_on_gccfarm gcc112"
+  define_target power9 "-mcpu=power9 -static" "$HOME/bin/run_on_gccfarm gcc135"
+  target_list="power7 power8 power9{,-ffast-math}"
+  ;;
+
+powerpc64-*)
+  define_target power7 "-mcpu=power7 -static" "$HOME/bin/run_on_gccfarm gcc110"
+  define_target power8 "-mcpu=power8 -static" "$HOME/bin/run_on_gccfarm gcc110"
+  target_list="power7 power8{,-ffast-math}"
+  ;;
+esac
+```
+
+The `unix` target is pre-defined to have no initial flags and no simulator. Use 
+the `define_target(name, flags, sim)` function to define your own targets for 
+the `target_list` variable. In the example above `define_target power7 
+"-mcpu=power7 -static" "$HOME/bin/run_on_gccfarm gcc112"` defines the target 
+`power7` which always uses the flags `-mcpu=power7` and `-static` when 
+compiling tests and prepends `$HOME/bin/run_on_gccfarm gcc112` to test 
+executables. In `target_list` you can now use the name `power7`. E.g. 
+`target_list="power7 power7/-ffast-math"` or it's shorthand 
+`target_list="power7{,-ffast-math}"`.
+
+
+### `DRIVEROPTS`
+
+This variable affects the `Makefile`s generated per target (as defined above). 
+It's a string of flags that are prepended to the `driver.sh` invocation which 
+builds and runs the tests. You `cd` into a simd test subdir and use `make help` 
+to see possible options and a list of all valid targets.
+
+```
+use DRIVEROPTS= to pass the following options:
+-q, --quiet Disable same-line progress output (default if stdout is
+not a tty).
+-p, --percentageAdd percentage to default same-line progress output.
+-v, --verbose   Print one line per test and minimal extra information on
+failure.
+-vv Print all compiler and test output.
+-k, --keep-failed   Keep executables of failed tests.
+--sim   Path to an executable that is prepended to the test
+execution binary (default: the value of
+GCC_TEST_SIMULATOR).
+--timeout-factor 
+Multiply the default timeout with x.
+-x, --run-expensive Compile and run tests marked as expensive (default:
+true if GCC_TEST_RUN_EXPENSIVE is set, false otherwise).
+-o , --only 
+Compile and run only tests matching the given pattern.
+```
+
+
+### `TESTFLAGS`
+
+This variable also affects the `Makefile`s generated per target. It's a list of 
+compiler flags that are appended to `CXXFLAGS`.
+
+
+### `GCC_TEST_SIMULATOR`
+
+If `--sim` is not passed via `DRIVEROPTS`, then this variable is prepended to 
+test invocations. If a simulator was defined via the `CHECK_SIMD_CONFIG` 
+script, then then gener

Re: [PATCH 1/9] [nvptx] Enable large vectors

2021-06-08 Thread Thomas Schwinge
Hi!

On 2019-01-12T23:21:23+0100, Tom de Vries  wrote:
> Allow vector_length clauses to accept values larger than warp size.

>   * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Expect vector
>   length 2097152 to be reduced to 1024 instead of 32.

> --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> @@ -350,7 +350,7 @@ int main ()
>  int gangs_min, gangs_max, workers_min, workers_max, vectors_min, 
> vectors_max;
>  gangs_min = workers_min = vectors_min = INT_MAX;
>  gangs_max = workers_max = vectors_max = INT_MIN;
> -#pragma acc parallel copy (vectors_actual) /* { dg-warning "using 
> vector_length \\(32\\), ignoring 2097152" "" { target 
> openacc_nvidia_accel_configured } } */ \
> +#pragma acc parallel copy (vectors_actual) /* { dg-warning "using 
> vector_length \\(1024\\), ignoring 2097152" "" { target 
> openacc_nvidia_accel_configured } } */ \
>vector_length (VECTORS)
>  {
>if (acc_on_device (acc_device_host))
> @@ -361,7 +361,7 @@ int main ()
>else if (acc_on_device (acc_device_nvidia))
>   {
> /* The GCC nvptx back end enforces vector_length (32).  */
> -   vectors_actual = 32;
> +   vectors_actual = 1024;
>   }
>else
>   __builtin_abort ();

As obvious, pushed "[nvptx] Update comment in
'libgomp.oacc-c-c++-common/parallel-dims.c'" to master branch in commit
e64d62c7008e6a4b0227fd25e071db8f0b3f1820, see attached.


Grüße
 Thomas


-
Mentor Graphics (Deutschland) GmbH, Arnulfstrasse 201, 80634 München 
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Frank 
Thürauf
>From e64d62c7008e6a4b0227fd25e071db8f0b3f1820 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Sat, 5 Jun 2021 22:01:48 +0200
Subject: [PATCH] [nvptx] Update comment in
 'libgomp.oacc-c-c++-common/parallel-dims.c'

Small fix-up for r267889 (commit 2b9d9e393766d2fa6e2dd5f361d0db14872cf261)
"[nvptx] Enable large vectors":

> 	* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Expect vector
> 	length 2097152 to be reduced to 1024 instead of 32.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
	: Update comment.
---
 libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
index ef4917aafff..ef3dfda5fa5 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
@@ -385,7 +385,7 @@ int main ()
 	}
   else if (acc_on_device (acc_device_nvidia))
 	{
-	  /* The GCC nvptx back end enforces vector_length (32).  */
+	  /* The GCC nvptx back end reduces to vector_length (1024).  */
 	  vectors_actual = 1024;
 	}
   else if (acc_on_device (acc_device_radeon))
-- 
2.30.2



Re: [committed][nvptx] Handle assignment to gang-level reduction variable

2021-06-08 Thread Thomas Schwinge
Hi!

On 2019-01-15T11:13:51+0100, Tom de Vries  wrote:
> this fixes an ICE when handling an assignment to a gang-level reduction
> variable.
>
> Committed to trunk.

>   PR target/80547

Pushed "Revert PR80547 workaround in
'libgomp.oacc-c-c++-common/parallel-dims.c'" to master branch in commit
0886426f5f543e813c1a61e18da6616caf377dfc, see attached.


Grüße
 Thomas


-
Mentor Graphics (Deutschland) GmbH, Arnulfstrasse 201, 80634 München 
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Frank 
Thürauf
>From 0886426f5f543e813c1a61e18da6616caf377dfc Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Mon, 7 Jun 2021 16:17:06 +0200
Subject: [PATCH] Revert PR80547 workaround in
 'libgomp.oacc-c-c++-common/parallel-dims.c'

This problem has been fixed long ago, in r267934 (commit
d41d952c9bbdffe6fd2badc9c4f2c18d241ce412) "[nvptx] Handle assignment to
gang-level reduction variable".

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Revert
	PR80547 workaround.
---
 .../libgomp.oacc-c-c++-common/parallel-dims.c   | 13 -
 1 file changed, 13 deletions(-)

diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
index ef3dfda5fa5..c7412c2ef3a 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
@@ -72,22 +72,9 @@ int main ()
   gangs_actual = 1;
   for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
 	{
-	  /* .  */
-#if 0
 	  gangs_min = gangs_max = acc_gang ();
 	  workers_min = workers_max = acc_worker ();
 	  vectors_min = vectors_max = acc_vector ();
-#else
-	  int gangs = acc_gang ();
-	  gangs_min = (gangs_min < gangs) ? gangs_min : gangs;
-	  gangs_max = (gangs_max > gangs) ? gangs_max : gangs;
-	  int workers = acc_worker ();
-	  workers_min = (workers_min < workers) ? workers_min : workers;
-	  workers_max = (workers_max > workers) ? workers_max : workers;
-	  int vectors = acc_vector ();
-	  vectors_min = (vectors_min < vectors) ? vectors_min : vectors;
-	  vectors_max = (vectors_max > vectors) ? vectors_max : vectors;
-#endif
 	}
 }
 if (gangs_actual != 1)
-- 
2.30.2



Re: [patch, libgomp] Enable OpenACC GCN testing

2021-06-08 Thread Thomas Schwinge
Hi!

On 2019-11-14T16:36:38+, Andrew Stubbs  wrote:
> This patch adds some necessary bits to enable OpenACC testings for
> amdgcn offloading.

> --- a/libgomp/testsuite/lib/libgomp.exp
> +++ b/libgomp/testsuite/lib/libgomp.exp

> +# Return 1 if at least one AMD GCN board is present, and the AMD GCN device
> +# type is selected by default.
> +
> +proc check_effective_target_openacc_amdgcn_accel_selected { } {
> +if { ![check_effective_target_openacc_amdgcn_accel_present] } {
> + return 0;
> +}
> +global offload_target
> +if { [string match "amdgcn*" $offload_target] } {
> +return 1;
> +}
> +return 0;
> +}

Pushed "[GCN] Streamline
'libgomp/testsuite/lib/libgomp.exp:check_effective_target_openacc_radeon_accel_selected'"
to master branch in commit f9da798ba6348feaada80de04bc72cdf0c4a1f70, see
attached.


Grüße
 Thomas


-
Mentor Graphics (Deutschland) GmbH, Arnulfstrasse 201, 80634 München 
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Frank 
Thürauf
>From f9da798ba6348feaada80de04bc72cdf0c4a1f70 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Fri, 4 Jun 2021 15:19:35 +0200
Subject: [PATCH] [GCN] Streamline
 'libgomp/testsuite/lib/libgomp.exp:check_effective_target_openacc_radeon_accel_selected'

The GCN support that got added in r278935 (commit
83caa34e2a618842e05f59cbb3e2dda93dc23270) "Enable OpenACC GCN testing" was
forked before my r269107 (commit ee332b4a9a19552d160a23155f59b11692d8f07e)
"[libgomp] Clarify difference between offload target, offload plugin, and
OpenACC device type", and didn't later pick up these changes.

No functional change.

	libgomp/
	* testsuite/lib/libgomp.exp
	(check_effective_target_openacc_radeon_accel_selected):
	Streamline.
---
 libgomp/testsuite/lib/libgomp.exp | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/libgomp/testsuite/lib/libgomp.exp b/libgomp/testsuite/lib/libgomp.exp
index 0f4eb6fd4ff..45c78d8510e 100644
--- a/libgomp/testsuite/lib/libgomp.exp
+++ b/libgomp/testsuite/lib/libgomp.exp
@@ -472,11 +472,8 @@ proc check_effective_target_openacc_radeon_accel_selected { } {
 if { ![check_effective_target_openacc_radeon_accel_present] } {
 	return 0;
 }
-global offload_target
-if { [string match "amdgcn*" $offload_target] } {
-return 1;
-}
-return 0;
+global openacc_device_type
+return [string match "radeon" $openacc_device_type]
 }
 
 # Return 1 if cuda.h and -lcuda are available.
-- 
2.30.2



[committed] openmp: Fix ICE on depend(source) clause during cdtor cloning [PR100957]

2021-06-08 Thread Jakub Jelinek via Gcc-patches
Hi!

The depend(source) clause has NULL OMP_CLAUSE_DECL, it has just the
depend kind specified and no arguments.  So copy_tree_body_r shouldn't
check TREE_CODE on it without checking it is non-NULL.

Tested on x86_64-linux, committed to trunk.

2021-06-08  Jakub Jelinek  

PR c++/100957
* tree-inline.c (copy_tree_body_r): For OMP_CLAUSE_DEPEND don't
check TREE_CODE if OMP_CLAUSE_DECL is NULL.

* g++.dg/gomp/doacross-2.C: New test.

--- gcc/tree-inline.c.jj2021-06-07 09:25:30.541217037 +0200
+++ gcc/tree-inline.c   2021-06-08 10:40:07.013853127 +0200
@@ -1458,7 +1458,8 @@ copy_tree_body_r (tree *tp, int *walk_su
   || OMP_CLAUSE_CODE (*tp) == OMP_CLAUSE_DEPEND))
{
  tree t = OMP_CLAUSE_DECL (*tp);
- if (TREE_CODE (t) == TREE_LIST
+ if (t
+ && TREE_CODE (t) == TREE_LIST
  && TREE_PURPOSE (t)
  && TREE_CODE (TREE_PURPOSE (t)) == TREE_VEC)
{
--- gcc/testsuite/g++.dg/gomp/doacross-2.C.jj   2021-06-08 10:43:22.469121933 
+0200
+++ gcc/testsuite/g++.dg/gomp/doacross-2.C  2021-06-08 10:42:39.648720282 
+0200
@@ -0,0 +1,16 @@
+// PR c++/100957
+// { dg-do compile }
+
+struct S {
+  S ()
+  {
+  #pragma omp for ordered(2)
+for (int i = 0; i < 32; ++i)
+  for (int j = 0; j < 32; ++j)
+   {
+   #pragma omp ordered depend(source)
+ ;
+   #pragma omp ordered depend(sink: i - 1, j - 1)
+   }
+  }
+};

Jakub



Re: [openacc, testsuite, committed] Enable libgomp.oacc-*/declare-*.{c,f90} for non-nvidia devices

2021-06-08 Thread Thomas Schwinge
Hi!

On 2017-10-16T10:49:45+0200, Tom de Vries  wrote:
> this patch enables some openacc test-cases for non-nvidia devices.

Additionally, pushed "Don't require 'openacc_nvidia_accel_selected' in
additional 'libgomp.oacc-*/declare-*'" to master branch in commit
77f41a5c4e60a88533c90f0948b4dd24c9bb88b2, see attached.


Grüße
 Thomas


-
Mentor Graphics (Deutschland) GmbH, Arnulfstrasse 201, 80634 München 
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Frank 
Thürauf
>From 77f41a5c4e60a88533c90f0948b4dd24c9bb88b2 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Fri, 4 Jun 2021 15:25:32 +0200
Subject: [PATCH] Don't require 'openacc_nvidia_accel_selected' in additional
 'libgomp.oacc-*/declare-*'

Like r253779 (commit 92d5d01ac65e395ceaecc5d930f6017952aa4934)
"Enable libgomp.oacc-*/declare-*.{c,f90} for non-nvidia devices".

	libgomp/
	* testsuite/libgomp.oacc-c++/declare-1.C: Don't require
	'openacc_nvidia_accel_selected'.
	* testsuite/libgomp.oacc-c-c++-common/declare-3.c: Likewise.
---
 libgomp/testsuite/libgomp.oacc-c++/declare-1.C  | 2 --
 libgomp/testsuite/libgomp.oacc-c-c++-common/declare-3.c | 2 --
 2 files changed, 4 deletions(-)

diff --git a/libgomp/testsuite/libgomp.oacc-c++/declare-1.C b/libgomp/testsuite/libgomp.oacc-c++/declare-1.C
index 0286955d0c7..461b77820ef 100644
--- a/libgomp/testsuite/libgomp.oacc-c++/declare-1.C
+++ b/libgomp/testsuite/libgomp.oacc-c++/declare-1.C
@@ -1,5 +1,3 @@
-/* { dg-do run { target openacc_nvidia_accel_selected } } */
-
 #include 
 
 template
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/declare-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/declare-3.c
index c3a21876312..dc6c7f35275 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/declare-3.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/declare-3.c
@@ -1,5 +1,3 @@
-/* { dg-do run { target openacc_nvidia_accel_selected } } */
-
 #include 
 #include 
 
-- 
2.30.2



Re: [patch, openacc] Adjust tests for amdgcn offloading

2021-06-08 Thread Thomas Schwinge
Hi!

On 2019-12-13T17:43:57+, Andrew Stubbs  wrote:
> On 19/11/2019 12:21, Andrew Stubbs wrote:
>> This patch adds GCN special casing for most of the OpenACC libgomp tests
>> that require it. It also disables one testcase that explicitly uses CUDA.
>
> The patches aren't all that controversial, should only change the
> results on amdgcn

Almost.  ;-)

> Update OpenACC tests for amdgcn

>   * testsuite/libgomp.oacc-c-c++-common/async_queue-1.c: Disable on GCN.

> --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/async_queue-1.c
> +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/async_queue-1.c
> @@ -1,3 +1,5 @@
> +/* { dg-do run { target openacc_nvidia_accel_selected } } */

Actually that also disables it for 'acc_device_host'.

It's however trivial to make it work for all; pushed "Don't require
'openacc_nvidia_accel_selected' in
'libgomp.oacc-c-c++-common/async_queue-1.c'" to master branch in commit
89c1a427a1cfdb38e4b2354eeb1e28e0042af54c, see attached.


Grüße
 Thomas


-
Mentor Graphics (Deutschland) GmbH, Arnulfstrasse 201, 80634 München 
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Frank 
Thürauf
>From 89c1a427a1cfdb38e4b2354eeb1e28e0042af54c Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Fri, 4 Jun 2021 15:27:55 +0200
Subject: [PATCH] Don't require 'openacc_nvidia_accel_selected' in
 'libgomp.oacc-c-c++-common/async_queue-1.c'

That is, re-enable it for host-fallback, and enable it for GCN offloading.

Fix-up for r279378 (commit 26b74ed0223d108d7d7818c3c860f20cfe81a4af)
"Update OpenACC tests for amdgcn".

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/async_queue-1.c: Don't
	require 'openacc_nvidia_accel_selected'.  Fix up for
	'ACC_DEVICE_TYPE_radeon'.
---
 .../testsuite/libgomp.oacc-c-c++-common/async_queue-1.c| 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/async_queue-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/async_queue-1.c
index 4f9e53da85d..533d498bcf7 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/async_queue-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/async_queue-1.c
@@ -1,5 +1,3 @@
-/* { dg-do run { target openacc_nvidia_accel_selected } } */
-
 /* Test mapping of async values to specific underlying queues.  */
 
 #undef NDEBUG
@@ -29,6 +27,8 @@ int main(void)
   acc_device_t d;
 #if defined ACC_DEVICE_TYPE_nvidia
   d = acc_device_nvidia;
+#elif defined ACC_DEVICE_TYPE_radeon
+  d = acc_device_radeon;
 #elif defined ACC_DEVICE_TYPE_host
   d = acc_device_host;
 #else
@@ -88,6 +88,9 @@ int main(void)
 	assert (queues[i].cuda_stream == NULL);
   else
 	assert (queues[i].cuda_stream != NULL);
+#elif defined ACC_DEVICE_TYPE_radeon
+  /* For "acc_device_radeon" there are no CUDA streams.  */
+  assert (queues[i].cuda_stream == NULL);
 #elif defined ACC_DEVICE_TYPE_host
   /* For "acc_device_host" there are no CUDA streams.  */
   assert (queues[i].cuda_stream == NULL);
-- 
2.30.2



Re: [PATCH] predcom: Adjust some unnecessary update_ssa calls

2021-06-08 Thread Kewen.Lin via Gcc-patches
on 2021/6/7 下午10:46, Richard Biener wrote:
> On Wed, Jun 2, 2021 at 11:29 AM Kewen.Lin  wrote:
>>
>> Hi,
>>
>> As Richi suggested in PR100794, this patch is to remove
>> some unnecessary update_ssa calls with flag
>> TODO_update_ssa_only_virtuals, also do some refactoring.
>>
>> Bootstrapped/regtested on powerpc64le-linux-gnu P9,
>> x86_64-redhat-linux and aarch64-linux-gnu, built well
>> on Power9 ppc64le with --with-build-config=bootstrap-O3,
>> and passed both P8 and P9 SPEC2017 full build with
>> {-O3, -Ofast} + {,-funroll-loops}.
>>
>> Is it ok for trunk?
> 
> LGTM, minor comment on the fancy C++:
> 
> +  auto cleanup = [&]() {
> +release_chains (chains);
> +free_data_refs (datarefs);
> +BITMAP_FREE (looparound_phis);
> +free_affine_expand_cache (&name_expansions);
> +  };
> 
> +  cleanup ();
> +  return 0;
> 
> so that could have been
> 
>   class cleanup {
>  ~cleanup()
> {
>   release_chains (chains);
>   free_data_refs (datarefs);
>   BITMAP_FREE (looparound_phis);
>   free_affine_expand_cache (&name_expansions);
> }
>   } cleanup;
> 
> ?  Or some other means of adding registering a RAII-style cleanup?
> I mean, we can't wrap it all in
> 
>   try {...}
>   finally {...}
> 
> because C++ doesn't have finally.
> 
> OK with this tiny part of the C++ refactoring delayed, but we can also simply
> discuss best options.  At least for looparound_phis a good cleanup would
> be to pass the bitmap around and use auto_bitmap local to
> tree_predictive_commoning_loop ...
> 

Thanks Richi!  One draft (not ready for review) is attached for the further
discussion.  It follows the idea of RAII-style cleanup.  I noticed that
Martin suggested stepping forward to make tree_predictive_commoning_loop
and its callees into one class (Thanks Martin), since there are not many
this kind of C++-style work functions, I want to double confirm which option
do you guys prefer?

One point you might have seen is that to make tree_predictive_commoning_loop
and its callees as member functions of one class can avoid to pass bitmap
looparound_phis all around what's in the draft.  :)

BR,
Kewen

diff --git a/gcc/tree-predcom.c b/gcc/tree-predcom.c
index ac1674d5486..75acc342c5a 100644
--- a/gcc/tree-predcom.c
+++ b/gcc/tree-predcom.c
@@ -375,13 +375,40 @@ struct component
   struct component *next;
 };
 
-/* Bitmap of ssa names defined by looparound phi nodes covered by chains.  */
+typedef hash_map tree_expand_map_t;
+static void release_chains (vec chains);
 
-static bitmap looparound_phis;
+/* Class for predictive commoning data structure for one LOOP.  */
+class loop_pcom_info
+{
+public:
+  loop_pcom_info (loop_p l)
+: loop (l), datarefs (vNULL), dependences (vNULL), chains (vNULL),
+  cache (NULL)
+  {
+dependences.create (10);
+datarefs.create (10);
+  }
 
-/* Cache used by tree_to_aff_combination_expand.  */
+  ~loop_pcom_info ()
+  {
+free_data_refs (datarefs);
+free_dependence_relations (dependences);
+release_chains (chains);
+free_affine_expand_cache (&cache);
+  }
 
-static hash_map *name_expansions;
+  /* The pointer to the given loop.  */
+  loop_p loop;
+  /* All data references.  */
+  vec datarefs;
+  /* All data dependences.  */
+  vec dependences;
+  /* All chains.  */
+  vec chains;
+  /* Cache used by tree_to_aff_combination_expand.  */
+  tree_expand_map_t *cache;
+};
 
 /* Dumps data reference REF to FILE.  */
 
@@ -673,13 +700,13 @@ suitable_reference_p (struct data_reference *a, enum 
ref_step_type *ref_step)
 /* Stores DR_OFFSET (DR) + DR_INIT (DR) to OFFSET.  */
 
 static void
-aff_combination_dr_offset (struct data_reference *dr, aff_tree *offset)
+aff_combination_dr_offset (struct data_reference *dr, aff_tree *offset,
+  tree_expand_map_t **cache_ptr)
 {
   tree type = TREE_TYPE (DR_OFFSET (dr));
   aff_tree delta;
 
-  tree_to_aff_combination_expand (DR_OFFSET (dr), type, offset,
- &name_expansions);
+  tree_to_aff_combination_expand (DR_OFFSET (dr), type, offset, cache_ptr);
   aff_combination_const (&delta, type, wi::to_poly_widest (DR_INIT (dr)));
   aff_combination_add (offset, &delta);
 }
@@ -692,7 +719,7 @@ aff_combination_dr_offset (struct data_reference *dr, 
aff_tree *offset)
 
 static bool
 determine_offset (struct data_reference *a, struct data_reference *b,
- poly_widest_int *off)
+ poly_widest_int *off, tree_expand_map_t **cache_ptr)
 {
   aff_tree diff, baseb, step;
   tree typea, typeb;
@@ -720,13 +747,13 @@ determine_offset (struct data_reference *a, struct 
data_reference *b,
 
   /* Compare the offsets of the addresses, and check whether the difference
  is a multiple of step.  */
-  aff_combination_dr_offset (a, &diff);
-  aff_combination_dr_offset (b, &baseb);
+  aff_combination_dr_offset (a, &diff, cache_ptr);
+  aff_combination_dr_offset (b, &baseb, cache_ptr);
   aff_combination_scale (

Add 'acc_device_radeon' testing to 'libgomp.oacc-*/acc_on_device-*'

2021-06-08 Thread Thomas Schwinge
Hi!

Pushed "Add 'acc_device_radeon' testing to
'libgomp.oacc-*/acc_on_device-*'" to master branch in commit
97a040e987bfdc40d3bf442be74571a6819122cd, see attached.


Grüße
 Thomas


-
Mentor Graphics (Deutschland) GmbH, Arnulfstrasse 201, 80634 München 
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Frank 
Thürauf
>From 97a040e987bfdc40d3bf442be74571a6819122cd Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Fri, 4 Jun 2021 15:29:54 +0200
Subject: [PATCH] Add 'acc_device_radeon' testing to
 'libgomp.oacc-*/acc_on_device-*'

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c: Add
	'acc_device_radeon' testing.
	* testsuite/libgomp.oacc-fortran/acc_on_device-1-1.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/acc_on_device-1-2.f: Likewise.
	* testsuite/libgomp.oacc-fortran/acc_on_device-1-3.f: Likewise.
---
 .../libgomp.oacc-c-c++-common/acc_on_device-1.c   | 11 +++
 .../libgomp.oacc-fortran/acc_on_device-1-1.f90|  7 +++
 .../libgomp.oacc-fortran/acc_on_device-1-2.f  |  7 +++
 .../libgomp.oacc-fortran/acc_on_device-1-3.f  |  7 +++
 4 files changed, 32 insertions(+)

diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
index 8112745bcb8..064c6f5f2d8 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
@@ -19,6 +19,8 @@ main (int argc, char *argv[])
   abort ();
 if (acc_on_device (acc_device_nvidia))
   abort ();
+if (acc_on_device (acc_device_radeon))
+  abort ();
   }
 
 
@@ -34,6 +36,8 @@ main (int argc, char *argv[])
   abort ();
 if (acc_on_device (acc_device_nvidia))
   abort ();
+if (acc_on_device (acc_device_radeon))
+  abort ();
   }
 
 
@@ -55,6 +59,13 @@ main (int argc, char *argv[])
 #else
 if (acc_on_device (acc_device_nvidia))
   abort ();
+#endif
+#if ACC_DEVICE_TYPE_radeon
+if (!acc_on_device (acc_device_radeon))
+  abort ();
+#else
+if (acc_on_device (acc_device_radeon))
+  abort ();
 #endif
   }
 
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-1.f90
index ace935817dc..cd599e5d0e3 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-1.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-1.f90
@@ -21,6 +21,7 @@ if (.not. acc_on_device (acc_device_none)) STOP 1
 if (.not. acc_on_device (acc_device_host)) STOP 2
 if (acc_on_device (acc_device_not_host)) STOP 3
 if (acc_on_device (acc_device_nvidia)) STOP 4
+if (acc_on_device (acc_device_radeon)) STOP 4
 
 
 ! Host via offloading fallback mode.
@@ -32,6 +33,7 @@ if (.not. acc_on_device (acc_device_none)) STOP 5
 if (.not. acc_on_device (acc_device_host)) STOP 6
 if (acc_on_device (acc_device_not_host)) STOP 7
 if (acc_on_device (acc_device_nvidia)) STOP 8
+if (acc_on_device (acc_device_radeon)) STOP 8
 !$acc end parallel
 
 
@@ -49,6 +51,11 @@ if (.not. acc_on_device (acc_device_nvidia)) STOP 12
 #else
 if (acc_on_device (acc_device_nvidia)) STOP 13
 #endif
+#if ACC_DEVICE_TYPE_radeon
+if (.not. acc_on_device (acc_device_radeon)) STOP 14
+#else
+if (acc_on_device (acc_device_radeon)) STOP 15
+#endif
 !$acc end parallel
 
 #endif
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-2.f b/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-2.f
index 56270b12970..eb3daba0188 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-2.f
+++ b/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-2.f
@@ -21,6 +21,7 @@
   IF (.NOT. ACC_ON_DEVICE (ACC_DEVICE_HOST)) STOP 2
   IF (ACC_ON_DEVICE (ACC_DEVICE_NOT_HOST)) STOP 3
   IF (ACC_ON_DEVICE (ACC_DEVICE_NVIDIA)) STOP 4
+  IF (ACC_ON_DEVICE (ACC_DEVICE_RADEON)) STOP 4
 
 
 !Host via offloading fallback mode.
@@ -32,6 +33,7 @@
   IF (.NOT. ACC_ON_DEVICE (ACC_DEVICE_HOST)) STOP 6
   IF (ACC_ON_DEVICE (ACC_DEVICE_NOT_HOST)) STOP 7
   IF (ACC_ON_DEVICE (ACC_DEVICE_NVIDIA)) STOP 8
+  IF (ACC_ON_DEVICE (ACC_DEVICE_RADEON)) STOP 8
 !$ACC END PARALLEL
 
 
@@ -49,6 +51,11 @@
 #else
   IF (ACC_ON_DEVICE (ACC_DEVICE_NVIDIA)) STOP 13
 #endif
+#if ACC_DEVICE_TYPE_radeon
+  IF (.NOT. ACC_ON_DEVICE (ACC_DEVICE_RADEON)) STOP 14
+#else
+  IF (ACC_ON_DEVICE (ACC_DEVICE_RADEON)) STOP 15
+#endif
 !$ACC END PARALLEL
 
 #endif
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-3.f b/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-3.f
index a8b9cddd1ae..5f500c19481 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-3.f
+++ b/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-3.f
@@ -21,6 +21,7 @@
   IF (.NOT. ACC_ON_DEVICE (ACC_DEVICE_HOST)) STOP 2
   IF (ACC_ON_DEVICE (ACC_DEVICE_NOT_HOST)) STOP 3
   IF (ACC_ON_DEVICE (ACC_DEVICE_NVIDIA)) STOP 4
+

Enhance 'libgomp.oacc-c-c++-common/firstprivate-1.c' for non-'acc_device_nvidia'

2021-06-08 Thread Thomas Schwinge
Hi!

Pushed "Enhance 'libgomp.oacc-c-c++-common/firstprivate-1.c' for
non-'acc_device_nvidia'" to master branch in commit
292fb10bebf3c209f560d1590d2d70bf30b58018, see attached.


Grüße
 Thomas


-
Mentor Graphics (Deutschland) GmbH, Arnulfstrasse 201, 80634 München 
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Frank 
Thürauf
>From 292fb10bebf3c209f560d1590d2d70bf30b58018 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Fri, 4 Jun 2021 15:42:22 +0200
Subject: [PATCH] Enhance 'libgomp.oacc-c-c++-common/firstprivate-1.c' for
 non-'acc_device_nvidia'

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/firstprivate-1.c: Enhance
	for non-'acc_device_nvidia'.
---
 libgomp/testsuite/libgomp.oacc-c-c++-common/firstprivate-1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/firstprivate-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/firstprivate-1.c
index fff0c28e8ad..27da7654de9 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/firstprivate-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/firstprivate-1.c
@@ -79,7 +79,7 @@ void t2 ()
 void t3 ()
 {
   int a, b[N], c, d, i;
-  int n = acc_get_device_type () == acc_device_nvidia ? N : 1;
+  int n = acc_get_device_type () != acc_device_host ? N : 1;
 
   a = 5;
   for (i = 0; i < n; i++)
-- 
2.30.2



Re: [patch, openacc] Adjust tests for amdgcn offloading

2021-06-08 Thread Thomas Schwinge
Hi!

On 2019-12-13T17:43:57+, Andrew Stubbs  wrote:
> On 19/11/2019 12:21, Andrew Stubbs wrote:
>> This patch adds GCN special casing for most of the OpenACC libgomp tests
>> that require it.
>
> [...] I've gone ahead and committed the attached.

>   * testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c: [Handle 
> gcn.]

> --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
> +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
> @@ -106,6 +106,8 @@ static void cb_enqueue_launch_start (acc_prof_info 
> *prof_info, acc_event_info *e
>  assert (event_info->launch_event.vector_length >= 1);
>else if (acc_device_type == acc_device_nvidia) /* ... is special.  */
>  assert (event_info->launch_event.vector_length == 32);
> +  else if (acc_device_type == acc_device_gcn) /* ...and so is this.  */
> +assert (event_info->launch_event.vector_length == 64);
>else
>  {
>  #ifdef __OPTIMIZE__
> @@ -118,6 +120,8 @@ static void cb_enqueue_launch_start (acc_prof_info 
> *prof_info, acc_event_info *e
>
>if (acc_device_type == acc_device_host)
>  assert (api_info->device_api == acc_device_api_none);
> +  else if (acc_device_type == acc_device_gcn)
> +assert (api_info->device_api == acc_device_api_other);
>else
>  assert (api_info->device_api == acc_device_api_cuda);
>assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);

(Someone please scold me for making 'acc_device_api_cuda' the default
'else' case here, without 'if (acc_device_type == acc_device_nvidia)'...)

To make this testcase work in the current GCN setting, I've pushed "Fix
'libgomp.oacc-c-c++-common/acc_prof-kernels-1.c' for 'acc_device_radeon'"
to master branch in commit 984df1e1630f262d782c00cefad2643b8e8469f8, see
attached.  (... to be adjusted again, later...)


Grüße
 Thomas


-
Mentor Graphics (Deutschland) GmbH, Arnulfstrasse 201, 80634 München 
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Frank 
Thürauf
>From 984df1e1630f262d782c00cefad2643b8e8469f8 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Sun, 6 Jun 2021 10:41:18 +0200
Subject: [PATCH] Fix 'libgomp.oacc-c-c++-common/acc_prof-kernels-1.c' for
 'acc_device_radeon'

... on top of r279378 (commit 26b74ed0223d108d7d7818c3c860f20cfe81a4af)
"Update OpenACC tests for amdgcn".

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c: Fix
	for 'acc_device_radeon'.
---
 .../testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
index ad33f72e2fb..7f74ee922b7 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
@@ -93,6 +93,11 @@ static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *e
 }
   if (num_workers < 1)
 assert (event_info->launch_event.num_workers >= 1);
+  /* GCN currently enforces 'num_workers (1)'.  */
+  else if (acc_device_type == acc_device_radeon
+	   /*TODO ... just not in the "Parallelism dimensions: variable" case.  */
+	   && /*TODO*/ num_gangs != 22)
+assert (event_info->launch_event.num_workers == 1);
   else
 {
 #ifdef __OPTIMIZE__
-- 
2.30.2



Re: [committed, amdgcn] Update OpenACC testcases for amdgcn

2021-06-08 Thread Thomas Schwinge
Hi!

On 2020-01-20T16:53:49+, Andrew Stubbs  wrote:
> I've committed this testsuite-only patch to fix some test cases that
> need GCN-specific settings in order to pass.

>   * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
>   (acc_gang): Recognise acc_device_radeon.
>   (acc_worker): Likewise.
>   (acc_vector): Likewise.

> --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> @@ -14,7 +14,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) 
> acc_gang ()
>  {
>if (acc_on_device ((int) acc_device_host))
>  return 0;
> -  else if (acc_on_device ((int) acc_device_nvidia))
> +  else if (acc_on_device ((int) acc_device_nvidia)
> +|| acc_on_device ((int) acc_device_radeon))
>  return __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
>else
>  __builtin_abort ();
> @@ -25,7 +26,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) 
> acc_worker ()
>  {
>if (acc_on_device ((int) acc_device_host))
>  return 0;
> -  else if (acc_on_device ((int) acc_device_nvidia))
> +  else if (acc_on_device ((int) acc_device_nvidia)
> +|| acc_on_device ((int) acc_device_radeon))
>  return __builtin_goacc_parlevel_id (GOMP_DIM_WORKER);
>else
>  __builtin_abort ();
> @@ -36,7 +38,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) 
> acc_vector ()
>  {
>if (acc_on_device ((int) acc_device_host))
>  return 0;
> -  else if (acc_on_device ((int) acc_device_nvidia))
> +  else if (acc_on_device ((int) acc_device_nvidia)
> +|| acc_on_device ((int) acc_device_radeon))
>  return __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR);
>else
>  __builtin_abort ();

Similar changes are necessary for
'libgomp.oacc-fortran/parallel-dims.f90', too -- and actually all that
can be simplified; pushed "Fix 'libgomp.oacc-fortran/parallel-dims.f90'
for 'acc_device_radeon'" to master branch in commit
32099c0d24adb93a031e0301ffd77b065b6f5472, see attached.

>   (main): Set expectations for amdgcn.

> @@ -282,6 +285,12 @@ int main ()
> /* The GCC nvptx back end enforces num_workers (32).  */
> workers_actual = 32;
>   }
> +  else if (acc_on_device (acc_device_radeon))
> + {
> +   /* The GCC GCN back end is limited to num_workers (16).
> +  Temporarily set this to 1 until multiple workers are permitted. */
> +   workers_actual = 1; // 16;
> + }

ACK; working on that.

> @@ -328,6 +337,11 @@ int main ()
> /* We're actually executing with num_workers (32).  */
> /* workers_actual = 32; */
>   }
> +  else if (acc_on_device (acc_device_radeon))
> + {
> +   /* The GCC GCN back end is limited to num_workers (16).  */
> +   workers_actual = 16;
> + }

That's surprising however, that here we're not similarly having to set
'workers_actual = 1' -- find the explanation in a forthcoming email.  ;-)

> [...]


Grüße
 Thomas


-
Mentor Graphics (Deutschland) GmbH, Arnulfstrasse 201, 80634 München 
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Frank 
Thürauf
>From 32099c0d24adb93a031e0301ffd77b065b6f5472 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Fri, 4 Jun 2021 15:31:53 +0200
Subject: [PATCH] Fix 'libgomp.oacc-fortran/parallel-dims.f90' for
 'acc_device_radeon'

..., by simplifying 'libgomp.oacc-c-c++-common/parallel-dims.c', and updating
the former correspondingly.  '__builtin_goacc_parlevel_id' does the right thing
for all 'acc_device_*'.

Follow-up to commit 09e0ad6253f4330977e1b2f116b5e289dc2c2a02 "Update OpenACC
tests for amdgcn".

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Simplify.
	* testsuite/libgomp.oacc-fortran/parallel-dims-aux.c: Update.
---
 .../libgomp.oacc-c-c++-common/parallel-dims.c | 32 ---
 .../libgomp.oacc-fortran/parallel-dims-aux.c  | 31 --
 2 files changed, 12 insertions(+), 51 deletions(-)

diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
index c7412c2ef3a..974e1504534 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
@@ -10,42 +10,22 @@
 #include 
 #include 
 
-/* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
-   not behaving as expected for -O0.  */
 #pragma acc routine seq
-static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
+static int acc_gang ()
 {
-  if (acc_on_device ((int) acc_device_host))
-return 0;
-  else if (acc_on_device ((int) acc_device_nvidia)
-	   || acc_on_device ((int) acc_device_radeon))
-return __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
-  else
-__builtin_abort ();
+  return __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
 }
 
 #pragma acc routine seq
-static unsigned int __attribute__ ((optimize ("O2"))) acc_wo

Enable more 'libgomp.oacc-*/lib-*' testcases for non-'openacc_nvidia_accel_selected'

2021-06-08 Thread Thomas Schwinge
Hi!

An old patch refreshed: pushed "Enable more 'libgomp.oacc-*/lib-*'
testcases for non-'openacc_nvidia_accel_selected'" to master branch in
commit c68ddd5e2a9dd0cfe21c3661404d7d4c323b23cf, see attached.


Grüße
 Thomas


-
Mentor Graphics (Deutschland) GmbH, Arnulfstrasse 201, 80634 München 
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Frank 
Thürauf
>From c68ddd5e2a9dd0cfe21c3661404d7d4c323b23cf Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Thu, 31 Oct 2019 17:40:13 +0100
Subject: [PATCH] Enable more 'libgomp.oacc-*/lib-*' testcases for
 non-'openacc_nvidia_accel_selected'

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/lib-11.c: Enable for all but
	'-DACC_MEM_SHARED=0'.
	* testsuite/libgomp.oacc-c-c++-common/lib-13.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-14.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-15.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-20.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-23.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-24.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-34.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-42.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-44.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-48.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-88.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-89.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-92.c: Likewise.
	* testsuite/libgomp.oacc-fortran/lib-14.f90: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-5.c: Add
	'acc_device_radeon' testing.
	* testsuite/libgomp.oacc-c-c++-common/lib-6.c: Likewise.
	* testsuite/libgomp.oacc-fortran/lib-5.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/lib-7.f90: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-52.c: Enable for all.
	* testsuite/libgomp.oacc-c-c++-common/lib-53.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-54.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-86.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-87.c: Likewise.
	* testsuite/libgomp.oacc-fortran/lib-10.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/lib-8.f90: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-57.c: Improve checking
	for non-'openacc_nvidia_accel_selected'.
	* testsuite/libgomp.oacc-c-c++-common/lib-58.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-62.c: Clarify that "Not
	all implement this checking".
	* testsuite/libgomp.oacc-c-c++-common/lib-63.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-64.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-65.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-67.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-68.c: Likewise.
---
 .../libgomp.oacc-c-c++-common/lib-11.c| 10 ++--
 .../libgomp.oacc-c-c++-common/lib-13.c|  2 +-
 .../libgomp.oacc-c-c++-common/lib-14.c|  2 +-
 .../libgomp.oacc-c-c++-common/lib-15.c|  2 +-
 .../libgomp.oacc-c-c++-common/lib-20.c|  4 +-
 .../libgomp.oacc-c-c++-common/lib-23.c|  4 +-
 .../libgomp.oacc-c-c++-common/lib-24.c|  2 +-
 .../libgomp.oacc-c-c++-common/lib-34.c|  4 +-
 .../libgomp.oacc-c-c++-common/lib-42.c|  4 +-
 .../libgomp.oacc-c-c++-common/lib-44.c|  4 +-
 .../libgomp.oacc-c-c++-common/lib-48.c|  4 +-
 .../libgomp.oacc-c-c++-common/lib-5.c | 20 +++-
 .../libgomp.oacc-c-c++-common/lib-52.c|  6 +--
 .../libgomp.oacc-c-c++-common/lib-53.c|  6 +--
 .../libgomp.oacc-c-c++-common/lib-54.c|  6 +--
 .../libgomp.oacc-c-c++-common/lib-57.c|  2 +-
 .../libgomp.oacc-c-c++-common/lib-58.c|  2 +-
 .../libgomp.oacc-c-c++-common/lib-6.c | 47 ++-
 .../libgomp.oacc-c-c++-common/lib-62.c|  3 +-
 .../libgomp.oacc-c-c++-common/lib-63.c|  3 +-
 .../libgomp.oacc-c-c++-common/lib-64.c|  3 +-
 .../libgomp.oacc-c-c++-common/lib-65.c|  3 +-
 .../libgomp.oacc-c-c++-common/lib-67.c|  3 +-
 .../libgomp.oacc-c-c++-common/lib-68.c|  3 +-
 .../libgomp.oacc-c-c++-common/lib-86.c| 27 +++
 .../libgomp.oacc-c-c++-common/lib-87.c| 27 +++
 .../libgomp.oacc-c-c++-common/lib-88.c|  9 +---
 .../libgomp.oacc-c-c++-common/lib-89.c| 18 +++
 .../libgomp.oacc-c-c++-common/lib-92.c| 18 +++
 .../testsuite/libgomp.oacc-fortran/lib-10.f90 |  6 +--
 .../testsuite/libgomp.oacc-fortran/lib-14.f90 |  3 +-
 .../testsuite/libgomp.oacc-fortran/lib-5.f90  | 46 ++
 .../testsuite/libgomp.oacc-fortran/lib-7.f90  | 46 ++
 .../testsuite/libgomp.oacc-fortran/lib-8.f90  |  6 +--
 34 files changed, 225 insertions(+), 130 deletions(-)

diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-11.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-11.c
index 86cfeb68c5d..1f05161436c 100644
--- a/libgomp/testsuite/libgomp.oacc-c-

Re: [PATCH 6/7 amdgcn] Use a single worker for OpenACC on AMD GCN

2021-06-08 Thread Thomas Schwinge
Hi!

On 2019-11-12T13:29:15+, Andrew Stubbs  wrote:
> This patch prevents the compiler using multiple workers in a gang.

Almost.  The GCN back end fails to enforce this for the case of run-time
variable 'num_workers': that's 'dims[GOMP_DIM_WORKER] == 0', and the
current 'gcc/config/gcn/gcn.c:gcn_goacc_validate_dims' logic doesn't
consider that case:

/* Check the num workers is not too large.  */
if (dims[GOMP_DIM_WORKER] > max_workers)
  {
warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
OPT_Wopenacc_dims,
"using num_workers (%d), ignoring %d",
max_workers, dims[GOMP_DIM_WORKER]);
dims[GOMP_DIM_WORKER] = max_workers;

We could fix that either here, or simply in the GCN libgomp plugin.  I've
pushed "[GCN] Fix run-time variable 'num_workers'" to master branch in
commit 30656822b3792712c7a69fe1a0a79739f8f29abc, see attached.  As
detailed there, this actually affects/fixes a small number of testcases.

> This
> should be reverted when worker support is committed.

ACK; working on that.


Grüße
 Thomas


> 2019-11-12  Andrew Stubbs  
>   Julian Brown  
>
>   gcc/
>   * config/gcn/gcn.c (gcn_goacc_validate_dims): Ensure
>   flag_worker_partitioning is not set.
>   (TARGET_GOACC_WORKER_PARTITIONING): Remove target hook definition.
>   * config/gcn/gcn.opt (macc-experimental-workers): Default to off.
> ---
>  gcc/config/gcn/gcn.c   | 4 ++--
>  gcc/config/gcn/gcn.opt | 2 +-
>  2 files changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c
> index cdd24277cf6..1a69737f693 100644
> --- a/gcc/config/gcn/gcn.c
> +++ b/gcc/config/gcn/gcn.c
> @@ -4695,6 +4695,8 @@ gcn_goacc_validate_dims (tree decl, int dims[], int 
> fn_level,
>/* FIXME: remove -facc-experimental-workers when they're ready.  */
>int max_workers = flag_worker_partitioning ? 16 : 1;
>
> +  gcc_assert (!flag_worker_partitioning);
> +
>/* The vector size must appear to be 64, to the user, unless this is a
>   SEQ routine.  The real, internal value is always 1, which means use
>   autovectorization, but the user should not see that.  */
> @@ -6073,8 +6075,6 @@ print_operand (FILE *file, rtx x, int code)
>  #define TARGET_GOACC_REDUCTION gcn_goacc_reduction
>  #undef  TARGET_GOACC_VALIDATE_DIMS
>  #define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims
> -#undef  TARGET_GOACC_WORKER_PARTITIONING
> -#define TARGET_GOACC_WORKER_PARTITIONING true
>  #undef  TARGET_HARD_REGNO_MODE_OK
>  #define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok
>  #undef  TARGET_HARD_REGNO_NREGS
> diff --git a/gcc/config/gcn/gcn.opt b/gcc/config/gcn/gcn.opt
> index bdc878f35ad..402deb625bd 100644
> --- a/gcc/config/gcn/gcn.opt
> +++ b/gcc/config/gcn/gcn.opt
> @@ -65,7 +65,7 @@ Target Report RejectNegative Var(flag_bypass_init_error)
>  bool flag_worker_partitioning = false
>
>  macc-experimental-workers
> -Target Report Var(flag_worker_partitioning) Init(1)
> +Target Report Var(flag_worker_partitioning) Init(0)
>
>  int stack_size_opt = -1
>


-
Mentor Graphics (Deutschland) GmbH, Arnulfstrasse 201, 80634 München 
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Frank 
Thürauf
>From 30656822b3792712c7a69fe1a0a79739f8f29abc Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Sat, 5 Jun 2021 22:39:21 +0200
Subject: [PATCH] [GCN] Fix run-time variable 'num_workers'

... which currently has *not* been forced to 'num_workers (1)'.

In addition to the testcases modified here, this also fixes:

FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/mode-transitions.c -DACC_DEVICE_TYPE_radeon=1 -DACC_MEM_SHARED=0 -foffload=amdgcn-amdhsa  -O0  execution test
[Etc.]

mode-transitions.exe: [...]/libgomp.oacc-c-c++-common/mode-transitions.c:702: t17: Assertion `arr_b[i] == (i ^ 31) * 8' failed.

	libgomp/
	* plugin/plugin-gcn.c (gcn_exec): Force 'num_workers (1)'
	unconditionally.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c:
	Update.
	* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c: Likewise.
---
 libgomp/plugin/plugin-gcn.c  | 5 ++---
 .../testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c | 4 +---
 libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c  | 5 +++--
 libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c   | 3 ++-
 4 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 8aab708b0ef..cfed42a2d4d 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -3041,10 +3041,9 @@ gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
  problem size, so let's do a reasonable number of single-worker gangs.
  64 gangs matches a typical Fiji device.  */
 
-  /* NOTE: Until support for middle-end 

Re: [PATCH V3] Split loop for NE condition.

2021-06-08 Thread Richard Biener
On Fri, 4 Jun 2021, Jiufu Guo wrote:

> Update the patch since v2:
> . Check index and bound from gcond before checking if wrap.
> . Update test case, and add an executable case.
> . Refine code comments.
> . Enhance the checking for i++/++i in the loop header.
> . Enhance code to handle equal condition on exit
> 
> Bootstrap and regtest pass on powerpc64le, and also pass regtest
> on bootstrap-O3. Is this ok for trunk?
> 
> BR.
> Jiufu Guo.
> 
> 
> When there is the possibility that wrap may happen on the loop index,
> a few optimizations would not happen. For example code:
> 
> foo (int *a, int *b, unsigned k, unsigned n)
> {
>   while (++k != n)
> a[k] = b[k]  + 1;
> }
> 
> For this code, if "k > n", k would wrap.  if "k < n" at begining,
> it could be optimized (e.g. vectorization).
> 
> We can split the loop into two loops:
> 
>   while (++k > n)
> a[k] = b[k]  + 1;
>   while (k++ < n)
> a[k] = b[k]  + 1;
> 
> This patch splits this kind of loop to achieve better performance.
> 
> gcc/ChangeLog:
> 
> 2021-06-04  Jiufu Guo  
> 
>   * tree-ssa-loop-split.c (connect_loop_phis): Add new param.
>   (get_ne_cond_branch): New function.
>   (split_ne_loop): New function.
>   (split_loop_on_ne_cond): New function.
>   (tree_ssa_split_loops): Use split_loop_on_ne_cond.
> 
> gcc/testsuite/ChangeLog:
> 
> 2021-06-04  Jiufu Guo  
> 
>   * gcc.dg/loop-split1.c: New test.
>   * gcc.dg/loop-split2.c: New test.
>   * g++.dg/vect/pr98064.cc: Suppress warning.
> 
> ---
>  gcc/testsuite/g++.dg/vect/pr98064.cc |   4 +-
>  gcc/testsuite/gcc.dg/loop-split1.c   | 101 +++
>  gcc/testsuite/gcc.dg/loop-split2.c   |  54 ++
>  gcc/tree-ssa-loop-split.c| 251 ++-
>  4 files changed, 404 insertions(+), 6 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/loop-split1.c
>  create mode 100644 gcc/testsuite/gcc.dg/loop-split2.c
> 
> diff --git a/gcc/testsuite/g++.dg/vect/pr98064.cc 
> b/gcc/testsuite/g++.dg/vect/pr98064.cc
> index 74043ce7725..dcb2985d05a 100644
> --- a/gcc/testsuite/g++.dg/vect/pr98064.cc
> +++ b/gcc/testsuite/g++.dg/vect/pr98064.cc
> @@ -1,5 +1,7 @@
>  // { dg-do compile }
> -// { dg-additional-options "-O3" }
> +// { dg-additional-options "-O3 -Wno-stringop-overflow" }
> +/* There is warning message when "short g = var_8; g; g++"
> +   is optimized/analyzed as string operation,e.g. memset.  */
>  
>  const long long &min(const long long &__a, long long &__b) {
>if (__b < __a)
> diff --git a/gcc/testsuite/gcc.dg/loop-split1.c 
> b/gcc/testsuite/gcc.dg/loop-split1.c
> new file mode 100644
> index 000..dd2d03a7b96
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-split1.c
> @@ -0,0 +1,101 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
> +
> +void
> +foo (int *a, int *b, unsigned l, unsigned n)
> +{
> +  while (++l != n)
> +a[l] = b[l] + 1;
> +}
> +void
> +foo_1 (int *a, int *b, unsigned n)
> +{
> +  unsigned l = 0;
> +  while (++l != n)
> +a[l] = b[l] + 1;
> +}
> +
> +void
> +foo1 (int *a, int *b, unsigned l, unsigned n)
> +{
> +  while (l++ != n)
> +a[l] = b[l] + 1;
> +}
> +
> +/* No wrap.  */
> +void
> +foo1_1 (int *a, int *b, unsigned n)
> +{
> +  unsigned l = 0;
> +  while (l++ != n)
> +a[l] = b[l] + 1;
> +}
> +
> +unsigned
> +foo2 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  while (++l != n)
> +if (a[l] != b[l])
> +  break;
> +
> +  return l;
> +}
> +
> +unsigned
> +foo2_1 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  l = 0;
> +  while (++l != n)
> +if (a[l] != b[l])
> +  break;
> +
> +  return l;
> +}
> +
> +unsigned
> +foo3 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  while (l++ != n)
> +if (a[l] != b[l])
> +  break;
> +
> +  return l;
> +}
> +
> +/* No wrap.  */
> +unsigned
> +foo3_1 (char *a, char *b, unsigned l, unsigned n)
> +{
> +  l = 0;
> +  while (l++ != n)
> +if (a[l] != b[l])
> +  break;
> +
> +  return l;
> +}
> +
> +void
> +bar ();
> +void
> +foo4 (unsigned n, unsigned i)
> +{
> +  do
> +{
> +  if (i == n)
> + return;
> +  bar ();
> +  ++i;
> +}
> +  while (1);
> +}
> +
> +unsigned
> +find_skip_diff (char *p, char *q, unsigned n, unsigned i)
> +{
> +  while (p[i] == q[i] && ++i != n)
> +p++, q++;
> +
> +  return i;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "Loop split" 8 "lsplit" } } */
> diff --git a/gcc/testsuite/gcc.dg/loop-split2.c 
> b/gcc/testsuite/gcc.dg/loop-split2.c
> new file mode 100644
> index 000..0d3fded3f61
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-split2.c
> @@ -0,0 +1,54 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3" } */
> +
> +extern void abort (void);
> +extern void exit (int);
> +
> +#define NI __attribute__ ((noinline))
> +
> +void NI
> +foo (int *a, int *b, unsigned char l, unsigned char n)
> +{
> +  while (++l != n)
> +a[l] = b[l] + 1;
> +}
> +
> +unsigned NI
> +bar (int *a, int *b, unsigne

Re: [PATCH] arc: Add --with-fpu support for ARCv2 cpus

2021-06-08 Thread Bernhard Reutner-Fischer via Gcc-patches
On Tue, 8 Jun 2021 10:05:28 +0300
Claudiu Zissulescu  wrote:

> Thank you for your input.
> 
> I have made an update using grep's ERE. Please let me know if it is ok.

I would have written [[:space:]]* instead of [[:space:]]+ to handle
potentially missing space, at least after the comma but also before the
comma to avoid surprises for new names in the future.
Furthermore | alone would be [[:blank:]]* but as you prefer.

grep ... > /dev/null would be grep -q which is mandated by POSIX since
at least SUSv2 so can be used safely since quite some time now.

Instead of the redundant 'true' calls, i'd usually write :
E.g.
if grep -q ... ; then :
else echo "nah"; exit 1
fi

Which could be shortened to
if ! grep -q ...
then
  echo "nah"
  exit 1
fi

to avoid any questions about an empty arm in the first place.

ISTM you only set the expected flags in the switch so i would have
set only that variable and have grepped only once after the switch for
brevity.

Either way, thanks for not using grep -P :)
thanks,


Re: [PATCH] predcom: Adjust some unnecessary update_ssa calls

2021-06-08 Thread Richard Biener via Gcc-patches
On Tue, Jun 8, 2021 at 11:31 AM Kewen.Lin  wrote:
>
> on 2021/6/7 下午10:46, Richard Biener wrote:
> > On Wed, Jun 2, 2021 at 11:29 AM Kewen.Lin  wrote:
> >>
> >> Hi,
> >>
> >> As Richi suggested in PR100794, this patch is to remove
> >> some unnecessary update_ssa calls with flag
> >> TODO_update_ssa_only_virtuals, also do some refactoring.
> >>
> >> Bootstrapped/regtested on powerpc64le-linux-gnu P9,
> >> x86_64-redhat-linux and aarch64-linux-gnu, built well
> >> on Power9 ppc64le with --with-build-config=bootstrap-O3,
> >> and passed both P8 and P9 SPEC2017 full build with
> >> {-O3, -Ofast} + {,-funroll-loops}.
> >>
> >> Is it ok for trunk?
> >
> > LGTM, minor comment on the fancy C++:
> >
> > +  auto cleanup = [&]() {
> > +release_chains (chains);
> > +free_data_refs (datarefs);
> > +BITMAP_FREE (looparound_phis);
> > +free_affine_expand_cache (&name_expansions);
> > +  };
> >
> > +  cleanup ();
> > +  return 0;
> >
> > so that could have been
> >
> >   class cleanup {
> >  ~cleanup()
> > {
> >   release_chains (chains);
> >   free_data_refs (datarefs);
> >   BITMAP_FREE (looparound_phis);
> >   free_affine_expand_cache (&name_expansions);
> > }
> >   } cleanup;
> >
> > ?  Or some other means of adding registering a RAII-style cleanup?
> > I mean, we can't wrap it all in
> >
> >   try {...}
> >   finally {...}
> >
> > because C++ doesn't have finally.
> >
> > OK with this tiny part of the C++ refactoring delayed, but we can also 
> > simply
> > discuss best options.  At least for looparound_phis a good cleanup would
> > be to pass the bitmap around and use auto_bitmap local to
> > tree_predictive_commoning_loop ...
> >
>
> Thanks Richi!  One draft (not ready for review) is attached for the further
> discussion.  It follows the idea of RAII-style cleanup.  I noticed that
> Martin suggested stepping forward to make tree_predictive_commoning_loop
> and its callees into one class (Thanks Martin), since there are not many
> this kind of C++-style work functions, I want to double confirm which option
> do you guys prefer?
>
> One point you might have seen is that to make tree_predictive_commoning_loop
> and its callees as member functions of one class can avoid to pass bitmap
> looparound_phis all around what's in the draft.  :)

Such general cleanup is of course desired - Giuliano started some of it within
GSoC two years ago in the attempt to thread the compilation process.  The
cleanup then helps to get rid of global state which of course interferes here
(and avoids unnecessary use of TLS vars).

So yes, encapsulating global state into a class and making accessors
member functions is something that is desired (but a lot of mechanical
work).

Thanks
Richard.

> BR,
> Kewen
>


Re: [PATCH] predcom: Adjust some unnecessary update_ssa calls

2021-06-08 Thread Richard Biener via Gcc-patches
On Tue, Jun 8, 2021 at 1:02 PM Richard Biener
 wrote:
>
> On Tue, Jun 8, 2021 at 11:31 AM Kewen.Lin  wrote:
> >
> > on 2021/6/7 下午10:46, Richard Biener wrote:
> > > On Wed, Jun 2, 2021 at 11:29 AM Kewen.Lin  wrote:
> > >>
> > >> Hi,
> > >>
> > >> As Richi suggested in PR100794, this patch is to remove
> > >> some unnecessary update_ssa calls with flag
> > >> TODO_update_ssa_only_virtuals, also do some refactoring.
> > >>
> > >> Bootstrapped/regtested on powerpc64le-linux-gnu P9,
> > >> x86_64-redhat-linux and aarch64-linux-gnu, built well
> > >> on Power9 ppc64le with --with-build-config=bootstrap-O3,
> > >> and passed both P8 and P9 SPEC2017 full build with
> > >> {-O3, -Ofast} + {,-funroll-loops}.
> > >>
> > >> Is it ok for trunk?
> > >
> > > LGTM, minor comment on the fancy C++:
> > >
> > > +  auto cleanup = [&]() {
> > > +release_chains (chains);
> > > +free_data_refs (datarefs);
> > > +BITMAP_FREE (looparound_phis);
> > > +free_affine_expand_cache (&name_expansions);
> > > +  };
> > >
> > > +  cleanup ();
> > > +  return 0;
> > >
> > > so that could have been
> > >
> > >   class cleanup {
> > >  ~cleanup()
> > > {
> > >   release_chains (chains);
> > >   free_data_refs (datarefs);
> > >   BITMAP_FREE (looparound_phis);
> > >   free_affine_expand_cache (&name_expansions);
> > > }
> > >   } cleanup;
> > >
> > > ?  Or some other means of adding registering a RAII-style cleanup?
> > > I mean, we can't wrap it all in
> > >
> > >   try {...}
> > >   finally {...}
> > >
> > > because C++ doesn't have finally.
> > >
> > > OK with this tiny part of the C++ refactoring delayed, but we can also 
> > > simply
> > > discuss best options.  At least for looparound_phis a good cleanup would
> > > be to pass the bitmap around and use auto_bitmap local to
> > > tree_predictive_commoning_loop ...
> > >
> >
> > Thanks Richi!  One draft (not ready for review) is attached for the further
> > discussion.  It follows the idea of RAII-style cleanup.  I noticed that
> > Martin suggested stepping forward to make tree_predictive_commoning_loop
> > and its callees into one class (Thanks Martin), since there are not many
> > this kind of C++-style work functions, I want to double confirm which option
> > do you guys prefer?
> >
> > One point you might have seen is that to make tree_predictive_commoning_loop
> > and its callees as member functions of one class can avoid to pass bitmap
> > looparound_phis all around what's in the draft.  :)
>
> Such general cleanup is of course desired - Giuliano started some of it within
> GSoC two years ago in the attempt to thread the compilation process.  The
> cleanup then helps to get rid of global state which of course interferes here
> (and avoids unnecessary use of TLS vars).
>
> So yes, encapsulating global state into a class and making accessors
> member functions is something that is desired (but a lot of mechanical
> work).

Btw, the patch you posted is OK with me as well, it achieves the global
state removal, too.

Richard.

> Thanks
> Richard.
>
> > BR,
> > Kewen
> >


Re: [PATCH] arm: Auto-vectorization for MVE and Neon: vhadd/vrhadd

2021-06-08 Thread Richard Sandiford via Gcc-patches
Christophe Lyon  writes:
> On Wed, 2 Jun 2021 at 20:19, Richard Sandiford
>  wrote:
>>
>> Christophe Lyon  writes:
>> > This patch adds support for auto-vectorization of average value
>> > computation using vhadd or vrhadd, for both MVE and Neon.
>> >
>> > The patch adds the needed [u]avg3_[floor|ceil] patterns to
>> > vec-common.md, I'm not sure how to factorize them without introducing
>> > an unspec iterator?
>>
>> Yeah, an int iterator would be one way, but I'm not sure it would
>> make things better given the differences in how Neon and MVE handle
>> their unspecs.
>>
>> > It also adds tests for 'floor' and for 'ceil', each for MVE and Neon.
>> >
>> > Vectorization works with 8-bit and 16 bit input/output vectors, but
>> > not with 32-bit ones because the vectorizer expects wider types
>> > availability for the intermediate values, but int32_t + int32_t does
>> > not involve wider types in the IR.
>>
>> Right.  Like you say, it's only valid to use V(R)HADD if, in the source
>> code, the addition and shift have a wider precision than the operands.
>> That happens naturally for 8-bit and 16-bit operands, since C arithmetic
>> promotes them to "int" first.  But for 32-bit operands, the C code needs
>> to do the addition and shift in 64 bits.  Doing them in 64 bits should
>> be fine for narrower operands too.
>>
>> So:
>>
>> > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vhadd-1.c 
>> > b/gcc/testsuite/gcc.target/arm/simd/mve-vhadd-1.c
>> > new file mode 100644
>> > index 000..40489ecc67d
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vhadd-1.c
>> > @@ -0,0 +1,31 @@
>> > +/* { dg-do compile } */
>> > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
>> > +/* { dg-add-options arm_v8_1m_mve } */
>> > +/* { dg-additional-options "-O3" } */
>> > +
>> > +#include 
>> > +
>> > +#define FUNC(SIGN, TYPE, BITS, OP, NAME) \
>> > +  void test_ ## NAME ##_ ## SIGN ## BITS (TYPE##BITS##_t * __restrict__ 
>> > dest, \
>> > +   TYPE##BITS##_t *a, TYPE##BITS##_t 
>> > *b) { \
>> > +int i;   \
>> > +for (i=0; i < (128 / BITS); i++) {
>> >\
>> > +  dest[i] = (a[i] OP b[i]) >> 1; \
>> > +} 
>> >\
>> > +}
>> > +
>>
>> …it should work if you make this "((int64_t) a[i] OP b[i]) >> 1".
>
> Indeed. However, this may not be obvious for end-users :-(
>
> I've updated my patch as attached: added the (int64_t) cast and
> removed the xfail clauses.
>
> OK for trunk?
>
> Thanks,
>
> Christophe
>
>>
>> > As noted in neon-vhadd-1.c, I couldn't write a test able to use Neon
>> > vectorization with 64-bit vectors: we default to
>> > -mvectorize-with-neon-quad, and attempts to use
>> > -mvectorize-with-neon-double resulted in much worse code, which this
>> > patch does not aim at improving.
>>
>> I guess this is because the MVE_2 mode iterators only include 128-bit types.
>> Leaving Neon double as future work sounds good though.
> Note that I am focusing on MVE enablement at the moment.

Right.  I meant “possible future work by someone somewhere”. :-)

>> And yeah, the code for V(R)HADD-equivalent operations is much worse when
>> V(R)HADD isn't available, since the compiler really does need to double
>> the precision of the operands, do double-precision addition,
>> do double-precision shifts, and then truncate back.  So this looks
>> like the expected behaviour.
>>
>> Thanks,
>> Richard
>
> From 493693b5c2f4e5fee7408062785930f723f2bd85 Mon Sep 17 00:00:00 2001
> From: Christophe Lyon 
> Date: Thu, 27 May 2021 20:11:28 +
> Subject: [PATCH v2] arm: Auto-vectorization for MVE and Neon: vhadd/vrhadd
>
> This patch adds support for auto-vectorization of average value
> computation using vhadd or vrhadd, for both MVE and Neon.
>
> The patch adds the needed [u]avg3_[floor|ceil] patterns to
> vec-common.md, I'm not sure how to factorize them without introducing
> an unspec iterator?
>
> It also adds tests for 'floor' and for 'ceil', each for MVE and Neon.
>
> Vectorization works with 8-bit and 16 bit input/output vectors, but
> not with 32-bit ones because the vectorizer expects wider types
> availability for the intermediate values, but int32_t + int32_t does
> not involve wider types in the IR.
>
> As noted in neon-vhadd-1.c, I couldn't write a test able to use Neon
> vectorization with 64-bit vectors: we default to
> -mvectorize-with-neon-quad, and attempts to use
> -mvectorize-with-neon-double resulted in much worse code, which this
> patch does not aim at improving.

The above needs updating.

> 2021-05-31  Christophe Lyon  
>
>   gcc/
>   * gcc/config/arm/mve.md (mve_vhaddq_): Prefix with '@'.
>   (@mve_vrhaddq_   * gcc/config/arm/neon.md (neon_vhadd): Likewise.
>   * config/arm/vec-common.md (avg3_flo

Re: [PATCH 1/2] arm: Auto-vectorization for MVE: vclz

2021-06-08 Thread Richard Sandiford via Gcc-patches
Christophe Lyon  writes:
> This patch adds support for auto-vectorization of clz for MVE.
>
> It does so by removing the unspec from mve_vclzq_ and uses
> 'clz' instead. It moves to neon_vclz expander from neon.md to
> vec-common.md and renames it into the standard name clz2.
>
> 2021-06-03  Christophe Lyon  
>
>   gcc/
>   * config/arm/iterators.md (): Remove VCLZQ_U, VCLZQ_S.
>   (VCLZQ): Remove.
>   * config/arm/mve.md (mve_vclzq_): Add '@' prefix,
>   remove  iterator.
>   (mve_vclzq_u): New.
>   * config/arm/neon.md (clz2): Rename to neon_vclz.
>   (neon_vclz   * config/arm/unspecs.md (VCLZQ_U, VCLZQ_S): Remove.
>   * config/arm/vec-common.md ... here. Add support for MVE.
>
>   gcc/testsuite/
>   * gcc.target/arm/simd/mve-vclz.c: New test.
> ---
>  gcc/config/arm/iterators.md  |  3 +--
>  gcc/config/arm/mve.md| 12 ++---
>  gcc/config/arm/neon.md   | 11 +---
>  gcc/config/arm/unspecs.md|  2 --
>  gcc/config/arm/vec-common.md | 13 +
>  gcc/testsuite/gcc.target/arm/simd/mve-vclz.c | 28 
>  6 files changed, 52 insertions(+), 17 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vclz.c
>
> diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> index 3042bafc6c6..5c4fe895268 100644
> --- a/gcc/config/arm/iterators.md
> +++ b/gcc/config/arm/iterators.md
> @@ -1288,7 +1288,7 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U 
> "u") (VREV16Q_S "s")
>  (VMOVLBQ_U "u") (VCVTQ_FROM_F_S "s") (VCVTQ_FROM_F_U "u")
>  (VCVTPQ_S "s") (VCVTPQ_U "u") (VCVTNQ_S "s")
>  (VCVTNQ_U "u") (VCVTMQ_S "s") (VCVTMQ_U "u")
> -(VCLZQ_U "u") (VCLZQ_S "s") (VREV32Q_U "u")
> +(VREV32Q_U "u")
>  (VREV32Q_S "s") (VADDLVQ_U "u") (VADDLVQ_S "s")
>  (VCVTQ_N_TO_F_S "s") (VCVTQ_N_TO_F_U "u")
>  (VCREATEQ_U "u") (VCREATEQ_S "s") (VSHRQ_N_S "s")
> @@ -1538,7 +1538,6 @@ (define_int_iterator VCVTQ_FROM_F [VCVTQ_FROM_F_S 
> VCVTQ_FROM_F_U])
>  (define_int_iterator VREV16Q [VREV16Q_U VREV16Q_S])
>  (define_int_iterator VCVTAQ [VCVTAQ_U VCVTAQ_S])
>  (define_int_iterator VDUPQ_N [VDUPQ_N_U VDUPQ_N_S])
> -(define_int_iterator VCLZQ [VCLZQ_U VCLZQ_S])
>  (define_int_iterator VADDVQ [VADDVQ_U VADDVQ_S])
>  (define_int_iterator VREV32Q [VREV32Q_U VREV32Q_S])
>  (define_int_iterator VMOVLBQ [VMOVLBQ_S VMOVLBQ_U])
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index 04aa612331a..99e46d0bc69 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -435,16 +435,22 @@ (define_insn "mve_vdupq_n_"
>  ;;
>  ;; [vclzq_u, vclzq_s])
>  ;;
> -(define_insn "mve_vclzq_"
> +(define_insn "@mve_vclzq_s"
>[
> (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> - (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")]
> -  VCLZQ))
> + (clz:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w")))
>]
>"TARGET_HAVE_MVE"
>"vclz.i%#  %q0, %q1"
>[(set_attr "type" "mve_move")
>  ])
> +(define_expand "mve_vclzq_u"
> +  [
> +   (set (match_operand:MVE_2 0 "s_register_operand")
> + (clz:MVE_2 (match_operand:MVE_2 1 "s_register_operand")))
> +  ]
> +  "TARGET_HAVE_MVE"
> +)
>  
>  ;;
>  ;; [vclsq_s])
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index 18571d819eb..0fdffaf4ec4 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -3018,7 +3018,7 @@ (define_insn "neon_vcls"
>[(set_attr "type" "neon_cls")]
>  )
>  
> -(define_insn "clz2"
> +(define_insn "neon_vclz"
>[(set (match_operand:VDQIW 0 "s_register_operand" "=w")
>  (clz:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")))]
>"TARGET_NEON"
> @@ -3026,15 +3026,6 @@ (define_insn "clz2"
>[(set_attr "type" "neon_cnt")]
>  )
>  
> -(define_expand "neon_vclz"
> -  [(match_operand:VDQIW 0 "s_register_operand")
> -   (match_operand:VDQIW 1 "s_register_operand")]
> -  "TARGET_NEON"
> -{
> -  emit_insn (gen_clz2 (operands[0], operands[1]));
> -  DONE;
> -})
> -
>  (define_insn "popcount2"
>[(set (match_operand:VE 0 "s_register_operand" "=w")
>  (popcount:VE (match_operand:VE 1 "s_register_operand" "w")))]
> diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
> index ed1bc293b78..ad1c6edd005 100644
> --- a/gcc/config/arm/unspecs.md
> +++ b/gcc/config/arm/unspecs.md
> @@ -556,8 +556,6 @@ (define_c_enum "unspec" [
>VQABSQ_S
>VDUPQ_N_U
>VDUPQ_N_S
> -  VCLZQ_U
> -  VCLZQ_S
>VCLSQ_S
>VADDVQ_S
>VADDVQ_U
> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> index 2779c1a8aaa..1ba1e5eb008 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -625,3 +625,16 @@ (define_expand "uavg3_ceil"
>  

[PATCH 00/11] stdx::simd optimizations, corrections, and cleanups

2021-06-08 Thread Matthias Kretz
The following patches mostly contain code cleanups and minor corrections. The 
major feature in this patchset is the last patch, which should make the use of 
stdx::simd much safer wrt. ODR violations involuntarily introduced by linking 
TUs that were compiled with different -m and floating-point flags.

Matthias Kretz (11):
  libstdc++: Improve copysign codegen
  libstdc++: Remove dead code
  libstdc++: Improve fixed_size codegen
  libstdc++: Make use of __builtin_bit_cast
  libstdc++: Remove incorrect fabs overload
  libstdc++: Minor simd_math cleanups
  libstdc++: Fix condition when AVX512F ldexp implementation is used
  libstdc++: Avoid raising fp exceptions in trunc, floor, and ceil
  libstdc++: Ensure unrolled loops inline the lambda
  libstdc++: Fix internal names: add missing underscores
  libstdc++: Fix ODR issues with different -m flags

 libstdc++-v3/include/experimental/bits/simd.h | 438 --
 .../include/experimental/bits/simd_builtin.h  |  48 +-
 .../experimental/bits/simd_converter.h|   2 +-
 .../include/experimental/bits/simd_detail.h   |  40 ++
 .../experimental/bits/simd_fixed_size.h   |  95 ++--
 .../include/experimental/bits/simd_math.h | 107 ++---
 .../include/experimental/bits/simd_neon.h |   4 +-
 .../include/experimental/bits/simd_ppc.h  |   4 +-
 .../include/experimental/bits/simd_scalar.h   |  71 ++-
 .../include/experimental/bits/simd_x86.h  |  33 +-
 .../simd/tests/bits/test_values.h |   8 +-
 11 files changed, 540 insertions(+), 310 deletions(-)

-- 
──
 Dr. Matthias Kretz   https://mattkretz.github.io
 GSI Helmholtz Centre for Heavy Ion Research   https://gsi.de
 std::experimental::simd  https://github.com/VcDevel/std-simd
──



Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns

2021-06-08 Thread Richard Sandiford via Gcc-patches
Christophe Lyon  writes:
> This patch adds vec_unpack_hi_, vec_unpack_lo_,
> vec_pack_trunc_ patterns for MVE.
>
> It does so by moving the unpack patterns from neon.md to
> vec-common.md, while adding them support for MVE. The pack expander is
> derived from the Neon one (which in turn is renamed into
> neon_quad_vec_pack_trunc_).
>
> The patch introduces mve_vec_pack_trunc_ to avoid the need for a
> zero-initialized temporary, which is needed if the
> vec_pack_trunc_ expander calls @mve_vmovn[bt]q_
> instead.
>
> With this patch, we can now vectorize the 16 and 8-bit versions of
> vclz and vshl, although the generated code could still be improved.
> For test_clz_s16, we now generate
> vldrh.16q3, [r1]
> vmovlb.s16   q2, q3
> vmovlt.s16   q3, q3
> vclz.i32  q2, q2
> vclz.i32  q3, q3
> vmovnb.i32  q1, q2
> vmovnt.i32  q1, q3
> vstrh.16q1, [r0]
> which could be improved to
> vldrh.16q3, [r1]
>   vclz.i16q1, q3
> vstrh.16q1, [r0]
> if we could avoid the need for unpack/pack steps.

Yeah, there was a PR about fixing this for popcount.  I guess the same
approach would apply here too.

> For reference, clang-12 generates:
>   vldrh.s32   q0, [r1]
>   vldrh.s32   q1, [r1, #8]
>   vclz.i32q0, q0
>   vstrh.32q0, [r0]
>   vclz.i32q0, q1
>   vstrh.32q0, [r0, #8]
>
> 2021-06-03  Christophe Lyon  
>
>   gcc/
>   * config/arm/mve.md (mve_vmovltq_): Prefix with '@'.
>   (mve_vmovlbq_): Likewise.
>   (mve_vmovnbq_): Likewise.
>   (mve_vmovntq_): Likewise.
>   (@mve_vec_pack_trunc_): New pattern.
>   * config/arm/neon.md (vec_unpack_hi_): Move to
>   vec-common.md.
>   (vec_unpack_lo_): Likewise.
>   (vec_pack_trunc_): Rename to
>   neon_quad_vec_pack_trunc_.
>   * config/arm/vec-common.md (vec_unpack_hi_): New
>   pattern.
>   (vec_unpack_lo_): New.
>   (vec_pack_trunc_): New.
>
>   gcc/testsuite/
>   * gcc.target/arm/simd/mve-vclz.c: Update expected results.
>   * gcc.target/arm/simd/mve-vshl.c: Likewise.
> ---
>  gcc/config/arm/mve.md| 20 -
>  gcc/config/arm/neon.md   | 39 +
>  gcc/config/arm/vec-common.md | 89 
>  gcc/testsuite/gcc.target/arm/simd/mve-vclz.c |  7 +-
>  gcc/testsuite/gcc.target/arm/simd/mve-vshl.c |  5 +-
>  5 files changed, 114 insertions(+), 46 deletions(-)
>
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index 99e46d0bc69..b18292c07d3 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -510,7 +510,7 @@ (define_insn "mve_vrev32q_"
>  ;;
>  ;; [vmovltq_u, vmovltq_s])
>  ;;
> -(define_insn "mve_vmovltq_"
> +(define_insn "@mve_vmovltq_"
>[
> (set (match_operand: 0 "s_register_operand" "=w")
>   (unspec: [(match_operand:MVE_3 1 "s_register_operand" 
> "w")]
> @@ -524,7 +524,7 @@ (define_insn "mve_vmovltq_"
>  ;;
>  ;; [vmovlbq_s, vmovlbq_u])
>  ;;
> -(define_insn "mve_vmovlbq_"
> +(define_insn "@mve_vmovlbq_"
>[
> (set (match_operand: 0 "s_register_operand" "=w")
>   (unspec: [(match_operand:MVE_3 1 "s_register_operand" 
> "w")]
> @@ -2187,7 +2187,7 @@ (define_insn "mve_vmlsldavxq_s"
>  ;;
>  ;; [vmovnbq_u, vmovnbq_s])
>  ;;
> -(define_insn "mve_vmovnbq_"
> +(define_insn "@mve_vmovnbq_"
>[
> (set (match_operand: 0 "s_register_operand" "=w")
>   (unspec: [(match_operand: 1 
> "s_register_operand" "0")
> @@ -2202,7 +2202,7 @@ (define_insn "mve_vmovnbq_"
>  ;;
>  ;; [vmovntq_s, vmovntq_u])
>  ;;
> -(define_insn "mve_vmovntq_"
> +(define_insn "@mve_vmovntq_"
>[
> (set (match_operand: 0 "s_register_operand" "=w")
>   (unspec: [(match_operand: 1 
> "s_register_operand" "0")
> @@ -2214,6 +2214,18 @@ (define_insn "mve_vmovntq_"
>[(set_attr "type" "mve_move")
>  ])
>  
> +(define_insn "@mve_vec_pack_trunc_"
> + [(set (match_operand: 0 "register_operand" "=&w")
> +   (vec_concat:
> + (truncate:
> + (match_operand:MVE_5 1 "register_operand" "w"))
> + (truncate:
> + (match_operand:MVE_5 2 "register_operand" "w"]
> + "TARGET_HAVE_MVE"
> + "vmovnb.i%q0, %q1\;vmovnt.i   %q0, %q2"
> +  [(set_attr "type" "mve_move")]
> +)
> +

I realise this is (like you say) based on the neon.md pattern,
but we should use separate vmovnb and vmovnt instructions instead
of putting two instructions into a single pattern.

One specific advantage to using separate patterns is that it would
avoid the imprecision of the earlyclobber: the output only conflicts
with operand 1 and can be tied to operand 2.

>  ;;
>  ;; [vmulq_f])
>  ;;
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index 0fdffaf4ec4..392d9607919 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -5924,43 +5924,6 @@ (defi

[PATCH 01/11] libstdc++: Improve copysign codegen

2021-06-08 Thread Matthias Kretz


From: Matthias Kretz 

This also resolves a test failure on aarch64 with -ffast-math and
fixed_size with large N.

Signed-off-by: Matthias Kretz 

libstdc++-v3/ChangeLog:

* include/experimental/bits/simd.h: Add missing operator~
overload for simd to __float_bitwise_operators.
* include/experimental/bits/simd_builtin.h
(_SimdImplBuiltin::_S_complement): Bitcast to int (and back) to
implement complement for floating-point vectors.
* include/experimental/bits/simd_fixed_size.h
(_SimdImplFixedSize::_S_copysign): New function, forwarding to
copysign implementation of _SimdTuple members.
* include/experimental/bits/simd_math.h (copysign): Call
_SimdImpl::_S_copysign for fixed_size arguments. Simplify
generic copysign implementation using the new ~ operator.
---
 libstdc++-v3/include/experimental/bits/simd.h| 6 ++
 libstdc++-v3/include/experimental/bits/simd_builtin.h| 7 ++-
 libstdc++-v3/include/experimental/bits/simd_fixed_size.h | 2 +-
 libstdc++-v3/include/experimental/bits/simd_math.h   | 4 +++-
 4 files changed, 16 insertions(+), 3 deletions(-)


--
──
 Dr. Matthias Kretz   https://mattkretz.github.io
 GSI Helmholtz Centre for Heavy Ion Research   https://gsi.de
 std::experimental::simd  https://github.com/VcDevel/std-simd
──diff --git a/libstdc++-v3/include/experimental/bits/simd.h b/libstdc++-v3/include/experimental/bits/simd.h
index 59ddf3cc958..163f1b574e2 100644
--- a/libstdc++-v3/include/experimental/bits/simd.h
+++ b/libstdc++-v3/include/experimental/bits/simd.h
@@ -5189,6 +5189,12 @@ template 
 return {__private_init,
 	_Ap::_SimdImpl::_S_bit_and(__data(__a), __data(__b))};
   }
+
+template 
+  _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_SIMD_CONSTEXPR
+  enable_if_t, simd<_Tp, _Ap>>
+  operator~(const simd<_Tp, _Ap>& __a)
+  { return {__private_init, _Ap::_SimdImpl::_S_complement(__data(__a))}; }
 } // namespace __float_bitwise_operators }}}
 
 _GLIBCXX_SIMD_END_NAMESPACE
diff --git a/libstdc++-v3/include/experimental/bits/simd_builtin.h b/libstdc++-v3/include/experimental/bits/simd_builtin.h
index e986ee91620..8cd338e313f 100644
--- a/libstdc++-v3/include/experimental/bits/simd_builtin.h
+++ b/libstdc++-v3/include/experimental/bits/simd_builtin.h
@@ -1632,7 +1632,12 @@ template 
 template 
   _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
   _S_complement(_SimdWrapper<_Tp, _Np> __x) noexcept
-  { return ~__x._M_data; }
+  {
+	if constexpr (is_floating_point_v<_Tp>)
+	  return __vector_bitcast<_Tp>(~__vector_bitcast<__int_for_sizeof_t<_Tp>>(__x));
+	else
+	  return ~__x._M_data;
+  }
 
 // _S_unary_minus {{{2
 template 
diff --git a/libstdc++-v3/include/experimental/bits/simd_fixed_size.h b/libstdc++-v3/include/experimental/bits/simd_fixed_size.h
index 2722055c899..7c2c1df77c8 100644
--- a/libstdc++-v3/include/experimental/bits/simd_fixed_size.h
+++ b/libstdc++-v3/include/experimental/bits/simd_fixed_size.h
@@ -1663,7 +1663,7 @@ template 
 _GLIBCXX_SIMD_APPLY_ON_TUPLE(_Tp, ldexp)
 _GLIBCXX_SIMD_APPLY_ON_TUPLE(_Tp, fmod)
 _GLIBCXX_SIMD_APPLY_ON_TUPLE(_Tp, remainder)
-// copysign in simd_math.h
+_GLIBCXX_SIMD_APPLY_ON_TUPLE(_Tp, copysign)
 _GLIBCXX_SIMD_APPLY_ON_TUPLE(_Tp, nextafter)
 _GLIBCXX_SIMD_APPLY_ON_TUPLE(_Tp, fdim)
 _GLIBCXX_SIMD_APPLY_ON_TUPLE(_Tp, fmax)
diff --git a/libstdc++-v3/include/experimental/bits/simd_math.h b/libstdc++-v3/include/experimental/bits/simd_math.h
index 4799803a200..d954e761eee 100644
--- a/libstdc++-v3/include/experimental/bits/simd_math.h
+++ b/libstdc++-v3/include/experimental/bits/simd_math.h
@@ -1304,6 +1304,8 @@ template 
   {
 if constexpr (simd_size_v<_Tp, _Abi> == 1)
   return std::copysign(__x[0], __y[0]);
+else if constexpr (__is_fixed_size_abi_v<_Abi>)
+  return {__private_init, _Abi::_SimdImpl::_S_copysign(__data(__x), __data(__y))};
 else if constexpr (is_same_v<_Tp, long double> && sizeof(_Tp) == 12)
   // Remove this case once __bit_cast is implemented via __builtin_bit_cast.
   // It is necessary, because __signmask below cannot be computed at compile
@@ -1315,7 +1317,7 @@ template 
 	using _V = simd<_Tp, _Abi>;
 	using namespace std::experimental::__float_bitwise_operators;
 	_GLIBCXX_SIMD_USE_CONSTEXPR_API auto __signmask = _V(1) ^ _V(-1);
-	return (__x & (__x ^ __signmask)) | (__y & __signmask);
+	return (__x & ~__signmask) | (__y & __signmask);
   }
   }
 


[PATCH 02/11] libstdc++: Remove dead code

2021-06-08 Thread Matthias Kretz


From: Matthias Kretz 

This helper type became unused at some point.

Signed-off-by: Matthias Kretz 

libstdc++-v3/ChangeLog:

* include/experimental/bits/simd_fixed_size.h
(_AbisInSimdTuple): Removed.
---
 .../experimental/bits/simd_fixed_size.h   | 49 ---
 1 file changed, 49 deletions(-)


--
──
 Dr. Matthias Kretz   https://mattkretz.github.io
 GSI Helmholtz Centre for Heavy Ion Research   https://gsi.de
 std::experimental::simd  https://github.com/VcDevel/std-simd
──diff --git a/libstdc++-v3/include/experimental/bits/simd_fixed_size.h b/libstdc++-v3/include/experimental/bits/simd_fixed_size.h
index 7c2c1df77c8..b6fb47cdf39 100644
--- a/libstdc++-v3/include/experimental/bits/simd_fixed_size.h
+++ b/libstdc++-v3/include/experimental/bits/simd_fixed_size.h
@@ -1025,55 +1025,6 @@ template 
   _Tp, _Remain, _SimdTuple<_Tp, _As..., typename _Next::abi_type>>::type;
   };
 
-// }}}
-// _AbisInSimdTuple {{{
-template 
-  struct _SeqOp;
-
-template 
-  struct _SeqOp>
-  {
-using _FirstPlusOne = index_sequence<_I0 + 1, _Is...>;
-using _NotFirstPlusOne = index_sequence<_I0, (_Is + 1)...>;
-template 
-using _Prepend = index_sequence<_First, _I0 + _Add, (_Is + _Add)...>;
-  };
-
-template 
-  struct _AbisInSimdTuple;
-
-template 
-  struct _AbisInSimdTuple<_SimdTuple<_Tp>>
-  {
-using _Counts = index_sequence<0>;
-using _Begins = index_sequence<0>;
-  };
-
-template 
-  struct _AbisInSimdTuple<_SimdTuple<_Tp, _Ap>>
-  {
-using _Counts = index_sequence<1>;
-using _Begins = index_sequence<0>;
-  };
-
-template 
-  struct _AbisInSimdTuple<_SimdTuple<_Tp, _A0, _A0, _As...>>
-  {
-using _Counts = typename _SeqOp>::_Counts>::_FirstPlusOne;
-using _Begins = typename _SeqOp>::_Begins>::_NotFirstPlusOne;
-  };
-
-template 
-  struct _AbisInSimdTuple<_SimdTuple<_Tp, _A0, _A1, _As...>>
-  {
-using _Counts = typename _SeqOp>::_Counts>::template _Prepend<1, 0>;
-using _Begins = typename _SeqOp>::_Begins>::template _Prepend<0, 1>;
-  };
-
 // }}}
 // __autocvt_to_simd {{{
 template >>


[PATCH 03/11] libstdc++: Improve fixed_size codegen

2021-06-08 Thread Matthias Kretz


From: Matthias Kretz 

Sometimes fixed_size objects will get unnecessarily copied on the stack.
The simd implementation should never pass _SimdTuple by value to avoid
requiring the optimizer to see through these copies.

Signed-off-by: Matthias Kretz 

libstdc++-v3/ChangeLog:

* include/experimental/bits/simd_converter.h
(_SimdConverter::operator()): Pass _SimdTuple by const-ref.
* include/experimental/bits/simd_fixed_size.h
(_GLIBCXX_SIMD_FIXED_OP): Pass binary operator _SimdTuple
arguments by const-ref.
(_S_masked_unary): Pass _SimdTuple by const-ref.
---
 libstdc++-v3/include/experimental/bits/simd_converter.h  | 2 +-
 libstdc++-v3/include/experimental/bits/simd_fixed_size.h | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)


--
──
 Dr. Matthias Kretz   https://mattkretz.github.io
 GSI Helmholtz Centre for Heavy Ion Research   https://gsi.de
 std::experimental::simd  https://github.com/VcDevel/std-simd
──diff --git a/libstdc++-v3/include/experimental/bits/simd_converter.h b/libstdc++-v3/include/experimental/bits/simd_converter.h
index 9c8bf382df9..11999df25e4 100644
--- a/libstdc++-v3/include/experimental/bits/simd_converter.h
+++ b/libstdc++-v3/include/experimental/bits/simd_converter.h
@@ -316,7 +316,7 @@ template 
 
 _GLIBCXX_SIMD_INTRINSIC constexpr
   typename _SimdTraits<_To, _Ap>::_SimdMember
-  operator()(_Arg __x) const noexcept
+  operator()(const _Arg& __x) const noexcept
 {
   if constexpr (_Arg::_S_tuple_size == 1)
 	return __vector_convert<__vector_type_t<_To, _Np>>(__x.first);
diff --git a/libstdc++-v3/include/experimental/bits/simd_fixed_size.h b/libstdc++-v3/include/experimental/bits/simd_fixed_size.h
index b6fb47cdf39..dc2fb90b9b2 100644
--- a/libstdc++-v3/include/experimental/bits/simd_fixed_size.h
+++ b/libstdc++-v3/include/experimental/bits/simd_fixed_size.h
@@ -1480,7 +1480,7 @@ template 
 #define _GLIBCXX_SIMD_FIXED_OP(name_, op_) \
 template\
   static inline constexpr _SimdTuple<_Tp, _As...> name_(   \
-	const _SimdTuple<_Tp, _As...> __x, const _SimdTuple<_Tp, _As...> __y)  \
+	const _SimdTuple<_Tp, _As...>& __x, const _SimdTuple<_Tp, _As...>& __y)\
   {\
 	return __x._M_apply_per_chunk( \
 	  [](auto __impl, auto __xx, auto __yy) constexpr {\
@@ -1780,8 +1780,7 @@ template 
 // _S_masked_unary {{{2
 template  class _Op, typename _Tp, typename... _As>
   static inline _SimdTuple<_Tp, _As...>
-  _S_masked_unary(const _MaskMember __bits,
-		  const _SimdTuple<_Tp, _As...> __v) // TODO: const-ref __v?
+  _S_masked_unary(const _MaskMember __bits, const _SimdTuple<_Tp, _As...>& __v)
   {
 	return __v._M_apply_wrapped([&__bits](auto __meta,
 	  auto __native) constexpr {


[PATCH 04/11] libstdc++: Make use of __builtin_bit_cast

2021-06-08 Thread Matthias Kretz


From: Matthias Kretz 

The __bit_cast function was a hack to achieve what __builtin_bit_cast
can do, therefore use __builtin_bit_cast if possible. However,
__builtin_bit_cast cannot be used to cast from/to fixed_size_simd, since
it isn't trivially copyable (in the language sense — in principle it
is). Therefore add __proposed::simd_bit_cast to enable the use case
required in the test framework.

Signed-off-by: Matthias Kretz 

libstdc++-v3/ChangeLog:

* include/experimental/bits/simd.h (__bit_cast): Implement via
__builtin_bit_cast #if available.
(__proposed::simd_bit_cast): Add overloads for simd and
simd_mask, which use __builtin_bit_cast (or __bit_cast #if not
available), which return an object of the requested type with
the same bits as the argument.
* include/experimental/bits/simd_math.h: Use simd_bit_cast
instead of __bit_cast to allow casts to fixed_size_simd.
* testsuite/experimental/simd/tests/bits/test_values.h: Switch
from __bit_cast to __proposed::simd_bit_cast since the former
will not cast fixed_size objects anymore.
---
 libstdc++-v3/include/experimental/bits/simd.h | 40 ++-
 .../include/experimental/bits/simd_math.h |  8 ++--
 .../simd/tests/bits/test_values.h |  8 ++--
 3 files changed, 46 insertions(+), 10 deletions(-)


--
──
 Dr. Matthias Kretz   https://mattkretz.github.io
 GSI Helmholtz Centre for Heavy Ion Research   https://gsi.de
 std::experimental::simd  https://github.com/VcDevel/std-simd
──diff --git a/libstdc++-v3/include/experimental/bits/simd.h b/libstdc++-v3/include/experimental/bits/simd.h
index 163f1b574e2..5d243f22434 100644
--- a/libstdc++-v3/include/experimental/bits/simd.h
+++ b/libstdc++-v3/include/experimental/bits/simd.h
@@ -1598,7 +1598,9 @@ template 
   _GLIBCXX_SIMD_INTRINSIC constexpr _To
   __bit_cast(const _From __x)
   {
-// TODO: implement with / replace by __builtin_bit_cast ASAP
+#if __has_builtin(__builtin_bit_cast)
+return __builtin_bit_cast(_To, __x);
+#else
 static_assert(sizeof(_To) == sizeof(_From));
 constexpr bool __to_is_vectorizable
   = is_arithmetic_v<_To> || is_enum_v<_To>;
@@ -1629,6 +1631,7 @@ template 
 			 reinterpret_cast(&__x), sizeof(_To));
 	return __r;
   }
+#endif
   }
 
 // }}}
@@ -2900,6 +2903,41 @@ template (__x)};
   }
+
+template 
+  _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_SIMD_CONSTEXPR
+  _To
+  simd_bit_cast(const simd<_Up, _Abi>& __x)
+  {
+using _Tp = typename _To::value_type;
+using _ToMember = typename _SimdTraits<_Tp, typename _To::abi_type>::_SimdMember;
+using _From = simd<_Up, _Abi>;
+using _FromMember = typename _SimdTraits<_Up, _Abi>::_SimdMember;
+// with concepts, the following should be constraints
+static_assert(sizeof(_To) == sizeof(_From));
+static_assert(is_trivially_copyable_v<_Tp> && is_trivially_copyable_v<_Up>);
+static_assert(is_trivially_copyable_v<_ToMember> && is_trivially_copyable_v<_FromMember>);
+#if __has_builtin(__builtin_bit_cast)
+return {__private_init, __builtin_bit_cast(_ToMember, __data(__x))};
+#else
+return {__private_init, __bit_cast<_ToMember>(__data(__x))};
+#endif
+  }
+
+template 
+  _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_SIMD_CONSTEXPR
+  _To
+  simd_bit_cast(const simd_mask<_Up, _Abi>& __x)
+  {
+using _From = simd_mask<_Up, _Abi>;
+static_assert(sizeof(_To) == sizeof(_From));
+static_assert(is_trivially_copyable_v<_To> && is_trivially_copyable_v<_From>);
+#if __has_builtin(__builtin_bit_cast)
+return __builtin_bit_cast(_To, __x);
+#else
+return __bit_cast<_To>(__x);
+#endif
+  }
 } // namespace __proposed
 
 // simd_cast {{{2
diff --git a/libstdc++-v3/include/experimental/bits/simd_math.h b/libstdc++-v3/include/experimental/bits/simd_math.h
index d954e761eee..3ade293fcbf 100644
--- a/libstdc++-v3/include/experimental/bits/simd_math.h
+++ b/libstdc++-v3/include/experimental/bits/simd_math.h
@@ -700,11 +700,9 @@ template 
 	// (inf and NaN are excluded by -ffinite-math-only)
 	const auto __iszero_inf_nan = __x == 0;
 #else
-	const auto __as_int
-	  = __bit_cast, _V>>(abs(__x));
-	const auto __inf
-	  = __bit_cast, _V>>(
-	_V(__infinity_v<_Tp>));
+	using _Ip = __int_for_sizeof_t<_Tp>;
+	const auto __as_int = simd_bit_cast>(abs(__x));
+	const auto __inf = simd_bit_cast>(_V(__infinity_v<_Tp>));
 	const auto __iszero_inf_nan = static_simd_cast(
 	  __as_int == 0 || __as_int >= __inf);
 #endif
diff --git a/libstdc++-v3/testsuite/experimental/simd/tests/bits/test_values.h b/libstdc++-v3/testsuite/experimental/simd/tests/bits/test_values.h
index b69bd0b704d..67aa870659b 100644
--- a/libstdc++-v3/testsuite/experimental/simd/tests/bits/test_values.h
+++ b/libstdc++-v3/testsuite/experimental/simd/tests/bits/te

[PATCH 05/11] libstdc++: Remove incorrect fabs overload

2021-06-08 Thread Matthias Kretz


From: Matthias Kretz 

fabs(int) returns double, this one didn't. This overload is not
specified in the Parallelism TS 2. Also remove the comment about labs
and llabs: it doesn't belong here.

Signed-off-by: Matthias Kretz 

libstdc++-v3/ChangeLog:

* include/experimental/bits/simd_math.h (fabs): Remove
fabs(simd) overload.
---
 .../include/experimental/bits/simd_math.h| 16 
 1 file changed, 16 deletions(-)


--
──
 Dr. Matthias Kretz   https://mattkretz.github.io
 GSI Helmholtz Centre for Heavy Ion Research   https://gsi.de
 std::experimental::simd  https://github.com/VcDevel/std-simd
──diff --git a/libstdc++-v3/include/experimental/bits/simd_math.h b/libstdc++-v3/include/experimental/bits/simd_math.h
index 3ade293fcbf..cff4371619d 100644
--- a/libstdc++-v3/include/experimental/bits/simd_math.h
+++ b/libstdc++-v3/include/experimental/bits/simd_math.h
@@ -863,22 +863,6 @@ template 
   abs(const simd<_Tp, _Abi>& __x)
   { return {__private_init, _Abi::_SimdImpl::_S_abs(__data(__x))}; }
 
-template 
-  enable_if_t && is_signed_v<_Tp>, simd<_Tp, _Abi>>
-  fabs(const simd<_Tp, _Abi>& __x)
-  { return {__private_init, _Abi::_SimdImpl::_S_abs(__data(__x))}; }
-
-// the following are overloads for functions in  and not covered by
-// [parallel.simd.math]. I don't see much value in making them work, though
-/*
-template  simd labs(const simd &__x)
-{ return {__private_init, _Abi::_SimdImpl::abs(__data(__x))}; }
-
-template  simd llabs(const simd
-&__x)
-{ return {__private_init, _Abi::_SimdImpl::abs(__data(__x))}; }
-*/
-
 #define _GLIBCXX_SIMD_CVTING2(_NAME)   \
 template  \
   _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> _NAME(   \


[PATCH 06/11] libstdc++: Minor simd_math cleanups

2021-06-08 Thread Matthias Kretz


From: Matthias Kretz 

Signed-off-by: Matthias Kretz 

libstdc++-v3/ChangeLog:

* include/experimental/bits/simd_math.h: Undefine internal
macros after use.
(frexp): Move #if to a more sensible position and reformat
preceding code.
(logb): Call _SimdImpl::_S_logb for fixed_size instead of
duplicating the code here.
(modf): Simplify condition.
---
 .../include/experimental/bits/simd_math.h | 22 +--
 1 file changed, 6 insertions(+), 16 deletions(-)


--
──
 Dr. Matthias Kretz   https://mattkretz.github.io
 GSI Helmholtz Centre for Heavy Ion Research   https://gsi.de
 std::experimental::simd  https://github.com/VcDevel/std-simd
──diff --git a/libstdc++-v3/include/experimental/bits/simd_math.h b/libstdc++-v3/include/experimental/bits/simd_math.h
index cff4371619d..a5df2039970 100644
--- a/libstdc++-v3/include/experimental/bits/simd_math.h
+++ b/libstdc++-v3/include/experimental/bits/simd_math.h
@@ -645,11 +645,8 @@ template 
 	return __r;
   }
 else if constexpr (__is_fixed_size_abi_v<_Abi>)
-  {
-	return {__private_init,
-		_Abi::_SimdImpl::_S_frexp(__data(__x), __data(*__exp))};
+  return {__private_init, _Abi::_SimdImpl::_S_frexp(__data(__x), __data(*__exp))};
 #if _GLIBCXX_SIMD_X86INTRIN
-  }
 else if constexpr (__have_avx512f)
   {
 	constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
@@ -667,8 +664,8 @@ template 
 		_Abi::_CommonImpl::_S_blend(_SimdWrapper(
 	  __isnonzero),
 	__v, __getmant_avx512(__v))};
-#endif // _GLIBCXX_SIMD_X86INTRIN
   }
+#endif // _GLIBCXX_SIMD_X86INTRIN
 else
   {
 	// fallback implementation
@@ -749,14 +746,7 @@ template 
 if constexpr (_Np == 1)
   return std::logb(__x[0]);
 else if constexpr (__is_fixed_size_abi_v<_Abi>)
-  {
-	return {__private_init,
-		__data(__x)._M_apply_per_chunk([](auto __impl, auto __xx) {
-		  using _V = typename decltype(__impl)::simd_type;
-		  return __data(
-		std::experimental::logb(_V(__private_init, __xx)));
-		})};
-  }
+  return {__private_init, _Abi::_SimdImpl::_S_logb(__data(__x))};
 #if _GLIBCXX_SIMD_X86INTRIN // {{{
 else if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>())
   return {__private_init,
@@ -827,9 +817,7 @@ template 
   enable_if_t, simd<_Tp, _Abi>>
   modf(const simd<_Tp, _Abi>& __x, simd<_Tp, _Abi>* __iptr)
   {
-if constexpr (__is_scalar_abi<_Abi>()
-		  || (__is_fixed_size_abi_v<
-			_Abi> && simd_size_v<_Tp, _Abi> == 1))
+if constexpr (simd_size_v<_Tp, _Abi> == 1)
   {
 	_Tp __tmp;
 	_Tp __r = std::modf(__x[0], &__tmp);
@@ -1472,6 +1460,8 @@ template 
   }
 // }}}
 
+#undef _GLIBCXX_SIMD_CVTING2
+#undef _GLIBCXX_SIMD_CVTING3
 #undef _GLIBCXX_SIMD_MATH_CALL_
 #undef _GLIBCXX_SIMD_MATH_CALL2_
 #undef _GLIBCXX_SIMD_MATH_CALL3_


[PATCH 07/11] libstdc++: Fix condition when AVX512F ldexp implementation is used

2021-06-08 Thread Matthias Kretz


From: Matthias Kretz 

This improves codegen of ldexp if AVX512VL is available.

Signed-off-by: Matthias Kretz 

libstdc++-v3/ChangeLog:

* include/experimental/bits/simd_x86.h (_S_ldexp): The AVX512F
implementation doesn't require a _VecBltnBtmsk ABI tag, it
requires either a 64-Byte input (in which case AVX512F must be
available) or AVX512VL.
---
 libstdc++-v3/include/experimental/bits/simd_x86.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)


--
──
 Dr. Matthias Kretz   https://mattkretz.github.io
 GSI Helmholtz Centre for Heavy Ion Research   https://gsi.de
 std::experimental::simd  https://github.com/VcDevel/std-simd
──diff --git a/libstdc++-v3/include/experimental/bits/simd_x86.h b/libstdc++-v3/include/experimental/bits/simd_x86.h
index 305d7a9fa54..5706bf63845 100644
--- a/libstdc++-v3/include/experimental/bits/simd_x86.h
+++ b/libstdc++-v3/include/experimental/bits/simd_x86.h
@@ -2611,13 +2611,14 @@ template 
   _S_ldexp(_SimdWrapper<_Tp, _Np> __x,
 	   __fixed_size_storage_t __exp)
   {
-	if constexpr (__is_avx512_abi<_Abi>())
+	if constexpr (sizeof(__x) == 64 || __have_avx512vl)
 	  {
 	const auto __xi = __to_intrin(__x);
 	constexpr _SimdConverter, _Tp, _Abi>
 	  __cvt;
 	const auto __expi = __to_intrin(__cvt(__exp));
-	constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
+	using _Up = __bool_storage_member_type_t<_Np>;
+	constexpr _Up __k1 = _Np < sizeof(_Up) * __CHAR_BIT__ ? _Up((1ULL << _Np) - 1) : ~_Up();
 	if constexpr (sizeof(__xi) == 16)
 	  {
 		if constexpr (sizeof(_Tp) == 8)


[PATCH 08/11] libstdc++: Avoid raising fp exceptions in trunc, floor, and ceil

2021-06-08 Thread Matthias Kretz


From: Matthias Kretz 

Signed-off-by: Matthias Kretz 

libstdc++-v3/ChangeLog:
* include/experimental/bits/simd_x86.h (_S_trunc, _S_floor,
_S_ceil): Set bit 8 (_MM_FROUND_NO_EXC) on AVX and SSE4.1
roundp[sd] calls.
---
 .../include/experimental/bits/simd_x86.h  | 24 +--
 1 file changed, 12 insertions(+), 12 deletions(-)


--
──
 Dr. Matthias Kretz   https://mattkretz.github.io
 GSI Helmholtz Centre for Heavy Ion Research   https://gsi.de
 std::experimental::simd  https://github.com/VcDevel/std-simd
──diff --git a/libstdc++-v3/include/experimental/bits/simd_x86.h b/libstdc++-v3/include/experimental/bits/simd_x86.h
index 5706bf63845..34633c096b1 100644
--- a/libstdc++-v3/include/experimental/bits/simd_x86.h
+++ b/libstdc++-v3/include/experimental/bits/simd_x86.h
@@ -2657,13 +2657,13 @@ template 
 	else if constexpr (__is_avx512_pd<_Tp, _Np>())
 	  return _mm512_roundscale_pd(__x, 0x0b);
 	else if constexpr (__is_avx_ps<_Tp, _Np>())
-	  return _mm256_round_ps(__x, 0x3);
+	  return _mm256_round_ps(__x, 0xb);
 	else if constexpr (__is_avx_pd<_Tp, _Np>())
-	  return _mm256_round_pd(__x, 0x3);
+	  return _mm256_round_pd(__x, 0xb);
 	else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
-	  return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0x3));
+	  return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xb));
 	else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
-	  return _mm_round_pd(__x, 0x3);
+	  return _mm_round_pd(__x, 0xb);
 	else if constexpr (__is_sse_ps<_Tp, _Np>())
 	  {
 	auto __truncated
@@ -2786,13 +2786,13 @@ template 
 	else if constexpr (__is_avx512_pd<_Tp, _Np>())
 	  return _mm512_roundscale_pd(__x, 0x09);
 	else if constexpr (__is_avx_ps<_Tp, _Np>())
-	  return _mm256_round_ps(__x, 0x1);
+	  return _mm256_round_ps(__x, 0x9);
 	else if constexpr (__is_avx_pd<_Tp, _Np>())
-	  return _mm256_round_pd(__x, 0x1);
+	  return _mm256_round_pd(__x, 0x9);
 	else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
-	  return __auto_bitcast(_mm_floor_ps(__to_intrin(__x)));
+	  return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0x9));
 	else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
-	  return _mm_floor_pd(__x);
+	  return _mm_round_pd(__x, 0x9);
 	else
 	  return _Base::_S_floor(__x);
   }
@@ -2808,13 +2808,13 @@ template 
 	else if constexpr (__is_avx512_pd<_Tp, _Np>())
 	  return _mm512_roundscale_pd(__x, 0x0a);
 	else if constexpr (__is_avx_ps<_Tp, _Np>())
-	  return _mm256_round_ps(__x, 0x2);
+	  return _mm256_round_ps(__x, 0xa);
 	else if constexpr (__is_avx_pd<_Tp, _Np>())
-	  return _mm256_round_pd(__x, 0x2);
+	  return _mm256_round_pd(__x, 0xa);
 	else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
-	  return __auto_bitcast(_mm_ceil_ps(__to_intrin(__x)));
+	  return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xa));
 	else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
-	  return _mm_ceil_pd(__x);
+	  return _mm_round_pd(__x, 0xa);
 	else
 	  return _Base::_S_ceil(__x);
   }


[PATCH 09/11] libstdc++: Ensure unrolled loops inline the lambda

2021-06-08 Thread Matthias Kretz


From: Matthias Kretz 

Signed-off-by: Matthias Kretz 

libstdc++-v3/ChangeLog:

* include/experimental/bits/simd.h (__execute_on_index_sequence,
__execute_on_index_sequence_with_return,
__call_with_n_evaluations, __call_with_subscripts): Add flatten
attribute.
---
 libstdc++-v3/include/experimental/bits/simd.h | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)


--
──
 Dr. Matthias Kretz   https://mattkretz.github.io
 GSI Helmholtz Centre for Heavy Ion Research   https://gsi.de
 std::experimental::simd  https://github.com/VcDevel/std-simd
──diff --git a/libstdc++-v3/include/experimental/bits/simd.h b/libstdc++-v3/include/experimental/bits/simd.h
index 5d243f22434..21100c1087d 100644
--- a/libstdc++-v3/include/experimental/bits/simd.h
+++ b/libstdc++-v3/include/experimental/bits/simd.h
@@ -234,7 +234,8 @@ namespace __detail
 // unrolled/pack execution helpers
 // __execute_n_times{{{
 template 
-  _GLIBCXX_SIMD_INTRINSIC constexpr void
+  [[__gnu__::__flatten__]] _GLIBCXX_SIMD_INTRINSIC constexpr
+  void
   __execute_on_index_sequence(_Fp&& __f, index_sequence<_I...>)
   { ((void)__f(_SizeConstant<_I>()), ...); }
 
@@ -254,7 +255,8 @@ template 
 // }}}
 // __generate_from_n_evaluations{{{
 template 
-  _GLIBCXX_SIMD_INTRINSIC constexpr _R
+  [[__gnu__::__flatten__]] _GLIBCXX_SIMD_INTRINSIC constexpr
+  _R
   __execute_on_index_sequence_with_return(_Fp&& __f, index_sequence<_I...>)
   { return _R{__f(_SizeConstant<_I>())...}; }
 
@@ -269,7 +271,8 @@ template 
 // }}}
 // __call_with_n_evaluations{{{
 template 
-  _GLIBCXX_SIMD_INTRINSIC constexpr auto
+  [[__gnu__::__flatten__]] _GLIBCXX_SIMD_INTRINSIC constexpr
+  auto
   __call_with_n_evaluations(index_sequence<_I...>, _F0&& __f0, _FArgs&& __fargs)
   { return __f0(__fargs(_SizeConstant<_I>())...); }
 
@@ -285,7 +288,8 @@ template 
 // }}}
 // __call_with_subscripts{{{
 template 
-  _GLIBCXX_SIMD_INTRINSIC constexpr auto
+  [[__gnu__::__flatten__]] _GLIBCXX_SIMD_INTRINSIC constexpr
+  auto
   __call_with_subscripts(_Tp&& __x, index_sequence<_It...>, _Fp&& __fun)
   { return __fun(__x[_First + _It]...); }
 


[PATCH 10/11] libstdc++: Fix internal names: add missing underscores

2021-06-08 Thread Matthias Kretz


From: Matthias Kretz 

Signed-off-by: Matthias Kretz 

libstdc++-v3/ChangeLog:

* include/experimental/bits/simd_math.h
(_GLIBCXX_SIMD_MATH_CALL2_): Rename arg2_ to __arg2.
(_GLIBCXX_SIMD_MATH_CALL3_): Rename arg2_ to __arg2 and arg3_ to
__arg3.
---
 libstdc++-v3/include/experimental/bits/simd_math.h | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)


--
──
 Dr. Matthias Kretz   https://mattkretz.github.io
 GSI Helmholtz Centre for Heavy Ion Research   https://gsi.de
 std::experimental::simd  https://github.com/VcDevel/std-simd
──diff --git a/libstdc++-v3/include/experimental/bits/simd_math.h b/libstdc++-v3/include/experimental/bits/simd_math.h
index a5df2039970..61af9fc67af 100644
--- a/libstdc++-v3/include/experimental/bits/simd_math.h
+++ b/libstdc++-v3/include/experimental/bits/simd_math.h
@@ -119,10 +119,10 @@ template 
 
 //}}}
 // _GLIBCXX_SIMD_MATH_CALL2_ {{{
-#define _GLIBCXX_SIMD_MATH_CALL2_(__name, arg2_)   \
+#define _GLIBCXX_SIMD_MATH_CALL2_(__name, __arg2)  \
 template < \
   typename _Tp, typename _Abi, typename...,\
-  typename _Arg2 = _Extra_argument_type, \
+  typename _Arg2 = _Extra_argument_type<__arg2, _Tp, _Abi>,\
   typename _R = _Math_return_type_t<   \
 decltype(std::__name(declval(), _Arg2::declval())), _Tp, _Abi>>\
   enable_if_t, _R>\
@@ -137,7 +137,7 @@ template\
   declval(),   \
   declval, \
+	  is_same<__arg2, _Tp>,\
 	  negation, simd<_Tp, _Abi>>>,   \
 	  is_convertible<_Up, simd<_Tp, _Abi>>, is_floating_point<_Tp>>,   \
 	double>>())),  \
@@ -147,10 +147,10 @@ template\
 
 // }}}
 // _GLIBCXX_SIMD_MATH_CALL3_ {{{
-#define _GLIBCXX_SIMD_MATH_CALL3_(__name, arg2_, arg3_)\
+#define _GLIBCXX_SIMD_MATH_CALL3_(__name, __arg2, __arg3)  \
 template , \
-	  typename _Arg3 = _Extra_argument_type, \
+	  typename _Arg2 = _Extra_argument_type<__arg2, _Tp, _Abi>,\
+	  typename _Arg3 = _Extra_argument_type<__arg3, _Tp, _Abi>,\
 	  typename _R = _Math_return_type_t<   \
 	decltype(std::__name(declval(), _Arg2::declval(),  \
  _Arg3::declval())),   \


[PATCH 11/11] libstdc++: Fix ODR issues with different -m flags

2021-06-08 Thread Matthias Kretz

From: Matthias Kretz 

Explicitly support use of the stdx::simd implementation in situations
where the user links TUs that were compiled with different -m flags. In
general, this is always a (quasi) ODR violation for inline functions
because at least codegen may differ in important ways. However, in the
resulting executable only one (unspecified which one) of them might be
used. For simd we want to support users to compile code multiple times,
with different -m flags and have a runtime dispatch to the TU matching
the target CPU. But if internal functions are not inlined this may lead
to unexpected performance loss or execution of illegal instructions.
Therefore, inline functions that are not marked as always_inline must
use an additional template parameter somewhere in their name, to
disambiguate between the different -m translations.

Signed-off-by: Matthias Kretz 

libstdc++-v3/ChangeLog:

* include/experimental/bits/simd.h: Move feature detection bools
and add __have_avx512bitalg, __have_avx512vbmi2,
__have_avx512vbmi, __have_avx512ifma, __have_avx512cd,
__have_avx512vnni, __have_avx512vpopcntdq.
(__detail::__machine_flags): New function which returns a unique
uint64 depending on relevant -m and -f flags.
(__detail::__odr_helper): New type alias for either an anonymous
type or a type specialized with the __machine_flags number.
(_SimdIntOperators): Change template parameters from _Impl to
_Tp, _Abi because _Impl now has an __odr_helper parameter which
may be _OdrEnforcer from the anonymous namespace, which makes
for a bad base class.
(many): Either add __odr_helper template parameter or mark as
always_inline.
* include/experimental/bits/simd_detail.h: Add defines for
AVX512BITALG, AVX512VBMI2, AVX512VBMI, AVX512IFMA, AVX512CD,
AVX512VNNI, AVX512VPOPCNTDQ, and AVX512VP2INTERSECT.
* include/experimental/bits/simd_builtin.h: Add __odr_helper
template parameter or mark as always_inline.
* include/experimental/bits/simd_fixed_size.h: Ditto.
* include/experimental/bits/simd_math.h: Ditto.
* include/experimental/bits/simd_scalar.h: Ditto.
* include/experimental/bits/simd_neon.h: Add __odr_helper
template parameter.
* include/experimental/bits/simd_ppc.h: Ditto.
* include/experimental/bits/simd_x86.h: Ditto.
---
 libstdc++-v3/include/experimental/bits/simd.h | 380 --
 .../include/experimental/bits/simd_builtin.h  |  41 +-
 .../include/experimental/bits/simd_detail.h   |  40 ++
 .../experimental/bits/simd_fixed_size.h   |  39 +-
 .../include/experimental/bits/simd_math.h |  45 ++-
 .../include/experimental/bits/simd_neon.h |   4 +-
 .../include/experimental/bits/simd_ppc.h  |   4 +-
 .../include/experimental/bits/simd_scalar.h   |  71 +++-
 .../include/experimental/bits/simd_x86.h  |   4 +-
 9 files changed, 440 insertions(+), 188 deletions(-)


--
──
 Dr. Matthias Kretz   https://mattkretz.github.io
 GSI Helmholtz Centre for Heavy Ion Research   https://gsi.de
 std::experimental::simd  https://github.com/VcDevel/std-simd
──diff --git a/libstdc++-v3/include/experimental/bits/simd.h b/libstdc++-v3/include/experimental/bits/simd.h
index 21100c1087d..43331134301 100644
--- a/libstdc++-v3/include/experimental/bits/simd.h
+++ b/libstdc++-v3/include/experimental/bits/simd.h
@@ -35,6 +35,7 @@
 #include  // for stderr
 #endif
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -203,9 +204,170 @@ template 
 // }}}
 template 
   using _SizeConstant = integral_constant;
+// constexpr feature detection{{{
+constexpr inline bool __have_mmx = _GLIBCXX_SIMD_HAVE_MMX;
+constexpr inline bool __have_sse = _GLIBCXX_SIMD_HAVE_SSE;
+constexpr inline bool __have_sse2 = _GLIBCXX_SIMD_HAVE_SSE2;
+constexpr inline bool __have_sse3 = _GLIBCXX_SIMD_HAVE_SSE3;
+constexpr inline bool __have_ssse3 = _GLIBCXX_SIMD_HAVE_SSSE3;
+constexpr inline bool __have_sse4_1 = _GLIBCXX_SIMD_HAVE_SSE4_1;
+constexpr inline bool __have_sse4_2 = _GLIBCXX_SIMD_HAVE_SSE4_2;
+constexpr inline bool __have_xop = _GLIBCXX_SIMD_HAVE_XOP;
+constexpr inline bool __have_avx = _GLIBCXX_SIMD_HAVE_AVX;
+constexpr inline bool __have_avx2 = _GLIBCXX_SIMD_HAVE_AVX2;
+constexpr inline bool __have_bmi = _GLIBCXX_SIMD_HAVE_BMI1;
+constexpr inline bool __have_bmi2 = _GLIBCXX_SIMD_HAVE_BMI2;
+constexpr inline bool __have_lzcnt = _GLIBCXX_SIMD_HAVE_LZCNT;
+constexpr inline bool __have_sse4a = _GLIBCXX_SIMD_HAVE_SSE4A;
+constexpr inline bool __have_fma = _GLIBCXX_SIMD_HAVE_FMA;
+constexpr inline bool __have_fma4 = _GLIBCXX_SIMD_HAVE_FMA4;
+constexpr inline bool __have_f16c = _GLIBCXX_SIMD_HAVE_F16C;
+constexpr inline bool __have_popcnt = _GLIBCXX_

Re: [PATCH] rtl: Join the insn and split conditions in define_insn_and_split

2021-06-08 Thread Segher Boessenkool
On Tue, Jun 08, 2021 at 09:05:57AM +0200, Richard Biener wrote:
> On Tue, Jun 8, 2021 at 12:05 AM Segher Boessenkool
>  wrote:
> >
> > In theory we could have a split condition not inclusive of the insn
> > condition in the past.  That never was a good idea, the code does not do
> > what a non-suspicious reader would think it does.  But it leads to more
> > serious problems together with iterators: if the split condition (as
> > written) does not start with "&&", you do not get the insn condition
> > included in the split condition, and that holds for the part of the insn
> > condition that was generated by the iterator as well!
> >
> > This patch simply always joins the two conditions (after the iterators
> > have done their work) to get the effective split condition.
> >
> > I tested this on all Linux targets, building the Linux kernel for each,
> > and it does not change generated code for any of them, so I think we do
> > not have much breakage to fear.  But it is possible for other targets of
> > course, and for floating point or vector code, etc.
> >
> > Is this okay for trunk?
> 
> Even if it looks uglier I would prefer to enforce a leading "&& " on the
> split condition.  That keeps the semantic of the define_insn_and_split
> the same on trunk and branches and thus maintaining things easier.
> I suppose once branches without such enforcement go out of
> maintainance we can mass-strip the "&& "s.

This still allows a leading &&, but it doesn't enforce it.  Since we
have survived for years and years without enforcing this I don't foresee
any big problems.  There should not be many backports able to trigger
this either.

> I guess a mass-change to add "&& "s at this point is smaller than
> a corresponding change to drop them (IMHO leaving both after this
> change would be confusing).

I also managed to build with nds32 now (it is one of those targets that
likes to ICE with a Linux defconfig), and it *does* show differences.

Looking at the machine description there are many patterns that have
!TARGET_BIG_ENDIAN in the insn condition but not in the split condition.
That is exactly the kind of situation that is almost certainly an error
(or "not by design", or "very bad design", take your pick).  The config
I build is LE so none of these insns match, but apparently the split
condition *does* trigger for some insns matched by *other* patterns.

There also is "sms1", which has insn condition
  "NDS32_EXT_DSP_P ()
   && (!reload_completed
   || !nds32_need_split_sms_p (operands[3], operands[4],
   operands[5], operands[6]))"
but split condition
  "NDS32_EXT_DSP_P ()
   && !reload_completed
   && nds32_need_split_sms_p (operands[3], operands[4],
  operands[5], operands[6])"
Luckily we never trigger that.

So yeah, patch withdrawn.  This on one hand is proof we do want to make
such a change, but on the other hand shows it needs more preparatory
steps.


Segher


Re: [RFC/PATCH 00/11] Fix up some unexpected empty split conditions

2021-06-08 Thread Segher Boessenkool
On Tue, Jun 08, 2021 at 09:08:56AM +0200, Richard Biener wrote:
> On Tue, Jun 8, 2021 at 4:10 AM Kewen.Lin via Gcc-patches
>  wrote:
> > on 2021/6/8 上午7:50, Segher Boessenkool wrote:
> > > On Fri, Jun 04, 2021 at 10:57:51AM +0800, Kewen.Lin via Gcc-patches wrote:
> > >> To find out those need fixing seems to be the critical part.  It's
> > >> not hard to add one explicit "&&" to those that don't have it now, but
> > >> even with further bootstrapped and regression tested I'm still not
> > >> confident the adjustments are safe enough, since the testing coverage
> > >> could be limited.  It may need more efforts to revisit, or/and test
> > >> with more coverages, and port maintainers' reviews.
> > >
> > > https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572120.html
> > >
> > > This adds an "&&" everywhere (or in fact, it just skips any existing
> > > one, it just has the same effect of adding it everywhere).  I tested it
> > > with building gcc and Linux for all supported targets (31 of them; I do
> > > some with multiple configs, mostly 32-bit and 64-bit).  None had any
> > > difference before and after the change.
> > >
> > > So I am no longer worried that there will be any fallout from doing
> > > this.  There are many things that *could* go wrong, but I don't think
> > > there will be enough at all to be an impediment to just throwing the
> > > switch.
> > >
> > > If we go this way no target will need any significant fixing, maybe none
> > > at all will be needed across all targets.  And no changes will be needed
> > > anywhere immediately.  We could make leading "&&" deprecated, and the
> > > same for split condition "1" (which was "&& 1").  This is easy to change
> > > automatically as well.
> > >
> > Thanks very much for doing this!
> >
> > I guess we are not going to backport this?  If we won't, it seems to need
> > some way to ensure the implied "&&" will show up explicitly when backporting
> > some define_insn_and_split.
> 
> For this reason I'd prefer the explicit "&& ", Seghers testing means
> mass-changing all define_insn_and_split is reasonable.

So mass-change all define_insn_and_split where the split condition is
not inclusive of the insn condition (as written, i.e. before the
iterators have added stuff), to be separate define_insn and define_split
patterns?  And *then* add the &&?


Segher


Re: [PATCH] Implement a context aware points-to analyzer for use in evrp.

2021-06-08 Thread Andrew MacLeod via Gcc-patches

On 6/8/21 2:26 AM, Aldy Hernandez wrote:



On 6/7/21 9:20 PM, Andrew MacLeod wrote:

On 6/7/21 9:30 AM, Richard Biener via Gcc-patches wrote:

On Mon, Jun 7, 2021 at 12:10 PM Aldy Hernandez via Gcc-patches
 wrote:

The substitute_and_fold_engine which evrp uses is expecting symbolics
from value_of_expr / value_on_edge / etc, which ranger does not 
provide.

In some cases, these provide important folding cues, as in the case of
aliases for pointers.  For example, legacy evrp may return [&foo, 
&foo]

for the value of "bar" where bar is on an edge where bar == &foo, or
when bar has been globally set to &foo.  This information is then used
by the subst & fold engine to propagate the known value of bar.

Currently this is a major source of discrepancies between evrp and
ranger.  Of the 284 cases legacy evrp is getting over ranger, 237 are
for pointer equality as discussed above.

This patch implements a context aware points-to class which
ranger-evrp can use to query what a pointer is currently pointing to.
With it, we reduce the 284 cases legacy evrp is getting to 47.

The API for the points-to analyzer is the following:

class points_to_analyzer
{
public:
   points_to_analyzer (gimple_ranger *r);
   ~points_to_analyzer ();
   void enter (basic_block);
   void leave (basic_block);
   void visit_stmt (gimple *stmt);
   tree get_points_to (tree name) const;
...
};

The enter(), leave(), and visit_stmt() methods are meant to be called
from a DOM walk.   At any point throughout the walk, one can call
get_points_to() to get whatever an SSA is pointing to.

If this class is useful to others, we could place it in a more generic
location.

Tested on x86-64 Linux with a regular bootstrap/tests and by comparing
EVRP folds over ranger before and after this patch.

Hmm, but why call it "points-to" - when I look at the implementation
it's really about equivalences.  Thus,

  if (var1_2 == var2_3)

could be handled the same way.  Also "points-to" implies (to me)
that &p[1] and &p[2] point to the same object but your points-to
is clearly tracking equivalences only.

So maybe at least rename it to pointer_equiv_analyzer?  ISTR
propagating random (symbolic) equivalences has issues.


Yeah, pointer_equiv is probably more accurate. This is purely for 
cases where we know a pointer points to something that isn't an 
ssa_name. Eventually this is likely to be subsumed into a 
pointer_range object, but unlikely in this release.


I don't think this is actually doing the propagation though... It 
tracks that a_2 currently points to &foo.. and returns that to either 
simplifier or folder thru value_of_expr(). Presumably it is up to 
them to determine whether the tree expression passed back is safe to 
propagate.   Is there any attempt in EVRP to NOT set the range of 
something to [&foo, &foo] under some conditions?   This is what the 
change amounts to.  Ranger would just return a range of [1, +INF], 
and value_of_expr  would therefore return NULL.  This allows value_of 
to return &foo in these conditions.   Aldy, did you see any other 
checks in the vr-values code?


The propagation is done in the subst & fold engine when either 
value_of_expr or value_on_edge return a value that can be propagated. 
Propagations are not done blindly, as all uses of the result of 
value_o* are guarded with may_propagate_copy().


The simplifier (vr-values) is not involved, as it uses range_of_expr 
which only returns constant ranges.


Aldy


patch is OK, btw..

Andrew



Re: [PATCH] docs: document evrp-sparse-threshold param

2021-06-08 Thread Andrew MacLeod via Gcc-patches

On 6/8/21 3:46 AM, Martin Liška wrote:

Pushed as obvious.

Martin

gcc/ChangeLog:

* doc/invoke.texi: Document new param evrp-sparse-threshold.
---
gcc/doc/invoke.texi | 3 +++
1 file changed, 3 insertions(+)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 04048cd8332..6063e466c13 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -14207,6 +14207,9 @@ we may be able to devirtualize speculatively.
The maximum number of assertions to add along the default edge of a 
switch

statement during VRP.

+@item evrp-sparse-threshold
+Maximum number of basic blocks before EVRP uses a sparse cache.
+
@item evrp-mode
Specifies the mode Early VRP should operate in.


Sorry, I always seem to forget this part :-P  doh.  NEXT time I'll remember.

Thanks

Andrew




Re: [PATCH] rtl: Join the insn and split conditions in define_insn_and_split

2021-06-08 Thread Richard Biener via Gcc-patches
On Tue, Jun 8, 2021 at 2:27 PM Segher Boessenkool
 wrote:
>
> On Tue, Jun 08, 2021 at 09:05:57AM +0200, Richard Biener wrote:
> > On Tue, Jun 8, 2021 at 12:05 AM Segher Boessenkool
> >  wrote:
> > >
> > > In theory we could have a split condition not inclusive of the insn
> > > condition in the past.  That never was a good idea, the code does not do
> > > what a non-suspicious reader would think it does.  But it leads to more
> > > serious problems together with iterators: if the split condition (as
> > > written) does not start with "&&", you do not get the insn condition
> > > included in the split condition, and that holds for the part of the insn
> > > condition that was generated by the iterator as well!
> > >
> > > This patch simply always joins the two conditions (after the iterators
> > > have done their work) to get the effective split condition.
> > >
> > > I tested this on all Linux targets, building the Linux kernel for each,
> > > and it does not change generated code for any of them, so I think we do
> > > not have much breakage to fear.  But it is possible for other targets of
> > > course, and for floating point or vector code, etc.
> > >
> > > Is this okay for trunk?
> >
> > Even if it looks uglier I would prefer to enforce a leading "&& " on the
> > split condition.  That keeps the semantic of the define_insn_and_split
> > the same on trunk and branches and thus maintaining things easier.
> > I suppose once branches without such enforcement go out of
> > maintainance we can mass-strip the "&& "s.
>
> This still allows a leading &&, but it doesn't enforce it.  Since we
> have survived for years and years without enforcing this I don't foresee
> any big problems.  There should not be many backports able to trigger
> this either.
>
> > I guess a mass-change to add "&& "s at this point is smaller than
> > a corresponding change to drop them (IMHO leaving both after this
> > change would be confusing).
>
> I also managed to build with nds32 now (it is one of those targets that
> likes to ICE with a Linux defconfig), and it *does* show differences.
>
> Looking at the machine description there are many patterns that have
> !TARGET_BIG_ENDIAN in the insn condition but not in the split condition.
> That is exactly the kind of situation that is almost certainly an error
> (or "not by design", or "very bad design", take your pick).  The config
> I build is LE so none of these insns match, but apparently the split
> condition *does* trigger for some insns matched by *other* patterns.
>
> There also is "sms1", which has insn condition
>   "NDS32_EXT_DSP_P ()
>&& (!reload_completed
>|| !nds32_need_split_sms_p (operands[3], operands[4],
>operands[5], operands[6]))"
> but split condition
>   "NDS32_EXT_DSP_P ()
>&& !reload_completed
>&& nds32_need_split_sms_p (operands[3], operands[4],
>   operands[5], operands[6])"
> Luckily we never trigger that.
>
> So yeah, patch withdrawn.  This on one hand is proof we do want to make
> such a change, but on the other hand shows it needs more preparatory
> steps.

I wonder if it makes sense to provide ports a means to opt-in into
the strict "&& " requirement and thus we can gradually fix them.
Probably requires some t-$target make fragment editing plus
passing an extra arg to gen* based on that.

That way maintainers can gradually fix their ports and make sure
they won't regress again.

Richard.

>
> Segher


Re: [RFC/PATCH 00/11] Fix up some unexpected empty split conditions

2021-06-08 Thread Richard Biener via Gcc-patches
On Tue, Jun 8, 2021 at 2:32 PM Segher Boessenkool
 wrote:
>
> On Tue, Jun 08, 2021 at 09:08:56AM +0200, Richard Biener wrote:
> > On Tue, Jun 8, 2021 at 4:10 AM Kewen.Lin via Gcc-patches
> >  wrote:
> > > on 2021/6/8 上午7:50, Segher Boessenkool wrote:
> > > > On Fri, Jun 04, 2021 at 10:57:51AM +0800, Kewen.Lin via Gcc-patches 
> > > > wrote:
> > > >> To find out those need fixing seems to be the critical part.  It's
> > > >> not hard to add one explicit "&&" to those that don't have it now, but
> > > >> even with further bootstrapped and regression tested I'm still not
> > > >> confident the adjustments are safe enough, since the testing coverage
> > > >> could be limited.  It may need more efforts to revisit, or/and test
> > > >> with more coverages, and port maintainers' reviews.
> > > >
> > > > https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572120.html
> > > >
> > > > This adds an "&&" everywhere (or in fact, it just skips any existing
> > > > one, it just has the same effect of adding it everywhere).  I tested it
> > > > with building gcc and Linux for all supported targets (31 of them; I do
> > > > some with multiple configs, mostly 32-bit and 64-bit).  None had any
> > > > difference before and after the change.
> > > >
> > > > So I am no longer worried that there will be any fallout from doing
> > > > this.  There are many things that *could* go wrong, but I don't think
> > > > there will be enough at all to be an impediment to just throwing the
> > > > switch.
> > > >
> > > > If we go this way no target will need any significant fixing, maybe none
> > > > at all will be needed across all targets.  And no changes will be needed
> > > > anywhere immediately.  We could make leading "&&" deprecated, and the
> > > > same for split condition "1" (which was "&& 1").  This is easy to change
> > > > automatically as well.
> > > >
> > > Thanks very much for doing this!
> > >
> > > I guess we are not going to backport this?  If we won't, it seems to need
> > > some way to ensure the implied "&&" will show up explicitly when 
> > > backporting
> > > some define_insn_and_split.
> >
> > For this reason I'd prefer the explicit "&& ", Seghers testing means
> > mass-changing all define_insn_and_split is reasonable.
>
> So mass-change all define_insn_and_split where the split condition is
> not inclusive of the insn condition (as written, i.e. before the
> iterators have added stuff), to be separate define_insn and define_split
> patterns?  And *then* add the &&?

Possible.  Maybe first enable a warning for the case with not starting
with "&& "
to give maintainers a chance to fix them and only then do the mass change
to separate define_insn and define_split to avoid churn in the .md files.  Well
maintained ports should be quick to fixup themselves.

And some cases might even be obvious.

Richard.

>
> Segher


[PATCH] Make SLP root stmt a vector

2021-06-08 Thread Richard Biener
This fixes a TODO noticed when adding vectorization of
BIT_INSERT_EXPRs and what's now useful for vectorization of
BB reductions.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

2021-06-08  Richard Biener  

* tree-vectorizer.h (_slp_instance::root_stmt): Change to...
(_slp_instance::root_stmts): ... a vector.
(SLP_INSTANCE_ROOT_STMT): Rename to ...
(SLP_INSTANCE_ROOT_STMTS): ... this.
(slp_root::root): Change to...
(slp_root::roots): ... a vector.
(slp_root::slp_root): Adjust.
* tree-vect-slp.c (_slp_instance::location): Adjust.
(vect_free_slp_instance): Release the root stmt vector.
(vect_build_slp_instance): Adjust.
(vect_analyze_slp): Likewise.
(_bb_vec_info::~_bb_vec_info): Likewise.
(vect_slp_analyze_operations): Likewise.
(vect_bb_vectorization_profitable_p): Likewise.  Adjust
costs for the root stmt.
(vect_slp_check_for_constructors): Gather all BIT_INSERT_EXPRs
as root stmts.
(vect_slp_analyze_bb_1): Simplify by marking all root stmts
as pure_slp.
(vectorize_slp_instance_root_stmt): Adjust.
(vect_schedule_slp): Likewise.
---
 gcc/tree-vect-slp.c   | 152 +++---
 gcc/tree-vectorizer.h |  10 +--
 2 files changed, 90 insertions(+), 72 deletions(-)

diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index ca1539e63f2..cc734e065df 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -164,8 +164,8 @@ vect_free_slp_tree (slp_tree node)
 dump_user_location_t
 _slp_instance::location () const
 {
-  if (root_stmt)
-return root_stmt->stmt;
+  if (!root_stmts.is_empty ())
+return root_stmts[0]->stmt;
   else
 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
 }
@@ -178,6 +178,7 @@ vect_free_slp_instance (slp_instance instance)
 {
   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
   SLP_INSTANCE_LOADS (instance).release ();
+  SLP_INSTANCE_ROOT_STMTS (instance).release ();
   instance->subgraph_entries.release ();
   instance->cost_vec.release ();
   free (instance);
@@ -2503,7 +2504,7 @@ static bool
 vect_build_slp_instance (vec_info *vinfo,
 slp_instance_kind kind,
 vec &scalar_stmts,
-stmt_vec_info root_stmt_info,
+vec &root_stmt_infos,
 unsigned max_tree_size, unsigned *limit,
 scalar_stmts_to_slp_tree_map_t *bst_map,
 /* ???  We need stmt_info for group splitting.  */
@@ -2564,7 +2565,7 @@ vect_build_slp_instance (vec_info *vinfo,
  SLP_INSTANCE_TREE (new_instance) = node;
  SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
  SLP_INSTANCE_LOADS (new_instance) = vNULL;
- SLP_INSTANCE_ROOT_STMT (new_instance) = root_stmt_info;
+ SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
  SLP_INSTANCE_KIND (new_instance) = kind;
  new_instance->reduc_phis = NULL;
  new_instance->cost_vec = vNULL;
@@ -2836,13 +2837,20 @@ vect_analyze_slp_instance (vec_info *vinfo,
   else
 gcc_unreachable ();
 
+  vec roots = vNULL;
+  if (kind == slp_inst_kind_ctor)
+{
+  roots.create (1);
+  roots.quick_push (stmt_info);
+}
   /* Build the tree for the SLP instance.  */
   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
- kind == slp_inst_kind_ctor
- ? stmt_info : NULL,
+ roots,
  max_tree_size, limit, bst_map,
  kind == slp_inst_kind_store
  ? stmt_info : NULL);
+  if (!res)
+roots.release ();
 
   /* ???  If this is slp_inst_kind_store and the above succeeded here's
  where we should do store group splitting.  */
@@ -2878,12 +2886,15 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
max_tree_size)
 {
   for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
{
- vect_location = bb_vinfo->roots[i].root->stmt;
+ vect_location = bb_vinfo->roots[i].roots[0]->stmt;
  if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
   bb_vinfo->roots[i].stmts,
-  bb_vinfo->roots[i].root,
+  bb_vinfo->roots[i].roots,
   max_tree_size, &limit, bst_map, NULL))
-   bb_vinfo->roots[i].stmts = vNULL;
+   {
+ bb_vinfo->roots[i].stmts = vNULL;
+ bb_vinfo->roots[i].roots = vNULL;
+   }
}
 }
 
@@ -3741,7 +3752,10 @@ _bb_vec_info::~_bb_vec_info ()
 }
 
   for (unsigned i = 0; i < roots.length (); ++i)
-roots[i].stmts.release ();
+{
+  roots[i].stmts.r

[PATCH] tree-optimization/100923 - fix alias-ref construction wrt availability

2021-06-08 Thread Richard Biener
This PR shows that building an ao_ref from value-numbers is prone to
expose bogus contextual alias info to the oracle.  The following makes
sure to construct ao_refs from SSA names available at the program point
only.

On the way it modifies the awkward valueize_refs[_1] API.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed to trunk.

2021-06-08  Richard Biener  

PR tree-optimization/100923
* tree-ssa-sccvn.c (valueize_refs_1): Take a pointer to
the operand vector to be valueized.
(valueize_refs): Likewise.
(valueize_shared_reference_ops_from_ref): Adjust.
(valueize_shared_reference_ops_from_call): Likewise.
(vn_reference_lookup_3): Likewise.
(vn_reference_lookup_pieces): Likewise.  Re-valueize
with honoring availability when we are about to create
the ao_ref and valueized before.
(vn_reference_lookup): Likewise.
(vn_reference_insert_pieces): Adjust.

* gcc.dg/torture/pr100923.c: New testcase.
---
 gcc/testsuite/gcc.dg/torture/pr100923.c | 25 
 gcc/tree-ssa-sccvn.c| 76 -
 2 files changed, 75 insertions(+), 26 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/torture/pr100923.c

diff --git a/gcc/testsuite/gcc.dg/torture/pr100923.c 
b/gcc/testsuite/gcc.dg/torture/pr100923.c
new file mode 100644
index 000..05a6341fea3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr100923.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+
+int a = 1, b, c, *d = &a, *e = &a, f;
+void g(int h) {}
+void k(int *l)
+{
+  int ***j;
+  if (c)
+{
+  *j = &l;
+  ***j;
+}
+  g(*l);
+  *e = f;
+  if (*l)
+{
+  int i = b / a;
+  a = i;
+}
+}
+int main()
+{
+  k(d);
+  return 0;
+}
diff --git a/gcc/tree-ssa-sccvn.c b/gcc/tree-ssa-sccvn.c
index e8761219460..64e3a707f5c 100644
--- a/gcc/tree-ssa-sccvn.c
+++ b/gcc/tree-ssa-sccvn.c
@@ -553,7 +553,7 @@ vuse_ssa_val (tree x)
   return x;
 }
 
-/* Similar to the above but used as callback for walk_non_aliases_vuses
+/* Similar to the above but used as callback for walk_non_aliased_vuses
and thus should stop at unvisited VUSE to not walk across region
boundaries.  */
 
@@ -1579,8 +1579,8 @@ contains_storage_order_barrier_p (vec 
ops)
the vector passed in is returned.  *VALUEIZED_ANYTHING will specify
whether any operands were valueized.  */
 
-static vec 
-valueize_refs_1 (vec orig, bool *valueized_anything,
+static void
+valueize_refs_1 (vec *orig, bool *valueized_anything,
 bool with_avail = false)
 {
   vn_reference_op_t vro;
@@ -1588,7 +1588,7 @@ valueize_refs_1 (vec orig, bool 
*valueized_anything,
 
   *valueized_anything = false;
 
-  FOR_EACH_VEC_ELT (orig, i, vro)
+  FOR_EACH_VEC_ELT (*orig, i, vro)
 {
   if (vro->opcode == SSA_NAME
  || (vro->op0 && TREE_CODE (vro->op0) == SSA_NAME))
@@ -1627,16 +1627,16 @@ valueize_refs_1 (vec orig, bool 
*valueized_anything,
   if (i > 0
  && vro->op0
  && TREE_CODE (vro->op0) == ADDR_EXPR
- && orig[i - 1].opcode == MEM_REF)
+ && (*orig)[i - 1].opcode == MEM_REF)
{
- if (vn_reference_fold_indirect (&orig, &i))
+ if (vn_reference_fold_indirect (orig, &i))
*valueized_anything = true;
}
   else if (i > 0
   && vro->opcode == SSA_NAME
-  && orig[i - 1].opcode == MEM_REF)
+  && (*orig)[i - 1].opcode == MEM_REF)
{
- if (vn_reference_maybe_forwprop_address (&orig, &i))
+ if (vn_reference_maybe_forwprop_address (orig, &i))
*valueized_anything = true;
}
   /* If it transforms a non-constant ARRAY_REF into a constant
@@ -1654,15 +1654,13 @@ valueize_refs_1 (vec orig, bool 
*valueized_anything,
  off.to_shwi (&vro->off);
}
 }
-
-  return orig;
 }
 
-static vec 
-valueize_refs (vec orig)
+static void
+valueize_refs (vec *orig)
 {
   bool tem;
-  return valueize_refs_1 (orig, &tem);
+  valueize_refs_1 (orig, &tem);
 }
 
 static vec shared_lookup_references;
@@ -1679,8 +1677,7 @@ valueize_shared_reference_ops_from_ref (tree ref, bool 
*valueized_anything)
 return vNULL;
   shared_lookup_references.truncate (0);
   copy_reference_ops_from_ref (ref, &shared_lookup_references);
-  shared_lookup_references = valueize_refs_1 (shared_lookup_references,
- valueized_anything);
+  valueize_refs_1 (&shared_lookup_references, valueized_anything);
   return shared_lookup_references;
 }
 
@@ -1695,7 +1692,7 @@ valueize_shared_reference_ops_from_call (gcall *call)
 return vNULL;
   shared_lookup_references.truncate (0);
   copy_reference_ops_from_call (call, &shared_lookup_references);
-  shared_lookup_references = valueize_refs (shared_lookup_references);
+  valueize_refs (&shared_lookup_references);
   return shared_lookup_references;
 }
 
@@ -2546,7 +2543,7 @@ vn_reference_lookup_3 (ao_ref 

Re: [PATCH] For obj-c stage-final re-use the checksum from the previous stage

2021-06-08 Thread Jason Merrill via Gcc-patches
On Fri, May 28, 2021 at 3:48 AM Bernd Edlinger 
wrote:

> Hi Richard,
>
> I've replicated your PR to make the objective-c checksum compare equal
>
> commit fb2647aaf55b453b37badfd40c14c59486a74584
> Author: Richard Biener 
> Date:   Tue May 3 08:14:27 2016 +
>
> Make-lang.in (cc1-checksum.c): For stage-final re-use the checksum
> from the previous stage.
>
> 2016-05-03  Richard Biener  
>
> c/
> * Make-lang.in (cc1-checksum.c): For stage-final re-use
> the checksum from the previous stage.
>
> cp/
> * Make-lang.in (cc1plus-checksum.c): For stage-final re-use
> the checksum from the previous stage.
>
> From-SVN: r235804
>
>
> This silences the stage compare.
>
> Bootstrapped and reg-tested on x86_64-pc-linux-gnu.
> Is it OK for trunk?
>
>
> Thanks
> Bernd.
>
>
> 2021-05-28  Bernd Edlinger  
>
> objc/
> * Make-lang.in (cc1obj-checksum.c): For stage-final re-use
> the checksum from the previous stage.
>
> objcp/
> * Make-lang.in (cc1objplus-checksum.c): For stage-final re-use
> the checksum from the previous stage.
>

This breaks bootstrap2.

Jason


[committed] libstdc++: Finish implementing LWG 3413 for propagate_const

2021-06-08 Thread Jonathan Wakely via Gcc-patches
We already have conditional noexcept so this just constrains the
non-member swap overload.

Signed-off-by: Jonathan Wakely 

libstdc++-v3/ChangeLog:

* include/experimental/propagate_const (swap): Constrain.
* testsuite/experimental/propagate_const/swap/lwg3413.cc: New test.

Tested x86_64-linux. Committed to trunk.

commit d319517e809ee50496db29e552f86a83a14c837c
Author: Jonathan Wakely 
Date:   Tue Jun 8 14:56:57 2021

libstdc++: Finish implementing LWG 3413 for propagate_const

We already have conditional noexcept so this just constrains the
non-member swap overload.

Signed-off-by: Jonathan Wakely 

libstdc++-v3/ChangeLog:

* include/experimental/propagate_const (swap): Constrain.
* testsuite/experimental/propagate_const/swap/lwg3413.cc: New test.

diff --git a/libstdc++-v3/include/experimental/propagate_const 
b/libstdc++-v3/include/experimental/propagate_const
index 0d03c13c5e1..162b4783dd7 100644
--- a/libstdc++-v3/include/experimental/propagate_const
+++ b/libstdc++-v3/include/experimental/propagate_const
@@ -113,6 +113,7 @@ inline namespace fundamentals_v2
   constexpr propagate_const() = default;
   propagate_const(const propagate_const& __p) = delete;
   constexpr propagate_const(propagate_const&& __p) = default;
+
   template ,
 is_convertible<_Up&&, _Tp>>::value, bool
@@ -120,6 +121,7 @@ inline namespace fundamentals_v2
   constexpr propagate_const(propagate_const<_Up>&& __pu)
: _M_t(std::move(get_underlying(__pu)))
   {}
+
   template ,
 __not_>>::value,
@@ -127,6 +129,7 @@ inline namespace fundamentals_v2
   constexpr explicit propagate_const(propagate_const<_Up>&& __pu)
: _M_t(std::move(get_underlying(__pu)))
   {}
+
   template ,
 is_convertible<_Up&&, _Tp>,
@@ -136,6 +139,7 @@ inline namespace fundamentals_v2
   constexpr propagate_const(_Up&& __u)
: _M_t(std::forward<_Up>(__u))
   {}
+
   template ,
 __not_>,
@@ -399,8 +403,10 @@ inline namespace fundamentals_v2
 }
 
   // [propagate_const.algorithms], specialized algorithms
+  // _GLIBCXX_RESOLVE_LIB_DEFECTS
+  // 3413. propagate_const's swap [...] needs to be constrained and use a trait
   template 
-constexpr void
+constexpr enable_if_t<__is_swappable<_Tp>::value, void>
 swap(propagate_const<_Tp>& __pt, propagate_const<_Tp>& __pt2)
   noexcept(__is_nothrow_swappable<_Tp>::value)
 {
diff --git 
a/libstdc++-v3/testsuite/experimental/propagate_const/swap/lwg3413.cc 
b/libstdc++-v3/testsuite/experimental/propagate_const/swap/lwg3413.cc
new file mode 100644
index 000..8dc13cfebdd
--- /dev/null
+++ b/libstdc++-v3/testsuite/experimental/propagate_const/swap/lwg3413.cc
@@ -0,0 +1,41 @@
+// { dg-do compile { target c++14 } }
+
+// LWG 3413
+// propagate_const's swap's noexcept specification needs to be constrained
+// and use a trait
+
+#include 
+
+using std::experimental::propagate_const;
+
+propagate_const i;
+static_assert( noexcept(i.swap(i)), "member swap is noexcept" );
+static_assert( noexcept(swap(i, i)), "non-member swap is noexcept" );
+
+struct P
+{
+  int i = 0;
+  int& operator*() const;
+};
+
+void swap(P&, P&) noexcept(false);
+
+propagate_const p;
+static_assert( ! noexcept(p.swap(p)), "member swap is conditionally noexcept" 
);
+static_assert( ! noexcept(swap(p, p)), "non-member swap is conditionally 
noexcept" );
+
+// std::is_swappable not available for -std=c++14
+#if __cplusplus > 201402L || !defined(__STRICT_ANSI__)
+struct Q
+{
+  int i = 0;
+  int& operator*() const;
+
+  Q& operator=(Q&&) = delete;
+};
+
+static_assert( ! std::is_swappable::value, "" );
+
+static_assert( ! std::is_swappable>::value,
+  "non-member swap is constrained" );
+#endif


Re: Aligning stack offsets for spills

2021-06-08 Thread Michael Matz
Hello,

On Mon, 7 Jun 2021, Jeff Law wrote:

> 
> So, as many of you know I left Red Hat a while ago and joined Tachyum.  We're
> building a new processor and we've come across an issue where I think we need
> upstream discussion.
> 
> I can't divulge many of the details right now, but one of the quirks of our
> architecture is that reg+d addressing modes for our vector loads/stores
> require the displacement to be aligned.  This is an artifact of how these
> instructions are encoded.
> 
> Obviously we can emit a load of the address into a register when the
> displacement isn't aligned.  From a correctness point that works perfectly. 
> Unfortunately, it's a significant performance hit on some standard benchmarks
> (spec) where we have a great number of spills of vector objects into the stack
> at unaligned offsets in the hot parts of the code.
> 
> 
> We've considered 3 possible approaches to solve this problem.
> 
> 1. When the displacement isn't properly aligned, allocate more space in
> assign_stack_local so that we can make the offset aligned.  The downside is
> this potentially burns a lot of stack space, but in practice the cost was
> minimal (16 bytes in a 9k frame)  From a performance standpoint this works
> perfectly.
> 
> 2. Abuse the register elimination code to create a second pointer into the
> stack.  Spills would start as  + offset, then either get eliminated
> to sp+offset' when the offset is aligned or gpr+offset'' when the offset
> wasn't properly aligned. We started a bit down this path, but with #1 working
> so well, we didn't get this approach to proof-of-concept.
> 
> 3. Hack up the post-reload optimizers to fix things up as best as we can. 
> This may still be advantageous, but again with #1 working so well, we didn't
> explore this in any significant way.  We may still look at this at some point
> in other contexts.
> 
> Here's what we're playing with.  Obviously we'd need a target hook to 
> drive this behavior.  I was thinking that we'd pass in any slot offset 
> alignment requirements (from the target hook) to assign_stack_local and 
> that would bubble down to this point in try_fit_stack_local:

Why is the machinery involving STACK_SLOT_ALIGNMENT and 
spill_slot_alignment() (for spilling) or get_stack_local_alignment() (for 
backing stack slots) not working for you?  If everything is setup 
correctly the input alignment to try_fit_stack_local ought to be correct 
already.


Ciao,
Michael.


[committed] Further improve redundant test/compare removal on the H8

2021-06-08 Thread Jeff Law via Gcc-patches
This is another minor patch for elimination of redundant test/compares 
on the H8.  In particular it allows the compiler to use the result of a 
byte sized AND instruction to eliminate a compare/test.  The only 
"trick" here is we have to distinguish between BCLR which clears a bit, 
but does not set condition codes and AND which does set condition codes.


This patch also fixes a minor goof in the length computation of such 
instructions.  The prior patterns had the length computation backwards.  
Byte sized ANDs are always 2 bytes while a BCLR can be from 2 to 8 bytes 
long depending on the target operand.


Finally this patch also merges the H8SX BSET/BNOT for QI and HI mode.

This is about 9 hours into the 22 hour test cycle (my timeouts are 
dramatically longer than the defaults to improve test coverage and 
stability).   I'm not expecting any issues.


Committed to the trunk,
Jeff
commit 941aa24ca9553b422dba6e267448ddd952bc52d1
Author: Jeff Law 
Date:   Tue Jun 8 10:10:23 2021 -0400

Further improve redundant test/compare removal on the H8

gcc/
* config/h8300/logical.md (andqi3_1): Move BCLR case into 
define_insn_and_split.
Create length attribute on define_insn_and_split.  Only split for 
cases which we
know will use AND.
(andqi3_1): Renamed from andqi3_1_clobber_flags.  Only handle 
AND here and
fix length computation.
(bmsx): Combine QImode and HImode H8/SX patterns using 
iterator.

diff --git a/gcc/config/h8300/logical.md b/gcc/config/h8300/logical.md
index 34cf74e24ee..fae3c7cd0c5 100644
--- a/gcc/config/h8300/logical.md
+++ b/gcc/config/h8300/logical.md
@@ -62,22 +62,21 @@
(match_operand:QI 2 "h8300_src_operand" "Y0,rn")))]
   "register_operand (operands[0], QImode)
|| single_zero_operand (operands[2], QImode)"
-  "#"
-  "&& reload_completed"
+  "bclr %W2,%R0"
+  "&& reload_completed && !single_zero_operand (operands[2], QImode)"
   [(parallel [(set (match_dup 0) (and:QI (match_dup 1) (match_dup 2)))
- (clobber (reg:CC CC_REG))])])
+ (clobber (reg:CC CC_REG))])]
+  ""
+  [(set_attr "length" "8,2")])
 
-(define_insn "andqi3_1_clobber_flags"
-  [(set (match_operand:QI 0 "bit_operand" "=U,r")
-   (and:QI (match_operand:QI 1 "bit_operand" "%0,0")
-   (match_operand:QI 2 "h8300_src_operand" "Y0,rn")))
+(define_insn "*andqi3_1"
+  [(set (match_operand:QI 0 "register_operand" "=r")
+   (and:QI (match_operand:QI 1 "register_operand" "%0")
+   (match_operand:QI 2 "h8300_src_operand" "rn")))
(clobber (reg:CC CC_REG))]
-  "register_operand (operands[0], QImode)
-   || single_zero_operand (operands[2], QImode)"
-  "@
-   bclr %W2,%R0
-   and  %X2,%X0"
-  [(set_attr "length" "2,8")])
+  ""
+  "and  %X2,%X0"
+  [(set_attr "length" "2")])
 
 (define_insn_and_split "*andor3"
   [(set (match_operand:QHSI 0 "register_operand" "=r")
@@ -166,22 +165,14 @@
 ;; OR/XOR INSTRUCTIONS
 ;; --
 
-(define_insn "bqi_msx"
-  [(set (match_operand:QI 0 "bit_register_indirect_operand" "=WU")
-   (ors:QI (match_operand:QI 1 "bit_register_indirect_operand" "%0")
-   (match_operand:QI 2 "single_one_operand" "Y2")))]
+(define_insn "b_msx"
+  [(set (match_operand:QHI 0 "bit_register_indirect_operand" "=WU")
+   (ors:QHI (match_operand:QHI 1 "bit_register_indirect_operand" "%0")
+(match_operand:QHI 2 "single_one_operand" "Y2")))]
   "TARGET_H8300SX && rtx_equal_p (operands[0], operands[1])"
   { return  == IOR ? "bset\\t%V2,%0" : "bnot\\t%V2,%0"; }
   [(set_attr "length" "8")])
 
-(define_insn "bhi_msx"
-  [(set (match_operand:HI 0 "bit_register_indirect_operand" "=m")
-   (ors:HI (match_operand:HI 1 "bit_register_indirect_operand" "%0")
-   (match_operand:HI 2 "single_one_operand" "Y2")))]
-  "TARGET_H8300SX"
-  { return  == IOR ? "bset\\t%V2,%0" : "bnot\\t%V2,%0"; }
-  [(set_attr "length" "8")])
-
 (define_insn_and_split "qi3_1"
   [(set (match_operand:QI 0 "bit_operand" "=U,rQ")
(ors:QI (match_operand:QI 1 "bit_operand" "%0,0")


Re: [PATCH] predcom: Adjust some unnecessary update_ssa calls

2021-06-08 Thread Martin Sebor via Gcc-patches

On 6/8/21 3:30 AM, Kewen.Lin wrote:

on 2021/6/7 下午10:46, Richard Biener wrote:

On Wed, Jun 2, 2021 at 11:29 AM Kewen.Lin  wrote:


Hi,

As Richi suggested in PR100794, this patch is to remove
some unnecessary update_ssa calls with flag
TODO_update_ssa_only_virtuals, also do some refactoring.

Bootstrapped/regtested on powerpc64le-linux-gnu P9,
x86_64-redhat-linux and aarch64-linux-gnu, built well
on Power9 ppc64le with --with-build-config=bootstrap-O3,
and passed both P8 and P9 SPEC2017 full build with
{-O3, -Ofast} + {,-funroll-loops}.

Is it ok for trunk?


LGTM, minor comment on the fancy C++:

+  auto cleanup = [&]() {
+release_chains (chains);
+free_data_refs (datarefs);
+BITMAP_FREE (looparound_phis);
+free_affine_expand_cache (&name_expansions);
+  };

+  cleanup ();
+  return 0;

so that could have been

   class cleanup {
  ~cleanup()
 {
   release_chains (chains);
   free_data_refs (datarefs);
   BITMAP_FREE (looparound_phis);
   free_affine_expand_cache (&name_expansions);
 }
   } cleanup;

?  Or some other means of adding registering a RAII-style cleanup?
I mean, we can't wrap it all in

   try {...}
   finally {...}

because C++ doesn't have finally.

OK with this tiny part of the C++ refactoring delayed, but we can also simply
discuss best options.  At least for looparound_phis a good cleanup would
be to pass the bitmap around and use auto_bitmap local to
tree_predictive_commoning_loop ...



Thanks Richi!  One draft (not ready for review) is attached for the further
discussion.  It follows the idea of RAII-style cleanup.  I noticed that
Martin suggested stepping forward to make tree_predictive_commoning_loop
and its callees into one class (Thanks Martin), since there are not many
this kind of C++-style work functions, I want to double confirm which option
do you guys prefer?


I meant that not necessarily as something to include in this patch
but as a suggestion for a future improvement.  If you'd like to
tackle it at any point that would be great of course :)  In any
event, thanks for double-checking!

The attached patch looks good to me as well (more for the sake of
style than anything else, declaring the class copy ctor and copy 
assignment = delete would make it clear it's not meant to be

copied, although in this case it's unlikely to make a practical
difference).



One point you might have seen is that to make tree_predictive_commoning_loop
and its callees as member functions of one class can avoid to pass bitmap
looparound_phis all around what's in the draft.  :)


Yes, that would simplify the interfaces of all the functions that
the info members are passed to as arguments.

Martin



BR,
Kewen





Re: [PATCH] Implement a context aware points-to analyzer for use in evrp.

2021-06-08 Thread Andrew MacLeod via Gcc-patches

On 6/8/21 3:26 AM, Richard Biener wrote:

On Mon, Jun 7, 2021 at 9:20 PM Andrew MacLeod  wrote:


I don't think this is actually doing the propagation though... It tracks
that a_2 currently points to &foo.. and returns that to either
simplifier or folder thru value_of_expr().  Presumably it is up to them
to determine whether the tree expression passed back is safe to
propagate.   Is there any attempt in EVRP to NOT set the range of
something to [&foo, &foo] under some conditions?   This is what the
change amounts to.  Ranger would just return a range of [1, +INF], and
value_of_expr  would therefore return NULL.  This allows value_of to
return &foo in these conditions.   Aldy, did you see any other checks in
the vr-values code?

Things like   if (var1_2 == var2_3) deal with just ssa-names and will be
handled by an ssa_name relation oracle. It just treats equivalencies
like a a slightly special kind of relation. Im just about to bring that
forward this week.

Ah, great - I'm looking forward to this.  Currently both DOM and VN


The initial code will be a bit basic, but it can be educated as we go 
along :-)


Its currently tied into ranger just because as ranger processes 
statements it registers any relations it sees.. the oracle organizes 
these and can answer questions on anything it has seen.


It is otherwise independent of ranger. It is dominance based, and there 
is no reason relations cant be queried and registered by any pass doing 
a DOM walk without ranger.  It benefits from ranger in that sometime 
relations are refined when we know ranges  (ie for unsigned math)


    a_2 = b_4 + 6

if we know the range of b_4 will not cause an overflow, then we could 
set a_2 > b_4.. otherwise we cant..  Wiring it with ranger also removes 
the dependency on a DOM walk as ranger sorts the ordering out as needed.


It is driven by data provided by range-ops and is more of a data 
propagation/lookup mechanism than anything. There are a number of cases 
we don't currently register relations simply because we have not flushed 
out the various tree code instructions.   We'll get to those eventually. 
I expect a number of the PRs will eventually be fixed primarily by 
adding code to range-ops .


It also only does first order relations so far...  I'll get to 
transitives and other things as well.




do a very simplistic thing when trying to simplify downstream conditions
based on earlier ones, abusing their known-expressions hash tables
by, for example, registering (a < b) == 1, (a > b) == 0, (a == b) == 0,
(a != b) == 1 for an earlier a < b condition on the true edge.  So I wonder
if this relation code can be somehow used there.  In VN there's the
extra complication that it iterates, but DOM is just a DOM-walk and
the VN code also has a non-iterating mode (but not a DOM walk).


I don't think the iteration is an issue,  ranger iterates to some degree 
as well, and some statement are registered multiple times. I believe it 
intersects with any known relation, so if an iteration causes a relation 
to become "better" it should be updated.


The API is for registering is pretty straightforward:

  void register_relation (gimple *stmt, relation_kind k, tree op1, tree 
op2);

  void register_relation (edge e, relation_kind k, tree op1, tree op2);

so all you'd have to do when a < b is encountered is to register  (a 
LT_EXPR b) on the true edge, and (a GE_EXPR b) on the false edge.  Then 
any queries downstream should be reflected.





Of course the code is also used to simplify

  if (a > b)
 c = a != b;

but the relation oracle should be able to handle that as well I guess.

yeah, so a GT_EXPR B is registered on the true edge.  Then when 
processing c = a != b,  you can determine that a NE_EXPR b intersected 
with a GT_EXPR b result in  a GT_EXPR b... which folds to a 1.


This is all also available with the range-op API additions such that you 
can simply call :


rangerop->fold_range (stmt(c = a != b), range_of_a, range_of_b, GT_EXPR 
(relation of a to b)  and the range returned will be [1,1].


The actual ranges in this case are irrelevant, but arent for some other 
kinds of stmts.


Likewise, simply asking ranger for the range of c will likewise return 
[1,1], the relation processing is all integrated behind the scenes in 
ranger..


As we start using the code more, we may find we want/need a few more 
wrappers around some of this so that you can transparently ask what the 
RHS folds to without any ranger present, just with relations.  Those'll 
be fairly trivial to add...


The relation oracle is going to be directly accessible from the 
get_range_query(cfun) range_query class.  I'll do a big writeup when i 
submit it and we should be able to make it usable in any of those places.


Andrew





Re: Aligning stack offsets for spills

2021-06-08 Thread Jeff Law




On 6/8/2021 8:08 AM, Michael Matz wrote:

Hello,

On Mon, 7 Jun 2021, Jeff Law wrote:


So, as many of you know I left Red Hat a while ago and joined Tachyum.  We're
building a new processor and we've come across an issue where I think we need
upstream discussion.

I can't divulge many of the details right now, but one of the quirks of our
architecture is that reg+d addressing modes for our vector loads/stores
require the displacement to be aligned.  This is an artifact of how these
instructions are encoded.

Obviously we can emit a load of the address into a register when the
displacement isn't aligned.  From a correctness point that works perfectly.
Unfortunately, it's a significant performance hit on some standard benchmarks
(spec) where we have a great number of spills of vector objects into the stack
at unaligned offsets in the hot parts of the code.


We've considered 3 possible approaches to solve this problem.

1. When the displacement isn't properly aligned, allocate more space in
assign_stack_local so that we can make the offset aligned.  The downside is
this potentially burns a lot of stack space, but in practice the cost was
minimal (16 bytes in a 9k frame)  From a performance standpoint this works
perfectly.

2. Abuse the register elimination code to create a second pointer into the
stack.  Spills would start as  + offset, then either get eliminated
to sp+offset' when the offset is aligned or gpr+offset'' when the offset
wasn't properly aligned. We started a bit down this path, but with #1 working
so well, we didn't get this approach to proof-of-concept.

3. Hack up the post-reload optimizers to fix things up as best as we can.
This may still be advantageous, but again with #1 working so well, we didn't
explore this in any significant way.  We may still look at this at some point
in other contexts.

Here's what we're playing with.  Obviously we'd need a target hook to
drive this behavior.  I was thinking that we'd pass in any slot offset
alignment requirements (from the target hook) to assign_stack_local and
that would bubble down to this point in try_fit_stack_local:

Why is the machinery involving STACK_SLOT_ALIGNMENT and
spill_slot_alignment() (for spilling) or get_stack_local_alignment() (for
backing stack slots) not working for you?  If everything is setup
correctly the input alignment to try_fit_stack_local ought to be correct
already.
We don't need the MEM as a whole aligned, just the offset in the address 
calculation due to how we encode those instructions.  If I've read that 
code correctly, it would arrange for a dynamic realignment of the stack  
so that it could then align the slot. None of that is necessary for us 
and we'd like to avoid forcing the dynamic stack realignment.  Or did I 
misread the code?


jeff


Re: [PATCH] PR tree-optimization/100781 - Do not calculate new values when evaluating a debug, statement.

2021-06-08 Thread Andrew MacLeod via Gcc-patches

On 6/2/21 3:29 AM, Richard Biener wrote:

On Tue, Jun 1, 2021 at 4:24 PM Andrew MacLeod  wrote:

On 6/1/21 3:34 AM, Richard Biener wrote:

On Tue, Jun 1, 2021 at 3:38 AM Andrew MacLeod via Gcc-patches
 wrote:

An ongoing issue  is the the order we evaluate things in can affect
decisions along the way. As ranger isn't a fully iterative pass, we can
sometimes come up with different results if back edges are processed in
different orders.

One of the ways this can happen is when the cache is propagating
on-entry values for an SSA_NAME. It calculates outgoing edge values and
the gori-compute engine can flag ssa-names that were involved in a range
calculation that have not yet been initialized.  When the propagation
for the original name is done, it goes back and examines the "poor
values" and tries to quickly calculate a better range, and if it comes
up with one, immediately tries to go back  and update the location/range
gori_compute flagged.   This produces better ranges earlier.

However, when we do this in different orders, we can get different
results.  We were processing the uses on is_gimple_debug statements just
like normal uses, and this would sometimes cause a difference in how
things were resolved.

This patch adds a flag to enable/disable this attempt to look up new
values, and when range_of_expr is processing the use on a debug
statement, turns it off for the query.  This means the query will never
cause a new lookup, and this should resolve all the -fcompare-debug issues.

Bootstrapped on x86_64-pc-linux-gnu, with no new regressions. Pushed.

Please check if such fixes also apply to the GCC 11 branch.

Richard.



I've checked both testcases against gcc11 release, and neither is an
issue there.  Much of this was triggered by changes to the export list.
That said, is there potential for it to surface? The potential is
probably there.   We'd have to address it differently tho.  For the
gcc11 release, since we always run in hybrid mode it doesn't really
matter if ranger looks up ranges for debug statements... EVRP will still
pick up what we use to get for them.  we could simply disable looking
for contextual ranges for is_gimple_stmt and simply pick up the best
known global/on-entry value available..   I can either provide a patch
for that now, or deal with it if we ever get a PR.  I'm ok either way.

I think it would be good to robustify the code even w/o a PR.


btw, when is the next point release? I added an infrastructure patch to
trunk (https://gcc.gnu.org/pipermail/gcc-patches/2021-May/569884.html)
to enable replacing the on-entry cache to deal with memory consumption
issues like in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100299 .  I
specifically put it in early before the other changes so that it could
be directly applied to gcc11 as well, but I need to follow up with one
of the replacements I have queued up to look at if we are interested in
fixing this in gcc 11.  I'll bump the priority to try to hit the next
release if thats the case.

The first point release is usuall about two month from the initial release
which means in about a month and a half.  It would be nice to fix
those issues and the earlier in the release series the better.

Richard.


Andrew

OK, so this would be the simple way I'd tackle this in gcc11. This 
should be quite safe.  Just treat debug_stmts as if they are not stmts.. 
and make a global query.   EVRP will still provide a contextual range as 
good as it ever did, but it wont trigger ranger lookups on debug uses 
any more.


It bootstraps on x86_64-pc-linux-gnu.  Is there a process other than 
getting the OK to check this into the gcc 11 branch?  Does it go into 
releases/gcc-11 ?


Andrew



>From ff5ab360b21a83ac84b1fff22d091df2c44dafdf Mon Sep 17 00:00:00 2001
From: Andrew MacLeod 
Date: Tue, 8 Jun 2021 09:43:17 -0400
Subject: [PATCH 4/4] Don't process lookups for debug statements in Ranger.

Although PR 100781 is not an issue in GCC11, its possible that a similar
situation may arise.  The identical fix cannot be easily introduced.
With EVRP always running in hybrid mode, there is no need for ranger to
spawn a lookup for a debug statement in this release.

	* gimple-range.cc (gimple_ranger::range_of_expr): Treat debug statments
	as contextless queries to avoid additional lookups.
---
 gcc/gimple-range.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/gimple-range.cc b/gcc/gimple-range.cc
index 6158a754dd6..fd7fa5e3dbb 100644
--- a/gcc/gimple-range.cc
+++ b/gcc/gimple-range.cc
@@ -945,7 +945,7 @@ gimple_ranger::range_of_expr (irange &r, tree expr, gimple *stmt)
 return get_tree_range (r, expr);
 
   // If there is no statement, just get the global value.
-  if (!stmt)
+  if (!stmt || is_gimple_debug (stmt))
 {
   if (!m_cache.get_global_range (r, expr))
 r = gimple_range_global (expr);
-- 
2.25.4



[PATCH] [GCC 11] tree-optimization/100299 - Cherry picked solution from trunk

2021-06-08 Thread Andrew MacLeod via Gcc-patches
The 2 recent patches, plus the original abstraction patch can be simply 
cherry picked for gcc 11.


I have applied the 3 patches to the current gcc 11 release, and it 
bootstrapped with no regressions on x86_64-pc-linux-gnu.


Andrew



>From 58289b678064c4b4e1efeb806f78c42d86ae31a4 Mon Sep 17 00:00:00 2001
From: Andrew MacLeod 
Date: Fri, 7 May 2021 12:03:01 -0400
Subject: [PATCH 1/4] Clean up and virtualize the on-entry cache interface.

Cleanup/Virtualize the ssa_block_range class, and implement the current
vector approach as a derived class.
Allow memory allocation from the irange allocator obstack for easy freeing.

	* gimple-range-cache.cc (ssa_block_ranges): Virtualize.
	(sbr_vector): Renamed from ssa_block_cache.
	(sbr_vector::sbr_vector): Allocate from obstack abd initialize.
	(ssa_block_ranges::~ssa_block_ranges): Remove.
	(sbr_vector::set_bb_range): Use varying and undefined cached values.
	(ssa_block_ranges::set_bb_varying): Remove.
	(sbr_vector::get_bb_range): Adjust assert.
	(sbr_vector::bb_range_p): Adjust assert.
	(~block_range_cache): No freeing loop required.
	(block_range_cache::get_block_ranges): Remove.
	(block_range_cache::set_bb_range): Inline get_block_ranges.
	(block_range_cache::set_bb_varying): Remove.
	* gimple-range-cache.h (set_bb_varying): Remove prototype.
	* value-range.h (irange_allocator::get_memory): New.

(cherry picked from commit 14b0f37a644d7b59e1737fb275ec4fff044972a8)
---
 gcc/gimple-range-cache.cc | 165 --
 gcc/gimple-range-cache.h  |   1 -
 gcc/value-range.h |   9 +++
 3 files changed, 80 insertions(+), 95 deletions(-)

diff --git a/gcc/gimple-range-cache.cc b/gcc/gimple-range-cache.cc
index 38e4fe1c7c0..2be83d698ab 100644
--- a/gcc/gimple-range-cache.cc
+++ b/gcc/gimple-range-cache.cc
@@ -107,29 +107,53 @@ non_null_ref::process_name (tree name)
 
 // -
 
-// This class implements a cache of ranges indexed by basic block.  It
-// represents all that is known about an SSA_NAME on entry to each
-// block.  It caches a range-for-type varying range so it doesn't need
-// to be reformed all the time.  If a range is ever always associated
-// with a type, we can use that instead.  Whenever varying is being
-// set for a block, the cache simply points to this cached one rather
-// than create a new one each time.
+// This class represents the API into a cache of ranges for an SSA_NAME.
+// Routines must be implemented to set, get, and query if a value is set.
 
 class ssa_block_ranges
 {
 public:
-  ssa_block_ranges (tree t, irange_allocator *allocator);
-  ~ssa_block_ranges ();
-
-  void set_bb_range (const basic_block bb, const irange &r);
-  void set_bb_varying (const basic_block bb);
-  bool get_bb_range (irange &r, const basic_block bb);
-  bool bb_range_p (const basic_block bb);
+  virtual void set_bb_range (const basic_block bb, const irange &r) = 0;
+  virtual bool get_bb_range (irange &r, const basic_block bb) = 0;
+  virtual bool bb_range_p (const basic_block bb) = 0;
 
   void dump(FILE *f);
-private:
-  vec m_tab;
-  irange *m_type_range;
+};
+
+// Print the list of known ranges for file F in a nice format.
+
+void
+ssa_block_ranges::dump (FILE *f)
+{
+  basic_block bb;
+  int_range_max r;
+
+  FOR_EACH_BB_FN (bb, cfun)
+if (get_bb_range (r, bb))
+  {
+	fprintf (f, "BB%d  -> ", bb->index);
+	r.dump (f);
+	fprintf (f, "\n");
+  }
+}
+
+// This class implements the range cache as a linear vector, indexed by BB.
+// It caches a varying and undefined range which are used instead of
+// allocating new ones each time.
+
+class sbr_vector : public ssa_block_ranges
+{
+public:
+  sbr_vector (tree t, irange_allocator *allocator);
+
+  virtual void set_bb_range (const basic_block bb, const irange &r) OVERRIDE;
+  virtual bool get_bb_range (irange &r, const basic_block bb) OVERRIDE;
+  virtual bool bb_range_p (const basic_block bb) OVERRIDE;
+protected:
+  irange **m_tab;	// Non growing vector.
+  int m_tab_size;
+  int_range<2> m_varying;
+  int_range<2> m_undefined;
   tree m_type;
   irange_allocator *m_irange_allocator;
 };
@@ -137,55 +161,43 @@ private:
 
 // Initialize a block cache for an ssa_name of type T.
 
-ssa_block_ranges::ssa_block_ranges (tree t, irange_allocator *allocator)
+sbr_vector::sbr_vector (tree t, irange_allocator *allocator)
 {
   gcc_checking_assert (TYPE_P (t));
   m_type = t;
   m_irange_allocator = allocator;
-
-  m_tab.create (0);
-  m_tab.safe_grow_cleared (last_basic_block_for_fn (cfun));
+  m_tab_size = last_basic_block_for_fn (cfun) + 1;
+  m_tab = (irange **)allocator->get_memory (m_tab_size * sizeof (irange *));
+  memset (m_tab, 0, m_tab_size * sizeof (irange *));
 
   // Create the cached type range.
-  m_type_range = m_irange_allocator->allocate (2);
-  m_type_range->set_varying (t);
-
-  m_tab[ENTRY_BLOCK_PTR_FOR_FN (cfun)->index] = m_type_range;
-}
-
-// Destruct block range.
-
-ssa_block_ranges::~ssa

Re: Aligning stack offsets for spills

2021-06-08 Thread Jakub Jelinek via Gcc-patches
On Tue, Jun 08, 2021 at 08:47:26AM -0600, Jeff Law wrote:
> > Why is the machinery involving STACK_SLOT_ALIGNMENT and
> > spill_slot_alignment() (for spilling) or get_stack_local_alignment() (for
> > backing stack slots) not working for you?  If everything is setup
> > correctly the input alignment to try_fit_stack_local ought to be correct
> > already.
> We don't need the MEM as a whole aligned, just the offset in the address
> calculation due to how we encode those instructions.  If I've read that code
> correctly, it would arrange for a dynamic realignment of the stack  so that
> it could then align the slot. None of that is necessary for us and we'd like
> to avoid forcing the dynamic stack realignment.  Or did I misread the code?

I think dynamic stack realignment is done only on x86, no other backend has
that support, on all the other arches larger alignments are done
in expand_stack_vars by effectively performing __builtin_alloca_with_align
for the block containing all such variables.
So I'd the the functions Michael mentioned shouldn't be doing dynamic stack
realignment, though perhaps by pretending the vars have higher alignment
might be recorded in MEM_ALIGN and perhaps might result in wrong-code if
something will try to e.g. test if least significant bits of certain MEM
address are 0.

Jakub



Re: Aligning stack offsets for spills

2021-06-08 Thread Jeff Law




On 6/8/2021 12:56 AM, Richard Biener wrote:

On Mon, Jun 7, 2021 at 9:00 PM Jeff Law  wrote:


So, as many of you know I left Red Hat a while ago and joined Tachyum.
We're building a new processor and we've come across an issue where I
think we need upstream discussion.

I can't divulge many of the details right now, but one of the quirks of
our architecture is that reg+d addressing modes for our vector
loads/stores require the displacement to be aligned.  This is an
artifact of how these instructions are encoded.

Obviously we can emit a load of the address into a register when the
displacement isn't aligned.  From a correctness point that works
perfectly.  Unfortunately, it's a significant performance hit on some
standard benchmarks (spec) where we have a great number of spills of
vector objects into the stack at unaligned offsets in the hot parts of
the code.


We've considered 3 possible approaches to solve this problem.

1. When the displacement isn't properly aligned, allocate more space in
assign_stack_local so that we can make the offset aligned.  The downside
is this potentially burns a lot of stack space, but in practice the cost
was minimal (16 bytes in a 9k frame)  From a performance standpoint this
works perfectly.

2. Abuse the register elimination code to create a second pointer into
the stack.  Spills would start as  + offset, then either get
eliminated to sp+offset' when the offset is aligned or gpr+offset'' when
the offset wasn't properly aligned. We started a bit down this path, but
with #1 working so well, we didn't get this approach to proof-of-concept.

3. Hack up the post-reload optimizers to fix things up as best as we
can.  This may still be advantageous, but again with #1 working so well,
we didn't explore this in any significant way.  We may still look at
this at some point in other contexts.


So just as extra info - you're pre-allocating the frame (including for spills)
and not using push/pop?

Yes, we're an ACCUMULATE_OUTGOING_ARGS target.





Here's what we're playing with.  Obviously we'd need a target hook to
drive this behavior.  I was thinking that we'd pass in any slot offset
alignment requirements (from the target hook) to assign_stack_local and
that would bubble down to this point in try_fit_stack_local:

diff --git a/gcc/function.c b/gcc/function.c
index d616f5f64f4..7f441b87a63 100644
--- a/gcc/function.c
+++ b/gcc/function.c
@@ -307,6 +307,14 @@ try_fit_stack_local (poly_int64 start, poly_int64
length,
 frame_off = targetm.starting_frame_offset () % frame_alignment;
 frame_phase = frame_off ? frame_alignment - frame_off : 0;

+  if (known_eq (size, 64) && alignment < 64)
+alignment = 64;
+

I'm not familiar with the spill slot allocation code in GCC (I assume the above
is part of it) - do we in any way "sort" the spill slots so the extra required
padding is minimal?  Does the above guarantee that in the end the
offset will be aligned?  I assume IRA/LRA can still choose to eliminate
the respective frame pointer to sth else that ends up misaligning the offset
again?  Thus is it a real fix or a heuristic that ends up working most of
the time?
LRA does sort the spill slots, but I haven't looked into its sorting 
algorithm to see if it's anything other than a priority sort.  LRA does 
allow sharing spill slots for non-conflicting pseudos which is what I've 
assumed has kept the extra padding to a minimum.


It's a real fix, not a heuristic.



The actual alignment value should be dependent on the mode and
target preference and thus a target hook I suppose (you mention
this applies to vector loads/stores only).
Absolutely.  What I posted was just the initial proof-of-concept. It 
needs to be a target hook and we need to pass in the data from LRA since 
by the time we get into assign_stack_local, we don't have a useful mode 
-- LRA passes in the size and BLKmode.


I probably trimmed out too many comments in my attempt to avoid 
disclosing anything I shouldn't.  It's worth noting that adjusting 
things at that particular point results in getting the offsets aligned 
without forcing the stack as a whole into a higher alignment or even 
forcing slots to a higher alignment.





Don't you have the very same issue with non-stack accesses?
We do and will continue to handle those by reloading the reg+d address 
when the displacement isn't suitably aligned.  In practice those cases 
aren't common and aren't on critical paths.


Jeff



Re: [RFC] Implementing detection of saturation and rounding arithmetic

2021-06-08 Thread Andre Simoes Dias Vieira via Gcc-patches

Hi Bin,

Thank you for the reply, I have some questions, see below.

On 07/06/2021 12:28, Bin.Cheng wrote:

On Fri, Jun 4, 2021 at 12:35 AM Andre Vieira (lists) via Gcc-patches
 wrote:

Hi Andre,
I didn't look into the details of the IV sharing RFC.  It seems to me
costing outside uses is trying to generate better code for later code
(epilogue loop here).  The only problem is IVOPTs doesn't know that
the outside use is not in the final form - which will be transformed
by IVOPTs again.

I think this example is not good at describing your problem because it
shows exactly that considering outside use results in better code,
compared to the other two approaches.
I don't quite understand what you are saying here :( What do you mean by 
final form? It seems to me that costing uses inside and outside loop the 
same way is wrong because calculating the IV inside the loop has to be 
done every iteration, whereas if you can resolve it to a single update 
(without an IV) then you can sink it outside the loop. This is why I 
think this example shows why we need to cost these uses differently.

2) Is there a cleaner way to generate the optimal 'post-increment' use
for the outside-use variable? I first thought the position in the
candidate might be something I could use or even the var_at_stmt
functionality, but the outside IV has the actual increment of the
variable as it's use, rather than the outside uses. This is this RFC's
main weakness I find.

To answer why IVOPTs behaves like this w/o your two patches.  The main
problem is the point IVOPTs rewrites outside use IV - I don't remember
the exact point - but looks like at the end of loop while before
incrementing instruction of main IV.  It's a known issue that outside
use should be costed/re-written on the exit edge along which its value
flows out of loop.  I had a patch a long time ago but discarded it,
because it didn't bring obvious improvement and is complicated in case
of multi-exit edges.
Yeah I haven't looked at multi-exit edges and I understand that 
complicates things. But for now we could disable the special casing of 
outside uses when dealing with multi-exit loops and keep the current 
behavior.


But in general, I am less convinced that any of the two patches is the
right direction solving IV sharing issue between vectorized loop and
epilogue loop.  I would need to read the previous RFC before giving
further comments though.


The previous RFC still has a lot of unanswered questions too, but 
regardless of that, take the following (non-vectorizer) example:


#include 
#include 

void bar (char  * __restrict__ a, char * __restrict__ b, char * 
__restrict__ c, unsigned long long n)

{
    svbool_t all_true = svptrue_b8 ();
  unsigned long long i = 0;
    for (; i < (n & ~(svcntb() - 1)); i += svcntb()) {
  svuint8_t va = svld1 (all_true, (uint8_t*)a);
  svuint8_t vb = svld1 (all_true, (uint8_t*)b);
  svst1 (all_true, (uint8_t *)c, svadd_z (all_true, va,vb));
  a += svcntb();
  b += svcntb();
  c += svcntb();
  }
  svbool_t pred;
  for (; i < (n); i += svcntb()) {
  pred = svwhilelt_b8 (i, n);
  svuint8_t va = svld1 (pred, (uint8_t*)a);
  svuint8_t vb = svld1 (pred, (uint8_t*)b);
  svst1 (pred, (uint8_t *)c, svadd_z (pred, va,vb));
  a += svcntb();
  b += svcntb();
  c += svcntb();
  }


Current IVOPTs will use 4 iterators for the first loop, when it could do 
with just 1. In fact, if you use my patches it will create just a single 
IV and sink the uses and it is then able to merge them with loads & 
stores of the next loop.


I am not saying setting outside costs to 0 is the right thing to do by 
the way. It is absolutely not! It will break cost considerations for 
other cases. Like I said above I've been playing around with using 
'!use->outside' as a multiplier for the cost. Unfortunately it won't 
help with the case above, because this seems to choose 'infinite_cost' 
because the candidate IV has a lower precision than the use IV. I don't 
quite understand yet how candidates are created, but something I'm going 
to try to look at. Just wanted to show this as an example of how IVOPTs 
would not improve code with multiple loops that don't involve the 
vectorizer.


BR,
Andre




Thanks,
bin


Re: Aligning stack offsets for spills

2021-06-08 Thread H.J. Lu via Gcc-patches
On Tue, Jun 8, 2021 at 7:56 AM Jakub Jelinek via Gcc-patches
 wrote:
>
> On Tue, Jun 08, 2021 at 08:47:26AM -0600, Jeff Law wrote:
> > > Why is the machinery involving STACK_SLOT_ALIGNMENT and
> > > spill_slot_alignment() (for spilling) or get_stack_local_alignment() (for
> > > backing stack slots) not working for you?  If everything is setup
> > > correctly the input alignment to try_fit_stack_local ought to be correct
> > > already.
> > We don't need the MEM as a whole aligned, just the offset in the address
> > calculation due to how we encode those instructions.  If I've read that code
> > correctly, it would arrange for a dynamic realignment of the stack  so that
> > it could then align the slot. None of that is necessary for us and we'd like
> > to avoid forcing the dynamic stack realignment.  Or did I misread the code?
>
> I think dynamic stack realignment is done only on x86, no other backend has

I believe that all pieces of infrastructure to realign the stack are
in place.  You
just need to properly align the stack in the backend.

> that support, on all the other arches larger alignments are done
> in expand_stack_vars by effectively performing __builtin_alloca_with_align
> for the block containing all such variables.
> So I'd the the functions Michael mentioned shouldn't be doing dynamic stack
> realignment, though perhaps by pretending the vars have higher alignment
> might be recorded in MEM_ALIGN and perhaps might result in wrong-code if
> something will try to e.g. test if least significant bits of certain MEM
> address are 0.
>
> Jakub
>


-- 
H.J.


Re: [PATCH] rtl: Join the insn and split conditions in define_insn_and_split

2021-06-08 Thread Segher Boessenkool
On Tue, Jun 08, 2021 at 02:48:11PM +0200, Richard Biener wrote:
> > So yeah, patch withdrawn.  This on one hand is proof we do want to make
> > such a change, but on the other hand shows it needs more preparatory
> > steps.
> 
> I wonder if it makes sense to provide ports a means to opt-in into
> the strict "&& " requirement and thus we can gradually fix them.
> Probably requires some t-$target make fragment editing plus
> passing an extra arg to gen* based on that.
> 
> That way maintainers can gradually fix their ports and make sure
> they won't regress again.

Just some target macro might be better / easier?  Just gensupport.c will
need to use it.

Will we still allow empty split conditions?  And automatically make that
do the equivalent of "&& 1"?


Segher


Re: Aligning stack offsets for spills

2021-06-08 Thread Jeff Law




On 6/8/2021 9:06 AM, H.J. Lu wrote:

On Tue, Jun 8, 2021 at 7:56 AM Jakub Jelinek via Gcc-patches
 wrote:

On Tue, Jun 08, 2021 at 08:47:26AM -0600, Jeff Law wrote:

Why is the machinery involving STACK_SLOT_ALIGNMENT and
spill_slot_alignment() (for spilling) or get_stack_local_alignment() (for
backing stack slots) not working for you?  If everything is setup
correctly the input alignment to try_fit_stack_local ought to be correct
already.

We don't need the MEM as a whole aligned, just the offset in the address
calculation due to how we encode those instructions.  If I've read that code
correctly, it would arrange for a dynamic realignment of the stack  so that
it could then align the slot. None of that is necessary for us and we'd like
to avoid forcing the dynamic stack realignment.  Or did I misread the code?

I think dynamic stack realignment is done only on x86, no other backend has

I believe that all pieces of infrastructure to realign the stack are
in place.  You
just need to properly align the stack in the backend.
As I've stated, we don't need the stack aligned to these higher 
boundaries.  Nor do we need the slot as a whole aligned.   That's 
ultimately just wasteful since we don't need them.  We just want to get 
an aligned offset.


Jeff


Re: [PATCH][version 3]add -ftrivial-auto-var-init and variable attribute "uninitialized" to gcc

2021-06-08 Thread Qing Zhao via Gcc-patches


On Jun 8, 2021, at 2:41 AM, Richard Biener 
mailto:rguent...@suse.de>> wrote:



Which is also why I suggested to split out the padding initialization
bits to a separate patch (and option).

Personally, I am okay with splitting padding initialization from this current 
patch,
Kees, what’s your opinion on this? i.e, the current -ftrivial-auto-var-init 
will NOT initialize padding, we will add another option to
Explicitly initialize padding.

It would also be possible to have -fauto-var-init, -fauto-var-init-padding
and have -ftrivial-auto-var-init for clang compatibility enabling
both.

I really like this idea.

Personally, I do think that separating padding initialization from auto-var 
initialization will make the design and implemenation more clean.

With an additional -ftrivial-auto-var-init to include both will serve the clang 
compatibility well.

 Or -fauto-var-init={zero,pattern,padding} and allow
-fauto-var-init=pattern,padding to be specified.  Note there's also
padding between auto variables on the stack - that "trailing"
padding isn't initialized either?  (yes, GCC sorts variables to minimize
that padding)  For example for

void foo()
{
 char a[3];
 bar (a);
}

there's 12 bytes padding after 'a', shouldn't we initialize that?

Yes, in the current patch, tail paddings are also initialized.

But “paddings” between auto variables are not initialized. (They are not belong 
to variables).

Qing


 If not,
why's other padding important to be initialized?

Richard.



Re: [PATCH] libstdc++: Fix Wrong param type in :atomic_ref<_Tp*>::wait [PR100889]

2021-06-08 Thread Jonathan Wakely via Gcc-patches
On Tue, 8 Jun 2021 at 01:29, Thomas Rodgers wrote:

> This time without the repeatred [PR] in the subject line.
>
> Fixes libstdc++/100889
>

This should be part of the ChangeLog entry instead, preceded by PR so it
updates bugzilla, i.e.



> libstdc++-v3/ChangeLog:
>

PR libstdc++/100889


> * include/bits/atomic_base.h (atomic_ref<_Tp*>::wait):
> Change parameter type from _Tp to _Tp*.
> * testsuite/29_atomics/atomic_ref/wait_notify.cc: Extend
> coverage of types tested.
>


OK for trunk and gcc-11 with that change, thanks.


[RFC PATCH] i386: Do not emit segment overrides for %p and %P [PR100936]

2021-06-08 Thread Uros Bizjak via Gcc-patches
Using %p to move the address of a symbol using LEA:

  asm ("lea %p1, %0" : "=r"(addr) : "m"(var));

emits assembler warning when VAR is declared in a non-generic address space:

  Warning: segment override on `lea' is ineffectual

The problem is with %p operand modifier, which should emit raw symbol name:

  p -- print raw symbol name.

Similar problem exists with %P modifier, trying to CALL or JMP to an
overridden symbol,e.g:

call %gs:zzz
jmp %gs:zzz

emits assembler warning:

  Warning: skipping prefixes on `call'
  Warning: skipping prefixes on `jmp'

Ensure that %p and %P never emit segment overrides.

2021-06-08  Uroš Bizjak  

gcc/
PR target/100936
* config/i386/i386.c (print_operand_address_as): Rename "no_rip"
argument to "raw".  Do not emit segment overrides when "raw" is true.

gcc/testsuite/

PR target/100936
* gcc.target/i386/pr100936.c: New test.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Uros.
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index b0d19a61a76..05b8dc806cd 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -13531,7 +13531,7 @@ ix86_print_operand_punct_valid_p (unsigned char code)
 
 static void
 ix86_print_operand_address_as (FILE *file, rtx addr,
-  addr_space_t as, bool no_rip)
+  addr_space_t as, bool raw)
 {
   struct ix86_address parts;
   rtx base, index, disp;
@@ -13570,7 +13570,7 @@ ix86_print_operand_address_as (FILE *file, rtx addr,
   else
 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
 
-  if (!ADDR_SPACE_GENERIC_P (as))
+  if (!ADDR_SPACE_GENERIC_P (as) && !raw)
 {
   if (ASSEMBLER_DIALECT == ASM_ATT)
putc ('%', file);
@@ -13589,7 +13589,7 @@ ix86_print_operand_address_as (FILE *file, rtx addr,
 }
 
   /* Use one byte shorter RIP relative addressing for 64bit mode.  */
-  if (TARGET_64BIT && !base && !index && !no_rip)
+  if (TARGET_64BIT && !base && !index && !raw)
 {
   rtx symbol = disp;
 
diff --git a/gcc/testsuite/gcc.target/i386/pr100936.c 
b/gcc/testsuite/gcc.target/i386/pr100936.c
new file mode 100644
index 000..c076cbb2405
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100936.c
@@ -0,0 +1,34 @@
+/* PR target/100936 */
+/* { dg-do assemble } */
+/* { dg-options "-O2" } */
+
+__seg_gs int var;
+
+static int
+*foo (void)
+{
+  int *addr;
+
+  asm ("lea %p1, %0" : "=r"(addr) : "m"(var));
+
+  return addr;
+}
+
+static int
+bar (int *addr)
+{
+  int val;
+
+  asm ("mov %%gs:%1, %0" : "=r"(val) : "m"(*addr));
+
+  return val;
+}
+
+int
+baz (void)
+{
+  int *addr = foo();
+  int val = bar (addr);
+  
+  return val;
+}


Re: [PATCH] rtl: Join the insn and split conditions in define_insn_and_split

2021-06-08 Thread Richard Sandiford via Gcc-patches
Segher Boessenkool  writes:
> On Tue, Jun 08, 2021 at 02:48:11PM +0200, Richard Biener wrote:
>> > So yeah, patch withdrawn.  This on one hand is proof we do want to make
>> > such a change, but on the other hand shows it needs more preparatory
>> > steps.
>> 
>> I wonder if it makes sense to provide ports a means to opt-in into
>> the strict "&& " requirement and thus we can gradually fix them.
>> Probably requires some t-$target make fragment editing plus
>> passing an extra arg to gen* based on that.
>> 
>> That way maintainers can gradually fix their ports and make sure
>> they won't regress again.
>
> Just some target macro might be better / easier?  Just gensupport.c will
> need to use it.
>
> Will we still allow empty split conditions?  And automatically make that
> do the equivalent of "&& 1"?

Wouldn't that run the risk of the partial transition that my suggestion
was rejected for? ;-)

I think an empty define_insn_and_split split condition should continue
to mean the same thing everywhere.  So while we continue to have ports
in which an empty condition means one thing, I don't think we should
also have ports where an empty condition means something else.

Thanks,
Richard


Re: Aligning stack offsets for spills

2021-06-08 Thread Michael Matz
Hello,

On Tue, 8 Jun 2021, Jeff Law wrote:

> On 6/8/2021 9:06 AM, H.J. Lu wrote:
> > On Tue, Jun 8, 2021 at 7:56 AM Jakub Jelinek via Gcc-patches
> >  wrote:
> >> On Tue, Jun 08, 2021 at 08:47:26AM -0600, Jeff Law wrote:
>  Why is the machinery involving STACK_SLOT_ALIGNMENT and
>  spill_slot_alignment() (for spilling) or get_stack_local_alignment() (for
>  backing stack slots) not working for you?  If everything is setup
>  correctly the input alignment to try_fit_stack_local ought to be correct
>  already.
> >>> We don't need the MEM as a whole aligned, just the offset in the address
> >>> calculation due to how we encode those instructions.  If I've read that
> >>> code
> >>> correctly, it would arrange for a dynamic realignment of the stack  so
> >>> that
> >>> it could then align the slot. None of that is necessary for us and we'd
> >>> like
> >>> to avoid forcing the dynamic stack realignment.  Or did I misread the
> >>> code?
> >> I think dynamic stack realignment is done only on x86, no other backend has
> > I believe that all pieces of infrastructure to realign the stack are
> > in place.  You
> > just need to properly align the stack in the backend.
> 
> As I've stated, we don't need the stack aligned to these higher boundaries. 
> Nor do we need the slot as a whole aligned.   That's ultimately just wasteful
> since we don't need them.  We just want to get an aligned offset.

Well, but isn't that creating a difference when there is none?  You need 
an aligned offset; when given an aligned stack pointer that then is 
equivalent to an aligned stack address.  You are saying that you don't 
need the aligned stack pointer, sure, but would it be a problem for you?

Apart from that: dynamic stack realignment can be disabled (or probably 
isn't enabled for your target to start with), then the stack offset 
alignment machinery should still work in isolation.  (Well it might 
generate alignment claims in MEM RTL which then isn't in fact true, 
depends on the architecture if that's a problem for you).

Either way, I think whatever you need should probably be somehow 
integrated with the existing stack slot alignment knobs.  Or rather the 
two orthogonal pieces (stack pointer alignment and stack offset alignment) 
be separated and you then using only the latter.

(Btw: are you also trying to improve non-stack addresses?  Because 
ultimately your constraints aren't about stack at all but about all 
address forms.  In a way this all is more like a job for addressing 
mode selection and massaging, but of course our support for such in GCC is 
limited beyond avoiding invalid modes by reloading into registers, which 
is exactly what you don't want :) )


Ciao,
Michael.


Re: [PATCH] rtl: Join the insn and split conditions in define_insn_and_split

2021-06-08 Thread Segher Boessenkool
On Tue, Jun 08, 2021 at 04:50:56PM +0100, Richard Sandiford wrote:
> Segher Boessenkool  writes:
> > On Tue, Jun 08, 2021 at 02:48:11PM +0200, Richard Biener wrote:
> >> > So yeah, patch withdrawn.  This on one hand is proof we do want to make
> >> > such a change, but on the other hand shows it needs more preparatory
> >> > steps.
> >> 
> >> I wonder if it makes sense to provide ports a means to opt-in into
> >> the strict "&& " requirement and thus we can gradually fix them.
> >> Probably requires some t-$target make fragment editing plus
> >> passing an extra arg to gen* based on that.
> >> 
> >> That way maintainers can gradually fix their ports and make sure
> >> they won't regress again.
> >
> > Just some target macro might be better / easier?  Just gensupport.c will
> > need to use it.
> >
> > Will we still allow empty split conditions?  And automatically make that
> > do the equivalent of "&& 1"?
> 
> Wouldn't that run the risk of the partial transition that my suggestion
> was rejected for? ;-)

First off, I have changed position a few times now on what I think would
be the best way forward here :-)

I assumed we would make the "&&" requirement a requirement for GCC 12
eventually.  But yes that needs to be spelled out!

> I think an empty define_insn_and_split split condition should continue
> to mean the same thing everywhere.  So while we continue to have ports
> in which an empty condition means one thing, I don't think we should
> also have ports where an empty condition means something else.

If we have the "&&" requirement, we either disallow empty split
conditions, or have it be treated as "&& 1".  In either case it will
mean the same thing everywhere.

And in all cases, not just these cases but *all* cases, code that works
on trunk will not necessarily work on backports.  I don't see any
obvious cases where this will be a worse problem with this, do you?


Segher


Re: [PATCH] rtl: Join the insn and split conditions in define_insn_and_split

2021-06-08 Thread Richard Sandiford via Gcc-patches
Segher Boessenkool  writes:
> On Tue, Jun 08, 2021 at 04:50:56PM +0100, Richard Sandiford wrote:
>> Segher Boessenkool  writes:
>> > On Tue, Jun 08, 2021 at 02:48:11PM +0200, Richard Biener wrote:
>> >> > So yeah, patch withdrawn.  This on one hand is proof we do want to make
>> >> > such a change, but on the other hand shows it needs more preparatory
>> >> > steps.
>> >> 
>> >> I wonder if it makes sense to provide ports a means to opt-in into
>> >> the strict "&& " requirement and thus we can gradually fix them.
>> >> Probably requires some t-$target make fragment editing plus
>> >> passing an extra arg to gen* based on that.
>> >> 
>> >> That way maintainers can gradually fix their ports and make sure
>> >> they won't regress again.
>> >
>> > Just some target macro might be better / easier?  Just gensupport.c will
>> > need to use it.
>> >
>> > Will we still allow empty split conditions?  And automatically make that
>> > do the equivalent of "&& 1"?
>> 
>> Wouldn't that run the risk of the partial transition that my suggestion
>> was rejected for? ;-)
>
> First off, I have changed position a few times now on what I think would
> be the best way forward here :-)
>
> I assumed we would make the "&&" requirement a requirement for GCC 12
> eventually.  But yes that needs to be spelled out!
>
>> I think an empty define_insn_and_split split condition should continue
>> to mean the same thing everywhere.  So while we continue to have ports
>> in which an empty condition means one thing, I don't think we should
>> also have ports where an empty condition means something else.
>
> If we have the "&&" requirement, we either disallow empty split
> conditions, or have it be treated as "&& 1".  In either case it will
> mean the same thing everywhere.

Ah, OK.  I meant that we shouldn't change what an empty condition means
until the transition is complete and the target macro has been removed.
If the question was instead whether we should allow an empty condition
once the transition is complete, then I've no opinion either way.
(It doesn't seem like something we need to decide now though.)

Thanks,
Richard


Re: [PATCH][version 3]add -ftrivial-auto-var-init and variable attribute "uninitialized" to gcc

2021-06-08 Thread Kees Cook via Gcc-patches
On Tue, Jun 08, 2021 at 09:37:30AM +0200, Richard Biener wrote:
> On Mon, 7 Jun 2021, Qing Zhao wrote:
> > On Jun 7, 2021, at 2:48 AM, Richard Biener 
> > mailto:rguent...@suse.de>> wrote:
> > 
> > Meh - can you try using a mailer that does proper quoting?  It's difficult
> > to spot your added comments.  Will try anyway (and sorry for the delay)
> > 
> > Only the email replied to gcc-patch alias had this issue, all the other 
> > emails I sent are fine. Not sure why?
> 
> All your mails have this problem for me, it makes it quite difficult to
> follow the conversation.

I think the first step is to make sure the MUA is sending "text only"
emails. Then configure the "quoting style" to do the standard "> "-style.

What email client are you using?

-- 
Kees Cook


Re: [PATCH][version 3]add -ftrivial-auto-var-init and variable attribute "uninitialized" to gcc

2021-06-08 Thread Kees Cook via Gcc-patches
On Tue, Jun 08, 2021 at 09:41:38AM +0200, Richard Biener wrote:
> On Mon, 7 Jun 2021, Qing Zhao wrote:
> 
> > Hi, 
> > 
> > > On Jun 7, 2021, at 2:53 AM, Richard Biener  wrote:
> > > 
> > >> 
> > >> To address the above suggestion:
> > >> 
> > >> My study shows: the call to __builtin_clear_padding is expanded during 
> > >> gimplification phase.
> > >> And there is no __bultin_clear_padding expanding during rtx expanding 
> > >> phase.
> > >> However, for -ftrivial-auto-var-init, padding initialization should be 
> > >> done both in gimplification phase and rtx expanding phase.
> > >> since the __builtin_clear_padding might not be good for rtx expanding, 
> > >> reusing __builtin_clear_padding might not work.
> > >> 
> > >> Let me know if you have any more comments on this.
> > > 
> > > Yes, I didn't suggest to literally emit calls to __builtin_clear_padding 
> > > but instead to leverage the lowering code, more specifically share the
> > > code that figures _what_ is to be initialized (where the padding is)
> > > and eventually the actual code generation pieces.  That might need some
> > > refactoring but the code where padding resides should be present only
> > > a single time (since it's quite complex).
> > 
> > Okay, I see your point here.
> > 
> > > 
> > > Which is also why I suggested to split out the padding initialization
> > > bits to a separate patch (and option).
> > 
> > Personally, I am okay with splitting padding initialization from this 
> > current patch,
> > Kees, what’s your opinion on this? i.e, the current -ftrivial-auto-var-init 
> > will NOT initialize padding, we will add another option to 
> > Explicitly initialize padding.
> 
> It would also be possible to have -fauto-var-init, -fauto-var-init-padding
> and have -ftrivial-auto-var-init for clang compatibility enabling both.

Sounds good to me!

> Or -fauto-var-init={zero,pattern,padding} and allow
> -fauto-var-init=pattern,padding to be specified.  Note there's also
> padding between auto variables on the stack - that "trailing"
> padding isn't initialized either?  (yes, GCC sorts variables to minimize
> that padding)  For example for
> 
> void foo()
> {
>   char a[3];
>   bar (a);
> }
> 
> there's 12 bytes padding after 'a', shouldn't we initialize that?  If not,
> why's other padding important to be initialized?

This isn't a situation that I'm aware of causing real-world problems.
The issues have all come from padding within an addressable object. I
haven't tested Clang's behavior on this (and I have no kernel tests for
this padding), but I do check for trailing padding, like:

struct test_trailing_hole {
char *one;
char *two;
char *three;
char four;
/* "sizeof(unsigned long) - 1" byte padding hole here. */
};

-Kees

-- 
Kees Cook


[pushed] c++: braced-list overload resolution [PR100963]

2021-06-08 Thread Jason Merrill via Gcc-patches
My PR969626 patch made us ignore template candidates when there's a perfect
non-template candidate.  In this case, we were considering B(int) a perfect
match for B({0}), but the brace elision makes it imperfect.

Tested x86_64-pc-linux-gnu, applying to trunk.

PR c++/100963

gcc/cp/ChangeLog:

* call.c (perfect_conversion_p): Check check_narrowing.

gcc/testsuite/ChangeLog:

* g++.dg/cpp0x/initlist124.C: New test.
---
 gcc/cp/call.c|  3 +++
 gcc/testsuite/g++.dg/cpp0x/initlist124.C | 13 +
 2 files changed, 16 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/cpp0x/initlist124.C

diff --git a/gcc/cp/call.c b/gcc/cp/call.c
index 17fc60cd4af..d2f6ca872fc 100644
--- a/gcc/cp/call.c
+++ b/gcc/cp/call.c
@@ -5880,6 +5880,9 @@ perfect_conversion_p (conversion *conv)
next_conversion (conv)->type))
return false;
 }
+  if (conv->check_narrowing)
+/* Brace elision is imperfect.  */
+return false;
   return true;
 }
 
diff --git a/gcc/testsuite/g++.dg/cpp0x/initlist124.C 
b/gcc/testsuite/g++.dg/cpp0x/initlist124.C
new file mode 100644
index 000..45dcbb303e2
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp0x/initlist124.C
@@ -0,0 +1,13 @@
+// PR c++/100963
+// { dg-do compile { target c++11 } }
+
+#include 
+
+struct B {
+  B(int) = delete;
+  template B(std::initializer_list);
+};
+
+int main() {
+  B({0});
+}

base-commit: 69bb37f9e0143fbca3124069c0e9b6937ccf1fc7
-- 
2.27.0



Re: [PATCH, OpenACC 2.5, libgomp] Add *_async versions of runtime library API functions

2021-06-08 Thread Thomas Schwinge
Hi Chung-Lin!

;-) It's been a while:

On 2018-09-10T23:04:18+0800, Chung-Lin Tang  wrote:
>  * testsuite/libgomp.oacc-c-c++-common/lib-94.c: New test.
>  * testsuite/libgomp.oacc-c-c++-common/lib-95.c: New test.
>  * testsuite/libgomp.oacc-fortran/lib-16.f90: New test.

Do you happen to remember why in these testcases you're using the
following pattern:

> --- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c  (nonexistent)
> +++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c  (working copy)
> @@ -0,0 +1,42 @@
> +/* { dg-do run } */
> +/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } */
> +
> +#include 
> +#include 
> +#include 
> +
> +int
> +main (int argc, char **argv)
> +{
> +  const int N = 256;
> +  int i;
> +  int async = 8;
> +  unsigned char *h;
> +
> +  h = (unsigned char *) malloc (N);
> +
> +  for (i = 0; i < N; i++)
> +{
> +  h[i] = i;
> +}
> +
> +  acc_copyin_async (h, N, async);
> +
> +  memset (h, 0, N);
> +
> +  acc_wait (async);

You first issue 'acc_copyin_async', then (while potentially that's still
accessing 'h') already 'memset' 'h' (potentially overwriting data that
'acc_copyin_async' is still working on), and only then 'acc_wait'?

My understanding of OpenACC would swap 'memset' and 'acc_wait', but maybe
you have a specific reason to do it in this way?

In particular, the GCC nvptx offloading implementation "doesn't seem to
care" (as discussed elsewhere; 'OpenACC "ephemeral" asynchronous
host-to-device copies', etc.) -- but I suppose if you meant to test such
implementation traits here, you'd have commented that?

> +
> +  acc_copyout_async (h, N, async + 1);
> +
> +  acc_wait (async + 1);
> +
> +  for (i = 0; i < N; i++)
> +{
> +  if (h[i] != i)
> + abort ();
> +}
> +
> +  free (h);
> +
> +  return 0;
> +}

> --- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-95.c  (nonexistent)
> +++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-95.c  (working copy)
> @@ -0,0 +1,45 @@
> +/* { dg-do run } */
> +/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } */
> +
> +#include 
> +#include 
> +#include 
> +
> +int
> +main (int argc, char **argv)
> +{
> +  const int N = 256;
> +  int i, q = 5;
> +  unsigned char *h, *g;
> +  void *d;
> +
> +  h = (unsigned char *) malloc (N);
> +  g = (unsigned char *) malloc (N);
> +  for (i = 0; i < N; i++)
> +{
> +  g[i] = i;
> +}
> +
> +  acc_create_async (h, N, q);
> +
> +  acc_memcpy_to_device_async (acc_deviceptr (h), g, N, q);
> +  memset (&h[0], 0, N);
> +
> +  acc_wait (q);

Similar here.

> +  acc_update_self_async (h, N, q + 1);
> +  acc_delete_async (h, N, q + 1);
> +
> +  acc_wait (q + 1);
> +
> +  for (i = 0; i < N; i++)
> +{
> +  if (h[i] != i)
> + abort ();
> +}
> +
> +  free (h);
> +  free (g);
> +
> +  return 0;
> +}

> --- libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 (nonexistent)
> +++ libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 (working copy)

(Later also similarly copied into 'libgomp.oacc-fortran/lib-16-2.f90'.)

Similar:

> @@ -0,0 +1,57 @@
> +! { dg-do run }
> +! { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } }
> +
> +program main
> +  use openacc
> +  implicit none
> +
> +  integer, parameter :: N = 256
> +  integer, allocatable :: h(:)
> +  integer :: i
> +  integer :: async = 5
> +
> +  allocate (h(N))
> +
> +  do i = 1, N
> +h(i) = i
> +  end do
> +
> +  call acc_copyin (h)
> +
> +  do i = 1, N
> +h(i) = i + i
> +  end do
> +
> +  call acc_update_device_async (h, sizeof (h), async)
> +
> +  if (acc_is_present (h) .neqv. .TRUE.) call abort

Don't we need 'acc_wait' here (while 'acc_update_device_async' may still
be reading from 'h'), before overwriting 'h' here:

> +
> +  h(:) = 0
> +
> +  call acc_copyout_async (h, sizeof (h), async)
> +
> +  call acc_wait (async)
> +
> +  do i = 1, N
> +if (h(i) /= i + i) call abort
> +  end do
> +
> +  call acc_copyin (h, sizeof (h))
> +
> +  h(:) = 0
> +
> +  call acc_update_self_async (h, sizeof (h), async)
> +
> +  if (acc_is_present (h) .neqv. .TRUE.) call abort

Don't we need 'acc_wait' here (to make sure we finish device to host copy
of 'h'), before evaluating 'h' here:

> +
> +  do i = 1, N
> +if (h(i) /= i + i) call abort
> +  end do
> +
> +  call acc_delete_async (h, async)
> +
> +  call acc_wait (async)
> +
> +  if (acc_is_present (h) .neqv. .FALSE.) call abort
> +
> +end program

Julian has patches for most of these (as part of other commits).


Grüße
 Thomas
-
Mentor Graphics (Deutschland) GmbH, Arnulfstrasse 201, 80634 München 
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Frank 
Thürauf


Re: [PATCH][version 3]add -ftrivial-auto-var-init and variable attribute "uninitialized" to gcc

2021-06-08 Thread Qing Zhao via Gcc-patches
Thanks a lot.

Kees. 

Do you have the same issue with my emails?

I see this problem with my email mostly to part of the emails that were sent to 
gcc-patches alias. 
Other emails are fine. 

> On Jun 8, 2021, at 11:56 AM, Kees Cook  wrote:
> 
> On Tue, Jun 08, 2021 at 09:37:30AM +0200, Richard Biener wrote:
>> On Mon, 7 Jun 2021, Qing Zhao wrote:
>>> On Jun 7, 2021, at 2:48 AM, Richard Biener 
>>> mailto:rguent...@suse.de>> wrote:
>>> 
>>> Meh - can you try using a mailer that does proper quoting?  It's difficult
>>> to spot your added comments.  Will try anyway (and sorry for the delay)
>>> 
>>> Only the email replied to gcc-patch alias had this issue, all the other 
>>> emails I sent are fine. Not sure why?
>> 
>> All your mails have this problem for me, it makes it quite difficult to
>> follow the conversation.
> 
> I think the first step is to make sure the MUA is sending "text only"
> emails. Then configure the "quoting style" to do the standard "> "-style.
> 
> What email client are you using?

I am using Mac’s Apple Mail client on my computer. 

I have been using this mail client for a long time, but only had such issues 
recently. 

Really not sure what’s going on.

I will try to figure this out.

Qing
> 
> -- 
> Kees Cook



Re: [PATCH][version 3]add -ftrivial-auto-var-init and variable attribute "uninitialized" to gcc

2021-06-08 Thread Kees Cook via Gcc-patches
On Tue, Jun 08, 2021 at 05:32:32PM +, Qing Zhao wrote:
> Thanks a lot.
> 
> Kees. 
> 
> Do you have the same issue with my emails?

Some of them, yes. This one was fine.

> I see this problem with my email mostly to part of the emails that were sent 
> to gcc-patches alias. 
> Other emails are fine. 
> 
> > On Jun 8, 2021, at 11:56 AM, Kees Cook  wrote:
> > 
> > On Tue, Jun 08, 2021 at 09:37:30AM +0200, Richard Biener wrote:
> >> On Mon, 7 Jun 2021, Qing Zhao wrote:
> >>> On Jun 7, 2021, at 2:48 AM, Richard Biener 
> >>> mailto:rguent...@suse.de>> wrote:
> >>> 
> >>> Meh - can you try using a mailer that does proper quoting?  It's difficult
> >>> to spot your added comments.  Will try anyway (and sorry for the delay)
> >>> 
> >>> Only the email replied to gcc-patch alias had this issue, all the other 
> >>> emails I sent are fine. Not sure why?
> >> 
> >> All your mails have this problem for me, it makes it quite difficult to
> >> follow the conversation.
> > 
> > I think the first step is to make sure the MUA is sending "text only"
> > emails. Then configure the "quoting style" to do the standard "> "-style.
> > 
> > What email client are you using?
> 
> I am using Mac’s Apple Mail client on my computer. 
> 
> I have been using this mail client for a long time, but only had such issues 
> recently. 

I share your pain! Gmail frequently likes making tiny breaking changes
too. :)

> Really not sure what’s going on.
> 
> I will try to figure this out.

Thanks!

-- 
Kees Cook


[PATCH v3 2/2] x86: Add vec_duplicate expander

2021-06-08 Thread H.J. Lu via Gcc-patches
1. Update vec_duplicate to allow to fail so that backend can only allow
broadcasting an integer constant to a vector when broadcast instruction
is available.  This can be used by memset expander to avoid vec_duplicate
when loading from constant pool is more efficient.
2. Add vec_duplicate expander and enable vec_duplicate from a
non-standard SSE constant integer only if vector broadcast is available.

* config/i386/i386-expand.c (ix86_expand_integer_vec_duplicate):
New function.
* config/i386/i386-protos.h (ix86_expand_integer_vec_duplicat):
New prototype.
* config/i386/sse.md (INT_BROADCAST_MODE): New mode iterator.
(vec_duplicate): New expander.
* doc/md.texi: Update vec_duplicate.
---
 gcc/config/i386/i386-expand.c | 21 +
 gcc/config/i386/i386-protos.h |  1 +
 gcc/config/i386/sse.md| 20 
 gcc/doc/md.texi   |  2 --
 4 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 29d96805d9d..145e028353c 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -15669,6 +15669,27 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, 
rtx vec, int elt)
 }
 }
 
+/* Expand integer vec_duplicate.  Return true if successful.  */
+
+bool
+ix86_expand_integer_vec_duplicate (rtx *operands)
+{
+  /* Enable VEC_DUPLICATE from a non-standard SSE constant integer only
+ if vector broadcast is available.  */
+  if (CONST_INT_P (operands[1])
+  && (!TARGET_AVX2
+ || standard_sse_constant_p (operands[1],
+ GET_MODE (operands[0]
+return false;
+
+  if (!ix86_expand_vector_init_duplicate (false,
+ GET_MODE (operands[0]),
+ operands[0], operands[1]))
+gcc_unreachable ();
+
+  return true;
+}
+
 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
The upper bits of DEST are undefined, though they shouldn't cause
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 578750a2532..dc191dc18ec 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -260,6 +260,7 @@ extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, 
bool, bool);
 extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_abs (rtx, rtx);
+extern bool ix86_expand_integer_vec_duplicate (rtx *);
 
 /* In i386-c.c  */
 extern void ix86_target_macros (void);
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 2a34756be2a..a227295cc1d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -24570,3 +24570,23 @@
   "TARGET_WIDEKL"
   "aes\t{%0}"
   [(set_attr "type" "other")])
+
+;; Modes handled by broadcast patterns.
+(define_mode_iterator INT_BROADCAST_MODE
+  [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI
+   (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI
+   (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI
+   (V8DI "TARGET_AVX512F") (V4DI "TARGET_64BIT") V2DI])
+
+;; Broadcast from an integer.  NB: Enable broadcast only if we can move
+;; from GPR to SSE register directly.
+(define_expand "vec_duplicate"
+  [(set (match_operand:INT_BROADCAST_MODE 0 "register_operand")
+   (vec_duplicate:INT_BROADCAST_MODE
+ (match_operand: 1 "general_operand")))]
+  "TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_TO_VEC"
+{
+  if (!ix86_expand_integer_vec_duplicate (operands))
+FAIL;
+  DONE;
+})
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 00caf3844cc..e66c41c4779 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5077,8 +5077,6 @@ the mode appropriate for one element of @var{m}.
 This pattern only handles duplicates of non-constant inputs.  Constant
 vectors go through the @code{mov@var{m}} pattern instead.
 
-This pattern is not allowed to @code{FAIL}.
-
 @cindex @code{vec_series@var{m}} instruction pattern
 @item @samp{vec_series@var{m}}
 Initialize vector output operand 0 so that element @var{i} is equal to
-- 
2.31.1



[PATCH v3 0/2] x86: Convert CONST_WIDE_INT/CONST_VECTOR to broadcast

2021-06-08 Thread H.J. Lu via Gcc-patches
1. Update move expanders to convert the CONST_WIDE_INT and CONST_VECTO
operands to vector broadcast from an integer with AVX2.
2. Add ix86_gen_scratch_sse_rtx to return a scratch SSE register which
won't increase stack alignment requirement and blocks transformation by
the combine pass.
3. Update PR 87767 tests to expect integer broadcast instead of broadcast
from memory.
4. Update avx512f_cond_move.c to expect integer broadcast.
5. Update vec_duplicate to allow to fail so that backend can only allow
broadcasting an integer constant to a vector when broadcast instruction
is available.  This can be used by memset expander to avoid vec_duplicate
when loading from constant pool is more efficient.
6. Add vec_duplicate expander and enable vec_duplicate from a
non-standard SSE constant integer only if vector broadcast is available.

A small benchmark:

https://gitlab.com/x86-benchmarks/microbenchmark/-/tree/memset/broadcast

shows that broadcast is a little bit faster on Intel Core i7-8559U:

$ make
gcc -g -I. -O2   -c -o test.o test.c
gcc -g   -c -o memory.o memory.S
gcc -g   -c -o broadcast.o broadcast.S
gcc -g   -c -o vec_dup_sse2.o vec_dup_sse2.S
gcc -o test test.o memory.o broadcast.o vec_dup_sse2.o
./test
memory  : 147215
broadcast   : 121213
vec_dup_sse2: 171366
$

broadcast is also smaller:

$ size memory.o broadcast.o
   textdata bss dec hex filename
132   0   0 132  84 memory.o
122   0   0 122  7a broadcast.o
$

H.J. Lu (2):
  x86: Convert CONST_WIDE_INT/CONST_VECTOR to broadcast
  x86: Add vec_duplicate expander

 gcc/config/i386/i386-expand.c | 213 +-
 gcc/config/i386/i386-protos.h |   3 +
 gcc/config/i386/i386.c|  31 +++
 gcc/config/i386/sse.md|  20 ++
 gcc/doc/md.texi   |   2 -
 .../i386/avx512f-broadcast-pr87767-1.c|   7 +-
 .../i386/avx512f-broadcast-pr87767-5.c|   5 +-
 .../gcc.target/i386/avx512f_cond_move.c   |   4 +-
 .../i386/avx512vl-broadcast-pr87767-1.c   |  12 +-
 .../i386/avx512vl-broadcast-pr87767-5.c   |   9 +-
 gcc/testsuite/gcc.target/i386/pr100865-1.c|  13 ++
 gcc/testsuite/gcc.target/i386/pr100865-10a.c  |  33 +++
 gcc/testsuite/gcc.target/i386/pr100865-10b.c  |   7 +
 gcc/testsuite/gcc.target/i386/pr100865-2.c|  14 ++
 gcc/testsuite/gcc.target/i386/pr100865-3.c|  15 ++
 gcc/testsuite/gcc.target/i386/pr100865-4a.c   |  16 ++
 gcc/testsuite/gcc.target/i386/pr100865-4b.c   |   9 +
 gcc/testsuite/gcc.target/i386/pr100865-5a.c   |  16 ++
 gcc/testsuite/gcc.target/i386/pr100865-5b.c   |   9 +
 gcc/testsuite/gcc.target/i386/pr100865-6a.c   |  16 ++
 gcc/testsuite/gcc.target/i386/pr100865-6b.c   |   9 +
 gcc/testsuite/gcc.target/i386/pr100865-7a.c   |  17 ++
 gcc/testsuite/gcc.target/i386/pr100865-7b.c   |   9 +
 gcc/testsuite/gcc.target/i386/pr100865-8a.c   |  24 ++
 gcc/testsuite/gcc.target/i386/pr100865-8b.c   |   7 +
 gcc/testsuite/gcc.target/i386/pr100865-9a.c   |  25 ++
 gcc/testsuite/gcc.target/i386/pr100865-9b.c   |   7 +
 27 files changed, 526 insertions(+), 26 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-10a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-10b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-4a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-4b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-5a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-5b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-6a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-6b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-7a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-7b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-8a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-8b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-9a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-9b.c

-- 
2.31.1



[PATCH v3 1/2] x86: Convert CONST_WIDE_INT/CONST_VECTOR to broadcast

2021-06-08 Thread H.J. Lu via Gcc-patches
1. Update move expanders to convert the CONST_WIDE_INT and CONST_VECTO
operands to vector broadcast from an integer with AVX2.
2. Add ix86_gen_scratch_sse_rtx to return a scratch SSE register which
won't increase stack alignment requirement and blocks transformation by
the combine pass.
3. Update PR 87767 tests to expect integer broadcast instead of broadcast
from memory.
4. Update avx512f_cond_move.c to expect integer broadcast.

A small benchmark:

https://gitlab.com/x86-benchmarks/microbenchmark/-/tree/memset/broadcast

shows that broadcast is a little bit faster on Intel Core i7-8559U:

$ make
gcc -g -I. -O2   -c -o test.o test.c
gcc -g   -c -o memory.o memory.S
gcc -g   -c -o broadcast.o broadcast.S
gcc -g   -c -o vec_dup_sse2.o vec_dup_sse2.S
gcc -o test test.o memory.o broadcast.o vec_dup_sse2.o
./test
memory  : 147215
broadcast   : 121213
vec_dup_sse2: 171366
$

broadcast is also smaller:

$ size memory.o broadcast.o
   textdata bss dec hex filename
132   0   0 132  84 memory.o
122   0   0 122  7a broadcast.o
$

gcc/

PR target/100865
* config/i386/i386-expand.c (ix86_expand_vector_init_duplicate):
New prototype.
(ix86_byte_broadcast): New function.
(ix86_convert_const_wide_int_to_broadcast): Likewise.
(ix86_expand_move): Convert CONST_WIDE_INT to broadcast if mode
size is 16 bytes or bigger.
(ix86_broadcast_from_integer_constant): New function.
(ix86_expand_vector_move): Convert CONST_WIDE_INT and CONST_VECTOR
to broadcast if mode size is 16 bytes or bigger.
* config/i386/i386-protos.h (ix86_gen_scratch_sse_rtx): New
prototype.
* config/i386/i386.c (ix86_gen_scratch_sse_rtx): New function.

gcc/testsuite/

PR target/100865
* gcc.target/i386/avx512f-broadcast-pr87767-1.c: Expect integer
broadcast.
* gcc.target/i386/avx512f-broadcast-pr87767-5.c: Likewise.
* gcc.target/i386/avx512vl-broadcast-pr87767-1.c: Likewise.
* gcc.target/i386/avx512vl-broadcast-pr87767-5.c: Likewise.
* gcc.target/i386/avx512f_cond_move.c: Also pass
-mprefer-vector-width=512 and expect integer broadcast.
* gcc.target/i386/pr100865-1.c: New test.
* gcc.target/i386/pr100865-2.c: Likewise.
* gcc.target/i386/pr100865-3.c: Likewise.
* gcc.target/i386/pr100865-4a.c: Likewise.
* gcc.target/i386/pr100865-4b.c: Likewise.
* gcc.target/i386/pr100865-5a.c: Likewise.
* gcc.target/i386/pr100865-5b.c: Likewise.
* gcc.target/i386/pr100865-6a.c: Likewise.
* gcc.target/i386/pr100865-6b.c: Likewise.
* gcc.target/i386/pr100865-7a.c: Likewise.
* gcc.target/i386/pr100865-7b.c: Likewise.
* gcc.target/i386/pr100865-8a.c: Likewise.
* gcc.target/i386/pr100865-8b.c: Likewise.
* gcc.target/i386/pr100865-9a.c: Likewise.
* gcc.target/i386/pr100865-9b.c: Likewise.
* gcc.target/i386/pr100865-10a.c: Likewise.
* gcc.target/i386/pr100865-10b.c: Likewise.
---
 gcc/config/i386/i386-expand.c | 192 --
 gcc/config/i386/i386-protos.h |   2 +
 gcc/config/i386/i386.c|  31 +++
 .../i386/avx512f-broadcast-pr87767-1.c|   7 +-
 .../i386/avx512f-broadcast-pr87767-5.c|   5 +-
 .../gcc.target/i386/avx512f_cond_move.c   |   4 +-
 .../i386/avx512vl-broadcast-pr87767-1.c   |  12 +-
 .../i386/avx512vl-broadcast-pr87767-5.c   |   9 +-
 gcc/testsuite/gcc.target/i386/pr100865-1.c|  13 ++
 gcc/testsuite/gcc.target/i386/pr100865-10a.c  |  33 +++
 gcc/testsuite/gcc.target/i386/pr100865-10b.c  |   7 +
 gcc/testsuite/gcc.target/i386/pr100865-2.c|  14 ++
 gcc/testsuite/gcc.target/i386/pr100865-3.c|  15 ++
 gcc/testsuite/gcc.target/i386/pr100865-4a.c   |  16 ++
 gcc/testsuite/gcc.target/i386/pr100865-4b.c   |   9 +
 gcc/testsuite/gcc.target/i386/pr100865-5a.c   |  16 ++
 gcc/testsuite/gcc.target/i386/pr100865-5b.c   |   9 +
 gcc/testsuite/gcc.target/i386/pr100865-6a.c   |  16 ++
 gcc/testsuite/gcc.target/i386/pr100865-6b.c   |   9 +
 gcc/testsuite/gcc.target/i386/pr100865-7a.c   |  17 ++
 gcc/testsuite/gcc.target/i386/pr100865-7b.c   |   9 +
 gcc/testsuite/gcc.target/i386/pr100865-8a.c   |  24 +++
 gcc/testsuite/gcc.target/i386/pr100865-8b.c   |   7 +
 gcc/testsuite/gcc.target/i386/pr100865-9a.c   |  25 +++
 gcc/testsuite/gcc.target/i386/pr100865-9b.c   |   7 +
 25 files changed, 484 insertions(+), 24 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-10a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-10b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-4a.c
 create mode 100644 gcc/testsuite

Re: [PATCH][version 3]add -ftrivial-auto-var-init and variable attribute "uninitialized" to gcc

2021-06-08 Thread Qing Zhao via Gcc-patches


> On Jun 8, 2021, at 11:59 AM, Kees Cook  wrote:
> 
> On Tue, Jun 08, 2021 at 09:41:38AM +0200, Richard Biener wrote:
>> On Mon, 7 Jun 2021, Qing Zhao wrote:
>>> 
>>> Personally, I am okay with splitting padding initialization from this 
>>> current patch,
>>> Kees, what’s your opinion on this? i.e, the current -ftrivial-auto-var-init 
>>> will NOT initialize padding, we will add another option to 
>>> Explicitly initialize padding.
>> 
>> It would also be possible to have -fauto-var-init, -fauto-var-init-padding
>> and have -ftrivial-auto-var-init for clang compatibility enabling both.
> 
> Sounds good to me!

Agreed!

Then I will take this approach:

1.  Adding two separate new options:
   -fauto-var-init. initialize auto variables to zero or patterns. For 
variables that have paddings, only initialize valid fields, no padding 
initialization;
   -fauto-var-init-paddinginitialize paddings inside an auto variables to 
zeroes. 

2.  Add another new option for Clang compatibility:
   -ftrivial-auto-var-init   will enable -fauto-var-init + 
-fauto-var-init-padding


Thanks.

Qing
> 
>> Or -fauto-var-init={zero,pattern,padding} and allow
>> -fauto-var-init=pattern,padding to be specified.  Note there's also
>> padding between auto variables on the stack - that "trailing"
>> padding isn't initialized either?  (yes, GCC sorts variables to minimize
>> that padding)  For example for
>> 
>> void foo()
>> {
>>  char a[3];
>>  bar (a);
>> }
>> 
>> there's 12 bytes padding after 'a', shouldn't we initialize that?  If not,
>> why's other padding important to be initialized?
> 
> This isn't a situation that I'm aware of causing real-world problems.
> The issues have all come from padding within an addressable object. I
> haven't tested Clang's behavior on this (and I have no kernel tests for
> this padding), but I do check for trailing padding, like:
> 
> struct test_trailing_hole {
>char *one;
>char *two;
>char *three;
>char four;
>/* "sizeof(unsigned long) - 1" byte padding hole here. */
> };
> 
> -Kees
> 
> -- 
> Kees Cook



[PATCH 02/55] rs6000: Initial create of rs6000-gen-builtins.c

2021-06-08 Thread Bill Schmidt via Gcc-patches
2021-04-02  Bill Schmidt  

gcc/
* config/rs6000/rs6000-gen-builtins.c: New.
---
 gcc/config/rs6000/rs6000-gen-builtins.c | 165 
 1 file changed, 165 insertions(+)
 create mode 100644 gcc/config/rs6000/rs6000-gen-builtins.c

diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c 
b/gcc/config/rs6000/rs6000-gen-builtins.c
new file mode 100644
index 000..0afbff8e3ab
--- /dev/null
+++ b/gcc/config/rs6000/rs6000-gen-builtins.c
@@ -0,0 +1,165 @@
+/* Generate built-in function initialization and recognition for Power.
+   Copyright (C) 2020-21 Free Software Foundation, Inc.
+   Contributed by Bill Schmidt, IBM 
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+.  */
+
+/* This program generates built-in function initialization and
+   recognition code for Power targets, based on text files that
+   describe the built-in functions and vector overloads:
+
+ rs6000-builtin-new.def Table of built-in functions
+ rs6000-overload.defTable of overload functions
+
+   Both files group similar functions together in "stanzas," as
+   described below.
+
+   Each stanza in the built-in function file starts with a line
+   identifying the circumstances in which the group of functions is
+   permitted, with the gating predicate in square brackets.  For
+   example, this could be
+
+ [altivec]
+
+   or it could be
+
+ [power9]
+
+   The bracketed gating predicate is the only information allowed on
+   the stanza header line, other than whitespace.
+
+   Following the stanza header are two lines for each function: the
+   prototype line and the attributes line.  The prototype line has
+   this format, where the square brackets indicate optional
+   information and angle brackets indicate required information:
+
+ [kind]   ();
+
+   Here [kind] can be one of "const", "pure", or "fpmath";
+is a legal type for a built-in function result;
+is the name by which the function can be called;
+   and  is a comma-separated list of legal types
+   for built-in function arguments.  The argument list may be
+   empty, but the parentheses and semicolon are required.
+
+   The attributes line looks like this:
+
+   {}
+
+   Here  is a unique internal identifier for the built-in
+   function that will be used as part of an enumeration of all
+   built-in functions;  is the define_expand or
+   define_insn that will be invoked when the call is expanded;
+   and  is a comma-separated list of special
+   conditions that apply to the built-in function.  The attribute
+   list may be empty, but the braces are required.
+
+   Attributes are strings, such as these:
+
+ init Process as a vec_init function
+ set  Process as a vec_set function
+ extract  Process as a vec_extract function
+ nosoft   Not valid with -msoft-float
+ ldvecNeeds special handling for vec_ld semantics
+ stvecNeeds special handling for vec_st semantics
+ reve Needs special handling for element reversal
+ pred Needs special handling for comparison predicates
+ htm  Needs special handling for transactional memory
+ htmspr   HTM function using an SPR
+ htmcrHTM function using a CR
+ mma  Needs special handling for MMA instructions
+ quad MMA instruction using a register quad as an input operand
+ pair MMA instruction using a register pair as an input operand
+ no32bit  Not valid for TARGET_32BIT
+ 32bitRequires different handling for TARGET_32BIT
+ cpu  This is a "cpu_is" or "cpu_supports" builtin
+ ldstmask Altivec mask for load or store
+ lxvrse   Needs special handling for load-rightmost, sign-extended
+ lxvrze   Needs special handling for load-rightmost, zero-extended
+ endian   Needs special handling for endianness
+
+   An example stanza might look like this:
+
+[altivec]
+  const vsc __builtin_altivec_abs_v16qi (vsc);
+ABS_V16QI absv16qi2 {}
+  const vss __builtin_altivec_abs_v8hi (vss);
+ABS_V8HI absv8hi2 {}
+
+   Here "vsc" and "vss" are shorthand for "vector signed char" and
+   "vector signed short" to shorten line lengths and improve readability.
+   Note the use of indentation, which is recommended but not required.
+
+   The overload file has more complex stanza headers.  Here the stanza
+   represents all functions with the same overloaded function name:
+
+ [, , [[, ]] 

[PATCHv2 00/55] Replace the Power target-specific builtin machinery

2021-06-08 Thread Bill Schmidt via Gcc-patches
Original patch series here:
https://gcc.gnu.org/pipermail/gcc-patches/2021-April/568840.html

Segher and others, thanks for reviewing the first chunk of patches
from this series!  Some of the stylistic changes turn out to affect
many of the remaining patches, so in addition to addressing the
review comments, I've proactively fixed those as well.  I've also
rebased the series to more recent trunk and fixed up a couple of
things as a result of community changes in the meanwhile.

As a result of reviews, the original patch 0001 has been dropped,
so the patch numbering is off by one compared with the original
series.  Status of the remaining patches (using new numbering):

0001: Reviewed by Segher and Richi, approved pending understanding of
  possible problems on mingw
0002: Approved
0003: Approved
0004: Approved with small changes that were made in this series
0005: Needs re-review after changes made in this series
0006: Approved with small changes that were made in this series
0007: Approved
0008: Approved with factor request (addressed in this series)
0009: Approved
0010-0055: Not yet reviewed

Thanks again for the ongoing reviews!

Bill

Bill Schmidt (55):
  rs6000: Support scanning of build-time GC roots in gengtype
  rs6000: Initial create of rs6000-gen-builtins.c
  rs6000: Add initial input files
  rs6000: Add file support and functions for diagnostic support
  rs6000: Add helper functions for parsing
  rs6000: Add functions for matching types, part 1 of 3
  rs6000: Add functions for matching types, part 2 of 3
  rs6000: Add functions for matching types, part 3 of 3
  rs6000: Red-black tree implementation for balanced tree search
  rs6000: Main function with stubs for parsing and output
  rs6000: Parsing built-in input file, part 1 of 3
  rs6000: Parsing built-in input file, part 2 of 3
  rs6000: Parsing built-in input file, part 3 of 3
  rs6000: Parsing of overload input file
  rs6000: Build and store function type identifiers
  rs6000: Write output to the builtin definition include file
  rs6000: Write output to the builtins header file
  rs6000: Write output to the builtins init file, part 1 of 3
  rs6000: Write output to the builtins init file, part 2 of 3
  rs6000: Write output to the builtins init file, part 3 of 3
  rs6000: Write static initializations for built-in table
  rs6000: Write static initializations for overload tables
  rs6000: Incorporate new builtins code into the build machinery
  rs6000: Add gengtype handling to the build machinery
  rs6000: Add the rest of the [altivec] stanza to the builtins file
  rs6000: Add VSX builtins
  rs6000: Add available-everywhere and ancient builtins
  rs6000: Add power7 and power7-64 builtins
  rs6000: Add power8-vector builtins
  rs6000: Add Power9 builtins
  rs6000: Add available-everywhere and ancient builtins
  rs6000: Add Power10 builtins
  rs6000: Add MMA builtins
  rs6000: Add miscellaneous builtins
  rs6000: Add Cell builtins
  rs6000: Add remaining overloads
  rs6000: Execute the automatic built-in initialization code
  rs6000: Darwin builtin support
  rs6000: Add sanity to V2DI_type_node definitions
  rs6000: Always initialize vector_pair and vector_quad nodes
  rs6000: Handle overloads during program parsing
  rs6000: Handle gimple folding of target built-ins
  rs6000: Support for vectorizing built-in functions
  rs6000: Builtin expansion, part 1
  rs6000: Builtin expansion, part 2
  rs6000: Builtin expansion, part 3
  rs6000: Builtin expansion, part 4
  rs6000: Builtin expansion, part 5
  rs6000: Builtin expansion, part 6
  rs6000: Update rs6000_builtin_decl
  rs6000: Miscellaneous uses of rs6000_builtin_decls_x
  rs6000: Debug support
  rs6000: Update altivec.h for automated interfaces
  rs6000: Test case adjustments
  rs6000: Enable the new builtin support

 gcc/Makefile.in   |5 +-
 gcc/config.gcc|2 +
 gcc/config/rs6000/altivec.h   |  516 +-
 gcc/config/rs6000/darwin.h|8 +-
 gcc/config/rs6000/rbtree.c|  242 +
 gcc/config/rs6000/rbtree.h|   52 +
 gcc/config/rs6000/rs6000-builtin-new.def  | 3875 +++
 gcc/config/rs6000/rs6000-c.c  | 1083 +++
 gcc/config/rs6000/rs6000-call.c   | 3383 -
 gcc/config/rs6000/rs6000-gen-builtins.c   | 2982 
 gcc/config/rs6000/rs6000-overload.def | 6076 +
 gcc/config/rs6000/rs6000.c|  219 +-
 gcc/config/rs6000/rs6000.h|   82 +
 gcc/config/rs6000/t-rs6000|   45 +-
 gcc/gengtype-state.c  |   29 +-
 gcc/gengtype.c|   19 +-
 gcc/gengtype.h|5 +
 .../powerpc/bfp/scalar-extract-exp-2.c|2 +-
 .../powerpc/bfp/scalar-extract-sig-2.c|2 +-
 .../powerpc/bfp/scalar-insert-exp-2.c |2 +-
 .../powerpc/bfp/scalar-i

[PATCH 04/55] rs6000: Add file support and functions for diagnostic support

2021-06-08 Thread Bill Schmidt via Gcc-patches
2021-06-07  Bill Schmidt  

gcc/
* config/rs6000/rs6000-gen-builtins.c (bif_file): New variable.
(ovld_file): Likewise.
(header_file): Likewise.
(init_file): Likewise.
(defines_file): Likewise.
(pgm_path): Likewise.
(bif_path): Likewise.
(ovld_path): Likewise.
(header_path): Likewise.
(init_path): Likewise.
(defines_path): Likewise.
(LINELEN): New macro.
(linebuf): New variable.
(line): Likewise.
(pos): Likewise.
(diag): Likewise.
(bif_diag): New function.
(ovld_diag): Likewise.
---
 gcc/config/rs6000/rs6000-gen-builtins.c | 47 +
 1 file changed, 47 insertions(+)

diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c 
b/gcc/config/rs6000/rs6000-gen-builtins.c
index 0afbff8e3ab..f4ab0978f2c 100644
--- a/gcc/config/rs6000/rs6000-gen-builtins.c
+++ b/gcc/config/rs6000/rs6000-gen-builtins.c
@@ -163,3 +163,50 @@ along with GCC; see the file COPYING3.  If not see
 #include 
 #include 
 #include 
+
+/* Input and output file descriptors and pathnames.  */
+static FILE *bif_file;
+static FILE *ovld_file;
+static FILE *header_file;
+static FILE *init_file;
+static FILE *defines_file;
+
+static const char *pgm_path;
+static const char *bif_path;
+static const char *ovld_path;
+static const char *header_path;
+static const char *init_path;
+static const char *defines_path;
+
+/* Position information.  Note that "pos" is zero-indexed, but users
+   expect one-indexed column information, so representations of "pos"
+   as columns in diagnostic messages must be adjusted.  */
+#define LINELEN 1024
+static char linebuf[LINELEN];
+static int line;
+static int pos;
+
+/* Pointer to a diagnostic function.  */
+static void (*diag) (const char *, ...)
+  __attribute__ ((format (printf, 1, 2)));
+
+/* Custom diagnostics.  */
+static void __attribute__ ((format (printf, 1, 2)))
+bif_diag (const char * fmt, ...)
+{
+  va_list args;
+  fprintf (stderr, "%s:%d: ", bif_path, line);
+  va_start (args, fmt);
+  vfprintf (stderr, fmt, args);
+  va_end (args);
+}
+
+static void __attribute__ ((format (printf, 1, 2)))
+ovld_diag (const char * fmt, ...)
+{
+  va_list args;
+  fprintf (stderr, "%s:%d: ", ovld_path, line);
+  va_start (args, fmt);
+  vfprintf (stderr, fmt, args);
+  va_end (args);
+}
-- 
2.27.0



[PATCH 05/55] rs6000: Add helper functions for parsing

2021-06-08 Thread Bill Schmidt via Gcc-patches
2021-06-07  Bill Schmidt  

gcc/
* config/rs6000/rs6000-gen-builtins.c (consume_whitespace): New
function.
(advance_line): Likewise.
(safe_inc_pos): Likewise.
(match_identifier): Likewise.
(match_integer): Likewise.
(match_to_right_bracket): Likewise.
---
 gcc/config/rs6000/rs6000-gen-builtins.c | 111 
 1 file changed, 111 insertions(+)

diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c 
b/gcc/config/rs6000/rs6000-gen-builtins.c
index f4ab0978f2c..f35893748d2 100644
--- a/gcc/config/rs6000/rs6000-gen-builtins.c
+++ b/gcc/config/rs6000/rs6000-gen-builtins.c
@@ -210,3 +210,114 @@ ovld_diag (const char * fmt, ...)
   vfprintf (stderr, fmt, args);
   va_end (args);
 }
+
+/* Pass over unprintable characters and whitespace (other than a newline,
+   which terminates the scan).  */
+static void
+consume_whitespace (void)
+{
+  while (pos < LINELEN && isspace(linebuf[pos]) && linebuf[pos] != '\n')
+pos++;
+  return;
+}
+
+/* Get the next nonblank, noncomment line, returning 0 on EOF, 1 otherwise.  */
+static int
+advance_line (FILE *file)
+{
+  while (1)
+{
+  /* Read ahead one line and check for EOF.  */
+  if (!fgets (linebuf, sizeof linebuf, file))
+   return 0;
+  line++;
+  size_t len = strlen (linebuf);
+  if (linebuf[len - 1] != '\n')
+   (*diag) ("line doesn't terminate with newline\n");
+  pos = 0;
+  consume_whitespace ();
+  if (linebuf[pos] != '\n' && linebuf[pos] != ';')
+   return 1;
+}
+}
+
+static inline void
+safe_inc_pos (void)
+{
+  if (pos++ >= LINELEN)
+{
+  (*diag) ("line length overrun.\n");
+  exit (1);
+}
+}
+
+/* Match an identifier, returning NULL on failure, else a pointer to a
+   buffer containing the identifier.  */
+static char *
+match_identifier (void)
+{
+  int lastpos = pos - 1;
+  while (isalnum (linebuf[lastpos + 1]) || linebuf[lastpos + 1] == '_')
+++lastpos;
+
+  if (lastpos < pos)
+return 0;
+
+  char *buf = (char *) malloc (lastpos - pos + 2);
+  memcpy (buf, &linebuf[pos], lastpos - pos + 1);
+  buf[lastpos - pos + 1] = '\0';
+
+  pos = lastpos + 1;
+  return buf;
+}
+
+/* Match an integer and return the string representing its value,
+   or a null string on failure.  */
+static char *
+match_integer (void)
+{
+  int startpos = pos;
+  if (linebuf[pos] == '-')
+safe_inc_pos ();
+
+  int lastpos = pos - 1;
+  while (isdigit (linebuf[lastpos + 1]))
+++lastpos;
+
+  if (lastpos < pos)
+return NULL;
+
+  pos = lastpos + 1;
+  char *buf = (char *) malloc (lastpos - startpos + 2);
+  memcpy (buf, &linebuf[startpos], lastpos - startpos + 1);
+  buf[lastpos - startpos + 1] = '\0';
+  return buf;
+}
+
+/* Match a string up to but not including a ']', and return its value,
+   or zero if there is nothing before the ']'.  Error if we don't find
+   such a character.  */
+static const char *
+match_to_right_bracket (void)
+{
+  int lastpos = pos - 1;
+  while (linebuf[lastpos + 1] != ']')
+{
+  if (linebuf[lastpos + 1] == '\n')
+   {
+ (*diag) ("no ']' found before end of line.\n");
+ exit (1);
+   }
+  ++lastpos;
+}
+
+  if (lastpos < pos)
+return 0;
+
+  char *buf = (char *) malloc (lastpos - pos + 2);
+  memcpy (buf, &linebuf[pos], lastpos - pos + 1);
+  buf[lastpos - pos + 1] = '\0';
+
+  pos = lastpos + 1;
+  return buf;
+}
-- 
2.27.0



[PATCH 06/55] rs6000: Add functions for matching types, part 1 of 3

2021-06-08 Thread Bill Schmidt via Gcc-patches
2021-06-07  Bill Schmidt  

gcc/
* config/rs6000/rs6000-gen-builtins.c (void_status): New enum.
(basetype): Likewise.
(typeinfo): Likewise.
(handle_pointer): New function.
(match_basetype): New stub function.
(match_const_restriction): Likewise.
(match_type): New function.
---
 gcc/config/rs6000/rs6000-gen-builtins.c | 367 
 1 file changed, 367 insertions(+)

diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c 
b/gcc/config/rs6000/rs6000-gen-builtins.c
index f35893748d2..f31554e973b 100644
--- a/gcc/config/rs6000/rs6000-gen-builtins.c
+++ b/gcc/config/rs6000/rs6000-gen-builtins.c
@@ -186,6 +186,52 @@ static char linebuf[LINELEN];
 static int line;
 static int pos;
 
+/* Used to determine whether a type can be void (only return types).  */
+enum void_status
+{
+ VOID_NOTOK,
+ VOID_OK
+};
+
+/* Legal base types for an argument or return type.  */
+enum basetype
+{
+  BT_CHAR,
+  BT_SHORT,
+  BT_INT,
+  BT_LONG,
+  BT_LONGLONG,
+  BT_FLOAT,
+  BT_DOUBLE,
+  BT_LONGDOUBLE,
+  BT_INT128,
+  BT_FLOAT128,
+  BT_BOOL,
+  BT_STRING,
+  BT_DECIMAL32,
+  BT_DECIMAL64,
+  BT_DECIMAL128,
+  BT_IBM128,
+  BT_VPAIR,
+  BT_VQUAD
+};
+
+/* Type modifiers for an argument or return type.  */
+struct typeinfo
+{
+  char isvoid;
+  char isconst;
+  char isvector;
+  char issigned;
+  char isunsigned;
+  char isbool;
+  char ispixel;
+  char ispointer;
+  basetype base;
+  char *val1;
+  char *val2;
+};
+
 /* Pointer to a diagnostic function.  */
 static void (*diag) (const char *, ...)
   __attribute__ ((format (printf, 1, 2)));
@@ -321,3 +367,324 @@ match_to_right_bracket (void)
   pos = lastpos + 1;
   return buf;
 }
+
+static inline void
+handle_pointer (typeinfo *typedata)
+{
+  consume_whitespace ();
+  if (linebuf[pos] == '*')
+{
+  typedata->ispointer = 1;
+  safe_inc_pos ();
+}
+}
+
+/* Match one of the allowable base types.  Consumes one token unless the
+   token is "long", which must be paired with a second "long".  Optionally
+   consumes a following '*' token for pointers.  Return 1 for success,
+   0 for failure.  */
+static int
+match_basetype (typeinfo *typedata)
+{
+  return 1;
+}
+
+/* A const int argument may be restricted to certain values.  This is
+   indicated by one of the following occurring after the "int' token:
+
+restricts the constant to x bits, interpreted as unsigned
+  restricts the constant to the inclusive range [x,y]
+ [x,y] restricts the constant to the inclusive range [x,y],
+  but only applies if the argument is constant.
+ {x,y} restricts the constant to one of two values, x or y.
+
+   Here x and y are integer tokens.  Note that the "const" token is a
+   lie when the restriction is [x,y], but this simplifies the parsing
+   significantly and is hopefully forgivable.
+
+   Return 1 for success, else 0.  */
+static int
+match_const_restriction (typeinfo *typedata)
+{
+  return 1;
+}
+
+/* Look for a type, which can be terminated by a token that is not part of
+   a type, a comma, or a closing parenthesis.  Place information about the
+   type in TYPEDATA.  Return 1 for success, 0 for failure.  */
+static int
+match_type (typeinfo *typedata, int voidok)
+{
+  /* A legal type is of the form:
+
+   [const] [[signed|unsigned]  | ] [*]
+
+ Legal values of  are (for now):
+
+   char
+   short
+   int
+   long
+   long double
+   long long
+   float
+   double
+   __int128
+   _Float128
+   bool
+   string
+   _Decimal32
+   _Decimal64
+   _Decimal128
+   __ibm128
+
+ Legal values of  are as follows, and are shorthand for
+ the associated meaning:
+
+   vsc vector signed char
+   vuc vector unsigned char
+   vbc vector bool char
+   vss vector signed short
+   vus vector unsigned short
+   vbs vector bool short
+   vsi vector signed int
+   vui vector unsigned int
+   vbi vector bool int
+   vsllvector signed long long
+   vullvector unsigned long long
+   vbllvector bool long long
+   vsq vector signed __int128
+   vuq vector unsigned __int128
+   vbq vector bool __int128
+   vp  vector pixel
+   vf  vector float
+   vd  vector double
+   v256__vector_pair
+   v512__vector_quad
+
+ For simplicity, We don't support "short int" and "long long int".
+ We don't currently support a  of "_Float16".  "signed"
+ and "unsigned" only apply to integral base types.  The optional *
+ indicates a pointer type.  */
+
+  consume_whitespace ();
+  memset (typedata, 0, sizeof *typedata);
+  int oldpos = pos;
+
+  char *token = match_identifier ();
+  if (!token)
+return 0;
+
+  if (!strcmp (token, "const"))
+{
+  typedata->isconst = 1;
+  consume_whitespace ();
+  oldpos = pos;
+  token = match_identifier ();
+   

[PATCH 01/55] rs6000: Support scanning of build-time GC roots in gengtype

2021-06-08 Thread Bill Schmidt via Gcc-patches
Currently gengtype supports scanning target-specific files for GC roots,
but those files must exist in the source tree.  This patch extends the
support to include header files generated into the build directory.  It
also allows targets to specify build dependencies for s-gtype to ensure
the built headers are up to date prior to running gengtype.

2021-06-08  Bill Schmidt  

gcc/
* Makefile.in (EXTRA_GTYPE_DEPS): New variable.
(s-gtype): Depend on EXTRA_GTYPE_DEPS.
* gengtype-state.c (state_writer::write_state_file_list): Add a
parameter to the fileslist expression for the number of build
headers to scan.
(read_state_files_list): Detect build headers and strip the
initial "./" from their names.
* gengtype.c (build_headers): New global variable.
(num_build_headers): Likewise.
(open_base_files): Emit #include for each build header.
(main): Detect and count build headers.
* gengtype.h (build_headers): New extern variable.
(num_build_headers): Likewise.
---
 gcc/Makefile.in  |  5 +++--
 gcc/gengtype-state.c | 29 +++--
 gcc/gengtype.c   | 19 ---
 gcc/gengtype.h   |  5 +
 4 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 4cb2966157e..f651b61481b 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -560,6 +560,7 @@ out_file=$(srcdir)/config/@out_file@
 out_object_file=@out_object_file@
 common_out_file=$(srcdir)/common/config/@common_out_file@
 common_out_object_file=@common_out_object_file@
+EXTRA_GTYPE_DEPS=
 md_file=$(srcdir)/common.md $(srcdir)/config/@md_file@
 tm_file_list=@tm_file_list@
 tm_include_list=@tm_include_list@
@@ -2734,8 +2735,8 @@ s-gtyp-input: Makefile
$(SHELL) $(srcdir)/../move-if-change tmp-gi.list gtyp-input.list
$(STAMP) s-gtyp-input
 
-s-gtype: build/gengtype$(build_exeext) $(filter-out [%], $(GTFILES)) \
-gtyp-input.list
+s-gtype: $(EXTRA_GTYPE_DEPS) build/gengtype$(build_exeext) \
+   $(filter-out [%], $(GTFILES)) gtyp-input.list
 # First, parse all files and save a state file.
$(RUN_GEN) build/gengtype$(build_exeext) $(GENGTYPE_FLAGS) \
 -S $(srcdir) -I gtyp-input.list -w tmp-gtype.state
diff --git a/gcc/gengtype-state.c b/gcc/gengtype-state.c
index e9775ed633e..5bb8f57c450 100644
--- a/gcc/gengtype-state.c
+++ b/gcc/gengtype-state.c
@@ -1269,7 +1269,7 @@ state_writer::write_state_files_list (void)
   int i = 0;
   /* Write the list of files with their lang_bitmap.  */
   begin_s_expr ("fileslist");
-  fprintf (state_file, "%d", (int) num_gt_files);
+  fprintf (state_file, "%d %d", (int) num_gt_files, (int) num_build_headers);
   for (i = 0; i < (int) num_gt_files; i++)
 {
   const char *cursrcrelpath = NULL;
@@ -2456,16 +2456,20 @@ read_state_files_list (void)
   struct state_token_st *t0 = peek_state_token (0);
   struct state_token_st *t1 = peek_state_token (1);
   struct state_token_st *t2 = peek_state_token (2);
+  struct state_token_st *t3 = peek_state_token (3);
 
   if (state_token_kind (t0) == STOK_LEFTPAR
   && state_token_is_name (t1, "!fileslist")
-  && state_token_kind (t2) == STOK_INTEGER)
+  && state_token_kind (t2) == STOK_INTEGER
+  && state_token_kind (t3) == STOK_INTEGER)
 {
-  int i = 0;
+  int i = 0, j = 0;
   num_gt_files = t2->stok_un.stok_num;
-  next_state_tokens (3);
-  t0 = t1 = t2 = NULL;
+  num_build_headers = t3->stok_un.stok_num;
+  next_state_tokens (4);
+  t0 = t1 = t2 = t3 = NULL;
   gt_files = XCNEWVEC (const input_file *, num_gt_files);
+  build_headers = XCNEWVEC (const char *, num_build_headers);
   for (i = 0; i < (int) num_gt_files; i++)
{
  bool issrcfile = FALSE;
@@ -2498,7 +2502,20 @@ read_state_files_list (void)
  free (fullpath);
}
  else
-   curgt = input_file_by_name (fnam);
+   {
+ curgt = input_file_by_name (fnam);
+ /* Look for a header file created during the build,
+which looks like "./.h".  */
+ int len = strlen (fnam);
+ if (len >= 5 && fnam[0] == '.' && fnam[1] == '/'
+ && fnam[len-2] == '.' && fnam[len-1] == 'h')
+   {
+ char *buf = (char *) xmalloc (len - 1);
+ /* Strip the leading "./" from the filename.  */
+ strcpy (buf, &fnam[2]);
+ build_headers[j++] = buf;
+   }
+   }
  set_lang_bitmap (curgt, bmap);
  gt_files[i] = curgt;
  next_state_tokens (2);
diff --git a/gcc/gengtype.c b/gcc/gengtype.c
index b94e2f126ec..c6786a58f9b 100644
--- a/gcc/gengtype.c
+++ b/gcc/geng

[PATCH 07/55] rs6000: Add functions for matching types, part 2 of 3

2021-06-08 Thread Bill Schmidt via Gcc-patches
2021-04-02  Bill Schmidt  

gcc/
* config/rs6000/rs6000-gen-builtins.c (match_basetype): Implement.
---
 gcc/config/rs6000/rs6000-gen-builtins.c | 64 +
 1 file changed, 64 insertions(+)

diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c 
b/gcc/config/rs6000/rs6000-gen-builtins.c
index f31554e973b..95e269e7275 100644
--- a/gcc/config/rs6000/rs6000-gen-builtins.c
+++ b/gcc/config/rs6000/rs6000-gen-builtins.c
@@ -386,6 +386,70 @@ handle_pointer (typeinfo *typedata)
 static int
 match_basetype (typeinfo *typedata)
 {
+  consume_whitespace ();
+  int oldpos = pos;
+  char *token = match_identifier ();
+  if (!token)
+{
+  (*diag) ("missing base type in return type at column %d\n", pos + 1);
+  return 0;
+}
+
+  if (!strcmp (token, "char"))
+typedata->base = BT_CHAR;
+  else if (!strcmp (token, "short"))
+typedata->base = BT_SHORT;
+  else if (!strcmp (token, "int"))
+typedata->base = BT_INT;
+  else if (!strcmp (token, "long"))
+{
+  consume_whitespace ();
+  oldpos = pos;
+  char *mustbelongordbl = match_identifier ();
+  if (!mustbelongordbl)
+   typedata->base = BT_LONG;
+  else if (!strcmp (mustbelongordbl, "long"))
+   typedata->base = BT_LONGLONG;
+  else if (!strcmp (mustbelongordbl, "double"))
+   typedata->base = BT_LONGDOUBLE;
+  else
+   /* Speculatively accept "long" here and push back the token.
+  This occurs when "long" is a return type and the next token
+  is the function name.  */
+   {
+ typedata->base = BT_LONG;
+ pos = oldpos;
+   }
+}
+  else if (!strcmp (token, "float"))
+typedata->base = BT_FLOAT;
+  else if (!strcmp (token, "double"))
+typedata->base = BT_DOUBLE;
+  else if (!strcmp (token, "__int128"))
+typedata->base = BT_INT128;
+  else if (!strcmp (token, "_Float128"))
+typedata->base = BT_FLOAT128;
+  else if (!strcmp (token, "bool"))
+typedata->base = BT_BOOL;
+  /* A "string" is a special "const char *" -- we need it because it
+ cannot match either signed or unsigned char *.  */
+  else if (!strcmp (token, "string"))
+typedata->base = BT_STRING;
+  else if (!strcmp (token, "_Decimal32"))
+typedata->base = BT_DECIMAL32;
+  else if (!strcmp (token, "_Decimal64"))
+typedata->base = BT_DECIMAL64;
+  else if (!strcmp (token, "_Decimal128"))
+typedata->base = BT_DECIMAL128;
+  else if (!strcmp (token, "__ibm128"))
+typedata->base = BT_IBM128;
+  else
+{
+  (*diag) ("unrecognized base type at column %d\n", oldpos + 1);
+  return 0;
+}
+
+  handle_pointer (typedata);
   return 1;
 }
 
-- 
2.27.0



[PATCH 08/55] rs6000: Add functions for matching types, part 3 of 3

2021-06-08 Thread Bill Schmidt via Gcc-patches
2021-06-07  Bill Schmidt  

gcc/
* config/rs6000/rs6000-gen-builtins.c (restriction): New enum.
(typeinfo): Add restr field.
(match_bracketed_pair): New function.
(match_const_restriction): Implement.
---
 gcc/config/rs6000/rs6000-gen-builtins.c | 115 +++-
 1 file changed, 114 insertions(+), 1 deletion(-)

diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c 
b/gcc/config/rs6000/rs6000-gen-builtins.c
index 95e269e7275..4dbb8d28b33 100644
--- a/gcc/config/rs6000/rs6000-gen-builtins.c
+++ b/gcc/config/rs6000/rs6000-gen-builtins.c
@@ -216,6 +216,22 @@ enum basetype
   BT_VQUAD
 };
 
+/* Ways in which a const int value can be restricted.  RES_BITS indicates
+   that the integer is restricted to val1 bits, interpreted as an unsigned
+   number.  RES_RANGE indicates that the integer is restricted to values
+   between val1 and val2, inclusive.  RES_VAR_RANGE is like RES_RANGE, but
+   the argument may be variable, so it can only be checked if it is constant.
+   RES_VALUES indicates that the integer must have one of the values val1
+   or val2.  */
+enum restriction
+{
+  RES_NONE,
+  RES_BITS,
+  RES_RANGE,
+  RES_VAR_RANGE,
+  RES_VALUES
+};
+
 /* Type modifiers for an argument or return type.  */
 struct typeinfo
 {
@@ -228,6 +244,7 @@ struct typeinfo
   char ispixel;
   char ispointer;
   basetype base;
+  restriction restr;
   char *val1;
   char *val2;
 };
@@ -453,6 +470,53 @@ match_basetype (typeinfo *typedata)
   return 1;
 }
 
+/* Helper routine for match_const_restriction.  */
+static int
+match_bracketed_pair (typeinfo *typedata, char open, char close,
+ restriction restr)
+{
+  if (linebuf[pos] == open)
+{
+  safe_inc_pos ();
+  int oldpos = pos;
+  char *x = match_integer ();
+  if (x == NULL)
+   {
+ (*diag) ("malformed integer at column %d.\n", oldpos + 1);
+ return 0;
+   }
+  consume_whitespace ();
+  if (linebuf[pos] != ',')
+   {
+ (*diag) ("missing comma at column %d.\n", pos + 1);
+ return 0;
+   }
+  safe_inc_pos ();
+  consume_whitespace ();
+  oldpos = pos;
+  char *y = match_integer ();
+  if (y == NULL)
+   {
+ (*diag) ("malformed integer at column %d.\n", oldpos + 1);
+ return 0;
+   }
+  typedata->restr = restr;
+  typedata->val1 = x;
+  typedata->val2 = y;
+
+  consume_whitespace ();
+  if (linebuf[pos] != close)
+   {
+ (*diag) ("malformed restriction at column %d.\n", pos + 1);
+ return 0;
+   }
+  safe_inc_pos ();
+  return 1;
+}
+
+  return 0;
+}
+
 /* A const int argument may be restricted to certain values.  This is
indicated by one of the following occurring after the "int' token:
 
@@ -470,7 +534,56 @@ match_basetype (typeinfo *typedata)
 static int
 match_const_restriction (typeinfo *typedata)
 {
-  return 1;
+  int oldpos = pos;
+  if (linebuf[pos] == '<')
+{
+  safe_inc_pos ();
+  oldpos = pos;
+  char *x = match_integer ();
+  if (x == NULL)
+   {
+ (*diag) ("malformed integer at column %d.\n", oldpos + 1);
+ return 0;
+   }
+  consume_whitespace ();
+  if (linebuf[pos] == '>')
+   {
+ typedata->restr = RES_BITS;
+ typedata->val1 = x;
+ safe_inc_pos ();
+ return 1;
+   }
+  else if (linebuf[pos] != ',')
+   {
+ (*diag) ("malformed restriction at column %d.\n", pos + 1);
+ return 0;
+   }
+  safe_inc_pos ();
+  oldpos = pos;
+  char *y = match_integer ();
+  if (y == NULL)
+   {
+ (*diag) ("malformed integer at column %d.\n", oldpos + 1);
+ return 0;
+   }
+  typedata->restr = RES_RANGE;
+  typedata->val1 = x;
+  typedata->val2 = y;
+
+  consume_whitespace ();
+  if (linebuf[pos] != '>')
+   {
+ (*diag) ("malformed restriction at column %d.\n", pos + 1);
+ return 0;
+   }
+  safe_inc_pos ();
+  return 1;
+}
+  else if (match_bracketed_pair (typedata, '{', '}', RES_VALUES)
+  || match_bracketed_pair (typedata, '[', ']', RES_VAR_RANGE))
+return 1;
+
+  return 0;
 }
 
 /* Look for a type, which can be terminated by a token that is not part of
-- 
2.27.0



[PATCH 09/55] rs6000: Red-black tree implementation for balanced tree search

2021-06-08 Thread Bill Schmidt via Gcc-patches
2021-06-08  Bill Schmidt  

gcc/
* config/rs6000/rbtree.c: New file.
* config/rs6000/rbtree.h: New file.
---
 gcc/config/rs6000/rbtree.c | 242 +
 gcc/config/rs6000/rbtree.h |  52 
 2 files changed, 294 insertions(+)
 create mode 100644 gcc/config/rs6000/rbtree.c
 create mode 100644 gcc/config/rs6000/rbtree.h

diff --git a/gcc/config/rs6000/rbtree.c b/gcc/config/rs6000/rbtree.c
new file mode 100644
index 000..37a559c1fbc
--- /dev/null
+++ b/gcc/config/rs6000/rbtree.c
@@ -0,0 +1,242 @@
+/* Partial red-black tree implementation for rs6000-gen-builtins.c.
+   Copyright (C) 2020-21 Free Software Foundation, Inc.
+   Contributed by Bill Schmidt, IBM 
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+.  */
+
+#include 
+#include 
+#include 
+#include 
+#include "rbtree.h"
+
+/* Initialize a red-black tree.  */
+void
+rbt_new (struct rbt_strings *t)
+{
+  t->rbt_nil = (rbt_string_node *) malloc (sizeof (rbt_string_node));
+  t->rbt_nil->color = RBT_BLACK;
+  t->rbt_root = t->rbt_nil;
+}
+
+/* Create a new node to be inserted into the red-black tree.  An inserted
+   node starts out red.  */
+static struct rbt_string_node *
+rbt_create_node (struct rbt_strings *t, char *str)
+{
+  struct rbt_string_node *nodeptr
+= (struct rbt_string_node *) malloc (sizeof (rbt_string_node));
+  nodeptr->str = str;
+  nodeptr->left = t->rbt_nil;
+  nodeptr->right = t->rbt_nil;
+  nodeptr->par = NULL;
+  nodeptr->color = RBT_RED;
+  return nodeptr;
+}
+
+/* Perform a left-rotate operation on NODE in the red-black tree.  */
+static void
+rbt_left_rotate (struct rbt_strings *t, struct rbt_string_node *node)
+{
+  struct rbt_string_node *right = node->right;
+  assert (right);
+
+  /* Turn RIGHT's left subtree into NODE's right subtree.  */
+  node->right = right->left;
+  if (right->left != t->rbt_nil)
+right->left->par = node;
+
+  /* Link NODE's parent to RIGHT.  */
+  right->par = node->par;
+
+  if (node->par == t->rbt_nil)
+t->rbt_root = right;
+  else if (node == node->par->left)
+node->par->left = right;
+  else
+node->par->right = right;
+
+  /* Put NODE on RIGHT's left.  */
+  right->left = node;
+  node->par = right;
+}
+
+/* Perform a right-rotate operation on NODE in the red-black tree.  */
+static void
+rbt_right_rotate (struct rbt_strings *t, struct rbt_string_node *node)
+{
+  struct rbt_string_node *left = node->left;
+  assert (left);
+
+  /* Turn LEFT's right subtree into NODE's left subtree.  */
+  node->left = left->right;
+  if (left->right != t->rbt_nil)
+left->right->par = node;
+
+  /* Link NODE's parent to LEFT.  */
+  left->par = node->par;
+
+  if (node->par == t->rbt_nil)
+t->rbt_root = left;
+  else if (node == node->par->right)
+node->par->right = left;
+  else
+node->par->left = left;
+
+  /* Put NODE on LEFT's right.  */
+  left->right = node;
+  node->par = left;
+}
+
+/* Insert STR into the tree, returning 1 for success and 0 if STR already
+   appears in the tree.  */
+int
+rbt_insert (struct rbt_strings *t, char *str)
+{
+  struct rbt_string_node *curr = t->rbt_root;
+  struct rbt_string_node *trail = t->rbt_nil;
+
+  while (curr != t->rbt_nil)
+{
+  trail = curr;
+  int cmp = strcmp (str, curr->str);
+  if (cmp < 0)
+   curr = curr->left;
+  else if (cmp > 0)
+   curr = curr->right;
+  else
+   return 0;
+}
+
+  struct rbt_string_node *fresh = rbt_create_node (t, str);
+  fresh->par = trail;
+
+  if (trail == t->rbt_nil)
+t->rbt_root = fresh;
+  else if (strcmp (fresh->str, trail->str) < 0)
+trail->left = fresh;
+  else
+trail->right = fresh;
+
+  fresh->left = t->rbt_nil;
+  fresh->right = t->rbt_nil;
+
+  /* FRESH has now been inserted as a red leaf.  If we have invalidated
+ one of the following preconditions, we must fix things up:
+  (a) If a node is red, both of its children are black.
+  (b) The root must be black.
+ Note that only (a) or (b) applies at any given time during the
+ process.  This algorithm works up the tree from NEW looking
+ for a red child with a red parent, and cleaning that up.  If the
+ root ends up red, it gets turned black at the end.  */
+  curr = fresh;
+  while (curr->par->color == RBT_RED)
+if (curr->par == curr->par->par->left)
+  {
+   struct rbt_string_node *uncle = curr->par->par->right;
+   

[PATCH 03/55] rs6000: Add initial input files

2021-06-08 Thread Bill Schmidt via Gcc-patches
This patch adds a tiny subset of the built-in and overload descriptions.

2021-04-02  Bill Schmidt  

gcc/
* config/rs6000/rs6000-builtin-new.def: New.
* config/rs6000/rs6000-overload.def: New.
---
 gcc/config/rs6000/rs6000-builtin-new.def | 199 +++
 gcc/config/rs6000/rs6000-overload.def|  82 ++
 2 files changed, 281 insertions(+)
 create mode 100644 gcc/config/rs6000/rs6000-builtin-new.def
 create mode 100644 gcc/config/rs6000/rs6000-overload.def

diff --git a/gcc/config/rs6000/rs6000-builtin-new.def 
b/gcc/config/rs6000/rs6000-builtin-new.def
new file mode 100644
index 000..a84a3def2d5
--- /dev/null
+++ b/gcc/config/rs6000/rs6000-builtin-new.def
@@ -0,0 +1,199 @@
+; Built-in functions for PowerPC.
+; Copyright (C) 2020-21 Free Software Foundation, Inc.
+; Contributed by Bill Schmidt, IBM 
+;
+; This file is part of GCC.
+;
+; GCC is free software; you can redistribute it and/or modify it under
+; the terms of the GNU General Public License as published by the Free
+; Software Foundation; either version 3, or (at your option) any later
+; version.
+;
+; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+; for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with GCC; see the file COPYING3.  If not see
+; .
+
+
+; Built-in functions in this file are organized into "stanzas", where
+; all built-ins in a given stanza are enabled together.  Each stanza
+; starts with a line identifying the circumstances in which the group of
+; functions is permitted, with the gating predicate in square brackets.
+; For example, this could be
+;
+; [altivec]
+;
+;   or it could be
+;
+; [power9]
+;
+; The bracketed gating predicate is the only information allowed on
+; the stanza header line, other than whitespace.
+;
+; Following the stanza header are two lines for each function: the
+; prototype line and the attributes line.  The prototype line has
+; this format, where the square brackets indicate optional
+; information and angle brackets indicate required information:
+;
+;   [kind]   ();
+;
+; Here [kind] can be one of "const", "pure", or "fpmath";
+;  is a legal type for a built-in function result;
+;  is the name by which the function can be called;
+; and  is a comma-separated list of legal types
+; for built-in function arguments.  The argument list may be
+; empty, but the parentheses and semicolon are required.
+;
+; A legal type is of the form:
+;
+;   [const] [[signed|unsigned]  | ] [*]
+;
+; where "const" applies only to a  of "int".  Legal values
+; of  are (for now):
+;
+;   char
+;   short
+;   int
+;   long
+;   long double
+;   long long
+;   float
+;   double
+;   __int128
+;   _Float128
+;   bool
+;   string
+;   _Decimal32
+;   _Decimal64
+;   _Decimal128
+;   __ibm128
+;
+; Legal values of  are as follows, and are shorthand for
+; the associated meaning:
+;
+;   vscvector signed char
+;   vucvector unsigned char
+;   vbcvector bool char
+;   vssvector signed short
+;   vusvector unsigned short
+;   vbsvector bool short
+;   vsivector signed int
+;   vuivector unsigned int
+;   vbivector bool int
+;   vsll   vector signed long long
+;   vull   vector unsigned long long
+;   vbll   vector bool long long
+;   vsqvector signed __int128
+;   vuqvector unsigned __int128
+;   vbqvector bool __int128
+;   vp vector pixel
+;   vf vector float
+;   vd vector double
+;   v256   __vector_pair
+;   v512   __vector_quad
+;
+; For simplicity, We don't support "short int" and "long long int".
+; We don't currently support a  of "_Float16".  "signed"
+; and "unsigned" only apply to integral base types.  The optional *
+; indicates a pointer type.
+;
+; The attributes line looks like this:
+;
+; {}
+;
+; Here  is a unique internal identifier for the built-in
+; function that will be used as part of an enumeration of all
+; built-in functions;  is the define_expand or
+; define_insn that will be invoked when the call is expanded;
+; and  is a comma-separated list of special
+; conditions that apply to the built-in function.  The attribute
+; list may be empty, but the braces are required.
+;
+; Attributes are strings, and the allowed ones are listed below.
+;
+;   init Process as a vec_init function
+;   set  Process as a vec_set function
+;   extract  Process as a vec_extract function
+;   nosoft   Not valid with -msoft-float
+;   ldvecNeeds special handling for vec_ld semantics
+;   stvecNeeds special handling for vec_st semantics
+;   reve Needs special handling for

[PATCH 10/55] rs6000: Main function with stubs for parsing and output

2021-06-08 Thread Bill Schmidt via Gcc-patches
2021-06-08  Bill Schmidt  

gcc/
* config/rs6000/rs6000-gen-builtins.c (rbtree.h): New #include.
(num_bifs): New variable.
(num_ovld_stanzas): Likewise.
(num_ovlds): Likewise.
(parse_codes): New enum.
(bif_rbt): New variable.
(ovld_rbt): Likewise.
(fntype_rbt): Likewise.
(bifo_rbt): Likewise.
(parse_bif): New stub function.
(create_bif_order): Likewise.
(parse_ovld): Likewise.
(write_header_file): Likewise.
(write_init_file): Likewise.
(write_defines_file): Likewise.
(delete_output_files): New function.
(main): Likewise.
---
 gcc/config/rs6000/rs6000-gen-builtins.c | 211 
 1 file changed, 211 insertions(+)

diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c 
b/gcc/config/rs6000/rs6000-gen-builtins.c
index 4dbb8d28b33..1a9b73c193f 100644
--- a/gcc/config/rs6000/rs6000-gen-builtins.c
+++ b/gcc/config/rs6000/rs6000-gen-builtins.c
@@ -163,6 +163,7 @@ along with GCC; see the file COPYING3.  If not see
 #include 
 #include 
 #include 
+#include "rbtree.h"
 
 /* Input and output file descriptors and pathnames.  */
 static FILE *bif_file;
@@ -249,6 +250,29 @@ struct typeinfo
   char *val2;
 };
 
+static int num_bifs;
+static int num_ovld_stanzas;
+static int num_ovlds;
+
+/* Return codes for parsing routines.  */
+enum parse_codes
+{
+  PC_OK,
+  PC_EOFILE,
+  PC_EOSTANZA,
+  PC_PARSEFAIL
+};
+
+/* The red-black trees for built-in function identifiers, built-in
+   overload identifiers, and function type descriptors.  */
+static rbt_strings bif_rbt;
+static rbt_strings ovld_rbt;
+static rbt_strings fntype_rbt;
+
+/* Another red-black tree containing a mapping from built-in function
+   identifiers to the order in which they were encountered.  */
+static rbt_strings bifo_rbt;
+
 /* Pointer to a diagnostic function.  */
 static void (*diag) (const char *, ...)
   __attribute__ ((format (printf, 1, 2)));
@@ -865,3 +889,190 @@ match_type (typeinfo *typedata, int voidok)
 
   return 1;
 }
+
+/* Parse the built-in file.  */
+static parse_codes
+parse_bif (void)
+{
+  return PC_OK;
+}
+
+/* Create a mapping from function IDs in their final order to the order
+   they appear in the built-in function file.  */
+static void
+create_bif_order (void)
+{
+}
+
+/* Parse the overload file.  */
+static parse_codes
+parse_ovld (void)
+{
+  return PC_OK;
+}
+
+/* Write everything to the header file (rs6000-builtins.h).  */
+static int
+write_header_file (void)
+{
+  return 1;
+}
+
+/* Write everything to the initialization file (rs6000-builtins.c).  */
+static int
+write_init_file (void)
+{
+  return 1;
+}
+
+/* Write everything to the include file (rs6000-vecdefines.h).  */
+static int
+write_defines_file (void)
+{
+  return 1;
+}
+
+/* Close and delete output files after any failure, so that subsequent
+   build dependencies will fail.  */
+static void
+delete_output_files (void)
+{
+  /* Depending on whence we're called, some of these may already be
+ closed.  Don't check for errors.  */
+  fclose (header_file);
+  fclose (init_file);
+  fclose (defines_file);
+
+  unlink (header_path);
+  unlink (init_path);
+  unlink (defines_path);
+}
+
+/* Main program to convert flat files into built-in initialization code.  */
+int
+main (int argc, const char **argv)
+{
+  if (argc != 6)
+{
+  fprintf (stderr,
+  "Five arguments required: two input file and three output "
+  "files.\n");
+  exit (1);
+}
+
+  pgm_path = argv[0];
+  bif_path = argv[1];
+  ovld_path = argv[2];
+  header_path = argv[3];
+  init_path = argv[4];
+  defines_path = argv[5];
+
+  bif_file = fopen (bif_path, "r");
+  if (!bif_file)
+{
+  fprintf (stderr, "Cannot find input built-in file '%s'.\n", bif_path);
+  exit (1);
+}
+  ovld_file = fopen (ovld_path, "r");
+  if (!ovld_file)
+{
+  fprintf (stderr, "Cannot find input overload file '%s'.\n", ovld_path);
+  exit (1);
+}
+  header_file = fopen (header_path, "w");
+  if (!header_file)
+{
+  fprintf (stderr, "Cannot open header file '%s' for output.\n",
+  header_path);
+  exit (1);
+}
+  init_file = fopen (init_path, "w");
+  if (!init_file)
+{
+  fprintf (stderr, "Cannot open init file '%s' for output.\n", init_path);
+  exit (1);
+}
+  defines_file = fopen (defines_path, "w");
+  if (!defines_file)
+{
+  fprintf (stderr, "Cannot open defines file '%s' for output.\n",
+  defines_path);
+  exit (1);
+}
+
+  /* Initialize the balanced trees containing built-in function ids,
+ overload function ids, and function type declaration ids.  */
+  rbt_new (&bif_rbt);
+  rbt_new (&ovld_rbt);
+  rbt_new (&fntype_rbt);
+
+  /* Initialize another balanced tree that contains a map from built-in
+ function ids to the order in which they were encountered.  */
+  rbt_new (&bifo_rbt);
+
+  /* Parse the built-in function f

  1   2   >