[PATCH] xtensa: Remove old broken tweak for leaf function

2023-01-13 Thread Takayuki 'January June' Suwa via Gcc-patches
In the before-IRA era, ORDER_REGS_FOR_LOCAL_ALLOC was called for each
function in Xtensa, and there was register allocation table reordering
for leaf functions to compensate for the poor performance of local-alloc.

Today the adjustment hook is still called via its alternative
ADJUST_REG_ALLOC_ORDER, but it is only called once at the start of the IRA,
and leaf_function_p() erroneously returns true and also gives no argument
count.

That straightforwardly misleads register allocation that all functions are
always leaves with no arguments, which leads to inefficiencies in allocation
results.

Fortunately, IRA is smart enough than local-alloc to not need such assistance.

This patch does away with the antiquated by removing the wreckage that no
longer works.

gcc/ChangeLog:

* config/xtensa/xtensa-protos.h (order_regs_for_local_alloc):
  Rename to xtensa_adjust_reg_alloc_order.
* config/xtensa/xtensa.cc (xtensa_adjust_reg_alloc_order):
  Ditto.  And also remove code to reorder register numbers for
  leaf functions, rename the tables, and adjust the allocation
  order for the call0 ABI to use register A0 more.
  (xtensa_leaf_regs): Remove.
* config/xtensa/xtensa.h (REG_ALLOC_ORDER): Cosmetics.
  (order_regs_for_local_alloc): Rename as the above.
  (LEAF_REGISTERS, LEAF_REG_REMAP, leaf_function): Remove.
---
 gcc/config/xtensa/xtensa-protos.h |  2 +-
 gcc/config/xtensa/xtensa.cc   | 77 +++
 gcc/config/xtensa/xtensa.h| 51 ++--
 3 files changed, 31 insertions(+), 99 deletions(-)

diff --git a/gcc/config/xtensa/xtensa-protos.h 
b/gcc/config/xtensa/xtensa-protos.h
index 91a215e535d..7b5790c5fc4 100644
--- a/gcc/config/xtensa/xtensa-protos.h
+++ b/gcc/config/xtensa/xtensa-protos.h
@@ -78,7 +78,7 @@ extern long compute_frame_size (poly_int64);
 extern bool xtensa_use_return_instruction_p (void);
 extern void xtensa_expand_prologue (void);
 extern void xtensa_expand_epilogue (bool);
-extern void order_regs_for_local_alloc (void);
+extern void xtensa_adjust_reg_alloc_order (void);
 extern enum reg_class xtensa_regno_to_class (int regno);
 extern HOST_WIDE_INT xtensa_initial_elimination_offset (int from, int to);
 extern const char **xtensa_get_config_strings (void);
diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
index 6cf6b35399a..df9b53aeced 100644
--- a/gcc/config/xtensa/xtensa.cc
+++ b/gcc/config/xtensa/xtensa.cc
@@ -107,18 +107,6 @@ struct GTY(()) machine_function
   rtx last_logues_a9_content;
 };
 
-/* Vector, indexed by hard register number, which contains 1 for a
-   register that is allowable in a candidate for leaf function
-   treatment.  */
-
-const char xtensa_leaf_regs[FIRST_PSEUDO_REGISTER] =
-{
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1
-};
-
 static void xtensa_option_override (void);
 static enum internal_test map_test_to_internal_test (enum rtx_code);
 static rtx gen_int_relational (enum rtx_code, rtx, rtx);
@@ -4140,58 +4128,25 @@ xtensa_secondary_reload (bool in_p, rtx x, reg_class_t 
rclass,
   return NO_REGS;
 }
 
+/* Called once at the start of IRA, by ADJUST_REG_ALLOC_ORDER.  */
 
 void
-order_regs_for_local_alloc (void)
+xtensa_adjust_reg_alloc_order (void)
 {
-  if (!leaf_function_p ())
-{
-  static const int reg_nonleaf_alloc_order[FIRST_PSEUDO_REGISTER] =
-   REG_ALLOC_ORDER;
-  static const int reg_nonleaf_alloc_order_call0[FIRST_PSEUDO_REGISTER] =
-   {
- 11, 10,  9,  8,  7,  6,  5,  4,  3,  2, 12, 13, 14, 15,
- 18,
- 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
- 0,  1, 16, 17,
- 35,
-   };
-
-  memcpy (reg_alloc_order, TARGET_WINDOWED_ABI ?
- reg_nonleaf_alloc_order : reg_nonleaf_alloc_order_call0,
- FIRST_PSEUDO_REGISTER * sizeof (int));
-}
-  else
-{
-  int i, num_arg_regs;
-  int nxt = 0;
-
-  /* Use the AR registers in increasing order (skipping a0 and a1)
-but save the incoming argument registers for a last resort.  */
-  num_arg_regs = crtl->args.info.arg_words;
-  if (num_arg_regs > MAX_ARGS_IN_REGISTERS)
-   num_arg_regs = MAX_ARGS_IN_REGISTERS;
-  for (i = GP_ARG_FIRST; i < 16 - num_arg_regs; i++)
-   reg_alloc_order[nxt++] = i + num_arg_regs;
-  for (i = 0; i < num_arg_regs; i++)
-   reg_alloc_order[nxt++] = GP_ARG_FIRST + i;
-
-  /* List the coprocessor registers in order.  */
-  for (i = 0; i < BR_REG_NUM; i++)
-   reg_alloc_order[nxt++] = BR_REG_FIRST + i;
-
-  /* List the FP registers in order for now.  */
-  for (i = 0; i < 16; i++)
-   reg_alloc_order[nxt++] = FP_REG_FIRST + i;
-
-  /* GCC requires that we list *all* the registers  */
-  reg_alloc_order[nxt++] = 0;  /* a0 = return address */
-  reg_alloc_order[nxt++] = 1;  /* a1

Re: [RFC] Introduce -finline-memset-loops

2023-01-13 Thread Alexandre Oliva via Gcc-patches
Hello, Paul,

On Jan 13, 2023, Paul Koning  wrote:

>> On Jan 13, 2023, at 8:54 PM, Alexandre Oliva via Gcc-patches
>>  wrote:

>> Target-specific code is great for tight optimizations, but the main
>> purpose of this feature is not an optimization.  AFAICT it actually
>> slows things down in general (due to code growth, and to conservative
>> assumptions about alignment), 

> I thought machinery like the memcpy patterns have as one of their
> benefits the ability to find the alignment of their operands and from
> that optimize things.  So I don't understand why you'd say
> "conservative".

Though memcpy implementations normally do that indeed, dynamically
increasing dest alignment has such an impact on code size that *inline*
memcpy doesn't normally do that.  try_store_by_multiple_pieces,
specifically, is potentially branch-heavy to begin with, and bumping
alignment up could double the inline expansion size.  So what it does is
to take the conservative dest alignment estimate from the compiler and
use it.

By adding leading loops to try_store_by_multiple_pieces (as does the
proposed patch, with its option enabled) we may expand an
unknown-length, unknown-alignment memset to something conceptually like
(cims is short for constant-sized inlined memset):

while (len >= 64) { len -= 64; cims(dest, c, 64); dest += 64; }
if (len >= 32) { len -= 32; cims(dest, c, 32); dest += 32; }
if (len >= 16) { len -= 16; cims(dest, c, 16); dest += 16; }
if (len >= 8) { len -= 8; cims(dest, c, 8); dest += 8; }
if (len >= 4) { len -= 4; cims(dest, c, 4); dest += 4; }
if (len >= 2) { len -= 2; cims(dest, c, 2); dest += 2; }
if (len >= 1) { len -= 1; cims(dest, c, 1); dest += 1; }

With dynamic alignment bumps under a trivial extension of the current
logic, it would become (cimsN is short for cims with dest known to be
aligned to an N-byte boundary):

if (len >= 2 && (dest & 1)) { len -= 1; cims(dest, c, 1); dest += 1; }
if (len >= 4 && (dest & 2)) { len -= 2; cims2(dest, c, 2); dest += 2; }
if (len >= 8 && (dest & 4)) { len -= 4; cims4(dest, c, 4); dest += 4; }
if (len >= 16 && (dest & 8)) { len -= 8; cims8(dest, c, 8); dest += 8; }
if (len >= 32 && (dest & 16)) { len -= 16; cims16(dest, c, 16); dest += 16; }
if (len >= 64 && (dest & 32)) { len -= 32; cims32(dest, c, 32); dest += 32; }
while (len >= 64) { len -= 64; cims64(dest, c, 64); dest += 64; }
if (len >= 32) { len -= 32; cims32(dest, c, 32); dest += 32; }
if (len >= 16) { len -= 16; cims16(dest, c, 16); dest += 16; }
if (len >= 8) { len -= 8; cims8(dest, c, 8); dest += 8; }
if (len >= 4) { len -= 4; cims4(dest, c, 4); dest += 4; }
if (len >= 2) { len -= 2; cims2(dest, c, 2); dest += 2; }
if (len >= 1) { len -= 1; cims(dest, c, 1); dest += 1; }


Now, by using more loops instead of going through every power of two, We
could shorten (for -Os) the former to e.g.:

while (len >= 64) { len -= 64; cims(dest, c, 64); dest += 64; }
while (len >= 8) { len -= 8; cims(dest, c, 8); dest += 8; }
while (len >= 1) { len -= 1; cims(dest, c, 1); dest += 1; }

and we could similarly add more compact logic for dynamic alignment:

if (len >= 8) {
  while (dest & 7) { len -= 1; cims(dest, c, 1); dest += 1; }
  if (len >= 64)
while (dest & 56) { len -= 8; cims8(dest, c, 8); dest += 8; }
  while (len >= 64) { len -= 64; cims64(dest, c, 64); dest += 64; }
  while (len >= 8) { len -= 8; cims8(dest, c, 8); dest += 8; }
}
while (len >= 1) { len -= 1; cims(dest, c, 1); dest += 1; }


Now, given that improving performance was never goal of this change, and
the expansion it optionally offers is desirable even when it slows
things down, just making it a simple loop at the known alignment would
do.  The remainder sort of flowed out of the way
try_store_by_multiple_pieces was structured, and I found it sort of made
sense to start with the largest-reasonable block loop, and then end with
whatever try_store_by_multiple_pieces would have expanded a
known-shorter but variable length memset to.  And this is how I got to
it.  I'm not sure it makes any sense to try to change things further to
satisfy other competing goals such as performance or code size.

-- 
Alexandre Oliva, happy hackerhttps://FSFLA.org/blogs/lxo/
   Free Software Activist   GNU Toolchain Engineer
Disinformation flourishes because many people care deeply about injustice
but very few check the facts.  Ask me about 


Re: [RFC] Introduce -finline-memset-loops

2023-01-13 Thread Paul Koning via Gcc-patches



> On Jan 13, 2023, at 8:54 PM, Alexandre Oliva via Gcc-patches 
>  wrote:
> 
> Hello, Richard,
> 
> Thank you for the feedback.
> 
> On Jan 12, 2023, Richard Biener  wrote:
> 
>> On Tue, Dec 27, 2022 at 5:12 AM Alexandre Oliva via Gcc-patches
>>  wrote:
> 
>>> This patch extends the memset expansion to start with a loop, so as to
>>> still take advantage of known alignment even with long lengths, but
>>> without necessarily adding store blocks for every power of two.
> 
>> I wonder if that isn't better handled by targets via the setmem pattern,
> 
> That was indeed where I started, but then I found myself duplicating the
> logic in try_store_by_multiple_pieces on a per-target basis.
> 
> Target-specific code is great for tight optimizations, but the main
> purpose of this feature is not an optimization.  AFAICT it actually
> slows things down in general (due to code growth, and to conservative
> assumptions about alignment), 

I thought machinery like the memcpy patterns have as one of their benefits the 
ability to find the alignment of their operands and from that optimize things.  
So I don't understand why you'd say "conservative".

paul




Re: [RFC] Introduce -finline-memset-loops

2023-01-13 Thread Alexandre Oliva via Gcc-patches
Hello, Richard,

Thank you for the feedback.

On Jan 12, 2023, Richard Biener  wrote:

> On Tue, Dec 27, 2022 at 5:12 AM Alexandre Oliva via Gcc-patches
>  wrote:

>> This patch extends the memset expansion to start with a loop, so as to
>> still take advantage of known alignment even with long lengths, but
>> without necessarily adding store blocks for every power of two.

> I wonder if that isn't better handled by targets via the setmem pattern,

That was indeed where I started, but then I found myself duplicating the
logic in try_store_by_multiple_pieces on a per-target basis.

Target-specific code is great for tight optimizations, but the main
purpose of this feature is not an optimization.  AFAICT it actually
slows things down in general (due to code growth, and to conservative
assumptions about alignment), except perhaps for some microbenchmarks.
It's rather a means to avoid depending on the C runtime, particularly
due to compiler-introduced memset calls.

My initial goal was to be able to show that inline expansion would NOT
bring about performance improvements, but performance was not the
concern that led to the request.

If the approach seems generally acceptable, I may even end up extending
it to other such builtins.  I have a vague recollection that memcmp is
also an issue for us.

> like x86 has the stringop inline strathegy.  What is considered acceptable
> in terms of size or performance will vary and I don't think there's much
> room for improvements on this generic code support?

*nod* x86 is quite finely tuned already; I suppose other targets may
have some room for additional tuning, both for performance and for code
size, but we don't have much affordance for avoiding builtin calls to
the C runtime, which is what this is about.

Sometimes disabling loop distribution is enough to accomplish that, but
in some cases GNAT itself resorts to builtin memset calls, in ways that
are not so easy to avoid, and that would ultimately amount to expanding
memset inline, so I figured we might as well offer that as a general
feature, for users to whom this matters.

Is (optionally) tending to this (uncommon, I suppose) need (or
preference?) not something GCC would like to do?

-- 
Alexandre Oliva, happy hackerhttps://FSFLA.org/blogs/lxo/
   Free Software Activist   GNU Toolchain Engineer
Disinformation flourishes because many people care deeply about injustice
but very few check the facts.  Ask me about 


[PATCH v4] c++: Reject UDLs in certain contexts [PR105300]

2023-01-13 Thread Marek Polacek via Gcc-patches
On Sat, Dec 03, 2022 at 02:58:16PM -0500, Jason Merrill wrote:
> On 12/2/22 18:58, Marek Polacek wrote:
> > On Fri, Nov 18, 2022 at 08:39:10PM -0500, Jason Merrill wrote:
> > > On 11/18/22 18:52, Marek Polacek wrote:
> > > > +/* Parse a string literal or user defined string literal.
> > > > +
> > > > +   user-defined-string-literal :
> > > > + string-literal ud-suffix
> > > > +
> > > > +   Parameters as for cp_parser_string_literal.  If LOOKUP_UDLIT, 
> > > > perform
> > > > +   a lookup for a suitable template function.  */
> > > > +
> > > > +static inline cp_expr
> > > > +cp_parser_userdef_string_literal (cp_parser *parser, bool translate,
> > > > + bool wide_ok, bool lookup_udlit = 
> > > > true)
> > > 
> > > I think this function doesn't need the translate and wide_ok parms, they 
> > > can
> > > always be true.
> > 
> > I've dropped the wide_ok one, but not the other, because...
> > > > +{
> > > > +  return cp_parser_string_literal_common (parser, translate, wide_ok,
> > > > + /*udl_ok=*/true, 
> > > > lookup_udlit);
> > > > +}
> > > > +
> > > >/* Look up a literal operator with the name and the exact arguments. 
> > > >  */
> > > >static tree
> > > > @@ -4913,7 +4955,7 @@ cp_parser_userdef_numeric_literal (cp_parser 
> > > > *parser)
> > > >   as arguments.  */
> > > >static tree
> > > > -cp_parser_userdef_string_literal (tree literal)
> > > > +finish_userdef_string_literal (tree literal)
> > > >{
> > > >  tree suffix_id = USERDEF_LITERAL_SUFFIX_ID (literal);
> > > >  tree name = cp_literal_operator_id (IDENTIFIER_POINTER 
> > > > (suffix_id));
> > > > @@ -5652,10 +5694,10 @@ cp_parser_primary_expression (cp_parser *parser,
> > > >case CPP_UTF8STRING_USERDEF:
> > > >  /* ??? Should wide strings be allowed when 
> > > > parser->translate_strings_p
> > > >  is false (i.e. in attributes)?  If not, we can kill the third
> > > > -argument to cp_parser_string_literal.  */
> > > 
> > > I think the answer to this old question is no: if we have an
> > > encoding-prefix, we should be translating.
> > 
> > ...I don't actually know how to resolve this.  wide_ok is always true here.
> > Should that change?  Or rather, should translate be false for CPP_STRING 
> > only?

Sorry it's taken so long to get back to this.
 
> The one current exception to my assertion above is static_assert, for which
> we currently allow encoding-prefixes but don't translate.  I think this is
> wrong, that we should translate the string.  But I'm not confident of that.
> 
> But to your question, yes: when translate is false, I think we also don't
> want to allow UDLs.  So _userdef can always pass true for translate.  And as
> below we should call it only when translate would be true.

Done: _userdef no longer has the translate paramater and it's only called
when parser->translate_strings_p.
 
> Incidentally, it seems that we set translate off for all attributes, even
> ones that would take a normal expression argument where presumably we do
> want translation (and UDLs).  The whole business of different parsing for
> different attributes is a headache.  You don't need to deal with this now.
> 
> > > > -  return (cp_parser_string_literal (parser,
> > > > -   parser->translate_strings_p,
> > > > -   true)
> > > > +argument to cp_parser_{,userdef}string_literal.  */
> > > > +  return (cp_parser_userdef_string_literal (parser,
> > > > +   
> > > > parser->translate_strings_p,
> > > > +   /*wide_ok=*/true)
> > > 
> > > For CPP_*STRING* without _USERDEF, we should still call
> > > cp_parser_string_literal.
> > 
> > It looks like we always have to call cp_parser_userdef_string_literal
> > otherwise this would be reejcted:
> > 
> >std::string concat01 = "Hello, " "World!"_www;
> > 
> > Because first we see a CPP_STRING but the subsequent UDL shouldn't
> > be rejected.
> 
> Ah, I didn't notice the function was handling a sequence of string-literals.
> So maybe we want to call _userdef here when translate_strings_p, and not
> when it's false.

Resolved by the change above.  Thanks,

Bootstrapped/regtested on x86_64-pc-linux-gnu, ok for trunk?

-- >8 --
In this PR, we are crashing because we've encountered a UDL where a
string-literal is expected.  This patch makes the parser reject string
and character UDLs in all places where the grammar requires a
string-literal and not a user-defined-string-literal.

I've introduced two new wrappers; the existing cp_parser_string_literal
was renamed to cp_parser_string_literal_common and should not be called
directly.  finish_userdef_string_literal is renamed from
cp_parser_userdef_string_literal.

PR c++/105300

gcc/c-family/ChangeLog:

* c-pragma.cc (handle_pragma_message): Warn for

[committed] analyzer: add heuristics for switch on enum type [PR105273]

2023-01-13 Thread David Malcolm via Gcc-patches
Assume that switch on an enum doesn't follow an implicit default
skipping all cases when all enum values are covered by cases.

Fixes various false positives from -Wanalyzer-use-of-uninitialized-value
such as this one seen in Doom:

p_maputl.c: In function 'P_BoxOnLineSide':
p_maputl.c:151:8: warning: use of uninitialized value 'p1' [CWE-457] 
[-Wanalyzer-use-of-uninitialized-value]
  151 | if (p1 == p2)
  |^
  'P_BoxOnLineSide': events 1-5
|
|  115 | int p1;
|  | ^~
|  | |
|  | (1) region created on stack here
|  | (2) capacity: 4 bytes
|..
|  118 | switch (ld->slopetype)
|  | ~~
|  | |
|  | (3) following 'default:' branch...
|..
|  151 | if (p1 == p2)
|  |~
|  ||
|  |(4) ...to here
|  |(5) use of uninitialized value 'p1' here
|

where "ld->slopetype" is a "slopetype_t" enum, and for every value of
that enum the switch has a case that initializes "p1".

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to trunk as r13-5159-gccd4df81aa6537.

gcc/analyzer/ChangeLog:
PR analyzer/105273
* region-model.cc (has_nondefault_case_for_value_p): New.
(has_nondefault_cases_for_all_enum_values_p): New.
(region_model::apply_constraints_for_gswitch): Skip
implicitly-created "default" when switching on an enum
and all enum values have non-default cases.
(rejected_default_case::dump_to_pp): New.
* region-model.h (region_model_context::possibly_tainted_p): New
decl.
(class rejected_default_case): New.
* sm-taint.cc (region_model_context::possibly_tainted_p): New.
* supergraph.cc (switch_cfg_superedge::dump_label_to_pp): Dump
when implicitly_created_default_p.
(switch_cfg_superedge::implicitly_created_default_p): New.
* supergraph.h
(switch_cfg_superedge::implicitly_created_default_p): New decl.

gcc/testsuite/ChangeLog:
PR analyzer/105273
* gcc.dg/analyzer/switch-enum-1.c: New test.
* gcc.dg/analyzer/switch-enum-2.c: New test.
* gcc.dg/analyzer/switch-enum-pr105273-git-vreportf-2.c: New test.
* gcc.dg/analyzer/switch-enum-taint-1.c: New test.
* gcc.dg/analyzer/switch-wrong-enum.c: New test.
* gcc.dg/analyzer/torture/switch-enum-pr105273-doom-p_floor.c: New
test.
* gcc.dg/analyzer/torture/switch-enum-pr105273-doom-p_maputl.c:
New test.
* gcc.dg/analyzer/torture/switch-enum-pr105273-git-vreportf-1.c:
New test.

Signed-off-by: David Malcolm 
---
 gcc/analyzer/region-model.cc  | 104 +-
 gcc/analyzer/region-model.h   |  12 ++
 gcc/analyzer/sm-taint.cc  |  25 
 gcc/analyzer/supergraph.cc|  22 +++
 gcc/analyzer/supergraph.h |   2 +
 gcc/testsuite/gcc.dg/analyzer/switch-enum-1.c | 136 ++
 gcc/testsuite/gcc.dg/analyzer/switch-enum-2.c | 132 +
 .../switch-enum-pr105273-git-vreportf-2.c |  40 ++
 .../gcc.dg/analyzer/switch-enum-taint-1.c | 102 +
 .../gcc.dg/analyzer/switch-wrong-enum.c   |  27 
 .../switch-enum-pr105273-doom-p_floor.c   |  89 
 .../switch-enum-pr105273-doom-p_maputl.c  |  86 +++
 .../switch-enum-pr105273-git-vreportf-1.c |  35 +
 13 files changed, 810 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/switch-enum-1.c
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/switch-enum-2.c
 create mode 100644 
gcc/testsuite/gcc.dg/analyzer/switch-enum-pr105273-git-vreportf-2.c
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/switch-enum-taint-1.c
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/switch-wrong-enum.c
 create mode 100644 
gcc/testsuite/gcc.dg/analyzer/torture/switch-enum-pr105273-doom-p_floor.c
 create mode 100644 
gcc/testsuite/gcc.dg/analyzer/torture/switch-enum-pr105273-doom-p_maputl.c
 create mode 100644 
gcc/testsuite/gcc.dg/analyzer/torture/switch-enum-pr105273-git-vreportf-1.c

diff --git a/gcc/analyzer/region-model.cc b/gcc/analyzer/region-model.cc
index 2e59fbaadd7..6a3a1b474bf 100644
--- a/gcc/analyzer/region-model.cc
+++ b/gcc/analyzer/region-model.cc
@@ -4341,6 +4341,72 @@ region_model::apply_constraints_for_gcond (const 
cfg_superedge &sedge,
   return add_constraint (lhs, op, rhs, ctxt, out);
 }
 
+/* Return true iff SWITCH_STMT has a non-default label that contains
+   INT_CST.  */
+
+static bool
+has_nondefault_case_for_value_p (const gswitch *switch_stmt, tree int_cst)
+{
+  /* We expect the initial label to be the default; skip it.  */
+  gcc_assert (CASE_LOW (gimple_switch_label (switch_stmt, 0)) == NULL);
+  unsigned min_idx = 1;
+  unsigned max_idx = 

Re: [GCC][PATCH 13/15, v5] arm: Add support for dwarf debug directives and pseudo hard-register for PAC feature.

2023-01-13 Thread Jakub Jelinek via Gcc-patches
On Fri, Jan 13, 2023 at 10:39:59PM +, Richard Earnshaw wrote:
> > > It is.  The new unwinder fortunately doesn't suffer from this (at least I
> > > think it doesn't), but in older gccs the unwinder could be split
> > > across different
> > > objects, having e.g. parts of the unwinder in one shared library and
> > > another
> > > part in another one, each built by different GCC version.
> > > 
> > > Guess targets which weren't supported in GCC 2.x are ok, while
> > > __frame_state_for is in libgcc, nothing calls it, so while such changes
> > > change the ABI, nothing likely cares.
> > > But for older targets it is a problem.
> > > 
> > > And it is hard to catch this in the testsuite, one would either need to
> > > hardcode the count for each target in the test, or test with mixing
> > > GCC 2.x
> > > compiled code with current trunk.
> > > 
> > > Before the introduction of libgcc_eh.a etc., parts of the unwinder
> > > was e.g.
> > > exported from glibc.
> > > See e.g.
> > > https://gcc.gnu.org/legacy-ml/gcc-patches/2001-07/threads.html#00472
> > > 
> > > for some details.
> > 
> > So:
> > 1) GCC-2.* didn't support the EABI, which is all we support these days.
> > 2) the Arm port updated FIRST_PSEUDO_REGISTER in 2019 in r10-4441
> > (16155ccf588a403c033ccd7743329671bcfb27d5) and I didn't see any fallout
> > from that.
> In fact it's been changed in
> 
>  16155ccf588a
>  cf16f980e527
>  0be8bd1a1c89
>  f1adb0a9f4d7
>  9b66ebb1460d
>  5a9335ef017c
> 
> All since 2003 (ie since gcc-3.0 was released).

You're right, t-bpabi uses unwind-arm.c rather than unwind-dw2.c.
Sorry for the false alarm.

Jakub



Re: [GCC][PATCH 13/15, v5] arm: Add support for dwarf debug directives and pseudo hard-register for PAC feature.

2023-01-13 Thread Richard Earnshaw via Gcc-patches

On 13/01/2023 22:25, Richard Earnshaw (lists) via Gcc-patches wrote:

On 13/01/2023 22:12, Jakub Jelinek wrote:

On Fri, Jan 13, 2023 at 09:58:26PM +, Richard Earnshaw (lists) wrote:
> I'm afraid increasing number of DWARF registers is ABI incompatible 
change.

> E.g. libgcc __frame_state_for function fills in:
> typedef struct frame_state
> {
>    void *cfa;
>    void *eh_ptr;
>    long cfa_offset;
>    long args_size;
>    long reg_or_offset[PRE_GCC3_DWARF_FRAME_REGISTERS+1];
>    unsigned short cfa_reg;
>    unsigned short retaddr_column;
>    char saved[PRE_GCC3_DWARF_FRAME_REGISTERS+1];
> } frame_state;
> > structure, where PRE_GCC3_DWARF_FRAME_REGISTERS defaults to
> __LIBGCC_DWARF_FRAME_REGISTERS__, which is defined to
> DWARF_FRAME_REGISTERS, which defaults to FIRST_PSEUDO_REGISTER.
> So, changing FIRST_PSEUDO_REGISTER is an ABI change unless you 
arrange for

> PRE_GCC3_DWARF_FRAME_REGISTERS to be defined to the old value.
> >  Jakub
>
So where's the red flag that warns about this?

I also note that Richard Sandiford made a similar type of change for 
AArch64
in r10-4195 (183bfdafc6f1f98711c5400498a7268cc1441096) and nothing 
was said

about that at the time.

It seems incredibly fragile to me to have some ABI based off the 
number of

machine registers.


It is.  The new unwinder fortunately doesn't suffer from this (at least I
think it doesn't), but in older gccs the unwinder could be split 
across different
objects, having e.g. parts of the unwinder in one shared library and 
another

part in another one, each built by different GCC version.

Guess targets which weren't supported in GCC 2.x are ok, while
__frame_state_for is in libgcc, nothing calls it, so while such changes
change the ABI, nothing likely cares.
But for older targets it is a problem.

And it is hard to catch this in the testsuite, one would either need to
hardcode the count for each target in the test, or test with mixing 
GCC 2.x

compiled code with current trunk.

Before the introduction of libgcc_eh.a etc., parts of the unwinder was 
e.g.

exported from glibc.
See e.g. 
https://gcc.gnu.org/legacy-ml/gcc-patches/2001-07/threads.html#00472 


for some details.

 Jakub



So:
1) GCC-2.* didn't support the EABI, which is all we support these days.
2) the Arm port updated FIRST_PSEUDO_REGISTER in 2019 in r10-4441 
(16155ccf588a403c033ccd7743329671bcfb27d5) and I didn't see any fallout 
from that.

In fact it's been changed in

 16155ccf588a
 cf16f980e527
 0be8bd1a1c89
 f1adb0a9f4d7
 9b66ebb1460d
 5a9335ef017c

All since 2003 (ie since gcc-3.0 was released).

3) The Arm port uses the unwinding mechanism defined by the ABI, not the 
dwarf2 based tables.


So I'm inclined to think this probably isn't going to be a problem in 
reality.


R.




Re: [GCC][PATCH 13/15, v5] arm: Add support for dwarf debug directives and pseudo hard-register for PAC feature.

2023-01-13 Thread Richard Earnshaw (lists) via Gcc-patches

On 13/01/2023 22:12, Jakub Jelinek wrote:

On Fri, Jan 13, 2023 at 09:58:26PM +, Richard Earnshaw (lists) wrote:

> I'm afraid increasing number of DWARF registers is ABI incompatible change.
> E.g. libgcc __frame_state_for function fills in:
> typedef struct frame_state
> {
>    void *cfa;
>    void *eh_ptr;
>    long cfa_offset;
>    long args_size;
>    long reg_or_offset[PRE_GCC3_DWARF_FRAME_REGISTERS+1];
>    unsigned short cfa_reg;
>    unsigned short retaddr_column;
>    char saved[PRE_GCC3_DWARF_FRAME_REGISTERS+1];
> } frame_state;
> 
> structure, where PRE_GCC3_DWARF_FRAME_REGISTERS defaults to

> __LIBGCC_DWARF_FRAME_REGISTERS__, which is defined to
> DWARF_FRAME_REGISTERS, which defaults to FIRST_PSEUDO_REGISTER.
> So, changing FIRST_PSEUDO_REGISTER is an ABI change unless you arrange for
> PRE_GCC3_DWARF_FRAME_REGISTERS to be defined to the old value.
> 
>  Jakub
> 


So where's the red flag that warns about this?

I also note that Richard Sandiford made a similar type of change for AArch64
in r10-4195 (183bfdafc6f1f98711c5400498a7268cc1441096) and nothing was said
about that at the time.

It seems incredibly fragile to me to have some ABI based off the number of
machine registers.


It is.  The new unwinder fortunately doesn't suffer from this (at least I
think it doesn't), but in older gccs the unwinder could be split across 
different

objects, having e.g. parts of the unwinder in one shared library and another
part in another one, each built by different GCC version.

Guess targets which weren't supported in GCC 2.x are ok, while
__frame_state_for is in libgcc, nothing calls it, so while such changes
change the ABI, nothing likely cares.
But for older targets it is a problem.

And it is hard to catch this in the testsuite, one would either need to
hardcode the count for each target in the test, or test with mixing GCC 2.x
compiled code with current trunk.

Before the introduction of libgcc_eh.a etc., parts of the unwinder was e.g.
exported from glibc.
See e.g. 
https://gcc.gnu.org/legacy-ml/gcc-patches/2001-07/threads.html#00472 


for some details.

     Jakub



So:
1) GCC-2.* didn't support the EABI, which is all we support these days.
2) the Arm port updated FIRST_PSEUDO_REGISTER in 2019 in r10-4441 
(16155ccf588a403c033ccd7743329671bcfb27d5) and I didn't see any fallout 
from that.
3) The Arm port uses the unwinding mechanism defined by the ABI, not the 
dwarf2 based tables.


So I'm inclined to think this probably isn't going to be a problem in 
reality.


R.


Re: [GCC][PATCH 13/15, v5] arm: Add support for dwarf debug directives and pseudo hard-register for PAC feature.

2023-01-13 Thread Jakub Jelinek via Gcc-patches
On Fri, Jan 13, 2023 at 09:58:26PM +, Richard Earnshaw (lists) wrote:
> > I'm afraid increasing number of DWARF registers is ABI incompatible change.
> > E.g. libgcc __frame_state_for function fills in:
> > typedef struct frame_state
> > {
> >void *cfa;
> >void *eh_ptr;
> >long cfa_offset;
> >long args_size;
> >long reg_or_offset[PRE_GCC3_DWARF_FRAME_REGISTERS+1];
> >unsigned short cfa_reg;
> >unsigned short retaddr_column;
> >char saved[PRE_GCC3_DWARF_FRAME_REGISTERS+1];
> > } frame_state;
> > 
> > structure, where PRE_GCC3_DWARF_FRAME_REGISTERS defaults to
> > __LIBGCC_DWARF_FRAME_REGISTERS__, which is defined to
> > DWARF_FRAME_REGISTERS, which defaults to FIRST_PSEUDO_REGISTER.
> > So, changing FIRST_PSEUDO_REGISTER is an ABI change unless you arrange for
> > PRE_GCC3_DWARF_FRAME_REGISTERS to be defined to the old value.
> > 
> > Jakub
> > 
> 
> So where's the red flag that warns about this?
> 
> I also note that Richard Sandiford made a similar type of change for AArch64
> in r10-4195 (183bfdafc6f1f98711c5400498a7268cc1441096) and nothing was said
> about that at the time.
> 
> It seems incredibly fragile to me to have some ABI based off the number of
> machine registers.

It is.  The new unwinder fortunately doesn't suffer from this (at least I
think it doesn't), but in older gccs the unwinder could be split across 
different
objects, having e.g. parts of the unwinder in one shared library and another
part in another one, each built by different GCC version.

Guess targets which weren't supported in GCC 2.x are ok, while
__frame_state_for is in libgcc, nothing calls it, so while such changes
change the ABI, nothing likely cares.
But for older targets it is a problem.

And it is hard to catch this in the testsuite, one would either need to
hardcode the count for each target in the test, or test with mixing GCC 2.x
compiled code with current trunk.

Before the introduction of libgcc_eh.a etc., parts of the unwinder was e.g.
exported from glibc.
See e.g. https://gcc.gnu.org/legacy-ml/gcc-patches/2001-07/threads.html#00472
for some details.

Jakub



Re: [PATCH] PR tree-optimization/108359 - Utilize op1 == op2 when invoking range-ops folding.

2023-01-13 Thread Andrew MacLeod via Gcc-patches


On 1/13/23 16:54, Jakub Jelinek wrote:

On Fri, Jan 13, 2023 at 04:23:20PM -0500, Andrew MacLeod via Gcc-patches wrote:

fold_range() already invokes wi_fold_in_parts to try to get more refined
information. If the subranges are quite small, it will do each individual
calculation and combine the results.

x * y with x = [1,3] and y = [1,3]  is broken down and we calculate each
possibility and we end up with [1,4][6,6][9,9] instead of [1,9]

We limit this as the time is between quadratic to exponential depending on
the number of elements in x and y.

If we also check the relation and determine that x == y, we don't need to
worry about that growth as this process is linear.  The above case will be
broken down to just  1*1, 2*2 and 3*3, resulting in a range of [1,
1][4,4][9,9].

  In the testcase, it happens to be the right_shift operation, but this
solution is generic and applies to all range-op operations. I added a
testcase which checks >>, *, + and %.

I also arbitrarily chose 8 elements as the limit for breaking down
individual operations.  The overall compile time change for this is
negligible.

Although this is a regression fix, it will affect all operations where x ==
y, which is where my initial hesitancy arose.

Regardless, bootstrapped on x86_64-pc-linux-gnu with no regressions.  OK for
trunk?

Will defer to Aldy, just some nits.

Did you mean Richi?



+  // if there are 1 to 8 values in the LH range, split them up.
+  r.set_undefined ();
+  if (lh_range >= 0 && lh_range <= 7)
+{
+  unsigned x;
+  for (x = 0; x <= lh_range; x++)

Nothing uses x after the loop, so why not
   for (unsigned x = 0; x <= lh_range; x++)
instead?


Just old habits.



@@ -234,6 +264,26 @@ range_operator::fold_range (irange &r, tree type,
unsigned num_lh = lh.num_pairs ();
unsigned num_rh = rh.num_pairs ();
  
+  // If op1 and op2 are equivalences, then we don't need a complete cross

+  // product, just pairs of matching elements.
+  if (relation_equiv_p (rel) && (lh == rh))

The ()s around lh == rh look superfluous to me.

Yeah I just found it marginally more readable, but it is superfluous

+{
+  int_range_max tmp;
+  r.set_undefined ();
+  for (unsigned x = 0; x < num_lh; ++x)

fold_range has an upper bound of num_lh * num_rh > 12, shouldn't something
like that be there for this case too?
I mean, every wi_fold_in_parts_equiv can result in 8 subranges,
but num_lh could be up to 255 here, it is true it is linear and union_
should merge excess ones, but still I wonder if some larger num_lh upper
bound like 20 or 32 wouldn't be useful.  Up to you...
fold_range has the num_lh * num_rh limit because it was 
quadratic/exponential and changes rapidly. Since this was linear based 
on the number of sub ranges I didn't think it would matter much, but 
sure, we can put a similar limit on it.. 16 seems reasonable.

+   {
+ wide_int lh_lb = lh.lower_bound (x);
+ wide_int lh_ub = lh.upper_bound (x);
+ wi_fold_in_parts_equiv (tmp, type, lh_lb, lh_ub);
+ r.union_ (tmp);
+ if (r.varying_p ())
+   break;
+   }
+  op1_op2_relation_effect (r, type, lh, rh, rel);
+  update_known_bitmask (r, m_code, lh, rh);
+  return true;
+}
+

Jakub

Updated patch attached...  I'll run it through testing whe the current 
one is done.



Andrew
From 42010868ff3cdbb5b9ad3484f115b7c23f9e14e7 Mon Sep 17 00:00:00 2001
From: Andrew MacLeod 
Date: Wed, 11 Jan 2023 18:12:51 -0500
Subject: [PATCH 2/3] Utilize op1 == op2 when invoking range-ops folding.

If there exists an equivalence relationship between op1 and op2,
any binary operation can be broken into individual operations and
unioned if there are sufficently few elements in the set.

	PR tree-optimization/108359
	gcc/
	* range-op.cc (range_operator::wi_fold_in_parts_equiv): New.
	(range_operator::fold_range): If op1 is equivalent to op2 then
	invoke new fold_in_parts_equiv to operate on sub-components.
	* range-op.h (wi_fold_in_parts_equiv): New prototype.

	gcc/testsuite/
	* gcc.dg/pr108359.c: New.
---
 gcc/range-op.cc | 49 
 gcc/range-op.h  |  5 
 gcc/testsuite/gcc.dg/pr108359.c | 50 +
 3 files changed, 104 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/pr108359.c

diff --git a/gcc/range-op.cc b/gcc/range-op.cc
index ec75e07bc8a..33bc4dcb4b4 100644
--- a/gcc/range-op.cc
+++ b/gcc/range-op.cc
@@ -160,6 +160,35 @@ range_operator::wi_fold (irange &r, tree type,
   r.set_varying (type);
 }
 
+// Call wi_fold when both op1 and op2 are equivalent. Further split small
+// subranges into constants.  This can provide better precision.
+// For x + y,  when x == y with a range of [0,4] instead of [0, 8] produce
+// [0,0][2, 2][4,4][6, 6][8, 8]
+
+void
+range_operator::wi_fold_in_parts_equiv (irange &r, tree type,
+	const wide_int &lh_lb,
+	const wide_int &lh_ub) const
+{
+  int_range

Re: [GCC][PATCH 13/15, v5] arm: Add support for dwarf debug directives and pseudo hard-register for PAC feature.

2023-01-13 Thread Richard Earnshaw via Gcc-patches

On 13/01/2023 21:58, Richard Earnshaw (lists) via Gcc-patches wrote:

On 13/01/2023 18:02, Jakub Jelinek via Gcc-patches wrote:
On Fri, Jan 13, 2023 at 05:44:15PM +, Srinath Parvathaneni via 
Gcc-patches wrote:

Hello,

This patch teaches the DWARF support in gcc about RA_AUTH_CODE pseudo 
hard-register and also
updates the ".save", ".cfi_register", ".cfi_offset", ".cfi_restore" 
directives accordingly.
This patch also adds support to emit ".pacspval" directive when "pac 
ip, lr, sp" instruction

in generated in the assembly.

RA_AUTH_CODE register number is 107 and it's dwarf register number is 
143.


I'm afraid increasing number of DWARF registers is ABI incompatible 
change.

E.g. libgcc __frame_state_for function fills in:
typedef struct frame_state
{
   void *cfa;
   void *eh_ptr;
   long cfa_offset;
   long args_size;
   long reg_or_offset[PRE_GCC3_DWARF_FRAME_REGISTERS+1];
   unsigned short cfa_reg;
   unsigned short retaddr_column;
   char saved[PRE_GCC3_DWARF_FRAME_REGISTERS+1];
} frame_state;

structure, where PRE_GCC3_DWARF_FRAME_REGISTERS defaults to
__LIBGCC_DWARF_FRAME_REGISTERS__, which is defined to
DWARF_FRAME_REGISTERS, which defaults to FIRST_PSEUDO_REGISTER.
So, changing FIRST_PSEUDO_REGISTER is an ABI change unless you arrange 
for

PRE_GCC3_DWARF_FRAME_REGISTERS to be defined to the old value.

Jakub



So where's the red flag that warns about this?

I also note that Richard Sandiford made a similar type of change for 
AArch64 in r10-4195 (183bfdafc6f1f98711c5400498a7268cc1441096) and 
nothing was said about that at the time.


It seems incredibly fragile to me to have some ABI based off the number 
of machine registers.


R.


Also, the Arm port does not use dwarf based unwinding, so is this really 
relevant?


R.


Re: [GCC][PATCH 13/15, v5] arm: Add support for dwarf debug directives and pseudo hard-register for PAC feature.

2023-01-13 Thread Richard Earnshaw (lists) via Gcc-patches

On 13/01/2023 18:02, Jakub Jelinek via Gcc-patches wrote:

On Fri, Jan 13, 2023 at 05:44:15PM +, Srinath Parvathaneni via Gcc-patches 
wrote:

Hello,

This patch teaches the DWARF support in gcc about RA_AUTH_CODE pseudo 
hard-register and also
updates the ".save", ".cfi_register", ".cfi_offset", ".cfi_restore" directives 
accordingly.
This patch also adds support to emit ".pacspval" directive when "pac ip, lr, 
sp" instruction
in generated in the assembly.

RA_AUTH_CODE register number is 107 and it's dwarf register number is 143.


I'm afraid increasing number of DWARF registers is ABI incompatible change.
E.g. libgcc __frame_state_for function fills in:
typedef struct frame_state
{
   void *cfa;
   void *eh_ptr;
   long cfa_offset;
   long args_size;
   long reg_or_offset[PRE_GCC3_DWARF_FRAME_REGISTERS+1];
   unsigned short cfa_reg;
   unsigned short retaddr_column;
   char saved[PRE_GCC3_DWARF_FRAME_REGISTERS+1];
} frame_state;

structure, where PRE_GCC3_DWARF_FRAME_REGISTERS defaults to
__LIBGCC_DWARF_FRAME_REGISTERS__, which is defined to
DWARF_FRAME_REGISTERS, which defaults to FIRST_PSEUDO_REGISTER.
So, changing FIRST_PSEUDO_REGISTER is an ABI change unless you arrange for
PRE_GCC3_DWARF_FRAME_REGISTERS to be defined to the old value.

Jakub



So where's the red flag that warns about this?

I also note that Richard Sandiford made a similar type of change for 
AArch64 in r10-4195 (183bfdafc6f1f98711c5400498a7268cc1441096) and 
nothing was said about that at the time.


It seems incredibly fragile to me to have some ABI based off the number 
of machine registers.


R.


Re: [PATCH] PR tree-optimization/108359 - Utilize op1 == op2 when invoking range-ops folding.

2023-01-13 Thread Jakub Jelinek via Gcc-patches
On Fri, Jan 13, 2023 at 04:23:20PM -0500, Andrew MacLeod via Gcc-patches wrote:
> fold_range() already invokes wi_fold_in_parts to try to get more refined
> information. If the subranges are quite small, it will do each individual
> calculation and combine the results.
> 
> x * y with x = [1,3] and y = [1,3]  is broken down and we calculate each
> possibility and we end up with [1,4][6,6][9,9] instead of [1,9]
> 
> We limit this as the time is between quadratic to exponential depending on
> the number of elements in x and y.
> 
> If we also check the relation and determine that x == y, we don't need to
> worry about that growth as this process is linear.  The above case will be
> broken down to just  1*1, 2*2 and 3*3, resulting in a range of [1,
> 1][4,4][9,9].
> 
>  In the testcase, it happens to be the right_shift operation, but this
> solution is generic and applies to all range-op operations. I added a
> testcase which checks >>, *, + and %.
> 
> I also arbitrarily chose 8 elements as the limit for breaking down
> individual operations.  The overall compile time change for this is
> negligible.
> 
> Although this is a regression fix, it will affect all operations where x ==
> y, which is where my initial hesitancy arose.
> 
> Regardless, bootstrapped on x86_64-pc-linux-gnu with no regressions.  OK for
> trunk?

Will defer to Aldy, just some nits.

> +  // if there are 1 to 8 values in the LH range, split them up.
> +  r.set_undefined ();
> +  if (lh_range >= 0 && lh_range <= 7)
> +{
> +  unsigned x;
> +  for (x = 0; x <= lh_range; x++)

Nothing uses x after the loop, so why not
  for (unsigned x = 0; x <= lh_range; x++)
instead?

> @@ -234,6 +264,26 @@ range_operator::fold_range (irange &r, tree type,
>unsigned num_lh = lh.num_pairs ();
>unsigned num_rh = rh.num_pairs ();
>  
> +  // If op1 and op2 are equivalences, then we don't need a complete cross
> +  // product, just pairs of matching elements.
> +  if (relation_equiv_p (rel) && (lh == rh))

The ()s around lh == rh look superfluous to me.

> +{
> +  int_range_max tmp;
> +  r.set_undefined ();
> +  for (unsigned x = 0; x < num_lh; ++x)

fold_range has an upper bound of num_lh * num_rh > 12, shouldn't something
like that be there for this case too?
I mean, every wi_fold_in_parts_equiv can result in 8 subranges,
but num_lh could be up to 255 here, it is true it is linear and union_
should merge excess ones, but still I wonder if some larger num_lh upper
bound like 20 or 32 wouldn't be useful.  Up to you...

> + {
> +   wide_int lh_lb = lh.lower_bound (x);
> +   wide_int lh_ub = lh.upper_bound (x);
> +   wi_fold_in_parts_equiv (tmp, type, lh_lb, lh_ub);
> +   r.union_ (tmp);
> +   if (r.varying_p ())
> + break;
> + }
> +  op1_op2_relation_effect (r, type, lh, rh, rel);
> +  update_known_bitmask (r, m_code, lh, rh);
> +  return true;
> +}
> +

Jakub



[PATCH] PR tree-optimization/108359 - Utilize op1 == op2 when invoking range-ops folding.

2023-01-13 Thread Andrew MacLeod via Gcc-patches
fold_range() already invokes wi_fold_in_parts to try to get more refined 
information. If the subranges are quite small, it will do each 
individual calculation and combine the results.


x * y with x = [1,3] and y = [1,3]  is broken down and we calculate each 
possibility and we end up with [1,4][6,6][9,9] instead of [1,9]


We limit this as the time is between quadratic to exponential depending 
on the number of elements in x and y.


If we also check the relation and determine that x == y, we don't need 
to worry about that growth as this process is linear.  The above case 
will be broken down to just  1*1, 2*2 and 3*3, resulting in a range of 
[1, 1][4,4][9,9].


 In the testcase, it happens to be the right_shift operation, but this 
solution is generic and applies to all range-op operations. I added a 
testcase which checks >>, *, + and %.


I also arbitrarily chose 8 elements as the limit for breaking down 
individual operations.  The overall compile time change for this is 
negligible.


Although this is a regression fix, it will affect all operations where x 
== y, which is where my initial hesitancy arose.


Regardless, bootstrapped on x86_64-pc-linux-gnu with no regressions.  OK 
for trunk?


Andrew



From fd50dabc626cea1886ebb517ca24c8b8f199c3aa Mon Sep 17 00:00:00 2001
From: Andrew MacLeod 
Date: Wed, 11 Jan 2023 18:12:51 -0500
Subject: [PATCH 2/3] Utilize op1 == op2 when invoking range-ops folding.

If there exists an equivalence relationship between op1 and op2,
any binary operation can be broken into individual operations and
unioned if there are sufficently few elements in the set.

	PR tree-optimization/108359
	gcc/
	* range-op.cc (range_operator::wi_fold_in_parts_equiv): New.
	(range_operator::fold_range): If op1 is equivalent to op2 then
	invoke new fold_in_parts_equiv to operate on sub-components.
	* range-op.h (wi_fold_in_parts_equiv): New prototype.

	gcc/testsuite/
	* gcc.dg/pr108359.c: New.
---
 gcc/range-op.cc | 50 +
 gcc/range-op.h  |  5 
 gcc/testsuite/gcc.dg/pr108359.c | 50 +
 3 files changed, 105 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/pr108359.c

diff --git a/gcc/range-op.cc b/gcc/range-op.cc
index ec75e07bc8a..2cb2c1344f1 100644
--- a/gcc/range-op.cc
+++ b/gcc/range-op.cc
@@ -160,6 +160,36 @@ range_operator::wi_fold (irange &r, tree type,
   r.set_varying (type);
 }
 
+// Call wi_fold when both op1 and op2 are equivalent. Further split small
+// subranges into constants.  This can provide better precision.
+// For x + y,  when x == y with a range of [0,4] instead of [0, 8] produce
+// [0,0][2, 2][4,4][6, 6][8, 8]
+
+void
+range_operator::wi_fold_in_parts_equiv (irange &r, tree type,
+	const wide_int &lh_lb,
+	const wide_int &lh_ub) const
+{
+  int_range_max tmp;
+  widest_int lh_range = wi::sub (widest_int::from (lh_ub, TYPE_SIGN (type)),
+ widest_int::from (lh_lb, TYPE_SIGN (type)));
+  // if there are 1 to 8 values in the LH range, split them up.
+  r.set_undefined ();
+  if (lh_range >= 0 && lh_range <= 7)
+{
+  unsigned x;
+  for (x = 0; x <= lh_range; x++)
+	{
+	  wide_int val = lh_lb + x;
+	  wi_fold (tmp, type, val, val, val, val);
+	  r.union_ (tmp);
+	}
+}
+  // Otherwise just call wi_fold.
+  else
+wi_fold (r, type, lh_lb, lh_ub, lh_lb, lh_ub);
+}
+
 // Call wi_fold, except further split small subranges into constants.
 // This can provide better precision. For something   8 >> [0,1]
 // Instead of [8, 16], we will produce [8,8][16,16]
@@ -234,6 +264,26 @@ range_operator::fold_range (irange &r, tree type,
   unsigned num_lh = lh.num_pairs ();
   unsigned num_rh = rh.num_pairs ();
 
+  // If op1 and op2 are equivalences, then we don't need a complete cross
+  // product, just pairs of matching elements.
+  if (relation_equiv_p (rel) && (lh == rh))
+{
+  int_range_max tmp;
+  r.set_undefined ();
+  for (unsigned x = 0; x < num_lh; ++x)
+	{
+	  wide_int lh_lb = lh.lower_bound (x);
+	  wide_int lh_ub = lh.upper_bound (x);
+	  wi_fold_in_parts_equiv (tmp, type, lh_lb, lh_ub);
+	  r.union_ (tmp);
+	  if (r.varying_p ())
+	break;
+	}
+  op1_op2_relation_effect (r, type, lh, rh, rel);
+  update_known_bitmask (r, m_code, lh, rh);
+  return true;
+}
+
   // If both ranges are single pairs, fold directly into the result range.
   // If the number of subranges grows too high, produce a summary result as the
   // loop becomes exponential with little benefit.  See PR 103821.
diff --git a/gcc/range-op.h b/gcc/range-op.h
index b7b8a3b9473..998aeedb0d9 100644
--- a/gcc/range-op.h
+++ b/gcc/range-op.h
@@ -109,6 +109,11 @@ protected:
 			 const wide_int &rh_lb,
 			 const wide_int &rh_ub) const;
 
+  // Called by fold range to split small subranges into parts when op1 == op2
+  void wi_fold_in_parts_equiv (irange &r, tree type,
+			   const wide_int &lb,
+			   const wide_int &ub) const;
+
   //

[c-family] Small fix for -fdump-ada-spec

2023-01-13 Thread Eric Botcazou via Gcc-patches
This is needed to support the _Float32 and _Float64 types.

Tested on x86-64/Linux, applied on the mainline.


2023-01-13  Eric Botcazou  

c-family/
* c-ada-spec.cc (is_float32): New function.
(is_float64): Likewise.
(is_float128): Tweak.
(dump_ada_node) : Call them to recognize more types.

-- 
Eric Botcazoudiff --git a/gcc/c-family/c-ada-spec.cc b/gcc/c-family/c-ada-spec.cc
index faf71742522..1e011d52825 100644
--- a/gcc/c-family/c-ada-spec.cc
+++ b/gcc/c-family/c-ada-spec.cc
@@ -2030,7 +2030,39 @@ dump_ada_enum_type (pretty_printer *buffer, tree node, tree type, int spc)
 }
 }
 
-/* Return true if NODE is the __float128/_Float128 type.  */
+/* Return true if NODE is the _Float32/_Float32x type.  */
+
+static bool
+is_float32 (tree node)
+{
+  if (!TYPE_NAME (node) || TREE_CODE (TYPE_NAME (node)) != TYPE_DECL)
+return false;
+
+  tree name = DECL_NAME (TYPE_NAME (node));
+
+  if (IDENTIFIER_POINTER (name) [0] != '_')
+return false;
+
+  return id_equal (name, "_Float32") || id_equal (name, "_Float32x");
+}
+
+/* Return true if NODE is the _Float64/_Float64x type.  */
+
+static bool
+is_float64 (tree node)
+{
+  if (!TYPE_NAME (node) || TREE_CODE (TYPE_NAME (node)) != TYPE_DECL)
+return false;
+
+  tree name = DECL_NAME (TYPE_NAME (node));
+
+  if (IDENTIFIER_POINTER (name) [0] != '_')
+return false;
+
+  return id_equal (name, "_Float64") || id_equal (name, "_Float64x");
+}
+
+/* Return true if NODE is the __float128/_Float128/_Float128x type.  */
 
 static bool
 is_float128 (tree node)
@@ -2043,7 +2075,9 @@ is_float128 (tree node)
   if (IDENTIFIER_POINTER (name) [0] != '_')
 return false;
 
-  return id_equal (name, "__float128") || id_equal (name, "_Float128");
+  return id_equal (name, "__float128")
+	 || id_equal (name, "_Float128")
+	 || id_equal (name, "_Float128x");
 }
 
 static bool bitfield_used = false;
@@ -2132,7 +2166,17 @@ dump_ada_node (pretty_printer *buffer, tree node, tree type, int spc,
   break;
 
 case REAL_TYPE:
-  if (is_float128 (node))
+  if (is_float32 (node))
+	{
+	  pp_string (buffer, "Float");
+	  break;
+	}
+  else if (is_float64 (node))
+	{
+	  pp_string (buffer, "Long_Float");
+	  break;
+	}
+  else if (is_float128 (node))
 	{
 	  append_withs ("Interfaces.C.Extensions", false);
 	  pp_string (buffer, "Extensions.Float_128");


Re: gcc-13/changes.html: Mention -fstrict-flex-arrays and its impact

2023-01-13 Thread Gerald Pfeifer
On Tue, 20 Dec 2022, Qing Zhao via Gcc-patches wrote:
> +Treating trailing arrays as flexible array 
> members

Please note that ids must not contain white space.

Would you mind following up making this "flexiblearray" or similiar?

Thank you,
Gerald


Re: [PATCH v3 2/2] aarch64: Fix bit-field alignment in param passing [PR105549]

2023-01-13 Thread Jakub Jelinek via Gcc-patches
On Fri, Jan 13, 2023 at 08:25:01PM +0100, Jakub Jelinek via Gcc-patches wrote:
> alignment is 256, which is not <= 16 * BITS_PER_UNIT.
> type is pst_uniform4 with user alignment of 32 bytes:
> struct pst_uniform4
> {
>   fixed_int32_t a __attribute__((aligned(SVE_BYTES * 2)));
>   fixed_int32_t b[3] __attribute__((aligned(SVE_BYTES * 2)));
>   fixed_int32_t c __attribute__((aligned(SVE_BYTES * 2)));
> };
> and with -march=armv8.2-a+sve -msve-vector-bits=128
> __ARM_FEATURE_SVE_BITS and therefore SVE_BYTES is 128 and so
> the alignment seems requested.

typedef __SVInt32_t fixed_int32_t __attribute__ ((arm_sve_vector_bits (128)));
struct A { fixed_int32_t a[3] __attribute__((aligned((128 / 8) * 2))); };
void foo (struct A x) {}

-march=armv8.2-a+sve -O -msve-vector-bits=128

reproduces it too.

Jakub



[committed] hppa: Fix support for atomic loads and stores on hppa

2023-01-13 Thread John David Anglin
The following change fixes a number of problems with atomic loads
and stores on hppa.

Tested on hppa-unknown-linux-gnu and hppa64-hp-hpux11.11.

The TARGET_SOFT_FLOAT case is not tested.  It's possible we need
additional atomic load support for this case but I think that's
unnecessary since hppa requires strict alignment.

We could use an array of locks in sync-libfuncs.c to improve
performance but I kept things simple for now.

Committed to trunk.

Dave
---

Fix support for atomic loads and stores on hppa.

This change updates the atomic libcall support to fix the following
issues:

1) A internal compiler error with -fno-sync-libcalls.
2) When sync libcalls are disabled, we don't generate libcalls for
   libatomic.
3) There is no sync libcall support for targets other than linux.
   As a result, non-atomic stores are silently emitted for types
   smaller or equal to the word size.  There are now a few atomic
   libcalls in the libgcc code, so we need sync support on all
   targets.

2023-01-13  John David Anglin  

gcc/ChangeLog:

* config/pa/pa-linux.h (TARGET_SYNC_LIBCALL): Delete define.
* config/pa/pa.cc (pa_init_libfuncs): Use MAX_SYNC_LIBFUNC_SIZE
define.
* config/pa/pa.h (TARGET_SYNC_LIBCALLS): Use flag_sync_libcalls.
(MAX_SYNC_LIBFUNC_SIZE): Define.
(TARGET_CPU_CPP_BUILTINS): Define __SOFTFP__ when soft float is
enabled.
* config/pa/pa.md (atomic_storeqi): Emit __atomic_exchange_1
libcall when sync libcalls are disabled.
(atomic_storehi, atomic_storesi, atomic_storedi): Likewise.
(atomic_loaddi): Emit __atomic_load_8 libcall when sync libcalls
are disabled on 32-bit target.
* config/pa/pa.opt (matomic-libcalls): New option.
* doc/invoke.texi (HPPA Options): Update.

libgcc/ChangeLog:

* config.host (hppa*64*-*-linux*): Adjust tmake_file to use
pa/t-pa64-linux.
(hppa*64*-*-hpux11*): Adjust tmake_file to use pa/t-pa64-hpux
instead of pa/t-hpux and pa/t-pa64.
* config/pa/linux-atomic.c: Define u32 type.
(ATOMIC_LOAD): Define new macro to implement atomic_load_1,
atomic_load_2, atomic_load_4 and atomic_load_8.  Update sync
defines to use atomic_load calls for type.
(SYNC_LOCK_LOAD_2): New macro to implement __sync_lock_load_8.
* config/pa/sync-libfuncs.c: New file.
* config/pa/t-netbsd (LIB2ADD_ST): Define.
* config/pa/t-openbsd (LIB2ADD_ST): Define.
* config/pa/t-pa64-hpux: New file.
* config/pa/t-pa64-linux: New file.  

diff --git a/gcc/config/pa/pa-linux.h b/gcc/config/pa/pa-linux.h
index 5af11a1df80..1073f42bd6b 100644
--- a/gcc/config/pa/pa-linux.h
+++ b/gcc/config/pa/pa-linux.h
@@ -133,9 +133,6 @@ along with GCC; see the file COPYING3.  If not see
 #undef TARGET_GAS
 #define TARGET_GAS 1
 
-#undef TARGET_SYNC_LIBCALL
-#define TARGET_SYNC_LIBCALL 1
-
 /* The SYNC operations are implemented as library functions, not
INSN patterns.  As a result, the HAVE defines for the patterns are
not defined.  We need to define them to generate the corresponding
diff --git a/gcc/config/pa/pa.cc b/gcc/config/pa/pa.cc
index 9f43802075f..b43a91f2edb 100644
--- a/gcc/config/pa/pa.cc
+++ b/gcc/config/pa/pa.cc
@@ -5940,8 +5940,8 @@ pa_init_libfuncs (void)
"_U_Qfcnvxf_udbl_to_quad");
 }
 
-  if (TARGET_SYNC_LIBCALL)
-init_sync_libfuncs (8);
+  if (TARGET_SYNC_LIBCALLS)
+init_sync_libfuncs (MAX_SYNC_LIBFUNC_SIZE);
 }
 
 /* HP's millicode routines mean something special to the assembler.
diff --git a/gcc/config/pa/pa.h b/gcc/config/pa/pa.h
index bafdf602138..93d6f53f97f 100644
--- a/gcc/config/pa/pa.h
+++ b/gcc/config/pa/pa.h
@@ -72,10 +72,12 @@ extern unsigned long total_code_bytes;
 #define HPUX_LONG_DOUBLE_LIBRARY 0
 #endif
 
-/* Linux kernel atomic operation support.  */
-#ifndef TARGET_SYNC_LIBCALL
-#define TARGET_SYNC_LIBCALL 0
-#endif
+/* Sync libcall support.  */
+#define TARGET_SYNC_LIBCALLS (flag_sync_libcalls)
+
+/* The maximum size of the sync library functions supported.  DImode
+   is supported on 32-bit targets using floating point loads and stores.  */
+#define MAX_SYNC_LIBFUNC_SIZE 8
 
 /* The following three defines are potential target switches.  The current
defines are optimal given the current capabilities of GAS and GNU ld.  */
@@ -173,6 +175,8 @@ do {
\
builtin_define("_PA_RISC1_0");  \
  if (HPUX_LONG_DOUBLE_LIBRARY) \
builtin_define("__SIZEOF_FLOAT128__=16");   \
+ if (TARGET_SOFT_FLOAT)\
+   builtin_define("__SOFTFP__");   \
 } while (0)
 
 /* An old set of OS defines for various BSD-like systems.  */
diff --git a/gcc/config/pa/pa.md b/gcc/config/pa/pa.md
index 41382271e54..71f391f2bf7 100644
--- a/gcc/config/pa/pa

Re: [PATCH v3 2/2] aarch64: Fix bit-field alignment in param passing [PR105549]

2023-01-13 Thread Jakub Jelinek via Gcc-patches
On Fri, Jan 13, 2023 at 04:38:00PM +0100, Jakub Jelinek via Gcc-patches wrote:
> I'm seeing
> +FAIL: g++.target/aarch64/bitfield-abi-warning-align16-O2.C 
> scan-assembler-times and\\tw0, w1, 1 10
> +FAIL: g++.target/aarch64/bitfield-abi-warning-align32-O2.C 
> scan-assembler-times and\\tw0, w1, 1 10
> +FAIL: g++.target/aarch64/bitfield-abi-warning-align8-O2.C 
> scan-assembler-times and\\tw0, w0, 1 11
> +FAIL: g++.target/aarch64/bitfield-abi-warning-align8-O2.C 
> scan-assembler-times and\\tw0, w1, 1 18

The above seems only because I'm testing with 

> +FAIL: gcc.target/aarch64/sve/pcs/struct_3_128.c -march=armv8.2-a+sve 
> (internal compiler error: in aarch64_layout_arg, at 
> config/aarch64/aarch64.cc:7696)
> +FAIL: gcc.target/aarch64/sve/pcs/struct_3_128.c -march=armv8.2-a+sve (test 
> for excess errors)
> +FAIL: gcc.target/aarch64/sve/pcs/struct_3_256.c -march=armv8.2-a+sve 
> (internal compiler error: in aarch64_layout_arg, at 
> config/aarch64/aarch64.cc:7696)
> +FAIL: gcc.target/aarch64/sve/pcs/struct_3_256.c -march=armv8.2-a+sve (test 
> for excess errors)
> +FAIL: gcc.target/aarch64/sve/pcs/struct_3_512.c -march=armv8.2-a+sve 
> (internal compiler error: in aarch64_layout_arg, at 
> config/aarch64/aarch64.cc:7696)
> +FAIL: gcc.target/aarch64/sve/pcs/struct_3_512.c -march=armv8.2-a+sve (test 
> for excess errors)
> regressions with this change.

But these I can reproduce using a cross compiler on current trunk:
#0  fancy_abort (file=0x2da73c0 "../../gcc/config/aarch64/aarch64.cc", 
line=7710, function=0x2da8917 "aarch64_layout_arg") at 
../../gcc/diagnostic.cc:2219
#1  0x01a8756b in aarch64_layout_arg (pcum_v=..., arg=...) at 
../../gcc/config/aarch64/aarch64.cc:7710
#2  0x01a88477 in aarch64_function_arg_advance (pcum_v=..., arg=...) at 
../../gcc/config/aarch64/aarch64.cc:8023
#3  0x0107cb17 in gimplify_parameters (cleanup=0x7fffd8c0) at 
../../gcc/function.cc:3929
#4  0x01156366 in gimplify_body (fndecl=, do_parms=true) at ../../gcc/gimplify.cc:17619
#5  0x01156ca0 in gimplify_function_tree (fndecl=) at ../../gcc/gimplify.cc:17822
#6  0x00ea2402 in cgraph_node::analyze (this=) at ../../gcc/cgraphunit.cc:676
#7  0x00ea4489 in analyze_functions (first_time=true) at 
../../gcc/cgraphunit.cc:1240
#8  0x00ea7a06 in symbol_table::finalize_compilation_unit 
(this=0x7fffea38b000) at ../../gcc/cgraphunit.cc:2547
#9  0x01572df1 in compile_file () at ../../gcc/toplev.cc:471
#10 0x01575caf in do_compile (no_backend=false) at 
../../gcc/toplev.cc:2125
#11 0x01576078 in toplev::main (this=0x7fffdc6a, argc=14, 
argv=0x7fffdd98) at ../../gcc/toplev.cc:2277
#12 0x02a81c6a in main (argc=14, argv=0x7fffdd98) at 
../../gcc/main.cc:39

alignment is 256, which is not <= 16 * BITS_PER_UNIT.
type is pst_uniform4 with user alignment of 32 bytes:
struct pst_uniform4
{
  fixed_int32_t a __attribute__((aligned(SVE_BYTES * 2)));
  fixed_int32_t b[3] __attribute__((aligned(SVE_BYTES * 2)));
  fixed_int32_t c __attribute__((aligned(SVE_BYTES * 2)));
};
and with -march=armv8.2-a+sve -msve-vector-bits=128
__ARM_FEATURE_SVE_BITS and therefore SVE_BYTES is 128 and so
the alignment seems requested.

Jakub



Re: [PATCH] c, c++, v3: Avoid incorrect shortening of divisions [PR108365]

2023-01-13 Thread Jason Merrill via Gcc-patches

On 1/13/23 12:45, Jakub Jelinek wrote:

On Fri, Jan 13, 2023 at 11:58:06AM -0500, Jason Merrill wrote:

LGTM, though we might put that condition in c-common somewhere?


So like this then?  Just tested on the new testcases, full bootstrap/regtest
queued?


OK..


2023-01-13  Jakub Jelinek  

PR c++/108365
* c-common.h (may_shorten_divmod): New static inline function.

* c-typeck.cc (build_binary_op): Use may_shorten_divmod for integral
division or modulo.

* typeck.cc (cp_build_binary_op): Use may_shorten_divmod for integral
division or modulo.

* c-c++-common/pr108365.c: New test.
* g++.dg/opt/pr108365.C: New test.
* g++.dg/warn/pr108365.C: New test.

--- gcc/c-family/c-common.h.jj  2022-11-14 13:35:34.195160199 +0100
+++ gcc/c-family/c-common.h 2023-01-13 18:35:08.130362228 +0100
@@ -918,6 +918,30 @@ extern tree convert_init (tree, tree);
  /* Subroutine of build_binary_op, used for certain operations.  */
  extern tree shorten_binary_op (tree result_type, tree op0, tree op1, bool 
bitwise);
  
+/* Return true if division or modulo op0 / op1 or op0 % op1 may be shortened.

+   We can shorten only if we can guarantee that op0 is not signed integral
+   minimum or op1 is not -1, because e.g. (long long) INT_MIN / -1 is
+   well defined INT_MAX + 1LL if long long is wider than int, but INT_MIN / -1
+   is UB.  */
+static inline bool
+may_shorten_divmod (tree op0, tree op1)
+{
+  tree type0 = TREE_TYPE (op0);
+  if (TYPE_UNSIGNED (type0))
+return true;
+  /* A cast from narrower unsigned won't be signed integral minimum,
+ but cast from same or wider precision unsigned could be.  */
+  if (TREE_CODE (op0) == NOP_EXPR
+  && INTEGRAL_TYPE_P (TREE_TYPE (TREE_OPERAND (op0, 0)))
+  && TYPE_UNSIGNED (TREE_TYPE (TREE_OPERAND (op0, 0)))
+  && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (op0, 0)))
+ < TYPE_PRECISION (type0)))
+return true;
+  if (TREE_CODE (op1) == INTEGER_CST && !integer_all_onesp (op1))
+return true;
+  return false;
+}
+
  /* Subroutine of build_binary_op, used for comparison operations.
 See if the operands have both been converted from subword integer types
 and, if so, perhaps change them both back to their original type.  */
--- gcc/c/c-typeck.cc.jj2023-01-13 11:11:45.368016437 +0100
+++ gcc/c/c-typeck.cc   2023-01-13 18:38:25.919538847 +0100
@@ -12431,9 +12431,7 @@ build_binary_op (location_t location, en
   undefined if the quotient can't be represented in the
   computation mode.  We shorten only if unsigned or if
   dividing by something we know != -1.  */
-   shorten = (TYPE_UNSIGNED (TREE_TYPE (orig_op0))
-  || (TREE_CODE (op1) == INTEGER_CST
-  && !integer_all_onesp (op1)));
+   shorten = may_shorten_divmod (op0, op1);
  common = 1;
}
break;
@@ -12467,9 +12465,7 @@ build_binary_op (location_t location, en
 on some targets, since the modulo instruction is undefined if the
 quotient can't be represented in the computation mode.  We shorten
 only if unsigned or if dividing by something we know != -1.  */
- shorten = (TYPE_UNSIGNED (TREE_TYPE (orig_op0))
-|| (TREE_CODE (op1) == INTEGER_CST
-&& !integer_all_onesp (op1)));
+ shorten = may_shorten_divmod (op0, op1);
  common = 1;
}
break;
--- gcc/cp/typeck.cc.jj 2023-01-13 11:11:45.418015716 +0100
+++ gcc/cp/typeck.cc2023-01-13 18:38:40.754327078 +0100
@@ -5455,10 +5455,7 @@ cp_build_binary_op (const op_location_t
 point, so we have to dig out the original type to find out if
 it was unsigned.  */
  tree stripped_op1 = tree_strip_any_location_wrapper (op1);
- shorten = ((TREE_CODE (op0) == NOP_EXPR
- && TYPE_UNSIGNED (TREE_TYPE (TREE_OPERAND (op0, 0
-|| (TREE_CODE (stripped_op1) == INTEGER_CST
-&& ! integer_all_onesp (stripped_op1)));
+ shorten = may_shorten_divmod (op0, stripped_op1);
}
  
  	  common = 1;

@@ -5491,10 +5488,7 @@ cp_build_binary_op (const op_location_t
 quotient can't be represented in the computation mode.  We shorten
 only if unsigned or if dividing by something we know != -1.  */
  tree stripped_op1 = tree_strip_any_location_wrapper (op1);
- shorten = ((TREE_CODE (op0) == NOP_EXPR
- && TYPE_UNSIGNED (TREE_TYPE (TREE_OPERAND (op0, 0
-|| (TREE_CODE (stripped_op1) == INTEGER_CST
-&& ! integer_all_onesp (stripped_op1)));
+ shorten = may_shorten_divmod (op0, stripped_op1);
  common = 1;
}
break;
--- gcc/testsuite/c-c++-common/pr108365.c.jj2023-01-13 18:25:09.16391121

Re: [PATCH] sched-deps: do not schedule pseudos across calls [PR108117]

2023-01-13 Thread Jose E. Marchesi via Gcc-patches


> On Fri, 23 Dec 2022, Jose E. Marchesi wrote:
>
>> > +1 for trying this FWIW.  There's still plenty of time to try an
>> > alternative solution if there are unexpected performance problems.
>> 
>> Let me see if Alexander's patch fixes the issue at hand (it must) and
>> will also do some regression testing.
>
> Hi, I'm not sure at which court the ball is, but in the interest at moving
> things forward here's the complete patch with the testcase. OK to
> apply?

Thanks for this.
We were actually on it, but of course busy with other stuff :)

>
> ---8<---
>
> From: Alexander Monakov 
> Date: Fri, 13 Jan 2023 21:04:02 +0300
> Subject: [PATCH] sched-deps: do not schedule pseudos across calls [PR108117]
>
> Scheduling across calls in the pre-RA scheduler is problematic: we do
> not take liveness info into account, and are thus prone to extending
> lifetime of a pseudo over the loop, requiring a callee-saved hardreg
> or causing a spill.
>
> If current function called a setjmp, lifting an assignment over a call
> may be incorrect if a longjmp would happen before the assignment.
>
> Thanks to Jose Marchesi for testing on AArch64.
>
> gcc/ChangeLog:
>
>   PR rtl-optimization/108117
>   PR rtl-optimization/108132
>   * sched-deps.cc (deps_analyze_insn): Do not schedule across
>   calls before reload.
>
> gcc/testsuite/ChangeLog:
>
>   PR rtl-optimization/108117
>   PR rtl-optimization/108132
>   * gcc.dg/pr108117.c: New test.
> ---
>  gcc/sched-deps.cc   |  9 -
>  gcc/testsuite/gcc.dg/pr108117.c | 30 ++
>  2 files changed, 38 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.dg/pr108117.c
>
> diff --git a/gcc/sched-deps.cc b/gcc/sched-deps.cc
> index 948aa0c3b..5dc4fa4cd 100644
> --- a/gcc/sched-deps.cc
> +++ b/gcc/sched-deps.cc
> @@ -3688,7 +3688,14 @@ deps_analyze_insn (class deps_desc *deps, rtx_insn 
> *insn)
>  
>CANT_MOVE (insn) = 1;
>  
> -  if (find_reg_note (insn, REG_SETJMP, NULL))
> +  if (!reload_completed)
> + {
> +   /* Scheduling across calls may increase register pressure by extending
> +  live ranges of pseudos over the call.  Worse, in presence of setjmp
> +  it may incorrectly move up an assignment over a longjmp.  */
> +   reg_pending_barrier = MOVE_BARRIER;
> + }
> +  else if (find_reg_note (insn, REG_SETJMP, NULL))
>  {
>/* This is setjmp.  Assume that all registers, not just
>   hard registers, may be clobbered by this call.  */
> diff --git a/gcc/testsuite/gcc.dg/pr108117.c b/gcc/testsuite/gcc.dg/pr108117.c
> new file mode 100644
> index 0..ae151693e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/pr108117.c
> @@ -0,0 +1,30 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target nonlocal_goto } */
> +/* { dg-options "-O2 -fschedule-insns" } */
> +
> +#include 
> +#include 
> +
> +jmp_buf ex_buf;
> +
> +__attribute__((noipa))
> +void fn_throw(int x)
> +{
> +   if (x)
> +  longjmp(ex_buf, 1);
> +}
> +
> +int main(void)
> +{
> +int vb = 0; // NB: not volatile, not modified after setjmp
> +
> +if (!setjmp(ex_buf)) {
> +fn_throw(1);
> +vb = 1; // not reached in the abstract machine
> +}
> +
> +if (vb) {
> +printf("Failed, vb = %d!\n", vb);
> +return 1;
> +}
> +}


Re: [PATCH] sched-deps: do not schedule pseudos across calls [PR108117]

2023-01-13 Thread Richard Sandiford via Gcc-patches
Alexander Monakov  writes:
> On Fri, 23 Dec 2022, Jose E. Marchesi wrote:
>
>> > +1 for trying this FWIW.  There's still plenty of time to try an
>> > alternative solution if there are unexpected performance problems.
>> 
>> Let me see if Alexander's patch fixes the issue at hand (it must) and
>> will also do some regression testing.
>
> Hi, I'm not sure at which court the ball is, but in the interest at moving
> things forward here's the complete patch with the testcase. OK to apply?
>
> ---8<---
>
> From: Alexander Monakov 
> Date: Fri, 13 Jan 2023 21:04:02 +0300
> Subject: [PATCH] sched-deps: do not schedule pseudos across calls [PR108117]
>
> Scheduling across calls in the pre-RA scheduler is problematic: we do
> not take liveness info into account, and are thus prone to extending
> lifetime of a pseudo over the loop, requiring a callee-saved hardreg
> or causing a spill.
>
> If current function called a setjmp, lifting an assignment over a call
> may be incorrect if a longjmp would happen before the assignment.
>
> Thanks to Jose Marchesi for testing on AArch64.
>
> gcc/ChangeLog:
>
>   PR rtl-optimization/108117
>   PR rtl-optimization/108132
>   * sched-deps.cc (deps_analyze_insn): Do not schedule across
>   calls before reload.
>
> gcc/testsuite/ChangeLog:
>
>   PR rtl-optimization/108117
>   PR rtl-optimization/108132
>   * gcc.dg/pr108117.c: New test.

OK, thanks.

Richard

> ---
>  gcc/sched-deps.cc   |  9 -
>  gcc/testsuite/gcc.dg/pr108117.c | 30 ++
>  2 files changed, 38 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.dg/pr108117.c
>
> diff --git a/gcc/sched-deps.cc b/gcc/sched-deps.cc
> index 948aa0c3b..5dc4fa4cd 100644
> --- a/gcc/sched-deps.cc
> +++ b/gcc/sched-deps.cc
> @@ -3688,7 +3688,14 @@ deps_analyze_insn (class deps_desc *deps, rtx_insn 
> *insn)
>  
>CANT_MOVE (insn) = 1;
>  
> -  if (find_reg_note (insn, REG_SETJMP, NULL))
> +  if (!reload_completed)
> + {
> +   /* Scheduling across calls may increase register pressure by extending
> +  live ranges of pseudos over the call.  Worse, in presence of setjmp
> +  it may incorrectly move up an assignment over a longjmp.  */
> +   reg_pending_barrier = MOVE_BARRIER;
> + }
> +  else if (find_reg_note (insn, REG_SETJMP, NULL))
>  {
>/* This is setjmp.  Assume that all registers, not just
>   hard registers, may be clobbered by this call.  */
> diff --git a/gcc/testsuite/gcc.dg/pr108117.c b/gcc/testsuite/gcc.dg/pr108117.c
> new file mode 100644
> index 0..ae151693e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/pr108117.c
> @@ -0,0 +1,30 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target nonlocal_goto } */
> +/* { dg-options "-O2 -fschedule-insns" } */
> +
> +#include 
> +#include 
> +
> +jmp_buf ex_buf;
> +
> +__attribute__((noipa))
> +void fn_throw(int x)
> +{
> +   if (x)
> +  longjmp(ex_buf, 1);
> +}
> +
> +int main(void)
> +{
> +int vb = 0; // NB: not volatile, not modified after setjmp
> +
> +if (!setjmp(ex_buf)) {
> +fn_throw(1);
> +vb = 1; // not reached in the abstract machine
> +}
> +
> +if (vb) {
> +printf("Failed, vb = %d!\n", vb);
> +return 1;
> +}
> +}


[PATCH] sched-deps: do not schedule pseudos across calls [PR108117]

2023-01-13 Thread Alexander Monakov


On Fri, 23 Dec 2022, Jose E. Marchesi wrote:

> > +1 for trying this FWIW.  There's still plenty of time to try an
> > alternative solution if there are unexpected performance problems.
> 
> Let me see if Alexander's patch fixes the issue at hand (it must) and
> will also do some regression testing.

Hi, I'm not sure at which court the ball is, but in the interest at moving
things forward here's the complete patch with the testcase. OK to apply?

---8<---

From: Alexander Monakov 
Date: Fri, 13 Jan 2023 21:04:02 +0300
Subject: [PATCH] sched-deps: do not schedule pseudos across calls [PR108117]

Scheduling across calls in the pre-RA scheduler is problematic: we do
not take liveness info into account, and are thus prone to extending
lifetime of a pseudo over the loop, requiring a callee-saved hardreg
or causing a spill.

If current function called a setjmp, lifting an assignment over a call
may be incorrect if a longjmp would happen before the assignment.

Thanks to Jose Marchesi for testing on AArch64.

gcc/ChangeLog:

PR rtl-optimization/108117
PR rtl-optimization/108132
* sched-deps.cc (deps_analyze_insn): Do not schedule across
calls before reload.

gcc/testsuite/ChangeLog:

PR rtl-optimization/108117
PR rtl-optimization/108132
* gcc.dg/pr108117.c: New test.
---
 gcc/sched-deps.cc   |  9 -
 gcc/testsuite/gcc.dg/pr108117.c | 30 ++
 2 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.dg/pr108117.c

diff --git a/gcc/sched-deps.cc b/gcc/sched-deps.cc
index 948aa0c3b..5dc4fa4cd 100644
--- a/gcc/sched-deps.cc
+++ b/gcc/sched-deps.cc
@@ -3688,7 +3688,14 @@ deps_analyze_insn (class deps_desc *deps, rtx_insn *insn)
 
   CANT_MOVE (insn) = 1;
 
-  if (find_reg_note (insn, REG_SETJMP, NULL))
+  if (!reload_completed)
+   {
+ /* Scheduling across calls may increase register pressure by extending
+live ranges of pseudos over the call.  Worse, in presence of setjmp
+it may incorrectly move up an assignment over a longjmp.  */
+ reg_pending_barrier = MOVE_BARRIER;
+   }
+  else if (find_reg_note (insn, REG_SETJMP, NULL))
 {
   /* This is setjmp.  Assume that all registers, not just
  hard registers, may be clobbered by this call.  */
diff --git a/gcc/testsuite/gcc.dg/pr108117.c b/gcc/testsuite/gcc.dg/pr108117.c
new file mode 100644
index 0..ae151693e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr108117.c
@@ -0,0 +1,30 @@
+/* { dg-do run } */
+/* { dg-require-effective-target nonlocal_goto } */
+/* { dg-options "-O2 -fschedule-insns" } */
+
+#include 
+#include 
+
+jmp_buf ex_buf;
+
+__attribute__((noipa))
+void fn_throw(int x)
+{
+   if (x)
+  longjmp(ex_buf, 1);
+}
+
+int main(void)
+{
+int vb = 0; // NB: not volatile, not modified after setjmp
+
+if (!setjmp(ex_buf)) {
+fn_throw(1);
+vb = 1; // not reached in the abstract machine
+}
+
+if (vb) {
+printf("Failed, vb = %d!\n", vb);
+return 1;
+}
+}
-- 
2.37.2



Re: [PATCH 2/3] rs6000: Delete PRE_GCC3_DWARF_FRAME_REGISTERS

2023-01-13 Thread Jakub Jelinek via Gcc-patches
On Mon, May 06, 2019 at 09:55:50PM +, Segher Boessenkool wrote:
> We don't need this.
> 
> 
> Segher
> 
> 
> 2019-05-06  Segher Boessenkool  
> 
>   * config/rs6000/rs6000.h (PRE_GCC3_DWARF_FRAME_REGISTERS): Delete.

Why do you think so?

This seems to be a clear ABI break to me in the __frame_state_for
API.
So, if a __frame_state_for caller calls the function, it will overflow
the buffer passed by the caller.

> diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h
> index ff9449c..3829e8f 100644
> --- a/gcc/config/rs6000/rs6000.h
> +++ b/gcc/config/rs6000/rs6000.h
> @@ -817,9 +817,6 @@ enum data_align { align_abi, align_opt, align_both };
>  
>  #define FIRST_PSEUDO_REGISTER 115
>  
> -/* This must be included for pre gcc 3.0 glibc compatibility.  */
> -#define PRE_GCC3_DWARF_FRAME_REGISTERS 77
> -
>  /* The sfp register and 3 HTM registers
> aren't included in DWARF_FRAME_REGISTERS.  */
>  #define DWARF_FRAME_REGISTERS (FIRST_PSEUDO_REGISTER - 4)
> -- 
> 1.8.3.1

Jakub



Re: [GCC][PATCH 13/15, v5] arm: Add support for dwarf debug directives and pseudo hard-register for PAC feature.

2023-01-13 Thread Jakub Jelinek via Gcc-patches
On Fri, Jan 13, 2023 at 05:44:15PM +, Srinath Parvathaneni via Gcc-patches 
wrote:
> Hello,
> 
> This patch teaches the DWARF support in gcc about RA_AUTH_CODE pseudo 
> hard-register and also 
> updates the ".save", ".cfi_register", ".cfi_offset", ".cfi_restore" 
> directives accordingly.
> This patch also adds support to emit ".pacspval" directive when "pac ip, lr, 
> sp" instruction
> in generated in the assembly.
> 
> RA_AUTH_CODE register number is 107 and it's dwarf register number is 143.

I'm afraid increasing number of DWARF registers is ABI incompatible change.
E.g. libgcc __frame_state_for function fills in:
typedef struct frame_state
{
  void *cfa;
  void *eh_ptr;
  long cfa_offset;
  long args_size;
  long reg_or_offset[PRE_GCC3_DWARF_FRAME_REGISTERS+1];
  unsigned short cfa_reg;
  unsigned short retaddr_column;
  char saved[PRE_GCC3_DWARF_FRAME_REGISTERS+1];
} frame_state;

structure, where PRE_GCC3_DWARF_FRAME_REGISTERS defaults to
__LIBGCC_DWARF_FRAME_REGISTERS__, which is defined to
DWARF_FRAME_REGISTERS, which defaults to FIRST_PSEUDO_REGISTER.
So, changing FIRST_PSEUDO_REGISTER is an ABI change unless you arrange for
PRE_GCC3_DWARF_FRAME_REGISTERS to be defined to the old value.

Jakub



[PATCH] IBM zSystems: Fix TARGET_D_CPU_VERSIONS

2023-01-13 Thread Stefan Schulze Frielinghaus via Gcc-patches
In the context of D the interpretation of S390, S390X, and SystemZ is a
bit fuzzy.  The wording S390X was wrongly deprecated in favour of
SystemZ by commit
https://github.com/dlang/dlang.org/commit/3b50a4c3faf01c32234d0ef8be5f82915a61c23f
Thus, SystemZ is used for 64-bit targets, now, and S390 for 31-bit
targets.  However, in TARGET_D_CPU_VERSIONS depending on TARGET_ZARCH we
set the CPU version to SystemZ.  This is also the case if compiled for
31-bit targets leading to the following error:

libphobos/libdruntime/core/sys/posix/sys/stat.d:967:13: error: static assert:  
'96u == 144u' is false
  967 | static assert(stat_t.sizeof == 144);
  | ^

Thus in order to keep this patch simple I went for keeping SystemZ for
64-bit targets and S390, as usual, for 31-bit targets and dropped the
distinction between ESA and z/Architecture.

Bootstrapped and regtested on IBM zSystems.  Ok for mainline?

gcc/ChangeLog:

* config/s390/s390-d.cc (s390_d_target_versions): Fix detection
of CPU version.
---
 gcc/config/s390/s390-d.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/gcc/config/s390/s390-d.cc b/gcc/config/s390/s390-d.cc
index d10b45f7de4..ced7f49a988 100644
--- a/gcc/config/s390/s390-d.cc
+++ b/gcc/config/s390/s390-d.cc
@@ -30,10 +30,8 @@ along with GCC; see the file COPYING3.  If not see
 void
 s390_d_target_versions (void)
 {
-  if (TARGET_ZARCH)
+  if (TARGET_64BIT)
 d_add_builtin_version ("SystemZ");
-  else if (TARGET_64BIT)
-d_add_builtin_version ("S390X");
   else
 d_add_builtin_version ("S390");
 
-- 
2.39.0



Re: [OG12][committed] amdgcn, libgomp: custom USM allocator

2023-01-13 Thread Andrew Stubbs

I changed it to use 128-byte alignment to match the GPU cache-lines.

Committed to OG12.

Andrew

On 11/01/2023 18:05, Andrew Stubbs wrote:
This patch fixes a runtime issue I encountered with the AMD GCN Unified 
Shared Memory implementation.


We were using regular malloc'd memory configured into USM mode, but 
there were random intermittent crashes. I can't be completely sure, but 
my best guess is that the HSA driver is using malloc internally from the 
same heap, and therefore using memory on the same page as the offload 
kernel. What I do know is that I could make the crashes go away by 
simply padding the USM allocations before and after.


With this patch USM allocations are now completely separated from the 
system heap. The custom allocator is probably less optimal is some 
use-cases, but does have the advantage that all the metadata is stored 
in a side-table that won't ever cause any pages to migrate back to 
main-memory unnecessarily. It's still possible for the user program to 
use USM memory in a way that causes it to thrash, and this might have 
been the ultimate cause of the crashes, but there's not much we can do 
about that here.


I've broken the allocator out into a new file because I anticipate it 
being needed in more than one place, but I didn't put full 
data-isolation on it yet.


I'll rebase, merge, and repost all of the OpenMP memory patches sometime 
soonish.


Andrew
libgomp, amdgcn: Switch USM to 128-byte alignment

This should optimize cache-lines on the AMD GPUs somewhat.

libgomp/ChangeLog:

* usm-allocator.c (ALIGN): Use 128-byte alignment.

diff --git a/libgomp/usm-allocator.c b/libgomp/usm-allocator.c
index c45109169ca..68c1ebafec2 100644
--- a/libgomp/usm-allocator.c
+++ b/libgomp/usm-allocator.c
@@ -57,7 +57,8 @@ static int usm_lock = 0;
 static struct usm_splay_tree_s usm_allocations = { NULL };
 static struct usm_splay_tree_s usm_free_space = { NULL };
 
-#define ALIGN(VAR) (((VAR) + 7) & ~7)/* 8-byte granularity.  */
+/* 128-byte granularity means GPU cache-line aligned.  */
+#define ALIGN(VAR) (((VAR) + 127) & ~127)
 
 /* Coalesce contiguous free space into one entry.  This considers the entries
either side of the root node only, so it should be called each time a new


Re: [PATCH v2] ipa-cp: Speculatively call specialized functions

2023-01-13 Thread Martin Jambor
Hello,

sorry for getting to this quite late.  I have only had a quick glance at
ipa-cp.cc hunks so far.

On Fri, Dec 16 2022, Manolis Tsamis wrote:
> The IPA CP pass offers a wide range of optimizations, where most of them
> lead to specialized functions that are called from a call site.
> This can lead to multiple specialized function clones, if more than
> one call-site allows such an optimization.
> If not all call-sites can be optimized, the program might end
> up with call-sites to the original function.
>
> This pass assumes that non-optimized call-sites (i.e. call-sites
> that don't call specialized functions) are likely to be called
> with arguments that would allow calling specialized clones.
> Since we cannot guarantee this (for obvious reasons), we can't
> replace the existing calls. However, we can introduce dynamic
> guards that test the arguments for the collected constants
> and calls the specialized function if there is a match.
>
> To demonstrate the effect, let's consider the following program part:
>
>   func_1()
> myfunc(1)
>   func_2()
> myfunc(2)
>   func_i(i)
> myfunc(i)
>
> In this case the transformation would do the following:
>
>   func_1()
> myfunc.constprop.1() // myfunc() with arg0 == 1
>   func_2()
> myfunc.constprop.2() // myfunc() with arg0 == 2
>   func_i(i)
> if (i == 1)
>   myfunc.constprop.1() // myfunc() with arg0 == 1
> else if (i == 2)
>   myfunc.constprop.2() // myfunc() with arg0 == 2
> else
>   myfunc(i)

My understanding of the code, however, is that it rather creates

  func_i(i)
if (i == 1)
  myfunc.constprop.1_1() // mostly equivalent but separate from 
myfunc.constprop.1
else if (i == 2)
  myfunc.constprop.2_1() // mostly equivalent but separate from 
myfunc.constprop.2
else
  myfunc(i)

Which I find difficult to justify.  From comments it looked like the
reason is avoiding calling find_more_scalar_values, is that correct?

I'd like to know more about the cases you are targeting and cases where
adding the additional known scalar constants were an issue.  I think it
needs to be tackled differently.

By the way, as IPA-CP works now (it would be nice but difficult to lift
that limitation), all but up to one constant in known_csts are constants
in all call contexts, so without calling find_more_scalar_values you
should need just one run-time condition per speculative call.  So
tracking which constant is which might be better than avoiding
find_more_scalar_values?

Also growth limits in ipa-cp are not updated appropriately.

Some more comments inline:

>
> The pass consists of two main parts:
> * collecting all specialized functions and the argument/constant pair(s)
> * insertion of the guards during materialization
>
> The patch integrates well into ipa-cp and related IPA functionality.
> Given the nature of IPA, the changes are touching many IPA-related
> files as well as call-graph data structures.
>
> The impact of the dynamic guard is expected to be less than the speedup
> gained by enabled optimizations (e.g. inlining or constant propagation).
>
> gcc/Changelog:
>
> * cgraph.cc (cgraph_add_edge_to_call_site_hash): Add support for 
> guarded specialized edges.
> (cgraph_edge::set_call_stmt): Likewise.
> (symbol_table::create_edge): Likewise.
> (cgraph_edge::remove): Likewise.
> (cgraph_edge::make_speculative): Likewise.
> (cgraph_edge::make_specialized): Likewise.
> (cgraph_edge::remove_specializations): Likewise.
> (cgraph_edge::redirect_call_stmt_to_callee): Likewise.
> (cgraph_edge::dump_edge_flags): Likewise.
> (verify_speculative_call): Likewise.
> (verify_specialized_call): Likewise.
> (cgraph_node::verify_node): Likewise.
> * cgraph.h (class GTY): Add new class that contains info of 
> specialized edges.
> * cgraphclones.cc (cgraph_edge::clone): Add support for guarded 
> specialized edges.
> (cgraph_node::set_call_stmt_including_clones): Likewise.
> * ipa-cp.cc (want_remove_some_param_p): Likewise.
> (create_specialized_node): Likewise.
> (add_specialized_edges): Likewise.
> (ipcp_driver): Likewise.
> * ipa-fnsummary.cc (redirect_to_unreachable): Likewise.
> (ipa_fn_summary_t::duplicate): Likewise.
> (analyze_function_body): Likewise.
> (estimate_edge_size_and_time): Likewise.
> (remap_edge_summaries): Likewise.
> * ipa-inline-transform.cc (inline_transform): Likewise.
> * ipa-inline.cc (edge_badness): Likewise.
>  lto-cgraph.cc (lto_output_edge): Likewise.
> (input_edge): Likewise.
> * tree-inline.cc (copy_bb): Likewise.
> * value-prof.cc (gimple_sc): Add function to create guarded 
> specializations.
> * value-prof.h (gimple_sc): Likewise.

Please also include test-cases.

>
> Signed-off-by: Manolis Tsamis 
>
> ---
>

[...]

> diff --git 

[GCC][PATCH v4] arm: Add pacbti related multilib support for armv8.1-m.main.

2023-01-13 Thread Srinath Parvathaneni via Gcc-patches
Hi,

This patch adds the support for pacbti multlilib linking by making
"-mbranch-protection=none" as default multilib option for arm-none-eabi
target.

Eg 1.

If the passed command line flags are (without mbranch-protection):
a) -march=armv8.1-m.main+mve -mfloat-abi=hard -mfpu=auto

"-mbranch-protection=none" will be used in the multilib matching.

Eg 2.

If the passed command line flags are (with mbranch-protection):
a) -march=armv8.1-m.main+mve+pacbti -mfloat-abi=hard -mfpu=auto  
-mbranch-protection=pac-ret

"-mbranch-protection=standard" will be used in the multilib matching.

Regression tested on arm-none-eabi and bootstrapped on arm-none-linux-gnueabihf.

Ok for master?

Regards,
Srinath.

gcc/ChangeLog:

2023-01-11  Srinath Parvathaneni  

* config.gcc ($tm_file): Update variable.
* config/arm/arm-mlib.h: Create new header file.
* config/arm/t-rmprofile (MULTI_ARCH_DIRS_RM): Rename mbranch-protection
multilib arch directory.
(MULTILIB_REUSE): Add multilib reuse rules.
(MULTILIB_MATCHES): Add multilib match rules.

gcc/testsuite/ChangeLog:

2023-01-11  Srinath Parvathaneni  

* gcc.target/arm/multilib.exp (multilib_config "rmprofile"): Update
tests.
* gcc.target/arm/pac-12.c: New test.
* gcc.target/arm/pac-13.c: Likewise.
* gcc.target/arm/pac-14.c: Likewise.


multilib_pacbti
Description: multilib_pacbti


[PATCH] c, c++, v3: Avoid incorrect shortening of divisions [PR108365]

2023-01-13 Thread Jakub Jelinek via Gcc-patches
On Fri, Jan 13, 2023 at 11:58:06AM -0500, Jason Merrill wrote:
> LGTM, though we might put that condition in c-common somewhere?

So like this then?  Just tested on the new testcases, full bootstrap/regtest
queued?

2023-01-13  Jakub Jelinek  

PR c++/108365
* c-common.h (may_shorten_divmod): New static inline function.

* c-typeck.cc (build_binary_op): Use may_shorten_divmod for integral
division or modulo.

* typeck.cc (cp_build_binary_op): Use may_shorten_divmod for integral
division or modulo.

* c-c++-common/pr108365.c: New test.
* g++.dg/opt/pr108365.C: New test.
* g++.dg/warn/pr108365.C: New test.

--- gcc/c-family/c-common.h.jj  2022-11-14 13:35:34.195160199 +0100
+++ gcc/c-family/c-common.h 2023-01-13 18:35:08.130362228 +0100
@@ -918,6 +918,30 @@ extern tree convert_init (tree, tree);
 /* Subroutine of build_binary_op, used for certain operations.  */
 extern tree shorten_binary_op (tree result_type, tree op0, tree op1, bool 
bitwise);
 
+/* Return true if division or modulo op0 / op1 or op0 % op1 may be shortened.
+   We can shorten only if we can guarantee that op0 is not signed integral
+   minimum or op1 is not -1, because e.g. (long long) INT_MIN / -1 is
+   well defined INT_MAX + 1LL if long long is wider than int, but INT_MIN / -1
+   is UB.  */
+static inline bool
+may_shorten_divmod (tree op0, tree op1)
+{
+  tree type0 = TREE_TYPE (op0);
+  if (TYPE_UNSIGNED (type0))
+return true;
+  /* A cast from narrower unsigned won't be signed integral minimum,
+ but cast from same or wider precision unsigned could be.  */
+  if (TREE_CODE (op0) == NOP_EXPR
+  && INTEGRAL_TYPE_P (TREE_TYPE (TREE_OPERAND (op0, 0)))
+  && TYPE_UNSIGNED (TREE_TYPE (TREE_OPERAND (op0, 0)))
+  && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (op0, 0)))
+ < TYPE_PRECISION (type0)))
+return true;
+  if (TREE_CODE (op1) == INTEGER_CST && !integer_all_onesp (op1))
+return true;
+  return false;
+}
+
 /* Subroutine of build_binary_op, used for comparison operations.
See if the operands have both been converted from subword integer types
and, if so, perhaps change them both back to their original type.  */
--- gcc/c/c-typeck.cc.jj2023-01-13 11:11:45.368016437 +0100
+++ gcc/c/c-typeck.cc   2023-01-13 18:38:25.919538847 +0100
@@ -12431,9 +12431,7 @@ build_binary_op (location_t location, en
   undefined if the quotient can't be represented in the
   computation mode.  We shorten only if unsigned or if
   dividing by something we know != -1.  */
-   shorten = (TYPE_UNSIGNED (TREE_TYPE (orig_op0))
-  || (TREE_CODE (op1) == INTEGER_CST
-  && !integer_all_onesp (op1)));
+   shorten = may_shorten_divmod (op0, op1);
  common = 1;
}
   break;
@@ -12467,9 +12465,7 @@ build_binary_op (location_t location, en
 on some targets, since the modulo instruction is undefined if the
 quotient can't be represented in the computation mode.  We shorten
 only if unsigned or if dividing by something we know != -1.  */
- shorten = (TYPE_UNSIGNED (TREE_TYPE (orig_op0))
-|| (TREE_CODE (op1) == INTEGER_CST
-&& !integer_all_onesp (op1)));
+ shorten = may_shorten_divmod (op0, op1);
  common = 1;
}
   break;
--- gcc/cp/typeck.cc.jj 2023-01-13 11:11:45.418015716 +0100
+++ gcc/cp/typeck.cc2023-01-13 18:38:40.754327078 +0100
@@ -5455,10 +5455,7 @@ cp_build_binary_op (const op_location_t
 point, so we have to dig out the original type to find out if
 it was unsigned.  */
  tree stripped_op1 = tree_strip_any_location_wrapper (op1);
- shorten = ((TREE_CODE (op0) == NOP_EXPR
- && TYPE_UNSIGNED (TREE_TYPE (TREE_OPERAND (op0, 0
-|| (TREE_CODE (stripped_op1) == INTEGER_CST
-&& ! integer_all_onesp (stripped_op1)));
+ shorten = may_shorten_divmod (op0, stripped_op1);
}
 
  common = 1;
@@ -5491,10 +5488,7 @@ cp_build_binary_op (const op_location_t
 quotient can't be represented in the computation mode.  We shorten
 only if unsigned or if dividing by something we know != -1.  */
  tree stripped_op1 = tree_strip_any_location_wrapper (op1);
- shorten = ((TREE_CODE (op0) == NOP_EXPR
- && TYPE_UNSIGNED (TREE_TYPE (TREE_OPERAND (op0, 0
-|| (TREE_CODE (stripped_op1) == INTEGER_CST
-&& ! integer_all_onesp (stripped_op1)));
+ shorten = may_shorten_divmod (op0, stripped_op1);
  common = 1;
}
   break;
--- gcc/testsuite/c-c++-common/pr108365.c.jj2023-01-13 18:25:09.163911212 
+0100
+++ gcc/testsuite/c-c++-common/pr108365.c  

[GCC][PATCH 13/15, v5] arm: Add support for dwarf debug directives and pseudo hard-register for PAC feature.

2023-01-13 Thread Srinath Parvathaneni via Gcc-patches
Hello,

This patch teaches the DWARF support in gcc about RA_AUTH_CODE pseudo 
hard-register and also 
updates the ".save", ".cfi_register", ".cfi_offset", ".cfi_restore" directives 
accordingly.
This patch also adds support to emit ".pacspval" directive when "pac ip, lr, 
sp" instruction
in generated in the assembly.

RA_AUTH_CODE register number is 107 and it's dwarf register number is 143.

Applying this patch on top of PACBTI series posted here
https://gcc.gnu.org/pipermail/gcc-patches/2022-August/599658.html and when 
compiling the following
test.c with "-march=armv8.1-m.main+mve+pacbti -mbranch-protection=pac-ret 
-mthumb -mfloat-abi=hard
fasynchronous-unwind-tables -g -O0 -S" command line options, the assembly 
output after this patch
looks like below:

$cat test.c

void fun1(int a);
void fun(int a,...)
{
  fun1(a);
}

int main()
{
  fun (10);
  return 0;
}

$ arm-none-eabi-gcc -march=armv8.1-m.main+mve+pacbti 
-mbranch-protection=pac-ret -mthumb -mfloat-abi=hard
-fasynchronous-unwind-tables -g -O0 -S test.s

Assembly output:
...
fun:
...
.pacspval
pac ip, lr, sp
.cfi_register 143, 12
push{r3, r7, ip, lr}
.save {r3, r7, ra_auth_code, lr}
...
.cfi_offset 143, -24
...
.cfi_restore 143
...
aut ip, lr, sp
bx  lr
...
main:
...
.pacspval
pac ip, lr, sp
.cfi_register 143, 12
push{r3, r7, ip, lr}
.save {r3, r7, ra_auth_code, lr}
...
.cfi_offset 143, -8
...
.cfi_restore 143
...
aut ip, lr, sp
bx  lr
...

Regression tested on arm-none-eabi target and found no regressions.

Ok for master?

Regards,
Srinath.

2023-01-11  Srinath Parvathaneni  

* config/arm/aout.h (ra_auth_code): Add entry in enum.
(emit_multi_reg_push): Add RA_AUTH_CODE register to
dwarf frame expression.
(arm_emit_multi_reg_pop): Restore RA_AUTH_CODE register.
(arm_expand_prologue): Update frame related information and reg notes
for pac/pacbit insn.
(arm_regno_class): Check for pac pseudo reigster.
(arm_dbx_register_number): Assign ra_auth_code register number in dwarf.
(arm_init_machine_status): Set pacspval_needed to zero.
(arm_debugger_regno): Check for PAC register.
(arm_unwind_emit_sequence): Print .save directive with ra_auth_code
register.
(arm_unwind_emit_set): Add entry for IP_REGNUM in switch case.
(arm_unwind_emit): Update REG_CFA_REGISTER case._
* config/arm/arm.h (FIRST_PSEUDO_REGISTER): Modify.
(DWARF_PAC_REGNUM): Define.
(IS_PAC_REGNUM): Likewise.
(enum reg_class): Add PAC_REG entry.
(machine_function): Add pacbti_needed state to structure.
* config/arm/arm.md (RA_AUTH_CODE): Define.

gcc/testsuite/ChangeLog:

2023-01-11  Srinath Parvathaneni  

* g++.target/arm/pac-1.C: New test.
* gcc.target/arm/pac-15.c: Likewise.


dwarf_pacbti
Description: dwarf_pacbti


Re: [committed] libstdc++: Do not include in concurrency headers

2023-01-13 Thread Jonathan Wakely via Gcc-patches
On Fri, 13 Jan 2023 at 16:39, Jonathan Wakely wrote:
>
> On Fri, 13 Jan 2023 at 15:08, Rainer Orth wrote:
> >
> > Hi Jonathan,
> >
> > > The , , and  headers use
> > > std::errc constants, but don't use std::system_error itself. They only
> > > use the __throw_system_error(int) function, which is defined in
> > > .
> > >
> > > By including the header for the errc constants instead of the whole of
> > >  we avoid depending on the whole std::string definition.
> >
> > it seems this patch broke many tests on Solaris, e.g.
> >
> > FAIL: 29_atomics/atomic/requirements/types_neg.cc (test for excess errors)
> > Excess errors:
> > /var/gcc/regression/master/11.4-gcc/build/i386-pc-solaris2.11/libstdc++-v3/include/bits/std_mutex.h:157:
> >  error: 'EBUSY' was not declared in this scope
> >
>
> Oops, testing this patch now.

Pushed to trunk - thanks for the report!



Re: [PATCH] gimple-fold.h: Add missing gimple-iterator.h

2023-01-13 Thread Palmer Dabbelt

On Wed, 11 Jan 2023 23:55:15 PST (-0800), richard.guent...@gmail.com wrote:

On Thu, Jan 12, 2023 at 2:46 AM Palmer Dabbelt  wrote:


As of 6f5b06032eb ("Finish gimple_build API enhancement") gimple-fold.h
uses some of the declarations from gimple-iterator.h, which causes
issues when building Linux's stackprotector plugin.

gcc/ChangeLog:

* gimple-fold.h: Add gimple-iterator.h include.

---

I'm not sure if this should instead be fixed in Linux by reordering the
includes along the lines of

diff --git a/scripts/gcc-plugins/gcc-common.h b/scripts/gcc-plugins/gcc-common.h
index 9a1895747b15..2c3a3079128a 100644
--- a/scripts/gcc-plugins/gcc-common.h
+++ b/scripts/gcc-plugins/gcc-common.h
@@ -72,6 +72,7 @@
 #include "stor-layout.h"
 #include "internal-fn.h"
 #include "gimple-expr.h"
+#include "gimple-iterator.h"
 #include "gimple-fold.h"
 #include "context.h"
 #include "tree-ssa-alias.h"
@@ -88,7 +89,6 @@
 #include "gimple.h"
 #include "tree-phinodes.h"
 #include "tree-cfg.h"
-#include "gimple-iterator.h"
 #include "gimple-ssa.h"
 #include "ssa-iterators.h"


The above change is OK.


but I figured it was slightly easier for users to keep these compatible.
It looks like many GCC-internal uses of gimple-fold.h already have the
gimple-iterator.h include right before, though, so not sure if that's
how things are meant to be.
---
 gcc/gimple-fold.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gcc/gimple-fold.h b/gcc/gimple-fold.h
index 2fd58db9a2e..66bee2b75df 100644
--- a/gcc/gimple-fold.h
+++ b/gcc/gimple-fold.h
@@ -22,6 +22,8 @@ along with GCC; see the file COPYING3.  If not see
 #ifndef GCC_GIMPLE_FOLD_H
 #define GCC_GIMPLE_FOLD_H

+#include "gimple-iterator.h"
+


But this is not - we try to avoid #include directives in headers, we want the
include dependences to be "flat"


Makes sense.  I've sent the diff above to Linux: 
https://lore.kernel.org/r/20230113173033.4380-1-pal...@rivosinc.com/





 extern tree create_tmp_reg_or_ssa_name (tree, gimple *stmt = NULL);
 extern tree canonicalize_constructor_val (tree, tree);
 extern tree get_symbol_constant_value (tree);
--
2.39.0



Re: [PATCH] Use cxx11 abi in versioned namespace

2023-01-13 Thread Jonathan Wakely via Gcc-patches
@@ -396,7 +376,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   // Non-inline namespace for components replaced by alternates in active mode.
   namespace __cxx1998
   {
-# if _GLIBCXX_USE_CXX11_ABI
+# if _GLIBCXX_USE_CXX11_ABI && ! _GLIBCXX_VERSION_NAMESPACE

This should be INLINE not VERSION, right?



Re: [PATCH] Use cxx11 abi in versioned namespace

2023-01-13 Thread Jonathan Wakely via Gcc-patches
On Fri, 13 Jan 2023 at 16:33, Jonathan Wakely  wrote:
>
> On Mon, 5 Dec 2022 at 21:14, François Dumont via Libstdc++
>  wrote:
> >
> > I just rebased this patch.
> >
> > All good apart from the to_chars/from_chars symbols issue.
> >
> > François
> >
> >
> > On 11/10/22 19:28, François Dumont wrote:
> > > Hi
> > >
> > > Now that pretty printer is fixed (once patch validated) I'd like
> > > to propose this patch again.
> > >
> > > Note that I'am adding a check on pretty printer with a std::any on
> > > a std::wstring. I did so because of the FIXME in printers.py which is
> > > dealing with 'std::string' explicitely. Looks like in my case, where
> > > there is no 'std::string' but just a 'std::__8::string' we do not need
> > > the workaround.
> > >
> > > Once again I am attaching also the version namespace bump patch as
> > > I think that adopting the cxx11 abi in this mode is a good enough
> > > reason to bump it. If you agress let me know if I should squash the
> > > commits before pushing.
>
> Yes, I think this change would justify bumping the version.
>
> > >
> > > libstdc++: [_GLIBCXX_INLINE_VERSION] Use cxx11 abi
> > >
> > > Use cxx11 abi when activating versioned namespace mode.
> > >
> > > libstdcxx-v3/ChangeLog:
> > >
> > > * acinclude.m4 [GLIBCXX_ENABLE_LIBSTDCXX_DUAL_ABI]:
> > > Default to "new" libstdcxx abi.
> > > * config/locale/dragonfly/monetary_members.cc
> > > [!_GLIBCXX_USE_DUAL_ABI]: Define money_base
> > > members.
> > > * config/locale/generic/monetary_members.cc
> > > [!_GLIBCXX_USE_DUAL_ABI]: Likewise.
> > > * config/locale/gnu/monetary_members.cc
> > > [!_GLIBCXX_USE_DUAL_ABI]: Likewise.
> > > * config/locale/gnu/numeric_members.cc
> > > [!_GLIBCXX_USE_DUAL_ABI](__narrow_multibyte_chars): Define.
> > > * configure: Regenerate.
> > > * include/bits/c++config
> > > [_GLIBCXX_INLINE_VERSION](_GLIBCXX_NAMESPACE_CXX11,
> > > _GLIBCXX_BEGIN_NAMESPACE_CXX11): Define
> > > empty.
> > > [_GLIBCXX_INLINE_VERSION](_GLIBCXX_END_NAMESPACE_CXX11,
> > > _GLIBCXX_DEFAULT_ABI_TAG): Likewise.
> > > * python/libstdcxx/v6/printers.py
> > > (StdStringPrinter::__init__): Set self.new_string to True
> > > when std::__8::basic_string type is
> > > found.
> > > * src/Makefile.am
> > > [ENABLE_SYMVERS_GNU_NAMESPACE](ldbl_alt128_compat_sources): Define empty.
> > > * src/Makefile.in: Regenerate.
> > > * src/c++11/Makefile.am (cxx11_abi_sources): Rename into...
> > > (dual_abi_sources): ...this, new. Also move several
> > > sources to...
> > > (sources): ...this.
> > > (extra_string_inst_sources): Move several sources to...
> > > (inst_sources): ...this.
>
> I don't understand this part. Moving those files to sources and
> inst_sources will mean they are always compiled, right? But we don't
> want them compiled for --disable-libstdcxx-dual-abi
>
> In those files you've changed the #if conditions so they are empty if
> the dual ABI is disabled, but why do they need to be compiled at all?
> This isn't clear from the patch or the description or the changelog.
>
>
> > > * src/c++11/Makefile.in: Regenerate.
> > > * src/c++11/cow-fstream-inst.cc [_GLIBCXX_USE_CXX11_ABI]:
> > > Skip definitions.
> > > * src/c++11/cow-locale_init.cc [_GLIBCXX_USE_CXX11_ABI]:
> > > Skip definitions.
> > > * src/c++11/cow-sstream-inst.cc [_GLIBCXX_USE_CXX11_ABI]:
> > > Skip definitions.
> > > * src/c++11/cow-stdexcept.cc
> > > [_GLIBCXX_USE_CXX11_ABI](error_category::_M_message):
> > > Skip definition.
> > > [_GLIBCXX_USE_CXX11_ABI]: Skip Transaction Memory TS
> > > definitions.
> > > * src/c++11/cow-string-inst.cc [_GLIBCXX_USE_CXX11_ABI]:
> > > Skip definitions.
> > > * src/c++11/cow-string-io-inst.cc
> > > [_GLIBCXX_USE_CXX11_ABI]: Skip definitions.
> > > * src/c++11/cow-wstring-inst.cc [_GLIBCXX_USE_CXX11_ABI]:
> > > Skip definitions.
> > > * src/c++11/cow-wstring-io-inst.cc
> > > [_GLIBCXX_USE_CXX11_ABI]: Skip definitions.
> > > * src/c++11/cxx11-hash_tr1.cc [!_GLIBCXX_USE_CXX11_ABI]:
> > > Skip definitions.
> > > * src/c++11/cxx11-ios_failure.cc
> > > [!_GLIBCXX_USE_CXX11_ABI]: Skip definitions.
> > > [!_GLIBCXX_USE_DUAL_ABI] (__ios_failure): Remove.
>
> For this file I think your changes make sense, because the definitions
> of the gcc4-compatible and cxx11 ABI are different, we're not just
> compiling it twice.
>
>
> > > * src/c++11/cxx11-locale-inst.cc: Cleanup, just include
> > > locale-inst.cc.
> > > * src/c++11/cxx11-stdexcept.cc [!_GLIBCXX_USE_CXX11_ABI]:
> > > Skip definitions.
> > > [!_GLIBCXX_USE_DUAL_ABI](__cow_string): Remove.
> > > * src

Re: [PATCH] c++: Avoid some false positive -Wfloat-conversion warnings with extended precision [PR108285]

2023-01-13 Thread Jason Merrill via Gcc-patches

On 1/11/23 04:52, Jakub Jelinek wrote:

Hi!

On the following testcase trunk emits a false positive warning on ia32.
convert_like_internal is there called with type of double and
expr EXCESS_PRECISION_EXPR with float type with long double operand
2.L * (long double) x.
Now, for the code generation we do the right thing, cp_convert
to double from that 2.L * (long double) x, but we call even
cp_convert_and_check with that and that emits the -Wfloat-conversion
warning.  Looking at what the C FE does in this case, it calls
convert_and_check with the EXCESS_PRECISION_EXPR expression rather
than its operand, and essentially uses the operand for code generation
and EXCESS_PRECISION_EXPR itself for warnings.

The following patch does that too for the C++ FE.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?


OK.


2023-01-11  Jakub Jelinek  

PR c++/108285
* cvt.cc (cp_convert_and_check): For EXCESS_PRECISION_EXPR
use its operand except that for warning purposes use the original
EXCESS_PRECISION_EXPR.
* call.cc (convert_like_internal): Only look through
EXCESS_PRECISION_EXPR when calling cp_convert, not when calling
cp_convert_and_check.

* g++.dg/warn/pr108285.C: New test.

--- gcc/cp/cvt.cc.jj2022-10-14 09:32:32.403797521 +0200
+++ gcc/cp/cvt.cc   2023-01-10 13:53:00.639130717 +0100
@@ -652,8 +652,10 @@ cp_convert (tree type, tree expr, tsubst
  tree
  cp_convert_and_check (tree type, tree expr, tsubst_flags_t complain)
  {
-  tree result;
+  tree result, expr_for_warning = expr;
  
+  if (TREE_CODE (expr) == EXCESS_PRECISION_EXPR)

+expr = TREE_OPERAND (expr, 0);
if (TREE_TYPE (expr) == type)
  return expr;
if (expr == error_mark_node)
@@ -663,7 +665,7 @@ cp_convert_and_check (tree type, tree ex
if ((complain & tf_warning)
&& c_inhibit_evaluation_warnings == 0)
  {
-  tree folded = cp_fully_fold (expr);
+  tree folded = cp_fully_fold (expr_for_warning);
tree folded_result;
if (folded == expr)
folded_result = result;
--- gcc/cp/call.cc.jj   2023-01-09 23:41:11.135159084 +0100
+++ gcc/cp/call.cc  2023-01-10 13:50:09.277640628 +0100
@@ -8863,12 +8863,14 @@ convert_like_internal (conversion *convs
  return error_mark_node;
  
warning_sentinel w (warn_zero_as_null_pointer_constant);

-  if (TREE_CODE (expr) == EXCESS_PRECISION_EXPR)
-expr = TREE_OPERAND (expr, 0);
if (issue_conversion_warnings)
  expr = cp_convert_and_check (totype, expr, complain);
else
-expr = cp_convert (totype, expr, complain);
+{
+  if (TREE_CODE (expr) == EXCESS_PRECISION_EXPR)
+   expr = TREE_OPERAND (expr, 0);
+  expr = cp_convert (totype, expr, complain);
+}
  
return expr;

  }
--- gcc/testsuite/g++.dg/warn/pr108285.C.jj 2023-01-10 16:52:06.115345345 
+0100
+++ gcc/testsuite/g++.dg/warn/pr108285.C2023-01-10 16:39:26.646532929 
+0100
@@ -0,0 +1,11 @@
+// PR c++/108285
+// { dg-do compile }
+// { dg-options "-fexcess-precision=standard -Wfloat-conversion" }
+
+void bar (double);
+
+void
+foo (float x)
+{
+  bar (2 * x); // { dg-bogus "conversion from '\[^\n\r]\*' to 'double' may change 
value" }
+}

Jakub





Re: [PATCH] c, c++, v2: Avoid incorrect shortening of divisions [PR108365]

2023-01-13 Thread Jason Merrill via Gcc-patches

On 1/12/23 15:31, Jakub Jelinek wrote:

On Thu, Jan 12, 2023 at 08:55:32PM +0100, Jakub Jelinek via Gcc-patches wrote:

So, the following patch for the NOP_EXPR cases checks just in case that
it is from integral type and more importantly checks it is a widening
conversion, and then next to it also allows op0 to be just unsigned,
promoted or not, as that is what the C FE will do for those cases too
and I believe it must work - either the division/modulo common type
will be that unsigned type, then we can shorten and don't need to worry
about UB, or it will be some wider signed type but then it can't be most
negative value of the wider type.


Why not use the same condition in C and C++?


I can test that.  Do you mean change the C FE to match the patched C++
or change C++ FE to just test TYPE_UNSIGNED (orig_op0)?
I think both should work, though what I wrote perhaps can shorten in more
cases.  Can try to construct testcases where it differs...


E.g.
int f1 (int x, int y) { return (unsigned) x / y; }
unsigned short f2 (unsigned short x, unsigned short y) { return (unsigned) x / 
y; }
unsigned int f3 (unsigned int x, unsigned int y) { return (long long) x / y; }
C++ FE before and after the patch shortens the division in f2 and f3,
C FE shortens only in f2.  So using the C FE condition would be a regression
for C++.

Therefore I'm going to test following patch:


LGTM, though we might put that condition in c-common somewhere?


2023-01-12  Jakub Jelinek  

PR c++/108365
* c-typeck.cc (build_binary_op): For integral division or modulo,
shorten if type0 is unsigned, or op0 is cast from narrower unsigned
integral type or op1 is INTEGER_CST other than -1.

* typeck.cc (cp_build_binary_op): For integral division or modulo,
shorten if type0 is unsigned, or op0 is cast from narrower unsigned
integral type or stripped_op1 is INTEGER_CST other than -1.

* c-c++-common/pr108365.c: New test.
* g++.dg/opt/pr108365.C: New test.
* g++.dg/warn/pr108365.C: New test.

--- gcc/c/c-typeck.cc.jj2022-11-13 12:29:08.197504249 +0100
+++ gcc/c/c-typeck.cc   2023-01-12 21:06:53.310875131 +0100
@@ -12431,7 +12431,14 @@ build_binary_op (location_t location, en
   undefined if the quotient can't be represented in the
   computation mode.  We shorten only if unsigned or if
   dividing by something we know != -1.  */
-   shorten = (TYPE_UNSIGNED (TREE_TYPE (orig_op0))
+   shorten = (TYPE_UNSIGNED (type0)
+  || (TREE_CODE (op0) == NOP_EXPR
+  && INTEGRAL_TYPE_P (TREE_TYPE (TREE_OPERAND (op0,
+   0)))
+  && TYPE_UNSIGNED (TREE_TYPE (TREE_OPERAND (op0, 0)))
+  && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (op0,
+   0)))
+  < TYPE_PRECISION (type0)))
   || (TREE_CODE (op1) == INTEGER_CST
   && !integer_all_onesp (op1)));
  common = 1;
@@ -12467,7 +12474,12 @@ build_binary_op (location_t location, en
 on some targets, since the modulo instruction is undefined if the
 quotient can't be represented in the computation mode.  We shorten
 only if unsigned or if dividing by something we know != -1.  */
- shorten = (TYPE_UNSIGNED (TREE_TYPE (orig_op0))
+ shorten = (TYPE_UNSIGNED (type0)
+|| (TREE_CODE (op0) == NOP_EXPR
+&& INTEGRAL_TYPE_P (TREE_TYPE (TREE_OPERAND (op0, 0)))
+&& TYPE_UNSIGNED (TREE_TYPE (TREE_OPERAND (op0, 0)))
+&& (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (op0, 0)))
+< TYPE_PRECISION (type0)))
 || (TREE_CODE (op1) == INTEGER_CST
 && !integer_all_onesp (op1)));
  common = 1;
--- gcc/cp/typeck.cc.jj 2023-01-11 12:47:56.099672340 +0100
+++ gcc/cp/typeck.cc2023-01-12 21:04:23.738022528 +0100
@@ -5455,8 +5455,15 @@ cp_build_binary_op (const op_location_t
 point, so we have to dig out the original type to find out if
 it was unsigned.  */
  tree stripped_op1 = tree_strip_any_location_wrapper (op1);
- shorten = ((TREE_CODE (op0) == NOP_EXPR
- && TYPE_UNSIGNED (TREE_TYPE (TREE_OPERAND (op0, 0
+ shorten = (TYPE_UNSIGNED (type0)
+|| (TREE_CODE (op0) == NOP_EXPR
+&& INTEGRAL_TYPE_P (TREE_TYPE (TREE_OPERAND (op0,
+ 0)))
+&& TYPE_UNSIGNED (TREE_TYPE (TREE_OPERAND (op0,
+   0)

Re: [PATCH] IPA: do not release body if still needed

2023-01-13 Thread Martin Jambor
Hi,

sorry for getting to this so late.

On Thu, Dec 01 2022, Martin Liška wrote:
> Hi.
>
> Noticed during building of libbackend.a with the LTO partial linking.

The testcase is areally nice one, too bad it's probably impossible to
get it small enough to be included in the testcase.  But it also fails
to fail for me on trunk, I could only reproduce the problem on the
gcc-12 branch.

>
> The function release_body is called even if clone_of is a clone
> of a another function and thus it shares tree declaration. We should
> preserve it in that situation.
>

But then PR 100413 could happen just one level higher in the clones
hierarchy, not for clone_of but for clone_of->clone_of, no?

I think we need something like the following (only lightly tested so
far, I'll bootstrap it over the weekend):


diff --git a/gcc/cgraph.cc b/gcc/cgraph.cc
index 4bb9e7ba6af..3734c85db63 100644
--- a/gcc/cgraph.cc
+++ b/gcc/cgraph.cc
@@ -1895,8 +1895,18 @@ cgraph_node::remove (void)
   else if (clone_of)
 {
   clone_of->clones = next_sibling_clone;
-  if (!clone_of->analyzed && !clone_of->clones && !clones)
-   clone_of->release_body ();
+  if (!clones)
+   {
+ bool need_body = false;
+ for (cgraph_node *n = clone_of; n; n = n->clone_of)
+   if (n->analyzed || n->clones)
+ {
+   need_body = true;
+   break;
+ }
+ if (!need_body)
+   clone_of->release_body ();
+   }
 }
   if (next_sibling_clone)
 next_sibling_clone->prev_sibling_clone = prev_sibling_clone;

Thanks for catching this.

Martin


Re: [PATCH] arm: Split up MVE _Generic associations to prevent type clashes [PR107515]

2023-01-13 Thread Richard Earnshaw via Gcc-patches




On 01/12/2022 18:19, Stam Markianos-Wright via Gcc-patches wrote:

Hi all,

With these previous patches:
https://gcc.gnu.org/pipermail/gcc-patches/2022-November/606586.html
https://gcc.gnu.org/pipermail/gcc-patches/2022-November/606587.html
we enabled the MVE overloaded _Generic associations to handle more
scalar types, however at PR 107515 we found a new regression that
wasn't detected in our testing:

With glibc's `posix/types.h`:
```
typedef signed int __int32_t;
...
typedef __int32_t int32_t;
```
We would get a `error: '_Generic' specifies two compatible types`
from `__ARM_mve_coerce3` because of `type: param`, when `type` is
`int` and `int32_t: param` both being the same under the hood.

The same did not happen with Newlib's header `sys/_stdint.h`:
```
typedef long int __int32_t;
...
typedef __int32_t int32_t ;
```
which worked fine, because it uses `long int`.

The same could feasibly happen in `__ARM_mve_coerce2` between
`__fp16` and `float16_t`.

The solution here is to break the _Generic down, so that the similar
types don't appear at the same level, as is done in `__ARM_mve_typeid`.

Ok for trunk?

Thanks,
Stam Markianos-Wright

gcc/ChangeLog:
     PR target/96795
     PR target/107515
     * config/arm/arm_mve.h (__ARM_mve_coerce2): Split types.
     (__ARM_mve_coerce3): Likewise.

gcc/testsuite/ChangeLog:
     PR target/96795
     PR target/107515
     * 
gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-fp.c: New test.
     * 
gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-int.c: New test.


Please fix the missing new lines at the end of the tests.

Otherwise OK.

R.




=== Inline Ctrl+C, Ctrl+V or patch ===

diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index 
09167ec118ed3310c5077145e119196f29d83cac..70003653db65736fcfd019e83d9f18153be650dc 100644

--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -35659,9 +35659,9 @@ extern void *__ARM_undef;
  #define __ARM_mve_coerce1(param, type) \
  _Generic(param, type: param, const type: param, default: *(type 
*)__ARM_undef)

  #define __ARM_mve_coerce2(param, type) \
-    _Generic(param, type: param, float16_t: param, float32_t: param, 
default: *(type *)__ARM_undef)
+    _Generic(param, type: param, __fp16: param, default: _Generic 
(param, _Float16: param, float16_t: param, float32_t: param, default: 
*(type *)__ARM_undef))

  #define __ARM_mve_coerce3(param, type) \
-    _Generic(param, type: param, int8_t: param, int16_t: param, 
int32_t: param, int64_t: param, uint8_t: param, uint16_t: param, 
uint32_t: param, uint64_t: param, default: *(type *)__ARM_undef)
+    _Generic(param, type: param, default: _Generic (param, int8_t: 
param, int16_t: param, int32_t: param, int64_t: param, uint8_t: param, 
uint16_t: param, uint32_t: param, uint64_t: param, default: *(type 
*)__ARM_undef))


  #if (__ARM_FEATURE_MVE & 2) /* MVE Floating point.  */

diff --git 
a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-fp.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-fp.c

new file mode 100644
index 
..427dcacb5ff59b53d5eab1f1582ef6460da3f2f3

--- /dev/null
+++ 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-fp.c

@@ -0,0 +1,65 @@
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O2 -Wno-pedantic -Wno-long-long" } */
+#include "arm_mve.h"
+
+float f1;
+double f2;
+float16_t f3;
+float32_t f4;
+__fp16 f5;
+_Float16 f6;
+
+int i1;
+short i2;
+long i3;
+long long i4;
+int8_t i5;
+int16_t i6;
+int32_t i7;
+int64_t i8;
+
+const int ci1;
+const short ci2;
+const long ci3;
+const long long ci4;
+const int8_t ci5;
+const int16_t ci6;
+const int32_t ci7;
+const int64_t ci8;
+
+float16x8_t floatvec;
+int16x8_t intvec;
+
+void test(void)
+{
+    /* Test a few different supported ways of passing an int value.  The
+    intrinsic vmulq was chosen arbitrarily, but it is representative of
+    all intrinsics that take a non-const scalar value.  */
+    intvec = vmulq(intvec, 2);
+    intvec = vmulq(intvec, (int32_t) 2);
+    intvec = vmulq(intvec, (short) 2);
+    intvec = vmulq(intvec, i1);
+    intvec = vmulq(intvec, i2);
+    intvec = vmulq(intvec, i3);
+    intvec = vmulq(intvec, i4);
+    intvec = vmulq(intvec, i5);
+    intvec = vmulq(intvec, i6);
+    intvec = vmulq(intvec, i7);
+    intvec = vmulq(intvec, i8);
+
+    /* Test a few different supported ways of passing a float value.  */
+    floatvec = vmulq(floatvec, 0.5);
+    floatvec = vmulq(floatvec, 0.5f);
+    floatvec = vmulq(floatvec, (__fp16) 0.5);
+    floatvec = vmulq(floatvec, f1);
+    floatvec = vmulq(floatvec, f2);
+    floatvec = vmulq(floatvec, f3);
+    floatvec = vmulq(floatvec, f4);
+    floatvec = vmulq(floatvec, f5);
+    floatvec = vmulq(floatvec, f6);
+    floatvec = vmulq(floatvec, 0.15f

[committed] testsuite: Add another testcase from PR107131

2023-01-13 Thread Jakub Jelinek via Gcc-patches
Hi!

This one is hand reduced to problematic code from optimized dump
that used to be miscompiled during combine starting with
r12-303 and fixed with r13-3530 aka PR107172 fix.

2023-01-13  Jakub Jelinek  

PR target/107131
* gcc.c-torture/execute/pr107131.c: New test.

--- gcc/testsuite/gcc.c-torture/execute/pr107131.c.jj   2023-01-13 
17:29:42.713370475 +0100
+++ gcc/testsuite/gcc.c-torture/execute/pr107131.c  2023-01-13 
17:29:33.366503884 +0100
@@ -0,0 +1,18 @@
+/* PR target/107131 */
+
+__attribute__((noipa)) unsigned long long
+foo (unsigned char o)
+{
+  unsigned long long t1 = -(long long) (o == 0);
+  unsigned long long t2 = -(long long) (t1 > 10439075533421201520ULL);
+  unsigned long long t3 = -(long long) (t1 <= t2);
+  return t3;
+}
+
+int
+main ()
+{
+  if (foo (0) != -1ULL)
+__builtin_abort ();
+  return 0;
+}

Jakub



Re: [committed] libstdc++: Do not include in concurrency headers

2023-01-13 Thread Jonathan Wakely via Gcc-patches
On Fri, 13 Jan 2023 at 15:08, Rainer Orth wrote:
>
> Hi Jonathan,
>
> > The , , and  headers use
> > std::errc constants, but don't use std::system_error itself. They only
> > use the __throw_system_error(int) function, which is defined in
> > .
> >
> > By including the header for the errc constants instead of the whole of
> >  we avoid depending on the whole std::string definition.
>
> it seems this patch broke many tests on Solaris, e.g.
>
> FAIL: 29_atomics/atomic/requirements/types_neg.cc (test for excess errors)
> Excess errors:
> /var/gcc/regression/master/11.4-gcc/build/i386-pc-solaris2.11/libstdc++-v3/include/bits/std_mutex.h:157:
>  error: 'EBUSY' was not declared in this scope
>

Oops, testing this patch now.
commit 58e6fe334e55f56eeb0211c10697be4d4a8c52b6
Author: Jonathan Wakely 
Date:   Fri Jan 13 16:37:57 2023

libstdc++: Add  to 

This needs to be included explicitly now that we don't include all of
 here.

libstdc++-v3/ChangeLog:

* include/bits/std_mutex.h: Include .

diff --git a/libstdc++-v3/include/bits/std_mutex.h 
b/libstdc++-v3/include/bits/std_mutex.h
index bc515358d23..f74ddc4123a 100644
--- a/libstdc++-v3/include/bits/std_mutex.h
+++ b/libstdc++-v3/include/bits/std_mutex.h
@@ -36,6 +36,7 @@
 # include 
 #else
 
+#include  // EBUSY
 #include 
 #include 
 


Re: [PATCH] Use cxx11 abi in versioned namespace

2023-01-13 Thread Jonathan Wakely via Gcc-patches
On Mon, 5 Dec 2022 at 21:14, François Dumont via Libstdc++
 wrote:
>
> I just rebased this patch.
>
> All good apart from the to_chars/from_chars symbols issue.
>
> François
>
>
> On 11/10/22 19:28, François Dumont wrote:
> > Hi
> >
> > Now that pretty printer is fixed (once patch validated) I'd like
> > to propose this patch again.
> >
> > Note that I'am adding a check on pretty printer with a std::any on
> > a std::wstring. I did so because of the FIXME in printers.py which is
> > dealing with 'std::string' explicitely. Looks like in my case, where
> > there is no 'std::string' but just a 'std::__8::string' we do not need
> > the workaround.
> >
> > Once again I am attaching also the version namespace bump patch as
> > I think that adopting the cxx11 abi in this mode is a good enough
> > reason to bump it. If you agress let me know if I should squash the
> > commits before pushing.

Yes, I think this change would justify bumping the version.

> >
> > libstdc++: [_GLIBCXX_INLINE_VERSION] Use cxx11 abi
> >
> > Use cxx11 abi when activating versioned namespace mode.
> >
> > libstdcxx-v3/ChangeLog:
> >
> > * acinclude.m4 [GLIBCXX_ENABLE_LIBSTDCXX_DUAL_ABI]:
> > Default to "new" libstdcxx abi.
> > * config/locale/dragonfly/monetary_members.cc
> > [!_GLIBCXX_USE_DUAL_ABI]: Define money_base
> > members.
> > * config/locale/generic/monetary_members.cc
> > [!_GLIBCXX_USE_DUAL_ABI]: Likewise.
> > * config/locale/gnu/monetary_members.cc
> > [!_GLIBCXX_USE_DUAL_ABI]: Likewise.
> > * config/locale/gnu/numeric_members.cc
> > [!_GLIBCXX_USE_DUAL_ABI](__narrow_multibyte_chars): Define.
> > * configure: Regenerate.
> > * include/bits/c++config
> > [_GLIBCXX_INLINE_VERSION](_GLIBCXX_NAMESPACE_CXX11,
> > _GLIBCXX_BEGIN_NAMESPACE_CXX11): Define
> > empty.
> > [_GLIBCXX_INLINE_VERSION](_GLIBCXX_END_NAMESPACE_CXX11,
> > _GLIBCXX_DEFAULT_ABI_TAG): Likewise.
> > * python/libstdcxx/v6/printers.py
> > (StdStringPrinter::__init__): Set self.new_string to True
> > when std::__8::basic_string type is
> > found.
> > * src/Makefile.am
> > [ENABLE_SYMVERS_GNU_NAMESPACE](ldbl_alt128_compat_sources): Define empty.
> > * src/Makefile.in: Regenerate.
> > * src/c++11/Makefile.am (cxx11_abi_sources): Rename into...
> > (dual_abi_sources): ...this, new. Also move several
> > sources to...
> > (sources): ...this.
> > (extra_string_inst_sources): Move several sources to...
> > (inst_sources): ...this.

I don't understand this part. Moving those files to sources and
inst_sources will mean they are always compiled, right? But we don't
want them compiled for --disable-libstdcxx-dual-abi

In those files you've changed the #if conditions so they are empty if
the dual ABI is disabled, but why do they need to be compiled at all?
This isn't clear from the patch or the description or the changelog.


> > * src/c++11/Makefile.in: Regenerate.
> > * src/c++11/cow-fstream-inst.cc [_GLIBCXX_USE_CXX11_ABI]:
> > Skip definitions.
> > * src/c++11/cow-locale_init.cc [_GLIBCXX_USE_CXX11_ABI]:
> > Skip definitions.
> > * src/c++11/cow-sstream-inst.cc [_GLIBCXX_USE_CXX11_ABI]:
> > Skip definitions.
> > * src/c++11/cow-stdexcept.cc
> > [_GLIBCXX_USE_CXX11_ABI](error_category::_M_message):
> > Skip definition.
> > [_GLIBCXX_USE_CXX11_ABI]: Skip Transaction Memory TS
> > definitions.
> > * src/c++11/cow-string-inst.cc [_GLIBCXX_USE_CXX11_ABI]:
> > Skip definitions.
> > * src/c++11/cow-string-io-inst.cc
> > [_GLIBCXX_USE_CXX11_ABI]: Skip definitions.
> > * src/c++11/cow-wstring-inst.cc [_GLIBCXX_USE_CXX11_ABI]:
> > Skip definitions.
> > * src/c++11/cow-wstring-io-inst.cc
> > [_GLIBCXX_USE_CXX11_ABI]: Skip definitions.
> > * src/c++11/cxx11-hash_tr1.cc [!_GLIBCXX_USE_CXX11_ABI]:
> > Skip definitions.
> > * src/c++11/cxx11-ios_failure.cc
> > [!_GLIBCXX_USE_CXX11_ABI]: Skip definitions.
> > [!_GLIBCXX_USE_DUAL_ABI] (__ios_failure): Remove.

For this file I think your changes make sense, because the definitions
of the gcc4-compatible and cxx11 ABI are different, we're not just
compiling it twice.


> > * src/c++11/cxx11-locale-inst.cc: Cleanup, just include
> > locale-inst.cc.
> > * src/c++11/cxx11-stdexcept.cc [!_GLIBCXX_USE_CXX11_ABI]:
> > Skip definitions.
> > [!_GLIBCXX_USE_DUAL_ABI](__cow_string): Remove.
> > * src/c++11/cxx11-wlocale-inst.cc
> > [!_GLIBCXX_USE_CXX11_ABI]: Skip definitions.
> > * src/c++11/fstream-inst.cc [!_GLIBCXX_USE_CXX11_ABI]:
> > Skip definitions
> > * src/c++11/locale-inst-numeric.h
> > [!_GLIBCXX_USE_DUAL_ABI](std::use_facet>,
> > std::

[PATCH] arm: Make MVE masked stores read memory operand [PR 108177]

2023-01-13 Thread Andre Simoes Dias Vieira via Gcc-patches
Hi,

This patch adds the memory operand of MVE masked stores as input operands to
mimic the 'partial' writes, to prevent erroneous write-after-write
optimizations as described in the PR.

Regression tested on arm-none-eabi for armv8.1-m.main+mve.fp. 

OK for trunk?

gcc/ChangeLog:

PR target/108177
* config/arm/mve.md (mve_vstrbq_p_, mve_vstrhq_p_fv8hf,
mve_vstrhq_p_, mve_vstrwq_p_v4si): Add memory operand
as input operand.

gcc/testsuite/ChangeLog:

*   gcc.target/arm/mve/pr108177-1-run.c: New test.
*   gcc.target/arm/mve/pr108177-1.c: New test.
*   gcc.target/arm/mve/pr108177-10-run.c: New test.
*   gcc.target/arm/mve/pr108177-10.c: New test.
*   gcc.target/arm/mve/pr108177-11-run.c: New test.
*   gcc.target/arm/mve/pr108177-11.c: New test.
*   gcc.target/arm/mve/pr108177-12-run.c: New test.
*   gcc.target/arm/mve/pr108177-12.c: New test.
*   gcc.target/arm/mve/pr108177-13-run.c: New test.
*   gcc.target/arm/mve/pr108177-13.c: New test.
*   gcc.target/arm/mve/pr108177-14-run.c: New test.
*   gcc.target/arm/mve/pr108177-14.c: New test.
*   gcc.target/arm/mve/pr108177-2-run.c: New test.
*   gcc.target/arm/mve/pr108177-2.c: New test.
*   gcc.target/arm/mve/pr108177-3-run.c: New test.
*   gcc.target/arm/mve/pr108177-3.c: New test.
*   gcc.target/arm/mve/pr108177-4-run.c: New test.
*   gcc.target/arm/mve/pr108177-4.c: New test.
*   gcc.target/arm/mve/pr108177-5-run.c: New test.
*   gcc.target/arm/mve/pr108177-5.c: New test.
*   gcc.target/arm/mve/pr108177-6-run.c: New test.
*   gcc.target/arm/mve/pr108177-6.c: New test.
*   gcc.target/arm/mve/pr108177-7-run.c: New test.
*   gcc.target/arm/mve/pr108177-7.c: New test.
*   gcc.target/arm/mve/pr108177-8-run.c: New test.
*   gcc.target/arm/mve/pr108177-8.c: New test.
*   gcc.target/arm/mve/pr108177-9-run.c: New test.
*   gcc.target/arm/mve/pr108177-9.c: New test.
*   gcc.target/arm/mve/pr108177-main.x: New test include.
*   gcc.target/arm/mve/pr108177.x: New test include.


pr108177.patch
Description: pr108177.patch


[Committed] arm: Add cde feature support for Cortex-M55 CPU.

2023-01-13 Thread Srinath Parvathaneni via Gcc-patches
Hi,

This patch adds cde feature (optional) support for Cortex-M55 CPU, please refer
[1] for more details. To use this feature we need to specify +cdecpN 
(e.g. -mcpu=cortex-m55+cdecp), where N is the coprocessor number 0 to 7.

Bootstrapped for arm-none-linux-gnueabihf target, regression tested
on arm-none-eabi target and found no regressions.

[1] https://developer.arm.com/documentation/101051/0101/?lang=en (version: 
r1p1).

Ok for master?

Regards,
Srinath.

gcc/ChangeLog:

2023-01-13  Srinath Parvathaneni  

* common/config/arm/arm-common.cc (arm_canon_arch_option_1): Ignore cde
options for -mlibarch.
* config/arm/arm-cpus.in (begin cpu cortex-m55): Add cde options.
* doc/invoke.texi (CDE): Document options for Cortex-M55 CPU.

gcc/testsuite/ChangeLog:

2023-01-13  Srinath Parvathaneni  

* gcc.target/arm/multilib.exp: Add multilib tests for Cortex-M55 CPU.


### Attachment also inlined for ease of reply###


diff --git a/gcc/common/config/arm/arm-common.cc 
b/gcc/common/config/arm/arm-common.cc
index 
c38812f1ea6a690cd19b0dc74d963c4f5ae155ca..9ed6830417bbcc984f67237fe30beb5ebec76c00
 100644
--- a/gcc/common/config/arm/arm-common.cc
+++ b/gcc/common/config/arm/arm-common.cc
@@ -685,8 +685,10 @@ arm_canon_arch_option_1 (int argc, const char **argv, bool 
arch_for_multilib)
   auto_sbitmap target_isa (isa_num_bits);
   auto_sbitmap base_isa (isa_num_bits);
   auto_sbitmap fpu_isa (isa_num_bits);
+  auto_sbitmap ignore_multilib_isa (isa_num_bits);
 
   bitmap_clear (fpu_isa);
+  bitmap_clear (ignore_multilib_isa);
 
   const arch_option *selected_arch = NULL;
 
@@ -719,15 +721,6 @@ arm_canon_arch_option_1 (int argc, const char **argv, bool 
arch_for_multilib)
   arm_initialize_isa (target_isa, selected_arch->common.isa_bits);
   arm_parse_option_features (target_isa, &selected_arch->common,
 strchr (arch, '+'));
-  if (arch_for_multilib)
-   {
- const enum isa_feature removable_bits[] = {ISA_IGNORE_FOR_MULTILIB,
-isa_nobit};
- sbitmap isa_bits = sbitmap_alloc (isa_num_bits);
- arm_initialize_isa (isa_bits, removable_bits);
- bitmap_and_compl (target_isa, target_isa, isa_bits);
-   }
-
   if (fpu && strcmp (fpu, "auto") != 0)
{
  /* We assume that architectures do not have any FPU bits
@@ -806,6 +799,16 @@ arm_canon_arch_option_1 (int argc, const char **argv, bool 
arch_for_multilib)
   bitmap_clear_bit (target_isa, isa_bit_vfpv2);
 }
 
+  /* Here we remove feature isa bits from -mlibarch string which are not
+ necessary for multilib string comparsion.  */
+  if ((arch || cpu) && arch_for_multilib)
+{
+  const enum isa_feature removable_bits[] = {ISA_IGNORE_FOR_MULTILIB,
+isa_nobit};
+  arm_initialize_isa (ignore_multilib_isa, removable_bits);
+  bitmap_and_compl (target_isa, target_isa, ignore_multilib_isa);
+}
+
   /* If we don't have a selected architecture by now, something's
  badly wrong.  */
   gcc_assert (selected_arch);
diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in
index 
e89106c51b41d709b2159073da3273423af537f8..579cf35636632869947f37753bbf0ba79277808c
 100644
--- a/gcc/config/arm/arm-cpus.in
+++ b/gcc/config/arm/arm-cpus.in
@@ -1644,6 +1644,14 @@ begin cpu cortex-m55
  option nomve remove mve mve_float
  option nofp remove ALL_FP mve_float
  option nodsp remove MVE mve_float
+ option cdecp0 add cdecp0
+ option cdecp1 add cdecp1
+ option cdecp2 add cdecp2
+ option cdecp3 add cdecp3
+ option cdecp4 add cdecp4
+ option cdecp5 add cdecp5
+ option cdecp6 add cdecp6
+ option cdecp7 add cdecp7
  isa quirk_no_asmcpu quirk_vlldm
  costs v7m
  vendor 41
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 
701c228bd0a824dee52c7d4e23c9f687000cd3d6..0b0bdb63436e5a63b12a06c3c6cd5544cc54c808
 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -22168,6 +22168,10 @@ floating-point instructions on @samp{cortex-m55}.
 Disable the M-Profile Vector Extension (MVE) single precision floating-point
 instructions on @samp{cortex-m55}.
 
+@item +cdecp0, +cdecp1, ... , +cdecp7
+Enable the Custom Datapath Extension (CDE) on selected coprocessors according
+to the numbers given in the options in the range 0 to 7 on @samp{cortex-m55}.
+
 @item  +nofp
 Disables the floating-point instructions on @samp{arm9e},
 @samp{arm946e-s}, @samp{arm966e-s}, @samp{arm968e-s}, @samp{arm10e},
diff --git a/gcc/testsuite/gcc.target/arm/multilib.exp 
b/gcc/testsuite/gcc.target/arm/multilib.exp
index 
f903f028a83f884bdc1521f810f7e70e4130a715..e9c9b9cf885155d9a7e4106161c570df1b57ffab
 100644
--- a/gcc/testsuite/gcc.target/arm/multilib.exp
+++ b/gcc/testsuite/gcc.target/arm/multilib.exp
@@ -854,6 +854,18 @@ if {[multilib_config "rmprofile"] } {
{-mcpu=cortex-m55+nomve+nofp -mfpu=auto -mflo

Re: [PATCH v3 2/2] aarch64: Fix bit-field alignment in param passing [PR105549]

2023-01-13 Thread Jakub Jelinek via Gcc-patches
On Wed, Jan 11, 2023 at 03:18:06PM +0100, Christophe Lyon via Gcc-patches wrote:
> While working on enabling DFP for AArch64, I noticed new failures in
> gcc.dg/compat/struct-layout-1.exp (t028) which were not actually
> caused by DFP types handling. These tests are generated during 'make
> check' and enabling DFP made generation different (not sure if new
> non-DFP tests are generated, or if existing ones are generated
> differently, the tests in question are huge and difficult to compare).
> 
> Anyway, I reduced the problem to what I attach at the end of the new
> gcc.target/aarch64/aapcs64/va_arg-17.c test and rewrote it in the same
> scheme as other va_arg* AArch64 tests.  Richard Sandiford further
> reduced this to a non-vararg function, added as a second testcase.
> 
> This is a tough case mixing bit-fields and alignment, where
> aarch64_function_arg_alignment did not follow what its descriptive
> comment says: we want to use the natural alignment of the bit-field
> type only if the user didn't reduce the alignment for the bit-field
> itself.
> 
> The patch also adds a comment and assert that would help someone who
> has to look at this area again.
> 
> The fix would be very small, except that this introduces a new ABI
> break, and we have to warn about that.  Since this actually fixes a
> problem introduced in GCC 9.1, we keep the old computation to detect
> when we now behave differently.
> 
> This patch adds two new tests (va_arg-17.c and
> pr105549.c). va_arg-17.c contains the reduced offending testcase from
> struct-layout-1.exp for reference.  We update some tests introduced by
> the previous patch, where parameters with bit-fields and packed
> attribute now emit a different warning.

I'm seeing
+FAIL: g++.target/aarch64/bitfield-abi-warning-align16-O2.C 
scan-assembler-times and\\tw0, w1, 1 10
+FAIL: g++.target/aarch64/bitfield-abi-warning-align32-O2.C 
scan-assembler-times and\\tw0, w1, 1 10
+FAIL: g++.target/aarch64/bitfield-abi-warning-align8-O2.C scan-assembler-times 
and\\tw0, w0, 1 11
+FAIL: g++.target/aarch64/bitfield-abi-warning-align8-O2.C scan-assembler-times 
and\\tw0, w1, 1 18
+FAIL: gcc.target/aarch64/sve/pcs/struct_3_128.c -march=armv8.2-a+sve (internal 
compiler error: in aarch64_layout_arg, at config/aarch64/aarch64.cc:7696)
+FAIL: gcc.target/aarch64/sve/pcs/struct_3_128.c -march=armv8.2-a+sve (test for 
excess errors)
+FAIL: gcc.target/aarch64/sve/pcs/struct_3_256.c -march=armv8.2-a+sve (internal 
compiler error: in aarch64_layout_arg, at config/aarch64/aarch64.cc:7696)
+FAIL: gcc.target/aarch64/sve/pcs/struct_3_256.c -march=armv8.2-a+sve (test for 
excess errors)
+FAIL: gcc.target/aarch64/sve/pcs/struct_3_512.c -march=armv8.2-a+sve (internal 
compiler error: in aarch64_layout_arg, at config/aarch64/aarch64.cc:7696)
+FAIL: gcc.target/aarch64/sve/pcs/struct_3_512.c -march=armv8.2-a+sve (test for 
excess errors)
regressions with this change.

aarch64.cc:7696 is for me the newly added:

> +  gcc_assert (alignment <= 16 * BITS_PER_UNIT
> +   && (!alignment || abi_break < alignment)
> +   && (!abi_break_packed || alignment < abi_break_packed));

assert.
Details in
https://kojipkgs.fedoraproject.org//work/tasks/2857/96062857/build.log
(configure line etc.), plus if you
wget https://kojipkgs.fedoraproject.org//work/tasks/2857/96062857/build.log
sed -n '/^begin /,/^end/p' build.log | uuencode
you get a compressed tarball with the testsuite *.log files.

Jakub



Re: [PATCH 2/2] Corrected pr25521.c target matching.

2023-01-13 Thread Cupertino Miranda via Gcc-patches


Cupertino Miranda writes:

>> On 12/2/22 10:52, Cupertino Miranda via Gcc-patches wrote:
>>> This commit is a follow up of bugzilla #107181.
>>> The commit /a0aafbc/ changed the default implementation of the
>>> SELECT_SECTION hook in order to match clang/llvm behaviour w.r.t the
>>> placement of `const volatile' objects.
>>> However, the following targets use target-specific selection functions
>>> and they choke on the testcase pr25521.c:
>>>   *rx - target sets its const variables as '.section C,"a",@progbits'.
>> That's presumably a constant section.  We should instead twiddle the test to
>> recognize that section.
>
> Although @progbits is indeed a constant section, I believe it is
> more interesting to detect if the `rx' starts selecting more
> standard sections instead of the current @progbits.
> That was the reason why I opted to XFAIL instead of PASSing it.
> Can I keep it as such ?
>
Jeff: Can you please give me an answer on this ?

Cupertino

>>
>>>   *powerpc - its 32bit version is eager to allocate globals in .sdata
>>>  sections.
>>> Normally, one can expect for the variable to be allocated in .srodata,
>>> however, in case of powerpc-*-* or powerpc64-*-* (with -m32)
>>> 'targetm.have_srodata_section == false' and the code in
>>> categorize_decl_for_section(varasm.cc), forces it to allocate in .sdata.
>>>/* If the target uses small data sections, select it.  */
>>>else if (targetm.in_small_data_p (decl))
>>>  {
>>>if (ret == SECCAT_BSS)
>>> ret = SECCAT_SBSS;
>>>else if targetm.have_srodata_section && ret == SECCAT_RODATA)
>>> ret = SECCAT_SRODATA;
>>>else
>>> ret = SECCAT_SDATA;
>>>  }
>> I'd just skip the test for 32bit ppc.  There should be suitable 
>> effective-target
>> tests you can use.
>>
>> jeff


Re: [committed] libstdc++: Do not include in concurrency headers

2023-01-13 Thread Rainer Orth
Hi Jonathan,

> The , , and  headers use
> std::errc constants, but don't use std::system_error itself. They only
> use the __throw_system_error(int) function, which is defined in
> .
>
> By including the header for the errc constants instead of the whole of
>  we avoid depending on the whole std::string definition.

it seems this patch broke many tests on Solaris, e.g.

FAIL: 29_atomics/atomic/requirements/types_neg.cc (test for excess errors)
Excess errors:
/var/gcc/regression/master/11.4-gcc/build/i386-pc-solaris2.11/libstdc++-v3/include/bits/std_mutex.h:157:
 error: 'EBUSY' was not declared in this scope

Rainer

-- 
-
Rainer Orth, Center for Biotechnology, Bielefeld University


Re: [PATCH 1/2] select .rodata for const volatile variables.

2023-01-13 Thread Cupertino Miranda via Gcc-patches


Richard Biener writes:

> On Mon, Dec 5, 2022 at 7:07 PM Jeff Law via Gcc-patches
>  wrote:
>>
>>
>>
>> On 12/2/22 10:52, Cupertino Miranda via Gcc-patches wrote:
>> > Changed target code to select .rodata section for 'const volatile'
>> > defined variables.
>> > This change is in the context of the bugzilla #170181.
>> >
>> > gcc/ChangeLog:
>> >
>> >   v850.c(v850_select_section): Changed function.
>> I'm not sure this is safe/correct.  ISTM that you need to look at the
>> underlying TREE_TYPE to check for const-volatile rather than
>> TREE_SIDE_EFFECTS.
>
> Just to quote tree.h:
>
> /* In any expression, decl, or constant, nonzero means it has side effects or
>reevaluation of the whole expression could produce a different value.
>This is set if any subexpression is a function call, a side effect or a
>reference to a volatile variable.  In a ..._DECL, this is set only if the
>declaration said `volatile'.  This will never be set for a constant.  */
> #define TREE_SIDE_EFFECTS(NODE) \
>   (NON_TYPE_CHECK (NODE)->base.side_effects_flag)
>
> so if exp is a decl then that's the volatile check.
>

Thank you Richard for the review.
Jeff: Can you please let me know if Richard comments reply to your
concerns?

Cupertino

>> Of secondary importance is the ChangeLog.  Just saying "Changed
>> function" provides no real information.  Something like this would be
>> better:
>>
>> * config/v850/v850.c (v850_select_section): Put const volatile
>> objects into read-only sections.
>>
>>
>> Jeff
>>
>>
>>
>>
>> > ---
>> >   gcc/config/v850/v850.cc | 1 -
>> >   1 file changed, 1 deletion(-)
>> >
>> > diff --git a/gcc/config/v850/v850.cc b/gcc/config/v850/v850.cc
>> > index c7d432990ab..e66893fede4 100644
>> > --- a/gcc/config/v850/v850.cc
>> > +++ b/gcc/config/v850/v850.cc
>> > @@ -2865,7 +2865,6 @@ v850_select_section (tree exp,
>> >   {
>> > int is_const;
>> > if (!TREE_READONLY (exp)
>> > -   || TREE_SIDE_EFFECTS (exp)
>> > || !DECL_INITIAL (exp)
>> > || (DECL_INITIAL (exp) != error_mark_node
>> > && !TREE_CONSTANT (DECL_INITIAL (exp


Re: [PATCH 2/2] libstdc++: Fix a few !HOSTED test regressions

2023-01-13 Thread Jonathan Wakely via Gcc-patches
On Tue, 10 Jan 2023 at 10:03, Arsen Arsenović via Libstdc++
 wrote:
>
> libstdc++-v3/ChangeLog:
>
> * testsuite/20_util/to_chars/version.cc: Mark hosted-only.
> * testsuite/20_util/uses_allocator/lwg3677.cc: Ditto.
> * testsuite/20_util/weak_ptr/cons/self_move.cc: Ditto.
> * testsuite/std/ranges/adaptors/as_rvalue/1.cc: Replace usage of
> std::make_unique with a freestanding-compatible wrapper around
> unique_ptr.
> * testsuite/21_strings/basic_string_view/operations/contains/char.cc:
> Don't test for presence of __cpp_lib_string_contains on !HOSTED.
> * 
> testsuite/21_strings/basic_string_view/operations/contains/char/2.cc:
> Ditto.
> * testsuite/std/ranges/version_c++23.cc: Don't test for presence
> of __cpp_lib_ranges in !HOSTED.

Tested powerpc64le-linux and pushed to trunk, thanks!



Re: [PATCH 1/2] libstdc++: Enable string_view in freestanding

2023-01-13 Thread Jonathan Wakely via Gcc-patches
On Tue, 10 Jan 2023 at 16:31, Arsen Arsenović via Libstdc++
 wrote:
>
> Hi Jonathan,
>
> Jonathan Wakely  writes:
>
> > Sorry for the top post.
> >
> > -#define __cpp_lib_string_contains 202011L
> > +#if _GLIBCXX_HOSTED
> > +  // This FTM is not hosted as it also implies matching 
> > support,
> > +  // and  is omitted from the freestanding subset.
> > +# define __cpp_lib_string_contains 202011L
> > +#endif // HOSTED
> >
> > That should say "not freestanding", right?
>
> Whoops, yes.  Here's the fixed-up patch.

Tested powerpc64le-linux and pushed to trunk, thanks!



Re: nvptx: Avoid deadlock in 'cuStreamAddCallback' callback, error case (was: [PATCH 6/6, OpenACC, libgomp] Async re-work, nvptx changes)

2023-01-13 Thread Thomas Schwinge
Hi!

On 2023-01-13T21:17:43+0800, Chung-Lin Tang  wrote:
> On 2023/1/12 9:51 PM, Thomas Schwinge wrote:
>> In my case, 'cuda_callback_wrapper' (expectedly) gets invoked with
>> 'res != CUDA_SUCCESS' ("an illegal memory access was encountered").
>> When we invoke 'GOMP_PLUGIN_fatal', this attempts to shut down the device
>> (..., which deadlocks); that's generally problematic: per
>> https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483
>> "'cuStreamAddCallback' [...] Callbacks must not make any CUDA API calls".
>
> I remember running into this myself when first creating this async support
> (IIRC in my case it was cuFree()-ing something) yet you've found another 
> mistake here! :)

;-)

>> Given that eventually we must reach a host/device synchronization point
>> (latest when the device is shut down at program termination), and the
>> non-'CUDA_SUCCESS' will be upheld until then, it does seem safe to
>> replace this 'GOMP_PLUGIN_fatal' with 'GOMP_PLUGIN_error' as per the
>> "nvptx: Avoid deadlock in 'cuStreamAddCallback' callback, error case"
>> attached.  OK to push?
>
> I think this patch is fine. Actual approval powers are your's or Tom's :)

ACK.  I'll let it sit for some more time before 'git push'.


>> (Might we even skip 'GOMP_PLUGIN_error' here, understanding that the
>> error will be caught and reported at the next host/device synchronization
>> point?  But I've not verified that.)
>
> Actually, the CUDA driver API docs are a bit vague on what exactly this
> CUresult arg to the callback actually means. The 'res != CUDA_SUCCESS' 
> handling
> here was basically just generic handling.

I suppose this really is just for its own use: for example, skip certain
things in presence of pre-existing error?

> I am not really sure what is the
> true right thing to do here (is the error still retained by CUDA after the 
> callback
> completes?)

Indeed the latter is what I do observe:

  GOMP_OFFLOAD_openacc_async_exec: prepare mappings
  nvptx_exec: kernel main$_omp_fn$0: launch gangs=1, workers=1, vectors=32
  nvptx_exec: kernel main$_omp_fn$0: finished

libgomp: cuMemcpyDtoHAsync_v2 error: an illegal memory access was 
encountered

libgomp:
libgomp: Copying of dev object [0x7f9a4500..0x7f9a4528) to host 
object [0x1d89350..0x1d89378) failed
cuda_callback_wrapper error: an illegal memory access was encountered

libgomp: cuStreamDestroy error: an illegal memory access was encountered

libgomp: cuMemFree_v2 error: an illegal memory access was encountered

libgomp: device finalization failed

Here, after the 'async' OpenACC 'parallel' a 'copyout' gets enqueued,
thus 'cuMemcpyDtoHAsync_v2', which is where we first get the device-side
fault reported (all as expected).  Then -- CUDA-internally
multi-threaded, I suppose (thus the mangled printing) -- we print the
'Copying [...] failed' error plus get 'cuda_callback_wrapper' invoked.
This receives the previous 'CUresult' as seen, and then the error is
still visible at device shut-down, as shown by the following reports.
(This makes sense, as the 'CUcontext' does not magically recover.)

Also, per
,
"In the event of a device error, all subsequently executed callbacks will
receive an appropriate 'CUresult'".

But again: I'm perfectly fine with the repeated error reporting.


Grüße
 Thomas
-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955


Re: [PATCH] c, c++: Allow ignoring -Winit-self through pragmas [PR105593]

2023-01-13 Thread Marek Polacek via Gcc-patches
On Thu, Jan 12, 2023 at 09:49:56PM -0500, Jason Merrill wrote:
> On 1/12/23 19:32, Jakub Jelinek wrote:
> > Hi!
> > 
> > As mentioned in the PR, various x86 intrinsics need to return
> > an uninitialized vector.  Currently they use self initialization
> > to avoid -Wuninitialized warnings, which works fine in C, but
> > doesn't work in C++ where -Winit-self is enabled in -Wall.
> > We don't have an attribute to mark a variable as knowingly
> > uninitialized (the uninitialized attribute exists but means
> > something else, only in the -ftrivial-auto-var-init context),
> > and trying to suppress either -Wuninitialized or -Winit-self
> > inside of the _mm_undefined_ps etc. intrinsic definitions
> > doesn't work, one needs to currently disable through pragmas
> > -Wuninitialized warning at the point where _mm_undefined_ps etc.
> > result is actually used, but that goes against the intent of
> > those intrinsics.
> > 
> > The -Winit-self warning option actually doesn't do any warning,
> > all we do is record a suppression for -Winit-self if !warn_init_self
> > on the decl definition and later look that up in uninit pass.
> > 
> > The following patch changes those !warn_init_self tests which
> > are true only based on the command line option setting, not based
> > on GCC diagnostic pragma overrides to
> > !warning_enabled_at (DECL_SOURCE_LOCATION (decl), OPT_Winit_self)
> > such that it takes them into account.
> > 
> > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> 
> OK on Monday if the C maintainers don't comment.

The C changes LGTM, thanks.
 
> > Will post incremental patch for the intrinsic headers.
> > 
> > 2023-01-13  Jakub Jelinek  
> > 
> > PR c++/105593
> > gcc/c/
> > * c-parser.cc (c_parser_initializer): Check warning_enabled_at
> > at the DECL_SOURCE_LOCATION (decl) for OPT_Winit_self instead
> > of warn_init_self.
> > gcc/cp/
> > * decl.cc (cp_finish_decl): Check warning_enabled_at
> > at the DECL_SOURCE_LOCATION (decl) for OPT_Winit_self instead
> > of warn_init_self.
> > gcc/testsuite/
> > * c-c++-common/Winit-self3.c: New test.
> > * c-c++-common/Winit-self4.c: New test.
> > * c-c++-common/Winit-self5.c: New test.
> > 
> > --- gcc/c/c-parser.cc.jj2023-01-11 22:18:25.560492345 +0100
> > +++ gcc/c/c-parser.cc   2023-01-12 15:30:10.460233783 +0100
> > @@ -5701,7 +5701,7 @@ c_parser_initializer (c_parser *parser,
> >   && !DECL_EXTERNAL (decl)
> >   && !TREE_STATIC (decl)
> >   && ret.value == decl
> > - && !warn_init_self)
> > + && !warning_enabled_at (DECL_SOURCE_LOCATION (decl), OPT_Winit_self))
> > suppress_warning (decl, OPT_Winit_self);
> > if (TREE_CODE (ret.value) != STRING_CST
> >   && (TREE_CODE (ret.value) != COMPOUND_LITERAL_EXPR
> > --- gcc/cp/decl.cc.jj   2023-01-04 18:42:24.597997547 +0100
> > +++ gcc/cp/decl.cc  2023-01-12 15:26:01.257817526 +0100
> > @@ -8407,7 +8407,7 @@ cp_finish_decl (tree decl, tree init, bo
> > if (!DECL_EXTERNAL (decl)
> >   && !TREE_STATIC (decl)
> >   && decl == tree_strip_any_location_wrapper (init)
> > - && !warn_init_self)
> > + && !warning_enabled_at (DECL_SOURCE_LOCATION (decl), OPT_Winit_self))
> > suppress_warning (decl, OPT_Winit_self);
> >   }
> > --- gcc/testsuite/c-c++-common/Winit-self3.c.jj 2023-01-12 
> > 15:49:56.759172518 +0100
> > +++ gcc/testsuite/c-c++-common/Winit-self3.c2023-01-12 
> > 15:50:51.512384963 +0100
> > @@ -0,0 +1,36 @@
> > +/* PR c++/105593 */
> > +/* { dg-do compile } */
> > +/* { dg-options "-W -Wall" } */
> > +
> > +void bar (int);
> > +
> > +static inline int
> > +baz (void)
> > +{
> > +#pragma GCC diagnostic push
> > +#pragma GCC diagnostic ignored "-Winit-self"
> > +  int u = u;   /* { dg-bogus "'u' is used uninitialized" } */
> > +#pragma GCC diagnostic pop
> > +  return u;
> > +}
> > +
> > +void
> > +foo (void)
> > +{
> > +  int u = baz ();
> > +  bar (u);
> > +}
> > +
> > +static inline int
> > +qux (void)
> > +{
> > +  int u = u;   /* { dg-warning "'u' is used uninitialized" "" 
> > { target c++ } } */
> > +  return u;/* { dg-message "'u' was declared here" "" { 
> > target c++ } .-1 } */
> > +}
> > +
> > +void
> > +corge (void)
> > +{
> > +  int u = qux ();
> > +  bar (u);
> > +}
> > --- gcc/testsuite/c-c++-common/Winit-self4.c.jj 2023-01-12 
> > 15:50:15.233906776 +0100
> > +++ gcc/testsuite/c-c++-common/Winit-self4.c2023-01-12 
> > 15:50:42.445515372 +0100
> > @@ -0,0 +1,36 @@
> > +/* PR c++/105593 */
> > +/* { dg-do compile } */
> > +/* { dg-options "-W -Wall -Winit-self" } */
> > +
> > +void bar (int);
> > +
> > +static inline int
> > +baz (void)
> > +{
> > +#pragma GCC diagnostic push
> > +#pragma GCC diagnostic ignored "-Winit-self"
> > +  int u = u;   /* { dg-bogus "'u' is used uninitialized" } */
> > +#pragma GCC diagnostic pop
> > +  return u;
> > +}
> > +
> > +void
> > +foo (void)
> > +{
>

Re: nvptx: Avoid deadlock in 'cuStreamAddCallback' callback, error case (was: [PATCH 6/6, OpenACC, libgomp] Async re-work, nvptx changes)

2023-01-13 Thread Chung-Lin Tang via Gcc-patches
Hi Thomas,

On 2023/1/12 9:51 PM, Thomas Schwinge wrote:
> In my case, 'cuda_callback_wrapper' (expectedly) gets invoked with
> 'res != CUDA_SUCCESS' ("an illegal memory access was encountered").
> When we invoke 'GOMP_PLUGIN_fatal', this attempts to shut down the device
> (..., which deadlocks); that's generally problematic: per
> https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483
> "'cuStreamAddCallback' [...] Callbacks must not make any CUDA API calls".

I remember running into this myself when first creating this async support
(IIRC in my case it was cuFree()-ing something) yet you've found another 
mistake here! :) 

> Given that eventually we must reach a host/device synchronization point
> (latest when the device is shut down at program termination), and the
> non-'CUDA_SUCCESS' will be upheld until then, it does seem safe to
> replace this 'GOMP_PLUGIN_fatal' with 'GOMP_PLUGIN_error' as per the
> "nvptx: Avoid deadlock in 'cuStreamAddCallback' callback, error case"
> attached.  OK to push?

I think this patch is fine. Actual approval powers are your's or Tom's :)

> 
> (Might we even skip 'GOMP_PLUGIN_error' here, understanding that the
> error will be caught and reported at the next host/device synchronization
> point?  But I've not verified that.)

Actually, the CUDA driver API docs are a bit vague on what exactly this
CUresult arg to the callback actually means. The 'res != CUDA_SUCCESS' handling
here was basically just generic handling. I am not really sure what is the
true right thing to do here (is the error still retained by CUDA after the 
callback
completes?)

Chung-Lin



Re: [PATCH] arm: unified syntax for libgcc clear_cache

2023-01-13 Thread Richard Earnshaw via Gcc-patches
I've just noticed that this was never committed.  Presumably that's 
because the patch did not apply cleanly.  I've cleaned it up and pushed 
it now.


R.

On 30/09/2022 16:30, Seija Kijin via Gcc-patches wrote:

Yes, please!

On Tue, Sep 6, 2022 at 10:48 AM Kyrylo Tkachov  wrote:


Hi Seija,


-Original Message-
From: Gcc-patches  On Behalf Of Seija Kijin via
Gcc-patches
Sent: Thursday, August 11, 2022 2:36 PM
To: gcc-patches@gcc.gnu.org
Subject: [PATCH] arm: unified syntax for libgcc clear_cache

The patch to convert all thumb1 code in libgcc to unified syntax
omitted changing all swi instructions to the current name: svc.

This patch fixes this case.


This is ok, thanks.
Do you need someone to commit this for you?

Kyrill



---
  libgcc/config/arm/lib1funcs.S | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index 8c39c9f20a2b..19fa1462ccf3 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -1522,7 +1522,7 @@ LSYM(Lover12):
   add r7, r7, #2
  #endif
   mov r2, #0
- swi 0
+ svc 0
   do_pop {r7}
   RET
   FUNC_END clear_cache


Re: [PATCH] Fix PR rtl-optimization/108274

2023-01-13 Thread Richard Biener via Gcc-patches
On Fri, Jan 13, 2023 at 11:50 AM Eric Botcazou via Gcc-patches
 wrote:
>
> Hi,
>
> unlike other IPA passes, the ICF pass can be run at -O0 and some testcases
> rely on this in the testsuite.  Now it effectively creates a tail call so the
> DF information needs be updated in this case after epilogue creation.
>
> Tested on x86-64/Linux, OK for mainline?

OK.

Richard.

>
>
> 2023-01-13  Eric Botcazou  
>
> PR rtl-optimization/108274
> * function.cc (thread_prologue_and_epilogue_insns): Also update the
> DF information for calls in a few more cases.
>
> --
> Eric Botcazou


[committed] C-SKY: Fix skip codition for testcase ldbs.c

2023-01-13 Thread Xianmiao Qu via Gcc-patches
gcc/testsuite/
* gcc.target/csky/ldbs.c : Fix exclude-opts, should not
be "*".
---
 gcc/testsuite/gcc.target/csky/ldbs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/csky/ldbs.c 
b/gcc/testsuite/gcc.target/csky/ldbs.c
index 27a02543413..7fce1aa3736 100644
--- a/gcc/testsuite/gcc.target/csky/ldbs.c
+++ b/gcc/testsuite/gcc.target/csky/ldbs.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-skip-if "" { *-*-* } { "-mcpu=ck801" "-march=ck801" } { "*" } } */
+/* { dg-skip-if "ck801 does not support ld.bs" { csky-*-* } { "-mcpu=ck801" 
"-march=ck801" } { "" } } */
 /* { dg-csky-options "-O1" } */
 
 int foo (signed char *pb)
-- 
2.32.1 (Apple Git-133)



[committed] C-SKY: Add missing builtin defines for soft float abi.

2023-01-13 Thread Xianmiao Qu via Gcc-patches
The builtin defines for soft float abi are:
'__csky_soft_float_abi__' and '__CSKY_SOFT_FLOAT_ABI__'.

gcc/
* config/csky/csky.cc (csky_cpu_cpp_builtins): Add builtin
defines for soft float abi.
---
 gcc/config/csky/csky.cc | 5 +
 1 file changed, 5 insertions(+)

diff --git a/gcc/config/csky/csky.cc b/gcc/config/csky/csky.cc
index b0e50cfef3e..ddc6954dad1 100644
--- a/gcc/config/csky/csky.cc
+++ b/gcc/config/csky/csky.cc
@@ -396,6 +396,11 @@ csky_cpu_cpp_builtins (cpp_reader *pfile)
  builtin_define ("__csky_hard_float_abi__");
  builtin_define ("__CSKY_HARD_FLOAT_ABI__");
}
+  else
+   {
+ builtin_define ("__csky_soft_float_abi__");
+ builtin_define ("__CSKY_SOFT_FLOAT_ABI__");
+   }
   if (TARGET_SINGLE_FPU)
{
  builtin_define ("__csky_hard_float_fpu_sf__");
-- 
2.32.1 (Apple Git-133)



[committed] C-SKY: Fix float abi option in MULTILIB_DEFAULTS.

2023-01-13 Thread Xianmiao Qu via Gcc-patches
The msoft-float is alias of mfloat-abi=soft, use mfloat-abi=soft
in MULTILIB_DEFAULTS to correspond to the option in MULTILIB_OPTIONS,
otherwise it will find the wrong path.

gcc/
* config/csky/csky.h (MULTILIB_DEFAULTS): Fix float abi option.
---
 gcc/config/csky/csky.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/csky/csky.h b/gcc/config/csky/csky.h
index d21a57a8bb2..a9dc60cecc5 100644
--- a/gcc/config/csky/csky.h
+++ b/gcc/config/csky/csky.h
@@ -952,7 +952,7 @@ while (0)
specially when using MULTILIB_OPTIONS.  */
 #undef MULTILIB_DEFAULTS
 #define MULTILIB_DEFAULTS\
-{"mlittle-endian", "mcpu=ck810f", "msoft-float"}
+{"mlittle-endian", "mcpu=ck810f", "mfloat-abi=soft"}
 
 /* Support for a compile-time default CPU, et cetera.  The rules are:
--with-arch is ignored if -march or -mcpu are specified.
-- 
2.32.1 (Apple Git-133)



[committed] C-SKY: Define SYSROOT_SUFFIX_SPEC.

2023-01-13 Thread Xianmiao Qu via Gcc-patches
The earlier patch
  https://gcc.gnu.org/pipermail/gcc-patches/2021-July/575418.html
refine the way to generate sysroot suffix, but it can't find the
right path for all CPUs. The SYSROOT_SUFFIX_SPEC should be defined
to fix it.

gcc/
* config/csky/csky-linux-elf.h (SYSROOT_SUFFIX_SPEC): New.
---
 gcc/config/csky/csky-linux-elf.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/gcc/config/csky/csky-linux-elf.h b/gcc/config/csky/csky-linux-elf.h
index 677c201b218..3f67af64c15 100644
--- a/gcc/config/csky/csky-linux-elf.h
+++ b/gcc/config/csky/csky-linux-elf.h
@@ -65,6 +65,14 @@
 
 #define GLIBC_DYNAMIC_LINKER 
"/lib/ld-linux-cskyv2%{mfloat-abi=hard:-hf}%{mbig-endian:-be}.so.1"
 
+#define SYSROOT_SUFFIX_SPEC\
+  "%{mbig-endian:/big}"\
+  "%{mcpu=ck807*:/ck807}"  \
+  "%{mcpu=ck860*:/ck860}"  \
+  "%{mcpu=ck800*:/ck800}"  \
+  "%{mfloat-abi=softfp:/soft-fp}"  \
+  "%{mfloat-abi=hard:/hard-fp}"
+
 #define LINUX_TARGET_LINK_SPEC "%{h*} %{version:-v}\
%{b}\
%{static:-Bstatic}  \
-- 
2.32.1 (Apple Git-133)



[committed] C-SKY: Fix patterns' condition for ck802 smart mode.

2023-01-13 Thread Xianmiao Qu via Gcc-patches
Ck802 smart mode should not be treated as ck801.
It do only allocate r0-r8 registers like ck801,
but support 32-bits intructions.
This bug will cause ICE when compiler pr43164.c for ck802 big-endian,
/src/gcc/gcc/testsuite/gcc.c-torture/compile/pr43164.c:16:1: error: insn does 
not satisfy its constraints:
(insn 48 28 30 2 (set (reg:SI 0 a0 [230])
(ior:SI (reg:SI 2 a2 [222])
(ashift:SI (const_int 1 [0x1])
(const_int 24 [0x18] 
"/src/gcc/gcc/testsuite/gcc.c-torture/compile/pr43164.c":15:10 224 {smart_bseti}
 (expr_list:REG_DEAD (reg:SI 2 a2 [222])
(nil)))

gcc/
* config/csky/csky.md (smart_bseti): Change condition to 
CSKY_ISA_FEATURE (E1).
(smart_bclri): Likewise.
(fast_bseti): Change condition to CSKY_ISA_FEATURE (E2).
(fast_bclri): Likewise.
(fast_cmpnesi_i): Likewise.
(*fast_cmpltsi_i): Likewise.
(*fast_cmpgeusi_i): Likewise.
---
 gcc/config/csky/csky.md | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/gcc/config/csky/csky.md b/gcc/config/csky/csky.md
index d7bafdbebb9..8fb832f9e75 100644
--- a/gcc/config/csky/csky.md
+++ b/gcc/config/csky/csky.md
@@ -573,7 +573,7 @@
(ior:SI (match_operand:SI 1 "register_operand"  "0")
(ashift:SI (const_int 1)
   (match_operand:SI 2 "csky_literal_K_operand" "K"]
-  "TARGET_MINI_REGISTERS"
+  "CSKY_ISA_FEATURE (E1)"
   "bseti\t%0, %2"
   [(set_attr "length" "2")])
 
@@ -582,7 +582,7 @@
(ior:SI (match_operand:SI 1 "register_operand"  "0,r")
(ashift:SI (const_int 1)
   (match_operand:SI 2 "csky_literal_K_operand" 
"K,K"]
-  "!TARGET_MINI_REGISTERS"
+  "CSKY_ISA_FEATURE (E2)"
   "bseti\t%0, %1, %2"
   [(set_attr "length" "2,4")])
 
@@ -599,7 +599,7 @@
(and:SI (match_operand:SI 1 "register_operand"  "0")
(not:SI (ashift:SI (const_int 1)
   (match_operand:SI 2 "csky_literal_K_operand" 
"K")]
-  "TARGET_MINI_REGISTERS"
+  "CSKY_ISA_FEATURE (E1)"
   "bclri\t%0, %2"
   [(set_attr "length" "2")])
 
@@ -608,7 +608,7 @@
(and:SI (match_operand:SI 1 "register_operand"  "0,r")
(not:SI (ashift:SI (const_int 1)
   (match_operand:SI 2 "csky_literal_K_operand" 
"K,K")]
-  "!TARGET_MINI_REGISTERS"
+  "CSKY_ISA_FEATURE (E2)"
   "bclri\t%0, %1, %2"
   [(set_attr "length" "2,4")])
 
@@ -3014,7 +3014,7 @@
   [(set (reg:CC CSKY_CC_REGNUM)
(ne:CC (match_operand:SI 0 "register_operand"   "r")
   (match_operand:SI 1 "csky_literal_I_operand" "I")))]
-  "!TARGET_MINI_REGISTERS && CSKY_ISA_FEATURE (E2)"
+  "CSKY_ISA_FEATURE (E2)"
   "cmpnei\t%0, %1"
   [(set_attr "type" "cmp")]
 )
@@ -3056,7 +3056,7 @@
   [(set (reg:CC CSKY_CC_REGNUM)
(lt:CC (match_operand:SI 0 "register_operand""a,r")
   (match_operand:SI 1 "csky_literal_Uk_operand" "J,Uk")))]
-  "!TARGET_MINI_REGISTERS && CSKY_ISA_FEATURE (E2)"
+  "CSKY_ISA_FEATURE (E2)"
   "cmplti\t%0, %1"
   [(set_attr "length" "2,4")
(set_attr "type" "cmp")]
@@ -3149,7 +3149,7 @@
   [(set (reg:CC CSKY_CC_REGNUM)
(geu:CC (match_operand:SI 0 "register_operand""a,r")
(match_operand:SI 1 "csky_literal_Uk_operand" "J,Uk")))]
-  "!TARGET_MINI_REGISTERS && CSKY_ISA_FEATURE (E2)"
+  "CSKY_ISA_FEATURE (E2)"
   "cmphsi\t%0, %1"
   [(set_attr "length" "2,4")
(set_attr "type" "cmp")]
-- 
2.32.1 (Apple Git-133)



[committed] C-SKY: Skip other CPUs if the testcases are only for ck801.

2023-01-13 Thread Xianmiao Qu via Gcc-patches
Refine some testcases for ck801, if the testcase is only for
ck801, add the filename prefix "ck801-", and add dg-skip-if
to skip other CPUs.

gcc/testsuite/
* gcc.target/csky/and3a.c: Rename to ...
* gcc.target/csky/ck801-and.c: ... this.
* gcc.target/csky/constpool-3.c: Rename to ...
* gcc.target/csky/constpool-2.c: ... this, Rename to ...
* gcc.target/csky/constpool-1.c: ... this, Rename to ...
* gcc.target/csky/ck801-constpool.c: ... this, and skip
if the CPU is not ck801.
* gcc.target/csky/ck801-branch.c: Skip if the CPU is not ck801.
---
 .../gcc.target/csky/{and3a.c => ck801-and.c}|  3 ++-
 gcc/testsuite/gcc.target/csky/ck801-branch.c|  3 ++-
 gcc/testsuite/gcc.target/csky/ck801-constpool.c | 17 +
 gcc/testsuite/gcc.target/csky/constpool-1.c |  7 +++
 gcc/testsuite/gcc.target/csky/constpool-2.c |  8 
 gcc/testsuite/gcc.target/csky/constpool-3.c | 15 ---
 6 files changed, 28 insertions(+), 25 deletions(-)
 rename gcc/testsuite/gcc.target/csky/{and3a.c => ck801-and.c} (64%)
 create mode 100644 gcc/testsuite/gcc.target/csky/ck801-constpool.c
 delete mode 100644 gcc/testsuite/gcc.target/csky/constpool-3.c

diff --git a/gcc/testsuite/gcc.target/csky/and3a.c 
b/gcc/testsuite/gcc.target/csky/ck801-and.c
similarity index 64%
rename from gcc/testsuite/gcc.target/csky/and3a.c
rename to gcc/testsuite/gcc.target/csky/ck801-and.c
index 3d706f6eb58..7c79879278d 100644
--- a/gcc/testsuite/gcc.target/csky/and3a.c
+++ b/gcc/testsuite/gcc.target/csky/ck801-and.c
@@ -1,5 +1,6 @@
 /* { dg-do compile } */
-/* { dg-csky-options "-mcpu=ck801 -O1" } */
+/* { dg-skip-if  "test is specific to ck801"  { csky-*-* }  { "*" }  { 
"-mcpu=ck801" }  }  */
+/* { dg-csky-options "-O1" } */
 
 /* Test special code generation patterns for bit operators.  */
 
diff --git a/gcc/testsuite/gcc.target/csky/ck801-branch.c 
b/gcc/testsuite/gcc.target/csky/ck801-branch.c
index 95e69624475..e4dafb37e89 100644
--- a/gcc/testsuite/gcc.target/csky/ck801-branch.c
+++ b/gcc/testsuite/gcc.target/csky/ck801-branch.c
@@ -1,5 +1,6 @@
 /* { dg-do compile } */
-/* { dg-csky-options "-mcpu=ck801 -O1 -fno-reorder-blocks" } */
+/* { dg-skip-if  "test is specific to ck801"  { csky-*-* }  { "*" }  { 
"-mcpu=ck801" }  }  */
+/* { dg-csky-options "-O1 -fno-reorder-blocks" } */
 
 /* Test branch generation on CK801, which cannot rely on assembler
branch relaxation because long branches clobber lr.  */
diff --git a/gcc/testsuite/gcc.target/csky/ck801-constpool.c 
b/gcc/testsuite/gcc.target/csky/ck801-constpool.c
new file mode 100644
index 000..5c92f39d0ab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/csky/ck801-constpool.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-skip-if  "test is specific to ck801"  { csky-*-* }  { "*" }  { 
"-mcpu=ck801" }  }  */
+/* { dg-csky-options "-O1" } */
+
+/* Make sure that constant pools are emitted by the compiler for ck801.
+   If this is deferred to the assembler, the compiler will compute
+   incorrect branch offsets.  */
+
+void f (unsigned int *u, long long int *l, float *f, double *d)
+{
+  *u = 0xdeadbeef;
+  *l = 0xcafef00dc0ffeeULL;
+  *f = 3.14159F;
+  *d = 2.718281828459;
+}
+
+/* { dg-final { scan-assembler-times "\\.long" 6 } } */
diff --git a/gcc/testsuite/gcc.target/csky/constpool-1.c 
b/gcc/testsuite/gcc.target/csky/constpool-1.c
index 5c7cfdc73bd..d654420bc3f 100644
--- a/gcc/testsuite/gcc.target/csky/constpool-1.c
+++ b/gcc/testsuite/gcc.target/csky/constpool-1.c
@@ -1,9 +1,8 @@
 /* { dg-do compile } */
-/* { dg-csky-options "-mcpu=ck801 -O1" } */
+/* { dg-csky-options "-mcpu=ck810f -O1 -mconstpool" } */
 
-/* Make sure that constant pools are emitted by the compiler for ck801.
-   If this is deferred to the assembler, the compiler will compute
-   incorrect branch offsets.  */
+/* Make sure that constant pools are emitted by the compiler when
+   -mconstpool is provided.  */
 
 void f (unsigned int *u, long long int *l, float *f, double *d)
 {
diff --git a/gcc/testsuite/gcc.target/csky/constpool-2.c 
b/gcc/testsuite/gcc.target/csky/constpool-2.c
index d654420bc3f..e3a6e095f5b 100644
--- a/gcc/testsuite/gcc.target/csky/constpool-2.c
+++ b/gcc/testsuite/gcc.target/csky/constpool-2.c
@@ -1,8 +1,8 @@
 /* { dg-do compile } */
-/* { dg-csky-options "-mcpu=ck810f -O1 -mconstpool" } */
+/* { dg-csky-options "-mcpu=ck810f -O1 -mno-constpool" } */
 
-/* Make sure that constant pools are emitted by the compiler when
-   -mconstpool is provided.  */
+/* Make sure that constant pools are not emitted by the compiler when
+   -mno-constpool is provided.  */
 
 void f (unsigned int *u, long long int *l, float *f, double *d)
 {
@@ -12,4 +12,4 @@ void f (unsigned int *u, long long int *l, float *f, double 
*d)
   *d = 2.718281828459;
 }
 
-/* { dg-final { scan-assembler-times "\\.long" 6 } } */
+/* { dg-final { scan-assembler-not "\\.long" } } */
diff --git a/gcc/testsuite/gcc.target/csky/con

[committed] C-SKY: Add conditions for ceil etc patterns.

2023-01-13 Thread Xianmiao Qu via Gcc-patches
The ceil etc functions can be only inlined as instruction when
they can raise the "inexact" exception. Without the adding
conditions, it will cause the "gcc.dg/torture/builtin-fp-int-inexact-c2x.c"
etc cases fails.

gcc/
* config/csky/csky_insn_fpuv3.md (lsi2): 
Test
flag_fp_int_builtin_inexact || !flag_trapping_math.
(2): Likewise.
---
 gcc/config/csky/csky_insn_fpuv3.md | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gcc/config/csky/csky_insn_fpuv3.md 
b/gcc/config/csky/csky_insn_fpuv3.md
index 628bae597ba..7f8f459621e 100644
--- a/gcc/config/csky/csky_insn_fpuv3.md
+++ b/gcc/config/csky/csky_insn_fpuv3.md
@@ -476,14 +476,16 @@
   [(set (match_operand:SI 0 "register_operand" "=v")
(FIX_SU:SI (unspec:F3ANY [(match_operand:F3ANY 1 "register_operand" 
"0")]
   FRM)))]
-  "CSKY_ISA_FEATURE(fpv3_)"
+  "CSKY_ISA_FEATURE(fpv3_)
+   && (flag_fp_int_builtin_inexact || !flag_trapping_math)"
   "fftoi.f.32\t%0, %1"
 )
 
 (define_insn "2"
   [(set (match_operand:F3ANY 0 "register_operand" "=v")
(unspec:F3ANY [(match_operand:F3ANY 1 "register_operand" "0")] FRMF))]
-  "CSKY_ISA_FEATURE(fpv3_)"
+  "CSKY_ISA_FEATURE(fpv3_)
+   && (flag_fp_int_builtin_inexact || !flag_trapping_math)"
   "fftofi.f\t%0, %1"
 )
 
-- 
2.32.1 (Apple Git-133)



Re: [GCC][PATCH v2] arm: Add cde feature support for Cortex-M55 CPU.

2023-01-13 Thread Richard Earnshaw via Gcc-patches




On 31/10/2022 12:38, Srinath Parvathaneni via Gcc-patches wrote:

Hi,


-Original Message-
From: Christophe Lyon 
Sent: Monday, October 17, 2022 2:30 PM
To: Srinath Parvathaneni ; gcc-
patc...@gcc.gnu.org
Cc: Richard Earnshaw 
Subject: Re: [GCC][PATCH] arm: Add cde feature support for Cortex-M55
CPU.

Hi Srinath,


On 10/10/22 10:20, Srinath Parvathaneni via Gcc-patches wrote:

Hi,

This patch adds cde feature (optional) support for Cortex-M55 CPU,
please refer [1] for more details. To use this feature we need to
specify +cdecpN (e.g. -mcpu=cortex-m55+cdecp), where N is the

coprocessor number 0 to 7.


Bootstrapped for arm-none-linux-gnueabihf target, regression tested on
arm-none-eabi target and found no regressions.

[1] https://developer.arm.com/documentation/101051/0101/?lang=en

(version: r1p1).


Ok for master?

Regards,
Srinath.

gcc/ChangeLog:

2022-10-07  Srinath Parvathaneni  

  * common/config/arm/arm-common.cc (arm_canon_arch_option_1):

Ignore cde

  options for mlibarch.
  * config/arm/arm-cpus.in (begin cpu cortex-m55): Add cde options.
  * doc/invoke.texi (CDE): Document options for Cortex-M55 CPU.

gcc/testsuite/ChangeLog:

2022-10-07  Srinath Parvathaneni  

  * gcc.target/arm/multilib.exp: Add multilib tests for Cortex-M55 CPU.


### Attachment also inlined for ease of reply

###



diff --git a/gcc/common/config/arm/arm-common.cc
b/gcc/common/config/arm/arm-common.cc
index


c38812f1ea6a690cd19b0dc74d963c4f5ae155ca..b6f955b3c012475f398382e72
c9a

3966412991ec 100644
--- a/gcc/common/config/arm/arm-common.cc
+++ b/gcc/common/config/arm/arm-common.cc
@@ -753,6 +753,15 @@ arm_canon_arch_option_1 (int argc, const char

**argv, bool arch_for_multilib)

 arm_initialize_isa (target_isa, selected_cpu->common.isa_bits);
 arm_parse_option_features (target_isa, &selected_cpu->common,
 strchr (cpu, '+'));
+  if (arch_for_multilib)
+   {
+ const enum isa_feature removable_bits[] =

{ISA_IGNORE_FOR_MULTILIB,

+isa_nobit};
+ sbitmap isa_bits = sbitmap_alloc (isa_num_bits);
+ arm_initialize_isa (isa_bits, removable_bits);
+ bitmap_and_compl (target_isa, target_isa, isa_bits);
+   }
+


I can see the piece of code you add here is exactly the same as the one a few
lines above when handling "if (arch)". Can this be moved below and thus be
common to the two cases, or does it have to be performed before
bitmap_ior of fpu_isa?


Thanks for pointing out this, I have moved the common code below the arch and 
cpu
if blocks in the attached patch.
  

Also, IIUC, CDE was already optional for other CPUs (M33, M35P, star-mc1),
so the hunk above fixes a latent bug when handling multilibs for these CPUs
too? If so, maybe worth splitting the patch into two parts since the above is
not strictly related to M55?


Even though CDE is optional for the mentioned CPUs as per the specs, the code to
enable CDE as optional feature is missing in current compiler.
Current GCC compiler supports CDE as optional feature only with -march options 
and
this pass adds CDE as optional for M55 and so this is not a fix bug.


But I'm not a maintainer ;-)

Thanks,

Christophe


 if (fpu && strcmp (fpu, "auto") != 0)
{
  /* The easiest and safest way to remove the default fpu diff
--git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in index


5a63bc548e54dbfdce5d1df425bd615d81895d80..aa02c04c4924662f3ddd58e
69673

92ba3f4b4a87 100644
--- a/gcc/config/arm/arm-cpus.in
+++ b/gcc/config/arm/arm-cpus.in
@@ -1633,6 +1633,14 @@ begin cpu cortex-m55
option nomve remove mve mve_float
option nofp remove ALL_FP mve_float
option nodsp remove MVE mve_float
+ option cdecp0 add cdecp0
+ option cdecp1 add cdecp1
+ option cdecp2 add cdecp2
+ option cdecp3 add cdecp3
+ option cdecp4 add cdecp4
+ option cdecp5 add cdecp5
+ option cdecp6 add cdecp6
+ option cdecp7 add cdecp7
isa quirk_no_asmcpu quirk_vlldm
costs v7m
vendor 41
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index


aa5655764a0360959f9c1061749d2cc9ebd23489..26857f7a90e42d925bc69086
86ac

78138a53c4ad 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -21698,6 +21698,10 @@ floating-point instructions on @samp{cortex-

m55}.

   Disable the M-Profile Vector Extension (MVE) single precision floating-

point

   instructions on @samp{cortex-m55}.

+@item +cdecp0, +cdecp1, ... , +cdecp7 Enable the Custom Datapath
+Extension (CDE) on selected coprocessors according to the numbers
+given in the options in the range 0 to 7 on @samp{cortex-m55}.
+
   @item  +nofp
   Disables the floating-point instructions on @samp{arm9e},
   @samp{arm946e-s}, @samp{arm966e-s}, @samp{arm968e-s},

@samp{arm10e},

diff --git a/gcc/testsuite/gcc.target/arm/multilib.exp
b/gcc/testsuite/gcc.target/arm/multilib.exp
index


2fa648c61

Re: [PATCH 1/1] [fwprop]: Add the support of forwarding the vec_duplicate rtx

2023-01-13 Thread juzhe.zhong
Hi, Richard. Would you mind take a look at this patch?
This is a proposal patch (We could add more testcase for ARM in the future).
But we want to know if this patch is a correct approach to achieve what we want.

In RVV (RISC-V Vector), we have a bunch of instructions: 
vadd.vx/vsub.vx/vmul.vx..etc.
Such instructions allows CPU do the operations between vector and scalar 
directly without any vector duplicate or broadcast instruction.
So this patch is quite important for RVV auto-vectorizaton support which can 
reduce a lot of gimple IR pattern.
I known GCC 13 is not the appropriate for this patch, we hope this can be done 
in GCC 14.

Thank you so much.


juzhe.zh...@rivai.ai
 
From: lehua.ding
Date: 2023-01-13 17:42
To: gcc-patches
CC: richard.sandiford; juzhe.zhong; Lehua Ding
Subject: [PATCH 1/1] [fwprop]: Add the support of forwarding the vec_duplicate 
rtx
From: Lehua Ding 
 
ps: Resend for adjusting the width of each line of text.
 
Hi,
 
When I was adding the new RISC-V auto-vectorization function, I found that
converting `vector-reg1 vop vector-vreg2` to `scalar-reg3 vop vectorreg2`
is not very easy to handle where `vector-reg1` is a vec_duplicate_expr.
For example the bellow gimple IR:
 
```gimple

vect_cst__51 = [vec_duplicate_expr] z_14(D);
 

vect_iftmp.13_53 = .LEN_COND_ADD(mask__40.9_47, vect__6.12_50, vect_cst__51, { 
0.0, ... }, curr_cnt_60);
```
 
I once wanted to add corresponding functions to gimple IR, such as adding
.LEN_COND_ADD_VS, and then convert .LEN_COND_ADD to .LEN_COND_ADD_VS in 
match.pd.
This method can be realized, but it will cause too many similar internal 
functions
to be added to gimple IR. It doesn't feel necessary. Later, I tried to combine 
them
on the combine pass but failed. Finally, I thought of adding the ability to 
support
forwarding `(vec_duplciate reg)` in fwprop pass, so I have this patch.
 
Because the current upstream does not support the RISC-V automatic vectorization
function, I found an example in sve that can also be optimized and simply tried
it. For the float type, one instruction can be reduced, for example the bellow C
code. The difference between the new and old assembly code is that the new one
uses the mov instruction to directly move the scalar variable to the vector 
register.
The old assembly code first moves the scalar variable to the vector register 
outside
the loop, and then uses the sel instruction. Compared with the entire assembly 
code,
the new assembly code has one instruction less. In addition, I noticed that some
instructions in the new assembly code are ahead of the `ble .L1` instruction.
I debugged and found that the modification was made in the ce1 pass. This pass
believes that moving up is more beneficial to performance.
 
In addition, for the int type, compared with the float type, the new assembly 
code
will have one more `fmov s2, w2` instruction, so I can't judge whether the
performance is better than the previous one. In fact, I mainly do RISC-V 
development work.
 
This patch is an exploratory patch and has not been tested too much. I mainly
want to see your suggestions on whether this method is feasible and possible
potential problems.
 
Best,
Lehua Ding
 
```c
/* compiler options: -O3 -march=armv8.2-a+sve -S */
void test1 (int *pred, float *x, float z, int n)
{
 for (int i = 0; i < n; i += 1)
   {
 x[i] = pred[i] != 1 ? x[i] : z;
   }
}
```
 
The old assembly code like this (compiler explorer link: 
https://godbolt.org/z/hxTnEhaqY):
 
```asm
test1:
 cmp w2, 0
 ble.L1
 mov x3, 0
 cntw x4
 mov z0.s, s0
 whilelo p0.s, wzr, w2
 ptrue p2.b, all
.L3:
 ld1w z2.s, p0/z, [x0, x3, lsl 2]
 ld1w z1.s, p0/z, [x1, x3, lsl 2]
 cmpne p1.s, p2/z, z2.s, #1
 sel z1.s, p1, z1.s, z0.s
 st1w z1.s, p0, [x1, x3, lsl 2]
 add x3, x3, x4
 while lo p0.s, w3, w2
 b.any.L3
.L1:
 ret
```
 
The new assembly code like this:
 
```asm
test1:
 whilelo p0.s, wzr, w2
 mov x3, 0
 cntw x4
 ptrue p2.b, all
 cmp w2, 0
 ble.L1
.L3:
 ld1w z2.s, p0/z, [x0, x3, lsl 2]
 ld1w z1.s, p0/z, [x1, x3, lsl 2]
 cmpne p1.s, p2/z, z2.s, #1
 mov z1.s, p1/m, s0
 st1w z1.s, p0, [x1, x3, lsl 2]
 add x3, x3, x4
 while lo p0.s, w3, w2
 b.any.L3
.L1:
 ret
```
 
 
gcc/ChangeLog:
 
* config/aarch64/aarch64-sve.md (@aarch64_sel_dup_vs): Add new 
pattern to capture new opeands order
* fwprop.cc (fwprop_propagation::profitable_p): Add new check
(reg_single_def_for_src_p): Add new function for src rtx
(forward_propagate_into): Change to new function call
 
---
gcc/config/aarch64/aarch64-sve.md | 20 
gcc/fwprop.cc | 16 +++-
2 files changed, 35 insertions(+), 1 deletion(-)
 
diff --git a/gcc/config/aarch64/aarch64-sve.md

Re: [PATCH 3/9] arm: Don't add crtfastmath.o for -shared

2023-01-13 Thread Richard Earnshaw via Gcc-patches




On 13/01/2023 08:00, Richard Biener via Gcc-patches wrote:

Don't add crtfastmath.o for -shared to avoid altering the FP
environment when loading a shared library.

PR target/55522
* config/arm/linux-eabi.h (ENDFILE_SPEC): Don't add
crtfastmath.o for -shared.
* config/arm/unknown-elf.h (STARTFILE_SPEC): Likewise.


OK.

R.


---
  gcc/config/arm/linux-eabi.h  | 2 +-
  gcc/config/arm/unknown-elf.h | 2 +-
  2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/arm/linux-eabi.h b/gcc/config/arm/linux-eabi.h
index 57f830f0176..a119875599d 100644
--- a/gcc/config/arm/linux-eabi.h
+++ b/gcc/config/arm/linux-eabi.h
@@ -121,7 +121,7 @@
  
  #undef	ENDFILE_SPEC

  #define ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} "  \
+  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}} " 
  \
LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC)
  
  /* Use the default LIBGCC_SPEC, not the version in linux-elf.h, as we

diff --git a/gcc/config/arm/unknown-elf.h b/gcc/config/arm/unknown-elf.h
index 464d38b6cc6..397ac3f68b9 100644
--- a/gcc/config/arm/unknown-elf.h
+++ b/gcc/config/arm/unknown-elf.h
@@ -33,7 +33,7 @@
  
  #undef  STARTFILE_SPEC

  #define STARTFILE_SPEC\
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} "  \
+  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}} " 
  \
UNKNOWN_ELF_STARTFILE_SPEC
  
  #define UNKNOWN_ELF_ENDFILE_SPEC	"crtend%O%s crtn%O%s"


[PATCH] Fix PR rtl-optimization/108274

2023-01-13 Thread Eric Botcazou via Gcc-patches
Hi,

unlike other IPA passes, the ICF pass can be run at -O0 and some testcases 
rely on this in the testsuite.  Now it effectively creates a tail call so the 
DF information needs be updated in this case after epilogue creation.

Tested on x86-64/Linux, OK for mainline?


2023-01-13  Eric Botcazou  

PR rtl-optimization/108274
* function.cc (thread_prologue_and_epilogue_insns): Also update the
DF information for calls in a few more cases.

-- 
Eric Botcazoudiff --git a/gcc/function.cc b/gcc/function.cc
index d975b001ec9..95f47d287c5 100644
--- a/gcc/function.cc
+++ b/gcc/function.cc
@@ -6261,7 +6261,10 @@ thread_prologue_and_epilogue_insns (void)
 
   /* Threading the prologue and epilogue changes the artificial refs in the
  entry and exit blocks, and may invalidate DF info for tail calls.  */
-  if (optimize)
+  if (optimize
+  || flag_optimize_sibling_calls
+  || flag_ipa_icf_functions
+  || in_lto_p)
 df_update_entry_exit_and_calls ();
   else
 {


Re: [PATCH 1/9] aarch64: Don't add crtfastmath.o for -shared

2023-01-13 Thread Richard Earnshaw via Gcc-patches




On 13/01/2023 07:59, Richard Biener via Gcc-patches wrote:

Don't add crtfastmath.o for -shared to avoid altering the FP
environment when loading a shared library.

PR target/55522
* config/aarch64/aarch64-elf-raw.h (ENDFILE_SPEC): Don't add
crtfastmath.o for -shared.
* config/aarch64/aarch64-freebsd.h (GNU_USER_TARGET_MATHFILE_SPEC):
Likewise.
* config/aarch64/aarch64-linux.h (GNU_USER_TARGET_MATHFILE_SPEC):
Likewise.
---
  gcc/config/aarch64/aarch64-elf-raw.h | 2 +-
  gcc/config/aarch64/aarch64-freebsd.h | 2 +-
  gcc/config/aarch64/aarch64-linux.h   | 2 +-
  3 files changed, 3 insertions(+), 3 deletions(-)



OK.

R.


diff --git a/gcc/config/aarch64/aarch64-elf-raw.h 
b/gcc/config/aarch64/aarch64-elf-raw.h
index d4d820a9d54..fa5b4527ea0 100644
--- a/gcc/config/aarch64/aarch64-elf-raw.h
+++ b/gcc/config/aarch64/aarch64-elf-raw.h
@@ -25,7 +25,7 @@
  #define STARTFILE_SPEC " crti%O%s crtbegin%O%s crt0%O%s"
  #define ENDFILE_SPEC \
" crtend%O%s crtn%O%s " \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
+  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}}"
  
  #ifndef LINK_SPEC

  #define LINK_SPEC "%{h*} \
diff --git a/gcc/config/aarch64/aarch64-freebsd.h 
b/gcc/config/aarch64/aarch64-freebsd.h
index 13beb3781b6..2cf9cf6f046 100644
--- a/gcc/config/aarch64/aarch64-freebsd.h
+++ b/gcc/config/aarch64/aarch64-freebsd.h
@@ -50,7 +50,7 @@
  #define LINK_SPEC FBSD_TARGET_LINK_SPEC AARCH64_ERRATA_LINK_SPEC
  
  #define GNU_USER_TARGET_MATHFILE_SPEC \

-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
+  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}}"
  
  #undef ENDFILE_SPEC

  #define ENDFILE_SPEC \
diff --git a/gcc/config/aarch64/aarch64-linux.h 
b/gcc/config/aarch64/aarch64-linux.h
index 5e4553d79f5..61ed4067fc5 100644
--- a/gcc/config/aarch64/aarch64-linux.h
+++ b/gcc/config/aarch64/aarch64-linux.h
@@ -50,7 +50,7 @@
  #define LINK_SPEC LINUX_TARGET_LINK_SPEC AARCH64_ERRATA_LINK_SPEC
  
  #define GNU_USER_TARGET_MATHFILE_SPEC \

-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
+  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}}"
  
  #undef ENDFILE_SPEC

  #define ENDFILE_SPEC   \


[committed] testsuite: Add testcase for PR that went latent in GCC 13 [PR107131]

2023-01-13 Thread Jakub Jelinek via Gcc-patches
Hi!

The following testcase is probably latent since r13-3217-gc4d15dddf6b9e.
Adding testcase so that it doesn't silently reappear.

Tested on x86_64-linux and i686-linux and with GCC 12 where it FAILs,
committed to trunk as obvious.

2023-01-13  Jakub Jelinek  

PR target/107131
* gcc.dg/pr107131.c: New test.

--- gcc/testsuite/gcc.dg/pr107131.c.jj
+++ gcc/testsuite/gcc.dg/pr107131.c
@@ -0,0 +1,30 @@
+/* PR target/107131 */
+/* { dg-do run } */
+/* { dg-options "-Os -fno-ipa-vrp -fno-tree-bit-ccp -Wno-psabi" } */
+
+typedef unsigned char C;
+typedef unsigned long long __attribute__((__vector_size__ (32))) U;
+typedef unsigned long long __attribute__((__vector_size__ (64))) V;
+
+static __attribute__((__noclone__)) C
+foo (C o, U x, U y, U z)
+{
+  V a = __builtin_shufflevector (x, x, 3, 1, 3, 0, 0, 1, 1, 3);
+  V b = (V) { } >= o;
+  V c = b <= (V)(b >= (V) { 0, 0, 0, 0, 0, 0x90DF0BE3990AC871ULL });
+  U d = __builtin_shufflevector (y, z, 3, 1, 4, 5);
+  V e = a + c;
+  U f = ((union { V v; U u[2]; }) e).u[1] + d;
+  return ((union { U u; C c[32]; }) f).c[9];
+}
+
+int
+main ()
+{
+  if (__SIZEOF_LONG_LONG__ != 8 || __CHAR_BIT__ != 8)
+return 0;
+  C x = foo (0, (U) { }, (U) { }, (U) { });
+  if (x != 0xff)
+__builtin_abort();  
+  return 0;
+}

Jakub



[pushed] aarch64: Fix DWARF frame register sizes for predicates

2023-01-13 Thread Richard Sandiford via Gcc-patches
Richard Sandiford  writes:
> Jakub Jelinek  writes:
>> On Thu, Jan 12, 2023 at 04:50:07PM +, Richard Sandiford wrote:
>>> I'm jumping in here without fully understanding the context, so maybe this
>>> is exactly your point, but: the SIMD/FP DWARF registers are supposed to be
>>> size 8 regardless of which features are enabled.  That's already only half
>>> of the hardware register size for base Armv8-A, since Advanced SIMD 
>>> registers
>>> are 16 bytes in size.
>>> 
>>> So yeah, if we're using the hardware register size then something is wrong.
>>
>> I'm talking about what the following compiles to
>> static unsigned char 
>> dwarf_reg_size_table[__LIBGCC_DWARF_FRAME_REGISTERS__+1];
>>
>> void
>> foo (void)
>> {
>>   __builtin_init_dwarf_reg_size_table (dwarf_reg_size_table);
>> }
>> (and therefore what libgcc/unwind-dw2.c (init_dwarf_reg_size_table) as well)
>> with -O2 -fbuilding-libgcc -march=armv8-a vs. -O2 -fbuilding-libgcc 
>> -march=armv8-a+sve
>> The former is setting I think [0..31, 46, 48..63, 72..79, 96]=8, [64..71, 
>> 80..95]=0
>> (and leaving others untouched, which keeps them 0).
>> While the latter is setting [0..31, 46, 72..79, 96]=8, [64..71, 80..95]=0
>> and [48..63]=cntd
>
> Ah, interesting.  So the SIMD/FP registers are OK, but the predicate
> registers are causing a problem.
>
> I think we should set the predicates to size 0 too, like we do for
> call-clobbered FP registers.  Predicate registers should never need
> to be represented in CFI.

Done with the patch below.  Tested on aarch64-linux-gnu & pushed.

Thanks Jakub for pointing this out.

Richard


gcc/
* config/aarch64/aarch64.cc (aarch64_dwarf_frame_reg_mode): New
function.
(TARGET_DWARF_FRAME_REG_MODE): Define.

gcc/testsuite/
* gcc.target/aarch64/dwarf_reg_size_1.c: New test.
* gcc.target/aarch64/dwarf_reg_size_2.c: Likewise.
---
 gcc/config/aarch64/aarch64.cc | 17 
 .../gcc.target/aarch64/dwarf_reg_size_1.c | 27 +++
 .../gcc.target/aarch64/dwarf_reg_size_2.c |  6 +
 3 files changed, 50 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/dwarf_reg_size_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/dwarf_reg_size_2.c

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 80b71a7b612..2821368756b 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -3443,6 +3443,20 @@ aarch64_debugger_regno (unsigned regno)
return DWARF_FRAME_REGISTERS;
 }
 
+/* Implement TARGET_DWARF_FRAME_REG_MODE.  */
+static machine_mode
+aarch64_dwarf_frame_reg_mode (int regno)
+{
+  /* Predicate registers are call-clobbered in the EH ABI (which is
+ ARM_PCS_AAPCS64), so they should not be described by CFI.
+ Their size changes as VL changes, so any values computed by
+ __builtin_init_dwarf_reg_size_table might not be valid for
+ all frames.  */
+  if (PR_REGNUM_P (regno))
+return VOIDmode;
+  return default_dwarf_frame_reg_mode (regno);
+}
+
 /* If X is a CONST_DOUBLE, return its bit representation as a constant
integer, otherwise return X unmodified.  */
 static rtx
@@ -27900,6 +27914,9 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_SCHED_REASSOCIATION_WIDTH
 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
 
+#undef TARGET_DWARF_FRAME_REG_MODE
+#define TARGET_DWARF_FRAME_REG_MODE aarch64_dwarf_frame_reg_mode
+
 #undef TARGET_PROMOTED_TYPE
 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
 
diff --git a/gcc/testsuite/gcc.target/aarch64/dwarf_reg_size_1.c 
b/gcc/testsuite/gcc.target/aarch64/dwarf_reg_size_1.c
new file mode 100644
index 000..cb7666ddaa8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/dwarf_reg_size_1.c
@@ -0,0 +1,27 @@
+/* { dg-do run } */
+/* { dg-options "-fbuilding-libgcc" } */
+
+static unsigned char dwarf_reg_size_table[__LIBGCC_DWARF_FRAME_REGISTERS__+1];
+
+int
+main (void)
+{
+  __builtin_init_dwarf_reg_size_table (dwarf_reg_size_table);
+  /* X0-X31 and SP.  */
+  for (int i = 0; i < 32; ++i)
+if (dwarf_reg_size_table[i] != 8)
+  __builtin_abort ();
+  /* Q0-Q31/Z0-Z31, of which only the low 64 bits of register 8-15
+ are saved.  */
+  for (int i = 64; i < 96; ++i)
+if (dwarf_reg_size_table[i] != (i >= 72 && i < 80 ? 8 : 0))
+  __builtin_abort ();
+  /* P0-P15, which are never saved.  */
+  for (int i = 48; i < 63; ++i)
+if (dwarf_reg_size_table[i] != 0)
+  __builtin_abort ();
+  /* VG */
+  if (dwarf_reg_size_table[46] != 8)
+__builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/dwarf_reg_size_2.c 
b/gcc/testsuite/gcc.target/aarch64/dwarf_reg_size_2.c
new file mode 100644
index 000..8b7e6d4a717
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/dwarf_reg_size_2.c
@@ -0,0 +1,6 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-fbuilding-libgcc" } */
+
+#pragma GCC target "+sve"
+
+#include 

[pushed] aarch64: Don't update EH info when folding [PR107209]

2023-01-13 Thread Richard Biener via Gcc-patches
The AArch64 folders tried to update EH info on the fly, bypassing
the folder's attempts to remove dead EH edges later.  This triggered
an ICE when folding a potentially-trapping call to a constant.

Thanks to Richard for the patch.  Tested on aarch64-linux-gnu & pushed.

gcc/
PR target/107209
* config/aarch64/aarch64.cc (aarch64_gimple_fold_builtin): Don't
update EH info on the fly.

gcc/testsuite/
* gcc.target/aarch64/pr107209.c: New test.

Co-Authored-By: Richard Biener 
---
 gcc/config/aarch64/aarch64.cc   |  2 +-
 gcc/testsuite/gcc.target/aarch64/pr107209.c | 16 
 2 files changed, 17 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr107209.c

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index c8335e7def7..80b71a7b612 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -15348,7 +15348,7 @@ aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
   if (!new_stmt)
 return false;
 
-  gsi_replace (gsi, new_stmt, true);
+  gsi_replace (gsi, new_stmt, false);
   return true;
 }
 
diff --git a/gcc/testsuite/gcc.target/aarch64/pr107209.c 
b/gcc/testsuite/gcc.target/aarch64/pr107209.c
new file mode 100644
index 000..b86a6ea9036
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr107209.c
@@ -0,0 +1,16 @@
+/* { dg-options "-O2 -fnon-call-exceptions -fno-tree-fre" } */
+
+#include 
+
+float64x1_t
+foo (void)
+{
+  float64_t v1 = 3.14159265359;
+  float64_t v2 = 1.383894;
+  float64_t vec_1_data[] = {v1};
+  float64_t vec_2_data[] = {v2};
+  float64x1_t vec_1 = vld1_f64 (vec_1_data);
+  float64x1_t vec_2 = vld1_f64 (vec_2_data);
+
+  return vmulx_f64 (vec_1, vec_2);
+}
-- 
2.25.1



[PATCH] Don't add crtfastmath.o for -shared.

2023-01-13 Thread liuhongt via Gcc-patches
Patches [1] and [2] fixed PR55522 for x86-linux but left all other x86
targets unfixed (x86-cygwin, x86-darwin and x86-mingw32).
This patch applies a similar change to other specs using crtfastmath.o.

Ok for trunk?

[1] https://gcc.gnu.org/pipermail/gcc-patches/2022-December/608528.html
[2] https://gcc.gnu.org/pipermail/gcc-patches/2022-December/608529.html

gcc/ChangeLog:

PR target/55522
* config/i386/cygwin.h (ENDFILE_SPEC): Link crtfastmath.o
whenever -mdaz-ftz is specified. Don't link crtfastmath.o when
-share or -mno-daz-ftz is specified.
* config/i386/darwin.h (ENDFILE_SPEC): Ditto.
* config/i386/mingw32.h (ENDFILE_SPEC): Ditto.
---
 gcc/config/i386/cygwin.h  | 2 +-
 gcc/config/i386/darwin.h  | 2 +-
 gcc/config/i386/mingw32.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/cygwin.h b/gcc/config/i386/cygwin.h
index 0a604d65b32..d795ee1e3c5 100644
--- a/gcc/config/i386/cygwin.h
+++ b/gcc/config/i386/cygwin.h
@@ -48,7 +48,7 @@ along with GCC; see the file COPYING3.  If not see
 
 #undef ENDFILE_SPEC
 #define ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}\
+  
"%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!shared:%{!mno-daz-ftz:crtfastmath.o%s}}}
 \
%{!shared:%:if-exists(default-manifest.o%s)}\
%{fvtable-verify=none:%s; \
 fvtable-verify=preinit:vtv_end.o%s; \
diff --git a/gcc/config/i386/darwin.h b/gcc/config/i386/darwin.h
index 5bcb714..ac198db0d9c 100644
--- a/gcc/config/i386/darwin.h
+++ b/gcc/config/i386/darwin.h
@@ -110,7 +110,7 @@ along with GCC; see the file COPYING3.  If not see
 
 #undef ENDFILE_SPEC
 #define ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+  
"%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!shared:%{!mno-daz-ftz:crtfastmath.o%s}}}
 \
%{mpc32:crtprec32.o%s} \
%{mpc64:crtprec64.o%s} \
%{mpc80:crtprec80.o%s}" TM_DESTRUCTOR
diff --git a/gcc/config/i386/mingw32.h b/gcc/config/i386/mingw32.h
index 19a98c3d995..4e5b486a3da 100644
--- a/gcc/config/i386/mingw32.h
+++ b/gcc/config/i386/mingw32.h
@@ -196,7 +196,7 @@ along with GCC; see the file COPYING3.  If not see
 
 #undef ENDFILE_SPEC
 #define ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+  
"%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!shared:%{!mno-daz-ftz:crtfastmath.o%s}}}
 \
%{!shared:%:if-exists(default-manifest.o%s)}\
%{fvtable-verify=none:%s; \
 fvtable-verify=preinit:vtv_end.o%s; \
-- 
2.31.1



[PATCH] tree-optimization/108387 - ICE with VN handling of x << C as x * (1<

2023-01-13 Thread Richard Biener via Gcc-patches
The following fixes unexpected simplification of x << C as
x * (1<

[PATCH 1/1] [fwprop]: Add the support of forwarding the vec_duplicate rtx

2023-01-13 Thread lehua . ding
From: Lehua Ding 

ps: Resend for adjusting the width of each line of text.

Hi,

When I was adding the new RISC-V auto-vectorization function, I found that
converting `vector-reg1 vop vector-vreg2` to `scalar-reg3 vop vectorreg2`
is not very easy to handle where `vector-reg1` is a vec_duplicate_expr.
For example the bellow gimple IR:

```gimple

vect_cst__51 = [vec_duplicate_expr] z_14(D);


vect_iftmp.13_53 = .LEN_COND_ADD(mask__40.9_47, vect__6.12_50, vect_cst__51, { 
0.0, ... }, curr_cnt_60);
```

I once wanted to add corresponding functions to gimple IR, such as adding
.LEN_COND_ADD_VS, and then convert .LEN_COND_ADD to .LEN_COND_ADD_VS in 
match.pd.
This method can be realized, but it will cause too many similar internal 
functions
to be added to gimple IR. It doesn't feel necessary. Later, I tried to combine 
them
on the combine pass but failed. Finally, I thought of adding the ability to 
support
forwarding `(vec_duplciate reg)` in fwprop pass, so I have this patch.

Because the current upstream does not support the RISC-V automatic vectorization
function, I found an example in sve that can also be optimized and simply tried
it. For the float type, one instruction can be reduced, for example the bellow C
code. The difference between the new and old assembly code is that the new one
uses the mov instruction to directly move the scalar variable to the vector 
register.
The old assembly code first moves the scalar variable to the vector register 
outside
the loop, and then uses the sel instruction. Compared with the entire assembly 
code,
the new assembly code has one instruction less. In addition, I noticed that some
instructions in the new assembly code are ahead of the `ble .L1` instruction.
I debugged and found that the modification was made in the ce1 pass. This pass
believes that moving up is more beneficial to performance.

In addition, for the int type, compared with the float type, the new assembly 
code
will have one more `fmov s2, w2` instruction, so I can't judge whether the
performance is better than the previous one. In fact, I mainly do RISC-V 
development work.

This patch is an exploratory patch and has not been tested too much. I mainly
want to see your suggestions on whether this method is feasible and possible
potential problems.

Best,
Lehua Ding

```c
/* compiler options: -O3 -march=armv8.2-a+sve -S */
void test1 (int *pred, float *x, float z, int n)
{
 for (int i = 0; i < n; i += 1)
   {
 x[i] = pred[i] != 1 ? x[i] : z;
   }
}
```

The old assembly code like this (compiler explorer link: 
https://godbolt.org/z/hxTnEhaqY):

```asm
test1:
 cmp w2, 0
 ble.L1
 mov x3, 0
 cntw x4
 mov z0.s, s0
 whilelo p0.s, wzr, w2
 ptrue p2.b, all
.L3:
 ld1w z2.s, p0/z, [x0, x3, lsl 2]
 ld1w z1.s, p0/z, [x1, x3, lsl 2]
 cmpne p1.s, p2/z, z2.s, #1
 sel z1.s, p1, z1.s, z0.s
 st1w z1.s, p0, [x1, x3, lsl 2]
 add x3, x3, x4
 while lo p0.s, w3, w2
 b.any.L3
.L1:
 ret
```

The new assembly code like this:

```asm
test1:
 whilelo p0.s, wzr, w2
 mov x3, 0
 cntw x4
 ptrue p2.b, all
 cmp w2, 0
 ble.L1
.L3:
 ld1w z2.s, p0/z, [x0, x3, lsl 2]
 ld1w z1.s, p0/z, [x1, x3, lsl 2]
 cmpne p1.s, p2/z, z2.s, #1
 mov z1.s, p1/m, s0
 st1w z1.s, p0, [x1, x3, lsl 2]
 add x3, x3, x4
 while lo p0.s, w3, w2
 b.any.L3
.L1:
 ret
```


gcc/ChangeLog:

* config/aarch64/aarch64-sve.md (@aarch64_sel_dup_vs): Add new 
pattern to capture new opeands order
* fwprop.cc (fwprop_propagation::profitable_p): Add new check
(reg_single_def_for_src_p): Add new function for src rtx
(forward_propagate_into): Change to new function call

---
 gcc/config/aarch64/aarch64-sve.md | 20 
 gcc/fwprop.cc | 16 +++-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index b8cc47ef5fc..84d8ed0924d 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -7636,6 +7636,26 @@
   [(set_attr "movprfx" "*,*,yes,yes,yes,yes")]
 )
 
+;; Swap the order of operand 1 and operand 2 so that it matches the above 
pattern
+(define_insn_and_split "@aarch64_sel_dup_vs"
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=?w, w, ??w, ?&w, ??&w, 
?&w")
+   (unspec:SVE_ALL
+ [(match_operand: 3 "register_operand" "Upl, Upl, Upl, Upl, 
Upl, Upl")
+   (match_operand:SVE_ALL 1 "aarch64_simd_reg_or_zero" "0, 0, Dz, Dz, 
w, w")
+  (vec_duplicate:SVE_ALL
+ (match_operand: 2 "register_operand" "r, w, r, w, r, w"))]
+ UNSPEC_SEL))]
+  "TARGET_SVE"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+(unspec:SVE_ALL
+  [(match_dup 3)
+   (vec_dup

[PATCH 1/1] [fwprop]: Add the support of forwarding the vec_duplicate rtx

2023-01-13 Thread lehua . ding
From: Lehua Ding 

Hi,

When I was adding the new RISC-V auto-vectorization function, I found that 
converting `vector-reg1 vop vector-vreg2` to `scalar-reg3 vop vectorreg2` is 
not very easy to handle where `vector-reg1` is a vec_duplicate_expr. For 
example the bellow gimple IR:

```gimple

vect_cst__51 = [vec_duplicate_expr] z_14(D);


vect_iftmp.13_53 = .LEN_COND_ADD(mask__40.9_47, vect__6.12_50, vect_cst__51, { 
0.0, ... }, curr_cnt_60);
```

I once wanted to add corresponding functions to gimple IR, such as adding 
.LEN_COND_ADD_VS, and then convert .LEN_COND_ADD to .LEN_COND_ADD_VS in 
match.pd. This method can be realized, but it will cause too many similar 
internal functions to be added to gimple IR. It doesn't feel necessary. Later, 
I tried to combine them on the combine pass but failed. Finally, I thought of 
adding the ability to support forwarding `(vec_duplciate reg)` in fwprop pass, 
so I have this patch.

Because the current upstream does not support the RISC-V automatic 
vectorization function, I found an example in sve that can also be optimized 
and simply tried it. For the float type, one instruction can be reduced, for 
example the bellow C code. The difference between the new and old assembly code 
is that the new one uses the mov instruction to directly move the scalar 
variable to the vector register. The old assembly code first moves the scalar 
variable to the vector register outside the loop, and then uses the sel 
instruction. Compared with the entire assembly code, the new assembly code has 
one instruction less. In addition, I noticed that some instructions in the new 
assembly code are ahead of the `ble .L1` instruction. I debugged and found that 
the modification was made in the ce1 pass. This pass believes that moving up is 
more beneficial to performance.

In addition, for the int type, compared with the float type, the new assembly 
code will have one more `fmov s2, w2` instruction, so I can't judge whether the 
performance is better than the previous one. In fact, I mainly do RISC-V 
development work.

This patch is an exploratory patch and has not been tested too much. I mainly 
want to see your suggestions on whether this method is feasible and possible 
potential problems.

```c
/* compiler options: -O3 -march=armv8.2-a+sve -S */
void test1 (int *pred, float *x, float z, int n)
{
 for (int i = 0; i < n; i += 1)
   {
 x[i] = pred[i] != 1 ? x[i] : z;
   }
}
```

The old assembly code like this (compiler explorer link: 
https://godbolt.org/z/hxTnEhaqY):

```asm
test1:
 cmp w2, 0
 ble.L1
 mov x3, 0
 cntw x4
 mov z0.s, s0
 whilelo p0.s, wzr, w2
 ptrue p2.b, all
.L3:
 ld1w z2.s, p0/z, [x0, x3, lsl 2]
 ld1w z1.s, p0/z, [x1, x3, lsl 2]
 cmpne p1.s, p2/z, z2.s, #1
 sel z1.s, p1, z1.s, z0.s
 st1w z1.s, p0, [x1, x3, lsl 2]
 add x3, x3, x4
 while lo p0.s, w3, w2
 b.any.L3
.L1:
 ret
```

The new assembly code like this:

```asm
test1:
 whilelo p0.s, wzr, w2
 mov x3, 0
 cntw x4
 ptrue p2.b, all
 cmp w2, 0
 ble.L1
.L3:
 ld1w z2.s, p0/z, [x0, x3, lsl 2]
 ld1w z1.s, p0/z, [x1, x3, lsl 2]
 cmpne p1.s, p2/z, z2.s, #1
 mov z1.s, p1/m, s0
 st1w z1.s, p0, [x1, x3, lsl 2]
 add x3, x3, x4
 while lo p0.s, w3, w2
 b.any.L3
.L1:
 ret
```


gcc/ChangeLog:

* config/aarch64/aarch64-sve.md (@aarch64_sel_dup_vs): Add new 
pattern to capture new opeands order
* fwprop.cc (fwprop_propagation::profitable_p): Add new check
(reg_single_def_for_src_p): Add new function for src rtx
(forward_propagate_into): Change to new function call

---
 gcc/config/aarch64/aarch64-sve.md | 20 
 gcc/fwprop.cc | 16 +++-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index b8cc47ef5fc..84d8ed0924d 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -7636,6 +7636,26 @@
   [(set_attr "movprfx" "*,*,yes,yes,yes,yes")]
 )
 
+;; Swap the order of operand 1 and operand 2 so that it matches the above 
pattern
+(define_insn_and_split "@aarch64_sel_dup_vs"
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=?w, w, ??w, ?&w, ??&w, 
?&w")
+   (unspec:SVE_ALL
+ [(match_operand: 3 "register_operand" "Upl, Upl, Upl, Upl, 
Upl, Upl")
+   (match_operand:SVE_ALL 1 "aarch64_simd_reg_or_zero" "0, 0, Dz, Dz, 
w, w")
+  (vec_duplicate:SVE_ALL
+ (match_operand: 2 "register_operand" "r, w, r, w, r, w"))]
+ UNSPEC_SEL))]
+  "TARGET_SVE"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+(unspec:SVE_ALL
+  [(match_dup 3)
+   (vec_duplicate:SVE_ALL (match_dup 2))
+   (match_dup 1)]
+  

Re: [PATCH] ipa: silent -Wodr notes with -w

2023-01-13 Thread Martin Liška

PING^3

On 12/22/22 13:15, Martin Liška wrote:

PING^2

On 12/9/22 09:27, Martin Liška wrote:

PING^1

On 12/2/22 12:27, Martin Liška wrote:

If -w is used, warn_odr properly sets *warned = false and
so it should be preserved when calling warn_types_mismatch.

Noticed that during a LTO reduction where I used -w.

Patch can bootstrap on x86_64-linux-gnu and survives regression tests.

Ready to be installed?
Thanks,
Martin

gcc/ChangeLog:

* ipa-devirt.cc (odr_types_equivalent_p): Respect *warned
value if set.
---
  gcc/ipa-devirt.cc | 12 ++--
  1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/gcc/ipa-devirt.cc b/gcc/ipa-devirt.cc
index 265d07bb354..bcdc50c5bd7 100644
--- a/gcc/ipa-devirt.cc
+++ b/gcc/ipa-devirt.cc
@@ -1300,7 +1300,7 @@ odr_types_equivalent_p (tree t1, tree t2, bool warn, bool 
*warned,
  warn_odr (t1, t2, NULL, NULL, warn, warned,
G_("it is defined as a pointer to different type "
   "in another translation unit"));
- if (warn && warned)
+ if (warn && (warned == NULL || *warned))
warn_types_mismatch (TREE_TYPE (t1), TREE_TYPE (t2),
 loc1, loc2);
  return false;
@@ -1315,7 +1315,7 @@ odr_types_equivalent_p (tree t1, tree t2, bool warn, bool 
*warned,
  warn_odr (t1, t2, NULL, NULL, warn, warned,
G_("a different type is defined "
   "in another translation unit"));
- if (warn && warned)
+ if (warn && (warned == NULL || *warned))
warn_types_mismatch (TREE_TYPE (t1), TREE_TYPE (t2), loc1, loc2);
  return false;
}
@@ -1333,7 +1333,7 @@ odr_types_equivalent_p (tree t1, tree t2, bool warn, bool 
*warned,
warn_odr (t1, t2, NULL, NULL, warn, warned,
  G_("a different type is defined in another "
 "translation unit"));
-   if (warn && warned)
+   if (warn && (warned == NULL || *warned))
  warn_types_mismatch (TREE_TYPE (t1), TREE_TYPE (t2), loc1, loc2);
  }
gcc_assert (TYPE_STRING_FLAG (t1) == TYPE_STRING_FLAG (t2));
@@ -1375,7 +1375,7 @@ odr_types_equivalent_p (tree t1, tree t2, bool warn, bool 
*warned,
  warn_odr (t1, t2, NULL, NULL, warn, warned,
G_("has different return value "
   "in another translation unit"));
- if (warn && warned)
+ if (warn && (warned == NULL || *warned))
warn_types_mismatch (TREE_TYPE (t1), TREE_TYPE (t2), loc1, loc2);
  return false;
}
@@ -1398,7 +1398,7 @@ odr_types_equivalent_p (tree t1, tree t2, bool warn, bool 
*warned,
  warn_odr (t1, t2, NULL, NULL, warn, warned,
G_("has different parameters in another "
   "translation unit"));
- if (warn && warned)
+ if (warn && (warned == NULL || *warned))
warn_types_mismatch (TREE_VALUE (parms1),
 TREE_VALUE (parms2), loc1, loc2);
  return false;
@@ -1484,7 +1484,7 @@ odr_types_equivalent_p (tree t1, tree t2, bool warn, bool 
*warned,
warn_odr (t1, t2, f1, f2, warn, warned,
  G_("a field of same name but different type "
 "is defined in another translation unit"));
-   if (warn && warned)
+   if (warn && (warned == NULL || *warned))
  warn_types_mismatch (TREE_TYPE (f1), TREE_TYPE (f2), 
loc1, loc2);
return false;
  }








[PATCH] Sync LTO type_for_mode with c-family/

2023-01-13 Thread Richard Biener via Gcc-patches
The following adds _FloatN mode support to the LTO copy of
c_common_type_for_mode and also implements the fix for PR94072.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

gcc/lto/
* lto-lang.cc (lto_type_for_mode): Sync with
c_common_type_for_mode.
---
 gcc/lto/lto-lang.cc | 20 +---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/gcc/lto/lto-lang.cc b/gcc/lto/lto-lang.cc
index 7018dfae4a5..1b2a4a632bb 100644
--- a/gcc/lto/lto-lang.cc
+++ b/gcc/lto/lto-lang.cc
@@ -1004,6 +1004,11 @@ lto_type_for_mode (machine_mode mode, int unsigned_p)
   if (mode == TYPE_MODE (long_double_type_node))
 return long_double_type_node;
 
+  for (i = 0; i < NUM_FLOATN_NX_TYPES; i++)
+if (FLOATN_NX_TYPE_NODE (i) != NULL_TREE
+   && mode == TYPE_MODE (FLOATN_NX_TYPE_NODE (i)))
+  return FLOATN_NX_TYPE_NODE (i);
+
   if (mode == TYPE_MODE (void_type_node))
 return void_type_node;
 
@@ -1029,6 +1034,11 @@ lto_type_for_mode (machine_mode mode, int unsigned_p)
   if (mode == TYPE_MODE (complex_long_double_type_node))
return complex_long_double_type_node;
 
+  for (i = 0; i < NUM_FLOATN_NX_TYPES; i++)
+   if (COMPLEX_FLOATN_NX_TYPE_NODE (i) != NULL_TREE
+   && mode == TYPE_MODE (COMPLEX_FLOATN_NX_TYPE_NODE (i)))
+ return COMPLEX_FLOATN_NX_TYPE_NODE (i);
+
   if (mode == TYPE_MODE (complex_integer_type_node) && !unsigned_p)
return complex_integer_type_node;
 
@@ -1154,9 +1164,13 @@ lto_type_for_mode (machine_mode mode, int unsigned_p)
 }
 
   for (t = registered_builtin_types; t; t = TREE_CHAIN (t))
-if (TYPE_MODE (TREE_VALUE (t)) == mode)
-  return TREE_VALUE (t);
-
+{
+  tree type = TREE_VALUE (t);
+  if (TYPE_MODE (type) == mode
+ && VECTOR_TYPE_P (type) == VECTOR_MODE_P (mode)
+ && !!unsigned_p == !!TYPE_UNSIGNED (type))
+   return type;
+}
   return NULL_TREE;
 }
 
-- 
2.35.3


Re: [PATCH 0/9] Don't add crtfastmath.o for -shared

2023-01-13 Thread Hongtao Liu via Gcc-patches
On Fri, Jan 13, 2023 at 4:05 PM Richard Biener via Gcc-patches
 wrote:
>
>
> This is a series completing the fix for PR55522 which got a fix for
> x86-linux already but left all other targets unfixed (including
> x86-cygwin, x86-darwin and x86-mingw32).  The following series
> applies a similar change to other specs using crtfastmath.o,
> the changes are untested.
>
> Target maintainers are CCed and I hope they can smoke-test the
> changes.
>
> x86 maintainers, can you please adjust the missed specs yourself?
Sure, Let me do this.
>
> Thanks,
> Richard.



-- 
BR,
Hongtao


[PATCH 0/9] Don't add crtfastmath.o for -shared

2023-01-13 Thread Richard Biener via Gcc-patches


This is a series completing the fix for PR55522 which got a fix for
x86-linux already but left all other targets unfixed (including
x86-cygwin, x86-darwin and x86-mingw32).  The following series
applies a similar change to other specs using crtfastmath.o,
the changes are untested.

Target maintainers are CCed and I hope they can smoke-test the
changes.

x86 maintainers, can you please adjust the missed specs yourself?

Thanks,
Richard.


[PATCH 9/9] Clarify -shared effect on crtfastmath.o

2023-01-13 Thread Richard Biener via Gcc-patches
This rewords the note to not specifically mention crtfastmath.o
but FP environment altering by -ffast-math or -Ofast.

I'll push this after the target parts are approved.

PR target/55522
* doc/invoke.texi (-shared): Clarify effect on -ffast-math
and -Ofast FP environment side-effects.
---
 gcc/doc/invoke.texi | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 701c228bd0a..7a6a83b2641 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -17693,8 +17693,9 @@ needs to build supplementary stub code for constructors 
to work.  On
 multi-libbed systems, @samp{gcc -shared} must select the correct support
 libraries to link against.  Failing to supply the correct flags may lead
 to subtle defects.  Supplying them in cases where they are not necessary
-is innocuous. For x86, crtfastmath.o will not be added when
-@option{-shared} is specified. }
+is innocuous.  @option{-shared} suppresses the addition of startup code
+to alter the floating-point environment as done with @option{-ffast-math}
+or @option{-Ofast} on some targets.}
 
 @item -shared-libgcc
 @itemx -static-libgcc
-- 
2.35.3


[PATCH 8/9] solaris2: Don't add crtfastmath.o for -shared

2023-01-13 Thread Richard Biener via Gcc-patches
Don't add crtfastmath.o for -shared to avoid altering the FP
environment when loading a shared library.

PR target/55522
* config/sol2.h (ENDFILE_SPEC): Don't add crtfastmath.o for -shared.
---
 gcc/config/sol2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/sol2.h b/gcc/config/sol2.h
index 05dbaffa8ea..616f9b91212 100644
--- a/gcc/config/sol2.h
+++ b/gcc/config/sol2.h
@@ -295,7 +295,7 @@ along with GCC; see the file COPYING3.  If not see
 
 #undef  ENDFILE_SPEC
 #define ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}} \
%(endfile_arch) %(endfile_vtv) %(endfile_crtend) crtn.o%s"
 
 #undef LINK_ARCH32_SPEC_BASE
-- 
2.35.3



[PATCH 7/9] sparc: Don't add crtfastmath.o for -shared

2023-01-13 Thread Richard Biener via Gcc-patches
Don't add crtfastmath.o for -shared to avoid altering the FP
environment when loading a shared library.

PR target/55522
* config/sparc/freebsd.h (ENDFILE_SPEC): Don't add crtfastmath.o
for -shared.
* config/sparc/linux.h (ENDFILE_SPEC): Likewise.
* config/sparc/linux64.h (ENDFILE_SPEC): Likewise.
* config/sparc/sp-elf.h (ENDFILE_SPEC): Likewise.
* config/sparc/sp64-elf.h (ENDFILE_SPEC): Likewise.
---
 gcc/config/sparc/freebsd.h  | 2 +-
 gcc/config/sparc/linux.h| 2 +-
 gcc/config/sparc/linux64.h  | 2 +-
 gcc/config/sparc/sp-elf.h   | 2 +-
 gcc/config/sparc/sp64-elf.h | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/config/sparc/freebsd.h b/gcc/config/sparc/freebsd.h
index 73850a31f58..a5aa3679547 100644
--- a/gcc/config/sparc/freebsd.h
+++ b/gcc/config/sparc/freebsd.h
@@ -127,7 +127,7 @@ along with GCC; see the file COPYING3.  If not see
 
 #undef ENDFILE_SPEC
 #define ENDFILE_SPEC   \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} "\
+  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}} " 
\
   FBSD_ENDFILE_SPEC
 
 /* We use GNU ld so undefine this so that attribute((init_priority)) works.  */
diff --git a/gcc/config/sparc/linux.h b/gcc/config/sparc/linux.h
index 6a809e9092d..48386884d05 100644
--- a/gcc/config/sparc/linux.h
+++ b/gcc/config/sparc/linux.h
@@ -30,7 +30,7 @@ along with GCC; see the file COPYING3.  If not see
 #undef  ENDFILE_SPEC
 #define ENDFILE_SPEC \
   GNU_USER_TARGET_ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
+  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}}"
 
 /* -mcpu=native handling only makes sense with compiler running on
a SPARC chip.  */
diff --git a/gcc/config/sparc/linux64.h b/gcc/config/sparc/linux64.h
index d08a2ef96fe..4132cee4c8c 100644
--- a/gcc/config/sparc/linux64.h
+++ b/gcc/config/sparc/linux64.h
@@ -47,7 +47,7 @@ along with GCC; see the file COPYING3.  If not see
 #undef ENDFILE_SPEC
 #define ENDFILE_SPEC \
   GNU_USER_TARGET_ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
+  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}}"
 
 /* The default code model.  */
 #undef SPARC_DEFAULT_CMODEL
diff --git a/gcc/config/sparc/sp-elf.h b/gcc/config/sparc/sp-elf.h
index 53f03b951db..8bd2f54f05d 100644
--- a/gcc/config/sparc/sp-elf.h
+++ b/gcc/config/sparc/sp-elf.h
@@ -32,7 +32,7 @@ along with GCC; see the file COPYING3.  If not see
 
 #undef ENDFILE_SPEC
 #define ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}} \
crtend.o%s crtn.o%s"
 
 /* Don't set the target flags, this is done by the linker script */
diff --git a/gcc/config/sparc/sp64-elf.h b/gcc/config/sparc/sp64-elf.h
index dc918c6ae24..866db10a343 100644
--- a/gcc/config/sparc/sp64-elf.h
+++ b/gcc/config/sparc/sp64-elf.h
@@ -44,7 +44,7 @@ along with GCC; see the file COPYING3.  If not see
 
 #undef ENDFILE_SPEC
 #define ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}} \
crtend.o%s crtn.o%s"
 
 /* Use the default (for now).  */
-- 
2.35.3



[PATCH 6/9] mips: Don't add crtfastmath.o for -shared

2023-01-13 Thread Richard Biener via Gcc-patches
Don't add crtfastmath.o for -shared to avoid altering the FP
environment when loading a shared library.

PR target/55522
* config/mips/gnu-user.h (GNU_USER_TARGET_MATHFILE_SPEC):
Don't add crtfastmath.o for -shared.
---
 gcc/config/mips/gnu-user.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/mips/gnu-user.h b/gcc/config/mips/gnu-user.h
index 9a540b8b53f..a4e5380b589 100644
--- a/gcc/config/mips/gnu-user.h
+++ b/gcc/config/mips/gnu-user.h
@@ -139,7 +139,7 @@ extern const char *host_detect_local_cpu (int argc, const 
char **argv);
 /* Similar to standard Linux, but adding -ffast-math support.  */
 #undef GNU_USER_TARGET_MATHFILE_SPEC
 #define GNU_USER_TARGET_MATHFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
+  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}}"
 #undef  ENDFILE_SPEC
 #define ENDFILE_SPEC \
   GNU_USER_TARGET_MATHFILE_SPEC " " \
-- 
2.35.3



[PATCH 5/9] loongarch: Don't add crtfastmath.o for -shared

2023-01-13 Thread Richard Biener via Gcc-patches
Don't add crtfastmath.o for -shared to avoid altering the FP
environment when loading a shared library.

PR target/55522
* config/loongarch/gnu-user.h (GNU_USER_TARGET_MATHFILE_SPEC):
Don't add crtfastmath.o for -shared.
---
 gcc/config/loongarch/gnu-user.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/loongarch/gnu-user.h b/gcc/config/loongarch/gnu-user.h
index c5b1afe530d..1dc6add62d4 100644
--- a/gcc/config/loongarch/gnu-user.h
+++ b/gcc/config/loongarch/gnu-user.h
@@ -49,7 +49,7 @@ along with GCC; see the file COPYING3.  If not see
 /* Similar to standard Linux, but adding -ffast-math support.  */
 #undef GNU_USER_TARGET_MATHFILE_SPEC
 #define GNU_USER_TARGET_MATHFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
+  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}}"
 
 #undef LIB_SPEC
 #define LIB_SPEC GNU_USER_TARGET_LIB_SPEC
-- 
2.35.3



[PATCH 2/9] alpha: Don't add crtfastmath.o for -shared

2023-01-13 Thread Richard Biener via Gcc-patches
Don't add crtfastmath.o for -shared to avoid altering the FP
environment when loading a shared library.

PR target/55522
* config/alpha/linux.h (ENDFILE_SPEC): Don't add
crtfastmath.o for -shared.
---
 gcc/config/alpha/linux.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/alpha/linux.h b/gcc/config/alpha/linux.h
index 9c3ad5a1097..7d2f0e844f9 100644
--- a/gcc/config/alpha/linux.h
+++ b/gcc/config/alpha/linux.h
@@ -106,7 +106,7 @@ along with GCC; see the file COPYING3.  If not see
 
 #undef ENDFILE_SPEC
 #define ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}} \
%{shared|pie:crtendS.o%s;:crtend.o%s} crtn.o%s"
 
 #define LINK_GCC_C_SEQUENCE_SPEC \
-- 
2.35.3



[PATCH 4/9] ia64: Don't add crtfastmath.o for -shared

2023-01-13 Thread Richard Biener via Gcc-patches
Don't add crtfastmath.o for -shared to avoid altering the FP
environment when loading a shared library.

There's no maintainer listed for ia64, I'll push this myself as
obvious as last part of the series.

PR target/55522
* config/ia64/linux.h (ENDFILE_SPEC): Don't add crtfastmath.o
for -shared.
---
 gcc/config/ia64/linux.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/ia64/linux.h b/gcc/config/ia64/linux.h
index 93510098ccc..4106737cb23 100644
--- a/gcc/config/ia64/linux.h
+++ b/gcc/config/ia64/linux.h
@@ -49,7 +49,7 @@ do {  \
 /* Similar to standard Linux, but adding -ffast-math support.  */
 #undef  ENDFILE_SPEC
 #define ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}} \
%{shared|pie:crtendS.o%s;:crtend.o%s} crtn.o%s"
 
 /* Define this for shared library support because it isn't in the main
-- 
2.35.3



[PATCH 3/9] arm: Don't add crtfastmath.o for -shared

2023-01-13 Thread Richard Biener via Gcc-patches
Don't add crtfastmath.o for -shared to avoid altering the FP
environment when loading a shared library.

PR target/55522
* config/arm/linux-eabi.h (ENDFILE_SPEC): Don't add
crtfastmath.o for -shared.
* config/arm/unknown-elf.h (STARTFILE_SPEC): Likewise.
---
 gcc/config/arm/linux-eabi.h  | 2 +-
 gcc/config/arm/unknown-elf.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/arm/linux-eabi.h b/gcc/config/arm/linux-eabi.h
index 57f830f0176..a119875599d 100644
--- a/gcc/config/arm/linux-eabi.h
+++ b/gcc/config/arm/linux-eabi.h
@@ -121,7 +121,7 @@
 
 #undef ENDFILE_SPEC
 #define ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} "\
+  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}} " 
\
   LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC)
 
 /* Use the default LIBGCC_SPEC, not the version in linux-elf.h, as we
diff --git a/gcc/config/arm/unknown-elf.h b/gcc/config/arm/unknown-elf.h
index 464d38b6cc6..397ac3f68b9 100644
--- a/gcc/config/arm/unknown-elf.h
+++ b/gcc/config/arm/unknown-elf.h
@@ -33,7 +33,7 @@
 
 #undef  STARTFILE_SPEC
 #define STARTFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} "\
+  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}} " 
\
   UNKNOWN_ELF_STARTFILE_SPEC
 
 #define UNKNOWN_ELF_ENDFILE_SPEC   "crtend%O%s crtn%O%s"
-- 
2.35.3



[PATCH 1/9] aarch64: Don't add crtfastmath.o for -shared

2023-01-13 Thread Richard Biener via Gcc-patches
Don't add crtfastmath.o for -shared to avoid altering the FP
environment when loading a shared library.

PR target/55522
* config/aarch64/aarch64-elf-raw.h (ENDFILE_SPEC): Don't add
crtfastmath.o for -shared.
* config/aarch64/aarch64-freebsd.h (GNU_USER_TARGET_MATHFILE_SPEC):
Likewise.
* config/aarch64/aarch64-linux.h (GNU_USER_TARGET_MATHFILE_SPEC):
Likewise.
---
 gcc/config/aarch64/aarch64-elf-raw.h | 2 +-
 gcc/config/aarch64/aarch64-freebsd.h | 2 +-
 gcc/config/aarch64/aarch64-linux.h   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-elf-raw.h 
b/gcc/config/aarch64/aarch64-elf-raw.h
index d4d820a9d54..fa5b4527ea0 100644
--- a/gcc/config/aarch64/aarch64-elf-raw.h
+++ b/gcc/config/aarch64/aarch64-elf-raw.h
@@ -25,7 +25,7 @@
 #define STARTFILE_SPEC " crti%O%s crtbegin%O%s crt0%O%s"
 #define ENDFILE_SPEC \
   " crtend%O%s crtn%O%s " \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
+  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}}"
 
 #ifndef LINK_SPEC
 #define LINK_SPEC "%{h*}   \
diff --git a/gcc/config/aarch64/aarch64-freebsd.h 
b/gcc/config/aarch64/aarch64-freebsd.h
index 13beb3781b6..2cf9cf6f046 100644
--- a/gcc/config/aarch64/aarch64-freebsd.h
+++ b/gcc/config/aarch64/aarch64-freebsd.h
@@ -50,7 +50,7 @@
 #define LINK_SPEC FBSD_TARGET_LINK_SPEC AARCH64_ERRATA_LINK_SPEC
 
 #define GNU_USER_TARGET_MATHFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
+  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}}"
 
 #undef ENDFILE_SPEC
 #define ENDFILE_SPEC \
diff --git a/gcc/config/aarch64/aarch64-linux.h 
b/gcc/config/aarch64/aarch64-linux.h
index 5e4553d79f5..61ed4067fc5 100644
--- a/gcc/config/aarch64/aarch64-linux.h
+++ b/gcc/config/aarch64/aarch64-linux.h
@@ -50,7 +50,7 @@
 #define LINK_SPEC LINUX_TARGET_LINK_SPEC AARCH64_ERRATA_LINK_SPEC
 
 #define GNU_USER_TARGET_MATHFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
+  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}}"
 
 #undef ENDFILE_SPEC
 #define ENDFILE_SPEC   \
-- 
2.35.3