[gcc(refs/users/matz/heads/x86-ssw)] x86-ssw: Adjust testcase

2024-07-09 Thread Michael Matz via Gcc-cvs
https://gcc.gnu.org/g:495a687dc93a58110076700f48fb57fa79026bef

commit 495a687dc93a58110076700f48fb57fa79026bef
Author: Michael Matz 
Date:   Tue Jul 9 14:26:31 2024 +0200

x86-ssw: Adjust testcase

this testcase tries to (uselessly) shrink wrap frame allocation
in f0(), and then calls the prologue expander twice emitting the
messages looked for with the dejagnu directives more times than
expected.  Just disable separate shrink wrapping here.

Diff:
---
 gcc/testsuite/gcc.dg/stack-check-5.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/stack-check-5.c 
b/gcc/testsuite/gcc.dg/stack-check-5.c
index 0243147939c1..b93dabdaea1d 100644
--- a/gcc/testsuite/gcc.dg/stack-check-5.c
+++ b/gcc/testsuite/gcc.dg/stack-check-5.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -fstack-clash-protection -fdump-rtl-pro_and_epilogue 
-fno-optimize-sibling-calls --param stack-clash-protection-probe-interval=12 
--param stack-clash-protection-guard-size=12" } */
+/* { dg-options "-O2 -fstack-clash-protection -fno-shrink-wrap-separate 
-fdump-rtl-pro_and_epilogue -fno-optimize-sibling-calls --param 
stack-clash-protection-probe-interval=12 --param 
stack-clash-protection-guard-size=12" } */
 /* { dg-require-effective-target supports_stack_clash_protection } */
 /* { dg-skip-if "" { *-*-* } { "-fstack-protector*" } { "" } } */


[gcc(refs/users/matz/heads/x86-ssw)] x86-ssw: precise using of moves

2024-07-09 Thread Michael Matz via Gcc-cvs
https://gcc.gnu.org/g:d213bc5e67d903143608e0a7879c2577c33ca47e

commit d213bc5e67d903143608e0a7879c2577c33ca47e
Author: Michael Matz 
Date:   Tue Jul 9 06:01:47 2024 +0200

x86-ssw: precise using of moves

we need to differ between merely not wanting to use moves
and not being able to.  When the allocated frame is too
large we can't use moves freely and hence need to disable
separate shrink wrapping.  If we don't want to use moves
by default for speed or the like but nothing else prevents
them then this is no reason to disable separate shrink wrapping.

Diff:
---
 gcc/config/i386/i386.cc | 20 +++-
 gcc/config/i386/i386.h  |  1 +
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 23226d204a09..20f4dcd61870 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -7120,9 +7120,7 @@ ix86_compute_frame_layout (void)
   /* Size prologue needs to allocate.  */
   to_allocate = offset - frame->sse_reg_save_offset;
 
-  if ((!to_allocate && frame->nregs <= 1
-   /*&& !flag_shrink_wrap_separate*/)
-  || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x8000))
+  if ((TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x8000))
/* If static stack checking is enabled and done with probes,
  the registers need to be saved before allocating the frame.  */
   || flag_stack_check == STATIC_BUILTIN_STACK_CHECK
@@ -7135,6 +7133,12 @@ ix86_compute_frame_layout (void)
   || (flag_stack_clash_protection
  && !ix86_target_stack_probe ()
  && to_allocate > get_probe_interval ()))
+{
+  frame->cannot_use_moves = true;
+}
+
+  if ((!to_allocate && frame->nregs <= 1)
+  || frame->cannot_use_moves)
 frame->save_regs_using_mov = false;
 
   if (ix86_using_red_zone ()
@@ -10800,13 +10804,13 @@ separate_frame_alloc_p (void)
 static sbitmap
 ix86_get_separate_components (void)
 {
-  //struct machine_function *m = cfun->machine;
-  //struct ix86_frame *frame = >frame;
+  struct machine_function *m = cfun->machine;
+  struct ix86_frame *frame = >frame;
   sbitmap components;
 
   ix86_finalize_stack_frame_flags ();
-  if (/*!frame->save_regs_using_mov
-  ||*/ crtl->drap_reg
+  if (frame->cannot_use_moves
+  || crtl->drap_reg
   || cfun->machine->func_type != TYPE_NORMAL)
 return NULL;
 
@@ -10868,9 +10872,7 @@ ix86_components_for_bb (basic_block bb)
{
  need_frame = true;
  break;
-
}
-
}
 }
   if (need_frame)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index dd73687a8e2c..bda3d97ab4cf 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -2645,6 +2645,7 @@ struct GTY(()) ix86_frame
   /* When save_regs_using_mov is set, emit prologue using
  move instead of push instructions.  */
   bool save_regs_using_mov;
+  bool cannot_use_moves;
 
   /* Assume without checking that:
EXPENSIVE_P = expensive_function_p (EXPENSIVE_COUNT).  */


[gcc(refs/users/matz/heads/x86-ssw)] x86-ssw: adjust testcase

2024-07-09 Thread Michael Matz via Gcc-cvs
https://gcc.gnu.org/g:cf6d794219dd0cf2ca3601e2d6e6b9e5f497a47a

commit cf6d794219dd0cf2ca3601e2d6e6b9e5f497a47a
Author: Michael Matz 
Date:   Tue Jul 9 06:01:22 2024 +0200

x86-ssw: adjust testcase

Diff:
---
 gcc/testsuite/gcc.target/x86_64/abi/callabi/leaf-2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/x86_64/abi/callabi/leaf-2.c 
b/gcc/testsuite/gcc.target/x86_64/abi/callabi/leaf-2.c
index 2a54bc89cfc2..140389626659 100644
--- a/gcc/testsuite/gcc.target/x86_64/abi/callabi/leaf-2.c
+++ b/gcc/testsuite/gcc.target/x86_64/abi/callabi/leaf-2.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mabi=sysv" } */
+/* { dg-options "-O2 -mabi=sysv -fno-shrink-wrap-separate" } */
 
 extern int glb1, gbl2, gbl3;


[gcc(refs/users/matz/heads/x86-ssw)] x86-ssw: fix testcases

2024-07-09 Thread Michael Matz via Gcc-cvs
https://gcc.gnu.org/g:c5a72cc80939e42518f4021e0640d29c8b8495a7

commit c5a72cc80939e42518f4021e0640d29c8b8495a7
Author: Michael Matz 
Date:   Tue Jul 9 04:27:46 2024 +0200

x86-ssw: fix testcases

the separate-shrink-wrap infrastructure sometimes
considers components as handled when they aren't in fact
handled (e.g. never calling any emit_prologue_components or
emit_epilogue_components hooks for the component in question).

So track stuff ourselves.

Diff:
---
 gcc/config/i386/i386.cc | 34 --
 gcc/config/i386/i386.h  |  1 +
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 4aa37c2ffeaa..23226d204a09 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -6970,7 +6970,7 @@ ix86_compute_frame_layout (void)
 }
 
   frame->save_regs_using_mov
-= (TARGET_PROLOGUE_USING_MOVE || flag_shrink_wrap_separate) && 
m->use_fast_prologue_epilogue;
+= (TARGET_PROLOGUE_USING_MOVE /*|| flag_shrink_wrap_separate*/) && 
m->use_fast_prologue_epilogue;
 
   /* Skip return address and error code in exception handler.  */
   offset = INCOMING_FRAME_SP_OFFSET;
@@ -7121,7 +7121,7 @@ ix86_compute_frame_layout (void)
   to_allocate = offset - frame->sse_reg_save_offset;
 
   if ((!to_allocate && frame->nregs <= 1
-   && !flag_shrink_wrap_separate)
+   /*&& !flag_shrink_wrap_separate*/)
   || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x8000))
/* If static stack checking is enabled and done with probes,
  the registers need to be saved before allocating the frame.  */
@@ -7418,7 +7418,7 @@ ix86_emit_save_regs (void)
   int regno;
   rtx_insn *insn;
 
-  gcc_assert (!crtl->shrink_wrapped_separate);
+  gcc_assert (!cfun->machine->anything_separately);
 
   if (!TARGET_APX_PUSH2POP2
   || !ix86_can_use_push2pop2 ()
@@ -8974,7 +8974,7 @@ ix86_expand_prologue (void)
   if (!int_registers_saved)
 {
   /* If saving registers via PUSH, do so now.  */
-  if (!frame.save_regs_using_mov)
+  if (!frame.save_regs_using_mov && !m->anything_separately)
{
  ix86_emit_save_regs ();
  int_registers_saved = true;
@@ -9489,7 +9489,7 @@ ix86_emit_restore_regs_using_pop (bool ppx_p)
 {
   unsigned int regno;
 
-  gcc_assert (!crtl->shrink_wrapped_separate);
+  gcc_assert (!cfun->machine->anything_separately);
   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
   ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno), ppx_p);
@@ -9506,7 +9506,7 @@ ix86_emit_restore_regs_using_pop2 (void)
   int loaded_regnum = 0;
   bool aligned = cfun->machine->fs.sp_offset % 16 == 0;
 
-  gcc_assert (!crtl->shrink_wrapped_separate);
+  gcc_assert (!cfun->machine->anything_separately);
   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
   {
@@ -9894,7 +9894,7 @@ ix86_expand_epilogue (int style)
   /* EH_RETURN requires the use of moves to function properly.  */
   if (crtl->calls_eh_return)
 restore_regs_via_mov = true;
-  else if (crtl->shrink_wrapped_separate)
+  else if (m->anything_separately)
 {
   gcc_assert (!TARGET_SEH);
   restore_regs_via_mov = true;
@@ -10800,13 +10800,14 @@ separate_frame_alloc_p (void)
 static sbitmap
 ix86_get_separate_components (void)
 {
-  struct machine_function *m = cfun->machine;
-  struct ix86_frame *frame = >frame;
+  //struct machine_function *m = cfun->machine;
+  //struct ix86_frame *frame = >frame;
   sbitmap components;
 
   ix86_finalize_stack_frame_flags ();
-  if (!frame->save_regs_using_mov
-  || crtl->drap_reg)
+  if (/*!frame->save_regs_using_mov
+  ||*/ crtl->drap_reg
+  || cfun->machine->func_type != TYPE_NORMAL)
 return NULL;
 
   components = sbitmap_alloc (NCOMPONENTS);
@@ -11150,6 +11151,8 @@ ix86_process_components (sbitmap components, bool 
prologue_p)
   {
if (bitmap_bit_p (components, regno))
  {
+   m->reg_wrapped_separately[regno] = true;
+   m->anything_separately = true;
if (prologue_p)
  ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
else
@@ -11161,6 +11164,8 @@ ix86_process_components (sbitmap components, bool 
prologue_p)
   {
if (bitmap_bit_p (components, regno))
  {
+   m->reg_wrapped_separately[regno] = true;
+   m->anything_separately = true;
if (prologue_p)
  ix86_emit_save_reg_using_mov (V4SFmode, regno, sse_cfa_offset);
else
@@ -11181,6 +11186,7 @@ ix86_emit_prologue_components (sbitmap components)
   if (bitmap_bit_

[gcc(refs/users/matz/heads/x86-ssw)] x86-ssw: disable if DRAP reg is needed

2024-07-09 Thread Michael Matz via Gcc-cvs
https://gcc.gnu.org/g:f917195f8a4e1767e89ebb0c875abcbe4dcf97ff

commit f917195f8a4e1767e89ebb0c875abcbe4dcf97ff
Author: Michael Matz 
Date:   Tue Jul 9 02:37:55 2024 +0200

x86-ssw: disable if DRAP reg is needed

Diff:
---
 gcc/config/i386/i386.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 734802dbed4f..4aa37c2ffeaa 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -10805,7 +10805,8 @@ ix86_get_separate_components (void)
   sbitmap components;
 
   ix86_finalize_stack_frame_flags ();
-  if (!frame->save_regs_using_mov)
+  if (!frame->save_regs_using_mov
+  || crtl->drap_reg)
 return NULL;
 
   components = sbitmap_alloc (NCOMPONENTS);


[gcc(refs/users/matz/heads/x86-ssw)] x86-ssw: don't clobber flags

2024-07-09 Thread Michael Matz via Gcc-cvs
https://gcc.gnu.org/g:5a9a70a5837aba373e3f36a89943c52e37a19809

commit 5a9a70a5837aba373e3f36a89943c52e37a19809
Author: Michael Matz 
Date:   Tue Jul 9 02:20:10 2024 +0200

x86-ssw: don't clobber flags

Diff:
---
 gcc/config/i386/i386.cc | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 33e69e96008d..734802dbed4f 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -10878,8 +10878,12 @@ ix86_components_for_bb (basic_block bb)
 }
 
 static void
-ix86_disqualify_components (sbitmap, edge, sbitmap, bool)
+ix86_disqualify_components (sbitmap components, edge e, sbitmap, bool)
 {
+  /* If the flags are needed at the start of e->dest then we can't insert
+ our stack adjustment insns (they default to flag-clobbering add/sub).  */
+  if (bitmap_bit_p (DF_LIVE_IN (e->dest), FLAGS_REG))
+bitmap_clear_bit (components, SW_FRAME);
 }
 
 static void


[gcc(refs/users/matz/heads/x86-ssw)] x86: implement separate shrink wrapping

2024-07-09 Thread Michael Matz via Gcc-cvs
https://gcc.gnu.org/g:eb94eb73cf3993c1d544e6eb8c4dcb671f215b25

commit eb94eb73cf3993c1d544e6eb8c4dcb671f215b25
Author: Michael Matz 
Date:   Sun Jun 30 03:52:39 2024 +0200

x86: implement separate shrink wrapping

Diff:
---
 gcc/config/i386/i386.cc | 581 +++-
 gcc/config/i386/i386.h  |   2 +
 2 files changed, 533 insertions(+), 50 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 4b6b665e5997..33e69e96008d 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -6970,7 +6970,7 @@ ix86_compute_frame_layout (void)
 }
 
   frame->save_regs_using_mov
-= TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue;
+= (TARGET_PROLOGUE_USING_MOVE || flag_shrink_wrap_separate) && 
m->use_fast_prologue_epilogue;
 
   /* Skip return address and error code in exception handler.  */
   offset = INCOMING_FRAME_SP_OFFSET;
@@ -7120,7 +7120,8 @@ ix86_compute_frame_layout (void)
   /* Size prologue needs to allocate.  */
   to_allocate = offset - frame->sse_reg_save_offset;
 
-  if ((!to_allocate && frame->nregs <= 1)
+  if ((!to_allocate && frame->nregs <= 1
+   && !flag_shrink_wrap_separate)
   || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x8000))
/* If static stack checking is enabled and done with probes,
  the registers need to be saved before allocating the frame.  */
@@ -7417,6 +7418,8 @@ ix86_emit_save_regs (void)
   int regno;
   rtx_insn *insn;
 
+  gcc_assert (!crtl->shrink_wrapped_separate);
+
   if (!TARGET_APX_PUSH2POP2
   || !ix86_can_use_push2pop2 ()
   || cfun->machine->func_type != TYPE_NORMAL)
@@ -7589,7 +7592,8 @@ ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
   {
-ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
+   if (!cfun->machine->reg_wrapped_separately[regno])
+ ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
cfa_offset -= UNITS_PER_WORD;
   }
 }
@@ -7604,7 +7608,8 @@ ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT 
cfa_offset)
   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
   {
-   ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
+   if (!cfun->machine->reg_wrapped_separately[regno])
+ ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
cfa_offset -= GET_MODE_SIZE (V4SFmode);
   }
 }
@@ -9089,6 +9094,7 @@ ix86_expand_prologue (void)
= frame.sse_reg_save_offset - frame.reg_save_offset;
 
   gcc_assert (int_registers_saved);
+  gcc_assert (!m->frame_alloc_separately);
 
   /* No need to do stack checking as the area will be immediately
 written.  */
@@ -9106,6 +9112,7 @@ ix86_expand_prologue (void)
   && flag_stack_clash_protection
   && !ix86_target_stack_probe ())
 {
+  gcc_assert (!m->frame_alloc_separately);
   ix86_adjust_stack_and_probe (allocate, int_registers_saved, false);
   allocate = 0;
 }
@@ -9116,6 +9123,7 @@ ix86_expand_prologue (void)
 {
   const HOST_WIDE_INT probe_interval = get_probe_interval ();
 
+  gcc_assert (!m->frame_alloc_separately);
   if (STACK_CHECK_MOVING_SP)
{
  if (crtl->is_leaf
@@ -9172,9 +9180,16 @@ ix86_expand_prologue (void)
   else if (!ix86_target_stack_probe ()
   || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
 {
-  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
-GEN_INT (-allocate), -1,
-m->fs.cfa_reg == stack_pointer_rtx);
+  if (!m->frame_alloc_separately)
+   pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+  GEN_INT (-allocate), -1,
+  m->fs.cfa_reg == stack_pointer_rtx);
+  else
+   {
+ if (m->fs.cfa_reg == stack_pointer_rtx)
+   m->fs.cfa_offset -= allocate;
+ m->fs.sp_offset += allocate;
+   }
 }
   else
 {
@@ -9184,6 +9199,8 @@ ix86_expand_prologue (void)
   bool eax_live = ix86_eax_live_at_start_p ();
   bool r10_live = false;
 
+  gcc_assert (!m->frame_alloc_separately);
+
   if (TARGET_64BIT)
 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
 
@@ -9338,6 +9355,7 @@ ix86_emit_restore_reg_using_pop (rtx reg, bool ppx_p)
   struct machine_function *m = cfun->machine;
   rtx_insn *insn = emit_insn (gen_pop (reg, ppx_p));
 
+  gcc_assert (!m->reg_wrapped_separately[REGNO (reg)]);
   ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset)

[gcc] Created branch 'matz/heads/x86-ssw' in namespace 'refs/users'

2024-07-09 Thread Michael Matz via Gcc-cvs
The branch 'matz/heads/x86-ssw' was created in namespace 'refs/users' pointing 
to:

 c27b30552e6c... gomp: testsuite: improve compatibility of bad-array-section


RE: [PATCH][ivopts]: use affine_tree when comparing IVs during candidate selection [PR114932]

2024-06-19 Thread Michael Matz
Hello,

On Wed, 19 Jun 2024, Tamar Christina wrote:

> So this is where we compare different IV expressions to determine which
> IVs compute the same thing and thus can be in the same group.
> 
> The STRIP_NOPS don't work because while the incoming types are the same
> the casts are different.  So:
> 
> >>> p debug (ustep)
> (unsigned long) stride.3_27 * 4
> $3 = void
> >>> p debug (cstep)
> (unsigned long) (stride.3_27 * 4)
> $4 = void
> 
> Which is of course stripped to:
> 
> >>> p debug (top)
> (unsigned long) stride.3_27 * 4
> $1 = void
> >>> p debug (bot)
> stride.3_27 * 4
> 
> Both of these compute the same thing

In isolation these are _not_ computing the same when strides type is 
smaller than ulong, namely when stride is either negative or larger than 
its max-value/4.  I.e. when comparing IVs not only the overflow behaviour 
for the whole {base,+,step} revolution matters, but also the behaviour on 
the constituent expressions.  (It's possible that stride is known to be 
non-problematic here, I haven't checked.  I was just triggered by the 
claim of same-ness :) )


Ciao,
Michael.


Re: How to avoid some built-in expansions in gcc?

2024-06-05 Thread Michael Matz via Gcc
Hey,

On Wed, 5 Jun 2024, David Brown wrote:

> The ideal here would be to have some way to tell gcc that a given 
> function has the semantics of a different function.  For example, a 
> programmer might have several implementations of "memcpy" that are 
> optimised for different purposes based on the size or alignment of the 
> arguments.  Maybe some of these are written with inline assembly or work 
> in a completely different way (I've used DMA on a microcontroller for 
> the purpose).  If you could tell the compiler that the semantic 
> behaviour and results were the same as standard memcpy(), that could 
> lead to optimisations.
> 
> Then you could declare your "isinf" function with 
> __attribute__((semantics_of(__builtin_isinf))).
> 
> And the feature could be used in any situation where you can write a 
> function in a simple, easy-to-analyse version and a more efficient but 
> opaque version.

Hmm, that actually sounds like a useful feature.  There are some details 
to care for, like what to do with arguments: e.g. do they need to have the 
same types as the referred builtin, only compatible ones, or even just 
convertible ones, and suchlike, but yeah, that sounds nice.


Ciao,
Michael.


Re: How to avoid some built-in expansions in gcc?

2024-06-05 Thread Michael Matz via Gcc
Hello,

On Tue, 4 Jun 2024, Jakub Jelinek wrote:

> On Tue, Jun 04, 2024 at 07:43:40PM +0200, Michael Matz via Gcc wrote:
> > (Well, and without reverse-recognition of isfinite-like idioms in the 
> > sources.  That's orthogonal as well.)
> 
> Why?  If isfinite is better done by a libcall, why isn't isfinite-like
> idiom also better done as a libcall?

It is.  I was just trying to avoid derailing the discussion for finding an 
immediately good solution by searching for the perfect solution.  Idiom 
finding simply is completely independend from the posed problem that 
Georg-Johann has, which remains unsolved AFAICS, as using 
fno-builtin-foobar has its own (perhaps mere theoretical for AVR) 
problems.


Ciao,
Michael.


Re: How to avoid some built-in expansions in gcc?

2024-06-04 Thread Michael Matz via Gcc
Hello,

On Sat, 1 Jun 2024, Richard Biener via Gcc wrote:

> >>> You have a pointer how to define a target optab? I looked into optabs 
> >>> code but found no appropriate hook.  For isinf is seems is is 
> >>> enough to provide a failing expander, but other functions like isnan 
> >>> don't have an optab entry, so there is a hook mechanism to extend optabs?
> >> Just add corresponding optabs for the missing cases (some additions are 
> >> pending, like isnornal).  There’s no hook to prevent folding to FP 
> >> compares nor is that guarded by other means (like availability of native 
> >> FP ops).  Changing the guards would be another reasonable option.
> >> Richard
> > 
> > There are many other such folds, e.g. for isdigit().  The AVR libraries 
> > have all this in hand-optimized assembly, and all these built-in expansions 
> > are bypassing that.  Open-coded C will never beat that assemlbly code, at 
> > least not with the current GCC capabilities.
> 
> The idea is that isdigit() || isalpha() or similar optimize without 
> special casing the builtins.
> 
> > How would I go about disabling / bypassing non-const folds from ctype.h and 
> > the many others?
> 
> I think this mostly shows we lack late recognition of open-coded isdigit 
> and friends, at least for the targets where inlining them is not 
> profitable.
> 
> A pragmatic solution might be a new target hook, indicating a specified 
> builtin is not to be folded into an open-coded form.

Well, that's what the mechanism behind -fno-builtin-foobar is supposed to 
be IMHO.  Hopefully the newly added additional mechanism using optabs and 
ifns (instead of builtins) heeds it.

> A good solution would base this on (size) costs, the perfect solution 
> would re-discover the builtins late and undo inlining that didn’t turn 
> out to enable further simplification.
> 
> How is inlined isdigit bad on AVR?  Is a call really that cheap 
> considering possible register spilling around it?

On AVR with needing to use 8bit registers to do everything?  I'm pretty 
sure the call is cheaper, yeah :)


Ciao,
Michael.


Re: [PATCH v6 1/8] Improve must tail in RTL backend

2024-06-04 Thread Michael Matz
Hello,

On Mon, 3 Jun 2024, Jakub Jelinek wrote:

> > Hmm.  I count six tests in about 25 lines of code in 
> > tree-tailcall.cc:suitable_for_tail_opt_p and suitable_for_tail_call_opt_p.
> > 
> > Are you perhaps worrying about the sibcall discovery itself (i.e. much of 
> > find_tail_calls)?  Why would that be needed for musttail?  Is that 
> > attribute sometimes applied to calls that aren't in fact sibcall-able?
> > 
> > One thing I'm worried about is the need for a new sibcall pass at O0 just 
> > for sibcall discovery.  find_tail_calls isn't cheap, because it computes 
> > live local variables for the whole function, potentially being quadratic.
> 
> But the pass could be done only if there is at least one musttail call 
> in a function (remembered in some cfun flag).  If people use that 
> attribute, guess they are willing to pay for it.

Yeah, but I think the way the current proposal is doing it is mostly 
equivalent and fine enough, as Andi mentioned (in my worry I haven't 
considered that overall the backward walk stops fairly soon and then only 
does something when a musttail is there).

I still think that the tree pass being necessary for correctness is bad 
design, in the grand scheme of things, especially for those tests that are 
done for the call statement in isolation (i.e. tests about arguments like 
address-taken and suchlike, and return value, flags on the callee, and 
facts about the current function).  Those should all move to calls.cc or 
cfgexpand IMHO.

But I will yield on the discovery part that tree-tailcall is doing (i.e. 
those pieces that need to look at multiple statements, e.g. how the call 
result is used later); those are a bit harder to do in expand and how it's 
structured, and without getting rid of that part in tree-tailcall we have 
to run it at O0 anyway for musttail.  And moving only parts of the tests 
to calls.cc doesn't seem so worthwhile to hold up the patch.

So, I have no objections on the patch design anymore.


Ciao,
Michael.


Re: [PATCH v6 1/8] Improve must tail in RTL backend

2024-06-03 Thread Michael Matz
Hello,

On Fri, 31 May 2024, Andi Kleen wrote:

> > I think the ultimate knowledge if a call can or cannot be implemented as 
> > tail-call lies within calls.cc/expand_call: It is inherently 
> > target and ABI specific how arguments and returns are layed out, how the 
> > stack frame is generated, if arguments are or aren't removed by callers 
> > or callees and so on; all of that being knowledge that tree-tailcall 
> > doesn't have and doesn't want to have.  As such tree-tailcall should 
> > not be regarded as ultimate truth, and failures of tree-tailcall to 
> > recognize something as tail-callable shouldn't matter.
> 
> It's not the ultimate truth, but some of the checks it does are not 
> duplicated at expand time nor the backend. So it's one necessary pre 
> condition with the current code base.
> 
> Yes maybe the checks could be all moved, but that's a much larger 
> project.

Hmm.  I count six tests in about 25 lines of code in 
tree-tailcall.cc:suitable_for_tail_opt_p and suitable_for_tail_call_opt_p.

Are you perhaps worrying about the sibcall discovery itself (i.e. much of 
find_tail_calls)?  Why would that be needed for musttail?  Is that 
attribute sometimes applied to calls that aren't in fact sibcall-able?

One thing I'm worried about is the need for a new sibcall pass at O0 just 
for sibcall discovery.  find_tail_calls isn't cheap, because it computes 
live local variables for the whole function, potentially being quadratic.


Ciao,
Michael.


Re: [PATCH v6 1/8] Improve must tail in RTL backend

2024-05-29 Thread Michael Matz
On Tue, 21 May 2024, Andi Kleen wrote:

> - Give error messages for all causes of non sibling call generation
> - When giving error messages clear the musttail flag to avoid ICEs
> - Error out when tree-tailcall failed to mark a must-tail call
> sibcall. In this case it doesn't know the true reason and only gives
> a vague message.

Sorry for jumping in late, Richi triggered me :)  But some general 
remarks:

I think the ultimate knowledge if a call can or cannot be implemented as 
tail-call lies within calls.cc/expand_call: It is inherently 
target and ABI specific how arguments and returns are layed out, how the 
stack frame is generated, if arguments are or aren't removed by callers 
or callees and so on; all of that being knowledge that tree-tailcall 
doesn't have and doesn't want to have.  As such tree-tailcall should 
not be regarded as ultimate truth, and failures of tree-tailcall to 
recognize something as tail-callable shouldn't matter.

It then follows that tree-tailcall needn't be run at -O0 merely for 
setting the flag.  Instead calls.cc simply should try expanding a 
tail-call when it sees the must-tail flag (as it right now would do), i.e. 
trust the user.  If that fails for some reasons then that means that the 
checks within calls.cc aren't complete enough (and that tree-tailcall 
papered over that problem).  That would be (IMHO) an independend bug to be 
solved.  But _when_ those bugs are fixed then what you merely need to do 
for the musttail attribute is to set that flag on the gimple_call, 
possibly make sure that nothing (tree-tailcall!) removes the flag, and be 
done.

(For avoidance of doubt: with tree-tailcall I mean the tree sibcall call 
pass, "tailc", not the tail-recursion pass).

IOW: I don't see why the tree pass needs to be run at -O0 for musttail.  
If something doesn't work currently then that points to other 
deficiencies.


Ciao,
Michael.

> 
>   PR83324
> 
> gcc/ChangeLog:
> 
>   * calls.cc (expand_call): Fix mustcall implementation.
>   (maybe_complain_about_tail_call): Clear must tail flag on error.
> ---
>  gcc/calls.cc | 30 --
>  1 file changed, 24 insertions(+), 6 deletions(-)
> 
> diff --git a/gcc/calls.cc b/gcc/calls.cc
> index 21d78f9779fe..161e36839654 100644
> --- a/gcc/calls.cc
> +++ b/gcc/calls.cc
> @@ -1249,6 +1249,7 @@ maybe_complain_about_tail_call (tree call_expr, const 
> char *reason)
>  return;
>  
>error_at (EXPR_LOCATION (call_expr), "cannot tail-call: %s", reason);
> +  CALL_EXPR_MUST_TAIL_CALL (call_expr) = 0;
>  }
>  
>  /* Fill in ARGS_SIZE and ARGS array based on the parameters found in
> @@ -2650,7 +2651,11 @@ expand_call (tree exp, rtx target, int ignore)
>/* The type of the function being called.  */
>tree fntype;
>bool try_tail_call = CALL_EXPR_TAILCALL (exp);
> -  bool must_tail_call = CALL_EXPR_MUST_TAIL_CALL (exp);
> +  /* tree-tailcall decided not to do tail calls. Error for the musttail case,
> + unfortunately we don't know the reason so it's fairly vague.
> + When tree-tailcall reported an error it already cleared the flag.  */
> +  if (!try_tail_call)
> +  maybe_complain_about_tail_call (exp, "other reasons");
>int pass;
>  
>/* Register in which non-BLKmode value will be returned,
> @@ -3022,10 +3027,21 @@ expand_call (tree exp, rtx target, int ignore)
>   pushed these optimizations into -O2.  Don't try if we're already
>   expanding a call, as that means we're an argument.  Don't try if
>   there's cleanups, as we know there's code to follow the call.  */
> -  if (currently_expanding_call++ != 0
> -  || (!flag_optimize_sibling_calls && !CALL_FROM_THUNK_P (exp))
> -  || args_size.var
> -  || dbg_cnt (tail_call) == false)
> +  if (currently_expanding_call++ != 0)
> +{
> +  maybe_complain_about_tail_call (exp, "inside another call");
> +  try_tail_call = 0;
> +}
> +  if (!flag_optimize_sibling_calls
> + && !CALL_FROM_THUNK_P (exp)
> + && !CALL_EXPR_MUST_TAIL_CALL (exp))
> +try_tail_call = 0;
> +  if (args_size.var)
> +{
> +  maybe_complain_about_tail_call (exp, "variable size arguments");
> +  try_tail_call = 0;
> +}
> +  if (dbg_cnt (tail_call) == false)
>  try_tail_call = 0;
>  
>/* Workaround buggy C/C++ wrappers around Fortran routines with
> @@ -3046,13 +3062,15 @@ expand_call (tree exp, rtx target, int ignore)
>   if (MEM_P (*iter))
> {
>   try_tail_call = 0;
> + maybe_complain_about_tail_call (exp,
> + "hidden string length argument passed on 
> stack");
>   break;
> }
>   }
>  
>/* If the user has marked the function as requiring tail-call
>   optimization, attempt it.  */
> -  if (must_tail_call)
> +  if (CALL_EXPR_MUST_TAIL_CALL (exp))
>  try_tail_call = 1;
>  
>/*  Rest of purposes for tail call optimizations to fail.  */
> 


Re: Sourceware mitigating and preventing the next xz-backdoor

2024-04-04 Thread Michael Matz via Gcc
Hello,

On Wed, 3 Apr 2024, Jonathon Anderson wrote:

> Of course, this doesn't make the build system any less complex, but 
> projects using newer build systems seem easier to secure and audit than 
> those using overly flexible build systems like Autotools and maybe even 
> CMake. IMHO using a late-model build system is a relatively low 
> technical hurdle to overcome for the benefits noted above, switching 
> should be considered and in a positive light.

Note that we're talking not (only) about the build system itself, i.e. how 
to declare dependencies within the sources, and how to declare how to 
build them.  make it just fine for that (as are many others).  (In a way 
I think we meanwhile wouldn't really need automake and autogen, but 
rewriting all that in pure GNUmake is a major undertaking).

But Martin also specifically asked about alternatives for feature tests, 
i.e. autoconfs purpose.  I simply don't see how any alternative to it 
could be majorly "easier" or "less complex" at its core.  Going with the 
examples given upthread there is usually only one major solution: to check 
if a given system supports FOOBAR you need to bite the bullet and compile 
(and potentially run!) a small program using FOOBAR.  A configuration 
system that can do that (and I don't see any real alternative to that), no 
matter in which language it's written and how traditional or modern it is, 
also gives you enough rope to hang yourself, if you so choose.

If you get away without many configuration tests in your project then this 
is because what (e.g.) the compiler gives you, in the form of libstdc++ 
for example, abstracts away many of the peculiarities of a system.  But 
in order to be able to do that something (namely the config system of 
libstdc++) needs to determine what is or isn't supported by the system in 
order to correctly implement these abstractions.  I.e. things you depend 
on did the major lifting of hiding system divergence.

(Well, that, or you are very limited in the number of systems you support, 
which can be the right thing as well!)


Ciao,
Michael.


Re: [PATCH] middle-end/114579 - speed up add_scope_conflicts

2024-04-04 Thread Michael Matz
Hello,

On Thu, 4 Apr 2024, Richard Biener wrote:

> I have reworded the comment before the all-to-all conflict recording
> since it seemed to be confusing and missing the point - but maybe I
> am also missing something here.

The point of the comment was to explain how this differs from building a 
conflict graph for named objects without indirect accesses, i.e. how e.g. 
a conflict graph for register allocation is done.  There conflicts are 
added at the points where objects are defined (between that def and all 
currently live objects), and _only_ there.  I.e. not at CFG merge points 
and not at uses.  So something like so:

  foreach INSN in block backwards:
foreach DEF in INSN:
  foreach O in active:
add_conflict (DEF, O);
  remove (DEF from active);

That's for the more usual backwards direction, but it would be similar 
for forward one.  The point is that this is the only place where conflicts 
are added, not at CFG splits/merges.

The above relies on being able to see all places where the objects are 
written.  With indirect accesses that's not the case anymore:

   ptr = cond :  :// MENTION foo, bar
   *ptr = 42;  // no DEF of a named object
   escape (ptr);
   CLOBBER (foo)
   CLOBBER (bar)

We somewhere need to record the conflict between foo and bar.  
Conservatively we did so at the first real instruction of a block.  As 
minor optimization we also start a block in a mode where we just record 
mentions, and switch to also recording conflicts at the first real insn. 
(An empty block, or just PHIs do not cause conflicts in themself)

This need for an N-to-N conflict generation at some point is the major 
difference between building the conflict graph for named vs. 
potentionally indirect objects, and the comment tried to explain that 
from the perspective of someone familiar with conflict graph building 
in regalloc :-)

Where exactly these N-to-N conflicts are generated doesn't matter so much, 
as long as everything is there.  The current way was the most obvious one.

The observation that these N-to-N conflicts only need to be done on the 
commonly disjoint sets of the inputs (which is trivially empty for a 
single predecessor) is correct, i.e. only doing it with more than a single 
predecessor makes sense.  This requires adding the N-to-Ns always even in 
mostly emtpy blocks, for the danger of that being missed in the successor 
block, as you are doing.

So, fine with me, also with the changed comment if you think it explains 
stuff better :)


Ciao,
Michael.

> 
> Nevertheless for the testcase in the PR the compile-time spend in
> add_scope_conflicts at -O1 drops from previously 67s (39%) to 10s (9%).
> 
> Bootstrapped and tested on x86_64-unknown-linux-gnu, OK for trunk?
> 
> Thanks,
> Richard.
> 
>   PR middle-end/114579
>   * cfgexpand.cc (add_scope_conflicts_1): Record all-to-all
>   conflicts only when there's a CFG merge but for all CFG merges.
> ---
>  gcc/cfgexpand.cc | 46 +-
>  1 file changed, 33 insertions(+), 13 deletions(-)
> 
> diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
> index eef565eddb5..fa48a4c633f 100644
> --- a/gcc/cfgexpand.cc
> +++ b/gcc/cfgexpand.cc
> @@ -640,21 +640,26 @@ add_scope_conflicts_1 (basic_block bb, bitmap work, 
> bool for_conflict)
>   {
> if (for_conflict && visit == visit_op)
>   {
> -   /* If this is the first real instruction in this BB we need
> -  to add conflicts for everything live at this point now.
> -  Unlike classical liveness for named objects we can't
> -  rely on seeing a def/use of the names we're interested in.
> -  There might merely be indirect loads/stores.  We'd not add any
> -  conflicts for such partitions.  */
> +   /* When we are inheriting live variables from our predecessors
> +  through a CFG merge we might not see an actual mention of
> +  the variables to record the approprate conflict as defs/uses
> +  might be through indirect stores/loads.  For this reason
> +  we have to make sure each live variable conflicts with
> +  each other.  When there's just a single predecessor the
> +  set of conflicts is already up-to-date.
> +  We perform this delayed at the first real instruction to
> +  allow clobbers starting this block to remove variables from
> +  the set of live variables.  */
> bitmap_iterator bi;
> unsigned i;
> -   EXECUTE_IF_SET_IN_BITMAP (work, 0, i, bi)
> - {
> -   class stack_var *a = _vars[i];
> -   if (!a->conflicts)
> - a->conflicts = BITMAP_ALLOC (_var_bitmap_obstack);
> -   bitmap_ior_into (a->conflicts, work);
> - }
> +   if (EDGE_COUNT (bb->preds) > 1)
> + EXECUTE_IF_SET_IN_BITMAP 

Re: Sourceware mitigating and preventing the next xz-backdoor

2024-04-03 Thread Michael Matz via Gcc
Hello,

On Wed, 3 Apr 2024, Martin Uecker wrote:

> The backdoor was hidden in a complicated autoconf script...

Which itself had multiple layers and could just as well have been a 
complicated cmake function.

> > (And, FWIW, testing for features isn't "complex".  And have you looked at 
> > other build systems?  I have, and none of them are less complex, just 
> > opaque in different ways from make+autotools).
> 
> I ask a very specific question: To what extend is testing 
> for features instead of semantic versions and/or supported
> standards still necessary?

I can't answer this with absolute certainty, but points to consider: the 
semantic versions need to be maintained just as well, in some magic way.  
Because ultimately software depend on features of dependencies not on 
arbitrary numbers given to them.  The numbers encode these features, in 
the best case, when there are no errors.  So, no, version numbers are not 
a replacement for feature tests, they are a proxy.  One that is manually 
maintained, and hence prone to errors.

Now, supported standards: which one? ;-)  Or more in earnest: while on 
this mailing list here we could chose a certain set, POSIX, some 
languages, Windows, MacOS (versions so-and-so).  What about other 
software relying on other 3rdparty feature providers (libraries or system 
services)?  Without standards?

So, without absolute certainty, but with a little bit of it: yes, feature 
tests are required in general.  That doesn't mean that we could not 
do away with quite some of them for (e.g.) GCC, those that hold true on 
any platform we support.  But we can't get rid of the infrastructure for 
that, and can't get rid of certain classes of tests.

> This seems like a problematic approach that may have been necessary 
> decades ago, but it seems it may be time to move on.

I don't see that.  Many aspects of systems remain non-standardized.


Ciao,
Michael.


Re: Sourceware mitigating and preventing the next xz-backdoor

2024-04-03 Thread Michael Matz via Gcc
Hello,

On Wed, 3 Apr 2024, Martin Uecker via Gcc wrote:

> > > Seems reasonable, but note that it wouldn't make any difference to
> > > this attack.  The liblzma library was modified to corrupt the sshd
> > > binary, when sshd was linked against liblzma.  The actual attack
> > > occurred via a connection to a corrupt sshd.  If sshd was running as
> > > root, as is normal, the attacker had root access to the machine.  None
> > > of the attacking steps had anything to do with having root access
> > > while building or installing the program.
> 
> There does not seem a single good solution against something like this.
> 
> My take a way is that software needs to become less complex. Do 
> we really still need complex build systems such as autoconf?

Do we really need complex languages like C++ to write our software in?  
SCNR :)  Complexity lies in the eye of the beholder, but to be honest in 
the software that we're dealing with here, the build system or autoconf 
does _not_ come to mind first when thinking about complexity.

(And, FWIW, testing for features isn't "complex".  And have you looked at 
other build systems?  I have, and none of them are less complex, just 
opaque in different ways from make+autotools).


Ciao,
Michael.


Re: [PATCH] middle-end/114480 - IDF compute is slow

2024-03-27 Thread Michael Matz
Hey,

On Wed, 27 Mar 2024, Jakub Jelinek wrote:

> > @@ -1712,12 +1711,9 @@ compute_idf (bitmap def_blocks, bitmap_head *dfs)
> >gcc_checking_assert (bb_index
> >< (unsigned) last_basic_block_for_fn (cfun));
> >  
> > -  EXECUTE_IF_AND_COMPL_IN_BITMAP ([bb_index], phi_insertion_points,
> > - 0, i, bi)
> > -   {
> > +  EXECUTE_IF_SET_IN_BITMAP ([bb_index], 0, i, bi)
> > +   if (bitmap_set_bit (phi_insertion_points, i))
> >   bitmap_set_bit (work_set, i);
> > - bitmap_set_bit (phi_insertion_points, i);
> > -   }
> >  }
> 
> I don't understand why the above is better.
> Wouldn't it be best to do
>   bitmap_ior_and_compl_into (work_set, [bb_index],
>phi_insertion_points);
>   bitmap_ior_into (phi_insertion_points, [bb_index]);
> ?

I had the same hunch, but:

1) One would have to make work_set be non-tree-view again (which with the 
current structure is a wash anyway, and that makes sense as accesses to 
work_set aren't heavily random here).

2) But doing that and using bitmap_ior.._into is still measurably slower: 
on a reduced testcase with -O0 -fno-checking, proposed structure 
(tree-view or not-tree-view workset doesn't matter):

 tree SSA rewrite   :  14.93 ( 12%)   0.01 (  2%)  14.95 ( 
12%)27M (  8%)

with non-tree-view, and your suggestion:

 tree SSA rewrite   :  20.68 ( 12%)   0.02 (  4%)  20.75 ( 
12%)27M (  8%)

I can only speculate that the usually extreme sparsity of the bitmaps in 
question make the setup costs of the two bitmap_ior calls actually more 
expensive than the often skipped second call to bitmap_set_bit in Richis 
proposed structure.  (That or cache effects)


Ciao,
Michael.


Re: [PATCH] i386: For noreturn functions save at least the bp register if it is used [PR114116]

2024-02-29 Thread Michael Matz
Hello,

On Tue, 27 Feb 2024, Jakub Jelinek wrote:

> On Tue, Feb 27, 2024 at 10:13:14AM +0100, Jakub Jelinek wrote:
> > For __libc_start_main, glibc surely just could use no_callee_saved_registers
> > attribute, because that is typically the outermost frame in backtrace,
> > there is no need to save those there.
> > And for kernel if it really wants it and nothing will use the backtraces,
> > perhaps the patch wouldn't need to be reverted completely but just guarded
> > the implicit no_callee_saved_registers treatment of noreturn
> > functions on -mcmodel=kernel or -fno-asynchronous-unwind-tables.
> 
> Guarding on -fno-asynchronous-unwind-tables isn't a good idea,
> with just -g we emit in that case unwind info in .debug_frame section
> and even that shouldn't break, and we shouldn't generate different code for
> -g vs. -g0.
> The problem with the changes is that it breaks the unwinding and debugging
> experience not just in the functions on which the optimization triggers,
> but on all functions in the backtrace as well.
> 
> So, IMHO either revert the changes altogether, or guard on -mcmodel=kernel
> (but talk to kernel people on linux-toolchains if that is what they actually
> want).

What is the underlying real purpose of the changes anyway?  It's a 
nano-optimization: for functions to be called multiple times 
they must either return or be recursive.  The latter is not very likely, 
so a noreturn function is called only once in the vast majority of cases.  
Any optimizations that diddle with the frame setup code for functions 
called only once seems to be ... well, not so very useful, especially so 
when they impact anything that is actually useful, like debugging.

I definitely think this shouldn't be done by default.


Ciao,
Michael.


Re: [PATCH] tree-optimization/114074 - CHREC multiplication and undefined overflow

2024-02-26 Thread Michael Matz
Hello,

On Mon, 26 Feb 2024, Jakub Jelinek wrote:

> > Will update the patch, I think any improvement should be done
> > to get_range_pos_neg (it's a bit odd in behavior for unsigned
> > but I have only signed things incoming).
> 
> For unsigned if it always returned 1, it would be quite useless, there would
> be no point for the caller to call it in that case.

Which seems to make sense for a function called ...pos_neg on unsigned 
types.  I would expect calling it to be useless and always return "yep, 
non-negative, why did you ask?".


Ciao,
Michael.


Re: [PATCH] c: Handle scoped attributes in __has*attribute and scoped attribute parsing changes in -std=c11 etc. modes [PR114007]

2024-02-22 Thread Michael Matz
Hi,

On Thu, 22 Feb 2024, Jakub Jelinek wrote:

> > Hmm, shouldn't you be able to use (nonexistence of) the PREV_WHITE flag on 
> > the second COLON token to see that it's indeed a '::' without intervening 
> > whitespace?  Instead of setting a new flag on the first COLON token?
> > 
> > I.e. something like this:
> > 
> >if (c_parser_next_token_is (parser, CPP_SCOPE)
> > -  || (loose_scope_p
> > - && c_parser_next_token_is (parser, CPP_COLON)
> >   && c_parser_peek_2nd_token (parser)->type == CPP_COLON))
> > + && !(c_parser_peek_2nd_token (parser)->flags & PREV_WHITE)))
> > 
> > ?
> 
> That doesn't seem to work.

Too bad then.  I had hoped it would make the code easier without changes 
to c-lex.  Well, then ... was worth a try, I'll crouch back under my stone
:)


Ciao,
Michael.


Re: [PATCH] c: Handle scoped attributes in __has*attribute and scoped attribute parsing changes in -std=c11 etc. modes [PR114007]

2024-02-22 Thread Michael Matz
Hello,

On Thu, 22 Feb 2024, Jakub Jelinek wrote:

> So, the following patch adds a flag during preprocessing at the point
> where we normally create CPP_SCOPE tokens out of 2 consecutive colons
> on the first CPP_COLON to mark the consecutive case (as we are tight
> on the bits, I've reused the PURE_ZERO flag, which is used just by the
> C++ FE and only ever set (both C and C++) on CPP_NUMBER tokens, this
> new flag has the same value and is only ever used on CPP_COLON tokens)

Hmm, shouldn't you be able to use (nonexistence of) the PREV_WHITE flag on 
the second COLON token to see that it's indeed a '::' without intervening 
whitespace?  Instead of setting a new flag on the first COLON token?

I.e. something like this:

   if (c_parser_next_token_is (parser, CPP_SCOPE)
-  || (loose_scope_p
- && c_parser_next_token_is (parser, CPP_COLON)
  && c_parser_peek_2nd_token (parser)->type == CPP_COLON))
+ && !(c_parser_peek_2nd_token (parser)->flags & PREV_WHITE)))

?


Ciao,
Michael.


Re: Improvement of CLOBBER descriptions

2024-02-21 Thread Michael Matz via Gcc
Hello,

On Wed, 21 Feb 2024, Daniil Frolov wrote:

> >> Following the recent introduction of more detailed CLOBBER types in GCC, a
> >> minor
> >> inconsistency has been identified in the description of
> >> CLOBBER_OBJECT_BEGIN:
> >> 
> >>   /* Beginning of object lifetime, e.g. C++ constructor.  */
> >>   CLOBBER_OBJECT_BEGIN

The "e.g." comment mixes concepts of the C++ language with a 
middle-end/GIMPLE concept, and hence is a bit misleading.  What the 
clobbers are intended to convey to the middle-end is the low-level notion 
of "storage associated with this and that object is now accessible 
(BEGIN)/not accessible anymore (END), for purposes related to that very 
object".  The underlying motivation, _why_ that knowledge is interesting 
to the middle-end, is to be able to share storage between different 
objects.

"purposes related to that object" are any operations on the object: 
construction, destruction, access, frobnification.  It's not tied to a 
particular frontend language (although it's the language semantics that 
dictate when emitting the begin/end clobbers is appropriate).  For the 
middle-end the C++ concept of construction/deconstruction are simply 
modifications (conceptual or real) of the storage associated with an 
object ("object" not in the C++ sense, but from a low-level perspective; 
i.e. an "object" doesn't only exist after c++-construction, it comes into 
existence before that, even if perhaps in an indeterminate/invalid state 
from the frontends perspective).

Initially these clobbers were only emitted when decls went ouf of 
scope, and so did have some relation to frontend language semantics 
(although a fairly universal one, namely "scope").  The 
C++ frontend then found further uses (e.g. after a dtor for an 
object _ends_ it's storage can also be reused), and eventually we also 
needed the BEGIN clobbers to convey the meaning of "now storage use for 
object potentially starts, don't share it with any other object".

If certain frontends find use for more fine-grained definitions of 
life-times, then further note kinds need to be invented for those 
frontends use.  They likely won't have influence on the middle-end though 
(perhaps for some sanitizers such new kinds might be useful?).  But the 
current BEGIN/END clobbers need to continue to mark the outermost borders 
of storage validity for an object.


Ciao,
Michael.


Re: [PATCH V1] rs6000: New pass for replacement of adjacent (load) lxv with lxvp

2024-01-17 Thread Michael Matz
Hello,

On Wed, 17 Jan 2024, Ajit Agarwal wrote:

> > first is even, since OOmode is only ok for even vsx register and its
> > size makes it take two consecutive vsx registers.
> > 
> > Hi Peter, is my understanding correct?
> > 
> 
> I tried all the combination in the past RA is not allocating sequential 
> register. I dont see any such code in RA that generates sequential 
> registers.

See HARD_REGNO_NREGS.  If you form a pseudo of a mode that's larger than a 
native-sized hardreg (and the target is correctly set up) then the RA will 
allocate the correct number of hardregs (consecutively) for this pseudo.  
This is what Kewen was referring to by mentioning the OOmode for the new 
hypothetical pseudo.  The individual parts of such pseudo will then need 
to use subreg to access them.

So, when you work before RA you simply will transform this (I'm going to 
use SImode and DImode for demonstration):

   (set (reg:SI x) (mem:SI (addr)))
   (set (reg:SI y) (mem:SI (addr+4)))
   ...
   ( ...use1... (reg:SI x))
   ( ...use2... (reg:SI y))

into this:

   (set (reg:DI z) (mem:DI (addr)))
   ...
   ( ...use1... (subreg:SI (reg:DI z) 0))
   ( ...use2... (subreg:SI (reg:DI z) 4))

For this to work the target needs to accept the (subreg...) in certain 
operands of instruction patterns, which I assume was what Kewen also 
referred to.  The register allocator will then assign hardregs X and X+1 
to the pseudo-reg 'z'.  (Assuming that DImode is okay for hardreg X, and 
HARD_REGNO_NREGS says that it needs two hardregs to hold DImode).

It will also replace the subregs by their appropriate concrete hardreg.

It seems your problems stem from trying to place your new pass somewhere 
within the register-allocation pipeline, rather than simply completely 
before.


Ciao,
Michael.


Re: [PATCH] Optimize (ne:SI (subreg:QI (ashift:SI x 7) 0) 0) as (and:SI x 1).

2023-10-10 Thread Michael Matz


On Tue, 10 Oct 2023, Roger Sayle wrote:

> 
> This patch is the middle-end piece of an improvement to PRs 101955 and
> 106245, that adds a missing simplification to the RTL optimizers.
> This transformation is to simplify (char)(x << 7) != 0 as x & 1.

Random observation:

So, why restrict to shifts of LEN-1 and mask 1?  It's always the case that
(type-of-LEN)(x << S)) != 0  ===  (x & ((1 << (LEN - S)) - 1)) != 0.

E.g. (char)(x << 5) != 0  ===  (x & 7) != 0.

(Eventually the mask will be a constant that's too costly to compute if S 
is target-dependendly too small, but all else being equal avoiding shifts 
seems sensible)


Ciao,
Michael.


Re: [V2][PATCH 0/3] New attribute "counted_by" to annotate bounds for C99 FAM(PR108896)

2023-08-10 Thread Michael Matz via Gcc-patches
Hey,

On Thu, 10 Aug 2023, Martin Uecker wrote:

> > offset(struct foo_flex, t[0]) + N * sizeof(foo->t);
> > 
> > With GCC, offset(struct foo_flex,t[0]) == 6, which is also correct. 
> 
> This formula might be considered incorrect / dangerous because
> it might allocate less storage than sizeof(struct foo_flex). 

Oh indeed.  I hadn't even considered that.  That could be "fixed" with 
another max(theabove, sizeof(struct foo_flex)), but that starts to become 
silly when the obvious choice works fine.


Ciao,
Michael.


Re: [V2][PATCH 0/3] New attribute "counted_by" to annotate bounds for C99 FAM(PR108896)

2023-08-10 Thread Michael Matz via Gcc-patches
Hello,

On Wed, 9 Aug 2023, Qing Zhao wrote:

> > So, should the equivalent FAM struct also have this sizeof()?  If no: 
> > there should be a good argument why it shouldn't be similar to the non-FAM 
> > one.
> 
> The sizeof() of a structure with FAM is defined as: (after I searched online,
>  I think that the one in Wikipedia is the most reasonable one):
> https://en.wikipedia.org/wiki/Flexible_array_member

Well, wikipedia has it's uses.  Here we are in language lawyering land 
together with a discussion what makes most sense in many circumstances.  
FWIW, in this case I think the cited text from wikipedia is correct in the 
sense of "not wrong" but not helpful in the sense of "good advise".

> By definition, the sizeof() of a struct with FAM might not be the same 
> as the non-FAM one. i.e, for the following two structures, one with FAM, 
> the other with fixed array:
> 
> struct foo_flex { int a; short b; char t[]; } x = { .t = { 1, 2, 3 } };
> struct foo_fix {int a; short b; char t[3]; } 
> 
> With current GCC:
> sizeof(foo_flex) == 8
> sizeof(foo_fix) == 12
> 
> I think that the current behavior of sizeof for structure with FAM in 
> GCC is correct.

It is, yes.

> The major issue is what was pointed out by Martin in the previous email:
> 
> Whether using the following formula is correct to compute the 
> allocation?
> 
> sizeof(struct foo_flex) + N * sizeof(foo->t);
> 
> As pointed out  in the wikipedia, the value computed by this formula might
> be bigger than the actual size since “sizeof(struct foo_flex)” might include 
> paddings that are used as part of the array.

That doesn't make the formula incorrect, but rather conservatively 
correct.  If you don't want to be conservative, then yes, you can use a 
different formula if you happen to know the layout rules your compiler at 
hand uses (or the ones prescribed by an ABI, if it does that).  I think 
it would be bad advise to the general population do advertise this scheme 
as better.

> So the more accurate formula should be
> 
> offset(struct foo_flex, t[0]) + N * sizeof(foo->t);

"* sizeof(foo->t[0])", but yes.

> For the question, whether the compiler needs to allocate paddings after 
> the FAM field, I don’t know the answer, and it’s not specified in the 
> standard either. Does it matter?

It matters for two things:

1) Abstract reasons: is there as reason to deviate 
from the normal rules?  If not: it shouldn't deviate.  Future 
extensibility: while it right now is not possible to form an array 
of FMA-structs (in C!), who's to say that it may not be eventually added?  
It seems a natural enough extension of an extension, and while it has 
certain implementation problems (the "real" size of the elements needs to 
be computable, and hence be part of the array type) those could be 
overcome.  At that point you _really_ want to have the elements aligned 
naturally, be compatible with sizeof, and be the same as an individual 
object.

2) Practical reasons: codegeneration works better if the accessible sizes 
of objects are a multiple of their alignment, as often you have 
instructions that can move around alignment-sized blobs (say, words).  If 
you don't pad out objects you have to be careful to use only byte accesses 
when you get to the end of an object.

Let me ask the question in the opposite way: what would you _gain_ by not 
allocating padding?  And is that enough to deviate from the most obvious 
choices?  (Do note that e.g. global or local FMA-typed objects don't exist 
in isolation, and their surrounding objects have their own alignment 
rules, which often will lead to tail padding anyway, even if you would use 
the non-conservative size calculation; the same applies for malloc'ed 
objects).

> > Note that if one choses to allocate less space than sizeof implies that 
> > this will have quite some consequences for code generation, in that 
> > sometimes the instruction sequences (e.g. for copying) need to be careful 
> > to never access tail padding that should be there in array context, but 
> > isn't there in single-object context.  I think this alone should make it 
> > clear that it's advisable that sizeof() and allocated size agree.
> 
> Sizeof by definition is return the size of the TYPE, not the size of the 
> allocated object.

Sure.  Outside special cases like FMA it's the same, though.  And there 
sizeof does include tail padding.

> > And then the next question is what __builtin_object_size should do with 
> > these: should it return the size with or without padding at end (i.e. 
> > could/should it return 9 even if sizeof is 12).  I can see arguments for 
> > both.
> 
> Currently, GCC’s __builtin_object_size use the following formula to 
> compute the object size for The structure with FAM:
> 
> offset(struct foo_flex, t[0]) + N * sizeof(foo->t);
> 
> I think it’s correct.

See above.  It's non-conservatively correct.  And that may be the right 
choice for this builtin, considering its intended use-cases (strict 

Re: [V2][PATCH 0/3] New attribute "counted_by" to annotate bounds for C99 FAM(PR108896)

2023-08-09 Thread Michael Matz via Gcc-patches
Hello,

On Wed, 9 Aug 2023, Qing Zhao wrote:

> Although this is an old FAM related issue that does not relate to my current 
> patch 
> (and might need to be resolved in a separate patch).  I think that it’s 
> necessary to have
> more discussion on this old issue and resolve it. 
> 
> The first thing that I’d like to confirm is:
> 
> What the exact memory layout for the following structure x?
> 
> struct foo { int a; short b; char t[]; } x = { .t = { 1, 2, 3 } };
> 
> And the key that is confusing me is, where should the field “t” start? 
> 
> A.  Starting at offset 8 as the following:
> 
> a 4-bytes
> b 2-bytes
> padding   2-bytes
> t 3-bytes

Why should there be padding before 't'?  It's a char array (FAM or not), 
so it can be (and should be) placed directly after 'b'.  So ...

> B. Starting at offset 6 as the following:
> 
> a 4-bytes
> b 2-bytes
> t 3-bytes

... this is the correct layout, when seen in isolation.  The discussion 
revolves around what should come after 't': if it's a non-FAM struct (with 
t[3]), then it's clear that there needs to be padding after it, so to pad 
out the whole struct to be 12 bytes long (for sizeof() purpose), as 
required by its alignment (due to the int field 'a').

So, should the equivalent FAM struct also have this sizeof()?  If no: 
there should be a good argument why it shouldn't be similar to the non-FAM 
one.

Then there is an argument that the compiler would be fine, when allocating 
a single object of such type (not as part of an array!), to only reserve 9 
bytes of space for the FAM-struct.  Then the question is: should it also 
do that for a non-FAM struct (based on the argument that the padding 
behind 't' isn't accessible, and hence doesn't need to be alloced).  I 
think it would be quite unexpected for the compiler to actually reserve 
less space than sizeof() implies, so I personally don't buy that argument.  
For FAM or non-FAM structs.

Note that if one choses to allocate less space than sizeof implies that 
this will have quite some consequences for code generation, in that 
sometimes the instruction sequences (e.g. for copying) need to be careful 
to never access tail padding that should be there in array context, but 
isn't there in single-object context.  I think this alone should make it 
clear that it's advisable that sizeof() and allocated size agree.

As in: I think sizeof for both structs should return 12, and 12 bytes 
should be reserved for objects of such types.

And then the next question is what __builtin_object_size should do with 
these: should it return the size with or without padding at end (i.e. 
could/should it return 9 even if sizeof is 12).  I can see arguments for 
both.


Ciao,
Michael.


RE: Intel AVX10.1 Compiler Design and Support

2023-08-09 Thread Michael Matz via Gcc-patches
Hello,

On Wed, 9 Aug 2023, Zhang, Annita via Gcc-patches wrote:

> > The question is whether you want to mandate the 16-bit floating point
> > extensions.  You might get better adoption if you stay compatible with 
> > shipping
> > CPUs.  Furthermore, the 256-bit tuning apparently benefits current Intel 
> > CPUs,
> > even though they can do 512-bit vectors.
> > 
> > (The thread subject is a bit misleading for this sub-topic, by the way.)
> > 
> > Thanks,
> > Florian
> 
> Since 256bit and 512bit are diverged from AVX10.1 and will continue in 
> the future AVX10 versions, I think it's hard to keep a single version 
> number to cover both and increase monotonically. Hence I'd like to 
> suggest x86-64-v5 for 512bit and x86-64-v5-256 for 256bit, and so on.

The raison d'etre for the x86-64-vX scheme is to make life sensible as 
distributor.  That goal can only be achieved if this scheme contains only 
a few components that have a simple relationship.  That basically means: 
one dimension only.  If you now add a second dimension (with and without 
-512) we have to add another one if Intel (or whomever else) next does a 
marketing stunt for feature "foobar" and end up with x86-64-v6, 
x86-64-v6-512, x86-64-v6-1024, x86-64-v6-foobar, x86-64-v6-512-foobar, 
x86-64-v6-1024-foobar.

In short: no.

It isn't the right time anyway to assign meaning to x86-64-v5, as it 
wasn't the right time for assigning x86-64-v4 (as we now see).  These are 
supposed to reflect generally useful feature sets actually shipped in 
generally available CPUs in the market, and be vendor independend.  As 
such it's much too early to define v5 based purely on text documents.


Ciao,
Michael.


Re: [V2][PATCH 0/3] New attribute "counted_by" to annotate bounds for C99 FAM(PR108896)

2023-08-08 Thread Michael Matz via Gcc-patches
Hello,

On Tue, 8 Aug 2023, Martin Uecker via Gcc-patches wrote:

> There at least three different size expression which could
> make sense. Consider
> 
> short foo { int a; short b; char t[]; }; 
> 
> Most people seem to use
> 
> sizeof(struct foo) + N * sizeof(foo->t);
> 
> which for N == 3 yields 11 bytes on x86-64 because the formula
> adds the padding of the original struct. There is an example
> in the  C standard that uses this formula.
> 
> 
> But he minimum size of an object which stores N elements is
> 
> max(sizeof (struct s), offsetof(struct s, t[n]))
> 
> which is 9 bytes. 

But should it really?  struct sizes should usually be a multiple of a 
structs alignment, and if int is 4-aligned only size 12 would be correct. 
I don't see why one should deviate from that general rule for sizes for 
FAM structs.  (I realize that the above is not about sizeof(), but rather 
bdos/bos, but I think it's fairly useful that both agree when possbible).

Say, if you were to allocate an array of such struct foos, each having 3 
elements in foo.t[].  You need to add 12 bytes to go to the next array 
element, not 9.  (Of course arrays of FAM-structs are somewhat meh, but 
still).  It's true that you would be allowed to rely on only 9 bytes of 
those 12 bytes (the rest being padding), so perhaps it's really the right 
answer for __bos, but, hmm, 12 seems to be "more correct" to my guts :-)


Ciao,
Michael.


Re: [RFC] GCC Security policy

2023-08-08 Thread Michael Matz via Gcc-patches
Hello,

On Tue, 8 Aug 2023, Jakub Jelinek via Gcc-patches wrote:

> What I'd really like to avoid is having all compiler bugs (primarily ICEs)
> considered to be security bugs (e.g. DoS category), it would be terrible to
> release every week a new compiler because of the "security" issues.
> Running compiler on untrusted sources can trigger ICEs (which we want to fix
> but there will always be some), or run into some compile time and/or compile
> memory issue (we have various quadratic or worse spots), compiler stack
> limits (deeply nested stuff e.g. during parsing but other areas as well).
> So, people running fuzzers and reporting issues is great, but if they'd get
> a CVE assigned for each ice-on-invalid-code, ice-on-valid-code,
> each compile-time-hog and each memory-hog, that wouldn't be useful.

This!  Double-this!

FWIW, the binutils security policy, and by extension the proposed GCC 
policy David posted, handles this.  (To me this is the most important 
aspect of such policy, having been on the receiving end of such nonsense 
on the binutils side).

> Runtime libraries or security issues in the code we generate for valid
> sources are of course a different thing.

Generate or otherwise provide for consumption.  E.g. a bug with security 
consequences in the runtime libs (either in source form (templates) or as 
executable code, but with the problem being in e.g. libgcc sources 
(unwinder!)) needs proper handling, similar to how glibc is handled.


Ciao,
Michael.


Re: _BitInt vs. _Atomic

2023-08-02 Thread Michael Matz via Gcc-patches
Hello,

On Tue, 1 Aug 2023, Joseph Myers wrote:

> > Only because cmpxchg is defined in terms of memcpy/memcmp.  If it were 
> > defined in terms of the == operator (obviously applied recursively 
> > member-wise for structs) and simple-assignment that wouldn't be a problem.  
> 
> It also wouldn't work for floating point, where I think clearly the atomic 
> operations should consider positive and negative zero as different, and 
> should consider different DFP quantum exponents for the same real number 
> as different - but should also consider the same NaN (same payload, same 
> choice of quiet / signaling) as being the same.

That is all true.  But the current wording can't work either.  It happily 
requires copying around memory between types of different representations 
and sizes, it makes padding observable behaviour, and due to that makes 
basic algebraic guarantees not be observed (after two values are checked 
equal with the predicates of the algrabra, they are not then in fact equal 
with predicates of the same algebra).


Ciao,
Michael.


Re: _BitInt vs. _Atomic

2023-08-01 Thread Michael Matz via Gcc-patches
Hello,

On Mon, 31 Jul 2023, Martin Uecker wrote:

> >  Say you have a loop like so:
> > 
> > _Atomic T obj;
> > ...
> > T expected1, expected2, newval;
> > newval = ...;
> > expected1 = ...;
> > do {
> >   expected2 = expected1;
> >   if (atomic_compare_exchange_weak(, , newval);
> > break;
> >   expected1 = expected2;
> > } while (1);
> > 
> > As written this looks of course stupid, and you may say "don't do that", 
> > but internally the copies might result from temporaries (compiler 
> > generated or wrapper function arguments, or suchlike). 
> >  Now, while 
> > expected2 will contain the copied padding bits after the cmpxchg the 
> > copies to and from expected1 will possibly destroy them.  Either way I 
> > don't see why the above loop should be out-of-spec, so I can write it and 
> > expect it to proceed eventually (certainly when the _strong variant is 
> > used).  Any argument that would declare the above loop out-of-spec I would 
> > consider a defect in the spec.
> 
> It is "out-of-spec" for C in the sense that it can not be
> expected work with the semantics as specified in the C standard.

(I call that a defect.  See below)

> In practice, what the semantics specified using memcpy/memcmp
> allow one to do is to also apply atomic operations on non-atomic 
> types.  This is not guaranteed to work by the C standard, but
> in practice  people often have to do this.  For example, nobody
> is going to copy a 256 GB numerical array with non-atomic types
> into another data structure with atomic versions of the same
> type just so that you can apply atomic operations on it.
> So one simply does an unsafe cast and hopes the compiler does
> not break this.
> 
> If the non-atomic struct now has non-zero values in the padding, 
> and the compiler would clear those automatically for "expected", 
> you would create the problem of an infinite loop (this time 
> for real).

Only because cmpxchg is defined in terms of memcpy/memcmp.  If it were 
defined in terms of the == operator (obviously applied recursively 
member-wise for structs) and simple-assignment that wouldn't be a problem.  
In addition that would get rid of all discussion of what happens or 
doesn't happen with padding.  Introducing reliance on padding bits (which 
IMHO goes against some fundamental ideas of the C standard) has 
far-reaching consequences, see below.  The current definition of the 
atomic_cmpxchg is also inconsistent with the rest of the standard:

We have:

  ... (C is non-atomic variant of A) ...
  _Bool atomic_compare_exchange_strong(volatile A *object,
   C *expected, C desired);
  ... (is equivalent to atomic variant of:) 
  if (memcmp(object, expected, sizeof (*object)) == 0)
{ memcpy(object, , sizeof (*object)); return true; }
  else
{ memcpy(expected, object, sizeof (*object)); return false; }

But we also have:

  The size, representation, and alignment of an atomic type need not be 
  the same as those of the corresponding unqualified type.

  (with later text only suggesting that at least for atomic integer 
  types these please be the same.  But here we aren't talking about
  integer types even.)

So, already the 'memcmp(object, expected, sizeof (*object)' may be 
undefined.  sizeof(*object) need not be the same as sizeof(*expected).
In particular the memcpy in the else branch might clobber memory outside 
*expected.

That alone should be sufficient to show that defining this all in terms of 
memcpy/memcmp is a bad idea.  But it also has other 
consequences: you can't copy (simple-assign) or compare (== operator) 
atomic values anymore reliably and expect the atomic_cmpxchg to work.  My 
example from earlier shows that you can't copy them, a similar one can be 
constructed for breaking ==.

But it goes further: you can also construct an example that shows an 
internal inconsistency just with using atomic_cmpxchg (of course, assume 
all this to run without concurrent accesses to the respective objects):

  _Atomic T obj;
  ...
  T expected, newval;
  expected = ...;
  newval = expected + 1; // just to make it different
  atomic_store (, expected);
  if (atomic_cmpxchg_strong (, , newval)) {
/* Now we have: obj == newval.
   Do we also have memcmp(,)==0? */
if (!atomic_cmpxchg_strong (, , expected)) {
  /* No, we can't rely on that!  */
  error("what's going on?");
}
  } else {
/* May happen, padding of expected may not be the same
   as in obj, even after atomic_store.  */
error("WTH? a compare after a store doesn't even work?");
  }

So, even though cmpxchg is defined in terms of memcpy/memcmp, we still 
can't rely on anything after it succeeded (or failed).  Simply because the 
by-value passing of the 'desired' argument will have unknown padding 
(within the implementation of cmpxchg) that isn't necessarily the same as 
the newval object.

Now, about your suggestion of clearing or ignoring the padding bits at 
specific 

Re: _BitInt vs. _Atomic

2023-07-31 Thread Michael Matz via Gcc-patches
Hello,

On Fri, 28 Jul 2023, Martin Uecker wrote:

> > > Sorry, somehow I must be missing something here.
> > > 
> > > If you add something you would create a new value and this may (in
> > > an object) have random new padding.  But the "expected" value should
> > > be updated by a failed atomic_compare_exchange cycle and then have
> > > same padding as the value stored in the atomic. So the next cycle
> > > should succeed.  The user would not change the representation of
> > > the "expected" value but create a new value for another object
> > > by adding something.
> > 
> > You're right that it would pass the expected value not something after an
> > operation on it usually.  But still, expected type will be something like
> > _BitInt(37) or _BitInt(195) and so neither the atomic_load nor what
> > atomic_compare_exchange copies back on failure is guaranteed to have the
> > padding bits preserved.
> 
> For atomic_load in C a value is returned. A value does not care about
> padding and when stored into a new object can produce new and different
> padding.  
> 
> But for atomic_compare_exchange the memory content is copied into 
> an object passed by pointer, so here the C standard requires to
> that the padding is preserved. It explicitely states that the effect
> is like:
> 
> if (memcmp(object, expected, sizeof(*object)) == 0)
>   memcpy(object, , sizeof(*object));
> else
>   memcpy(expected, object, sizeof(*object));
> 
> > It is true that if it is larger than 16 bytes the libatomic 
> > atomic_compare_exchange will memcpy the value back which copies the 
> > padding bits, but is there a guarantee that the user code doesn't 
> > actually copy that value further into some other variable?  
> 
> I do not think it would be surprising for C user when
> the next atomic_compare_exchange fails in this case.

But that is a problem (the same one the cited C++ paper tries to resolve, 
IIUC).  Say you have a loop like so:

_Atomic T obj;
...
T expected1, expected2, newval;
newval = ...;
expected1 = ...;
do {
  expected2 = expected1;
  if (atomic_compare_exchange_weak(, , newval);
break;
  expected1 = expected2;
} while (1);

As written this looks of course stupid, and you may say "don't do that", 
but internally the copies might result from temporaries (compiler 
generated or wrapper function arguments, or suchlike).  Now, while 
expected2 will contain the copied padding bits after the cmpxchg the 
copies to and from expected1 will possibly destroy them.  Either way I 
don't see why the above loop should be out-of-spec, so I can write it and 
expect it to proceed eventually (certainly when the _strong variant is 
used).  Any argument that would declare the above loop out-of-spec I would 
consider a defect in the spec.

It's never a good idea to introduce reliance on padding bits.  Exactly 
because you can trivially destroy them with simple value copies.

> > Anyway, for smaller or equal to 16 (or 8) bytes if 
> > atomic_compare_exchange is emitted inline I don't see what would 
> > preserve the bits.
> 
> This then seems to be incorrect for C.

Or the spec is.


Ciao,
Michael.


Re: Calling convention for Intel APX extension

2023-07-31 Thread Michael Matz via Gcc
Hello,

On Sun, 30 Jul 2023, Thomas Koenig wrote:

> > I've recently submitted a patch that adds some attributes that basically
> > say "these-and-those regs aren't clobbered by this function" (I did them
> > for not clobbered xmm8-15).  Something similar could be used for the new
> > GPRs as well.  Then it would be a matter of ensuring that the interesting
> > functions are marked with that attributes (and then of course do the
> > necessary call-save/restore).
> 
> Interesting.
> 
> Taking this a bit further: The compiler knows which registers it used
> (and which ones might get clobbered by called functions) and could
> generate such information automatically and embed it in the assembly
> file, and the assembler could, in turn, put it into the object file.
> 
> A linker (or LTO) could then check this and elide save/restore pairs
> where they are not needed.

LTO with interprocedural register allocation (-fipa-ra) already does this.  

Doing it without LTO is possible to implement in the way you suggest, but 
is very hard to get effective: the problem is that saving/restoring of 
registers might be scheduled in non-trivial ways and getting rid of 
instruction bytes within function bodies at link time is fairly 
non-trivial: it needs excessive meta-information to be effective (e.g. all 
jumps that potentially cross the removed bytes must get relocations).

So you either limit the ways that prologue and epilogues are emitted to 
help the linker (thereby limiting effectiveness of unchanged xlogues) or 
you emit more meta-info than the instruction bytes themself, bloating 
object files for dubious outcomes.

> It would probably be impossible for calls into shared libraries, since
> the saved registers might change from version to version.

The above scheme could be extended to also allow introducing stubs 
(wrappers) for shared lib functions, handled by the dynamic loader.  But 
then you would get hard problems to solve related to function addresses 
and their uniqueness.

> Still, potential gains could be substantial, and it could have an
> effect which could come close to inlining, while actually saving space
> instead of using extra.
> 
> Comments?

I think it would be an interesting experiment to implement such scheme 
fully just to see how effective it would be in practice.  But it's very 
non-trivial to do, and my guess is that it won't be super effective.  So, 
could be a typical research paper topic :-)

At least outside of extreme cases like the SSE regs, where none are 
callee-saved, and which can be handled in a different way like the 
explicit attributes.


Ciao,
Michael.


Re: Calling convention for Intel APX extension

2023-07-27 Thread Michael Matz via Gcc
Hey,

On Thu, 27 Jul 2023, Thomas Koenig via Gcc wrote:

> Intel recommends to have the new registers as caller-saved for
> compatibility with current calling conventions.  If I understand this
> correctly, this is required for exception unwinding, but not if the
> function called is __attribute__((nothrow)).

That's not the full truth.  It's not (only) exception handling but also 
context switching via setjmp/longjmp and make/get/setcontext.

The data structures for that are part of the ABI unfortunately, and can't 
be assumed to be extensible (as Florian says, for glibc there maybe be 
hacks (or maybe not) on x86-64.  Some other archs implemented 
extensibility from the outset).  So all registers (and register parts!) 
added after the initial psABI is defined usually _have_ to be 
call-clobbered.

> Since Fortran tends to use a lot of registers for its array descriptors,
> and also tends to call nothrow functions (all Fortran functions, and
> all Fortran intrinsics, such as sin/cos/etc) a lot, it could profit from
> making some of the new registers callee-saved, to save some spills
> at function calls.

I've recently submitted a patch that adds some attributes that basically 
say "these-and-those regs aren't clobbered by this function" (I did them 
for not clobbered xmm8-15).  Something similar could be used for the new 
GPRs as well.  Then it would be a matter of ensuring that the interesting 
functions are marked with that attributes (and then of course do the 
necessary call-save/restore).


Ciao,
Michael.


Re: [WIP RFC] Add support for keyword-based attributes

2023-07-17 Thread Michael Matz via Gcc-patches
Hello,

On Mon, 17 Jul 2023, Richard Sandiford via Gcc-patches wrote:

> >> There are some existing attributes that similarly affect semantics
> >> in ways that cannot be ignored.  vector_size is one obvious example.
> >> But that doesn't make it a good thing. :)
> >...
> > If you had added __arm(bar(args)) instead of __arm_bar(args) you would only
> > need one additional keyword - we could set aside a similar one for each
> > target then.  I realize that double-nesting of arguments might prove a bit
> > challenging but still.
> 
> Yeah, that would work.

So, essentially you want unignorable attributes, right?  Then implement 
exactly that: add one new keyword "__known_attribute__" (invent a better 
name, maybe :) ), semantics exactly as with __attribute__ (including using 
the same underlying lists in our data structures), with only one single 
deviation: instead of the warning you give an error for unhandled 
attributes.  Done.

(Old compilers will barf of the unknown new keyword, new compilers will 
error on unknown values used within such attribute list)

> > In any case I also think that attributes are what you want and their 
> > ugliness/issues are not worse than the ugliness/issues of the keyword 
> > approach IMHO.
> 
> I guess the ugliness of keywords is the double underscore? What are the 
> issues with the keyword approach though?

There are _always_ problems with new keywords, the more new keywords the 
more problems :-)  Is the keyword context-sensitive or not?  What about 
existing system headers that use it right now?  Is it recognized in 
free-standing or not?  Is it only recognized for some targets?  Is it 
recognized only for certain configurations of the target?

So, let's define one new mechanism, for all targets, all configs, and all 
language standards.  Let's use __attribute__ with a twist :)


Ciao,
Michael.


Re: [x86-64] RFC: Add nosse abi attribute

2023-07-11 Thread Michael Matz via Gcc-patches
Hey,

On Tue, 11 Jul 2023, Alexander Monakov via Gcc-patches wrote:

> > > > * nosseclobber: claims (and ensures) that xmm8-15 aren't clobbered
> > > 
> > > This is the weak/active form; I'd suggest "preserve_high_sse".
> > 
> > But it preserves only the low parts :-)  You swapped the two in your 
> > mind when writing the reply?
> 
> Ahhh. By "high SSE" I mean the high-numbered SSE regs, i.e. xmm8-15, not
> the higher halves of (unspecified subset of) SSE regs.

Ah, gotcha :-)  It just shows that all these names are confusing.  Maybe 
I'll just go with "attribute1" and "attribute2" and rely on docu.  (SCNR)


Ciao,
Michael.


Re: [x86-64] RFC: Add nosse abi attribute

2023-07-11 Thread Michael Matz via Gcc-patches
Hello,

On Mon, 10 Jul 2023, Alexander Monakov wrote:

> I think the main question is why you're going with this (weak) form
> instead of the (strong) form "may only clobber the low XMM regs":

I want to provide both.  One of them allows more arbitrary function 
definitions, the other allows more register (parts) to be preserved.  I 
feel both have their place.

> as Richi noted, surely for libcalls we'd like to know they preserve
> AVX-512 mask registers as well?

Yeah, mask registers.  I'm still pondering this.  We would need to split 
the 8 maskregs into two parts.  Hmm.

> Note this interacts with anything that interposes between the caller
> and the callee, like the Glibc lazy binding stub (which used to
> zero out high halves of 512-bit arguments in ZMM registers).
> Not an immediate problem for the patch, just something to mind perhaps.

Yeah, needs to be kept in mind indeed.  Anything coming in between the 
caller and a so-marked callee needs to preserve things.

> > I chose to make it possible to write function definitions with that
> > attribute with GCC adding the necessary callee save/restore code in
> > the xlogue itself.
> 
> But you can't trivially restore if the callee is sibcalling — what
> happens then (a testcase might be nice)?

I hoped early on that the generic code that prohibits sibcalls between 
call sites of too "different" ABIs would deal with this, and then forgot 
to check.  Turns out you had a good hunch here, it actually does a 
sibcall, destroying the guarantees.  Thanks! :)

> > Carefully note that this is only possible for the SSE2 registers, as 
> > other parts of them would need instructions that are only optional.
> 
> What is supposed to happen on 32-bit x86 with -msse -mno-sse2?

Hmm.  I feel the best answer here is "that should error out".  I'll add a 
test and adjust patch if necessary.

> > When a function doesn't contain calls to
> > unknown functions we can be a bit more lenient: we can make it so that
> > GCC simply doesn't touch xmm8-15 at all, then no save/restore is
> > necessary.
> 
> What if the source code has a local register variable bound to xmm15,
> i.e. register double x asm("xmm15"); asm("..." : "+x"(x)); ?

Makes a good testcase as well.  My take: it's acceptable with the 
only-sse2-preserved attribute (xmm15 will in this case be saved/restored), 
and should be an error with the everything-preserved attribute (maybe we 
can make an exception as here we only specify an XMM reg, instead of 
larger parts).

> > To that end I introduce actually two related attributes (for naming
> > see below):
> > * nosseclobber: claims (and ensures) that xmm8-15 aren't clobbered
> 
> This is the weak/active form; I'd suggest "preserve_high_sse".

But it preserves only the low parts :-)  You swapped the two in your 
mind when writing the reply?

> > I would welcome any comments, about the names, the approach, the attempt
> > at documenting the intricacies of these attributes and anything.
> 
> I hope the new attributes are supposed to be usable with function 
> pointers? From the code it looks that way, but the documentation doesn't 
> promise that.

Yes, like all ABI influencing attributes they _have_ to be part of the 
functions type (and hence transfer to function pointers), with appropriate 
incompatible-conversion errors and warnings at the appropriate places.  (I 
know that this isn't always the way we're dealing with ABI-infuencing 
attributes, and often refer to a decl only.  All those are actual bugs.)

And yes, I will adjust the docu to be explicit about this.


Ciao,
Michael.


Re: [x86-64] RFC: Add nosse abi attribute

2023-07-11 Thread Michael Matz via Gcc-patches
Hello,

On Tue, 11 Jul 2023, Jan Hubicka wrote:

> > > > When a function doesn't contain calls to
> > > > unknown functions we can be a bit more lenient: we can make it so that
> > > > GCC simply doesn't touch xmm8-15 at all, then no save/restore is
> > > > necessary.
> 
> One may also take into account that first 8 registers are cheaper to
> encode than the later 8, so perhaps we may want to choose range that
> contains both.

There is actually none in the low range that's usable.  xmm0/1 are used 
for return values and xmm2-7 are used for argument passing.  Arguments are 
by default callee clobbered, and we do not want to change this (or limit 
the number of register arguments for the alternate ABI).


Ciao,
Michael.


[x86-64] RFC: Add nosse abi attribute

2023-07-10 Thread Michael Matz via Gcc-patches
Hello,

the ELF psABI for x86-64 doesn't have any callee-saved SSE
registers (there were actual reasons for that, but those don't
matter anymore).  This starts to hurt some uses, as it means that
as soon as you have a call (say to memmove/memcpy, even if
implicit as libcall) in a loop that manipulates floating point
or vector data you get saves/restores around those calls.

But in reality many functions can be written such that they only need
to clobber a subset of the 16 XMM registers (or do the save/restore
themself in the codepaths that needs them, hello memcpy again).
So we want to introduce a way to specify this, via an ABI attribute
that basically says "doesn't clobber the high XMM regs".

I've opted to do only the obvious: do something special only for
xmm8 to xmm15, without a way to specify the clobber set in more detail.
I think such half/half split is reasonable, and as I don't want to
change the argument passing anyway (whose regs are always clobbered)
there isn't that much wiggle room anyway.

I chose to make it possible to write function definitions with that
attribute with GCC adding the necessary callee save/restore code in
the xlogue itself.  Carefully note that this is only possible for
the SSE2 registers, as other parts of them would need instructions
that are only optional.  When a function doesn't contain calls to
unknown functions we can be a bit more lenient: we can make it so that
GCC simply doesn't touch xmm8-15 at all, then no save/restore is
necessary.  If a function contains calls then GCC can't know which
parts of the XMM regset is clobbered by that, it may be parts
which don't even exist yet (say until avx2048 comes out), so we must
restrict ourself to only save/restore the SSE2 parts and then of course
can only claim to not clobber those parts.

To that end I introduce actually two related attributes (for naming
see below):
* nosseclobber: claims (and ensures) that xmm8-15 aren't clobbered
* noanysseclobber: claims (and ensures) that nothing of any of the
  registers overlapping xmm8-15 is clobbered (not even future, as of
  yet unknown, parts)

Ensuring the first is simple: potentially add saves/restore in xlogue
(e.g. when xmm8 is either used explicitely or implicitely by a call).
Ensuring the second comes with more: we must also ensure that no
functions are called that don't guarantee the same thing (in addition
to just removing all xmm8-15 parts alltogether from the available
regsters).

See also the added testcases for what I intended to support.

I chose to use the new target independend function-abi facility for
this.  I need some adjustments in generic code:
* the "default_abi" is actually more like a "current" abi: it happily
  changes its contents according to conditional_register_usage,
  and other code assumes that such changes do propagate.
  But if that conditonal_reg_usage is actually done because the current
  function is of a different ABI, then we must not change default_abi.
* in insn_callee_abi we do look at a potential fndecl for a call
  insn (only set when -fipa-ra), but doesn't work for calls through
  pointers and (as said) is optional.  So, also always look at the
  called functions type (it's always recorded in the MEM_EXPR for
  non-libcalls), before asking the target.
  (The function-abi accessors working on trees were already doing that,
  its just the RTL accessor that missed this)

Accordingly I also implement some more target hooks for function-abi.
With that it's possible to also move the other ABI-influencing code
of i386 to function-abi (ms_abi and friends).  I have not done so for
this patch.

Regarding the names of the attributes: gah!  I've left them at
my mediocre attempts of names in order to hopefully get input on better
names :-)

I would welcome any comments, about the names, the approach, the attempt
at documenting the intricacies of these attributes and anything.

FWIW, this particular patch was regstrapped on x86-64-linux
with trunk from a week ago (and sniff-tested on current trunk).


Ciao,
Michael.

diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 37cb5a0dcc4..92358f4ac41 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -3244,6 +3244,16 @@ ix86_set_indirect_branch_type (tree fndecl)
 }
 }
 
+unsigned
+ix86_fntype_to_abi_id (const_tree fntype)
+{
+  if (lookup_attribute ("nosseclobber", TYPE_ATTRIBUTES (fntype)))
+return ABI_LESS_SSE;
+  if (lookup_attribute ("noanysseclobber", TYPE_ATTRIBUTES (fntype)))
+return ABI_NO_SSE;
+  return ABI_DEFAULT;
+}
+
 /* Establish appropriate back-end context for processing the function
FNDECL.  The argument might be NULL to indicate processing at top
level, outside of any function scope.  */
@@ -3311,6 +3321,12 @@ ix86_set_current_function (tree fndecl)
   else
TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
 }
+
+  unsigned prev_abi_id = 0;
+  if 

Re: [PATCH] Basic asm blocks should always be volatile

2023-06-29 Thread Michael Matz via Gcc
Hello,

On Thu, 29 Jun 2023, Julian Waters via Gcc wrote:

> int main() {
> asm ("nop" "\n"
>  "\t" "nop" "\n"
>  "\t" "nop");
> 
> asm volatile ("nop" "\n"
>   "\t" "nop" "\n"
>   "\t" "nop");
> }
> 
> objdump --disassemble-all -M intel -M intel-mnemonic a.exe > disassembly.txt
> 
> 0001400028c0 :
>1400028c0: 48 83 ec 28 subrsp,0x28
>1400028c4: e8 37 ec ff ffcall   140001500 <__main>
>1400028c9: 90nop
>1400028ca: 90nop
>1400028cb: 90nop
>1400028cc: 31 c0   xoreax,eax
>1400028cd: 48 83 c4 28 addrsp,0x28
>1400028ce: c3ret
> 
> Note how there are only 3 nops above when there should be 6, as the first 3
> have been deleted by the compiler. With the patch, the correct 6 nops
> instead of 3 are compiled into the final code.
> 
> Of course, the above was compiled with the most extreme optimizations
> available to stress test the possible bug, -O3, -ffunction-sections
> -fdata-sections -Wl,--gc-sections -flto=auto. Compiled as C++17 and intel
> assembly syntax

Works just fine here:

% cat xx.c
int main() {
asm ("nop" "\n"
 "\t" "nop" "\n"
 "\t" "nop");

asm volatile ("nop" "\n"
  "\t" "nop" "\n"
  "\t" "nop");
}

% g++ -v
...
gcc version 12.2.1 20230124 [revision 
193f7e62815b4089dfaed4c2bd34fd4f10209e27] (SUSE Linux)

% g++ -std=c++17 -flto=auto -O3 -ffunction-sections -fdata-sections xx.c
% objdump --disassemble-all -M intel -M intel-mnemonic a.out
...
00401020 :
  401020:   90  nop
  401021:   90  nop
  401022:   90  nop
  401023:   90  nop
  401024:   90  nop
  401025:   90  nop
  401026:   31 c0   xoreax,eax
  401028:   c3  ret
  401029:   0f 1f 80 00 00 00 00nopDWORD PTR [rax+0x0]
...

Testing with recent trunk works as well with no differences in output.
This is for x86_64-linux.

So, as suspected, something else is broken for you.  Which compiler 
version are you using?  (And we need to check if it's something in the 
mingw target)


Ciao,
Michael.


Re: types in GIMPLE IR

2023-06-29 Thread Michael Matz via Gcc
Hello,

On Thu, 29 Jun 2023, Krister Walfridsson wrote:

> > The thing with signed bools is that the two relevant values are -1 (true)
> > and 0 (false), those are used for vector bool components where we also
> > need them to be of wider type (32bits in this case).
> 
> My main confusion comes from seeing IR doing arithmetic such as
> 
>_127;
>_169;
>   ...
>   _169 = _127 + -1;
> 
> or
> 
>_127;
>_169;
>   ...
>   _169 = -_127;
> 
> and it was unclear to me what kind of arithmetic is allowed.
> 
> I have now verified that all cases seems to be just one operation of this form
> (where _127 has the value 0 or 1), so it cannot construct values such as 42.
> But the wide signed Boolean can have the three different values 1, 0, and -1,
> which I still think is at least one too many. :)

It definitely is.  For signed bool it should be -1 and 0, for unsigned 
bool 1 and 0.  And of course, arithmetic on bools is always dubious, that  
should all be logical operations.  Modulo-arithmetic (mod 2) could be 
made to work, but then we would have to give up the idea of signed bools 
and always use conversions to signed int to get a bitmaks of all-ones.  
And as mod-2-arithmetic is equivalent to logical ops it seems a bit futile 
to go that way.

Of course, enforcing this all might lead to a surprising heap of errors, 
but one has to start somewhere, so ...

> I'll update my tool to complain if the value is outside the range [-1, 
> 1].

... maybe not do that, at least optionally, that maybe somewhen someone 
can look into fixing that all up? :-)  -fdubious-bools?


Ciao,
Michael.


Re: types in GIMPLE IR

2023-06-28 Thread Michael Matz via Gcc
Hello,

On Wed, 28 Jun 2023, Krister Walfridsson via Gcc wrote:

> Type safety
> ---
> Some transformations treat 1-bit types as a synonym of _Bool and mix the types
> in expressions, such as:
> 
>_2;
>   _Bool _3;
>   _Bool _4;
>   ...
>   _4 = _2 ^ _3;
> 
> and similarly mixing _Bool and enum
> 
>   enum E:bool { E0, E1 };
> 
> in one operation.
> 
> I had expected this to be invalid... What are the type safety rules in the
> GIMPLE IR?

Type safety in gimple is defined in terms of type compatiblity, with 
_many_ exceptions for specific types of statements.  Generally stuff is 
verified in verify_gimple_seq., in this case of a binary assign statement 
in verify_gimple_assign_binary.  As you can see there the normal rules for 
bread-and-butter binary assigns is simply that RHS, LHS1 and LHS2 must 
all be type-compatible.

T1 and T2 are compatible if conversions from T1 to T2 are useless and 
conversion from T2 to T1 are also useless.  (types_compatible_p)  The meat 
for that is all in gimple-expr.cc:useless_type_conversion_p.  For this 
specific case again we have:

  /* Preserve conversions to/from BOOLEAN_TYPE if types are not
 of precision one.  */
  if (((TREE_CODE (inner_type) == BOOLEAN_TYPE)
   != (TREE_CODE (outer_type) == BOOLEAN_TYPE))
  && TYPE_PRECISION (outer_type) != 1)
return false;

So, yes, booleans and 1-bit types can be compatible (under certain other 
conditions, see the function).

> Somewhat related, gcc.c-torture/compile/pr96796.c performs a VIEW_CONVERT_EXPR
> from
> 
>   struct S1 {
> long f3;
> char f4;
>   } g_3_4;
> 
> to an int
> 
>   p_51_9 = VIEW_CONVERT_EXPR(g_3_4);
> 
> That must be wrong?

VIEW_CONVERT_EXPR is _very_ generous.  See 
verify_types_in_gimple_reference: 

  if (TREE_CODE (expr) == VIEW_CONVERT_EXPR)
{
  /* For VIEW_CONVERT_EXPRs which are allowed here too, we only check
 that their operand is not a register an invariant when
 requiring an lvalue (this usually means there is a SRA or IPA-SRA
 bug).  Otherwise there is nothing to verify, gross mismatches at
 most invoke undefined behavior.  */
  if (require_lvalue
  && (is_gimple_reg (op) || is_gimple_min_invariant (op)))
{
  error ("conversion of %qs on the left hand side of %qs",
 get_tree_code_name (TREE_CODE (op)), code_name);
  debug_generic_stmt (expr);
  return true;
}
  else if (is_gimple_reg (op)
   && TYPE_SIZE (TREE_TYPE (expr)) != TYPE_SIZE (TREE_TYPE 
(op)))
{
  error ("conversion of register to a different size in %qs",
 code_name);
  debug_generic_stmt (expr);
  return true;
}
}

Here the operand is not a register (but a global memory object), so 
everything goes.

It should be said that over the years gimples type system became stricter 
and stricter, but it started as mostly everything-goes, so making it 
stricter is a bumpy road that isn't fully travelled yet, because checking 
types often results in testcase regressions :-)

> Semantics of 
> 
> "Wide" Booleans, such as , seems to allow more values than
> 0 and 1. For example, I've seen some IR operations like:
> 
>   _66 = _16 ? _Literal () -1 : 0;
> 
> But I guess there must be some semantic difference between 
>  and a 32-bit int, otherwise the wide Boolean type 
> would not be needed... So what are the difference?

See above, normally conversions to booleans that are wider than 1 bit are 
_not_ useless (because they require booleanization to true/false).  In the 
above case the not-useless cast is within a COND_EXPR, so it's quite 
possible that the gimplifier didn't look hard enough to split this out 
into a proper conversion statement.  (The verifier doesn't look inside 
the expressions of the COND_EXPR, so also doesn't catch this one)

If that turns out to be true and the above still happens when -1 is 
replaced by (say) 42, then it might be possible to construct a 
wrong-code testcase based on the fact that _66 as boolean should contain 
only two observable values (true/false), but could then contain 42.  OTOH, 
it might also not be possible to create such testcase, namely when GCC is 
internally too conservative in handling wide bools :-)  In that case we 
probably have a missed optimization somewhere, which when implemented 
would enable construction of such wrong-code testcase ;)

(I'm saying that -1 should be replaced by something else for a wrong-code 
testcase, because -1 is special and could justifieably be special-cased in 
GCC: -1 is the proper arithmetic value for a signed boolean that is 
"true").


Ciao,
Michael.


Re: [PATCH] Basic asm blocks should always be volatile

2023-06-28 Thread Michael Matz via Gcc
Hello,

On Wed, 28 Jun 2023, Julian Waters via Gcc wrote:

> On the contrary, code compiled with gcc with or without the applied patch
> operates very differently, with only gcc with the applied patch producing a
> fully correctly operating program as expected. Even if the above inline
> assembly blocks never worked due to other optimizations however, the
> failure mode of the program would be very different from how it fails now:
> It should fail noisily with an access violation exception due to
> the malformed assembly, but instead all the assembly which installs an
> exception handler never makes it into the final program because with
> anything higher than -O1 gcc deletes all of it (I have verified this with
> objdump too),

Can you please provide a _full_ compilable testcase (preprocessed).  What 
Andrew says is (supposed to be) correct: ignoring the other 
problems you're going to see with your asms (even if you make them 
volatile) GCC should not remove any of the asm statements of them.

If something changes when you add 'volatile' by hand then we have another 
problem lurking somewhere, and adding the parser patch might not fully 
solve it (even if it changes behaviour for you).


Ciao,
Michael.


Re: [gimple-ssa] Get the gimple corresponding to the definition of a VAR_DECL

2023-06-27 Thread Michael Matz via Gcc
Hello,

On Tue, 27 Jun 2023, Pierrick Philippe wrote:

> My main problem is regarding uninitialized definition, but still not being
> considered undefined behavior.
> For example, the following code:
> 
>int x;
>int *y = 
>*y = 6;
> 
> What I'm doing is basically looking at each gimple statement if the lhs has a
> given attribute for the purpose of the analysis I'm trying to perform.
> To precise, I'm plugged into the analyzer, so an out-of-tree plugin.
> 
> But in the example above, the definition of x is not really within the
> gimple_seq of the function as it is never directly assigned.

Then you need to be a bit more precise in what exactly you want.  There 
are multiple ways to interpret "definition of a variable".

a) assigning a value to it: that's what Richard alluded to, you need to 
   iterate all gimple statements to see if any assign to variables you're 
   interested in (in SSA form there will only be one, outside SSA there 
   may be many).  As you notice there also may be none at all that 
   directly assign a value.  You need to solve the associated data-flow 
   problem in order to (conservatively) know the answer to this question.
   In particular you need points-to sets (above for instance, that 'y' 
   points to 'x' so that when you modify '*y' that you can note down that 
   "whatever y points to (i.e. at least x) is modified").

   There is no ready-made list of statements that potentially modify a 
   local variable in question.  You need to do that yourself, but GCC 
   contains many helper routines for parts of this problem (as it needs to 
   answer these questions itself as well, for optimization purposes).

b) allocating storage for the variable in question (and possibly giving it 
   an initial value): there are _no_ gimple instruction that express this 
   idea.  The very fact that variables are mentioned in local_decls (and 
   are used in a meaningful way in the instruction stream) leads to them
   being allocated on the stack during function expansion (see 
   expand_used_vars).

non-local variables are similarly handled, they are placed in various 
lists that lead to appropriate assembler statements allocating static 
storage for them (in the data or bss, or whatever other appropriate, 
segment).  They aren't defined (in the allocate-it sense) by gimple 
statement either.


Ciao,
Michael.


Re: Different ASM for ReLU function between GCC11 and GCC12

2023-06-20 Thread Michael Matz via Gcc
Hello,

On Tue, 20 Jun 2023, Jakub Jelinek via Gcc wrote:

> ce1 pass results in emit_conditional_move with
> (gt (reg/v:SF 83 [ x ]) (reg:SF 84)), (reg/v:SF 83 [ x ]), (reg:SF 84)
> operands in the GCC 11 case and so is successfully matched by
> ix86_expand_fp_movcc as ix86_expand_sse_fp_minmax.
> But, in GCC 12+, emit_conditional_move is called with
> (gt (reg/v:SF 83 [ x ]) (reg:SF 84)), (reg/v:SF 83 [ x ]), (const_double:SF 
> 0.0 [0x0.0p+0])
> instead (reg:SF 84 in both cases contains the (const_double:SF 0.0 [0x0.0p+0])
> value, in the GCC 11 case loaded from memory, in the GCC 12+ case set
> directly in a move.  The reason for the difference is exactly that
> because cheap SSE constant can be moved directly into a reg, it is done so
> instead of reusing a pseudo that contains that value already.

But reg84 is _also_ used as operand of emit_conditional_move, so there's 
no reason to not also use it as third operand.  It seems more canonical to 
call a function with

  X-containing-B, A, B

than with

  X-containing-B, A, something-equal-to-B-but-not-B

so either the (const_double) RTL should be used in both, or reg84 should, 
but not a mix.  Exactly so to ...

> actually a minmax.  Even if it allowed the cheap SSE constants,
> it wouldn't know that r84 is also zero (unless the expander checks that
> it is a pseudo with a single setter and verifies it or something similar).

... not have this problem.


Ciao,
Michael.


Re: gcc tricore porting

2023-06-19 Thread Michael Matz via Gcc
Hello,

note that I know next to nothing about Tricore in particular, so take 
everything with grains of salt.  Anyway:

On Mon, 19 Jun 2023, Claudio Eterno wrote:

> in your reply you mentioned "DSP". Do you want to use the DSP instructions
> for final assembly?

It's not a matter of me wanting or not wanting, I have no stake in 
tricore.  From a 20-second look at the Infineon architecture overview I've 
linked it looked like that some DSP instructions could be used for 
implementing normal floating point support, which of course would be 
desirable in a compiler supporting all of C (otherwise you'd have to 
resort to softfloat emulation).  But I have no idea if the CPU and the DSP 
parts are interconnected enough (hardware wise) to make that feasible (or 
even required, maybe the CPU supports floating point itself already?).

> Michael, based on your experience, how much time is necessary to release
> this porting?

Depending on experience in compilers in general and GCC in particular: 
anything between a couple weeks (fulltime) and a year.

> And.. have you any idea about where to start?

If you don't have an assembler and linker already, then with that.  An 
assembler/linker is not part of GCC, but it relies on one.  So look at 
binutils for this.

Once binutils are taken care of: Richis suggestion is a good one: start 
with an existing port of a target with similar features as you intend to 
implement, and modify it according to your needs.  After that works (say, 
you can compile a hello-world successfully): throw it away and restart a 
completely new target from scratch with everything you learned until then.  
(This is so that you don't start with all the cruft that the target you 
used as baseline comes with).

It helps if you already have a toolchain that you can work against, but 
it's not required.

You need to be familiar with some GCC internals, and the documentation 
coming with GCC is a good starting point: 
  https://gcc.gnu.org/onlinedocs/gccint/
(the "Machine Description" chapter will be the important one, but for that 
you need to read a couple other chapters as well)

There are a couple online resources about writing new targets for GCC.  
Stackoverflow refers to some.  E.g. 
  
https://stackoverflow.com/questions/44904644/gcc-how-to-add-support-to-a-new-architecture
refers to https://kristerw.blogspot.com/2017/08/writing-gcc-backend_4.html 
which is something not too old.  For concrete questions this mailing list 
is a good place to ask.


Good luck,
Michael.

> 
> Ciao
> Claudio
> 
> Il giorno lun 19 giu 2023 alle ore 16:16 Michael Matz  ha
> scritto:
> 
> > Hello,
> >
> > On Mon, 19 Jun 2023, Richard Biener via Gcc wrote:
> >
> > > On Sun, Jun 18, 2023 at 12:00 PM Claudio Eterno via Gcc 
> > wrote:
> > > >
> > > > Hi, this is my first time with open source development. I worked in
> > > > automotive for 22 years and we (generally) were using tricore series
> > for
> > > > these products. GCC doesn't compile on that platform. I left my work
> > some
> > > > days ago and so I'll have some spare time in the next few months. I
> > would
> > > > like to know how difficult it is to port the tricore platform on gcc
> > and if
> > > > during this process somebody can support me as tutor and... also if
> > the gcc
> > > > team is interested in this item...
> > >
> > > We welcome ports to new architectures.  Quick googling doesn't find me
> > > something like an ISA specification though so it's difficult to assess
> > the
> > > complexity of porting to that architecture.
> >
> > https://en.wikipedia.org/wiki/Infineon_TriCore
> >
> > https://www.infineon.com/dgdl/TC1_3_ArchOverview_1.pdf?fileId=db3a304312bae05f0112be86204c0111
> >
> > CPU part looks like fairly regular 32bit RISC.  DSP part seems quite
> > normal as well.  There even was once a GCC port to Tricore, version 3.3
> > from HighTec (now part of Infineon itself), but not even the wayback
> > machine has the files for that anymore:
> >
> >
> > https://web.archive.org/web/20150205040416/http://www.hightec-rt.com:80/en/downloads/sources.html
> >
> > Given the age of that port it's probably better to start from scratch
> > anyway :)  (the current stuff from them/Infineon doesn't seem to be
> > GCC-based anymore?)
> >
> >
> > Ciao,
> > Michael.
> 
> 
> 
> 


Re: gcc tricore porting

2023-06-19 Thread Michael Matz via Gcc
Hello,

On Mon, 19 Jun 2023, Richard Biener via Gcc wrote:

> On Sun, Jun 18, 2023 at 12:00 PM Claudio Eterno via Gcc  
> wrote:
> >
> > Hi, this is my first time with open source development. I worked in
> > automotive for 22 years and we (generally) were using tricore series for
> > these products. GCC doesn't compile on that platform. I left my work some
> > days ago and so I'll have some spare time in the next few months. I would
> > like to know how difficult it is to port the tricore platform on gcc and if
> > during this process somebody can support me as tutor and... also if the gcc
> > team is interested in this item...
> 
> We welcome ports to new architectures.  Quick googling doesn't find me
> something like an ISA specification though so it's difficult to assess the
> complexity of porting to that architecture.

https://en.wikipedia.org/wiki/Infineon_TriCore
https://www.infineon.com/dgdl/TC1_3_ArchOverview_1.pdf?fileId=db3a304312bae05f0112be86204c0111

CPU part looks like fairly regular 32bit RISC.  DSP part seems quite 
normal as well.  There even was once a GCC port to Tricore, version 3.3 
from HighTec (now part of Infineon itself), but not even the wayback 
machine has the files for that anymore:

https://web.archive.org/web/20150205040416/http://www.hightec-rt.com:80/en/downloads/sources.html

Given the age of that port it's probably better to start from scratch 
anyway :)  (the current stuff from them/Infineon doesn't seem to be 
GCC-based anymore?)


Ciao,
Michael.


Re: Passing the complex args in the GPR's

2023-06-07 Thread Michael Matz via Gcc
Hey,

On Tue, 6 Jun 2023, Umesh Kalappa via Gcc wrote:

> Question is : Why does GCC choose to use GPR's here and have any
> reference to support this decision  ?

You explicitely used -m32 ppc, so 
https://www.polyomino.org.uk/publications/2011/Power-Arch-32-bit-ABI-supp-1.0-Unified.pdf
 
applies.  It explicitely states in "B.1 ATR-Linux Inclusion and 
Conformance" that it is "ATR-PASS-COMPLEX-IN-GPRS", and other sections 
detail what that means (namely passing complex args in r3 .. r10, whatever 
fits).  GCC adheres to that, and has to.

The history how that came to be was explained in the thread.


Ciao,
Michael.

 > 
> Thank you
> ~Umesh
> 
> 
> 
> On Tue, Jun 6, 2023 at 10:16 PM Segher Boessenkool
>  wrote:
> >
> > Hi!
> >
> > On Tue, Jun 06, 2023 at 08:35:22PM +0530, Umesh Kalappa wrote:
> > > Hi Adnrew,
> > > Thank you for the quick response and for PPC64 too ,we do have
> > > mismatches in ABI b/w complex operations like
> > > https://godbolt.org/z/bjsYovx4c .
> > >
> > > Any reason why GCC chose to use GPR 's here ?
> >
> > What did you expect, what happened instead?  Why did you expect that,
> > and why then is it an error what did happen?
> >
> > You used -O0.  As long as the code works, all is fine.  But unoptimised
> > code frequently is hard to read, please use -O2 instead?
> >
> > As Andrew says, why did you use -m32 for GCC but -m64 for LLVM?  It is
> > hard to compare those at all!  32-bit PowerPC Linux ABI (based on 32-bit
> > PowerPC ELF ABI from 1995, BE version) vs. 64-bit ELFv2 ABI from 2015
> > (LE version).
> >
> >
> > Segher
> 


Re: More C type errors by default for GCC 14

2023-05-15 Thread Michael Matz via Gcc
Hello,

On Fri, 12 May 2023, Florian Weimer wrote:

> * Alexander Monakov:
> 
> > This is not valid (constraint violation):
> >
> >   unsigned x;
> >
> >   int *p = 
> >
> > In GCC this is diagnosed under -Wpointer-sign:
> >
> >   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=25892
> 
> Thanks for the reference.  I filed:
> 
>   -Wpointer-sign must be enabled by default
>   

Can you please not extend the scope of your proposals in this thread but 
create a new one?

(FWIW: no, this should not be an error, a warning is fine, and I actually 
think the current state of it not being in Wall is the right thing as 
well)


Ciao,
Michael.


Re: More C type errors by default for GCC 14

2023-05-15 Thread Michael Matz via Gcc
Hello,

On Fri, 12 May 2023, Jakub Jelinek via Gcc wrote:

> On Fri, May 12, 2023 at 11:33:01AM +0200, Martin Jambor wrote:
> > > One fairly big GCC-internal task is to clear up the C test suite so that
> > > it passes with the new compiler defaults.  I already have an offer of
> > > help for that, so I think we can complete this work in a reasonable time
> > > frame.
> 
> I'd prefer to keep at least significant portion of those tests as is with
> -fpermissive added (plus of course we need new tests that verify the errors
> are emitted), so that we have some testsuite coverage for those.

Yes, this!  Try to (!) never change committed testcase souces, however 
invalid they may be (changing how they are compiled, including by 
introducing new dg-directives and using them in comments, is of course 
okay).

(And FWIW: I'm fine with Florians proposal.  I personally think the 
arguments for upgrading the warnings to errors are _not_ strong enough, 
but I don't think so very strongly :) )


Ciao,
Michael.


Re: [PATCH] Add targetm.libm_function_max_error

2023-04-27 Thread Michael Matz via Gcc-patches
Hello,

On Thu, 27 Apr 2023, Jakub Jelinek wrote:

> The first really large error I see is for sinl with
> x/2gx 
> 0x748160ed90d9425b0xefd8b811d6293294
> i.e.
> 1.5926552660973502228303666578452949e+253
> with most significant double being
> 1.5926552660973502e+253
> and low double
> -5.9963639272208416e+230

Such large numbers will always be a problem with the range reduction step 
in sin/cos.  With double-double the possible mantissage length can be much 
larger than 106, and the range reduction needs to be precise to at 
least those many bits to give anything sensible.

Realistically I think you can only expect reasonably exact results for 
double-double on operations that require range-reductions for
(a) "smallish" values.  Where the low double is (say) <= 2^16 * PI, or
(b) where the low double is consecutive to the high double, i.e. the
overall mantissa size (including the implicit zeros in the middle) is 
less than 107 (or whatever the current precision for the 
range-reduction step on large values is)

> given is
> -0.4025472157704263326278375983156912
> and expected (mpfr computed)
> -0.46994008859023245970759964236618727
> But if I try on x86_64:
> #define _GNU_SOURCE
> #include 
> 
> int
> main ()
> {
>   _Float128 f, f2, f3, f4;
>   double d, d2;
>   f = 1.5926552660973502228303666578452949e+253f128;
>   d = 1.5926552660973502e+253;
>   f2 = d;
>   f2 += -5.9963639272208416e+230;
>   f3 = sinf128 (f);
>   f4 = sinf128 (f2);
>   d2 = sin (d);
>   return 0;
> }
> where I think f2 is what matches most closely the 106 bit precision value,
> (gdb) p f
> $7 = 1.5926552660973502228303666578452949e+253
> (gdb) p f2
> $8 = 1.59265526609735022283036665784527174e+253
> (gdb) p f3
> $9 = -0.277062522218693980443596385112227247
> (gdb) p f4
> $10 = -0.402547215770426332627837598315693221
> and f4 is much closer to the given than to expected.

Sure, but that's because f2 is only "close" to the double-double exact 
value of (1.5926552660973502e+253 + -5.9963639272208416e+230) relatively, 
not absolutely.  As you already wrote the mantissa to represent the exact 
value (which double-double and mpfr can!) is larger than 106 bits.  The 
necessary error of rounding to represent it in f128 is small in ULPs, but 
very very large in magnitude.  Large magnitude changes of input value to 
sin/cos essentially put the value into completely different quadrants and 
positions within those quadrants, and hence the result under such rounding 
in input can be _wildly_ off.

E.g. imagine a double-double representing (2^107 * PI + PI/2) exactly 
(assume PI is the 53-bit representation of pi, that's why I say 
"exactly").  The correct result of sin() on that is 1.  The result on the 
nearest f128 input value (2^107 * PI) will be 0.  So you really can't 
compare f128 arithmetic with double-double one when the mantissas are too 
far away.

So, maybe you want to only let your tester test "good" double-double 
values, i.e. those that can be interpreted as a about-106-bit number where 
(high-exp - low-exp) <= about 53.

(Or just not care about the similarities of cos() on double-double to a 
random number generator :) )


Ciao,
Michael.


Re: [PATCH] Add targetm.libm_function_max_error

2023-04-26 Thread Michael Matz via Gcc-patches
Hello,

On Wed, 26 Apr 2023, Jakub Jelinek via Gcc-patches wrote:

> For glibc I've gathered data from:
> 4) using attached ulp-tester.c (how to invoke in file comment; tested
>both x86_64, ppc64, ppc64le 50M pseudo-random values in all 4 rounding
>modes, plus on x86_64 float/double sin/cos using libmvec - see
>attached libmvec-wrapper.c as well)

That ulp-tester.c file as attached here is not testing what you think it 
does.  (1) It doesn't compile as it doesn't #define the TESTS macro in the 
!LIBMVEC_TEST case, and (2) it almost never progresses 'm', the status 
variable used before the random numbers start, to beyond 1: you start with 
nextafter(0.0, 1.0), which is the smallest subnormal number (with a ERANGE 
error, but that's ignored), and you test for equality with THIS_MIN, the 
smallest normal (!) number, until you start incrementing 'm'.

>From subnormal smallest to normal smallest takes 1<

Re: MIN/MAX and trapping math and NANs

2023-04-11 Thread Michael Matz via Gcc
Hello,

On Tue, 11 Apr 2023, Richard Biener via Gcc wrote:

> In the case we ever implement conforming FP exception support
> either targets would need to be fixed to mask unexpected exceptions
> or we have to refrain from moving instructions where the target
> implementation may rise exceptions across operations that might
> raise exceptions as originally written in source (and across
> points of FP exception state inspection).
> 
> That said, the effect to the FP exception state according to IEEE
> is still unanswered.

The IEEE 754-2008 predicate here is minNum/maxNum, and those are 
general-computational with floating point result.  That means any sNaN 
input raises-invalid (and delivers-qNaN if masked).  For qNaN input 
there's a special case: the result is the non-qNaN input (normal handling 
would usually require the qNaN to be returned).

Note that this makes minNum/maxNum (and friends) not associative.  Also, 
different languages and different hardware implement fmin/fmax different 
and sometimes in conflict with 754-2008 (e.g. on SSE2 maxsd isn't 
commutative but maxNum is!).  This can be considered a defect in 754-2008.  
As result these operations were demoted in 754-2019 and new functions 
minimumNumber (and friends) recommended (those propagate a qNaN).

Of course IEEE standards aren't publicly available and I don't have 
754-2019 (but -2008), so I can't be sure about the _exact_ wording 
regarding minimumNumber, but for background of the min/max woes: 

  https://754r.ucbtest.org/background/minNum_maxNum_Removal_Demotion_v3.pdf

In short: it's not so easy :-)  (it may not be advisable to slavishly 
follow 754-2008 for min/max)

> The NaN handling then possibly allows
> implementation with unordered compare + mask ops.


Ciao,
Michael.


Re: [RFC PATCH] driver: unfilter default library path [PR 104707]

2023-04-06 Thread Michael Matz via Gcc
Hello,

On Thu, 6 Apr 2023, Shiqi Zhang wrote:

> Currently, gcc delibrately filters out default library paths "/lib/" and
> "/usr/lib/", causing some linkers like mold fails to find libraries.

If linkers claim to be a compatible replacement for other linkers then 
they certainly should behave in a similar way.  In this case: look into 
/lib and /usr/lib when something isn't found till then.

> This behavior was introduced at least 31 years ago in the initial
> revision of the git repo, personally I think it's obsolete because:
>  1. The less than 20 bytes of saving is negligible compares to the command
> line argument space of most hosts today.

That's not the issue that is solved by ignoring these paths in the driver 
for %D/%I directives.  The issue is (traditionally) that even if the 
startfiles sit in /usr/lib (say), you don't want to add -L/usr/lib to the 
linker command line because the user might have added -L/usr/local/lib 
explicitely into her link command and depending on order of spec file 
entries the -L/usr/lib would be added in front interacting with the 
expectations of where libraries are found.

Hence: never add something in (essentially) random places that is default 
fallback anyway.  (Obviously the above problem could be solved in a 
different, more complicated, way.  But this is the way it was solved since 
about forever).

If mold doesn't look into {,/usr}/lib{,64} (as appropriate) by default 
then that's the problem of mold.


Ciao,
Michael.


Re: [Tree-SSA] Question from observation, bogus SSA form?

2023-03-17 Thread Michael Matz via Gcc
Hello,

On Fri, 17 Mar 2023, Pierrick Philippe wrote:

> > This means that global variables, volatile variables, aggregates,
> > variables which are not considered aggregates but are nevertheless
> > partially modified (think insertion into a vector) or variables which
> > need to live in memory (most probably because their address was taken)
> > are not put into an SSA form.  It may not be easily possible.
> 
> Alright, I understand, but I honestly find it confusing.

You can write something only into SSA form if you see _all_ assignments to 
the entity in question.  That's not necessarily the case for stuff sitting 
in memory.  Code you may not readily see (or might not be able to 
statically know the behaviour of) might be able to get ahold of it and 
hence change it behind your back or in unknown ways.  Not in your simple 
example (and if you look at it during some later passes in the compiler 
you will see that 'x' will indeed be written into SSA form), but in some 
that are only a little more complex:

int foo (int i) {
  int x, *y=
  x = i;  // #1
  bar(y); // #2
  return x;
}

or

int foo (int i) {
  int x, z, *y = i ?  : 
  x = z = 1;  // #1
  *y = 42;// #2
  return x;
}

here point #1 is very obviously a definition of x (and z) in both 
examples.  And point #2?  Is it a definition or not?  And if it is, then 
what entity is assigned to?  Think about that for a while and what that 
means for SSA form.

> I mean, aren't they any passes relying on the pure SSA form properties 
> to analyze code? For example to DSE or DCE.

Of course.  They all have to deal with memory in a special way (many by 
not doing things on memory).  Because of the above problems they would 
need to special-case memory no matter what.  (E.g. in GCC memory is dealt 
with via the virtual operands, the '.MEM_x = VDEF<.MEM_y>' and VUSE 
constructs you saw in the dumps, to make dealing with memory in an 
SSA-based compiler at least somewhat natural).


Ciao,
Michael.


Re: [PATCH] lto: pass through -funwind-tables and -fasynchronous-unwind-tables

2023-01-18 Thread Michael Matz via Gcc-patches
Hello,

On Wed, 18 Jan 2023, Jakub Jelinek wrote:

> > > > > Partly OT, what is riscv not defaulting that on as well?  Does it have
> > > > > usable unwind info even without that option, something else?
> > > > 
> > > > The RISC-V ABI does not address this, AFAICS.
> > > 
> > > And neither do many other ABIs, still we default there to
> > > -fasynchronous-unwind-tables because we've decided it is a good idea.
> > 
> > That might or might not be, but in the context of this thread that's 
> > immaterial.  Doing the same as the other archs will then simply hide the 
> > problem on risc-v as well, instead of fixing it.
> 
> Yeah, that is why I've mentioned "Partly OT".  We want this bug to be fixed
> (but the fix is not what has been posted but rather decide what we want to
> ask there; if it is at the end of compilation, whether it is at least one
> function with that flag has been compiled, or all functions have been with
> that flag, something else),

The answer to this should be guided by normal use cases.  The normal use 
case is that a whole input file is compiled with or without 
-funwind-tables, and not that individual functions change this.  So any 
solution in which that usecase doesn't work is not a solution.

The purest solution is to emit unwind tables for all functions that 
request it into .eh_frame and for those that don't request it put 
into .debug_frame (if also -g is on).  If that requires enabling 
unwind-tables globally first (i.e. if even just one input function 
requests it) then that is what needs to be done.  (this seems to be the 
problem currently, that the unwind-table activation on a per-function 
basis comes too late).

The easier solution might be to make funwind-tables also be a global 
special-cased option for LTO (so that the usual use-case above works), 
that would trade one or another bug, but I'd say the current bug is more 
serious than the other bug that would be introduced.

> and IMHO riscv should switch to
> -fasynchronous-unwind-tables by default.

I don't disagree, as long as that doesn't lead to the bug being ignored :)


Ciao,
Michael.


Re: [PATCH] lto: pass through -funwind-tables and -fasynchronous-unwind-tables

2023-01-18 Thread Michael Matz via Gcc-patches
Hello,

On Wed, 18 Jan 2023, Jakub Jelinek wrote:

> On Wed, Jan 18, 2023 at 04:09:08PM +0100, Andreas Schwab wrote:
> > On Jan 18 2023, Jakub Jelinek wrote:
> > 
> > > Partly OT, what is riscv not defaulting that on as well?  Does it have
> > > usable unwind info even without that option, something else?
> > 
> > The RISC-V ABI does not address this, AFAICS.
> 
> And neither do many other ABIs, still we default there to
> -fasynchronous-unwind-tables because we've decided it is a good idea.

That might or might not be, but in the context of this thread that's 
immaterial.  Doing the same as the other archs will then simply hide the 
problem on risc-v as well, instead of fixing it.


Ciao,
Michael.


Re: [PATCH] lto: pass through -funwind-tables and -fasynchronous-unwind-tables

2023-01-18 Thread Michael Matz via Gcc-patches
On Wed, 18 Jan 2023, Andreas Schwab via Gcc-patches wrote:

> No unwind tables are generated, as if -funwind-tables is ignored.  If
> LTO is disabled, everything works as expected.

On Risc-V btw.  (which, unlike i386,aarch64,s390,rs6000 does not 
effectively enable funwind-tables always via either backend or 
common/config/$arch/ code, which is the reason why the problem can't be 
seen there).  It's an interaction with -g :

The problem (with cross compiler here, but shouldn't matter):

% riscv64-gcc -g -flto -funwind-tables -fPIC -c hello.c
% riscv64-gcc -shared hello.o
% readelf -wF a.out
... empty .eh_frame ...
Contents of the .debug_frame section:
... content ...

Using a compiler for any of the above archs makes this work (working means 
that the unwind info is placed into .eh_frame, _not_ into .debug_frame).  
Not using -g makes it work.  Adding -funwind-tables to the link command 
makes it work.  Adding -fexceptions to the compile command makes it work.  
Not using LTO makes it work.

So, it's quite clear that the option merging algorithm related to all this 
is somewhat broken, the global (or per function, or whatever) 
-funwind-tables option from hello.o doesn't make it correctly into the 
output (when -g is there).  Adding -fexception makes it work because then 
the functions will have personalities and on LTO-read-in _that_ will 
implicitely enable funwind-tables again (which should have been enabled 
already by the option-read-in).

As written initially the other archs are working because they all have 
various ways of essentially setting flag_unwind_tables always:

i386 via common/config/i386/i386-common.cc
   opts->x_flag_asynchronous_unwind_tables = 2;
  config/i386/i386-options.cc
 if (opts->x_flag_asynchronous_unwind_tables == 2)
   opts->x_flag_unwind_tables
 = opts->x_flag_asynchronous_unwind_tables = 1;

rs6000 via common/config/rs6000/rs6000-common.cc
   #ifdef OBJECT_FORMAT_ELF
 opts->x_flag_asynchronous_unwind_tables = 1;
   #endif
  (which ultimately also enabled flag_unwind_tables)

s390 via common/config/s390/s390-common.cc
opts->x_flag_asynchronous_unwind_tables = 1;

aarch64 via common/config/aarch64/aarch64-common.cc
  #if (TARGET_DEFAULT_ASYNC_UNWIND_TABLES == 1)
{ OPT_LEVELS_ALL, OPT_fasynchronous_unwind_tables, NULL, 1 },
{ OPT_LEVELS_ALL, OPT_funwind_tables, NULL, 1},
  #endif

  (the #define here is set for aarch64*-*-linux* )

So the problem cannot be readily demonstrated on these architectures.  
risc-v has no such code (and doesn't need to).


Ciao,
Michael.


Re: access to include path in front end

2022-12-05 Thread Michael Matz via Gcc
Hey,

On Fri, 2 Dec 2022, James K. Lowden wrote:

> > > 3.  Correct the entries in the default_compilers array.  Currently I
> > > have in cobol/lang-specs.h:
> > > 
> > > {".cob", "@cobol", 0, 0, 0},
> > > {".COB", "@cobol", 0, 0, 0},
> > > {".cbl", "@cobol", 0, 0, 0},
> > > {".CBL", "@cobol", 0, 0, 0},
> > > {"@cobol", 
> > >   "cobol1 %i %(cc1_options) %{!fsyntax-only:%(invoke_as)}", 
> > >   0, 0, 0}, 
> > 
> > It misses %(cpp_unique_options) which was the reason why your -I
> > arguments weren't passed to cobol1.  
> 
> If I understood you correctly, I don't need to modify gcc.cc.  I only
> need to modify cobol/lang-specs.h, which I've done.  But that's
> evidently not all I need to do, because it doesn't seem to work.  
> 
> The last element in the fragment in cobol/lang-specs.h is now: 
> 
> {"@cobol",
>   "cobol1 %i %(cc1_options) %{!fsyntax-only:%(invoke_as)} "
>   "%(cpp_unique_options) ",

%(invoke_as) needs to be last.  What it does is effectively add this to 
the command line (under certain conditions): "-somemoreoptions | as".  
Note the pipe symbol.  Like in normal shell commands also the gcc driver 
interprets this as "and now start the following command as well connection 
stdout of the first to stdin of the second".  So all in all the generated 
cmdline will be somewhat like:

  cobol1 input.cbl -stuff-from-cc1-options | as - -stuff-from-cpp-options

Your cpp_unique_options addition will effectively be options to that 'as' 
command, but you wanted it to be options for cobol1.  So, just switch 
order of elements.

> I see the -B and -I options, and others, with their arguments, contained
> in COLLECT_GCC_OPTIONS on lines 9 and 11.  I guess that represents an
> environment string?

Yes.  It's our round-about-way of passing the gcc options as the user gave 
them downwards in case collect2 (a wrapper for the linking step for, gah, 
don't ask) needs to call gcc itself recursively.  But in the -### (or -v) 
output, if the assembler is invoked in your example (i.e. cobol1 doesn't 
fail for some reason) you should see your -I options being passed to that 
one (wrongly so, of course :) ).


Ciao,
Michael.


Re: access to include path in front end

2022-12-01 Thread Michael Matz via Gcc
Hey,

On Thu, 1 Dec 2022, James K. Lowden wrote:

> > E.g. look in gcc.cc for '@c' (matching the file extension) how that
> > entry uses '%(cpp_unique_options)', and how cpp_unique_options is
> > defined for the specs language:
> > 
> >   INIT_STATIC_SPEC ("cpp_unique_options",   _unique_options),
> > 
> > and
> > 
> > static const char *cpp_unique_options =
> >   "%{!Q:-quiet} %{nostdinc*} %{C} %{CC} %{v} %@{I**} %{P} %I\  
> 
> Please tell me if this looks right and complete to you:
> 
> 1.  Add an element to the static_specs array: 
> 
> INIT_STATIC_SPEC ("cobol_options", _options),

That, or expand its contents where you'd use '%(cobol_options)' in the 
strings.

> 
> 2.  Define the referenced structure: 
> 
>   static const char *cobol_options =  "%{v} %@{I**}"
>   or just
>   static const char *cobol_options =  "%{v} %@{I*}"
> 
>   because I don't know what -F does, or if I need it.

I.e. as long as it's that short expanding inline would work nicely.

> I'm using "cobol_options" instead of "cobol_unique_options" because the
> options aren't unique to Cobol, and because only cpp seems to have
> unique options.  
> 
> I'm including %{v} against the future, when the cobol1 compiler
> supports a -v option. 

Makes sense.

> 3.  Correct the entries in the default_compilers array.  Currently I
> have in cobol/lang-specs.h:
> 
> {".cob", "@cobol", 0, 0, 0},
> {".COB", "@cobol", 0, 0, 0},
> {".cbl", "@cobol", 0, 0, 0},
> {".CBL", "@cobol", 0, 0, 0},
> {"@cobol", 
>   "cobol1 %i %(cc1_options) %{!fsyntax-only:%(invoke_as)}", 
>   0, 0, 0}, 
> 
> That last one is a doozy.  Is it even slightly right?

It misses %(cpp_unique_options) which was the reason why your -I arguments 
weren't passed to cobol1.  You would just your new %(cobol_options), or 
simply '%{v} %{I*}' directly in addition to cc1_options.

> IIUC, I should at
> least remove 
> 
>   %{!fsyntax-only:%(invoke_as)}
> 
> because I don't need the options from the invoke_as string in gcc.cc. 

I think you do, as cobol1 will write out assembler code (it does, right?), 
so to get an object file the driver needs to invoke 'as' as well.  
Basically invoke_as tacks another command to run at the end of the already 
build command line (the one that above would start with 'cobol1 
inputfile.cob ... all the options ...'.  It will basically tack the 
equivalent of '| as tempfile.s -o realoutput.o' to the end (which 
eventually will make the driver executate that command as well).

> That would still leave me with too much, because cobol1 ignores most of
> the options cc1 accepts.  What would you do?

I would go through all cc1_options and see if they _really_ shouldn't be 
interpreted by cobol1.  I guess for instance '-Ddefine' really doesn't 
make sense, but e.g. '-m...' and '-f...' do, and maybe -quiet as well, and 
suchlike.  In that case I'd just use cc1_options (in addition to your new 
%{I*} snippet).

If you then really determine, that no, most options do not make sense you 
need to extract a subset of cc1_options that do, and write them into the 
@cobol entry.  Look e.g. what the fortran frontend does (in 
fortran/lang-specs.h) it simply attaches more things to cc1_options.

> I don't understand the relationship between default_compliers and
> static_specs.  

static_specs lists the names of 'variables' you can use within the specs 
strings, and to what they should expand.  E.g. when I would want to use 
'%(foobar)' in any of my specs strings that needs to be registered in 
static_spec[]:

  INIT_STATIC_SPEC ("foobar", _variable_containing_a_string)

The specs parse would then replace '%(foobar)' in specs strings with 
whatever that variable contains.

Using such variables mostly makes sense if you want to enable users (who 
can provide their own specs file) to refer to well-known snippets 
maintained by GCC itself.  For most such strings it's not necessary, and 
you'd be fine with the approach of fortran:

 #define MY_FOOBAR_STRING "%{v} ... this and that ..."

...

  {@mylang, ... "lang1 %i " MY_FOOBAR_STRING "" ... }

> I have made no special provision for "compiler can deal with
> multiple source files", except that cobol1 accepts multiple source
> files on the command line, and iterates over them.  If that's enough,
> then I'll set compiler::combinable to 1.

No good advise here for combinable.  Try it :)

> As I mentioned, for a year I've been able to avoid the Specs Language,
> apparently because some things happen by default.  The options defined
> in cobol/lang.opt are passed from gcobol to cobol1.  The value of the
> -findicator-column option is available (but only if the option starts
> with "f"; -indicator-column doesn't work).  cobol1 sees the value of
> -fmax-errors. 

That's because "%{f*}" is contained in %(cc1_options): 'pass on all 
options starting with "f"', and because you listed cc1_options in your 
cobol1 command line.


Ciao,
Michael.


Re: access to include path in front end

2022-11-30 Thread Michael Matz via Gcc
Hello,

On Tue, 29 Nov 2022, James K. Lowden wrote:

> I don't understand how to access in a front end the arguments to the -I
> option on the command line.  
> 
> Cobol has a feature similar to the C preprecessor, known as the
> Compiler Directing Facility (CDF).  The CDF has a COPY statement that
> resembles an #include directive in C, and shares the property that COPY
> names a file that is normally found in a "copybook" which, for our
> purposes, is a directory of such files.  The name of that directory is
> defined outside the Cobol program.  
> 
> I would like to use the -I option to pass the names of copybook
> directories to the cobol front end.  A bit of exploration yesterday left
> me with the sense that the -I argument, in C at least, is not passed to
> the compiler, but to the preprocessor. Access to -fmax-errors I think
> I've figured out, but -I is a mystery. 
> 
> I'm a little puzzled by the status quo as I understand it.  Unless I
> missed it, it's not discussed in gccint.  ISTM ideally there would be
> some kind of getopt(3) processing, and the whole set of command-line
> options captured in an array of structures accessible to any front
> end.

There is, it's just much more complicated than getopt :)

If you're looking at the C frontends for inspiration, then:

c-family/c.opt defines which options are recognized and several specifics 
about them, e.g. for -I it has:


I
C ObjC C++ ObjC++ Joined Separate MissingArgError(missing path after %qs)
-I Add  to the end of the main include path.


(look at some other examples therein, also in common.opt to get a feel).

Then code in c-family/c-opts.c:c_common_handle_option actually handles the 
option:

case OPT_I:
  if (strcmp (arg, "-"))
add_path (xstrdup (arg), INC_BRACKET, 0, true);
  else
  .,.

That function is made a langhook for option processing so that it's 
actually called via c/c-objc-common.h:

  #define LANG_HOOKS_HANDLE_OPTION c_common_handle_option

If you're also using the model of a compiler driver (i.e. the gcc program, 
source in gcc.cc) that actually calls compiler (cc1), assembler and 
linker, then you also need to arrange for that program to pass all -I 
options to the compiler proper.  That is done with the spec language, by 
somewhere having '{I*}' in the specs for invoking the cobol compiler.  
E.g. look in gcc.cc for '@c' (matching the file extension) how that entry 
uses '%(cpp_unique_options)', and how cpp_unique_options is defined for 
the specs language:

  INIT_STATIC_SPEC ("cpp_unique_options",   _unique_options),

and

static const char *cpp_unique_options =
  "%{!Q:-quiet} %{nostdinc*} %{C} %{CC} %{v} %@{I**} %{P} %I\  

(the specs language used here is documented in a lengthy comment early in 
gcc.cc, "The Specs Language")

The "%@{I*F*}" is the one that makes gcc pass -Iwhatever to cc1 (and 
ensures relative order with -F options is retained and puts all these into 
an @file if one is given on the cmdline, otherwise leaves it on cmdline).  
If you use the compiler driver then using '-v' when invoking it will 
quickly tell you if that options passing worked, as it will show the 
concrete command it exec's for the compiler proper.

Hope this helps.


Ciao,
Michael.


Re: [PATCH] Various pages: SYNOPSIS: Use VLA syntax in function parameters

2022-11-29 Thread Michael Matz via Gcc
Hey,

On Tue, 29 Nov 2022, Uecker, Martin wrote:

> It does not require any changes on how arrays are represented.
> 
> As part of VM-types the size becomes part of the type and this
> can be used for static or dynamic analysis, e.g. you can 
> - today - get a run-time bounds violation with the sanitizer:
> 
> void foo(int n, char (*buf)[n])
> {
>   (*buf)[n] = 1;
> }

This can already statically analyzed as being wrong, no need for dynamic 
checking.  What I mean is the checking of the claimed contract.  Above you 
assure for the function body that buf has n elements.  This is also a 
pre-condition for calling this function and _that_ can't be checked in all 
cases because:

  void foo (int n, char (*buf)[n]) { (*buf)[n-1] = 1; }
  void callfoo(char * buf) { foo(10, buf); }

buf doesn't have a known size.  And a pre-condition that can't be checked 
is no pre-condition at all, as only then it can become a guarantee for the 
body.

The compiler has no choice than to trust the user that the pre-condition 
for calling foo is fulfilled.  I can see how being able to just check half 
of the contract might be useful, but if it doesn't give full checking then 
any proposal for syntax should be even more obviously orthogonal than the 
current one.

> For
> 
> void foo(int n, char buf[n]);
> 
> it semantically has no meaning according to the C standard,
> but a compiler could still warn. 

Hmm?  Warn about what in this decl?

> It could also warn for
> 
> void foo(int n, char buf[n]);
> 
> int main()
> {
> char buf[9];
> foo(buf);
> }

You mean if you write 'foo(10,buf)' (the above, as is, is simply a syntax 
error for non-matching number of args).  Or was it a mispaste and you mean 
the one from the godbolt link, i.e.:

void foo(char buf[10]){ buf[9] = 1; }
int main()
{
char buf[9];
foo(buf);
}

?  If so, yeah, we warn already.  I don't think this is an argument for 
(or against) introducing new syntax.

...

> But in general: This feature is useful not only for documentation
> but also for analysis.

Which feature we're talking about now?  The ones you used all work today, 
as you demonstrated.  I thought we would be talking about that ".whatever" 
syntax to refer to arbitrary parameters, even following ones?  I think a 
disrupting syntax change like that should have a higher bar than "in some 
cases, depending on circumstance, we might even be able to warn".


Ciao,
Michael.


Re: [PATCH] Various pages: SYNOPSIS: Use VLA syntax in function parameters

2022-11-29 Thread Michael Matz via Gcc
Hey,

On Tue, 29 Nov 2022, Alex Colomar via Gcc wrote:

> How about the compiler parsing the parameter list twice?

This _is_ unbounded look-ahead.  You could avoid this by using "." for 
your new syntax.  Use something unambiguous that can't be confused with 
other syntactic elements, e.g. with a different punctuator like '@' or the 
like.  But I'm generally doubtful of this whole feature within C itself.  
It serves a purpose in documentation, so in man-pages it seems fine enough 
(but then still could use a different puncuator to not be confusable with 
C syntax).

But within C it still can only serve a documentation purpose as no 
checking could be performed without also changes in how e.g. arrays are 
represented (they always would need to come with a size).  It seems 
doubtful to introduce completely new and ambiguous syntax with all the 
problems Joseph lists just in order to be able to write documentation when 
there's a perfectly fine method to do so: comments.


Ciao,
Michael.


Re: How can Autoconf help with the transition to stricter compilation defaults?

2022-11-17 Thread Michael Matz via Gcc
Hello,

On Wed, 16 Nov 2022, Paul Eggert wrote:

> On 2022-11-16 06:26, Michael Matz wrote:
> > char foobar(void);
> > int main(void) {
> >return  != 0;
> > }
> 
> That still has undefined behavior according to draft C23,

This is correct (and also holds for the actually working variant later, 
with a volatile variable).  If your argument is then that as both 
solutions for the link-test problem are relying on undefined behaviour 
they are equivalent and hence no change is needed, you have a point, but I 
disagree.  In practice one (with the call) will cause more problems than 
the other (with address taking).

> If Clang's threatened pickiness were of some real use elsewhere, it 
> might be justifiable for default Clang to break Autoconf. But so far we 
> haven't seen real-world uses that would justify this pickiness for 
> Autoconf's use of 'char memset_explicit(void);'.

Note that both, GCC and clang, already warn (not error out!) about the 
mismatching decl, even without any headers.  So we are in the pickiness 
era already.

I.e. a C file containing just a single line "char printf(void);" will be 
warned about, by default.  There is about nothing that autoconf could do 
to rectify this, except containing a long list of prototypes for 
well-known functions, with the associated maintenance hassle.  But 
autoconf _can_ do something about how the decls are used in the 
link-tests.


Ciao,
Michael.


Re: How can Autoconf help with the transition to stricter compilation defaults?

2022-11-16 Thread Michael Matz via Gcc
Hello,

On Wed, 16 Nov 2022, Jonathan Wakely wrote:

> > > Unrelated but I was a bit tempted to ask for throwing in
> > > -Wbuiltin-declaration-mismatch to default -Werror while Clang 16 was at
> > > it, but I suppose we don't want the world to burn too much,
> >
> > :-)  It's IMHO a bug in the standard that it misses "if any of its
> > associated headers are included" in the item for reservation of external
> > linkage identifiers; it has that for all other items about reserved
> > identifiers in the Library clause.  If that restriction were added you
> > couldn't justify erroring on the example at hand (because it doesn't
> > include e.g.  and then printf wouldn't be reserved).  A warning
> > is of course always okay and reasonable.  As is, you could justify
> > erroring out, but I too think that would be overzealous.
> 
> 
> I think that's very intentional and not a defect in the standard.
> 
> If one TU was allowed to define:
> 
> void printf() { }
> 
> and have that compiled into the program, then that would cause
> unexpected behaviour for every other TU which includes  and
> calls printf. They would get the non-standard rogue printf.

True.  But suppose the restriction would be added.  I could argue that 
then your problem program (in some other TU) _does_ include the header, 
hence the identifier would have been reserved and so the above definition 
would have been wrong.  I.e. I think adding the restriction wouldn't allow 
the problematic situation either.

I'm aware that the argument would then invoke all the usual problems of 
what constitutes a full program, and if that includes the library even 
when not including headers and so on.  And in any case currently the 
standard does say they're reserved so it's idle speculation anyway :)


Ciao,
Michael.


Re: How can Autoconf help with the transition to stricter compilation defaults?

2022-11-16 Thread Michael Matz via Gcc
Hello,

On Wed, 16 Nov 2022, Sam James wrote:

> Unrelated but I was a bit tempted to ask for throwing in 
> -Wbuiltin-declaration-mismatch to default -Werror while Clang 16 was at 
> it, but I suppose we don't want the world to burn too much,

:-)  It's IMHO a bug in the standard that it misses "if any of its 
associated headers are included" in the item for reservation of external 
linkage identifiers; it has that for all other items about reserved 
identifiers in the Library clause.  If that restriction were added you 
couldn't justify erroring on the example at hand (because it doesn't 
include e.g.  and then printf wouldn't be reserved).  A warning 
is of course always okay and reasonable.  As is, you could justify 
erroring out, but I too think that would be overzealous.


Ciao,
Michael.


Re: How can Autoconf help with the transition to stricter compilation defaults?

2022-11-16 Thread Michael Matz via Gcc
Hey,

On Wed, 16 Nov 2022, Alexander Monakov wrote:

> > The idea is so obvious that I'm probably missing something, why autoconf 
> > can't use that idiom instead.  But perhaps the (historic?) reasons why it 
> > couldn't be used are gone now?
> 
> Ironically, modern GCC and LLVM optimize ' != 0' to '1' even at -O0,
> and thus no symbol reference remains in the resulting assembly.

Err, right, *head-->table*.
Playing with volatile should help:

char foobar(void);
char (* volatile ptr)(void);
int main(void) {
ptr = foobar;
return ptr != 0;
}


Ciao,
Michael.


Re: How can Autoconf help with the transition to stricter compilation defaults?

2022-11-16 Thread Michael Matz via Gcc
Hi,

On Tue, 15 Nov 2022, Paul Eggert wrote:

> On 2022-11-15 11:27, Jonathan Wakely wrote:
> > Another perspective is that autoconf shouldn't get in the way of
> > making the C and C++ toolchain more secure by default.
> 
> Can you cite any examples of a real-world security flaw what would be 
> found by Clang erroring out because 'char foo(void);' is the wrong 
> prototype? Is it plausible that any such security flaw exists?
> 
> On the contrary, it's more likely that Clang's erroring out here would 
> *introduce* a security flaw, because it would cause 'configure' to 
> incorrectly infer that an important security-relevant function is 
> missing and that a flawed substitute needs to be used.
> 
> Let's focus on real problems rather than worrying about imaginary ones.

I sympathize, and I would think a compiler emitting an error (not a 
warning) in the situation at hand (in absence of -Werror) is overly 
pedantic.  But, could autoconf perhaps avoid the problem?  AFAICS the 
ac_fn_c_check_func really does only a link test to check for symbol 
existence, and the perceived problem is that the call statement in main() 
invokes UB.  So, let's avoid the call then while retaining the access to 
the symbol?  Like:

-
char foobar(void);
int main(void) {
  return  != 0;
}
-

No call involved: no reason for compiler to complain.  The prototype decl 
itself will still be "wrong", but compilers complaining about that (in 
absence of a pre-existing different prototype, which is avoided by 
autoconf) seem unlikely.

Obviously this program will also say "foobar exists" if it's a data 
symbol, but that's the same with the variant using the call on most 
platforms (after all it's not run).

The idea is so obvious that I'm probably missing something, why autoconf 
can't use that idiom instead.  But perhaps the (historic?) reasons why it 
couldn't be used are gone now?


Ciao,
Michael.


Re: [PATCH 1/3] STABS: remove -gstabs and -gxcoff functionality

2022-11-10 Thread Michael Matz via Gcc-patches
Hello,

On Thu, 10 Nov 2022, Martin Liška wrote:

> > These changes are part of
> > commit r13-2361-g7e0db0cdf01e9c885a29cb37415f5bc00d90c029
> > "STABS: remove -gstabs and -gxcoff functionality".  What this does is
> > remove these identifiers from "poisoning":
> > 
> >  /* As the last action in this file, we poison the identifiers that
> > shouldn't be used.
> >  [...]
> >  /* Other obsolete target macros, or macros that used to be in target
> > headers and were not used, and may be obsolete or may never have
> > been used.  */
> >   #pragma GCC poison [...]
> > 
> > Shouldn't these identifiers actually stay (so that any accidental future
> > use gets flagged, as I understand this machinery), and instead more
> > identifiers be added potentially: those where their definition/use got
> > removed with "STABS: remove -gstabs and -gxcoff functionality"?  (I've
> > not checked.)
> 
> Well, the identifiers are not used any longer, so I don't think we should
> poison them. Or do I miss something?

It's the very nature of poisoned identifiers that they aren't used (every 
use would get flagged as error).  The point of poisoning them is to avoid 
future new uses to creep in (e.g. via mislead back- or forward-ports, 
which can for instance happen easily with backend macros when an 
out-of-tree port is eventually tried to be integrated).  Hence, generally 
the list of those identifiers is only extended, never reduced.  (There may 
be exceptions of course)


Ciao,
Michael.


Re: [PATCH] sphinx: support Sphinx in lib*/Makefile.am.

2022-11-10 Thread Michael Matz via Gcc-patches
Hello,

On Thu, 10 Nov 2022, Martin Liška wrote:

> This is a patch which adds support for Sphinx in lib*/Makefile.am where
> I wrongly modified Makefile.in that are generated.
> 
> One thing that's missing is that the generated Makefile.in does not
> contain 'install-info-am' target and thus the created info files
> are not installed with 'make install'. Does anybody know?

The whole generation/processing of '*info*' targets (and dvi,pdf,ps,html 
targets) is triggered by the presence of a 'TEXINFO' primary 
(here in the 'info_TEXINFO' variable), which you removed.  As the sphinx 
result is not appropriate for either TEXINFO or MANS primaries (the only 
ones in automake related specifically to documentation), you probably want 
to include them in the DATA primary.  For backward compatibility you might 
want to add your own {un,}install-info-am targets depending on 
{un,}install-data-am then, though I'm not sure why one would need one.

I currently don't quite see how you make the Sphinx results be installed 
at all, AFAICS there's no mention of them in any of the automake 
variables.  You have to list something somewhere (as said, probably in 
DATA) to enable automake to generate the usual set of Makefile targets.

(beware: I'm not an automake expert, so the above might turn out to be 
misleading advise :-) )


Ciao,
Michael.


> 
> Thanks,
> Martin
> 
> ---
>  libgomp/Makefile.am   |  27 ++-
>  libgomp/Makefile.in   | 275 +++---
>  libgomp/testsuite/Makefile.in |   3 +
>  libitm/Makefile.am|  26 ++-
>  libitm/Makefile.in| 278 ++
>  libitm/testsuite/Makefile.in  |   3 +
>  libquadmath/Makefile.am   |  37 ++--
>  libquadmath/Makefile.in   | 307 +++---
>  8 files changed, 208 insertions(+), 748 deletions(-)
> 
> diff --git a/libgomp/Makefile.am b/libgomp/Makefile.am
> index 428f7a9dab5..ab5e86b0f98 100644
> --- a/libgomp/Makefile.am
> +++ b/libgomp/Makefile.am
> @@ -11,6 +11,8 @@ config_path = @config_path@
>  search_path = $(addprefix $(top_srcdir)/config/, $(config_path))
> $(top_srcdir) \
> $(top_srcdir)/../include
>  +abs_doc_builddir = @abs_top_builddir@/doc
> +
>  fincludedir =
> $(libdir)/gcc/$(target_alias)/$(gcc_version)$(MULTISUBDIR)/finclude
>  libsubincludedir = $(libdir)/gcc/$(target_alias)/$(gcc_version)/include
>  @@ -100,18 +102,6 @@ fortran.o: libgomp_f.h
>  env.lo: libgomp_f.h
>  env.o: libgomp_f.h
>  -
> -# Automake Documentation:
> -# If your package has Texinfo files in many directories, you can use the
> -# variable TEXINFO_TEX to tell Automake where to find the canonical
> -# `texinfo.tex' for your package. The value of this variable should be
> -# the relative path from the current `Makefile.am' to `texinfo.tex'.
> -TEXINFO_TEX   = ../gcc/doc/include/texinfo.tex
> -
> -# Defines info, dvi, pdf and html targets
> -MAKEINFOFLAGS = -I $(srcdir)/../gcc/doc/include
> -info_TEXINFOS = libgomp.texi
> -
>  # AM_CONDITIONAL on configure option --generated-files-in-srcdir
>  if GENINSRC
>  STAMP_GENINSRC = stamp-geninsrc
> @@ -127,7 +117,7 @@ STAMP_BUILD_INFO =
>  endif
>   -all-local: $(STAMP_GENINSRC)
> +all-local: $(STAMP_GENINSRC) $(STAMP_BUILD_INFO)
>   stamp-geninsrc: libgomp.info
>   cp -p $(top_builddir)/libgomp.info $(srcdir)/libgomp.info
> @@ -135,8 +125,15 @@ stamp-geninsrc: libgomp.info
>   libgomp.info: $(STAMP_BUILD_INFO)
>  -stamp-build-info: libgomp.texi
> - $(MAKEINFO) $(AM_MAKEINFOFLAGS) $(MAKEINFOFLAGS) -I $(srcdir) -o
> libgomp.info $(srcdir)/libgomp.texi
> +RST_FILES:=$(shell find $(srcdir) -name *.rst)
> +SPHINX_CONFIG_FILES:=$(srcdir)/doc/conf.py $(srcdir)/../doc/baseconf.py
> +SPHINX_FILES:=$(RST_FILES) $(SPHINX_CONFIG_FILES)
> +
> +stamp-build-info: $(SPHINX_FILES)
> + + if [ x$(HAS_SPHINX_BUILD) = xhas-sphinx-build ]; then \
> +   make -C $(srcdir)/../doc info SOURCEDIR=$(abs_srcdir)/doc
> BUILDDIR=$(abs_doc_builddir)/info SPHINXBUILD=$(SPHINX_BUILD); \
> +   cp ./doc/info/texinfo/libgomp.info libgomp.info; \
> + else true; fi
>   @touch $@
>   diff --git a/libgomp/Makefile.in b/libgomp/Makefile.in
> index 814ccd13dc0..4d0f2184e95 100644
> --- a/libgomp/Makefile.in
> +++ b/libgomp/Makefile.in
> @@ -177,7 +177,7 @@ am__uninstall_files_from_dir = { \
>  || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
>   $(am__cd) "$$dir" && rm -f $$files; }; \
>}
> -am__installdirs = "$(DESTDIR)$(toolexeclibdir)" "$(DESTDIR)$(infodir)" \
> +am__installdirs = "$(DESTDIR)$(toolexeclibdir)" \
>   "$(DESTDIR)$(fincludedir)" "$(DESTDIR)$(libsubincludedir)" \
>   "$(DESTDIR)$(toolexeclibdir)"
>  LTLIBRARIES = $(toolexeclib_LTLIBRARIES)
> @@ -269,16 +269,9 @@ am__v_FCLD_0 = @echo "  FCLD" $@;
>  am__v_FCLD_1 =
>  SOURCES = $(libgomp_plugin_gcn_la_SOURCES) \
>   $(libgomp_plugin_nvptx_la_SOURCES) $(libgomp_la_SOURCES)
> -AM_V_DVIPS = $(am__v_DVIPS_@AM_V@)
> -am__v_DVIPS_ = 

Re: [RFC] docs: remove documentation for unsupported releases

2022-11-09 Thread Michael Matz via Gcc-patches
Hello,

On Wed, 9 Nov 2022, Martin Liška wrote:

> I think we should remove documentation for unsupported GCC releases
> as it's indexed by Google engine. Second reason is that the page is long
> one one can't easily jump to Current development documentation.
> 
> Thoughts?

I think everything that's on the web server (even the 2.95 docu) has to be 
reachable via a (reasonable) set of clicks from the main index.html.  It 
doesn't need to be _directly_ from the main index.html, though.

Also, you simply might want to move the "Current Development" section 
to the front if it's currently too cumbersome to reach.

(E.g. one could move the index of the obsolete versions to a different web 
page, make that one un-indexable by google, but leave a trace to that one 
from the main index.html).


Ciao,
Michael.


> Martin
> 
> ---
>  htdocs/onlinedocs/index.html | 1294 --
>  1 file changed, 1294 deletions(-)
> 
> diff --git a/htdocs/onlinedocs/index.html b/htdocs/onlinedocs/index.html
> index cfa8bf5a..138dde95 100644
> --- a/htdocs/onlinedocs/index.html
> +++ b/htdocs/onlinedocs/index.html
> @@ -255,1300 +255,6 @@
>   href="https://gcc.gnu.org/onlinedocs/gcc-10.4.0/docs-sources.tar.gz;>Texinfo
>   sources of all the GCC 10.4 manuals
>
> -
> -  GCC 9.5 manuals:
> -  
> -https://gcc.gnu.org/onlinedocs/gcc-9.5.0/gcc/;>GCC
> - 9.5 Manual ( - href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/gcc.pdf;>also
> - in PDF or  - 
> href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/gcc.ps.gz;>PostScript or  - href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/gcc-html.tar.gz;>an
> - HTML tarball)
> -https://gcc.gnu.org/onlinedocs/gcc-9.5.0/gfortran/;>GCC
> - 9.5 GNU Fortran Manual ( - href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/gfortran.pdf;>also
> - in PDF or  - 
> href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/gfortran.ps.gz;>PostScript 
> or  - 
> href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/gfortran-html.tar.gz;>an
> - HTML tarball)
> -https://gcc.gnu.org/onlinedocs/gcc-9.5.0/cpp/;>GCC
> - 9.5 CPP Manual ( - href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/cpp.pdf;>also
> - in PDF or  - 
> href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/cpp.ps.gz;>PostScript or  - href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/cpp-html.tar.gz;>an
> - HTML tarball)
> -https://gcc.gnu.org/onlinedocs/gcc-9.5.0/gnat_rm/;>GCC
> - 9.5 GNAT Reference Manual ( - href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/gnat_rm.pdf;>also
> - in PDF or  - 
> href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/gnat_rm.ps.gz;>PostScript 
> or  - 
> href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/gnat_rm-html.tar.gz;>an
> - HTML tarball)
> -https://gcc.gnu.org/onlinedocs/gcc-9.5.0/gnat_ugn/;>GCC
> - 9.5 GNAT User's Guide ( - href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/gnat_ugn.pdf;>also
> - in PDF or  - 
> href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/gnat_ugn.ps.gz;>PostScript 
> or  - 
> href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/gnat_ugn-html.tar.gz;>an
> - HTML tarball)
> - href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/libstdc++/manual/;>GCC
> - 9.5 Standard C++ Library Manual  ( - 
> href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/libstdc++-manual.pdf.gz;>also
> - in PDF or  - 
> href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/libstdc++-manual.xml.gz;>XML
>  or  - 
> href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/libstdc++-manual-html.tar.gz;>an
> - HTML tarball)
> -https://gcc.gnu.org/onlinedocs/gcc-9.5.0/libstdc++/api/;>GCC
> - 9.5 Standard C++ Library Reference Manual  ( - 
> href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/libstdc++-api.pdf.gz;>also
> - in PDF or  - 
> href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/libstdc++-api.xml.gz;>XML 
> GPL or
> -  - 
> href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/libstdc++-api-gfdl.xml.gz;>XML 
> GFDL or  - 
> href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/libstdc++-api-html.tar.gz;>an
> - HTML tarball)
> -https://gcc.gnu.org/onlinedocs/gcc-9.5.0/gccgo/;>GCCGO 9.5 
> Manual ( -   href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/gccgo.pdf;>also in
> -   PDF or  -   
> href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/gccgo.ps.gz;>PostScript or 
>  -   
> href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/gccgo-html.tar.gz;>an
> -   HTML tarball)
> -https://gcc.gnu.org/onlinedocs/gcc-9.5.0/libgomp/;>GCC 9.5
> - GNU Offloading and Multi Processing Runtime Library Manual ( - href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/libgomp.pdf;>also in
> - PDF or  - 
> href="https://gcc.gnu.org/onlinedocs/gcc-9.5.0/libgomp.ps.gz;>PostScript 
> or  -

Re: Local type inference with auto is in C2X

2022-11-03 Thread Michael Matz via Gcc
Hello,

On Thu, 3 Nov 2022, Florian Weimer via Gcc wrote:

> will not have propagated widely once GCC 13 releases, so rejecting
> implicit ints in GCC 13 might be too early.  GCC 14 might want to switch
> to C23/C24 mode by default, activating auto support, if the standard
> comes out in 2023 (which apparently is the plan).
> 
> Then we would go from
> warning to changed semantics in a single release.
> 
> Comments?

I would argue that changing the default C mode to c23 in the year that 
comes out (or even a year later) is too aggressive and early.  Existing 
sources are often compiled with defaults, and hence would change 
semantics, which seems unattractive.  New code can instead easily use 
-std=c23 for a time.

E.g. c99/gnu99 (a largish deviation from gnu90) was never default and 
gnu11 was made default only in 2014.


Ciao,
Michael.


Re: Remove support for Intel MIC offloading (was: [PATCH] Remove dead code.)

2022-10-20 Thread Michael Matz via Gcc-patches
Hey,

On Thu, 20 Oct 2022, Thomas Schwinge wrote:

> This had been done in
> wwwdocs commit 5c7ecfb5627e412a3d142d8dc212f4cd39b3b73f
> "Document deprecation of OpenMP MIC offloading in GCC 12".
> 
> I'm sad about this, because -- in theory -- such a plugin is very useful
> for offloading simulation/debugging (separate host/device memory spaces,
> allow sanitizers to run on offloaded code

Yeah, I think that's a _very_ useful feature, but indeed ...

> (like LLVM a while ago
> implemented), and so on), but all that doesn't help -- in practice -- if
> nobody is maintaining that code.

... it should then be somewhat maintained properly.  Maybe the 
MIC-specifics could be removed from the code, and it could be transformed 
into a "null"-offload target, as example and testing vehicle (and implying 
that such new liboffloadmic^H^H^Hnull would have its upstream in the GCC 
repo).  Alas, if noone is going to do that work removing is the right 
choice.


Ciao,
Michael.


Re: [PATCH 1/2] gcov: test switch/break line counts

2022-10-11 Thread Michael Matz via Gcc-patches
Hello,

On Tue, 11 Oct 2022, Jørgen Kvalsvik wrote:

> I apologise for the confusion. The diff there is not a part of the 
> change itself (note the indentation) but rather a way to reproduce,

Ah!  Thanks, that explains it, sorry for adding confusion on top :-)


Ciao,
Michael.


Re: [PATCH 1/2] gcov: test switch/break line counts

2022-10-11 Thread Michael Matz via Gcc-patches
Hello,

On Tue, 11 Oct 2022, Jørgen Kvalsvik via Gcc-patches wrote:

> The coverage support will under some conditions decide to split edges to
> accurately report coverage. By running the test suite with/without this
> edge splitting a small diff shows up, addressed by this patch, which
> should catch future regressions.
> 
> Removing the edge splitting:
> 
> $ diff --git a/gcc/profile.cc b/gcc/profile.cc
> --- a/gcc/profile.cc
> +++ b/gcc/profile.cc
> @@ -1244,19 +1244,7 @@ branch_prob (bool thunk)
> Don't do that when the locuses match, so
> if (blah) goto something;
> is not computed twice.  */
> - if (last
> - && gimple_has_location (last)
> - && !RESERVED_LOCATION_P (e->goto_locus)
> - && !single_succ_p (bb)
> - && (LOCATION_FILE (e->goto_locus)
> - != LOCATION_FILE (gimple_location (last))
> - || (LOCATION_LINE (e->goto_locus)
> - != LOCATION_LINE (gimple_location (last)
> -   {
> - basic_block new_bb = split_edge (e);
> - edge ne = single_succ_edge (new_bb);
> - ne->goto_locus = e->goto_locus;
> -   }
> +

Assuming this is correct (I really can't say) then the comment needs 
adjustments.  It specifically talks about this very code you remove.


Ciao,
Michael.


Re: [PATCH] [PR68097] frange::set_nonnegative should not contain -NAN.

2022-09-20 Thread Michael Matz via Gcc-patches
Hello,

On Tue, 20 Sep 2022, Aldy Hernandez wrote:

> > FWIW, in IEEE, 'abs' (like 'copy, 'copysign' and 'negate') are not
> > arithmetic, they are quiet-computational.  Hence they don't rise
> > anything, not even for sNaNs; they copy the input bits and appropriately
> > modify the bit pattern according to the specification (i.e. fiddle the
> > sign bit).
> >
> > That also means that a predicate like negative_p(x) that would be
> > implemented ala
> >
> >   copysign(1.0, x) < 0.0
> 
> I suppose this means -0.0 is not considered negative,

It would be considered negative if the predicate is implemented like 
above:
   copysign(1.0, -0.0) == -1.0

But really, that depends on what _our_ definition of negative_p is 
supposed to be.  I think the most reasonable definition is indeed similar 
to above, which in turn is equivalent to simply looking at the sign bit 
(which is what copysign() does), i.e. ...

> though it has
> the signbit set?  FWIW, on real_value's real_isneg() returns true for
> -0.0 because it only looks at the sign.

... this seems the sensible thing.  I just wanted to argue the case that 
set_negative (or the like) which "sets" the sign bit does not make the 
nan-ness go away.  They are orthogonal.

> > deal with NaNs just fine and is required to correctly capture the sign of
> > 'x'.  If frange::set_nonnegative is supposed to be used in such contexts
> > (and I think it's a good idea if that were the case), then set_nonnegative
> > does _not_ imply no-NaN.
> >
> > In particular I would assume that, given an VAYRING frange FR, that
> > FR.set_nonnegative() would result in an frange {[+0.0,+inf],+nan} .
> 
> That was my understanding as well, and what my original patch did.
> But again, I'm just the messenger.

Ah, I obviously haven't followed the thread carefully then.  If that's 
what it was doing then IMO it was the right thing.


Ciao,
Michael.


Re: [PATCH] [PR68097] frange::set_nonnegative should not contain -NAN.

2022-09-19 Thread Michael Matz via Gcc-patches
Hello,

On Mon, 19 Sep 2022, Richard Biener via Gcc-patches wrote:

> > but I guess it's good we do the right thing for correctness sake, and
> > if it ever gets used by someone else.
> >
> > >
> > > That said, 'set_nonnegative' could be interpreted to be without
> > > NaNs?
> >
> > Sounds good to me.  How's this?
> 
> Hmm, I merely had lots of questions above so I think to answer them
> we at least should document what 'set_nonnegative' implies in the
> abstract vrange class.
> 
> It's probably safer to keep NaN in.  For example unfolded copysign (x, 1.)
> will return true for x == -NaN but the result will be a NaN.

FWIW, in IEEE, 'abs' (like 'copy, 'copysign' and 'negate') are not 
arithmetic, they are quiet-computational.  Hence they don't rise 
anything, not even for sNaNs; they copy the input bits and appropriately 
modify the bit pattern according to the specification (i.e. fiddle the 
sign bit).

That also means that a predicate like negative_p(x) that would be 
implemented ala

  copysign(1.0, x) < 0.0

deal with NaNs just fine and is required to correctly capture the sign of 
'x'.  If frange::set_nonnegative is supposed to be used in such contexts 
(and I think it's a good idea if that were the case), then set_nonnegative 
does _not_ imply no-NaN.

In particular I would assume that, given an VAYRING frange FR, that 
FR.set_nonnegative() would result in an frange {[+0.0,+inf],+nan} .


Ciao,
Michael.


Re: [PATCH 2/3] rename DBX_REGISTER_NUMBER to DEBUGGER_REGISTER_NUMBER

2022-09-01 Thread Michael Matz via Gcc-patches
Hello,

okay, I'll bite :)  DBG_REGISTER_NUMBER?  DEBUGGER_REGNO?


Ciao,
Michael.


Re: Counting static __cxa_atexit calls

2022-08-24 Thread Michael Matz via Gcc
Hello,

On Wed, 24 Aug 2022, Florian Weimer wrote:

> > On Wed, 24 Aug 2022, Florian Weimer wrote:
> >
> >> > Isn't this merely moving the failure point from exception-at-ctor to 
> >> > dlopen-fails?
> >> 
> >> Yes, and that is a soft error that can be handled (likewise for
> >> pthread_create).
> >
> > Makes sense.  Though that actually hints at a design problem with ELF 
> > static ctors/dtors: they should be able to soft-fail (leading to dlopen or 
> > pthread_create error returns).  So, maybe the _best_ way to deal with this 
> > is to extend the definition of the various object-initionalization means 
> > in ELF to allow propagating failure.
> 
> We could enable unwinding through the dynamic linker perhaps.  But as I
> said, those Itanium ABI functions tend to be noexcept, so there's work
> on that front as well.

Yeah, my idea would have been slightly less ambitious: redefine the ABI of 
.init_array functions to be able to return an int.  The loader would abort 
loading if any of them return non-zero.  Now change GCC code emission of 
those helper functions placed in .init_array to catch all exceptions and 
(in case an exception happened) return non-zero.  Or, even easier, don't 
deal with exceptions, but rather just check if __cxa_atexit worked, and if 
not return non-zero right away.  That way all the exception propagation 
(or cxa_atexit error handling) stays purely within the GCC generated code 
and the dynamic loader only needs to deal with return values, not 
exceptions and unwinding.

For backward compat we can't just change the ABI of .init_array, but we 
can devise an alternative: .init_array_mayfail and the associated DT tags.

> For thread-local storage, it's even more difficult because any first
> access can throw even if the constructor is noexcept.

That's extending the scope somewhat, pre-counting cxa_atexit wouldn't 
solve this problem either, right?

> >> I think we need some level of link editor support to avoid drastically
> >> over-counting multiple static calls that get merged into one
> >> implementation as the result of vague linkage.  Not sure how to express
> >> that at the ELF level?
> >
> > Hmm.  The __cxa_atexit calls are coming from the per-file local static 
> > initialization_and_destruction routine which doesn't have vague linkage, 
> > so its contribution to the overall number of cxa_atexit calls doesn't 
> > change from .o to final-exe.  Can you show an example of what you're 
> > worried about?
> 
> Sorry if I didn't use the correct terminology.
> 
> I was thinking about this:
> 
> #include 
> 
> template 
> struct S {
>   static std::vector vec;
> };
> 
> template  std::vector S::vec(i);
> 
> std::vector &
> f()
> {
>   return S<1009>::vec;
> }
> 
> The initialization is deduplicated with the help of a guard variable,
> and that also bounds to number of __cxa_atexit invocations to at most
> one per type.

Ah, right, thanks.  The guard variable for class-local statics, I was 
thinking file-scope globals.  Double-hmm.  I don't readily see a nice way 
to correctly precalculate the number of cxa_atexit calls here.  A simple 
problem is the following: assume a couple files each defining such class 
templates, that ultimately define and initialize static members A<1>::a 
and B<1>::b (assume vague linkage).  Assume we have four files:

a:  defines A::a
b:  defines B::b
ab: defines A::a and B::b
ba: defines B::b and A::a

Now link order influences which file gets to actually initialize the 
members and which ones skip it due to guard variables.  But the object 
files themself don't know enough context of which will be which.  Not even 
the link editor know that because the non-taken cxa_atexit calls aren't in 
linkonce/group sections, there are all there in 
object.o:.text:_Z41__static_initialization_and_destruction_0ii .

So, what would need to be emitted is for instance a list of cxa_atexit 
calls plus guard variable; the link editor could then count all unguarded 
cxa_atexit calls plus all guarded ones, but the latter only once per 
guard.  The key would be the identity of the guard variable.

That seems like an awful lot of complexity at the wrong level for a very 
specific usecase when we could also make .init_array failable, which then 
even might have more usecases.

> > A completely different way would be to not use cxa_atexit at all: 
> > allocate memory statically for the object and dtor addresses in 
> > .rodata (instead of in .text right now), and iterate over those at 
> > static_destruction time.  (For the thread-local ones it would need to 
> > store arguments to __tls_get_addr).
> 
> That only works if the compiler and linker can figure out the
> construction order.  In general, that is not possible, and that case
> seems even quite common with C++.  If the construction order is not
> known ahead of time, it is necessary to record it somewhere, so that
> destruction can happen in reverse.  So I think storing things in .rodata
> is out.

Hmm, right.  The 

Re: Counting static __cxa_atexit calls

2022-08-24 Thread Michael Matz via Gcc
Hello,

On Wed, 24 Aug 2022, Florian Weimer wrote:

> > Isn't this merely moving the failure point from exception-at-ctor to 
> > dlopen-fails?
> 
> Yes, and that is a soft error that can be handled (likewise for
> pthread_create).

Makes sense.  Though that actually hints at a design problem with ELF 
static ctors/dtors: they should be able to soft-fail (leading to dlopen or 
pthread_create error returns).  So, maybe the _best_ way to deal with this 
is to extend the definition of the various object-initionalization means 
in ELF to allow propagating failure.

> > Probably a note section, which the link editor could either transform into 
> > a dynamic tag or leave as note(s) in the PT_NOTE segment.  The latter 
> > wouldn't require any specific tooling support in the link editor.  But the 
> > consumer would have to iterate through all the notes to add the 
> > individual counts together.  Might be acceptable, though.
> 
> I think we need some level of link editor support to avoid drastically
> over-counting multiple static calls that get merged into one
> implementation as the result of vague linkage.  Not sure how to express
> that at the ELF level?

Hmm.  The __cxa_atexit calls are coming from the per-file local static 
initialization_and_destruction routine which doesn't have vague linkage, 
so its contribution to the overall number of cxa_atexit calls doesn't 
change from .o to final-exe.  Can you show an example of what you're 
worried about?

A completely different way would be to not use cxa_atexit at all: allocate 
memory statically for the object and dtor addresses in .rodata (instead of 
in .text right now), and iterate over those at static_destruction time.  
(For the thread-local ones it would need to store arguments to 
__tls_get_addr).

Doing that or defining failure modes for ELF init/fini seems a better 
design than hacking around the current limitation via counting static 
cxa_atexit calls.


Ciao,
Michael.


Re: Counting static __cxa_atexit calls

2022-08-23 Thread Michael Matz via Gcc
Hello,

On Tue, 23 Aug 2022, Florian Weimer via Gcc wrote:

> We currently have a latent bug in glibc where C++ constructor calls can
> fail if they have static or thread storage duration and a non-trivial
> destructor.  The reason is that __cxa_atexit (and
> __cxa_thread_atexit_impl) may have to allocate memory.  We can avoid
> that if we know how many such static calls exist in an object (for C++,
> the compiler will never emit these calls repeatedly in a loop).  Then we
> can allocate the resources beforehand, either during process and thread
> start, or when dlopen is called and new objects are loaded.

Isn't this merely moving the failure point from exception-at-ctor to 
dlopen-fails?  If an individual __cxa_atexit can't allocate memory anymore 
for its list structure, why should pre-allocation (which is still dynamic, 
based on the number of actual atexit calls) have any more luck?

> What would be the most ELF-flavored way to implement this?  After the
> final link, I expect that the count (or counts, we need a separate
> counter for thread-local storage) would show up under a new dynamic tag
> in the dynamic segment.  This is actually a very good fit because older
> loaders will just ignore it.  But the question remains what GCC should
> emit into assembler & object files, so that the link editor can compute
> the total count from that.

Probably a note section, which the link editor could either transform into 
a dynamic tag or leave as note(s) in the PT_NOTE segment.  The latter 
wouldn't require any specific tooling support in the link editor.  But the 
consumer would have to iterate through all the notes to add the 
individual counts together.  Might be acceptable, though.


Ciao,
Michael.


Re: [PATCH] match.pd: Add bitwise and pattern [PR106243]

2022-08-04 Thread Michael Matz via Gcc-patches
Hello,

On Wed, 3 Aug 2022, Jeff Law via Gcc-patches wrote:

> > .optimized dump shows:
> >_1 = ABS_EXPR ;
> >_3 = _1 & 1;
> >return _3;
> > 
> > altho combine simplifies it to x & 1 on RTL, resulting in code-gen:
> > f1:
> >  and w0, w0, 1
> >  ret
> Doesn't the abs(x) & mask simplify to x & mask for any mask where the sign bit
> of x is off

No.  Only the lowest bit remains the same between x and -x, all others 
might or might not be inverted (first counter example: x=-3, mask=3).


Ciao,
Michael.


Re: DWARF question about size of a variable

2022-06-09 Thread Michael Matz via Gcc
Hello,

On Wed, 8 Jun 2022, Carl Love via Gcc wrote:

> Is there dwarf information that gives the size of a variable?

Yes, it's in the type description.  For array types the siblings of it 
give the index types and ranges.  If that range is 
computed at runtime DWARF will (try to) express it as an expression in 
terms of other available values (like registers, constants, or memory), 
and as such can also change depending on where (at which PC) you evaluate 
that expression (and the expression itself can also change per PC).  For 
instance, in your example, on x86 with -O3 we have these relevant DWARF 
snippets (readelf -wi):

 <2>: Abbrev Number: 12 (DW_TAG_variable)
   DW_AT_name: a
   DW_AT_type: <0xa29>

So, 'a' is a variable of type 0xa29, which is:

 <1>: Abbrev Number: 13 (DW_TAG_array_type)
   DW_AT_type: <0xa4a>
   DW_AT_sibling : <0xa43>
 <2>: Abbrev Number: 14 (DW_TAG_subrange_type)
   DW_AT_type: <0xa43>
   DW_AT_upper_bound : 10 byte block: 75 1 8 20 24 8 20 26 31 1c   
(DW_OP_breg5 (rdi): 1; DW_OP_const1u: 32; DW_OP_shl; DW_OP_const1u: 32; 
DW_OP_shra; DW_OP_lit1; DW_OP_minus)
 <2>: Abbrev Number: 0

So, type 0xa29 is an array type, whose element type is 0xa4a (which will 
turn out to be a signed char), and whose (single) dimension type is 0xa43 
(unsigned long) with an upper bound that is runtime computed, see below.
The referenced types from that are:

 <1>: Abbrev Number: 1 (DW_TAG_base_type)
   DW_AT_byte_size   : 8
   DW_AT_encoding: 7   (unsigned)
   DW_AT_name: (indirect string, offset: 0x13b): long unsigned 
int

 <1>: Abbrev Number: 1 (DW_TAG_base_type)
   DW_AT_byte_size   : 1
   DW_AT_encoding: 6   (signed char)
   DW_AT_name: (indirect string, offset: 0x1ce): char

With that gdb has all information to compute the size of this array 
variable in its scope ((upper-bound + 1 minus lower-bound (default 0)) 
times sizeof(basetype)).  Compare the above for instance with the 
debuginfo generated at -O0, only the upper-range expression changes:

 <2>: Abbrev Number: 10 (DW_TAG_subrange_type)
   DW_AT_type: <0xa29>
   DW_AT_upper_bound : 3 byte block: 91 68 6   (DW_OP_fbreg: -24; 
DW_OP_deref)

Keep in mind that DWARF expressions are based on a simple stack machine.
So, for instance, the computation for the upper bound in the O3 case is:
 ((register %rdi + 1) << 32 >> 32) - 1
(i.e. basically the 32-to-64 signextension of %rdi).

On ppc I assume that either the upper_bound attribute isn't there or 
contains an uninformative expression (or one that isn't valid at the 
program-counter gdb stops at), in which case you would want to look at 
dwarf2out.cc:subrange_type_die or add_subscript_info (look for 
TYPE_MAX_VALUE of the subscripts domain type).  Hope this helps.


Ciao,
Michael.


Re: [PATCH] Add GIMPLE switch support to loop unswitching

2022-05-25 Thread Michael Matz via Gcc-patches
Hello,

On Wed, 25 May 2022, Richard Biener via Gcc-patches wrote:

> I guess we might want to (warn about labels in the "toplevel"
> scope in switch statements).  So warn about
> 
> switch (x)
> {
> case 1:
> bar:
> };

That style is actually used quite some time in GCC itself.  Sometimes with 
statements between 'case 1:' and 'bar:'.

> Maybe we just want to warn when the label is otherwise unused?

We do with -Wunused-labels (part of -Wall).  The testcases simply aren't 
compiled with that.


Ciao,
Michael.


Re: [x86 PATCH] Avoid andn and generate shorter not;and with -Oz.

2022-04-13 Thread Michael Matz via Gcc-patches
Hello,

On Wed, 13 Apr 2022, Roger Sayle wrote:

> The x86 instruction encoding for SImode andn is longer than the
> equivalent notl/andl sequence when the source for the not operand
> is the same register as the destination.

_And_ when no REX prefixes are necessary for the notl,andn, which they are 
if the respective registers are %r8 or beyond.  As you seem to be fine 
with saving just a byte you ought to test that as well to not waste one 
again :-)


Ciao,
Michael.


Re: Consider 'TDF_UID', 'TDF_NOUID' in 'print_node_brief', 'print_node'

2022-02-10 Thread Michael Matz via Gcc-patches
Hi,

On Thu, 10 Feb 2022, Richard Biener via Gcc-patches wrote:

> On Wed, Feb 9, 2022 at 2:21 PM Thomas Schwinge  
> wrote:
> >
> > Hi!
> >
> > OK to push (now, or in next development stage 1?) the attached
> > "Consider 'TDF_UID', 'TDF_NOUID' in 'print_node_brief', 'print_node'",
> > or should that be done differently -- or, per the current state (why?)
> > not at all?
> >
> > This does work for my current debugging task, but I've not yet run
> > 'make check' in case anything needs to be adjusted there.
> 
> Hmm, I wonder if we shouldn't simply dump DECL_UID as
> 
>  'uid NNN'

Yes, much better in line with the normal dump_tree output.


Ciao,
Michael.

> 
> somewhere.  For example after or before DECL_NAME?
> 
> >
> > Grüße
> >  Thomas
> >
> >
> > -
> > Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 
> > 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: 
> > Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; 
> > Registergericht München, HRB 106955
> 


Re: [PATCH 1/4][RFC] middle-end/90348 - add explicit birth

2022-02-09 Thread Michael Matz via Gcc-patches
Hello,

On Wed, 9 Feb 2022, Richard Biener wrote:

> > That breaks down when a birth is there (because it was otherwise 
> > reachable) but not on the taken path:
> > 
> >   if (nevertrue)
> > goto start;
> >   goto forw;
> >   start:
> >   {
> > int i;
> > printf("not really reachable, but unknowingly so\n");
> >   forw:
> > i = 1;
> >   }
> 
> I think to cause breakage you need a use of 'i' on the side-entry
> path that is not reachable from the path with the birth.  I guess sth like
> 
>if (nevertrue)
>  goto start;
>goto forw;
>start:
>{
>  int i = 0;
>  printf("not really reachable, but unknowingly so\n");
>  goto common;
>forw:
>  i = 1;
>common:
>  foo ();
>}
> 
> if we'd have a variable that's live only on the side-entry path
> then it could share the stack slot with 'i' this way, breaking
> things (now we don't move CLOBBERs so it isn't easy to construct
> such case).  The present dataflow would, for the above, indeed
> compute 'i' not live in the forw: block.

Yes, now that we have established (in the subthread with Joseph) that the 
value becomes indeterminate at decls we only need to care for not sharing 
storage invalidly, so yeah, some changes in the conflict computation still 
are needed.

> > Except for switches side-entries are really really seldom, so we might 
> > usefully get away with that latter solution.  And for switches it might be 
> > okay to put the births at the block containing the switch (if it itself 
> > doesn't have side entries, and the switch block doesn't have side 
> > entries except the case labels).
> > 
> > If the birth is moved to outward blocks it might be best if also the 
> > dealloc/death clobbers are moved to it, otherwise there might be paths 
> > containing a birth but no death.
> > 
> > The less precise you get with those births the more non-sharing you'll 
> > get, but that's the price.
> 
> Yes, sure.  I don't see a good way to place births during gimplification
> then.

Well, for each BIND you can compute if there are side entries at all, 
then, when lowering a BIND you put the births into the containing 
innermost BIND that doesn't have side-entries, instead of into the current 
BIND.

> The end clobbers rely on our EH lowering machinery.  For the entries we 
> might be able to compute GIMPLE BIND transitions during BIND lowering if 
> we associate labels with BINDs.  There should be a single fallthru into 
> the BIND at this point.  We could put a flag on the goto destination 
> labels whether they are reached from an outer BIND.
> 
>  goto inner;
>  {
>{
> int i;
>  {
>int j;
> inner:
>goto middle;
>  }
> middle:
>}
>  }
> 
> Since an entry CLOBBER is also a clobber we have to insert birth
> clobbers for all decls of all the binds inbetwee the goto source
> and the destination.  So for the above case on the edge to
> inner: have births for i and j and at the edge to middle we'd
> have none.

Correct, that's basically the most precise scheme, it's what I called 
try-finally topside-down ("always-before"? :) ).  (You have to care for 
computed goto! i.e. BINDs containing address-taken labels, which make 
things even uglier)  I think the easier way to deal with the above is to 
notice that the inner BIND has a side entry and conceptually move the 
decls outwards to BINDs that don't have such (or bind-crossing gotos), 
i.e. do as if it were:

  int i;  // moved
  int j;  // moved
  goto inner;
  {
{
  {
  inner:
goto middle;
  }
  middle:
}
  }

> Requires some kind of back-mapping from label to goto sources
> that are possibly problematic.  One issue is that GIMPLE
> lowering re-builds the BLOCK tree (for whatever reason...),
> so I'm not sure if we need to do it before that (for correctness).
> 
> Does that make sense?

I honestly can't say for 100% :-)  It sounds like it makes sense, yes.


Ciao,
Michael.


Re: [PATCH 1/4][RFC] middle-end/90348 - add explicit birth

2022-02-09 Thread Michael Matz via Gcc-patches
Hey,

On Tue, 8 Feb 2022, Joseph Myers wrote:

> On Tue, 8 Feb 2022, Richard Biener via Gcc-patches wrote:
> 
> > which I think is OK?  That is, when the abstract machine
> > arrives at 'int i;' then the previous content in 'i' goes
> > away?  Or would
> 
> Yes, that's correct.  "If an initialization is specified for the object, 
> it is performed each time the declaration or compound literal is reached 
> in the execution of the block; otherwise, the value becomes indeterminate 
> each time the declaration is reached.".

Okay, that makes things easier then.  We can put the birth 
clobbers at their point of declaration, just the storage associated with a 
decl needs to survive for the whole block.  We still need to make sure 
that side entries skipping declarations correctly "allocate" such storage 
(by introducing proper conflicts with other objects), but at least values 
don't need to survive decls.


Ciao,
Michael.


Re: [PATCH 1/4][RFC] middle-end/90348 - add explicit birth

2022-02-08 Thread Michael Matz via Gcc-patches
Hello,

On Tue, 8 Feb 2022, Richard Biener wrote:

> > int state = 2, *p, camefrom1 = 0;
> > for (;;) switch (state) {
> >   case 1: 
> >   case 2: ;
> > int i;
> > if (state != 1) { p =  i = 0; }
> > if (state == 1) { (*p)++; return *p; }
> > state = 1;
> > continue;
> > }
> > 
> > Note how i is initialized during state 2, and needs to be kept initialized 
> > during state 1, so there must not be a CLOBBER (birth or other) at the 
> > point of the declaration of 'i'.  AFAICS in my simple tests a DECL_EXPR 
> > for 'i' is placed with the statement associated with 'case 2' label, 
> > putting a CLOBBER there would be the wrong thing.  If the decl had an 
> > initializer it might be harmless, as it would be overwritten at that 
> > place, but even so, in this case there is no initializer.  Hmm.
> 
> You get after gimplification:
> 
>   state = 2;
>   camefrom1 = 0;
>   :
>   switch (state) , case 1: , case 2: >
>   {
> int i;
> 
> try
>   {
> i = {CLOBBER(birth)};  /// ignore, should go away
> :
> :
> i = {CLOBBER(birth)};

I think this clobber here would be the problem, because ...

> which I think is OK?  That is, when the abstract machine
> arrives at 'int i;' then the previous content in 'i' goes
> away?  Or would
> 
> int foo()
> {
>goto ick;
> use:
>int i, *p;
>return *p;
> ick:
>i = 1;
>p = 
>goto use;
> 
> }
> 
> require us to return 1?

... I think this is exactly the logical consequence of lifetime of 'i' 
being the whole block.  We need to return 1. (Joseph: correct me on that! 
:) )  That's what I was trying to get at with my switch example as well.

> With the current patch 'i' is clobbered before the return.
> 
> > Another complication arises from simple forward jumps:
> > 
> >   goto forw;
> >   {
> > int i;
> > printf("not reachable\n");
> >   forw:
> > i = 1;
> >   }
> > 
> > Despite the jump skiping the unreachable head of the BLOCK, 'i' needs to 
> > be considered birthed at the label.  (In a way the placement of births 
> > exactly mirrors the placements of deaths (of course), every path from 
> > outside a BLOCK to inside needs to birth-clobber all variables (in C), 
> > like every path leaving needs to kill them.  It's just that we have a 
> > convenient construct for the latter (try-finally), but not for the former)
> 
> The situation with an unreachable birth is handled conservatively
> since we consider a variable without a (visible at RTL expansion time)
> birth to conflict with all other variables.

That breaks down when a birth is there (because it was otherwise 
reachable) but not on the taken path:

  if (nevertrue)
goto start;
  goto forw;
  start:
  {
int i;
printf("not really reachable, but unknowingly so\n");
  forw:
i = 1;
  }

> I don't see a good way to have a birth magically appear at 'forw' 
> without trying to argue that the following stmt is the first mentioning 
> the variable.

That's what my 'Hmm' aluded to :)  The only correct and precise way I see 
is to implement something similar like try-finally topside-down.  An 
easier but less precise way is to place the births at the (start of) 
innermost block containing the decl _and all jumps into the block_.  Even 
less presice, but perhaps even easier is to place the births for decls in 
blocks with side-entries into the function scope (and for blocks without 
side entries at their start).

Except for switches side-entries are really really seldom, so we might 
usefully get away with that latter solution.  And for switches it might be 
okay to put the births at the block containing the switch (if it itself 
doesn't have side entries, and the switch block doesn't have side 
entries except the case labels).

If the birth is moved to outward blocks it might be best if also the 
dealloc/death clobbers are moved to it, otherwise there might be paths 
containing a birth but no death.

The less precise you get with those births the more non-sharing you'll 
get, but that's the price.


Ciao,
Michael.


Re: [PATCH 1/4][RFC] middle-end/90348 - add explicit birth

2022-02-08 Thread Michael Matz via Gcc-patches
Hello,

On Tue, 8 Feb 2022, Richard Biener wrote:

> int foo(int always_true_at_runtime)
> {
>   {
> int *p;
> if (always_true_at_runtime)
>   goto after;
> lab:
> return *p;
> after:
> int i = 0;
> p = 
> if (always_true_at_runtime)
>   goto lab;
>   }
>   return 0;
> }
> 
> For the implementation I considered starting lifetime at a DECL_EXPR
> if it exists and otherwise at the start of the BIND_EXPR.  Note the
> complication is that control flow has to reach the lifetime-start
> clobber before the first access.  But that might also save us here,
> since for the example above 'i' would be live via the backedge
> from goto lab.
> 
> That said, the existing complication is that when the gimplifier
> visits the BIND_EXPR it has no way to know whether there will be
> a following DECL_EXPR or not (and the gimplifier notes there are
> cases a DECL_EXPR variable is not in a BIND_EXPR).  My plan is to
> compute this during one of the body walks the gimplifier performs
> before gimplifying.
> 
> Another complication is that at least the C frontend + gimplifier
> combination for
> 
>   switch (i)
>   {
>   case 1:
> int i;
> break;
>   }
> 
> will end up having the BIND_EXPR mentioning 'i' containing the
> case label, so just placing the birth at the BIND will make it
> unreachable as
> 
>   i = BIRTH;
> case_label_for_1:
>   ...

I think anything that needs to happen (conceptually) during the jump from 
switch to case-label needs to happen right before the jump, that might 
mean creating a new fake BLOCK that contains just the jump and the 
associated births.

> conveniently at least the C frontend has a DECL_EXPR for 'i'
> which avoids this and I did not find a way (yet) in the gimplifier
> to rectify this when gimplifying switches.

In C a 'case' label is nothing else than a normal label, it doesn't start 
it's own block or the like.  So also for that one the births would need to 
be at the start of their containing blocks.

> So there's still work to do but I think that starting the lifetime at a 
> DECL_EXPR if it exists is the way to go?

Depends on where the DECL_EXPR is exactly placed, but probably it wouldn't 
be okay.  You can't place the birth at the fall-through path, because this 
needs to work (basically your jump example above rewritten as switch):

int state = 2, *p, camefrom1 = 0;
for (;;) switch (state) {
  case 1: 
  case 2: ;
int i;
if (state != 1) { p =  i = 0; }
if (state == 1) { (*p)++; return *p; }
state = 1;
continue;
}

Note how i is initialized during state 2, and needs to be kept initialized 
during state 1, so there must not be a CLOBBER (birth or other) at the 
point of the declaration of 'i'.  AFAICS in my simple tests a DECL_EXPR 
for 'i' is placed with the statement associated with 'case 2' label, 
putting a CLOBBER there would be the wrong thing.  If the decl had an 
initializer it might be harmless, as it would be overwritten at that 
place, but even so, in this case there is no initializer.  Hmm.

Another complication arises from simple forward jumps:

  goto forw;
  {
int i;
printf("not reachable\n");
  forw:
i = 1;
  }

Despite the jump skiping the unreachable head of the BLOCK, 'i' needs to 
be considered birthed at the label.  (In a way the placement of births 
exactly mirrors the placements of deaths (of course), every path from 
outside a BLOCK to inside needs to birth-clobber all variables (in C), 
like every path leaving needs to kill them.  It's just that we have a 
convenient construct for the latter (try-finally), but not for the former)

Ciao,
Michael.


Re: [PATCH] libgcc: Actually force divide by zero

2022-02-03 Thread Michael Matz via Gcc-patches
Hello,

On Thu, 3 Feb 2022, Richard Biener via Gcc-patches wrote:

> /* Preserve explicit divisions by 0: the C++ front-end wants to detect
>undefined behavior in constexpr evaluation, and assuming that the 
> division
>traps enables better optimizations than these anyway.  */
> (for div (trunc_div ceil_div floor_div round_div exact_div)
>  /* 0 / X is always zero.  */
>  (simplify
>   (div integer_zerop@0 @1)
>   /* But not for 0 / 0 so that we can get the proper warnings and errors.  
> */
>   (if (!integer_zerop (@1))
>@0))
> 
> it suggests we want to preserve all X / 0 when the 0 is literal but
> I think we can go a bit further and require 0/0 to not unnecessarily
> restrict other special cases.

Just remember that 0/0 is completely different from X/0 (with X != 0), the 
latter is a sensible limit, the former is just non-sense.  There's a 
reason why one is a NaN and the other Inf in floating point.  So it does 
make sense to differ between both on integer side as well.

(i'm split mind on the usefullness of "1/x -> 0" vs. its effect on 
trapping behaviour)


Ciao,
Michael.

> 
> Comments on the libgcc case below
> 
> > 2022-02-03  Jakub Jelinek  
> > 
> > * libgcc2.c (__udivmoddi4): Add optimization barriers to actually
> > ensure division by zero.
> > 
> > --- libgcc/libgcc2.c.jj 2022-01-11 23:11:23.810270199 +0100
> > +++ libgcc/libgcc2.c2022-02-03 09:24:14.513682731 +0100
> > @@ -1022,8 +1022,13 @@ __udivmoddi4 (UDWtype n, UDWtype d, UDWt
> > {
> >   /* qq = NN / 0d */
> >  
> > - if (d0 == 0)
> > -   d0 = 1 / d0;/* Divide intentionally by zero.  */
> > + if (__builtin_expect (d0 == 0, 0))
> > +   {
> > + UWtype one = 1;
> > + __asm ("" : "+g" (one));
> > + __asm ("" : "+g" (d0));
> > + d0 = one / d0;/* Divide intentionally by zero.  */
> > +   }
> 
> I'm not sure why we even bother - division or modulo by zero is
> undefined behavior and we are not emulating CPU behavior because
> the wide instructions emulated here do not actually exist.  That
> gives us the freedom of choosing the implementation defined
> behavior.
> 
> That said, _if_ we choose to "care" I'd rather choose to
> define the implementation to use the trap mechanism the
> target provides and thus use __builtin_trap ().  That then
> at least traps reliably, compared to the division by zero
> which doesn't do that on all targets.
> 
> So I'm not sure there's anything to fix besides eventually
> just deleting the d0 == 0 special case?
> 
> Richard.
> 


Re: [PATCH] Add CLOBBER_MARKS_EOL to mark storage end-of-life clobbers

2022-02-03 Thread Michael Matz via Gcc-patches
Hello,

On Thu, 3 Feb 2022, Richard Biener via Gcc-patches wrote:

> > > It indeed did occur to me as well, so as we're two now I tried to 
> > > see how it looks like.  It does like the following (didn't bother to 
> > > replace all build_clobber calls but defaulted the parameter - there 
> > > are too many).
> > 
> > Except for this part I like it (I wouldn't call ca. 25 calls too 
> > many).  I often find optional arguments to be a long term maintenance 
> > burden when reading code.
> 
> But I think in this case it makes clear that the remaining callers are 
> un-audited

My pedantic answer to that would be that to make something clear you want 
to be explicit, and being explicit means writing something, not 
leaving something out, so you'd add another "CLOBBER_DONTKNOW" and use 
that for the unaudited calls.  Pedantic, as I said :)

But, of course, you built the shed, so paint it green, all right :)


Ciao,
Michael.


Re: [PATCH] Add CLOBBER_MARKS_EOL to mark storage end-of-life clobbers

2022-02-03 Thread Michael Matz via Gcc-patches
Hello,

On Thu, 3 Feb 2022, Richard Biener wrote:

> > > This adds a flag to CONSTRUCTOR nodes indicating that for clobbers this 
> > > marks the end-of-life of storage as opposed to just ending the lifetime 
> > > of the object that occupied it. The dangling pointer diagnostics uses 
> > > CLOBBERs but is confused by those emitted by the C++ frontend for 
> > > example which emits them for the second purpose at the start of CTORs.  
> > > The issue is also appearant for aarch64 in PR104092.
> > > 
> > > Distinguishing the two cases is also necessary for the PR90348 fix.
> > 
> > (Just FWIW: I agree with the plan to have different types of CLOBBERs, in 
> > particular those for marking births)
> > 
> > A style nit:
> > 
> > > tree clobber = build_clobber (TREE_TYPE (t));
> > > +   CLOBBER_MARKS_EOL (clobber) = 1;
> > 
> > I think it really were better if build_clobber() gained an additional 
> > argument (non-optional!) that sets the type of it.
> 
> It indeed did occur to me as well, so as we're two now I tried to see
> how it looks like.  It does like the following (didn't bother to
> replace all build_clobber calls but defaulted the parameter - there
> are too many).

Except for this part I like it (I wouldn't call ca. 25 calls too 
many).  I often find optional arguments to be a long term maintenance 
burden when reading code.

(And yeah, enum good, putting the enum to tree_base.u, if it works, 
otherwise use your bit shuffling)


Ciao,
Michael.


Re: [PATCH] Add CLOBBER_MARKS_EOL to mark storage end-of-life clobbers

2022-02-02 Thread Michael Matz via Gcc-patches
Hello,

On Wed, 2 Feb 2022, Richard Biener via Gcc-patches wrote:

> This adds a flag to CONSTRUCTOR nodes indicating that for clobbers this 
> marks the end-of-life of storage as opposed to just ending the lifetime 
> of the object that occupied it. The dangling pointer diagnostics uses 
> CLOBBERs but is confused by those emitted by the C++ frontend for 
> example which emits them for the second purpose at the start of CTORs.  
> The issue is also appearant for aarch64 in PR104092.
> 
> Distinguishing the two cases is also necessary for the PR90348 fix.

(Just FWIW: I agree with the plan to have different types of CLOBBERs, in 
particular those for marking births)

A style nit:

> tree clobber = build_clobber (TREE_TYPE (t));
> +   CLOBBER_MARKS_EOL (clobber) = 1;

I think it really were better if build_clobber() gained an additional 
argument (non-optional!) that sets the type of it.


Ciao,
Michael.


Re: reordering of trapping operations and volatile

2022-01-17 Thread Michael Matz via Gcc
Hello,

On Sat, 15 Jan 2022, Martin Uecker wrote:

> > Because it interferes with existing optimisations. An explicit 
> > checkpoint has a clear meaning. Using every volatile access that way 
> > will hurt performance of code that doesn't require that behaviour for 
> > correctness.
> 
> This is why I would like to understand better what real use cases of 
> performance sensitive code actually make use of volatile and are 
> negatively affected. Then one could discuss the tradeoffs.

But you seem to ignore whatever we say in this thread.  There are now 
multiple examples that demonstrate problems with your proposal as imagined 
(for lack of a _concrete_ proposal with wording from you), problems that 
don't involve volatile at all.  They all stem from the fact that you order 
UB with respect to all side effects (because you haven't said how you want 
to avoid such total ordering with all side effects).

As I said upthread: you need to define a concept of time at whose 
granularity you want to limit the effects of UB, and the borders of each 
time step can't simply be (all) the existing side effects.  Then you need 
to have wording of what it means for UB to occur within such time step, in 
particular if multiple UB happens within one (for best results it should 
simply be UB, not individual instances of different UBs).

If you look at the C++ proposal (thanks Jonathan) I think you will find 
that if you replace 'std::observable' with 'sequence point containing a 
volatile access' that you basically end up with what you wanted.  The 
crucial point being that the time steps (epochs in that proposal) aren't 
defined by all side effects but by a specific and explicit thing only (new 
function in the proposal, volatile accesses in an alternative).

FWIW: I think for a new language feature reusing volatile accesses as the 
clock ticks are the worse choice: if you intend that feature to be used 
for writing safer programs (a reasonable thing) I think being explicit and 
at the same time null-overhead is better (i.e. a new internal 
function/keyword/builtin, specified to have no effects except moving the 
clock forward).  volatile accesses obviously already exist and hence are 
easier to integrate into the standard, but in a given new/safe program, 
whenever you see a volatile access you would always need to ask 'is thise 
for clock ticks, or is it a "real" volatile access for memmap IO'.


Ciao,
Michael.


Re: git hooks: too strict check

2022-01-14 Thread Michael Matz via Gcc
Hello,

On Fri, 14 Jan 2022, Martin Liška wrote:

> Hello.
> 
> I'm working on a testsuite clean-up where some of the files are wrongly named.
> More precisely, so files have .cc extension and should use .C. However there's
> existing C test-case and it leads to:
> 
> marxin@marxinbox:~/Programming/gcc/gcc/testsuite> find . -name test-asm.*
> ./jit.dg/test-asm.C
> ./jit.dg/test-asm.c

You can't have that, the check is correct.  There are filesystems (NTFS 
for instance) that are case-preserving but case-insensitive, on those you 
really can't have two files that differ only in casing.  You need to find 
a different solution, either consistently use .cc instead of .C, live with 
the inconsistency or rename the base name of these files.


Ciao,
Michael.

> 
> test-kunlun me/rename-testsuite-files
> Enumerating objects: 804, done.
> Counting objects: 100% (804/804), done.
> Delta compression using up to 16 threads
> Compressing objects: 100% (242/242), done.
> Writing objects: 100% (564/564), 142.13 KiB | 7.48 MiB/s, done.
> Total 564 (delta 424), reused 417 (delta 295), pack-reused 0
> remote: Resolving deltas: 100% (424/424), completed with 222 local objects.
> remote: *** The following filename collisions have been detected.
> remote: *** These collisions happen when the name of two or more files
> remote: *** differ in casing only (Eg: "hello.txt" and "Hello.txt").
> remote: *** Please re-do your commit, chosing names that do not collide.
> remote: ***
> remote: *** Commit: 7297e1de9bed96821d2bcfd034bad604ce035afb
> remote: *** Subject: Rename tests in jit sub-folder.
> remote: ***
> remote: *** The matching files are:
> remote: ***
> remote: *** gcc/testsuite/jit.dg/test-quadratic.C
> remote: *** gcc/testsuite/jit.dg/test-quadratic.c
> remote: ***
> remote: *** gcc/testsuite/jit.dg/test-switch.C
> remote: *** gcc/testsuite/jit.dg/test-switch.c
> remote: ***
> remote: *** gcc/testsuite/jit.dg/test-asm.C
> remote: *** gcc/testsuite/jit.dg/test-asm.c
> remote: ***
> remote: *** gcc/testsuite/jit.dg/test-alignment.C
> remote: *** gcc/testsuite/jit.dg/test-alignment.c
> 
> Can we please do something about it?
> 
> Thanks,
> Martin
> 


  1   2   3   4   5   6   7   8   9   10   >