[PATCH] Fix discrepancy in Walloca test on 32-bit systems.

2020-10-21 Thread Aldy Hernandez via Gcc-patches
There is a discrepancy in the way we report -Walloca-larger-than=
errors on 32-bit versus 64-bit architectures, due to the nature of
ranges derived from a cast operation.

For the Walloca-1 tests on 64-bits we get:

  int num.0_1;
  long unsigned int _2;

   [local count: 1073741824]:
  num.0_1 = num;
  _2 = (long unsigned int) num.0_1;
  s_8 = __builtin_alloca (_2);

Because of the cast of a 32-bit quantity into a 64-bit quantity in _2,
ranger calculates its range as:

long unsigned int [0, 2147483647][18446744071562067968, +INF]

Thus excluding the numbers that can't exist in _2.

This causes the Walloca pass to report that the argument to alloca may be
too large.

However, for -m32 on x86, the gimple is:

  int num.0_1;
  unsigned int num.1_2;

   [local count: 1073741824]:
  num.0_1 = num;
  num.1_2 = (unsigned int) num.0_1;
  s_8 = __builtin_alloca (num.1_2);

Since num.0_1 and num.1_2 are of the same size, we cannot determine
any useful range, so we return VARYING.  In the Walloca pass, VARYING
basically means "unbounded" (no known bounds for the alloca call argument).
So on 32-bits, the error message issued is slightly different:

warning: unbounded use of ‘alloca’

versus on 64-bits, where due to the cast, it is:

warning: argument to ‘alloca’ may be too large

In reality both versions of the IL show an unbounded call, but in one
version (64-bits) we can exclude some values so we assume the range
was provided, but it was out of bounds.

I've mentioned various times that all these diagnostics passes
(alloca, restrict, printf, etc), could benefit from less specific error
messages since what we have can potentially confuse the user.  However,
no consensus has been reached on how to report these things.

In the meantime, this patch adjusts the testcase to accept both variants.

Pushed.

gcc/testsuite/ChangeLog:

* gcc.dg/Walloca-1.c: Adjust for 32-bits.
---
 gcc/testsuite/gcc.dg/Walloca-1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/Walloca-1.c b/gcc/testsuite/gcc.dg/Walloca-1.c
index ed1fa929398..37ee1912187 100644
--- a/gcc/testsuite/gcc.dg/Walloca-1.c
+++ b/gcc/testsuite/gcc.dg/Walloca-1.c
@@ -24,7 +24,7 @@ void foo1 (size_t len, size_t len2, size_t len3)
   char *s = alloca (123);
   useit (s);   // OK, constant argument to alloca
 
-  s = alloca (num);// { dg-warning "may be too large" }
+  s = alloca (num);// { dg-warning "\(may be too large|unbounded 
use\)" }
   useit (s);
 
   s = alloca (3);  /* { dg-warning "is too large" } */
-- 
2.26.2



Re: [PATCH][middle-end][i386][version 3]Add -fzero-call-used-regs=[skip|used-gpr-arg|used-arg|all-arg|used-gpr|all-gpr|used|all]

2020-10-21 Thread Uros Bizjak via Gcc-patches
On Tue, Oct 20, 2020 at 10:04 PM Qing Zhao  wrote:

> +/* Check whether the register REGNO should be zeroed on X86.
> +   When ALL_SSE_ZEROED is true, all SSE registers have been zeroed
> +   together, no need to zero it again.
> +   Stack registers (st0-st7) and mm0-mm7 are aliased with each other.
> +   very hard to be zeroed individually, don't zero individual st or
> +   mm registgers at this time.  */
> +
> +static bool
> +zero_call_used_regno_p (const unsigned int regno,
> + bool all_sse_zeroed)
> +{
> +  return GENERAL_REGNO_P (regno)
> +  || (!all_sse_zeroed && SSE_REGNO_P (regno))
> +  || MASK_REGNO_P (regno);
> +}
> +
> +/* Return the machine_mode that is used to zero register REGNO.  */
> +
> +static machine_mode
> +zero_call_used_regno_mode (const unsigned int regno)
> +{
> +  /* NB: We only need to zero the lower 32 bits for integer registers
> + and the lower 128 bits for vector registers since destination are
> + zero-extended to the full register width.  */
> +  if (GENERAL_REGNO_P (regno))
> +return SImode;
> +  else if (SSE_REGNO_P (regno))
> +return V4SFmode;
> +  else
> +return HImode;
> +}
> +
> +/* Generate a rtx to zero all vector registers togetehr if possible,
> +   otherwise, return NULL.  */
> +
> +static rtx
> +zero_all_vector_registers (HARD_REG_SET need_zeroed_hardregs)
> +{
> +  if (!TARGET_AVX)
> +return NULL;
> +
> +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
> +if ((IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG)
> +  || (TARGET_64BIT
> +  && (REX_SSE_REGNO_P (regno)
> +  || (TARGET_AVX512F && EXT_REX_SSE_REGNO_P (regno)
> + && !TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
> +  return NULL;
> +
> +  return gen_avx_vzeroall ();
> +}
> +
> +/* Generate a rtx to zero all st and mm registers togetehr if possible,
> +   otherwise, return NULL.  */
> +
> +static rtx
> +zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs)
> +{
> +  if (!TARGET_MMX)
> +return NULL;
> +
> +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
> +if ((STACK_REGNO_P (regno) || MMX_REGNO_P (regno))
> + && !TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
> +  return NULL;
> +
> +  return gen_mmx_emms ();
>
>
> emms is not clearing any register, it only loads x87FPUTagWord with
> H. So I think, the above is useless, as far as register clearing
> is concerned.
>
>
> Thanks for the info.
>
> So, for mm and st registers, should we clear them, and how?
>
>
> I don't know.
>
> Please note that %mm and %st share the same register file, and
> touching %mm registers will block access to %st until emms is emitted.
> You can't just blindly load 0 to %st registers, because the register
> file can be in MMX mode and vice versa. For 32bit targets, function
> can also  return a value in the %mm0.
>
>
> If data flow determine that %mm0 does not return a value at the return, can 
> we clear all the %st as following:
>
> emms
> mov %st0, 0
> mov %st1, 0
> mov %st2, 0
> mov %st3, 0
> mov %st4, 0
> mov %st5, 0
> mov %st6, 0
> mov %st7, 0

The i386 ABI says:

-- q --
The CPU shall be in x87 mode upon entry to a function. Therefore,
every function that uses the MMX registers is required to issue an
emms or femms instruction after using MMX registers, before returning
or calling another function.
-- /q --

(The above requirement slightly contradicts its own ABI, since we have
3 MMX argument registers and MMX return register, so the CPU obviously
can't be in x87 mode at all function boundaries).

So, assuming that the first sentence is not deliberately vague w.r.t
function exit, emms should not be needed. However, we are dealing with
x87 stack registers that have their own set of peculiarities. It is
not possible to load a random register in the way you show.  Also,
stack should be either empty or one (two in case of complex value
return) levels deep at the function return. I think you want a series
of 8 or 7(6) fldz insns, followed by a series of fstp insn to clear
the stack and mark stack slots empty.

Uros.


[Ada] Simplify the VxWorks implementation of __gnat_environ

2020-10-21 Thread Pierre-Marie de Rodat
The VxWorks kernel implementation of __gnat_environ was more convoluted
than it needed to be and had an unneeded dependency on ppGlobalEnviron,
a symbol which Wind River has removed in newer versions of VxWorks. This
patch simplifies the implementation and uses a common implementation for
both VxWorks 6 and 7 kernel mode since they both support envGet.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* env.c (__gnat_environ): For VxWorks kernel simply return the
result of the envGet call. Do this for VxWorks 6 and 7 as they
both support the same API.diff --git a/gcc/ada/env.c b/gcc/ada/env.c
--- a/gcc/ada/env.c
+++ b/gcc/ada/env.c
@@ -68,18 +68,9 @@
envLib.h on VxWorks MILS and VxWorks 653.  */
 #include 
 #include 
-  #elif (_WRS_VXWORKS_MAJOR <= 6)
+  #else
+/* Kernel mode */
 #include 
-/* In that mode the following symbol is not defined in any VxWorks
-   include files, prior to vxWorks 7, so we declare it as extern.  */
-extern char** ppGlobalEnviron;
-  #elif (_WRS_VXWORKS_MAJOR >= 7)
-/* This should work for kernel mode on VxWorks 7.x.  In 7.2 the tcb
-   is made private, so accessor functions must be used, in 7.0 it
-   is optional but there is no way to distinguish between 7.2
-   and 7.0 since the version.h header file was never updated.  */
-#include 
-#include 
   #endif
 #endif
 
@@ -144,17 +135,11 @@ __gnat_environ (void)
   extern char **environ;
   return environ;
 #else
-  #if defined (__RTP__) || defined (VTHREADS) || (_WRS_VXWORKS_MAJOR <= 6)
+  #if defined (__RTP__) || defined (VTHREADS)
 return environ;
-  #elif (_WRS_VXWORKS_MAJOR >= 7)
-char **task_environ;
-
-task_environ = envGet (taskIdSelf ());
-
-if (task_environ == NULL)
-   return ppGlobalEnviron;
-else
-   return task_environ;
+  #else
+/* Kernel mode */
+return envGet (NULL);
   #endif
 #endif
 }




[Ada] Raise Capacity_Error on formal vector insertion

2020-10-21 Thread Pierre-Marie de Rodat
Capacity_Error should be raised on insertion inside a formal vector when
the capacity is exceeded.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* libgnat/a-cofove.adb (Copy): Add explanation in case of
Capacity_Error.
(Insert_Space): Raise Capacity_Error if the new length is
greater than the capacity.
(Reserve_Capacity): Raise Capacity_Error instead of
Constraint_Error.diff --git a/gcc/ada/libgnat/a-cofove.adb b/gcc/ada/libgnat/a-cofove.adb
--- a/gcc/ada/libgnat/a-cofove.adb
+++ b/gcc/ada/libgnat/a-cofove.adb
@@ -171,7 +171,7 @@ is
   elsif Capacity >= LS then
  C := Capacity;
   else
- raise Capacity_Error;
+ raise Capacity_Error with "Capacity too small";
   end if;
 
   return Target : Vector (C) do
@@ -956,6 +956,12 @@ is
 
   if New_Length > Max_Length then
  raise Constraint_Error with "Count is out of range";
+
+  --  Raise Capacity_Error if the new length exceeds the container's
+  --  capacity.
+
+  elsif New_Length > Container.Capacity then
+ raise Capacity_Error with "New length is larger than capacity";
   end if;
 
   J := To_Array_Index (Before);
@@ -1104,7 +1110,7 @@ is
is
begin
   if Capacity > Container.Capacity then
- raise Constraint_Error with "Capacity is out of range";
+ raise Capacity_Error with "Capacity is out of range";
   end if;
end Reserve_Capacity;
 




[Ada] Improve precision of Ada.Directories.Modification_Time

2020-10-21 Thread Pierre-Marie de Rodat
The modification file time precision now defined by OS.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* adaint.c (__gnat_file_time): New routine.
(__gnat_copy_attribs): Copy timestamps in nanoseconds.
* libgnat/a-direct.adb (C_Modification_Time): Bind to
__gnat_file_time.
(Modification_Time): Call to C_Modification_Time.diff --git a/gcc/ada/adaint.c b/gcc/ada/adaint.c
--- a/gcc/ada/adaint.c
+++ b/gcc/ada/adaint.c
@@ -60,6 +60,7 @@
 /* We want to use the POSIX variants of include files.  */
 #define POSIX
 #include "vxWorks.h"
+#include 
 
 #if defined (__mips_vxworks)
 #include "cacheLib.h"
@@ -1474,6 +1475,74 @@ __gnat_file_time_fd (int fd)
return __gnat_file_time_fd_attr (fd, &attr);
 }
 
+extern long long __gnat_file_time(char* name)
+{
+  long long result;
+
+  if (name == NULL) {
+return LLONG_MIN;
+  }
+  /* Number of seconds between  and . */
+  static const long long ada_epoch_offset = (136 * 365 + 44 * 366) * 86400LL;
+#if defined(_WIN32)
+
+  /* Number of 100 nanoseconds between  and . */
+  static const long long w32_epoch_offset =
+  (11644473600LL + ada_epoch_offset) * 1E7;
+
+  WIN32_FILE_ATTRIBUTE_DATA fad;
+  union
+  {
+FILETIME ft_time;
+long long ll_time;
+  } t_write;
+
+  if (!GetFileAttributesExA(name, GetFileExInfoStandard, &fad)) {
+return LLONG_MIN;
+  }
+
+  t_write.ft_time = fad.ftLastWriteTime;
+
+  /* Next code similar to (t_write.ll_time - w32_epoch_offset) * 100
+ but on overflow returns LLONG_MIN value. */
+
+  if (__builtin_ssubll_overflow(t_write.ll_time, w32_epoch_offset, &result)) {
+return LLONG_MIN;
+  }
+
+  if (__builtin_smulll_overflow(result, 100, &result)) {
+return LLONG_MIN;
+  }
+
+#else
+
+  struct stat sb;
+  if (stat(name, &sb) != 0) {
+return LLONG_MIN;
+  }
+
+  /* Next code similar to
+ (sb.st_mtime - ada_epoch_offset) * 1E9 + sb.st_mtim.tv_nsec
+ but on overflow returns LLONG_MIN value. */
+
+  if (__builtin_ssubll_overflow(sb.st_mtime, ada_epoch_offset, &result)) {
+return LLONG_MIN;
+  }
+
+  if (__builtin_smulll_overflow(result, 1E9, &result)) {
+return LLONG_MIN;
+  }
+
+#if defined(st_mtime)
+  if (__builtin_saddll_overflow(result, sb.st_mtim.tv_nsec, &result)) {
+return LLONG_MIN;
+  }
+#endif
+
+#endif
+  return result;
+}
+
 /* Set the file time stamp.  */
 
 void
@@ -3173,22 +3242,45 @@ __gnat_copy_attribs (char *from ATTRIBUTE_UNUSED, char *to ATTRIBUTE_UNUSED,
 
 #else
   GNAT_STRUCT_STAT fbuf;
-  struct utimbuf tbuf;
 
   if (GNAT_STAT (from, &fbuf) == -1) {
  return -1;
   }
 
-  /* Do we need to copy timestamp ? */
+#if _POSIX_C_SOURCE >= 200809L
+  struct timespec tbuf[2];
+
   if (mode != 2) {
- tbuf.actime = fbuf.st_atime;
- tbuf.modtime = fbuf.st_mtime;
+ tbuf[0] = fbuf.st_atim;
+ tbuf[1] = fbuf.st_mtim;
 
- if (utime (to, &tbuf) == -1) {
+ if (utimensat (AT_FDCWD, to, tbuf, 0) == -1) {
 return -1;
  }
   }
 
+#else
+  struct timeval tbuf[2];
+  /* Do we need to copy timestamp ? */
+
+  if (mode != 2) {
+ tbuf[0].tv_sec  = fbuf.st_atime;
+ tbuf[1].tv_sec  = fbuf.st_mtime;
+
+ #if defined(st_mtime)
+ tbuf[0].tv_usec = fbuf.st_atim.tv_nsec / 1000;
+ tbuf[1].tv_usec = fbuf.st_mtim.tv_nsec / 1000;
+ #else
+ tbuf[0].tv_usec = 0;
+ tbuf[1].tv_usec = 0;
+ #endif
+
+ if (utimes (to, tbuf) == -1) {
+return -1;
+ }
+  }
+#endif
+
   /* Do we need to copy file permissions ? */
   if (mode != 0 && (chmod (to, fbuf.st_mode) == -1)) {
 	  return -1;


diff --git a/gcc/ada/libgnat/a-direct.adb b/gcc/ada/libgnat/a-direct.adb
--- a/gcc/ada/libgnat/a-direct.adb
+++ b/gcc/ada/libgnat/a-direct.adb
@@ -30,7 +30,6 @@
 --
 
 with Ada.Calendar;   use Ada.Calendar;
-with Ada.Calendar.Formatting;use Ada.Calendar.Formatting;
 with Ada.Characters.Handling;use Ada.Characters.Handling;
 with Ada.Directories.Validity;   use Ada.Directories.Validity;
 with Ada.Directories.Hierarchical_File_Names;
@@ -70,6 +69,15 @@ package body Ada.Directories is
pragma Import (C, Max_Path, "__gnat_max_path_len");
--  The maximum length of a path
 
+   function C_Modification_Time (N : System.Address) return Ada.Calendar.Time;
+   pragma Import (C, C_Modification_Time, "__gnat_file_time");
+   --  Get modification time for file with name referenced by N
+
+   Invalid_Time : constant Ada.Calendar.Time :=
+C_Modification_Time (System.Null_Address);
+   --  Result returned from C_Modification_Time call when routine unable to get
+   --  file modification time.
+
type Search_Data is record
   Is_Valid  : Boolean := False;
   Name  : Unbounded_String;
@@ -991,14 +999,9 @@ package body Ada.Directories is
---
 
function Modification_Time (Name : String) return Time is
-  Date   : OS_Time;
-  Year   : Year_Type;

[Ada] Disable warnings on entities when building finalizers

2020-10-21 Thread Pierre-Marie de Rodat
GNAT emits references to GNAT-generated entities when generating
finalizers. These references are protected by a counter. GCC is unable
to detect that the counter protects these references and emits warnings
when -Wmaybe-uninitialized is used.

This is a problem because users can't do anything to prevent GNAT from
generating these references, so we disable warnings on them.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* exp_ch7.adb (Build_Finalizer): Disable warnings on referenced
entity.diff --git a/gcc/ada/exp_ch7.adb b/gcc/ada/exp_ch7.adb
--- a/gcc/ada/exp_ch7.adb
+++ b/gcc/ada/exp_ch7.adb
@@ -3157,6 +3157,14 @@ package body Exp_Ch7 is
 
  Append_To (Finalizer_Stmts, Label);
 
+ --  Disable warnings on Obj_Id. This works around an issue where GCC
+ --  is not able to detect that Obj_Id is protected by a counter and
+ --  emits spurious warnings.
+
+ if not Comes_From_Source (Obj_Id) then
+Set_Warnings_Off (Obj_Id);
+ end if;
+
  --  Processing for simple protected objects. Such objects require
  --  manual finalization of their lock managers.
 




[Ada] Make minimum accessibility level a constant object

2020-10-21 Thread Pierre-Marie de Rodat
Minimum accessibility level was stored in a variable which was never
modified. It seems cleaner to store it in a constant, which hopefully
allows better optimization (e.g. when tracking know object the value of
a variable is killed on a subprogram called).

No impact on compiler behaviour, expect possibly an optimization.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* exp_ch6.adb (Expand_Actuals): Whitespace cleanup.
* sem_ch6.adb (Analyze_Subprogram_Body_Helper): Make minimum
accessibility a constant object.diff --git a/gcc/ada/exp_ch6.adb b/gcc/ada/exp_ch6.adb
--- a/gcc/ada/exp_ch6.adb
+++ b/gcc/ada/exp_ch6.adb
@@ -1458,12 +1458,12 @@ package body Exp_Ch6 is
   Subp  : Entity_Id;
   Post_Call : out List_Id)
is
-  Loc   : constant Source_Ptr := Sloc (N);
-  Actual: Node_Id;
-  Formal: Entity_Id;
-  N_Node: Node_Id;
-  E_Actual  : Entity_Id;
-  E_Formal  : Entity_Id;
+  Loc  : constant Source_Ptr := Sloc (N);
+  Actual   : Node_Id;
+  Formal   : Entity_Id;
+  N_Node   : Node_Id;
+  E_Actual : Entity_Id;
+  E_Formal : Entity_Id;
 
   procedure Add_Call_By_Copy_Code;
   --  For cases where the parameter must be passed by copy, this routine


diff --git a/gcc/ada/sem_ch6.adb b/gcc/ada/sem_ch6.adb
--- a/gcc/ada/sem_ch6.adb
+++ b/gcc/ada/sem_ch6.adb
@@ -4699,7 +4699,7 @@ package body Sem_Ch6 is
   then
  --  Generate the minimum accessibility level object
 
- --A60b : natural := natural'min(1, paramL);
+ --A60b : constant natural := natural'min(1, paramL);
 
  declare
 Loc  : constant Source_Ptr := Sloc (Body_Nod);
@@ -4708,6 +4708,7 @@ package body Sem_Ch6 is
 Defining_Identifier =>
   Make_Temporary
 (Loc, 'A', Extra_Accessibility (Form)),
+Constant_Present=> True,
 Object_Definition   => New_Occurrence_Of
  (Standard_Natural, Loc),
 Expression  =>




[Ada] Use VxWorks 7 APIs for accessing environment variables in kernel mode

2020-10-21 Thread Pierre-Marie de Rodat
VxWorks 7 provides newer APIs for kernel mode for accessing environment
variables that lead to smaller and more efficient code.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* env.c (__gnat_setenv): call setenv for VxWorks 7 kernel mode.
(__gnat_environ): envGet takes an int instead of a NULL pointer.
(__gnat_unsetenv): call unsetenv for VxWorks 7 kernel mode.
(__gnat_clearenv): use __gnat_unsetenv to clear environment
variables.diff --git a/gcc/ada/env.c b/gcc/ada/env.c
--- a/gcc/ada/env.c
+++ b/gcc/ada/env.c
@@ -99,7 +99,8 @@ __gnat_getenv (char *name, int *len, char **value)
 void
 __gnat_setenv (char *name, char *value)
 {
-#if (defined (__vxworks) && defined (__RTP__)) || defined (__APPLE__)
+#if (defined (__vxworks) && (defined (__RTP__) || _WRS_VXWORKS_MAJOR >= 7)) \
+|| defined (__APPLE__)
   setenv (name, value, 1);
 
 #else
@@ -110,9 +111,9 @@ __gnat_setenv (char *name, char *value)
 
   sprintf (expression, "%s=%s", name, value);
   putenv (expression);
-#if defined (__MINGW32__) || (defined (__vxworks) && ! defined (__RTP__))
-  /* On some systems like MacOS X and Windows, putenv is making a copy of the
- expression string so we can free it after the call to putenv */
+#if defined (__MINGW32__) || defined (__vxworks)
+  /* putenv for Windows and VxWorks 6 kernel modules makes a copy of the
+ expression string, so we need to free it after the call to putenv. */
   free (expression);
 #endif
 #endif
@@ -138,8 +139,13 @@ __gnat_environ (void)
   #if defined (__RTP__) || defined (VTHREADS)
 return environ;
   #else
-/* Kernel mode */
-return envGet (NULL);
+/* For VxWorks kernel modules use envGet to get the task's environment
+   (either the task's private environment if it has one or the global
+   environment otherwise). taskId parameter of 0 refers to the current
+   task (the VxWorks documentation says to use NULL but the compiler
+   complains that taskId is an int rather than a pointer. Internally,
+   VxWorks uses 0 as well). */
+return envGet (0);
   #endif
 #endif
 }
@@ -147,7 +153,8 @@ __gnat_environ (void)
 void __gnat_unsetenv (char *name)
 {
 #if defined (__hpux__) || defined (__sun__) \
- || (defined (__vxworks) && ! defined (__RTP__)) \
+ || (defined (__vxworks) && ! defined (__RTP__) \
+  && _WRS_VXWORKS_MAJOR <= 6) \
  || defined (_AIX) || defined (__Lynx__)
 
   /* On Solaris and HP-UX there is no function to clear an environment
@@ -170,7 +177,7 @@ void __gnat_unsetenv (char *name)
  if (strlen (env[index]) > size) {
if (strstr (env[index], name) == env[index] &&
 	   env[index][size] == '=') {
-#if defined (__vxworks) && ! defined (__RTP__)
+#if defined (__vxworks)
  /* on Vxworks we are sure that the string has been allocated using
 malloc */
  free (env[index]);
@@ -203,9 +210,10 @@ void __gnat_unsetenv (char *name)
 void __gnat_clearenv (void)
 {
 #if defined (__sun__) \
-  || (defined (__vxworks) && ! defined (__RTP__)) || defined (__Lynx__) \
+  || (defined (__vxworks) && !defined (__RTP__) && _WRS_VXWORKS_MAJOR <= 6) \
+  || defined (__Lynx__) \
   || defined (__PikeOS__)
-  /* On Solaris, VxWorks (not RTPs), and Lynx there is no system
+  /* On Solaris, VxWorks kernel pre 7, and Lynx there is no system
  call to unset a variable or to clear the environment so set all
  the entries in the environ table to NULL (see comment in
  __gnat_unsetenv for more explanation). */
@@ -217,7 +225,8 @@ void __gnat_clearenv (void)
 index++;
   }
 #elif defined (__MINGW32__) || defined (__FreeBSD__) || defined (__APPLE__) \
-   || (defined (__vxworks) && defined (__RTP__)) || defined (__CYGWIN__) \
+   || (defined (__vxworks) && defined (__RTP__) || _WRS_VXWORKS_MAJOR >= 7) \
+   || defined (__CYGWIN__) \
|| defined (__NetBSD__) || defined (__OpenBSD__) || defined (__rtems__) \
|| defined (__DragonFly__) || defined (__DJGPP__)
   /* On Windows, FreeBSD and MacOS there is no function to clean all the




[Ada] Fix bogus error for bit-packed array with volatile component

2020-10-21 Thread Pierre-Marie de Rodat
This fixes a bogus error recently introduced in the compiler for a
bit-packed array component in a record type with Volatile_Components
aspect on the array definition.  The array type should not be deemed
requiring strict alignment, although it is a by-reference type.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* freeze.adb (Check_Strict_Alignment): Do not set the flag for
a bit-packed array type, even if it is a by-reference type.diff --git a/gcc/ada/freeze.adb b/gcc/ada/freeze.adb
--- a/gcc/ada/freeze.adb
+++ b/gcc/ada/freeze.adb
@@ -1609,7 +1609,10 @@ package body Freeze is
   Comp  : Entity_Id;
 
begin
-  if Is_By_Reference_Type (E) then
+  --  Bit-packed array types do not require strict alignment, even if they
+  --  are by-reference types, because they are accessed in a special way.
+
+  if Is_By_Reference_Type (E) and then not Is_Bit_Packed_Array (E) then
  Set_Strict_Alignment (E);
 
   elsif Is_Array_Type (E) then




[Ada] Incorrect associations for extra accessibility parameters

2020-10-21 Thread Pierre-Marie de Rodat
This patch fixes an error in the compiler whereby extra accessibility
level actuals did not get expanded properly in calls - leading to
mislabed parameter associations.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* exp_ch6.adb (Expand_Call_Helper): Modify calls to
Add_Extra_Actual to use Extra_Accessibility instead of
Get_Accessibility for the EF parameter.diff --git a/gcc/ada/exp_ch6.adb b/gcc/ada/exp_ch6.adb
--- a/gcc/ada/exp_ch6.adb
+++ b/gcc/ada/exp_ch6.adb
@@ -3658,7 +3658,7 @@ package body Exp_Ch6 is
 
  --  Create possible extra actual for accessibility level
 
- if Present (Get_Accessibility (Formal)) then
+ if Present (Extra_Accessibility (Formal)) then
 
 --  Ada 2005 (AI-252): If the actual was rewritten as an Access
 --  attribute, then the original actual may be an aliased object
@@ -3748,7 +3748,7 @@ package body Exp_Ch6 is
   Add_Extra_Actual
 (Expr =>
New_Occurrence_Of (Get_Accessibility (Parm_Ent), Loc),
- EF   => Get_Accessibility (Formal));
+ EF   => Extra_Accessibility (Formal));
end;
 
 elsif Is_Entity_Name (Prev_Orig) then
@@ -3782,7 +3782,7 @@ package body Exp_Ch6 is
   (Expr =>
  New_Occurrence_Of
(Get_Accessibility (Parm_Ent), Loc),
-   EF   => Get_Accessibility (Formal));
+   EF   => Extra_Accessibility (Formal));
 
  --  If the actual access parameter does not have an
  --  associated extra formal providing its scope level,
@@ -3794,7 +3794,7 @@ package body Exp_Ch6 is
   (Expr =>
  Make_Integer_Literal (Loc,
Intval => Scope_Depth (Standard_Standard)),
-   EF   => Get_Accessibility (Formal));
+   EF   => Extra_Accessibility (Formal));
  end if;
   end;
 
@@ -3804,7 +3804,7 @@ package body Exp_Ch6 is
else
   Add_Extra_Actual
 (Expr => Dynamic_Accessibility_Level (Prev_Orig),
- EF   => Get_Accessibility (Formal));
+ EF   => Extra_Accessibility (Formal));
end if;
 
 --  If the actual is an access discriminant, then pass the level
@@ -3820,7 +3820,7 @@ package body Exp_Ch6 is
  (Expr =>
 Make_Integer_Literal (Loc,
   Intval => Object_Access_Level (Prefix (Prev_Orig))),
-  EF   => Get_Accessibility (Formal));
+  EF   => Extra_Accessibility (Formal));
 
 --  All other cases
 
@@ -3878,7 +3878,7 @@ package body Exp_Ch6 is
New_Occurrence_Of
  (Get_Accessibility
 (Entity (Prev_Ult)), Loc),
- EF   => Get_Accessibility (Formal));
+ EF   => Extra_Accessibility (Formal));
 
--  Normal case, call Object_Access_Level. Note:
--  should be Dynamic_Accessibility_Level ???
@@ -3889,7 +3889,7 @@ package body Exp_Ch6 is
Make_Integer_Literal (Loc,
  Intval =>
Object_Access_Level (Prev_Orig)),
- EF   => Get_Accessibility (Formal));
+ EF   => Extra_Accessibility (Formal));
end if;
 
 --  Treat the unchecked attributes as library-level
@@ -3901,7 +3901,7 @@ package body Exp_Ch6 is
  (Expr =>
 Make_Integer_Literal (Loc,
   Intval => Scope_Depth (Standard_Standard)),
-  EF   => Get_Accessibility (Formal));
+  EF   => Extra_Accessibility (Formal));
 
 --  No other cases of attributes returning access
 --  values that can be passed to access parameters.
@@ -3923,7 +3923,7 @@ package body Exp_Ch6 is
(Expr =>
   Make_Integer_Literal (Loc,
 Intval => Scope_Depth (Current_Scope) + 1),
-EF   => Get_Accessibility (Formal));
+EF   => Extra_Accessibility (Formal));
 
   --  For most other cases we simply pass the level of the
   --  actual's access type. The type is retrieved from
@@ -4151,7 +4151,7 @@ package body Exp_Ch6 is
 
A

[Ada] Remove -mthreads from Linker_Options pragma for x86-lynx178e

2020-10-21 Thread Pierre-Marie de Rodat
The -mthreads flag is no longer needed.  It was deprecated in 2.2.4 and
in 2.2.5 causes the linker to reference a non-existent directory.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* libgnarl/s-osinte__lynxos178e.ads: Remove -mthreads switch.diff --git a/gcc/ada/libgnarl/s-osinte__lynxos178e.ads b/gcc/ada/libgnarl/s-osinte__lynxos178e.ads
--- a/gcc/ada/libgnarl/s-osinte__lynxos178e.ads
+++ b/gcc/ada/libgnarl/s-osinte__lynxos178e.ads
@@ -47,10 +47,6 @@ with System.Multiprocessors;
 package System.OS_Interface is
pragma Preelaborate;
 
-   pragma Linker_Options ("-mthreads");
-   --  Selects the POSIX 1.c runtime, rather than the non-threading runtime or
-   --  the deprecated legacy threads library.
-
subtype intis Interfaces.C.int;
subtype short  is Interfaces.C.short;
subtype long   is Interfaces.C.long;




[Ada] Fix crash with iterated_component_association and -gnatc

2020-10-21 Thread Pierre-Marie de Rodat
When expansion was disabled, e.g. in GNATprove mode or when switch
-gnatc was used, analysis of the iterated_component_association's
expression was crashing when the expression included a function call.

The problem was that a copy of the expression was created with empty
parent.  Then an access-before-elaboration call marker was inserted as
an action associated with this empty parent, which cannot work.

When expansion is enabled, e.g. when compiling the code as usual, then
analysis of the iterated_component_association's expression only happens
after expansion, where the parent link was set. (The decision whether to
analyze the expression's copy and not wait until it is fully expanded it
explained in a comment in Resolve_Aggr_Expr.)

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* sem_aggr.adb (Resolve_Iterated_Component_Association):
Expression's copy and now has the same parent as the original
expression.
(Resolve_Array_Aggregate): Add ??? comment about a still
existing minor issue that led to discovery of the above crash.diff --git a/gcc/ada/sem_aggr.adb b/gcc/ada/sem_aggr.adb
--- a/gcc/ada/sem_aggr.adb
+++ b/gcc/ada/sem_aggr.adb
@@ -1662,6 +1662,7 @@ package body Sem_Aggr is
  --  as a loop with a new index variable.
 
  Expr := New_Copy_Tree (Expression (N));
+ Set_Parent (Expr, N);
  Dummy := Resolve_Aggr_Expr (Expr, False);
 
  --  An iterated_component_association may appear in a nested
@@ -2057,8 +2058,13 @@ package body Sem_Aggr is
  return Failure;
   end if;
 
+   --  ??? Checks for dynamically tagged expressions below will
+   --  be only applied to iterated_component_association after
+   --  expansion; in particular, errors might not be reported when
+   --  -gnatc switch is used.
+
elsif Nkind (Assoc) = N_Iterated_Component_Association then
-  null;   --  handled above, in a loop context.
+  null;   --  handled above, in a loop context
 
elsif not Resolve_Aggr_Expr
(Expression (Assoc), Single_Elmt => Single_Choice)




[Ada] Use helper function in Freeze_Subprogram_Body

2020-10-21 Thread Pierre-Marie de Rodat
This replaces the manual retrieval of the freeze node for the enclosing
body of the generic with a mere call to Package_Freeze_Node.

No functional changes.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* sem_ch12.adb (Freeze_Subprogram_Body): Call
Package_Freeze_Node to retrieve the freeze node for the
enclosing body of the generic.diff --git a/gcc/ada/sem_ch12.adb b/gcc/ada/sem_ch12.adb
--- a/gcc/ada/sem_ch12.adb
+++ b/gcc/ada/sem_ch12.adb
@@ -8980,8 +8980,8 @@ package body Sem_Ch12 is
   is
   Gen_Unit : constant Entity_Id := Get_Generic_Entity (Inst_Node);
   Par  : constant Entity_Id := Scope (Gen_Unit);
-  E_G_Id   : Entity_Id;
   Enc_G: Entity_Id;
+  Enc_G_F  : Node_Id;
   Enc_I: Node_Id;
   F_Node   : Node_Id;
 
@@ -9128,14 +9128,6 @@ package body Sem_Ch12 is
 and then Enc_G /= Enc_I
 and then Earlier (Inst_Node, Gen_Body)
   then
- if Nkind (Enc_G) = N_Package_Body then
-E_G_Id :=
-  Corresponding_Spec (Enc_G);
- else pragma Assert (Nkind (Enc_G) = N_Package_Body_Stub);
-E_G_Id :=
-  Corresponding_Spec (Proper_Body (Unit (Library_Unit (Enc_G;
- end if;
-
  --  Freeze package that encloses instance, and place node after the
  --  package that encloses generic. If enclosing package is already
  --  frozen we have to assume it is at the proper place. This may be a
@@ -9163,10 +9155,10 @@ package body Sem_Ch12 is
 
  --  Freeze enclosing subunit before instance
 
- Ensure_Freeze_Node (E_G_Id);
+ Enc_G_F := Package_Freeze_Node (Enc_G);
 
- if not Is_List_Member (Freeze_Node (E_G_Id)) then
-Insert_After (Enc_G, Freeze_Node (E_G_Id));
+ if not Is_List_Member (Enc_G_F) then
+Insert_After (Enc_G, Enc_G_F);
  end if;
 
  Insert_Freeze_Node_For_Instance (Inst_Node, F_Node);




[Ada] Wrong detection of potentially blocking call in protected object

2020-10-21 Thread Pierre-Marie de Rodat
When a protected subprogram invokes a function that returns a limited
type, and the sources are compiled with pragma Detect_ Blocking, the
code generated by the compiler erroneously invokes the runtime service
Activate_Tasks, call which is detected by the runtime as a potentially
blocking call (as described in RM 9.5.1) and causes Program_Error to be
raised raised at runtime.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* exp_ch9.adb (Build_Task_Activation_Call): Do not generate a
call to activate tasks if we are within the scope of a protected
type and pragma Detect_Blocking is active.diff --git a/gcc/ada/exp_ch9.adb b/gcc/ada/exp_ch9.adb
--- a/gcc/ada/exp_ch9.adb
+++ b/gcc/ada/exp_ch9.adb
@@ -4960,6 +4960,18 @@ package body Exp_Ch9 is
 
   if No (Chain) or else Is_Ignored_Ghost_Entity (Chain) then
  return;
+
+  --  The availability of the activation chain entity does not ensure
+  --  that we have tasks to activate because it may have been declared
+  --  by the frontend to pass a required extra formal to a build-in-place
+  --  subprogram call. If we are within the scope of a protected type and
+  --  pragma Detect_Blocking is active we can assume that no tasks will be
+  --  activated; if tasks are created in a protected object and this pragma
+  --  is active then the frontend emits a warning and Program_Error is
+  --  raised at runtime.
+
+  elsif Detect_Blocking and then Within_Protected_Type (Current_Scope) then
+ return;
   end if;
 
   --  The location of the activation call must be as close as possible to




[Ada] Fix crash on illegal OTHERS in iterated_component_association

2020-10-21 Thread Pierre-Marie de Rodat
A code for detecting illegal OTHERS clause is handles both
component_association and iterated_component_association, whose choice
list must be accessed with Choices and Discrete_Choices, respectively.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* sem_aggr.adb (Resolve_Array_Aggregate): Use Choice_List, which
internally calls either Choice or Discrete_Choices, depending on
the context.diff --git a/gcc/ada/sem_aggr.adb b/gcc/ada/sem_aggr.adb
--- a/gcc/ada/sem_aggr.adb
+++ b/gcc/ada/sem_aggr.adb
@@ -1837,7 +1837,7 @@ package body Sem_Aggr is
   if Others_Present and then not Others_Allowed then
  Error_Msg_N
("OTHERS choice not allowed here",
-First (Choices (First (Component_Associations (N);
+First (Choice_List (First (Component_Associations (N);
  return Failure;
   end if;
 




[Ada] Implement missing function result finalization.

2020-10-21 Thread Pierre-Marie de Rodat
In the case where a function call
  a) has a non-limited result type that requires finalization; and
  b) the callee has an out-mode (or in-out-mode) formal parameter; and
  c) the corresponding actual parameter's subtype is subject to an
 enabled predicate
, fix a compiler bug that could cause the function result to not be
finalized.  If finalization was being used to reclaim storage then this
missing finalization could result in a storage leak.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* exp_ch6.adb (Insert_Post_Call_Actions): When a function's
result type requires finalization and we decide to make copy of
a call to the function and subsequently refer only to the copy,
then don't forget to finalize the original function result
object.diff --git a/gcc/ada/exp_ch6.adb b/gcc/ada/exp_ch6.adb
--- a/gcc/ada/exp_ch6.adb
+++ b/gcc/ada/exp_ch6.adb
@@ -8390,13 +8390,28 @@ package body Exp_Ch6 is
 --  the write back to be skipped completely.
 
 --  To deal with this, we replace the call by
-
+--
 --do
 --   Tnnn : constant function-result-type := function-call;
 --   Post_Call actions
 --in
 --   Tnnn;
 --end;
+--
+--   However, that doesn't work if function-result-type requires
+--   finalization (because function-call's result never gets
+--   finalized). So in that case, we instead replace the call by
+--
+--do
+--   type Ref is access all function-result-type;
+--   Ptr : constant Ref := function-call'Reference;
+--   Tnnn : constant function-result-type := Ptr.all;
+--   Finalize (Ptr.all);
+--   Post_Call actions
+--in
+--   Tnnn;
+--end;
+--
 
 declare
Loc   : constant Source_Ptr := Sloc (N);
@@ -8405,12 +8420,63 @@ package body Exp_Ch6 is
Name  : constant Node_Id   := Relocate_Node (N);
 
 begin
-   Prepend_To (Post_Call,
- Make_Object_Declaration (Loc,
-   Defining_Identifier => Tnnn,
-   Object_Definition   => New_Occurrence_Of (FRTyp, Loc),
-   Constant_Present=> True,
-   Expression  => Name));
+   if Needs_Finalization (FRTyp) then
+  declare
+ Ptr_Typ : constant Entity_Id := Make_Temporary (Loc, 'A');
+
+ Ptr_Typ_Decl : constant Node_Id :=
+   Make_Full_Type_Declaration (Loc,
+ Defining_Identifier => Ptr_Typ,
+ Type_Definition =>
+   Make_Access_To_Object_Definition (Loc,
+ All_Present=> True,
+ Subtype_Indication =>
+   New_Occurrence_Of (FRTyp, Loc)));
+
+ Ptr_Obj : constant Entity_Id :=
+   Make_Temporary (Loc, 'P');
+
+ Ptr_Obj_Decl : constant Node_Id :=
+   Make_Object_Declaration (Loc,
+ Defining_Identifier => Ptr_Obj,
+ Object_Definition   =>
+   New_Occurrence_Of (Ptr_Typ, Loc),
+ Constant_Present=> True,
+ Expression  =>
+   Make_Attribute_Reference (Loc,
+   Prefix => Name,
+   Attribute_Name => Name_Unrestricted_Access));
+
+ function Ptr_Dereference return Node_Id is
+   (Make_Explicit_Dereference (Loc,
+  Prefix => New_Occurrence_Of (Ptr_Obj, Loc)));
+
+ Tnn_Decl : constant Node_Id :=
+   Make_Object_Declaration (Loc,
+ Defining_Identifier => Tnnn,
+ Object_Definition   => New_Occurrence_Of (FRTyp, Loc),
+ Constant_Present=> True,
+ Expression  => Ptr_Dereference);
+
+ Finalize_Call : constant Node_Id :=
+   Make_Final_Call
+ (Obj_Ref => Ptr_Dereference, Typ => FRTyp);
+  begin
+ --  Prepend in reverse order
+
+ Prepend_To (Post_Call, Finalize_Call);
+ Prepend_To (Post_Call, Tnn_Decl);
+ Prepend_To (Post_Call, Ptr_Obj_Decl);
+ Prepend_To (Post_Call, Ptr_Typ_Decl);
+  end;
+   else
+  Prepend_To (Post_Call,
+Make_Object_Declaration (Loc,
+

[Ada] Use index parameter for iterated_component_association

2020-10-21 Thread Pierre-Marie de Rodat
Processing of index parameters in iterated_component_association was
different within array aggregates (where we created a synonym
identifier) and array delta aggregates (where we used the existing
defining identifier).  Apparently, we can use the existing defining
identifiers in both cases.

This change is needed to handle iterated_component_association in
GNATprove, where a custom expansion of iterated_component_association
must analyze its expression (because the GNAT will only analyze a copy
of this expression for legality checking and then rely on its own
expansion with proper analysis).  With an (anonymous) synonym
identifier, which was not attached to AST, it was not possible to access
the scope entity introduced by GNAT resolution.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* sem_aggr.adb (Resolve_Iterated_Component_Association): Use
existing defining identifier for index parameter.diff --git a/gcc/ada/sem_aggr.adb b/gcc/ada/sem_aggr.adb
--- a/gcc/ada/sem_aggr.adb
+++ b/gcc/ada/sem_aggr.adb
@@ -1640,21 +1640,16 @@ package body Sem_Aggr is
  Set_Etype  (Ent, Standard_Void_Type);
  Set_Parent (Ent, Parent (N));
  Push_Scope (Ent);
- Id :=
-   Make_Defining_Identifier (Loc,
- Chars => Chars (Defining_Identifier (N)));
 
  --  Insert and decorate the index variable in the current scope.
  --  The expression has to be analyzed once the index variable is
- --  directly visible. Mark the variable as referenced to prevent
- --  spurious warnings, given that subsequent uses of its name in the
- --  expression will reference the internal (synonym) loop variable.
+ --  directly visible.
 
+ Id := Defining_Identifier (N);
  Enter_Name (Id);
  Set_Etype (Id, Index_Typ);
  Set_Ekind (Id, E_Variable);
  Set_Scope (Id, Ent);
- Set_Referenced (Id);
 
  --  Analyze a copy of the expression, to verify legality. We use
  --  a copy because the expression will be analyzed anew when the




[Ada] Compiler crash on prefixed call to controlled function with invariant check

2020-10-21 Thread Pierre-Marie de Rodat
When post call actions are created for a call to a function with a
result of a controlled type, such as for performing an invariant check
on a parameter with mode out or in out, the compiler can violate an
assertion (or crash with a Storage_Error) due to not recognizing the
expanded call as a function call. Calls to functions with controlled
results can be rewritten as an explicit dereference, and that case is
now checked for when processing post-call actions.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* exp_ch6.adb (Insert_Post_Call_Actions): Test for
N_Explicit_Dereference as part of the existing test for function
calls.diff --git a/gcc/ada/exp_ch6.adb b/gcc/ada/exp_ch6.adb
--- a/gcc/ada/exp_ch6.adb
+++ b/gcc/ada/exp_ch6.adb
@@ -8360,9 +8360,12 @@ package body Exp_Ch6 is
  --  The write-back of (in)-out parameters is handled by the back-end,
  --  but the constraint checks generated when subtypes of formal and
  --  actual don't match must be inserted in the form of assignments.
+ --  Also do this in the case of explicit dereferences, which can occur
+ --  due to rewritings of function calls with controlled results.
 
  if Nkind (N) = N_Function_Call
or else Nkind (Original_Node (N)) = N_Function_Call
+   or else Nkind (N) = N_Explicit_Dereference
  then
 pragma Assert (Ada_Version >= Ada_2012);
 --  Functions with '[in] out' parameters are only allowed in Ada




[Ada] Work around missing Long_Long_Long_Size entry in .atp file

2020-10-21 Thread Pierre-Marie de Rodat
This prevents the compiler from giving an error message when the new
Long_Long_Long_Size entry is missing in a target configuration file.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* set_targ.adb (DTN): Fix oversight.
(Read_Target_Dependent_Values): Do not error out on missing
Long_Long_Long_Size entry and reuse Long_Long_Size for it.diff --git a/gcc/ada/set_targ.adb b/gcc/ada/set_targ.adb
--- a/gcc/ada/set_targ.adb
+++ b/gcc/ada/set_targ.adb
@@ -84,8 +84,8 @@ package body Set_Targ is
   S_Float_Size 'Unrestricted_Access,
   S_Float_Words_BE 'Unrestricted_Access,
   S_Int_Size   'Unrestricted_Access,
-  S_Long_Long_Long_Size'Unrestricted_Access,
   S_Long_Double_Size   'Unrestricted_Access,
+  S_Long_Long_Long_Size'Unrestricted_Access,
   S_Long_Long_Size 'Unrestricted_Access,
   S_Long_Size  'Unrestricted_Access,
   S_Maximum_Alignment  'Unrestricted_Access,
@@ -748,8 +748,15 @@ package body Set_Targ is
 
   for J in DTR'Range loop
  if not DTR (J) then
-Fail ("missing entry for " & DTN (J).all & " in file "
-  & File_Name);
+--  Make an exception for Long_Long_Long_Size???
+
+if DTN (J) = S_Long_Long_Long_Size'Unrestricted_Access then
+   Long_Long_Long_Size := Long_Long_Size;
+
+else
+   Fail ("missing entry for " & DTN (J).all & " in file "
+ & File_Name);
+end if;
  end if;
   end loop;
 




[Ada] Codepeer remarks take into account

2020-10-21 Thread Pierre-Marie de Rodat
This patch adds pragma Assert to help CodePeer static analysis and
pragma Annotate to ignore Intentional or False_Positive warning.
Furthermore, some minor changes were added to take into account CodePeer
finding.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* libgnat/s-carsi8.adb (Compare_Array_S8): Add pragma Assert to
avoid warning concerning Left_Len and RighLen value regarding
Bytes_Compared_As_Words.
* libgnat/s-carun8.adb (Compare_Array_U8): Likewise.
* libgnat/s-geveop.adb (Binary_Operation, Unary_Operation): Add
pragma Assert concerning divide by 0 warning.
* libgnat/s-imgcha.adb (Image_Character): Code update to prevent
constant operation warning.
(Image_Character): Add pragma Assert concerning the unchecked
String size.
* libgnat/s-imgdec.adb
(Round): Upate loop code to prevent warning concerning
Digs'First access.
(Round): Add pragma assert.
(Set): Add pragma Assert for the unchecked string size.
(Set_Digits): Add pragma Assert for the input range.
(Set_Decimal_Digits): Add pragma Assert.
(Set_Blank_And_Sign): Add pragma Assert for the input range.
* libgnat/s-arit64.adb (DoubleDivide): Add pragma Assert
concerning Du /= 0.
(Multiply_With_Ovflo_Check): Add pragma Annotate to avoid
warning concerning unsigned -> signed conversion.
* libgnat/s-imguns.adb (Set_Image_Unsigned): Add pragma Assert
to prevent overflow check warning.  Add pragma Assert for
controlling S'First = 1.
* libgnat/s-imgrea.adb (Image_Floating_Point, Set, Set_Digs,
Set_Special_Fill, Convert_Integer): Add pragma Annotate to
prevent overflow check warning.
(Set_Image_Real): Add pragma Annotate to avoid dead code warning
on float check. Add pragma Assert to prevent overflow check
warning.
* libgnat/s-imgwiu.adb (Set_Digits, Set_Image_Width_Unsigned):
Add pragma assert to prevent overflow check warning.
* libgnat/s-imgllu.adb (Set_Image_Long_Long_Unsigned): Add
pragma assert to prevent overflow check warning.
* libgnat/s-imgint.adb (Set_Digits): Add Assert for input
constraint and to prevent overflow check warning, create
Non_Positive subtype, and change the T parameter as Non_Positive
instead Integer.
(Set_Image_Integer): Add pragma assert to prevent overflow check
warning.
* libgnat/s-imglli.adb (Set_Digits): Add Assert for input
constraint and to prevent overflow check warning, create
Non_Positive subtype, and change the T parameter as Non_Positive
instead Integer.
(Set_Image_Long_Long_Integer): Add pragma assert to prevent
overflow check warning.
* libgnat/s-fatgen.adb (Decompose, Pred, Succ): Add pragma
Annotate to prevent dead code due to invalid float check.
* libgnat/s-imenne.adb (Image_Enumeration_8,
Image_Enumeration_16, Image_Enumeration_32): Add pragma Assert
to prevent overflow check warning.  Add Names_Index subtype for
restricting Index_table content.

patch.diff.gz
Description: application/gzip


[Ada] Fix problematic placement of freeze node after instantiation

2020-10-21 Thread Pierre-Marie de Rodat
This prevents Freeze_Subprogram_Body from moving the freeze node of a
package body outside of its scope when it contains the instantiation
of a generic subprogram and the parent of the package body happens to
be in the same declarative part as the freeze node of the scope of the
generic subprogram.  This appears to be both unnecessary and counter-
productive in case the package body itself contains a generic unit.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* sem_ch12.adb (Freeze_Subprogram_Body): Do not move the freeze
node of the package body enclosing the instance when its parent
is in the same declarative part as the freeze node of the parent.diff --git a/gcc/ada/sem_ch12.adb b/gcc/ada/sem_ch12.adb
--- a/gcc/ada/sem_ch12.adb
+++ b/gcc/ada/sem_ch12.adb
@@ -9106,12 +9106,7 @@ package body Sem_Ch12 is
 and then Present (Freeze_Node (Par))
 and then Present (Enc_I)
   then
- if In_Same_Declarative_Part (Parent (Freeze_Node (Par)), Enc_I)
-   or else
- (Nkind (Enc_I) = N_Package_Body
-   and then In_Same_Declarative_Part
-  (Parent (Freeze_Node (Par)), Parent (Enc_I)))
- then
+ if In_Same_Declarative_Part (Parent (Freeze_Node (Par)), Enc_I) then
 --  The enclosing package may contain several instances. Rather
 --  than computing the earliest point at which to insert its freeze
 --  node, we place it at the end of the declarative part of the




[Ada] Fix analysis of iterated component expression with null range

2020-10-21 Thread Pierre-Marie de Rodat
When checking legality of the iterated_component_association's
expression the index parameter must be in scope, even its range is null.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* exp_aggr.adb (Gen_Loop): Analyze copy of the expression in the
scope of the implicit loop with name of the index parameter
visible.diff --git a/gcc/ada/exp_aggr.adb b/gcc/ada/exp_aggr.adb
--- a/gcc/ada/exp_aggr.adb
+++ b/gcc/ada/exp_aggr.adb
@@ -53,6 +53,7 @@ with Sem;  use Sem;
 with Sem_Aggr; use Sem_Aggr;
 with Sem_Aux;  use Sem_Aux;
 with Sem_Ch3;  use Sem_Ch3;
+with Sem_Ch8;  use Sem_Ch8;
 with Sem_Ch13; use Sem_Ch13;
 with Sem_Eval; use Sem_Eval;
 with Sem_Mech; use Sem_Mech;
@@ -1954,7 +1955,30 @@ package body Exp_Aggr is
   Expander_Mode_Save_And_Set (False);
   Tcopy := New_Copy_Tree (Expr);
   Set_Parent (Tcopy, N);
-  Analyze_And_Resolve (Tcopy, Component_Type (Etype (N)));
+
+  --  For iterated_component_association analyze and resolve
+  --  the expression with name of the index parameter visible.
+  --  To manipulate scopes, we use entity of the implicit loop.
+
+  if Is_Iterated_Component then
+ declare
+Index_Parameter : constant Entity_Id :=
+  Defining_Identifier (Parent (Expr));
+ begin
+Push_Scope (Scope (Index_Parameter));
+Enter_Name (Index_Parameter);
+Analyze_And_Resolve
+  (Tcopy, Component_Type (Etype (N)));
+End_Scope;
+ end;
+
+  --  For ordinary component association, just analyze and
+  --  resolve the expression.
+
+  else
+ Analyze_And_Resolve (Tcopy, Component_Type (Etype (N)));
+  end if;
+
   Expander_Mode_Restore;
end if;
 end if;




[Ada] Fix target configuration file used for CodePeer/SPARK for new ints

2020-10-21 Thread Pierre-Marie de Rodat
The new 128-bits integer support requires adapting the special target
configuration file used for CodePeer and SPARK analysis. One change was
missing in Width_From_Size.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* ada_get_targ.adb (Width_From_Size): Add case for 128 bits.
Reorder declarations in the same order as get_targ.adb to
facilitate diffs.diff --git a/gcc/ada/ada_get_targ.adb b/gcc/ada/ada_get_targ.adb
--- a/gcc/ada/ada_get_targ.adb
+++ b/gcc/ada/ada_get_targ.adb
@@ -118,15 +118,6 @@ package body Get_Targ is
   return 4;
end Get_Maximum_Alignment;
 
-   
-   -- Get_System_Allocator_Alignment --
-   
-
-   function Get_System_Allocator_Alignment return Nat is
-   begin
-  return 1;
-   end Get_System_Allocator_Alignment;
-

-- Get_Float_Words_BE --

@@ -181,6 +172,15 @@ package body Get_Targ is
   return 1;
end Get_Strict_Alignment;
 
+   
+   -- Get_System_Allocator_Alignment --
+   
+
+   function Get_System_Allocator_Alignment return Nat is
+   begin
+  return 1;
+   end Get_System_Allocator_Alignment;
+

-- Get_Double_Float_Alignment --

@@ -199,15 +199,6 @@ package body Get_Targ is
   return 0;
end Get_Double_Scalar_Alignment;
 
-   -
-   -- Get_Max_Unaligned_Field --
-   -
-
-   function Get_Max_Unaligned_Field return Pos is
-   begin
-  return 64;  -- Can be different on some targets (e.g., AAMP)
-   end Get_Max_Unaligned_Field;
-
--
-- Digits_From_Size --
--
@@ -225,6 +216,15 @@ package body Get_Targ is
end Digits_From_Size;
 
-
+   -- Get_Max_Unaligned_Field --
+   -
+
+   function Get_Max_Unaligned_Field return Pos is
+   begin
+  return 64;  -- Can be different on some targets (e.g., AAMP)
+   end Get_Max_Unaligned_Field;
+
+   -
-- Register_Back_End_Types --
-
 
@@ -255,13 +255,14 @@ package body Get_Targ is
-- Width_From_Size --
-
 
-   function Width_From_Size  (Size : Pos) return Pos is
+   function Width_From_Size (Size : Pos) return Pos is
begin
   case Size is
- when  8 => return  4;
- when 16 => return  6;
- when 32 => return 11;
- when 64 => return 21;
+ when   8=> return  4;
+ when  16=> return  6;
+ when  32=> return 11;
+ when  64=> return 21;
+ when 128=> return 40;
  when others => raise Program_Error;
   end case;
end Width_From_Size;




[Ada] Fix bootstrap with old GCC

2020-10-21 Thread Pierre-Marie de Rodat
Some gcc versions (4 or less) support the __builtin_*_overflow
routines while corresponding g++ compilers do not.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* adaint.c (__gnat_file_time): Use regular arithmetic instead of
__builtin_*_overflow routines if GCC version 4 or less and
compiler is g++.diff --git a/gcc/ada/adaint.c b/gcc/ada/adaint.c
--- a/gcc/ada/adaint.c
+++ b/gcc/ada/adaint.c
@@ -1503,6 +1503,9 @@ extern long long __gnat_file_time(char* name)
 
   t_write.ft_time = fad.ftLastWriteTime;
 
+#if defined(__GNUG__) && __GNUG__ <= 4
+  result = (t_write.ll_time - w32_epoch_offset) * 100;
+#else
   /* Next code similar to (t_write.ll_time - w32_epoch_offset) * 100
  but on overflow returns LLONG_MIN value. */
 
@@ -1513,6 +1516,7 @@ extern long long __gnat_file_time(char* name)
   if (__builtin_smulll_overflow(result, 100, &result)) {
 return LLONG_MIN;
   }
+#endif
 
 #else
 
@@ -1521,6 +1525,12 @@ extern long long __gnat_file_time(char* name)
 return LLONG_MIN;
   }
 
+#if defined(__GNUG__) && __GNUG__ <= 4
+result = (sb.st_mtime - ada_epoch_offset) * 1E9;
+#if defined(st_mtime)
+result += sb.st_mtim.tv_nsec;
+#endif
+#else
   /* Next code similar to
  (sb.st_mtime - ada_epoch_offset) * 1E9 + sb.st_mtim.tv_nsec
  but on overflow returns LLONG_MIN value. */
@@ -1538,7 +1548,7 @@ extern long long __gnat_file_time(char* name)
 return LLONG_MIN;
   }
 #endif
-
+#endif
 #endif
   return result;
 }




[committed] aarch64: [testsuite] Fix typo in diagnostic message

2020-10-21 Thread Andrea Corallo via Gcc-patches
Hi all,

just committed as very obvious the following simple patch fixing a nit
in an Aarch64 testcase.

   Andrea

>From 9491dfe6c8adc298af34ce13280e0d3411c89d7d Mon Sep 17 00:00:00 2001
From: Andrea Corallo 
Date: Wed, 21 Oct 2020 08:48:16 +0200
Subject: [PATCH] aarch64: [testsuite] Fix typo in diagnostic message

gcc/testsuite/ChangeLog

2020-10-21  Andrea Corallo  

* gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c (CMT):
Adopt the same style used in the rest of the file.
---
 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c
index 3329e6bbb54..45062d9a4c5 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c
@@ -476,7 +476,7 @@ void exec_vstX_lane (void)
 
   TEST_ALL_EXTRA_CHUNKS(2, 1);
 #undef CMT
-#define CMT " chunk 1"
+#define CMT " (chunk 1)"
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_st2_1, CMT);
   CHECK(TEST_MSG, int, 16, 4, PRIx16, expected_st2_1, CMT);
   CHECK(TEST_MSG, int, 32, 2, PRIx32, expected_st2_1, CMT);
-- 
2.20.1



[PATCH][pushed] ASAN: Support detect_invalid_pointer_pairs=1 with detect_stack_use_after_return=1

2020-10-21 Thread Martin Liška

This is one another backport from master.

Do not crash when AsanThread::GetStackVariableShadowStart does not find
a variable for a pointer on a shadow stack.

Cherry-pick from ad2be02a833e56f7fe280797280b219eb3312621.

Differential Revision: https://reviews.llvm.org/D89552
---
 libsanitizer/asan/asan_thread.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libsanitizer/asan/asan_thread.cpp 
b/libsanitizer/asan/asan_thread.cpp
index f0df8bd4b37..58cdc29d365 100644
--- a/libsanitizer/asan/asan_thread.cpp
+++ b/libsanitizer/asan/asan_thread.cpp
@@ -366,7 +366,9 @@ uptr AsanThread::GetStackVariableShadowStart(uptr addr) {
 bottom = stack_bottom();
   } else if (has_fake_stack()) {
 bottom = fake_stack()->AddrIsInFakeStack(addr);
-CHECK(bottom);
+if (bottom == 0) {
+  return 0;
+}
   } else {
 return 0;
   }
--
2.28.0



Re: [PATCH] Saturate overflows return from SCEV in ranger.

2020-10-21 Thread Aldy Hernandez via Gcc-patches




On 10/21/20 8:19 AM, Richard Biener wrote:

On Tue, Oct 20, 2020 at 5:21 PM Aldy Hernandez via Gcc-patches
 wrote:


bounds_of_var_in_loop is returning an overflowed int, which is causing
us to create a range for which we can't compare the bounds causing
an ICE in verify_range.

Overflowed bounds cause compare_values() to return -2, which we
don't handle in verify_range.

We don't represent overflowed ranges in irange, so this patch just
saturates any overflowed end-points to MIN or MAX.


I don't think TREE_OVERFLOW means what you think it means in the
context of bounds_of_var_in_loop - look at its bottom which does

   /* Even for valid range info, sometimes overflow flag will leak in.
  As GIMPLE IL should have no constants with TREE_OVERFLOW set, we
  drop them.  */
   if (TREE_OVERFLOW_P (*min))
 *min = drop_tree_overflow (*min);
   if (TREE_OVERFLOW_P (*max))
 *max = drop_tree_overflow (*max);


Interesting.

If these values "leaked" in.  Should they have been fixed at the source, 
instead of after the fact?  You mention below that every use of 
TREE_OVERFLOW in the ME is a bug, should we clean them up before 
arriving in gimple, or are there legitimate uses of it?




and the code explicitly checks for overflow, doing range adjustments
accordingly.


Well, not all overflows are adjusted:

  /* Like in PR19590, scev can return a constant function.  */
  if (is_gimple_min_invariant (chrec))
{
  *min = *max = chrec;
  return true;
}

Are these min/max not adjusted for overflow by design, or is this an 
oversight?


If the latter, we could instead what I do below.  What do you think?

Thanks for the feedback.
Aldy

diff --git a/gcc/gimple-range.cc b/gcc/gimple-range.cc
index b790d62d75f..c5520e0700b 100644
--- a/gcc/gimple-range.cc
+++ b/gcc/gimple-range.cc
@@ -1156,9 +1156,9 @@ gimple_ranger::range_of_ssa_name_with_loop_info 
(irange &r, tree name,

   // ?? We could do better here.  Since MIN/MAX can only be an
   // SSA, SSA +- INTEGER_CST, or INTEGER_CST, we could easily call
   // the ranger and solve anything not an integer.
-  if (TREE_CODE (min) != INTEGER_CST || TREE_OVERFLOW (min))
+  if (TREE_CODE (min) != INTEGER_CST)
min = vrp_val_min (type);
-  if (TREE_CODE (max) != INTEGER_CST || TREE_OVERFLOW (max))
+  if (TREE_CODE (max) != INTEGER_CST)
max = vrp_val_max (type);
   r.set (min, max);
 }
diff --git a/gcc/vr-values.c b/gcc/vr-values.c
index 67c88006f13..7778ceccf0a 100644
--- a/gcc/vr-values.c
+++ b/gcc/vr-values.c
@@ -1844,7 +1844,7 @@ bounds_of_var_in_loop (tree *min, tree *max, 
range_query *query,

   if (is_gimple_min_invariant (chrec))
 {
   *min = *max = chrec;
-  return true;
+  goto fix_overflow;
 }

   if (TREE_CODE (chrec) != POLYNOMIAL_CHREC)
@@ -1964,6 +1964,7 @@ bounds_of_var_in_loop (tree *min, tree *max, 
range_query *query,

   else
 *min = init;

+ fix_overflow:
   /* Even for valid range info, sometimes overflow flag will leak in.
  As GIMPLE IL should have no constants with TREE_OVERFLOW set, we
  drop them.  */



[PATCH][pushed] libsanitizer: add test-case

2020-10-21 Thread Martin Liška

And here I'm adding a test-case for the PR.

Thanks,
Martin

gcc/testsuite/ChangeLog:

PR sanitizer/97414
* g++.dg/asan/pr97414.C: New test.
---
 gcc/testsuite/g++.dg/asan/pr97414.C | 19 +++
 1 file changed, 19 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/asan/pr97414.C

diff --git a/gcc/testsuite/g++.dg/asan/pr97414.C 
b/gcc/testsuite/g++.dg/asan/pr97414.C
new file mode 100644
index 000..6ea03906daa
--- /dev/null
+++ b/gcc/testsuite/g++.dg/asan/pr97414.C
@@ -0,0 +1,19 @@
+/* PR sanitizer/97414 */
+/* { dg-do run } */
+/* { dg-set-target-env-var ASAN_OPTIONS 
"detect_invalid_pointer_pairs=1:halt_on_error=1,detect_stack_use_after_return=1"
 } */
+/* { dg-options "-fsanitize=address,pointer-compare,pointer-subtract" } */
+
+[[gnu::noinline]] auto pointer_diff(const int *begin, const int *end) {
+  return end - begin;
+}
+
+int main() {
+  constexpr auto size = (2048 / sizeof(int)) + 1;
+
+  auto buf = new int[size];
+  auto end = buf + size;
+  pointer_diff(end, buf);
+  delete[] buf;
+
+  return 0;
+}
--
2.28.0



[PATCH] Move simplify_cond_using_ranges_2 to tree-vrp.c

2020-10-21 Thread Aldy Hernandez via Gcc-patches
This was slated to be moved last year, as its only use is in tree-vrp.c

There are no functional changes.  It's just a move and a rename.

Pushed.

gcc/ChangeLog:

* vr-values.h: Remove simplify_cond_using_ranges_2.
(range_fits_type_p): New.
* vr-values.c (range_fits_type_p): Remove static qualifier.
(vrp_simplify_cond_using_ranges): Move...
* tree-vrp.c (vrp_simplify_cond_using_ranges): ...to here.
---
 gcc/tree-vrp.c  | 65 +++--
 gcc/vr-values.c | 63 +--
 gcc/vr-values.h |  5 ++--
 3 files changed, 66 insertions(+), 67 deletions(-)

diff --git a/gcc/tree-vrp.c b/gcc/tree-vrp.c
index 0e19690f41f..e00c034fee3 100644
--- a/gcc/tree-vrp.c
+++ b/gcc/tree-vrp.c
@@ -4395,6 +4395,67 @@ vrp_prop::vrp_finalize (vrp_folder *folder, bool 
warn_array_bounds_p)
 }
 }
 
+/* STMT is a conditional at the end of a basic block.
+
+   If the conditional is of the form SSA_NAME op constant and the SSA_NAME
+   was set via a type conversion, try to replace the SSA_NAME with the RHS
+   of the type conversion.  Doing so makes the conversion dead which helps
+   subsequent passes.  */
+
+static void
+vrp_simplify_cond_using_ranges (vr_values *query, gcond *stmt)
+{
+  tree op0 = gimple_cond_lhs (stmt);
+  tree op1 = gimple_cond_rhs (stmt);
+
+  /* If we have a comparison of an SSA_NAME (OP0) against a constant,
+ see if OP0 was set by a type conversion where the source of
+ the conversion is another SSA_NAME with a range that fits
+ into the range of OP0's type.
+
+ If so, the conversion is redundant as the earlier SSA_NAME can be
+ used for the comparison directly if we just massage the constant in the
+ comparison.  */
+  if (TREE_CODE (op0) == SSA_NAME
+  && TREE_CODE (op1) == INTEGER_CST)
+{
+  gimple *def_stmt = SSA_NAME_DEF_STMT (op0);
+  tree innerop;
+
+  if (!is_gimple_assign (def_stmt)
+ || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def_stmt)))
+   return;
+
+  innerop = gimple_assign_rhs1 (def_stmt);
+
+  if (TREE_CODE (innerop) == SSA_NAME
+ && !POINTER_TYPE_P (TREE_TYPE (innerop))
+ && !SSA_NAME_OCCURS_IN_ABNORMAL_PHI (innerop)
+ && desired_pro_or_demotion_p (TREE_TYPE (innerop), TREE_TYPE (op0)))
+   {
+ const value_range *vr = query->get_value_range (innerop);
+
+ if (range_int_cst_p (vr)
+ && range_fits_type_p (vr,
+   TYPE_PRECISION (TREE_TYPE (op0)),
+   TYPE_SIGN (TREE_TYPE (op0)))
+ && int_fits_type_p (op1, TREE_TYPE (innerop)))
+   {
+ tree newconst = fold_convert (TREE_TYPE (innerop), op1);
+ gimple_cond_set_lhs (stmt, innerop);
+ gimple_cond_set_rhs (stmt, newconst);
+ update_stmt (stmt);
+ if (dump_file && (dump_flags & TDF_DETAILS))
+   {
+ fprintf (dump_file, "Folded into: ");
+ print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
+ fprintf (dump_file, "\n");
+   }
+   }
+   }
+}
+}
+
 /* Main entry point to VRP (Value Range Propagation).  This pass is
loosely based on J. R. C. Patterson, ``Accurate Static Branch
Prediction by Value Range Propagation,'' in SIGPLAN Conference on
@@ -4482,8 +4543,8 @@ execute_vrp (struct function *fun, bool 
warn_array_bounds_p)
 {
   gimple *last = last_stmt (bb);
   if (last && gimple_code (last) == GIMPLE_COND)
-   simplify_cond_using_ranges_2 (&vrp_prop.vr_values,
- as_a  (last));
+   vrp_simplify_cond_using_ranges (&vrp_prop.vr_values,
+   as_a  (last));
 }
 
   free_numbers_of_iterations_estimates (fun);
diff --git a/gcc/vr-values.c b/gcc/vr-values.c
index 67c88006f13..cc0ddca2bd3 100644
--- a/gcc/vr-values.c
+++ b/gcc/vr-values.c
@@ -3594,7 +3594,7 @@ test_for_singularity (enum tree_code cond_code, tree op0,
 /* Return whether the value range *VR fits in an integer type specified
by PRECISION and UNSIGNED_P.  */
 
-static bool
+bool
 range_fits_type_p (const value_range *vr,
   unsigned dest_precision, signop dest_sgn)
 {
@@ -3781,67 +3781,6 @@ simplify_using_ranges::simplify_cond_using_ranges_1 
(gcond *stmt)
   return false;
 }
 
-/* STMT is a conditional at the end of a basic block.
-
-   If the conditional is of the form SSA_NAME op constant and the SSA_NAME
-   was set via a type conversion, try to replace the SSA_NAME with the RHS
-   of the type conversion.  Doing so makes the conversion dead which helps
-   subsequent passes.  */
-
-void
-simplify_cond_using_ranges_2 (vr_values *query, gcond *stmt)
-{
-  tree op0 = gimple_cond_lhs (stmt);
-  tree op1 = gimple_cond_rhs (stmt);
-
-  /* If we have a comparison of an SSA_NAME (OP0) against a constant,
- see if

Re: [PATCH] Saturate overflows return from SCEV in ranger.

2020-10-21 Thread Richard Biener via Gcc-patches
On Wed, Oct 21, 2020 at 9:30 AM Aldy Hernandez  wrote:
>
>
>
> On 10/21/20 8:19 AM, Richard Biener wrote:
> > On Tue, Oct 20, 2020 at 5:21 PM Aldy Hernandez via Gcc-patches
> >  wrote:
> >>
> >> bounds_of_var_in_loop is returning an overflowed int, which is causing
> >> us to create a range for which we can't compare the bounds causing
> >> an ICE in verify_range.
> >>
> >> Overflowed bounds cause compare_values() to return -2, which we
> >> don't handle in verify_range.
> >>
> >> We don't represent overflowed ranges in irange, so this patch just
> >> saturates any overflowed end-points to MIN or MAX.
> >
> > I don't think TREE_OVERFLOW means what you think it means in the
> > context of bounds_of_var_in_loop - look at its bottom which does
> >
> >/* Even for valid range info, sometimes overflow flag will leak in.
> >   As GIMPLE IL should have no constants with TREE_OVERFLOW set, we
> >   drop them.  */
> >if (TREE_OVERFLOW_P (*min))
> >  *min = drop_tree_overflow (*min);
> >if (TREE_OVERFLOW_P (*max))
> >  *max = drop_tree_overflow (*max);
>
> Interesting.
>
> If these values "leaked" in.  Should they have been fixed at the source,
> instead of after the fact?  You mention below that every use of
> TREE_OVERFLOW in the ME is a bug, should we clean them up before
> arriving in gimple, or are there legitimate uses of it?

There are no legitimate uses in GIMPLE.  They are (ab-)used by
GENERIC folding for propagating overflow (also used in FE
diagnostics).  Generally the better way is to use wide_ints overflow
handling which also "sticks".

> >
> > and the code explicitly checks for overflow, doing range adjustments
> > accordingly.
>
> Well, not all overflows are adjusted:
>
>/* Like in PR19590, scev can return a constant function.  */
>if (is_gimple_min_invariant (chrec))
>  {
>*min = *max = chrec;
>return true;
>  }
>
> Are these min/max not adjusted for overflow by design, or is this an
> oversight?

Ah, that's an oversight here.  And yes, "fixing" it in scalar evolution
analysis itself (dropping the flag there) would be best

>
> If the latter, we could instead what I do below.  What do you think?

Yeah, though *cough* goto ... (well, not so bad I guess)

Thanks,
Richard.

> Thanks for the feedback.
> Aldy
>
> diff --git a/gcc/gimple-range.cc b/gcc/gimple-range.cc
> index b790d62d75f..c5520e0700b 100644
> --- a/gcc/gimple-range.cc
> +++ b/gcc/gimple-range.cc
> @@ -1156,9 +1156,9 @@ gimple_ranger::range_of_ssa_name_with_loop_info
> (irange &r, tree name,
> // ?? We could do better here.  Since MIN/MAX can only be an
> // SSA, SSA +- INTEGER_CST, or INTEGER_CST, we could easily call
> // the ranger and solve anything not an integer.
> -  if (TREE_CODE (min) != INTEGER_CST || TREE_OVERFLOW (min))
> +  if (TREE_CODE (min) != INTEGER_CST)
> min = vrp_val_min (type);
> -  if (TREE_CODE (max) != INTEGER_CST || TREE_OVERFLOW (max))
> +  if (TREE_CODE (max) != INTEGER_CST)
> max = vrp_val_max (type);
> r.set (min, max);
>   }
> diff --git a/gcc/vr-values.c b/gcc/vr-values.c
> index 67c88006f13..7778ceccf0a 100644
> --- a/gcc/vr-values.c
> +++ b/gcc/vr-values.c
> @@ -1844,7 +1844,7 @@ bounds_of_var_in_loop (tree *min, tree *max,
> range_query *query,
> if (is_gimple_min_invariant (chrec))
>   {
> *min = *max = chrec;
> -  return true;
> +  goto fix_overflow;
>   }
>
> if (TREE_CODE (chrec) != POLYNOMIAL_CHREC)
> @@ -1964,6 +1964,7 @@ bounds_of_var_in_loop (tree *min, tree *max,
> range_query *query,
> else
>   *min = init;
>
> + fix_overflow:
> /* Even for valid range info, sometimes overflow flag will leak in.
>As GIMPLE IL should have no constants with TREE_OVERFLOW set, we
>drop them.  */
>


Re: [PATCH][middle-end][i386][version 3]Add -fzero-call-used-regs=[skip|used-gpr-arg|used-arg|all-arg|used-gpr|all-gpr|used|all]

2020-10-21 Thread Uros Bizjak via Gcc-patches
On Wed, Oct 21, 2020 at 9:18 AM Uros Bizjak  wrote:
>
> On Tue, Oct 20, 2020 at 10:04 PM Qing Zhao  wrote:
>
> > +/* Check whether the register REGNO should be zeroed on X86.
> > +   When ALL_SSE_ZEROED is true, all SSE registers have been zeroed
> > +   together, no need to zero it again.
> > +   Stack registers (st0-st7) and mm0-mm7 are aliased with each other.
> > +   very hard to be zeroed individually, don't zero individual st or
> > +   mm registgers at this time.  */
> > +
> > +static bool
> > +zero_call_used_regno_p (const unsigned int regno,
> > + bool all_sse_zeroed)
> > +{
> > +  return GENERAL_REGNO_P (regno)
> > +  || (!all_sse_zeroed && SSE_REGNO_P (regno))
> > +  || MASK_REGNO_P (regno);
> > +}
> > +
> > +/* Return the machine_mode that is used to zero register REGNO.  */
> > +
> > +static machine_mode
> > +zero_call_used_regno_mode (const unsigned int regno)
> > +{
> > +  /* NB: We only need to zero the lower 32 bits for integer registers
> > + and the lower 128 bits for vector registers since destination are
> > + zero-extended to the full register width.  */
> > +  if (GENERAL_REGNO_P (regno))
> > +return SImode;
> > +  else if (SSE_REGNO_P (regno))
> > +return V4SFmode;
> > +  else
> > +return HImode;
> > +}
> > +
> > +/* Generate a rtx to zero all vector registers togetehr if possible,
> > +   otherwise, return NULL.  */
> > +
> > +static rtx
> > +zero_all_vector_registers (HARD_REG_SET need_zeroed_hardregs)
> > +{
> > +  if (!TARGET_AVX)
> > +return NULL;
> > +
> > +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
> > +if ((IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG)
> > +  || (TARGET_64BIT
> > +  && (REX_SSE_REGNO_P (regno)
> > +  || (TARGET_AVX512F && EXT_REX_SSE_REGNO_P (regno)
> > + && !TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
> > +  return NULL;
> > +
> > +  return gen_avx_vzeroall ();
> > +}
> > +
> > +/* Generate a rtx to zero all st and mm registers togetehr if possible,
> > +   otherwise, return NULL.  */
> > +
> > +static rtx
> > +zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs)
> > +{
> > +  if (!TARGET_MMX)
> > +return NULL;
> > +
> > +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
> > +if ((STACK_REGNO_P (regno) || MMX_REGNO_P (regno))
> > + && !TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
> > +  return NULL;
> > +
> > +  return gen_mmx_emms ();
> >
> >
> > emms is not clearing any register, it only loads x87FPUTagWord with
> > H. So I think, the above is useless, as far as register clearing
> > is concerned.
> >
> >
> > Thanks for the info.
> >
> > So, for mm and st registers, should we clear them, and how?
> >
> >
> > I don't know.
> >
> > Please note that %mm and %st share the same register file, and
> > touching %mm registers will block access to %st until emms is emitted.
> > You can't just blindly load 0 to %st registers, because the register
> > file can be in MMX mode and vice versa. For 32bit targets, function
> > can also  return a value in the %mm0.
> >
> >
> > If data flow determine that %mm0 does not return a value at the return, can 
> > we clear all the %st as following:
> >
> > emms
> > mov %st0, 0
> > mov %st1, 0
> > mov %st2, 0
> > mov %st3, 0
> > mov %st4, 0
> > mov %st5, 0
> > mov %st6, 0
> > mov %st7, 0
>
> The i386 ABI says:
>
> -- q --
> The CPU shall be in x87 mode upon entry to a function. Therefore,
> every function that uses the MMX registers is required to issue an
> emms or femms instruction after using MMX registers, before returning
> or calling another function.
> -- /q --
>
> (The above requirement slightly contradicts its own ABI, since we have
> 3 MMX argument registers and MMX return register, so the CPU obviously
> can't be in x87 mode at all function boundaries).
>
> So, assuming that the first sentence is not deliberately vague w.r.t
> function exit, emms should not be needed. However, we are dealing with
> x87 stack registers that have their own set of peculiarities. It is
> not possible to load a random register in the way you show.  Also,
> stack should be either empty or one (two in case of complex value
> return) levels deep at the function return. I think you want a series
> of 8 or 7(6) fldz insns, followed by a series of fstp insn to clear
> the stack and mark stack slots empty.

Something like this:

--cut here--
long double
__attribute__ ((noinline))
test (long double a, long double b)
{
  long double r = a + b;

  asm volatile ("fldz;\
fldz;\
fldz;\
fldz;\
fldz;\
fldz;\
fldz;\
fstp %%st(0);\
fstp %%st(0);\
fstp %%st(0);\
fstp %%st(0);\
fstp %%st(0);\
fstp %%st(0);\
fstp %%st(0)" : : "X"(r));
  return r;
}

int
main 

[PATCH] phiopt: Optimize x ? __builtin_clz (x) : 32 in GIMPLE [PR97503]

2020-10-21 Thread Jakub Jelinek via Gcc-patches
Hi!

While we have at the RTL level noce_try_ifelse_collapse combined with
simplify_cond_clz_ctz, that optimization doesn't always trigger because
e.g. on powerpc there is an define_insn to compare a reg against zero and
copy that register to another one and so we end up with a different pseudo
in the simplify_cond_clz_ctz test and punt.

For targets that define C?Z_DEFINED_VALUE_AT_ZERO to 2 for certain modes,
we can optimize it already in phiopt though, just need to ensure that
we transform the __builtin_c?z* calls into .C?Z ifns because my recent
VRP changes codified that the builtin calls are always undefined at zero,
while ifns honor C?Z_DEFINED_VALUE_AT_ZERO equal to 2.
And, in phiopt we already have popcount handling that does pretty much the
same thing, except for always using a zero value rather than the one set
by C?Z_DEFINED_VALUE_AT_ZERO.

So, this patch extends that function to handle not just popcount, but also
clz and ctz.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2020-10-20  Jakub Jelinek  

PR tree-optimization/97503
* tree-ssa-phiopt.c (cond_removal_in_popcount_pattern): Rename to ...
(cond_removal_in_popcount_clz_ctz_pattern): ... this.  Handle not just
popcount, but also clz and ctz if it has C?Z_DEFINED_VALUE_AT_ZERO 2.

* gcc.dg/tree-ssa/pr97503.c: New test.

--- gcc/tree-ssa-phiopt.c.jj2020-07-28 15:39:10.075755306 +0200
+++ gcc/tree-ssa-phiopt.c   2020-10-20 17:46:16.971329154 +0200
@@ -61,8 +61,9 @@ static bool minmax_replacement (basic_bl
edge, edge, gimple *, tree, tree);
 static bool abs_replacement (basic_block, basic_block,
 edge, edge, gimple *, tree, tree);
-static bool cond_removal_in_popcount_pattern (basic_block, basic_block,
- edge, edge, gimple *, tree, tree);
+static bool cond_removal_in_popcount_clz_ctz_pattern (basic_block, basic_block,
+ edge, edge, gimple *,
+ tree, tree);
 static bool cond_store_replacement (basic_block, basic_block, edge, edge,
hash_set *);
 static bool cond_if_else_store_replacement (basic_block, basic_block, 
basic_block);
@@ -344,8 +345,9 @@ tree_ssa_phiopt_worker (bool do_store_el
  else if (abs_replacement (bb, bb1, e1, e2, phi, arg0, arg1))
cfgchanged = true;
  else if (!early_p
-  && cond_removal_in_popcount_pattern (bb, bb1, e1, e2,
-   phi, arg0, arg1))
+  && cond_removal_in_popcount_clz_ctz_pattern (bb, bb1, e1,
+   e2, phi, arg0,
+   arg1))
cfgchanged = true;
  else if (minmax_replacement (bb, bb1, e1, e2, phi, arg0, arg1))
cfgchanged = true;
@@ -1777,16 +1779,20 @@ minmax_replacement (basic_block cond_bb,
 

c_12 = PHI <_9(2)>
-*/
+
+   Similarly for __builtin_clz or __builtin_ctz if
+   C?Z_DEFINED_VALUE_AT_ZERO is 2, optab is present and
+   instead of 0 above it uses the value from that macro.  */
 
 static bool
-cond_removal_in_popcount_pattern (basic_block cond_bb, basic_block middle_bb,
- edge e1, edge e2,
- gimple *phi, tree arg0, tree arg1)
+cond_removal_in_popcount_clz_ctz_pattern (basic_block cond_bb,
+ basic_block middle_bb,
+ edge e1, edge e2, gimple *phi,
+ tree arg0, tree arg1)
 {
   gimple *cond;
   gimple_stmt_iterator gsi, gsi_from;
-  gimple *popcount;
+  gimple *call;
   gimple *cast = NULL;
   tree lhs, arg;
 
@@ -1804,35 +1810,65 @@ cond_removal_in_popcount_pattern (basic_
   gsi_next_nondebug (&gsi);
   if (!gsi_end_p (gsi))
 {
-  popcount = gsi_stmt (gsi);
+  call = gsi_stmt (gsi);
   gsi_next_nondebug (&gsi);
   if (!gsi_end_p (gsi))
return false;
 }
   else
 {
-  popcount = cast;
+  call = cast;
   cast = NULL;
 }
 
-  /* Check that we have a popcount builtin.  */
-  if (!is_gimple_call (popcount))
+  /* Check that we have a popcount/clz/ctz builtin.  */
+  if (!is_gimple_call (call) || gimple_call_num_args (call) != 1)
 return false;
-  combined_fn cfn = gimple_call_combined_fn (popcount);
+
+  arg = gimple_call_arg (call, 0);
+  lhs = gimple_get_lhs (call);
+
+  if (lhs == NULL_TREE)
+return false;
+
+  combined_fn cfn = gimple_call_combined_fn (call);
+  internal_fn ifn = IFN_LAST;
+  int val = 0;
   switch (cfn)
 {
 CASE_CFN_POPCOUNT:
   break;
+CASE_CFN_CLZ:
+  if (INTEGRAL_TYPE_P (TREE_TYPE (arg)))
+   {
+ scalar_int_mode mode = SCALAR_INT_TYPE_MODE (T

Re: [committed][nvptx] Remove -m32

2020-10-21 Thread Tom de Vries
On 10/20/20 3:48 PM, Tobias Burnus wrote:
> On 10/15/20 3:26 PM, Tom de Vries wrote:
>>   PR target/97436
>>   * config/nvptx/nvptx.opt (m32): Comment out.
>>   * doc/invoke.texi (NVPTX options): Remove -m32.
> 
> This caused the warning: doc/invoke.texi:25617: warning: @itemx should
> not begin @table
> 
> Fixed by the committed attached patch.

Thanks for fixing this.

- Tom


[PATCH] rs6000: Don't split constant operator add before reload, move to temp register for future optimization

2020-10-21 Thread Xionghu Luo via Gcc-patches
This is a revised version of the patch posted at
https://gcc.gnu.org/pipermail/gcc-patches/2020-March/542718.html, resend
this since this is a quite high priority performance issue for Power.

Don't split code from add3 for SDI to allow a later pass to split.
This allows later logic to hoist out constant load in add instructions.
In loop, lis+ori could be hoisted out to improve performance compared with
previous addis+addi (About 15% on typical case), weak point is
one more register is used and one more instruction is generated.  i.e.:

addis 3,3,0x6765
addi 3,3,0x4321

=>

lis 9,0x6765
ori 9,9,0x4321
add 3,3,9

Likewise, paddi is replaced with pli+add for Power10. No obvious performance
and binary size change to SPEC2017.

gcc/ChangeLog:

2020-10-21  Xiong Hu Luo  

* config/rs6000/rs6000.md (add3 for SDI): Don't split before 
reload,
move constant to temp register for add.

gcc/testsuite/ChangeLog:

2020-10-21  Xiong Hu Luo  

* gcc.target/powerpc/prefix-add.c: Check pli instead of paddi.
* gcc.target/powerpc/prefix-no-update.c: Likewise.
* gcc.target/powerpc/add-const.c: New test.
---
 gcc/config/rs6000/predicates.md   |  3 +-
 gcc/config/rs6000/rs6000.md   | 54 +++
 gcc/testsuite/gcc.target/powerpc/add-const.c  | 18 +++
 gcc/testsuite/gcc.target/powerpc/prefix-add.c |  4 +-
 .../gcc.target/powerpc/prefix-no-update.c |  2 +-
 5 files changed, 54 insertions(+), 27 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/add-const.c

diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index 4c2fe7fa312..af577da669e 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -859,8 +859,7 @@ (define_special_predicate "indexed_address_mem"
 (define_predicate "add_operand"
   (if_then_else (match_code "const_int")
 (match_test "satisfies_constraint_I (op)
-|| satisfies_constraint_L (op)
-|| satisfies_constraint_eI (op)")
+|| satisfies_constraint_L (op)")
 (match_operand 0 "gpc_reg_operand")))
 
 ;; Return 1 if the operand is either a non-special register, or 0, or -1.
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 779bfd11237..facf6e12114 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -1750,34 +1750,44 @@ (define_expand "add3"
 
   if (CONST_INT_P (operands[2]) && !add_operand (operands[2], mode))
 {
-  rtx tmp = ((!can_create_pseudo_p ()
- || rtx_equal_p (operands[0], operands[1]))
-? operands[0] : gen_reg_rtx (mode));
+  bool reg0 = (reg_or_subregno (operands[0]) == 0);
+  if (can_create_pseudo_p () || reg0)
+   {
+
+ rtx tmp = (!can_create_pseudo_p ()
+ || rtx_equal_p (operands[0], operands[1]))
+   ? operands[0] : gen_reg_rtx (mode);
 
   /* Adding a constant to r0 is not a valid insn, so use a different
-strategy in that case.  */
-  if (reg_or_subregno (operands[1]) == 0 || reg_or_subregno (tmp) == 0)
-   {
- if (operands[0] == operands[1])
-   FAIL;
- rs6000_emit_move (operands[0], operands[2], mode);
- emit_insn (gen_add3 (operands[0], operands[1], operands[0]));
+strategy in that case.  See stack-limit.c, need generate
+"24: %0:DI=0x20fa0; 25: %0:DI=%14:DI+%0:DI" in pro_and_epilogue
+when can_create_pseudo_p is false.  */
+ if (reg0 == 0 || reg_or_subregno (tmp) == 0)
+ {
+   if (operands[0] == operands[1])
+ FAIL;
+ }
+
+ rs6000_emit_move (tmp, operands[2], mode);
+ emit_insn (gen_add3 (operands[0], operands[1], tmp));
  DONE;
}
+  else
+   {
+ HOST_WIDE_INT val = INTVAL (operands[2]);
+ HOST_WIDE_INT low = ((val & 0x) ^ 0x8000) - 0x8000;
+ HOST_WIDE_INT rest = trunc_int_for_mode (val - low, mode);
 
-  HOST_WIDE_INT val = INTVAL (operands[2]);
-  HOST_WIDE_INT low = ((val & 0x) ^ 0x8000) - 0x8000;
-  HOST_WIDE_INT rest = trunc_int_for_mode (val - low, mode);
-
-  if (mode == DImode && !satisfies_constraint_L (GEN_INT (rest)))
-   FAIL;
+ if (mode == DImode && !satisfies_constraint_L (GEN_INT (rest)))
+   FAIL;
 
-  /* The ordering here is important for the prolog expander.
-When space is allocated from the stack, adding 'low' first may
-produce a temporary deallocation (which would be bad).  */
-  emit_insn (gen_add3 (tmp, operands[1], GEN_INT (rest)));
-  emit_insn (gen_add3 (operands[0], tmp, GEN_INT (low)));
-  DONE;
+ /* The ordering here is important for the prolog expander.
+When space is allocated from the stack, adding 'low' first may
+produce a temporary deallocation (which would be bad).  */
+ emit_insn (gen_add3 (operands[0], operands[1], GEN_INT (

Re: [PATCH] phiopt: Optimize x ? __builtin_clz (x) : 32 in GIMPLE [PR97503]

2020-10-21 Thread Richard Biener
On Wed, 21 Oct 2020, Jakub Jelinek wrote:

> Hi!
> 
> While we have at the RTL level noce_try_ifelse_collapse combined with
> simplify_cond_clz_ctz, that optimization doesn't always trigger because
> e.g. on powerpc there is an define_insn to compare a reg against zero and
> copy that register to another one and so we end up with a different pseudo
> in the simplify_cond_clz_ctz test and punt.
> 
> For targets that define C?Z_DEFINED_VALUE_AT_ZERO to 2 for certain modes,
> we can optimize it already in phiopt though, just need to ensure that
> we transform the __builtin_c?z* calls into .C?Z ifns because my recent
> VRP changes codified that the builtin calls are always undefined at zero,
> while ifns honor C?Z_DEFINED_VALUE_AT_ZERO equal to 2.
> And, in phiopt we already have popcount handling that does pretty much the
> same thing, except for always using a zero value rather than the one set
> by C?Z_DEFINED_VALUE_AT_ZERO.
> 
> So, this patch extends that function to handle not just popcount, but also
> clz and ctz.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> 
> 2020-10-20  Jakub Jelinek  
> 
>   PR tree-optimization/97503
>   * tree-ssa-phiopt.c (cond_removal_in_popcount_pattern): Rename to ...
>   (cond_removal_in_popcount_clz_ctz_pattern): ... this.  Handle not just
>   popcount, but also clz and ctz if it has C?Z_DEFINED_VALUE_AT_ZERO 2.
> 
>   * gcc.dg/tree-ssa/pr97503.c: New test.
> 
> --- gcc/tree-ssa-phiopt.c.jj  2020-07-28 15:39:10.075755306 +0200
> +++ gcc/tree-ssa-phiopt.c 2020-10-20 17:46:16.971329154 +0200
> @@ -61,8 +61,9 @@ static bool minmax_replacement (basic_bl
>   edge, edge, gimple *, tree, tree);
>  static bool abs_replacement (basic_block, basic_block,
>edge, edge, gimple *, tree, tree);
> -static bool cond_removal_in_popcount_pattern (basic_block, basic_block,
> -   edge, edge, gimple *, tree, tree);
> +static bool cond_removal_in_popcount_clz_ctz_pattern (basic_block, 
> basic_block,
> +   edge, edge, gimple *,
> +   tree, tree);
>  static bool cond_store_replacement (basic_block, basic_block, edge, edge,
>   hash_set *);
>  static bool cond_if_else_store_replacement (basic_block, basic_block, 
> basic_block);
> @@ -344,8 +345,9 @@ tree_ssa_phiopt_worker (bool do_store_el
> else if (abs_replacement (bb, bb1, e1, e2, phi, arg0, arg1))
>   cfgchanged = true;
> else if (!early_p
> -&& cond_removal_in_popcount_pattern (bb, bb1, e1, e2,
> - phi, arg0, arg1))
> +&& cond_removal_in_popcount_clz_ctz_pattern (bb, bb1, e1,
> + e2, phi, arg0,
> + arg1))
>   cfgchanged = true;
> else if (minmax_replacement (bb, bb1, e1, e2, phi, arg0, arg1))
>   cfgchanged = true;
> @@ -1777,16 +1779,20 @@ minmax_replacement (basic_block cond_bb,
>  
> 
> c_12 = PHI <_9(2)>
> -*/
> +
> +   Similarly for __builtin_clz or __builtin_ctz if
> +   C?Z_DEFINED_VALUE_AT_ZERO is 2, optab is present and
> +   instead of 0 above it uses the value from that macro.  */
>  
>  static bool
> -cond_removal_in_popcount_pattern (basic_block cond_bb, basic_block middle_bb,
> -   edge e1, edge e2,
> -   gimple *phi, tree arg0, tree arg1)
> +cond_removal_in_popcount_clz_ctz_pattern (basic_block cond_bb,
> +   basic_block middle_bb,
> +   edge e1, edge e2, gimple *phi,
> +   tree arg0, tree arg1)
>  {
>gimple *cond;
>gimple_stmt_iterator gsi, gsi_from;
> -  gimple *popcount;
> +  gimple *call;
>gimple *cast = NULL;
>tree lhs, arg;
>  
> @@ -1804,35 +1810,65 @@ cond_removal_in_popcount_pattern (basic_
>gsi_next_nondebug (&gsi);
>if (!gsi_end_p (gsi))
>  {
> -  popcount = gsi_stmt (gsi);
> +  call = gsi_stmt (gsi);
>gsi_next_nondebug (&gsi);
>if (!gsi_end_p (gsi))
>   return false;
>  }
>else
>  {
> -  popcount = cast;
> +  call = cast;
>cast = NULL;
>  }
>  
> -  /* Check that we have a popcount builtin.  */
> -  if (!is_gimple_call (popcount))
> +  /* Check that we have a popcount/clz/ctz builtin.  */
> +  if (!is_gimple_call (call) || gimple_call_num_args (call) != 1)
>  return false;
> -  combined_fn cfn = gimple_call_combined_fn (popcount);
> +
> +  arg = gimple_call_arg (call, 0);
> +  lhs = gimple_get_lhs (call);
> +
> +  if (lhs == NULL_TREE)
> +return false;
> +
> +  combined_fn cfn = gimple_call_combined_fn (call);
> +  internal_fn ifn = 

[PATCH 1/2] Separate new_edges compute in copy_bbs

2020-10-21 Thread Richard Biener
This separates out a loop finding new_edges from edges in copy_bbs,
making its complexity cheaper overall from total number of succs in
copied bbs times num_edges to num_edges times the complexity of
find_edge.

Bootstrapped / tested on x86_64-unknown-linux-gnu, pushed.

2020-10-21  Richard Biener  

* cfghooks.c (copy_bbs): Split out loop computing new_edges.
---
 gcc/cfghooks.c | 21 +++--
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/gcc/cfghooks.c b/gcc/cfghooks.c
index 71c6b63ad3b..14c006df6e1 100644
--- a/gcc/cfghooks.c
+++ b/gcc/cfghooks.c
@@ -1391,8 +1391,6 @@ copy_bbs (basic_block *bbs, unsigned n, basic_block 
*new_bbs,
 }
 
   /* Redirect edges.  */
-  for (j = 0; j < num_edges; j++)
-new_edges[j] = NULL;
   for (i = 0; i < n; i++)
 {
   edge_iterator ei;
@@ -1401,15 +1399,26 @@ copy_bbs (basic_block *bbs, unsigned n, basic_block 
*new_bbs,
 
   FOR_EACH_EDGE (e, ei, new_bb->succs)
{
- for (j = 0; j < num_edges; j++)
-   if (edges[j] && edges[j]->src == bb && edges[j]->dest == e->dest)
- new_edges[j] = e;
-
  if (!(e->dest->flags & BB_DUPLICATED))
continue;
  redirect_edge_and_branch_force (e, get_bb_copy (e->dest));
}
 }
+  for (j = 0; j < num_edges; j++)
+{
+  if (!edges[j])
+   new_edges[j] = NULL;
+  else
+   {
+ basic_block src = edges[j]->src;
+ basic_block dest = edges[j]->dest;
+ if (src->flags & BB_DUPLICATED)
+   src = get_bb_copy (src);
+ if (dest->flags & BB_DUPLICATED)
+   dest = get_bb_copy (dest);
+ new_edges[j] = find_edge (src, dest);
+   }
+}
 
   /* Clear information about duplicates.  */
   for (i = 0; i < n; i++)
-- 
2.26.2



[PATCH 2/2] Simplify CFG copying tables

2020-10-21 Thread Richard Biener
This simplifies the maps between original and new basic blocks and
loops as used for CFG copying.  Instead of using a pointer hash
table to allocated mapping entries use a hash_map with int_hash,
removing the indirection and code duplication.  We can use -1 and
-2 as empty/deleted values as those are not valid basic-block
indices or loop numbers.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

2020-10-21  Richard Biener  

* cfg.c (htab_bb_copy_original_entry): Remove.
(bb_copy_hasher): Likewise.
(bb_original, bb_copy, loop_copy): Use
hash_map, int>.
(original_copy_bb_pool): Remove.
(initialize_original_copy_tables): Adjust.
(reset_original_copy_tables): Likewise.
(free_original_copy_tables): Likewise.
(original_copy_tables_initialized_p): Likewise.
(copy_original_table_clear): Simplify.
(copy_original_table_set): Likewise.
(get_bb_original): Likewise.
(get_bb_copy): Likewise.
(get_loop_copy): Likewise.
---
 gcc/cfg.c | 114 +++---
 1 file changed, 23 insertions(+), 91 deletions(-)

diff --git a/gcc/cfg.c b/gcc/cfg.c
index 270a48f729a..d82324faf03 100644
--- a/gcc/cfg.c
+++ b/gcc/cfg.c
@@ -959,55 +959,23 @@ scale_bbs_frequencies (basic_block *bbs, int nbbs,
 bbs[i]->count = bbs[i]->count.apply_probability (p);
 }
 
-/* Helper types for hash tables.  */
-
-struct htab_bb_copy_original_entry
-{
-  /* Block we are attaching info to.  */
-  int index1;
-  /* Index of original or copy (depending on the hashtable) */
-  int index2;
-};
-
-struct bb_copy_hasher : nofree_ptr_hash 
-{
-  static inline hashval_t hash (const htab_bb_copy_original_entry *);
-  static inline bool equal (const htab_bb_copy_original_entry *existing,
-   const htab_bb_copy_original_entry * candidate);
-};
-
-inline hashval_t
-bb_copy_hasher::hash (const htab_bb_copy_original_entry *data)
-{
-  return data->index1;
-}
-
-inline bool
-bb_copy_hasher::equal (const htab_bb_copy_original_entry *data,
-  const htab_bb_copy_original_entry *data2)
-{
-  return data->index1 == data2->index1;
-}
-
 /* Data structures used to maintain mapping between basic blocks and
copies.  */
-static hash_table *bb_original;
-static hash_table *bb_copy;
+typedef hash_map, int> copy_map_t;
+static copy_map_t *bb_original;
+static copy_map_t *bb_copy;
 
 /* And between loops and copies.  */
-static hash_table *loop_copy;
-static object_allocator *original_copy_bb_pool;
+static copy_map_t *loop_copy;
 
 /* Initialize the data structures to maintain mapping between blocks
and its copies.  */
 void
 initialize_original_copy_tables (void)
 {
-  original_copy_bb_pool = new object_allocator
-("original_copy");
-  bb_original = new hash_table (10);
-  bb_copy = new hash_table (10);
-  loop_copy = new hash_table (10);
+  bb_original = new copy_map_t (10);
+  bb_copy = new copy_map_t (10);
+  loop_copy = new copy_map_t (10);
 }
 
 /* Reset the data structures to maintain mapping between blocks and
@@ -1016,7 +984,6 @@ initialize_original_copy_tables (void)
 void
 reset_original_copy_tables (void)
 {
-  gcc_assert (original_copy_bb_pool);
   bb_original->empty ();
   bb_copy->empty ();
   loop_copy->empty ();
@@ -1027,15 +994,12 @@ reset_original_copy_tables (void)
 void
 free_original_copy_tables (void)
 {
-  gcc_assert (original_copy_bb_pool);
   delete bb_copy;
   bb_copy = NULL;
   delete bb_original;
   bb_original = NULL;
   delete loop_copy;
   loop_copy = NULL;
-  delete original_copy_bb_pool;
-  original_copy_bb_pool = NULL;
 }
 
 /* Return true iff we have had a call to initialize_original_copy_tables
@@ -1044,51 +1008,31 @@ free_original_copy_tables (void)
 bool
 original_copy_tables_initialized_p (void)
 {
-  return original_copy_bb_pool != NULL;
+  return bb_copy != NULL;
 }
 
 /* Removes the value associated with OBJ from table TAB.  */
 
 static void
-copy_original_table_clear (hash_table *tab, unsigned obj)
+copy_original_table_clear (copy_map_t *tab, unsigned obj)
 {
-  htab_bb_copy_original_entry **slot;
-  struct htab_bb_copy_original_entry key, *elt;
-
-  if (!original_copy_bb_pool)
+  if (!original_copy_tables_initialized_p ())
 return;
 
-  key.index1 = obj;
-  slot = tab->find_slot (&key, NO_INSERT);
-  if (!slot)
-return;
-
-  elt = *slot;
-  tab->clear_slot (slot);
-  original_copy_bb_pool->remove (elt);
+  tab->remove (obj);
 }
 
 /* Sets the value associated with OBJ in table TAB to VAL.
Do nothing when data structures are not initialized.  */
 
 static void
-copy_original_table_set (hash_table *tab,
+copy_original_table_set (copy_map_t *tab,
 unsigned obj, unsigned val)
 {
-  struct htab_bb_copy_original_entry **slot;
-  struct htab_bb_copy_original_entry key;
-
-  if (!original_copy_bb_pool)
+  if (!original_copy_tables_initialized_p ())
 return;
 
-  key.index1 = obj;
-  slot 

Re: [PATCH] Saturate overflows return from SCEV in ranger.

2020-10-21 Thread Aldy Hernandez via Gcc-patches




On 10/21/20 9:59 AM, Richard Biener wrote:


/* Even for valid range info, sometimes overflow flag will leak in.
   As GIMPLE IL should have no constants with TREE_OVERFLOW set, we
   drop them.  */
if (TREE_OVERFLOW_P (*min))
  *min = drop_tree_overflow (*min);
if (TREE_OVERFLOW_P (*max))
  *max = drop_tree_overflow (*max);


Interesting.

If these values "leaked" in.  Should they have been fixed at the source,
instead of after the fact?  You mention below that every use of
TREE_OVERFLOW in the ME is a bug, should we clean them up before
arriving in gimple, or are there legitimate uses of it?


There are no legitimate uses in GIMPLE.  They are (ab-)used by
GENERIC folding for propagating overflow (also used in FE
diagnostics).  Generally the better way is to use wide_ints overflow
handling which also "sticks".


If there are no legitimate uses, perhaps we should drop them altogether 
as we go into GIMPLE??  I vaguely recall seeing them leak into 
value_range's.






and the code explicitly checks for overflow, doing range adjustments
accordingly.


Well, not all overflows are adjusted:

/* Like in PR19590, scev can return a constant function.  */
if (is_gimple_min_invariant (chrec))
  {
*min = *max = chrec;
return true;
  }

Are these min/max not adjusted for overflow by design, or is this an
oversight?


Ah, that's an oversight here.  And yes, "fixing" it in scalar evolution
analysis itself (dropping the flag there) would be best


Excellent.  I've pushed the patch below after testing it.

Thanks again.
Aldy

Adjust overflow for invariants in bounds_of_var_in_loop.

Invariants returned from SCEV can have TREE_OVERFLOW set.  Clear the
overflow as we do with the rest of the values returned from this
function.

gcc/ChangeLog:

* gimple-range.cc 
(gimple_ranger::range_of_ssa_name_with_loop_info):

Remove TREE_OVERFLOW special case.
* vr-values.c (bounds_of_var_in_loop): Adjust overflow for
invariants.

diff --git a/gcc/gimple-range.cc b/gcc/gimple-range.cc
index b790d62d75f..c5520e0700b 100644
--- a/gcc/gimple-range.cc
+++ b/gcc/gimple-range.cc
@@ -1156,9 +1156,9 @@ gimple_ranger::range_of_ssa_name_with_loop_info 
(irange &r, tree name,

   // ?? We could do better here.  Since MIN/MAX can only be an
   // SSA, SSA +- INTEGER_CST, or INTEGER_CST, we could easily call
   // the ranger and solve anything not an integer.
-  if (TREE_CODE (min) != INTEGER_CST || TREE_OVERFLOW (min))
+  if (TREE_CODE (min) != INTEGER_CST)
min = vrp_val_min (type);
-  if (TREE_CODE (max) != INTEGER_CST || TREE_OVERFLOW (max))
+  if (TREE_CODE (max) != INTEGER_CST)
max = vrp_val_max (type);
   r.set (min, max);
 }
diff --git a/gcc/vr-values.c b/gcc/vr-values.c
index cc0ddca2bd3..7a0e70eab64 100644
--- a/gcc/vr-values.c
+++ b/gcc/vr-values.c
@@ -1844,7 +1844,7 @@ bounds_of_var_in_loop (tree *min, tree *max, 
range_query *query,

   if (is_gimple_min_invariant (chrec))
 {
   *min = *max = chrec;
-  return true;
+  goto fix_overflow;
 }

   if (TREE_CODE (chrec) != POLYNOMIAL_CHREC)
@@ -1964,6 +1964,7 @@ bounds_of_var_in_loop (tree *min, tree *max, 
range_query *query,

   else
 *min = init;

+ fix_overflow:
   /* Even for valid range info, sometimes overflow flag will leak in.
  As GIMPLE IL should have no constants with TREE_OVERFLOW set, we
  drop them.  */



[PATCH] vect: Remove redundant LOOP_VINFO_FULLY_MASKED_P

2020-10-21 Thread Kewen.Lin via Gcc-patches
Hi,

This is a very trivial patch, it's to remove a redundant
LOOP_VINFO_FULLY_MASKED_P condition check which will be
checked in vect_use_loop_mask_for_alignment_p.

Is it OK for trunk?

BR,
Kewen
-
gcc/ChangeLog:

* tree-vect-loop.c (vect_transform_loop): Remove the redundant
LOOP_VINFO_FULLY_MASKED_P check.

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index dba230f6320..5e9e25add73 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -8913,8 +8913,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple 
*loop_vectorized_call)

   split_edge (loop_preheader_edge (loop));

-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
-  && vect_use_loop_mask_for_alignment_p (loop_vinfo))
+  if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
 /* This will deal with any possible peeling.  */
 vect_prepare_for_masked_peels (loop_vinfo);


Re: [PATCH][PR 97506] Simplify trivial vcond_expr in expander.

2020-10-21 Thread Jakub Jelinek via Gcc-patches
On Wed, Oct 21, 2020 at 02:29:07PM +0800, Hongtao Liu via Gcc-patches wrote:
> gcc/ChangeLog:
> 
> PR target/97506
> * config/i386/i386-expand.c (ix86_expand_sse_movcc): Move
> op_true to dest directly When op_true equals op_false,

Lowercase when in the middle of sentence.  Use . instead of , at the end.

> --- a/gcc/config/i386/i386-expand.c
> +++ b/gcc/config/i386/i386-expand.c
> @@ -3525,6 +3525,13 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx
> op_true, rtx op_false)
>machine_mode mode = GET_MODE (dest);
>machine_mode cmpmode = GET_MODE (cmp);
> 
> +  /* Simplify trivial vcond_expr to avoid ICE error in pr97506.  */

There is no such thing as vcond_expr, I'd say use VEC_COND_EXPR instead.
Please change ICE error to just ICE, ICE stands for internal compiler error,
so the error word is in there already.

Otherwise LGTM.

> +  if (rtx_equal_p (op_true, op_false))
> +{
> +  emit_move_insn (dest, op_true);
> +  return;
> +}
> +
>/* In AVX512F the result of comparison is an integer mask.  */
>bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode);
> 
> diff --git a/gcc/testsuite/gcc.target/i386/pr97506.c
> b/gcc/testsuite/gcc.target/i386/pr97506.c
> new file mode 100644
> index 000..74714cfab2c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr97506.c
> @@ -0,0 +1,19 @@
> +/* PR target/97506  */
> +/* { dg-do compile } */
> +/* { dg-options "-Og -finline-functions-called-once -fno-tree-ccp
> -mavx512vbmi -mavx512vl" } */
> +
> +typedef unsigned char __attribute__ ((__vector_size__ (16))) U;
> +typedef int __attribute__ ((__vector_size__ (4))) V;
> +U u;
> +
> +void
> +bar (int i, V v)
> +{
> +  u += (char) i & (char) i > (U){};
> +}
> +
> +void
> +foo (void)
> +{
> +  bar (0, (V){});
> +}
> -- 
> 2.18.1
> 
> 
> -- 
> BR,
> Hongtao

Jakub



Re: PING [PATCH] Enable GCC support for Intel Key Locker extension

2020-10-21 Thread Hongyu Wang via Gcc-patches
Hi,

> IIRC, adding a new regclass is O(n^2), so it should be avoided. I
> think that the new patterns should follow the same path as vzeroall
> and vzeroupper patterns, where we emit the pattern with explicit hard
> regs.
>
> BTW: We do have SSE_FIRST_REG class, but this class was added to solve
> some reload problems in the past by marking %xmm0 as likely spilled.

Thanks for your suggestion, we have removed the register classes and
constraints, and
set explicit sse hard registers in the expander. The corresponding patterns
are also adjusted,

Update and rebased patch.


Uros Bizjak  于2020年10月14日周三 下午11:56写道:

> Hello!
>
> > This patch is about to support Intel Key Locker extension.
> >
> > Key Locker provides a mechanism to encrypt and decrypt data with an AES
> key without having access to the raw key value.
> >
> > For more details, please refer to
>
> https://software.intel.com/content/dam/develop/external/us/en/documents/343965-intel-key-locker-specification.pdf
> .
> >
> > Bootstrap ok, regression test on i386/x86 backend is ok.
> >
> > OK for master?
>
> @@ -1414,6 +1418,13 @@ enum reg_class
>FP_TOP_REG, FP_SECOND_REG, /* %st(0) %st(1) */
>FLOAT_REGS,
>SSE_FIRST_REG,
> +  SSE_SECOND_REG,
> +  SSE_THIRD_REG,
> +  SSE_FOURTH_REG,
> +  SSE_FIFTH_REG,
> +  SSE_SIXTH_REG,
> +  SSE_SEVENTH_REG,
> +  SSE_EIGHTH_REG,
>NO_REX_SSE_REGS,
>SSE_REGS,
>ALL_SSE_REGS,
> @@ -1474,6 +1485,13 @@ enum reg_class
> "FP_TOP_REG", "FP_SECOND_REG", \
> "FLOAT_REGS", \
> "SSE_FIRST_REG", \
> +   "SSE_SECOND_REG", \
> +   "SSE_THIRD_REG", \
> +   "SSE_FOURTH_REG", \
> +   "SSE_FIFTH_REG", \
> +   "SSE_SIXTH_REG", \
> +   "SSE_SEVENTH_REG", \
> +   "SSE_EIGHTH_REG", \
> "NO_REX_SSE_REGS", \
> "SSE_REGS", \
> "ALL_SSE_REGS", \
> @@ -1513,6 +1531,13 @@ enum reg_class
>   { 0x200,0x0,   0x0 }, /* FP_SECOND_REG */ \
>  { 0xff00,0x0,   0x0 }, /* FLOAT_REGS */ \
>{ 0x10,0x0,   0x0 }, /* SSE_FIRST_REG */ \
> +  { 0x20,0x0,   0x0 }, /* SSE_SECOND_REG */ \
> +  { 0x40,0x0,   0x0 }, /* SSE_THIRD_REG */ \
> +  { 0x80,0x0,   0x0 }, /* SSE_FOURTH_REG */ \
> + { 0x100,0x0,   0x0 }, /* SSE_FIFTH_REG */ \
> + { 0x200,0x0,   0x0 }, /* SSE_SIXTH_REG*/ \
> + { 0x400,0x0,   0x0 }, /* SSE_SEVENTH_REG */ \
> + { 0x800,0x0,   0x0 }, /* SSE_EIGHTH_REG */ \
>   { 0xff0,0x0,   0x0 }, /* NO_REX_SSE_REGS */ \
>   { 0xff0,0xff000,   0x0 }, /* SSE_REGS */ \
>   { 0xff0, 0xf000,   0xf }, /* ALL_SSE_REGS */ \
>
> IIRC, adding a new regclass is O(n^2), so it should be avoided. I
> think that the new patterns should follow the same path as vzeroall
> and vzeroupper patterns, where we emit the pattern with explicit hard
> regs.
>
> BTW: We do have SSE_FIRST_REG class, but this class was added to solve
> some reload problems in the past by marking %xmm0 as likely spilled.
>
> Uros.
>
From 6fcb89ab7f51de70baca12e46a14fb2d1fed67d5 Mon Sep 17 00:00:00 2001
From: liuhongt 
Date: Thu, 5 Mar 2020 17:36:02 +0800
Subject: [PATCH] Enable GCC to support Intel Key Locker ISA

gcc/ChangeLog

2018-12-15  Xuepeng Guo  

	* common/config/i386/cpuinfo.h (get_available_features):
	Detect KL, AESKLE and WIDEKL features.
	* common/config/i386/i386-common.c
	(OPTION_MASK_ISA_KL_SET): New.
	(OPTION_MASK_ISA_WIDEKL_SET): Likewise.
	(OPTION_MASK_ISA_KL_UNSET): Likewise.
	(OPTION_MASK_ISA_WIDEKL_UNSET): Likewise.
	(OPTION_MASK_ISA2_AVX2_UNSET): Likewise.
	(OPTION_MASK_ISA2_AVX_UNSET): Likewise.
	(OPTION_MASK_ISA2_SSE4_2_UNSET): Likewise.
	(OPTION_MASK_ISA2_SSE4_1_UNSET): Likewise.
	(OPTION_MASK_ISA2_SSE4_UNSET): Likewise.
	(OPTION_MASK_ISA2_SSSE3_UNSET): Likewise.
	(OPTION_MASK_ISA2_SSE3_UNSET): Likewise.
	(OPTION_MASK_ISA2_SSE2_UNSET): Likewise.
	(OPTION_MASK_ISA2_SSE_UNSET): Likewise.
	(ix86_handle_option): Handle kl and widekl, add dependency chain
	for KL and SSE2.
	* common/config/i386/i386-cpuinfo.h (enum processor_features):
	(FEATURE_KL, FEATURE_AESKLE, FEATURE_WIDEKL): New.
	* common/config/i386/i386-isas.h: Add ISA_NAMES_TABLE_ENTRY
	for KL, AESKLE and WIDEKL.
	* config.gcc: Add keylockerintrin.h.
	* doc/invoke.texi: Document new option -mkl and -mwidekl.
	* doc/extend.texi: Document kl and widekl.
	* config/i386/cpuid.h (bit_KL, bit_AESKLE, bit_WIDEKL): New.
	* config/i386/i386-builtin-types.def ((UINT, UINT, V2DI, V2DI, PVOID),
	(UINT, UINT, V2DI, PVOID), (VOID, V2DI, V2DI, V2DI, UINT),
	(UINT8, PV2DI, V2DI, PCVOID), (UINT8, PV2DI, PCV2DI, PCVOID)): New
	function types.
	* config/i386/i386-builtin.def: Add
	__builtin_ia32_loadiwkey,
	__builtin_ia32_aesdec128kl_u8,
	__builtin_ia32_aesdec256kl_u8,
	__builtin_ia32_aesenc128kl_u8,
	__builtin_ia32_aesenc256kl_u8,
	__builtin_ia32_aesdecwide128kl_u8,
	__builtin_ia32_aesdecwide256kl_u8,
	__builtin_ia32_aesencwide128kl_u8,
	__builtin_ia32_aesencwide256kl_u8,
	__builtin_ia32_encodekey128_u32,
	__builtin_ia32_encodekey256_u32.
	* config

[PATCH 1/2] ASAN: Support detect_invalid_pointer_pairs=1 with detect_stack_use_after_return=1

2020-10-21 Thread Martin Liška

Do not crash when AsanThread::GetStackVariableShadowStart does not find
a variable for a pointer on a shadow stack.

Cherry-pick from ad2be02a833e56f7fe280797280b219eb3312621.

Differential Revision: https://reviews.llvm.org/D89552

(cherry picked from commit b69f33f477b9ac38af3c39465600ae74a3554878)
---
 libsanitizer/asan/asan_thread.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libsanitizer/asan/asan_thread.cpp 
b/libsanitizer/asan/asan_thread.cpp
index 6734d9a1668..cb374b28622 100644
--- a/libsanitizer/asan/asan_thread.cpp
+++ b/libsanitizer/asan/asan_thread.cpp
@@ -366,7 +366,9 @@ uptr AsanThread::GetStackVariableShadowStart(uptr addr) {
 bottom = stack_bottom();
   } else if (has_fake_stack()) {
 bottom = fake_stack()->AddrIsInFakeStack(addr);
-CHECK(bottom);
+if (bottom == 0) {
+  return 0;
+}
   } else {
 return 0;
   }
--
2.28.0




[PATCH 2/2] libsanitizer: add test-case

2020-10-21 Thread Martin Liška

gcc/testsuite/ChangeLog:

PR sanitizer/97414
* g++.dg/asan/pr97414.C: New test.

(cherry picked from commit 6c5b08a2ca935c5db68e79d33e5c5b752252115c)
---
 gcc/testsuite/g++.dg/asan/pr97414.C | 19 +++
 1 file changed, 19 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/asan/pr97414.C

diff --git a/gcc/testsuite/g++.dg/asan/pr97414.C 
b/gcc/testsuite/g++.dg/asan/pr97414.C
new file mode 100644
index 000..6ea03906daa
--- /dev/null
+++ b/gcc/testsuite/g++.dg/asan/pr97414.C
@@ -0,0 +1,19 @@
+/* PR sanitizer/97414 */
+/* { dg-do run } */
+/* { dg-set-target-env-var ASAN_OPTIONS 
"detect_invalid_pointer_pairs=1:halt_on_error=1,detect_stack_use_after_return=1"
 } */
+/* { dg-options "-fsanitize=address,pointer-compare,pointer-subtract" } */
+
+[[gnu::noinline]] auto pointer_diff(const int *begin, const int *end) {
+  return end - begin;
+}
+
+int main() {
+  constexpr auto size = (2048 / sizeof(int)) + 1;
+
+  auto buf = new int[size];
+  auto end = buf + size;
+  pointer_diff(end, buf);
+  delete[] buf;
+
+  return 0;
+}
--
2.28.0



Re: [PATCH] Saturate overflows return from SCEV in ranger.

2020-10-21 Thread Richard Biener via Gcc-patches
On Wed, Oct 21, 2020 at 10:50 AM Aldy Hernandez  wrote:
>
>
>
> On 10/21/20 9:59 AM, Richard Biener wrote:
>
> >>> /* Even for valid range info, sometimes overflow flag will leak in.
> >>>As GIMPLE IL should have no constants with TREE_OVERFLOW set, we
> >>>drop them.  */
> >>> if (TREE_OVERFLOW_P (*min))
> >>>   *min = drop_tree_overflow (*min);
> >>> if (TREE_OVERFLOW_P (*max))
> >>>   *max = drop_tree_overflow (*max);
> >>
> >> Interesting.
> >>
> >> If these values "leaked" in.  Should they have been fixed at the source,
> >> instead of after the fact?  You mention below that every use of
> >> TREE_OVERFLOW in the ME is a bug, should we clean them up before
> >> arriving in gimple, or are there legitimate uses of it?
> >
> > There are no legitimate uses in GIMPLE.  They are (ab-)used by
> > GENERIC folding for propagating overflow (also used in FE
> > diagnostics).  Generally the better way is to use wide_ints overflow
> > handling which also "sticks".
>
> If there are no legitimate uses, perhaps we should drop them altogether
> as we go into GIMPLE??

We do, but they tend to creep back in by infrastructure using the
GENERIC folder.  Some key places make sure to clear them (I've tracked
down a lot of them).  But I was never bave enough to assert they do
not end up in IL operands ;)

>  I vaguely recall seeing them leak into
> value_range's.
> >
> >>>
> >>> and the code explicitly checks for overflow, doing range adjustments
> >>> accordingly.
> >>
> >> Well, not all overflows are adjusted:
> >>
> >> /* Like in PR19590, scev can return a constant function.  */
> >> if (is_gimple_min_invariant (chrec))
> >>   {
> >> *min = *max = chrec;
> >> return true;
> >>   }
> >>
> >> Are these min/max not adjusted for overflow by design, or is this an
> >> oversight?
> >
> > Ah, that's an oversight here.  And yes, "fixing" it in scalar evolution
> > analysis itself (dropping the flag there) would be best
>
> Excellent.  I've pushed the patch below after testing it.
>
> Thanks again.
> Aldy
>
>  Adjust overflow for invariants in bounds_of_var_in_loop.
>
>  Invariants returned from SCEV can have TREE_OVERFLOW set.  Clear the
>  overflow as we do with the rest of the values returned from this
>  function.
>
>  gcc/ChangeLog:
>
>  * gimple-range.cc
> (gimple_ranger::range_of_ssa_name_with_loop_info):
>  Remove TREE_OVERFLOW special case.
>  * vr-values.c (bounds_of_var_in_loop): Adjust overflow for
>  invariants.
>
> diff --git a/gcc/gimple-range.cc b/gcc/gimple-range.cc
> index b790d62d75f..c5520e0700b 100644
> --- a/gcc/gimple-range.cc
> +++ b/gcc/gimple-range.cc
> @@ -1156,9 +1156,9 @@ gimple_ranger::range_of_ssa_name_with_loop_info
> (irange &r, tree name,
> // ?? We could do better here.  Since MIN/MAX can only be an
> // SSA, SSA +- INTEGER_CST, or INTEGER_CST, we could easily call
> // the ranger and solve anything not an integer.
> -  if (TREE_CODE (min) != INTEGER_CST || TREE_OVERFLOW (min))
> +  if (TREE_CODE (min) != INTEGER_CST)
> min = vrp_val_min (type);
> -  if (TREE_CODE (max) != INTEGER_CST || TREE_OVERFLOW (max))
> +  if (TREE_CODE (max) != INTEGER_CST)
> max = vrp_val_max (type);
> r.set (min, max);
>   }
> diff --git a/gcc/vr-values.c b/gcc/vr-values.c
> index cc0ddca2bd3..7a0e70eab64 100644
> --- a/gcc/vr-values.c
> +++ b/gcc/vr-values.c
> @@ -1844,7 +1844,7 @@ bounds_of_var_in_loop (tree *min, tree *max,
> range_query *query,
> if (is_gimple_min_invariant (chrec))
>   {
> *min = *max = chrec;
> -  return true;
> +  goto fix_overflow;
>   }
>
> if (TREE_CODE (chrec) != POLYNOMIAL_CHREC)
> @@ -1964,6 +1964,7 @@ bounds_of_var_in_loop (tree *min, tree *max,
> range_query *query,
> else
>   *min = init;
>
> + fix_overflow:
> /* Even for valid range info, sometimes overflow flag will leak in.
>As GIMPLE IL should have no constants with TREE_OVERFLOW set, we
>drop them.  */
>


[PATCH] aarch64: Add vcopy(q)__lane(q)_bf16 intrinsics

2020-10-21 Thread Andrea Corallo via Gcc-patches
Hi all,

I'd like to submit the following patch implementing the bfloat16_t
neon related copy intrinsics: vcopy_lane_bf16, vcopyq_lane_bf16,
vcopyq_laneq_bf16, vcopy_laneq_bf16.

Please see refer to:
ACLE 
ISA  

Regtested and bootstrapped.

Regards

  Andrea

>From d1335c0f49df849b87ee522e9507023113051839 Mon Sep 17 00:00:00 2001
From: Andrea Corallo 
Date: Thu, 8 Oct 2020 12:29:00 +0200
Subject: [PATCH] aarch64: Add vcopy(q)__lane(q)_bf16 intrinsics

gcc/ChangeLog

2020-10-20  Andrea Corallo  

* config/aarch64/arm_neon.h (vcopy_lane_bf16, vcopyq_lane_bf16)
(vcopyq_laneq_bf16, vcopy_laneq_bf16): New intrinsics.

gcc/testsuite/ChangeLog

2020-10-20  Andrea Corallo  

* gcc.target/aarch64/advsimd-intrinsics/bf16_vect_copy_lane_1.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vcopy_lane_bf16_indices_1.c:
Likewise.
* gcc.target/aarch64/advsimd-intrinsics/vcopy_lane_bf16_indices_2.c:
Likewise.
* gcc.target/aarch64/advsimd-intrinsics/vcopy_laneq_bf16_indices_1.c:
Likewise.
* gcc.target/aarch64/advsimd-intrinsics/vcopy_laneq_bf16_indices_2.c:
Likewise.
* gcc.target/aarch64/advsimd-intrinsics/vcopyq_lane_bf16_indices_1.c:
Likewise.
* gcc.target/aarch64/advsimd-intrinsics/vcopyq_lane_bf16_indices_2.c:
Likewise.
* gcc.target/aarch64/advsimd-intrinsics/vcopyq_laneq_bf16_indices_1.c:
Likewise.
* gcc.target/aarch64/advsimd-intrinsics/vcopyq_laneq_bf16_indices_2.c:
Likewise.
---
 gcc/config/aarch64/arm_neon.h | 36 +++
 .../bf16_vect_copy_lane_1.c   | 32 +
 .../vcopy_lane_bf16_indices_1.c   | 18 ++
 .../vcopy_lane_bf16_indices_2.c   | 18 ++
 .../vcopy_laneq_bf16_indices_1.c  | 17 +
 .../vcopy_laneq_bf16_indices_2.c  | 17 +
 .../vcopyq_lane_bf16_indices_1.c  | 17 +
 .../vcopyq_lane_bf16_indices_2.c  | 17 +
 .../vcopyq_laneq_bf16_indices_1.c | 17 +
 .../vcopyq_laneq_bf16_indices_2.c | 17 +
 10 files changed, 206 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vect_copy_lane_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_lane_bf16_indices_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_lane_bf16_indices_2.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_laneq_bf16_indices_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_laneq_bf16_indices_2.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_lane_bf16_indices_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_lane_bf16_indices_2.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_laneq_bf16_indices_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_laneq_bf16_indices_2.c

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 0088ea9896f..9c801661775 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -35155,6 +35155,42 @@ vcvtq_high_bf16_f32 (bfloat16x8_t __inactive, 
float32x4_t __a)
   return __builtin_aarch64_bfcvtn2v8bf (__inactive, __a);
 }
 
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_lane_bf16 (bfloat16x4_t __a, const int __lane1,
+bfloat16x4_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+ __a, __lane1);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_lane_bf16 (bfloat16x8_t __a, const int __lane1,
+ bfloat16x4_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+ __a, __lane1);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_laneq_bf16 (bfloat16x4_t __a, const int __lane1,
+ bfloat16x8_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+ __a, __lane1);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_laneq_bf16 (bfloat16x8_t __a, const int __lane1,
+  bfloat16x8_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+ __a, __lane1);
+}

Re: [PATCH] vect: Remove redundant LOOP_VINFO_FULLY_MASKED_P

2020-10-21 Thread Richard Biener via Gcc-patches
On Wed, Oct 21, 2020 at 10:58 AM Kewen.Lin via Gcc-patches
 wrote:
>
> Hi,
>
> This is a very trivial patch, it's to remove a redundant
> LOOP_VINFO_FULLY_MASKED_P condition check which will be
> checked in vect_use_loop_mask_for_alignment_p.
>
> Is it OK for trunk?

OK.

>
> BR,
> Kewen
> -
> gcc/ChangeLog:
>
> * tree-vect-loop.c (vect_transform_loop): Remove the redundant
> LOOP_VINFO_FULLY_MASKED_P check.
>
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index dba230f6320..5e9e25add73 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -8913,8 +8913,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple 
> *loop_vectorized_call)
>
>split_edge (loop_preheader_edge (loop));
>
> -  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> -  && vect_use_loop_mask_for_alignment_p (loop_vinfo))
> +  if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
>  /* This will deal with any possible peeling.  */
>  vect_prepare_for_masked_peels (loop_vinfo);


Re: [PATCH][PR 97506] Simplify trivial vcond_expr in expander.

2020-10-21 Thread Hongtao Liu via Gcc-patches
On Wed, Oct 21, 2020 at 5:07 PM Jakub Jelinek  wrote:
>
> On Wed, Oct 21, 2020 at 02:29:07PM +0800, Hongtao Liu via Gcc-patches wrote:
> > gcc/ChangeLog:
> >
> > PR target/97506
> > * config/i386/i386-expand.c (ix86_expand_sse_movcc): Move
> > op_true to dest directly When op_true equals op_false,
>
> Lowercase when in the middle of sentence.  Use . instead of , at the end.
>
> > --- a/gcc/config/i386/i386-expand.c
> > +++ b/gcc/config/i386/i386-expand.c
> > @@ -3525,6 +3525,13 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx
> > op_true, rtx op_false)
> >machine_mode mode = GET_MODE (dest);
> >machine_mode cmpmode = GET_MODE (cmp);
> >
> > +  /* Simplify trivial vcond_expr to avoid ICE error in pr97506.  */
>
> There is no such thing as vcond_expr, I'd say use VEC_COND_EXPR instead.
> Please change ICE error to just ICE, ICE stands for internal compiler error,
> so the error word is in there already.
>
> Otherwise LGTM.
>

Thanks for the review, i'll commit the patch with upper adjustment.

> > +  if (rtx_equal_p (op_true, op_false))
> > +{
> > +  emit_move_insn (dest, op_true);
> > +  return;
> > +}
> > +
> >/* In AVX512F the result of comparison is an integer mask.  */
> >bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode);
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/pr97506.c
> > b/gcc/testsuite/gcc.target/i386/pr97506.c
> > new file mode 100644
> > index 000..74714cfab2c
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr97506.c
> > @@ -0,0 +1,19 @@
> > +/* PR target/97506  */
> > +/* { dg-do compile } */
> > +/* { dg-options "-Og -finline-functions-called-once -fno-tree-ccp
> > -mavx512vbmi -mavx512vl" } */
> > +
> > +typedef unsigned char __attribute__ ((__vector_size__ (16))) U;
> > +typedef int __attribute__ ((__vector_size__ (4))) V;
> > +U u;
> > +
> > +void
> > +bar (int i, V v)
> > +{
> > +  u += (char) i & (char) i > (U){};
> > +}
> > +
> > +void
> > +foo (void)
> > +{
> > +  bar (0, (V){});
> > +}
> > --
> > 2.18.1
> >
> >
> > --
> > BR,
> > Hongtao
>
> Jakub
>


-- 
BR,
Hongtao


[PATCH] tree-optimization/97500 - avoid SLP backedges for inductions

2020-10-21 Thread Richard Biener
Inductions are not vectorized as cycle but materialized from SCEV data.
Filling in backedge SLP nodes confuses this process.

Bootstrapped / tested on x86_64-unknown-linux-gnu, pushed.

2020-10-21  Richard Biener  

PR tree-optimization/97500
* tree-vect-slp.c (vect_analyze_slp_backedges): Do not
fill backedges for inductions.

* gfortran.dg/pr97500.f90: New testcase.
---
 gcc/testsuite/gfortran.dg/pr97500.f90 | 35 +++
 gcc/tree-vect-slp.c   |  6 +
 2 files changed, 41 insertions(+)
 create mode 100644 gcc/testsuite/gfortran.dg/pr97500.f90

diff --git a/gcc/testsuite/gfortran.dg/pr97500.f90 
b/gcc/testsuite/gfortran.dg/pr97500.f90
new file mode 100644
index 000..d63b8616ad6
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/pr97500.f90
@@ -0,0 +1,35 @@
+! { dg-do run }
+! { dg-additional-options "-ftree-vectorize -fno-guess-branch-probability" }
+module testmod
+  implicit none
+
+  contains
+
+  subroutine foo(n)
+integer, intent(in) :: n
+real :: r(0:n,-n:n), a(0:n,-n:n), dj
+integer :: k, j
+
+! initialize with some dummy values
+do j = -n, n
+  a(:, j) = j
+  r(:,j) = j + 1
+end do
+
+! here be dragons
+do k = 0, n
+  dj = r(k, k - 2) * a(k, k - 2)
+  r(k,k) = a(k, k - 1) * dj
+enddo
+
+if (r(0,0) .ne. -2.) STOP 1
+
+  end subroutine
+
+end module
+
+program test
+  use testmod
+  implicit none
+  call foo(5)
+end program
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 0c1447e7aa0..e3f94cb8a2d 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2380,6 +2380,12 @@ vect_analyze_slp_backedges (vec_info *vinfo, slp_tree 
node,
 if (child)
   vect_analyze_slp_backedges (vinfo, child, bst_map, visited);
 
+  /* Inductions are not vectorized by vectorizing their defining cycle
+ but by materializing the values from SCEV data.  */
+  if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (node))
+  == vect_induction_def)
+return;
+
   if (gphi *phi = dyn_cast  (SLP_TREE_REPRESENTATIVE (node)->stmt))
 for (unsigned i = 0; i < gimple_phi_num_args (phi); ++i)
   {
-- 
2.26.2


[PATCH] LTO: get_section: add new argument

2020-10-21 Thread Martin Liška

Hey.

During partial linking we ipa_prop_write_jump_functions twice from 2 IPA
pass (fnsummary and cp). That produces 2 compressed blocks in an ELF section
and then zstd complains as sections size does not correspond to the compressed
stream.

I'm adding both sanity check changes and the fix in ipa-prop.c.
I guess Martin and Honza can explain it in more detail?

Patch can bootstrap on x86_64-linux-gnu and survives regression tests.

Ready to be installed?
Thanks,
Martin

gcc/ChangeLog:

PR lto/97508
* langhooks.c (lhd_begin_section): Call get_section with
not_existing = true.
* output.h (get_section): Add new argument.
* varasm.c (get_section): Fail when NOT_EXISTING is true
and a section already exists.
* ipa-prop.c (ipa_prop_write_jump_functions): Do not stream
twice.
---
 gcc/ipa-prop.c  |  9 +
 gcc/langhooks.c |  2 +-
 gcc/output.h|  3 ++-
 gcc/varasm.c| 12 ++--
 4 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/gcc/ipa-prop.c b/gcc/ipa-prop.c
index a848f1db95e..d43fd2eee4f 100644
--- a/gcc/ipa-prop.c
+++ b/gcc/ipa-prop.c
@@ -5067,6 +5067,13 @@ ipa_prop_write_jump_functions (void)
   lto_symtab_encoder_iterator lsei;
   lto_symtab_encoder_t encoder;
 
+  /* The function can be called from 2 IPA_PASSES: "fnsummary" and "cp"

+ which happens in partial linking (-r).  Prevent double streaming
+ as reported in PR97508.  */
+  static bool already_stremed = false;
+  if (already_stremed)
+return;
+
   if (!ipa_node_params_sum || !ipa_edge_args_sum)
 return;
 
@@ -5096,6 +5103,8 @@ ipa_prop_write_jump_functions (void)

   streamer_write_char_stream (ob->main_stream, 0);
   produce_asm (ob, NULL);
   destroy_output_block (ob);
+
+  already_stremed = true;
 }
 
 /* Read section in file FILE_DATA of length LEN with data DATA.  */

diff --git a/gcc/langhooks.c b/gcc/langhooks.c
index 8819a8859d4..d82f54251fd 100644
--- a/gcc/langhooks.c
+++ b/gcc/langhooks.c
@@ -790,7 +790,7 @@ lhd_begin_section (const char *name)
 saved_section = text_section;
 
   /* Create a new section and switch to it.  */

-  section = get_section (name, SECTION_DEBUG | SECTION_EXCLUDE, NULL);
+  section = get_section (name, SECTION_DEBUG | SECTION_EXCLUDE, NULL, true);
   switch_to_section (section);
 }
 
diff --git a/gcc/output.h b/gcc/output.h

index eb253c50329..2f2f1697fd8 100644
--- a/gcc/output.h
+++ b/gcc/output.h
@@ -523,7 +523,8 @@ extern GTY(()) bool in_cold_section_p;
 
 extern section *get_unnamed_section (unsigned int, void (*) (const void *),

 const void *);
-extern section *get_section (const char *, unsigned int, tree);
+extern section *get_section (const char *, unsigned int, tree,
+bool not_existing = false);
 extern section *get_named_section (tree, const char *, int);
 extern section *get_variable_section (tree, bool);
 extern void place_block_symbol (rtx);
diff --git a/gcc/varasm.c b/gcc/varasm.c
index ea0b59cf44a..207c9b077d1 100644
--- a/gcc/varasm.c
+++ b/gcc/varasm.c
@@ -277,10 +277,12 @@ get_noswitch_section (unsigned int flags, 
noswitch_section_callback callback)
 }
 
 /* Return the named section structure associated with NAME.  Create

-   a new section with the given fields if no such structure exists.  */
+   a new section with the given fields if no such structure exists.
+   When NOT_EXISTING, then fail if the section already exists.  */
 
 section *

-get_section (const char *name, unsigned int flags, tree decl)
+get_section (const char *name, unsigned int flags, tree decl,
+bool not_existing)
 {
   section *sect, **slot;
 
@@ -297,6 +299,12 @@ get_section (const char *name, unsigned int flags, tree decl)

 }
   else
 {
+  if (not_existing)
+   {
+ error ("Section already exists: %qs", name);
+ gcc_unreachable ();
+   }
+
   sect = *slot;
   /* It is fine if one of the sections has SECTION_NOTYPE as long as
  the other has none of the contrary flags (see the logic at the end
--
2.28.0



Re: [PATCH] LTO: get_section: add new argument

2020-10-21 Thread Jan Hubicka
> Hey.
> 
> During partial linking we ipa_prop_write_jump_functions twice from 2 IPA
> pass (fnsummary and cp). That produces 2 compressed blocks in an ELF section
> and then zstd complains as sections size does not correspond to the compressed
> stream.
> 
> I'm adding both sanity check changes and the fix in ipa-prop.c.
> I guess Martin and Honza can explain it in more detail?
> 
> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
> 
> Ready to be installed?
> Thanks,
> Martin
> 
> gcc/ChangeLog:
> 
>   PR lto/97508
>   * langhooks.c (lhd_begin_section): Call get_section with
>   not_existing = true.
>   * output.h (get_section): Add new argument.
>   * varasm.c (get_section): Fail when NOT_EXISTING is true
>   and a section already exists.
>   * ipa-prop.c (ipa_prop_write_jump_functions): Do not stream
>   twice.

I think the streaming should happen only from ipa-fnsummary.
Oriignally ipa-prop was ipa-cp only, then indirect inlining was added,
but these days we have specialized analysis pass and thus ipa-prop
should be intergrated to it.

Honza
> ---
>  gcc/ipa-prop.c  |  9 +
>  gcc/langhooks.c |  2 +-
>  gcc/output.h|  3 ++-
>  gcc/varasm.c| 12 ++--
>  4 files changed, 22 insertions(+), 4 deletions(-)
> 
> diff --git a/gcc/ipa-prop.c b/gcc/ipa-prop.c
> index a848f1db95e..d43fd2eee4f 100644
> --- a/gcc/ipa-prop.c
> +++ b/gcc/ipa-prop.c
> @@ -5067,6 +5067,13 @@ ipa_prop_write_jump_functions (void)
>lto_symtab_encoder_iterator lsei;
>lto_symtab_encoder_t encoder;
> +  /* The function can be called from 2 IPA_PASSES: "fnsummary" and "cp"
> + which happens in partial linking (-r).  Prevent double streaming
> + as reported in PR97508.  */
> +  static bool already_stremed = false;
> +  if (already_stremed)
> +return;
> +
>if (!ipa_node_params_sum || !ipa_edge_args_sum)
>  return;
> @@ -5096,6 +5103,8 @@ ipa_prop_write_jump_functions (void)
>streamer_write_char_stream (ob->main_stream, 0);
>produce_asm (ob, NULL);
>destroy_output_block (ob);
> +
> +  already_stremed = true;
>  }
>  /* Read section in file FILE_DATA of length LEN with data DATA.  */
> diff --git a/gcc/langhooks.c b/gcc/langhooks.c
> index 8819a8859d4..d82f54251fd 100644
> --- a/gcc/langhooks.c
> +++ b/gcc/langhooks.c
> @@ -790,7 +790,7 @@ lhd_begin_section (const char *name)
>  saved_section = text_section;
>/* Create a new section and switch to it.  */
> -  section = get_section (name, SECTION_DEBUG | SECTION_EXCLUDE, NULL);
> +  section = get_section (name, SECTION_DEBUG | SECTION_EXCLUDE, NULL, true);
>switch_to_section (section);
>  }
> diff --git a/gcc/output.h b/gcc/output.h
> index eb253c50329..2f2f1697fd8 100644
> --- a/gcc/output.h
> +++ b/gcc/output.h
> @@ -523,7 +523,8 @@ extern GTY(()) bool in_cold_section_p;
>  extern section *get_unnamed_section (unsigned int, void (*) (const void *),
>const void *);
> -extern section *get_section (const char *, unsigned int, tree);
> +extern section *get_section (const char *, unsigned int, tree,
> +  bool not_existing = false);
>  extern section *get_named_section (tree, const char *, int);
>  extern section *get_variable_section (tree, bool);
>  extern void place_block_symbol (rtx);
> diff --git a/gcc/varasm.c b/gcc/varasm.c
> index ea0b59cf44a..207c9b077d1 100644
> --- a/gcc/varasm.c
> +++ b/gcc/varasm.c
> @@ -277,10 +277,12 @@ get_noswitch_section (unsigned int flags, 
> noswitch_section_callback callback)
>  }
>  /* Return the named section structure associated with NAME.  Create
> -   a new section with the given fields if no such structure exists.  */
> +   a new section with the given fields if no such structure exists.
> +   When NOT_EXISTING, then fail if the section already exists.  */
>  section *
> -get_section (const char *name, unsigned int flags, tree decl)
> +get_section (const char *name, unsigned int flags, tree decl,
> +  bool not_existing)
>  {
>section *sect, **slot;
> @@ -297,6 +299,12 @@ get_section (const char *name, unsigned int flags, tree 
> decl)
>  }
>else
>  {
> +  if (not_existing)
> + {
> +   error ("Section already exists: %qs", name);
> +   gcc_unreachable ();
> + }
> +
>sect = *slot;
>/* It is fine if one of the sections has SECTION_NOTYPE as long as
>   the other has none of the contrary flags (see the logic at the end
> -- 
> 2.28.0
> 


Re: [patch] Introduce vxworks7r2 support for ppc and ppc64

2020-10-21 Thread Olivier Hainque
Hi Segher!

> On 20 Oct 2020, at 22:06, Segher Boessenkool  
> wrote:
> 
>> +# Wind River 7 post SR0600 is mostly like Linux so we setup
>> +# out config in a very similar fashion and adjust to a few
>> +# specificities.
> 
> "our config"?
> 
>> +   - Starting with VxWorks 7 (post SR600), the system environment
>> + was made extremely similar to GNU/Linux and this toolchain is
>> + builtin on top of the corresponding header files.  */
> 
> "built on top"?

Indeed.

>> +/
>> + * Common definitions first *
>> + /
> 
> We don't use such decorated comments in GCC.  But it is your header file
> of course :-)

Hmm, I’d really like to keep some visible separation for the
sections because it’s pretty dense overall and I think hard to
read without some high level hints about the general organization.

There are (a few, agreed :) instances of sectioning in other
sources, tree-core.h or tree-vectorizer.h for example, with a
different style though. I think I’ll adjust to one of these.

> I don't see anything wrong with the actual code itself, fwiw :-)

Great, feedback appreciated, thanks!

Next in line specific to rs6000 are a couple of suggestions
for updates in the testsuite (wrt fpic and dfp).

Regards,

Olivier



PATCH [DR2303][PR97453]

2020-10-21 Thread kamlesh kumar via Gcc-patches
gcc/cp/ChangeLog
---

2020-10-21  Kamlesh Kumar  

PR c++/97453
* pt.c (get_template_base): Implement DR2303,
Consider closest base while template
deduction when base of base also matches.

gcc/testsuite/ChangeLog
--

2020-10-21  Kamlesh Kumar  

* g++.dg/Drs/dr2303.C: New Test

--

As part of this patch I Implemented fix for below defect report in cwg
https://wg21.cmeerw.net/cwg/issue2303 .
Reg tested on x86_64 and did not found any failure.
Patch summary: Remove base of base from list of bases

created a hash_set from list of bases and then iterate over each
element of hash_set and find its  list of bases and remove this from
hash_set if present.
and finally, deduction succeeds if in hash_set remains only single
element or it's empty.
otherwise deduction is ambiguous.
---
diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c
index dc664ec3798..7adf461e108 100644
--- a/gcc/cp/pt.c
+++ b/gcc/cp/pt.c
@@ -22643,8 +22643,9 @@ static enum template_base_result
 get_template_base (tree tparms, tree targs, tree parm, tree arg,
 bool explain_p, tree *result)
 {
-  tree rval = NULL_TREE;
+  *result = NULL_TREE;
   tree binfo;
+  hash_set binfo_set;

   gcc_assert (RECORD_OR_UNION_CODE_P (TREE_CODE (arg)));

@@ -22659,31 +22660,51 @@ get_template_base (tree tparms, tree targs,
tree parm, tree arg,
   /* Walk in inheritance graph order.  The search order is not
  important, and this avoids multiple walks of virtual bases.  */
   for (binfo = TREE_CHAIN (binfo); binfo; binfo = TREE_CHAIN (binfo))
-{
-  tree r = try_class_unification (tparms, targs, parm,
-   BINFO_TYPE (binfo), explain_p);
-
-  if (r)
- {
-   /* If there is more than one satisfactory baseclass, then:
-
-[temp.deduct.call]
+ {
+   tree r = try_class_unification (tparms, targs, parm,
+   BINFO_TYPE (binfo), explain_p);
+   if (r)
+ {
+   binfo_set.add(r);
+ }
+ }

-   If they yield more than one possible deduced A, the type
-   deduction fails.
+  /* If there is more than one satisfactory baseclass, then:
+ [temp.deduct.call]
+  If they yield more than one possible deduced A, the type
+  deduction fails.
+ However, if there is a class C that is a (direct or indirect)
base class of
+ D and derived (directly or indirectly) from a class B and that would be a
+ valid deduced A, the deduced A cannot be B or pointer to B,
respectively.  */
+  for (hash_set::iterator it = binfo_set.begin();
+it != binfo_set.end(); ++it)
+{
+  binfo = TYPE_BINFO (*it);
+  for (binfo = TREE_CHAIN (binfo); binfo; binfo = TREE_CHAIN (binfo))
+{
+  tree r = try_class_unification (tparms, targs, parm,
+  BINFO_TYPE (binfo), explain_p);
+  if (r && binfo_set.contains(r))
+{
+  binfo_set.remove(r);
+}
+}
+}

-  applies.  */
-   if (rval && !same_type_p (r, rval))
- {
-   *result = NULL_TREE;
-   return tbr_ambiguous_baseclass;
- }
+  if (binfo_set.elements() > 1)
+{
+  return tbr_ambiguous_baseclass;
+}

-   rval = r;
- }
+  if (binfo_set.is_empty())
+{
+  return tbr_success;
 }

-  *result = rval;
+  if (binfo_set.elements() == 1)
+{
+  *result = *binfo_set.begin();
+}
   return tbr_success;
 }

diff --git a/gcc/testsuite/g++.dg/DRs/dr2303.C
b/gcc/testsuite/g++.dg/DRs/dr2303.C
new file mode 100644
index 000..b4c23332358
--- /dev/null
+++ b/gcc/testsuite/g++.dg/DRs/dr2303.C
@@ -0,0 +1,20 @@
+// DR 2303
+// PR c++/97453
+// { dg-do compile { target c++11 } }
+
+template 
+struct A;
+template <>
+struct A<> {};
+template 
+struct A : A {};
+struct B : A {};
+
+template 
+void f(const A &) {
+  static_assert(sizeof...(T) == 2, "it should duduce to A");
+}
+
+void g() {
+  f(B{});
+}


./kamlesh


Re: [PATCH] LTO: get_section: add new argument

2020-10-21 Thread Martin Liška

On 10/21/20 1:17 PM, Martin Liška wrote:

On 10/21/20 12:06 PM, Jan Hubicka wrote:

I think the streaming should happen only from ipa-fnsummary.
Oriignally ipa-prop was ipa-cp only, then indirect inlining was added,
but these days we have specialized analysis pass and thus ipa-prop
should be intergrated to it.


All right, there's a WIP patch but it ICEs at various places:

gcc main.o
during IPA pass: fnsummary
lto1: internal compiler error: Segmentation fault
0xc909ff crash_signal
 /home/marxin/Programming/gcc/gcc/toplev.c:330
0x7788e6bf ???
 
/usr/src/debug/glibc-2.32-1.1.x86_64/signal/../sysdeps/unix/sysv/linux/x86_64/sigaction.c:0
0xa7cfbd hash_table_mod1(unsigned int, unsigned int)
 /home/marxin/Programming/gcc/gcc/hash-table.h:344
0xa7cfbd hash_table, ipa_node_params*, 
simple_hashmap_traits >, ipa_node_params*> >::hash_entry, 
false, xcallocator>::find_with_hash(int const&, unsigned int)
 /home/marxin/Programming/gcc/gcc/hash-table.h:911
0xa79216 hash_map, ipa_node_params*, 
simple_hashmap_traits >, ipa_node_params*> 
>::get(int const&)
 /home/marxin/Programming/gcc/gcc/hash-map.h:185
0xa79216 function_summary::get(cgraph_node*)
 /home/marxin/Programming/gcc/gcc/symbol-summary.h:163
0xa79216 inline_read_section
 /home/marxin/Programming/gcc/gcc/ipa-fnsummary.c:4314
0xa79ee0 ipa_fn_summary_read
 /home/marxin/Programming/gcc/gcc/ipa-fnsummary.c:4478
0xbc0ead ipa_read_summaries_1
 /home/marxin/Programming/gcc/gcc/passes.c:2844
0x7e31aa read_cgraph_and_symbols(unsigned int, char const**)
 /home/marxin/Programming/gcc/gcc/lto/lto-common.c:2919
0x7cb6e2 lto_main()
 /home/marxin/Programming/gcc/gcc/lto/lto.c:625

Can you please you or Martin finish the patch?
Thanks,
Martin


... adding missing patch.

Martin
>From 8c765ebad21da7f34a5038b4df8c4d29fb391055 Mon Sep 17 00:00:00 2001
From: Martin Liska 
Date: Wed, 21 Oct 2020 11:11:03 +0200
Subject: [PATCH] LTO: get_section: add new argument

gcc/ChangeLog:

	PR lto/97508
	* langhooks.c (lhd_begin_section): Call get_section with
	not_existing = true.
	* output.h (get_section): Add new argument.
	* varasm.c (get_section): Fail when NOT_EXISTING is true
	and a section already exists.
	* ipa-cp.c (ipcp_write_summary): Remove.
	(ipcp_read_summary): Likewise.
---
 gcc/ipa-cp.c| 20 ++--
 gcc/langhooks.c |  2 +-
 gcc/output.h|  3 ++-
 gcc/varasm.c| 12 ++--
 4 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index 2152f9e5876..db87329bc0c 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -5943,22 +5943,6 @@ ipcp_generate_summary (void)
 ipa_analyze_node (node);
 }
 
-/* Write ipcp summary for nodes in SET.  */
-
-static void
-ipcp_write_summary (void)
-{
-  ipa_prop_write_jump_functions ();
-}
-
-/* Read ipcp summary.  */
-
-static void
-ipcp_read_summary (void)
-{
-  ipa_prop_read_jump_functions ();
-}
-
 namespace {
 
 const pass_data pass_data_ipa_cp =
@@ -5980,8 +5964,8 @@ public:
   pass_ipa_cp (gcc::context *ctxt)
 : ipa_opt_pass_d (pass_data_ipa_cp, ctxt,
 		  ipcp_generate_summary, /* generate_summary */
-		  ipcp_write_summary, /* write_summary */
-		  ipcp_read_summary, /* read_summary */
+		  NULL, /* write_summary */
+		  NULL, /* read_summary */
 		  ipcp_write_transformation_summaries, /*
 		  write_optimization_summary */
 		  ipcp_read_transformation_summaries, /*
diff --git a/gcc/langhooks.c b/gcc/langhooks.c
index 8819a8859d4..d82f54251fd 100644
--- a/gcc/langhooks.c
+++ b/gcc/langhooks.c
@@ -790,7 +790,7 @@ lhd_begin_section (const char *name)
 saved_section = text_section;
 
   /* Create a new section and switch to it.  */
-  section = get_section (name, SECTION_DEBUG | SECTION_EXCLUDE, NULL);
+  section = get_section (name, SECTION_DEBUG | SECTION_EXCLUDE, NULL, true);
   switch_to_section (section);
 }
 
diff --git a/gcc/output.h b/gcc/output.h
index eb253c50329..2f2f1697fd8 100644
--- a/gcc/output.h
+++ b/gcc/output.h
@@ -523,7 +523,8 @@ extern GTY(()) bool in_cold_section_p;
 
 extern section *get_unnamed_section (unsigned int, void (*) (const void *),
  const void *);
-extern section *get_section (const char *, unsigned int, tree);
+extern section *get_section (const char *, unsigned int, tree,
+			 bool not_existing = false);
 extern section *get_named_section (tree, const char *, int);
 extern section *get_variable_section (tree, bool);
 extern void place_block_symbol (rtx);
diff --git a/gcc/varasm.c b/gcc/varasm.c
index ea0b59cf44a..207c9b077d1 100644
--- a/gcc/varasm.c
+++ b/gcc/varasm.c
@@ -277,10 +277,12 @@ get_noswitch_section (unsigned int flags, noswitch_section_callback callback)
 }
 
 /* Return the named section structure associated with NAME.  Create
-   a new section with the given fields if no such structure exists.  */
+   a new section with the given fields if no such structure exists.
+   When NOT_EXISTING, then fail if the section already exists.  

Re: [PATCH] LTO: get_section: add new argument

2020-10-21 Thread Martin Liška

On 10/21/20 12:06 PM, Jan Hubicka wrote:

I think the streaming should happen only from ipa-fnsummary.
Oriignally ipa-prop was ipa-cp only, then indirect inlining was added,
but these days we have specialized analysis pass and thus ipa-prop
should be intergrated to it.


All right, there's a WIP patch but it ICEs at various places:

gcc main.o
during IPA pass: fnsummary
lto1: internal compiler error: Segmentation fault
0xc909ff crash_signal
/home/marxin/Programming/gcc/gcc/toplev.c:330
0x7788e6bf ???

/usr/src/debug/glibc-2.32-1.1.x86_64/signal/../sysdeps/unix/sysv/linux/x86_64/sigaction.c:0
0xa7cfbd hash_table_mod1(unsigned int, unsigned int)
/home/marxin/Programming/gcc/gcc/hash-table.h:344
0xa7cfbd hash_table, ipa_node_params*, 
simple_hashmap_traits >, ipa_node_params*> >::hash_entry, 
false, xcallocator>::find_with_hash(int const&, unsigned int)
/home/marxin/Programming/gcc/gcc/hash-table.h:911
0xa79216 hash_map, ipa_node_params*, 
simple_hashmap_traits >, ipa_node_params*> 
>::get(int const&)
/home/marxin/Programming/gcc/gcc/hash-map.h:185
0xa79216 function_summary::get(cgraph_node*)
/home/marxin/Programming/gcc/gcc/symbol-summary.h:163
0xa79216 inline_read_section
/home/marxin/Programming/gcc/gcc/ipa-fnsummary.c:4314
0xa79ee0 ipa_fn_summary_read
/home/marxin/Programming/gcc/gcc/ipa-fnsummary.c:4478
0xbc0ead ipa_read_summaries_1
/home/marxin/Programming/gcc/gcc/passes.c:2844
0x7e31aa read_cgraph_and_symbols(unsigned int, char const**)
/home/marxin/Programming/gcc/gcc/lto/lto-common.c:2919
0x7cb6e2 lto_main()
/home/marxin/Programming/gcc/gcc/lto/lto.c:625

Can you please you or Martin finish the patch?
Thanks,
Martin


Re: PING [PATCH] Enable GCC support for Intel Key Locker extension

2020-10-21 Thread Uros Bizjak via Gcc-patches
On Wed, Oct 21, 2020 at 11:11 AM Hongyu Wang  wrote:
>
> Hi,
>
> > IIRC, adding a new regclass is O(n^2), so it should be avoided. I
> > think that the new patterns should follow the same path as vzeroall
> > and vzeroupper patterns, where we emit the pattern with explicit hard
> > regs.
> >
> > BTW: We do have SSE_FIRST_REG class, but this class was added to solve
> > some reload problems in the past by marking %xmm0 as likely spilled.
>
> Thanks for your suggestion, we have removed the register classes and 
> constraints, and
> set explicit sse hard registers in the expander. The corresponding patterns 
> are also adjusted,
>
> Update and rebased patch.

The attached patch goes only half-way to using explicit registers. As
said previously, please see how avx_vzeroall expander is generating
its insn pattern, and how *avx_vzeroall matches the generated pattern
using "vzeroall_operation" predicate.

Uros.


[PATCH] openmp: Change omp_get_initial_device () to match OpenMP 5.1 requirements

2020-10-21 Thread Jakub Jelinek via Gcc-patches
On Thu, Oct 15, 2020 at 01:02:29PM +0200, Jakub Jelinek via Gcc-patches wrote:
> Therefore, I think until omp_get_initial_device () value is changed, we

The following so far untested patch implements that change.

OpenMP 4.5 said for omp_get_initial_device:
The value of the device number is implementation defined. If it is between 0 
and one less than
omp_get_num_devices() then it is valid for use with all device constructs and 
routines; if it is
outside that range, then it is only valid for use with the device memory 
routines and not in the
device clause.
and OpenMP 5.0 similarly, but OpenMP 5.1 says:
The value of the device number is the value returned by the omp_get_num_devices 
routine.

As the new value is compatible with what has been required earlier, I think
we can change it already now.

2020-10-21  Jakub Jelinek  

* icv.c (omp_get_initial_device): Remove including corresponding
ialias.
* icv-device.c (omp_get_initial_device): New function.  Return
gomp_get_num_devices ().  Add ialias.
* target.c (resolve_device): Don't fail with
OMP_TARGET_OFFLOAD=mandatory if device_id is equal to
gomp_get_num_devices ().
(omp_target_alloc, omp_target_free, omp_target_is_present,
omp_target_memcpy, omp_target_memcpy_rect, omp_target_associate_ptr,
omp_target_disassociate_ptr, omp_pause_resource): Use
gomp_get_num_devices () instead of GOMP_DEVICE_HOST_FALLBACK on the
first use in the functions, in uses dominated by the
gomp_get_num_devices call use num_devices_openmp instead.
* libgomp.texi (omp_get_initial_device): Document.
* config/gcn/icv-device.c (omp_get_initial_device): New function.
Add ialias.
* config/nvptx/icv-device.c (omp_get_initial_device): Likewise.
* testsuite/libgomp.c/target-40.c: New test.

--- libgomp/icv.c.jj2020-10-13 22:29:22.202958364 +0200
+++ libgomp/icv.c   2020-10-21 12:31:05.941289010 +0200
@@ -156,12 +156,6 @@ omp_get_proc_bind (void)
 }
 
 int
-omp_get_initial_device (void)
-{
-  return GOMP_DEVICE_HOST_FALLBACK;
-}
-
-int
 omp_get_num_places (void)
 {
   return gomp_places_list_len;
@@ -241,7 +235,6 @@ ialias (omp_get_max_active_levels)
 ialias (omp_get_supported_active_levels)
 ialias (omp_get_cancellation)
 ialias (omp_get_proc_bind)
-ialias (omp_get_initial_device)
 ialias (omp_get_max_task_priority)
 ialias (omp_get_num_places)
 ialias (omp_get_place_num)
--- libgomp/icv-device.c.jj 2020-01-12 11:54:39.016374137 +0100
+++ libgomp/icv-device.c2020-10-21 12:32:24.827131320 +0200
@@ -43,6 +43,12 @@ omp_get_default_device (void)
 }
 
 int
+omp_get_initial_device (void)
+{
+  return gomp_get_num_devices ();
+}
+
+int
 omp_get_num_devices (void)
 {
   return gomp_get_num_devices ();
@@ -57,5 +63,6 @@ omp_is_initial_device (void)
 
 ialias (omp_set_default_device)
 ialias (omp_get_default_device)
+ialias (omp_get_initial_device)
 ialias (omp_get_num_devices)
 ialias (omp_is_initial_device)
--- libgomp/target.c.jj 2020-10-20 19:51:38.149361531 +0200
+++ libgomp/target.c2020-10-21 12:43:19.336526122 +0200
@@ -118,7 +118,8 @@ resolve_device (int device_id)
   if (device_id < 0 || device_id >= gomp_get_num_devices ())
 {
   if (gomp_target_offload_var == GOMP_TARGET_OFFLOAD_MANDATORY
- && device_id != GOMP_DEVICE_HOST_FALLBACK)
+ && device_id != GOMP_DEVICE_HOST_FALLBACK
+ && device_id != num_devices_openmp)
gomp_fatal ("OMP_TARGET_OFFLOAD is set to MANDATORY, "
"but device not found");
 
@@ -132,8 +133,7 @@ resolve_device (int device_id)
 {
   gomp_mutex_unlock (&devices[device_id].lock);
 
-  if (gomp_target_offload_var == GOMP_TARGET_OFFLOAD_MANDATORY
- && device_id != GOMP_DEVICE_HOST_FALLBACK)
+  if (gomp_target_offload_var == GOMP_TARGET_OFFLOAD_MANDATORY)
gomp_fatal ("OMP_TARGET_OFFLOAD is set to MANDATORY, "
"but device is finalized");
 
@@ -2716,7 +2716,7 @@ GOMP_teams (unsigned int num_teams, unsi
 void *
 omp_target_alloc (size_t size, int device_num)
 {
-  if (device_num == GOMP_DEVICE_HOST_FALLBACK)
+  if (device_num == gomp_get_num_devices ())
 return malloc (size);
 
   if (device_num < 0)
@@ -2742,7 +2742,7 @@ omp_target_free (void *device_ptr, int d
   if (device_ptr == NULL)
 return;
 
-  if (device_num == GOMP_DEVICE_HOST_FALLBACK)
+  if (device_num == gomp_get_num_devices ())
 {
   free (device_ptr);
   return;
@@ -2773,7 +2773,7 @@ omp_target_is_present (const void *ptr,
   if (ptr == NULL)
 return 1;
 
-  if (device_num == GOMP_DEVICE_HOST_FALLBACK)
+  if (device_num == gomp_get_num_devices ())
 return 1;
 
   if (device_num < 0)
@@ -2807,7 +2807,7 @@ omp_target_memcpy (void *dst, const void
   struct gomp_device_descr *dst_devicep = NULL, *src_devicep = NULL;
   bool ret;
 
-  if (dst_device_num != GOMP_DEVICE_HOST_FALLBACK)
+  if (dst_device_num != gomp_get

Re: PING [PATCH] Enable GCC support for Intel Key Locker extension

2020-10-21 Thread Uros Bizjak via Gcc-patches
On Wed, Oct 21, 2020 at 1:48 PM Uros Bizjak  wrote:
>
> On Wed, Oct 21, 2020 at 11:11 AM Hongyu Wang  wrote:
> >
> > Hi,
> >
> > > IIRC, adding a new regclass is O(n^2), so it should be avoided. I
> > > think that the new patterns should follow the same path as vzeroall
> > > and vzeroupper patterns, where we emit the pattern with explicit hard
> > > regs.
> > >
> > > BTW: We do have SSE_FIRST_REG class, but this class was added to solve
> > > some reload problems in the past by marking %xmm0 as likely spilled.
> >
> > Thanks for your suggestion, we have removed the register classes and 
> > constraints, and
> > set explicit sse hard registers in the expander. The corresponding patterns 
> > are also adjusted,
> >
> > Update and rebased patch.
>
> The attached patch goes only half-way to using explicit registers. As
> said previously, please see how avx_vzeroall expander is generating
> its insn pattern, and how *avx_vzeroall matches the generated pattern
> using "vzeroall_operation" predicate.

For example:

+(define_insn "encodekey128u32"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+(unspec_volatile:SI
+  [(match_operand:SI   1 "register_operand" "r")
+   (match_operand:V2DI 3 "register_operand" "2")]
+ UNSPECV_ENCODEKEY128U32))

should be generated as:

(parallel [
  (set ( ... as above ... )
(unspec_volatile:SI [( ... as above ... ) ( reg:V2DI 20 xmm0 )]
UNSPEC_ENCODEKEY128U32))

followed by the serie of:

   (set (reg:V2DI 20 xmm0)
(unspec_volatile:V2DI [(const_int 0)] UNSPECV_ENCODEKEY128U32))

no need to duplicate already listed input operands in unspec_volatile.

followed by another serie of:

   (set (reg:V2DI 26 xmm6)
(const_vector:V2DI [(const_int 0) (const_int 0)]))

to tell the optimizer that some registers now hold zero, so the value
in the register can eventually be reused elsewhere.

and finish the parallel with clobber of flags_reg.

Another example:

+(define_insn "aesu8"
+  [(set (reg:CCZ FLAGS_REG)
+(unspec_volatile:CCZ [(match_operand:BLK 0 "memory_operand" "m")
+  (match_operand:V2DI 9  "register_operand" "1")
+  (match_operand:V2DI 2  "sse_reg_operand")
+  (match_operand:V2DI 3  "sse_reg_operand")
+  (match_operand:V2DI 4  "sse_reg_operand")
+  (match_operand:V2DI 5  "sse_reg_operand")
+  (match_operand:V2DI 6  "sse_reg_operand")
+  (match_operand:V2DI 7  "sse_reg_operand")
+  (match_operand:V2DI 8  "sse_reg_operand")]
+ AESDECENCWIDEKL))
+   (set (match_operand:V2DI 1 "register_operand" "=Yz")
+(unspec_volatile:V2DI [(const_int 0)] AESDECENCWIDEKL))
+   (set (match_dup 2)
+(unspec_volatile:V2DI [(const_int 0)] AESDECENCWIDEKL))

This should be written as:

parallel [
  (set ( ... as above ... )
(unspec_volatile:CCZ [( ... as above, BLK only ... )]
UNSPEC_AESDECENWIDEKL))

followed by a series of:

   (set (reg:V2DI 20 xmm0)
(unspec_volatile:V2DI [(reg:V2DI 20 xmm0)] UNSPEC_AESDECENCWIDEKL))

And please see the mentioned expander and pattern how the above series
are generated and matched.

Uros.


[RFC2][PATCH] SLP vectorize across PHI nodes

2020-10-21 Thread Richard Biener
This enables SLP build to handle PHI nodes in full, continuing
the SLP build to non-backedges.  For loop vectorization this
enables outer loop vectorization of nested SLP cycles and for
BB vectorization this enables vectorization of PHIs at CFG merges.

Vectorized backedge defs are now filled using this info which
requires sanitizing the SLP tree for SLP reduction chains even
more, manually filling the backedge SLP def.

This also exposes the fact that CFG copying (and edge splitting
until I fixed that) ends up with different edge order in the
copy which doesn't play well with the desired 1:1 mapping of
SLP PHI node children and edges for epilogue vectorization.
I've tried to fixup CFG copying here but this really looks
like a dead (or expensive) end there so I've done fixup in
slpeel_tree_duplicate_loop_to_edge_cfg instead for the cases
we can run into.

There's still NULLs in the SLP_TREE_CHILDREN vectors and I'm
not sure it's possible to eliminate them all so the patch
has quite some checks for this case all over the place.

Bootstrapped and tested  on x86_64-unknown-linux-gnu.

I still have to track down two SPEC 2k6 build ICEs with the patch,
but otherwise it would have been ready.

Richard.

2020-10-21  Richard Biener  

* gimple.h (gimple_expr_type): For PHIs return the type
of the result.
* tree-vect-loop-manip.c (slpeel_tree_duplicate_loop_to_edge_cfg):
Make sure edge order into copied loop headers line up with the
originals.
* tree-vect-loop.c (vect_transform_cycle_phi): Handle nested
loops with SLP.
(vectorizable_phi): New function.
(vectorizable_live_operation): For BB vectorization compute insert
location here.
* tree-vect-slp.c (vect_free_slp_tree): Deal with NULL
SLP_TREE_CHILDREN entries.
(vect_print_slp_graph): Likewise.
(vect_mark_slp_stmts): Likewise.
(vect_mark_slp_stmts_relevant): Likewise.
(vect_gather_slp_loads): Likewise.
(vect_optimize_slp): Likewise.
(vect_slp_analyze_node_operations): Likewise.
(vect_bb_slp_scalar_cost): Likewise.
(vect_remove_slp_scalar_calls): Likewise.
(vect_get_and_check_slp_defs): Handle PHIs and mark backedge
defs.
(vect_build_slp_tree_1): Handle PHIs.
(vect_build_slp_tree_2): Continue SLP build, following PHI
arguments.
(vect_analyze_slp_instance): Set the backedge SLP def for
reduction chains.
(vect_analyze_slp_backedges): Skip already set backedges,
set the SLP child corresponding to the edge.
(vect_slp_build_vertices): Adjust leaf condition.
(vect_bb_slp_mark_live_stmts): Handle PHIs.
(vect_bb_partition_graph_r): Likewise.
(vect_slp_function): Adjust split condition to allow CFG
merges.
(vect_schedule_slp_instance): Adjust.
(vect_fill_vectorized_backedge_defs): New function.
(vect_schedule_slp): Call it.  Remove ad-hoc vectorized
backedge fill code.
* tree-vect-stmts.c (vect_analyze_stmt): Call
vectorizable_phi.
(vect_transform_stmt): Likewise.
(vect_is_simple_use): Handle vect_backedge_def.
* tree-vectorizer.c (vec_info::new_stmt_vec_info): Only
set loop header PHIs to vect_unknown_def_type for loop
vectorization.
* tree-vectorizer.h (enum vect_def_type): Add vect_backedge_def.
(enum stmt_vec_info_type): Add phi_info_type.
(vectorizable_phi): Declare.

* gcc.dg/vect/bb-slp-54.c: New test.
* gcc.dg/vect/vect-outer-slp-1.c: New test.
---
 gcc/gimple.h |   2 +
 gcc/testsuite/gcc.dg/vect/bb-slp-54.c|  23 ++
 gcc/testsuite/gcc.dg/vect/vect-outer-slp-1.c |  31 ++
 gcc/tree-vect-loop-manip.c   |  27 ++
 gcc/tree-vect-loop.c | 108 +-
 gcc/tree-vect-slp.c  | 378 ---
 gcc/tree-vect-stmts.c|  11 +-
 gcc/tree-vectorizer.c|   3 +-
 gcc/tree-vectorizer.h|   3 +
 9 files changed, 442 insertions(+), 144 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-54.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-outer-slp-1.c

diff --git a/gcc/gimple.h b/gcc/gimple.h
index 3c9b9965f5a..87c90be9a6a 100644
--- a/gcc/gimple.h
+++ b/gcc/gimple.h
@@ -6598,6 +6598,8 @@ gimple_expr_type (const gimple *stmt)
 }
   else if (code == GIMPLE_COND)
 return boolean_type_node;
+  else if (code == GIMPLE_PHI)
+return TREE_TYPE (gimple_phi_result (stmt));
   else
 return void_type_node;
 }
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-54.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-54.c
new file mode 100644
index 000..d05ce33310d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-54.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_double

[Patch, committed] Fortran: class.c - update vtable comment

2020-10-21 Thread Tobias Burnus

I was confused as I saw _deallocate – but didn't show up
in the big comment at the beginning of class.c.
Hence, I added it.

Committed as r11-4186-g310fe80babe04ccb7d2e15c8fca7dc98180701a8
but if you have have follow-up suggestions, we can surely change it.

Tobias

-
Mentor Graphics (Deutschland) GmbH, Arnulfstraße 201, 80634 München / Germany
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Alexander 
Walter
commit 310fe80babe04ccb7d2e15c8fca7dc98180701a8
Author: Tobias Burnus 
Date:   Wed Oct 21 14:38:44 2020 +0200

Fortran: class.c - update vtable comment

gcc/fortran/
PR fortran/45516
* class.c: Add _deallocate to the vtable documentation
comment.

diff --git a/gcc/fortran/class.c b/gcc/fortran/class.c
index dfa48400712..5677d920239 100644
--- a/gcc/fortran/class.c
+++ b/gcc/fortran/class.c
@@ -49,6 +49,8 @@ along with GCC; see the file COPYING3.  If not see
 * _copy: A procedure pointer to a copying procedure.
 * _final:A procedure pointer to a wrapper function, which frees
 		 allocatable components and calls FINAL subroutines.
+* _deallocate: A procedure pointer to a deallocation procedure; nonnull
+		 only for a recursive derived type.
 
After these follow procedure pointer components for the specific
type-bound procedures.  */


[committed] libstdc++: Make structured bindings always work for subranges [PR 97512]

2020-10-21 Thread Jonathan Wakely via Gcc-patches
The definition of ranges::subrange was moved to the new
 header so that it could be used in 
without including the whole of . However, the tuple-like support
that enables subrange to be used with structured bindings was left in
. This is arguably conforming (to use a subrange you should
include ) but it's inconvenient and probably confusing.

This change makes the tuple-like support available whenever subrange
itself is available.

libstdc++-v3/ChangeLog:

PR libstdc++/97512
* include/bits/ranges_util.h (tuple_size)
(tuple_element): Move here from ...
* include/std/ranges: ... here.
* testsuite/std/ranges/subrange/97512.cc: New test.

Tested powerpc64le-linux. Committed to trunk.

commit a186ab670e97c4c3883d96506655c4621e7c5515
Author: Jonathan Wakely 
Date:   Wed Oct 21 14:40:54 2020

libstdc++: Make structured bindings always work for subranges [PR 97512]

The definition of ranges::subrange was moved to the new
 header so that it could be used in 
without including the whole of . However, the tuple-like support
that enables subrange to be used with structured bindings was left in
. This is arguably conforming (to use a subrange you should
include ) but it's inconvenient and probably confusing.

This change makes the tuple-like support available whenever subrange
itself is available.

libstdc++-v3/ChangeLog:

PR libstdc++/97512
* include/bits/ranges_util.h (tuple_size)
(tuple_element): Move here from ...
* include/std/ranges: ... here.
* testsuite/std/ranges/subrange/97512.cc: New test.

diff --git a/libstdc++-v3/include/bits/ranges_util.h 
b/libstdc++-v3/include/bits/ranges_util.h
index a98658ff5c8..cc50e2ad4e4 100644
--- a/libstdc++-v3/include/bits/ranges_util.h
+++ b/libstdc++-v3/include/bits/ranges_util.h
@@ -410,6 +410,27 @@ namespace ranges
 
   using ranges::get;
 
+  template
+struct tuple_size>
+: integral_constant
+{ };
+
+  template
+struct tuple_element<0, ranges::subrange<_Iter, _Sent, _Kind>>
+{ using type = _Iter; };
+
+  template
+struct tuple_element<1, ranges::subrange<_Iter, _Sent, _Kind>>
+{ using type = _Sent; };
+
+  template
+struct tuple_element<0, const ranges::subrange<_Iter, _Sent, _Kind>>
+{ using type = _Iter; };
+
+  template
+struct tuple_element<1, const ranges::subrange<_Iter, _Sent, _Kind>>
+{ using type = _Sent; };
+
 _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace std
 #endif // library concepts
diff --git a/libstdc++-v3/include/std/ranges b/libstdc++-v3/include/std/ranges
index b03ae399fa6..4d3a4940c6f 100644
--- a/libstdc++-v3/include/std/ranges
+++ b/libstdc++-v3/include/std/ranges
@@ -3320,27 +3320,6 @@ namespace views
 
   namespace views = ranges::views;
 
-  template
-struct tuple_size>
-: integral_constant
-{ };
-
-  template
-struct tuple_element<0, ranges::subrange<_Iter, _Sent, _Kind>>
-{ using type = _Iter; };
-
-  template
-struct tuple_element<1, ranges::subrange<_Iter, _Sent, _Kind>>
-{ using type = _Sent; };
-
-  template
-struct tuple_element<0, const ranges::subrange<_Iter, _Sent, _Kind>>
-{ using type = _Iter; };
-
-  template
-struct tuple_element<1, const ranges::subrange<_Iter, _Sent, _Kind>>
-{ using type = _Sent; };
-
 _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace
 #endif // library concepts
diff --git a/libstdc++-v3/testsuite/std/ranges/subrange/97512.cc 
b/libstdc++-v3/testsuite/std/ranges/subrange/97512.cc
new file mode 100644
index 000..b0fd0e2577f
--- /dev/null
+++ b/libstdc++-v3/testsuite/std/ranges/subrange/97512.cc
@@ -0,0 +1,33 @@
+// Copyright (C) 2020 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// .
+
+// { dg-options "-std=gnu++2a" }
+// { dg-do compile { target c++2a } }
+
+// PR libstdc++/97512
+// Check that structured bindings work for subranges without .
+#include 
+
+constexpr bool
+test01()
+{
+  int r[] = { 1, 2, 2, 3, 3, 3 };
+  auto [first, last] = std::ranges::unique(r);
+  return first == std::ranges::begin(r) + 3 && last == std::ranges::end(r);
+}
+
+static_assert( test01() );


Re: [PATCH][middle-end][i386][version 3]Add -fzero-call-used-regs=[skip|used-gpr-arg|used-arg|all-arg|used-gpr|all-gpr|used|all]

2020-10-21 Thread Qing Zhao via Gcc-patches



> On Oct 21, 2020, at 3:03 AM, Uros Bizjak  wrote:
> 
> On Wed, Oct 21, 2020 at 9:18 AM Uros Bizjak  > wrote:
>> 
>> On Tue, Oct 20, 2020 at 10:04 PM Qing Zhao  wrote:
>> 
>>> +/* Check whether the register REGNO should be zeroed on X86.
>>> +   When ALL_SSE_ZEROED is true, all SSE registers have been zeroed
>>> +   together, no need to zero it again.
>>> +   Stack registers (st0-st7) and mm0-mm7 are aliased with each other.
>>> +   very hard to be zeroed individually, don't zero individual st or
>>> +   mm registgers at this time.  */
>>> +
>>> +static bool
>>> +zero_call_used_regno_p (const unsigned int regno,
>>> + bool all_sse_zeroed)
>>> +{
>>> +  return GENERAL_REGNO_P (regno)
>>> +  || (!all_sse_zeroed && SSE_REGNO_P (regno))
>>> +  || MASK_REGNO_P (regno);
>>> +}
>>> +
>>> +/* Return the machine_mode that is used to zero register REGNO.  */
>>> +
>>> +static machine_mode
>>> +zero_call_used_regno_mode (const unsigned int regno)
>>> +{
>>> +  /* NB: We only need to zero the lower 32 bits for integer registers
>>> + and the lower 128 bits for vector registers since destination are
>>> + zero-extended to the full register width.  */
>>> +  if (GENERAL_REGNO_P (regno))
>>> +return SImode;
>>> +  else if (SSE_REGNO_P (regno))
>>> +return V4SFmode;
>>> +  else
>>> +return HImode;
>>> +}
>>> +
>>> +/* Generate a rtx to zero all vector registers togetehr if possible,
>>> +   otherwise, return NULL.  */
>>> +
>>> +static rtx
>>> +zero_all_vector_registers (HARD_REG_SET need_zeroed_hardregs)
>>> +{
>>> +  if (!TARGET_AVX)
>>> +return NULL;
>>> +
>>> +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
>>> +if ((IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG)
>>> +  || (TARGET_64BIT
>>> +  && (REX_SSE_REGNO_P (regno)
>>> +  || (TARGET_AVX512F && EXT_REX_SSE_REGNO_P (regno)
>>> + && !TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
>>> +  return NULL;
>>> +
>>> +  return gen_avx_vzeroall ();
>>> +}
>>> +
>>> +/* Generate a rtx to zero all st and mm registers togetehr if possible,
>>> +   otherwise, return NULL.  */
>>> +
>>> +static rtx
>>> +zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs)
>>> +{
>>> +  if (!TARGET_MMX)
>>> +return NULL;
>>> +
>>> +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
>>> +if ((STACK_REGNO_P (regno) || MMX_REGNO_P (regno))
>>> + && !TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
>>> +  return NULL;
>>> +
>>> +  return gen_mmx_emms ();
>>> 
>>> 
>>> emms is not clearing any register, it only loads x87FPUTagWord with
>>> H. So I think, the above is useless, as far as register clearing
>>> is concerned.
>>> 
>>> 
>>> Thanks for the info.
>>> 
>>> So, for mm and st registers, should we clear them, and how?
>>> 
>>> 
>>> I don't know.
>>> 
>>> Please note that %mm and %st share the same register file, and
>>> touching %mm registers will block access to %st until emms is emitted.
>>> You can't just blindly load 0 to %st registers, because the register
>>> file can be in MMX mode and vice versa. For 32bit targets, function
>>> can also  return a value in the %mm0.
>>> 
>>> 
>>> If data flow determine that %mm0 does not return a value at the return, can 
>>> we clear all the %st as following:
>>> 
>>> emms
>>> mov %st0, 0
>>> mov %st1, 0
>>> mov %st2, 0
>>> mov %st3, 0
>>> mov %st4, 0
>>> mov %st5, 0
>>> mov %st6, 0
>>> mov %st7, 0
>> 
>> The i386 ABI says:
>> 
>> -- q --
>> The CPU shall be in x87 mode upon entry to a function. Therefore,
>> every function that uses the MMX registers is required to issue an
>> emms or femms instruction after using MMX registers, before returning
>> or calling another function.
>> -- /q --
>> 
>> (The above requirement slightly contradicts its own ABI, since we have
>> 3 MMX argument registers and MMX return register, so the CPU obviously
>> can't be in x87 mode at all function boundaries).
>> 
>> So, assuming that the first sentence is not deliberately vague w.r.t
>> function exit, emms should not be needed. However, we are dealing with
>> x87 stack registers that have their own set of peculiarities. It is
>> not possible to load a random register in the way you show.  Also,
>> stack should be either empty or one (two in case of complex value
>> return) levels deep at the function return. I think you want a series
>> of 8 or 7(6) fldz insns, followed by a series of fstp insn to clear
>> the stack and mark stack slots empty.
> 
> Something like this:
> 
> --cut here--
> long double
> __attribute__ ((noinline))
> test (long double a, long double b)
> {
>  long double r = a + b;
> 
>  asm volatile ("fldz;\
>fldz;\
>fldz;\
>fldz;\
>fldz;\
>fldz;\
>fldz;\
>fstp %%st(0);\
>fstp %%st(0);\
>fstp %%st(0);\
> 

[PATCH] SLP: Move load/store-lanes check till late

2020-10-21 Thread Tamar Christina via Gcc-patches
Hi All,

This moves the code that checks for load/store lanes further in the pipeline and
places it after slp_optimize.  This would allow us to perform optimizations on
the SLP tree and only bail out if we really have a permute.

With this change it allows us to handle permutes such as {1,1,1,1} which should
be handled by a load and replicate.

This change however makes it all or nothing. Either all instances can be handled
or none at all.  This is why some of the test cases have been adjusted.

Bootstrapped Regtested on aarch64-none-linux-gnu, -x86_64-pc-linux-gnu
 and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* tree-vect-slp.c (vect_analyze_slp_instance): Moved load/store lanes
check to ...
* tree-vect-loop.c (vect_analyze_loop_2): ..Here

gcc/testsuite/ChangeLog:

* gcc.dg/vect/slp-11b.c: Update output scan.
* gcc.dg/vect/slp-perm-6.c: Likewise.

-- 
diff --git a/gcc/testsuite/gcc.dg/vect/slp-11b.c b/gcc/testsuite/gcc.dg/vect/slp-11b.c
index 0cc23770badf0e00ef98769a2dd14a92dca32cca..fe5bb0c3ce7682c7cef1313e342d95aba3fe11b2 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-11b.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-11b.c
@@ -45,4 +45,4 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_strided4 && vect_int_mult } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target { ! { vect_strided4 && vect_int_mult } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
+/* { dg-final { scan-tree-dump-times "re-trying with SLP disabled" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-6.c b/gcc/testsuite/gcc.dg/vect/slp-perm-6.c
index 38489291a2659c989121d44c9e9e7bdfaa12f868..07bf8916de7ce88bbb1d65437f8bf6d8ab17efe6 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-6.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-6.c
@@ -106,7 +106,7 @@ int main (int argc, const char* argv[])
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_perm3_int && { {! vect_load_lanes } && {! vect_partial_vectors_usage_1 } } } } } } */
 /* The epilogues are vectorized using partial vectors.  */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { target { vect_perm3_int && { {! vect_load_lanes } && vect_partial_vectors_usage_1 } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_load_lanes } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
 /* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } } } } */
 /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
 /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 991fd4572298448c5d074f87a4ed318f0a3c9db6..c1350a8008850ea5e21a27cacd7e340d0da9bc9c 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2342,6 +2342,60 @@ start_over:
    "unsupported SLP instances\n");
 	  goto again;
 	}
+
+  /* Check whether any load is possibly permuted.  */
+  slp_tree load_node, slp_root;
+  unsigned i, x;
+  slp_instance instance;
+  FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
+	{
+	  bool loads_permuted = false;
+	  slp_root = SLP_INSTANCE_TREE (instance);
+	  int group_size = SLP_TREE_LANES (slp_root);
+	  tree vectype = SLP_TREE_VECTYPE (slp_root);
+
+	  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
+	{
+	  if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
+		continue;
+	  unsigned j;
+	  stmt_vec_info load_info;
+	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
+		if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
+		  {
+		loads_permuted = true;
+		break;
+		  }
+	}
+
+	  /* If the loads and stores can be handled with load/store-lane
+	 instructions do not generate this SLP instance.  */
+	  if (loads_permuted
+	  && vect_store_lanes_supported (vectype, group_size, false))
+	{
+	  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
+		{
+		  stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
+		  (SLP_TREE_SCALAR_STMTS (load_node)[0]);
+		  /* Use SLP for strided accesses (or if we can't
+		 load-lanes).  */
+		  if (STMT_VINFO_STRIDED_P (stmt_vinfo)
+		  || ! vect_load_lanes_supported
+			(STMT_VINFO_VECTYPE (stmt_vinfo),
+			 DR_GROUP_SIZE (stmt_vinfo), false))
+		break;
+		}
+
+	  if (i == SLP_INSTANCE_LOADS (instance).length ())
+		{
+		ok = opt_result::failure_at (vect_location,
+		 "Built SLP cancelled: can use"
+		 " load/store-lanes\n");
+		  goto again;
+		}
+	}
+	}
+
 }
 
   /* Dissolve SLP-only groups.  */
diff --git a/g

Re: [PATCH 1/2] [target 87767] Refactor AVX512 broadcast patterns with speical memory constraint.

2020-10-21 Thread Vladimir Makarov via Gcc-patches



On 2020-10-20 10:11 p.m., Hongtao Liu wrote:


Changed, and it passed the i386/x86-64 regression test.

Update patch.


Thank you, Hongtao.  This patch is ok for the trunk.




[patch] vxworks: Remove interfering default #undefs from vx-common.h

2020-10-21 Thread Olivier Hainque

This patch removes the #undef issued for LIB_SPEC and LINK_SPEC
in vx-common.h, which all the ports do on their own and which
impairs the bi-arch’d ppc*-vx7r2 targets, relying on linux64
definitions.

Tested together with the previous changes posted for the
newly introduced powerpc ports.

Committing to mainline shortly.

Olivier

2020-10-21  Douglas Rupp  

gcc/
* config/vx-common.h (LINK_SPEC, LIB_SPEC): Remove #undef.


--- a/gcc/config/vx-common.h
+++ b/gcc/config/vx-common.h
@@ -23,8 +23,6 @@ along with GCC; see the file COPYING3.  If not see
 /* Most of these will probably be overridden by subsequent headers.  We
undefine them here just in case, and define VXWORKS_ versions of each,
to be used in port-specific vxworks.h.  */
-#undef LIB_SPEC
-#undef LINK_SPEC
 #undef LIBGCC_SPEC
 #define LIBGCC_SPEC VXWORKS_LIBGCC_SPEC
 #undef STARTFILE_SPEC


Increase inlining limits for inline functions with builtin_constant_p on parameter

2020-10-21 Thread Jan Hubicka
Hi,
this patch implements heuristics that increases inline limits (by the hints
mechanism) for inline functions that use builtin_constant_p on parameter. Those
are very likely intended to be always inlined and simplify after inlining.

The PR is about a function that we used to inline with
 --param inline-insns-single=200 but with new default of 70 for -O2 we no longer
do so.  Hints are currently configured to bump the bound up twice, so we
get limit of 140 that is still not enough to inline the particular testcase
but it should help in general.  I can implement a stronger bump if that seems
useful (maybe it is). The example is bit operation written as a decision chain
with 64 conditions:

return ( __builtin_constant_p((size) - 1) ? ( __builtin_constant_p((size) - 1)
? ( ((size) - 1) < 2 ? 0 : ((size) - 1) & (1ULL << 63) ? 63 : ((size) - 1) &
(1ULL << 62) ? 62 : ((size) - 1) & (1ULL << 61) ? 61 : ((size) - 1) & (1ULL <<
60) ? 60 : ((size) - 1) & (1ULL << 59) ? 59 : ((size) - 1) & (1ULL << 58) ? 58
: ((size) - 1) & (1ULL << 57) ? 57 : ((size) - 1) & (1ULL << 56) ? 56 : ((size)
- 1) & (1ULL << 55) ? 55 : ((size) - 1) & (1ULL << 54) ? 54 : ((size) - 1) &
(1ULL << 53) ? 53 : ((size) - 1) & (1ULL << 52) ? 52 : ((size) - 1) & (1ULL <<
51) ? 51 : ((size) - 1) & (1ULL << 50) ? 50 : ((size) - 1) & (1ULL << 49) ? 49
: ((size) - 1) & (1ULL << 48) ? 48 : ((size) - 1) & (1ULL << 47) ? 47 : ((size)
- 1) & (1ULL << 46) ? 46 : ((size) - 1) & (1ULL << 45) ? 45 : ((size) - 1) &
(1ULL << 44) ? 44 : ((size) - 1) & (1ULL << 43) ? 43 : ((size) - 1) & (1ULL <<
42) ? 42 : ((size) - 1) & (1ULL << 41) ? 41 : ((size) - 1) & (1ULL << 40) ? 40
: ((size) - 1) & (1ULL << 39) ? 39 : ((size) - 1) & (1ULL << 38) ? 38 : ((size)
- 1) & (1ULL << 37) ? 37 : ((size) - 1) & (1ULL << 36) ? 36 : ((size) - 1) &
(1ULL << 35) ? 35 : ((size) - 1) & (1ULL << 34) ? 34 : ((size) - 1) & (1ULL <<
33) ? 33 : ((size) - 1) & (1ULL << 32) ? 32 : ((size) - 1) & (1ULL << 31) ? 31
: ((size) - 1) & (1ULL << 30) ? 30 : ((size) - 1) & (1ULL << 29) ? 29 : ((size)
- 1) & (1ULL << 28) ? 28 : ((size) - 1) & (1ULL << 27) ? 27 : ((size) - 1) &
(1ULL << 26) ? 26 : ((size) - 1) & (1ULL << 25) ? 25 : ((size) - 1) & (1ULL <<
24) ? 24 : ((size) - 1) & (1ULL << 23) ? 23 : ((size) - 1) & (1ULL << 22) ? 22
: ((size) - 1) & (1ULL << 21) ? 21 : ((size) - 1) & (1ULL << 20) ? 20 : ((size)
- 1) & (1ULL << 19) ? 19 : ((size) - 1) & (1ULL << 18) ? 18 : ((size) - 1) &
(1ULL << 17) ? 17 : ((size) - 1) & (1ULL << 16) ? 16 : ((size) - 1) & (1ULL <<
15) ? 15 : ((size) - 1) & (1ULL << 14) ? 14 : ((size) - 1) & (1ULL << 13) ? 13
: ((size) - 1) & (1ULL << 12) ? 12 : ((size) - 1) & (1ULL << 11) ? 11 : ((size)
- 1) & (1ULL << 10) ? 10 : ((size) - 1) & (1ULL << 9) ? 9 : ((size) - 1) &
(1ULL << 8) ? 8 : ((size) - 1) & (1ULL << 7) ? 7 : ((size) - 1) & (1ULL << 6) ?
6 : ((size) - 1) & (1ULL << 5) ? 5 : ((size) - 1) & (1ULL << 4) ? 4 : ((size) -
1) & (1ULL << 3) ? 3 : ((size) - 1) & (1ULL << 2) ? 2 : 1) : -1) :
(sizeof((size) - 1) <= 4) ? __ilog2_u32((size) - 1) : __ilog2_u64((size) - 1) )
- 12 + 1;

This blows up the limit on number of conditions we track per funtion (which is
30) and thus the size/time estimates are not working that well.

Bootstrapped/regtsted x86_64-linux, will commit it after bit more testing.

gcc/ChangeLog:

2020-10-21  Jan Hubicka  

PR ipa/97445
* ipa-fnsummary.c (ipa_dump_hints): Add INLINE_HINT_builtin_constant_p.
(ipa_fn_summary::~ipa_fn_summary): Free builtin_constant_p_parms.
(ipa_fn_summary_t::duplicate): Duplicate builtin_constant_p_parms.
(ipa_dump_fn_summary): Dump builtin_constant_p_parms.
(add_builtin_constant_p_parm): New function
(set_cond_stmt_execution_predicate): Update builtin_constant_p_parms.
(ipa_call_context::estimate_size_and_time): Set 
INLINE_HINT_builtin_constant_p..
(ipa_merge_fn_summary_after_inlining): Merge builtin_constant_p_parms.
(inline_read_section): Read builtin_constant_p_parms.
(ipa_fn_summary_write): Write builtin_constant_p_parms.
* ipa-fnsummary.h (enum ipa_hints_vals): Add
INLINE_HINT_builtin_constant_p.
* ipa-inline.c (want_inline_small_function_p): Use
INLINE_HINT_builtin_constant_p.
(edge_badness): Use INLINE_HINT_builtin_constant_p.

gcc/testsuite/ChangeLog:

2020-10-21  Jan Hubicka  

PR ipa/97445
* gcc.dg/ipa/inlinehint-5.c: New test.

diff --git a/gcc/ipa-fnsummary.c b/gcc/ipa-fnsummary.c
index 9e3eda4d3cb..eb7467a8d52 100644
--- a/gcc/ipa-fnsummary.c
+++ b/gcc/ipa-fnsummary.c
@@ -141,6 +141,11 @@ ipa_dump_hints (FILE *f, ipa_hints hints)
   hints &= ~INLINE_HINT_known_hot;
   fprintf (f, " known_hot");
 }
+  if (hints & INLINE_HINT_builtin_constant_p)
+{
+  hints &= ~INLINE_HINT_builtin_constant_p;
+  fprintf (f, " builtin_constant_p");
+}
   gcc_assert (!hints);
 }
 
@@ -751,6 +756,7 @@ ipa_fn_summary::~ipa_fn_summary ()
   vec_free (call_size_time_t

[PATCH] x86: Allow configuring with --with-arch_64=x86-64-v[234]

2020-10-21 Thread Jakub Jelinek via Gcc-patches
On Wed, Sep 30, 2020 at 06:06:31PM +0200, Florian Weimer wrote:
> --- a/gcc/common/config/i386/i386-common.c
> +++ b/gcc/common/config/i386/i386-common.c
> @@ -1795,9 +1795,13 @@ const pta processor_alias_table[] =
>  PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR, 0, P_NONE},
>{"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
>  PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR, 0, P_NONE},
> -  {"x86-64", PROCESSOR_K8, CPU_K8,
> -PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR,
> -0, P_NONE},
> +  {"x86-64", PROCESSOR_K8, CPU_K8, PTA_X86_64_BASELINE, 0, P_NONE},
> +  {"x86-64-v2", PROCESSOR_K8, CPU_GENERIC, PTA_X86_64_V2 | PTA_NO_TUNE,
> +   0, P_NONE},
> +  {"x86-64-v3", PROCESSOR_K8, CPU_GENERIC, PTA_X86_64_V3 | PTA_NO_TUNE,
> +   0, P_NONE},
> +  {"x86-64-v4", PROCESSOR_K8, CPU_GENERIC, PTA_X86_64_V4 | PTA_NO_TUNE,
> +   0, P_NONE},
>{"eden-x2", PROCESSOR_K8, CPU_K8,
>  PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR,
>  0, P_NONE},

I have noticed that one can't configure gcc to default to these.

I've also found various other 32-bit or 64-bit -march= arguments for which
it wasn't possible to configure gcc to default to those.

The x86-64-v* the patch only allows in --with-arch_64=, because otherwise
it fails build miserably - as
./xgcc -B ./ -S -march=x86-64-v2 -m32 test.c
cc1: error: ‘x86-64-v2’ architecture level is only defined for the x86-64 psABI
when building 32-bit multilibs.  Even if multilibs are disallowed, I think
the compiler still supports -m32 and so --with-arch_64= seems to be the only
option in which we can support that.

Ok for trunk if this passes bootstrap/regtest?  So far I've just tested that
e.g. --with-tune=x86-64-v3 build fails as expected and --with-arch_64=x86-64-v3
one went fine.

2020-10-21  Jakub Jelinek  

* config.gcc (x86_archs): Add samuel-2, nehemiah, c7 and esther.
(x86_64_archs): Add eden-x2, nano, nano-1000, nano-2000, nano-3000,
nano-x2, eden-x4, nano-x4, x86-64-v2, x86-64-v3 and x86-64-v4.
(i[34567]86-*-* | x86_64-*-*): Only allow x86-64-v* as argument
to --with-arch_64=.

--- gcc/config.gcc.jj   2020-10-15 09:04:50.614521860 +0200
+++ gcc/config.gcc  2020-10-21 17:03:10.396077993 +0200
@@ -662,7 +662,8 @@ tm_defines="$tm_defines LIBC_GLIBC=1 LIB
 x86_archs="athlon athlon-4 athlon-fx athlon-mp athlon-tbird \
 athlon-xp k6 k6-2 k6-3 geode c3 c3-2 winchip-c6 winchip2 i386 i486 \
 i586 i686 pentium pentium-m pentium-mmx pentium2 pentium3 pentium3m \
-pentium4 pentium4m pentiumpro prescott lakemont"
+pentium4 pentium4m pentiumpro prescott lakemont samuel-2 nehemiah \
+c7 esther"
 
 # 64-bit x86 processors supported by --with-arch=.  Each processor
 # MUST be separated by exactly one space.
@@ -672,7 +673,8 @@ opteron-sse3 nocona core2 corei7 corei7-
 slm nehalem westmere sandybridge ivybridge haswell broadwell bonnell \
 silvermont knl knm skylake-avx512 cannonlake icelake-client icelake-server \
 skylake goldmont goldmont-plus tremont cascadelake tigerlake cooperlake \
-sapphirerapids alderlake x86-64 native"
+sapphirerapids alderlake eden-x2 nano nano-1000 nano-2000 nano-3000 \
+nano-x2 eden-x4 nano-x4 x86-64 x86-64-v2 x86-64-v3 x86-64-v4 native"
 
 # Additional x86 processors supported by --with-cpu=.  Each processor
 # MUST be separated by exactly one space.
@@ -4458,6 +4460,17 @@ case "${target}" in
if test x${val} != x; then
case " $x86_64_archs " in
*" ${val} "*)
+   # Disallow x86-64-v* for 
--with-cpu=/--with-tune=
+   # or --with-arch= or 
--with-arch_32=
+   # It can be only specified in 
--with-arch_64=
+   case "x$which$val" in
+   
xcpu*x86-64-v*|xtune*x86-64-v*|xarchx86-64-v*|xarch_32x86-64-v*)
+   echo "Unknown CPU given 
in --with-$which=$val." 1>&2
+   exit 1
+   ;;
+   *)
+   ;;
+   esac
# OK
;;
*)

Jakub



Re: [PATCH] x86: Allow configuring with --with-arch_64=x86-64-v[234]

2020-10-21 Thread Florian Weimer
* Jakub Jelinek:

> On Wed, Sep 30, 2020 at 06:06:31PM +0200, Florian Weimer wrote:
>> --- a/gcc/common/config/i386/i386-common.c
>> +++ b/gcc/common/config/i386/i386-common.c
>> @@ -1795,9 +1795,13 @@ const pta processor_alias_table[] =
>>  PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR, 0, P_NONE},
>>{"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
>>  PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR, 0, P_NONE},
>> -  {"x86-64", PROCESSOR_K8, CPU_K8,
>> -PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR,
>> -0, P_NONE},
>> +  {"x86-64", PROCESSOR_K8, CPU_K8, PTA_X86_64_BASELINE, 0, P_NONE},
>> +  {"x86-64-v2", PROCESSOR_K8, CPU_GENERIC, PTA_X86_64_V2 | PTA_NO_TUNE,
>> +   0, P_NONE},
>> +  {"x86-64-v3", PROCESSOR_K8, CPU_GENERIC, PTA_X86_64_V3 | PTA_NO_TUNE,
>> +   0, P_NONE},
>> +  {"x86-64-v4", PROCESSOR_K8, CPU_GENERIC, PTA_X86_64_V4 | PTA_NO_TUNE,
>> +   0, P_NONE},
>>{"eden-x2", PROCESSOR_K8, CPU_K8,
>>  PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR,
>>  0, P_NONE},
>
> I have noticed that one can't configure gcc to default to these.
>
> I've also found various other 32-bit or 64-bit -march= arguments for which
> it wasn't possible to configure gcc to default to those.
>
> The x86-64-v* the patch only allows in --with-arch_64=, because otherwise
> it fails build miserably - as
> ./xgcc -B ./ -S -march=x86-64-v2 -m32 test.c
> cc1: error: ‘x86-64-v2’ architecture level is only defined for the x86-64 
> psABI
> when building 32-bit multilibs.  Even if multilibs are disallowed, I think
> the compiler still supports -m32 and so --with-arch_64= seems to be the only
> option in which we can support that.

Yes, that was certainly my intent.


Re: [PATCH] [PR rtl-optimization/97249]Simplify vec_select of paradoxical subreg.

2020-10-21 Thread Richard Sandiford via Gcc-patches
Hongtao Liu  writes:
> +   poly_uint64 nunits
> + = GET_MODE_NUNITS (GET_MODE (SUBREG_REG (trueop0)));
> +   rtx par = trueop1;
> +   for (int i = 0; i != l1; i++)
> + {
> +   rtx idx = XVECEXP (trueop1, 0, i);
> +   if (!CONST_INT_P (idx)
> +   || maybe_ge (UINTVAL (idx) + subreg_offset, nunits))
> + return 0;
> + }

I think the previous version was better.  We shouldn't assume that
further simplification rules will fail just because the conditions
for this rule haven't been met.

Thanks,
Richard


Re: [PATCH][middle-end][i386][version 3]Add -fzero-call-used-regs=[skip|used-gpr-arg|used-arg|all-arg|used-gpr|all-gpr|used|all]

2020-10-21 Thread Richard Sandiford via Gcc-patches
Qing Zhao  writes:
>>> +  /* For each of the hard registers, check to see whether we should zero 
>>> it if:
>>> + 1. it is a call-used-registers;
>>> + and 2. it is not a fixed-registers;
>>> + and 3. it is not live at the return of the routine;
>>> + and 4. it is general registor if gpr_only is true;
>>> + and 5. it is used in the routine if used_only is true;
>>> + and 6. it is a register that passes parameter if arg_only is true;
>>> +   */
>>> +
>>> +  HARD_REG_SET need_zeroed_hardregs;
>>> +  CLEAR_HARD_REG_SET (need_zeroed_hardregs);
>>> +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
>>> +{
>>> +  if (!this_target_hard_regs->x_call_used_regs[regno])
>>> +   continue;
>> 
>> This should use crtl->abi instead.  The set of call-used registers
>> can vary from function to function.
>
> You mean to use:
>
> If (!crtl->abi->clobbers_full_reg_p(regno))
>
> ?

Yeah, that's right.  (But with a space before “(regno)” :-))

>>> +static unsigned int
>>> +rest_of_zero_call_used_regs (void)
>>> +{
>>> +  basic_block bb;
>>> +  rtx_insn *insn;
>>> +
>>> +  /* This pass needs data flow information.  */
>>> +  df_analyze ();
>>> +
>>> +  /* Search all the "return"s in the routine, and insert instruction 
>>> sequence to
>>> + zero the call used registers.  */
>>> +  FOR_EACH_BB_REVERSE_FN (bb, cfun)
>>> +if (bb == EXIT_BLOCK_PTR_FOR_FN (cfun)
>>> +   || (single_succ_p (bb)
>>> +   && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun)))
>>> +  FOR_BB_INSNS_REVERSE (bb, insn)
>>> +   if (JUMP_P (insn) && ANY_RETURN_P (JUMP_LABEL (insn)))
>>> + {
>>> +   /* Now we can insert the instruction sequence to zero the call used
>>> +  registers before this insn.  */
>>> +   gen_call_used_regs_seq (insn);
>>> +   break;
>>> + }
>> 
>> The exit block never has instructions, so it's only necessary to process
>> predecessors.  A simpler way to do that is to iterate over the edges in:
>> 
>>  EXIT_BLOCK_PTR_FOR_FN (cfun)->preds
>> 
>> You shouldn't need to use FOR_BB_INSNS_REVERSE: it should be enough
>> to check only BB_END (bb), since returns always end blocks.
>
> Something like the following?
>
>   FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
> {
>  insn = BB_END (e->src);
>   If (JUMP_P (insn) && ANY_RETURN_P (JUMP_LABEL (insn)))
> {
> /* Now we can insert the instruction sequence to zero the call used
>  registers before this insn.  */
>   gen_call_used_regs_seq (insn);
>   break;   
> }
>   }

With this you don't want/need the break, since it would break out
of the edge traversal (instead of the FOR_BB_INSNS_REVERSE, as above).
Also, I think the code becomes simple enough that the comment isn't
really needed.

Thanks,
Richard


Re: [PATCH] arm: Fix multiple inheritance thunks for thumb-1 with -mpure-code

2020-10-21 Thread Christophe Lyon via Gcc-patches
On Tue, 20 Oct 2020 at 13:25, Richard Earnshaw
 wrote:
>
> On 20/10/2020 12:22, Richard Earnshaw wrote:
> > On 19/10/2020 17:32, Christophe Lyon via Gcc-patches wrote:
> >> On Mon, 19 Oct 2020 at 16:39, Richard Earnshaw
> >>  wrote:
> >>>
> >>> On 12/10/2020 08:59, Christophe Lyon via Gcc-patches wrote:
>  On Thu, 8 Oct 2020 at 11:58, Richard Earnshaw
>   wrote:
> >
> > On 08/10/2020 10:07, Christophe Lyon via Gcc-patches wrote:
> >> On Tue, 6 Oct 2020 at 18:02, Richard Earnshaw
> >>  wrote:
> >>>
> >>> On 29/09/2020 20:50, Christophe Lyon via Gcc-patches wrote:
>  When mi_delta is > 255 and -mpure-code is used, we cannot load delta
>  from code memory (like we do without -mpure-code).
> 
>  This patch builds the value of mi_delta into r3 with a series of
>  movs/adds/lsls.
> 
>  We also do some cleanup by not emitting the function address and 
>  delta
>  via .word directives at the end of the thunk since we don't use them
>  with -mpure-code.
> 
>  No need for new testcases, this bug was already identified by
>  eg. pr46287-3.C
> 
>  2020-09-29  Christophe Lyon  
> 
>    gcc/
>    * config/arm/arm.c (arm_thumb1_mi_thunk): Build mi_delta in r3 
>  and
>    do not emit function address and delta when -mpure-code is 
>  used.
> >>>
> >> Hi Richard,
> >>
> >> Thanks for your comments.
> >>
> >>> There are some optimizations you can make to this code.
> >>>
> >>> Firstly, for values between 256 and 510 (inclusive), it would be 
> >>> better
> >>> to just expand a mov of 255 followed by an add.
> >> I now see the splitted for the "Pe" constraint which I hadn't noticed
> >> before, so I can write something similar indeed.
> >>
> >> However, I'm note quite sure to understand the benefit in the split
> >> when -mpure-code is NOT used.
> >> Consider:
> >> int f3_1 (void) { return 510; }
> >> int f3_2 (void) { return 511; }
> >> Compile with -O2 -mcpu=cortex-m0:
> >> f3_1:
> >> movsr0, #255
> >> lslsr0, r0, #1
> >> bx  lr
> >> f3_2:
> >> ldr r0, .L4
> >> bx  lr
> >>
> >> The splitter makes the code bigger, does it "compensate" for this by
> >> not having to load the constant?
> >> Actually the constant uses 4 more bytes, which should be taken into
> >> account when comparing code size,
> >
> > Yes, the size of the literal pool entry needs to be taken into account.
> >  It might happen that the entry could be shared with another use of that
> > literal, but in general that's rare.
> >
> >> so f3_1 uses 6 bytes, and f3_2 uses 8, so as you say below three
> >> thumb1 instructions would be equivalent in size compared to loading
> >> from the literal pool. Should the 256-510 range be extended?
> >
> > It's a bit borderline at three instructions when literal pools are not
> > expensive to use, but in thumb1 literal pools tend to be quite small due
> > to the limited pc offsets we can use.  I think on balance we probably
> > want to use the instruction sequence unless optimizing for size.
> >
> >>
> >>
> >>> This is also true for
> >>> the literal pools alternative as well, so should be handled before all
> >>> this.
> >> I am not sure what you mean: with -mpure-code, the above sample is 
> >> compiled as:
> >> f3_1:
> >> movsr0, #255
> >> lslsr0, r0, #1
> >> bx  lr
> >> f3_2:
> >> movsr0, #1
> >> lslsr0, r0, #8
> >> addsr0, r0, #255
> >> bx  lr
> >>
> >> so the "return 510" case is already handled as without -mpure-code.
> >
> > I was thinking specifically of the thunk sequence where you seem to be
> > emitting instructions directly rather than generating RTL.  The examples
> > you show here are not thunks.
> >
>  OK thanks for the clarification.
> 
>  Here is an updated version, split into 3 patches to hopefully make
>  review easier.
>  They apply on top of my other mpure-code patches for PR96967 and PR96770:
>  https://gcc.gnu.org/pipermail/gcc-patches/2020-September/554956.html
>  https://gcc.gnu.org/pipermail/gcc-patches/2020-September/554957.html
> 
>  I kept it this way to make incremental changes easier to understand.
> 
>  Patch 1: With the hope to avoid confusion and make maintenance easier,
>  I have updated thumb1_gen_const_int() so that it can generate either RTL 
>  or
>  asm. This way, all the code used to build thumb-1 constants is in the
>  same place,
>   in case we need to improve/fix it later. We now generate shorter 
> >>

Re: [PATCH] arm: Fix multiple inheritance thunks for thumb-1 with -mpure-code

2020-10-21 Thread Richard Earnshaw via Gcc-patches
On 21/10/2020 16:49, Christophe Lyon via Gcc-patches wrote:
> On Tue, 20 Oct 2020 at 13:25, Richard Earnshaw
>  wrote:
>>
>> On 20/10/2020 12:22, Richard Earnshaw wrote:
>>> On 19/10/2020 17:32, Christophe Lyon via Gcc-patches wrote:
 On Mon, 19 Oct 2020 at 16:39, Richard Earnshaw
  wrote:
>
> On 12/10/2020 08:59, Christophe Lyon via Gcc-patches wrote:
>> On Thu, 8 Oct 2020 at 11:58, Richard Earnshaw
>>  wrote:
>>>
>>> On 08/10/2020 10:07, Christophe Lyon via Gcc-patches wrote:
 On Tue, 6 Oct 2020 at 18:02, Richard Earnshaw
  wrote:
>
> On 29/09/2020 20:50, Christophe Lyon via Gcc-patches wrote:
>> When mi_delta is > 255 and -mpure-code is used, we cannot load delta
>> from code memory (like we do without -mpure-code).
>>
>> This patch builds the value of mi_delta into r3 with a series of
>> movs/adds/lsls.
>>
>> We also do some cleanup by not emitting the function address and 
>> delta
>> via .word directives at the end of the thunk since we don't use them
>> with -mpure-code.
>>
>> No need for new testcases, this bug was already identified by
>> eg. pr46287-3.C
>>
>> 2020-09-29  Christophe Lyon  
>>
>>   gcc/
>>   * config/arm/arm.c (arm_thumb1_mi_thunk): Build mi_delta in r3 
>> and
>>   do not emit function address and delta when -mpure-code is 
>> used.
>
 Hi Richard,

 Thanks for your comments.

> There are some optimizations you can make to this code.
>
> Firstly, for values between 256 and 510 (inclusive), it would be 
> better
> to just expand a mov of 255 followed by an add.
 I now see the splitted for the "Pe" constraint which I hadn't noticed
 before, so I can write something similar indeed.

 However, I'm note quite sure to understand the benefit in the split
 when -mpure-code is NOT used.
 Consider:
 int f3_1 (void) { return 510; }
 int f3_2 (void) { return 511; }
 Compile with -O2 -mcpu=cortex-m0:
 f3_1:
 movsr0, #255
 lslsr0, r0, #1
 bx  lr
 f3_2:
 ldr r0, .L4
 bx  lr

 The splitter makes the code bigger, does it "compensate" for this by
 not having to load the constant?
 Actually the constant uses 4 more bytes, which should be taken into
 account when comparing code size,
>>>
>>> Yes, the size of the literal pool entry needs to be taken into account.
>>>  It might happen that the entry could be shared with another use of that
>>> literal, but in general that's rare.
>>>
 so f3_1 uses 6 bytes, and f3_2 uses 8, so as you say below three
 thumb1 instructions would be equivalent in size compared to loading
 from the literal pool. Should the 256-510 range be extended?
>>>
>>> It's a bit borderline at three instructions when literal pools are not
>>> expensive to use, but in thumb1 literal pools tend to be quite small due
>>> to the limited pc offsets we can use.  I think on balance we probably
>>> want to use the instruction sequence unless optimizing for size.
>>>


> This is also true for
> the literal pools alternative as well, so should be handled before all
> this.
 I am not sure what you mean: with -mpure-code, the above sample is 
 compiled as:
 f3_1:
 movsr0, #255
 lslsr0, r0, #1
 bx  lr
 f3_2:
 movsr0, #1
 lslsr0, r0, #8
 addsr0, r0, #255
 bx  lr

 so the "return 510" case is already handled as without -mpure-code.
>>>
>>> I was thinking specifically of the thunk sequence where you seem to be
>>> emitting instructions directly rather than generating RTL.  The examples
>>> you show here are not thunks.
>>>
>> OK thanks for the clarification.
>>
>> Here is an updated version, split into 3 patches to hopefully make
>> review easier.
>> They apply on top of my other mpure-code patches for PR96967 and PR96770:
>> https://gcc.gnu.org/pipermail/gcc-patches/2020-September/554956.html
>> https://gcc.gnu.org/pipermail/gcc-patches/2020-September/554957.html
>>
>> I kept it this way to make incremental changes easier to understand.
>>
>> Patch 1: With the hope to avoid confusion and make maintenance easier,
>> I have updated thumb1_gen_const_int() so that it can generate either RTL 
>> or
>> asm. This way, all the code used to build thumb-1 constants is in the
>> same place,
>>  in c

Re: [PATCH][middle-end][i386][version 3]Add -fzero-call-used-regs=[skip|used-gpr-arg|used-arg|all-arg|used-gpr|all-gpr|used|all]

2020-10-21 Thread Uros Bizjak via Gcc-patches
On Wed, Oct 21, 2020 at 4:45 PM Qing Zhao  wrote:
>
>
>
> On Oct 21, 2020, at 3:03 AM, Uros Bizjak  wrote:
>
> On Wed, Oct 21, 2020 at 9:18 AM Uros Bizjak  wrote:
>
>
> On Tue, Oct 20, 2020 at 10:04 PM Qing Zhao  wrote:
>
> +/* Check whether the register REGNO should be zeroed on X86.
> +   When ALL_SSE_ZEROED is true, all SSE registers have been zeroed
> +   together, no need to zero it again.
> +   Stack registers (st0-st7) and mm0-mm7 are aliased with each other.
> +   very hard to be zeroed individually, don't zero individual st or
> +   mm registgers at this time.  */
> +
> +static bool
> +zero_call_used_regno_p (const unsigned int regno,
> + bool all_sse_zeroed)
> +{
> +  return GENERAL_REGNO_P (regno)
> +  || (!all_sse_zeroed && SSE_REGNO_P (regno))
> +  || MASK_REGNO_P (regno);
> +}
> +
> +/* Return the machine_mode that is used to zero register REGNO.  */
> +
> +static machine_mode
> +zero_call_used_regno_mode (const unsigned int regno)
> +{
> +  /* NB: We only need to zero the lower 32 bits for integer registers
> + and the lower 128 bits for vector registers since destination are
> + zero-extended to the full register width.  */
> +  if (GENERAL_REGNO_P (regno))
> +return SImode;
> +  else if (SSE_REGNO_P (regno))
> +return V4SFmode;
> +  else
> +return HImode;
> +}
> +
> +/* Generate a rtx to zero all vector registers togetehr if possible,
> +   otherwise, return NULL.  */
> +
> +static rtx
> +zero_all_vector_registers (HARD_REG_SET need_zeroed_hardregs)
> +{
> +  if (!TARGET_AVX)
> +return NULL;
> +
> +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
> +if ((IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG)
> +  || (TARGET_64BIT
> +  && (REX_SSE_REGNO_P (regno)
> +  || (TARGET_AVX512F && EXT_REX_SSE_REGNO_P (regno)
> + && !TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
> +  return NULL;
> +
> +  return gen_avx_vzeroall ();
> +}
> +
> +/* Generate a rtx to zero all st and mm registers togetehr if possible,
> +   otherwise, return NULL.  */
> +
> +static rtx
> +zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs)
> +{
> +  if (!TARGET_MMX)
> +return NULL;
> +
> +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
> +if ((STACK_REGNO_P (regno) || MMX_REGNO_P (regno))
> + && !TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
> +  return NULL;
> +
> +  return gen_mmx_emms ();
>
>
> emms is not clearing any register, it only loads x87FPUTagWord with
> H. So I think, the above is useless, as far as register clearing
> is concerned.
>
>
> Thanks for the info.
>
> So, for mm and st registers, should we clear them, and how?
>
>
> I don't know.
>
> Please note that %mm and %st share the same register file, and
> touching %mm registers will block access to %st until emms is emitted.
> You can't just blindly load 0 to %st registers, because the register
> file can be in MMX mode and vice versa. For 32bit targets, function
> can also  return a value in the %mm0.
>
>
> If data flow determine that %mm0 does not return a value at the return, can 
> we clear all the %st as following:
>
> emms
> mov %st0, 0
> mov %st1, 0
> mov %st2, 0
> mov %st3, 0
> mov %st4, 0
> mov %st5, 0
> mov %st6, 0
> mov %st7, 0
>
>
> The i386 ABI says:
>
> -- q --
> The CPU shall be in x87 mode upon entry to a function. Therefore,
> every function that uses the MMX registers is required to issue an
> emms or femms instruction after using MMX registers, before returning
> or calling another function.
> -- /q --
>
> (The above requirement slightly contradicts its own ABI, since we have
> 3 MMX argument registers and MMX return register, so the CPU obviously
> can't be in x87 mode at all function boundaries).
>
> So, assuming that the first sentence is not deliberately vague w.r.t
> function exit, emms should not be needed. However, we are dealing with
> x87 stack registers that have their own set of peculiarities. It is
> not possible to load a random register in the way you show.  Also,
> stack should be either empty or one (two in case of complex value
> return) levels deep at the function return. I think you want a series
> of 8 or 7(6) fldz insns, followed by a series of fstp insn to clear
> the stack and mark stack slots empty.
>
>
> Something like this:
>
> --cut here--
> long double
> __attribute__ ((noinline))
> test (long double a, long double b)
> {
>  long double r = a + b;
>
>  asm volatile ("fldz;\
>fldz;\
>fldz;\
>fldz;\
>fldz;\
>fldz;\
>fldz;\
>fstp %%st(0);\
>fstp %%st(0);\
>fstp %%st(0);\
>fstp %%st(0);\
>fstp %%st(0);\
>fstp %%st(0);\
>fstp %%st(0)" : : "X"(r));
>  return r;
> }
>
> int
> main ()
> {
>  long double a = 1.1, b = 1.2;
>
>  

Re: [PATCH] arm: Fix multiple inheritance thunks for thumb-1 with -mpure-code

2020-10-21 Thread Christophe Lyon via Gcc-patches
On Wed, 21 Oct 2020 at 18:07, Richard Earnshaw
 wrote:
>
> On 21/10/2020 16:49, Christophe Lyon via Gcc-patches wrote:
> > On Tue, 20 Oct 2020 at 13:25, Richard Earnshaw
> >  wrote:
> >>
> >> On 20/10/2020 12:22, Richard Earnshaw wrote:
> >>> On 19/10/2020 17:32, Christophe Lyon via Gcc-patches wrote:
>  On Mon, 19 Oct 2020 at 16:39, Richard Earnshaw
>   wrote:
> >
> > On 12/10/2020 08:59, Christophe Lyon via Gcc-patches wrote:
> >> On Thu, 8 Oct 2020 at 11:58, Richard Earnshaw
> >>  wrote:
> >>>
> >>> On 08/10/2020 10:07, Christophe Lyon via Gcc-patches wrote:
>  On Tue, 6 Oct 2020 at 18:02, Richard Earnshaw
>   wrote:
> >
> > On 29/09/2020 20:50, Christophe Lyon via Gcc-patches wrote:
> >> When mi_delta is > 255 and -mpure-code is used, we cannot load 
> >> delta
> >> from code memory (like we do without -mpure-code).
> >>
> >> This patch builds the value of mi_delta into r3 with a series of
> >> movs/adds/lsls.
> >>
> >> We also do some cleanup by not emitting the function address and 
> >> delta
> >> via .word directives at the end of the thunk since we don't use 
> >> them
> >> with -mpure-code.
> >>
> >> No need for new testcases, this bug was already identified by
> >> eg. pr46287-3.C
> >>
> >> 2020-09-29  Christophe Lyon  
> >>
> >>   gcc/
> >>   * config/arm/arm.c (arm_thumb1_mi_thunk): Build mi_delta in 
> >> r3 and
> >>   do not emit function address and delta when -mpure-code is 
> >> used.
> >
>  Hi Richard,
> 
>  Thanks for your comments.
> 
> > There are some optimizations you can make to this code.
> >
> > Firstly, for values between 256 and 510 (inclusive), it would be 
> > better
> > to just expand a mov of 255 followed by an add.
>  I now see the splitted for the "Pe" constraint which I hadn't noticed
>  before, so I can write something similar indeed.
> 
>  However, I'm note quite sure to understand the benefit in the split
>  when -mpure-code is NOT used.
>  Consider:
>  int f3_1 (void) { return 510; }
>  int f3_2 (void) { return 511; }
>  Compile with -O2 -mcpu=cortex-m0:
>  f3_1:
>  movsr0, #255
>  lslsr0, r0, #1
>  bx  lr
>  f3_2:
>  ldr r0, .L4
>  bx  lr
> 
>  The splitter makes the code bigger, does it "compensate" for this by
>  not having to load the constant?
>  Actually the constant uses 4 more bytes, which should be taken into
>  account when comparing code size,
> >>>
> >>> Yes, the size of the literal pool entry needs to be taken into 
> >>> account.
> >>>  It might happen that the entry could be shared with another use of 
> >>> that
> >>> literal, but in general that's rare.
> >>>
>  so f3_1 uses 6 bytes, and f3_2 uses 8, so as you say below three
>  thumb1 instructions would be equivalent in size compared to loading
>  from the literal pool. Should the 256-510 range be extended?
> >>>
> >>> It's a bit borderline at three instructions when literal pools are not
> >>> expensive to use, but in thumb1 literal pools tend to be quite small 
> >>> due
> >>> to the limited pc offsets we can use.  I think on balance we probably
> >>> want to use the instruction sequence unless optimizing for size.
> >>>
> 
> 
> > This is also true for
> > the literal pools alternative as well, so should be handled before 
> > all
> > this.
>  I am not sure what you mean: with -mpure-code, the above sample is 
>  compiled as:
>  f3_1:
>  movsr0, #255
>  lslsr0, r0, #1
>  bx  lr
>  f3_2:
>  movsr0, #1
>  lslsr0, r0, #8
>  addsr0, r0, #255
>  bx  lr
> 
>  so the "return 510" case is already handled as without -mpure-code.
> >>>
> >>> I was thinking specifically of the thunk sequence where you seem to be
> >>> emitting instructions directly rather than generating RTL.  The 
> >>> examples
> >>> you show here are not thunks.
> >>>
> >> OK thanks for the clarification.
> >>
> >> Here is an updated version, split into 3 patches to hopefully make
> >> review easier.
> >> They apply on top of my other mpure-code patches for PR96967 and 
> >> PR96770:
> >> https://gcc.gnu.org/pipermail/gcc-patches/2020-September/554956.html
> >> https://gcc.gnu.org/pipermail/gcc-patches/2020-September

Re: [PATCH] x86: Allow configuring with --with-arch_64=x86-64-v[234]

2020-10-21 Thread Uros Bizjak via Gcc-patches
On Wed, Oct 21, 2020 at 5:15 PM Jakub Jelinek  wrote:
>
> On Wed, Sep 30, 2020 at 06:06:31PM +0200, Florian Weimer wrote:
> > --- a/gcc/common/config/i386/i386-common.c
> > +++ b/gcc/common/config/i386/i386-common.c
> > @@ -1795,9 +1795,13 @@ const pta processor_alias_table[] =
> >  PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR, 0, P_NONE},
> >{"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
> >  PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR, 0, P_NONE},
> > -  {"x86-64", PROCESSOR_K8, CPU_K8,
> > -PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR,
> > -0, P_NONE},
> > +  {"x86-64", PROCESSOR_K8, CPU_K8, PTA_X86_64_BASELINE, 0, P_NONE},
> > +  {"x86-64-v2", PROCESSOR_K8, CPU_GENERIC, PTA_X86_64_V2 | PTA_NO_TUNE,
> > +   0, P_NONE},
> > +  {"x86-64-v3", PROCESSOR_K8, CPU_GENERIC, PTA_X86_64_V3 | PTA_NO_TUNE,
> > +   0, P_NONE},
> > +  {"x86-64-v4", PROCESSOR_K8, CPU_GENERIC, PTA_X86_64_V4 | PTA_NO_TUNE,
> > +   0, P_NONE},
> >{"eden-x2", PROCESSOR_K8, CPU_K8,
> >  PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR,
> >  0, P_NONE},
>
> I have noticed that one can't configure gcc to default to these.
>
> I've also found various other 32-bit or 64-bit -march= arguments for which
> it wasn't possible to configure gcc to default to those.
>
> The x86-64-v* the patch only allows in --with-arch_64=, because otherwise
> it fails build miserably - as
> ./xgcc -B ./ -S -march=x86-64-v2 -m32 test.c
> cc1: error: ‘x86-64-v2’ architecture level is only defined for the x86-64 
> psABI
> when building 32-bit multilibs.  Even if multilibs are disallowed, I think
> the compiler still supports -m32 and so --with-arch_64= seems to be the only
> option in which we can support that.
>
> Ok for trunk if this passes bootstrap/regtest?  So far I've just tested that
> e.g. --with-tune=x86-64-v3 build fails as expected and 
> --with-arch_64=x86-64-v3
> one went fine.
>
> 2020-10-21  Jakub Jelinek  
>
> * config.gcc (x86_archs): Add samuel-2, nehemiah, c7 and esther.
> (x86_64_archs): Add eden-x2, nano, nano-1000, nano-2000, nano-3000,
> nano-x2, eden-x4, nano-x4, x86-64-v2, x86-64-v3 and x86-64-v4.
> (i[34567]86-*-* | x86_64-*-*): Only allow x86-64-v* as argument
> to --with-arch_64=.

LGTM.

Thanks,
Uros.

>
> --- gcc/config.gcc.jj   2020-10-15 09:04:50.614521860 +0200
> +++ gcc/config.gcc  2020-10-21 17:03:10.396077993 +0200
> @@ -662,7 +662,8 @@ tm_defines="$tm_defines LIBC_GLIBC=1 LIB
>  x86_archs="athlon athlon-4 athlon-fx athlon-mp athlon-tbird \
>  athlon-xp k6 k6-2 k6-3 geode c3 c3-2 winchip-c6 winchip2 i386 i486 \
>  i586 i686 pentium pentium-m pentium-mmx pentium2 pentium3 pentium3m \
> -pentium4 pentium4m pentiumpro prescott lakemont"
> +pentium4 pentium4m pentiumpro prescott lakemont samuel-2 nehemiah \
> +c7 esther"
>
>  # 64-bit x86 processors supported by --with-arch=.  Each processor
>  # MUST be separated by exactly one space.
> @@ -672,7 +673,8 @@ opteron-sse3 nocona core2 corei7 corei7-
>  slm nehalem westmere sandybridge ivybridge haswell broadwell bonnell \
>  silvermont knl knm skylake-avx512 cannonlake icelake-client icelake-server \
>  skylake goldmont goldmont-plus tremont cascadelake tigerlake cooperlake \
> -sapphirerapids alderlake x86-64 native"
> +sapphirerapids alderlake eden-x2 nano nano-1000 nano-2000 nano-3000 \
> +nano-x2 eden-x4 nano-x4 x86-64 x86-64-v2 x86-64-v3 x86-64-v4 native"
>
>  # Additional x86 processors supported by --with-cpu=.  Each processor
>  # MUST be separated by exactly one space.
> @@ -4458,6 +4460,17 @@ case "${target}" in
> if test x${val} != x; then
> case " $x86_64_archs " in
> *" ${val} "*)
> +   # Disallow x86-64-v* for 
> --with-cpu=/--with-tune=
> +   # or --with-arch= or 
> --with-arch_32=
> +   # It can be only specified in 
> --with-arch_64=
> +   case "x$which$val" in
> +   
> xcpu*x86-64-v*|xtune*x86-64-v*|xarchx86-64-v*|xarch_32x86-64-v*)
> +   echo "Unknown CPU 
> given in --with-$which=$val." 1>&2
> +   exit 1
> +   ;;
> +   *)
> +   ;;
> +   esac
> # OK
> ;;
> *)
>
> Jakub
>


Re: [Ada] Improve precision of Ada.Directories.Modification_Time

2020-10-21 Thread Iain Sandoe via Gcc-patches
Hi Folks,

This patch breaks bootstrap on Darwin platforms.

Pierre-Marie de Rodat  wrote:

> The modification file time precision now defined by OS.
> 
> Tested on x86_64-pc-linux-gnu, committed on trunk
> 
> gcc/ada/
> 
>   * adaint.c (__gnat_file_time): New routine.
>   (__gnat_copy_attribs): Copy timestamps in nanoseconds.
>   * libgnat/a-direct.adb (C_Modification_Time): Bind to
>   __gnat_file_time.
>   (Modification_Time): Call to C_Modification_Time.

#if defined(st_mtime)

is a necessary test - but the fields in the stat structure on Darwin platforms 
are
named st_{a,c,m}timespec rather than the Linux st_{a,c,m}tim.

The following patch is a fix lightly tested,
OK for master (if remaining testing is successful) or you have an alternate 
suggestion?

thanks
Iain

diff --git a/gcc/ada/adaint.c b/gcc/ada/adaint.c
index b7406a03c31..ac5738a60d2 100644
--- a/gcc/ada/adaint.c
+++ b/gcc/ada/adaint.c
@@ -1528,8 +1528,12 @@ extern long long __gnat_file_time(char* name)
 #if defined(__GNUG__) && __GNUG__ <= 4
 result = (sb.st_mtime - ada_epoch_offset) * 1E9;
 #if defined(st_mtime)
+#if __APPLE__
+result += sb.st_mtimespec.tv_nsec;
+#else
 result += sb.st_mtim.tv_nsec;
 #endif
+#endif
 #else
   /* Next code similar to
  (sb.st_mtime - ada_epoch_offset) * 1E9 + sb.st_mtim.tv_nsec
@@ -1544,11 +1548,17 @@ extern long long __gnat_file_time(char* name)
   }
 
 #if defined(st_mtime)
+#if __APPLE__
+  if (__builtin_saddll_overflow(result, sb.st_mtimespec.tv_nsec, &result)) {
+return LLONG_MIN;
+  }
+#else
   if (__builtin_saddll_overflow(result, sb.st_mtim.tv_nsec, &result)) {
 return LLONG_MIN;
   }
 #endif
 #endif
+#endif
 #endif
   return result;
 }
@@ -3278,8 +3288,13 @@ __gnat_copy_attribs (char *from ATTRIBUTE_UNUSED, char 
*to ATTRIBUTE_UNUSED,
  tbuf[1].tv_sec  = fbuf.st_mtime;
 
  #if defined(st_mtime)
+ #if __APPLE__
+ tbuf[0].tv_usec = fbuf.st_atimespec.tv_nsec / 1000;
+ tbuf[1].tv_usec = fbuf.st_mtimespec.tv_nsec / 1000;
+ #else
  tbuf[0].tv_usec = fbuf.st_atim.tv_nsec / 1000;
  tbuf[1].tv_usec = fbuf.st_mtim.tv_nsec / 1000;
+ #endif
  #else
  tbuf[0].tv_usec = 0;
  tbuf[1].tv_usec = 0;



Re: [PATCH] [PR rtl-optimization/97249]Simplify vec_select of paradoxical subreg.

2020-10-21 Thread Segher Boessenkool
On Wed, Oct 21, 2020 at 04:43:29PM +0100, Richard Sandiford wrote:
> Hongtao Liu  writes:
> > + poly_uint64 nunits
> > +   = GET_MODE_NUNITS (GET_MODE (SUBREG_REG (trueop0)));
> > + rtx par = trueop1;
> > + for (int i = 0; i != l1; i++)
> > +   {
> > + rtx idx = XVECEXP (trueop1, 0, i);
> > + if (!CONST_INT_P (idx)
> > + || maybe_ge (UINTVAL (idx) + subreg_offset, nunits))
> > +   return 0;
> > +   }
> 
> I think the previous version was better.  We shouldn't assume that
> further simplification rules will fail just because the conditions
> for this rule haven't been met.

Yes.  My suggestion was to factor this big piece of code to a separate
function, and do an early return from *that*.

The patch is okay for trunk without that, with the clumsy booleans.
Thanks Hongtao!


Segher


Re: [Ada] Improve precision of Ada.Directories.Modification_Time

2020-10-21 Thread Arnaud Charlet
> This patch breaks bootstrap on Darwin platforms.
> 
> Pierre-Marie de Rodat  wrote:
> 
> > The modification file time precision now defined by OS.
> > 
> > Tested on x86_64-pc-linux-gnu, committed on trunk
> > 
> > gcc/ada/
> > 
> > * adaint.c (__gnat_file_time): New routine.
> > (__gnat_copy_attribs): Copy timestamps in nanoseconds.
> > * libgnat/a-direct.adb (C_Modification_Time): Bind to
> > __gnat_file_time.
> > (Modification_Time): Call to C_Modification_Time.
> 
> #if defined(st_mtime)
> 
> is a necessary test - but the fields in the stat structure on Darwin 
> platforms are
> named st_{a,c,m}timespec rather than the Linux st_{a,c,m}tim.

What about instead putting above extern long long __gnat_file_time the
following:

#if __APPLE__
#define st_mtim st_mtimespec
#define st_atim st_atimespec
#endif

To avoid having the two (nested) #if __APPLE__ and keep the code easier
to follow?

Arno


[PATCH] c++: constexpr evaluation and bare EMPTY_CLASS_EXPR [PR96575]

2020-10-21 Thread Patrick Palka via Gcc-patches
In the testcase below, folding of the initializer for 'ret' inside the
instantiated f::lambda ends up yielding an initializer for which
potential_constant_expression returns false.  This causes finish_function
to mark the lambda as non-constexpr, which ultimately causes us to reject
'f(g)' as a call to a non-constexpr function.

The initializer for 'ret' inside f::lambda, prior to folding, is
the CALL_EXPR

  ::operator() (&cb, ({}, <<< Unknown tree: empty_class_expr >>>;))

where the second argument is a COMPOUND_EXPR whose second operand is an
EMPTY_CLASS_EXPR, formed by build_class_a.  cp_fully_fold_init is able
to only partially fold this initializer, doing away with the COMPOUND_EXPR
to yield

  ::operator() (&cb, <<< Unknown tree: empty_class_expr >>>)

as the final initializer for 'ret'.  This initializer no longer satifies
potential_constant_expression because this predicate returns false when
it sees a bare EMPTY_CLASS_EXPR that's not wrapped in a COMPOUND_EXPR.

(cp_fully_fold_init first tries maybe_constant_value on the original
CALL_EXPR, but constexpr evaluation punts upon seeing
__builtin_is_constant_evaluated, since manifestly_const_eval is false.)

To fix this, it seems to me we could either make cp_fold preserve
the COMPOUND_EXPR trees produced by build_call_a, or we could
modify potential_constant_expression and friends to handle "bare"
EMPTY_CLASS_EXPR trees.  Assuming it's safe to continue folding
away these COMPOUND_EXPRs, the second approach seems cleaner, so this
patch implements the second approach.

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
trunk?

gcc/cp/ChangeLog:

PR c++/96575
* constexpr.c (cxx_eval_constant_expression)
: Remove now-redundant handling of
COMPOUND_EXPR with EMPTY_CLASS_EXPR second operand.
: Lower it into a CONSTRUCTOR.
(potential_constant_expression_1) : Remove
now-redundant handling of COMPOUND_EXPR with EMPTY_CLASS_EXPR
second operand.
: Return true instead of false.

gcc/testsuite/ChangeLog:

PR c++/96575
* g++.dg/cpp1z/constexpr-96575.C: New test.
---
 gcc/cp/constexpr.c   | 20 
 gcc/testsuite/g++.dg/cpp1z/constexpr-96575.C | 19 +++
 2 files changed, 27 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/cpp1z/constexpr-96575.C

diff --git a/gcc/cp/constexpr.c b/gcc/cp/constexpr.c
index a118f8a810b..0c13ff4db71 100644
--- a/gcc/cp/constexpr.c
+++ b/gcc/cp/constexpr.c
@@ -6070,13 +6070,11 @@ cxx_eval_constant_expression (const constexpr_ctx *ctx, 
tree t,
 case COMPOUND_EXPR:
   {
/* check_return_expr sometimes wraps a TARGET_EXPR in a
-  COMPOUND_EXPR; don't get confused.  Also handle EMPTY_CLASS_EXPR
-  introduced by build_call_a.  */
+  COMPOUND_EXPR; don't get confused.  */
tree op0 = TREE_OPERAND (t, 0);
tree op1 = TREE_OPERAND (t, 1);
STRIP_NOPS (op1);
-   if ((TREE_CODE (op0) == TARGET_EXPR && op1 == TARGET_EXPR_SLOT (op0))
-   || TREE_CODE (op1) == EMPTY_CLASS_EXPR)
+   if (TREE_CODE (op0) == TARGET_EXPR && op1 == TARGET_EXPR_SLOT (op0))
  r = cxx_eval_constant_expression (ctx, op0,
lval, non_constant_p, overflow_p,
jump_target);
@@ -6403,9 +6401,9 @@ cxx_eval_constant_expression (const constexpr_ctx *ctx, 
tree t,
   break;
 
 case EMPTY_CLASS_EXPR:
-  /* This is good enough for a function argument that might not get
-used, and they can't do anything with it, so just return it.  */
-  return t;
+  /* Handle EMPTY_CLASS_EXPR produced by build_call_a by lowering
+it to an appropriate CONSTRUCTOR.  */
+  return build_constructor (TREE_TYPE (t), NULL);
 
 case STATEMENT_LIST:
   new_ctx = *ctx;
@@ -8186,13 +8184,11 @@ potential_constant_expression_1 (tree t, bool 
want_rval, bool strict, bool now,
 case COMPOUND_EXPR:
   {
/* check_return_expr sometimes wraps a TARGET_EXPR in a
-  COMPOUND_EXPR; don't get confused.  Also handle EMPTY_CLASS_EXPR
-  introduced by build_call_a.  */
+  COMPOUND_EXPR; don't get confused.  */
tree op0 = TREE_OPERAND (t, 0);
tree op1 = TREE_OPERAND (t, 1);
STRIP_NOPS (op1);
-   if ((TREE_CODE (op0) == TARGET_EXPR && op1 == TARGET_EXPR_SLOT (op0))
-   || TREE_CODE (op1) == EMPTY_CLASS_EXPR)
+   if (TREE_CODE (op0) == TARGET_EXPR && op1 == TARGET_EXPR_SLOT (op0))
  return RECUR (op0, want_rval);
else
  goto binary;
@@ -8321,7 +8317,7 @@ potential_constant_expression_1 (tree t, bool want_rval, 
bool strict, bool now,
   return true;
 
 case EMPTY_CLASS_EXPR:
-  return false;
+  return true;
 
 case GOTO_EXPR:
   {
diff --git a/gcc/testsuite/g++.dg/cpp1z/constexpr-96575.C 
b/gcc/testsuite/g

Re: [PATCH] c++: Handle RANGE_EXPR indexes in init_subob_ctx [PR97328]

2020-10-21 Thread Patrick Palka via Gcc-patches
On Thu, 8 Oct 2020, Patrick Palka wrote:

> In the testcase below, we're ICEing during constexpr evaluation of the
> CONSTRUCTOR {.data={{}, [1 ... 7]={}}} of type 'vector'.  The apparently
> unique thing about this CONSTRUCTOR is that it has a RANGE_EXPR index
> whose corresponding sub-aggregate initializer doesn't satisfy
> reduced_constant_expression_p (because its field 't' is uninitialized).
> 
> This is a problem because init_subob_ctx currently punts if the
> constructor index is a RANGE_EXPR, so when cxx_eval_bare_aggregate
> recurses into this sub-aggregate initializer we trip over the
> same_type_p assert in verify_ctor_sanity.
> 
> Fix this by making init_subob_ctx set up an appropriate sub-aggregate
> initialization context even when the index is a RANGE_EXPR.
> 
> Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
> trunk and the 10 branch?
> 
> gcc/cp/ChangeLog:
> 
>   PR c++/97328
>   * constexpr.c (init_subob_ctx): Don't punt if the index is a
>   RANGE_EXPR, instead build a sub-aggregate initialization context
>   with no subobject.
> 
> gcc/testsuite/ChangeLog:
> 
>   PR c++/97328
>   * g++.dg/cpp2a/constexpr-init19.C: New test.
>   * g++.dg/cpp2a/constexpr-init20.C: New test.

Ping.

> ---
>  gcc/cp/constexpr.c| 13 +++--
>  gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C | 15 +++
>  gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C | 15 +++
>  3 files changed, 37 insertions(+), 6 deletions(-)
>  create mode 100644 gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C
>  create mode 100644 gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C
> 
> diff --git a/gcc/cp/constexpr.c b/gcc/cp/constexpr.c
> index a118f8a810b..e50a2a220cb 100644
> --- a/gcc/cp/constexpr.c
> +++ b/gcc/cp/constexpr.c
> @@ -3953,11 +3953,6 @@ init_subob_ctx (const constexpr_ctx *ctx, 
> constexpr_ctx &new_ctx,
>  {
>new_ctx = *ctx;
>  
> -  if (index && TREE_CODE (index) != INTEGER_CST
> -  && TREE_CODE (index) != FIELD_DECL)
> -/* This won't have an element in the new CONSTRUCTOR.  */
> -return;
> -
>tree type = initialized_type (value);
>if (!AGGREGATE_TYPE_P (type) && !VECTOR_TYPE_P (type))
>  /* A non-aggregate member doesn't get its own CONSTRUCTOR.  */
> @@ -3967,7 +3962,13 @@ init_subob_ctx (const constexpr_ctx *ctx, 
> constexpr_ctx &new_ctx,
>   update object to refer to the subobject and ctor to refer to
>   the (newly created) sub-initializer.  */
>if (ctx->object)
> -new_ctx.object = build_ctor_subob_ref (index, type, ctx->object);
> +{
> +  if (index == NULL_TREE || TREE_CODE (index) == RANGE_EXPR)
> + /* There's no well-defined subobject for this index.  */
> + new_ctx.object = NULL_TREE;
> +  else
> + new_ctx.object = build_ctor_subob_ref (index, type, ctx->object);
> +}
>tree elt = build_constructor (type, NULL);
>CONSTRUCTOR_NO_CLEARING (elt) = true;
>new_ctx.ctor = elt;
> diff --git a/gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C 
> b/gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C
> new file mode 100644
> index 000..d354c5ad609
> --- /dev/null
> +++ b/gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C
> @@ -0,0 +1,15 @@
> +// PR c++/97328
> +// { dg-do compile { target c++20 } }
> +
> +struct vector {
> +  struct storage {
> +int t;
> +constexpr storage() {}
> +  } data[8];
> +};
> +
> +constexpr auto foo() {
> +  vector i;
> +  return i;
> +}
> +auto val = foo();
> diff --git a/gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C 
> b/gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C
> new file mode 100644
> index 000..1a6ed8d86dd
> --- /dev/null
> +++ b/gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C
> @@ -0,0 +1,15 @@
> +// PR c++/97328
> +// { dg-do compile { target c++20 } }
> +
> +struct vector {
> +  union storage {
> +int t;
> +constexpr storage() {}
> +  } data[8];
> +};
> +
> +constexpr auto foo() {
> +  vector i;
> +  return i;
> +}
> +auto val = foo();
> -- 
> 2.29.0.rc0
> 
> 



Re: [Ada] Improve precision of Ada.Directories.Modification_Time

2020-10-21 Thread Iain Sandoe

Arnaud Charlet  wrote:


This patch breaks bootstrap on Darwin platforms.

Pierre-Marie de Rodat  wrote:


The modification file time precision now defined by OS.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* adaint.c (__gnat_file_time): New routine.
(__gnat_copy_attribs): Copy timestamps in nanoseconds.
* libgnat/a-direct.adb (C_Modification_Time): Bind to
__gnat_file_time.
(Modification_Time): Call to C_Modification_Time.


#if defined(st_mtime)

is a necessary test - but the fields in the stat structure on Darwin  
platforms are

named st_{a,c,m}timespec rather than the Linux st_{a,c,m}tim.


What about instead putting above extern long long __gnat_file_time the
following:

#if __APPLE__
#define st_mtim st_mtimespec
#define st_atim st_atimespec
#endif

To avoid having the two (nested) #if __APPLE__ and keep the code easier
to follow?


works for me (the test patch was drafted quickly to allow bootstrap to  
continue)

- I can amend the patch and (re-)test more widely.

Iain



Re: [PATCH][middle-end][i386][version 3]Add -fzero-call-used-regs=[skip|used-gpr-arg|used-arg|all-arg|used-gpr|all-gpr|used|all]

2020-10-21 Thread Qing Zhao via Gcc-patches
Got it.

thanks.

Qing

> On Oct 21, 2020, at 10:47 AM, Richard Sandiford  
> wrote:
> 
> Qing Zhao  writes:
 +  /* For each of the hard registers, check to see whether we should zero 
 it if:
 + 1. it is a call-used-registers;
 + and 2. it is not a fixed-registers;
 + and 3. it is not live at the return of the routine;
 + and 4. it is general registor if gpr_only is true;
 + and 5. it is used in the routine if used_only is true;
 + and 6. it is a register that passes parameter if arg_only is true;
 +   */
 +
 +  HARD_REG_SET need_zeroed_hardregs;
 +  CLEAR_HARD_REG_SET (need_zeroed_hardregs);
 +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
 +{
 +  if (!this_target_hard_regs->x_call_used_regs[regno])
 +  continue;
>>> 
>>> This should use crtl->abi instead.  The set of call-used registers
>>> can vary from function to function.
>> 
>> You mean to use:
>> 
>> If (!crtl->abi->clobbers_full_reg_p(regno))
>> 
>> ?
> 
> Yeah, that's right.  (But with a space before “(regno)” :-))
> 
 +static unsigned int
 +rest_of_zero_call_used_regs (void)
 +{
 +  basic_block bb;
 +  rtx_insn *insn;
 +
 +  /* This pass needs data flow information.  */
 +  df_analyze ();
 +
 +  /* Search all the "return"s in the routine, and insert instruction 
 sequence to
 + zero the call used registers.  */
 +  FOR_EACH_BB_REVERSE_FN (bb, cfun)
 +if (bb == EXIT_BLOCK_PTR_FOR_FN (cfun)
 +  || (single_succ_p (bb)
 +  && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun)))
 +  FOR_BB_INSNS_REVERSE (bb, insn)
 +  if (JUMP_P (insn) && ANY_RETURN_P (JUMP_LABEL (insn)))
 +{
 +  /* Now we can insert the instruction sequence to zero the call used
 + registers before this insn.  */
 +  gen_call_used_regs_seq (insn);
 +  break;
 +}
>>> 
>>> The exit block never has instructions, so it's only necessary to process
>>> predecessors.  A simpler way to do that is to iterate over the edges in:
>>> 
>>> EXIT_BLOCK_PTR_FOR_FN (cfun)->preds
>>> 
>>> You shouldn't need to use FOR_BB_INSNS_REVERSE: it should be enough
>>> to check only BB_END (bb), since returns always end blocks.
>> 
>> Something like the following?
>> 
>>  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
>>{
>> insn = BB_END (e->src);
>>  If (JUMP_P (insn) && ANY_RETURN_P (JUMP_LABEL (insn)))
>>{
>>/* Now we can insert the instruction sequence to zero the call used
>> registers before this insn.  */
>>  gen_call_used_regs_seq (insn);
>>  break;   
>>}
>>  }
> 
> With this you don't want/need the break, since it would break out
> of the edge traversal (instead of the FOR_BB_INSNS_REVERSE, as above).
> Also, I think the code becomes simple enough that the comment isn't
> really needed.
> 
> Thanks,
> Richard



Re: [PATCH][middle-end][i386][version 3]Add -fzero-call-used-regs=[skip|used-gpr-arg|used-arg|all-arg|used-gpr|all-gpr|used|all]

2020-10-21 Thread Qing Zhao via Gcc-patches



> On Oct 21, 2020, at 11:09 AM, Uros Bizjak  wrote:
> 
> On Wed, Oct 21, 2020 at 4:45 PM Qing Zhao  wrote:
>> 
>> 
>> 
>> -- q --
>> The CPU shall be in x87 mode upon entry to a function. Therefore,
>> every function that uses the MMX registers is required to issue an
>> emms or femms instruction after using MMX registers, before returning
>> or calling another function.
>> -- /q --
>> 
>> (The above requirement slightly contradicts its own ABI, since we have
>> 3 MMX argument registers and MMX return register, so the CPU obviously
>> can't be in x87 mode at all function boundaries).
>> 
>> So, assuming that the first sentence is not deliberately vague w.r.t
>> function exit, emms should not be needed. However, we are dealing with
>> x87 stack registers that have their own set of peculiarities. It is
>> not possible to load a random register in the way you show.  Also,
>> stack should be either empty or one (two in case of complex value
>> return) levels deep at the function return. I think you want a series
>> of 8 or 7(6) fldz insns, followed by a series of fstp insn to clear
>> the stack and mark stack slots empty.
>> 
>> 
>> Something like this:
>> 
>> --cut here--
>> long double
>> __attribute__ ((noinline))
>> test (long double a, long double b)
>> {
>> long double r = a + b;
>> 
>> asm volatile ("fldz;\
>>   fldz;\
>>   fldz;\
>>   fldz;\
>>   fldz;\
>>   fldz;\
>>   fldz;\
>>   fstp %%st(0);\
>>   fstp %%st(0);\
>>   fstp %%st(0);\
>>   fstp %%st(0);\
>>   fstp %%st(0);\
>>   fstp %%st(0);\
>>   fstp %%st(0)" : : "X"(r));
>> return r;
>> }
>> 
>> int
>> main ()
>> {
>> long double a = 1.1, b = 1.2;
>> 
>> long double c = test (a, b);
>> 
>> printf ("%Lf\n", c);
>> 
>> return 0;
>> }
>> --cut here—
>> 
>> 
>> 
>> Okay, so,
>> 
>> 1. First compute how many st registers need to be zeroed,  num_of_zeroed_st
>> 2. Then issue (8 - num_of_zeroed_st) fldz to push 0 to the stack to clear 
>> all the dead stack slots;
>> 3. Then issue (8 - num_of_zeroed_st) fstp %st(0) to pop the stack and empty 
>> the stack.
>> 
>> Is the above understanding correctly?
> 
> Yes.
> 
>> Another thought is:
>> 
>> Looks like it’s very complicate to use the st/mm register set correctly, So,
>> I assume that this set of registers might be very hard to be used by the 
>> attacker correctly.
>> Right?
> 
> Correct, but "very hard to be used" depends on how determined the attacker is.

Okay, I see.
Then I will clear the st registers per the above algorithm you suggested.

Thanks a lot for the help.

Qing
> 
> Uros.



[PATCH] libstdc++: Add c++2a

2020-10-21 Thread Thomas Rodgers
From: Thomas Rodgers 

libstdc++/Changelog:
libstdc++-v3/doc/doxygen/user.cfg.in (INPUT): Add new header.
libstdc++-v3/include/Makefile.am (std_headers): Add new header.
libstdc++-v3/include/Makefile.in: Regenerate.
libstdc++-v3/include/precompiled/stdc++.h: Include new header.
libstdc++-v3/include/std/streambuf
(__detail::__streambuf_core_access): Define.
(basic_streambuf): Befriend __detail::__streambuf_core_access.
libstdc++-v3/include/std/syncstream: New header.
libstdc++-v3/include/std/version: Add __cpp_lib_syncbuf:
libstdc++-v3/testsuite/27_io/basic_syncbuf/1.cc: New test.
libstdc++-v3/testsuite/27_io/basic_syncbuf/2.cc: Likewise.
libstdc++-v3/testsuite/27_io/basic_syncbuf/basic_ops/1.cc:
Likewise.
libstdc++-v3/testsuite/27_io/basic_syncbuf/requirements/types.cc:
Likewise.
libstdc++-v3/testsuite/27_io/basic_syncbuf/sync_ops/1.cc:
Likewise.
libstdc++-v3/testsuite/27_io/basic_syncstream/1.cc: Likewise.
libstdc++-v3/testsuite/27_io/basic_syncstream/2.cc: Likewise.
libstdc++-v3/testsuite/27_io/basic_syncstream/basic_ops/1.cc:
Likewise.
libstdc++-v3/testsuite/27_io/basic_syncstream/requirements/types.cc:
Likewise.

---
 libstdc++-v3/doc/doxygen/user.cfg.in  |   1 +
 libstdc++-v3/include/Makefile.am  |   1 +
 libstdc++-v3/include/Makefile.in  |   1 +
 libstdc++-v3/include/precompiled/stdc++.h |   2 +-
 libstdc++-v3/include/std/syncstream   | 279 ++
 libstdc++-v3/include/std/version  |   4 +
 .../testsuite/27_io/basic_syncbuf/1.cc|  28 ++
 .../testsuite/27_io/basic_syncbuf/2.cc|  27 ++
 .../27_io/basic_syncbuf/basic_ops/1.cc| 138 +
 .../27_io/basic_syncbuf/requirements/types.cc |  42 +++
 .../27_io/basic_syncbuf/sync_ops/1.cc | 130 
 .../testsuite/27_io/basic_syncstream/1.cc |  28 ++
 .../testsuite/27_io/basic_syncstream/2.cc |  27 ++
 .../27_io/basic_syncstream/basic_ops/1.cc | 135 +
 .../basic_syncstream/requirements/types.cc|  43 +++
 15 files changed, 885 insertions(+), 1 deletion(-)
 create mode 100644 libstdc++-v3/include/std/syncstream
 create mode 100644 libstdc++-v3/testsuite/27_io/basic_syncbuf/1.cc
 create mode 100644 libstdc++-v3/testsuite/27_io/basic_syncbuf/2.cc
 create mode 100644 libstdc++-v3/testsuite/27_io/basic_syncbuf/basic_ops/1.cc
 create mode 100644 
libstdc++-v3/testsuite/27_io/basic_syncbuf/requirements/types.cc
 create mode 100644 libstdc++-v3/testsuite/27_io/basic_syncbuf/sync_ops/1.cc
 create mode 100644 libstdc++-v3/testsuite/27_io/basic_syncstream/1.cc
 create mode 100644 libstdc++-v3/testsuite/27_io/basic_syncstream/2.cc
 create mode 100644 libstdc++-v3/testsuite/27_io/basic_syncstream/basic_ops/1.cc
 create mode 100644 
libstdc++-v3/testsuite/27_io/basic_syncstream/requirements/types.cc

diff --git a/libstdc++-v3/doc/doxygen/user.cfg.in 
b/libstdc++-v3/doc/doxygen/user.cfg.in
index 9b49a15d31b..320f6dea688 100644
--- a/libstdc++-v3/doc/doxygen/user.cfg.in
+++ b/libstdc++-v3/doc/doxygen/user.cfg.in
@@ -897,6 +897,7 @@ INPUT  = @srcdir@/doc/doxygen/doxygroups.cc 
\
  include/streambuf \
  include/string \
  include/string_view \
+ include/syncstream \
  include/system_error \
  include/thread \
  include/tuple \
diff --git a/libstdc++-v3/include/Makefile.am b/libstdc++-v3/include/Makefile.am
index 28d273924ee..61aaff7a2f4 100644
--- a/libstdc++-v3/include/Makefile.am
+++ b/libstdc++-v3/include/Makefile.am
@@ -73,6 +73,7 @@ std_headers = \
${std_srcdir}/shared_mutex \
${std_srcdir}/span \
${std_srcdir}/sstream \
+   ${std_srcdir}/syncstream \
${std_srcdir}/stack \
${std_srcdir}/stdexcept \
${std_srcdir}/stop_token \
diff --git a/libstdc++-v3/include/precompiled/stdc++.h 
b/libstdc++-v3/include/precompiled/stdc++.h
index 7518a98c25a..8899c323a28 100644
--- a/libstdc++-v3/include/precompiled/stdc++.h
+++ b/libstdc++-v3/include/precompiled/stdc++.h
@@ -141,6 +141,6 @@
 #include 
 #include 
 #include 
-// #include 
+#include 
 #include 
 #endif
diff --git a/libstdc++-v3/include/std/syncstream 
b/libstdc++-v3/include/std/syncstream
new file mode 100644
index 000..3f78cef1d8d
--- /dev/null
+++ b/libstdc++-v3/include/std/syncstream
@@ -0,0 +1,279 @@
+//  -*- C++ -*-
+
+// Copyright (C) 2020 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This l

Re: [Ada] Improve precision of Ada.Directories.Modification_Time

2020-10-21 Thread Arnaud Charlet
> >What about instead putting above extern long long __gnat_file_time the
> >following:
> >
> >#if __APPLE__
> >#define st_mtim st_mtimespec
> >#define st_atim st_atimespec
> >#endif
> >
> >To avoid having the two (nested) #if __APPLE__ and keep the code easier

two => three :-)

> >to follow?
> 
> works for me (the test patch was drafted quickly to allow bootstrap
> to continue)
> - I can amend the patch and (re-)test more widely.

OK then with these changes, assuming successfully build/test.

Arno


Re: [PATCH] phiopt: Optimize x ? __builtin_clz (x) : 32 in GIMPLE [PR97503]

2020-10-21 Thread Rainer Orth
Hi Jakub,

> While we have at the RTL level noce_try_ifelse_collapse combined with
> simplify_cond_clz_ctz, that optimization doesn't always trigger because
> e.g. on powerpc there is an define_insn to compare a reg against zero and
> copy that register to another one and so we end up with a different pseudo
> in the simplify_cond_clz_ctz test and punt.
>
> For targets that define C?Z_DEFINED_VALUE_AT_ZERO to 2 for certain modes,
> we can optimize it already in phiopt though, just need to ensure that
> we transform the __builtin_c?z* calls into .C?Z ifns because my recent
> VRP changes codified that the builtin calls are always undefined at zero,
> while ifns honor C?Z_DEFINED_VALUE_AT_ZERO equal to 2.
> And, in phiopt we already have popcount handling that does pretty much the
> same thing, except for always using a zero value rather than the one set
> by C?Z_DEFINED_VALUE_AT_ZERO.
>
> So, this patch extends that function to handle not just popcount, but also
> clz and ctz.

this broke sparc-sun-solaris2.11 bootstrap

/vol/gcc/src/hg/master/local/gcc/tree-ssa-phiopt.c: In function 'bool 
cond_removal_in_popcount_clz_ctz_pattern(basic_block, basic_block, edge, edge, 
gimple*, tree, tree)':
/vol/gcc/src/hg/master/local/gcc/tree-ssa-phiopt.c:1858:27: error: variable 
'mode' set but not used [-Werror=unused-but-set-variable]
 1858 |   scalar_int_mode mode = SCALAR_INT_TYPE_MODE (TREE_TYPE (arg));
  |   ^~~~


and doubtlessly several other targets that use the defaults.h definition of

#define CTZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE)  0

Rainer

-- 
-
Rainer Orth, Center for Biotechnology, Bielefeld University


Re: [PATCH] libstdc++: Add c++2a

2020-10-21 Thread Jonathan Wakely via Gcc-patches

On 21/10/20 09:53 -0700, Thomas Rodgers wrote:

From: Thomas Rodgers 

libstdc++/Changelog:
libstdc++-v3/doc/doxygen/user.cfg.in (INPUT): Add new header.
libstdc++-v3/include/Makefile.am (std_headers): Add new header.
libstdc++-v3/include/Makefile.in: Regenerate.
libstdc++-v3/include/precompiled/stdc++.h: Include new header.
libstdc++-v3/include/std/streambuf
   (__detail::__streambuf_core_access): Define.
   (basic_streambuf): Befriend __detail::__streambuf_core_access.


This file is no longer part of the commit, so the server will reject
this changelog. Please ensure the changelog is accurate (the
gcc-verify alias created by contrib/gcc-git-customization.sh can do
that) and push, thanks.



Re: [PATCH] arm: Fix multiple inheritance thunks for thumb-1 with -mpure-code

2020-10-21 Thread Richard Earnshaw via Gcc-patches
On 21/10/2020 17:11, Christophe Lyon via Gcc-patches wrote:
> On Wed, 21 Oct 2020 at 18:07, Richard Earnshaw
>  wrote:
>>
>> On 21/10/2020 16:49, Christophe Lyon via Gcc-patches wrote:
>>> On Tue, 20 Oct 2020 at 13:25, Richard Earnshaw
>>>  wrote:

 On 20/10/2020 12:22, Richard Earnshaw wrote:
> On 19/10/2020 17:32, Christophe Lyon via Gcc-patches wrote:
>> On Mon, 19 Oct 2020 at 16:39, Richard Earnshaw
>>  wrote:
>>>
>>> On 12/10/2020 08:59, Christophe Lyon via Gcc-patches wrote:
 On Thu, 8 Oct 2020 at 11:58, Richard Earnshaw
  wrote:
>
> On 08/10/2020 10:07, Christophe Lyon via Gcc-patches wrote:
>> On Tue, 6 Oct 2020 at 18:02, Richard Earnshaw
>>  wrote:
>>>
>>> On 29/09/2020 20:50, Christophe Lyon via Gcc-patches wrote:
 When mi_delta is > 255 and -mpure-code is used, we cannot load 
 delta
 from code memory (like we do without -mpure-code).

 This patch builds the value of mi_delta into r3 with a series of
 movs/adds/lsls.

 We also do some cleanup by not emitting the function address and 
 delta
 via .word directives at the end of the thunk since we don't use 
 them
 with -mpure-code.

 No need for new testcases, this bug was already identified by
 eg. pr46287-3.C

 2020-09-29  Christophe Lyon  

   gcc/
   * config/arm/arm.c (arm_thumb1_mi_thunk): Build mi_delta in 
 r3 and
   do not emit function address and delta when -mpure-code is 
 used.
>>>
>> Hi Richard,
>>
>> Thanks for your comments.
>>
>>> There are some optimizations you can make to this code.
>>>
>>> Firstly, for values between 256 and 510 (inclusive), it would be 
>>> better
>>> to just expand a mov of 255 followed by an add.
>> I now see the splitted for the "Pe" constraint which I hadn't noticed
>> before, so I can write something similar indeed.
>>
>> However, I'm note quite sure to understand the benefit in the split
>> when -mpure-code is NOT used.
>> Consider:
>> int f3_1 (void) { return 510; }
>> int f3_2 (void) { return 511; }
>> Compile with -O2 -mcpu=cortex-m0:
>> f3_1:
>> movsr0, #255
>> lslsr0, r0, #1
>> bx  lr
>> f3_2:
>> ldr r0, .L4
>> bx  lr
>>
>> The splitter makes the code bigger, does it "compensate" for this by
>> not having to load the constant?
>> Actually the constant uses 4 more bytes, which should be taken into
>> account when comparing code size,
>
> Yes, the size of the literal pool entry needs to be taken into 
> account.
>  It might happen that the entry could be shared with another use of 
> that
> literal, but in general that's rare.
>
>> so f3_1 uses 6 bytes, and f3_2 uses 8, so as you say below three
>> thumb1 instructions would be equivalent in size compared to loading
>> from the literal pool. Should the 256-510 range be extended?
>
> It's a bit borderline at three instructions when literal pools are not
> expensive to use, but in thumb1 literal pools tend to be quite small 
> due
> to the limited pc offsets we can use.  I think on balance we probably
> want to use the instruction sequence unless optimizing for size.
>
>>
>>
>>> This is also true for
>>> the literal pools alternative as well, so should be handled before 
>>> all
>>> this.
>> I am not sure what you mean: with -mpure-code, the above sample is 
>> compiled as:
>> f3_1:
>> movsr0, #255
>> lslsr0, r0, #1
>> bx  lr
>> f3_2:
>> movsr0, #1
>> lslsr0, r0, #8
>> addsr0, r0, #255
>> bx  lr
>>
>> so the "return 510" case is already handled as without -mpure-code.
>
> I was thinking specifically of the thunk sequence where you seem to be
> emitting instructions directly rather than generating RTL.  The 
> examples
> you show here are not thunks.
>
 OK thanks for the clarification.

 Here is an updated version, split into 3 patches to hopefully make
 review easier.
 They apply on top of my other mpure-code patches for PR96967 and 
 PR96770:
 https://gcc.gnu.org/pipermail/gcc-patches/2020-September/554956.html

Re: [PATCH] phiopt: Optimize x ? __builtin_clz (x) : 32 in GIMPLE [PR97503]

2020-10-21 Thread Jakub Jelinek via Gcc-patches
On Wed, Oct 21, 2020 at 07:30:46PM +0200, Rainer Orth wrote:
> this broke sparc-sun-solaris2.11 bootstrap
> 
> /vol/gcc/src/hg/master/local/gcc/tree-ssa-phiopt.c: In function 'bool 
> cond_removal_in_popcount_clz_ctz_pattern(basic_block, basic_block, edge, 
> edge, gimple*, tree, tree)':
> /vol/gcc/src/hg/master/local/gcc/tree-ssa-phiopt.c:1858:27: error: variable 
> 'mode' set but not used [-Werror=unused-but-set-variable]
>  1858 |   scalar_int_mode mode = SCALAR_INT_TYPE_MODE (TREE_TYPE 
> (arg));
>   |   ^~~~
> 
> 
> and doubtlessly several other targets that use the defaults.h definition of
> 
> #define CTZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE)  0

Ugh, seems many of those macros do not evaluate the first argument.
This got broken by the change to direct_internal_fn_supported_p, previously
it used mode also in the optab test.

Anyway, I think this should fix it, I'll bootstrap/regtest it tonight:

2020-10-21  Jakub Jelinek  

* tree-ssa-phiopt.c (cond_removal_in_popcount_clz_ctz_pattern):
For CLZ and CTZ tests, use type temporary instead of mode.

--- gcc/tree-ssa-phiopt.c.jj2020-10-21 19:33:12.358042645 +0200
+++ gcc/tree-ssa-phiopt.c   2020-10-21 19:35:18.113213095 +0200
@@ -1842,10 +1842,10 @@ cond_removal_in_popcount_clz_ctz_pattern
 CASE_CFN_CLZ:
   if (INTEGRAL_TYPE_P (TREE_TYPE (arg)))
{
- scalar_int_mode mode = SCALAR_INT_TYPE_MODE (TREE_TYPE (arg));
- if (direct_internal_fn_supported_p (IFN_CLZ, TREE_TYPE (arg),
- OPTIMIZE_FOR_BOTH)
- && CLZ_DEFINED_VALUE_AT_ZERO (mode, val) == 2)
+ tree type = TREE_TYPE (arg);
+ if (direct_internal_fn_supported_p (IFN_CLZ, type, OPTIMIZE_FOR_BOTH)
+ && CLZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (type),
+   val) == 2)
{
  ifn = IFN_CLZ;
  break;
@@ -1855,10 +1855,10 @@ cond_removal_in_popcount_clz_ctz_pattern
 CASE_CFN_CTZ:
   if (INTEGRAL_TYPE_P (TREE_TYPE (arg)))
{
- scalar_int_mode mode = SCALAR_INT_TYPE_MODE (TREE_TYPE (arg));
- if (direct_internal_fn_supported_p (IFN_CTZ, TREE_TYPE (arg),
- OPTIMIZE_FOR_BOTH)
- && CTZ_DEFINED_VALUE_AT_ZERO (mode, val) == 2)
+ tree type = TREE_TYPE (arg);
+ if (direct_internal_fn_supported_p (IFN_CTZ, type, OPTIMIZE_FOR_BOTH)
+ && CTZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (type),
+   val) == 2)
{
  ifn = IFN_CTZ;
  break;


Jakub



[PATCH] rs6000: MMA type causes an ICE in ranger pass due to incompatible types

2020-10-21 Thread Peter Bergner via Gcc-patches
PR97360 shows a problem in how we create our PXI and POI modes that cause
an ICE in the ranger pass.  The problem seems to be that the extra call
to build_distinct_type_copy() also creates new TYPE_{MIN,MAX}_VALUEs that
are not compatible/the same as the base type itself.  The simple "fix" is
to actually remove the unneeded build_distinct_type_copy(), since according
to richi, the types returned from make_unsigned_type() are already distinct.

The following patch from Andrew and richi fixes the ICE on Martin's test
case and passes bootstrap and regtesting on powerpc64le-linux.
Ok for trunk?

Since the ranger code that triggered this doesn't seem to be in GCC 10,
I assume we do not want to backport this this change?

Peter


gcc/
PR target/97360
* config/rs6000/rs6000-call.c (rs6000_init_builtins): Remove call to
build_distinct_type_copy().

gcc/testsuite/
PR target/97360
* gcc.target/powerpc/pr97360.c: New test.

diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c
index 9fdf97bc803..7639aab171d 100644
--- a/gcc/config/rs6000/rs6000-call.c
+++ b/gcc/config/rs6000/rs6000-call.c
@@ -12914,15 +12914,13 @@ rs6000_init_builtins (void)
   /* Vector pair and vector quad support.  */
   if (TARGET_EXTRA_BUILTINS)
 {
-  tree oi_uns_type = make_unsigned_type (256);
-  vector_pair_type_node = build_distinct_type_copy (oi_uns_type);
+  vector_pair_type_node = make_unsigned_type (256);
   SET_TYPE_MODE (vector_pair_type_node, POImode);
   layout_type (vector_pair_type_node);
   lang_hooks.types.register_builtin_type (vector_pair_type_node,
  "__vector_pair");
 
-  tree xi_uns_type = make_unsigned_type (512);
-  vector_quad_type_node = build_distinct_type_copy (xi_uns_type);
+  vector_quad_type_node = make_unsigned_type (512);
   SET_TYPE_MODE (vector_quad_type_node, PXImode);
   layout_type (vector_quad_type_node);
   lang_hooks.types.register_builtin_type (vector_quad_type_node,
diff --git a/gcc/testsuite/gcc.target/powerpc/pr97360.c 
b/gcc/testsuite/gcc.target/powerpc/pr97360.c
new file mode 100644
index 000..2328d28a283
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr97360.c
@@ -0,0 +1,18 @@
+/* PR target/97360 */
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power10" } */
+
+/* Verify we do not ICE on the test below.  */
+
+typedef unsigned char vec_t __attribute__((vector_size(16)));
+
+void
+foo (__vector_quad *dst, __vector_pair *vpair, vec_t *vec)
+{
+  __vector_quad acc = *dst;
+  for (;;)
+{
+  __builtin_mma_xvf64gerpp(&acc, *vpair, vec[7]);
+}
+}


Re: [PATCH][middle-end][i386][version 3]Add -fzero-call-used-regs=[skip|used-gpr-arg|used-arg|all-arg|used-gpr|all-gpr|used|all]

2020-10-21 Thread Segher Boessenkool
On Wed, Oct 21, 2020 at 06:09:28PM +0200, Uros Bizjak wrote:
> On Wed, Oct 21, 2020 at 4:45 PM Qing Zhao  wrote:
> > Looks like it’s very complicate to use the st/mm register set correctly, So,
> > I assume that this set of registers might be very hard to be used by the 
> > attacker correctly.
> > Right?
> 
> Correct, but "very hard to be used" depends on how determined the attacker is.

Not only that, but the attacker only needs to get it right once, not for
every function (and not even for every program for that matter).


Segher


Re: [PATCH] rs6000: MMA type causes an ICE in ranger pass due to incompatible types

2020-10-21 Thread Segher Boessenkool
Hi!

On Wed, Oct 21, 2020 at 01:00:20PM -0500, Peter Bergner wrote:
> PR97360 shows a problem in how we create our PXI and POI modes that cause
> an ICE in the ranger pass.  The problem seems to be that the extra call
> to build_distinct_type_copy() also creates new TYPE_{MIN,MAX}_VALUEs that
> are not compatible/the same as the base type itself.  The simple "fix" is
> to actually remove the unneeded build_distinct_type_copy(), since according
> to richi, the types returned from make_unsigned_type() are already distinct.
> 
> The following patch from Andrew and richi fixes the ICE on Martin's test
> case and passes bootstrap and regtesting on powerpc64le-linux.
> Ok for trunk?

Yes, okay for trunk.  Thanks!

> Since the ranger code that triggered this doesn't seem to be in GCC 10,
> I assume we do not want to backport this this change?

No, please do, in a week or so, it is a pretty serious problem that we
could just asa well run into some other way, as far as I can see?


Segher


> gcc/
>   PR target/97360
>   * config/rs6000/rs6000-call.c (rs6000_init_builtins): Remove call to
>   build_distinct_type_copy().
> 
> gcc/testsuite/
>   PR target/97360
>   * gcc.target/powerpc/pr97360.c: New test.


Re: Increase inlining limits for inline functions with builtin_constant_p on parameter

2020-10-21 Thread Martin Liška

On 10/21/20 5:11 PM, Jan Hubicka wrote:

return ( __builtin_constant_p((size) - 1) ? ( __builtin_constant_p((size) - 1)
? ( ((size) - 1) < 2 ? 0 : ((size) - 1) & (1ULL << 63) ? 63 : ((size) - 1) &
(1ULL << 62) ? 62 : ((size) - 1) & (1ULL << 61) ? 61 : ((size) - 1) & (1ULL <<
60) ? 60 : ((size) - 1) & (1ULL << 59) ? 59 : ((size) - 1) & (1ULL << 58) ? 58
: ((size) - 1) & (1ULL << 57) ? 57 : ((size) - 1) & (1ULL << 56) ? 56 : ((size)
- 1) & (1ULL << 55) ? 55 : ((size) - 1) & (1ULL << 54) ? 54 : ((size) - 1) &
(1ULL << 53) ? 53 : ((size) - 1) & (1ULL << 52) ? 52 : ((size) - 1) & (1ULL <<
51) ? 51 : ((size) - 1) & (1ULL << 50) ? 50 : ((size) - 1) & (1ULL << 49) ? 49
: ((size) - 1) & (1ULL << 48) ? 48 : ((size) - 1) & (1ULL << 47) ? 47 : ((size)
- 1) & (1ULL << 46) ? 46 : ((size) - 1) & (1ULL << 45) ? 45 : ((size) - 1) &
(1ULL << 44) ? 44 : ((size) - 1) & (1ULL << 43) ? 43 : ((size) - 1) & (1ULL <<
42) ? 42 : ((size) - 1) & (1ULL << 41) ? 41 : ((size) - 1) & (1ULL << 40) ? 40
: ((size) - 1) & (1ULL << 39) ? 39 : ((size) - 1) & (1ULL << 38) ? 38 : ((size)
- 1) & (1ULL << 37) ? 37 : ((size) - 1) & (1ULL << 36) ? 36 : ((size) - 1) &
(1ULL << 35) ? 35 : ((size) - 1) & (1ULL << 34) ? 34 : ((size) - 1) & (1ULL <<
33) ? 33 : ((size) - 1) & (1ULL << 32) ? 32 : ((size) - 1) & (1ULL << 31) ? 31
: ((size) - 1) & (1ULL << 30) ? 30 : ((size) - 1) & (1ULL << 29) ? 29 : ((size)
- 1) & (1ULL << 28) ? 28 : ((size) - 1) & (1ULL << 27) ? 27 : ((size) - 1) &
(1ULL << 26) ? 26 : ((size) - 1) & (1ULL << 25) ? 25 : ((size) - 1) & (1ULL <<
24) ? 24 : ((size) - 1) & (1ULL << 23) ? 23 : ((size) - 1) & (1ULL << 22) ? 22
: ((size) - 1) & (1ULL << 21) ? 21 : ((size) - 1) & (1ULL << 20) ? 20 : ((size)
- 1) & (1ULL << 19) ? 19 : ((size) - 1) & (1ULL << 18) ? 18 : ((size) - 1) &
(1ULL << 17) ? 17 : ((size) - 1) & (1ULL << 16) ? 16 : ((size) - 1) & (1ULL <<
15) ? 15 : ((size) - 1) & (1ULL << 14) ? 14 : ((size) - 1) & (1ULL << 13) ? 13
: ((size) - 1) & (1ULL << 12) ? 12 : ((size) - 1) & (1ULL << 11) ? 11 : ((size)
- 1) & (1ULL << 10) ? 10 : ((size) - 1) & (1ULL << 9) ? 9 : ((size) - 1) &
(1ULL << 8) ? 8 : ((size) - 1) & (1ULL << 7) ? 7 : ((size) - 1) & (1ULL << 6) ?
6 : ((size) - 1) & (1ULL << 5) ? 5 : ((size) - 1) & (1ULL << 4) ? 4 : ((size) -
1) & (1ULL << 3) ? 3 : ((size) - 1) & (1ULL << 2) ? 2 : 1) : -1) :
(sizeof((size) - 1) <= 4) ? __ilog2_u32((size) - 1) : __ilog2_u64((size) - 1) )
- 12 + 1;


Hello.

Maybe a silly question, but isn't the code only an obfuscation?
What about doing:

if (__builtin_constant_p(x))
  return x < 2 ? 0 : __builtin_clz(x);

which is fully eliminated in a c.031t.ccp1 pass right after einline happens
for a call context where 'x' is a constant.

Isn't the right place for fix in Linux kernel?

Martin


Re: Increase inlining limits for inline functions with builtin_constant_p on parameter

2020-10-21 Thread Jakub Jelinek via Gcc-patches
On Wed, Oct 21, 2020 at 09:03:03PM +0200, Martin Liška wrote:
> Maybe a silly question, but isn't the code only an obfuscation?
> What about doing:
> 
> if (__builtin_constant_p(x))
>   return x < 2 ? 0 : __builtin_clz(x);
> 
> which is fully eliminated in a c.031t.ccp1 pass right after einline happens
> for a call context where 'x' is a constant.
> 
> Isn't the right place for fix in Linux kernel?

Of course it is, see
https://lore.kernel.org/linux-toolchains/21556974-eea1-ed6a-ea6f-3e97a6eea...@csgroup.eu/T/#m12d2586fe18ed27789c8d67a677783a83b79efa8

Jakub



Re: [PATCH] rs6000: MMA type causes an ICE in ranger pass due to incompatible types

2020-10-21 Thread Peter Bergner via Gcc-patches
On 10/21/20 1:34 PM, Segher Boessenkool wrote:
>> The following patch from Andrew and richi fixes the ICE on Martin's test
>> case and passes bootstrap and regtesting on powerpc64le-linux.
>> Ok for trunk?
> 
> Yes, okay for trunk.  Thanks!

Ok, pushed to trunk.  Thanks!



>> Since the ranger code that triggered this doesn't seem to be in GCC 10,
>> I assume we do not want to backport this this change?
> 
> No, please do, in a week or so, it is a pretty serious problem that we
> could just asa well run into some other way, as far as I can see?

Ok, I'll wait a week and then do the backport and testing.

Peter



Re: [PATCH] libstdc++: Add c++2a

2020-10-21 Thread Thomas Rodgers



> On Oct 21, 2020, at 10:34 AM, Jonathan Wakely  wrote:
> 
> On 21/10/20 09:53 -0700, Thomas Rodgers wrote:
>> From: Thomas Rodgers 
>> 
>> libstdc++/Changelog:
>>  libstdc++-v3/doc/doxygen/user.cfg.in (INPUT): Add new header.
>>  libstdc++-v3/include/Makefile.am (std_headers): Add new header.
>>  libstdc++-v3/include/Makefile.in: Regenerate.
>>  libstdc++-v3/include/precompiled/stdc++.h: Include new header.
>>  libstdc++-v3/include/std/streambuf
>>   (__detail::__streambuf_core_access): Define.
>>   (basic_streambuf): Befriend __detail::__streambuf_core_access.
> 
> This file is no longer part of the commit, so the server will reject
> this changelog. Please ensure the changelog is accurate (the
> gcc-verify alias created by contrib/gcc-git-customization.sh can do
> that) and push, thanks.
> 

This patch is dependent on the changes to  so I can’t push until that 
patch lands.

[committed] libstdc++: Simplify std::shared_ptr construction from std::weak_ptr

2020-10-21 Thread Jonathan Wakely via Gcc-patches
The _M_add_ref_lock() and _M_add_ref_lock_nothrow() members of
_Sp_counted_base are very similar, except that the former throws an
exception when the use count is zero and the latter returns false. The
former (and its callers) can be implemented in terms of the latter.
This results in a small reduction in code size, because throwing an
exception now only happens in one place.

libstdc++-v3/ChangeLog:

* include/bits/shared_ptr.h (shared_ptr(const weak_ptr&, nothrow_t)):
Add noexcept.
* include/bits/shared_ptr_base.h (_Sp_counted_base::_M_add_ref_lock):
Remove specializations and just call _M_add_ref_lock_nothrow.
(__shared_count, __shared_ptr): Use nullptr for null pointer
constants.
(__shared_count(const __weak_count&)): Use _M_add_ref_lock_nothrow
instead of _M_add_ref_lock.
(__shared_count(const __weak_count&, nothrow_t)): Add noexcept.
(__shared_ptr::operator bool()): Add noexcept.
(__shared_ptr(const __weak_ptr&, nothrow_t)): Add noexcept.

Tested powerpc64le-linux. Committed to trunk.

commit 945151b7f14c5d105abd8117f208ae9e3db91fb4
Author: Jonathan Wakely 
Date:   Wed Oct 21 21:13:41 2020

libstdc++: Simplify std::shared_ptr construction from std::weak_ptr

The _M_add_ref_lock() and _M_add_ref_lock_nothrow() members of
_Sp_counted_base are very similar, except that the former throws an
exception when the use count is zero and the latter returns false. The
former (and its callers) can be implemented in terms of the latter.
This results in a small reduction in code size, because throwing an
exception now only happens in one place.

libstdc++-v3/ChangeLog:

* include/bits/shared_ptr.h (shared_ptr(const weak_ptr&, 
nothrow_t)):
Add noexcept.
* include/bits/shared_ptr_base.h 
(_Sp_counted_base::_M_add_ref_lock):
Remove specializations and just call _M_add_ref_lock_nothrow.
(__shared_count, __shared_ptr): Use nullptr for null pointer
constants.
(__shared_count(const __weak_count&)): Use _M_add_ref_lock_nothrow
instead of _M_add_ref_lock.
(__shared_count(const __weak_count&, nothrow_t)): Add noexcept.
(__shared_ptr::operator bool()): Add noexcept.
(__shared_ptr(const __weak_ptr&, nothrow_t)): Add noexcept.

diff --git a/libstdc++-v3/include/bits/shared_ptr.h 
b/libstdc++-v3/include/bits/shared_ptr.h
index 0c393e23132..0bfb525aae7 100644
--- a/libstdc++-v3/include/bits/shared_ptr.h
+++ b/libstdc++-v3/include/bits/shared_ptr.h
@@ -413,7 +413,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
allocate_shared(const _Alloc& __a, _Args&&... __args);
 
   // This constructor is non-standard, it is used by weak_ptr::lock().
-  shared_ptr(const weak_ptr<_Tp>& __r, std::nothrow_t)
+  shared_ptr(const weak_ptr<_Tp>& __r, std::nothrow_t) noexcept
   : __shared_ptr<_Tp>(__r, std::nothrow) { }
 
   friend class weak_ptr<_Tp>;
diff --git a/libstdc++-v3/include/bits/shared_ptr_base.h 
b/libstdc++-v3/include/bits/shared_ptr_base.h
index ff578e66117..ca37f2bebd6 100644
--- a/libstdc++-v3/include/bits/shared_ptr_base.h
+++ b/libstdc++-v3/include/bits/shared_ptr_base.h
@@ -142,10 +142,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   { __gnu_cxx::__atomic_add_dispatch(&_M_use_count, 1); }
 
   void
-  _M_add_ref_lock();
+  _M_add_ref_lock()
+  {
+   if (!_M_add_ref_lock_nothrow())
+ __throw_bad_weak_ptr();
+  }
 
   bool
-  _M_add_ref_lock_nothrow();
+  _M_add_ref_lock_nothrow() noexcept;
 
   void
   _M_release() noexcept
@@ -214,48 +218,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   _Atomic_word  _M_weak_count;// #weak + (#shared != 0)
 };
 
-  template<>
-inline void
-_Sp_counted_base<_S_single>::
-_M_add_ref_lock()
-{
-  if (_M_use_count == 0)
-   __throw_bad_weak_ptr();
-  ++_M_use_count;
-}
-
-  template<>
-inline void
-_Sp_counted_base<_S_mutex>::
-_M_add_ref_lock()
-{
-  __gnu_cxx::__scoped_lock sentry(*this);
-  if (__gnu_cxx::__exchange_and_add_dispatch(&_M_use_count, 1) == 0)
-   {
- _M_use_count = 0;
- __throw_bad_weak_ptr();
-   }
-}
-
-  template<>
-inline void
-_Sp_counted_base<_S_atomic>::
-_M_add_ref_lock()
-{
-  // Perform lock-free add-if-not-zero operation.
-  _Atomic_word __count = _M_get_use_count();
-  do
-   {
- if (__count == 0)
-   __throw_bad_weak_ptr();
- // Replace the current counter value with the old value + 1, as
- // long as it's not changed meanwhile.
-   }
-  while (!__atomic_compare_exchange_n(&_M_use_count, &__count, __count + 1,
- true, __ATOMIC_ACQ_REL,
- __ATOMIC_RELAXED));
-}
-
   template<>
 inline bool
 _Sp_count

Re: [PATCH 2/8] [RS6000] rs6000_rtx_costs for AND

2020-10-21 Thread Segher Boessenkool
On Wed, Oct 21, 2020 at 01:27:42PM +1030, Alan Modra wrote:
> On Tue, Oct 20, 2020 at 01:55:56PM -0500, Segher Boessenkool wrote:
> > On Thu, Oct 08, 2020 at 09:27:54AM +1030, Alan Modra wrote:
> > > The existing "case AND" in this function is not sufficient for
> > > optabs.c:avoid_expensive_constant usage, where the AND is passed in
> > > outer_code.  We'd like to cost AND of rs6000_is_valid_and_mask
> > > or rs6000_is_valid_2insn_and variety there, so that those masks aren't
> > > seen as expensive (ie. better to load to a reg then AND).
> > > 
> > >   * config/rs6000/rs6000.c (rs6000_rtx_costs): Combine CONST_INT
> > >   AND handling with IOR/XOR.  Move costing for AND with
> > >   rs6000_is_valid_and_mask or rs6000_is_valid_2insn_and to
> > >   CONST_INT.
> > 
> > Sorry this took so long to review :-(
> > 
> > On 64-bit BE this leads to *bigger* code, and closer observation shows
> > that some common sequences degrade on all configs.  This seems to mostly
> > be about "andc" (and its dot form).  It wasn't costed properly before,
> > but after your patch, a single instruction is replaced by three.
> > 
> > Could you look into this?
> 
> ~/build/gcc-alan/gcc$ for z in *.o; do if test `objdump -dr $z | grep andc | 
> wc -l` != `objdump -dr ../../gcc/gcc/$z | grep andc | wc -l`; then echo $z; 
> fi; done
> gimplify.o
> insn-emit.o
> insn-opinit.o
> insn-recog.o
> rs6000-string.o
> 
> All of these are exactly the case I talked about in
> https://gcc.gnu.org/pipermail/gcc-patches/2020-September/553919.html

For a kernel build (my testcase) it happens more often.

> "Sometimes correct insn cost leads to unexpected results.  For
> example:
> 
> extern unsigned bar (void);
> unsigned
> f1 (unsigned a)
> {
>   if ((a & 0x01000200) == 0x01000200)
> return bar ();
>   return 0;
> }
> 
> emits for a & 0x01000200
>  (set (reg) (and (reg) (const_int 0x01000200)))
> at expand time (two rlwinm insns) rather than the older
>  (set (reg) (const_int 0x01000200))
>  (set (reg) (and (reg) (reg)))

And that is bad.  Why on earth does expand "optimise" this?  It should
not, it hinders various *real* optimisations!

> which is three insns.  However, since 0x01000200 is needed later the
> older code after optimisation is smaller."
> 
> Things have changed slightly since I wrote the above, with the two
> rlwinm insns being emitted at expand time, so you see
>  (set (reg) (and (reg) (const_int 0xff0003ff)))
>  (set (reg) (and (reg) (const_int 0x01fffe00)))

It has done that for many years?

> but of course that doesn't change anything regarding the cost of
> "a & 0x01000200".

Yeah.  But the problem is that cost that are "better", "closer to
reality", sometimes result in worse results :-(

Anyway:

+  || (outer_code == AND
+  && rs6000_is_valid_2insn_and (x, mode)))
{
  *total = COSTS_N_INSNS (1);
  return true;

It should return COSTS_N_INSNS (2) for that?

Testing with that now.


Segher


[PATCH] c++: Check DECL_TEMPLATE_PARM_P in duplicate_decls [PR97511]

2020-10-21 Thread Patrick Palka via Gcc-patches
This makes duplicate_decls differentiate a TYPE_DECL for an alias
template from a TYPE_DECL for one of its template parameters.  The
recently added assert in template_parm_to_arg revealed this latent issue
because merging of the two TYPE_DECLs cleared the DECL_TEMPLATE_PARM_P
flag.

With this patch, we now also correctly diagnose the name shadowing in
the below testcase (as required by [temp.local]/6).

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK to
commit?

gcc/cp/ChangeLog:

PR c++/97511
* decl.c (duplicate_decls): Return NULL_TREE if
DECL_TEMPLATE_PARM_P differ.

gcc/testsuite/ChangeLog:

PR c++/97511
* g++.dg/template/shadow3.C: New test.
---
 gcc/cp/decl.c   | 3 +++
 gcc/testsuite/g++.dg/template/shadow3.C | 4 
 2 files changed, 7 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/template/shadow3.C

diff --git a/gcc/cp/decl.c b/gcc/cp/decl.c
index 5f370e60b4e..2de4e1657fb 100644
--- a/gcc/cp/decl.c
+++ b/gcc/cp/decl.c
@@ -2002,6 +2002,9 @@ duplicate_decls (tree newdecl, tree olddecl, bool hiding, 
bool was_hidden)
  || DECL_IMPLICIT_TYPEDEF_P (newdecl)))
 return NULL_TREE;
 
+  if (DECL_TEMPLATE_PARM_P (olddecl) != DECL_TEMPLATE_PARM_P (newdecl))
+return NULL_TREE;
+
   if (!validate_constexpr_redeclaration (olddecl, newdecl))
 return error_mark_node;
 
diff --git a/gcc/testsuite/g++.dg/template/shadow3.C 
b/gcc/testsuite/g++.dg/template/shadow3.C
new file mode 100644
index 000..a5f256384ac
--- /dev/null
+++ b/gcc/testsuite/g++.dg/template/shadow3.C
@@ -0,0 +1,4 @@
+// PR c++/97511
+// { dg-do compile { target c++11 } }
+
+template  using Z = Z; // { dg-error "shadow|declaration" }
-- 
2.29.0.rc0



Re: [PATCH] c++: Check DECL_TEMPLATE_PARM_P in duplicate_decls [PR97511]

2020-10-21 Thread Jason Merrill via Gcc-patches

On 10/21/20 4:35 PM, Patrick Palka wrote:

This makes duplicate_decls differentiate a TYPE_DECL for an alias
template from a TYPE_DECL for one of its template parameters.  The
recently added assert in template_parm_to_arg revealed this latent issue
because merging of the two TYPE_DECLs cleared the DECL_TEMPLATE_PARM_P
flag.

With this patch, we now also correctly diagnose the name shadowing in
the below testcase (as required by [temp.local]/6).

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK to
commit?


OK.


gcc/cp/ChangeLog:

PR c++/97511
* decl.c (duplicate_decls): Return NULL_TREE if
DECL_TEMPLATE_PARM_P differ.

gcc/testsuite/ChangeLog:

PR c++/97511
* g++.dg/template/shadow3.C: New test.
---
  gcc/cp/decl.c   | 3 +++
  gcc/testsuite/g++.dg/template/shadow3.C | 4 
  2 files changed, 7 insertions(+)
  create mode 100644 gcc/testsuite/g++.dg/template/shadow3.C

diff --git a/gcc/cp/decl.c b/gcc/cp/decl.c
index 5f370e60b4e..2de4e1657fb 100644
--- a/gcc/cp/decl.c
+++ b/gcc/cp/decl.c
@@ -2002,6 +2002,9 @@ duplicate_decls (tree newdecl, tree olddecl, bool hiding, 
bool was_hidden)
  || DECL_IMPLICIT_TYPEDEF_P (newdecl)))
  return NULL_TREE;
  
+  if (DECL_TEMPLATE_PARM_P (olddecl) != DECL_TEMPLATE_PARM_P (newdecl))

+return NULL_TREE;
+
if (!validate_constexpr_redeclaration (olddecl, newdecl))
  return error_mark_node;
  
diff --git a/gcc/testsuite/g++.dg/template/shadow3.C b/gcc/testsuite/g++.dg/template/shadow3.C

new file mode 100644
index 000..a5f256384ac
--- /dev/null
+++ b/gcc/testsuite/g++.dg/template/shadow3.C
@@ -0,0 +1,4 @@
+// PR c++/97511
+// { dg-do compile { target c++11 } }
+
+template  using Z = Z; // { dg-error "shadow|declaration" }





Re: [PATCH] c++: Handle RANGE_EXPR indexes in init_subob_ctx [PR97328]

2020-10-21 Thread Jason Merrill via Gcc-patches

On 10/8/20 4:47 PM, Patrick Palka wrote:

In the testcase below, we're ICEing during constexpr evaluation of the
CONSTRUCTOR {.data={{}, [1 ... 7]={}}} of type 'vector'.  The apparently
unique thing about this CONSTRUCTOR is that it has a RANGE_EXPR index
whose corresponding sub-aggregate initializer doesn't satisfy
reduced_constant_expression_p (because its field 't' is uninitialized).

This is a problem because init_subob_ctx currently punts if the
constructor index is a RANGE_EXPR, so when cxx_eval_bare_aggregate
recurses into this sub-aggregate initializer we trip over the
same_type_p assert in verify_ctor_sanity.

Fix this by making init_subob_ctx set up an appropriate sub-aggregate
initialization context even when the index is a RANGE_EXPR.

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
trunk and the 10 branch?

gcc/cp/ChangeLog:

PR c++/97328
* constexpr.c (init_subob_ctx): Don't punt if the index is a
RANGE_EXPR, instead build a sub-aggregate initialization context
with no subobject.

gcc/testsuite/ChangeLog:

PR c++/97328
* g++.dg/cpp2a/constexpr-init19.C: New test.
* g++.dg/cpp2a/constexpr-init20.C: New test.
---
  gcc/cp/constexpr.c| 13 +++--
  gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C | 15 +++
  gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C | 15 +++
  3 files changed, 37 insertions(+), 6 deletions(-)
  create mode 100644 gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C
  create mode 100644 gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C

diff --git a/gcc/cp/constexpr.c b/gcc/cp/constexpr.c
index a118f8a810b..e50a2a220cb 100644
--- a/gcc/cp/constexpr.c
+++ b/gcc/cp/constexpr.c
@@ -3953,11 +3953,6 @@ init_subob_ctx (const constexpr_ctx *ctx, constexpr_ctx 
&new_ctx,
  {
new_ctx = *ctx;
  
-  if (index && TREE_CODE (index) != INTEGER_CST

-  && TREE_CODE (index) != FIELD_DECL)
-/* This won't have an element in the new CONSTRUCTOR.  */
-return;


Hmm, I wonder what this was trying to exclude?  I'd be more comfortable 
adding RANGE_EXPR to the allowed index codes.



tree type = initialized_type (value);
if (!AGGREGATE_TYPE_P (type) && !VECTOR_TYPE_P (type))
  /* A non-aggregate member doesn't get its own CONSTRUCTOR.  */
@@ -3967,7 +3962,13 @@ init_subob_ctx (const constexpr_ctx *ctx, constexpr_ctx 
&new_ctx,
   update object to refer to the subobject and ctor to refer to
   the (newly created) sub-initializer.  */
if (ctx->object)
-new_ctx.object = build_ctor_subob_ref (index, type, ctx->object);
+{
+  if (index == NULL_TREE || TREE_CODE (index) == RANGE_EXPR)
+   /* There's no well-defined subobject for this index.  */
+   new_ctx.object = NULL_TREE;
+  else
+   new_ctx.object = build_ctor_subob_ref (index, type, ctx->object);
+}
tree elt = build_constructor (type, NULL);
CONSTRUCTOR_NO_CLEARING (elt) = true;
new_ctx.ctor = elt;
diff --git a/gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C 
b/gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C
new file mode 100644
index 000..d354c5ad609
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C
@@ -0,0 +1,15 @@
+// PR c++/97328
+// { dg-do compile { target c++20 } }
+
+struct vector {
+  struct storage {
+int t;
+constexpr storage() {}
+  } data[8];
+};
+
+constexpr auto foo() {
+  vector i;
+  return i;
+}
+auto val = foo();
diff --git a/gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C 
b/gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C
new file mode 100644
index 000..1a6ed8d86dd
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C
@@ -0,0 +1,15 @@
+// PR c++/97328
+// { dg-do compile { target c++20 } }
+
+struct vector {
+  union storage {
+int t;
+constexpr storage() {}
+  } data[8];
+};
+
+constexpr auto foo() {
+  vector i;
+  return i;
+}
+auto val = foo();





  1   2   >