Re: [PATCH 0/2] LoongArch: respect --with-* and drop loongarch-driver

2022-12-08 Thread Xi Ruoyao via Gcc-patches
On Fri, 2022-12-09 at 10:43 +0800, Icenowy Zheng wrote:
> This patchset tries to fix the object duplication between the driver and
> the real compiler, which makes libgccjit building fail because of
> linking this two parts together.

Hmm, I think the issue is already fixed by https://gcc.gnu.org/r13-1010
but maybe I'm wrong here...

This looks like a good code clean-up anyway.  But if libgccjit builds
fine with GCC trunk I'd postpone the clean-up to GCC 14 stage 1.

> First, the build-time --with-* values are now respected by being the
> default -m* values in the driver, and then loongarch-driver, which is
> mostly doing no-op now, is dropped.
> 
> This patchset is bootstrapped on a native LoongArch device (without
> any
> --with-* flags). In addition, on a x86 device, cross compilers are
> built
> with --with-{arch,abi} and without any --with-* flags; all these
> configurations can correctly build all supported shipped libraries
> with
> GCC.
> 
> Icenowy Zheng (2):
>   LoongArch: respect the with values in config.gcc
>   LoongArch: drop loongarch-driver
> 
>  gcc/config.gcc   |   1 -
>  gcc/config/loongarch/loongarch-driver.cc | 187 --
> -
>  gcc/config/loongarch/loongarch-driver.h  |  68 -
>  gcc/config/loongarch/loongarch.h |  21 ++-
>  4 files changed, 19 insertions(+), 258 deletions(-)
>  delete mode 100644 gcc/config/loongarch/loongarch-driver.cc
>  delete mode 100644 gcc/config/loongarch/loongarch-driver.h
> 

-- 
Xi Ruoyao 
School of Aerospace Science and Technology, Xidian University


Re: [PATCH 3/3] btf: correct generation for extern funcs [PR106773]

2022-12-08 Thread Indu Bhagat via Gcc-patches

On 12/7/22 12:57, David Faust wrote:

The eBPF loader expects to find entries for functions declared as extern
in the corresponding BTF_KIND_DATASEC record, but we were not generating
these entries.

This patch adds support for the 'extern' linkage of function types in
BTF, and creates entries for for them BTF_KIND_DATASEC records as needed.

PR target/106773

gcc/

* btfout.cc (get_section_name): New function.
(btf_collect_datasec): Use it here. Process functions, marking them
'extern' and generating DATASEC entries for them as appropriate. Move
creation of BTF_KIND_FUNC records to here...
(btf_dtd_emit_preprocess_cb): ... from here.

gcc/testsuite/

* gcc.dg/debug/btf/btf-datasec-2.c: New test.
* gcc.dg/debug/btf/btf-function-6.c: New test.

include/

* btf.h (struct btf_var_secinfo): Update comments with notes about
extern functions.
---
  gcc/btfout.cc | 129 --
  .../gcc.dg/debug/btf/btf-datasec-2.c  |  28 
  .../gcc.dg/debug/btf/btf-function-6.c |  19 +++
  include/btf.h |   9 +-
  4 files changed, 139 insertions(+), 46 deletions(-)
  create mode 100644 gcc/testsuite/gcc.dg/debug/btf/btf-datasec-2.c
  create mode 100644 gcc/testsuite/gcc.dg/debug/btf/btf-function-6.c

diff --git a/gcc/btfout.cc b/gcc/btfout.cc
index 05f3a3f9b6e..d7ead377ec5 100644
--- a/gcc/btfout.cc
+++ b/gcc/btfout.cc
@@ -294,7 +294,35 @@ btf_datasec_push_entry (ctf_container_ref ctfc, const char 
*secname,
ds.entries.safe_push (info);
  
datasecs.safe_push (ds);

-  num_types_created++;
+}
+
+
+/* Return the section name, as of interest to btf_collect_datasec, for the
+   given symtab node. Note that this deliberately returns NULL for objects
+   which do not go in a section btf_collect_datasec cares about.  */


"Dot, space, space, new sentence."


+static const char *
+get_section_name (symtab_node *node)
+{
+  const char *section_name = node->get_section ();
+
+  if (section_name == NULL)
+{
+  switch (categorize_decl_for_section (node->decl, 0))
+   {
+   case SECCAT_BSS:
+ section_name = ".bss";
+ break;
+   case SECCAT_DATA:
+ section_name = ".data";
+ break;
+   case SECCAT_RODATA:
+ section_name = ".rodata";
+ break;
+   default:;
+   }
+}
+
+  return section_name;
  }
  
  /* Construct all BTF_KIND_DATASEC records for CTFC. One such record is created

@@ -305,7 +333,60 @@ btf_datasec_push_entry (ctf_container_ref ctfc, const char 
*secname,
  static void
  btf_collect_datasec (ctf_container_ref ctfc)
  {
-  /* See cgraph.h struct symtab_node, which varpool_node extends.  */
+  cgraph_node *func;
+  FOR_EACH_FUNCTION (func)
+{
+  dw_die_ref die = lookup_decl_die (func->decl);
+  if (die == NULL)
+   continue;
+
+  ctf_dtdef_ref dtd = ctf_dtd_lookup (ctfc, die);
+  if (dtd == NULL)
+   continue;
+
+  /* Functions actually get two types: a BTF_KIND_FUNC_PROTO, and
+also a BTF_KIND_FUNC. But the CTF container only allocates one
+type per function, which matches closely with BTF_KIND_FUNC_PROTO.
+For each such function, also allocate a BTF_KIND_FUNC entry.
+These will be output later.  */


"Dot, space, space, new sentence."


+  ctf_dtdef_ref func_dtd = ggc_cleared_alloc ();
+  func_dtd->dtd_data = dtd->dtd_data;
+  func_dtd->dtd_data.ctti_type = dtd->dtd_type;
+  func_dtd->linkage = dtd->linkage;
+  func_dtd->dtd_type = num_types_added + num_types_created;
+
+  /* Only the BTF_KIND_FUNC type actually references the name. The
+BTF_KIND_FUNC_PROTO is always anonymous.  */
+  dtd->dtd_data.ctti_name = 0;
+
+  vec_safe_push (funcs, func_dtd);
+  num_types_created++;
+
+  /* Mark any 'extern' funcs and add DATASEC entries for them.  */
+  if (DECL_EXTERNAL (func->decl))
+   {
+ func_dtd->linkage = BTF_LINKAGE_EXTERN;
+


What is the expected BTF when both decl and definition are present:

extern int extfunc(int x);
int extfunc (int x) {
  int y = foo ();
  return y;
}


+ const char *section_name = get_section_name (func);
+ /* Note: get_section_name () returns NULL for functions in text
+section. This is intentional, since we do not want to generate
+DATASEC entries for them.  */


"Dot, space, space, new sentence."


+ if (section_name == NULL)
+   continue;
+
+ struct btf_var_secinfo info;
+
+ /* +1 for the sentinel type not in the types map.  */
+ info.type = func_dtd->dtd_type + 1;
+
+ /* Both zero at compile time.  */
+ info.size = 0;
+ info.offset = 0;
+
+ btf_datasec_push_entry (ctfc, section_name, info);
+   }
+}
+
varpool_node *node;
FOR_EACH_VARIABLE (node)
  {
@@ -317,28 +398,13 @@ 

Re: [PATCH 2/3] btf: fix 'extern const void' variables [PR106773]

2022-12-08 Thread Indu Bhagat via Gcc-patches

Looks OK to me overall. Minor comments below.

Thanks

On 12/7/22 12:57, David Faust wrote:

The eBPF loader expects to find BTF_KIND_VAR records for references to
extern const void symbols. We were mistakenly identifing these as
unsupported types, and as a result skipping emitting VAR records for
them.

In addition, the internal DWARF representation from which BTF is
produced does not generate 'const' modifier DIEs for the void type,
which meant in BTF the 'const' qualifier was dropped for 'extern const
void' variables. This patch also adds support for generating a const
void type in BTF to correct emission for these variables.

PR target/106773

gcc/

* btfout.cc (btf_collect_datasec): Correct size of void entries.
(btf_dvd_emit_preprocess_cb): Do not skip emitting variables which
refer to void types.
(btf_init_postprocess): Create 'const void' type record if needed and
adjust variables to refer to it as appropriate.

gcc/testsuite/

* gcc.dg/debug/btf/btf-pr106773.c: New test.
---
  gcc/btfout.cc | 44 +--
  gcc/testsuite/gcc.dg/debug/btf/btf-pr106773.c | 25 +++
  2 files changed, 65 insertions(+), 4 deletions(-)
  create mode 100644 gcc/testsuite/gcc.dg/debug/btf/btf-pr106773.c

diff --git a/gcc/btfout.cc b/gcc/btfout.cc
index a1c6266a7db..05f3a3f9b6e 100644
--- a/gcc/btfout.cc
+++ b/gcc/btfout.cc
@@ -354,6 +354,8 @@ btf_collect_datasec (ctf_container_ref ctfc)
tree size = DECL_SIZE_UNIT (node->decl);
if (tree_fits_uhwi_p (size))
info.size = tree_to_uhwi (size);
+  else if (VOID_TYPE_P (TREE_TYPE (node->decl)))
+   info.size = 1;
  
/* Offset is left as 0 at compile time, to be filled in by loaders such

 as libbpf.  */
@@ -439,7 +441,7 @@ btf_dvd_emit_preprocess_cb (ctf_dvdef_ref *slot, 
ctf_container_ref arg_ctfc)
ctf_dvdef_ref var = (ctf_dvdef_ref) * slot;
  
/* Do not add variables which refer to unsupported types.  */

-  if (btf_removed_type_p (var->dvd_type))
+  if (!voids.contains (var->dvd_type) && btf_removed_type_p (var->dvd_type))
  return 1;
  
arg_ctfc->ctfc_vars_list[num_vars_added] = var;

@@ -1073,15 +1075,49 @@ btf_init_postprocess (void)
  {
ctf_container_ref tu_ctfc = ctf_get_tu_ctfc ();
  
-  size_t i;

-  size_t num_ctf_types = tu_ctfc->ctfc_types->elements ();
-
holes.create (0);
voids.create (0);
  
num_types_added = 0;

num_types_created = 0;
  
+  /* Workaround for 'const void' variables. These variables are sometimes used

+ in eBPF programs to address kernel symbols. DWARF does not generate const
+ qualifier on void type, so we would incorrectly emit these variables
+ without the const qualifier.
+ Unfortunately we need the TREE node to know it was const, and we need
+ to create the const modifier type (if needed) now, before making the types
+ list. So we can't avoid iterating with FOR_EACH_VARIABLE here, and then
+ again when creating the DATASEC entries.  */


"Dot, space, space, new sentence." in 3 places.



+  ctf_id_t constvoid_id = CTF_NULL_TYPEID;
+  varpool_node *var;
+  FOR_EACH_VARIABLE (var)
+{
+  if (!var->decl)
+   continue;
+
+  tree type = TREE_TYPE (var->decl);
+  if (type && VOID_TYPE_P (type) && TYPE_READONLY (type))
+   {
+ dw_die_ref die = lookup_decl_die (var->decl);
+ if (die == NULL)
+   continue;
+
+ ctf_dvdef_ref dvd = ctf_dvd_lookup (tu_ctfc, die);
+ if (dvd == NULL)
+   continue;
+
+ /* Create the 'const' modifier type for void.  */
+ if (constvoid_id == CTF_NULL_TYPEID)
+   constvoid_id = ctf_add_reftype (tu_ctfc, CTF_ADD_ROOT,
+   dvd->dvd_type, CTF_K_CONST, NULL);


No de-duplication of the const void type.  I assume libbpf will take 
care of this eventually.



+ dvd->dvd_type = constvoid_id;
+   }
+}
+
+  size_t i;
+  size_t num_ctf_types = tu_ctfc->ctfc_types->elements ();
+
if (num_ctf_types)
  {
init_btf_id_map (num_ctf_types + 1);
diff --git a/gcc/testsuite/gcc.dg/debug/btf/btf-pr106773.c 
b/gcc/testsuite/gcc.dg/debug/btf/btf-pr106773.c
new file mode 100644
index 000..f90fa773a4b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/debug/btf/btf-pr106773.c
@@ -0,0 +1,25 @@
+/* Test BTF generation for extern const void symbols.
+   BTF_KIND_VAR records should be emitted for such symbols if they are used,
+   as well as a corresponding entry in the appropriate DATASEC record.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O0 -gbtf -dA" } */
+
+/* Expect 1 variable record only for foo, with 'extern' (2) linkage.  */
+/* { dg-final { scan-assembler-times "\[\t \]0xe00\[\t 
\]+\[^\n\]*btv_info" 1 } } */
+/* { dg-final { scan-assembler-times "\[\t \]0x2\[\t \]+\[^\n\]*btv_linkage" 1 
} } */
+
+/* { dg-final { scan-assembler-times "ascii \"foo.0\"\[\t 

Re: [PATCH]AArch64 div-by-255, ensure that arguments are registers. [PR107988]

2022-12-08 Thread Richard Sandiford via Gcc-patches
Richard Earnshaw  writes:
> On 08/12/2022 16:39, Tamar Christina via Gcc-patches wrote:
>> Hi All,
>> 
>> At -O0 (as opposed to e.g. volatile) we can get into the situation where the
>> in0 and result RTL arguments passed to the division function are memory
>> locations instead of registers.  I think we could reject these early on by
>> checking that the gimple values are GIMPLE registers, but I think it's 
>> better to
>> handle it.
>> 
>> As such I force them to registers and emit a move to the memory locations and
>> leave it up to reload to handle.  This fixes the ICE and still allows the
>> optimization in these cases,  which improves the code quality a lot.
>> 
>> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>> 
>> Ok for master?
>> 
>> Thanks,
>> Tamar
>> 
>> 
>> 
>> gcc/ChangeLog:
>> 
>>  PR target/107988
>>  * config/aarch64/aarch64.cc
>>  (aarch64_vectorize_can_special_div_by_constant): Ensure input and output
>>  RTL are registers.
>> 
>> gcc/testsuite/ChangeLog:
>> 
>>  PR target/107988
>>  * gcc.target/aarch64/pr107988-1.c: New test.
>> 
>> --- inline copy of patch --
>> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> index 
>> b8dc3f070c8afc47c85fa18768c4da92c774338f..9f96424993c4fe90e1b241fcb3aa97025225
>>  100644
>> --- a/gcc/config/aarch64/aarch64.cc
>> +++ b/gcc/config/aarch64/aarch64.cc
>> @@ -24337,12 +24337,27 @@ aarch64_vectorize_can_special_div_by_constant 
>> (enum tree_code code,
>> if (!VECTOR_TYPE_P (vectype))
>>  return false;
>>   
>> +  if (!REG_P (in0))
>> +in0 = force_reg (GET_MODE (in0), in0);
>> +
>> gcc_assert (output);
>>   
>> -  if (!*output)
>> -*output = gen_reg_rtx (TYPE_MODE (vectype));
>> +  rtx res =  NULL_RTX;
>> +
>> +  /* Once e get to this point we cannot reject the RTL,  if it's not a reg 
>> then
>> + Create a new reg and write the result to the output afterwards.  */
>> +  if (!*output || !REG_P (*output))
>> +res = gen_reg_rtx (TYPE_MODE (vectype));
>> +  else
>> +res = *output;
>
> Why not write
>rtx res = *output
>if (!res || !REG_P (res))
>  res = gen_reg_rtx...
>
> then you don't need either the else clause or the dead NULL_RTX assignment.

I'd prefer that we use the expand_insn interface, which already has
logic for coercing inputs and outputs to predicates.  Something like:

  machine_mode mode = TYPE_MODE (vectype);
  unsigned int flags = aarch64_classify_vector_mode (mode);
  if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
return false;

  ...

  expand_operand ops[3];
  create_output_operand ([0], *output, mode);
  create_input_operand ([1], in0, mode);
  create_fixed_operand ([2], in1);
  expand_insn (insn_code, 3, ops);
  *output = ops[0].value;
  return true;

On this function: why do we have the VECTOR_TYPE_P condition in:

  /* We can use the optimized pattern.  */
  if (in0 == NULL_RTX && in1 == NULL_RTX)
return true;

  if (!VECTOR_TYPE_P (vectype))
   return false;

?  It seems odd to be returning false after we have decided (in the
non-generating case) that everything is OK.  When would we see a vector
mode that has an associated division instruction (checked above this),
and yet not have a vector type?

Thanks,
Richard

>> +
>> +  emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), res, in0, 
>> in1));
>> +
>> +  if (*output && res != *output)
>> +emit_move_insn (*output, res);
>> +  else
>> +*output = res;
>>   
>> -  emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output, in0, 
>> in1));
>> return true;
>>   }
>>   
>> diff --git a/gcc/testsuite/gcc.target/aarch64/pr107988-1.c 
>> b/gcc/testsuite/gcc.target/aarch64/pr107988-1.c
>> new file mode 100644
>> index 
>> ..c4fd290271b738345173b569bdc58c092fba7fe9
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/pr107988-1.c
>> @@ -0,0 +1,10 @@
>> +/* { dg-do compile } */
>> +/* { dg-additional-options "-O0" } */
>> +typedef unsigned short __attribute__((__vector_size__ (16))) V;
>> +
>> +V
>> +foo (V v)
>> +{
>> +  v /= 255;
>> +  return v;
>> +}
>> 
>> 
>> 
>> 
>
> Otherwise OK.
>
> R.


Re: [PATCH v2 1/2] Allow subtarget customization of CC1_SPEC

2022-12-08 Thread Sebastian Huber

On 07/12/2022 10:50, Richard Sandiford wrote:

How about going back to Jose's suggestion from the original thread
of using OS_CC1_SPEC?  The patch is OK with that change if no-one
objects in 24 hours.


I checked in this change:

https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=3e4b8dc477c12f303171ec7f0394c97494095545

--
embedded brains GmbH
Herr Sebastian HUBER
Dornierstr. 4
82178 Puchheim
Germany
email: sebastian.hu...@embedded-brains.de
phone: +49-89-18 94 741 - 16
fax:   +49-89-18 94 741 - 08

Registergericht: Amtsgericht München
Registernummer: HRB 157899
Vertretungsberechtigte Geschäftsführer: Peter Rasmussen, Thomas Dörfler
Unsere Datenschutzerklärung finden Sie hier:
https://embedded-brains.de/datenschutzerklaerung/


Re: [PATCH 1/3] btf: add 'extern' linkage for variables [PR106773]

2022-12-08 Thread Indu Bhagat via Gcc-patches

Hi David,

On 12/7/22 12:57, David Faust wrote:

Add support for the 'extern' linkage value for BTF_KIND_VAR records,
which is used for variables declared as extern in the source file.

PR target/106773

gcc/

* btfout.cc (BTF_LINKAGE_STATIC): New define.
(BTF_LINKAGE_GLOBAL): Likewise.
(BTF_LINKAGE_EXTERN): Likewise.
(btf_collect_datasec): Mark extern variables as such.
(btf_asm_varent): Accomodate 'extern' linkage.

gcc/testsuite/

* gcc.dg/debug/btf/btf-variables-4.c: New test.

include/

* btf.h (struct btf_var): Update comment to note 'extern' linkage.
---
  gcc/btfout.cc |  9 ++-
  .../gcc.dg/debug/btf/btf-variables-4.c| 24 +++
  include/btf.h |  2 +-
  3 files changed, 33 insertions(+), 2 deletions(-)
  create mode 100644 gcc/testsuite/gcc.dg/debug/btf/btf-variables-4.c

diff --git a/gcc/btfout.cc b/gcc/btfout.cc
index aef9fd70a28..a1c6266a7db 100644
--- a/gcc/btfout.cc
+++ b/gcc/btfout.cc
@@ -66,6 +66,10 @@ static char btf_info_section_label[MAX_BTF_LABEL_BYTES];
  
  #define BTF_INVALID_TYPEID 0x
  
+#define BTF_LINKAGE_STATIC 0

+#define BTF_LINKAGE_GLOBAL 1
+#define BTF_LINKAGE_EXTERN 2
+


I was about to suggest to rename these to use the same name as used in 
the kernel btf.h. What is used there is:

BTF_VAR_STATIC = 0,
BTF_VAR_GLOBAL_ALLOCATED = 1,
BTF_VAR_GLOBAL_EXTERN = 2,

But after looking at the Patch 3/3, I see you reuse these definitions 
for functions as well. I just find the names confusing on the first look 
- "BTF_LINKAGE_STATIC".


Naming aside, what do you think about adding the defines to 
include/btf.h instead ?



  /* Mapping of CTF variables to the IDs they will be assigned when they are
 converted to BTF_KIND_VAR type records. Strictly accounts for the index
 from the start of the variable type entries, does not include the number
@@ -314,6 +318,9 @@ btf_collect_datasec (ctf_container_ref ctfc)
continue;
  
const char *section_name = node->get_section ();

+  /* Mark extern variables.  */
+  if (DECL_EXTERNAL (node->decl))
+   dvd->dvd_visibility = BTF_LINKAGE_EXTERN;
  


This made me think about the following case.

extern const char a[];
const char a[] = "foo";

What is the expected BTF for this? Since BTF can differentiate between 
the non-defining extern variable declaration, I expected to see two 
variables with different "linkage". At this time I see, two variables 
with global linkage but different types:


.long   0xe00   # btv_info
.long   0x4 # btv_type
.long   0x1 # btv_linkage
.long   0x1f# btv_name
.long   0xe00   # btv_info
.long   0x7 # btv_type
.long   0x1 # btv_linkage
.long   0x60# btt_name


if (section_name == NULL)
{
@@ -676,7 +683,7 @@ btf_asm_varent (ctf_dvdef_ref var)
dw2_asm_output_data (4, var->dvd_name_offset, "btv_name");
dw2_asm_output_data (4, BTF_TYPE_INFO (BTF_KIND_VAR, 0, 0), "btv_info");
dw2_asm_output_data (4, get_btf_id (var->dvd_type), "btv_type");
-  dw2_asm_output_data (4, (var->dvd_visibility ? 1 : 0), "btv_linkage");
+  dw2_asm_output_data (4, var->dvd_visibility, "btv_linkage");
  }
  
  /* Asm'out a member description following a BTF_KIND_STRUCT or

diff --git a/gcc/testsuite/gcc.dg/debug/btf/btf-variables-4.c 
b/gcc/testsuite/gcc.dg/debug/btf/btf-variables-4.c
new file mode 100644
index 000..d77600bae1c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/debug/btf/btf-variables-4.c
@@ -0,0 +1,24 @@
+/* Test BTF generation for extern variables.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O0 -gbtf -dA" } */
+
+/* Expect 4 variables.  */
+/* { dg-final { scan-assembler-times "\[\t \]0xe00\[\t 
\]+\[^\n\]*btv_info" 4 } } */
+
+/* 2 extern, 1 global, 1 static.  */
+/* { dg-final { scan-assembler-times "\[\t \]0\[\t \]+\[^\n\]*btv_linkage" 1 } 
} */
+/* { dg-final { scan-assembler-times "\[\t \]0x1\[\t \]+\[^\n\]*btv_linkage" 1 
} } */
+/* { dg-final { scan-assembler-times "\[\t \]0x2\[\t \]+\[^\n\]*btv_linkage" 2 
} } */
+
+extern int a;
+extern const int b;
+int c;
+static const int d = 5;
+
+int foo (int x)
+{
+  c = a + b + x;
+
+  return c + d;
+}
diff --git a/include/btf.h b/include/btf.h
index eba67f9d599..9a757ce5bc9 100644
--- a/include/btf.h
+++ b/include/btf.h
@@ -182,7 +182,7 @@ struct btf_param
 information about the variable.  */
  struct btf_var
  {
-  uint32_t linkage;/* Currently only 0=static or 1=global.  */
+  uint32_t linkage;/* 0=static, 1=global, 2=extern.  */
  };
  
  /* BTF_KIND_DATASEC is followed by VLEN struct btf_var_secinfo entries,




Re: [PATCH V3] Use reg mode to move sub blocks for parameters and returns

2022-12-08 Thread Jiufu Guo via Gcc-patches
Hi Segher,

Thanks a lot for your helpful comments!

Segher Boessenkool  writes:

> On Thu, Dec 08, 2022 at 09:17:38PM +0800, Jiufu Guo wrote:
>> Segher Boessenkool  writes:
>> > On Wed, Dec 07, 2022 at 08:00:08PM +0800, Jiufu Guo wrote:
>> >> typedef struct SA {double a[3];} A;
>> >> A ret_arg_pt (A *a) {return *a;} // on ppc64le, expect only 3 lfd(s)
>> >> A ret_arg (A a) {return a;} // just empty fun body
>> >> void st_arg (A a, A *p) {*p = a;} //only 3 stfd(s)
>> >
>> > What is this like if you use [5] instead?  Or use an ABI without
>> > homogeneous aggregates?
>> Thanks for this question!  I also tested the cases on different array
>> types or different sizes, or mixed field types.
>> 
>> If it is out of the number of registers for passing the param
>> or return, it is treated as a mem block.
>> For parameter, it is partially passed via registers, and partially
>> passing via stack.
>> For return, it is returned via a pointer (with one invisible pointer
>> parameter). And the  of the function is not with parallel code.
>> 
>> This patch does not cover these cases.
>
> Understood, sure; but my point is, can it degrade code quality in such
> cases?  I don't see anything in the patch that precludes that.

No, the behavior of such cases is not affected in this patch.
The preclude code is in "assign_parm_setup_block". This patch only shows
the different parts, the context is not shown.

In assign_parm_setup_block, this patch marks "DECL_STACK_REGS_P" only
for "REG_P (entry_parm) || GET_CODE (entry_parm) == PARALLEL" which
indicates the registers are enough to pass the param.

>
>> >> --- /dev/null
>> >> +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c
>> >> @@ -0,0 +1,15 @@
>> >> +/* PR target/65421 */
>> >> +/* { dg-options "-O2" } */
>> >> +/* { dg-require-effective-target has_arch_ppc64 } */
>> >> +
>> >> +typedef struct SA
>> >> +{
>> >> +  double a[2];
>> >> +  long l;
>> >> +} A;
>> >> +
>> >> +/* std 3 param regs to return slot */
>> >> +A ret_arg (A a) {return a;}
>> >> +/* { dg-final { scan-assembler-times {\mstd 4,0\(3\)\s} 1 } } */
>> >> +/* { dg-final { scan-assembler-times {\mstd 5,8\(3\)\s} 1 } } *
>> >> +/* { dg-final { scan-assembler-times {\mstd 6,16\(3\)\s} 1 } } */
>> >
>> > This is only correct on certain ABIs, probably only ELFv2 even.
>> Thanks for point out this!
>> This is only correct if the ABI allows this struct to be passed
>> through integer registers, and return through the mem block.
>
> And it needs to be in those specific registers / at those specific
> offsets as well.
Yes.
>
> Btw, please leave out the \s?
Thanks! 
>
>> In the previous version, I added a requirement on ELFv2. As tested on
>> BE environments, this case also pass. So, I deleted the requirement.
>
> BE for ELFv2 also exists, fwiw.
Yeap! We have -mabi=elfv2.
>
>> (While on BE environments, there is another issue: some unnecessary
>> memory stores are not deleted.)
>
> Huh.  Does that happen with the current compiler as well?  Do you have
> an example?

We can use the test case (pr65421-1.c) as the example -:)

typedef struct SA {double a[2]; long l; } A;
A ret_arg (A a) {return a;}

For this case, without the patch, below is generated:
std 4,56(1)
std 5,64(1)
li 10,56
std 6,72(1)
std 6,16(3)
lxvd2x 0,1,10
stxvd2x 0,0,3
With the patch, below is generated:
std 4,56(1)
std 5,64(1)
std 6,72(1)
std 4,0(3)
std 5,8(3)
std 6,16(3)
The first 3 std insns are reductant.  This is an unrelated issue.
With -mabi=elfv2, code can be optimized, and those 3 insns are deleted.

I think it would be fine to just test this case on powerpc_elfv2.
I would merge pr65421-1.c into pr65421.c (with dg-require elfv2).

>
>> But with more reading of the code 'rs6000_function_arg', as you said,
>> I'm not sure if this behavior meets other ABIs (at least, it seems,
>> this is not correct on darwin64).
>> So, as you said, we may add a requirement on ELFv2; Or leave this
>> case there, and add "! target" when hitting failure?
>
> If you do !target the testcase won't test much at all anymore ;-)

Right. we could use this method to exclude the sub-targets which are not
using r4,5,6 for the param for this case.

>
>> > We certainly can improve the homogeneous aggregates stuff, but please
>> > make sure you don't degrade all other stuff?  Older, as well as when
>> > things are not an homogeneous aggregate, for example too big.  Can you
>> > please add tests for such cases?
>> Sure, thanks!  I encounter one issue in this kind of case (large struct)
>> on a previous version path.
>
> Perhaps it would be better to have a hook so that every target (and
> subtarget) can fine tune exactly when this is done.  Then again, perhaps
> I worry too much.

Understand, I also thought about using a hook for targets to tune!
The good news is that: a few target hooks are used by generic code for
arg, and the target-related info (e.g. if a struct param is 

[PATCH 2/2] LoongArch: drop loongarch-driver

2022-12-08 Thread Icenowy Zheng
Currently the loongarch-driver code tries to parse the values of some -m
flags to numeric representation, and then output the strings
corresponding to the numeric representations. This is mostly an no-op,
and it leads to duplication of these flags' parse code, which makes
building libgccjit fail by having multiple copies of this code to link.
However, this driver code is previously used to generate default values
for these -m flags, which is now obsoleted by respecting the values of
--with-* build-time configure flags (or their default values determined
in config.gcc).

Some other architectures include driver code to handle
-m{arch,tune}=native by converting it to the current CPU type. However
as the LoongArch compiler itself can handle native value, such a driver
is not necessary on LoongArch.

Drop loongarch-driver now as it's mostly useless. The specs snippets
that handles -mabi values are moved to loongarch.h as they're still
useful and does not depend on the driver.

gcc/ChangeLog:

* config/loongarch/loongarch.h (ABI_GRLEN_SPEC):
Moved from loongarch-driver.h.
* config/loongarch/loongarch.h (ABI_SPEC): Ditto.
* config/loongarch/loongarch-driver.h: Removed.
* config/loongarch/loongarch-driver.cc: Removed.

Signed-off-by: Icenowy Zheng 
---
 gcc/config.gcc   |   1 -
 gcc/config/loongarch/loongarch-driver.cc | 187 ---
 gcc/config/loongarch/loongarch-driver.h  |  68 -
 gcc/config/loongarch/loongarch.h |  10 +-
 4 files changed, 8 insertions(+), 258 deletions(-)
 delete mode 100644 gcc/config/loongarch/loongarch-driver.cc
 delete mode 100644 gcc/config/loongarch/loongarch-driver.h

diff --git a/gcc/config.gcc b/gcc/config.gcc
index b5eda046033..dc1b1cdaa9d 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -459,7 +459,6 @@ loongarch*-*-*)
cpu_type=loongarch
extra_headers="larchintrin.h"
extra_objs="loongarch-c.o loongarch-builtins.o loongarch-cpu.o 
loongarch-opts.o loongarch-def.o"
-   extra_gcc_objs="loongarch-driver.o loongarch-cpu.o loongarch-opts.o 
loongarch-def.o"
extra_options="${extra_options} g.opt fused-madd.opt"
;;
 nds32*)
diff --git a/gcc/config/loongarch/loongarch-driver.cc 
b/gcc/config/loongarch/loongarch-driver.cc
deleted file mode 100644
index 0adcc923b7d..000
--- a/gcc/config/loongarch/loongarch-driver.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Subroutines for the gcc driver.
-   Copyright (C) 2021-2022 Free Software Foundation, Inc.
-   Contributed by Loongson Ltd.
-
-This file is part of GCC.
-
-GCC is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 3, or (at your option)
-any later version.
-
-GCC is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with GCC; see the file COPYING3.  If not see
-.  */
-
-#define IN_TARGET_CODE 1
-
-#include "config.h"
-#include "system.h"
-#include "coretypes.h"
-#include "tm.h"
-#include "obstack.h"
-#include "diagnostic-core.h"
-
-#include "loongarch-opts.h"
-#include "loongarch-driver.h"
-
-static int
-  opt_arch_driver = M_OPTION_NOT_SEEN,
-  opt_tune_driver = M_OPTION_NOT_SEEN,
-  opt_fpu_driver = M_OPTION_NOT_SEEN,
-  opt_abi_base_driver = M_OPTION_NOT_SEEN,
-  opt_abi_ext_driver = M_OPTION_NOT_SEEN,
-  opt_cmodel_driver = M_OPTION_NOT_SEEN;
-
-int opt_switches = 0;
-
-/* This flag is set to 1 if we believe that the user might be avoiding
-   linking (implicitly) against something from the startfile search paths.  */
-static int no_link = 0;
-
-#define LARCH_DRIVER_SET_M_FLAG(OPTS_ARRAY, N_OPTS, FLAG, STR) \
-  for (int i = 0; i < (N_OPTS); i++)   \
-  {\
-if ((OPTS_ARRAY)[i] != 0)  \
-  if (strcmp ((STR), (OPTS_ARRAY)[i]) == 0)\
-   (FLAG) = i; \
-  }
-
-/* Use the public obstack from the gcc driver (defined in gcc.c).
-   This is for allocating space for the returned string.  */
-extern struct obstack opts_obstack;
-
-#define APPEND_LTR(S)\
-  obstack_grow (_obstack, (const void*) (S), \
-   sizeof ((S)) / sizeof (char) -1)
-
-#define APPEND_VAL(S) \
-  obstack_grow (_obstack, (const void*) (S), strlen ((S)))
-
-
-const char*
-driver_set_m_flag (int argc, const char **argv)
-{
-  int parm_off = 0;
-
-  if (argc != 1)
-return "%eset_m_flag requires exactly 1 argument.";
-
-#undef PARM
-#define PARM (argv[0] + parm_off)
-
-/* Note: sizeof (OPTSTR_##NAME) equals 

[PATCH 1/2] LoongArch: respect the with values in config.gcc

2022-12-08 Thread Icenowy Zheng
In config.gcc, there's a long code snippet that handles
--with-{arch,tune,abi,fpu} and give them default values; however these
"with" values are not used at all.

Use these "with" values to initialize these variables in specs.

gcc/ChangeLog:

* config/loongarch/loongarch.h (OPTION_DEFAULT_SPECS):
New macro that simply injects configure --with values.

Signed-off-by: Icenowy Zheng 
---
 gcc/config/loongarch/loongarch.h | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h
index a402d3ba35a..5e2f4158f70 100644
--- a/gcc/config/loongarch/loongarch.h
+++ b/gcc/config/loongarch/loongarch.h
@@ -50,6 +50,17 @@ along with GCC; see the file COPYING3.  If not see
 /* Driver native functions for SPEC processing in the GCC driver.  */
 #include "loongarch-driver.h"
 
+/* Support for a compile-time default CPU, et cetera.  The rules are:
+   --with-arch is ignored if -march is specified.
+   --with-tune is ignored if -mtune is specified.
+   --with-abi is ignored if -mabi is specified.
+   --with-fpu is ignored if -mfpu is specified. */
+#define OPTION_DEFAULT_SPECS \
+  {"arch", "%{!march=*:-march=%(VALUE)}" }, \
+  {"tune", "%{!mtune=*:-mtune=%(VALUE)}" }, \
+  {"abi", "%{!mabi=*:-mabi=%(VALUE)}" }, \
+  {"fpu", "%{!mfpu=*:-mfpu=%(VALUE)}" }, \
+
 /* This definition replaces the formerly used 'm' constraint with a
different constraint letter in order to avoid changing semantics of
the 'm' constraint when accepting new address formats in
-- 
2.38.1



[committed] analyzer: rename region-model-impl-calls.cc to kf.cc

2022-12-08 Thread David Malcolm via Gcc-patches
Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to trunk as r13-4579-g63a42ffc083355.

gcc/ChangeLog:
* Makefile.in (ANALYZER_OBJS): Update for renaming of
analyzer/region-model-impl-calls.cc to analyzer/kf.cc.

gcc/analyzer/ChangeLog:
* analyzer.h (class known_function): Expand comment.
* region-model-impl-calls.cc: Rename to...
* kf.cc: ...this.
* known-function-manager.h (class known_function_manager): Add
leading comment.

Signed-off-by: David Malcolm 
---
 gcc/Makefile.in|  2 +-
 gcc/analyzer/analyzer.h|  3 ++-
 gcc/analyzer/{region-model-impl-calls.cc => kf.cc} |  0
 gcc/analyzer/known-function-manager.h  | 12 
 4 files changed, 15 insertions(+), 2 deletions(-)
 rename gcc/analyzer/{region-model-impl-calls.cc => kf.cc} (100%)

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 7bcc5e501de..995d77f96c4 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1269,6 +1269,7 @@ ANALYZER_OBJS = \
analyzer/feasible-graph.o \
analyzer/function-set.o \
analyzer/infinite-recursion.o \
+   analyzer/kf.o \
analyzer/kf-analyzer.o \
analyzer/kf-lang-cp.o \
analyzer/known-function-manager.o \
@@ -1278,7 +1279,6 @@ ANALYZER_OBJS = \
analyzer/region.o \
analyzer/region-model.o \
analyzer/region-model-asm.o \
-   analyzer/region-model-impl-calls.o \
analyzer/region-model-manager.o \
analyzer/region-model-reachability.o \
analyzer/sm.o \
diff --git a/gcc/analyzer/analyzer.h b/gcc/analyzer/analyzer.h
index 418d4210755..19e5b9011fe 100644
--- a/gcc/analyzer/analyzer.h
+++ b/gcc/analyzer/analyzer.h
@@ -229,7 +229,8 @@ extern location_t get_stmt_location (const gimple *stmt, 
function *fun);
 extern bool compat_types_p (tree src_type, tree dst_type);
 
 /* Abstract base class for simulating the behavior of known functions,
-   supplied by the core of the analyzer, or by plugins.  */
+   supplied by the core of the analyzer, or by plugins.
+   The former are typically implemented in the various kf*.cc  */
 
 class known_function
 {
diff --git a/gcc/analyzer/region-model-impl-calls.cc b/gcc/analyzer/kf.cc
similarity index 100%
rename from gcc/analyzer/region-model-impl-calls.cc
rename to gcc/analyzer/kf.cc
diff --git a/gcc/analyzer/known-function-manager.h 
b/gcc/analyzer/known-function-manager.h
index 188cb8e034a..7bacafe8e24 100644
--- a/gcc/analyzer/known-function-manager.h
+++ b/gcc/analyzer/known-function-manager.h
@@ -25,6 +25,18 @@ along with GCC; see the file COPYING3.  If not see
 
 namespace ana {
 
+/* Instances of known_function are registered with the known_function_manager
+   when the analyzer starts.
+
+   The known_function_manager has responsibility for determining which
+   known_function instance (if any) is relevant at a call site, by checking
+   name or id, and by calling known_function::matches_call_types_p to ensure
+   that the known_function's preconditions hold (typically assumptions about
+   types e.g. that "has 3 args, and that arg 0 is of pointer type").
+
+   The known_function subclasses themselves have responsibility for
+   determining the outcome(s) of the call.  */
+
 class known_function_manager : public log_user
 {
 public:
-- 
2.26.3



[committed] analyzer: fix ICE on region creation during get_referenced_base_regions [PR108003]

2022-12-08 Thread David Malcolm via Gcc-patches
Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to trunk as r13-4578-g7dc0ecafe60b22.

gcc/analyzer/ChangeLog:
PR analyzer/108003
* call-summary.cc
(call_summary_replay::convert_region_from_summary_1): Convert
heap_regs_in_use from auto_sbitmap to auto_bitmap.
* region-model-manager.cc
(region_model_manager::get_or_create_region_for_heap_alloc):
Convert from sbitmap to bitmap.
* region-model-manager.h: Likewise.
* region-model.cc
(region_model::get_or_create_region_for_heap_alloc): Convert from
auto_sbitmap to auto_bitmap.
(region_model::get_referenced_base_regions): Likewise.
* region-model.h: Include "bitmap.h" rather than "sbitmap.h".
(region_model::get_referenced_base_regions): Convert from
auto_sbitmap to auto_bitmap.

gcc/testsuite/ChangeLog:
PR analyzer/108003
* g++.dg/analyzer/pr108003.C: New test.

Signed-off-by: David Malcolm 
---
 gcc/analyzer/call-summary.cc |  2 +-
 gcc/analyzer/region-model-manager.cc |  2 +-
 gcc/analyzer/region-model-manager.h  |  2 +-
 gcc/analyzer/region-model.cc |  4 +--
 gcc/analyzer/region-model.h  |  4 +--
 gcc/testsuite/g++.dg/analyzer/pr108003.C | 37 
 6 files changed, 44 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/analyzer/pr108003.C

diff --git a/gcc/analyzer/call-summary.cc b/gcc/analyzer/call-summary.cc
index 31674736ac5..a18a1b1b40a 100644
--- a/gcc/analyzer/call-summary.cc
+++ b/gcc/analyzer/call-summary.cc
@@ -726,7 +726,7 @@ call_summary_replay::convert_region_from_summary_1 (const 
region *summary_reg)
/* If we have a heap-allocated region in the summary, then
   it was allocated within the callee.
   Create a new heap-allocated region to summarize this.  */
-   auto_sbitmap heap_regs_in_use (mgr->get_num_regions ());
+   auto_bitmap heap_regs_in_use;
get_caller_model ()->get_referenced_base_regions (heap_regs_in_use);
return mgr->get_or_create_region_for_heap_alloc (heap_regs_in_use);
   }
diff --git a/gcc/analyzer/region-model-manager.cc 
b/gcc/analyzer/region-model-manager.cc
index 0fb96386f28..dad7c411446 100644
--- a/gcc/analyzer/region-model-manager.cc
+++ b/gcc/analyzer/region-model-manager.cc
@@ -1698,7 +1698,7 @@ get_region_for_unexpected_tree_code (region_model_context 
*ctxt,
 
 const region *
 region_model_manager::
-get_or_create_region_for_heap_alloc (const sbitmap _regs_in_use)
+get_or_create_region_for_heap_alloc (const bitmap _regs_in_use)
 {
   /* Try to reuse an existing region, if it's unreferenced in the
  client state.  */
diff --git a/gcc/analyzer/region-model-manager.h 
b/gcc/analyzer/region-model-manager.h
index 13fbe483f6d..ca9a498f12f 100644
--- a/gcc/analyzer/region-model-manager.h
+++ b/gcc/analyzer/region-model-manager.h
@@ -155,7 +155,7 @@ public:
  The number of these within the analysis can grow arbitrarily.
  They are still owned by the manager.  */
   const region *
-  get_or_create_region_for_heap_alloc (const sbitmap _regs_in_use);
+  get_or_create_region_for_heap_alloc (const bitmap _regs_in_use);
   const region *create_region_for_alloca (const frame_region *frame);
 
   void log_stats (logger *logger, bool show_objs) const;
diff --git a/gcc/analyzer/region-model.cc b/gcc/analyzer/region-model.cc
index 18eaf22a5d1..f6cd34f4c22 100644
--- a/gcc/analyzer/region-model.cc
+++ b/gcc/analyzer/region-model.cc
@@ -4904,7 +4904,7 @@ region_model::get_or_create_region_for_heap_alloc (const 
svalue *size_in_bytes,
   /* Determine which regions are referenced in this region_model, so that
  we can reuse an existing heap_allocated_region if it's not in use on
  this path.  */
-  auto_sbitmap base_regs_in_use (m_mgr->get_num_regions ());
+  auto_bitmap base_regs_in_use;
   get_referenced_base_regions (base_regs_in_use);
   const region *reg
 = m_mgr->get_or_create_region_for_heap_alloc (base_regs_in_use);
@@ -4917,7 +4917,7 @@ region_model::get_or_create_region_for_heap_alloc (const 
svalue *size_in_bytes,
reachable in this region_model.  */
 
 void
-region_model::get_referenced_base_regions (auto_sbitmap _ids) const
+region_model::get_referenced_base_regions (auto_bitmap _ids) const
 {
   reachable_regions reachable_regs (const_cast (this));
   m_store.for_each_cluster (reachable_regions::init_cluster_cb,
diff --git a/gcc/analyzer/region-model.h b/gcc/analyzer/region-model.h
index 291bb2ff45a..626b10d2538 100644
--- a/gcc/analyzer/region-model.h
+++ b/gcc/analyzer/region-model.h
@@ -26,7 +26,7 @@ along with GCC; see the file COPYING3.  If not see
   (Zhongxing Xu, Ted Kremenek, and Jian Zhang)
  http://lcs.ios.ac.cn/~xuzb/canalyze/memmodel.pdf  */
 
-#include "sbitmap.h"
+#include "bitmap.h"
 #include "selftest.h"
 #include "analyzer/svalue.h"
 #include "analyzer/region.h"
@@ -390,7 +390,7 @@ 

[committed] analyzer: handle memmove like memcpy

2022-12-08 Thread David Malcolm via Gcc-patches
On Thu, 2022-12-08 at 07:36 -0300, Alexandre Oliva wrote:
> Hello again, David,
> 
> On Dec  2, 2022, David Malcolm  wrote:
> 
> > I had a go at porting your patch to trunk; here's the result.
> 
> Oh, wow, nice!  Thank you so much.
> 
> I confirm it works on riscv64-elf too.

Thanks.

When I ran the full test suite (on x86_64) it turned out that the added
check_for_poison in memcpy was correctly flagging some uses of
uninitialized source buffers in the testsuite.  So I added modified the
patch further, adding dg-warning directives in a few places, and adding
some more test coverage of memcpy/memmove from uninit buffers.

Here's what I've pushed to trunk (as r13-4577-gcf80a23e19db83);
successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Hopefully this doesn't introduce any further failures on riscv.

Dave


gcc/analyzer/ChangeLog:
* region-model-impl-calls.cc (class kf_memcpy): Rename to...
(class kf_memcpy_memmove): ...this.
(kf_memcpy::impl_call_pre): Rename to...
(kf_memcpy_memmove::impl_call_pre): ...this, and check the src for
poison.
(register_known_functions): Update for above renaming, and
register BUILT_IN_MEMMOVE and BUILT_IN_MEMMOVE_CHK.

gcc/testsuite/ChangeLog:
* gcc.dg/analyzer/memcpy-1.c (test_8a, test_8b): New tests.
* gcc.dg/analyzer/memmove-1.c: New test, based on memcpy-1.c
* gcc.dg/analyzer/out-of-bounds-1.c (test7): Update expected
result for uninit srcBuf.
* gcc.dg/analyzer/out-of-bounds-5.c (test8, test9): Add
dg-warnings for memcpy from uninit src vla.
* gcc.dg/analyzer/pr104308.c (test_memmove_within_uninit):
Expect creation point note to be missing on riscv*-*-*.

Signed-off-by: David Malcolm 
---
 gcc/analyzer/region-model-impl-calls.cc   |  18 +-
 gcc/testsuite/gcc.dg/analyzer/memcpy-1.c  |  14 ++
 gcc/testsuite/gcc.dg/analyzer/memmove-1.c | 182 ++
 .../gcc.dg/analyzer/out-of-bounds-1.c |   2 +-
 .../gcc.dg/analyzer/out-of-bounds-5.c |   2 +
 gcc/testsuite/gcc.dg/analyzer/pr104308.c  |   2 +-
 6 files changed, 212 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/memmove-1.c

diff --git a/gcc/analyzer/region-model-impl-calls.cc 
b/gcc/analyzer/region-model-impl-calls.cc
index 6aeb9281bff..ff2f1b1ef9c 100644
--- a/gcc/analyzer/region-model-impl-calls.cc
+++ b/gcc/analyzer/region-model-impl-calls.cc
@@ -246,10 +246,12 @@ kf_malloc::impl_call_pre (const call_details ) const
 }
 }
 
-/* Handler for "memcpy" and "__builtin_memcpy".  */
-// TODO: complain about overlapping src and dest.
+/* Handler for "memcpy" and "__builtin_memcpy",
+   "memmove", and "__builtin_memmove".  */
+/* TODO: complain about overlapping src and dest for the memcpy
+   variants.  */
 
-class kf_memcpy : public known_function
+class kf_memcpy_memmove : public known_function
 {
 public:
   bool matches_call_types_p (const call_details ) const final override
@@ -263,7 +265,7 @@ public:
 };
 
 void
-kf_memcpy::impl_call_pre (const call_details ) const
+kf_memcpy_memmove::impl_call_pre (const call_details ) const
 {
   const svalue *dest_ptr_sval = cd.get_arg_svalue (0);
   const svalue *src_ptr_sval = cd.get_arg_svalue (1);
@@ -285,6 +287,8 @@ kf_memcpy::impl_call_pre (const call_details ) const
 = mgr->get_sized_region (dest_reg, NULL_TREE, num_bytes_sval);
   const svalue *src_contents_sval
 = model->get_store_value (sized_src_reg, cd.get_ctxt ());
+  model->check_for_poison (src_contents_sval, cd.get_arg_tree (1),
+  cd.get_ctxt ());
   model->set_value (sized_dest_reg, src_contents_sval, cd.get_ctxt ());
 }
 
@@ -927,8 +931,10 @@ register_known_functions (known_function_manager )
 kfm.add (BUILT_IN_EXPECT_WITH_PROBABILITY, make_unique ());
 kfm.add (BUILT_IN_FREE, make_unique ());
 kfm.add (BUILT_IN_MALLOC, make_unique ());
-kfm.add (BUILT_IN_MEMCPY, make_unique ());
-kfm.add (BUILT_IN_MEMCPY_CHK, make_unique ());
+kfm.add (BUILT_IN_MEMCPY, make_unique ());
+kfm.add (BUILT_IN_MEMCPY_CHK, make_unique ());
+kfm.add (BUILT_IN_MEMMOVE, make_unique ());
+kfm.add (BUILT_IN_MEMMOVE_CHK, make_unique ());
 kfm.add (BUILT_IN_MEMSET, make_unique ());
 kfm.add (BUILT_IN_MEMSET_CHK, make_unique ());
 kfm.add (BUILT_IN_REALLOC, make_unique ());
diff --git a/gcc/testsuite/gcc.dg/analyzer/memcpy-1.c 
b/gcc/testsuite/gcc.dg/analyzer/memcpy-1.c
index a9368d3307d..b1ffed0a979 100644
--- a/gcc/testsuite/gcc.dg/analyzer/memcpy-1.c
+++ b/gcc/testsuite/gcc.dg/analyzer/memcpy-1.c
@@ -166,3 +166,17 @@ void test_7b (void *src, size_t sz)
 {
   memcpy ((void *)"hello world", src, sz); /* { dg-warning "write to string 
literal" } */
 }
+
+/* memcpy from uninitialized buffer.  */
+
+void test_8a (void *dst)
+{
+  char src[16];
+  memcpy (dst, src, 16); /* { dg-warning "use of uninitialized value" } */
+}
+
+void test_8b (void *dst, size_t n)
+{
+  

Re: [PATCH 0/2] Support HWASAN with Intel LAM

2022-12-08 Thread Hongtao Liu via Gcc-patches
On Wed, Nov 30, 2022 at 10:07 PM Martin Liška  wrote:
>
> On 11/29/22 03:37, Hongtao Liu wrote:
> > On Mon, Nov 28, 2022 at 10:40 PM Martin Liška  wrote:
> >>
> >> On 11/11/22 02:26, liuhongt via Gcc-patches wrote:
> >>>2 years ago, ARM folks support HWASAN[1] in GCC[2], and introduced 
> >>> several
> >>> target hooks(Many thanks to their work) so other backends can do similar
> >>> things if they have similar feature.
> >>>Intel LAM(linear Address Masking)[3 Charpter 14] supports similar 
> >>> feature with
> >>> the upper bits of pointers can be used as metadata, LAM support two modes:
> >>>LAM_U48:bits 48-62 can be used as metadata
> >>>LAM_U57:bits 57-62 can be used as metedata.
> >>>
> >>> These 2 patches mainly support those target hooks, but HWASAN is not 
> >>> really
> >>> enabled until the final decision for the LAM kernel interface which may 
> >>> take
> >>> quite a long time. We have verified our patches with a "fake" interface 
> >>> locally[4], and
> >>> decided to push the backend patches to the GCC13 to make other HWASAN 
> >>> developper's work
> >>> easy.
I've committed 2 patches.
> >>
> >> Hello.
> >>
> >> A few random comments I noticed:
> >>
> >> 1) please document the new target -mlam in extend.texi
> > I will.
>
> Thanks.
>
> >> 2) the description speaks about bits [48-62] or [57-62], can explain why 
> >> the patch contains:
> >>
> > Kernel will use bit 63 for special purposes, and here we want to
> > extract the tag by shifting right the pointer 57 bits, and need to
> > manually mask off bit63.
>
> And thanks for the explanation.
>
> Martin
>
> >> +  /* Mask off bit63 when LAM_U57.  */
> >> +  if (ix86_lam_type == lam_u57)
> >> ?
> >>
> >> 3) Shouldn't the -lman option emit GNU_PROPERTY_X86_FEATURE_1_LAM_U57 or 
> >> GNU_PROPERTY_X86_FEATURE_1_LAM_U48
> >> .gnu.property note?
> >>
> >> 4) Can you please explain Florian's comment here:
> >> https://gitlab.com/x86-psABIs/x86-64-ABI/-/merge_requests/13#note_1181396487
> >>
> >> Thanks,
> >> Martin
> >>
> >>>
> >>> [1] 
> >>> https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> >>> [2] https://gcc.gnu.org/pipermail/gcc-patches/2020-November/557857.html
> >>> [3] 
> >>> https://www.intel.com/content/dam/develop/external/us/en/documents/architecture-instruction-set-extensions-programming-reference.pdf
> >>> [4] https://gitlab.com/x86-gcc/gcc/-/tree/users/intel/lam/master
> >>>
> >>>
> >>> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> >>> Ok for trunk?
> >>>
> >>> liuhongt (2):
> >>>Implement hwasan target_hook.
> >>>Enable hwasan for x86-64.
> >>>
> >>>   gcc/config/i386/i386-expand.cc  |  12 
> >>>   gcc/config/i386/i386-options.cc |   3 +
> >>>   gcc/config/i386/i386-opts.h |   6 ++
> >>>   gcc/config/i386/i386-protos.h   |   2 +
> >>>   gcc/config/i386/i386.cc | 123 
> >>>   gcc/config/i386/i386.opt|  16 +
> >>>   libsanitizer/configure.tgt  |   1 +
> >>>   7 files changed, 163 insertions(+)
> >>>
> >>
> >
> >
>


-- 
BR,
Hongtao


Re: [PATCH 2/3]rs6000: NFC use sext_hwi to replace ((v&0xf..f)^0x80..0) - 0x80..0

2022-12-08 Thread Jiufu Guo via Gcc-patches
Hi,

Jiufu Guo via Gcc-patches  writes:

> Hi Kewen,
>
> "Kewen.Lin"  writes:
>> on 2022/12/1 20:16, guojiufu wrote:
>>> On 2022-12-01 15:10, Jiufu Guo via Gcc-patches wrote:
 Hi Kewen,
cut...
>>> From 8aa8e1234b6ec34473434951a3a6177253aac770 Mon Sep 17 00:00:00 2001
>>> From: Jiufu Guo 
>>> Date: Wed, 30 Nov 2022 13:13:37 +0800
>>> Subject: [PATCH 2/2]rs6000: update ((v&0xf..f)^0x80..0) - 0x80..0 with 
>>> code: like sext_hwi
>>> 
>>
>> May be shorter with "rs6000: Update sign extension computation with
>> sext_hwi"?
> Thanks for your great suggestions!
>>
>>> This patch just replaces the expression like: 
>>> ((value & 0xf..f) ^ 0x80..0) - 0x80..0 to better code(e.g. sext_hwi) for
>>> rs6000.cc, rs6000.md and predicates.md (files under rs6000/).
>>
>>
>>> Bootstrap and regtest pass on ppc64{,le}.
>>> 
>>
>> Thanks for updating and testing, this patch is OK.
Committed via r13-4556.  Thanks again!

BR,
Jeff (Jiufu)
>
> BR,
> Jeff (Jiufu)
>
>>
>> BR,
>> Kewen


[committed] libstdc++: Fix some -Wunused warnings in tests

2022-12-08 Thread Jonathan Wakely via Gcc-patches
Tested x86_64-linux. Pushed to trunk.

-- >8 --

libstdc++-v3/ChangeLog:

* include/ext/pb_ds/detail/type_utils.hpp (PB_DS_STATIC_ASSERT):
Add unused attribute to avoid -Wunused-local-typedef warnings.
* testsuite/17_intro/tag_type_explicit_ctor.cc: Add pragma to
ignore -Wunused-variable warnings
---
 libstdc++-v3/include/ext/pb_ds/detail/type_utils.hpp  | 3 ++-
 libstdc++-v3/testsuite/17_intro/tag_type_explicit_ctor.cc | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/libstdc++-v3/include/ext/pb_ds/detail/type_utils.hpp 
b/libstdc++-v3/include/ext/pb_ds/detail/type_utils.hpp
index 52d05392975..c3db6c93ea6 100644
--- a/libstdc++-v3/include/ext/pb_ds/detail/type_utils.hpp
+++ b/libstdc++-v3/include/ext/pb_ds/detail/type_utils.hpp
@@ -152,7 +152,8 @@ namespace __gnu_pbds
 };
 
 #define PB_DS_STATIC_ASSERT(UNIQUE, E)  \
-typedef 
__gnu_pbds::detail::__static_assert_dumclass)>
 UNIQUE##__static_assert_type
+typedef 
__gnu_pbds::detail::__static_assert_dumclass)>
 \
+  UNIQUE##__static_assert_type __attribute__((__unused__))
 
 #endif
 
diff --git a/libstdc++-v3/testsuite/17_intro/tag_type_explicit_ctor.cc 
b/libstdc++-v3/testsuite/17_intro/tag_type_explicit_ctor.cc
index 410142d3974..fead30f63c3 100644
--- a/libstdc++-v3/testsuite/17_intro/tag_type_explicit_ctor.cc
+++ b/libstdc++-v3/testsuite/17_intro/tag_type_explicit_ctor.cc
@@ -34,6 +34,8 @@ void f5(std::try_to_lock_t);
 void f6(std::adopt_lock_t);
 #endif
 
+#pragma GCC diagnostic ignored "-Wunused-variable"
+
 int main()
 {
   std::nothrow_t v1;
-- 
2.38.1



[committed] libstdc++: Remove digit separators [PR108015]

2022-12-08 Thread Jonathan Wakely via Gcc-patches
Tested x86_64-linux. Pushed to trunk.

-- >8 --

These are not valid in C++11 and cause a warning when preprocessing,
even though they're inside a skipped group.

chrono:2436: warning: missing terminating ' character

libstdc++-v3/ChangeLog:

PR libstdc++/108015
* include/std/chrono (hh_mm_ss): Remove digit separators.
---
 libstdc++-v3/include/std/chrono | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libstdc++-v3/include/std/chrono b/libstdc++-v3/include/std/chrono
index 8d73d98e4cb..38ecd3142bc 100644
--- a/libstdc++-v3/include/std/chrono
+++ b/libstdc++-v3/include/std/chrono
@@ -2433,7 +2433,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
template
  requires (!treat_as_floating_point_v<_Rep>)
&& ratio_less_v<_Period, ratio<1, 250>>
-   && (ratio_greater_equal_v<_Period, ratio<1, 4'000'000'000>>
+   && (ratio_greater_equal_v<_Period, ratio<1, 40>>
  || __fits)
  struct __subseconds>
  {
-- 
2.38.1



[committed] libstdc++: Add [[nodiscard]] to chrono conversion functions

2022-12-08 Thread Jonathan Wakely via Gcc-patches
Tested x86_64-linux. Pushed to trunk.

-- >8 --

Also add doxygen comments.

libstdc++-v3/ChangeLog:

* include/bits/chrono.h (duration_cast, floor, round, abs, ceil)
(time_point_cast): Add [[nodiscard]] attribute and doxygen
comments.
(treat_as_floating_point): Add doxygen commen.
---
 libstdc++-v3/include/bits/chrono.h | 139 +
 1 file changed, 123 insertions(+), 16 deletions(-)

diff --git a/libstdc++-v3/include/bits/chrono.h 
b/libstdc++-v3/include/bits/chrono.h
index 496e9485a73..22c0be3fbe6 100644
--- a/libstdc++-v3/include/bits/chrono.h
+++ b/libstdc++-v3/include/bits/chrono.h
@@ -246,8 +246,18 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
 /// @endcond
 
-/// duration_cast
+/** Convert a `duration` to type `ToDur`.
+ *
+ * If the duration cannot be represented accurately in the result type,
+ * returns the result of integer truncation (i.e., rounded towards zero).
+ *
+ * @tparam _ToDur The result type must be a `duration`.
+ * @param __d A duration.
+ * @return The value of `__d` converted to type `_ToDur`.
+ * @since C++11
+ */
 template
+  _GLIBCXX_NODISCARD
   constexpr __enable_if_is_duration<_ToDur>
   duration_cast(const duration<_Rep, _Period>& __d)
   {
@@ -260,7 +270,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
return __dc::__cast(__d);
   }
 
-/// treat_as_floating_point
+/** Trait indicating whether to treat a type as a floating-point type.
+ *
+ * The chrono library uses this trait to tell whether a `duration` can
+ * represent fractional values of the given precision, or only integral
+ * values.
+ *
+ * You should specialize this trait for your own numeric types that are
+ * used with `duration` and can represent non-integral values.
+ *
+ * @since C++11
+ */
 template
   struct treat_as_floating_point
   : is_floating_point<_Rep>
@@ -320,8 +340,18 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 #if __cplusplus >= 201703L
 # define __cpp_lib_chrono 201611L
 
+/** Convert a `duration` to type `ToDur` and round down.
+ *
+ * If the duration cannot be represented exactly in the result type,
+ * returns the closest value that is less than the argument.
+ *
+ * @tparam _ToDur The result type must be a `duration`.
+ * @param __d A duration.
+ * @return The value of `__d` converted to type `_ToDur`.
+ * @since C++17
+ */
 template
-  constexpr __enable_if_is_duration<_ToDur>
+  [[nodiscard]] constexpr __enable_if_is_duration<_ToDur>
   floor(const duration<_Rep, _Period>& __d)
   {
auto __to = chrono::duration_cast<_ToDur>(__d);
@@ -330,8 +360,18 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
return __to;
   }
 
+/** Convert a `duration` to type `ToDur` and round up.
+ *
+ * If the duration cannot be represented exactly in the result type,
+ * returns the closest value that is greater than the argument.
+ *
+ * @tparam _ToDur The result type must be a `duration`.
+ * @param __d A duration.
+ * @return The value of `__d` converted to type `_ToDur`.
+ * @since C++17
+ */
 template
-  constexpr __enable_if_is_duration<_ToDur>
+  [[nodiscard]] constexpr __enable_if_is_duration<_ToDur>
   ceil(const duration<_Rep, _Period>& __d)
   {
auto __to = chrono::duration_cast<_ToDur>(__d);
@@ -340,8 +380,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
return __to;
   }
 
+/** Convert a `duration` to type `ToDur` and round to the closest value.
+ *
+ * If the duration cannot be represented exactly in the result type,
+ * returns the closest value, rounding ties to even.
+ *
+ * @tparam _ToDur The result type must be a `duration` with a
+ *non-floating-point `rep` type.
+ * @param __d A duration.
+ * @return The value of `__d` converted to type `_ToDur`.
+ * @since C++17
+ */
 template 
-  constexpr enable_if_t<
+  [[nodiscard]] constexpr
+  enable_if_t<
__and_<__is_duration<_ToDur>,
   __not_>>::value,
_ToDur>
@@ -352,18 +404,24 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
auto __diff0 = __d - __t0;
auto __diff1 = __t1 - __d;
if (__diff0 == __diff1)
-   {
+ {
if (__t0.count() & 1)
-   return __t1;
+ return __t1;
return __t0;
-   }
+ }
else if (__diff0 < __diff1)
-   return __t0;
+ return __t0;
return __t1;
   }
 
+/** The absolute (non-negative) value of a duration.
+ *
+ * @param __d A duration with a signed `rep` type.
+ * @return A duration of the same type as the argument, with value |d|.
+ * @since C++17
+ */
 template
-  constexpr
+  [[nodiscard]] constexpr
   enable_if_t::is_signed, duration<_Rep, _Period>>
   

[committed] libstdc++: Change class-key for duration and time_point to class

2022-12-08 Thread Jonathan Wakely via Gcc-patches
Tested x86_64-linux. Pushed to trunk.

-- >8 --

We define these with the 'struct' keyword, but the standard uses
'class'. This results in warnings if users try to refer to them using
elaborated type specifiers.

libstdc++-v3/ChangeLog:

* include/bits/chrono.h (duration, time_point): Change 'struct'
to 'class'.
---
 libstdc++-v3/include/bits/chrono.h | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libstdc++-v3/include/bits/chrono.h 
b/libstdc++-v3/include/bits/chrono.h
index cabf61264d8..496e9485a73 100644
--- a/libstdc++-v3/include/bits/chrono.h
+++ b/libstdc++-v3/include/bits/chrono.h
@@ -59,11 +59,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
 /// `chrono::duration` represents a distance between two points in time
 template>
-  struct duration;
+  class duration;
 
 /// `chrono::time_point` represents a point in time as measured by a clock
 template
-  struct time_point;
+  class time_point;
 /// @}
   }
 
@@ -431,14 +431,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 /// @endcond
 
 template
-  struct duration
+  class duration
   {
static_assert(!__is_duration<_Rep>::value, "rep cannot be a duration");
static_assert(__is_ratio<_Period>::value,
  "period must be a specialization of ratio");
static_assert(_Period::num > 0, "period must be positive");
 
-  private:
template
  using __is_float = treat_as_floating_point<_Rep2>;
 
@@ -844,11 +843,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 #undef _GLIBCXX_CHRONO_INT64_T
 
 template
-  struct time_point
+  class time_point
   {
static_assert(__is_duration<_Dur>::value,
"duration must be a specialization of std::chrono::duration");
 
+  public:
typedef _Clock  clock;
typedef _Durduration;
typedef typename duration::rep  rep;
-- 
2.38.1



Re: [PATCH v3] docs: Suggest options to improve ASAN stack traces

2022-12-08 Thread Jakub Jelinek via Gcc-patches
On Thu, Dec 08, 2022 at 05:56:41PM -0500, Marek Polacek wrote:
> Can't hurt.  Here's an updated patch.
>  
> -- >8 --
> I got a complaint that while Clang docs suggest options that improve
> the quality of the backtraces ASAN prints (cf.
> ), our docs
> don't say anything to that effect.  This patch amends that with a new
> paragraph.  (It deliberately doesn't mention -fno-omit-frame-pointer.)
> 
> gcc/ChangeLog:
> 
>   * doc/invoke.texi (-fsanitize=address): Suggest options to improve
>   stack traces.

Ok, thanks.

> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 726392409b6..3f2512ce16a 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -16510,6 +16510,16 @@ The option cannot be combined with 
> @option{-fsanitize=thread} or
>  @option{-fsanitize=hwaddress}.  Note that the only target
>  @option{-fsanitize=hwaddress} is currently supported on is AArch64.
>  
> +To get more accurate stack traces, it is possible to use options such as
> +@option{-O0}, @option{-O1}, or @option{-Og} (which, for instance, prevent
> +most function inlining), @option{-fno-optimize-sibling-calls} (which prevents
> +optimizing sibling and tail recursive calls; this option is implicit for
> +@option{-O0}, @option{-O1}, or @option{-Og}), or @option{-fno-ipa-icf} (which
> +disables Identical Code Folding for functions).  Since multiple runs of the
> +program may yield backtraces with different addresses due to ASLR (Address
> +Space Layout Randomization), it may be desirable to turn ASLR off.  On Linux,
> +this can be achieved with @samp{setarch `uname -m` -R ./prog}.
> +
>  @item -fsanitize=kernel-address
>  @opindex fsanitize=kernel-address
>  Enable AddressSanitizer for Linux kernel.

Jakub



[PATCH v3] docs: Suggest options to improve ASAN stack traces

2022-12-08 Thread Marek Polacek via Gcc-patches
On Thu, Dec 08, 2022 at 04:00:15PM +0100, Jakub Jelinek wrote:
> On Thu, Dec 08, 2022 at 09:34:34AM -0500, Marek Polacek wrote:
> > I got a complaint that while Clang docs suggest options that improve
> > the quality of the backtraces ASAN prints (cf.
> > ), our docs
> > don't say anything to that effect.  This patch amends that with a new
> > paragraph.  (It deliberately doesn't mention -fno-omit-frame-pointer.)
> > 
> > gcc/ChangeLog:
> > 
> > * doc/invoke.texi (-fsanitize=address): Suggest options to improve
> > stack traces.
> > ---
> >  gcc/doc/invoke.texi | 9 +
> >  1 file changed, 9 insertions(+)
> > 
> > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> > index 726392409b6..1641efecf18 100644
> > --- a/gcc/doc/invoke.texi
> > +++ b/gcc/doc/invoke.texi
> > @@ -16510,6 +16510,15 @@ The option cannot be combined with 
> > @option{-fsanitize=thread} or
> >  @option{-fsanitize=hwaddress}.  Note that the only target
> >  @option{-fsanitize=hwaddress} is currently supported on is AArch64.
> >  
> > +To get more accurate stack traces, it is possible to use options such as
> > +@option{-O} (which, for instance, prevents most function inlining),
> 
> Still not sure about this part.  For one, I wonder if we shouldn't
> recommend -O0, -O1 or -Og instead of just one of them, and I'm also not sure
> how much function inlining is prevented with -O1.

Right, that's why I put "most" in there.  But I think we should mention -O0
and -Og as well.

> always_inline functions are certainly inlined even at -O0 or -Og (at least
> when called directly), -O1 adds
> { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_finline_functions_called_once, NULL, 1 
> },
> to that, -O2 adds
> { OPT_LEVELS_2_PLUS, OPT_findirect_inlining, NULL, 1 },
> { OPT_LEVELS_2_PLUS, OPT_finline_small_functions, NULL, 1 },
> { OPT_LEVELS_2_PLUS, OPT_fpartial_inlining, NULL, 1 },
> { OPT_LEVELS_2_PLUS, OPT_finline_functions, NULL, 1 },
> and -O3 further bumps some parameters:
> { OPT_LEVELS_3_PLUS, OPT__param_max_inline_insns_auto_, NULL, 30 },
> { OPT_LEVELS_3_PLUS, OPT__param_early_inlining_insns_, NULL, 14 },
> { OPT_LEVELS_3_PLUS, OPT__param_inline_heuristics_hint_percent_, NULL, 
> 600 },
> { OPT_LEVELS_3_PLUS, OPT__param_inline_min_speedup_, NULL, 15 },
> { OPT_LEVELS_3_PLUS, OPT__param_max_inline_insns_single_, NULL, 200 },
> 
> > +@option{-fno-optimize-sibling-calls} (which prevents optimizing sibling
> 
> -fno-optimize-sibling-calls is the default for -O0/-O1/-Og; dunno if we
> want to reiterate it.

Can't hurt.  Here's an updated patch.
 
-- >8 --
I got a complaint that while Clang docs suggest options that improve
the quality of the backtraces ASAN prints (cf.
), our docs
don't say anything to that effect.  This patch amends that with a new
paragraph.  (It deliberately doesn't mention -fno-omit-frame-pointer.)

gcc/ChangeLog:

* doc/invoke.texi (-fsanitize=address): Suggest options to improve
stack traces.
---
 gcc/doc/invoke.texi | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 726392409b6..3f2512ce16a 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -16510,6 +16510,16 @@ The option cannot be combined with 
@option{-fsanitize=thread} or
 @option{-fsanitize=hwaddress}.  Note that the only target
 @option{-fsanitize=hwaddress} is currently supported on is AArch64.
 
+To get more accurate stack traces, it is possible to use options such as
+@option{-O0}, @option{-O1}, or @option{-Og} (which, for instance, prevent
+most function inlining), @option{-fno-optimize-sibling-calls} (which prevents
+optimizing sibling and tail recursive calls; this option is implicit for
+@option{-O0}, @option{-O1}, or @option{-Og}), or @option{-fno-ipa-icf} (which
+disables Identical Code Folding for functions).  Since multiple runs of the
+program may yield backtraces with different addresses due to ASLR (Address
+Space Layout Randomization), it may be desirable to turn ASLR off.  On Linux,
+this can be achieved with @samp{setarch `uname -m` -R ./prog}.
+
 @item -fsanitize=kernel-address
 @opindex fsanitize=kernel-address
 Enable AddressSanitizer for Linux kernel.

base-commit: 3a9f6d5a8ee490adf9a18f93feaf86542642be7d
-- 
2.38.1



Re: [PATCH] expr.cc: avoid unexpected side effects in expand_expr_divmod optimization

2022-12-08 Thread Jose E. Marchesi via Gcc-patches


Hi Jakub.

> On Thu, Dec 08, 2022 at 02:02:36PM +0100, Jose E. Marchesi wrote:
>> So, I guess the right fix would be to call assemble_external_libcall
>> during final?  The `.global FOO' directive would be generated
>> immediately before the call sequence, but I guess that would be ok.
>
> During final only if all the targets can deal with the effects of
> assemble_external_libcall being done in the middle of emitting assembly
> for the function.
>
> Otherwise, it could be e.g. done in the first loop of shorten_branches.
>
> Note, in calls.cc it is done only for emit_library_call_value_1
> and not for emit_call_1, so if we do it late, we need to be able to find
> out what call is to a libcall and what is to a normal call.  If there is
> no way to differentiate it right now, perhaps we need some flag somewhere,
> say on a SYMBOL_REF.  And then assemble_external_libcall either only
> if such a SYMBOL_REF appears in CALL_INSN or sibcall JUMP_INSN, or
> perhaps anywhere in the function and its constant pool.

Allright, the quick-and-dirty patch below seems to DTRT with simple
examples.

First, when libcalls are generated.  Note only one .global is generated
for all calls, and actually it is around the same position than before:

  $ cat foo.c
  int foo(unsigned int len, int flag)
  {
if (flag)
  return (((long)len) * 234 / 5);
return (((long)len) * 2 / 5);
  }
  $ cc1 -O2 foo.c
  $ cat foo.c
.file   "foo.c"
.text
.global __divdi3
.align  3
.global foo
.type   foo, @function
  foo:
mov32   %r1,%r1
lsh %r2,32
jne %r2,0,.L5
mov %r2,5
lsh %r1,1
call__divdi3
lsh %r0,32
arsh%r0,32
exit
  .L5:
mov %r2,5
mul %r1,234
call__divdi3
lsh %r0,32
arsh%r0,32
exit
.size   foo, .-foo
.ident  "GCC: (GNU) 13.0.0 20221207 (experimental)"

Second, when libcalls are tried by expand_moddiv in a sequence, but then
discarded and not linked in the main sequence:

  $ cat foo.c
  int foo(unsigned int len, int flag)
  {
if (flag)
  return (((long)len) * 234 / 5);
return (((long)len) * 2 / 5);
  }
  $ cc1 -O2 foo.c
  $ cat foo.c
.file   "foo.c"
.text
.align  3
.global foo
.type   foo, @function
  foo:
mov32   %r0,%r1
lsh %r2,32
jne %r2,0,.L5
add %r0,%r0
div %r0,5
lsh %r0,32
arsh%r0,32
exit
  .L5:
mul %r0,234
div %r0,5
lsh %r0,32
arsh%r0,32
exit
.size   foo, .-foo
.ident  "GCC: (GNU) 13.0.0 20221207 (experimental)"

Note the .global now is not generated, as desired.

As you can see below, I am adding a new RTX flag `is_libcall', with
written form "/l".

Before I get into serious testing etc, can you please confirm whether
this is the right approach or not?

In particular, I am a little bit concerned about the expectation I am
using that the target of the `call' instruction emitted by emit_call_1
is always a (MEM (SYMBOL_REF ...)) when it is passed a SYMBOL_REF as the
first argument (`fun' in emit_library_call_value_1).

Thanks.

diff --git a/gcc/calls.cc b/gcc/calls.cc
index 6dd6f73e978..6c4a3725272 100644
--- a/gcc/calls.cc
+++ b/gcc/calls.cc
@@ -4370,10 +4370,6 @@ emit_library_call_value_1 (int retval, rtx orgfun, rtx 
value,
|| argvec[i].partial != 0)
   update_stack_alignment_for_call ([i].locate);
 
-  /* If this machine requires an external definition for library
- functions, write one out.  */
-  assemble_external_libcall (fun);
-
   original_args_size = args_size;
   args_size.constant = (aligned_upper_bound (args_size.constant
 + stack_pointer_delta,
@@ -4717,6 +4713,9 @@ emit_library_call_value_1 (int retval, rtx orgfun, rtx 
value,
   valreg,
   old_inhibit_defer_pop + 1, call_fusage, flags, args_so_far);
 
+  /* Mark the emitted call as a libcall with the new flag.  */
+  RTL_LIBCALL_P (last_call_insn ()) = 1;
+
   if (flag_ipa_ra)
 {
   rtx datum = orgfun;
diff --git a/gcc/final.cc b/gcc/final.cc
index eea572238f6..df57de5afd0 100644
--- a/gcc/final.cc
+++ b/gcc/final.cc
@@ -815,6 +815,8 @@ make_pass_compute_alignments (gcc::context *ctxt)
reorg.cc, since the branch splitting exposes new instructions with delay
slots.  */
 
+static rtx call_from_call_insn (rtx_call_insn *insn);
+
 void
 shorten_branches (rtx_insn *first)
 {
@@ -850,6 +852,24 @@ shorten_branches (rtx_insn *first)
   for (insn = get_insns (), i = 1; insn; insn = NEXT_INSN (insn))
 {
   INSN_SHUID (insn) = i++;
+
+  /* If this is a `call' instruction that implements a libcall,
+ and this machine requires an external definition for library
+ functions, write one out.  */
+  if (CALL_P 

Re: [PATCH] Fortran: diagnose and reject duplicate CONTIGUOUS attribute [PR108025]

2022-12-08 Thread Steve Kargl via Gcc-patches
On Thu, Dec 08, 2022 at 10:59:42PM +0100, Harald Anlauf via Fortran wrote:
> Dear all,
> 
> a fairly obvious, or rather trivial fix that appeared while
> analyzing another pr and that can be treated independently:
> reject duplicate CONTIGUOUS attributes.
> 
> (Intel and NAG reject this, Cray warns that this is non-standard.)
> 
> Regtested on x86_64-pc-linux-gnu.  OK for mainline?

Yes, thanks for the patch.

-- 
Steve


Re: [PATCH 1/3] Rework 128-bit complex multiply and divide, PR target/107299

2022-12-08 Thread Michael Meissner via Gcc-patches
On Wed, Dec 07, 2022 at 03:55:41PM +0800, Kewen.Lin wrote:
> Hi Mike,
> 
> on 2022/12/7 14:44, Michael Meissner wrote:
> > On Tue, Dec 06, 2022 at 05:36:54PM +0800, Kewen.Lin wrote:
> >> Hi Mike,
> >>
> >> Thanks for fixing this!
> >>
> >> Could you help to elaborate why we need to disable it during libgcc 
> >> building?
> > 
> > When you are building libgcc, you are building the __mulkc3, __divkc3
> > functions.  The mapping in the compiler interferes with those functions,
> > because at the moment, libgcc uses an alternate IEEE 128-bit type.
> > 
> 
> But I'm still confused.  For __mulkc3 (__divkc3 is similar),
> 
> 1) with -mabi=ieeelongdouble (TARGET_IEEEQUAD true, define 
> __LONG_DOUBLE_IEEE128__),
>the used types are:
> 
>typedef float TFtype __attribute__ ((mode (TF)));
>typedef __complex float TCtype __attribute__ ((mode (TC)));
> 
> 2) with -mabi=ibmlongdouble (TARGET_IEEEQUAD false, not 
> __LONG_DOUBLE_IEEE128__ defined),
>the used types are:
> 
>typedef float TFtype __attribute__ ((mode (KF)));
>typedef __complex float TCtype __attribute__ ((mode (KC)));
> 
> The proposed mapping in the current patch is:
> 
> +
> +  if (id == complex_multiply_builtin_code (KCmode))
> + newname = "__mulkc3";
> +
> +  else if (id == complex_multiply_builtin_code (ICmode))
> + newname = "__multc3";
> +
> +  else if (id == complex_multiply_builtin_code (TCmode))
> + newname = (TARGET_IEEEQUAD) ? "__mulkc3" : "__multc3";
> 
> for 1), TCmode && TARGET_IEEEQUAD => "__mulkc3"
> for 2), KCmode => "__mulkc3"
> 
> Both should be still with name "__mulkc3", do I miss anything?
> 
> BR,
> Kewen

The reason is due to the different internal types, the value range propigation
pass throws an error when we are trying to build libgcc.  This is due to the
underlying problem of different IEEE 128-bit types within the compiler.

The 128-bit IEEE support in libgcc was written before _Float128 was added to
GCC.  One consequence is that you can't get to the complex variant of
__float128.  So libgcc needs to use the attribute mode to get to that type.

But with the support for IEEE 128-bit long double changing things, it makes the
libgcc code use the wrong code.

/home/meissner/fsf-src/work102/libgcc/config/rs6000/_mulkc3.c: In function 
‘__mulkc3_sw’:
/home/meissner/fsf-src/work102/libgcc/config/rs6000/_mulkc3.c:97:1: internal 
compiler error: in fold_stmt, at gimple-range-fold.cc:522
   97 | }
  | ^
0x122784f3 fold_using_range::fold_stmt(vrange&, gimple*, fur_source&, 
tree_node*)
/home/meissner/fsf-src/work102/gcc/gimple-range-fold.cc:522
0x1226477f gimple_ranger::fold_range_internal(vrange&, gimple*, tree_node*)
/home/meissner/fsf-src/work102/gcc/gimple-range.cc:257
0x12264b1f gimple_ranger::range_of_stmt(vrange&, gimple*, tree_node*)
/home/meissner/fsf-src/work102/gcc/gimple-range.cc:318
0x113bdd8b range_query::value_of_stmt(gimple*, tree_node*)
/home/meissner/fsf-src/work102/gcc/value-query.cc:134
0x1134838f rvrp_folder::value_of_stmt(gimple*, tree_node*)
/home/meissner/fsf-src/work102/gcc/tree-vrp.cc:1023
0x111344cf substitute_and_fold_dom_walker::before_dom_children(basic_block_def*)
/home/meissner/fsf-src/work102/gcc/tree-ssa-propagate.cc:819
0x121ecbd3 dom_walker::walk(basic_block_def*)
/home/meissner/fsf-src/work102/gcc/domwalk.cc:311
0x11134ee7 substitute_and_fold_engine::substitute_and_fold(basic_block_def*)
/home/meissner/fsf-src/work102/gcc/tree-ssa-propagate.cc:998
0x11346bb7 execute_ranger_vrp(function*, bool, bool)
/home/meissner/fsf-src/work102/gcc/tree-vrp.cc:1084
0x11347063 execute
/home/meissner/fsf-src/work102/gcc/tree-vrp.cc:1165
Please submit a full bug report, with preprocessed source (by using 
-freport-bug).
Please include the complete backtrace with any bug report.
See  for instructions.
make[1]: *** [/home/meissner/fsf-src/work102/libgcc/shared-object.mk:14: 
_mulkc3.o] Error 1
make[1]: Leaving directory 
'/home/meissner/fsf-build-ppc64le/work102/powerpc64le-unknown-linux-gnu/libgcc'
make: *** [Makefile:20623: all-target-libgcc] Error 2

> > I have a patch for making libgcc use the 'right' type that I haven't 
> > submitted
> > yet.  This is because the more general fix that these 3 patches do impacts 
> > other
> > functions (due to __float128 and _Float128 being different in the current
> > compiler when -mabi=ieeelongdouble).
> > 

The patch is to use _Float128 and _Complex _Float128 in libgcc.h instead of
trying to use attribute((mode(TF))) and attribute((mode(TC))) in libgcc.

Now, this patch fixes the specific problem of not being able to build libgcc
(along with patch #1 of the series).  But other things show the differences
from time time because we are using different internal types and the middle end
doesn't know that these types are really the same bits.

It is better long term (IMHO) if we have the two types (__float128 and
_Float128) 

[PATCH RFA] gimplify: avoid unnecessary copy of init array [PR105838]

2022-12-08 Thread Jason Merrill via Gcc-patches
After the previous patches, I noticed that we were putting the array of
strings into .rodata, but then memcpying it into an automatic array, which
is pointless; we should be able to use it directly.

C++ doesn't allow us to do this for the backing array of an
initializer_list, but should be able to do it for the implementation detail
array we use to construct the backing array.

This doesn't happen automatically because TREE_ADDRESSABLE is set, and
gimplify_init_constructor uses that to decide whether to promote a variable
to static.  Ideally this could use escape analysis to recognize that the
address, though taken, never leaves the function; that should allow
promotion when we're only using the address for indexing within the
function, as in initlist-opt2.C.

But in initlist-opt1.C, we're passing the array address to another function,
so it definitely escapes; it's only safe in this case because it's calling a
standard library function that we know only uses it for indexing.  So, a
flag seems needed.  I first thought to put the flag on the TARGET_EXPR, but
the VAR_DECL seems more appropriate.

Bikeshedding, or other approaches, welcome.

PR c++/105838

gcc/ChangeLog:

* tree.h (DECL_NOT_OBSERVABLE): New.
* tree-core.h (struct tree_decl_common): Mention it.
* gimplify.cc (gimplify_init_constructor): Check it.

gcc/cp/ChangeLog:

* call.cc (maybe_init_list_as_array): Set DECL_NOT_OBSERVABLE.

gcc/testsuite/ChangeLog:

* g++.dg/tree-ssa/initlist-opt1.C: Check for static array.
* g++.dg/tree-ssa/initlist-opt2.C: Likewise.
---
 gcc/tree-core.h   | 3 ++-
 gcc/tree.h| 5 +
 gcc/cp/call.cc| 4 +++-
 gcc/gimplify.cc   | 3 ++-
 gcc/testsuite/g++.dg/tree-ssa/initlist-opt1.C | 1 +
 gcc/testsuite/g++.dg/tree-ssa/initlist-opt2.C | 1 +
 6 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/gcc/tree-core.h b/gcc/tree-core.h
index e146b133dbd..c0d63632c1e 100644
--- a/gcc/tree-core.h
+++ b/gcc/tree-core.h
@@ -1808,7 +1808,8 @@ struct GTY(()) tree_decl_common {
  In VAR_DECL, PARM_DECL and RESULT_DECL, this is
  DECL_HAS_VALUE_EXPR_P.  */
   unsigned decl_flag_2 : 1;
-  /* In FIELD_DECL, this is DECL_PADDING_P.  */
+  /* In FIELD_DECL, this is DECL_PADDING_P.
+ In VAR_DECL, this is DECL_NOT_OBSERVABLE.  */
   unsigned decl_flag_3 : 1;
   /* Logically, these two would go in a theoretical base shared by var and
  parm decl. */
diff --git a/gcc/tree.h b/gcc/tree.h
index 23223ca0c87..7ba88fd16db 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -3221,6 +3221,11 @@ extern void decl_fini_priority_insert (tree, 
priority_type);
 #define DECL_NONALIASED(NODE) \
   (VAR_DECL_CHECK (NODE)->base.nothrow_flag)
 
+/* In a VAR_DECL, nonzero if this variable is not observable by user code (and
+   so e.g. it can be promoted to static even if it's addressable).  */
+#define DECL_NOT_OBSERVABLE(NODE) \
+  (VAR_DECL_CHECK (NODE)->decl_common.decl_flag_3)
+
 /* This field is used to reference anything in decl.result and is meant only
for use by the garbage collector.  */
 #define DECL_RESULT_FLD(NODE) \
diff --git a/gcc/cp/call.cc b/gcc/cp/call.cc
index 14aa96dd328..a9052c64265 100644
--- a/gcc/cp/call.cc
+++ b/gcc/cp/call.cc
@@ -4247,7 +4247,9 @@ maybe_init_list_as_array (tree elttype, tree init)
 
   init_elttype = cp_build_qualified_type (init_elttype, TYPE_QUAL_CONST);
   tree arr = build_array_of_n_type (init_elttype, CONSTRUCTOR_NELTS (init));
-  return finish_compound_literal (arr, init, tf_none);
+  arr = finish_compound_literal (arr, init, tf_none);
+  DECL_NOT_OBSERVABLE (TARGET_EXPR_SLOT (arr)) = true;
+  return arr;
 }
 
 /* If we were going to call e.g. vector(initializer_list) starting
diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc
index 250782b1140..87c913c48c5 100644
--- a/gcc/gimplify.cc
+++ b/gcc/gimplify.cc
@@ -5234,7 +5234,8 @@ gimplify_init_constructor (tree *expr_p, gimple_seq 
*pre_p, gimple_seq *post_p,
&& TREE_READONLY (object)
&& VAR_P (object)
&& !DECL_REGISTER (object)
-   && (flag_merge_constants >= 2 || !TREE_ADDRESSABLE (object))
+   && (flag_merge_constants >= 2 || !TREE_ADDRESSABLE (object)
+   || DECL_NOT_OBSERVABLE (object))
/* For ctors that have many repeated nonzero elements
   represented through RANGE_EXPRs, prefer initializing
   those through runtime loops over copies of large amounts
diff --git a/gcc/testsuite/g++.dg/tree-ssa/initlist-opt1.C 
b/gcc/testsuite/g++.dg/tree-ssa/initlist-opt1.C
index 053317b59d8..b1d2d25faf4 100644
--- a/gcc/testsuite/g++.dg/tree-ssa/initlist-opt1.C
+++ b/gcc/testsuite/g++.dg/tree-ssa/initlist-opt1.C
@@ -4,6 +4,7 @@
 
 // Test that we do range-initialization from const char *.
 // { dg-final { scan-tree-dump {_M_range_initialize} 
"gimple" } }
+// { dg-final { 

[PATCH] Fortran: diagnose and reject duplicate CONTIGUOUS attribute [PR108025]

2022-12-08 Thread Harald Anlauf via Gcc-patches
Dear all,

a fairly obvious, or rather trivial fix that appeared while
analyzing another pr and that can be treated independently:
reject duplicate CONTIGUOUS attributes.

(Intel and NAG reject this, Cray warns that this is non-standard.)

Regtested on x86_64-pc-linux-gnu.  OK for mainline?

Thanks,
Harald

From 3a9f6d5a8ee490adf9a18f93feaf86542642be7d Mon Sep 17 00:00:00 2001
From: Harald Anlauf 
Date: Thu, 8 Dec 2022 22:50:45 +0100
Subject: [PATCH] Fortran: diagnose and reject duplicate CONTIGUOUS attribute
 [PR108025]

gcc/fortran/ChangeLog:

	PR fortran/108025
	* symbol.cc (gfc_add_contiguous): Diagnose and reject duplicate
	CONTIGUOUS attribute.

gcc/testsuite/ChangeLog:

	PR fortran/108025
	* gfortran.dg/contiguous_12.f90: New test.
---
 gcc/fortran/symbol.cc   | 6 ++
 gcc/testsuite/gfortran.dg/contiguous_12.f90 | 7 +++
 2 files changed, 13 insertions(+)
 create mode 100644 gcc/testsuite/gfortran.dg/contiguous_12.f90

diff --git a/gcc/fortran/symbol.cc b/gcc/fortran/symbol.cc
index 49fb37864bd..e704e7ac2bd 100644
--- a/gcc/fortran/symbol.cc
+++ b/gcc/fortran/symbol.cc
@@ -1108,6 +1108,12 @@ gfc_add_contiguous (symbol_attribute *attr, const char *name, locus *where)
   if (check_used (attr, name, where))
 return false;

+  if (attr->contiguous)
+{
+  duplicate_attr ("CONTIGUOUS", where);
+  return false;
+}
+
   attr->contiguous = 1;
   return gfc_check_conflict (attr, name, where);
 }
diff --git a/gcc/testsuite/gfortran.dg/contiguous_12.f90 b/gcc/testsuite/gfortran.dg/contiguous_12.f90
new file mode 100644
index 000..9c477a7a06a
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/contiguous_12.f90
@@ -0,0 +1,7 @@
+! { dg-do compile }
+! PR fortran/108025
+
+subroutine foo (x)
+  real, contiguous :: x(:)
+  contiguous   :: x! { dg-error "Duplicate CONTIGUOUS attribute" }
+end
--
2.35.3



Re: [PATCH] RISC-V: Produce better code with complex constants [PR95632] [PR106602]

2022-12-08 Thread Palmer Dabbelt

On Thu, 08 Dec 2022 10:15:47 PST (-0800), gcc-patches@gcc.gnu.org wrote:



On 12/8/22 10:53, Palmer Dabbelt wrote:

On Wed, 07 Dec 2022 12:55:17 PST (-0800), rzin...@ventanamicro.com wrote:

Due to RISC-V limitations on operations with big constants combine
is failing to match such operations and is not being able to
produce optimal code as it keeps splitting them. By pretending we
can do those operations we can get more opportunities for
simplification of surrounding instructions.


I saw Jeff's comments.  This is always the kind of thing that worries
me: we're essentially lying to the optimizer in order to trick it into
generating better code, which might just make it generate worse code.
It's always easy to see a small example that improves, but those could
be wiped out by secondary effects in real code.  So I'd usually want to
have some benchmarking for a patch like this.

That said, if this is just the standard way of doing things then maybe
it's just fine?

Bridge combiner patterns are pretty standard.  The insn's condition of
cse_not_expected is also in there to minimize the potential for
surprises by not exposing this too early.


OK, I'm fine with this, then -- aside from the fairly minor issues 
pointed out.


Re: [PATCH] bpf: add define_insn for bswap

2022-12-08 Thread Jose E. Marchesi via Gcc-patches


Hi David.

> The eBPF architecture provides 'end[be,le]' instructions for endianness
> swapping. Add a define_insn for bswap2 to use them instaed of
> falling back on a libcall.
>
> Tested on bpf-unknown-none, no known regressions.
>
> OK to commit?
> Thanks

OK for master.
Thanks!

> gcc/
>
>   * config/bpf/bpf.md (bswap2): New define_insn.
>
> gcc/testsuite/
>
>   * gcc.target/bpf/bswap-1.c: New test.
> ---
>  gcc/config/bpf/bpf.md  | 17 +
>  gcc/testsuite/gcc.target/bpf/bswap-1.c | 23 +++
>  2 files changed, 40 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/bpf/bswap-1.c
>
> diff --git a/gcc/config/bpf/bpf.md b/gcc/config/bpf/bpf.md
> index a28021aef26..22a133f1c79 100644
> --- a/gcc/config/bpf/bpf.md
> +++ b/gcc/config/bpf/bpf.md
> @@ -341,6 +341,23 @@ (define_insn "lshr3"
>"rsh\t%0,%2"
>[(set_attr "type" "")])
>  
> + Endianness conversion
> +
> +(define_mode_iterator BSM [HI SI DI])
> +(define_mode_attr endmode [(HI "16") (SI "32") (DI "64")])
> +
> +(define_insn "bswap2"
> +  [(set (match_operand:BSM 0 "register_operand""=r")
> +(bswap:BSM (match_operand:BSM 1 "register_operand" " r")))]
> +  ""
> +{
> +  if (TARGET_BIG_ENDIAN)
> +return "endle\t%0, ";
> +  else
> +return "endbe\t%0, ";
> +}
> +  [(set_attr "type" "end")])
> +
>   Conditional branches
>  
>  ;; The eBPF jump instructions use 64-bit arithmetic when evaluating
> diff --git a/gcc/testsuite/gcc.target/bpf/bswap-1.c 
> b/gcc/testsuite/gcc.target/bpf/bswap-1.c
> new file mode 100644
> index 000..4748143ada5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/bpf/bswap-1.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mlittle-endian" } */
> +
> +unsigned short in16 = 0x1234U;
> +unsigned int   in32 = 0x12345678U;
> +unsigned long  in64 = 0x123456789abcdef0ULL;
> +
> +unsigned short out16 = 0;
> +unsigned int   out32 = 0;
> +unsigned long  out64 = 0;
> +
> +int foo (void)
> +{
> +  out16 = __builtin_bswap16 (in16);
> +  out32 = __builtin_bswap32 (in32);
> +  out64 = __builtin_bswap64 (in64);
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-assembler "endbe\t%r., 16" } } */
> +/* { dg-final { scan-assembler "endbe\t%r., 32" } } */
> +/* { dg-final { scan-assembler "endbe\t%r., 64" } } */


[pushed] c++: build initializer_list in a loop [PR105838]

2022-12-08 Thread Jason Merrill via Gcc-patches
Tested x86_64-pc-linux-gnu, applying to trunk.

-- 8< --

The previous patch avoided building an initializer_list at all when
building a vector, but in situations where that isn't possible, we
could still build the initializer_list with a loop over a constant array.

This is represented using a VEC_INIT_EXPR, which required adjusting a couple
of places that expected the initializer array to have the same type as the
target array and fixing build_vec_init not to undo our efforts.

PR c++/105838

gcc/cp/ChangeLog:

* call.cc (convert_like_internal) [ck_list]: Use
maybe_init_list_as_array.
* constexpr.cc (cxx_eval_vec_init_1): Init might have
a different type.
* tree.cc (build_vec_init_elt): Likewise.
* init.cc (build_vec_init): Handle from_array from a
TARGET_EXPR.  Retain TARGET_EXPR of a different type.

gcc/testsuite/ChangeLog:

* g++.dg/tree-ssa/initlist-opt2.C: New test.
---
 gcc/cp/call.cc| 11 -
 gcc/cp/constexpr.cc   |  6 ++---
 gcc/cp/init.cc| 13 --
 gcc/cp/tree.cc|  2 --
 gcc/testsuite/g++.dg/tree-ssa/initlist-opt2.C | 24 +++
 5 files changed, 48 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/tree-ssa/initlist-opt2.C

diff --git a/gcc/cp/call.cc b/gcc/cp/call.cc
index 33b5e7f87f5..14aa96dd328 100644
--- a/gcc/cp/call.cc
+++ b/gcc/cp/call.cc
@@ -8501,7 +8501,16 @@ convert_like_internal (conversion *convs, tree expr, 
tree fn, int argnum,
unsigned len = CONSTRUCTOR_NELTS (expr);
tree array;
 
-   if (len)
+   if (tree init = maybe_init_list_as_array (elttype, expr))
+ {
+   elttype = cp_build_qualified_type
+ (elttype, cp_type_quals (elttype) | TYPE_QUAL_CONST);
+   array = build_array_of_n_type (elttype, len);
+   array = build_vec_init_expr (array, init, complain);
+   array = get_target_expr (array);
+   array = cp_build_addr_expr (array, complain);
+ }
+   else if (len)
  {
tree val; unsigned ix;
 
diff --git a/gcc/cp/constexpr.cc b/gcc/cp/constexpr.cc
index e43d92864f5..3f7892aa88a 100644
--- a/gcc/cp/constexpr.cc
+++ b/gcc/cp/constexpr.cc
@@ -5255,12 +5255,12 @@ cxx_eval_vec_init_1 (const constexpr_ctx *ctx, tree 
atype, tree init,
   else
{
  /* Copying an element.  */
- gcc_assert (same_type_ignoring_top_level_qualifiers_p
- (atype, TREE_TYPE (init)));
  eltinit = cp_build_array_ref (input_location, init, idx, complain);
  if (!lvalue_p (init))
eltinit = move (eltinit);
- eltinit = force_rvalue (eltinit, complain);
+ eltinit = (perform_implicit_conversion_flags
+(elttype, eltinit, complain,
+ LOOKUP_IMPLICIT|LOOKUP_NO_NARROWING));
  eltinit = cxx_eval_constant_expression (_ctx, eltinit, lval,
  non_constant_p, overflow_p);
}
diff --git a/gcc/cp/init.cc b/gcc/cp/init.cc
index 428fac5621c..1ccde7cf8ad 100644
--- a/gcc/cp/init.cc
+++ b/gcc/cp/init.cc
@@ -4420,7 +4420,9 @@ build_vec_init (tree base, tree maxindex, tree init,
   /* Look through the TARGET_EXPR around a compound literal.  */
   if (init && TREE_CODE (init) == TARGET_EXPR
   && TREE_CODE (TARGET_EXPR_INITIAL (init)) == CONSTRUCTOR
-  && from_array != 2)
+  && from_array != 2
+  && (same_type_ignoring_top_level_qualifiers_p
+ (TREE_TYPE (init), atype)))
 init = TARGET_EXPR_INITIAL (init);
 
   if (tree vi = get_vec_init_expr (init))
@@ -4546,7 +4548,14 @@ build_vec_init (tree base, tree maxindex, tree init,
 {
   if (lvalue_kind (init) & clk_rvalueref)
xvalue = true;
-  base2 = decay_conversion (init, complain);
+  if (TREE_CODE (init) == TARGET_EXPR)
+   {
+ /* Avoid error in decay_conversion.  */
+ base2 = decay_conversion (TARGET_EXPR_SLOT (init), complain);
+ base2 = cp_build_compound_expr (init, base2, tf_none);
+   }
+  else
+   base2 = decay_conversion (init, complain);
   if (base2 == error_mark_node)
return error_mark_node;
   itype = TREE_TYPE (base2);
diff --git a/gcc/cp/tree.cc b/gcc/cp/tree.cc
index 33bde16f128..a600178239c 100644
--- a/gcc/cp/tree.cc
+++ b/gcc/cp/tree.cc
@@ -785,8 +785,6 @@ build_vec_init_elt (tree type, tree init, tsubst_flags_t 
complain)
   releasing_vec argvec;
   if (init && !BRACE_ENCLOSED_INITIALIZER_P (init))
 {
-  gcc_assert (same_type_ignoring_top_level_qualifiers_p
- (type, TREE_TYPE (init)));
   tree init_type = strip_array_types (TREE_TYPE (init));
   tree dummy = build_dummy_object (init_type);
   if (!lvalue_p (init))
diff --git a/gcc/testsuite/g++.dg/tree-ssa/initlist-opt2.C 

[pushed] c++: avoid initializer_list [PR105838]

2022-12-08 Thread Jason Merrill via Gcc-patches
Tested x86_64-pc-linux-gnu, applying to trunk.

-- 8< --

When constructing a vector from { "strings" }, first is built an
initializer_list, which is then copied into the strings in the
vector.  But this is inefficient: better would be treat the { "strings" }
as a range and construct the strings in the vector directly from the
string-literals.  We can do this transformation for standard library
classes because we know the design patterns they follow.

PR c++/105838

gcc/cp/ChangeLog:

* call.cc (list_ctor_element_type): New.
(braced_init_element_type): New.
(has_non_trivial_temporaries): New.
(maybe_init_list_as_array): New.
(maybe_init_list_as_range): New.
(build_user_type_conversion_1): Use maybe_init_list_as_range.
* parser.cc (cp_parser_braced_list): Call
recompute_constructor_flags.
* cp-tree.h (find_temps_r): Declare.

gcc/testsuite/ChangeLog:

* g++.dg/tree-ssa/initlist-opt1.C: New test.
---
 gcc/cp/cp-tree.h  |   1 +
 gcc/cp/call.cc| 138 ++
 gcc/cp/parser.cc  |   1 +
 gcc/testsuite/g++.dg/tree-ssa/initlist-opt1.C |  25 
 4 files changed, 165 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/tree-ssa/initlist-opt1.C

diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index 581ac2b1817..0d6c234b3b0 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -7087,6 +7087,7 @@ extern void set_global_friend (tree);
 extern bool is_global_friend   (tree);
 
 /* in init.cc */
+extern tree find_temps_r   (tree *, int *, void *);
 extern tree expand_member_init (tree);
 extern void emit_mem_initializers  (tree);
 extern tree build_aggr_init(tree, tree, int,
diff --git a/gcc/cp/call.cc b/gcc/cp/call.cc
index 459e86b5f09..33b5e7f87f5 100644
--- a/gcc/cp/call.cc
+++ b/gcc/cp/call.cc
@@ -4154,6 +4154,134 @@ add_list_candidates (tree fns, tree first_arg,
  access_path, flags, candidates, complain);
 }
 
+/* Given C(std::initializer_list), return A.  */
+
+static tree
+list_ctor_element_type (tree fn)
+{
+  gcc_checking_assert (is_list_ctor (fn));
+
+  tree parm = FUNCTION_FIRST_USER_PARMTYPE (fn);
+  parm = non_reference (TREE_VALUE (parm));
+  return TREE_VEC_ELT (CLASSTYPE_TI_ARGS (parm), 0);
+}
+
+/* If EXPR is a braced-init-list where the elements all decay to the same type,
+   return that type.  */
+
+static tree
+braced_init_element_type (tree expr)
+{
+  if (TREE_CODE (expr) == CONSTRUCTOR
+  && TREE_CODE (TREE_TYPE (expr)) == ARRAY_TYPE)
+return TREE_TYPE (TREE_TYPE (expr));
+  if (!BRACE_ENCLOSED_INITIALIZER_P (expr))
+return NULL_TREE;
+
+  tree elttype = NULL_TREE;
+  for (constructor_elt : CONSTRUCTOR_ELTS (expr))
+{
+  tree type = TREE_TYPE (e.value);
+  type = type_decays_to (type);
+  if (!elttype)
+   elttype = type;
+  else if (!same_type_p (type, elttype))
+   return NULL_TREE;
+}
+  return elttype;
+}
+
+/* True iff EXPR contains any temporaries with non-trivial destruction.
+
+   ??? Also ignore classes with non-trivial but no-op destruction other than
+   std::allocator?  */
+
+static bool
+has_non_trivial_temporaries (tree expr)
+{
+  auto_vec temps;
+  cp_walk_tree_without_duplicates (, find_temps_r, );
+  for (tree *p : temps)
+{
+  tree t = TREE_TYPE (*p);
+  if (!TYPE_HAS_TRIVIAL_DESTRUCTOR (t)
+ && !is_std_allocator (t))
+   return true;
+}
+  return false;
+}
+
+/* We're initializing an array of ELTTYPE from INIT.  If it seems useful,
+   return INIT as an array (of its own type) so the caller can initialize the
+   target array in a loop.  */
+
+static tree
+maybe_init_list_as_array (tree elttype, tree init)
+{
+  /* Only do this if the array can go in rodata but not once converted.  */
+  if (!CLASS_TYPE_P (elttype))
+return NULL_TREE;
+  tree init_elttype = braced_init_element_type (init);
+  if (!init_elttype || !SCALAR_TYPE_P (init_elttype) || !TREE_CONSTANT (init))
+return NULL_TREE;
+
+  tree first = CONSTRUCTOR_ELT (init, 0)->value;
+  if (TREE_CODE (init_elttype) == INTEGER_TYPE && null_ptr_cst_p (first))
+/* Avoid confusion from treating 0 as a null pointer constant.  */
+first = build1 (UNARY_PLUS_EXPR, init_elttype, first);
+  first = (perform_implicit_conversion_flags
+  (elttype, first, tf_none, LOOKUP_IMPLICIT|LOOKUP_NO_NARROWING));
+  if (first == error_mark_node)
+/* Let the normal code give the error.  */
+return NULL_TREE;
+
+  /* Don't do this if the conversion would be constant.  */
+  first = maybe_constant_init (first);
+  if (TREE_CONSTANT (first))
+return NULL_TREE;
+
+  /* We can't do this if the conversion creates temporaries that need
+ to live until the whole array is initialized.  */
+  if (has_non_trivial_temporaries (first))
+

[PATCH] bpf: add define_insn for bswap

2022-12-08 Thread David Faust via Gcc-patches
The eBPF architecture provides 'end[be,le]' instructions for endianness
swapping. Add a define_insn for bswap2 to use them instaed of
falling back on a libcall.

Tested on bpf-unknown-none, no known regressions.

OK to commit?
Thanks

gcc/

* config/bpf/bpf.md (bswap2): New define_insn.

gcc/testsuite/

* gcc.target/bpf/bswap-1.c: New test.
---
 gcc/config/bpf/bpf.md  | 17 +
 gcc/testsuite/gcc.target/bpf/bswap-1.c | 23 +++
 2 files changed, 40 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/bpf/bswap-1.c

diff --git a/gcc/config/bpf/bpf.md b/gcc/config/bpf/bpf.md
index a28021aef26..22a133f1c79 100644
--- a/gcc/config/bpf/bpf.md
+++ b/gcc/config/bpf/bpf.md
@@ -341,6 +341,23 @@ (define_insn "lshr3"
   "rsh\t%0,%2"
   [(set_attr "type" "")])
 
+ Endianness conversion
+
+(define_mode_iterator BSM [HI SI DI])
+(define_mode_attr endmode [(HI "16") (SI "32") (DI "64")])
+
+(define_insn "bswap2"
+  [(set (match_operand:BSM 0 "register_operand""=r")
+(bswap:BSM (match_operand:BSM 1 "register_operand" " r")))]
+  ""
+{
+  if (TARGET_BIG_ENDIAN)
+return "endle\t%0, ";
+  else
+return "endbe\t%0, ";
+}
+  [(set_attr "type" "end")])
+
  Conditional branches
 
 ;; The eBPF jump instructions use 64-bit arithmetic when evaluating
diff --git a/gcc/testsuite/gcc.target/bpf/bswap-1.c 
b/gcc/testsuite/gcc.target/bpf/bswap-1.c
new file mode 100644
index 000..4748143ada5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/bpf/bswap-1.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-mlittle-endian" } */
+
+unsigned short in16 = 0x1234U;
+unsigned int   in32 = 0x12345678U;
+unsigned long  in64 = 0x123456789abcdef0ULL;
+
+unsigned short out16 = 0;
+unsigned int   out32 = 0;
+unsigned long  out64 = 0;
+
+int foo (void)
+{
+  out16 = __builtin_bswap16 (in16);
+  out32 = __builtin_bswap32 (in32);
+  out64 = __builtin_bswap64 (in64);
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler "endbe\t%r., 16" } } */
+/* { dg-final { scan-assembler "endbe\t%r., 32" } } */
+/* { dg-final { scan-assembler "endbe\t%r., 64" } } */
-- 
2.38.1



[pushed] c++: fewer allocator temps [PR105838]

2022-12-08 Thread Jason Merrill via Gcc-patches
Tested x86_64-pc-linux-gnu, applying to trunk.

-- 8< --

In this PR, initializing the array of std::string to pass to the vector
initializer_list constructor gets very confusing to the optimizers as the
number of elements increases, primarily because of all the std::allocator
temporaries passed to all the string constructors.  Instead of creating one
for each string, let's share an allocator between all the strings; we can do
this safely because we know that std::allocator is stateless and that string
doesn't care about the object identity of its allocator parameter.

PR c++/105838

gcc/cp/ChangeLog:

* cp-tree.h (is_std_allocator): Declare.
* constexpr.cc (is_std_allocator): Split out  from...
(is_std_allocator_allocate): ...here.
* init.cc (find_temps_r): New.
(find_allocator_temp): New.
(build_vec_init): Use it.

gcc/testsuite/ChangeLog:

* g++.dg/tree-ssa/allocator-opt1.C: New test.
---
 gcc/cp/cp-tree.h  |  1 +
 gcc/cp/constexpr.cc   | 27 +
 gcc/cp/init.cc| 59 ++-
 .../g++.dg/tree-ssa/allocator-opt1.C  | 12 
 4 files changed, 88 insertions(+), 11 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/tree-ssa/allocator-opt1.C

diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index addd26ea077..581ac2b1817 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -8472,6 +8472,7 @@ extern bool is_rvalue_constant_expression (tree);
 extern bool is_nondependent_constant_expression (tree);
 extern bool is_nondependent_static_init_expression (tree);
 extern bool is_static_init_expression(tree);
+extern bool is_std_allocator (tree);
 extern bool potential_rvalue_constant_expression (tree);
 extern bool require_potential_constant_expression (tree);
 extern bool require_constant_expression (tree);
diff --git a/gcc/cp/constexpr.cc b/gcc/cp/constexpr.cc
index 23a27a962de..e43d92864f5 100644
--- a/gcc/cp/constexpr.cc
+++ b/gcc/cp/constexpr.cc
@@ -2214,6 +2214,22 @@ is_std_construct_at (const constexpr_call *call)
  && is_std_construct_at (call->fundef->decl));
 }
 
+/* True if CTX is an instance of std::allocator.  */
+
+bool
+is_std_allocator (tree ctx)
+{
+  if (ctx == NULL_TREE || !CLASS_TYPE_P (ctx) || !TYPE_MAIN_DECL (ctx))
+return false;
+
+  tree decl = TYPE_MAIN_DECL (ctx);
+  tree name = DECL_NAME (decl);
+  if (name == NULL_TREE || !id_equal (name, "allocator"))
+return false;
+
+  return decl_in_std_namespace_p (decl);
+}
+
 /* Return true if FNDECL is std::allocator::{,de}allocate.  */
 
 static inline bool
@@ -2224,16 +2240,7 @@ is_std_allocator_allocate (tree fndecl)
   || !(id_equal (name, "allocate") || id_equal (name, "deallocate")))
 return false;
 
-  tree ctx = DECL_CONTEXT (fndecl);
-  if (ctx == NULL_TREE || !CLASS_TYPE_P (ctx) || !TYPE_MAIN_DECL (ctx))
-return false;
-
-  tree decl = TYPE_MAIN_DECL (ctx);
-  name = DECL_NAME (decl);
-  if (name == NULL_TREE || !id_equal (name, "allocator"))
-return false;
-
-  return decl_in_std_namespace_p (decl);
+  return is_std_allocator (DECL_CONTEXT (fndecl));
 }
 
 /* Overload for the above taking constexpr_call*.  */
diff --git a/gcc/cp/init.cc b/gcc/cp/init.cc
index 2fff4ad2dc7..428fac5621c 100644
--- a/gcc/cp/init.cc
+++ b/gcc/cp/init.cc
@@ -4308,6 +4308,51 @@ finish_length_check (tree atype, tree iterator, tree 
obase, unsigned n)
 }
 }
 
+/* walk_tree callback to collect temporaries in an expression.  */
+
+tree
+find_temps_r (tree *tp, int *walk_subtrees, void *data)
+{
+  vec  = *static_cast *>(data);
+  tree t = *tp;
+  if (TREE_CODE (t) == TARGET_EXPR
+  && !TARGET_EXPR_ELIDING_P (t))
+temps.safe_push (tp);
+  else if (TYPE_P (t))
+*walk_subtrees = 0;
+
+  return NULL_TREE;
+}
+
+/* If INIT initializes a standard library class, and involves a temporary
+   std::allocator, return a pointer to the temp.
+
+   Used by build_vec_init when initializing an array of e.g. strings to reuse
+   the same temporary allocator for all of the strings.  We can do this because
+   std::allocator has no data and the standard library doesn't care about the
+   address of allocator objects.
+
+   ??? Add an attribute to allow users to assert the same property for other
+   classes, i.e. one object of the type is interchangeable with any other?  */
+
+static tree*
+find_allocator_temp (tree init)
+{
+  if (TREE_CODE (init) == EXPR_STMT)
+init = EXPR_STMT_EXPR (init);
+  if (TREE_CODE (init) == CONVERT_EXPR)
+init = TREE_OPERAND (init, 0);
+  tree type = TREE_TYPE (init);
+  if (!CLASS_TYPE_P (type) || !decl_in_std_namespace_p (TYPE_NAME (type)))
+return NULL;
+  auto_vec temps;
+  cp_walk_tree_without_duplicates (, find_temps_r, );
+  for (tree *p : temps)
+if (is_std_allocator (TREE_TYPE (*p)))
+  return p;
+  return NULL;
+}
+
 /* `build_vec_init' returns tree structure that performs
initialization of a 

Re: [PATCH RFA(tree)] c++: source position of lambda captures [PR84471]

2022-12-08 Thread Jason Merrill via Gcc-patches

Ping.

On 12/2/22 10:45, Jason Merrill wrote:

Tested x86_64-pc-linux-gnu, OK for trunk?

-- 8< --

If the DECL_VALUE_EXPR of a VAR_DECL has EXPR_LOCATION set, then any use of
that variable looks like it has that location, which leads to the debugger
jumping back and forth for both lambdas and structured bindings.

Rather than fix all the uses, it seems simplest to remove any EXPR_LOCATION
when setting DECL_VALUE_EXPR.  So the cp/ hunks aren't necessary, but it
seems cleaner not to work to add a location that will immediately get
stripped.

PR c++/84471
PR c++/107504

gcc/cp/ChangeLog:

* coroutines.cc (transform_local_var_uses): Don't
specify a location for DECL_VALUE_EXPR.
* decl.cc (cp_finish_decomp): Likewise.

gcc/ChangeLog:

* tree.cc (decl_value_expr_insert): Clear EXPR_LOCATION.

gcc/testsuite/ChangeLog:

* g++.dg/tree-ssa/value-expr1.C: New test.
* g++.dg/tree-ssa/value-expr2.C: New test.
* g++.dg/analyzer/pr93212.C: Move warning.
---
  gcc/cp/coroutines.cc|  4 ++--
  gcc/cp/decl.cc  | 12 +++---
  gcc/testsuite/g++.dg/analyzer/pr93212.C |  4 ++--
  gcc/testsuite/g++.dg/tree-ssa/value-expr1.C | 16 +
  gcc/testsuite/g++.dg/tree-ssa/value-expr2.C | 26 +
  gcc/tree.cc |  3 +++
  6 files changed, 52 insertions(+), 13 deletions(-)
  create mode 100644 gcc/testsuite/g++.dg/tree-ssa/value-expr1.C
  create mode 100644 gcc/testsuite/g++.dg/tree-ssa/value-expr2.C

diff --git a/gcc/cp/coroutines.cc b/gcc/cp/coroutines.cc
index 01a3e831ee5..a72bd6bbef0 100644
--- a/gcc/cp/coroutines.cc
+++ b/gcc/cp/coroutines.cc
@@ -2047,8 +2047,8 @@ transform_local_var_uses (tree *stmt, int *do_subtree, 
void *d)
= lookup_member (lvd->coro_frame_type, local_var.field_id,
 /*protect=*/1, /*want_type=*/0,
 tf_warning_or_error);
- tree fld_idx = build3_loc (lvd->loc, COMPONENT_REF, TREE_TYPE (lvar),
-lvd->actor_frame, fld_ref, NULL_TREE);
+ tree fld_idx = build3 (COMPONENT_REF, TREE_TYPE (lvar),
+lvd->actor_frame, fld_ref, NULL_TREE);
  local_var.field_idx = fld_idx;
  SET_DECL_VALUE_EXPR (lvar, fld_idx);
  DECL_HAS_VALUE_EXPR_P (lvar) = true;
diff --git a/gcc/cp/decl.cc b/gcc/cp/decl.cc
index 7af0b05d5f8..59e21581503 100644
--- a/gcc/cp/decl.cc
+++ b/gcc/cp/decl.cc
@@ -9133,9 +9133,7 @@ cp_finish_decomp (tree decl, tree first, unsigned int 
count)
  if (processing_template_decl)
continue;
  tree t = unshare_expr (dexp);
- t = build4_loc (DECL_SOURCE_LOCATION (v[i]), ARRAY_REF,
- eltype, t, size_int (i), NULL_TREE,
- NULL_TREE);
+ t = build4 (ARRAY_REF, eltype, t, size_int (i), NULL_TREE, NULL_TREE);
  SET_DECL_VALUE_EXPR (v[i], t);
  DECL_HAS_VALUE_EXPR_P (v[i]) = 1;
}
@@ -9154,9 +9152,7 @@ cp_finish_decomp (tree decl, tree first, unsigned int 
count)
  if (processing_template_decl)
continue;
  tree t = unshare_expr (dexp);
- t = build1_loc (DECL_SOURCE_LOCATION (v[i]),
- i ? IMAGPART_EXPR : REALPART_EXPR, eltype,
- t);
+ t = build1 (i ? IMAGPART_EXPR : REALPART_EXPR, eltype, t);
  SET_DECL_VALUE_EXPR (v[i], t);
  DECL_HAS_VALUE_EXPR_P (v[i]) = 1;
}
@@ -9180,9 +9176,7 @@ cp_finish_decomp (tree decl, tree first, unsigned int 
count)
  tree t = unshare_expr (dexp);
  convert_vector_to_array_for_subscript (DECL_SOURCE_LOCATION (v[i]),
 , size_int (i));
- t = build4_loc (DECL_SOURCE_LOCATION (v[i]), ARRAY_REF,
- eltype, t, size_int (i), NULL_TREE,
- NULL_TREE);
+ t = build4 (ARRAY_REF, eltype, t, size_int (i), NULL_TREE, NULL_TREE);
  SET_DECL_VALUE_EXPR (v[i], t);
  DECL_HAS_VALUE_EXPR_P (v[i]) = 1;
}
diff --git a/gcc/testsuite/g++.dg/analyzer/pr93212.C 
b/gcc/testsuite/g++.dg/analyzer/pr93212.C
index 41507e2b837..1029e8d547b 100644
--- a/gcc/testsuite/g++.dg/analyzer/pr93212.C
+++ b/gcc/testsuite/g++.dg/analyzer/pr93212.C
@@ -4,8 +4,8 @@
  auto lol()
  {
  int aha = 3;
-return [] { // { dg-warning "dereferencing pointer '.*' to within stale 
stack frame" }
-return aha;
+return [] {
+return aha; // { dg-warning "dereferencing pointer '.*' to within stale 
stack frame" }
  };
  /* TODO: may be worth special-casing the reporting of dangling
 references from lambdas, to highlight the declaration, and maybe fix
diff --git a/gcc/testsuite/g++.dg/tree-ssa/value-expr1.C 
b/gcc/testsuite/g++.dg/tree-ssa/value-expr1.C
new file mode 100644
index 

Re: [PATCH] RISC-V: Produce better code with complex constants [PR95632] [PR106602]

2022-12-08 Thread Jeff Law via Gcc-patches




On 12/8/22 10:53, Palmer Dabbelt wrote:

On Wed, 07 Dec 2022 12:55:17 PST (-0800), rzin...@ventanamicro.com wrote:

Due to RISC-V limitations on operations with big constants combine
is failing to match such operations and is not being able to
produce optimal code as it keeps splitting them. By pretending we
can do those operations we can get more opportunities for
simplification of surrounding instructions.


I saw Jeff's comments.  This is always the kind of thing that worries 
me: we're essentially lying to the optimizer in order to trick it into 
generating better code, which might just make it generate worse code. 
It's always easy to see a small example that improves, but those could 
be wiped out by secondary effects in real code.  So I'd usually want to 
have some benchmarking for a patch like this.


That said, if this is just the standard way of doing things then maybe 
it's just fine?
Bridge combiner patterns are pretty standard.  The insn's condition of 
cse_not_expected is also in there to minimize the potential for 
surprises by not exposing this too early.


jeff


Re: [PATCH] RISC-V: Produce better code with complex constants [PR95632] [PR106602]

2022-12-08 Thread Palmer Dabbelt

On Wed, 07 Dec 2022 12:55:17 PST (-0800), rzin...@ventanamicro.com wrote:

Due to RISC-V limitations on operations with big constants combine
is failing to match such operations and is not being able to
produce optimal code as it keeps splitting them. By pretending we
can do those operations we can get more opportunities for
simplification of surrounding instructions.


I saw Jeff's comments.  This is always the kind of thing that worries 
me: we're essentially lying to the optimizer in order to trick it into 
generating better code, which might just make it generate worse code.  
It's always easy to see a small example that improves, but those could 
be wiped out by secondary effects in real code.  So I'd usually want to 
have some benchmarking for a patch like this.


That said, if this is just the standard way of doing things then maybe 
it's just fine?



2022-12-06 Raphael Moreira Zinsly 
   Jeff Law 

gcc/Changelog:
PR target/95632
PR target/106602
* config/riscv/riscv.md: New pattern to simulate complex
const_int loads.

gcc/testsuite/ChangeLog:
* gcc.target/riscv/pr95632.c: New test.
* gcc.target/riscv/pr106602.c: Likewise.
---
 gcc/config/riscv/riscv.md | 16 
 gcc/testsuite/gcc.target/riscv/pr106602.c | 14 ++
 gcc/testsuite/gcc.target/riscv/pr95632.c  | 15 +++
 3 files changed, 45 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/pr106602.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/pr95632.c

diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index df57e2b0b4a..0a9b5ec22b0 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -1667,6 +1667,22 @@
  MAX_MACHINE_MODE, [3], TRUE);
 })

+;; Pretend to have the ability to load complex const_int in order to get
+;; better code generation around them.
+(define_insn_and_split ""
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+(match_operand:GPR 1 "splittable_const_int_operand" "i"))]
+  "cse_not_expected"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+
+{
+  riscv_move_integer (operands[0], operands[0], INTVAL (operands[1]),
+ mode, TRUE);
+  DONE;
+})


There's some comments from Jakub on this, I don't see any additional 
issues with the code (aside from the "does it help" stuff from above).



+
 ;; 64-bit integer moves

 (define_expand "movdi"
diff --git a/gcc/testsuite/gcc.target/riscv/pr106602.c 
b/gcc/testsuite/gcc.target/riscv/pr106602.c
new file mode 100644
index 000..83b70877012
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/pr106602.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=rv64gc" } */


There's a DG hook to limit this to 64-bit targets, that way it'll run 
with whatever target is being tested.



+
+unsigned long
+foo2 (unsigned long a)
+{
+  return (unsigned long)(unsigned int) a << 6;
+}
+
+/* { dg-final { scan-assembler-times "slli\t" 1 } } */
+/* { dg-final { scan-assembler-times "srli\t" 1 } } */
+/* { dg-final { scan-assembler-not "\tli\t" } } */
+/* { dg-final { scan-assembler-not "addi\t" } } */
+/* { dg-final { scan-assembler-not "and\t" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/pr95632.c 
b/gcc/testsuite/gcc.target/riscv/pr95632.c
new file mode 100644
index 000..bd316ab1d7b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/pr95632.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=rv32imafc -mabi=ilp32f" } */


Is there a reason to make this rv32-only?  Unless I'm missing something 
this should generate pretty much the same code for rv64.



+
+unsigned short
+foo (unsigned short crc)
+{
+  crc ^= 0x4002;
+  crc >>= 1;
+  crc |= 0x8000;
+
+  return crc;
+}
+
+/* { dg-final { scan-assembler-times "srli\t" 1 } } */
+/* { dg-final { scan-assembler-not "slli\t" } } */


Re: [PATCH] RISC-V: Produce better code with complex constants [PR95632] [PR106602]

2022-12-08 Thread Palmer Dabbelt

On Wed, 07 Dec 2022 13:30:32 PST (-0800), gcc-patches@gcc.gnu.org wrote:

On Wed, Dec 07, 2022 at 05:55:17PM -0300, Raphael Moreira Zinsly wrote:

Due to RISC-V limitations on operations with big constants combine
is failing to match such operations and is not being able to
produce optimal code as it keeps splitting them. By pretending we
can do those operations we can get more opportunities for
simplification of surrounding instructions.

2022-12-06 Raphael Moreira Zinsly 
   Jeff Law 


Just nits, not a proper review.
2 spaces after date and 2 spaces before <, rather than just 1.



gcc/Changelog:
PR target/95632
PR target/106602
* config/riscv/riscv.md: New pattern to simulate complex
const_int loads.

gcc/testsuite/ChangeLog:
* gcc.target/riscv/pr95632.c: New test.
* gcc.target/riscv/pr106602.c: Likewise.


All lines in the ChangeLog should be tab indented, rather than just some of
them and others with 8 spaces.


There's alsot contrib/git-commit-mklog.py, which provides a template for 
these (I also have trouble remembering the formatting rules).





--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -1667,6 +1667,22 @@
  MAX_MACHINE_MODE, [3], TRUE);
 })

+;; Pretend to have the ability to load complex const_int in order to get
+;; better code generation around them.
+(define_insn_and_split ""


define_insn_and_split patterns better should have some name, even if it
starts with *.  It makes dumps more readable, and you can refer to it
in the ChangeLog when it is added or changed etc.


+  [(set (match_operand:GPR 0 "register_operand" "=r")
+(match_operand:GPR 1 "splittable_const_int_operand" "i"))]
+  "cse_not_expected"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+


Why the empty line?


+{
+  riscv_move_integer (operands[0], operands[0], INTVAL (operands[1]),
+ mode, TRUE);


You can just use  if there is only one iterator in the pattern.

Jakub


Re: [PATCH]AArch64 div-by-255, ensure that arguments are registers. [PR107988]

2022-12-08 Thread Richard Earnshaw via Gcc-patches




On 08/12/2022 16:39, Tamar Christina via Gcc-patches wrote:

Hi All,

At -O0 (as opposed to e.g. volatile) we can get into the situation where the
in0 and result RTL arguments passed to the division function are memory
locations instead of registers.  I think we could reject these early on by
checking that the gimple values are GIMPLE registers, but I think it's better to
handle it.

As such I force them to registers and emit a move to the memory locations and
leave it up to reload to handle.  This fixes the ICE and still allows the
optimization in these cases,  which improves the code quality a lot.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar



gcc/ChangeLog:

PR target/107988
* config/aarch64/aarch64.cc
(aarch64_vectorize_can_special_div_by_constant): Ensure input and output
RTL are registers.

gcc/testsuite/ChangeLog:

PR target/107988
* gcc.target/aarch64/pr107988-1.c: New test.

--- inline copy of patch --
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
b8dc3f070c8afc47c85fa18768c4da92c774338f..9f96424993c4fe90e1b241fcb3aa97025225
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24337,12 +24337,27 @@ aarch64_vectorize_can_special_div_by_constant (enum 
tree_code code,
if (!VECTOR_TYPE_P (vectype))
 return false;
  
+  if (!REG_P (in0))

+in0 = force_reg (GET_MODE (in0), in0);
+
gcc_assert (output);
  
-  if (!*output)

-*output = gen_reg_rtx (TYPE_MODE (vectype));
+  rtx res =  NULL_RTX;
+
+  /* Once e get to this point we cannot reject the RTL,  if it's not a reg then
+ Create a new reg and write the result to the output afterwards.  */
+  if (!*output || !REG_P (*output))
+res = gen_reg_rtx (TYPE_MODE (vectype));
+  else
+res = *output;


Why not write
  rtx res = *output
  if (!res || !REG_P (res))
res = gen_reg_rtx...

then you don't need either the else clause or the dead NULL_RTX assignment.



+
+  emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), res, in0, in1));
+
+  if (*output && res != *output)
+emit_move_insn (*output, res);
+  else
+*output = res;
  
-  emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output, in0, in1));

return true;
  }
  
diff --git a/gcc/testsuite/gcc.target/aarch64/pr107988-1.c b/gcc/testsuite/gcc.target/aarch64/pr107988-1.c

new file mode 100644
index 
..c4fd290271b738345173b569bdc58c092fba7fe9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr107988-1.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O0" } */
+typedef unsigned short __attribute__((__vector_size__ (16))) V;
+
+V
+foo (V v)
+{
+  v /= 255;
+  return v;
+}






Otherwise OK.

R.


[PATCH] c++: class-scope qualified constrained auto [PR107188]

2022-12-08 Thread Patrick Palka via Gcc-patches
Here when parsing the class-scope auto constrained by a qualified
concept-id, we first tentatively parse the overall member-declaration as
a deprecated access-declaration, during which we parse C as a
standalone TEMPLATE_ID_EXPR (not part of the auto) and end up emitting
the bogus error

concepts-placeholder11.C:9:6: error: wrong number of template arguments (1, 
should be 2)
9 |   N::C auto f() { return 0; }
  |  ^~
concepts-placeholder11.C:5:34: note: provided for ‘template 
concept N::C’
5 |   template concept C = true;
  |  ^

from build_concept_id called from cp_parser_template_id_expr.

We could fix this by adding a complain parameter to build_concept_id and
passing tf_none when parsing tentatively.  However, it seems we can fix
this in a more general way that might benefit non-concepts code: when
tentatively parsing an access-declaration, abort the parse early if the
qualifying scope isn't possibly a class type, so that we avoid parsing
C as a TEMPLATE_ID_EXPR in the first place.  This patch takes this
latter approach.

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
trunk?

PR c++/107188

gcc/cp/ChangeLog:

* parser.cc (cp_parser_using_declaration): Abort the tentative
parse early if the scope of an access-declaration isn't possibly
a class type.

gcc/testsuite/ChangeLog:

* g++.dg/cpp2a/concepts-placeholder11.C: New test.
---
 gcc/cp/parser.cc|  5 +
 gcc/testsuite/g++.dg/cpp2a/concepts-placeholder11.C | 10 ++
 2 files changed, 15 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/cpp2a/concepts-placeholder11.C

diff --git a/gcc/cp/parser.cc b/gcc/cp/parser.cc
index e8a50904243..ccacf6d7dd0 100644
--- a/gcc/cp/parser.cc
+++ b/gcc/cp/parser.cc
@@ -21670,6 +21670,11 @@ cp_parser_using_declaration (cp_parser* parser,
 
   cp_warn_deprecated_use_scopes (qscope);
 
+  if (access_declaration_p && !MAYBE_CLASS_TYPE_P (qscope))
+/* If the qualifying scope of an access-declaration isn't possibly
+   a class type then it must be invalid.  */
+cp_parser_simulate_error (parser);
+
   if (access_declaration_p && cp_parser_error_occurred (parser))
 /* Something has already gone wrong; there's no need to parse
further.  Since an error has occurred, the return value of
diff --git a/gcc/testsuite/g++.dg/cpp2a/concepts-placeholder11.C 
b/gcc/testsuite/g++.dg/cpp2a/concepts-placeholder11.C
new file mode 100644
index 000..61eef743bae
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp2a/concepts-placeholder11.C
@@ -0,0 +1,10 @@
+// PR c++/107188
+// { dg-do compile { target c++20 } }
+
+namespace N {
+  template concept C = true;
+}
+
+struct X {
+  N::C auto f() { return 0; }
+};
-- 
2.39.0.rc2



[PATCH]AArch64 div-by-255, ensure that arguments are registers. [PR107988]

2022-12-08 Thread Tamar Christina via Gcc-patches
Hi All,

At -O0 (as opposed to e.g. volatile) we can get into the situation where the
in0 and result RTL arguments passed to the division function are memory
locations instead of registers.  I think we could reject these early on by
checking that the gimple values are GIMPLE registers, but I think it's better to
handle it.

As such I force them to registers and emit a move to the memory locations and
leave it up to reload to handle.  This fixes the ICE and still allows the
optimization in these cases,  which improves the code quality a lot.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar



gcc/ChangeLog:

PR target/107988
* config/aarch64/aarch64.cc
(aarch64_vectorize_can_special_div_by_constant): Ensure input and output
RTL are registers.

gcc/testsuite/ChangeLog:

PR target/107988
* gcc.target/aarch64/pr107988-1.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
b8dc3f070c8afc47c85fa18768c4da92c774338f..9f96424993c4fe90e1b241fcb3aa97025225
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24337,12 +24337,27 @@ aarch64_vectorize_can_special_div_by_constant (enum 
tree_code code,
   if (!VECTOR_TYPE_P (vectype))
return false;
 
+  if (!REG_P (in0))
+in0 = force_reg (GET_MODE (in0), in0);
+
   gcc_assert (output);
 
-  if (!*output)
-*output = gen_reg_rtx (TYPE_MODE (vectype));
+  rtx res =  NULL_RTX;
+
+  /* Once e get to this point we cannot reject the RTL,  if it's not a reg then
+ Create a new reg and write the result to the output afterwards.  */
+  if (!*output || !REG_P (*output))
+res = gen_reg_rtx (TYPE_MODE (vectype));
+  else
+res = *output;
+
+  emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), res, in0, in1));
+
+  if (*output && res != *output)
+emit_move_insn (*output, res);
+  else
+*output = res;
 
-  emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output, in0, 
in1));
   return true;
 }
 
diff --git a/gcc/testsuite/gcc.target/aarch64/pr107988-1.c 
b/gcc/testsuite/gcc.target/aarch64/pr107988-1.c
new file mode 100644
index 
..c4fd290271b738345173b569bdc58c092fba7fe9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr107988-1.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O0" } */
+typedef unsigned short __attribute__((__vector_size__ (16))) V;
+
+V
+foo (V v)
+{
+  v /= 255;
+  return v;
+}




-- 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
b8dc3f070c8afc47c85fa18768c4da92c774338f..9f96424993c4fe90e1b241fcb3aa97025225
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24337,12 +24337,27 @@ aarch64_vectorize_can_special_div_by_constant (enum 
tree_code code,
   if (!VECTOR_TYPE_P (vectype))
return false;
 
+  if (!REG_P (in0))
+in0 = force_reg (GET_MODE (in0), in0);
+
   gcc_assert (output);
 
-  if (!*output)
-*output = gen_reg_rtx (TYPE_MODE (vectype));
+  rtx res =  NULL_RTX;
+
+  /* Once e get to this point we cannot reject the RTL,  if it's not a reg then
+ Create a new reg and write the result to the output afterwards.  */
+  if (!*output || !REG_P (*output))
+res = gen_reg_rtx (TYPE_MODE (vectype));
+  else
+res = *output;
+
+  emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), res, in0, in1));
+
+  if (*output && res != *output)
+emit_move_insn (*output, res);
+  else
+*output = res;
 
-  emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output, in0, 
in1));
   return true;
 }
 
diff --git a/gcc/testsuite/gcc.target/aarch64/pr107988-1.c 
b/gcc/testsuite/gcc.target/aarch64/pr107988-1.c
new file mode 100644
index 
..c4fd290271b738345173b569bdc58c092fba7fe9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr107988-1.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O0" } */
+typedef unsigned short __attribute__((__vector_size__ (16))) V;
+
+V
+foo (V v)
+{
+  v /= 255;
+  return v;
+}





Re: [PATCH] expr.cc: avoid unexpected side effects in expand_expr_divmod optimization

2022-12-08 Thread Jose E. Marchesi via Gcc-patches


>> Am 08.12.2022 um 11:56 schrieb Jose E. Marchesi via Gcc-patches 
>> :
>> 
>> The expand_expr_divmod function in expr.cc attempts to optimize cases
>> where both arguments of a division/modulus are known to be positive
>> when interpreted as signed.  In these cases, both signed division and
>> unsigned division will raise the same value, and therefore the
>> cheapest option can be used.
>> 
>> In order to determine what is the cheaper option in the current
>> target, expand_expr_divmod actually expands both a signed divmod and
>> an unsigned divmod using local "sequences":
>> 
>>  start_sequence ();
>>  ...
>>  expand_divmod (... signed ...);
>>  ...
>>  end_sequence ();
>> 
>>  start_sequence ();
>>  ...
>>  expand_divmod (... unsigned ...);
>>  ...
>>  end_sequence ();
>> 
>> And then compares the cost of each generated sequence, choosing the
>> best one.  Finally, it emits the selected expanded sequence and
>> returns the rtx with the result.
>> 
>> This approach has a caveat.  Some targets do not provide instructions
>> for division/modulus instructions.  In the case of BPF, it provides
>> unsigned division/modulus, but not signed division/modulus.
>> 
>> In these cases, the expand_divmod tries can contain calls to funcalls.
>> For example, in BPF:
>> 
>>  start_sequence ();
>>  ...
>>  expand_divmod (... signed ...); -> This generates funcall to __divdi3
>>  ...
>>  end_sequence ();
>> 
>>  start_sequence ();
>>  ...
>>  expand_divmod (... unsigned ...); -> This generates direct `div' insn.
>>  ...
>>  end_sequence ();
>> 
>> The problem is that when a funcall is expanded, an accompanying global
>> symbol definition is written in the output stream:
>> 
>>  .global __divdi3
>> 
>> And this symbol definition remains in the compiled assembly file, even
>> if the sequence using the direct `div' instruction above is used.
>> 
>> This is particularly bad in BPF, because the kernel bpf loader chokes
>> on the spurious symbol __divdi3 and makes the resulting BPF object
>> unloadable (note that BPF objects are not linked before processed by
>> the kernel.)
>> 
>> In order to fix this, this patch modifies expand_expr_divmod in the
>> following way:
>> 
>> - When trying each sequence (signed, unsigned) the expand_divmod calls
>>  are told to _not_ use libcalls if everything else fails.  This is
>>  done by passing OPTAB_WIDEN as the `methods' argument.  (Before it
>>  was using the default value OPTAB_LIB_WIDEN.)
>> 
>> - If any of the tried expanded sequences contain a funcall, then the
>>  optimization is not attempted.
>
> How do libcalls appear in iff you specify OPTABS_WIDEN only?  Doesn’t
> that allow to simplify this and also use the sequence without a
> libcall?

If you pass OPTABS_WIDEN only then libcalls are not an option and (as
far as I can tell) expand_divmod returns NULL if a libcall is the only
possibility.

> Richard 
>
>> 
>> A couple of BPF tests are also added to make sure this doesn't break
>> at any point in the future.
>> 
>> Tested in bpf-unknown-none and x86_64-linux-gnu.
>> Regtested in x86_64-linux-gnu.  No regressions.
>> 
>> gcc/ChangeLog
>> 
>>* expr.cc (expand_expr_divmod): Avoid side-effects of trying
>>sequences involving funcalls in optimization.
>> 
>> gcc/testsuite/ChangeLog:
>> 
>>* gcc.target/bpf/divmod-funcall-1.c: New test.
>>* gcc.target/bpf/divmod-funcall-2.c: Likewise.
>> ---
>> gcc/expr.cc   | 44 +++
>> .../gcc.target/bpf/divmod-funcall-1.c |  8 
>> .../gcc.target/bpf/divmod-funcall-2.c |  8 
>> 3 files changed, 41 insertions(+), 19 deletions(-)
>> create mode 100644 gcc/testsuite/gcc.target/bpf/divmod-funcall-1.c
>> create mode 100644 gcc/testsuite/gcc.target/bpf/divmod-funcall-2.c
>> 
>> diff --git a/gcc/expr.cc b/gcc/expr.cc
>> index d9407432ea5..4d4be5d7bda 100644
>> --- a/gcc/expr.cc
>> +++ b/gcc/expr.cc
>> @@ -9168,32 +9168,38 @@ expand_expr_divmod (tree_code code, machine_mode 
>> mode, tree treeop0,
>>   do_pending_stack_adjust ();
>>   start_sequence ();
>>   rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
>> -   op0, op1, target, 1);
>> +   op0, op1, target, 1, OPTAB_WIDEN);
>>   rtx_insn *uns_insns = get_insns ();
>>   end_sequence ();
>>   start_sequence ();
>>   rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
>> -   op0, op1, target, 0);
>> +   op0, op1, target, 0, OPTAB_WIDEN);
>>   rtx_insn *sgn_insns = get_insns ();
>>   end_sequence ();
>> -  unsigned uns_cost = seq_cost (uns_insns, speed_p);
>> -  unsigned sgn_cost = seq_cost (sgn_insns, speed_p);
>> 
>> -  /* If costs are the same then use as tie breaker the other other
>> - factor.  */
>> -  if (uns_cost == sgn_cost)
>> -{
>> -  uns_cost = seq_cost (uns_insns, !speed_p);
>> -  sgn_cost = seq_cost (sgn_insns, !speed_p);
>> -}
>> -
>> -  

Re: [PATCH 02/17] libgomp: pinned memory

2022-12-08 Thread Tobias Burnus

On 08.12.22 15:35, Andrew Stubbs wrote:

On 08/12/2022 14:02, Tobias Burnus wrote:

With available, I assume that nvptx is an 'available device' (per OpenMP
definition, finally added in TR11), i.e. there is an image for nvptx and
- after omp_requires filtering - there remains at least one nvptx
device.


If plugin-nvptx has been loaded then the function will be available.
Do we need to get fancier than that?


I think it does not really make sense to use CUDA if there is no single device.
In terms of loading, the code does:

gomp_target_init(void)
{
...
  cur = OFFLOAD_PLUGINS;  /* This is a comma-separated string with the 
supported plugins. */
...
if (gomp_load_plugin_for_device (_device, plugin_name))
  {
int omp_req = omp_requires_mask & ~GOMP_REQUIRES_TARGET_USED;
new_num_devs = current_device.get_num_devices_func (omp_req);

Thus, CUDA is loaded at the 'gomp_load_plugin_for_device' line and at the
'new_num_devs =' line, it has been filtered for OpenMP's 'requires' demands.*

Thus, 'new_num_devs' contains the number of 'accessible devices' (OpenMP 
definition),
filtered for the 'requires'* (which part of the 'supported devices' 
requirements).

(* With some caveats related to late loading of offloading code from (shared) 
libraries.)

 * * *

Admittedly, this does not yet cover the last suggested feature:

GOMP_offload_register_ver (...)
{
gomp_load_image_to_device (devicep, version,

which is relevant for the first part of:

'supported devices' - '... supported by the implementation for execution of 
target code ...
requires directive are fulfilled'.

(available = (intersection of 'accessible devices' and 'supported devices') 
possibly
filtered + reordered via the OMP_AVAILABLE_DEVICES env var.)


I am not sure how strictly it is required and when we know when the all 
offload_register are
over; I do note that OpenMP TR 11 has an over-engineered OMP_AVAILABLE_DEVICES 
environment
variable which permits to filter the list of available devices – which also 
requires early
access to the initial 'available devices' list. But it might be sufficient to 
rely on the
device-is-accessible + requires filtering and ignore whether an actual image is 
available.

Tobias

-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955


Re: [PATCH v2] docs: Suggest options to improve ASAN stack traces

2022-12-08 Thread Jakub Jelinek via Gcc-patches
On Thu, Dec 08, 2022 at 09:34:34AM -0500, Marek Polacek wrote:
> I got a complaint that while Clang docs suggest options that improve
> the quality of the backtraces ASAN prints (cf.
> ), our docs
> don't say anything to that effect.  This patch amends that with a new
> paragraph.  (It deliberately doesn't mention -fno-omit-frame-pointer.)
> 
> gcc/ChangeLog:
> 
>   * doc/invoke.texi (-fsanitize=address): Suggest options to improve
>   stack traces.
> ---
>  gcc/doc/invoke.texi | 9 +
>  1 file changed, 9 insertions(+)
> 
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 726392409b6..1641efecf18 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -16510,6 +16510,15 @@ The option cannot be combined with 
> @option{-fsanitize=thread} or
>  @option{-fsanitize=hwaddress}.  Note that the only target
>  @option{-fsanitize=hwaddress} is currently supported on is AArch64.
>  
> +To get more accurate stack traces, it is possible to use options such as
> +@option{-O} (which, for instance, prevents most function inlining),

Still not sure about this part.  For one, I wonder if we shouldn't
recommend -O0, -O1 or -Og instead of just one of them, and I'm also not sure
how much function inlining is prevented with -O1.
always_inline functions are certainly inlined even at -O0 or -Og (at least
when called directly), -O1 adds
{ OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_finline_functions_called_once, NULL, 1 },
to that, -O2 adds
{ OPT_LEVELS_2_PLUS, OPT_findirect_inlining, NULL, 1 },
{ OPT_LEVELS_2_PLUS, OPT_finline_small_functions, NULL, 1 },
{ OPT_LEVELS_2_PLUS, OPT_fpartial_inlining, NULL, 1 },
{ OPT_LEVELS_2_PLUS, OPT_finline_functions, NULL, 1 },
and -O3 further bumps some parameters:
{ OPT_LEVELS_3_PLUS, OPT__param_max_inline_insns_auto_, NULL, 30 },
{ OPT_LEVELS_3_PLUS, OPT__param_early_inlining_insns_, NULL, 14 },
{ OPT_LEVELS_3_PLUS, OPT__param_inline_heuristics_hint_percent_, NULL, 600 
},
{ OPT_LEVELS_3_PLUS, OPT__param_inline_min_speedup_, NULL, 15 },
{ OPT_LEVELS_3_PLUS, OPT__param_max_inline_insns_single_, NULL, 200 },

> +@option{-fno-optimize-sibling-calls} (which prevents optimizing sibling

-fno-optimize-sibling-calls is the default for -O0/-O1/-Og; dunno if we
want to reiterate it.

> +and tail recursive calls), or @option{-fno-ipa-icf} (which disables Identical
> +Code Folding for functions).  Since multiple runs of the program may yield
> +backtraces with different addresses due to ASLR (Address Space Layout
> +Randomization), it may be desirable to turn ASLR off.  On Linux, this can be
> +achieved with @samp{setarch `uname -m` -R ./prog}.
> +
>  @item -fsanitize=kernel-address
>  @opindex fsanitize=kernel-address
>  Enable AddressSanitizer for Linux kernel.

Jakub



Re: [PATCH V3] Use reg mode to move sub blocks for parameters and returns

2022-12-08 Thread Segher Boessenkool
On Thu, Dec 08, 2022 at 09:17:38PM +0800, Jiufu Guo wrote:
> Segher Boessenkool  writes:
> > On Wed, Dec 07, 2022 at 08:00:08PM +0800, Jiufu Guo wrote:
> >> typedef struct SA {double a[3];} A;
> >> A ret_arg_pt (A *a) {return *a;} // on ppc64le, expect only 3 lfd(s)
> >> A ret_arg (A a) {return a;} // just empty fun body
> >> void st_arg (A a, A *p) {*p = a;} //only 3 stfd(s)
> >
> > What is this like if you use [5] instead?  Or use an ABI without
> > homogeneous aggregates?
> Thanks for this question!  I also tested the cases on different array
> types or different sizes, or mixed field types.
> 
> If it is out of the number of registers for passing the param
> or return, it is treated as a mem block.
> For parameter, it is partially passed via registers, and partially
> passing via stack.
> For return, it is returned via a pointer (with one invisible pointer
> parameter). And the  of the function is not with parallel code.
> 
> This patch does not cover these cases.

Understood, sure; but my point is, can it degrade code quality in such
cases?  I don't see anything in the patch that precludes that.

> >> --- /dev/null
> >> +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c
> >> @@ -0,0 +1,15 @@
> >> +/* PR target/65421 */
> >> +/* { dg-options "-O2" } */
> >> +/* { dg-require-effective-target has_arch_ppc64 } */
> >> +
> >> +typedef struct SA
> >> +{
> >> +  double a[2];
> >> +  long l;
> >> +} A;
> >> +
> >> +/* std 3 param regs to return slot */
> >> +A ret_arg (A a) {return a;}
> >> +/* { dg-final { scan-assembler-times {\mstd 4,0\(3\)\s} 1 } } */
> >> +/* { dg-final { scan-assembler-times {\mstd 5,8\(3\)\s} 1 } } *
> >> +/* { dg-final { scan-assembler-times {\mstd 6,16\(3\)\s} 1 } } */
> >
> > This is only correct on certain ABIs, probably only ELFv2 even.
> Thanks for point out this!
> This is only correct if the ABI allows this struct to be passed
> through integer registers, and return through the mem block.

And it needs to be in those specific registers / at those specific
offsets as well.

Btw, please leave out the \s?

> In the previous version, I added a requirement on ELFv2. As tested on
> BE environments, this case also pass. So, I deleted the requirement.

BE for ELFv2 also exists, fwiw.

> (While on BE environments, there is another issue: some unnecessary
> memory stores are not deleted.)

Huh.  Does that happen with the current compiler as well?  Do you have
an example?

> But with more reading of the code 'rs6000_function_arg', as you said,
> I'm not sure if this behavior meets other ABIs (at least, it seems,
> this is not correct on darwin64).
> So, as you said, we may add a requirement on ELFv2; Or leave this
> case there, and add "! target" when hitting failure?

If you do !target the testcase won't test much at all anymore ;-)

> > We certainly can improve the homogeneous aggregates stuff, but please
> > make sure you don't degrade all other stuff?  Older, as well as when
> > things are not an homogeneous aggregate, for example too big.  Can you
> > please add tests for such cases?
> Sure, thanks!  I encounter one issue in this kind of case (large struct)
> on a previous version path.

Perhaps it would be better to have a hook so that every target (and
subtarget) can fine tune exactly when this is done.  Then again, perhaps
I worry too much.


Segher


Re: [GCC][PATCH 13/15, v4] arm: Add support for dwarf debug directives and pseudo hard-register for PAC feature.

2022-12-08 Thread Richard Earnshaw via Gcc-patches




On 09/11/2022 14:32, Srinath Parvathaneni via Gcc-patches wrote:

Hello,

This patch teaches the DWARF support in gcc about RA_AUTH_CODE pseudo 
hard-register and also
updates the ".save", ".cfi_register", ".cfi_offset", ".cfi_restore" directives 
accordingly.
This patch also adds support to emit ".pacspval" directive when "pac ip, lr, 
sp" instruction
in generated in the assembly.

RA_AUTH_CODE register number is 107 and it's dwarf register number is 143.

Applying this patch on top of PACBTI series posted here
https://gcc.gnu.org/pipermail/gcc-patches/2022-August/599658.html and when 
compiling the following
test.c with "-march=armv8.1-m.main+mve+pacbti -mbranch-protection=pac-ret 
-mthumb -mfloat-abi=hard
fasynchronous-unwind-tables -g -O0 -S" command line options, the assembly 
output after this patch
looks like below:

$cat test.c

void fun1(int a);
void fun(int a,...)
{
   fun1(a);
}

int main()
{
   fun (10);
   return 0;
}

$ arm-none-eabi-gcc -march=armv8.1-m.main+mve+pacbti 
-mbranch-protection=pac-ret -mthumb -mfloat-abi=hard
-fasynchronous-unwind-tables -g -O0 -S test.s

Assembly output:
...
fun:
...
 .pacspval
 pac ip, lr, sp
 .cfi_register 143, 12
 push{r3, r7, ip, lr}
 .save {r3, r7, ra_auth_code, lr}
...
 .cfi_offset 143, -24
...
 .cfi_restore 143
...
 aut ip, lr, sp
 bx  lr
...
main:
...
 .pacspval
 pac ip, lr, sp
 .cfi_register 143, 12
 push{r3, r7, ip, lr}
 .save {r3, r7, ra_auth_code, lr}
...
 .cfi_offset 143, -8
...
 .cfi_restore 143
...
 aut ip, lr, sp
 bx  lr
...

Regression tested on arm-none-eabi target and found no regressions.

Ok for master?

Regards,
Srinath.

gcc/testsuite/ChangeLog:

2022-11-04  Srinath Parvathaneni  

 * g++.target/arm/pac-1.C: New test.
 * gcc.target/arm/pac-9.c: New test.


2022-11-04  Srinath Parvathaneni  

 * config/arm/aout.h (ra_auth_code): Add entry in enum.
 * config/arm/arm.cc (pac_emit): Declare new global boolean variable.
 (emit_multi_reg_push): Add RA_AUTH_CODE register to
 dwarf frame expression.
 (arm_emit_multi_reg_pop): Restore RA_AUTH_CODE register.
 (arm_expand_prologue): Update frame related infomration and reg notes
 for pac/pacbit insn.
 (arm_regno_class): Check for pac pseudo reigster.
 (arm_dbx_register_number): Assign ra_auth_code register number in 
dwarf.
 (arm_unwind_emit_sequence): Print .save directive with ra_auth_code
 register.
 (arm_unwind_emit_set): Add entry for IP_REGNUM in switch case.
 (arm_unwind_emit): Update REG_CFA_REGISTER case._
 (arm_conditional_register_usage): Mark ra_auth_code in fixed reigsters.
 * config/arm/arm.h (FIRST_PSEUDO_REGISTER): Modify.
 (IS_PAC_PSEUDO_REGNUM): Define.
 (enum reg_class): Add PAC_REG entry.
 * config/arm/arm.md (RA_AUTH_CODE): Define.

gcc/testsuite/ChangeLog:

2022-11-04  Srinath Parvathaneni  

 * g++.target/arm/pac-1.C: New test.
 * gcc.target/arm/pac-9.c: Likewise.


### Attachment also inlined for ease of reply###


diff --git a/gcc/config/arm/aout.h b/gcc/config/arm/aout.h
index 
b918ad3782fbee82320febb8b6e72ad615780261..ffeed45a678f17c63d5b42c21f020ca416cbf23f
 100644
--- a/gcc/config/arm/aout.h
+++ b/gcc/config/arm/aout.h
@@ -74,7 +74,8 @@
"wr8",   "wr9",   "wr10",  "wr11",  \
"wr12",  "wr13",  "wr14",  "wr15",  \
"wcgr0", "wcgr1", "wcgr2", "wcgr3", \
-  "cc", "vfpcc", "sfp", "afp", "apsrq", "apsrge", "p0"   \
+  "cc", "vfpcc", "sfp", "afp", "apsrq", "apsrge", "p0",  \
+  "ra_auth_code" \
  }
  #endif
  
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h

index 
a2dc3fc145c52d8381c54634687376089a47e704..91c400f12568156ed29bf5d5e59460bf887fbefb
 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -820,7 +820,8 @@ extern const int arm_arch_cde_coproc_bits[];
s16-s31   S VFP variable (aka d8-d15).
vfpcc   Not a real register.  Represents the VFP condition
code flags.
-   vpr Used to represent MVE VPR predication.  */
+   vpr Used to represent MVE VPR predication.
+   ra_auth_codePseudo register to save PAC.  */
  
  /* The stack backtrace structure is as follows:

fp points to here:  |  save code pointer  |  [fp]
@@ -861,7 +862,7 @@ extern const int arm_arch_cde_coproc_bits[];
1,1,1,1,1,1,1,1,\
1,1,1,1,\
/* Specials.  */\
-  1,1,1,1,1,1,1\
+  1,1,1,1,1,1,1,1  \
  }
  
  /* 1 for registers not available across function calls.

@@ -891,7 +892,7 @@ 

Re: [PATCH 02/17] libgomp: pinned memory

2022-12-08 Thread Andrew Stubbs

On 08/12/2022 14:02, Tobias Burnus wrote:

On 08.12.22 13:51, Andrew Stubbs wrote:

On 08/12/2022 12:11, Jakub Jelinek wrote:

On Thu, Jul 07, 2022 at 11:34:33AM +0100, Andrew Stubbs wrote:

Implement the OpenMP pinned memory trait on Linux hosts using the mlock
syscall.  Pinned allocations are performed using mmap, not malloc,
to ensure
that they can be unpinned safely when freed.

As I said before, I think the pinned memory is too precious to waste
it this
way, we should handle the -> pinned case through memkind_create_fixed on
mmap + mlock area, that way we can create even quite small pinned
allocations.


This has been delayed due to other priorities, but our current plan is
to switch to using cudaHostAlloc, when available, but we can certainly
use memkind_create_fixed for the fallback case (including amdgcn).


With available, I assume that nvptx is an 'available device' (per OpenMP
definition, finally added in TR11), i.e. there is an image for nvptx and
- after omp_requires filtering - there remains at least one nvptx device.


If plugin-nvptx has been loaded then the function will be available. Do 
we need to get fancier than that?


Andrew


[PATCH v2] docs: Suggest options to improve ASAN stack traces

2022-12-08 Thread Marek Polacek via Gcc-patches
On Thu, Dec 08, 2022 at 03:28:13PM +0100, Jakub Jelinek wrote:
> On Thu, Dec 08, 2022 at 09:11:54AM -0500, Marek Polacek via Gcc-patches wrote:
> > On Thu, Dec 08, 2022 at 08:25:26AM +0100, Florian Weimer wrote:
> > > * Marek Polacek via Gcc-patches:
> > > 
> > > > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> > > > index 726392409b6..2de14466dd3 100644
> > > > --- a/gcc/doc/invoke.texi
> > > > +++ b/gcc/doc/invoke.texi
> > > > @@ -16510,6 +16510,14 @@ The option cannot be combined with 
> > > > @option{-fsanitize=thread} or
> > > >  @option{-fsanitize=hwaddress}.  Note that the only target
> > > >  @option{-fsanitize=hwaddress} is currently supported on is AArch64.
> > > >  
> > > > +To get more accurate stack traces, it is possible to use options such 
> > > > as
> > > > +@option{-O} (which, for instance, prevents most function inlining),
> > > > +@option{-fno-optimize-sibling-calls} (which prevents optimizing sibling
> > > > +and tail recursive calls), or @option{-fno-ipa-icf} (which disables 
> > > > Identical
> > > > +Code Folding for functions and read-only variables).  Since multiple 
> > > > runs
> > > > +of the program may yield backtraces with different addresses due to 
> > > > ASLR,
> > > > +it may be desirable to turn off ASLR: @samp{setarch `uname -m` -R 
> > > > ./prog}.
> > > 
> > > What about -fasynchronous-unwind-tables?  It should help if ASAN ever
> > > reports stray segmentation faults.  Whether it also helps in general
> > > depends on whether ASAN maintains ABI around its instrumentation.
> > 
> > I'm not sure.  Someone else will have to decide if we want to mention
> > that option as well.
> 
> -fasynchronous-unwind-tables is on by default on many targets, so I wouldn't
> mention it:
> grep asynchronous_unwind_tables common/*/*/* config/*/*
> common/config/aarch64/aarch64-common.cc:{ OPT_LEVELS_ALL, 
> OPT_fasynchronous_unwind_tables, NULL, 1 },
> common/config/i386/i386-common.cc:  opts->x_flag_asynchronous_unwind_tables = 
> 2;
> common/config/loongarch/loongarch-common.cc:  { OPT_LEVELS_ALL, 
> OPT_fasynchronous_unwind_tables, NULL, 1 },
> common/config/rs6000/rs6000-common.cc:  
> opts->x_flag_asynchronous_unwind_tables = 1;
> common/config/s390/s390-common.cc:  opts->x_flag_asynchronous_unwind_tables = 
> 1;
> config/i386/i386-options.cc:  if (opts->x_flag_asynchronous_unwind_tables 
> == 2)
> config/i386/i386-options.cc:  opts->x_flag_asynchronous_unwind_tables = 
> !USE_IX86_FRAME_POINTER;
> config/mips/mips.cc:  && 
> !global_options_set.x_flag_asynchronous_unwind_tables)
> config/mips/mips.cc:flag_asynchronous_unwind_tables = 1;
> config/rs6000/rs6000.cc:  && !OPTION_SET_P 
> (flag_asynchronous_unwind_tables))
> config/rs6000/rs6000.cc:flag_asynchronous_unwind_tables = 1;
> 
> On the other side, the @samp{setarch `uname -m` -R ./prog} suggestion is
> very Linux specific, so if we mention it at all, it should mention that
> "e.g. on Linux through ..." or something similar.
> I also wouldn't mention the "and read-only variables" part, that is
> irrelevant for stack traces.

Thanks, updated patch here.  I've also expanded the ASLR acronym.

Ok?

-- >8 --
I got a complaint that while Clang docs suggest options that improve
the quality of the backtraces ASAN prints (cf.
), our docs
don't say anything to that effect.  This patch amends that with a new
paragraph.  (It deliberately doesn't mention -fno-omit-frame-pointer.)

gcc/ChangeLog:

* doc/invoke.texi (-fsanitize=address): Suggest options to improve
stack traces.
---
 gcc/doc/invoke.texi | 9 +
 1 file changed, 9 insertions(+)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 726392409b6..1641efecf18 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -16510,6 +16510,15 @@ The option cannot be combined with 
@option{-fsanitize=thread} or
 @option{-fsanitize=hwaddress}.  Note that the only target
 @option{-fsanitize=hwaddress} is currently supported on is AArch64.
 
+To get more accurate stack traces, it is possible to use options such as
+@option{-O} (which, for instance, prevents most function inlining),
+@option{-fno-optimize-sibling-calls} (which prevents optimizing sibling
+and tail recursive calls), or @option{-fno-ipa-icf} (which disables Identical
+Code Folding for functions).  Since multiple runs of the program may yield
+backtraces with different addresses due to ASLR (Address Space Layout
+Randomization), it may be desirable to turn ASLR off.  On Linux, this can be
+achieved with @samp{setarch `uname -m` -R ./prog}.
+
 @item -fsanitize=kernel-address
 @opindex fsanitize=kernel-address
 Enable AddressSanitizer for Linux kernel.

base-commit: d9f9d5d30feb33c359955d7030cc6be50ef6dc0a
-- 
2.38.1



Re: [PATCH] docs: Suggest options to improve ASAN stack traces

2022-12-08 Thread Jakub Jelinek via Gcc-patches
On Thu, Dec 08, 2022 at 09:11:54AM -0500, Marek Polacek via Gcc-patches wrote:
> On Thu, Dec 08, 2022 at 08:25:26AM +0100, Florian Weimer wrote:
> > * Marek Polacek via Gcc-patches:
> > 
> > > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> > > index 726392409b6..2de14466dd3 100644
> > > --- a/gcc/doc/invoke.texi
> > > +++ b/gcc/doc/invoke.texi
> > > @@ -16510,6 +16510,14 @@ The option cannot be combined with 
> > > @option{-fsanitize=thread} or
> > >  @option{-fsanitize=hwaddress}.  Note that the only target
> > >  @option{-fsanitize=hwaddress} is currently supported on is AArch64.
> > >  
> > > +To get more accurate stack traces, it is possible to use options such as
> > > +@option{-O} (which, for instance, prevents most function inlining),
> > > +@option{-fno-optimize-sibling-calls} (which prevents optimizing sibling
> > > +and tail recursive calls), or @option{-fno-ipa-icf} (which disables 
> > > Identical
> > > +Code Folding for functions and read-only variables).  Since multiple runs
> > > +of the program may yield backtraces with different addresses due to ASLR,
> > > +it may be desirable to turn off ASLR: @samp{setarch `uname -m` -R 
> > > ./prog}.
> > 
> > What about -fasynchronous-unwind-tables?  It should help if ASAN ever
> > reports stray segmentation faults.  Whether it also helps in general
> > depends on whether ASAN maintains ABI around its instrumentation.
> 
> I'm not sure.  Someone else will have to decide if we want to mention
> that option as well.

-fasynchronous-unwind-tables is on by default on many targets, so I wouldn't
mention it:
grep asynchronous_unwind_tables common/*/*/* config/*/*
common/config/aarch64/aarch64-common.cc:{ OPT_LEVELS_ALL, 
OPT_fasynchronous_unwind_tables, NULL, 1 },
common/config/i386/i386-common.cc:  opts->x_flag_asynchronous_unwind_tables = 2;
common/config/loongarch/loongarch-common.cc:  { OPT_LEVELS_ALL, 
OPT_fasynchronous_unwind_tables, NULL, 1 },
common/config/rs6000/rs6000-common.cc:  opts->x_flag_asynchronous_unwind_tables 
= 1;
common/config/s390/s390-common.cc:  opts->x_flag_asynchronous_unwind_tables = 1;
config/i386/i386-options.cc:  if (opts->x_flag_asynchronous_unwind_tables 
== 2)
config/i386/i386-options.cc:opts->x_flag_asynchronous_unwind_tables = 
!USE_IX86_FRAME_POINTER;
config/mips/mips.cc:  && 
!global_options_set.x_flag_asynchronous_unwind_tables)
config/mips/mips.cc:flag_asynchronous_unwind_tables = 1;
config/rs6000/rs6000.cc:  && !OPTION_SET_P 
(flag_asynchronous_unwind_tables))
config/rs6000/rs6000.cc:flag_asynchronous_unwind_tables = 1;

On the other side, the @samp{setarch `uname -m` -R ./prog} suggestion is
very Linux specific, so if we mention it at all, it should mention that
"e.g. on Linux through ..." or something similar.
I also wouldn't mention the "and read-only variables" part, that is
irrelevant for stack traces.

Jakub



Re: [PATCH] docs: Suggest options to improve ASAN stack traces

2022-12-08 Thread Marek Polacek via Gcc-patches
On Thu, Dec 08, 2022 at 08:25:26AM +0100, Florian Weimer wrote:
> * Marek Polacek via Gcc-patches:
> 
> > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> > index 726392409b6..2de14466dd3 100644
> > --- a/gcc/doc/invoke.texi
> > +++ b/gcc/doc/invoke.texi
> > @@ -16510,6 +16510,14 @@ The option cannot be combined with 
> > @option{-fsanitize=thread} or
> >  @option{-fsanitize=hwaddress}.  Note that the only target
> >  @option{-fsanitize=hwaddress} is currently supported on is AArch64.
> >  
> > +To get more accurate stack traces, it is possible to use options such as
> > +@option{-O} (which, for instance, prevents most function inlining),
> > +@option{-fno-optimize-sibling-calls} (which prevents optimizing sibling
> > +and tail recursive calls), or @option{-fno-ipa-icf} (which disables 
> > Identical
> > +Code Folding for functions and read-only variables).  Since multiple runs
> > +of the program may yield backtraces with different addresses due to ASLR,
> > +it may be desirable to turn off ASLR: @samp{setarch `uname -m` -R ./prog}.
> 
> What about -fasynchronous-unwind-tables?  It should help if ASAN ever
> reports stray segmentation faults.  Whether it also helps in general
> depends on whether ASAN maintains ABI around its instrumentation.

I'm not sure.  Someone else will have to decide if we want to mention
that option as well.

Marek



Re: [PATCH 02/17] libgomp: pinned memory

2022-12-08 Thread Tobias Burnus

On 08.12.22 13:51, Andrew Stubbs wrote:

On 08/12/2022 12:11, Jakub Jelinek wrote:

On Thu, Jul 07, 2022 at 11:34:33AM +0100, Andrew Stubbs wrote:

Implement the OpenMP pinned memory trait on Linux hosts using the mlock
syscall.  Pinned allocations are performed using mmap, not malloc,
to ensure
that they can be unpinned safely when freed.

As I said before, I think the pinned memory is too precious to waste
it this
way, we should handle the -> pinned case through memkind_create_fixed on
mmap + mlock area, that way we can create even quite small pinned
allocations.


This has been delayed due to other priorities, but our current plan is
to switch to using cudaHostAlloc, when available, but we can certainly
use memkind_create_fixed for the fallback case (including amdgcn).


With available, I assume that nvptx is an 'available device' (per OpenMP
definition, finally added in TR11), i.e. there is an image for nvptx and
- after omp_requires filtering - there remains at least one nvptx device.

* * *

For completeness, I want to note that OpenMP TR11 adds support for
creating memory spaces that are accessible from multiple devices, e.g.
host + one/all devices, and adds some convenience functions for the
latter (all devices, host and a specific device etc.) →
https://openmp.org/specifications/ TR11 (see Appendix B.2 for the
release notes, esp. for Section 6.2).

I think it makes sense to keep those addition in mind when doing the
actual implementation to avoid incompatibilities.

Side note regarding ompx_ additions proposed in
https://gcc.gnu.org/pipermail/gcc-patches/2022-July/597979.html (adds
ompx_pinned_mem_alloc),
https://gcc.gnu.org/pipermail/gcc-patches/2022-July/597983.html
(ompx_unified_shared_mem_alloc and ompx_host_mem_alloc;
ompx_unified_shared_mem_space and ompx_host_mem_space).

While TR11 does not add any predefined allocators or new memory spaces,
using e.g. omp_get_devices_all_allocator(memspace) returns a
unified-shared-memory allocator.

I note that LLVM does not seem to have any ompx_ in this regard (yet?).
(It has some ompx_ – but related to assumptions.)



Using Cuda might be trickier to implement because there's a layering
violation inherent in routing target independent allocations through
the nvptx plugin, but benchmarking shows that that's the only way to
get the faster path through the Cuda black box; being pinned is good
because it avoids page faults, but apparently if Cuda *knows* it is
pinned then you get a speed boost even when there would be *no* faults
(i.e. on a quiet machine). Additionally, Cuda somehow ignores the
OS-defining limits.


I wonder whether for a NUMA machine (and non-offloading access), using
memkind_create_fixed will have an advantage over cuHostAlloc or not.
(BTW, I find cuHostAlloc vs. cuAllocHost confusing.) And if so, whether
we should provide a means (GOMP_... env var?) to toggle the preference.

My feeling is that, on most systems, it does not matter - except (a)
possibly for large NUMA systems, where the memkind tuning will probably
make a difference and (b) we know that CUDA's cu(HostAlloc/AllocHost) is
faster with nvptx offloading. (cu(HostAlloc/AllocHost) also permits DMA
from the device. (If unified-shared address is supported, but that's the
case [cf. comment + assert in plugin-nvptx.c].)

Tobias

-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955


Re: [PATCH] PR tree-optimization/107985 - Ensure arguments to range-op handler are supported.

2022-12-08 Thread Andrew MacLeod via Gcc-patches

On 12/7/22 12:26, Richard Biener wrote:

On Wed, Dec 7, 2022 at 5:45 PM Andrew MacLeod via Gcc-patches
 wrote:

THis patch invalidates a range-op handler object if an operand type in
the statement is not supported.

This also triggered a check in stmt dependency resolution which assumed
there must be a valid handler for any stmt with an appropriate LHS
type... which is a false assumption.

This should do for now, but long term I will rework the dispatch code to
ensure it matches the specifically supported patterns of operands. This
will make the handler creation a little slower, but speed up the actual
dispatch, especially as we add new range types next release.  Its also
much more invasive... too much for this release I think.

bootstraps on x86_64-pc-linux-gnu with no regressions.  OK?

+ if (!Value_Range::supports_type_p (TREE_TYPE (m_op1)) ||
+ !Value_Range::supports_type_p (TREE_TYPE (m_op2)))

The ||s go to the next line.  Since in a GIMPLE_COND both operand types
are compatible it's enough to check one of them.

Likewise for the GIMPLE_ASSIGN case I think - I don't know of any
binary operator that has operands that would not be both compatible
or not compatible (but it's less clear-cut here).


Doh.  Checked this in:

Andrew

commit e3251e14bccf3891b265293371c7b7f95e306271
Author: Andrew MacLeod 
Date:   Tue Dec 6 10:41:29 2022 -0500

Ensure arguments to range-op handler are supported.

PR tree-optimization/107985
gcc/
* gimple-range-op.cc
(gimple_range_op_handler::gimple_range_op_handler): Check if type
of the operands is supported.
* gimple-range.cc (gimple_ranger::prefill_stmt_dependencies): Do
not assert if here is no range-op handler.

gcc/testsuite/
* g++.dg/pr107985.C: New.

diff --git a/gcc/gimple-range-op.cc b/gcc/gimple-range-op.cc
index 7764166d5fb..12068544bc5 100644
--- a/gcc/gimple-range-op.cc
+++ b/gcc/gimple-range-op.cc
@@ -148,6 +148,9 @@ gimple_range_op_handler::gimple_range_op_handler (gimple *s)
 	case GIMPLE_COND:
 	  m_op1 = gimple_cond_lhs (m_stmt);
 	  m_op2 = gimple_cond_rhs (m_stmt);
+	  // Check that operands are supported types.  One check is enough.
+	  if (!Value_Range::supports_type_p (TREE_TYPE (m_op1)))
+	m_valid = false;
 	  return;
 	case GIMPLE_ASSIGN:
 	  m_op1 = gimple_range_base_of_assignment (m_stmt);
@@ -164,6 +167,9 @@ gimple_range_op_handler::gimple_range_op_handler (gimple *s)
 	}
 	  if (gimple_num_ops (m_stmt) >= 3)
 	m_op2 = gimple_assign_rhs2 (m_stmt);
+	  // Check that operands are supported types.  One check is enough.
+	  if ((m_op1 && !Value_Range::supports_type_p (TREE_TYPE (m_op1
+	m_valid = false;
 	  return;
 	default:
 	  gcc_unreachable ();
diff --git a/gcc/gimple-range.cc b/gcc/gimple-range.cc
index ecd6039e0fd..8c055826e17 100644
--- a/gcc/gimple-range.cc
+++ b/gcc/gimple-range.cc
@@ -422,18 +422,20 @@ gimple_ranger::prefill_stmt_dependencies (tree ssa)
   else
 	{
 	  gimple_range_op_handler handler (stmt);
-	  gcc_checking_assert (handler);
-	  tree op = handler.operand2 ();
-	  if (op)
+	  if (handler)
 	{
-	  Value_Range r (TREE_TYPE (op));
-	  prefill_name (r, op);
-	}
-	  op = handler.operand1 ();
-	  if (op)
-	{
-	  Value_Range r (TREE_TYPE (op));
-	  prefill_name (r, op);
+	  tree op = handler.operand2 ();
+	  if (op)
+		{
+		  Value_Range r (TREE_TYPE (op));
+		  prefill_name (r, op);
+		}
+	  op = handler.operand1 ();
+	  if (op)
+		{
+		  Value_Range r (TREE_TYPE (op));
+		  prefill_name (r, op);
+		}
 	}
 	}
 }
diff --git a/gcc/testsuite/g++.dg/pr107985.C b/gcc/testsuite/g++.dg/pr107985.C
new file mode 100644
index 000..8d244b54efb
--- /dev/null
+++ b/gcc/testsuite/g++.dg/pr107985.C
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -ftree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-fre" } */
+
+struct B {
+  int f;
+};
+
+struct D : public B {
+};
+
+void foo() {
+  D d;
+  d.f = 7;
+
+  int B::* pfb = ::f;
+  int D::* pfd = pfb;
+  int v = d.*pfd;
+}


Re: [PATCH] expr.cc: avoid unexpected side effects in expand_expr_divmod optimization

2022-12-08 Thread Richard Biener via Gcc-patches



> Am 08.12.2022 um 11:56 schrieb Jose E. Marchesi via Gcc-patches 
> :
> 
> The expand_expr_divmod function in expr.cc attempts to optimize cases
> where both arguments of a division/modulus are known to be positive
> when interpreted as signed.  In these cases, both signed division and
> unsigned division will raise the same value, and therefore the
> cheapest option can be used.
> 
> In order to determine what is the cheaper option in the current
> target, expand_expr_divmod actually expands both a signed divmod and
> an unsigned divmod using local "sequences":
> 
>  start_sequence ();
>  ...
>  expand_divmod (... signed ...);
>  ...
>  end_sequence ();
> 
>  start_sequence ();
>  ...
>  expand_divmod (... unsigned ...);
>  ...
>  end_sequence ();
> 
> And then compares the cost of each generated sequence, choosing the
> best one.  Finally, it emits the selected expanded sequence and
> returns the rtx with the result.
> 
> This approach has a caveat.  Some targets do not provide instructions
> for division/modulus instructions.  In the case of BPF, it provides
> unsigned division/modulus, but not signed division/modulus.
> 
> In these cases, the expand_divmod tries can contain calls to funcalls.
> For example, in BPF:
> 
>  start_sequence ();
>  ...
>  expand_divmod (... signed ...); -> This generates funcall to __divdi3
>  ...
>  end_sequence ();
> 
>  start_sequence ();
>  ...
>  expand_divmod (... unsigned ...); -> This generates direct `div' insn.
>  ...
>  end_sequence ();
> 
> The problem is that when a funcall is expanded, an accompanying global
> symbol definition is written in the output stream:
> 
>  .global __divdi3
> 
> And this symbol definition remains in the compiled assembly file, even
> if the sequence using the direct `div' instruction above is used.
> 
> This is particularly bad in BPF, because the kernel bpf loader chokes
> on the spurious symbol __divdi3 and makes the resulting BPF object
> unloadable (note that BPF objects are not linked before processed by
> the kernel.)
> 
> In order to fix this, this patch modifies expand_expr_divmod in the
> following way:
> 
> - When trying each sequence (signed, unsigned) the expand_divmod calls
>  are told to _not_ use libcalls if everything else fails.  This is
>  done by passing OPTAB_WIDEN as the `methods' argument.  (Before it
>  was using the default value OPTAB_LIB_WIDEN.)
> 
> - If any of the tried expanded sequences contain a funcall, then the
>  optimization is not attempted.

How do libcalls appear in iff you specify OPTABS_WIDEN only?  Doesn’t that 
allow to simplify this and also use the sequence without a libcall?

Richard 

> 
> A couple of BPF tests are also added to make sure this doesn't break
> at any point in the future.
> 
> Tested in bpf-unknown-none and x86_64-linux-gnu.
> Regtested in x86_64-linux-gnu.  No regressions.
> 
> gcc/ChangeLog
> 
>* expr.cc (expand_expr_divmod): Avoid side-effects of trying
>sequences involving funcalls in optimization.
> 
> gcc/testsuite/ChangeLog:
> 
>* gcc.target/bpf/divmod-funcall-1.c: New test.
>* gcc.target/bpf/divmod-funcall-2.c: Likewise.
> ---
> gcc/expr.cc   | 44 +++
> .../gcc.target/bpf/divmod-funcall-1.c |  8 
> .../gcc.target/bpf/divmod-funcall-2.c |  8 
> 3 files changed, 41 insertions(+), 19 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/bpf/divmod-funcall-1.c
> create mode 100644 gcc/testsuite/gcc.target/bpf/divmod-funcall-2.c
> 
> diff --git a/gcc/expr.cc b/gcc/expr.cc
> index d9407432ea5..4d4be5d7bda 100644
> --- a/gcc/expr.cc
> +++ b/gcc/expr.cc
> @@ -9168,32 +9168,38 @@ expand_expr_divmod (tree_code code, machine_mode 
> mode, tree treeop0,
>   do_pending_stack_adjust ();
>   start_sequence ();
>   rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> -   op0, op1, target, 1);
> +   op0, op1, target, 1, OPTAB_WIDEN);
>   rtx_insn *uns_insns = get_insns ();
>   end_sequence ();
>   start_sequence ();
>   rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> -   op0, op1, target, 0);
> +   op0, op1, target, 0, OPTAB_WIDEN);
>   rtx_insn *sgn_insns = get_insns ();
>   end_sequence ();
> -  unsigned uns_cost = seq_cost (uns_insns, speed_p);
> -  unsigned sgn_cost = seq_cost (sgn_insns, speed_p);
> 
> -  /* If costs are the same then use as tie breaker the other other
> - factor.  */
> -  if (uns_cost == sgn_cost)
> -{
> -  uns_cost = seq_cost (uns_insns, !speed_p);
> -  sgn_cost = seq_cost (sgn_insns, !speed_p);
> -}
> -
> -  if (uns_cost < sgn_cost || (uns_cost == sgn_cost && unsignedp))
> -{
> -  emit_insn (uns_insns);
> -  return uns_ret;
> -}
> -  emit_insn (sgn_insns);
> -  return sgn_ret;
> +  /* Do not try to optimize if any of the sequences tried above
> + resulted 

Re: [PATCH] expr.cc: avoid unexpected side effects in expand_expr_divmod optimization

2022-12-08 Thread Jakub Jelinek via Gcc-patches
On Thu, Dec 08, 2022 at 02:02:36PM +0100, Jose E. Marchesi wrote:
> So, I guess the right fix would be to call assemble_external_libcall
> during final?  The `.global FOO' directive would be generated
> immediately before the call sequence, but I guess that would be ok.

During final only if all the targets can deal with the effects of
assemble_external_libcall being done in the middle of emitting assembly
for the function.

Otherwise, it could be e.g. done in the first loop of shorten_branches.

Note, in calls.cc it is done only for emit_library_call_value_1
and not for emit_call_1, so if we do it late, we need to be able to find
out what call is to a libcall and what is to a normal call.  If there is
no way to differentiate it right now, perhaps we need some flag somewhere,
say on a SYMBOL_REF.  And then assemble_external_libcall either only
if such a SYMBOL_REF appears in CALL_INSN or sibcall JUMP_INSN, or
perhaps anywhere in the function and its constant pool.

Jakub



Re: [PATCH V3] Use reg mode to move sub blocks for parameters and returns

2022-12-08 Thread Jiufu Guo via Gcc-patches
Hi Segher,

Thanks a lot for your review!

Segher Boessenkool  writes:

> Hi!
>
> On Wed, Dec 07, 2022 at 08:00:08PM +0800, Jiufu Guo wrote:
>> When assigning a parameter to a variable, or assigning a variable to
>> return value with struct type, "block move" are used to expand
>> the assignment. It would be better to use the register mode according
>> to the target/ABI to move the blocks if the parameter/return is passed
>> through registers. And then this would raise more opportunities for
>> other optimization passes(cse/dse/xprop).
>> 
>> As the example code (like code in PR65421):
>> 
>> typedef struct SA {double a[3];} A;
>> A ret_arg_pt (A *a) {return *a;} // on ppc64le, expect only 3 lfd(s)
>> A ret_arg (A a) {return a;} // just empty fun body
>> void st_arg (A a, A *p) {*p = a;} //only 3 stfd(s)
>
> What is this like if you use [5] instead?  Or use an ABI without
> homogeneous aggregates?
Thanks for this question!  I also tested the cases on different array
types or different sizes, or mixed field types.

If it is out of the number of registers for passing the param
or return, it is treated as a mem block.
For parameter, it is partially passed via registers, and partially
passing via stack.
For return, it is returned via a pointer (with one invisible pointer
parameter). And the  of the function is not with parallel code.

This patch does not cover these cases.

>
>> +static void
>> +move_sub_blocks (rtx to_rtx, tree from, machine_mode sub_mode, bool 
>> nontemporal)
>> +{
>> +  HOST_WIDE_INT size, sub_size;
>> +  int len;
>> +
>> +  gcc_assert (MEM_P (to_rtx));
>> +
>> +  size = MEM_SIZE (to_rtx).to_constant ();
>> +  sub_size = GET_MODE_SIZE (sub_mode).to_constant ();
>> +  len = size / sub_size;
>
> Unrelated, but a pet peeve: it is much more modern (and imo much better
> taste) to not put all declarations at the start; just declare at first
> use:
>
>   gcc_assert (MEM_P (to_rtx));
>
>   HOST_WIDE_INT size = MEM_SIZE (to_rtx).to_constant ();
>   HOST_WIDE_INT sub_size = GET_MODE_SIZE (sub_mode).to_constant ();
>   int len = size / sub_size;
>
Thanks for catch this!

>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c
>> @@ -0,0 +1,15 @@
>> +/* PR target/65421 */
>> +/* { dg-options "-O2" } */
>> +/* { dg-require-effective-target has_arch_ppc64 } */
>> +
>> +typedef struct SA
>> +{
>> +  double a[2];
>> +  long l;
>> +} A;
>> +
>> +/* std 3 param regs to return slot */
>> +A ret_arg (A a) {return a;}
>> +/* { dg-final { scan-assembler-times {\mstd 4,0\(3\)\s} 1 } } */
>> +/* { dg-final { scan-assembler-times {\mstd 5,8\(3\)\s} 1 } } *
>> +/* { dg-final { scan-assembler-times {\mstd 6,16\(3\)\s} 1 } } */
>
> This is only correct on certain ABIs, probably only ELFv2 even.
Thanks for point out this!
This is only correct if the ABI allows this struct to be passed
through integer registers, and return through the mem block.

In the previous version, I added a requirement on ELFv2. As tested on
BE environments, this case also pass. So, I deleted the requirement.
(While on BE environments, there is another issue: some unnecessary
memory stores are not deleted.)

But with more reading of the code 'rs6000_function_arg', as you said,
I'm not sure if this behavior meets other ABIs (at least, it seems,
this is not correct on darwin64).
So, as you said, we may add a requirement on ELFv2; Or leave this
case there, and add "! target" when hitting failure?

>
>
> We certainly can improve the homogeneous aggregates stuff, but please
> make sure you don't degrade all other stuff?  Older, as well as when
> things are not an homogeneous aggregate, for example too big.  Can you
> please add tests for such cases?
Sure, thanks!  I encounter one issue in this kind of case (large struct)
on a previous version path.

Thanks again for your comments and suggestions!

BR,
Jeff (Jiufu)

>
>
> Segher


Re: [PATCH] expr.cc: avoid unexpected side effects in expand_expr_divmod optimization

2022-12-08 Thread Jose E. Marchesi via Gcc-patches


Hi Jakub.

> On Thu, Dec 08, 2022 at 11:59:44AM +0100, Jose E. Marchesi via Gcc-patches 
> wrote:
>> gcc/ChangeLog
>> 
>>  * expr.cc (expand_expr_divmod): Avoid side-effects of trying
>>  sequences involving funcalls in optimization.
>
> That looks wrong.
> The globals for mentioned calls just shouldn't be emitted during expansion,
> especially if it is bigger annoyance than just having some extra symbols
> in the symbol table.
> expand_expr_divmod is definitely not the only place where something is
> expanded and later not used, lots of other places in the expander do that,
> and more importantly, there are over 80 optimization passes after expansion,
> many of them can remove code determined to be dead, and while lots of dead
> code is removed in GIMPLE optimizations already, definitely not all.
> So, rather than add hacks for this in a single spot, much better is to emit
> the globals only for stuff that is actually needed (so during final or
> immediately before it).

Yeah I see the point.

The culprit of the leadked .global seems to be a call to
assemble_external_libcall in emit_library_call_value_1:

expand_expr_divmod
  expand_divmod -> This will result in libcall
   sign_expand_divmod
 emit_library_call_value
   emit_library_call_value_1
 ...
 /* If this machine requires an external definition for library
functions, write one out.  */
 assemble_external_libcall (fun);
 ...

The documented purpose of assemble_external_libcall is, as stated in
output.h, to "Assemble a string constant".

So, it seems to me that emit_library_call_value should not assemble
anything, since it is used by expand functions whose expansions may be
eventually discarded.

However, simply removing that call to assemble_external_libcall makes
.global declarations to not be emitted even when the funcall is actually
emitted in final:

For:

  int foo(unsigned int len)
  {
return ((long)len) * 234 / 5;
  }

we get:

.file   "foo.c"
.text
<- NO .global __divdi3
.align  3
.global foo
.type   foo, @function
  foo:
mov32   %r1,%r1
mov %r2,5
mul %r1,234
call__divdi3
exit
  .size   foo, .-foo
  .ident  "GCC: (GNU) 13.0.0 20221207 (experimental)"

Note that BPF lacks signed division instructions.

So, I guess the right fix would be to call assemble_external_libcall
during final?  The `.global FOO' directive would be generated
immediately before the call sequence, but I guess that would be ok.

WDYT?


Re: [PATCH 02/17] libgomp: pinned memory

2022-12-08 Thread Andrew Stubbs

On 08/12/2022 12:11, Jakub Jelinek wrote:

On Thu, Jul 07, 2022 at 11:34:33AM +0100, Andrew Stubbs wrote:


Implement the OpenMP pinned memory trait on Linux hosts using the mlock
syscall.  Pinned allocations are performed using mmap, not malloc, to ensure
that they can be unpinned safely when freed.


As I said before, I think the pinned memory is too precious to waste it this
way, we should handle the -> pinned case through memkind_create_fixed on
mmap + mlock area, that way we can create even quite small pinned
allocations.


This has been delayed due to other priorities, but our current plan is 
to switch to using cudaHostAlloc, when available, but we can certainly 
use memkind_create_fixed for the fallback case (including amdgcn).


Using Cuda might be trickier to implement because there's a layering 
violation inherent in routing target independent allocations through the 
nvptx plugin, but benchmarking shows that that's the only way to get the 
faster path through the Cuda black box; being pinned is good because it 
avoids page faults, but apparently if Cuda *knows* it is pinned then you 
get a speed boost even when there would be *no* faults (i.e. on a quiet 
machine). Additionally, Cuda somehow ignores the OS-defining limits.


Thomas Schwinge has been assigned this task and will be getting to it 
soonish.


Andrew


Re: [PATCH] c++: modules and std::source_location::current() def arg [PR100881]

2022-12-08 Thread Nathan Sidwell via Gcc-patches

On 12/7/22 16:50, Patrick Palka wrote:

We currently declare __builtin_source_location with a const void* return
type instead of the true type (const std::source_location::__impl*), and
later when folding this builtin we just obtain the true type via name
lookup.

But the below testcase demonstrates this name lookup approach seems to
interact poorly with modules, since we may import an entity that uses
std::source_location::current() in a default argument (or DMI) without
also importing , and thus the name lookup will fail
when folding the builtin at the call site unless we also import
.

This patch fixes by instead initially declaring __builtin_source_location
with an auto return type and updating it appropriately upon its first use.
Thus when folding calls to this builtin we can fish out the true return
type through the type of the CALL_EXPR and avoid needing to do name
lookup.


That's a clever approach!  LGTM

nathan

--
Nathan Sidwell



Re: [PATCH] expr.cc: avoid unexpected side effects in expand_expr_divmod optimization

2022-12-08 Thread Jakub Jelinek via Gcc-patches
On Thu, Dec 08, 2022 at 11:59:44AM +0100, Jose E. Marchesi via Gcc-patches 
wrote:
> gcc/ChangeLog
> 
>   * expr.cc (expand_expr_divmod): Avoid side-effects of trying
>   sequences involving funcalls in optimization.

That looks wrong.
The globals for mentioned calls just shouldn't be emitted during expansion,
especially if it is bigger annoyance than just having some extra symbols
in the symbol table.
expand_expr_divmod is definitely not the only place where something is
expanded and later not used, lots of other places in the expander do that,
and more importantly, there are over 80 optimization passes after expansion,
many of them can remove code determined to be dead, and while lots of dead
code is removed in GIMPLE optimizations already, definitely not all.
So, rather than add hacks for this in a single spot, much better is to emit
the globals only for stuff that is actually needed (so during final or
immediately before it).

Jakub



Re: [PATCH 02/17] libgomp: pinned memory

2022-12-08 Thread Jakub Jelinek via Gcc-patches
On Thu, Jul 07, 2022 at 11:34:33AM +0100, Andrew Stubbs wrote:
> 
> Implement the OpenMP pinned memory trait on Linux hosts using the mlock
> syscall.  Pinned allocations are performed using mmap, not malloc, to ensure
> that they can be unpinned safely when freed.

As I said before, I think the pinned memory is too precious to waste it this
way, we should handle the -> pinned case through memkind_create_fixed on
mmap + mlock area, that way we can create even quite small pinned
allocations.

Jakub



Re: [PATCH 2/2] OpenMP: Duplicate checking for map clauses in Fortran (PR107214)

2022-12-08 Thread Tobias Burnus

Hi Julian:

On 07.12.22 20:13, Julian Brown wrote:

I know that this was the case before, but can you move the mark:1 etc.
after 'tlink'? In that case all bitfields are grouped together.

Thanks for doing so.

I wonder whether that also rejects the following – which seems to be
valid. The 'map' goes to 'target' and the 'firstprivate' to
'parallel', cf. OpenMP 5.2, "17.2 Clauses on Combined and Composite
Constructs", [340:3-4 & 12-14]. (BTW: While some fixes went into 5.1
regarding this section, a likewise wording is already in 5.0.)

(Testing showed: it give an ICE without the patch and an error with.)

...and this patch avoids the error for combined directives, and
reorders the gfc_symbol bitfields.


All in all, I am fine with the patch - but I spotted some issues.

First, I think you need to set for some error cases mark = 0 to avoid 
duplicated errors.
Namely:

  ! Outputs the error twice ('Symbol ‘y’ present on multiple clauses')
  !$omp target has_device_addr(y) firstprivate(y)
  block; end block

 * * *

Additionally, I think it would be good to have besides 'target' + 
map/firstprivate (→ error)
also a testcase for 'target simd' + map/firstprivate → error

And I think also gives-no-error checks all combined 'target ...' that take 
firstprivate
should be added, cf. your own patch - possibly with looking at the original 
dump (scan-tree-dump)
to see that the clause is properly attached correctly. Example for 'target 
teams':

  !$omp target teams map(x) firstprivate(x)
  block; end block

(Works but no testcase.)

 * * *

The following is not diagnosed and gives an ICE:

!$omp target in_reduction(+: x) private(x)
  block; end block
end

The C testcase properly has:
  error: ‘x’ appears more than once in data-sharing clauses

Note: Using 'firstprivate' instead of 'private' shows the proper error also in 
Fortran.


The following does not ICE but does not make sense (and is rejected in C):

4 | #pragma omp target private(x) map(x)

vs.

  !$omp target map(x) private(x)
  block; end block

(The latter produces "#pragma omp target private(x.0) map(tofrom:*x.0)", ups!)

 * * *

I also note that 'simd' accepts private such that

#pragma omp target simd private(x) map(x)
 for (int i=0; i < 0; i++)
 ;

!$omp target simd map(x) private(x)
do i = 1, 0; end do

is valid. (It is accepted by gcc and gfortran, i.e. it just needs to be added 
as testcase.)

 * * *

I note that C rejects {map(x),firstprivate(x)} + 
{has_device_addr(x),is_device_ptr(x)}',
but gfortran + your patch accepts:

  !$omp target map(x) has_device_addr(x)
  !$omp target map(x) is_device_ptr(x)

while

  !$omp target firstprivate(x) has_device_addr(x)
  !$omp target firstprivate(x) is_device_ptr(x)

is rejected – showing the error message twice.

Expected: I think it should show an error in all four cases - but only once.


2022-12-06  Julian Brown  

gcc/fortran/
 PR fortran/107214
 * gfortran.h (gfc_symbol): Add data_mark, dev_mark, gen_mark and
 reduc_mark bitfields.
 * openmp.cc (resolve_omp_clauses): Use above bitfields to improve
 duplicate clause detection.

gcc/testsuite/
 PR fortran/107214
 * gfortran.dg/gomp/pr107214.f90: New test.


Thanks,

Tobias

-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955


Re: [PATCH 01/17] libgomp, nvptx: low-latency memory allocator

2022-12-08 Thread Jakub Jelinek via Gcc-patches
On Thu, Jul 07, 2022 at 11:34:32AM +0100, Andrew Stubbs wrote:
> libgomp/ChangeLog:
> 
>   * allocator.c (MEMSPACE_ALLOC): New macro.
>   (MEMSPACE_CALLOC): New macro.
>   (MEMSPACE_REALLOC): New macro.
>   (MEMSPACE_FREE): New macro.
>   (dynamic_smem_size): New constants.
>   (omp_alloc): Use MEMSPACE_ALLOC.
>   Implement fall-backs for predefined allocators.
>   (omp_free): Use MEMSPACE_FREE.
>   (omp_calloc): Use MEMSPACE_CALLOC.
>   Implement fall-backs for predefined allocators.
>   (omp_realloc): Use MEMSPACE_REALLOC and MEMSPACE_ALLOC..
>   Implement fall-backs for predefined allocators.
>   * config/nvptx/team.c (__nvptx_lowlat_heap_root): New variable.
>   (__nvptx_lowlat_pool): New asm varaible.
>   (gomp_nvptx_main): Initialize the low-latency heap.
>   * plugin/plugin-nvptx.c (lowlat_pool_size): New variable.
>   (GOMP_OFFLOAD_init_device): Read the GOMP_NVPTX_LOWLAT_POOL envvar.
>   (GOMP_OFFLOAD_run): Apply lowlat_pool_size.
>   * config/nvptx/allocator.c: New file.
>   * testsuite/libgomp.c/allocators-1.c: New test.
>   * testsuite/libgomp.c/allocators-2.c: New test.
>   * testsuite/libgomp.c/allocators-3.c: New test.
>   * testsuite/libgomp.c/allocators-4.c: New test.
>   * testsuite/libgomp.c/allocators-5.c: New test.
>   * testsuite/libgomp.c/allocators-6.c: New test.
> 
> co-authored-by: Kwok Cheung Yeung  

> +/* These macros may be overridden in config//allocator.c.  */
> +#ifndef MEMSPACE_ALLOC
> +#define MEMSPACE_ALLOC(MEMSPACE, SIZE) malloc (SIZE)
> +#endif

Rather than uglifying the sources with __attribute__((unused)) on the
memspace variables, wouldn't it be better to always use MEMSPACE?
So,
#define MEMSPACE_ALLOC(MEMSPACE, SIZE) malloc (((MEMSPACE), (SIZE)))
or so (similarly other macros)?

> +#ifndef MEMSPACE_CALLOC
> +#define MEMSPACE_CALLOC(MEMSPACE, SIZE) calloc (1, SIZE)
> +#endif
> +#ifndef MEMSPACE_REALLOC
> +#define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE) realloc (ADDR, SIZE)
> +#endif
> +#ifndef MEMSPACE_FREE
> +#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) free (ADDR)
> +#endif

> +/* Map the predefined allocators to the correct memory space.
> +   The index to this table is the omp_allocator_handle_t enum value.  */
> +static const omp_memspace_handle_t predefined_alloc_mapping[] = {
> +  omp_default_mem_space,   /* omp_null_allocator. */
> +  omp_default_mem_space,   /* omp_default_mem_alloc. */
> +  omp_large_cap_mem_space, /* omp_large_cap_mem_alloc. */
> +  omp_default_mem_space,   /* omp_const_mem_alloc. */

Shouldn't this be omp_const_mem_space ?
That is what the standard says and you need to handle it in MEMSPACE_ALLOC
etc. anyway because omp_init_allocator could be done with that memspace.

> +  omp_high_bw_mem_space,   /* omp_high_bw_mem_alloc. */
> +  omp_low_lat_mem_space,   /* omp_low_lat_mem_alloc. */
> +  omp_low_lat_mem_space,   /* omp_cgroup_mem_alloc. */
> +  omp_low_lat_mem_space,   /* omp_pteam_mem_alloc. */
> +  omp_low_lat_mem_space,   /* omp_thread_mem_alloc. */

The above 3 are implementation defined, so we can choose whatever we want.

> @@ -496,35 +530,38 @@ retry:
>return ret;
>  
>  fail:
> -  if (allocator_data)
> +  int fallback = (allocator_data
> +   ? allocator_data->fallback
> +   : allocator == omp_default_mem_alloc
> +   ? omp_atv_null_fb
> +   : omp_atv_default_mem_fb);

A label can be only followed by variable declaration in C2X (and in C++),
I think we should keep libgomp in C99 for the time being.
So, it should be
fail:;

> +   || (allocator_data
> +   && allocator_data->pool_size < ~(uintptr_t) 0)
> +   || !allocator_data)

This would be better written as:
  || allocator_data == NULL
  || allocator_data->pool_size < ~(uintptr_t) 0)

> @@ -766,35 +816,38 @@ retry:
>return ret;
>  
>  fail:
> -  if (allocator_data)
> +  int fallback = (allocator_data
> +   ? allocator_data->fallback
> +   : allocator == omp_default_mem_alloc
> +   ? omp_atv_null_fb
> +   : omp_atv_default_mem_fb);

See above.

> +   || (allocator_data
> +   && allocator_data->pool_size < ~(uintptr_t) 0)
> +   || !allocator_data)

And again.

> @@ -1073,35 +1139,38 @@ retry:
>return ret;
>  
>  fail:
> -  if (allocator_data)
> +  int fallback = (allocator_data

And again.

> +   || (allocator_data
> +   && allocator_data->pool_size < ~(uintptr_t) 0)
> +   || !allocator_data)

And again.

> --- /dev/null
> +++ b/libgomp/config/nvptx/allocator.c
> @@ -0,0 +1,370 @@
> +/* Copyright (C) 2021 Free Software Foundation, Inc.

-2022

> +static void *
> +nvptx_memspace_alloc (omp_memspace_handle_t memspace, size_t size)
> +{
> +  if (memspace == omp_low_lat_mem_space)
> +{
> +  char *shared_pool;
> +  asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r"(shared_pool));

Space between 

[PATCH] tree-optimization/99919 - bogus uninit diagnostic with bitfield guards

2022-12-08 Thread Richard Biener via Gcc-patches
For the testcase in this PR what fold-const.cc optimize_bit_field_compare
does to bitfield against constant compares is confusing the uninit
predicate analysis and it also makes SRA obfuscate instead of optimize
the code.  We've long had the opinion that those optimizations are
premature but we do not have any replacement for the more complicated
ones combining multiple bitfield tests.  The following disables mangling
the case of a single bitfield test against constants but preserving
the existing diagnostic and optimization to a compile-time determined
value.

This requires silencing a bogus uninit diagnostic in the Fortran
frontend which I've done in a minimal way, avoiding initializing
the 40 byte symbol_attribute structure.  There's several issues,
one is the flag_coarrays is a global variable likely not CSEd
to help the uninit predicate analysis, the other is us short-circuiting
the flag_coarray == GFC_FCOARRAY_LIB && lhs_caf_attr.codimension
accesses as both have no side-effects so the guard isn't effective.
If the frontend folks are happy with this I can localize both
lhs_caf_attr and rhs_caf_attr and copy out the two only flags
tested by the code instead of the somewhat incomplete approach in
the patch.  Any opinions here?

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

OK for the fortran parts?

Thanks,
Richard.

PR tree-optimization/99919
* fold-const.cc (optimize_bit_field_compare): Disable
transforming the bitfield against constant compare optimization
if the result is not statically determinable.

gcc/fortran/
* trans-expr.cc (gfc_trans_assignment_1): Split out
lhs_codimension from lhs_caf_attr to avoid bogus uninit
diagnostics.

* gcc.dg/uninit-pr99919.c: New testcase.
---
 gcc/fold-const.cc | 37 +++
 gcc/fortran/trans-expr.cc |  6 +++--
 gcc/testsuite/gcc.dg/uninit-pr99919.c | 22 
 3 files changed, 30 insertions(+), 35 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/uninit-pr99919.c

diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index cdfe3f50ae3..b72cc0a1d51 100644
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -4559,7 +4559,6 @@ optimize_bit_field_compare (location_t loc, enum 
tree_code code,
 {
   poly_int64 plbitpos, plbitsize, rbitpos, rbitsize;
   HOST_WIDE_INT lbitpos, lbitsize, nbitpos, nbitsize;
-  tree type = TREE_TYPE (lhs);
   tree unsigned_type;
   int const_p = TREE_CODE (rhs) == INTEGER_CST;
   machine_mode lmode, rmode;
@@ -4667,13 +4666,7 @@ optimize_bit_field_compare (location_t loc, enum 
tree_code code,
 }
 
   /* Otherwise, we are handling the constant case.  See if the constant is too
- big for the field.  Warn and return a tree for 0 (false) if so.  We do
- this not only for its own sake, but to avoid having to test for this
- error case below.  If we didn't, we might generate wrong code.
-
- For unsigned fields, the constant shifted right by the field length should
- be all zero.  For signed fields, the high-order bits should agree with
- the sign bit.  */
+ big for the field.  Warn and return a tree for 0 (false) if so.  */
 
   if (lunsignedp)
 {
@@ -4695,31 +4688,9 @@ optimize_bit_field_compare (location_t loc, enum 
tree_code code,
}
 }
 
-  if (nbitpos < 0)
-return 0;
-
-  /* Single-bit compares should always be against zero.  */
-  if (lbitsize == 1 && ! integer_zerop (rhs))
-{
-  code = code == EQ_EXPR ? NE_EXPR : EQ_EXPR;
-  rhs = build_int_cst (type, 0);
-}
-
-  /* Make a new bitfield reference, shift the constant over the
- appropriate number of bits and mask it with the computed mask
- (in case this was a signed field).  If we changed it, make a new one.  */
-  lhs = make_bit_field_ref (loc, linner, lhs, unsigned_type,
-   nbitsize, nbitpos, 1, lreversep);
-
-  rhs = const_binop (BIT_AND_EXPR,
-const_binop (LSHIFT_EXPR,
- fold_convert_loc (loc, unsigned_type, rhs),
- size_int (lbitpos)),
-mask);
-
-  lhs = build2_loc (loc, code, compare_type,
-   build2 (BIT_AND_EXPR, unsigned_type, lhs, mask), rhs);
-  return lhs;
+  /* Otherwise do not prematurely optimize compares of bitfield members
+ to constants.  */
+  return 0;
 }
 
 /* Subroutine for fold_truth_andor_1: decode a field reference.
diff --git a/gcc/fortran/trans-expr.cc b/gcc/fortran/trans-expr.cc
index b95c5cf2f96..12c7dd7f26a 100644
--- a/gcc/fortran/trans-expr.cc
+++ b/gcc/fortran/trans-expr.cc
@@ -11654,9 +11654,11 @@ gfc_trans_assignment_1 (gfc_expr * expr1, gfc_expr * 
expr2, bool init_flag,
 
   /* Only analyze the expressions for coarray properties, when in coarray-lib
  mode.  */
+  bool lhs_codimension = false;
   if (flag_coarray == GFC_FCOARRAY_LIB)
 {
   lhs_caf_attr = gfc_caf_attr (expr1, 

[PATCH] expr.cc: avoid unexpected side effects in expand_expr_divmod optimization

2022-12-08 Thread Jose E. Marchesi via Gcc-patches
The expand_expr_divmod function in expr.cc attempts to optimize cases
where both arguments of a division/modulus are known to be positive
when interpreted as signed.  In these cases, both signed division and
unsigned division will raise the same value, and therefore the
cheapest option can be used.

In order to determine what is the cheaper option in the current
target, expand_expr_divmod actually expands both a signed divmod and
an unsigned divmod using local "sequences":

  start_sequence ();
  ...
  expand_divmod (... signed ...);
  ...
  end_sequence ();

  start_sequence ();
  ...
  expand_divmod (... unsigned ...);
  ...
  end_sequence ();

And then compares the cost of each generated sequence, choosing the
best one.  Finally, it emits the selected expanded sequence and
returns the rtx with the result.

This approach has a caveat.  Some targets do not provide instructions
for division/modulus instructions.  In the case of BPF, it provides
unsigned division/modulus, but not signed division/modulus.

In these cases, the expand_divmod tries can contain calls to funcalls.
For example, in BPF:

  start_sequence ();
  ...
  expand_divmod (... signed ...); -> This generates funcall to __divdi3
  ...
  end_sequence ();

  start_sequence ();
  ...
  expand_divmod (... unsigned ...); -> This generates direct `div' insn.
  ...
  end_sequence ();

The problem is that when a funcall is expanded, an accompanying global
symbol definition is written in the output stream:

  .global __divdi3

And this symbol definition remains in the compiled assembly file, even
if the sequence using the direct `div' instruction above is used.

This is particularly bad in BPF, because the kernel bpf loader chokes
on the spurious symbol __divdi3 and makes the resulting BPF object
unloadable (note that BPF objects are not linked before processed by
the kernel.)

In order to fix this, this patch modifies expand_expr_divmod in the
following way:

- When trying each sequence (signed, unsigned) the expand_divmod calls
  are told to _not_ use libcalls if everything else fails.  This is
  done by passing OPTAB_WIDEN as the `methods' argument.  (Before it
  was using the default value OPTAB_LIB_WIDEN.)

- If any of the tried expanded sequences contain a funcall, then the
  optimization is not attempted.

A couple of BPF tests are also added to make sure this doesn't break
at any point in the future.

Tested in bpf-unknown-none and x86_64-linux-gnu.
Regtested in x86_64-linux-gnu.  No regressions.

gcc/ChangeLog

* expr.cc (expand_expr_divmod): Avoid side-effects of trying
sequences involving funcalls in optimization.

gcc/testsuite/ChangeLog:

* gcc.target/bpf/divmod-funcall-1.c: New test.
* gcc.target/bpf/divmod-funcall-2.c: Likewise.
---
 gcc/expr.cc   | 44 +++
 .../gcc.target/bpf/divmod-funcall-1.c |  8 
 .../gcc.target/bpf/divmod-funcall-2.c |  8 
 3 files changed, 41 insertions(+), 19 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/bpf/divmod-funcall-1.c
 create mode 100644 gcc/testsuite/gcc.target/bpf/divmod-funcall-2.c

diff --git a/gcc/expr.cc b/gcc/expr.cc
index d9407432ea5..4d4be5d7bda 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -9168,32 +9168,38 @@ expand_expr_divmod (tree_code code, machine_mode mode, 
tree treeop0,
   do_pending_stack_adjust ();
   start_sequence ();
   rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
-  op0, op1, target, 1);
+  op0, op1, target, 1, OPTAB_WIDEN);
   rtx_insn *uns_insns = get_insns ();
   end_sequence ();
   start_sequence ();
   rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
-  op0, op1, target, 0);
+  op0, op1, target, 0, OPTAB_WIDEN);
   rtx_insn *sgn_insns = get_insns ();
   end_sequence ();
-  unsigned uns_cost = seq_cost (uns_insns, speed_p);
-  unsigned sgn_cost = seq_cost (sgn_insns, speed_p);
 
-  /* If costs are the same then use as tie breaker the other other
-factor.  */
-  if (uns_cost == sgn_cost)
-   {
- uns_cost = seq_cost (uns_insns, !speed_p);
- sgn_cost = seq_cost (sgn_insns, !speed_p);
-   }
-
-  if (uns_cost < sgn_cost || (uns_cost == sgn_cost && unsignedp))
-   {
- emit_insn (uns_insns);
- return uns_ret;
-   }
-  emit_insn (sgn_insns);
-  return sgn_ret;
+  /* Do not try to optimize if any of the sequences tried above
+ resulted in a funcall.  */
+  if (uns_ret && sgn_ret)
+{
+  unsigned uns_cost = seq_cost (uns_insns, speed_p);
+  unsigned sgn_cost = seq_cost (sgn_insns, speed_p);
+
+  /* If costs are the same then use as tie breaker the other
+ other factor.  */
+  if (uns_cost == sgn_cost)
+{
+

[PATCH] tree-optimization/107699 - missed _M_elems + _1 != _M_elems folding

2022-12-08 Thread Richard Biener via Gcc-patches
The following addresses a missed folding noticed in PR107699 that can
be fixed amending the existing  + a !=  + b pattern to also handle
the case of only one side having a pointer plus.  I'm moving the
patterns next to related simpifications showing there'd be an existing
pattern matching this if it were not gated with an explicit single_use
constraint.  Note the new pattern also handles  + a != , but
this hints at some unification / generalization opportunities here.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

PR tree-optimization/107699
* match.pd ( !=/==  + c -> ( - ) !=/== c): New
pattern variant.

* gcc.dg/tree-ssa/pr107699.c: New testcase.
---
 gcc/match.pd | 21 +
 gcc/testsuite/gcc.dg/tree-ssa/pr107699.c | 15 +++
 2 files changed, 28 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr107699.c

diff --git a/gcc/match.pd b/gcc/match.pd
index f48cbd9b73b..127cef9a610 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -2260,6 +2260,19 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
&& (CONSTANT_CLASS_P (@1) || (single_use (@2) && single_use (@3
(op @1 { build_zero_cst (TREE_TYPE (@1)); }
 
+/* ( + b) !=/== ([1] + c) -> ([0] - [1]) + b !=/== c */
+(for neeq (ne eq)
+ (simplify
+  (neeq:c ADDR_EXPR@0 (pointer_plus ADDR_EXPR@2 @3))
+   (with { poly_int64 diff; tree inner_type = TREE_TYPE (@3);}
+(if (ptr_difference_const (@0, @2, ))
+ (neeq { build_int_cst_type (inner_type, diff); } @3
+ (simplify
+  (neeq (pointer_plus ADDR_EXPR@0 @1) (pointer_plus ADDR_EXPR@2 @3))
+   (with { poly_int64 diff; tree inner_type = TREE_TYPE (@1);}
+(if (ptr_difference_const (@0, @2, ))
+ (neeq (plus { build_int_cst_type (inner_type, diff); } @1) @3)
+
 /* X - Y < X is the same as Y > 0 when there is no overflow.
For equality, this is also true with wrapping overflow.  */
 (for op (simple_comparison)
@@ -2439,14 +2452,6 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(if (ptr_difference_const (@0, @2, ))
 (plus { build_int_cst_type (type, diff); } (convert (minus @1 @3))
 
-/* (+b) !=/== ([1] + c) ->  sizeof(a[0]) + b !=/== c */
-(for neeq (ne eq)
- (simplify
-  (neeq (pointer_plus ADDR_EXPR@0 @1) (pointer_plus ADDR_EXPR@2 @3))
-   (with { poly_int64 diff; tree inner_type = TREE_TYPE (@1);}
-(if (ptr_difference_const (@0, @2, ))
- (neeq (plus { build_int_cst_type (inner_type, diff); } @1) @3)
-
 /* Canonicalize (T *)(ptr - ptr-cst) to [ptr + -ptr-cst].  */
 (simplify
  (convert (pointer_diff @0 INTEGER_CST@1))
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr107699.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr107699.c
new file mode 100644
index 000..4bf864dfd72
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr107699.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fdump-tree-forwprop1" } */
+
+struct { int data[16]; } x;
+
+int foo (int n)
+{
+  int *p = x.data + n;
+  /* Should simplify this to n * 4 != 0.  */
+  if ((void *) != (void *)p)
+return 1;
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump " != 0" "forwprop1" } } */
-- 
2.35.3


Re: [PATCH] i386: Add *concat3_{5, 6, 7} patterns [PR107627]

2022-12-08 Thread Uros Bizjak via Gcc-patches
On Thu, Dec 8, 2022 at 10:49 AM Jakub Jelinek  wrote:
>
> Hi!
>
> On Thu, Dec 01, 2022 at 09:09:51AM +0100, Jakub Jelinek via Gcc-patches wrote:
> > BTW, I wonder if we couldn't add additional patterns which would catch
> > the case where one of the operands is constant.
>
> The following patch does add those.
> The difference with the patch on the 2 testcases is:
>  baz:
> -   movq8(%rsi), %rax
> +   movq8(%rsi), %rsi
> +   movq%rdi, %r8
> movl%edx, %ecx
> -   xorl%r8d, %r8d
> -   xorl%edx, %edx
> -   movabsq $-2401053089206453570, %r9
> -   orq %r8, %rax
> -   orq %r9, %rdx
> -   shrdq   %rdx, %rax
> -   movq%rax, (%rdi)
> +   movabsq $-2401053089206453570, %rdi
> +   movq%rsi, %rax
> +   shrdq   %rdi, %rax
> +   movq%rax, (%r8)
>  qux:
> -   movq(%rsi), %rax
> +   movq%rdi, %r8
> +   movq(%rsi), %rdi
> movl%edx, %ecx
> -   xorl%r9d, %r9d
> -   movabsq $-2401053089206453570, %r8
> -   movq%rax, %rdx
> -   xorl%eax, %eax
> -   orq %r8, %rax
> -   orq %r9, %rdx
> -   shrdq   %rdx, %rax
> -   movq%rax, (%rdi)
> +   movabsq $-2401053089206453570, %rsi
> +   movq%rsi, %rax
> +   shrdq   %rdi, %rax
> +   movq%rax, (%r8)
> and
>  garply:
> pushl   %esi
> -   xorl%edx, %edx
> +   movl$-559038737, %esi
> pushl   %ebx
> movl16(%esp), %eax
> -   orl $-559038737, %edx
> movl20(%esp), %ecx
> -   movl4(%eax), %eax
> -   shrdl   %edx, %eax
> movl12(%esp), %edx
> +   movl4(%eax), %ebx
> +   movl%ebx, %eax
> +   shrdl   %esi, %eax
>  fred:
> ...
> movl16(%esp), %eax
> +   movl$-889275714, %ebx
> movl20(%esp), %ecx
> -   movl(%eax), %eax
> -   movl%eax, %edx
> -   movl$0, %eax
> -   orl $-889275714, %eax
> -   shrdl   %edx, %eax
> movl12(%esp), %edx
> +   movl(%eax), %esi
> +   movl%ebx, %eax
> +   shrdl   %esi, %eax
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2022-12-08  Jakub Jelinek  
>
> PR target/107627
> * config/i386/i386.md (HALF, half): New mode attributes.
> (*concat3_5, *concat3_6,
> *concat3_7): New define_insn_and_split patterns.
>
> * gcc.target/i386/pr107627-3.c: New test.
> * gcc.target/i386/pr107627-4.c: New test.

LGTM.

Thanks,
Uros.

> --- gcc/config/i386/i386.md.jj  2022-12-06 12:16:31.059905487 +0100
> +++ gcc/config/i386/i386.md 2022-12-07 15:11:55.297914206 +0100
> @@ -1134,6 +1134,10 @@ (define_mode_attr MODE_SIZE [(QI "1") (H
>  (define_mode_attr DWI [(QI "HI") (HI "SI") (SI "DI") (DI "TI") (TI "OI")])
>  (define_mode_attr dwi [(QI "hi") (HI "si") (SI "di") (DI "ti") (TI "oi")])
>
> +;; Half sized integer modes.
> +(define_mode_attr HALF [(TI "DI") (DI "SI")])
> +(define_mode_attr half [(TI "di") (DI "si")])
> +
>  ;; LEA mode corresponding to an integer mode
>  (define_mode_attr LEAMODE [(QI "SI") (HI "SI") (SI "SI") (DI "DI")])
>
> @@ -11464,6 +11468,80 @@ (define_insn_and_split "*concatsplit_double_concat (mode, operands[0], operands[1], operands[2]);
>DONE;
>  })
> +
> +(define_insn_and_split "*concat3_5"
> +  [(set (match_operand:DWI 0 "nonimmediate_operand" "=ro")
> +   (any_or_plus:DWI
> + (ashift:DWI (match_operand:DWI 1 "register_operand" "r")
> + (match_operand:DWI 2 "const_int_operand"))
> + (match_operand:DWI 3 "const_scalar_int_operand")))]
> +  "INTVAL (operands[2]) ==  * BITS_PER_UNIT / 2
> +   && (mode == DImode
> +   ? CONST_INT_P (operands[3])
> +&& (UINTVAL (operands[3]) & ~GET_MODE_MASK (SImode)) == 0
> +   : CONST_INT_P (operands[3])
> +   ? INTVAL (operands[3]) >= 0
> +   : CONST_WIDE_INT_NUNITS (operands[3]) == 2
> +&& CONST_WIDE_INT_ELT (operands[3], 1) == 0)"
> +  "#"
> +  "&& reload_completed"
> +  [(clobber (const_int 0))]
> +{
> +  rtx op3 = simplify_subreg (mode, operands[3], mode, 0);
> +  split_double_concat (mode, operands[0], op3,
> +  gen_lowpart (mode, operands[1]));
> +  DONE;
> +})
> +
> +(define_insn_and_split "*concat3_6"
> +  [(set (match_operand: 0 "nonimmediate_operand" "=ro,r")
> +   (any_or_plus:
> + (ashift:
> +   (zero_extend:
> + (match_operand:DWIH 1 "nonimmediate_operand" "r,m"))
> +   (match_operand: 2 "const_int_operand"))
> + (match_operand: 3 "const_scalar_int_operand")))]
> +  "INTVAL (operands[2]) ==  * BITS_PER_UNIT
> +   && (mode == DImode
> +   ? CONST_INT_P (operands[3])
> +&& (UINTVAL (operands[3]) & ~GET_MODE_MASK (SImode)) == 0
> +   : CONST_INT_P (operands[3])
> +   ? INTVAL (operands[3]) >= 0
> +   : CONST_WIDE_INT_NUNITS (operands[3]) == 2
> +&& 

Re: [PATCH] [PR102706] [testsuite] -Wno-stringop-overflow vs Warray-bounds

2022-12-08 Thread Alexandre Oliva via Gcc-patches
On Dec  3, 2022, Richard Biener  wrote:

>> On riscv64-elf and arm-eabi/-mcpu=cortex-r5, for example, though the
>> Warray-bounds-48.c condition passes, we don't issue warnings because
>> we decide not to vectorize the assignments.

> If it’s cost can you try-fno-vect-cost-model? If that works it might
> be better? Otherwise OK.

I tried it, and learned it wasn't about costs after all, so I've
adjusted the commit message to reflect this finding.

I also caught a mistake in the paragraph quoted above from the commit
message.  We *do* vectorize on ARM, but don't issue the expected bogus
warning.


Here's what I'm checking in, thanks!


[PR102706] [testsuite] -Wno-stringop-overflow vs Warray-bounds

The bogus Wstringop-overflow warnings conditionally issued for
Warray-bounds-48.c and -Wzero-length-array-bounds-2.c are expected
under conditions that depend on the availability of certain vector
patterns, but that don't seem to model the conditions under which the
warnings are expected.

On riscv64-elf and arm-eabi/-mcpu=cortex-r5, for example, though the
Warray-bounds-48.c condition passes, we don't issue warnings.  On
riscv64-elf, we decide not to vectorize the assignments; on cortex-r5,
we do vectorize pairs of assignments, but that doesn't yield the
expected warning, even though assignments that should trigger the
bogus warning are vectorized and associated with the earlier line
where the bogus warning would be expected.

On riscv64, for Wzero-length-array-bounds-2.c, we issue the expected
warning in test_C_global_buf, but we also issue a warning for
test_C_local_buf under the same conditions, that would be expected on
other platforms but that is not issued on them.  On
arm-eabi/-mcpu=cortex-r5, the condition passes so we'd expect the
warning in both functions, but we don't warn on either.

Instead of further extending the effective target tests, introduced to
temporarily tolerate these expected bogus warnings, so as to capture
the vectorizer analyses that lead to the mismatched decisions, I'm
disabling the undesired warnings for these two tests.


for  gcc/testsuite/ChangeLog

PR tree-optimization/102706
* gcc.dg/Warray-bounds-48.c: Disable -Wstringop-overflow.
* gcc.dg/Wzero-length-array-bounds-2.c: Likewise.
---
 gcc/testsuite/gcc.dg/Warray-bounds-48.c|   11 +--
 gcc/testsuite/gcc.dg/Wzero-length-array-bounds-2.c |   11 +--
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/Warray-bounds-48.c 
b/gcc/testsuite/gcc.dg/Warray-bounds-48.c
index 775b301e37537..e9203140a274a 100644
--- a/gcc/testsuite/gcc.dg/Warray-bounds-48.c
+++ b/gcc/testsuite/gcc.dg/Warray-bounds-48.c
@@ -4,6 +4,11 @@
{ dg-options "-O2 -Wall" }
{ dg-require-effective-target alloca } */
 
+/* pr102706: disabled warnings because the now-disabled conditions for the
+   bogus warnings to come up do not take cost analysis into account, and often
+   come up wrong.  */
+/* { dg-additional-options "-Wno-stringop-overflow" } */
+
 typedef __INT16_TYPE__ int16_t;
 typedef __INT32_TYPE__ int32_t;
 
@@ -30,7 +35,8 @@ static void nowarn_ax_extern (struct AX *p)
 
 static void warn_ax_local_buf (struct AX *p)
 {
-  p->ax[0] = 4; p->ax[1] = 5;  // { dg-warning "\\\[-Wstringop-overflow" 
"pr102706" { target { vect_slp_v2hi_store_align &&  { ! 
vect_slp_v4hi_store_unalign } } } }
+  p->ax[0] = 4; p->ax[1] = 5;  // { dg-bogus "\\\[-Wstringop-overflow" 
"pr102706" }
+   //   { xfail { vect_slp_v2hi_store_align &&  { 
! vect_slp_v4hi_store_unalign } } }
 
   p->ax[2] = 6; // { dg-warning "\\\[-Warray-bounds" }
   p->ax[3] = 7; // { dg-warning "\\\[-Warray-bounds" }
@@ -130,7 +136,8 @@ static void warn_a0_extern (struct A0 *p)
 
 static void warn_a0_local_buf (struct A0 *p)
 {
-  p->a0[0] = 4; p->a0[1] = 5;  // { dg-warning "\\\[-Wstringop-overflow" 
"pr102706" { target { vect_slp_v2hi_store_align && { ! 
vect_slp_v4hi_store_unalign } } } }
+  p->a0[0] = 4; p->a0[1] = 5;  // { dg-bogus "\\\[-Wstringop-overflow" 
"pr102706" }
+   //   { xfail { vect_slp_v2hi_store_align && { ! 
vect_slp_v4hi_store_unalign } } }
 
   p->a0[2] = 6; // { dg-warning "\\\[-Warray-bounds" }
   p->a0[3] = 7; // { dg-warning "\\\[-Warray-bounds" }
diff --git a/gcc/testsuite/gcc.dg/Wzero-length-array-bounds-2.c 
b/gcc/testsuite/gcc.dg/Wzero-length-array-bounds-2.c
index 2ef5ccd564ac4..19932d05a315f 100644
--- a/gcc/testsuite/gcc.dg/Wzero-length-array-bounds-2.c
+++ b/gcc/testsuite/gcc.dg/Wzero-length-array-bounds-2.c
@@ -4,6 +4,11 @@
{ dg-do compile }
{ dg-options "-O2 -Wall" } */
 
+/* pr102706: disabled warnings because the now-disabled conditions for the
+   bogus warnings to come up do not take cost analysis into account, and often
+   come up wrong.  */
+/* { dg-additional-options "-Wno-stringop-overflow" } */
+
 void sink (void*);
 
 struct A { int i; };
@@ -87,7 +92,8 @@ void test_C_global_buf (void)
   

Re: [PATCH trunk] [PR104308] [analyzer] handle memmove like memcpy

2022-12-08 Thread Alexandre Oliva via Gcc-patches
Hello again, David,

On Dec  2, 2022, David Malcolm  wrote:

> I had a go at porting your patch to trunk; here's the result.

Oh, wow, nice!  Thank you so much.

I confirm it works on riscv64-elf too.

-- 
Alexandre Oliva, happy hackerhttps://FSFLA.org/blogs/lxo/
   Free Software Activist   GNU Toolchain Engineer
Disinformation flourishes because many people care deeply about injustice
but very few check the facts.  Ask me about 


Re: [PATCH] cfgbuild: Fix DEBUG_INSN handling in find_bb_boundaries [PR106719]

2022-12-08 Thread Richard Biener via Gcc-patches
On Thu, Dec 8, 2022 at 11:12 AM Jakub Jelinek via Gcc-patches
 wrote:
>
> Hi!
>
> The following testcase FAILs on aarch64-linux.  We have some atomic
> instruction followed by 2 DEBUG_INSNs (if -g only of course) followed
> by NOTE_INSN_EPILOGUE_BEG followed by some USE insn.
> Now, split3 pass replaces the atomic instruction with a code sequence
> which ends with a conditional jump and the split3 pass calls
> find_many_sub_basic_blocks.
> For -g0, find_bb_boundaries sees the flow_transfer_insn (the new conditional
> jump), then NOTE_INSN_EPILOGUE_BEG which can live in between basic blocks
> and then the USE insn, so splits block after the NOTE_INSN_EPILOGUE_BEG
> and puts the NOTE in between the blocks.
> For -g, if sees a DEBUG_INSN after the flow_transfer_insn, so sets
> debug_insn to it, then walks over another DEBUG_INSN, NOTE_INSN_EPILOGUE_BEG
> until it finally sees the USE insn, and triggers the:
>   rtx_insn *prev = PREV_INSN (insn);
>
>   /* If the first non-debug inside_basic_block_p insn after a control
>  flow transfer is not a label, split the block before the debug
>  insn instead of before the non-debug insn, so that the debug
>  insns are not lost.  */
>   if (debug_insn && code != CODE_LABEL && code != BARRIER)
> prev = PREV_INSN (debug_insn);
> code I've added for PR81325.  If there are only DEBUG_INSNs, that is
> the right thing to do, but if in between debug_insn and insn there are
> notes which can stay in between basic blocks or simnilarly JUMP_TABLE_DATA
> or their associated CODE_LABELs, it causes -fcompare-debug differences.
>
> The following patch fixes it by clearing debug_insn if JUMP_TABLE_DATA
> or associated CODE_LABEL is seen (I'm afraid there is no good answer
> what to do with DEBUG_INSNs before those; the code then removes them:
>   /* Clean up the bb field for the insns between the blocks.  */
>   for (x = NEXT_INSN (flow_transfer_insn);
>x != BB_HEAD (fallthru->dest);
>x = next)
> {
>   next = NEXT_INSN (x);
>   /* Debug insns should not be in between basic blocks,
>  drop them on the floor.  */
>   if (DEBUG_INSN_P (x))
> delete_insn (x);
>   else if (!BARRIER_P (x))
> set_block_for_insn (x, NULL);
> }
> but if there are NOTEs, the patch just reorders the NOTEs and DEBUG_INSNs,
> such that the NOTEs come first (so that they stay in between basic blocks
> like with -g0) and DEBUG_INSNs after those (so that bb is split before
> them, so they will be in the basic block after NOTE_INSN_BASIC_BLOCK).
>
> Bootstrapped/regtested on x86_64-linux and i686-linux plus tested on
> the testcase in a cross to aarch64-linux, ok for trunk?

OK.

Thanks,
Richard.

> 2022-12-08  Jakub Jelinek  
>
> PR debug/106719
> * cfgbuild.cc (find_bb_boundaries): If there are NOTEs in between
> debug_insn (seen after flow_transfer_insn) and insn, move NOTEs
> before all the DEBUG_INSNs and split after NOTEs.  If there are
> other insns like jump table data, clear debug_insn.
>
> * gcc.dg/pr106719.c: New test.
>
> --- gcc/cfgbuild.cc.jj  2022-01-18 11:58:58.944991171 +0100
> +++ gcc/cfgbuild.cc 2022-12-07 21:36:27.493363173 +0100
> @@ -445,6 +445,7 @@ find_bb_boundaries (basic_block bb)
>rtx_insn *debug_insn = NULL;
>edge fallthru = NULL;
>bool skip_purge;
> +  bool seen_note_after_debug = false;
>
>if (insn == end)
>  return;
> @@ -492,7 +493,10 @@ find_bb_boundaries (basic_block bb)
>if (code == DEBUG_INSN)
> {
>   if (flow_transfer_insn && !debug_insn)
> -   debug_insn = insn;
> +   {
> + debug_insn = insn;
> + seen_note_after_debug = false;
> +   }
> }
>/* In case we've previously seen an insn that effects a control
>  flow transfer, split the block.  */
> @@ -506,7 +510,40 @@ find_bb_boundaries (basic_block bb)
>  insn instead of before the non-debug insn, so that the debug
>  insns are not lost.  */
>   if (debug_insn && code != CODE_LABEL && code != BARRIER)
> -   prev = PREV_INSN (debug_insn);
> +   {
> + prev = PREV_INSN (debug_insn);
> + if (seen_note_after_debug)
> +   {
> + /* Though, if there are NOTEs intermixed with DEBUG_INSNs,
> +move the NOTEs before the DEBUG_INSNs and split after
> +the last NOTE.  */
> + rtx_insn *first = NULL, *last = NULL;
> + for (x = debug_insn; x != insn; x = NEXT_INSN (x))
> +   {
> + if (NOTE_P (x))
> +   {
> + if (first == NULL)
> +   

Re: [PATCH 1/2] OpenMP/Fortran: Combined directives with map/firstprivate of same symbol

2022-12-08 Thread Tobias Burnus

On 07.12.22 20:09, Julian Brown wrote:

On Wed, 26 Oct 2022 12:39:39 +0200
Tobias Burnus  wrote:

The ICE seems to be because gcc/fortran/trans-openmp.cc's
gfc_split_omp_clauses mishandles this as the dump shows the following:

#pragma omp target firstprivate(a) map(tofrom:a)
  #pragma omp parallel firstprivate(a)

In contrast, for the C testcase:

#pragma omp target parallel for simd map(x) firstprivate(x)

the dump is as follows, which seems to be sensible:

#pragma omp target map(tofrom:x)
  #pragma omp parallel firstprivate(x)

This patch fixes a case where a combined directive (e.g. "!$omp target
parallel ...") contains both a map and a firstprivate clause for the
same variable.  When the combined directive is split into two nested
directives, the outer "target" gets the "map" clause, and the inner
"parallel" gets the "firstprivate" clause, like so:

...

This is not a recent regression, but appears to fix a long-standing ICE.

...

gcc/fortran/
 * trans-openmp.cc (gfc_add_firstprivate_if_unmapped): New function.
 (gfc_split_omp_clauses): Call above.

libgomp/
 * testsuite/libgomp.fortran/combined-directive-splitting-1.f90: New
 test.


LGTM – thanks!

Tobias

-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955


[PATCH] cfgbuild: Fix DEBUG_INSN handling in find_bb_boundaries [PR106719]

2022-12-08 Thread Jakub Jelinek via Gcc-patches
Hi!

The following testcase FAILs on aarch64-linux.  We have some atomic
instruction followed by 2 DEBUG_INSNs (if -g only of course) followed
by NOTE_INSN_EPILOGUE_BEG followed by some USE insn.
Now, split3 pass replaces the atomic instruction with a code sequence
which ends with a conditional jump and the split3 pass calls
find_many_sub_basic_blocks.
For -g0, find_bb_boundaries sees the flow_transfer_insn (the new conditional
jump), then NOTE_INSN_EPILOGUE_BEG which can live in between basic blocks
and then the USE insn, so splits block after the NOTE_INSN_EPILOGUE_BEG
and puts the NOTE in between the blocks.
For -g, if sees a DEBUG_INSN after the flow_transfer_insn, so sets
debug_insn to it, then walks over another DEBUG_INSN, NOTE_INSN_EPILOGUE_BEG
until it finally sees the USE insn, and triggers the:
  rtx_insn *prev = PREV_INSN (insn);

  /* If the first non-debug inside_basic_block_p insn after a control
 flow transfer is not a label, split the block before the debug
 insn instead of before the non-debug insn, so that the debug
 insns are not lost.  */
  if (debug_insn && code != CODE_LABEL && code != BARRIER)
prev = PREV_INSN (debug_insn);
code I've added for PR81325.  If there are only DEBUG_INSNs, that is
the right thing to do, but if in between debug_insn and insn there are
notes which can stay in between basic blocks or simnilarly JUMP_TABLE_DATA
or their associated CODE_LABELs, it causes -fcompare-debug differences.

The following patch fixes it by clearing debug_insn if JUMP_TABLE_DATA
or associated CODE_LABEL is seen (I'm afraid there is no good answer
what to do with DEBUG_INSNs before those; the code then removes them:
  /* Clean up the bb field for the insns between the blocks.  */
  for (x = NEXT_INSN (flow_transfer_insn);
   x != BB_HEAD (fallthru->dest);
   x = next)
{
  next = NEXT_INSN (x);
  /* Debug insns should not be in between basic blocks,
 drop them on the floor.  */
  if (DEBUG_INSN_P (x))
delete_insn (x);
  else if (!BARRIER_P (x))
set_block_for_insn (x, NULL);
}
but if there are NOTEs, the patch just reorders the NOTEs and DEBUG_INSNs,
such that the NOTEs come first (so that they stay in between basic blocks
like with -g0) and DEBUG_INSNs after those (so that bb is split before
them, so they will be in the basic block after NOTE_INSN_BASIC_BLOCK).

Bootstrapped/regtested on x86_64-linux and i686-linux plus tested on
the testcase in a cross to aarch64-linux, ok for trunk?

2022-12-08  Jakub Jelinek  

PR debug/106719
* cfgbuild.cc (find_bb_boundaries): If there are NOTEs in between
debug_insn (seen after flow_transfer_insn) and insn, move NOTEs
before all the DEBUG_INSNs and split after NOTEs.  If there are
other insns like jump table data, clear debug_insn.

* gcc.dg/pr106719.c: New test.

--- gcc/cfgbuild.cc.jj  2022-01-18 11:58:58.944991171 +0100
+++ gcc/cfgbuild.cc 2022-12-07 21:36:27.493363173 +0100
@@ -445,6 +445,7 @@ find_bb_boundaries (basic_block bb)
   rtx_insn *debug_insn = NULL;
   edge fallthru = NULL;
   bool skip_purge;
+  bool seen_note_after_debug = false;
 
   if (insn == end)
 return;
@@ -492,7 +493,10 @@ find_bb_boundaries (basic_block bb)
   if (code == DEBUG_INSN)
{
  if (flow_transfer_insn && !debug_insn)
-   debug_insn = insn;
+   {
+ debug_insn = insn;
+ seen_note_after_debug = false;
+   }
}
   /* In case we've previously seen an insn that effects a control
 flow transfer, split the block.  */
@@ -506,7 +510,40 @@ find_bb_boundaries (basic_block bb)
 insn instead of before the non-debug insn, so that the debug
 insns are not lost.  */
  if (debug_insn && code != CODE_LABEL && code != BARRIER)
-   prev = PREV_INSN (debug_insn);
+   {
+ prev = PREV_INSN (debug_insn);
+ if (seen_note_after_debug)
+   {
+ /* Though, if there are NOTEs intermixed with DEBUG_INSNs,
+move the NOTEs before the DEBUG_INSNs and split after
+the last NOTE.  */
+ rtx_insn *first = NULL, *last = NULL;
+ for (x = debug_insn; x != insn; x = NEXT_INSN (x))
+   {
+ if (NOTE_P (x))
+   {
+ if (first == NULL)
+   first = x;
+ last = x;
+   }
+ else
+   {
+ gcc_assert (DEBUG_INSN_P (x));
+ if (first)
+   {
+ 

[PATCH] i386: Add *concat3_{5,6,7} patterns [PR107627]

2022-12-08 Thread Jakub Jelinek via Gcc-patches
Hi!

On Thu, Dec 01, 2022 at 09:09:51AM +0100, Jakub Jelinek via Gcc-patches wrote:
> BTW, I wonder if we couldn't add additional patterns which would catch
> the case where one of the operands is constant.

The following patch does add those.
The difference with the patch on the 2 testcases is:
 baz:
-   movq8(%rsi), %rax
+   movq8(%rsi), %rsi
+   movq%rdi, %r8
movl%edx, %ecx
-   xorl%r8d, %r8d
-   xorl%edx, %edx
-   movabsq $-2401053089206453570, %r9
-   orq %r8, %rax
-   orq %r9, %rdx
-   shrdq   %rdx, %rax
-   movq%rax, (%rdi)
+   movabsq $-2401053089206453570, %rdi
+   movq%rsi, %rax
+   shrdq   %rdi, %rax
+   movq%rax, (%r8)
 qux:
-   movq(%rsi), %rax
+   movq%rdi, %r8
+   movq(%rsi), %rdi
movl%edx, %ecx
-   xorl%r9d, %r9d
-   movabsq $-2401053089206453570, %r8
-   movq%rax, %rdx
-   xorl%eax, %eax
-   orq %r8, %rax
-   orq %r9, %rdx
-   shrdq   %rdx, %rax
-   movq%rax, (%rdi)
+   movabsq $-2401053089206453570, %rsi
+   movq%rsi, %rax
+   shrdq   %rdi, %rax
+   movq%rax, (%r8)
and
 garply:
pushl   %esi
-   xorl%edx, %edx
+   movl$-559038737, %esi
pushl   %ebx
movl16(%esp), %eax
-   orl $-559038737, %edx
movl20(%esp), %ecx
-   movl4(%eax), %eax
-   shrdl   %edx, %eax
movl12(%esp), %edx
+   movl4(%eax), %ebx
+   movl%ebx, %eax
+   shrdl   %esi, %eax
 fred:
...
movl16(%esp), %eax
+   movl$-889275714, %ebx
movl20(%esp), %ecx
-   movl(%eax), %eax
-   movl%eax, %edx
-   movl$0, %eax
-   orl $-889275714, %eax
-   shrdl   %edx, %eax
movl12(%esp), %edx
+   movl(%eax), %esi
+   movl%ebx, %eax
+   shrdl   %esi, %eax

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2022-12-08  Jakub Jelinek  

PR target/107627
* config/i386/i386.md (HALF, half): New mode attributes.
(*concat3_5, *concat3_6,
*concat3_7): New define_insn_and_split patterns.

* gcc.target/i386/pr107627-3.c: New test.
* gcc.target/i386/pr107627-4.c: New test.

--- gcc/config/i386/i386.md.jj  2022-12-06 12:16:31.059905487 +0100
+++ gcc/config/i386/i386.md 2022-12-07 15:11:55.297914206 +0100
@@ -1134,6 +1134,10 @@ (define_mode_attr MODE_SIZE [(QI "1") (H
 (define_mode_attr DWI [(QI "HI") (HI "SI") (SI "DI") (DI "TI") (TI "OI")])
 (define_mode_attr dwi [(QI "hi") (HI "si") (SI "di") (DI "ti") (TI "oi")])
 
+;; Half sized integer modes.
+(define_mode_attr HALF [(TI "DI") (DI "SI")])
+(define_mode_attr half [(TI "di") (DI "si")])
+
 ;; LEA mode corresponding to an integer mode
 (define_mode_attr LEAMODE [(QI "SI") (HI "SI") (SI "SI") (DI "DI")])
 
@@ -11464,6 +11468,80 @@ (define_insn_and_split "*concatmode, operands[0], operands[1], operands[2]);
   DONE;
 })
+
+(define_insn_and_split "*concat3_5"
+  [(set (match_operand:DWI 0 "nonimmediate_operand" "=ro")
+   (any_or_plus:DWI
+ (ashift:DWI (match_operand:DWI 1 "register_operand" "r")
+ (match_operand:DWI 2 "const_int_operand"))
+ (match_operand:DWI 3 "const_scalar_int_operand")))]
+  "INTVAL (operands[2]) ==  * BITS_PER_UNIT / 2
+   && (mode == DImode
+   ? CONST_INT_P (operands[3])
+&& (UINTVAL (operands[3]) & ~GET_MODE_MASK (SImode)) == 0
+   : CONST_INT_P (operands[3])
+   ? INTVAL (operands[3]) >= 0
+   : CONST_WIDE_INT_NUNITS (operands[3]) == 2
+&& CONST_WIDE_INT_ELT (operands[3], 1) == 0)"
+  "#"
+  "&& reload_completed"
+  [(clobber (const_int 0))]
+{
+  rtx op3 = simplify_subreg (mode, operands[3], mode, 0);
+  split_double_concat (mode, operands[0], op3,
+  gen_lowpart (mode, operands[1]));
+  DONE;
+})
+
+(define_insn_and_split "*concat3_6"
+  [(set (match_operand: 0 "nonimmediate_operand" "=ro,r")
+   (any_or_plus:
+ (ashift:
+   (zero_extend:
+ (match_operand:DWIH 1 "nonimmediate_operand" "r,m"))
+   (match_operand: 2 "const_int_operand"))
+ (match_operand: 3 "const_scalar_int_operand")))]
+  "INTVAL (operands[2]) ==  * BITS_PER_UNIT
+   && (mode == DImode
+   ? CONST_INT_P (operands[3])
+&& (UINTVAL (operands[3]) & ~GET_MODE_MASK (SImode)) == 0
+   : CONST_INT_P (operands[3])
+   ? INTVAL (operands[3]) >= 0
+   : CONST_WIDE_INT_NUNITS (operands[3]) == 2
+&& CONST_WIDE_INT_ELT (operands[3], 1) == 0)"
+  "#"
+  "&& reload_completed"
+  [(clobber (const_int 0))]
+{
+  rtx op3 = simplify_subreg (mode, operands[3], mode, 0);
+  split_double_concat (mode, operands[0], op3, operands[1]);
+  DONE;
+})
+
+(define_insn_and_split "*concat3_7"
+  [(set (match_operand: 0 "nonimmediate_operand" "=ro,r")
+   (any_or_plus:
+ 

RE: Zen4 tuning part 1 - cost tables

2022-12-08 Thread Kumar, Venkataramanan via Gcc-patches
[AMD Official Use Only - General]

Hi Honza,

Thank you for posting the tuning patch.

> -Original Message-
> From: Jan Hubicka 
> Sent: Tuesday, December 6, 2022 3:31 PM
> To: gcc-patches@gcc.gnu.org; mjam...@suse.cz; Alexander Monakov
> ; Kumar, Venkataramanan
> ; Joshi, Tejas Sanjay
> 
> Subject: Zen4 tuning part 1 - cost tables
>
> Caution: This message originated from an External Source. Use proper
> caution when opening attachments, clicking links, or responding.
>
>
> Hi
> this patch updates cost of znver4 mostly based on data measued by Agner
> Fog.
> Compared to previous generations x87 became bit slower which is probably
> not big deal (and we have minimal benchmarking coverage for it).  One
> interesting improvement is reducation of FMA cost.  I also updated costs of
> AVX256 loads/stores  based on latencies (not throughput which is twice of
> avx256).
> Overall AVX512 vectorization seems to improve noticeably some of TSVC
> benchmarks but since internally 512 vectors are split to 256 vectors it is
> somewhat risky and does not win in SPEC scores (mostly by regressing
> benchmarks with loop that have small trip count like x264 and exchange), so
> for now I am going to set AVX256_OPTIMAL tune but I am still playing with it.
> We improved since ZNVER1 on choosing vectorization size and also have
> vectorized prologues/epilogues so it may be possible to make avx512 small
> win overall.

I also noted improvements to TSVC benchmarks when we enable AVX512 
vectorization.  I think we should allow full AVX512 bit vectorization for 
znver4.   Even if the 512 vectors are broken into two 256 vectors we can 
pipeline the higher half immediately in the next cycle.  Also we have less 
instructions to decode with avx512 instructions.  Overall AVX512 operations 
should be better.

>
> In general I would like to keep cost tables latency based unless we have a
> good reason to not do so.  There are some interesting diferences in
> znver3 tables that I also patched and seems performance neutral.  I will send
> that separately.
>
> Bootstrapped/regtested x86_64-linux, also benchmarked on SPEC2017 along
> with AVX512 tuning.  I plan to commit it tomorrow unless there are some
> comments.
>
> Honza
>
> * x86-tune-costs.h (znver4_cost): Upate costs of FP and SSE moves,
> division multiplication, gathers, L2 cache size, and more complex
> FP instrutions.
> diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-
> costs.h
> index f01b8ee9eef..3a6ce02f093 100644
> --- a/gcc/config/i386/x86-tune-costs.h
> +++ b/gcc/config/i386/x86-tune-costs.h
> @@ -1867,9 +1868,9 @@ struct processor_costs znver4_cost = {
>{8, 8, 8},   /* cost of storing integer
>registers.  */
>2,   /* cost of reg,reg fld/fst.  */
> -  {6, 6, 16},  /* cost of loading fp registers
> +  {14, 14, 17},/* cost of loading fp 
> registers
>in SFmode, DFmode and XFmode.  */
> -  {8, 8, 16},  /* cost of storing fp registers
> +  {12, 12, 16},/* cost of storing fp 
> registers
>in SFmode, DFmode and XFmode.  */
>2,   /* cost of moving MMX register.  */
>{6, 6},  /* cost of loading MMX registers
> @@ -1878,13 +1879,13 @@ struct processor_costs znver4_cost = {
>in SImode and DImode.  */
>2, 2, 3, /* cost of moving XMM,YMM,ZMM
>register.  */
> -  {6, 6, 6, 6, 12},/* cost of loading SSE registers
> +  {6, 6, 10, 10, 12},  /* cost of loading SSE registers
>in 32,64,128,256 and 512-bit.  */
> -  {8, 8, 8, 8, 16},/* cost of storing SSE registers
> +  {8, 8, 8, 12, 12},   /* cost of storing SSE registers
>in 32,64,128,256 and 512-bit.  */
> -  6, 6,/* SSE->integer and 
> integer->SSE
> +  6, 8,/* SSE->integer and 
> integer->SSE
>moves.  */
> -  8, 8,/* mask->integer and integer->mask 
> moves */
> +  8, 8,/* mask->integer and 
> integer->mask moves */
>{6, 6, 6},   /* cost of loading mask register
>in QImode, HImode, SImode.  */
>{8, 8, 8},   /* cost if storing mask register
> @@ -1894,6 +1895,7 @@ struct processor_costs znver4_cost = {
>},
>
>COSTS_N_INSNS (1),   

Re: [PATCH] arm: fix mve intrinsics scan body tests for C++

2022-12-08 Thread Andrea Corallo via Gcc-patches
Kyrylo Tkachov  writes:

> Hi Andrea,
>
>> -Original Message-
>> From: Andrea Corallo 
>> Sent: Wednesday, December 7, 2022 3:03 PM
>> To: gcc-patches@gcc.gnu.org
>> Cc: Kyrylo Tkachov ; Richard Earnshaw
>> ; Andrea Corallo 
>> Subject: [PATCH] arm: fix mve intrinsics scan body tests for C++
>> 
>> Hi all,
>> 
>> this patch is to export the functions defined in these MVE tests as C
>> so the body scan assembler works as expected also for our C++ tests.
>> 
>> Best Regards and sorry for the regression!
>
> Ok.
> Thanks,
> Kyrill

Thanks,

into trunk as 8d4f007398b.

Regards

  Andrea


Re: [PATCH] arm: fix mve intrinsics scan body tests for C++

2022-12-08 Thread Andrea Corallo via Gcc-patches
Kyrylo Tkachov  writes:

> Hi Andrea,
>
>> -Original Message-
>> From: Andrea Corallo 
>> Sent: Wednesday, December 7, 2022 3:03 PM
>> To: gcc-patches@gcc.gnu.org
>> Cc: Kyrylo Tkachov ; Richard Earnshaw
>> ; Andrea Corallo 
>> Subject: [PATCH] arm: fix mve intrinsics scan body tests for C++
>> 
>> Hi all,
>> 
>> this patch is to export the functions defined in these MVE tests as C
>> so the body scan assembler works as expected also for our C++ tests.
>> 
>> Best Regards and sorry for the regression!
>
> Ok.
> Thanks,
> Kyrill

Thanks attaching the original patch as compressed, the original it's
still stuck for moderator review (more than 400KB).

  Andrea



0001-arm-fix-mve-intrinsics-scan-body-tests-for-C.patch.gz
Description: application/gzip


[COMMITTED] libgcc: xtensa: remove stray symbols from X*HAL macro definitions

2022-12-08 Thread Max Filippov via Gcc-patches
libgcc/
* config/xtensa/xtensa-config-builtin.h (XCHAL_NUM_AREGS)
(XCHAL_ICACHE_SIZE, XCHAL_DCACHE_SIZE, XCHAL_ICACHE_LINESIZE)
(XCHAL_DCACHE_LINESIZE, XCHAL_MMU_MIN_PTE_PAGE_SIZE)
(XSHAL_ABI): Remove stray symbols from macro definitions.
---
 libgcc/config/xtensa/xtensa-config-builtin.h | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/libgcc/config/xtensa/xtensa-config-builtin.h 
b/libgcc/config/xtensa/xtensa-config-builtin.h
index 36d4d9db330b..47782a064523 100644
--- a/libgcc/config/xtensa/xtensa-config-builtin.h
+++ b/libgcc/config/xtensa/xtensa-config-builtin.h
@@ -130,7 +130,7 @@
 #define XCHAL_HAVE_WINDOWED__XCHAL_HAVE_WINDOWED
 
 #undef XCHAL_NUM_AREGS
-#define XCHAL_NUM_AREGS__XCHAL_NUM_AREGS2
+#define XCHAL_NUM_AREGS__XCHAL_NUM_AREGS
 
 #undef XCHAL_HAVE_WIDE_BRANCHES
 #define XCHAL_HAVE_WIDE_BRANCHES   __XCHAL_HAVE_WIDE_BRANCHES
@@ -140,16 +140,16 @@
 
 
 #undef XCHAL_ICACHE_SIZE
-#define XCHAL_ICACHE_SIZE  __XCHAL_ICACHE_SIZE6384
+#define XCHAL_ICACHE_SIZE  __XCHAL_ICACHE_SIZE
 
 #undef XCHAL_DCACHE_SIZE
-#define XCHAL_DCACHE_SIZE  __XCHAL_DCACHE_SIZE6384
+#define XCHAL_DCACHE_SIZE  __XCHAL_DCACHE_SIZE
 
 #undef XCHAL_ICACHE_LINESIZE
-#define XCHAL_ICACHE_LINESIZE  __XCHAL_ICACHE_LINESIZE2
+#define XCHAL_ICACHE_LINESIZE  __XCHAL_ICACHE_LINESIZE
 
 #undef XCHAL_DCACHE_LINESIZE
-#define XCHAL_DCACHE_LINESIZE  __XCHAL_DCACHE_LINESIZE2
+#define XCHAL_DCACHE_LINESIZE  __XCHAL_DCACHE_LINESIZE
 
 #undef XCHAL_ICACHE_LINEWIDTH
 #define XCHAL_ICACHE_LINEWIDTH __XCHAL_ICACHE_LINEWIDTH
@@ -165,7 +165,7 @@
 #define XCHAL_HAVE_MMU __XCHAL_HAVE_MMU
 
 #undef XCHAL_MMU_MIN_PTE_PAGE_SIZE
-#define XCHAL_MMU_MIN_PTE_PAGE_SIZE__XCHAL_MMU_MIN_PTE_PAGE_SIZE2
+#define XCHAL_MMU_MIN_PTE_PAGE_SIZE__XCHAL_MMU_MIN_PTE_PAGE_SIZE
 
 
 #undef XCHAL_HAVE_DEBUG
@@ -191,7 +191,7 @@
 #undef XSHAL_ABI
 #undef XTHAL_ABI_WINDOWED
 #undef XTHAL_ABI_CALL0
-#define XSHAL_ABI  __XSHAL_ABITHAL_ABI_WINDOWED
+#define XSHAL_ABI  __XSHAL_ABI
 #define XTHAL_ABI_WINDOWED __XTHAL_ABI_WINDOWED
 #define XTHAL_ABI_CALL0__XTHAL_ABI_CALL0
 
-- 
2.30.2



Re: [PATCH] Fix aarch64 PR 99657: ICE with SVE types used without an error

2022-12-08 Thread Kewen.Lin via Gcc-patches
on 2022/12/8 15:43, Richard Sandiford wrote:
> "Kewen.Lin"  writes:
>> on 2022/12/7 20:55, Richard Sandiford wrote:
>>> "Kewen.Lin"  writes:
 Hi Richard,

 on 2022/12/7 17:16, Richard Sandiford wrote:
> "Kewen.Lin"  writes:
>> Hi,
>>
>> In the recent discussion on how to make some built-in type only valid for
>> some target features efficiently[1], Andrew mentioned this patch which he
>> made previously (Thanks!).  I confirmed it can help rs6000 related issue,
>> and noticed PR99657 is still opened, so I think we still want this to
>> be reviewed.
>
> But does it work for things like:
>
> void f(foo_t *x, foo_t *y) { *x = *y; }
>
> where no variables are being created with foo_t type?
>

 I think it can work for this case as it touches build_indirect_ref.
>>>
>>> Ah, ok.  But indirecting through a pointer doesn't seem to match
>>> TCTX_AUTO_STORAGE.
>>>
>>
>> Indeed. :)
>>
>>> I guess another case is where there are global variables of the type
>>> that you want to forbid, compiled while the target feature is enabled,
>>> and then a function tries to access those variables with the target
>>> feature locally disabled (through a pragma or attribute).  Does that
>>> case work?
>>>
>>
>> Thanks for pointing out this, I tried with the below test case:
>>
>> __vector_quad a1;
>> __vector_quad a2;
>>
>> __attribute__((target("cpu=power8")))
>> void foo ()
>> {
>>   a2 = a3;
>> }
>>
>> the verify_type_context doesn't catch it as you suspected, I think
>> it needs some enhancements somewhere.
> 
> FWIW, another possible case is:
> 
>   foo_t f();
>   void g(foo_t);
>   void h() { g(f()); }
> 
> I'm not aware of any verify_type_context checks that would catch this
> for SVE (since it's valid for SVE types).


OK, thanks for the information, MMA built-in types are not allowed to be
used in function arguments, by hacking with the restriction relaxing, I
confirm the verify_type_context check can't catch this case.

BR,
Kewen


Re: [PATCH v4, rs6000] Enable have_cbranchcc4 on rs6000

2022-12-08 Thread Kewen.Lin via Gcc-patches
Hi Haochen,

on 2022/12/8 11:08, HAO CHEN GUI wrote:
> Hi,
>   This patch enables "have_cbranchcc4" on rs6000 by defining
> a "cbranchcc4" expander. "have_cbrnachcc4" is a flag in ifcvt.cc
> to indicate if branch by CC bits is invalid or not. With this
> flag enabled, some branches can be optimized to conditional
> moves.
> 
>   Compared to last version, the main changes are on the test
> cases. Test case is renamed and comments are modified.
> 
>   Bootstrapped and tested on powerpc64-linux BE and LE with no
> regressions. Is this okay for trunk? Any recommendations? Thanks
> a lot.
> 

This patch is OK, thanks!

BR,
Kewen

> BR
> Gui Haochen
> 
> ChangeLog
> 2022-12-07  Haochen Gui 
> 
> gcc/
>   * config/rs6000/rs6000.md (cbranchcc4): New expander.
> 
> gcc/testsuite
>   * gcc.target/powerpc/cbranchcc4-1.c: New.
>   * gcc.target/powerpc/cbranchcc4-2.c: New.
> 
> patch.diff
> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
> index e9e5cd1e54d..d7ddd96cc70 100644
> --- a/gcc/config/rs6000/rs6000.md
> +++ b/gcc/config/rs6000/rs6000.md
> @@ -11932,6 +11932,16 @@ (define_expand "cbranch4"
>DONE;
>  })
> 
> +(define_expand "cbranchcc4"
> +  [(set (pc)
> + (if_then_else (match_operator 0 "branch_comparison_operator"
> + [(match_operand 1 "cc_reg_operand")
> +  (match_operand 2 "zero_constant")])
> +   (label_ref (match_operand 3))
> +   (pc)))]
> +  ""
> +  "")
> +
>  (define_expand "cstore4_signed"
>[(use (match_operator 1 "signed_comparison_operator"
>   [(match_operand:P 2 "gpc_reg_operand")
> diff --git a/gcc/testsuite/gcc.target/powerpc/cbranchcc4-1.c 
> b/gcc/testsuite/gcc.target/powerpc/cbranchcc4-1.c
> new file mode 100644
> index 000..6c2cd130b6d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/cbranchcc4-1.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +/* Verify there is no ICE with cbranchcc4 enabled.  */
> +
> +int foo (double d)
> +{
> +  if (d == 0.0)
> +return 0;
> +
> +  d = ((d) >= 0 ? (d) : -(d));
> +
> +  if (d < 1.0)
> +return 1;
> +}
> diff --git a/gcc/testsuite/gcc.target/powerpc/cbranchcc4-2.c 
> b/gcc/testsuite/gcc.target/powerpc/cbranchcc4-2.c
> new file mode 100644
> index 000..528ba1a878d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/cbranchcc4-2.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-rtl-ce1" } */
> +/* { dg-final { scan-rtl-dump "noce_try_store_flag_constants" "ce1" } } */
> +
> +/* The inner branch should be detected by ifcvt then be converted to a setcc
> +   with a plus by noce_try_store_flag_constants.  */
> +
> +int test (unsigned int a, unsigned int b)
> +{
> +return (a < b ? 0 : (a > b ? 2 : 1));
> +}


Re: [PATCH v5, rs6000] Change mode and insn condition for VSX scalar extract/insert instructions

2022-12-08 Thread Kewen.Lin via Gcc-patches
Hi Haochen,

Thanks for the update, some comments are inlined as below.

on 2022/12/2 15:03, HAO CHEN GUI wrote:
> Hi,
>   For scalar extract/insert instructions, exponent field can be stored in a
> 32-bit register. So this patch changes the mode of exponent field from DI to
> SI so that these instructions can be generated in a 32-bit environment. Also
> it removes TARGET_64BIT check for these instructions.
> 
>   The instructions using DI registers can be invoked with -mpowerpc64 in a
> 32-bit environment. The patch changes insn condition from TARGET_64BIT to
> TARGET_POWERPC64 for those instructions.
> 
>   This patch also changes prototypes and catagories of relevant built-ins and
   ~ categories
> effective target checks of test cases.
> 
>   Compared to last version, main changes are to remove 64-bit environment
> requirement for relevant built-ins in extend.texi. And to change the type of
> arguments of relevant built-ins in rs6000-overload.def.
> 
>   Bootstrapped and tested on powerpc64-linux BE and LE with no regressions.
> Is this okay for trunk? Any recommendations? Thanks a lot.
> 
> ChangeLog
> 2022-12-01  Haochen Gui  
> 
> gcc/
>   * config/rs6000/rs6000-builtins.def
>   (__builtin_vsx_scalar_extract_exp): Set return type to const unsigned
>   int and move it from power9-64 to power9 catatlog.
 ~~~ catalog
>   (__builtin_vsx_scalar_extract_sig): Set return type to const unsigned
>   long long.
>   (__builtin_vsx_scalar_insert_exp): Set type of second argument to
>   unsigned int.
>   (__builtin_vsx_scalar_insert_exp_dp): Set type of second argument to
>   unsigned int and move it from power9-64 to power9 catatlog.
  ~~~ 

>   * config/rs6000/vsx.md (xsxexpdp): Set mode of first operand to
>   SImode.  Remove TARGET_64BIT from insn condition.
>   (xsxsigdp): Change insn condition from TARGET_64BIT to TARGET_POWERPC64.
>   (xsiexpdp): Change insn condition from TARGET_64BIT to
>   TARGET_POWERPC64.  Set mode of third operand to SImode.
>   (xsiexpdpf): Set mode of third operand to SImode.  Remove TARGET_64BIT
>   from insn condition.
>   * config/rs6000/rs6000-overload.def
>   (__builtin_vec_scalar_insert_exp): Set type of second argument to
>   unsigned int.
>   * doc/extend.texi (scalar_insert_exp): Set type of second argument to
>   unsigned int and remove 64-bit environment requirement when
>   significand has a float type.
>   (scalar_extract_exp): Remove 64-bit environment requirement.
> 
> gcc/testsuite/
>   * gcc.target/powerpc/bfp/scalar-extract-exp-0.c: Remove lp64 check.
>   * gcc.target/powerpc/bfp/scalar-extract-exp-1.c: Remove lp64 check.
>   * gcc.target/powerpc/bfp/scalar-extract-exp-2.c: Deleted as the case is
>   invalid now.
>   * gcc.target/powerpc/bfp/scalar-extract-exp-6.c: Replace lp64 check
>   with has_arch_ppc64.
>   * gcc.target/powerpc/bfp/scalar-extract-sig-0.c: Likewise.
>   * gcc.target/powerpc/bfp/scalar-extract-sig-6.c: Likewise.
>   * gcc.target/powerpc/bfp/scalar-insert-exp-0.c: Replace lp64 check
>   with has_arch_ppc64. Set type of exponent to unsigned int.
>   * gcc.target/powerpc/bfp/scalar-insert-exp-1.c: Set type of exponent
>   to unsigned int.
>   * gcc.target/powerpc/bfp/scalar-insert-exp-12.c: Replace lp64 check
>   with has_arch_ppc64. Set type of exponent to unsigned int.
>   * gcc.target/powerpc/bfp/scalar-insert-exp-13.c: Remove lp64 check.
>   Set type of exponent to unsigned int.
>   * gcc.target/powerpc/bfp/scalar-insert-exp-2.c: Set type of exponent to
>   unsigned int.
>   * gcc.target/powerpc/bfp/scalar-insert-exp-3.c: Remove lp64 check. Set
>   type of exponent to unsigned int.
>   * gcc.target/powerpc/bfp/scalar-insert-exp-4.c: Likewise.
>   * gcc.target/powerpc/bfp/scalar-insert-exp-5.c: Deleted as the case is
>   invalid now.
> 
> patch.diff
> diff --git a/gcc/config/rs6000/rs6000-builtins.def 
> b/gcc/config/rs6000/rs6000-builtins.def
> index f76f54793d7..d8d67fa0cad 100644
> --- a/gcc/config/rs6000/rs6000-builtins.def
> +++ b/gcc/config/rs6000/rs6000-builtins.def
> @@ -2833,6 +2833,11 @@
>const signed int __builtin_dtstsfi_ov_td (const int<6>, _Decimal128);
>  TSTSFI_OV_TD dfptstsfi_unordered_td {}
> 
> +  const unsigned int __builtin_vsx_scalar_extract_exp (double);
> +VSEEDP xsxexpdp {}
> +
> +  const double __builtin_vsx_scalar_insert_exp_dp (double, unsigned int);
> +VSIEDPF xsiexpdpf {}

This __builtin_vsx_scalar_insert_exp_dp still requires 64-bit, see further
explanation below ...

> 
>  [power9-64]
>void __builtin_altivec_xst_len_r (vsc, void *, long);
> @@ -2847,19 +2852,13 @@
>pure vsc __builtin_vsx_lxvl (const void *, signed long);
>  LXVL lxvl {}
> 
> -  const