[gcc r15-482] Reduce recursive inlining of always_inline functions

2024-05-14 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:1ec49897253e093e1ef6261eb104ac0c111bac83

commit r15-482-g1ec49897253e093e1ef6261eb104ac0c111bac83
Author: Jan Hubicka 
Date:   Tue May 14 12:58:56 2024 +0200

Reduce recursive inlining of always_inline functions

this patch tames down inliner on (mutiply) self-recursive always_inline 
functions.
While we already have caps on recursive inlning, the testcase combines 
early inliner
and late inliner to get very wide recursive inlining tree.  The basic idea 
is to
ignore DISREGARD_INLINE_LIMITS when deciding on inlining self recursive 
functions
(so we cut on function being large) and clear the flag once it is detected.

I did not include the testcase since it still produces a lot of code and 
would
slow down testing.  It also outputs many inlining failed messages that is 
not
very nice, but it is hard to detect self recursin cycles in full generality
when indirect calls and other tricks may happen.

gcc/ChangeLog:

PR ipa/113291

* ipa-inline.cc (enum can_inline_edge_by_limits_flags): New enum.
(can_inline_edge_by_limits_p): Take flags instead of multiple 
bools; add flag
for forcing inlinie limits.
(can_early_inline_edge_p): Update.
(want_inline_self_recursive_call_p): Update; use FORCE_LIMITS mode.
(check_callers): Update.
(update_caller_keys): Update.
(update_callee_keys): Update.
(recursive_inlining): Update.
(add_new_edges_to_heap): Update.
(speculation_useful_p): Update.
(inline_small_functions): Clear DECL_DISREGARD_INLINE_LIMITS on 
self recursion.
(flatten_function): Update.
(inline_to_all_callers_1): Update.

Diff:
---
 gcc/ipa-inline.cc | 79 +--
 1 file changed, 53 insertions(+), 26 deletions(-)

diff --git a/gcc/ipa-inline.cc b/gcc/ipa-inline.cc
index e52757510ce9..9fc41b7696d8 100644
--- a/gcc/ipa-inline.cc
+++ b/gcc/ipa-inline.cc
@@ -496,24 +496,33 @@ inline_insns_auto (cgraph_node *n, bool hint, bool hint2)
   return max_inline_insns_auto;
 }
 
+enum can_inline_edge_by_limits_flags
+{
+  /* True if we are early inlining.  */
+  CAN_INLINE_EARLY = 1,
+  /* Ignore size limits.  */
+  CAN_INLINE_DISREGARD_LIMITS = 2,
+  /* Force size limits (ignore always_inline).  This is used for
+ recrusive inlining where always_inline may lead to inline bombs
+ and technically it is non-sential anyway.  */
+  CAN_INLINE_FORCE_LIMITS = 4,
+  /* Report decision to dump file.  */
+  CAN_INLINE_REPORT = 8,
+};
+
 /* Decide if we can inline the edge and possibly update
inline_failed reason.  
We check whether inlining is possible at all and whether
-   caller growth limits allow doing so.  
-
-   if REPORT is true, output reason to the dump file.
-
-   if DISREGARD_LIMITS is true, ignore size limits.  */
+   caller growth limits allow doing so.  */
 
 static bool
-can_inline_edge_by_limits_p (struct cgraph_edge *e, bool report,
-bool disregard_limits = false, bool early = false)
+can_inline_edge_by_limits_p (struct cgraph_edge *e, int flags)
 {
   gcc_checking_assert (e->inline_failed);
 
   if (cgraph_inline_failed_type (e->inline_failed) == CIF_FINAL_ERROR)
 {
-  if (report)
+  if (flags & CAN_INLINE_REPORT)
 report_inline_failed_reason (e);
   return false;
 }
@@ -527,10 +536,11 @@ can_inline_edge_by_limits_p (struct cgraph_edge *e, bool 
report,
   tree callee_tree
 = callee ? DECL_FUNCTION_SPECIFIC_OPTIMIZATION (callee->decl) : NULL;
   /* Check if caller growth allows the inlining.  */
-  if (!DECL_DISREGARD_INLINE_LIMITS (callee->decl)
-  && !disregard_limits
-  && !lookup_attribute ("flatten",
-DECL_ATTRIBUTES (caller->decl))
+  if (!(flags & CAN_INLINE_DISREGARD_LIMITS)
+  && ((flags & CAN_INLINE_FORCE_LIMITS)
+ || (!DECL_DISREGARD_INLINE_LIMITS (callee->decl)
+ && !lookup_attribute ("flatten",
+DECL_ATTRIBUTES (caller->decl
   && !caller_growth_limits (e))
 inlinable = false;
   else if (callee->externally_visible
@@ -558,7 +568,7 @@ can_inline_edge_by_limits_p (struct cgraph_edge *e, bool 
report,
to inline library always_inline functions. See PR65873.
Disable the check for early inlining for now until better solution
is found.  */
- if (always_inline && early)
+ if (always_inline && (flags & CAN_INLINE_EARLY))
;
   /* There are some options that change IL semantics which means
  we cannot inline in these cases for correctness reason.
@@ -594,7 +604,7 @@ can_inline_edge_by_limits_p (struct cgraph_edge *e, bool 
report,
  /* When devirtualization is disabled for callee, it is not safe
 to inline it as we possibly mangled the type info.
 Allo

[gcc r15-512] Avoid pointer compares on TYPE_MAIN_VARIANT in TBAA

2024-05-15 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:9b7cad5884f21cc5783075be0043777448db3fab

commit r15-512-g9b7cad5884f21cc5783075be0043777448db3fab
Author: Jan Hubicka 
Date:   Wed May 15 14:14:27 2024 +0200

Avoid pointer compares on TYPE_MAIN_VARIANT in TBAA

while building more testcases for ipa-icf I noticed that there are two 
places
in aliasing code where we still compare TYPE_MAIN_VARIANT for pointer 
equality.
This is not good idea for LTO since type merging may not happen for example
when in one unit pointed to type is forward declared while in other it is 
fully
defined.  We have same_type_for_tbaa for that.

Bootstrapped/regtested x86_64-linux, OK?

gcc/ChangeLog:

* alias.cc (reference_alias_ptr_type_1): Use 
view_converted_memref_p.
* alias.h (view_converted_memref_p): Declare.
* tree-ssa-alias.cc (view_converted_memref_p): Export.
(ao_compare::compare_ao_refs): Use same_type_for_tbaa.

Diff:
---
 gcc/alias.cc  | 5 +
 gcc/alias.h   | 1 +
 gcc/tree-ssa-alias.cc | 6 +++---
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/gcc/alias.cc b/gcc/alias.cc
index 808e2095d9b4..853e84d7439a 100644
--- a/gcc/alias.cc
+++ b/gcc/alias.cc
@@ -770,10 +770,7 @@ reference_alias_ptr_type_1 (tree *t)
   /* If the innermost reference is a MEM_REF that has a
  conversion embedded treat it like a VIEW_CONVERT_EXPR above,
  using the memory access type for determining the alias-set.  */
-  if (TREE_CODE (inner) == MEM_REF
-  && (TYPE_MAIN_VARIANT (TREE_TYPE (inner))
- != TYPE_MAIN_VARIANT
-  (TREE_TYPE (TREE_TYPE (TREE_OPERAND (inner, 1))
+  if (view_converted_memref_p (inner))
 {
   tree alias_ptrtype = TREE_TYPE (TREE_OPERAND (inner, 1));
   /* Unless we have the (aggregate) effective type of the access
diff --git a/gcc/alias.h b/gcc/alias.h
index f8d93e8b5f4c..36095f0bf736 100644
--- a/gcc/alias.h
+++ b/gcc/alias.h
@@ -41,6 +41,7 @@ bool alias_ptr_types_compatible_p (tree, tree);
 int compare_base_decls (tree, tree);
 bool refs_same_for_tbaa_p (tree, tree);
 bool mems_same_for_tbaa_p (rtx, rtx);
+bool view_converted_memref_p (tree);
 
 /* This alias set can be used to force a memory to conflict with all
other memories, creating a barrier across which no memory reference
diff --git a/gcc/tree-ssa-alias.cc b/gcc/tree-ssa-alias.cc
index 374ba04e6fd0..96301bbde7fa 100644
--- a/gcc/tree-ssa-alias.cc
+++ b/gcc/tree-ssa-alias.cc
@@ -2049,7 +2049,7 @@ decl_refs_may_alias_p (tree ref1, tree base1,
which is done by ao_ref_base and thus one extra walk
of handled components is needed.  */
 
-static bool
+bool
 view_converted_memref_p (tree base)
 {
   if (TREE_CODE (base) != MEM_REF && TREE_CODE (base) != TARGET_MEM_REF)
@@ -4330,8 +4330,8 @@ ao_compare::compare_ao_refs (ao_ref *ref1, ao_ref *ref2,
   else if ((end_struct_ref1 != NULL) != (end_struct_ref2 != NULL))
 return flags | ACCESS_PATH;
   if (end_struct_ref1
-  && TYPE_MAIN_VARIANT (TREE_TYPE (end_struct_ref1))
-!= TYPE_MAIN_VARIANT (TREE_TYPE (end_struct_ref2)))
+  && same_type_for_tbaa (TREE_TYPE (end_struct_ref1),
+TREE_TYPE (end_struct_ref2)) != 1)
 return flags | ACCESS_PATH;
 
   /* Now compare all handled components of the access path.


[gcc r15-581] Fix points_to_local_or_readonly_memory_p wrt TARGET_MEM_REF

2024-05-16 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:96d53252aefcbc2fe419c4c3b4bcd3fc03d4d187

commit r15-581-g96d53252aefcbc2fe419c4c3b4bcd3fc03d4d187
Author: Jan Hubicka 
Date:   Thu May 16 15:33:55 2024 +0200

Fix points_to_local_or_readonly_memory_p wrt TARGET_MEM_REF

TARGET_MEM_REF can be used to offset constant base into a memory object (to
produce lea instruction).  This confuses 
points_to_local_or_readonly_memory_p
which treats the constant address as a base of the access.

Bootstrapped/regtsted x86_64-linux, comitted.
Honza

gcc/ChangeLog:

PR ipa/113787
* ipa-fnsummary.cc (points_to_local_or_readonly_memory_p): Do not
look into TARGET_MEM_REFS with constant opreand 0.

gcc/testsuite/ChangeLog:

* gcc.c-torture/execute/pr113787.c: New test.

Diff:
---
 gcc/ipa-fnsummary.cc   |  4 ++-
 gcc/testsuite/gcc.c-torture/execute/pr113787.c | 38 ++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/gcc/ipa-fnsummary.cc b/gcc/ipa-fnsummary.cc
index 07a853f78e39..2faf23892971 100644
--- a/gcc/ipa-fnsummary.cc
+++ b/gcc/ipa-fnsummary.cc
@@ -2648,7 +2648,9 @@ points_to_local_or_readonly_memory_p (tree t)
return true;
   return !ptr_deref_may_alias_global_p (t, false);
 }
-  if (TREE_CODE (t) == ADDR_EXPR)
+  if (TREE_CODE (t) == ADDR_EXPR
+  && (TREE_CODE (TREE_OPERAND (t, 0)) != TARGET_MEM_REF
+ || TREE_CODE (TREE_OPERAND (TREE_OPERAND (t, 0), 0)) != INTEGER_CST))
 return refs_local_or_readonly_memory_p (TREE_OPERAND (t, 0));
   return false;
 }
diff --git a/gcc/testsuite/gcc.c-torture/execute/pr113787.c 
b/gcc/testsuite/gcc.c-torture/execute/pr113787.c
new file mode 100644
index ..702b6c35fc68
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/pr113787.c
@@ -0,0 +1,38 @@
+void foo(int x, int y, int z, int d, int *buf)
+{
+  for(int i = z; i < y-z; ++i)
+for(int j = 0; j < d; ++j)
+  /* buf[x(i+1) + j] = buf[x(i+1)-j-1] */
+  buf[i*x+(x-z+j)] = buf[i*x+(x-z-1-j)];
+}
+
+void bar(int x, int y, int z, int d, int *buf)
+{
+  for(int i = 0; i < d; ++i)
+for(int j = z; j < x-z; ++j)
+  /* buf[j+(y+i)*x] = buf[j+(y-1-i)*x] */
+  buf[j+(y-z+i)*x] = buf[j+(y-z-1-i)*x];
+}
+
+__attribute__((noipa))
+void baz(int x, int y, int d, int *buf)
+{
+  foo(x, y, 0, d, buf);
+  bar(x, y, 0, d, buf);
+}
+
+int main(void)
+{
+  int a[] = { 1, 2, 3 };
+  baz (1, 2, 1, a);
+  /* foo does:
+ buf[1] = buf[0];
+ buf[2] = buf[1];
+
+ bar does:
+ buf[2] = buf[1]; (no-op)
+ so we should have { 1, 1, 1 }.  */
+  for (int i = 0; i < 3; i++)
+if (a[i] != 1)
+  __builtin_abort ();
+}


[gcc/devel/nothrow-detection] Fix handling of types

2024-08-27 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:c16d4a0ae162abc00d97bb73e598ca00d16cf555

commit c16d4a0ae162abc00d97bb73e598ca00d16cf555
Author: Jan Hubicka 
Date:   Tue Aug 27 13:50:32 2024 +0200

Fix handling of types

* ipa-devirt.cc (odr_equivalent_or_derived_p): New.
* ipa-utils.h (odr_equivalent_or_derived_p): Declare.
* tree-eh.cc (same_or_derived_type): New.
(match_lp): Use it.

Diff:
---
 gcc/ipa-devirt.cc | 24 
 gcc/ipa-utils.h   |  1 +
 gcc/tree-eh.cc| 26 +-
 3 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/gcc/ipa-devirt.cc b/gcc/ipa-devirt.cc
index a7ce434bffb4..d6cfcf2c676c 100644
--- a/gcc/ipa-devirt.cc
+++ b/gcc/ipa-devirt.cc
@@ -1211,6 +1211,30 @@ skip_in_fields_list_p (tree t)
   return false;
 }
 
+/* Return true if T2 is derived form T1.  */
+
+bool
+odr_equivalent_or_derived_p (tree t1, tree t2)
+{
+  if (in_lto_p)
+{
+  if (odr_types_equivalent_p (t1, t2))
+   return true;
+}
+  else
+{
+  if (TYPE_MAIN_VARIANT (t1) == TYPE_MAIN_VARIANT (t2))
+   return true;
+}
+  if (!TYPE_BINFO (t2))
+return false;
+  for (unsigned int i = 0; i < BINFO_N_BASE_BINFOS (TYPE_BINFO (t2)); i++)
+if (odr_equivalent_or_derived_p
+(t1, BINFO_TYPE (BINFO_BASE_BINFO (TYPE_BINFO (t2), i
+return true;
+  return false;
+}
+
 /* Compare T1 and T2, report ODR violations if WARN is true and set
WARNED to true if anything is reported.  Return true if types match.
If true is returned, the types are also compatible in the sense of
diff --git a/gcc/ipa-utils.h b/gcc/ipa-utils.h
index d1da9c31e09e..908b425e98c5 100644
--- a/gcc/ipa-utils.h
+++ b/gcc/ipa-utils.h
@@ -106,6 +106,7 @@ cgraph_node *try_speculative_devirtualization (tree, 
HOST_WIDE_INT,
 void warn_types_mismatch (tree t1, tree t2, location_t loc1 = UNKNOWN_LOCATION,
  location_t loc2 = UNKNOWN_LOCATION);
 bool odr_or_derived_type_p (const_tree t);
+bool odr_equivalent_or_derived_p (tree t1, tree t2);
 bool odr_types_equivalent_p (tree type1, tree type2);
 bool odr_type_violation_reported_p (tree type);
 tree prevailing_odr_type (tree type);
diff --git a/gcc/tree-eh.cc b/gcc/tree-eh.cc
index eec1e6af70d7..ab8e00972b06 100644
--- a/gcc/tree-eh.cc
+++ b/gcc/tree-eh.cc
@@ -47,6 +47,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "attribs.h"
 #include "asan.h"
 #include "gimplify.h"
+#include "print-tree.h"
+#include "ipa-utils.h"
 
 /* In some instances a tree and a gimple need to be stored in a same table,
i.e. in hash tables. This is a structure to do this. */
@@ -2270,6 +2272,25 @@ make_eh_dispatch_edges (geh_dispatch *stmt)
 
   return true;
 }
+bool
+same_or_derived_type (tree t1, tree t2)
+{
+  t1 = TYPE_MAIN_VARIANT (t1);
+  t2 = TYPE_MAIN_VARIANT (t2);
+  if (t1 == t2)
+return true;
+  while ((TREE_CODE (t1) == POINTER_TYPE || TREE_CODE (t1) == REFERENCE_TYPE)
+&& TREE_CODE (t1) == TREE_CODE (t2))
+  {
+t1 = TYPE_MAIN_VARIANT (TREE_TYPE (t1));
+t2 = TYPE_MAIN_VARIANT (TREE_TYPE (t2));
+  }
+  if (t1 == t2)
+return true;
+  if (!AGGREGATE_TYPE_P (t1) || !AGGREGATE_TYPE_P (t2))
+return false;
+  return odr_equivalent_or_derived_p (t1, t2);
+}
 
 // Check if a landing pad can handle any of the given exception types
 bool match_lp(eh_landing_pad lp, vec *exception_types) {
@@ -2282,11 +2303,14 @@ bool match_lp(eh_landing_pad lp, vec 
*exception_types) {
 while (catch_handler) {
 tree type_list = catch_handler->type_list;
 
+   if (!type_list)
+ return true;
+
 for (tree t = type_list; t; t = TREE_CHAIN(t)) {
 tree type = TREE_VALUE(t);
 for (unsigned i = 0; i < exception_types->length(); ++i) {
   // match found or a catch-all handler (NULL)
-if (type == (*exception_types)[i] || !type) {
+if (!type || same_or_derived_type ((*exception_types)[i], 
type)) {
 return true;
 }
 }


[gcc r15-3414] Zen5 tuning part 1: avoid FMA chains

2024-09-03 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:d6360b4083695970789fd65b9c515c11a5ce25b4

commit r15-3414-gd6360b4083695970789fd65b9c515c11a5ce25b4
Author: Jan Hubicka 
Date:   Tue Sep 3 13:38:33 2024 +0200

Zen5 tuning part 1: avoid FMA chains

testing matrix multiplication benchmarks shows that FMA on a critical chain
is a perofrmance loss over separate multiply and add. While the latency of 4
is lower than multiply + add (3+2) the problem is that all values needs to
be ready before computation starts.

While on znver4 AVX512 code fared well with FMA, it was because of the split
registers. Znver5 benefits from avoding FMA on all widths.  This may be 
different
with the mobile version though.

On naive matrix multiplication benchmark the difference is 8% with -O3
only since with -Ofast loop interchange solves the problem differently.
It is 30% win, for example, on S323 from TSVC:

real_t s323(struct args_t * func_args)
{

//recurrences
//coupled recurrence

initialise_arrays(__func__);
gettimeofday(&func_args->t1, NULL);

for (int nl = 0; nl < iterations/2; nl++) {
for (int i = 1; i < LEN_1D; i++) {
a[i] = b[i-1] + c[i] * d[i];
b[i] = a[i] + c[i] * e[i];
}
dummy(a, b, c, d, e, aa, bb, cc, 0.);
}

gettimeofday(&func_args->t2, NULL);
return calc_checksum(__func__);
}

gcc/ChangeLog:

* config/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS): Enable 
for
znver5.
(X86_TUNE_AVOID_256FMA_CHAINS): Likewise.
(X86_TUNE_AVOID_512FMA_CHAINS): Likewise.

Diff:
---
 gcc/config/i386/x86-tune.def | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 3d29bffc49c3..da1a3d6a3c6c 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -508,17 +508,18 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, 
"use_scatter_8parts",
 
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | 
m_ZNVER2 | m_ZNVER3 | m_ZNVER4
+DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER
   | m_YONGFENG | m_SHIJIDADAO | m_GENERIC)
 
 /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
m_ZNVER3 | m_ZNVER4
- | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC)
+DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains",
+ m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 | m_CORE_HYBRID
+ | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC)
 
 /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_NONE)
+DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5)
 
 /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
for v2df vector reduction.  */


[gcc r15-3417] Zen5 tuning part 2: disable gather and scatter

2024-09-03 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:d82edbe92eed53a479736fcbbe6d54d0fb42daa4

commit r15-3417-gd82edbe92eed53a479736fcbbe6d54d0fb42daa4
Author: Jan Hubicka 
Date:   Tue Sep 3 15:07:41 2024 +0200

Zen5 tuning part 2: disable gather and scatter

We disable gathers for zen4.  It seems that gather has improved a bit 
compared
to zen4 and Zen5 optimization manual suggests "Avoid GATHER instructions 
when
the indices are known ahead of time. Vector loads followed by shuffles 
result
in a higher load bandwidth." however the situation seems to be more
complicated.

gather is 5-10% loss on parest benchmark as well as 30% loss on sparse dot
products in TSVC. Curiously enough breaking these out into microbenchmark
reversed the situation and it turns out that the performance depends on
how indices are distributed.  gather is loss if indices are sequential,
neutral if they are random and win for some strides (4, 8).

This seems to be similar to earlier zens, so I think (especially for
backporting znver5 support) that it makes sense to be conistent and disable
gather unless we work out a good heuristics on when to use it. Since we
typically do not know the indices in advance, I don't see how that can be 
done.

I opened PR116582 with some examples of wins and loses

gcc/ChangeLog:

* config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Disable for
ZNVER5.
(X86_TUNE_USE_SCATTER_2PARTS): Disable for ZNVER5.
(X86_TUNE_USE_GATHER_4PARTS): Disable for ZNVER5.
(X86_TUNE_USE_SCATTER_4PARTS): Disable for ZNVER5.
(X86_TUNE_USE_GATHER_8PARTS): Disable for ZNVER5.
(X86_TUNE_USE_SCATTER_8PARTS): Disable for ZNVER5.

Diff:
---
 gcc/config/i386/x86-tune.def | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index da1a3d6a3c6c..ed26136faee5 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -476,35 +476,35 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, 
"avoid_4byte_prefixes",
 /* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
+ ~(m_ZNVER | m_CORE_HYBRID
| m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
elements.  */
 DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
- ~(m_ZNVER4))
+ ~(m_ZNVER4 | m_ZNVER5))
 
 /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
+ ~(m_ZNVER | m_CORE_HYBRID
| m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
elements.  */
 DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
- ~(m_ZNVER4))
+ ~(m_ZNVER4 | m_ZNVER5))
 
 /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_CORE_HYBRID | m_CORE_ATOM
+ ~(m_ZNVER | m_CORE_HYBRID | m_CORE_ATOM
| m_YONGFENG | m_SHIJIDADAO | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
elements.  */
 DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
- ~(m_ZNVER4))
+ ~(m_ZNVER4 | m_ZNVER5))
 
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
smaller FMA chain.  */


[gcc r15-3420] Zen5 tuning part 3: scheduler tweaks

2024-09-03 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:e2125a600552bc6e0329e3f1224eea14804db8d3

commit r15-3420-ge2125a600552bc6e0329e3f1224eea14804db8d3
Author: Jan Hubicka 
Date:   Tue Sep 3 16:26:16 2024 +0200

Zen5 tuning part 3: scheduler tweaks

this patch adds support for new fussion in znver5 documented in the
optimization manual:

   The Zen5 microarchitecture adds support to fuse reg-reg MOV Instructions
   with certain ALU instructions. The following conditions need to be met 
for
   fusion to happen:
 - The MOV should be reg-reg mov with Opcode 0x89 or 0x8B
 - The MOV is followed by an ALU instruction where the MOV and ALU 
destination register match.
 - The ALU instruction may source only registers or immediate data. 
There cannot be any memory source.
 - The ALU instruction sources either the source or dest of MOV 
instruction.
 - If ALU instruction has 2 reg sources, they should be different.
 - The following ALU instructions can fuse with an older qualified MOV 
instruction:
   ADD ADC AND XOR OP SUB SBB INC DEC NOT SAL / SHL SHR SAR
   (I assume OP is OR)

I also increased issue rate from 4 to 6.  Theoretically znver5 can do more, 
but
with our model we can't realy use it.
Increasing issue rate to 8 leads to infinite loop in scheduler.

Finally, I also enabled fuse_alu_and_branch since it is supported by
znver5 (I think by earlier zens too).

New fussion pattern moves quite few instructions around in common code:
@@ -2210,13 +2210,13 @@
.cfi_offset 3, -32
leaq63(%rsi), %rbx
movq%rbx, %rbp
+   shrq$6, %rbp
+   salq$3, %rbp
subq$16, %rsp
.cfi_def_cfa_offset 48
movq%rdi, %r12
-   shrq$6, %rbp
-   movq%rsi, 8(%rsp)
-   salq$3, %rbp
movq%rbp, %rdi
+   movq%rsi, 8(%rsp)
call_Znwm
movq8(%rsp), %rsi
movl$0, 8(%r12)
@@ -2224,8 +2224,8 @@
movq%rax, (%r12)
movq%rbp, 32(%r12)
testq   %rsi, %rsi
-   movq%rsi, %rdx
cmovns  %rsi, %rbx
+   movq%rsi, %rdx
sarq$63, %rdx
shrq$58, %rdx
sarq$6, %rbx
which should help decoder bandwidth and perhaps also cache, though I was not
able to measure off-noise effect on SPEC.

gcc/ChangeLog:

* config/i386/i386.h (TARGET_FUSE_MOV_AND_ALU): New tune.
* config/i386/x86-tune-sched.cc (ix86_issue_rate): Updat for znver5.
(ix86_adjust_cost): Add TODO about znver5 memory latency.
(ix86_fuse_mov_alu_p): New.
(ix86_macro_fusion_pair_p): Use it.
* config/i386/x86-tune.def (X86_TUNE_FUSE_ALU_AND_BRANCH): Add 
ZNVER5.
(X86_TUNE_FUSE_MOV_AND_ALU): New tune;

Diff:
---
 gcc/config/i386/i386.h|  2 ++
 gcc/config/i386/x86-tune-sched.cc | 67 ++-
 gcc/config/i386/x86-tune.def  | 11 +--
 3 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index eabb3248ea00..c1ec92ffb150 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -430,6 +430,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS]
 #define TARGET_FUSE_ALU_AND_BRANCH \
ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH]
+#define TARGET_FUSE_MOV_AND_ALU \
+   ix86_tune_features[X86_TUNE_FUSE_MOV_AND_ALU]
 #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU]
 #define TARGET_AVOID_LEA_FOR_ADDR \
ix86_tune_features[X86_TUNE_AVOID_LEA_FOR_ADDR]
diff --git a/gcc/config/i386/x86-tune-sched.cc 
b/gcc/config/i386/x86-tune-sched.cc
index d77298b0e34d..c6d5426ae8d3 100644
--- a/gcc/config/i386/x86-tune-sched.cc
+++ b/gcc/config/i386/x86-tune-sched.cc
@@ -67,7 +67,6 @@ ix86_issue_rate (void)
 case PROCESSOR_ZNVER2:
 case PROCESSOR_ZNVER3:
 case PROCESSOR_ZNVER4:
-case PROCESSOR_ZNVER5:
 case PROCESSOR_CORE2:
 case PROCESSOR_NEHALEM:
 case PROCESSOR_SANDYBRIDGE:
@@ -91,6 +90,13 @@ ix86_issue_rate (void)
   return 5;
 
 case PROCESSOR_SAPPHIRERAPIDS:
+/* For znver5 decoder can handle 4 or 8 instructions per cycle,
+   op cache 12 instruction/cycle, dispatch 8 instructions
+   integer rename 8 instructions and Fp 6 instructions.
+
+   The scheduler, without understanding out of order nature of the CPU
+   is unlikely going to be able to fill all of these.  */
+case PROCESSOR_ZNVER5:
   return 6;
 
 default:
@@ -434,6 +440,8 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn 
*dep_insn, int cost,
  enum attr_unit unit = get_attr_unit (insn);
  int loadcost;
 
+ /* TODO: On

[gcc r15-3425] Zen5 tuning part 3: fix typo in previous patch

2024-09-03 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:910e1769a0653ac32bd8c1d6aabb39c797d5d773

commit r15-3425-g910e1769a0653ac32bd8c1d6aabb39c797d5d773
Author: Jan Hubicka 
Date:   Tue Sep 3 17:25:05 2024 +0200

Zen5 tuning part 3: fix typo in previous patch

gcc/ChangeLog:

* config/i386/x86-tune-sched.cc (ix86_fuse_mov_alu_p): Fix
typo.

Diff:
---
 gcc/config/i386/x86-tune-sched.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/i386/x86-tune-sched.cc 
b/gcc/config/i386/x86-tune-sched.cc
index c6d5426ae8d3..4ebdf111269b 100644
--- a/gcc/config/i386/x86-tune-sched.cc
+++ b/gcc/config/i386/x86-tune-sched.cc
@@ -613,7 +613,7 @@ ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu)
   /* One of operands should be register.  */
   if (op1 && (!REG_P (op0) || REGNO (op0) != REGNO (reg)))
 std::swap (op0, op1);
-  if (!REG_P (op0) || REGNO (op1) != REGNO (reg))
+  if (!REG_P (op0) || REGNO (op0) != REGNO (reg))
 return false;
   if (op1
   && !REG_P (op1)


[gcc r15-3427] Zen5 tuning part 4: update reassocation width

2024-09-03 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:f0ab3de6ec0e3540f2e57f3f5628005f0a4e3fa5

commit r15-3427-gf0ab3de6ec0e3540f2e57f3f5628005f0a4e3fa5
Author: Jan Hubicka 
Date:   Tue Sep 3 18:20:34 2024 +0200

Zen5 tuning part 4: update reassocation width

Zen5 has 6 instead of 4 ALUs and the integer multiplication can now execute 
in
3 of them.  FP units can do 2 additions and 2 multiplications with latency 2
and 3.  This patch updates reassociation width accordingly.  This has 
potential
of increasing register pressure but unlike while benchmarking znver1 tuning
I did not noticed this actually causing problem on spec, so this patch bumps
up reassociation width to 6 for everything except for integer vectors, where
there are 4 units with typical latency of 1.

Bootstrapped/regtested x86_64-linux, comitted.

gcc/ChangeLog:

* config/i386/i386.cc (ix86_reassociation_width): Update for Znver5.
* config/i386/x86-tune-costs.h (znver5_costs): Update reassociation
widths.

Diff:
---
 gcc/config/i386/i386.cc  | 10 +++---
 gcc/config/i386/x86-tune-costs.h | 23 +--
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 7af9ceca429f..e8744fa77ead 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24483,13 +24483,17 @@ ix86_reassociation_width (unsigned int op, 
machine_mode mode)
   if (width == 1)
return 1;
 
-  /* Integer vector instructions execute in FP unit
+  /* Znver1-4 Integer vector instructions execute in FP unit
 and can execute 3 additions and one multiplication per cycle.  */
   if ((ix86_tune == PROCESSOR_ZNVER1 || ix86_tune == PROCESSOR_ZNVER2
-  || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4
-  || ix86_tune == PROCESSOR_ZNVER5)
+  || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4)
  && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS)
return 1;
+  /* Znver5 can do 2 integer multiplications per cycle with latency
+of 3.  */
+  if (ix86_tune == PROCESSOR_ZNVER5
+ && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS)
+   width = 6;
 
   /* Account for targets that splits wide vectors into multiple parts.  */
   if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 256)
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 2bfaee554d53..b90567fbbf23 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2100,16 +2100,19 @@ struct processor_costs znver5_cost = {
   COSTS_N_INSNS (13),  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (14),  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (20),  /* cost of SQRTSD instruction.  */
-  /* Zen can execute 4 integer operations per cycle.  FP operations
- take 3 cycles and it can execute 2 integer additions and 2
- multiplications thus reassociation may make sense up to with of 6.
- SPEC2k6 bencharks suggests
- that 4 works better than 6 probably due to register pressure.
-
- Integer vector operations are taken by FP unit and execute 3 vector
- plus/minus operations per cycle but only one multiply.  This is adjusted
- in ix86_reassociation_width.  */
-  4, 4, 3, 6,  /* reassoc int, fp, vec_int, vec_fp.  */
+  /* Zen5 can execute:
+  - integer ops: 6 per cycle, at most 3 multiplications.
+   latency 1 for additions, 3 for multiplications (pipelined)
+
+   Setting width of 9 for multiplication is probably excessive
+   for register pressure.
+  - fp ops: 2 additions per cycle, latency 2-3
+   2 multiplicaitons per cycle, latency 3
+  - vector intger ops: 4 additions, latency 1
+  2 multiplications, latency 4
+   We increase width to 6 for multiplications
+   in ix86_reassociation_width.  */
+  6, 6, 4, 6,  /* reassoc int, fp, vec_int, vec_fp.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),   /* cond_taken_branch_cost.  */


[gcc r15-3441] Zen5 tuning part 5: update instruction latencies in x86-tune-costs

2024-09-04 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:4292297a0f938ffc953422fa246ff00fe345fe3d

commit r15-3441-g4292297a0f938ffc953422fa246ff00fe345fe3d
Author: Jan Hubicka 
Date:   Wed Sep 4 09:19:08 2024 +0200

Zen5 tuning part 5: update instruction latencies in x86-tune-costs

there is nothing exciting in this patch.  I measured latencies and also 
compared
them with newly released optimization guide.  There are no dramatic changes
compared to zen4.  One interesting new bit is that addss is faster and can 
be
2 cycles when fed by another addss.

I also increased the large insn bound since decoders seems no longer require
instructions to be 8 bytes or less.

gcc/ChangeLog:

* config/i386/x86-tune-costs.h (znver5_cost): Update instruction
costs.

Diff:
---
 gcc/config/i386/x86-tune-costs.h | 28 +---
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index b90567fbbf2..1b3227ace16 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2034,6 +2034,7 @@ struct processor_costs znver5_cost = {
   COSTS_N_INSNS (1),   /* cost of a lea instruction.  */
   COSTS_N_INSNS (1),   /* variable shift costs.  */
   COSTS_N_INSNS (1),   /* constant shift costs.  */
+  /* mul has latency 3, executes in 3 integer units.  */
   {COSTS_N_INSNS (3),  /* cost of starting multiply for QI.  */
COSTS_N_INSNS (3),  /*   HI.  */
COSTS_N_INSNS (3),  /*   SI.  */
@@ -2041,6 +2042,8 @@ struct processor_costs znver5_cost = {
COSTS_N_INSNS (3)}, /*  other.  */
   0,   /* cost of multiply per each bit
   set.  */
+  /* integer divide has latency of 8 cycles
+ plus 1 for every 9 bits of quotient.  */
   {COSTS_N_INSNS (10), /* cost of a divide/mod for QI.  */
COSTS_N_INSNS (11), /*  HI.  */
COSTS_N_INSNS (13), /*  SI.  */
@@ -2048,7 +2051,7 @@ struct processor_costs znver5_cost = {
COSTS_N_INSNS (16)},/*  
other.  */
   COSTS_N_INSNS (1),   /* cost of movsx.  */
   COSTS_N_INSNS (1),   /* cost of movzx.  */
-  8,   /* "large" insn.  */
+  15,  /* "large" insn.  */
   9,   /* MOVE_RATIO.  */
   6,   /* CLEAR_RATIO */
   {6, 6, 6},   /* cost of loading integer registers
@@ -2065,12 +2068,13 @@ struct processor_costs znver5_cost = {
   2, 2, 2, /* cost of moving XMM,YMM,ZMM
   register.  */
   6,   /* cost of moving SSE register to 
integer.  */
-  /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
- throughput 5.  Approx 7 uops do not depend on vector size and every load
- is 5 uops.  */
+
+  /* TODO: gather and scatter instructions are currently disabled in
+ x86-tune.def.  In some cases they are however a win, see PR116582
+ We however need good cost model for them.  */
   14, 10,  /* Gather load static, per_elt.  */
   14, 20,  /* Gather store static, per_elt.  */
-  32,  /* size of l1 cache.  */
+  48,  /* size of l1 cache.  */
   1024,/* size of l2 cache.  */
   64,  /* size of prefetch block.  */
   /* New AMD processors never drop prefetches; if they cannot be performed
@@ -2080,6 +2084,8 @@ struct processor_costs znver5_cost = {
  time).  */
   100, /* number of parallel prefetches.  */
   3,   /* Branch cost.  */
+  /* TODO x87 latencies are still based on znver4.
+ Probably not very important these days.  */
   COSTS_N_INSNS (7),   /* cost of FADD and FSUB insns.  */
   COSTS_N_INSNS (7),   /* cost of FMUL instruction.  */
   /* Latency of fdiv is 8-15.  */
@@ -2089,16 +2095,24 @@ struct processor_costs znver5_cost = {
   /* Latency of fsqrt is 4-10.  */
   COSTS_N_INSNS (25),  /* cost of FSQRT instruction.  */
 
+  /* SSE instructions have typical throughput 4 and latency 1.  */
   COSTS_N_INSNS (1),   /* cost of cheap SSE instruction.  */
-  COSTS_N_INSNS (3),   /* cost of ADDSS/SD SUBSS/SD insns.  */
+  /* ADDSS has throughput 2 and latency 2
+ (in some cases when source is

[gcc r15-2200] Compare loop bounds in ipa-icf

2024-07-22 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:0d19fbc7b0760ce665fa6a88cd40cfa0311358d7

commit r15-2200-g0d19fbc7b0760ce665fa6a88cd40cfa0311358d7
Author: Jan Hubicka 
Date:   Mon Jul 22 18:01:57 2024 +0200

Compare loop bounds in ipa-icf

Hi,
this testcase shows another poblem with missing comparators for metadata
in ICF. With value ranges available to loop optimizations during early
opts we can estimate number of iterations based on guarding condition that
can be split away by the fnsplit pass. This patch disables ICF when
number of iteraitons does not match.

Bootstrapped/regtesed x86_64-linux, will commit it shortly

gcc/ChangeLog:

PR ipa/115277
* ipa-icf-gimple.cc (func_checker::compare_loops): compare loop
bounds.

gcc/testsuite/ChangeLog:

* gcc.c-torture/compile/pr115277.c: New test.

Diff:
---
 gcc/ipa-icf-gimple.cc  |  4 
 gcc/testsuite/gcc.c-torture/compile/pr115277.c | 28 ++
 2 files changed, 32 insertions(+)

diff --git a/gcc/ipa-icf-gimple.cc b/gcc/ipa-icf-gimple.cc
index c25eb24710f6..4c3174b68b67 100644
--- a/gcc/ipa-icf-gimple.cc
+++ b/gcc/ipa-icf-gimple.cc
@@ -543,6 +543,10 @@ func_checker::compare_loops (basic_block bb1, basic_block 
bb2)
 return return_false_with_msg ("unroll");
   if (!compare_variable_decl (l1->simduid, l2->simduid))
 return return_false_with_msg ("simduid");
+  if ((l1->any_upper_bound != l2->any_upper_bound)
+  || (l1->any_upper_bound
+ && (l1->nb_iterations_upper_bound != l2->nb_iterations_upper_bound)))
+return return_false_with_msg ("nb_iterations_upper_bound");
 
   return true;
 }
diff --git a/gcc/testsuite/gcc.c-torture/compile/pr115277.c 
b/gcc/testsuite/gcc.c-torture/compile/pr115277.c
new file mode 100644
index ..27449eb254f8
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/compile/pr115277.c
@@ -0,0 +1,28 @@
+int array[1000];
+void
+test (int a)
+{
+if (__builtin_expect (a > 3, 1))
+return;
+for (int i = 0; i < a; i++)
+array[i]=i;
+}
+void
+test2 (int a)
+{
+if (__builtin_expect (a > 10, 1))
+return;
+for (int i = 0; i < a; i++)
+array[i]=i;
+}
+int
+main()
+{
+test(1);
+test(2);
+test(3);
+test2(10);
+if (array[9] != 9)
+__builtin_abort ();
+return 0;
+}


[gcc r15-2201] Fix accounting of offsets in unadjusted_ptr_and_unit_offset

2024-07-22 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:391f46f10b0586c074014de82efe76787739bb0c

commit r15-2201-g391f46f10b0586c074014de82efe76787739bb0c
Author: Jan Hubicka 
Date:   Mon Jul 22 18:05:26 2024 +0200

Fix accounting of offsets in unadjusted_ptr_and_unit_offset

unadjusted_ptr_and_unit_offset accidentally throws away the offset computed 
by
get_addr_base_and_unit_offset. Instead of passing extra_offset it passes 
offset.

PR ipa/114207

gcc/ChangeLog:

* ipa-prop.cc (unadjusted_ptr_and_unit_offset): Fix accounting of 
offsets in ADDR_EXPR.

gcc/testsuite/ChangeLog:

* gcc.c-torture/execute/pr114207.c: New test.

Diff:
---
 gcc/ipa-prop.cc|  4 ++--
 gcc/testsuite/gcc.c-torture/execute/pr114207.c | 23 +++
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/gcc/ipa-prop.cc b/gcc/ipa-prop.cc
index 7d7cb3835d2b..99ebd6229ec4 100644
--- a/gcc/ipa-prop.cc
+++ b/gcc/ipa-prop.cc
@@ -1370,9 +1370,9 @@ unadjusted_ptr_and_unit_offset (tree op, tree *ret, 
poly_int64 *offset_ret)
 {
   if (TREE_CODE (op) == ADDR_EXPR)
{
- poly_int64 extra_offset = 0;
+ poly_int64 extra_offset;
  tree base = get_addr_base_and_unit_offset (TREE_OPERAND (op, 0),
-&offset);
+&extra_offset);
  if (!base)
{
  base = get_base_address (TREE_OPERAND (op, 0));
diff --git a/gcc/testsuite/gcc.c-torture/execute/pr114207.c 
b/gcc/testsuite/gcc.c-torture/execute/pr114207.c
new file mode 100644
index ..052fa85e9fc6
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/pr114207.c
@@ -0,0 +1,23 @@
+#include 
+#include 
+
+struct S {
+int a, b;
+};
+
+__attribute__((noinline))
+void foo (struct S *s) {
+struct S ss = (struct S) {
+.a = s->b,
+.b = s->a
+};
+*s = ss;
+}
+
+int main() {
+  struct S s = {6, 12};
+  foo(&s);
+  if (s.a != 12 || s.b != 6)
+__builtin_abort ();
+  return 0;
+}


[gcc r15-2202] Fix modref_eaf_analysis::analyze_ssa_name handling of values dereferenced to function call parameter

2024-07-22 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:cf8ffc58aad3127031c229a75cc4b99c8ace25e0

commit r15-2202-gcf8ffc58aad3127031c229a75cc4b99c8ace25e0
Author: Jan Hubicka 
Date:   Mon Jul 22 18:08:08 2024 +0200

Fix modref_eaf_analysis::analyze_ssa_name handling of values dereferenced 
to function call parameters

modref_eaf_analysis::analyze_ssa_name misinterprets EAF flags.  If 
dereferenced
parameter is passed (to map_iterator in the testcase) it can be returned
indirectly which in turn makes it to escape into the next function call.

PR ipa/115033

gcc/ChangeLog:

* ipa-modref.cc (modref_eaf_analysis::analyze_ssa_name): Fix 
checking of
EAF flags when analysing values dereferenced as function parameters.

gcc/testsuite/ChangeLog:

* gcc.c-torture/execute/pr115033.c: New test.

Diff:
---
 gcc/ipa-modref.cc  |  6 +++--
 gcc/testsuite/gcc.c-torture/execute/pr115033.c | 35 ++
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/gcc/ipa-modref.cc b/gcc/ipa-modref.cc
index a5adce8ea396..f994388a96ab 100644
--- a/gcc/ipa-modref.cc
+++ b/gcc/ipa-modref.cc
@@ -2571,8 +2571,10 @@ modref_eaf_analysis::analyze_ssa_name (tree name, bool 
deferred)
int call_flags = deref_flags
(gimple_call_arg_flags (call, i), ignore_stores);
if (!ignore_retval && !(call_flags & EAF_UNUSED)
-   && !(call_flags & EAF_NOT_RETURNED_DIRECTLY)
-   && !(call_flags & EAF_NOT_RETURNED_INDIRECTLY))
+   && (call_flags & (EAF_NOT_RETURNED_DIRECTLY
+ | EAF_NOT_RETURNED_INDIRECTLY))
+   != (EAF_NOT_RETURNED_DIRECTLY
+   | EAF_NOT_RETURNED_INDIRECTLY))
  merge_call_lhs_flags (call, i, name, false, true);
if (ecf_flags & (ECF_CONST | ECF_NOVOPS))
  m_lattice[index].merge_direct_load ();
diff --git a/gcc/testsuite/gcc.c-torture/execute/pr115033.c 
b/gcc/testsuite/gcc.c-torture/execute/pr115033.c
new file mode 100644
index ..3e79367d401c
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/pr115033.c
@@ -0,0 +1,35 @@
+
+typedef struct func
+{
+  int *a;
+}func;
+__attribute__((noinline))
+void ff(struct func *t)
+{
+  *(t->a) = 0;
+}
+
+
+typedef struct mapped_iterator {
+  func F;
+}mapped_iterator;
+
+__attribute__((noinline))
+mapped_iterator map_iterator(func F) {
+  mapped_iterator t = {F};
+  return t;
+}
+
+void map_to_vector(func *F) {
+  mapped_iterator t = map_iterator(*F);
+  ff(&t.F);
+}
+int main() {
+  int resultIsStatic = 1;
+  func t ={&resultIsStatic};
+  map_to_vector(&t);
+
+  if (resultIsStatic)
+__builtin_trap();
+  __builtin_exit(0);
+}


[gcc r14-10486] Reduce recursive inlining of always_inline functions

2024-07-22 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:9a7d668fc58f817027ec7f9fa7e20a6dce08bddb

commit r14-10486-g9a7d668fc58f817027ec7f9fa7e20a6dce08bddb
Author: Jan Hubicka 
Date:   Tue May 14 12:58:56 2024 +0200

Reduce recursive inlining of always_inline functions

this patch tames down inliner on (mutiply) self-recursive always_inline 
functions.
While we already have caps on recursive inlning, the testcase combines 
early inliner
and late inliner to get very wide recursive inlining tree.  The basic idea 
is to
ignore DISREGARD_INLINE_LIMITS when deciding on inlining self recursive 
functions
(so we cut on function being large) and clear the flag once it is detected.

I did not include the testcase since it still produces a lot of code and 
would
slow down testing.  It also outputs many inlining failed messages that is 
not
very nice, but it is hard to detect self recursin cycles in full generality
when indirect calls and other tricks may happen.

gcc/ChangeLog:

PR ipa/113291

* ipa-inline.cc (enum can_inline_edge_by_limits_flags): New enum.
(can_inline_edge_by_limits_p): Take flags instead of multiple 
bools; add flag
for forcing inlinie limits.
(can_early_inline_edge_p): Update.
(want_inline_self_recursive_call_p): Update; use FORCE_LIMITS mode.
(check_callers): Update.
(update_caller_keys): Update.
(update_callee_keys): Update.
(recursive_inlining): Update.
(add_new_edges_to_heap): Update.
(speculation_useful_p): Update.
(inline_small_functions): Clear DECL_DISREGARD_INLINE_LIMITS on 
self recursion.
(flatten_function): Update.
(inline_to_all_callers_1): Update.

(cherry picked from commit 1ec49897253e093e1ef6261eb104ac0c111bac83)

Diff:
---
 gcc/ipa-inline.cc | 79 +--
 1 file changed, 53 insertions(+), 26 deletions(-)

diff --git a/gcc/ipa-inline.cc b/gcc/ipa-inline.cc
index e52757510ce9..9fc41b7696d8 100644
--- a/gcc/ipa-inline.cc
+++ b/gcc/ipa-inline.cc
@@ -496,24 +496,33 @@ inline_insns_auto (cgraph_node *n, bool hint, bool hint2)
   return max_inline_insns_auto;
 }
 
+enum can_inline_edge_by_limits_flags
+{
+  /* True if we are early inlining.  */
+  CAN_INLINE_EARLY = 1,
+  /* Ignore size limits.  */
+  CAN_INLINE_DISREGARD_LIMITS = 2,
+  /* Force size limits (ignore always_inline).  This is used for
+ recrusive inlining where always_inline may lead to inline bombs
+ and technically it is non-sential anyway.  */
+  CAN_INLINE_FORCE_LIMITS = 4,
+  /* Report decision to dump file.  */
+  CAN_INLINE_REPORT = 8,
+};
+
 /* Decide if we can inline the edge and possibly update
inline_failed reason.  
We check whether inlining is possible at all and whether
-   caller growth limits allow doing so.  
-
-   if REPORT is true, output reason to the dump file.
-
-   if DISREGARD_LIMITS is true, ignore size limits.  */
+   caller growth limits allow doing so.  */
 
 static bool
-can_inline_edge_by_limits_p (struct cgraph_edge *e, bool report,
-bool disregard_limits = false, bool early = false)
+can_inline_edge_by_limits_p (struct cgraph_edge *e, int flags)
 {
   gcc_checking_assert (e->inline_failed);
 
   if (cgraph_inline_failed_type (e->inline_failed) == CIF_FINAL_ERROR)
 {
-  if (report)
+  if (flags & CAN_INLINE_REPORT)
 report_inline_failed_reason (e);
   return false;
 }
@@ -527,10 +536,11 @@ can_inline_edge_by_limits_p (struct cgraph_edge *e, bool 
report,
   tree callee_tree
 = callee ? DECL_FUNCTION_SPECIFIC_OPTIMIZATION (callee->decl) : NULL;
   /* Check if caller growth allows the inlining.  */
-  if (!DECL_DISREGARD_INLINE_LIMITS (callee->decl)
-  && !disregard_limits
-  && !lookup_attribute ("flatten",
-DECL_ATTRIBUTES (caller->decl))
+  if (!(flags & CAN_INLINE_DISREGARD_LIMITS)
+  && ((flags & CAN_INLINE_FORCE_LIMITS)
+ || (!DECL_DISREGARD_INLINE_LIMITS (callee->decl)
+ && !lookup_attribute ("flatten",
+DECL_ATTRIBUTES (caller->decl
   && !caller_growth_limits (e))
 inlinable = false;
   else if (callee->externally_visible
@@ -558,7 +568,7 @@ can_inline_edge_by_limits_p (struct cgraph_edge *e, bool 
report,
to inline library always_inline functions. See PR65873.
Disable the check for early inlining for now until better solution
is found.  */
- if (always_inline && early)
+ if (always_inline && (flags & CAN_INLINE_EARLY))
;
   /* There are some options that change IL semantics which means
  we cannot inline in these cases for correctness reason.
@@ -594,7 +604,7 @@ can_inline_edge_by_limits_p (struct cgraph_edge *e, bool 
report,
  /* When devirtualization is disabled for callee, it is not safe
 

[gcc r14-10487] Compare loop bounds in ipa-icf

2024-07-22 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:c5397d343ff1365fcebcf3ebabe140608874aac3

commit r14-10487-gc5397d343ff1365fcebcf3ebabe140608874aac3
Author: Jan Hubicka 
Date:   Mon Jul 22 18:01:57 2024 +0200

Compare loop bounds in ipa-icf

Hi,
this testcase shows another poblem with missing comparators for metadata
in ICF. With value ranges available to loop optimizations during early
opts we can estimate number of iterations based on guarding condition that
can be split away by the fnsplit pass. This patch disables ICF when
number of iteraitons does not match.

Bootstrapped/regtesed x86_64-linux, will commit it shortly

gcc/ChangeLog:

PR ipa/115277
* ipa-icf-gimple.cc (func_checker::compare_loops): compare loop
bounds.

gcc/testsuite/ChangeLog:

* gcc.c-torture/compile/pr115277.c: New test.

(cherry picked from commit 0d19fbc7b0760ce665fa6a88cd40cfa0311358d7)

Diff:
---
 gcc/ipa-icf-gimple.cc  |  4 
 gcc/testsuite/gcc.c-torture/compile/pr115277.c | 28 ++
 2 files changed, 32 insertions(+)

diff --git a/gcc/ipa-icf-gimple.cc b/gcc/ipa-icf-gimple.cc
index c25eb24710f6..4c3174b68b67 100644
--- a/gcc/ipa-icf-gimple.cc
+++ b/gcc/ipa-icf-gimple.cc
@@ -543,6 +543,10 @@ func_checker::compare_loops (basic_block bb1, basic_block 
bb2)
 return return_false_with_msg ("unroll");
   if (!compare_variable_decl (l1->simduid, l2->simduid))
 return return_false_with_msg ("simduid");
+  if ((l1->any_upper_bound != l2->any_upper_bound)
+  || (l1->any_upper_bound
+ && (l1->nb_iterations_upper_bound != l2->nb_iterations_upper_bound)))
+return return_false_with_msg ("nb_iterations_upper_bound");
 
   return true;
 }
diff --git a/gcc/testsuite/gcc.c-torture/compile/pr115277.c 
b/gcc/testsuite/gcc.c-torture/compile/pr115277.c
new file mode 100644
index ..27449eb254f8
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/compile/pr115277.c
@@ -0,0 +1,28 @@
+int array[1000];
+void
+test (int a)
+{
+if (__builtin_expect (a > 3, 1))
+return;
+for (int i = 0; i < a; i++)
+array[i]=i;
+}
+void
+test2 (int a)
+{
+if (__builtin_expect (a > 10, 1))
+return;
+for (int i = 0; i < a; i++)
+array[i]=i;
+}
+int
+main()
+{
+test(1);
+test(2);
+test(3);
+test2(10);
+if (array[9] != 9)
+__builtin_abort ();
+return 0;
+}


[gcc r14-10488] Fix accounting of offsets in unadjusted_ptr_and_unit_offset

2024-07-22 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:f2e98084792821c3849074867d5b007c49028854

commit r14-10488-gf2e98084792821c3849074867d5b007c49028854
Author: Jan Hubicka 
Date:   Mon Jul 22 18:05:26 2024 +0200

Fix accounting of offsets in unadjusted_ptr_and_unit_offset

unadjusted_ptr_and_unit_offset accidentally throws away the offset computed 
by
get_addr_base_and_unit_offset. Instead of passing extra_offset it passes 
offset.

PR ipa/114207

gcc/ChangeLog:

* ipa-prop.cc (unadjusted_ptr_and_unit_offset): Fix accounting of 
offsets in ADDR_EXPR.

gcc/testsuite/ChangeLog:

* gcc.c-torture/execute/pr114207.c: New test.

(cherry picked from commit 391f46f10b0586c074014de82efe76787739bb0c)

Diff:
---
 gcc/ipa-prop.cc|  4 ++--
 gcc/testsuite/gcc.c-torture/execute/pr114207.c | 23 +++
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/gcc/ipa-prop.cc b/gcc/ipa-prop.cc
index 374e998aa64b..99d4c33e8fdc 100644
--- a/gcc/ipa-prop.cc
+++ b/gcc/ipa-prop.cc
@@ -1370,9 +1370,9 @@ unadjusted_ptr_and_unit_offset (tree op, tree *ret, 
poly_int64 *offset_ret)
 {
   if (TREE_CODE (op) == ADDR_EXPR)
{
- poly_int64 extra_offset = 0;
+ poly_int64 extra_offset;
  tree base = get_addr_base_and_unit_offset (TREE_OPERAND (op, 0),
-&offset);
+&extra_offset);
  if (!base)
{
  base = get_base_address (TREE_OPERAND (op, 0));
diff --git a/gcc/testsuite/gcc.c-torture/execute/pr114207.c 
b/gcc/testsuite/gcc.c-torture/execute/pr114207.c
new file mode 100644
index ..052fa85e9fc6
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/pr114207.c
@@ -0,0 +1,23 @@
+#include 
+#include 
+
+struct S {
+int a, b;
+};
+
+__attribute__((noinline))
+void foo (struct S *s) {
+struct S ss = (struct S) {
+.a = s->b,
+.b = s->a
+};
+*s = ss;
+}
+
+int main() {
+  struct S s = {6, 12};
+  foo(&s);
+  if (s.a != 12 || s.b != 6)
+__builtin_abort ();
+  return 0;
+}


[gcc r14-10489] Fix modref_eaf_analysis::analyze_ssa_name handling of values dereferenced to function call parameter

2024-07-22 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:27ef3a0779e551ca116c56c431436c8d2191b253

commit r14-10489-g27ef3a0779e551ca116c56c431436c8d2191b253
Author: Jan Hubicka 
Date:   Mon Jul 22 18:08:08 2024 +0200

Fix modref_eaf_analysis::analyze_ssa_name handling of values dereferenced 
to function call parameters

modref_eaf_analysis::analyze_ssa_name misinterprets EAF flags.  If 
dereferenced
parameter is passed (to map_iterator in the testcase) it can be returned
indirectly which in turn makes it to escape into the next function call.

PR ipa/115033

gcc/ChangeLog:

* ipa-modref.cc (modref_eaf_analysis::analyze_ssa_name): Fix 
checking of
EAF flags when analysing values dereferenced as function parameters.

gcc/testsuite/ChangeLog:

* gcc.c-torture/execute/pr115033.c: New test.

(cherry picked from commit cf8ffc58aad3127031c229a75cc4b99c8ace25e0)

Diff:
---
 gcc/ipa-modref.cc  |  6 +++--
 gcc/testsuite/gcc.c-torture/execute/pr115033.c | 35 ++
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/gcc/ipa-modref.cc b/gcc/ipa-modref.cc
index a5adce8ea396..f994388a96ab 100644
--- a/gcc/ipa-modref.cc
+++ b/gcc/ipa-modref.cc
@@ -2571,8 +2571,10 @@ modref_eaf_analysis::analyze_ssa_name (tree name, bool 
deferred)
int call_flags = deref_flags
(gimple_call_arg_flags (call, i), ignore_stores);
if (!ignore_retval && !(call_flags & EAF_UNUSED)
-   && !(call_flags & EAF_NOT_RETURNED_DIRECTLY)
-   && !(call_flags & EAF_NOT_RETURNED_INDIRECTLY))
+   && (call_flags & (EAF_NOT_RETURNED_DIRECTLY
+ | EAF_NOT_RETURNED_INDIRECTLY))
+   != (EAF_NOT_RETURNED_DIRECTLY
+   | EAF_NOT_RETURNED_INDIRECTLY))
  merge_call_lhs_flags (call, i, name, false, true);
if (ecf_flags & (ECF_CONST | ECF_NOVOPS))
  m_lattice[index].merge_direct_load ();
diff --git a/gcc/testsuite/gcc.c-torture/execute/pr115033.c 
b/gcc/testsuite/gcc.c-torture/execute/pr115033.c
new file mode 100644
index ..3e79367d401c
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/pr115033.c
@@ -0,0 +1,35 @@
+
+typedef struct func
+{
+  int *a;
+}func;
+__attribute__((noinline))
+void ff(struct func *t)
+{
+  *(t->a) = 0;
+}
+
+
+typedef struct mapped_iterator {
+  func F;
+}mapped_iterator;
+
+__attribute__((noinline))
+mapped_iterator map_iterator(func F) {
+  mapped_iterator t = {F};
+  return t;
+}
+
+void map_to_vector(func *F) {
+  mapped_iterator t = map_iterator(*F);
+  ff(&t.F);
+}
+int main() {
+  int resultIsStatic = 1;
+  func t ={&resultIsStatic};
+  map_to_vector(&t);
+
+  if (resultIsStatic)
+__builtin_trap();
+  __builtin_exit(0);
+}


[gcc r15-2205] Fix modref's iteraction with store merging

2024-07-22 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:14074773350ffed7efdebbc553adf0f23b572e87

commit r15-2205-g14074773350ffed7efdebbc553adf0f23b572e87
Author: Jan Hubicka 
Date:   Mon Jul 22 19:00:39 2024 +0200

Fix modref's iteraction with store merging

Hi,
this patch fixes wrong code in case store-merging introduces load of 
function
parameter that was previously write-only (which happens for bitfields).
Without this, the whole store-merged area is consdered to be killed.

PR ipa/111613

gcc/ChangeLog:

* ipa-modref.cc (analyze_parms): Do not preserve EAF_NO_DIRECT_READ 
and
EAF_NO_INDIRECT_READ from past flags.

gcc/testsuite/ChangeLog:

* gcc.c-torture/pr111613.c: New test.

Diff:
---
 gcc/ipa-modref.cc  |  3 +++
 gcc/testsuite/gcc.c-torture/pr111613.c | 29 +
 2 files changed, 32 insertions(+)

diff --git a/gcc/ipa-modref.cc b/gcc/ipa-modref.cc
index f994388a96ab..53a2e35133da 100644
--- a/gcc/ipa-modref.cc
+++ b/gcc/ipa-modref.cc
@@ -3004,6 +3004,9 @@ analyze_parms (modref_summary *summary, 
modref_summary_lto *summary_lto,
 (past, ecf_flags,
  VOID_TYPE_P (TREE_TYPE
  (TREE_TYPE (current_function_decl;
+ /* Store merging can produce reads when combining together multiple
+bitfields.  See PR111613.  */
+ past &= ~(EAF_NO_DIRECT_READ | EAF_NO_INDIRECT_READ);
  if (dump_file && (flags | past) != flags && !(flags & EAF_UNUSED))
{
  fprintf (dump_file,
diff --git a/gcc/testsuite/gcc.c-torture/pr111613.c 
b/gcc/testsuite/gcc.c-torture/pr111613.c
new file mode 100644
index ..1ea1c4dec072
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/pr111613.c
@@ -0,0 +1,29 @@
+#include 
+#include 
+
+struct bitfield {
+   unsigned int field1 : 1;
+   unsigned int field2 : 1;
+   unsigned int field3 : 1;
+};
+
+__attribute__((noinline)) static void
+set_field1_and_field2(struct bitfield *b) {
+   b->field1 = 1;
+   b->field2 = 1;
+}
+
+__attribute__((noinline)) static struct bitfield *
+new_bitfield(void) {
+   struct bitfield *b = (struct bitfield *)malloc(sizeof(*b));
+   b->field3 = 1;
+   set_field1_and_field2(b);
+   return b;
+}
+
+int main(void) {
+   struct bitfield *b = new_bitfield();
+   if (b->field3 != 1)
+   __builtin_abort ();
+   return 0;
+}


[gcc r14-10495] Fix modref's iteraction with store merging

2024-07-22 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:9ddd5f88e60972147dff74b48658e2b12040d468

commit r14-10495-g9ddd5f88e60972147dff74b48658e2b12040d468
Author: Jan Hubicka 
Date:   Mon Jul 22 19:00:39 2024 +0200

Fix modref's iteraction with store merging

Hi,
this patch fixes wrong code in case store-merging introduces load of 
function
parameter that was previously write-only (which happens for bitfields).
Without this, the whole store-merged area is consdered to be killed.

PR ipa/111613

gcc/ChangeLog:

* ipa-modref.cc (analyze_parms): Do not preserve EAF_NO_DIRECT_READ 
and
EAF_NO_INDIRECT_READ from past flags.

gcc/testsuite/ChangeLog:

* gcc.c-torture/pr111613.c: New test.

(cherry picked from commit 14074773350ffed7efdebbc553adf0f23b572e87)

Diff:
---
 gcc/ipa-modref.cc  |  3 +++
 gcc/testsuite/gcc.c-torture/pr111613.c | 29 +
 2 files changed, 32 insertions(+)

diff --git a/gcc/ipa-modref.cc b/gcc/ipa-modref.cc
index f994388a96ab..53a2e35133da 100644
--- a/gcc/ipa-modref.cc
+++ b/gcc/ipa-modref.cc
@@ -3004,6 +3004,9 @@ analyze_parms (modref_summary *summary, 
modref_summary_lto *summary_lto,
 (past, ecf_flags,
  VOID_TYPE_P (TREE_TYPE
  (TREE_TYPE (current_function_decl;
+ /* Store merging can produce reads when combining together multiple
+bitfields.  See PR111613.  */
+ past &= ~(EAF_NO_DIRECT_READ | EAF_NO_INDIRECT_READ);
  if (dump_file && (flags | past) != flags && !(flags & EAF_UNUSED))
{
  fprintf (dump_file,
diff --git a/gcc/testsuite/gcc.c-torture/pr111613.c 
b/gcc/testsuite/gcc.c-torture/pr111613.c
new file mode 100644
index ..1ea1c4dec072
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/pr111613.c
@@ -0,0 +1,29 @@
+#include 
+#include 
+
+struct bitfield {
+   unsigned int field1 : 1;
+   unsigned int field2 : 1;
+   unsigned int field3 : 1;
+};
+
+__attribute__((noinline)) static void
+set_field1_and_field2(struct bitfield *b) {
+   b->field1 = 1;
+   b->field2 = 1;
+}
+
+__attribute__((noinline)) static struct bitfield *
+new_bitfield(void) {
+   struct bitfield *b = (struct bitfield *)malloc(sizeof(*b));
+   b->field3 = 1;
+   set_field1_and_field2(b);
+   return b;
+}
+
+int main(void) {
+   struct bitfield *b = new_bitfield();
+   if (b->field3 != 1)
+   __builtin_abort ();
+   return 0;
+}


[gcc r15-2207] Fix handling of ICF_NOVOPS in ipa-modref

2024-07-22 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:efcbe7b985e24ac002a863afd609c44a67761195

commit r15-2207-gefcbe7b985e24ac002a863afd609c44a67761195
Author: Jan Hubicka 
Date:   Mon Jul 22 23:01:50 2024 +0200

Fix handling of ICF_NOVOPS in ipa-modref

As shown in somewhat convoluted testcase, ipa-modref is mistreating
ECF_NOVOPS as "having no side effects".  This come from time when
modref cared only about memory accesses and thus it was possible to
shortcut on it.

This patch removes (hopefully) all those bad shortcuts.
Bootstrapped/regtested x86_64-linux, comitted.

gcc/ChangeLog:

PR ipa/109985

* ipa-modref.cc (modref_summary::useful_p): Fix handling of 
ECF_NOVOPS.
(modref_access_analysis::process_fnspec): Likevise.
(modref_access_analysis::analyze_call): Likevise.
(propagate_unknown_call): Likevise.
(modref_propagate_in_scc): Likevise.
(modref_propagate_flags_in_scc): Likewise.
(ipa_merge_modref_summary_after_inlining): Likewise.

Diff:
---
 gcc/ipa-modref.cc | 36 +---
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/gcc/ipa-modref.cc b/gcc/ipa-modref.cc
index 53a2e35133da..f6a758b5f427 100644
--- a/gcc/ipa-modref.cc
+++ b/gcc/ipa-modref.cc
@@ -334,7 +334,7 @@ modref_summary::useful_p (int ecf_flags, bool check_flags)
   if (check_flags
   && remove_useless_eaf_flags (static_chain_flags, ecf_flags, false))
 return true;
-  if (ecf_flags & (ECF_CONST | ECF_NOVOPS))
+  if (ecf_flags & ECF_CONST)
 return ((!side_effects || !nondeterministic)
&& (ecf_flags & ECF_LOOPING_CONST_OR_PURE));
   if (loads && !loads->every_base)
@@ -1263,7 +1263,7 @@ modref_access_analysis::merge_call_side_effects
   int flags = gimple_call_flags (call);
 
   /* Nothing to do for non-looping cont functions.  */
-  if ((flags & (ECF_CONST | ECF_NOVOPS))
+  if ((flags & ECF_CONST)
   && !(flags & ECF_LOOPING_CONST_OR_PURE))
 return false;
 
@@ -1276,7 +1276,7 @@ modref_access_analysis::merge_call_side_effects
   /* Merge side effects and non-determinism.
  PURE/CONST flags makes functions deterministic and if there is
  no LOOPING_CONST_OR_PURE they also have no side effects.  */
-  if (!(flags & (ECF_CONST | ECF_NOVOPS | ECF_PURE))
+  if (!(flags & (ECF_CONST | ECF_PURE))
   || (flags & ECF_LOOPING_CONST_OR_PURE))
 {
   if (!m_summary->side_effects && callee_summary->side_effects)
@@ -1465,7 +1465,7 @@ modref_access_analysis::process_fnspec (gcall *call)
 
   /* PURE/CONST flags makes functions deterministic and if there is
  no LOOPING_CONST_OR_PURE they also have no side effects.  */
-  if (!(flags & (ECF_CONST | ECF_NOVOPS | ECF_PURE))
+  if (!(flags & (ECF_CONST | ECF_PURE))
   || (flags & ECF_LOOPING_CONST_OR_PURE)
   || (cfun->can_throw_non_call_exceptions
  && stmt_could_throw_p (cfun, call)))
@@ -1604,12 +1604,12 @@ modref_access_analysis::analyze_call (gcall *stmt)
   print_gimple_stmt (dump_file, stmt, 0);
 }
 
-  if ((flags & (ECF_CONST | ECF_NOVOPS))
+  if ((flags & ECF_CONST)
   && !(flags & ECF_LOOPING_CONST_OR_PURE))
 {
   if (dump_file)
fprintf (dump_file,
-" - ECF_CONST | ECF_NOVOPS, ignoring all stores and all loads "
+" - ECF_CONST, ignoring all stores and all loads "
 "except for args.\n");
   return;
 }
@@ -1624,7 +1624,13 @@ modref_access_analysis::analyze_call (gcall *stmt)
   if (dump_file)
fprintf (dump_file, gimple_call_internal_p (stmt)
 ? " - Internal call" : " - Indirect call.\n");
-  process_fnspec (stmt);
+  if (flags & ECF_NOVOPS)
+{
+ set_side_effects ();
+ set_nondeterministic ();
+}
+  else
+   process_fnspec (stmt);
   return;
 }
   /* We only need to handle internal calls in IPA mode.  */
@@ -4568,7 +4574,7 @@ propagate_unknown_call (cgraph_node *node,
   return changed;
 }
 
-  if (!(ecf_flags & (ECF_CONST | ECF_NOVOPS | ECF_PURE))
+  if (!(ecf_flags & (ECF_CONST | ECF_PURE))
   || (ecf_flags & ECF_LOOPING_CONST_OR_PURE)
   || nontrivial_scc)
 {
@@ -4782,7 +4788,7 @@ modref_propagate_in_scc (cgraph_node *component_node)
  struct cgraph_node *callee;
 
  if (!callee_edge->inline_failed
-|| ((flags & (ECF_CONST | ECF_NOVOPS))
+|| ((flags & ECF_CONST)
 && !(flags & ECF_LOOPING_CONST_OR_PURE)))
continue;
 
@@ -5205,8 +5211,8 @@ modref_propagate_flags_in_scc (cgraph_node 
*component_node)
{
  escape_summary *sum = escape_summaries->get (e);
 
- if (!sum || (e->indirect_info->ecf_flags
-  & (ECF_CONST | ECF_NOVOPS)))
+ if (!sum || ((e->indirect_info->ecf_flags & ECF_CONST)
+ && !(e->indirect_info->ec

[gcc r14-9515] Add AMD znver5 processor enablement with scheduler model

2024-03-18 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:d0aa0af9a9b7dd709a8c7ff6604ed6b7da0fc23a

commit r14-9515-gd0aa0af9a9b7dd709a8c7ff6604ed6b7da0fc23a
Author: Jan Hubicka 
Date:   Mon Mar 18 10:22:44 2024 +0100

Add AMD znver5 processor enablement with scheduler model

2024-02-14  Jan Hubicka  
Karthiban Anbazhagan  

gcc/ChangeLog:
* common/config/i386/cpuinfo.h (get_amd_cpu): Recognize znver5.
* common/config/i386/i386-common.cc (processor_names): Add znver5.
(processor_alias_table): Likewise.
* common/config/i386/i386-cpuinfo.h (processor_types): Add new zen
family.
(processor_subtypes): Add znver5.
* config.gcc (x86_64-*-* |...): Likewise.
* config/i386/driver-i386.cc (host_detect_local_cpu): Let
march=native detect znver5 cpu's.
* config/i386/i386-c.cc (ix86_target_macros_internal): Add
znver5.
* config/i386/i386-options.cc (m_ZNVER5): New definition
(processor_cost_table): Add znver5.
* config/i386/i386.cc (ix86_reassociation_width): Likewise.
* config/i386/i386.h (processor_type): Add PROCESSOR_ZNVER5
(PTA_ZNVER5): New definition.
* config/i386/i386.md (define_attr "cpu"): Add znver5.
(Scheduling descriptions) Add znver5.md.
* config/i386/x86-tune-costs.h (znver5_cost): New definition.
* config/i386/x86-tune-sched.cc (ix86_issue_rate): Add znver5.
(ix86_adjust_cost): Likewise.
* config/i386/x86-tune.def (avx512_move_by_pieces): Add m_ZNVER5.
(avx512_store_by_pieces): Add m_ZNVER5.
* doc/extend.texi: Add znver5.
* doc/invoke.texi: Likewise.
* config/i386/znver4.md: Rename to zn4zn5.md; combine znver4 and 
znver5 Scheduler.

gcc/testsuite/ChangeLog:
* g++.target/i386/mv29.C: Handle znver5 arch.
* gcc.target/i386/funcspec-56.inc:Likewise.

Diff:
---
 gcc/common/config/i386/cpuinfo.h  |   16 +
 gcc/common/config/i386/i386-common.cc |6 +-
 gcc/common/config/i386/i386-cpuinfo.h |2 +
 gcc/config.gcc|   14 +-
 gcc/config/i386/driver-i386.cc|5 +
 gcc/config/i386/i386-c.cc |7 +
 gcc/config/i386/i386-options.cc   |6 +-
 gcc/config/i386/i386.cc   |3 +-
 gcc/config/i386/i386.h|4 +-
 gcc/config/i386/i386.md   |5 +-
 gcc/config/i386/x86-tune-costs.h  |  136 
 gcc/config/i386/x86-tune-sched.cc |2 +
 gcc/config/i386/x86-tune.def  |4 +-
 gcc/config/i386/znver4.md | 1068 -
 gcc/doc/extend.texi   |3 +
 gcc/doc/invoke.texi   |   10 +
 gcc/testsuite/g++.target/i386/mv29.C  |6 +
 gcc/testsuite/gcc.target/i386/funcspec-56.inc |2 +
 18 files changed, 219 insertions(+), 1080 deletions(-)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index a595ee537a8..017a952a5db 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -310,6 +310,22 @@ get_amd_cpu (struct __processor_model *cpu_model,
  cpu_model->__cpu_subtype = AMDFAM19H_ZNVER3;
}
   break;
+case 0x1a:
+  cpu_model->__cpu_type = AMDFAM1AH;
+  if (model <= 0x77)
+   {
+ cpu = "znver5";
+ CHECK___builtin_cpu_is ("znver5");
+ cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER5;
+   }
+  else if (has_cpu_feature (cpu_model, cpu_features2,
+   FEATURE_AVX512VP2INTERSECT))
+   {
+ cpu = "znver5";
+ CHECK___builtin_cpu_is ("znver5");
+ cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER5;
+   }
+  break;
 default:
   break;
 }
diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index c35191e6925..f814df8385b 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -2166,7 +2166,8 @@ const char *const processor_names[] =
   "znver1",
   "znver2",
   "znver3",
-  "znver4"
+  "znver4",
+  "znver5"
 };
 
 /* Guarantee that the array is aligned with enum processor_type.  */
@@ -2435,6 +2436,9 @@ const pta processor_alias_table[] =
   {"znver4", PROCESSOR_ZNVER4, CPU_ZNVER4,
 PTA_ZNVER4,
 M_CPU_SUBTYPE (AMDFAM19H_ZNVER4), P_PROC_AVX512F},
+  {"znver5", PROCESSOR_ZNVER5, CPU_ZNVER5,
+PTA_ZNVER5,
+M_CPU_SUBTYPE (AMDFAM1AH_ZNVER5), P_PROC_AVX512F},
   {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
   | PTA_SSSE3 | PTA_SSE4A | PTA_ABM | PTA_CX16 | PTA_PRFCHW
diff --git a/gcc/common/config/i386/i386-cpuinfo.h 
b/gcc/common/config/i386/i386-cpui

[gcc r14-9516] Add missing config/i386/zn4zn5.md file

2024-03-18 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:dfc9d1cc8353bdd7fbc37bc10bb3fd40f49fa4af

commit r14-9516-gdfc9d1cc8353bdd7fbc37bc10bb3fd40f49fa4af
Author: Jan Hubicka 
Date:   Mon Mar 18 14:24:10 2024 +0100

Add missing config/i386/zn4zn5.md file

gcc/ChangeLog:

* config/i386/zn4zn5.md: Add file missed in the previous commit.

Diff:
---
 gcc/config/i386/zn4zn5.md | 1785 +
 1 file changed, 1785 insertions(+)

diff --git a/gcc/config/i386/zn4zn5.md b/gcc/config/i386/zn4zn5.md
new file mode 100644
index 000..ba9cfbb5dfc
--- /dev/null
+++ b/gcc/config/i386/zn4zn5.md
@@ -0,0 +1,1785 @@
+;; Copyright (C) 2012-2024 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; .
+;;
+
+
+(define_attr "znver4_decode" "direct,vector,double"
+  (const_string "direct"))
+
+;; AMD znver4 and znver5 Scheduling
+;; Modeling automatons for zen decoders, integer execution pipes,
+;; AGU pipes, branch, floating point execution and fp store units.
+(define_automaton "znver4, znver4_ieu, znver4_idiv, znver4_fdiv, znver4_agu, 
znver4_fpu, znver4_fp_store")
+
+;; Decoders unit has 4 decoders and all of them can decode fast path
+;; and vector type instructions.
+(define_cpu_unit "znver4-decode0" "znver4")
+(define_cpu_unit "znver4-decode1" "znver4")
+(define_cpu_unit "znver4-decode2" "znver4")
+(define_cpu_unit "znver4-decode3" "znver4")
+
+;; Currently blocking all decoders for vector path instructions as
+;; they are dispatched separetely as microcode sequence.
+(define_reservation "znver4-vector" 
"znver4-decode0+znver4-decode1+znver4-decode2+znver4-decode3")
+
+;; Direct instructions can be issued to any of the four decoders.
+(define_reservation "znver4-direct" 
"znver4-decode0|znver4-decode1|znver4-decode2|znver4-decode3")
+
+;; Fix me: Need to revisit this later to simulate fast path double behavior.
+(define_reservation "znver4-double" "znver4-direct")
+
+
+;; Integer unit 4 ALU pipes in znver4 6 ALU pipes in znver5.
+(define_cpu_unit "znver4-ieu0" "znver4_ieu")
+(define_cpu_unit "znver4-ieu1" "znver4_ieu")
+(define_cpu_unit "znver4-ieu2" "znver4_ieu")
+(define_cpu_unit "znver4-ieu3" "znver4_ieu")
+(define_cpu_unit "znver5-ieu4" "znver4_ieu")
+(define_cpu_unit "znver5-ieu5" "znver4_ieu")
+
+;; Znver4 has an additional branch unit.
+(define_cpu_unit "znver4-bru0" "znver4_ieu")
+
+(define_reservation "znver4-ieu" 
"znver4-ieu0|znver4-ieu1|znver4-ieu2|znver4-ieu3")
+(define_reservation "znver5-ieu" 
"znver4-ieu0|znver4-ieu1|znver4-ieu2|znver4-ieu3|znver5-ieu4|znver5-ieu5")
+
+;; 3 AGU pipes in znver4 and 4 AGU pipes in znver5
+(define_cpu_unit "znver4-agu0" "znver4_agu")
+(define_cpu_unit "znver4-agu1" "znver4_agu")
+(define_cpu_unit "znver4-agu2" "znver4_agu")
+(define_cpu_unit "znver5-agu3" "znver4_agu")
+
+(define_reservation "znver4-agu-reserve" "znver4-agu0|znver4-agu1|znver4-agu2")
+(define_reservation "znver5-agu-reserve" 
"znver4-agu0|znver4-agu1|znver4-agu2|znver5-agu3")
+
+;; Load is 4 cycles. We do not model reservation of load unit.
+(define_reservation "znver4-load" "znver4-agu-reserve")
+(define_reservation "znver4-store" "znver4-agu-reserve")
+(define_reservation "znver5-load" "znver5-agu-reserve")
+(define_reservation "znver5-store" "znver5-agu-reserve")
+
+;; vectorpath (microcoded) instructions are single issue instructions.
+;; So, they occupy all the integer units.
+;; This is used for both Znver4 and Znver5, since reserving extra units not 
used otherwise
+;; is harmless.
+(define_reservation "znver4-ivector" "znver4-ieu0+znver4-ieu1
+ 
+znver4-ieu2+znver4-ieu3+znver5-ieu4+znver5-ieu5+znver4-bru0
+ 
+znver4-agu0+znver4-agu1+znver4-agu2+znver5-agu3")
+
+;; Floating point unit 4 FP pipes in znver4 and znver5.
+(define_cpu_unit "znver4-fpu0" "znver4_fpu")
+(define_cpu_unit "znver4-fpu1" "znver4_fpu")
+(define_cpu_unit "znver4-fpu2" "znver4_fpu")
+(define_cpu_unit "znver4-fpu3" "znver4_fpu")
+
+(define_reservation "znver4-fpu" 
"znver4-fpu0|znver4-fpu1|znver4-fpu2|znver4-fpu3")
+
+;; DIV units
+(define_cpu_unit "znver4-idiv" "znver4_idiv")
+(define_cpu_unit "znver4-fdiv" "znver4_fdiv")
+
+;; Separate fp store and fp-to-int store. Although there are 2 store pipes, the
+;; throughput is limited to only one per cycle.
+(define_cpu_unit "znver4-fp-st

[gcc r14-9705] Hash operands of PHI in ipa-icf

2024-03-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:0923fe2d4808c16b72c1d1bfe28220dd326d8b76

commit r14-9705-g0923fe2d4808c16b72c1d1bfe28220dd326d8b76
Author: Jan Hubicka 
Date:   Thu Mar 28 13:24:54 2024 +0100

Hash operands of PHI in ipa-icf

This patch fixes cache colision on function whose body differs only by 
constants
at PHI operands.  As for

if (test)
  a = cst1;
else
  a = cst2;

gcc/ChangeLog:

PR middle-end/113907
* ipa-icf.cc (sem_function::init): Hash PHI operands
(sem_function::compare_phi_node): Add argument about preserving 
order

Diff:
---
 gcc/ipa-icf.cc | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/gcc/ipa-icf.cc b/gcc/ipa-icf.cc
index 120d8544988..e84922c3ef8 100644
--- a/gcc/ipa-icf.cc
+++ b/gcc/ipa-icf.cc
@@ -1387,6 +1387,23 @@ sem_function::init (ipa_icf_gimple::func_checker 
*checker)
  cfg_checksum = iterative_hash_host_wide_int (e->flags,
 cfg_checksum);
 
+   /* TODO: We should be able to match PHIs with different order of
+  parameters.  This needs to be also updated in
+  sem_function::compare_phi_node.  */
+   gphi_iterator si;
+   for (si = gsi_start_nonvirtual_phis (bb); !gsi_end_p (si);
+gsi_next_nonvirtual_phi (&si))
+ {
+   hstate.add_int (GIMPLE_PHI);
+   gphi *phi = si.phi ();
+   m_checker->hash_operand (gimple_phi_result (phi), hstate, 0,
+func_checker::OP_NORMAL);
+   hstate.add_int (gimple_phi_num_args (phi));
+   for (unsigned int i = 0; i < gimple_phi_num_args (phi); i++)
+ m_checker->hash_operand (gimple_phi_arg_def (phi, i),
+  hstate, 0, func_checker::OP_NORMAL);
+ }
+
for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 gsi_next (&gsi))
  {
@@ -1579,6 +1596,8 @@ sem_function::compare_phi_node (basic_block bb1, 
basic_block bb2)
   if (size1 != size2)
return return_false ();
 
+  /* TODO: We should be able to match PHIs with different order of
+parameters.  This needs to be also updated in sem_function::init.  */
   for (i = 0; i < size1; ++i)
{
  t1 = gimple_phi_arg (phi1, i)->def;


[gcc r14-10093] Remove repeated information in -ftree-loop-distribute-patterns doc

2024-04-23 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:6f0a646dd2fc59e9c9cde63718b36085f84a19ba

commit r14-10093-g6f0a646dd2fc59e9c9cde63718b36085f84a19ba
Author: Jan Hubicka 
Date:   Tue Apr 23 15:51:42 2024 +0200

Remove repeated information in -ftree-loop-distribute-patterns doc

We have:

   -ftree-loop-distribute-patterns
   Perform loop distribution of patterns that can be code generated 
with calls to a library.  This flag is enabled by default at -O2 and higher, 
and by -fprofile-use and -fauto-profile.

   This pass distributes the initialization loops and generates a 
call to memset zero.  For example, the loop

...

   and the initialization loop is transformed into a call to memset 
zero.  This flag is enabled by default at -O3.  It is also enabled by 
-fprofile-use and -fauto-profile.

Which mentions optimizatoin flags twice and the repeated mention is out of
date, since we enable this option at -O2 as well.

gcc/ChangeLog:

* doc/invoke.texi (-ftree-loop-distribute-patterns): Remove 
duplicated
sentence about optimization flags implying this.

Diff:
---
 gcc/doc/invoke.texi | 2 --
 1 file changed, 2 deletions(-)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 2a35dc7ac75..27c31ab0c86 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -13852,8 +13852,6 @@ DO I = 1, N
 ENDDO
 @end smallexample
 and the initialization loop is transformed into a call to memset zero.
-This flag is enabled by default at @option{-O3}.
-It is also enabled by @option{-fprofile-use} and @option{-fauto-profile}.
 
 @opindex floop-interchange
 @item -floop-interchange


[gcc r14-10515] Fix ICE with -fdump-tree-moref

2024-07-29 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:98baaa17561ca299eefc98f469f4326e551604c9

commit r14-10515-g98baaa17561ca299eefc98f469f4326e551604c9
Author: Jan Hubicka 
Date:   Mon Jul 29 10:48:34 2024 +0200

Fix ICE with -fdump-tree-moref

gcc/ChangeLog:

PR ipa/116055
* ipa-modref.cc (analyze_function): Do not ICE when flags regress.

Diff:
---
 gcc/ipa-modref.cc | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/gcc/ipa-modref.cc b/gcc/ipa-modref.cc
index 53a2e35133da..37221215a65b 100644
--- a/gcc/ipa-modref.cc
+++ b/gcc/ipa-modref.cc
@@ -3291,7 +3291,8 @@ analyze_function (bool ipa)
fprintf (dump_file, "  Flags for param %i improved:",
 (int)i);
  else
-   gcc_unreachable ();
+   fprintf (dump_file, "  Flags for param %i changed:",
+(int)i);
  dump_eaf_flags (dump_file, old_flags, false);
  fprintf (dump_file, " -> ");
  dump_eaf_flags (dump_file, new_flags, true);
@@ -3307,7 +3308,7 @@ analyze_function (bool ipa)
  || (summary->retslot_flags & EAF_UNUSED))
fprintf (dump_file, "  Flags for retslot improved:");
  else
-   gcc_unreachable ();
+   fprintf (dump_file, "  Flags for retslot changed:");
  dump_eaf_flags (dump_file, past_retslot_flags, false);
  fprintf (dump_file, " -> ");
  dump_eaf_flags (dump_file, summary->retslot_flags, true);
@@ -3322,7 +3323,7 @@ analyze_function (bool ipa)
  || (summary->static_chain_flags & EAF_UNUSED))
fprintf (dump_file, "  Flags for static chain improved:");
  else
-   gcc_unreachable ();
+   fprintf (dump_file, "  Flags for static chain changed:");
  dump_eaf_flags (dump_file, past_static_chain_flags, false);
  fprintf (dump_file, " -> ");
  dump_eaf_flags (dump_file, summary->static_chain_flags, true);


[gcc r15-2376] Fix ICE with -fdump-tree-moref

2024-07-29 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:b3176b620ff29a06c90992ca3d29f3cffd459537

commit r15-2376-gb3176b620ff29a06c90992ca3d29f3cffd459537
Author: Jan Hubicka 
Date:   Mon Jul 29 10:49:49 2024 +0200

Fix ICE with -fdump-tree-moref

gcc/ChangeLog:

PR ipa/116055
* ipa-modref.cc (analyze_function): Do not ICE when flags regress.

Diff:
---
 gcc/ipa-modref.cc | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/gcc/ipa-modref.cc b/gcc/ipa-modref.cc
index f6a758b5f427..59cfe91f987a 100644
--- a/gcc/ipa-modref.cc
+++ b/gcc/ipa-modref.cc
@@ -3297,7 +3297,8 @@ analyze_function (bool ipa)
fprintf (dump_file, "  Flags for param %i improved:",
 (int)i);
  else
-   gcc_unreachable ();
+   fprintf (dump_file, "  Flags for param %i changed:",
+(int)i);
  dump_eaf_flags (dump_file, old_flags, false);
  fprintf (dump_file, " -> ");
  dump_eaf_flags (dump_file, new_flags, true);
@@ -3313,7 +3314,7 @@ analyze_function (bool ipa)
  || (summary->retslot_flags & EAF_UNUSED))
fprintf (dump_file, "  Flags for retslot improved:");
  else
-   gcc_unreachable ();
+   fprintf (dump_file, "  Flags for retslot changed:");
  dump_eaf_flags (dump_file, past_retslot_flags, false);
  fprintf (dump_file, " -> ");
  dump_eaf_flags (dump_file, summary->retslot_flags, true);
@@ -3328,7 +3329,7 @@ analyze_function (bool ipa)
  || (summary->static_chain_flags & EAF_UNUSED))
fprintf (dump_file, "  Flags for static chain improved:");
  else
-   gcc_unreachable ();
+   fprintf (dump_file, "  Flags for static chain changed:");
  dump_eaf_flags (dump_file, past_static_chain_flags, false);
  fprintf (dump_file, " -> ");
  dump_eaf_flags (dump_file, summary->static_chain_flags, true);


[gcc r15-5176] Verify that empty std::vector is optimized away

2024-11-12 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:2264b68796aa6f1a609987cf7edb9183bf070e7e

commit r15-5176-g2264b68796aa6f1a609987cf7edb9183bf070e7e
Author: Jan Hubicka 
Date:   Tue Nov 12 15:58:02 2024 +0100

Verify that empty std::vector is optimized away

With __builtin_operator_new we now can optimize away unused std::vectors.
This adds testcases mentioned in the PR.

PR tree-optimization/96945

gcc/testsuite/ChangeLog:

* g++.dg/tree-ssa/pr96945.C: New test.

Diff:
---
 gcc/testsuite/g++.dg/tree-ssa/pr96945.C | 60 +
 1 file changed, 60 insertions(+)

diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr96945.C 
b/gcc/testsuite/g++.dg/tree-ssa/pr96945.C
new file mode 100644
index ..4cb234c2f711
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr96945.C
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+// { dg-options "-O1 -fdump-tree-optimized -std=c++14" }
+#include 
+struct c {
+c() = default;
+c(const c&) =default;
+c(c&&) = default;
+};
+void foo(){
+std::vector vi = {c(),c(),c()};
+}
+
+struct c2 {
+c2() = default;
+c2(const c2&) =default;
+c2(c2&&) = default;
+};
+void foo2(){
+std::vector vi = {c2(),c2(),c2()};
+}
+
+struct c3 {
+c3() {};
+};
+void foo3(){
+std::vector vi = {c3(),c3(),c3()};
+}
+
+struct c4 {
+c4() = default;
+c4(const c4&) {};
+};
+void foo4(){
+std::vector vi = {c4(),c4(),c4()};
+}
+
+struct c5 {
+c5() = default;
+c5(const c5&) {};
+c5(c5&&) = default;
+};
+void foo5(){
+std::vector vi = {c5(),c5(),c5()};
+}
+
+struct c6 {
+c6() {}
+};
+void foo6(){
+std::vector vi = {c6(),c6(),c6()};
+}
+
+struct c7 {
+c7() = default;
+c7(const c7&) =default;
+};
+void foo7(){
+std::vector vi = {c7(),c7(),c7()};
+}
+// { dg-final { scan-tree-dump-not "delete" "optimized" } }


[gcc r13-9061] Reduce recursive inlining of always_inline functions

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:2532944e3588cf69bce019eaf03de9c63b78568f

commit r13-9061-g2532944e3588cf69bce019eaf03de9c63b78568f
Author: Jan Hubicka 
Date:   Tue May 14 12:58:56 2024 +0200

Reduce recursive inlining of always_inline functions

this patch tames down inliner on (mutiply) self-recursive always_inline 
functions.
While we already have caps on recursive inlning, the testcase combines 
early inliner
and late inliner to get very wide recursive inlining tree.  The basic idea 
is to
ignore DISREGARD_INLINE_LIMITS when deciding on inlining self recursive 
functions
(so we cut on function being large) and clear the flag once it is detected.

I did not include the testcase since it still produces a lot of code and 
would
slow down testing.  It also outputs many inlining failed messages that is 
not
very nice, but it is hard to detect self recursin cycles in full generality
when indirect calls and other tricks may happen.

gcc/ChangeLog:

PR ipa/113291

* ipa-inline.cc (enum can_inline_edge_by_limits_flags): New enum.
(can_inline_edge_by_limits_p): Take flags instead of multiple 
bools; add flag
for forcing inlinie limits.
(can_early_inline_edge_p): Update.
(want_inline_self_recursive_call_p): Update; use FORCE_LIMITS mode.
(check_callers): Update.
(update_caller_keys): Update.
(update_callee_keys): Update.
(recursive_inlining): Update.
(add_new_edges_to_heap): Update.
(speculation_useful_p): Update.
(inline_small_functions): Clear DECL_DISREGARD_INLINE_LIMITS on 
self recursion.
(flatten_function): Update.
(inline_to_all_callers_1): Update.

(cherry picked from commit 1ec49897253e093e1ef6261eb104ac0c111bac83)

Diff:
---
 gcc/ipa-inline.cc | 79 +--
 1 file changed, 53 insertions(+), 26 deletions(-)

diff --git a/gcc/ipa-inline.cc b/gcc/ipa-inline.cc
index 474fbff20574..77cb0726f9f0 100644
--- a/gcc/ipa-inline.cc
+++ b/gcc/ipa-inline.cc
@@ -443,24 +443,33 @@ inline_insns_auto (cgraph_node *n, bool hint, bool hint2)
   return max_inline_insns_auto;
 }
 
+enum can_inline_edge_by_limits_flags
+{
+  /* True if we are early inlining.  */
+  CAN_INLINE_EARLY = 1,
+  /* Ignore size limits.  */
+  CAN_INLINE_DISREGARD_LIMITS = 2,
+  /* Force size limits (ignore always_inline).  This is used for
+ recrusive inlining where always_inline may lead to inline bombs
+ and technically it is non-sential anyway.  */
+  CAN_INLINE_FORCE_LIMITS = 4,
+  /* Report decision to dump file.  */
+  CAN_INLINE_REPORT = 8,
+};
+
 /* Decide if we can inline the edge and possibly update
inline_failed reason.  
We check whether inlining is possible at all and whether
-   caller growth limits allow doing so.  
-
-   if REPORT is true, output reason to the dump file.
-
-   if DISREGARD_LIMITS is true, ignore size limits.  */
+   caller growth limits allow doing so.  */
 
 static bool
-can_inline_edge_by_limits_p (struct cgraph_edge *e, bool report,
-bool disregard_limits = false, bool early = false)
+can_inline_edge_by_limits_p (struct cgraph_edge *e, int flags)
 {
   gcc_checking_assert (e->inline_failed);
 
   if (cgraph_inline_failed_type (e->inline_failed) == CIF_FINAL_ERROR)
 {
-  if (report)
+  if (flags & CAN_INLINE_REPORT)
 report_inline_failed_reason (e);
   return false;
 }
@@ -474,10 +483,11 @@ can_inline_edge_by_limits_p (struct cgraph_edge *e, bool 
report,
   tree callee_tree
 = callee ? DECL_FUNCTION_SPECIFIC_OPTIMIZATION (callee->decl) : NULL;
   /* Check if caller growth allows the inlining.  */
-  if (!DECL_DISREGARD_INLINE_LIMITS (callee->decl)
-  && !disregard_limits
-  && !lookup_attribute ("flatten",
-DECL_ATTRIBUTES (caller->decl))
+  if (!(flags & CAN_INLINE_DISREGARD_LIMITS)
+  && ((flags & CAN_INLINE_FORCE_LIMITS)
+ || (!DECL_DISREGARD_INLINE_LIMITS (callee->decl)
+ && !lookup_attribute ("flatten",
+DECL_ATTRIBUTES (caller->decl
   && !caller_growth_limits (e))
 inlinable = false;
   else if (callee->externally_visible
@@ -505,7 +515,7 @@ can_inline_edge_by_limits_p (struct cgraph_edge *e, bool 
report,
to inline library always_inline functions. See PR65873.
Disable the check for early inlining for now until better solution
is found.  */
- if (always_inline && early)
+ if (always_inline && (flags & CAN_INLINE_EARLY))
;
   /* There are some options that change IL semantics which means
  we cannot inline in these cases for correctness reason.
@@ -541,7 +551,7 @@ can_inline_edge_by_limits_p (struct cgraph_edge *e, bool 
report,
  /* When devirtualization is disabled for callee, it is not safe
  

[gcc r14-10717] Zen5 tuning part 1: avoid FMA chains

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:fce2fe0406aa66c5d6f6465984a6af9ccc63370d

commit r14-10717-gfce2fe0406aa66c5d6f6465984a6af9ccc63370d
Author: Jan Hubicka 
Date:   Tue Sep 3 13:38:33 2024 +0200

Zen5 tuning part 1: avoid FMA chains

testing matrix multiplication benchmarks shows that FMA on a critical chain
is a perofrmance loss over separate multiply and add. While the latency of 4
is lower than multiply + add (3+2) the problem is that all values needs to
be ready before computation starts.

While on znver4 AVX512 code fared well with FMA, it was because of the split
registers. Znver5 benefits from avoding FMA on all widths.  This may be 
different
with the mobile version though.

On naive matrix multiplication benchmark the difference is 8% with -O3
only since with -Ofast loop interchange solves the problem differently.
It is 30% win, for example, on S323 from TSVC:

real_t s323(struct args_t * func_args)
{

//recurrences
//coupled recurrence

initialise_arrays(__func__);
gettimeofday(&func_args->t1, NULL);

for (int nl = 0; nl < iterations/2; nl++) {
for (int i = 1; i < LEN_1D; i++) {
a[i] = b[i-1] + c[i] * d[i];
b[i] = a[i] + c[i] * e[i];
}
dummy(a, b, c, d, e, aa, bb, cc, 0.);
}

gettimeofday(&func_args->t2, NULL);
return calc_checksum(__func__);
}

gcc/ChangeLog:

* config/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS): Enable 
for
znver5.
(X86_TUNE_AVOID_256FMA_CHAINS): Likewise.
(X86_TUNE_AVOID_512FMA_CHAINS): Likewise.

(cherry picked from commit d6360b4083695970789fd65b9c515c11a5ce25b4)

Diff:
---
 gcc/config/i386/x86-tune.def | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 1ab2f444b569..4a3bd15d0ad2 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -515,17 +515,18 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, 
"use_scatter_8parts",
 
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | 
m_ZNVER2 | m_ZNVER3 | m_ZNVER4
+DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER
   | m_YONGFENG | m_GENERIC)
 
 /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
m_ZNVER3 | m_ZNVER4
- | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC)
+DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains",
+ m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 | m_CORE_HYBRID
+ | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC)
 
 /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_NONE)
+DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5)
 
 /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
for v2df vector reduction.  */


[gcc r14-10718] Zen5 tuning part 2: disable gather and scatter

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:3d0a91130eceaf428387ba314cfdfceb99b51709

commit r14-10718-g3d0a91130eceaf428387ba314cfdfceb99b51709
Author: Jan Hubicka 
Date:   Tue Sep 3 15:07:41 2024 +0200

Zen5 tuning part 2: disable gather and scatter

We disable gathers for zen4.  It seems that gather has improved a bit 
compared
to zen4 and Zen5 optimization manual suggests "Avoid GATHER instructions 
when
the indices are known ahead of time. Vector loads followed by shuffles 
result
in a higher load bandwidth." however the situation seems to be more
complicated.

gather is 5-10% loss on parest benchmark as well as 30% loss on sparse dot
products in TSVC. Curiously enough breaking these out into microbenchmark
reversed the situation and it turns out that the performance depends on
how indices are distributed.  gather is loss if indices are sequential,
neutral if they are random and win for some strides (4, 8).

This seems to be similar to earlier zens, so I think (especially for
backporting znver5 support) that it makes sense to be conistent and disable
gather unless we work out a good heuristics on when to use it. Since we
typically do not know the indices in advance, I don't see how that can be 
done.

I opened PR116582 with some examples of wins and loses

gcc/ChangeLog:

* config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Disable for
ZNVER5.
(X86_TUNE_USE_SCATTER_2PARTS): Disable for ZNVER5.
(X86_TUNE_USE_GATHER_4PARTS): Disable for ZNVER5.
(X86_TUNE_USE_SCATTER_4PARTS): Disable for ZNVER5.
(X86_TUNE_USE_GATHER_8PARTS): Disable for ZNVER5.
(X86_TUNE_USE_SCATTER_8PARTS): Disable for ZNVER5.

(cherry picked from commit d82edbe92eed53a479736fcbbe6d54d0fb42daa4)

Diff:
---
 gcc/config/i386/x86-tune.def | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 4a3bd15d0ad2..01324a88a3be 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -483,35 +483,35 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, 
"avoid_4byte_prefixes",
 /* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
+ ~(m_ZNVER | m_CORE_HYBRID
| m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
elements.  */
 DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
- ~(m_ZNVER4))
+ ~(m_ZNVER4 | m_ZNVER5))
 
 /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
+ ~(m_ZNVER | m_CORE_HYBRID
| m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
elements.  */
 DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
- ~(m_ZNVER4))
+ ~(m_ZNVER4 | m_ZNVER5))
 
 /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_CORE_HYBRID | m_CORE_ATOM
+ ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ZNVER5 | m_CORE_HYBRID | 
m_CORE_ATOM
| m_YONGFENG | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
elements.  */
 DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
- ~(m_ZNVER4))
+ ~(m_ZNVER4 | m_ZNVER5))
 
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
smaller FMA chain.  */


[gcc r14-10721] Zen5 tuning part 4: update reassocation width

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:b17cb7ed709ea7250eaa4ddc4a713ebbb6b94b37

commit r14-10721-gb17cb7ed709ea7250eaa4ddc4a713ebbb6b94b37
Author: Jan Hubicka 
Date:   Tue Sep 3 18:20:34 2024 +0200

Zen5 tuning part 4: update reassocation width

Zen5 has 6 instead of 4 ALUs and the integer multiplication can now execute 
in
3 of them.  FP units can do 2 additions and 2 multiplications with latency 2
and 3.  This patch updates reassociation width accordingly.  This has 
potential
of increasing register pressure but unlike while benchmarking znver1 tuning
I did not noticed this actually causing problem on spec, so this patch bumps
up reassociation width to 6 for everything except for integer vectors, where
there are 4 units with typical latency of 1.

Bootstrapped/regtested x86_64-linux, comitted.

gcc/ChangeLog:

* config/i386/i386.cc (ix86_reassociation_width): Update for Znver5.
* config/i386/x86-tune-costs.h (znver5_costs): Update reassociation
widths.

(cherry picked from commit f0ab3de6ec0e3540f2e57f3f5628005f0a4e3fa5)

Diff:
---
 gcc/config/i386/i386.cc  | 10 +++---
 gcc/config/i386/x86-tune-costs.h | 23 +--
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 93d05a301c92..2a0a79888be3 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24537,13 +24537,17 @@ ix86_reassociation_width (unsigned int op, 
machine_mode mode)
   if (width == 1)
return 1;
 
-  /* Integer vector instructions execute in FP unit
+  /* Znver1-4 Integer vector instructions execute in FP unit
 and can execute 3 additions and one multiplication per cycle.  */
   if ((ix86_tune == PROCESSOR_ZNVER1 || ix86_tune == PROCESSOR_ZNVER2
-  || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4
-  || ix86_tune == PROCESSOR_ZNVER5)
+  || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4)
  && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS)
return 1;
+  /* Znver5 can do 2 integer multiplications per cycle with latency
+of 3.  */
+  if (ix86_tune == PROCESSOR_ZNVER5
+ && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS)
+   width = 6;
 
   /* Account for targets that splits wide vectors into multiple parts.  */
   if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 256)
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 8348ab8230ad..da36d2adfeca 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2100,16 +2100,19 @@ struct processor_costs znver5_cost = {
   COSTS_N_INSNS (13),  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (14),  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (20),  /* cost of SQRTSD instruction.  */
-  /* Zen can execute 4 integer operations per cycle.  FP operations
- take 3 cycles and it can execute 2 integer additions and 2
- multiplications thus reassociation may make sense up to with of 6.
- SPEC2k6 bencharks suggests
- that 4 works better than 6 probably due to register pressure.
-
- Integer vector operations are taken by FP unit and execute 3 vector
- plus/minus operations per cycle but only one multiply.  This is adjusted
- in ix86_reassociation_width.  */
-  4, 4, 3, 6,  /* reassoc int, fp, vec_int, vec_fp.  */
+  /* Zen5 can execute:
+  - integer ops: 6 per cycle, at most 3 multiplications.
+   latency 1 for additions, 3 for multiplications (pipelined)
+
+   Setting width of 9 for multiplication is probably excessive
+   for register pressure.
+  - fp ops: 2 additions per cycle, latency 2-3
+   2 multiplicaitons per cycle, latency 3
+  - vector intger ops: 4 additions, latency 1
+  2 multiplications, latency 4
+   We increase width to 6 for multiplications
+   in ix86_reassociation_width.  */
+  6, 6, 4, 6,  /* reassoc int, fp, vec_int, vec_fp.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),   /* cond_taken_branch_cost.  */


[gcc r13-9062] Add AMD znver5 processor enablement with scheduler model

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:499afa8e6899d8e866bbd1e6cc340e5a52557883

commit r13-9062-g499afa8e6899d8e866bbd1e6cc340e5a52557883
Author: Jan Hubicka 
Date:   Mon Mar 18 10:22:44 2024 +0100

Add AMD znver5 processor enablement with scheduler model

2024-02-14  Jan Hubicka  
Karthiban Anbazhagan  

gcc/ChangeLog:
* common/config/i386/cpuinfo.h (get_amd_cpu): Recognize znver5.
* common/config/i386/i386-common.cc (processor_names): Add znver5.
(processor_alias_table): Likewise.
* common/config/i386/i386-cpuinfo.h (processor_types): Add new zen
family.
(processor_subtypes): Add znver5.
* config.gcc (x86_64-*-* |...): Likewise.
* config/i386/driver-i386.cc (host_detect_local_cpu): Let
march=native detect znver5 cpu's.
* config/i386/i386-c.cc (ix86_target_macros_internal): Add
znver5.
* config/i386/i386-options.cc (m_ZNVER5): New definition
(processor_cost_table): Add znver5.
* config/i386/i386.cc (ix86_reassociation_width): Likewise.
* config/i386/i386.h (processor_type): Add PROCESSOR_ZNVER5
(PTA_ZNVER5): New definition.
* config/i386/i386.md (define_attr "cpu"): Add znver5.
(Scheduling descriptions) Add znver5.md.
* config/i386/x86-tune-costs.h (znver5_cost): New definition.
* config/i386/x86-tune-sched.cc (ix86_issue_rate): Add znver5.
(ix86_adjust_cost): Likewise.
* config/i386/x86-tune.def (avx512_move_by_pieces): Add m_ZNVER5.
(avx512_store_by_pieces): Add m_ZNVER5.
* doc/extend.texi: Add znver5.
* doc/invoke.texi: Likewise.
* config/i386/znver4.md: Rename to zn4zn5.md; combine znver4 and 
znver5 Scheduler.

gcc/testsuite/ChangeLog:
* g++.target/i386/mv29.C: Handle znver5 arch.
* gcc.target/i386/funcspec-56.inc:Likewise.

(cherry picked from commit d0aa0af9a9b7dd709a8c7ff6604ed6b7da0fc23a)

Diff:
---
 gcc/common/config/i386/cpuinfo.h  |  16 +
 gcc/common/config/i386/i386-common.cc |   6 +-
 gcc/common/config/i386/i386-cpuinfo.h |   2 +
 gcc/config.gcc|  14 +-
 gcc/config/i386/driver-i386.cc|   5 +
 gcc/config/i386/i386-c.cc |   7 +
 gcc/config/i386/i386-options.cc   |   6 +-
 gcc/config/i386/i386.cc   |   3 +-
 gcc/config/i386/i386.h|   3 +
 gcc/config/i386/i386.md   |   4 +-
 gcc/config/i386/x86-tune-costs.h  | 136 +
 gcc/config/i386/x86-tune-sched.cc |   2 +
 gcc/config/i386/x86-tune.def  |   4 +-
 gcc/config/i386/{znver4.md => zn4zn5.md}  | 817 --
 gcc/doc/extend.texi   |   3 +
 gcc/doc/invoke.texi   |  10 +
 gcc/testsuite/g++.target/i386/mv29.C  |   6 +
 gcc/testsuite/gcc.target/i386/funcspec-56.inc |   2 +
 18 files changed, 985 insertions(+), 61 deletions(-)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 441fae0cdc9f..a2e28e47a7d2 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -310,6 +310,22 @@ get_amd_cpu (struct __processor_model *cpu_model,
  cpu_model->__cpu_subtype = AMDFAM19H_ZNVER3;
}
   break;
+case 0x1a:
+  cpu_model->__cpu_type = AMDFAM1AH;
+  if (model <= 0x77)
+   {
+ cpu = "znver5";
+ CHECK___builtin_cpu_is ("znver5");
+ cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER5;
+   }
+  else if (has_cpu_feature (cpu_model, cpu_features2,
+   FEATURE_AVX512VP2INTERSECT))
+   {
+ cpu = "znver5";
+ CHECK___builtin_cpu_is ("znver5");
+ cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER5;
+   }
+  break;
 default:
   break;
 }
diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index a8809889360b..f36101558077 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -1983,7 +1983,8 @@ const char *const processor_names[] =
   "znver1",
   "znver2",
   "znver3",
-  "znver4"
+  "znver4",
+  "znver5"
 };
 
 /* Guarantee that the array is aligned with enum processor_type.  */
@@ -2243,6 +2244,9 @@ const pta processor_alias_table[] =
   {"znver4", PROCESSOR_ZNVER4, CPU_ZNVER4,
 PTA_ZNVER4,
 M_CPU_SUBTYPE (AMDFAM19H_ZNVER4), P_PROC_AVX512F},
+  {"znver5", PROCESSOR_ZNVER5, CPU_ZNVER5,
+PTA_ZNVER5,
+M_CPU_SUBTYPE (AMDFAM1AH_ZNVER5), P_PROC_AVX512F},
   {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
   | PTA_SSSE3 | PTA_SSE4A | PTA_ABM | PTA_CX16 | PTA_PRFCHW
diff --git a/gcc/common

[gcc r13-9063] Fixup unaligned load/store cost for znver5

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:22f4e4a5043fa69c54b1010d04bcd77958646189

commit r13-9063-g22f4e4a5043fa69c54b1010d04bcd77958646189
Author: Richard Biener 
Date:   Tue Jul 16 10:45:27 2024 +0200

Fixup unaligned load/store cost for znver5

Currently unaligned YMM and ZMM load and store costs are cheaper than
aligned which causes the vectorizer to purposely mis-align accesses
by adding an alignment prologue.  It looks like the unaligned costs
were simply copied from the bogus znver4 costs.  The following makes
the unaligned costs equal to the aligned costs like in the fixed znver4
version.

* config/i386/x86-tune-costs.h (znver5_cost): Update unaligned
load and store cost from the aligned costs.

(cherry picked from commit 896393791ee34ffc176c87d232dfee735db3aaab)

Diff:
---
 gcc/config/i386/x86-tune-costs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 4d3194323e14..02fad74c4d1c 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2060,8 +2060,8 @@ struct processor_costs znver5_cost = {
   in 32bit, 64bit, 128bit, 256bit and 
512bit */
   {8, 8, 8, 12, 12},   /* cost of storing SSE register
   in 32bit, 64bit, 128bit, 256bit and 
512bit */
-  {6, 6, 6, 6, 6}, /* cost of unaligned loads.  */
-  {8, 8, 8, 8, 8}, /* cost of unaligned stores.  */
+  {6, 6, 10, 10, 12},  /* cost of unaligned loads.  */
+  {8, 8, 8, 12, 12},   /* cost of unaligned stores.  */
   2, 2, 2, /* cost of moving XMM,YMM,ZMM
   register.  */
   6,   /* cost of moving SSE register to 
integer.  */


[gcc r14-10719] Zen5 tuning part 3: scheduler tweaks

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:2c01292411044adbd67f79355c1e24decd2fd3c0

commit r14-10719-g2c01292411044adbd67f79355c1e24decd2fd3c0
Author: Jan Hubicka 
Date:   Tue Sep 3 16:26:16 2024 +0200

Zen5 tuning part 3: scheduler tweaks

this patch adds support for new fussion in znver5 documented in the
optimization manual:

   The Zen5 microarchitecture adds support to fuse reg-reg MOV Instructions
   with certain ALU instructions. The following conditions need to be met 
for
   fusion to happen:
 - The MOV should be reg-reg mov with Opcode 0x89 or 0x8B
 - The MOV is followed by an ALU instruction where the MOV and ALU 
destination register match.
 - The ALU instruction may source only registers or immediate data. 
There cannot be any memory source.
 - The ALU instruction sources either the source or dest of MOV 
instruction.
 - If ALU instruction has 2 reg sources, they should be different.
 - The following ALU instructions can fuse with an older qualified MOV 
instruction:
   ADD ADC AND XOR OP SUB SBB INC DEC NOT SAL / SHL SHR SAR
   (I assume OP is OR)

I also increased issue rate from 4 to 6.  Theoretically znver5 can do more, 
but
with our model we can't realy use it.
Increasing issue rate to 8 leads to infinite loop in scheduler.

Finally, I also enabled fuse_alu_and_branch since it is supported by
znver5 (I think by earlier zens too).

New fussion pattern moves quite few instructions around in common code:
@@ -2210,13 +2210,13 @@
.cfi_offset 3, -32
leaq63(%rsi), %rbx
movq%rbx, %rbp
+   shrq$6, %rbp
+   salq$3, %rbp
subq$16, %rsp
.cfi_def_cfa_offset 48
movq%rdi, %r12
-   shrq$6, %rbp
-   movq%rsi, 8(%rsp)
-   salq$3, %rbp
movq%rbp, %rdi
+   movq%rsi, 8(%rsp)
call_Znwm
movq8(%rsp), %rsi
movl$0, 8(%r12)
@@ -2224,8 +2224,8 @@
movq%rax, (%r12)
movq%rbp, 32(%r12)
testq   %rsi, %rsi
-   movq%rsi, %rdx
cmovns  %rsi, %rbx
+   movq%rsi, %rdx
sarq$63, %rdx
shrq$58, %rdx
sarq$6, %rbx
which should help decoder bandwidth and perhaps also cache, though I was not
able to measure off-noise effect on SPEC.

gcc/ChangeLog:

* config/i386/i386.h (TARGET_FUSE_MOV_AND_ALU): New tune.
* config/i386/x86-tune-sched.cc (ix86_issue_rate): Updat for znver5.
(ix86_adjust_cost): Add TODO about znver5 memory latency.
(ix86_fuse_mov_alu_p): New.
(ix86_macro_fusion_pair_p): Use it.
* config/i386/x86-tune.def (X86_TUNE_FUSE_ALU_AND_BRANCH): Add 
ZNVER5.
(X86_TUNE_FUSE_MOV_AND_ALU): New tune;

(cherry picked from commit e2125a600552bc6e0329e3f1224eea14804db8d3)

Diff:
---
 gcc/config/i386/i386.h|  2 ++
 gcc/config/i386/x86-tune-sched.cc | 67 ++-
 gcc/config/i386/x86-tune.def  | 11 +--
 3 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 26e15d2677fb..2de838ef15ce 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -427,6 +427,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS]
 #define TARGET_FUSE_ALU_AND_BRANCH \
ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH]
+#define TARGET_FUSE_MOV_AND_ALU \
+   ix86_tune_features[X86_TUNE_FUSE_MOV_AND_ALU]
 #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU]
 #define TARGET_AVOID_LEA_FOR_ADDR \
ix86_tune_features[X86_TUNE_AVOID_LEA_FOR_ADDR]
diff --git a/gcc/config/i386/x86-tune-sched.cc 
b/gcc/config/i386/x86-tune-sched.cc
index 578ba57e6b22..07b79876c36f 100644
--- a/gcc/config/i386/x86-tune-sched.cc
+++ b/gcc/config/i386/x86-tune-sched.cc
@@ -69,7 +69,6 @@ ix86_issue_rate (void)
 case PROCESSOR_ZNVER2:
 case PROCESSOR_ZNVER3:
 case PROCESSOR_ZNVER4:
-case PROCESSOR_ZNVER5:
 case PROCESSOR_CORE2:
 case PROCESSOR_NEHALEM:
 case PROCESSOR_SANDYBRIDGE:
@@ -92,6 +91,13 @@ ix86_issue_rate (void)
   return 5;
 
 case PROCESSOR_SAPPHIRERAPIDS:
+/* For znver5 decoder can handle 4 or 8 instructions per cycle,
+   op cache 12 instruction/cycle, dispatch 8 instructions
+   integer rename 8 instructions and Fp 6 instructions.
+
+   The scheduler, without understanding out of order nature of the CPU
+   is unlikely going to be able to fill all of these.  */
+case PROCESSOR_ZNVER5:
   return 6;
 
 default:
@@ -435,6 +441,8 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn 
*dep_insn, int cost,
  enum attr_un

[gcc r14-10720] Zen5 tuning part 3: fix typo in previous patch

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:2eade72b0e2ac9dd18ef517bc3b868157f1ddf48

commit r14-10720-g2eade72b0e2ac9dd18ef517bc3b868157f1ddf48
Author: Jan Hubicka 
Date:   Tue Sep 3 17:25:05 2024 +0200

Zen5 tuning part 3: fix typo in previous patch

gcc/ChangeLog:

* config/i386/x86-tune-sched.cc (ix86_fuse_mov_alu_p): Fix
typo.

(cherry picked from commit 910e1769a0653ac32bd8c1d6aabb39c797d5d773)

Diff:
---
 gcc/config/i386/x86-tune-sched.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/i386/x86-tune-sched.cc 
b/gcc/config/i386/x86-tune-sched.cc
index 07b79876c36f..746f23b3cbc4 100644
--- a/gcc/config/i386/x86-tune-sched.cc
+++ b/gcc/config/i386/x86-tune-sched.cc
@@ -615,7 +615,7 @@ ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu)
   /* One of operands should be register.  */
   if (op1 && (!REG_P (op0) || REGNO (op0) != REGNO (reg)))
 std::swap (op0, op1);
-  if (!REG_P (op0) || REGNO (op1) != REGNO (reg))
+  if (!REG_P (op0) || REGNO (op0) != REGNO (reg))
 return false;
   if (op1
   && !REG_P (op1)


[gcc r12-10732] Add AMD znver5 processor enablement with scheduler model

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:54806268b47775449c7e237f8f03e922d6da26f6

commit r12-10732-g54806268b47775449c7e237f8f03e922d6da26f6
Author: Jan Hubicka 
Date:   Mon Mar 18 10:22:44 2024 +0100

Add AMD znver5 processor enablement with scheduler model

2024-02-14  Jan Hubicka  
Karthiban Anbazhagan  

gcc/ChangeLog:
* common/config/i386/cpuinfo.h (get_amd_cpu): Recognize znver5.
* common/config/i386/i386-common.cc (processor_names): Add znver5.
(processor_alias_table): Likewise.
* common/config/i386/i386-cpuinfo.h (processor_types): Add new zen
family.
(processor_subtypes): Add znver5.
* config.gcc (x86_64-*-* |...): Likewise.
* config/i386/driver-i386.cc (host_detect_local_cpu): Let
march=native detect znver5 cpu's.
* config/i386/i386-c.cc (ix86_target_macros_internal): Add
znver5.
* config/i386/i386-options.cc (m_ZNVER5): New definition
(processor_cost_table): Add znver5.
* config/i386/i386.cc (ix86_reassociation_width): Likewise.
* config/i386/i386.h (processor_type): Add PROCESSOR_ZNVER5
(PTA_ZNVER5): New definition.
* config/i386/i386.md (define_attr "cpu"): Add znver5.
(Scheduling descriptions) Add znver5.md.
* config/i386/x86-tune-costs.h (znver5_cost): New definition.
* config/i386/x86-tune-sched.cc (ix86_issue_rate): Add znver5.
(ix86_adjust_cost): Likewise.
* config/i386/x86-tune.def (avx512_move_by_pieces): Add m_ZNVER5.
(avx512_store_by_pieces): Add m_ZNVER5.
* doc/extend.texi: Add znver5.
* doc/invoke.texi: Likewise.
* config/i386/znver4.md: Rename to zn4zn5.md; combine znver4 and 
znver5 Scheduler.

gcc/testsuite/ChangeLog:
* g++.target/i386/mv29.C: Handle znver5 arch.
* gcc.target/i386/funcspec-56.inc:Likewise.

(cherry picked from commit d0aa0af9a9b7dd709a8c7ff6604ed6b7da0fc23a)

Diff:
---
 gcc/common/config/i386/cpuinfo.h  |  16 +
 gcc/common/config/i386/i386-common.cc |   6 +-
 gcc/common/config/i386/i386-cpuinfo.h |   2 +
 gcc/config.gcc|  14 +-
 gcc/config/i386/driver-i386.cc|   5 +
 gcc/config/i386/i386-c.cc |   7 +
 gcc/config/i386/i386-options.cc   |   6 +-
 gcc/config/i386/i386.cc   |   3 +-
 gcc/config/i386/i386.h|   3 +
 gcc/config/i386/i386.md   |   4 +-
 gcc/config/i386/x86-tune-costs.h  | 134 +
 gcc/config/i386/x86-tune-sched.cc |   2 +
 gcc/config/i386/x86-tune.def  |   4 +-
 gcc/config/i386/{znver4.md => zn4zn5.md}  | 817 --
 gcc/doc/extend.texi   |   3 +
 gcc/doc/invoke.texi   |  10 +
 gcc/testsuite/g++.target/i386/mv29.C  |   6 +
 gcc/testsuite/gcc.target/i386/funcspec-56.inc |   2 +
 18 files changed, 983 insertions(+), 61 deletions(-)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 316ad3cb3e9b..d79534331f77 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -282,6 +282,22 @@ get_amd_cpu (struct __processor_model *cpu_model,
  cpu_model->__cpu_subtype = AMDFAM19H_ZNVER3;
}
   break;
+case 0x1a:
+  cpu_model->__cpu_type = AMDFAM1AH;
+  if (model <= 0x77)
+   {
+ cpu = "znver5";
+ CHECK___builtin_cpu_is ("znver5");
+ cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER5;
+   }
+  else if (has_cpu_feature (cpu_model, cpu_features2,
+   FEATURE_AVX512VP2INTERSECT))
+   {
+ cpu = "znver5";
+ CHECK___builtin_cpu_is ("znver5");
+ cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER5;
+   }
+  break;
 default:
   break;
 }
diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index e2594cae4cc1..a01172cab2fb 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -1831,7 +1831,8 @@ const char *const processor_names[] =
   "znver1",
   "znver2",
   "znver3",
-  "znver4"
+  "znver4",
+  "znver5"
 };
 
 /* Guarantee that the array is aligned with enum processor_type.  */
@@ -2067,6 +2068,9 @@ const pta processor_alias_table[] =
   {"znver4", PROCESSOR_ZNVER4, CPU_ZNVER4,
 PTA_ZNVER4,
 M_CPU_SUBTYPE (AMDFAM19H_ZNVER4), P_PROC_AVX512F},
+  {"znver5", PROCESSOR_ZNVER5, CPU_ZNVER5,
+PTA_ZNVER5,
+M_CPU_SUBTYPE (AMDFAM1AH_ZNVER5), P_PROC_AVX512F},
   {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
   | PTA_SSSE3 | PTA_SSE4A | PTA_ABM | PTA_CX16 | PTA_PRFCHW
diff --git a/gcc/commo

[gcc r12-10733] Fixup unaligned load/store cost for znver5

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:c77b1c833e84b62928a729556c502e1311782b2d

commit r12-10733-gc77b1c833e84b62928a729556c502e1311782b2d
Author: Richard Biener 
Date:   Tue Jul 16 10:45:27 2024 +0200

Fixup unaligned load/store cost for znver5

Currently unaligned YMM and ZMM load and store costs are cheaper than
aligned which causes the vectorizer to purposely mis-align accesses
by adding an alignment prologue.  It looks like the unaligned costs
were simply copied from the bogus znver4 costs.  The following makes
the unaligned costs equal to the aligned costs like in the fixed znver4
version.

* config/i386/x86-tune-costs.h (znver5_cost): Update unaligned
load and store cost from the aligned costs.

(cherry picked from commit 896393791ee34ffc176c87d232dfee735db3aaab)

Diff:
---
 gcc/config/i386/x86-tune-costs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 11a9dd0ff9ed..b8e7ab9372ea 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2028,8 +2028,8 @@ struct processor_costs znver5_cost = {
   in 32bit, 64bit, 128bit, 256bit and 
512bit */
   {8, 8, 8, 12, 12},   /* cost of storing SSE register
   in 32bit, 64bit, 128bit, 256bit and 
512bit */
-  {6, 6, 6, 6, 6}, /* cost of unaligned loads.  */
-  {8, 8, 8, 8, 8}, /* cost of unaligned stores.  */
+  {6, 6, 10, 10, 12},  /* cost of unaligned loads.  */
+  {8, 8, 8, 12, 12},   /* cost of unaligned stores.  */
   2, 2, 2, /* cost of moving XMM,YMM,ZMM
   register.  */
   6,   /* cost of moving SSE register to 
integer.  */


[gcc r13-9065] Re-add m_ZNVER4 to X86_TUNE_AVOID_256FMA_CHAINS

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:ad9ba1eccec5086b84f1030fb3e87947242ba904

commit r13-9065-gad9ba1eccec5086b84f1030fb3e87947242ba904
Author: Jan Hubicka 
Date:   Sun Sep 29 02:10:14 2024 +0200

Re-add m_ZNVER4 to X86_TUNE_AVOID_256FMA_CHAINS

* config/i386/x86-tune.def (X86_TUNE_AVOID_256FMA_CHAINS): Re-add 
m_ZNVER4
accidentally removed during znver5 merge.

Diff:
---
 gcc/config/i386/x86-tune.def | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 0ef75e986be9..629e1fdf5f77 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -518,7 +518,7 @@ DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", 
m_ZNVER)
 /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
smaller FMA chain.  */
 DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
m_ZNVER3
- | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC | m_ZNVER5)
+ | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC | m_ZNVER4 
| m_ZNVER5)
 
 /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
smaller FMA chain.  */


[gcc r13-9064] Zen5 tuning part 1: avoid FMA chains

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:7c0c772e4fb89bf4d9bc09f7d8e41c6bc0b0e093

commit r13-9064-g7c0c772e4fb89bf4d9bc09f7d8e41c6bc0b0e093
Author: Jan Hubicka 
Date:   Tue Sep 3 13:38:33 2024 +0200

Zen5 tuning part 1: avoid FMA chains

testing matrix multiplication benchmarks shows that FMA on a critical chain
is a perofrmance loss over separate multiply and add. While the latency of 4
is lower than multiply + add (3+2) the problem is that all values needs to
be ready before computation starts.

While on znver4 AVX512 code fared well with FMA, it was because of the split
registers. Znver5 benefits from avoding FMA on all widths.  This may be 
different
with the mobile version though.

On naive matrix multiplication benchmark the difference is 8% with -O3
only since with -Ofast loop interchange solves the problem differently.
It is 30% win, for example, on S323 from TSVC:

real_t s323(struct args_t * func_args)
{

//recurrences
//coupled recurrence

initialise_arrays(__func__);
gettimeofday(&func_args->t1, NULL);

for (int nl = 0; nl < iterations/2; nl++) {
for (int i = 1; i < LEN_1D; i++) {
a[i] = b[i-1] + c[i] * d[i];
b[i] = a[i] + c[i] * e[i];
}
dummy(a, b, c, d, e, aa, bb, cc, 0.);
}

gettimeofday(&func_args->t2, NULL);
return calc_checksum(__func__);
}

gcc/ChangeLog:

* config/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS): Enable 
for
znver5.
(X86_TUNE_AVOID_256FMA_CHAINS): Likewise.
(X86_TUNE_AVOID_512FMA_CHAINS): Likewise.

(cherry picked from commit d6360b4083695970789fd65b9c515c11a5ce25b4)

Diff:
---
 gcc/config/i386/x86-tune.def | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 9cc44e2b628c..0ef75e986be9 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -513,16 +513,16 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, 
"use_scatter_8parts",
 
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | 
m_ZNVER2 | m_ZNVER3)
+DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER)
 
 /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
smaller FMA chain.  */
 DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
m_ZNVER3
- | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC | m_ZNVER4)
+ | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC | m_ZNVER5)
 
 /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_NONE)
+DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5)
 
 /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
for v2df vector reduction.  */


[gcc r13-9068] Zen5 tuning part 2: disable gather and scatter

2024-09-30 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:456719b5c0705a6c2065fc261f41d0c2a30f3045

commit r13-9068-g456719b5c0705a6c2065fc261f41d0c2a30f3045
Author: Jan Hubicka 
Date:   Tue Sep 3 15:07:41 2024 +0200

Zen5 tuning part 2: disable gather and scatter

We disable gathers for zen4.  It seems that gather has improved a bit 
compared
to zen4 and Zen5 optimization manual suggests "Avoid GATHER instructions 
when
the indices are known ahead of time. Vector loads followed by shuffles 
result
in a higher load bandwidth." however the situation seems to be more
complicated.

gather is 5-10% loss on parest benchmark as well as 30% loss on sparse dot
products in TSVC. Curiously enough breaking these out into microbenchmark
reversed the situation and it turns out that the performance depends on
how indices are distributed.  gather is loss if indices are sequential,
neutral if they are random and win for some strides (4, 8).

This seems to be similar to earlier zens, so I think (especially for
backporting znver5 support) that it makes sense to be conistent and disable
gather unless we work out a good heuristics on when to use it. Since we
typically do not know the indices in advance, I don't see how that can be 
done.

I opened PR116582 with some examples of wins and loses

gcc/ChangeLog:

* config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Disable for
ZNVER5.
(X86_TUNE_USE_SCATTER_2PARTS): Disable for ZNVER5.
(X86_TUNE_USE_GATHER_4PARTS): Disable for ZNVER5.
(X86_TUNE_USE_SCATTER_4PARTS): Disable for ZNVER5.
(X86_TUNE_USE_GATHER_8PARTS): Disable for ZNVER5.
(X86_TUNE_USE_SCATTER_8PARTS): Disable for ZNVER5.

(cherry picked from commit d82edbe92eed53a479736fcbbe6d54d0fb42daa4)

Diff:
---
 gcc/config/i386/x86-tune.def | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 629e1fdf5f77..4231ca90b0ed 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -481,35 +481,35 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, 
"avoid_4byte_prefixes",
 /* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE
+ ~(m_ZNVER | m_ALDERLAKE
| m_CORE_ATOM | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
elements.  */
 DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
- ~(m_ZNVER4))
+ ~(m_ZNVER4 | m_ZNVER5))
 
 /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE
+ ~(m_ZNVER | m_ALDERLAKE
| m_CORE_ATOM | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
elements.  */
 DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
- ~(m_ZNVER4))
+ ~(m_ZNVER4 | m_ZNVER5))
 
 /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE
+ ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ZNVER5 | m_ALDERLAKE
| m_CORE_ATOM | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
elements.  */
 DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
- ~(m_ZNVER4))
+ ~(m_ZNVER4 | m_ZNVER5))
 
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
smaller FMA chain.  */


[gcc r12-10736] Zen5 tuning part 1: avoid FMA chains

2024-09-30 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:be6334fffdf2a7df3b7f92ea933b804664dfc383

commit r12-10736-gbe6334fffdf2a7df3b7f92ea933b804664dfc383
Author: Jan Hubicka 
Date:   Tue Sep 3 13:38:33 2024 +0200

Zen5 tuning part 1: avoid FMA chains

testing matrix multiplication benchmarks shows that FMA on a critical chain
is a perofrmance loss over separate multiply and add. While the latency of 4
is lower than multiply + add (3+2) the problem is that all values needs to
be ready before computation starts.

While on znver4 AVX512 code fared well with FMA, it was because of the split
registers. Znver5 benefits from avoding FMA on all widths.  This may be 
different
with the mobile version though.

On naive matrix multiplication benchmark the difference is 8% with -O3
only since with -Ofast loop interchange solves the problem differently.
It is 30% win, for example, on S323 from TSVC:

real_t s323(struct args_t * func_args)
{

//recurrences
//coupled recurrence

initialise_arrays(__func__);
gettimeofday(&func_args->t1, NULL);

for (int nl = 0; nl < iterations/2; nl++) {
for (int i = 1; i < LEN_1D; i++) {
a[i] = b[i-1] + c[i] * d[i];
b[i] = a[i] + c[i] * e[i];
}
dummy(a, b, c, d, e, aa, bb, cc, 0.);
}

gettimeofday(&func_args->t2, NULL);
return calc_checksum(__func__);
}

gcc/ChangeLog:

* config/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS): Enable 
for
znver5.
(X86_TUNE_AVOID_256FMA_CHAINS): Likewise.
(X86_TUNE_AVOID_512FMA_CHAINS): Likewise.

(cherry picked from commit d6360b4083695970789fd65b9c515c11a5ce25b4)

Diff:
---
 gcc/config/i386/x86-tune.def | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index f5bf331242aa..249a239de775 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -499,16 +499,16 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, 
"use_scatter_8parts",
 
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | 
m_ZNVER2 | m_ZNVER3)
+DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER)
 
 /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
smaller FMA chain.  */
 DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
m_ZNVER3
- | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_GENERIC | m_ZNVER4)
+ | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_GENERIC | m_ZNVER4 | m_ZNVER5)
 
 /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_NONE)
+DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5)
 
 /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
for v2df vector reduction.  */


[gcc r15-5362] ipa-modref bits for unsequenced and reproducible

2024-11-17 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:addf02282026cc23d24fc4445b47b408d484d1e7

commit r15-5362-gaddf02282026cc23d24fc4445b47b408d484d1e7
Author: Jan Hubicka 
Date:   Sun Nov 17 11:54:10 2024 +0100

ipa-modref bits for unsequenced and reproducible

C attributes reproducible and unsequenced implies that calling function 
twice
leads to same effect if parameters are otherwise unchanged (function call
itself does not count).  This is bit bit stronger that modref's notion of
nondeterminism that says that same inputs will yield same outputs (function
call itself does count).

This patch makes reproducible/unsequenced imply determinism and cleans up
determinism handling.  By itself it is not useful, since we can not make 
use of it
unless we know what are the inputs/outputs of the function which I plan to 
handle
by the "fn spec" attribute.

gcc/ChangeLog:

* ipa-modref.cc (modref_summary::useful_p): const/pure implies
determinism.
(modref_summary_lto::useful_p): Likewise.
(ignore_nondeterminism_p): Add CALLEE_FNTYPE parameter; check for
reproducible/unsequenced
(modref_access_analysis::record_access_p): Use 
ignore_nondeterminism_p
when handling volatile accesses.
(modref_access_analysis::get_access_for_fnspec): Update.
(modref_access_analysis::process_fnspec): Cleanup handling of 
NOVOPS.
(modref_access_analysis::analyze_call): Use ignore_nondeterminism_p
when handling asm statements.
(modref_access_analysis::analyze_stmt): Update.
(propagate_unknown_call): Update.
(modref_propagate_in_scc): Update.
(ipa_merge_modref_summary_after_inlining): Update.

Diff:
---
 gcc/ipa-modref.cc | 90 ---
 1 file changed, 53 insertions(+), 37 deletions(-)

diff --git a/gcc/ipa-modref.cc b/gcc/ipa-modref.cc
index 12ac0e7865a7..08a7740de943 100644
--- a/gcc/ipa-modref.cc
+++ b/gcc/ipa-modref.cc
@@ -336,15 +336,13 @@ modref_summary::useful_p (int ecf_flags, bool check_flags)
   && remove_useless_eaf_flags (static_chain_flags, ecf_flags, false))
 return true;
   if (ecf_flags & ECF_CONST)
-return ((!side_effects || !nondeterministic)
-   && (ecf_flags & ECF_LOOPING_CONST_OR_PURE));
+return (!side_effects && (ecf_flags & ECF_LOOPING_CONST_OR_PURE));
   if (loads && !loads->every_base)
 return true;
   else
 kills.release ();
   if (ecf_flags & ECF_PURE)
-return ((!side_effects || !nondeterministic)
-   && (ecf_flags & ECF_LOOPING_CONST_OR_PURE));
+return (!side_effects && (ecf_flags & ECF_LOOPING_CONST_OR_PURE));
   return stores && !stores->every_base;
 }
 
@@ -409,15 +407,13 @@ modref_summary_lto::useful_p (int ecf_flags, bool 
check_flags)
   && remove_useless_eaf_flags (static_chain_flags, ecf_flags, false))
 return true;
   if (ecf_flags & (ECF_CONST | ECF_NOVOPS))
-return ((!side_effects || !nondeterministic)
-   && (ecf_flags & ECF_LOOPING_CONST_OR_PURE));
+return (!side_effects && (ecf_flags & ECF_LOOPING_CONST_OR_PURE));
   if (loads && !loads->every_base)
 return true;
   else
 kills.release ();
   if (ecf_flags & ECF_PURE)
-return ((!side_effects || !nondeterministic)
-   && (ecf_flags & ECF_LOOPING_CONST_OR_PURE));
+return (!side_effects && (ecf_flags & ECF_LOOPING_CONST_OR_PURE));
   return stores && !stores->every_base;
 }
 
@@ -794,13 +790,25 @@ namespace {
 /* Return true if ECF flags says that nondeterminism can be ignored.  */
 
 static bool
-ignore_nondeterminism_p (tree caller, int flags)
+ignore_nondeterminism_p (tree caller, int flags, tree callee_fntype)
 {
-  if (flags & (ECF_CONST | ECF_PURE))
+  int caller_flags = flags_from_decl_or_type (caller);
+  if ((flags | caller_flags) & (ECF_CONST | ECF_PURE))
 return true;
   if ((flags & (ECF_NORETURN | ECF_NOTHROW)) == (ECF_NORETURN | ECF_NOTHROW)
   || (!opt_for_fn (caller, flag_exceptions) && (flags & ECF_NORETURN)))
 return true;
+  /* C language defines unsequenced and reproducible functions
+ to be deterministic.  */
+  if (lookup_attribute ("unsequenced", TYPE_ATTRIBUTES (TREE_TYPE (caller)))
+  || lookup_attribute ("reproducible",
+  TYPE_ATTRIBUTES (TREE_TYPE (caller
+return true;
+  if (callee_fntype
+  && (lookup_attribute ("unsequenced", TYPE_ATTRIBUTES (callee_fntype))
+ || lookup_attribute ("reproducible",
+  TYPE_ATTRIBUTES (callee_fntype
+return true;
   return false;
 }
 
@@ -1151,7 +1159,8 @@ modref_access_analysis::record_access_lto 
(modref_records_lto *tt, ao_ref *ref,
 bool
 modref_access_analysis::record_access_p (tree expr)
 {
-  if (TREE_THIS_VOLATILE (expr))
+  if (TREE_THIS_VOLATILE (expr)
+  && !ignore_nondeterminism_p (current_function_decl, 0, NULL))
 {
   if

[gcc r15-5358] Minor cleanup to cxx_init_decl_processing

2024-11-16 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:101f8c73d575b4746e49b0ea35eb6cc24de1dfdc

commit r15-5358-g101f8c73d575b4746e49b0ea35eb6cc24de1dfdc
Author: Jan Hubicka 
Date:   Sat Nov 16 23:44:13 2024 +0100

Minor cleanup to cxx_init_decl_processing

gcc/cp/ChangeLog:

* decl.cc (cxx_build_operator_new): Break out from ...
(cxx_build_operator_delete): Break out from ...
(cxx_init_operator_new_delete_decls): Break out from ...
(cxx_init_decl_processing): ... here.

Diff:
---
 gcc/cp/decl.cc | 266 -
 1 file changed, 132 insertions(+), 134 deletions(-)

diff --git a/gcc/cp/decl.cc b/gcc/cp/decl.cc
index b4e7ceefedb0..25296a9de8f5 100644
--- a/gcc/cp/decl.cc
+++ b/gcc/cp/decl.cc
@@ -4851,6 +4851,131 @@ initialize_predefined_identifiers (void)
 }
 }
 
+/* Build a specific variant of operator new.  */
+
+static void
+cxx_build_operator_new (tree newtype)
+{
+  tree opnew = push_cp_library_fn (NEW_EXPR, newtype, 0);
+  DECL_IS_MALLOC (opnew) = 1;
+  DECL_SET_IS_OPERATOR_NEW (opnew, true);
+  DECL_IS_REPLACEABLE_OPERATOR (opnew) = 1;
+  opnew = push_cp_library_fn (VEC_NEW_EXPR, newtype, 0);
+  DECL_IS_MALLOC (opnew) = 1;
+  DECL_SET_IS_OPERATOR_NEW (opnew, true);
+  DECL_IS_REPLACEABLE_OPERATOR (opnew) = 1;
+}
+
+/* Build a specific variant of operator delete.  */
+
+static void
+cxx_build_operator_delete (tree deltype)
+{
+  tree opdel = push_cp_library_fn (DELETE_EXPR, deltype, ECF_NOTHROW);
+  DECL_SET_IS_OPERATOR_DELETE (opdel, true);
+  DECL_IS_REPLACEABLE_OPERATOR (opdel) = 1;
+  opdel = push_cp_library_fn (VEC_DELETE_EXPR, deltype, ECF_NOTHROW);
+  DECL_SET_IS_OPERATOR_DELETE (opdel, true);
+  DECL_IS_REPLACEABLE_OPERATOR (opdel) = 1;
+}
+
+/* Declare all variants of operator new and delete.  */
+
+static void
+cxx_init_operator_new_delete_decls (void)
+{
+  tree newattrs, extvisattr;
+  tree newtype, deltype;
+  tree ptr_ftype_sizetype;
+  tree new_eh_spec;
+  tree void_ftype_ptr = build_function_type_list (void_type_node,
+ ptr_type_node, NULL_TREE);
+  void_ftype_ptr
+= build_exception_variant (void_ftype_ptr, empty_except_spec);
+
+  ptr_ftype_sizetype
+= build_function_type_list (ptr_type_node, size_type_node, NULL_TREE);
+  if (cxx_dialect == cxx98)
+{
+  tree bad_alloc_id;
+  tree bad_alloc_type_node;
+  tree bad_alloc_decl;
+
+  push_nested_namespace (std_node);
+  bad_alloc_id = get_identifier ("bad_alloc");
+  bad_alloc_type_node = make_class_type (RECORD_TYPE);
+  TYPE_CONTEXT (bad_alloc_type_node) = current_namespace;
+  bad_alloc_decl
+   = create_implicit_typedef (bad_alloc_id, bad_alloc_type_node);
+  DECL_CONTEXT (bad_alloc_decl) = current_namespace;
+  pop_nested_namespace (std_node);
+
+  new_eh_spec
+   = add_exception_specifier (NULL_TREE, bad_alloc_type_node, -1);
+}
+  else
+new_eh_spec = noexcept_false_spec;
+
+  extvisattr = build_tree_list (get_identifier ("externally_visible"),
+   NULL_TREE);
+  newattrs = tree_cons (get_identifier ("alloc_size"),
+   build_tree_list (NULL_TREE, integer_one_node),
+   extvisattr);
+  newtype = cp_build_type_attribute_variant (ptr_ftype_sizetype, newattrs);
+  newtype = build_exception_variant (newtype, new_eh_spec);
+  deltype = cp_build_type_attribute_variant (void_ftype_ptr, extvisattr);
+  deltype = build_exception_variant (deltype, empty_except_spec);
+  cxx_build_operator_new (newtype);
+  cxx_build_operator_delete (deltype);
+  if (flag_sized_deallocation)
+{
+  /* Also push the sized deallocation variants:
+  void operator delete(void*, std::size_t) throw();
+  void operator delete[](void*, std::size_t) throw();  */
+  tree void_ftype_ptr_size
+   = build_function_type_list (void_type_node, ptr_type_node,
+   size_type_node, NULL_TREE);
+  deltype = cp_build_type_attribute_variant (void_ftype_ptr_size,
+extvisattr);
+  deltype = build_exception_variant (deltype, empty_except_spec);
+  cxx_build_operator_delete (deltype);
+}
+
+  if (aligned_new_threshold)
+{
+  push_nested_namespace (std_node);
+  tree align_id = get_identifier ("align_val_t");
+  align_type_node = start_enum (align_id, NULL_TREE, size_type_node,
+   NULL_TREE, /*scoped*/true, NULL);
+  pop_nested_namespace (std_node);
+
+  /* operator new (size_t, align_val_t); */
+  newtype = build_function_type_list (ptr_type_node, size_type_node,
+ align_type_node, NULL_TREE);
+  newtype = cp_build_type_attribute_variant (newtype, newattrs);
+  newtype = build_exception_variant (newtype, new_eh_spec);
+  cxx_build_operator_new (newtype);
+
+  /* operator delete (voi

[gcc r15-5322] Fix type of malloc parameter in trans-expr.cc

2024-11-15 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:d5af5657fa4173dd8d3155e44fe6a1e3914cde8f

commit r15-5322-gd5af5657fa4173dd8d3155e44fe6a1e3914cde8f
Author: Jan Hubicka 
Date:   Fri Nov 15 15:51:14 2024 +0100

Fix type of malloc parameter in trans-expr.cc

gcc/fortran/ChangeLog:

* trans-expr.cc (gfc_trans_subcomponent_assign): Fix type of malloc
parameter.

Diff:
---
 gcc/fortran/trans-expr.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/fortran/trans-expr.cc b/gcc/fortran/trans-expr.cc
index f004af713344..a3c1dc0b7af4 100644
--- a/gcc/fortran/trans-expr.cc
+++ b/gcc/fortran/trans-expr.cc
@@ -9740,6 +9740,7 @@ gfc_trans_subcomponent_assign (tree dest, gfc_component * 
cm,
  gfc_init_se (&se, NULL);
  gfc_conv_expr (&se, expr);
  size = size_of_string_in_bytes (cm->ts.kind, se.string_length);
+ size = fold_convert (size_type_node, size);
  tmp = build_call_expr_loc (input_location,
 builtin_decl_explicit (BUILT_IN_MALLOC),
 1, size);


[gcc r15-5359] Avoid expicit builtion list in tree-ssa-dce

2024-11-17 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:cc33f880e553d1aa94d19a349ad755f34c33de9e

commit r15-5359-gcc33f880e553d1aa94d19a349ad755f34c33de9e
Author: Jan Hubicka 
Date:   Sat Nov 16 23:45:57 2024 +0100

Avoid expicit builtion list in tree-ssa-dce

while working on -fmalloc-dce I noticed that tree-ssa-dce.cc still has an
outdated list of builtions that are known to not read memory that can be
replaced by query to fnspec and modref.

If I get things right, dce does some dead store removal, but only on those
memory object that are non-aliased (automatic variabels with no address 
taken)
and for all other memory addresses it resorts to
mark_all_reaching_defs_necessary expecting DSE to do the rest.  So we really
want to only check if there are no memory reads at all rather then trying to
understand them by parsing fnspec or modref summary.

I did run testsuite ensuring that all builtins matched previously are
still matched.  There are few testcases where this check fails, due to
type incompatibility.  New code uses gimple_call_builtin while other
just checked callee_decl.

We test things like calling free() without parmeter which I don't think
we want to care about, but there is also testase declaring

void * calloc (long, long)

where builtin declaration expects unsigned long.  I am not sure if this
case should not be allowed by gimple_call_builtin?

Bootstrappe/regtested x86_64-linux. OK?

gcc/ChangeLog:

* ipa-modref.cc (ipa_modref_callee_reads_no_memory_p): New function.
* ipa-modref.h (ipa_modref_callee_reads_no_memory_p): Declare
* tree-ssa-dce.cc (propagate_necessity): Use it.

Diff:
---
 gcc/ipa-modref.cc| 32 
 gcc/ipa-modref.h |  1 +
 gcc/testsuite/g++.dg/tree-ssa/pr109442.C | 12 
 gcc/tree-ssa-dce.cc  | 28 +---
 4 files changed, 54 insertions(+), 19 deletions(-)

diff --git a/gcc/ipa-modref.cc b/gcc/ipa-modref.cc
index c1973aa36c4c..12ac0e7865a7 100644
--- a/gcc/ipa-modref.cc
+++ b/gcc/ipa-modref.cc
@@ -5600,4 +5600,36 @@ ipa_modref_cc_finalize ()
   escape_summaries = NULL;
 }
 
+/* Return true if call is known to perform no memory reads.  */
+
+bool
+ipa_modref_callee_reads_no_memory_p (gcall *call)
+{
+  if (gimple_call_flags (call) & ECF_CONST)
+return true;
+  attr_fnspec fnspec = gimple_call_fnspec (call);
+  if (fnspec.known_p ()
+  && !fnspec.global_memory_read_p ())
+{
+  bool found = false;
+  for (unsigned int i = 0; i < gimple_call_num_args (call) && !found; i++)
+   if (!POINTER_TYPE_P (TREE_TYPE (gimple_call_arg (call, i
+ ;
+  else if (!fnspec.arg_specified_p (i)
+  || fnspec.arg_maybe_read_p (i))
+ found = true;
+  if (!found)
+   return true;
+}
+
+  /* For interposed calls we can not be sure that the other, semantically
+ equivalent body, will not perform some redundant load from memory
+ that may become undefined if we optimize out some stores.  */
+  bool interposed;
+  modref_summary *sum = get_modref_function_summary (call, &interposed);
+  if (sum && !interposed && !sum->global_memory_read && !sum->loads)
+return true;
+  return false;
+}
+
 #include "gt-ipa-modref.h"
diff --git a/gcc/ipa-modref.h b/gcc/ipa-modref.h
index 1bbe9bffee05..a0eb63a0afab 100644
--- a/gcc/ipa-modref.h
+++ b/gcc/ipa-modref.h
@@ -75,6 +75,7 @@ modref_summary *get_modref_function_summary (cgraph_node 
*func);
 modref_summary *get_modref_function_summary (gcall *call, bool *interposed);
 void ipa_modref_cc_finalize ();
 void ipa_merge_modref_summary_after_inlining (cgraph_edge *e);
+bool ipa_modref_callee_reads_no_memory_p (gcall *call);
 
 /* All flags that are implied by the ECF_CONST functions.  */
 static const int implicit_const_eaf_flags
diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr109442.C 
b/gcc/testsuite/g++.dg/tree-ssa/pr109442.C
new file mode 100644
index ..ec40c470c8dd
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr109442.C
@@ -0,0 +1,12 @@
+// { dg-do compile { target c++11 } }
+// { dg-options "-O1 -fdump-tree-optimized" }
+#include 
+#define T int
+T vat1(std::vector v1) {
+auto v = v1;
+return 10;
+}
+// This should compile to empty function; check that no size of
+// vector is determined and there is no allocation
+// { dg-final { scan-tree-dump-not "_M_start" "optimized" } }
+// { dg-final { scan-tree-dump-not "delete" "optimized" } }
diff --git a/gcc/tree-ssa-dce.cc b/gcc/tree-ssa-dce.cc
index 643a1efd8707..70e3843cabfd 100644
--- a/gcc/tree-ssa-dce.cc
+++ b/gcc/tree-ssa-dce.cc
@@ -69,6 +69,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-ssa-propagate.h"
 #include "gimple-fold.h"
 #include "tree-ssa.h"
+#include "ipa-modref-tree.h"
+#include "ipa-modref.h"
 
 static struct stmt_stats
 {
@@ 

[gcc r15-5336] Ignore conditions guarding __builtin_unreachable in inliner metrics

2024-11-16 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:cee7d080d5c2a5fb8125878998b742c040ec88b4

commit r15-5336-gcee7d080d5c2a5fb8125878998b742c040ec88b4
Author: Jan Hubicka 
Date:   Sat Nov 16 14:04:32 2024 +0100

Ignore conditions guarding __builtin_unreachable in inliner metrics

This extends my last year attempt to make inliner metric ignore
conditionals guarding __builtin_unreachable.  Compared to previous
patch, this one implements a "mini-dce" in ipa-fnsummary to avoid
accounting all statements that are only used to determine conditionals
guarding __builtin_unnecesary.  These will be removed later once value
ranges are determined.

While working on this, I noticed that we do have a lot of dead code while
computing fnsummary for early inline. Those are only used to apply
large-function growth, but it seems there is enough dead code to make this
valud kind of irrelevant.  Also there seems to be quite a lot of const/pure
calls that can be cheaply removed before we inline them.  So I wonder if we
want to run one DCE before early inlining.

gcc/ChangeLog:

PR tree-optimization/109442
* ipa-fnsummary.cc (builtin_unreachable_bb_p): New function.
(guards_builtin_unreachable): New function.
(STMT_NECESSARY): New macro.
(mark_stmt_necessary): New function.
(mark_operand_necessary): New function.
(find_necessary_statements): New function.
(analyze_function_body): Use it.

gcc/testsuite/ChangeLog:

* gcc.dg/ipa/fnsummary-1.c: New test.

Diff:
---
 gcc/ipa-fnsummary.cc   | 181 -
 gcc/testsuite/gcc.dg/ipa/fnsummary-1.c |   9 ++
 2 files changed, 189 insertions(+), 1 deletion(-)

diff --git a/gcc/ipa-fnsummary.cc b/gcc/ipa-fnsummary.cc
index e921cd495f69..87e08dad8467 100644
--- a/gcc/ipa-fnsummary.cc
+++ b/gcc/ipa-fnsummary.cc
@@ -2674,6 +2674,169 @@ points_to_possible_sra_candidate_p (tree t)
   return false;
 }
 
+/* Return true if BB only calls builtin_unreachable.
+   We skip empty basic blocks, debug statements, clobbers and predicts.
+   CACHE is used to memoize already analyzed blocks.  */
+
+static bool
+builtin_unreachable_bb_p (basic_block bb, vec &cache)
+{
+  if (cache[bb->index])
+return cache[bb->index] - 1;
+  gimple_stmt_iterator si;
+  auto_vec  visited_bbs;
+  bool ret = false;
+  while (true)
+{
+  bool empty_bb = true;
+  visited_bbs.safe_push (bb);
+  cache[bb->index] = 3;
+  for (si = gsi_start_nondebug_bb (bb);
+  !gsi_end_p (si) && empty_bb;
+  gsi_next_nondebug (&si))
+   {
+ if (gimple_code (gsi_stmt (si)) != GIMPLE_PREDICT
+ && !gimple_clobber_p (gsi_stmt (si))
+ && !gimple_nop_p (gsi_stmt (si)))
+   {
+ empty_bb = false;
+ break;
+   }
+   }
+  if (!empty_bb)
+   break;
+  else
+   bb = single_succ_edge (bb)->dest;
+  if (cache[bb->index])
+   {
+ ret = cache[bb->index] == 3 ? false : cache[bb->index] - 1;
+ goto done;
+   }
+}
+  if (gimple_call_builtin_p (gsi_stmt (si), BUILT_IN_UNREACHABLE)
+  || gimple_call_builtin_p (gsi_stmt (si), BUILT_IN_UNREACHABLE_TRAP))
+ret = true;
+done:
+  for (basic_block vbb:visited_bbs)
+cache[vbb->index] = (unsigned char)ret + 1;
+  return ret;
+}
+
+static bool
+guards_builtin_unreachable (basic_block bb, vec &cache)
+{
+  edge_iterator ei;
+  edge e;
+  FOR_EACH_EDGE (e, ei, bb->succs)
+if (builtin_unreachable_bb_p (e->dest, cache))
+  {
+   if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file,
+  "BB %i ends with conditional guarding __builtin_unreachable;"
+  " conditinal is unnecesary\n", bb->index);
+   return true;
+  }
+  return false;
+}
+
+#define STMT_NECESSARY GF_PLF_1
+
+/* If STMT is not already marked necessary, mark it, and add it to the
+   worklist if ADD_TO_WORKLIST is true.  */
+
+static inline void
+mark_stmt_necessary (gimple *stmt, auto_vec &worklist)
+{
+  gcc_assert (stmt);
+
+  if (gimple_plf (stmt, STMT_NECESSARY))
+return;
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+{
+  fprintf (dump_file, "Marking useful stmt: ");
+  print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
+  fprintf (dump_file, "\n");
+}
+
+  gimple_set_plf (stmt, STMT_NECESSARY, true);
+  worklist.safe_push (stmt);
+}
+
+/* Mark the statement defining operand OP as necessary.  */
+
+static inline void
+mark_operand_necessary (tree op, auto_vec &worklist)
+{
+  gimple *stmt = SSA_NAME_DEF_STMT (op);
+  if (gimple_nop_p (stmt))
+return;
+  mark_stmt_necessary (stmt, worklist);
+}
+
+/* Mark all statements that will remain in the body after optimizing out
+   conditionals guarding __builtin_unreachable which we keep to preserve
+   value ranges.  */
+
+static void
+find_necess

[gcc r15-5281] Fix common.opt.urls

2024-11-14 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:236104908257d779b06894379f99e007a1c4c897

commit r15-5281-g236104908257d779b06894379f99e007a1c4c897
Author: Jan Hubicka 
Date:   Thu Nov 14 17:29:14 2024 +0100

Fix common.opt.urls

gcc/ChangeLog:

* common.opt.urls: Fix.

Diff:
---
 gcc/common.opt.urls | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/common.opt.urls b/gcc/common.opt.urls
index 0b12f3433f95..db98158d6522 100644
--- a/gcc/common.opt.urls
+++ b/gcc/common.opt.urls
@@ -1169,7 +1169,7 @@ flive-patching=
 UrlSuffix(gcc/Optimize-Options.html#index-flive-patching)
 
 fallocation-dce
-UrlSuffix(gcc/Optimize-Options.html#index-fno-allocation-dce)
+UrlSuffix(gcc/Optimize-Options.html#index-fallocation-dce)
 
 flive-range-shrinkage
 UrlSuffix(gcc/Optimize-Options.html#index-flive-range-shrinkage)


[gcc r15-5255] Remove allocations which are used only for NULL pointer check and free

2024-11-14 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:7828dc070510f8f5c06765858815fa6e5d2d1ec6

commit r15-5255-g7828dc070510f8f5c06765858815fa6e5d2d1ec6
Author: Jan Hubicka 
Date:   Thu Nov 14 17:01:12 2024 +0100

Remove allocations which are used only for NULL pointer check and free

Extend tree-ssa-dse to remove memory allocations that are used only
to check that return value is non-NULL and freed.

New -fmalloc-dce flag can be used to control malloc/free removal.  I
ended up copying what -fallocation-dse does so -fmalloc-dce=1 enables
malloc/free removal provided return value is unused otherwise and
-fmalloc-dce=2 allows additional NULL pointer checks which it folds to
non-NULL direction.

I also added compensation for the gcc.dg/analyzer/pr101837.c testcase and
added testcase that std::nothrow variant of operator new is now optimized 
way.

With the -fmalloc-dce=n I can also add a level which emits runtime check 
for half
of address space and calloc overflow if it seems useful, but perhaps
incrementally.  Adding size parameter tracking is not that hard (I posted 
WIP
patch for that).

gcc/ChangeLog:

PR tree-optimization/117370
* common.opt: Add -fmalloc-dce.
* common.opt.urls: Update.
* doc/invoke.texi: Document it; also add missing -flifetime-dse 
entry.
* tree-ssa-dce.cc (is_removable_allocation_p): Break out from
...
(mark_stmt_if_obviously_necessary): ... here; also check that
operator new satisfies gimple_call_from_new_or_delete.
(checks_return_value_of_removable_allocation_p): New Function.
(mark_all_reaching_defs_necessary_1): add missing case for
STRDUP and STRNDUP
(propagate_necessity): Use is_removable_allocation_p and
checks_return_value_of_removable_allocation_p.
(eliminate_unnecessary_stmts): Update conditionals that use
removed allocation; use is_removable_allocation_p.

gcc/testsuite/ChangeLog:

* g++.dg/cdce3.C: Disable allocation dce.
* g++.dg/tree-ssa/pr19476-1.C: Likewise.
* g++.dg/tree-ssa/pr19476-2.C: Likewise.
* g++.dg/tree-ssa/pr19476-3.C: Likewise.
* g++.dg/tree-ssa/pr19476-4.C: Likewise.
* gcc.dg/analyzer/pr101837.c: Disable malloc dce.
* gcc.dg/tree-ssa/pr19831-3.c: Update.
* gfortran.dg/pr68078.f90: Disable malloc DCE.

Diff:
---
 gcc/common.opt|   7 ++
 gcc/common.opt.urls   |   6 ++
 gcc/doc/invoke.texi   |  22 +++-
 gcc/testsuite/g++.dg/cdce3.C  |   2 +-
 gcc/testsuite/g++.dg/tree-ssa/pr19476-1.C |   2 +-
 gcc/testsuite/g++.dg/tree-ssa/pr19476-2.C |   2 +-
 gcc/testsuite/g++.dg/tree-ssa/pr19476-3.C |   2 +-
 gcc/testsuite/g++.dg/tree-ssa/pr19476-4.C |   2 +-
 gcc/testsuite/gcc.dg/analyzer/pr101837.c  |   2 +-
 gcc/testsuite/gcc.dg/tree-ssa/pr19831-3.c |   9 +-
 gcc/testsuite/gfortran.dg/pr68078.f90 |   2 +
 gcc/tree-ssa-dce.cc   | 164 +++---
 12 files changed, 148 insertions(+), 74 deletions(-)

diff --git a/gcc/common.opt b/gcc/common.opt
index 0b1f1ec26e14..33be6b8042a8 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2282,6 +2282,13 @@ fmax-errors=
 Common Joined RejectNegative UInteger Var(flag_max_errors)
 -fmax-errors=  Maximum number of errors to report.
 
+fmalloc-dce
+Common Var(flag_malloc_dce,2) Init(2) Optimization
+Allow removal of malloc and free pairs when allocated block is unused.
+
+fmalloc-dce=
+Common Joined RejectNegative UInteger Var(flag_malloc_dse) Optimization 
IntegerRange(0, 2)
+
 fmem-report
 Common Var(mem_report)
 Report on permanent memory allocation.
diff --git a/gcc/common.opt.urls b/gcc/common.opt.urls
index 78e0dc209d14..0b12f3433f95 100644
--- a/gcc/common.opt.urls
+++ b/gcc/common.opt.urls
@@ -947,6 +947,12 @@ UrlSuffix(gcc/Optimize-Options.html#index-fmath-errno)
 fmax-errors=
 UrlSuffix(gcc/Warning-Options.html#index-fmax-errors) 
LangUrlSuffix_D(gdc/Warnings.html#index-fmax-errors)
 
+fmalloc-dce
+UrlSuffix(gcc/Optimize-Options.html#index-fmalloc-dce)
+
+fmalloc-dce=
+UrlSuffix(gcc/Optimize-Options.html#index-fmalloc-dce)
+
 fmem-report
 UrlSuffix(gcc/Developer-Options.html#index-fmem-report)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 4a494f6a668c..e00a04720e05 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -585,7 +585,7 @@ Objective-C and Objective-C++ Dialects}.
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const
 -fipa-reference  -fipa-reference-addressable
 -fipa-stack-alignment  -fipa-icf  -fira-algorithm=@var{algorithm}
--flate-combine-instructions  -flive-patching=@var{level}
+-flate-combine-instructions -flifetime-dse -flive-patching=@var{level}
 -fira-region=@var{region}  -fira-hoist-pressure
 -fira-loop-pres

[gcc r15-5256] New testcase for operator new/delete removal.

2024-11-14 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:269b6477744f25ab0a89b364295b86a4bce392f9

commit r15-5256-g269b6477744f25ab0a89b364295b86a4bce392f9
Author: Jan Hubicka 
Date:   Thu Nov 14 17:08:03 2024 +0100

New testcase for operator new/delete removal.

* g++.dg/tree-ssa/dce-1.C: New test.

Diff:
---
 gcc/testsuite/g++.dg/tree-ssa/dce-1.C | 9 +
 1 file changed, 9 insertions(+)

diff --git a/gcc/testsuite/g++.dg/tree-ssa/dce-1.C 
b/gcc/testsuite/g++.dg/tree-ssa/dce-1.C
new file mode 100644
index ..1d5115ccb14a
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-ssa/dce-1.C
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+#include 
+void test ()
+{
+   int *a = new (std::nothrow) int;
+   delete a;
+}
+/* { dg-final { scan-tree-dump-not "operator new" "optimized" } } */


[gcc r15-5238] Add testcases for std::vector optimization

2024-11-14 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:20faea418d580d4db99aa5293c976ab16867aca5

commit r15-5238-g20faea418d580d4db99aa5293c976ab16867aca5
Author: Jan Hubicka 
Date:   Thu Nov 14 14:19:04 2024 +0100

Add testcases for std::vector optimization

gcc/testsuite/ChangeLog:

PR tree-optimization/110819
PR tree-optimization/116868
PR tree-optimization/58483

* g++.dg/tree-ssa/pr96945.C: cleanup
* g++.dg/tree-ssa/pr110819.C: New test.
* g++.dg/tree-ssa/pr116868.C: New test.
* g++.dg/tree-ssa/pr58483.C: New test.

Diff:
---
 gcc/testsuite/g++.dg/tree-ssa/pr110819.C | 14 ++
 gcc/testsuite/g++.dg/tree-ssa/pr116868.C | 12 
 gcc/testsuite/g++.dg/tree-ssa/pr58483.C  | 15 +++
 gcc/testsuite/g++.dg/tree-ssa/pr96945.C  |  4 ++--
 4 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr110819.C 
b/gcc/testsuite/g++.dg/tree-ssa/pr110819.C
new file mode 100644
index ..04a074f1cb22
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr110819.C
@@ -0,0 +1,14 @@
+// { dg-do compile { target c++14 } }
+// { dg-options "-O1 -fdump-tree-optimized" }
+#include
+
+void f(int);
+
+void use_idx_const_size_reserve() {
+std::vector v;
+v.reserve(10);
+auto s = v.size();
+for (std::vector::size_type i = 0; i < s; i++)
+f(v[i]);
+}
+// { dg-final { scan-tree-dump-not "delete" "optimized" } }
diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr116868.C 
b/gcc/testsuite/g++.dg/tree-ssa/pr116868.C
new file mode 100644
index ..d89c7a3b09cc
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr116868.C
@@ -0,0 +1,12 @@
+// { dg-do compile { target c++14 } }
+// { dg-options "-O2 -fdump-tree-optimized" }
+#include 
+int sumVector() {
+const std::vector vec = {1};
+int sum = 0;
+for (int i = 0; i < vec.size(); i++) {
+sum += vec[i];
+}
+return sum;
+}
+// { dg-final { scan-tree-dump-not "delete" "optimized" } }
diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr58483.C 
b/gcc/testsuite/g++.dg/tree-ssa/pr58483.C
new file mode 100644
index ..c99664b57577
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr58483.C
@@ -0,0 +1,15 @@
+// { dg-do compile { target c++14 } }
+// { dg-do compile }
+// { dg-options "-O1 -fdump-tree-optimized" }
+#include
+
+void f(int);
+
+void use_idx_const_size_reserve() {
+std::vector v;
+v.reserve(10);
+auto s = v.size();
+for (std::vector::size_type i = 0; i < s; i++)
+f(v[i]);
+}
+// { dg-final { scan-tree-dump-not "delete" "optimized" } }
diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr96945.C 
b/gcc/testsuite/g++.dg/tree-ssa/pr96945.C
index 4cb234c2f711..63d55672e0c2 100644
--- a/gcc/testsuite/g++.dg/tree-ssa/pr96945.C
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr96945.C
@@ -1,5 +1,5 @@
-/* { dg-do compile } */
-// { dg-options "-O1 -fdump-tree-optimized -std=c++14" }
+// { dg-do compile { target c++14 } }
+// { dg-options "-O1 -fdump-tree-optimized" }
 #include 
 struct c {
 c() = default;


[gcc r15-5694] improve std::deque::_M_reallocate_map

2024-11-26 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:1729147f2b1a05ad2a4e8e2d91d1d2ba914039d6

commit r15-5694-g1729147f2b1a05ad2a4e8e2d91d1d2ba914039d6
Author: Jan Hubicka 
Date:   Tue Nov 26 13:52:09 2024 +0100

improve std::deque::_M_reallocate_map

Looking into reason why we still do throw_bad_alloc in clang binary I 
noticed
that quite few calls come from deque::_M_reallocate_map.  This patch adds
unreachable to limit the size of realloc_map.  _M_reallocate_map is called 
only
if new size is smaller then max_size.  map is an array holding pointers to
entries of fixed size.

Since rellocation is done by doubling the map size, I think the maximal 
size of
map allocated is max_size / deque_buf_size rounded up times two.  This 
should
be also safe for overflows since we have extra bit.

map size is always at least 8. Theoretically this computation may be wrong 
for
very large T, but in that case callers should never reallocate.

On the testcase I get:
jh@shroud:~> ~/trunk-install-new4/bin/g++ -O2 dq.C -c ; size -A dq.o | grep 
text
.text  284  0
.text._ZNSt5dequeIiSaIiEE17_M_reallocate_mapEmb485  0
.text.unlikely  10  0
jh@shroud:~> ~/trunk-install-new5/bin/g++ -O2 dq.C -c ; size -A dq.o | grep 
text
.text  284  0
.text._ZNSt5dequeIiSaIiEE17_M_reallocate_mapEmb465  0
.text.unlikely  10  0

so this saves about 20 bytes of rellocate_map, which I think is worthwhile.
Curiously enough gcc14 does:

jh@shroud:~> g++ -O2 dq.C -c ; size -A dq.o | grep text
.text 604  0
.text.unlikely 10  0

which is 145 bytes smaller. Obvoius difference is that _M_reallocate_map 
gets inlined.
Compiling gcc14 preprocessed file with trunk gives:

jh@shroud:~> g++ -O2 dq.C -S ; size -A dq.o | grep text
.text 762  0
.text.unlikely 10  0

So inlining is due to changes at libstdc++ side, but code size growth is 
due to
something else.

For clang this reduced number of thris_bad_new_array_length from 121 to 61.

libstdc++-v3/ChangeLog:

* include/bits/deque.tcc (std::deque::_M_reallocate_map): Add
__builtin_unreachable check to declare that maps are not very large.
* include/bits/stl_deque.h (std::deque::size): Add 
__builtin_unreachable
to check for maximal size of map.

gcc/testsuite/ChangeLog:

* g++.dg/tree-ssa/deque-1.C: New test.
* g++.dg/tree-ssa/deque-2.C: New test.

Diff:
---
 gcc/testsuite/g++.dg/tree-ssa/deque-1.C |  9 +
 gcc/testsuite/g++.dg/tree-ssa/deque-2.C | 10 ++
 libstdc++-v3/include/bits/deque.tcc |  4 
 libstdc++-v3/include/bits/stl_deque.h   |  7 ++-
 4 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/g++.dg/tree-ssa/deque-1.C 
b/gcc/testsuite/g++.dg/tree-ssa/deque-1.C
new file mode 100644
index ..c639ebb1a5f3
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-ssa/deque-1.C
@@ -0,0 +1,9 @@
+// { dg-do compile } 
+// { dg-options "-O1 -fdump-tree-optimized" }
+#include 
+void
+test(std::deque &q, int v)
+{
+  q.push_back (v);
+}
+// { dg-final { scan-tree-dump-not "throw_bad_alloc" "optimized" } }
diff --git a/gcc/testsuite/g++.dg/tree-ssa/deque-2.C 
b/gcc/testsuite/g++.dg/tree-ssa/deque-2.C
new file mode 100644
index ..7e268b3f018d
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-ssa/deque-2.C
@@ -0,0 +1,10 @@
+// { dg-do compile }
+// { dg-options "-O3 -fdump-tree-optimized" }
+#include 
+std::deque
+test2(std::deque &q)
+{
+  return q;
+}
+// rethrow is OK, but throw is not.
+// { dg-final { scan-tree-dump-not {[^e]throw} "optimized" } }
diff --git a/libstdc++-v3/include/bits/deque.tcc 
b/libstdc++-v3/include/bits/deque.tcc
index deb010a0ebb5..ee03c917a295 100644
--- a/libstdc++-v3/include/bits/deque.tcc
+++ b/libstdc++-v3/include/bits/deque.tcc
@@ -956,6 +956,10 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
 + std::max(this->_M_impl._M_map_size,
__nodes_to_add) + 2;
 
+ const size_t __bufsz = __deque_buf_size(sizeof(_Tp));
+ if (__new_map_size > ((max_size() + __bufsz - 1) / __bufsz) * 2)
+   __builtin_unreachable();
+
  _Map_pointer __new_map = this->_M_allocate_map(__new_map_size);
  __new_nstart = __new_map + (__new_map_size - __new_num_nodes) / 2
 + (__add_at_front ? __nodes_to_add : 0);
diff --git a/libstdc++-v3/include/bits/stl_deque.h 
b/libstdc++-v3/include/bits/stl_deque.h
index c617933bd81c..dd1212793ded 100644
--- a/libstdc++-v3/include/bits/stl_deque.h
+++ b/libstdc++-v3/include/bits/stl_deque.h
@@ -1266,7 +126

[gcc r12-10888] Zen5 tuning part 4: update reassocation width

2025-01-07 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:4d7efc031fbd925565b049670bf755aca21bd2e3

commit r12-10888-g4d7efc031fbd925565b049670bf755aca21bd2e3
Author: Jan Hubicka 
Date:   Tue Sep 3 18:20:34 2024 +0200

Zen5 tuning part 4: update reassocation width

Zen5 has 6 instead of 4 ALUs and the integer multiplication can now execute 
in
3 of them.  FP units can do 2 additions and 2 multiplications with latency 2
and 3.  This patch updates reassociation width accordingly.  This has 
potential
of increasing register pressure but unlike while benchmarking znver1 tuning
I did not noticed this actually causing problem on spec, so this patch bumps
up reassociation width to 6 for everything except for integer vectors, where
there are 4 units with typical latency of 1.

Bootstrapped/regtested x86_64-linux, comitted.

gcc/ChangeLog:

* config/i386/i386.cc (ix86_reassociation_width): Update for Znver5.
* config/i386/x86-tune-costs.h (znver5_costs): Update reassociation
widths.

(cherry picked from commit f0ab3de6ec0e3540f2e57f3f5628005f0a4e3fa5)

Diff:
---
 gcc/config/i386/i386.cc  | 10 +++---
 gcc/config/i386/x86-tune-costs.h | 23 +--
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 2087f8633eb8..ea25e56ad644 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22923,13 +22923,17 @@ ix86_reassociation_width (unsigned int op, 
machine_mode mode)
   if (width == 1)
return 1;
 
-  /* Integer vector instructions execute in FP unit
+  /* Znver1-4 Integer vector instructions execute in FP unit
 and can execute 3 additions and one multiplication per cycle.  */
   if ((ix86_tune == PROCESSOR_ZNVER1 || ix86_tune == PROCESSOR_ZNVER2
-  || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4
-  || ix86_tune == PROCESSOR_ZNVER5)
+  || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4)
  && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS)
return 1;
+  /* Znver5 can do 2 integer multiplications per cycle with latency
+of 3.  */
+  if (ix86_tune == PROCESSOR_ZNVER5
+ && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS)
+   width = 6;
 
   /* Account for targets that splits wide vectors into multiple parts.  */
   if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 256)
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index b8e7ab9372ea..0f2308bb079c 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2068,16 +2068,19 @@ struct processor_costs znver5_cost = {
   COSTS_N_INSNS (13),  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (14),  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (20),  /* cost of SQRTSD instruction.  */
-  /* Zen can execute 4 integer operations per cycle.  FP operations
- take 3 cycles and it can execute 2 integer additions and 2
- multiplications thus reassociation may make sense up to with of 6.
- SPEC2k6 bencharks suggests
- that 4 works better than 6 probably due to register pressure.
-
- Integer vector operations are taken by FP unit and execute 3 vector
- plus/minus operations per cycle but only one multiply.  This is adjusted
- in ix86_reassociation_width.  */
-  4, 4, 3, 6,  /* reassoc int, fp, vec_int, vec_fp.  */
+  /* Zen5 can execute:
+  - integer ops: 6 per cycle, at most 3 multiplications.
+   latency 1 for additions, 3 for multiplications (pipelined)
+
+   Setting width of 9 for multiplication is probably excessive
+   for register pressure.
+  - fp ops: 2 additions per cycle, latency 2-3
+   2 multiplicaitons per cycle, latency 3
+  - vector intger ops: 4 additions, latency 1
+  2 multiplications, latency 4
+   We increase width to 6 for multiplications
+   in ix86_reassociation_width.  */
+  6, 6, 4, 6,  /* reassoc int, fp, vec_int, vec_fp.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),   /* cond_taken_branch_cost.  */


[gcc r13-9285] Zen5 tuning part 4: update reassocation width

2025-01-07 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:52507e15aa31bc66e99f2273306f1b45be919bba

commit r13-9285-g52507e15aa31bc66e99f2273306f1b45be919bba
Author: Jan Hubicka 
Date:   Tue Sep 3 18:20:34 2024 +0200

Zen5 tuning part 4: update reassocation width

Zen5 has 6 instead of 4 ALUs and the integer multiplication can now execute 
in
3 of them.  FP units can do 2 additions and 2 multiplications with latency 2
and 3.  This patch updates reassociation width accordingly.  This has 
potential
of increasing register pressure but unlike while benchmarking znver1 tuning
I did not noticed this actually causing problem on spec, so this patch bumps
up reassociation width to 6 for everything except for integer vectors, where
there are 4 units with typical latency of 1.

Bootstrapped/regtested x86_64-linux, comitted.

gcc/ChangeLog:

* config/i386/i386.cc (ix86_reassociation_width): Update for Znver5.
* config/i386/x86-tune-costs.h (znver5_costs): Update reassociation
widths.

(cherry picked from commit f0ab3de6ec0e3540f2e57f3f5628005f0a4e3fa5)

Diff:
---
 gcc/config/i386/i386.cc  | 10 +++---
 gcc/config/i386/x86-tune-costs.h | 23 +--
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 8323b2e7cd39..395eeab70064 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23262,13 +23262,17 @@ ix86_reassociation_width (unsigned int op, 
machine_mode mode)
   if (width == 1)
return 1;
 
-  /* Integer vector instructions execute in FP unit
+  /* Znver1-4 Integer vector instructions execute in FP unit
 and can execute 3 additions and one multiplication per cycle.  */
   if ((ix86_tune == PROCESSOR_ZNVER1 || ix86_tune == PROCESSOR_ZNVER2
-  || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4
-  || ix86_tune == PROCESSOR_ZNVER5)
+  || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4)
  && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS)
return 1;
+  /* Znver5 can do 2 integer multiplications per cycle with latency
+of 3.  */
+  if (ix86_tune == PROCESSOR_ZNVER5
+ && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS)
+   width = 6;
 
   /* Account for targets that splits wide vectors into multiple parts.  */
   if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 256)
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 02fad74c4d1c..b89ac640ea5f 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2100,16 +2100,19 @@ struct processor_costs znver5_cost = {
   COSTS_N_INSNS (13),  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (14),  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (20),  /* cost of SQRTSD instruction.  */
-  /* Zen can execute 4 integer operations per cycle.  FP operations
- take 3 cycles and it can execute 2 integer additions and 2
- multiplications thus reassociation may make sense up to with of 6.
- SPEC2k6 bencharks suggests
- that 4 works better than 6 probably due to register pressure.
-
- Integer vector operations are taken by FP unit and execute 3 vector
- plus/minus operations per cycle but only one multiply.  This is adjusted
- in ix86_reassociation_width.  */
-  4, 4, 3, 6,  /* reassoc int, fp, vec_int, vec_fp.  */
+  /* Zen5 can execute:
+  - integer ops: 6 per cycle, at most 3 multiplications.
+   latency 1 for additions, 3 for multiplications (pipelined)
+
+   Setting width of 9 for multiplication is probably excessive
+   for register pressure.
+  - fp ops: 2 additions per cycle, latency 2-3
+   2 multiplicaitons per cycle, latency 3
+  - vector intger ops: 4 additions, latency 1
+  2 multiplications, latency 4
+   We increase width to 6 for multiplications
+   in ix86_reassociation_width.  */
+  6, 6, 4, 6,  /* reassoc int, fp, vec_int, vec_fp.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),   /* cond_taken_branch_cost.  */


[gcc r12-10887] Zen5 tuning part 3: scheduler tweaks

2025-01-07 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:f0718f1d7815c7845243a182c66f4a454efbfb72

commit r12-10887-gf0718f1d7815c7845243a182c66f4a454efbfb72
Author: Jan Hubicka 
Date:   Tue Sep 3 16:26:16 2024 +0200

Zen5 tuning part 3: scheduler tweaks

this patch adds support for new fussion in znver5 documented in the
optimization manual:

   The Zen5 microarchitecture adds support to fuse reg-reg MOV Instructions
   with certain ALU instructions. The following conditions need to be met 
for
   fusion to happen:
 - The MOV should be reg-reg mov with Opcode 0x89 or 0x8B
 - The MOV is followed by an ALU instruction where the MOV and ALU 
destination register match.
 - The ALU instruction may source only registers or immediate data. 
There cannot be any memory source.
 - The ALU instruction sources either the source or dest of MOV 
instruction.
 - If ALU instruction has 2 reg sources, they should be different.
 - The following ALU instructions can fuse with an older qualified MOV 
instruction:
   ADD ADC AND XOR OP SUB SBB INC DEC NOT SAL / SHL SHR SAR
   (I assume OP is OR)

I also increased issue rate from 4 to 6.  Theoretically znver5 can do more, 
but
with our model we can't realy use it.
Increasing issue rate to 8 leads to infinite loop in scheduler.

Finally, I also enabled fuse_alu_and_branch since it is supported by
znver5 (I think by earlier zens too).

New fussion pattern moves quite few instructions around in common code:
@@ -2210,13 +2210,13 @@
.cfi_offset 3, -32
leaq63(%rsi), %rbx
movq%rbx, %rbp
+   shrq$6, %rbp
+   salq$3, %rbp
subq$16, %rsp
.cfi_def_cfa_offset 48
movq%rdi, %r12
-   shrq$6, %rbp
-   movq%rsi, 8(%rsp)
-   salq$3, %rbp
movq%rbp, %rdi
+   movq%rsi, 8(%rsp)
call_Znwm
movq8(%rsp), %rsi
movl$0, 8(%r12)
@@ -2224,8 +2224,8 @@
movq%rax, (%r12)
movq%rbp, 32(%r12)
testq   %rsi, %rsi
-   movq%rsi, %rdx
cmovns  %rsi, %rbx
+   movq%rsi, %rdx
sarq$63, %rdx
shrq$58, %rdx
sarq$6, %rbx
which should help decoder bandwidth and perhaps also cache, though I was not
able to measure off-noise effect on SPEC.

gcc/ChangeLog:

* config/i386/i386.h (TARGET_FUSE_MOV_AND_ALU): New tune.
* config/i386/x86-tune-sched.cc (ix86_issue_rate): Updat for znver5.
(ix86_adjust_cost): Add TODO about znver5 memory latency.
(ix86_fuse_mov_alu_p): New.
(ix86_macro_fusion_pair_p): Use it.
* config/i386/x86-tune.def (X86_TUNE_FUSE_ALU_AND_BRANCH): Add 
ZNVER5.
(X86_TUNE_FUSE_MOV_AND_ALU): New tune;

(cherry picked from commit e2125a600552bc6e0329e3f1224eea14804db8d3)

Diff:
---
 gcc/config/i386/i386.h|  2 ++
 gcc/config/i386/x86-tune-sched.cc | 59 +++
 gcc/config/i386/x86-tune.def  |  6 +++-
 3 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 2bf294eb172a..ed988ca280ea 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -413,6 +413,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS]
 #define TARGET_FUSE_ALU_AND_BRANCH \
ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH]
+#define TARGET_FUSE_MOV_AND_ALU \
+   ix86_tune_features[X86_TUNE_FUSE_MOV_AND_ALU]
 #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU]
 #define TARGET_AVOID_LEA_FOR_ADDR \
ix86_tune_features[X86_TUNE_AVOID_LEA_FOR_ADDR]
diff --git a/gcc/config/i386/x86-tune-sched.cc 
b/gcc/config/i386/x86-tune-sched.cc
index ebfde5962495..f170f6476ce4 100644
--- a/gcc/config/i386/x86-tune-sched.cc
+++ b/gcc/config/i386/x86-tune-sched.cc
@@ -419,6 +419,8 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn 
*dep_insn, int cost,
  enum attr_unit unit = get_attr_unit (insn);
  int loadcost;
 
+ /* TODO: On znver5 complex addressing modes have
+greater latency.  */
  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
loadcost = 4;
  else
@@ -524,6 +526,60 @@ ix86_macro_fusion_p ()
   return TARGET_FUSE_CMP_AND_BRANCH;
 }
 
+static bool
+ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu)
+{
+  /* Validate mov:
+  - It should be reg-reg move with opcode 0x89 or 0x8B.  */
+  rtx set1 = PATTERN (mov);
+  if (GET_CODE (set1) != SET
+  || !GENERAL_REG_P (SET_SRC (set1))
+  || !GENERAL_REG_P (SET_DEST (set1)))
+return false;
+  rtx reg = SET_DEST (set1);
+  /*  - it should have 0x89 or 0x8B opcode.  */
+  

[gcc r13-9284] Zen5 tuning part 3: scheduler tweaks

2025-01-07 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:7392e9e480afe3143e72a99f7b5ac99b2f49c284

commit r13-9284-g7392e9e480afe3143e72a99f7b5ac99b2f49c284
Author: Jan Hubicka 
Date:   Tue Sep 3 16:26:16 2024 +0200

Zen5 tuning part 3: scheduler tweaks

this patch adds support for new fussion in znver5 documented in the
optimization manual:

   The Zen5 microarchitecture adds support to fuse reg-reg MOV Instructions
   with certain ALU instructions. The following conditions need to be met 
for
   fusion to happen:
 - The MOV should be reg-reg mov with Opcode 0x89 or 0x8B
 - The MOV is followed by an ALU instruction where the MOV and ALU 
destination register match.
 - The ALU instruction may source only registers or immediate data. 
There cannot be any memory source.
 - The ALU instruction sources either the source or dest of MOV 
instruction.
 - If ALU instruction has 2 reg sources, they should be different.
 - The following ALU instructions can fuse with an older qualified MOV 
instruction:
   ADD ADC AND XOR OP SUB SBB INC DEC NOT SAL / SHL SHR SAR
   (I assume OP is OR)

I also increased issue rate from 4 to 6.  Theoretically znver5 can do more, 
but
with our model we can't realy use it.
Increasing issue rate to 8 leads to infinite loop in scheduler.

Finally, I also enabled fuse_alu_and_branch since it is supported by
znver5 (I think by earlier zens too).

New fussion pattern moves quite few instructions around in common code:
@@ -2210,13 +2210,13 @@
.cfi_offset 3, -32
leaq63(%rsi), %rbx
movq%rbx, %rbp
+   shrq$6, %rbp
+   salq$3, %rbp
subq$16, %rsp
.cfi_def_cfa_offset 48
movq%rdi, %r12
-   shrq$6, %rbp
-   movq%rsi, 8(%rsp)
-   salq$3, %rbp
movq%rbp, %rdi
+   movq%rsi, 8(%rsp)
call_Znwm
movq8(%rsp), %rsi
movl$0, 8(%r12)
@@ -2224,8 +2224,8 @@
movq%rax, (%r12)
movq%rbp, 32(%r12)
testq   %rsi, %rsi
-   movq%rsi, %rdx
cmovns  %rsi, %rbx
+   movq%rsi, %rdx
sarq$63, %rdx
shrq$58, %rdx
sarq$6, %rbx
which should help decoder bandwidth and perhaps also cache, though I was not
able to measure off-noise effect on SPEC.

gcc/ChangeLog:

* config/i386/i386.h (TARGET_FUSE_MOV_AND_ALU): New tune.
* config/i386/x86-tune-sched.cc (ix86_issue_rate): Updat for znver5.
(ix86_adjust_cost): Add TODO about znver5 memory latency.
(ix86_fuse_mov_alu_p): New.
(ix86_macro_fusion_pair_p): Use it.
* config/i386/x86-tune.def (X86_TUNE_FUSE_ALU_AND_BRANCH): Add 
ZNVER5.
(X86_TUNE_FUSE_MOV_AND_ALU): New tune;

(cherry picked from commit e2125a600552bc6e0329e3f1224eea14804db8d3)

Diff:
---
 gcc/config/i386/i386.h|  2 ++
 gcc/config/i386/x86-tune-sched.cc | 59 +++
 gcc/config/i386/x86-tune.def  |  6 +++-
 3 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 08309367c18b..25c6540fb2c9 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -418,6 +418,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS]
 #define TARGET_FUSE_ALU_AND_BRANCH \
ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH]
+#define TARGET_FUSE_MOV_AND_ALU \
+   ix86_tune_features[X86_TUNE_FUSE_MOV_AND_ALU]
 #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU]
 #define TARGET_AVOID_LEA_FOR_ADDR \
ix86_tune_features[X86_TUNE_AVOID_LEA_FOR_ADDR]
diff --git a/gcc/config/i386/x86-tune-sched.cc 
b/gcc/config/i386/x86-tune-sched.cc
index cbaba5f9e3c3..28b9ed84d03b 100644
--- a/gcc/config/i386/x86-tune-sched.cc
+++ b/gcc/config/i386/x86-tune-sched.cc
@@ -435,6 +435,8 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn 
*dep_insn, int cost,
  enum attr_unit unit = get_attr_unit (insn);
  int loadcost;
 
+ /* TODO: On znver5 complex addressing modes have
+greater latency.  */
  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
loadcost = 4;
  else
@@ -540,6 +542,60 @@ ix86_macro_fusion_p ()
   return TARGET_FUSE_CMP_AND_BRANCH;
 }
 
+static bool
+ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu)
+{
+  /* Validate mov:
+  - It should be reg-reg move with opcode 0x89 or 0x8B.  */
+  rtx set1 = PATTERN (mov);
+  if (GET_CODE (set1) != SET
+  || !GENERAL_REG_P (SET_SRC (set1))
+  || !GENERAL_REG_P (SET_DEST (set1)))
+return false;
+  rtx reg = SET_DEST (set1);
+  /*  - it should have 0x89 or 0x8B opcode.  */
+  i

[gcc r15-5365] Mark asm statements as necessary in ipa-fnsummary

2024-11-17 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:bd59f2eeacd41b91e4e79b32dda83cc60d499e25

commit r15-5365-gbd59f2eeacd41b91e4e79b32dda83cc60d499e25
Author: Jan Hubicka 
Date:   Sun Nov 17 15:48:29 2024 +0100

Mark asm statements as necessary in ipa-fnsummary

I forgot to mark asm statements as necessary in ipa-fnsummary. This should
mask failure of gcc.dg/guality/pr36728-2.c where the patch enabled
cloning which breaks debug info.

gcc/ChangeLog:

* ipa-fnsummary.cc (find_necessary_statements): ASM statements are
necessary.

Diff:
---
 gcc/ipa-fnsummary.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/ipa-fnsummary.cc b/gcc/ipa-fnsummary.cc
index 87e08dad8467..e6bdc0069694 100644
--- a/gcc/ipa-fnsummary.cc
+++ b/gcc/ipa-fnsummary.cc
@@ -2804,7 +2804,8 @@ find_necessary_statements (struct cgraph_node *node)
  || (is_ctrl_stmt (stmt)
  && (gimple_code (stmt) != GIMPLE_COND
  || !guards_builtin_unreachable (bb, cache)))
- || gimple_store_p (stmt))
+ || gimple_store_p (stmt)
+ || gimple_code (stmt) == GIMPLE_ASM)
mark_stmt_necessary (stmt, worklist);
}
 }


[gcc r12-10923] Zen5 tuning part 2: disable gather and scatter

2025-01-21 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:e909afe8a8a2924dd6ced6bdf7d8e397f14310b5

commit r12-10923-ge909afe8a8a2924dd6ced6bdf7d8e397f14310b5
Author: Jan Hubicka 
Date:   Tue Sep 3 15:07:41 2024 +0200

Zen5 tuning part 2: disable gather and scatter

We disable gathers for zen4.  It seems that gather has improved a bit 
compared
to zen4 and Zen5 optimization manual suggests "Avoid GATHER instructions 
when
the indices are known ahead of time. Vector loads followed by shuffles 
result
in a higher load bandwidth." however the situation seems to be more
complicated.

gather is 5-10% loss on parest benchmark as well as 30% loss on sparse dot
products in TSVC. Curiously enough breaking these out into microbenchmark
reversed the situation and it turns out that the performance depends on
how indices are distributed.  gather is loss if indices are sequential,
neutral if they are random and win for some strides (4, 8).

This seems to be similar to earlier zens, so I think (especially for
backporting znver5 support) that it makes sense to be conistent and disable
gather unless we work out a good heuristics on when to use it. Since we
typically do not know the indices in advance, I don't see how that can be 
done.

I opened PR116582 with some examples of wins and loses

gcc/ChangeLog:

* config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Disable for
ZNVER5.
(X86_TUNE_USE_SCATTER_2PARTS): Disable for ZNVER5.
(X86_TUNE_USE_GATHER_4PARTS): Disable for ZNVER5.
(X86_TUNE_USE_SCATTER_4PARTS): Disable for ZNVER5.
(X86_TUNE_USE_GATHER_8PARTS): Disable for ZNVER5.
(X86_TUNE_USE_SCATTER_8PARTS): Disable for ZNVER5.

(cherry picked from commit d82edbe92eed53a479736fcbbe6d54d0fb42daa4)

Diff:
---
 gcc/config/i386/x86-tune.def | 15 ++-
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 561bd17b6e54..3fa7501fc72e 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -471,35 +471,32 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, 
"avoid_4byte_prefixes",
 /* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE
-   | m_GENERIC | m_GDS))
+ ~(m_ZNVER | m_ALDERLAKE | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
elements.  */
 DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
- ~(m_ZNVER4))
+ ~(m_ZNVER4 | m_ZNVER5))
 
 /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE
-   | m_GENERIC | m_GDS))
+ ~(m_ZNVER | m_ALDERLAKE | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
elements.  */
 DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
- ~(m_ZNVER4))
+ ~(m_ZNVER4 | m_ZNVER5))
 
 /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE
-   | m_GENERIC | m_GDS))
+ ~(m_ZNVER | m_ALDERLAKE | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
elements.  */
 DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
- ~(m_ZNVER4))
+ ~(m_ZNVER4 | m_ZNVER5))
 
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
smaller FMA chain.  */


[gcc r13-9309] Zen5 tuning part 5: update instruction latencies in x86-tune-costs

2025-01-12 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:f10d381dfc983ea32e5f72faadc7eb8126f114f6

commit r13-9309-gf10d381dfc983ea32e5f72faadc7eb8126f114f6
Author: Jan Hubicka 
Date:   Wed Sep 4 09:19:08 2024 +0200

Zen5 tuning part 5: update instruction latencies in x86-tune-costs

there is nothing exciting in this patch.  I measured latencies and also 
compared
them with newly released optimization guide.  There are no dramatic changes
compared to zen4.  One interesting new bit is that addss is faster and can 
be
2 cycles when fed by another addss.

I also increased the large insn bound since decoders seems no longer require
instructions to be 8 bytes or less.

gcc/ChangeLog:

* config/i386/x86-tune-costs.h (znver5_cost): Update instruction
costs.

(cherry picked from commit 4292297a0f938ffc953422fa246ff00fe345fe3d)

Diff:
---
 gcc/config/i386/x86-tune-costs.h | 28 +---
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index b89ac640ea5f..9edc6e36557d 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2034,6 +2034,7 @@ struct processor_costs znver5_cost = {
   COSTS_N_INSNS (1),   /* cost of a lea instruction.  */
   COSTS_N_INSNS (1),   /* variable shift costs.  */
   COSTS_N_INSNS (1),   /* constant shift costs.  */
+  /* mul has latency 3, executes in 3 integer units.  */
   {COSTS_N_INSNS (3),  /* cost of starting multiply for QI.  */
COSTS_N_INSNS (3),  /*   HI.  */
COSTS_N_INSNS (3),  /*   SI.  */
@@ -2041,6 +2042,8 @@ struct processor_costs znver5_cost = {
COSTS_N_INSNS (3)}, /*  other.  */
   0,   /* cost of multiply per each bit
   set.  */
+  /* integer divide has latency of 8 cycles
+ plus 1 for every 9 bits of quotient.  */
   {COSTS_N_INSNS (10), /* cost of a divide/mod for QI.  */
COSTS_N_INSNS (11), /*  HI.  */
COSTS_N_INSNS (13), /*  SI.  */
@@ -2048,7 +2051,7 @@ struct processor_costs znver5_cost = {
COSTS_N_INSNS (16)},/*  
other.  */
   COSTS_N_INSNS (1),   /* cost of movsx.  */
   COSTS_N_INSNS (1),   /* cost of movzx.  */
-  8,   /* "large" insn.  */
+  15,  /* "large" insn.  */
   9,   /* MOVE_RATIO.  */
   6,   /* CLEAR_RATIO */
   {6, 6, 6},   /* cost of loading integer registers
@@ -2065,12 +2068,13 @@ struct processor_costs znver5_cost = {
   2, 2, 2, /* cost of moving XMM,YMM,ZMM
   register.  */
   6,   /* cost of moving SSE register to 
integer.  */
-  /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
- throughput 5.  Approx 7 uops do not depend on vector size and every load
- is 5 uops.  */
+
+  /* TODO: gather and scatter instructions are currently disabled in
+ x86-tune.def.  In some cases they are however a win, see PR116582
+ We however need good cost model for them.  */
   14, 10,  /* Gather load static, per_elt.  */
   14, 20,  /* Gather store static, per_elt.  */
-  32,  /* size of l1 cache.  */
+  48,  /* size of l1 cache.  */
   1024,/* size of l2 cache.  */
   64,  /* size of prefetch block.  */
   /* New AMD processors never drop prefetches; if they cannot be performed
@@ -2080,6 +2084,8 @@ struct processor_costs znver5_cost = {
  time).  */
   100, /* number of parallel prefetches.  */
   3,   /* Branch cost.  */
+  /* TODO x87 latencies are still based on znver4.
+ Probably not very important these days.  */
   COSTS_N_INSNS (7),   /* cost of FADD and FSUB insns.  */
   COSTS_N_INSNS (7),   /* cost of FMUL instruction.  */
   /* Latency of fdiv is 8-15.  */
@@ -2089,16 +2095,24 @@ struct processor_costs znver5_cost = {
   /* Latency of fsqrt is 4-10.  */
   COSTS_N_INSNS (25),  /* cost of FSQRT instruction.  */
 
+  /* SSE instructions have typical throughput 4 and latency 1.  */
   COSTS_N_INSNS (1),   /* cost of cheap SSE instruction.  */
-  COSTS_N_INSNS (3),   /* cost of ADDSS/SD SUBSS/SD insns.  *

[gcc r12-10901] Zen5 tuning part 5: update instruction latencies in x86-tune-costs

2025-01-14 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:3911b6366ee49dffe2f16578093b49664b3a2d72

commit r12-10901-g3911b6366ee49dffe2f16578093b49664b3a2d72
Author: Jan Hubicka 
Date:   Wed Sep 4 09:19:08 2024 +0200

Zen5 tuning part 5: update instruction latencies in x86-tune-costs

there is nothing exciting in this patch.  I measured latencies and also 
compared
them with newly released optimization guide.  There are no dramatic changes
compared to zen4.  One interesting new bit is that addss is faster and can 
be
2 cycles when fed by another addss.

I also increased the large insn bound since decoders seems no longer require
instructions to be 8 bytes or less.

gcc/ChangeLog:

* config/i386/x86-tune-costs.h (znver5_cost): Update instruction
costs.

(cherry picked from commit 4292297a0f938ffc953422fa246ff00fe345fe3d)

Diff:
---
 gcc/config/i386/x86-tune-costs.h | 28 +---
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 0f2308bb079c..6bf09342feb0 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2002,6 +2002,7 @@ struct processor_costs znver5_cost = {
   COSTS_N_INSNS (1),   /* cost of a lea instruction.  */
   COSTS_N_INSNS (1),   /* variable shift costs.  */
   COSTS_N_INSNS (1),   /* constant shift costs.  */
+  /* mul has latency 3, executes in 3 integer units.  */
   {COSTS_N_INSNS (3),  /* cost of starting multiply for QI.  */
COSTS_N_INSNS (3),  /*   HI.  */
COSTS_N_INSNS (3),  /*   SI.  */
@@ -2009,6 +2010,8 @@ struct processor_costs znver5_cost = {
COSTS_N_INSNS (3)}, /*  other.  */
   0,   /* cost of multiply per each bit
   set.  */
+  /* integer divide has latency of 8 cycles
+ plus 1 for every 9 bits of quotient.  */
   {COSTS_N_INSNS (10), /* cost of a divide/mod for QI.  */
COSTS_N_INSNS (11), /*  HI.  */
COSTS_N_INSNS (13), /*  SI.  */
@@ -2016,7 +2019,7 @@ struct processor_costs znver5_cost = {
COSTS_N_INSNS (16)},/*  
other.  */
   COSTS_N_INSNS (1),   /* cost of movsx.  */
   COSTS_N_INSNS (1),   /* cost of movzx.  */
-  8,   /* "large" insn.  */
+  15,  /* "large" insn.  */
   9,   /* MOVE_RATIO.  */
   6,   /* CLEAR_RATIO */
   {6, 6, 6},   /* cost of loading integer registers
@@ -2033,12 +2036,13 @@ struct processor_costs znver5_cost = {
   2, 2, 2, /* cost of moving XMM,YMM,ZMM
   register.  */
   6,   /* cost of moving SSE register to 
integer.  */
-  /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
- throughput 5.  Approx 7 uops do not depend on vector size and every load
- is 5 uops.  */
+
+  /* TODO: gather and scatter instructions are currently disabled in
+ x86-tune.def.  In some cases they are however a win, see PR116582
+ We however need good cost model for them.  */
   14, 10,  /* Gather load static, per_elt.  */
   14, 20,  /* Gather store static, per_elt.  */
-  32,  /* size of l1 cache.  */
+  48,  /* size of l1 cache.  */
   1024,/* size of l2 cache.  */
   64,  /* size of prefetch block.  */
   /* New AMD processors never drop prefetches; if they cannot be performed
@@ -2048,6 +2052,8 @@ struct processor_costs znver5_cost = {
  time).  */
   100, /* number of parallel prefetches.  */
   3,   /* Branch cost.  */
+  /* TODO x87 latencies are still based on znver4.
+ Probably not very important these days.  */
   COSTS_N_INSNS (7),   /* cost of FADD and FSUB insns.  */
   COSTS_N_INSNS (7),   /* cost of FMUL instruction.  */
   /* Latency of fdiv is 8-15.  */
@@ -2057,16 +2063,24 @@ struct processor_costs znver5_cost = {
   /* Latency of fsqrt is 4-10.  */
   COSTS_N_INSNS (25),  /* cost of FSQRT instruction.  */
 
+  /* SSE instructions have typical throughput 4 and latency 1.  */
   COSTS_N_INSNS (1),   /* cost of cheap SSE instruction.  */
-  COSTS_N_INSNS (3),   /* cost of ADDSS/SD SUBSS/SD insns.  

[gcc r16-89] Fix vectorizer costs of COND_EXPR, MIN_EXPR, MAX_EXPR, ABS_EXPR, ABSU_EXPR

2025-04-22 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:0650ea627399a0ef23db434d4fce6b52b9faf557

commit r16-89-g0650ea627399a0ef23db434d4fce6b52b9faf557
Author: Jan Hubicka 
Date:   Tue Apr 22 23:47:14 2025 +0200

Fix vectorizer costs of COND_EXPR, MIN_EXPR, MAX_EXPR, ABS_EXPR, ABSU_EXPR

this patch adds special cases for vectorizer costs in COND_EXPR, MIN_EXPR,
MAX_EXPR, ABS_EXPR and ABSU_EXPR.   We previously costed ABS_EXPR and 
ABSU_EXPR
but it was only correct for FP variant (wehre it corresponds to andss 
clearing
sign bit).  Integer abs/absu is open coded as conditinal move for SSE2 and
SSE3 instroduced an instruction.

MIN_EXPR/MAX_EXPR compiles to minss/maxss for FP and accroding to Agner Fog
tables they costs same as sse_op on all targets. Integer translated to 
single
instruction since SSE3.

COND_EXPR translated to open-coded conditional move for SSE2, SSE4.1 
simplified
the sequence and AVX512 introduced masked registers.

gcc/ChangeLog:

* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Add 
special cases
for COND_EXPR; make MIN_EXPR, MAX_EXPR, ABS_EXPR and ABSU_EXPR more 
realistic.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr89618-2.c: XFAIL.

Diff:
---
 gcc/config/i386/i386.cc   | 95 ---
 gcc/testsuite/gcc.target/i386/pr89618-2.c |  8 ++-
 2 files changed, 92 insertions(+), 11 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d15f91ddd2cb..aef41454d9d5 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25300,7 +25300,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
  else if (X87_FLOAT_MODE_P (mode))
stmt_cost = ix86_cost->fadd;
  else
-   stmt_cost = ix86_cost->add;
+   stmt_cost = ix86_cost->add;
}
  else
stmt_cost = ix86_vec_cost (mode, fp ? ix86_cost->addss
@@ -25355,7 +25355,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
(subcode == RSHIFT_EXPR
 && !TYPE_UNSIGNED (TREE_TYPE (op1)))
? ASHIFTRT : LSHIFTRT, mode,
-   TREE_CODE (op2) == INTEGER_CST,
+   TREE_CODE (op2) == INTEGER_CST,
cst_and_fits_in_hwi (op2)
? int_cst_value (op2) : -1,
false, false, NULL, NULL);
@@ -25364,7 +25364,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
case NOP_EXPR:
  /* Only sign-conversions are free.  */
  if (tree_nop_conversion_p
-   (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
+   (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
 TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt
stmt_cost = 0;
  else if (fp)
@@ -25372,17 +25372,94 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
  (ix86_tune_cost, GET_MODE_BITSIZE (mode));
  break;
 
-   case BIT_IOR_EXPR:
-   case ABS_EXPR:
-   case ABSU_EXPR:
+   case COND_EXPR:
+ {
+   /* SSE2 conditinal move sequence is:
+pcmpgtd %xmm5, %xmm0
+pand%xmm0, %xmm2
+pandn   %xmm1, %xmm0
+por %xmm2, %xmm0
+  while SSE4 uses cmp + blend
+  and AVX512 masked moves.  */
+
+   int ninsns = TARGET_SSE4_1 ? 2 : 4;
+
+   if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+ stmt_cost = ninsns * ix86_cost->sse_op;
+   else if (X87_FLOAT_MODE_P (mode))
+ /* x87 requires conditional branch.  We don't have cost for
+that.  */
+ ;
+   else if (VECTOR_MODE_P (mode))
+ stmt_cost = ix86_vec_cost (mode, ninsns * ix86_cost->sse_op);
+   else
+ /* compare + cmov.  */
+ stmt_cost = ix86_cost->add * 2;
+ }
+ break;
+
case MIN_EXPR:
case MAX_EXPR:
+ if (fp)
+   {
+ if (X87_FLOAT_MODE_P (mode))
+   /* x87 requires conditional branch.  We don't have cost for
+  that.  */
+   ;
+ else
+   /* minss  */
+   stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+   }
+ else
+   {
+ if (VECTOR_MODE_P (mode))
+   {
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ /* vpmin was introduced in SSE3.
+SSE2 needs pcmpgtd + pand + pandn + pxor.  */
+ if (!TARGET_SSSE3)
+   stmt_cost *= 4;
+   }
+ else
+   /* cmp + cmov.  */
+  

[gcc r16-117] Fix i386 vectorizer cost of COND_EXPR and MIN_MAX with one of parameters 0 or -1

2025-04-24 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:2056d52d74070f50c5f8a22e4a600fcc3974fd88

commit r16-117-g2056d52d74070f50c5f8a22e4a600fcc3974fd88
Author: Jan Hubicka 
Date:   Thu Apr 24 18:37:55 2025 +0200

Fix i386 vectorizer cost of COND_EXPR and MIN_MAX with one of parameters 0 
or -1

gcc/ChangeLog:

PR target/119919
* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Account
correctly cond_expr and min/max when one of operands is 0 or -1.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr119919.c: New test.

Diff:
---
 gcc/config/i386/i386.cc  | 43 ++--
 gcc/testsuite/gcc.target/i386/pr119919.c | 13 ++
 2 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 3b4dfd9a9903..78df3d9525ae 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25375,14 +25375,32 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
case COND_EXPR:
  {
/* SSE2 conditinal move sequence is:
-pcmpgtd %xmm5, %xmm0
+pcmpgtd %xmm5, %xmm0 (accounted separately)
 pand%xmm0, %xmm2
 pandn   %xmm1, %xmm0
 por %xmm2, %xmm0
   while SSE4 uses cmp + blend
-  and AVX512 masked moves.  */
-
-   int ninsns = TARGET_SSE4_1 ? 2 : 4;
+  and AVX512 masked moves.
+
+  The condition is accounted separately since we usually have
+p = a < b
+c = p ? x : y
+  and we will account first statement as setcc.  Exception is when
+  p is loaded from memory as bool and then we will not acocunt
+  the compare, but there is no way to check for this.  */
+
+   int ninsns = TARGET_SSE4_1 ? 1 : 3;
+
+   /* If one of parameters is 0 or -1 the sequence will be simplified:
+  (if_true & mask) | (if_false & ~mask) -> if_true & mask  */
+   if (ninsns > 1
+   && (zerop (gimple_assign_rhs2 (stmt_info->stmt))
+   || zerop (gimple_assign_rhs3 (stmt_info->stmt))
+   || integer_minus_onep
+   (gimple_assign_rhs2 (stmt_info->stmt))
+   || integer_minus_onep
+   (gimple_assign_rhs3 (stmt_info->stmt
+ ninsns = 1;
 
if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
  stmt_cost = ninsns * ix86_cost->sse_op;
@@ -25393,8 +25411,8 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
else if (VECTOR_MODE_P (mode))
  stmt_cost = ix86_vec_cost (mode, ninsns * ix86_cost->sse_op);
else
- /* compare + cmov.  */
- stmt_cost = ix86_cost->add * 2;
+ /* compare (accounted separately) + cmov.  */
+ stmt_cost = ix86_cost->add;
  }
  break;
 
@@ -25416,9 +25434,18 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
{
  stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
  /* vpmin was introduced in SSE3.
-SSE2 needs pcmpgtd + pand + pandn + pxor.  */
+SSE2 needs pcmpgtd + pand + pandn + pxor.
+If one of parameters is 0 or -1 the sequence is simplified
+to pcmpgtd + pand.  */
  if (!TARGET_SSSE3)
-   stmt_cost *= 4;
+   {
+ if (zerop (gimple_assign_rhs2 (stmt_info->stmt))
+ || integer_minus_onep
+   (gimple_assign_rhs2 (stmt_info->stmt)))
+   stmt_cost *= 2;
+ else
+   stmt_cost *= 4;
+   }
}
  else
/* cmp + cmov.  */
diff --git a/gcc/testsuite/gcc.target/i386/pr119919.c 
b/gcc/testsuite/gcc.target/i386/pr119919.c
new file mode 100644
index ..ed646561bd1f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr119919.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -fdump-tree-vect-details" } */
+int a[9*9];
+bool b[9];
+void test()
+{
+for (int i = 0; i < 9; i++)
+{
+b[i] = a[i*9] != 0;
+}
+}
+
+/* { dg-final { scan-tree-dump "loop vectorized using 8 byte vectors" "vect" } 
} */


[gcc r16-116] Fix ICE building deepsjeng with -fprofile-use

2025-04-24 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:cfb04e0de6aa438df9d8b83a3d8c7f93789b5c9f

commit r16-116-gcfb04e0de6aa438df9d8b83a3d8c7f93789b5c9f
Author: Jan Hubicka 
Date:   Thu Apr 24 18:35:54 2025 +0200

Fix ICE building deepsjeng with -fprofile-use

The problem here is division by zero, since adjusted 0 > precise 0. Fixed by
using right test.

gcc/ChangeLog:

PR ipa/119924
* ipa-cp.cc (update_counts_for_self_gen_clones): Use nonzero_p.
(update_profiling_info): Likewise.
(update_specialized_profile): Likewise.

Diff:
---
 gcc/ipa-cp.cc | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gcc/ipa-cp.cc b/gcc/ipa-cp.cc
index abde64b6f296..b4b96997d750 100644
--- a/gcc/ipa-cp.cc
+++ b/gcc/ipa-cp.cc
@@ -4639,7 +4639,7 @@ update_counts_for_self_gen_clones (cgraph_node *orig_node,
   const vec &self_gen_clones)
 {
   profile_count redist_sum = orig_node->count.ipa ();
-  if (!(redist_sum > profile_count::zero ()))
+  if (!redist_sum.nonzero_p ())
 return;
 
   if (dump_file)
@@ -4710,7 +4710,7 @@ update_counts_for_self_gen_clones (cgraph_node *orig_node,
  it.  */
   for (cgraph_node *n : self_gen_clones)
 {
-  if (!(n->count.ipa () > profile_count::zero ()))
+  if (!n->count.ipa ().nonzero_p ())
continue;
 
   desc_incoming_count_struct desc;
@@ -4756,7 +4756,7 @@ update_profiling_info (struct cgraph_node *orig_node,
   profile_count new_sum;
   profile_count remainder, orig_node_count = orig_node->count.ipa ();
 
-  if (!(orig_node_count > profile_count::zero ()))
+  if (!orig_node_count.nonzero_p ())
 return;
 
   if (dump_file)
@@ -4920,7 +4920,7 @@ update_specialized_profile (struct cgraph_node *new_node,
   orig_node_count.dump (dump_file);
   fprintf (dump_file, "\n");
 }
-  if (!(orig_node_count > profile_count::zero ()))
+  if (!orig_node_count.nonzero_p ())
 return;
 
   new_node_count = new_node->count;


[gcc r16-100] Cost truth_value exprs in i386 vectorizer costs.

2025-04-23 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:9b9d605d68cf27a24e8ed9d4f1ead1f00131cec1

commit r16-100-g9b9d605d68cf27a24e8ed9d4f1ead1f00131cec1
Author: Jan Hubicka 
Date:   Wed Apr 23 17:04:32 2025 +0200

Cost truth_value exprs in i386 vectorizer costs.

this patch implements costing of truth_value exprs.  I.e.
  a = b < c;
Those seems to be now the most common operations that goes to the addss path
except for in->fp and fp->int conversions.

For integer we use setcc, for FP there is CMccSS and variants which sets the
destination register a s a mast (i.e. -1 on true and 0 on false).  
Technically
these needs res&1 to get into 1 on true, 0 on false, but looking on examples
where this is used, it is common that the resulting code is optimized 
avoiding
need for this (except for cases wehre result is directly saved to memory).
For this reason I am accounting only one sse_op (CMccSS) itself.

gcc/ChangeLog:

* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Cost 
truth_value
exprs.

Diff:
---
 gcc/config/i386/i386.cc | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index aef41454d9d5..3b4dfd9a9903 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25464,7 +25464,25 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
  else
stmt_cost = ix86_cost->add;
  break;
+
default:
+ if (truth_value_p (subcode))
+   {
+ if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+   /* CMPccS? insructions are cheap, so use sse_op.  While they
+  produce a mask which may need to be turned to 0/1 by and,
+  expect that this will be optimized away in a common case.  */
+   stmt_cost = ix86_cost->sse_op;
+ else if (X87_FLOAT_MODE_P (mode))
+   /* fcmp + setcc.  */
+   stmt_cost = ix86_cost->fadd + ix86_cost->add;
+ else if (VECTOR_MODE_P (mode))
+   stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ else
+   /* setcc.  */
+   stmt_cost = ix86_cost->add;
+ break;
+   }
  break;
}
 }


[gcc r16-101] Enable ip-cp cloning over non-hot edges

2025-04-23 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:132d01d96ea9d617aaffdd5dfba3284a8958e529

commit r16-101-g132d01d96ea9d617aaffdd5dfba3284a8958e529
Author: Jan Hubicka 
Date:   Wed Apr 23 18:39:14 2025 +0200

Enable ip-cp cloning over non-hot edges

Currently enabling profile feedback regresses x264 and exchange. In both 
cases the root of the
issue is that ipa-cp cost model thinks cloning is not relevant when 
feedback is available
while it clones without feedback.

Consider:

__attribute__ ((used))
int a[1000];

__attribute__ ((noinline))
void
test2(int sz)
{
  for (int i = 0; i < sz; i++)
  a[i]++;
  asm volatile (""::"m"(a));
}

__attribute__ ((noinline))
void
test1 (int sz)
{
  for (int i = 0; i < 1000; i++)
  test2(sz);
}
int main()
{
test1(1000);
return 0;
}

Here we want to clone call both test1 and test2 and specialize for 1000, but
ipa-cp will not do that, since it will skip call main->test1 as not hot 
since
it is called just once both with or without profile feedback.
In this simple testcase even without profile feedback we will track that 
main
is called once.

I think the testcase shows that hotness of call is not that relevant when
deciding whether we want to propagate constants across it.  ipa-cp with IPA
profile can compute overall estimate of time saved (which is existing time
benefit computing time saved per invociation of the function multiplied by
number of executions) and see if result is big enough. An easy check is to
simply call maybe_hot_p on the resulting count.

So this patch makes ipa-cp to consider all calls sites except those known 
to be
unlikely executed (i.e. run 0 times in train run or known to lead to 
someting
bad) as interesting, which makes ipa-cp to propagate across them, find 
cloning
candidates and feed them into good_clonning_oppurtunity.

For this I added cs_interesting_for_ipcp_p which also attempts to do right
thing with partial training.

Now good_clonning_oppurtunity will currently return false, since it will 
figure
out that the call edge is not very frequent.
It already kind of knows that frequency of call instruction istself is not 
too
important, but instead of computing overall time saved, it tries to compare 
it
with param_ipa_cp_profile_count_base percentage of counts of call edges.  I
think this is not very relevant since estimated time saved per call can be
large.  So I dropped this logic and replaced it with simple use of overall
saved time.

Since ipa-cp is not dealing well with the cases where it hits the allowed 
unit
growth limit, we probably want to be more careful, so I keep existing metric
with this change.

So now we get:

Evaluating opportunities for test1/3.
 - considering value 1000 for param #0 sz (caller_count: 1)
 good_cloning_opportunity_p (time: 1, size: 8, count_sum: 1 (precise), 
overall time saved: 1 (adjusted)) -> evaluation: 0.12, threshold: 500
 not cloning: time saved is not hot
 good_cloning_opportunity_p (time: 129001, size: 20, count_sum: 1 
(precise), overall time saved: 129001 (adjusted)) -> evaluation: 6450.05, 
threshold: 500

First call to good_cloning_oppurtunity considers the case where only test1 
is
clonned. In this case time saved is 1 (for passing the value around) and 
since
it is called just once (count_sum) overall time saved is 1 which is not
considered hot and we also get very low evaulation score.

In the second call we consider cloning chain test1->test2.  In this case 
time
saved is large (12901) since test2 is invoked many times and it is used to
controll the loop.  We still know that the count is 1 but overall time is
129001 which is already considered relevant and we clone.

I also try to do something sensible in case we have calls both with
and without IPA profile (which can happen for comdats where profile got 
missing
or with LTO if some units were not trained).
Instead of checking whether sum of calls with known profile is nonzero, I 
keep
track if there are other calls and if so, also try the local heuristics that
is used without profile feedback.

The patch improves SPECint with -Ofast -fprofile-use by approx 1% by 
speeding
up x264 from 99.3s to 91.3s (9%) and exchange from 99.7s to 95.5s (3.3%).

We still get better x264 runtime without profile (86.4s for x264 and 93.8 
for exchange).

The main problem I see is that ipa-cp has the global limit for growth of 10%
but does not consider the oppurtunities in priority order.  Consequently if 
the
limit is hit, randomly some clone oppurtunities are dropped in favour of
others.

I dumped unit size changes with -flto -Ofast build of

[gcc r15-9522] Stream ipa_return_value_summary

2025-04-16 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:eabba7be040e81690332070873d59d23e8c93e11

commit r15-9522-geabba7be040e81690332070873d59d23e8c93e11
Author: Jan Hubicka 
Date:   Wed Apr 16 15:28:32 2025 +0200

Stream ipa_return_value_summary

Add streaming of return summaries from compile time to ltrans
which are now needed for vrp to not ouput false errors on musttail.

Co-authored-by: Jakub Jelinek 

gcc/ChangeLog:
PR tree-optimization/119614

* ipa-prop.cc (ipa_write_return_summaries): New function.
(ipa_record_return_value_range_1): Break out from 
(ipa_record_return_value_range): ... here.
(ipa_read_return_summaries): New function.
(ipa_prop_read_section): Read return summaries.
(read_ipcp_transformation_info): Read return summaries.
(ipcp_write_transformation_summaries): Write return summaries;
do not stream stray 0.

gcc/testsuite/ChangeLog:

* g++.dg/lto/pr119614_0.C: New test.

Diff:
---
 gcc/ipa-prop.cc   | 115 --
 gcc/testsuite/g++.dg/lto/pr119614_0.C |  34 ++
 2 files changed, 131 insertions(+), 18 deletions(-)

diff --git a/gcc/ipa-prop.cc b/gcc/ipa-prop.cc
index 49d68ab044b7..0398d69962f8 100644
--- a/gcc/ipa-prop.cc
+++ b/gcc/ipa-prop.cc
@@ -5439,6 +5439,49 @@ ipa_read_node_info (class lto_input_block *ib, struct 
cgraph_node *node,
 }
 }
 
+/* Stream out ipa_return_summary.  */
+static void
+ipa_write_return_summaries (output_block *ob)
+{
+  if (!ipa_return_value_sum)
+{
+  streamer_write_uhwi (ob, 0);
+  return;
+}
+
+  lto_symtab_encoder_t encoder = ob->decl_state->symtab_node_encoder;
+  unsigned int count = 0;
+  for (int i = 0; i < lto_symtab_encoder_size (encoder); i++)
+{
+  symtab_node *snode = lto_symtab_encoder_deref (encoder, i);
+  cgraph_node *cnode = dyn_cast  (snode);
+  ipa_return_value_summary *v;
+
+  if (cnode && cnode->definition && !cnode->alias
+ && (v = ipa_return_value_sum->get (cnode))
+ && v->vr)
+   count++;
+}
+  streamer_write_uhwi (ob, count);
+
+  for (int i = 0; i < lto_symtab_encoder_size (encoder); i++)
+{
+  symtab_node *snode = lto_symtab_encoder_deref (encoder, i);
+  cgraph_node *cnode = dyn_cast  (snode);
+  ipa_return_value_summary *v;
+
+  if (cnode && cnode->definition && !cnode->alias
+ && (v = ipa_return_value_sum->get (cnode))
+ && v->vr)
+   {
+ streamer_write_uhwi
+   (ob,
+lto_symtab_encoder_encode (encoder, cnode));
+ v->vr->streamer_write (ob);
+   }
+}
+}
+
 /* Write jump functions for nodes in SET.  */
 
 void
@@ -5475,11 +5518,58 @@ ipa_prop_write_jump_functions (void)
  && ipa_node_params_sum->get (node) != NULL)
 ipa_write_node_info (ob, node);
 }
-  streamer_write_char_stream (ob->main_stream, 0);
+  ipa_write_return_summaries (ob);
   produce_asm (ob);
   destroy_output_block (ob);
 }
 
+/* Record that return value range of N is VAL.  */
+
+static void
+ipa_record_return_value_range_1 (cgraph_node *n, value_range val)
+{
+  if (!ipa_return_value_sum)
+{
+  if (!ipa_vr_hash_table)
+   ipa_vr_hash_table = hash_table::create_ggc (37);
+  ipa_return_value_sum = new (ggc_alloc_no_dtor  
())
+ ipa_return_value_sum_t (symtab, true);
+  ipa_return_value_sum->disable_insertion_hook ();
+}
+  ipa_return_value_sum->get_create (n)->vr = ipa_get_value_range (val);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+{
+  fprintf (dump_file, "Recording return range of %s:", n->dump_name ());
+  val.dump (dump_file);
+  fprintf (dump_file, "\n");
+}
+}
+
+/* Stream out ipa_return_summary.  */
+static void
+ipa_read_return_summaries (lto_input_block *ib,
+  struct lto_file_decl_data *file_data,
+  class data_in *data_in)
+{
+  unsigned int f_count = streamer_read_uhwi (ib);
+  for (unsigned int i = 0; i < f_count; i++)
+{
+  unsigned int index = streamer_read_uhwi (ib);
+  lto_symtab_encoder_t encoder = file_data->symtab_node_encoder;
+  struct cgraph_node *node
+ = dyn_cast 
+ (lto_symtab_encoder_deref (encoder, index));
+  ipa_vr rvr;
+  rvr.streamer_read (ib, data_in);
+  if (node->prevailing_p ())
+   {
+ value_range tmp;
+ rvr.get_vrange (tmp);
+ ipa_record_return_value_range_1 (node, tmp);
+   }
+}
+}
+
 /* Read section in file FILE_DATA of length LEN with data DATA.  */
 
 static void
@@ -5516,6 +5606,7 @@ ipa_prop_read_section (struct lto_file_decl_data 
*file_data, const char *data,
   gcc_assert (node->definition);
   ipa_read_node_info (&ib_main, node, data_in);
 }
+  ipa_read_return_summaries (&ib_main, file_data, data_in);
   lto_free_section_data (file_data, LTO

[gcc r16-401] Fix i386 bootstrap on non-Windows platforms

2025-05-06 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:673d446894c063c92cafce9ba41340c82e960a11

commit r16-401-g673d446894c063c92cafce9ba41340c82e960a11
Author: Jan Hubicka 
Date:   Tue May 6 12:07:15 2025 +0200

Fix i386 bootstrap on non-Windows platforms

* config/i386/i386.cc (ix86_tls_index): Add ifdef.

Diff:
---
 gcc/config/i386/i386.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index f28c92a9d3aa..89f518c86b5e 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -12320,6 +12320,7 @@ get_thread_pointer (machine_mode tp_mode, bool to_reg)
 
 static GTY(()) rtx ix86_tls_index_symbol;
 
+#if TARGET_WIN32_TLS
 static rtx
 ix86_tls_index (void)
 {
@@ -12331,6 +12332,7 @@ ix86_tls_index (void)
   else
 return ix86_tls_index_symbol;
 }
+#endif
 
 /* Construct the SYMBOL_REF for the tls_get_addr function.  */


[gcc r16-358] Make ix86 cost of VEC_SELECT equivalent to SUBREG cost 1

2025-05-02 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:c85148d036d17295bb2560e10020c924c83a5d13

commit r16-358-gc85148d036d17295bb2560e10020c924c83a5d13
Author: Jan Hubicka 
Date:   Fri May 2 15:53:35 2025 +0200

Make ix86 cost of VEC_SELECT equivalent to SUBREG cost 1

This patch fixes regression of imagick with PGO and AVX512 where correcting 
size
cost of SSE operations (to be 4 instead of 2 originally cut&pasted from x87)
made late combine to eliminate zero registers introduced by rapd.  The 
problem
is that cost-model mistakely accounts VEC_SELECT as real instruction while 
it is
optimized to nothing if src==dest (which is the case of these testcases).
This register is used to eliminate false dependency between source and 
destination
of int->fp conversions.

While ix86_insn_cost hook already contains logic to incrase cost of the 
zero-extend
the costs was not enough.

gcc/ChangeLog:

PR target/119900
* config/i386/i386.cc (ix86_can_change_mode_class): Add TODO
comment.
(ix86_rtx_costs): Make VEC_SELECT equivalent to SUBREG cost 1.

Diff:
---
 gcc/config/i386/i386.cc | 39 ---
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index cb348cb9cfb8..0c808c22b4f0 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20978,7 +20978,11 @@ ix86_can_change_mode_class (machine_mode from, 
machine_mode to,
 return true;
 
   /* x87 registers can't do subreg at all, as all values are reformatted
- to extended precision.  */
+ to extended precision.
+
+ ??? middle-end queries mode changes for ALL_REGS and this makes
+ vec_series_lowpart_p to always return false.  We probably should
+ restrict this to modes supported by i387 and check if it is enabled.  */
   if (MAYBE_FLOAT_CLASS_P (regclass))
 return false;
 
@@ -22756,13 +22760,41 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
}
   return false;
 
-case VEC_SELECT:
 case VEC_CONCAT:
   /* ??? Assume all of these vector manipulation patterns are
 recognizable.  In which case they all pretty much have the
-same cost.  */
+same cost.
+??? We should still recruse when computing cost.  */
  *total = cost->sse_op;
  return true;
+
+case VEC_SELECT:
+ /* Special case extracting lower part from the vector.
+   This by itself needs to code and most of SSE/AVX instructions have
+   packed and single forms where the single form may be represented
+   by such VEC_SELECT.
+
+   Use cost 1 (despite the fact that functionally equivalent SUBREG has
+   cost 0).  Making VEC_SELECT completely free, for example instructs CSE
+   to forward propagate VEC_SELECT into
+
+  (set (reg eax) (reg src))
+
+   which then prevents fwprop and combining. See i.e.
+   gcc.target/i386/pr91103-1.c.
+
+   ??? rtvec_series_p test should be, for valid patterns, equivalent to
+   vec_series_lowpart_p but is not, since the latter calls
+   can_cange_mode_class on ALL_REGS and this return false since x87 does
+   not support subregs at all.  */
+ if (rtvec_series_p (XVEC (XEXP (x, 1), 0), 0))
+   *total = rtx_cost (XEXP (x, 0), GET_MODE (XEXP (x, 0)),
+ outer_code, opno, speed) + 1;
+ else
+   /* ??? We should still recruse when computing cost.  */
+   *total = cost->sse_op;
+ return true;
+
 case VEC_DUPLICATE:
   *total = rtx_cost (XEXP (x, 0),
 GET_MODE (XEXP (x, 0)),
@@ -22780,6 +22812,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
   if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
*total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
   else
+   /* ??? We should still recruse when computing cost.  */
*total = cost->sse_op;
   return true;


[gcc r15-9496] Set znver5 issue rate to 4.

2025-04-15 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:4a01869b963520d689fe9242cf2ff24984ea41d8

commit r15-9496-g4a01869b963520d689fe9242cf2ff24984ea41d8
Author: Jan Hubicka 
Date:   Tue Apr 15 19:09:20 2025 +0200

Set znver5 issue rate to 4.

this patch sets issue rate of znver5 to 4.  With current model, unless a 
reservation is
missing, we will never issue more than 4 instructions per cycle since that 
is the limit
of decoders and the model does not take into acount the fact that typically 
code is run
from op cache.

gcc/ChangeLog:

* config/i386/x86-tune-sched.cc (ix86_issue_rate): Set
to 4 for znver5.

Diff:
---
 gcc/config/i386/x86-tune-sched.cc | 15 ---
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/gcc/config/i386/x86-tune-sched.cc 
b/gcc/config/i386/x86-tune-sched.cc
index 685a83c4311b..15d3d91a83b6 100644
--- a/gcc/config/i386/x86-tune-sched.cc
+++ b/gcc/config/i386/x86-tune-sched.cc
@@ -81,6 +81,14 @@ ix86_issue_rate (void)
 case PROCESSOR_YONGFENG:
 case PROCESSOR_SHIJIDADAO:
 case PROCESSOR_GENERIC:
+/* For znver5 decoder can handle 4 or 8 instructions per cycle,
+   op cache 12 instruction/cycle, dispatch 8 instructions
+   integer rename 8 instructions and Fp 6 instructions.
+
+   The scheduler, without understanding out of order nature of the CPU
+   is not going to be able to use more than 4 instructions since that
+   is limits of the decoders.  */
+case PROCESSOR_ZNVER5:
   return 4;
 
 case PROCESSOR_ICELAKE_CLIENT:
@@ -91,13 +99,6 @@ ix86_issue_rate (void)
   return 5;
 
 case PROCESSOR_SAPPHIRERAPIDS:
-/* For znver5 decoder can handle 4 or 8 instructions per cycle,
-   op cache 12 instruction/cycle, dispatch 8 instructions
-   integer rename 8 instructions and Fp 6 instructions.
-
-   The scheduler, without understanding out of order nature of the CPU
-   is unlikely going to be able to fill all of these.  */
-case PROCESSOR_ZNVER5:
   return 6;
 
 default:


[gcc r15-9495] Set ADDSS cost to 3 for znver5

2025-04-15 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:e2011ab13de3e70774f869b356f5f9c750780b34

commit r15-9495-ge2011ab13de3e70774f869b356f5f9c750780b34
Author: Jan Hubicka 
Date:   Tue Apr 15 19:04:15 2025 +0200

Set ADDSS cost to 3 for znver5

Znver5 has latency of addss 2 in typical case while all earlier versions 
has latency 3.
Unforunately addss cost is used to cost many other SSE instructions than 
just addss and
setting the cost to 2 makes us to vectorize 4 64bit stores into one 256bit 
store which
in turn regesses imagemagick.

This patch sets the cost back to 3.  Next stage1 we can untie addss from 
the other operatoins
and set it correctly.

bootstrapped/regtested x86_64-linux and also benchmarked on SPEC2k17

gcc/ChangeLog:

PR target/119298
* config/i386/x86-tune-costs.h (znver5_cost): Set ADDSS cost to 3.

Diff:
---
 gcc/config/i386/x86-tune-costs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 7c8cb738d7cd..9477345bdd7e 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2120,7 +2120,7 @@ struct processor_costs znver5_cost = {
   COSTS_N_INSNS (1),   /* cost of cheap SSE instruction.  */
   /* ADDSS has throughput 2 and latency 2
  (in some cases when source is another addition).  */
-  COSTS_N_INSNS (2),   /* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (3),   /* cost of ADDSS/SD SUBSS/SD insns.  */
   /* MULSS has throughput 2 and latency 3.  */
   COSTS_N_INSNS (3),   /* cost of MULSS instruction.  */
   COSTS_N_INSNS (3),   /* cost of MULSD instruction.  */


[gcc r16-162] Fix i386 vectorizer cost of FP scalar MAX_EXPR and MIN_EXPR

2025-04-26 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:1d635e79b3c2d26f864964b79717132bffbcad20

commit r16-162-g1d635e79b3c2d26f864964b79717132bffbcad20
Author: Jan Hubicka 
Date:   Sat Apr 26 22:10:19 2025 +0200

Fix i386 vectorizer cost of FP scalar MAX_EXPR and MIN_EXPR

I introduced a bug by last minute cleanups unifying the scalar and vector 
SSE conditional.
This patch fixes it and restores cost of 1 of SSE scalar MIN/MAX

Bootstrapped/regtested x86_64-linux, comitted.

gcc/ChangeLog:

PR target/105275
* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Fix cost 
of FP scalar
MAX_EXPR and MIN_EXPR

Diff:
---
 gcc/config/i386/i386.cc | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 78df3d9525ae..3171d6e0ad45 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25420,7 +25420,8 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
case MAX_EXPR:
  if (fp)
{
- if (X87_FLOAT_MODE_P (mode))
+ if (X87_FLOAT_MODE_P (mode)
+ && !SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
/* x87 requires conditional branch.  We don't have cost for
   that.  */
;
@@ -25457,7 +25458,8 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
case ABSU_EXPR:
  if (fp)
{
- if (X87_FLOAT_MODE_P (mode))
+ if (X87_FLOAT_MODE_P (mode)
+ && !SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
/* fabs.  */
stmt_cost = ix86_cost->fabs;
  else


[gcc r16-291] Fix cs_interesting_for_ipcp_p wrt flag_profile_partial_training.

2025-04-29 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:1efd525e828d5c30d0962d05ba81f9d7c8176ca7

commit r16-291-g1efd525e828d5c30d0962d05ba81f9d7c8176ca7
Author: Jan Hubicka 
Date:   Tue Apr 29 22:43:45 2025 +0200

Fix cs_interesting_for_ipcp_p wrt flag_profile_partial_training.

As noticed by Martin Jambor, I introduced a bug while simplifying
cs_interesting_for_ipcp_p and reversed condition for
flag_profile_partial_training.  Also I noticed that we probably want to
consider calls with unintialized counts for cloning so the pass does 
somehting
with -fno-guess-branch-probability even thugh this is probably not very 
useful
in practice.

gcc/ChangeLog:

* ipa-cp.cc (cs_interesting_for_ipcp_p): Fix handling of 
uninitialized
counts and 0 IPA cost wrt flag_profile_partial_training.

Diff:
---
 gcc/ipa-cp.cc | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/gcc/ipa-cp.cc b/gcc/ipa-cp.cc
index b4b96997d750..f7e5aa9bfd5c 100644
--- a/gcc/ipa-cp.cc
+++ b/gcc/ipa-cp.cc
@@ -545,14 +545,12 @@ cs_interesting_for_ipcp_p (cgraph_edge *e)
 return true;
   /* If local (possibly guseed or adjusted 0 profile) claims edge is
  not executed, do not propagate.  */
-  if (!e->count.nonzero_p ())
+  if (e->count.initialized_p () && !e->count.nonzero_p ())
 return false;
-  /* If IPA profile says edge is executed zero times, but zero
- is quality is ADJUSTED, still consider it for cloning in
- case we have partial training.  */
+  /* If we have zero IPA profile, still consider edge for cloning
+ in case we do partial training.  */
   if (e->count.ipa ().initialized_p ()
-  && opt_for_fn (e->callee->decl,flag_profile_partial_training)
-  && e->count.nonzero_p ())
+  && !opt_for_fn (e->callee->decl,flag_profile_partial_training))
 return false;
   return true;
 }


[gcc r16-367] Improve ix86 VEC_MERGE costs

2025-05-03 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:20d184e3f84d859e7e9f44a8d91772a02b658872

commit r16-367-g20d184e3f84d859e7e9f44a8d91772a02b658872
Author: Jan Hubicka 
Date:   Sat May 3 00:26:29 2025 +0200

Improve ix86 VEC_MERGE costs

ix86_rtx_costs VEC_MERGE by special casing AVX512 mask operations and 
otherwise
returning cost->sse_op completely ignoring costs of the operands.  Since
VEC_MERGE is also used to represent scalar variant of SSE/AVX operation, 
this
means that many instructions (such as SSE converisions) are often costed as
sse_op instead of their real cost.

This patch adds pattern matching for the VEC_MERGE pattern which also 
forced me
to add special cases for masked versions and vcmp otherwise combine is 
confused
by the default cost compred to the cost of recognized version of the
instruction.

Since now the important cases should be handled, I also added recursion to 
the
remaining cases so substituting constants and memory is adequately costed.

gcc/ChangeLog:

* config/i386/i386.cc (unspec_pcmp_p): New function.
(ix86_rtx_costs): Cost VEC_MERGE more realistically.

Diff:
---
 gcc/config/i386/i386.cc | 82 ++---
 1 file changed, 77 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 0c808c22b4f0..5ad47e194348 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22025,6 +22025,15 @@ vec_fp_conversion_cost (const struct processor_costs 
*cost, int size)
 return cost->vcvtps2pd512;
 }
 
+/* Return true of X is UNSPEC with UNSPEC_PCMP or UNSPEC_UNSIGNED_PCMP.  */
+
+static bool
+unspec_pcmp_p (rtx x)
+{
+  return GET_CODE (x) == UNSPEC
+&& (XINT (x, 1) == UNSPEC_PCMP || XINT (x, 1) == UNSPEC_UNSIGNED_PCMP);
+}
+
 /* Compute a (partial) cost for rtx X.  Return true if the complete
cost has been computed, and false if subexpressions should be
scanned.  In either case, *TOTAL contains the cost result.  */
@@ -22807,14 +22816,77 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
 
 case VEC_MERGE:
   mask = XEXP (x, 2);
+  /* Scalar versions of SSE instructions may be represented as:
+
+(vec_merge (vec_duplicate (operation ))
+(register or memory)
+(const_int 1))
+
+In this case vec_merge and vec_duplicate is for free.
+Just recurse into operation and second operand.  */
+  if (mask == const1_rtx
+ && GET_CODE (XEXP (x, 0)) == VEC_DUPLICATE)
+   {
+ *total = rtx_cost (XEXP (XEXP (x, 0), 0), mode,
+outer_code, opno, speed)
+  + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed);
+ return true;
+   }
   /* This is masked instruction, assume the same cost,
 as nonmasked variant.  */
-  if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
-   *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
+  else if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
+   {
+ *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
+  + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed);
+ return true;
+   }
+  /* Combination of the two above:
+
+(vec_merge (vec_merge (vec_duplicate (operation ...))
+  (register or memory)
+  (reg:QI mask))
+   (register or memory)
+   (const_int 1))
+
+i.e. avx512fp16_vcvtss2sh_mask.  */
+  else if (TARGET_AVX512F
+  && mask == const1_rtx
+  && GET_CODE (XEXP (x, 0)) == VEC_MERGE
+  && GET_CODE (XEXP (XEXP (x, 0), 0)) == VEC_DUPLICATE
+  && register_operand (XEXP (XEXP (x, 0), 2),
+   GET_MODE (XEXP (XEXP (x, 0), 2
+   {
+ *total = rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
+mode, outer_code, opno, speed)
+  + rtx_cost (XEXP (XEXP (x, 0), 1),
+  mode, outer_code, opno, speed)
+  + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed);
+ return true;
+   }
+  /* vcmp.  */
+  else if (unspec_pcmp_p (mask)
+  || (GET_CODE (mask) == NOT
+  && unspec_pcmp_p (XEXP (mask, 0
+   {
+ rtx uns = GET_CODE (mask) == NOT ? XEXP (mask, 0) : mask;
+ rtx unsop0 = XVECEXP (uns, 0, 0);
+ /* Make (subreg:V4SI (not:V16QI (reg:V16QI ..)) 0)
+cost the same as register.
+This is used by avx_cmp3_ltint_not.  */
+ if (GET_CODE (unsop0) == SUBREG)
+   unsop0 = XEXP (unsop0, 0);
+ if (GET_CODE (unsop0) == NOT)
+   unsop0 = XEXP (unsop0, 0);
+ *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)

[gcc r16-372] Improve maybe_hot handling in inliner heuristics

2025-05-04 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:064cac730f88dc71c6da578f9ae5b8e092ab6cd4

commit r16-372-g064cac730f88dc71c6da578f9ae5b8e092ab6cd4
Author: Jan Hubicka 
Date:   Sun May 4 10:52:35 2025 +0200

Improve maybe_hot handling in inliner heuristics

Inliner currently applies different heuristics to hot and cold calls (the
second are inlined only if the code size will shrink).  It may happen that 
the
call itself is hot, but the significant time is spent in callee and inlining
makes it faster.  For this reason we want to check if the anticipated 
speedup
is considered hot which is done by this patch (that is similar ot my earlier
ipa-cp change).

In general I think this is less important compared to ipa-cp change, since 
large
benefit from inlining happens only when something useful is propagated into 
the
callee and should be handled earlier by ipa-cp. However the patch improves
SPEC2k17 imagick runtime by about 9% as discussed in PR 11900 though it is
mostly problem of bad train data set which does not train well parts of 
program
that are hot for ref data set.  As discussed in the PR log, the particular 
call
that needs to be inlined has count that falls very slightly bellow the 
cutoff
and scaling it up by expected savings enables inlining.

gcc/ChangeLog:

PR target/119900
* cgraph.cc (cgraph_edge::maybe_hot_p): Add
a variant accepting a sreal scale; use reliability of
profile.
* cgraph.h (cgraph_edge::maybe_hot_p): Declare
a varaint accepting a sreal scale.
* ipa-inline.cc (callee_speedup): New function.
(want_inline_small_function_p): add early return
and avoid duplicated lookup of summaries; use scaled
maybe_hot predicate.

Diff:
---
 gcc/cgraph.cc | 42 ++
 gcc/cgraph.h  |  9 +++--
 gcc/ipa-inline.cc | 41 -
 3 files changed, 73 insertions(+), 19 deletions(-)

diff --git a/gcc/cgraph.cc b/gcc/cgraph.cc
index 6ae6a97f6f56..1a2ec38374ab 100644
--- a/gcc/cgraph.cc
+++ b/gcc/cgraph.cc
@@ -2984,13 +2984,22 @@ cgraph_edge::cannot_lead_to_return_p (void)
 return callee->cannot_return_p ();
 }
 
-/* Return true if the edge may be considered hot.  */
+/* Return true if the edge after scaling it profile by SCALE
+   may be considered hot.  */
 
 bool
-cgraph_edge::maybe_hot_p (void)
+cgraph_edge::maybe_hot_p (sreal scale)
 {
-  if (!maybe_hot_count_p (NULL, count.ipa ()))
+  /* Never consider calls in functions optimized for size hot.  */
+  if (opt_for_fn (caller->decl, optimize_size))
 return false;
+
+  /* If reliable IPA count is available, just use it.  */
+  profile_count c = count.ipa ();
+  if (c.reliable_p ())
+return maybe_hot_count_p (NULL, c * scale);
+
+  /* See if we can determine hotness using caller frequency.  */
   if (caller->frequency == NODE_FREQUENCY_UNLIKELY_EXECUTED
   || (callee
  && callee->frequency == NODE_FREQUENCY_UNLIKELY_EXECUTED))
@@ -2999,25 +3008,42 @@ cgraph_edge::maybe_hot_p (void)
   && (callee
  && callee->frequency <= NODE_FREQUENCY_EXECUTED_ONCE))
 return false;
-  if (opt_for_fn (caller->decl, optimize_size))
-return false;
+  /* ??? This may make sense for hot functions determined by
+ user attribute, but if function is hot by profile, it may
+ contains non-hot calls.  In most practical cases this case
+ is handled by the reliable ipa count above, but i.e. after
+ inlining function with no profile to function with profile
+ we get here.. */
   if (caller->frequency == NODE_FREQUENCY_HOT)
 return true;
+
+  /* Use IPA count and if it s not available appy local heuristics.  */
+  if (c.initialized_p ())
+return maybe_hot_count_p (NULL, c * scale);
   if (!count.initialized_p ())
 return true;
   cgraph_node *where = caller->inlined_to ? caller->inlined_to : caller;
   if (!where->count.initialized_p ())
-return false;
+return true;
+  c = count * scale;
   if (caller->frequency == NODE_FREQUENCY_EXECUTED_ONCE)
 {
-  if (count * 2 < where->count * 3)
+  if (c * 2 < where->count * 3)
return false;
 }
-  else if (count * param_hot_bb_frequency_fraction < where->count)
+  else if (c * param_hot_bb_frequency_fraction < where->count)
 return false;
   return true;
 }
 
+/* Return true if the edge may be considered hot.  */
+
+bool
+cgraph_edge::maybe_hot_p ()
+{
+  return maybe_hot_p (1);
+}
+
 /* Worker for cgraph_can_remove_if_no_direct_calls_p.  */
 
 static bool
diff --git a/gcc/cgraph.h b/gcc/cgraph.h
index abde770ba2b3..f7b67ed0a6c5 100644
--- a/gcc/cgraph.h
+++ b/gcc/cgraph.h
@@ -1872,8 +1872,13 @@ public:
   /* Return true when the edge represents a direct recursion.  */
   bool recursive_p (void);
 
-  /* Return true if the edge may be considered hot.  */
-  bool maybe_hot_p (void);
+  /* 

[gcc r16-39] Add tables for SSE fp conversion costs

2025-04-19 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:f6859fb621179ec9bf5631eb8902619ab8d4467b

commit r16-39-gf6859fb621179ec9bf5631eb8902619ab8d4467b
Author: Jan Hubicka 
Date:   Sat Apr 19 18:51:27 2025 +0200

Add tables for SSE fp conversion costs

as disucssed, I will proceed adding costs for common SSE operations which 
are
currently globbed into addss cost, so we do not need to set it incorrectly 
for
znver5.  Looking through the stats, there are quite few missing cases, so I 
am
starting with those that I think are more common. I plan to do it in smaller
steps so individual changes gets benchmarked by LNT and also can be bisected
to.

This patch adds costs for various SSE and AVX FP->FP conversions 
(extensions and
truncations). Looking through Agner Fog's tables, these are bit assymetric 
so I
added cost for CVTSS2SD which is also used for CVTSD2SS, CVTPS2PD and 
CVTPD2PS,
cost for 256bit VCVTPS2PS (also used for oposite direction) and cost for 
512bit
one.

I plan to add int->int conversions next and then int->fp & fp->int which are
more tricky since they may bundle inter-unit move.

I also noticed that size tables are wrong for all SSE instructions so I 
updated
them.  With some love I think vectorization can work as size optimization, 
too,
but we need more work on that.

Those values I can find in Agner Fog tables are taken from there, other are 
guesses
(especially for yongfeng_cost and shijidadao_cost).

gcc/ChangeLog:

* config/i386/i386.cc (vec_fp_conversion_cost): New function.
(ix86_rtx_costs): Use it for SSE/AVX FP conversoins.
(ix86_builtin_vectorization_cost): Fix indentation;
and use vec_fp_conversion_cost in vec_promote_demote.
(fp_conversion_stmt_cost): New function.
(ix86_vector_costs::add_stmt_cost): Use it to cost NOP_EXPR
and vec_promote_demote.
* config/i386/i386.h (struct processor_costs):
* config/i386/x86-tune-costs.h (struct processor_costs):

Diff:
---
 gcc/config/i386/i386.cc  |  64 -
 gcc/config/i386/i386.h   |   6 ++
 gcc/config/i386/x86-tune-costs.h | 121 +++
 3 files changed, 178 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 38df84f7db24..28603c2943ee 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -100,6 +100,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "i386-features.h"
 #include "function-abi.h"
 #include "rtl-error.h"
+#include "gimple-pretty-print.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -21816,6 +21817,25 @@ ix86_insn_cost (rtx_insn *insn, bool speed)
   return insn_cost + pattern_cost (PATTERN (insn), speed);
 }
 
+/* Return cost of SSE/AVX FP->FP conversion (extensions and truncates).  */
+
+static int
+vec_fp_conversion_cost (const struct processor_costs *cost, int size)
+{
+  if (size < 128)
+return cost->cvtss2sd;
+  else if (size < 256)
+{
+  if (TARGET_SSE_SPLIT_REGS)
+   return cost->cvtss2sd * size / 64;
+  return cost->cvtss2sd;
+}
+  if (size < 512)
+return cost->vcvtps2pd256;
+  else
+return cost->vcvtps2pd512;
+}
+
 /* Compute a (partial) cost for rtx X.  Return true if the complete
cost has been computed, and false if subexpressions should be
scanned.  In either case, *TOTAL contains the cost result.  */
@@ -22479,17 +22499,18 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
   return false;
 
 case FLOAT_EXTEND:
+  /* x87 represents all values extended to 80bit.  */
   if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
*total = 0;
   else
-*total = ix86_vec_cost (mode, cost->addss);
+   *total = vec_fp_conversion_cost (cost, GET_MODE_BITSIZE (mode));
   return false;
 
 case FLOAT_TRUNCATE:
   if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
*total = cost->fadd;
   else
-*total = ix86_vec_cost (mode, cost->addss);
+   *total = vec_fp_conversion_cost (cost, GET_MODE_BITSIZE (mode));
   return false;
 
 case ABS:
@@ -24683,7 +24704,7 @@ ix86_builtin_vectorization_cost (enum 
vect_cost_for_stmt type_of_cost,
   switch (type_of_cost)
 {
   case scalar_stmt:
-return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
+   return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
 
   case scalar_load:
/* load/store costs are relative to register move which is 2. Recompute
@@ -24754,7 +24775,11 @@ ix86_builtin_vectorization_cost (enum 
vect_cost_for_stmt type_of_cost,
 return ix86_cost->cond_not_taken_branch_cost;
 
   case vec_perm:
+   return ix86_vec_cost (mode, ix86_cost->sse_op);
+
   case vec_promote_demote:
+   if (fp)
+ return vec_fp_conversion_cost (ix86_tune_cost, mode);
  

[gcc r16-54] Fix cost of vectorized double->float conversion

2025-04-21 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:0907a810f586b07636cc5b83dba6025eb5240655

commit r16-54-g0907a810f586b07636cc5b83dba6025eb5240655
Author: Jan Hubicka 
Date:   Mon Apr 21 20:16:50 2025 +0200

Fix cost of vectorized double->float conversion

In previous patch I miscomputed costs of cvtpd2pf instruction
which mistakely gets accounted as 2 (VEC_PACK_TRUNC_EXPR).
Vectorizer can produce both, but when producing VEC_PACK_TRUNC_EXPR
it use promote_demote patch.  This patch thus simplifies
handling of NOP_EXPR since in that case we should always be producing
only one instruction.

PR target/119879
* config/i386/i386.cc (fp_conversion_stmt_cost): Inline to ...
(ix86_vector_costs::add_stmt_cost): ... here; fix handling of 
NOP_EXPR.

Diff:
---
 gcc/config/i386/i386.cc | 51 +
 1 file changed, 22 insertions(+), 29 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 28603c2943ee..d15f91ddd2cb 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25257,32 +25257,6 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool 
costing_for_scalar)
   return new ix86_vector_costs (vinfo, costing_for_scalar);
 }
 
-/* Return cost of statement doing FP conversion.  */
-
-static unsigned
-fp_conversion_stmt_cost (machine_mode mode, gimple *stmt, bool scalar_p)
-{
-  int outer_size
-= tree_to_uhwi
-   (TYPE_SIZE
-   (TREE_TYPE (gimple_assign_lhs (stmt;
-  int inner_size
-= tree_to_uhwi
-   (TYPE_SIZE
-   (TREE_TYPE (gimple_assign_rhs1 (stmt;
-  int stmt_cost = vec_fp_conversion_cost
-   (ix86_tune_cost, GET_MODE_BITSIZE (mode));
-  /* VEC_PACK_TRUNC_EXPR: If inner size is greater than outer size we will end
- up doing two conversions and packing them.  */
-  if (!scalar_p && inner_size > outer_size)
-{
-  int n = inner_size / outer_size;
-  stmt_cost = stmt_cost * n
- + (n - 1) * ix86_vec_cost (mode, ix86_cost->sse_op);
-}
-  return stmt_cost;
-}
-
 unsigned
 ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
  stmt_vec_info stmt_info, slp_tree node,
@@ -25394,8 +25368,8 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
 TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt
stmt_cost = 0;
  else if (fp)
-   stmt_cost = fp_conversion_stmt_cost (mode, stmt_info->stmt,
-scalar_p);
+   stmt_cost = vec_fp_conversion_cost
+ (ix86_tune_cost, GET_MODE_BITSIZE (mode));
  break;
 
case BIT_IOR_EXPR:
@@ -25439,7 +25413,26 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
 
   if (kind == vec_promote_demote
   && fp && FLOAT_TYPE_P (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt
-stmt_cost = fp_conversion_stmt_cost (mode, stmt_info->stmt, scalar_p);
+{
+  int outer_size
+   = tree_to_uhwi
+   (TYPE_SIZE
+   (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt;
+  int inner_size
+   = tree_to_uhwi
+   (TYPE_SIZE
+   (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt;
+  int stmt_cost = vec_fp_conversion_cost
+   (ix86_tune_cost, GET_MODE_BITSIZE (mode));
+  /* VEC_PACK_TRUNC_EXPR: If inner size is greater than outer size we will 
end
+up doing two conversions and packing them.  */
+  if (inner_size > outer_size)
+   {
+ int n = inner_size / outer_size;
+ stmt_cost = stmt_cost * n
+ + (n - 1) * ix86_vec_cost (mode, ix86_cost->sse_op);
+   }
+}
 
   /* If we do elementwise loads into a vector then we are bound by
  latency and execution resources for the many scalar loads


[gcc r15-7813] Break false dependency chain on Zen5

2025-03-04 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:8c4a00f9a48f1b2af10448c9f2058b44b8cb7234

commit r15-7813-g8c4a00f9a48f1b2af10448c9f2058b44b8cb7234
Author: Jan Hubicka 
Date:   Tue Mar 4 16:22:01 2025 +0100

Break false dependency chain on Zen5

Zen5 on some variants has false dependency on tzcnt, blsi, blsr and blsmsk
instructions.  Those can be tested by the following benchmark

jh@shroud:~> cat ee.c
int
main()
{
   int a = 10;
   int b = 0;
   for (int i = 0; i < 10; i++)
   {
   asm volatile ("xor %0, %0": "=r" (b));
   asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
   asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
   asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
   asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
   asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
   asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
   asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
   asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
   asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
   asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
   }
   return 0;
}
jh@shroud:~> cat bmk.sh
gcc ee.c -DBREAK -DINST=\"$1\" -O2 ; time ./a.out ; gcc ee.c -DINST=\"$1\" 
-O2 ; time ./a.out
jh@shroud:~> sh bmk.sh tzcnt

real0m0.886s
user0m0.886s
sys 0m0.000s

real0m0.886s
user0m0.886s
sys 0m0.000s

jh@shroud:~> sh bmk.sh blsi

real0m0.979s
user0m0.979s
sys 0m0.000s

real0m2.418s
user0m2.418s
sys 0m0.000s

jh@shroud:~> sh bmk.sh blsr

real0m0.986s
user0m0.986s
sys 0m0.000s

real0m2.422s
user0m2.421s
sys 0m0.000s
jh@shroud:~> sh bmk.sh blsmsk

real0m0.973s
user0m0.973s
sys 0m0.000s

real0m2.422s
user0m2.422s
sys 0m0.000s

We already have runable that controls tzcnt together with lzcnt and popcnt.
Since it seems that only tzcnt is affected I added new tunable to control 
tzcnt
only.  I also added splitters for blsi/blsr/blsmsk implemented analogously 
to
existing splitter for lzcnt.

The patch is neutral on SPEC. We produce blsi and blsr in some internal 
loops, but
they usually have same destination as source. However it is good to break 
the
dependency chain to avoid patogolical cases and it is quite cheap overall, 
so I
think we want to enable this for generic.  I will send followup patch for 
this.

Bootstrapped/regtested x86_64-linux, will commit it shortly.

gcc/ChangeLog:

* config/i386/i386.h (TARGET_AVOID_FALSE_DEP_FOR_TZCNT): New macro.
(TARGET_AVOID_FALSE_DEP_FOR_BLS): New macro.
* config/i386/i386.md (*bmi_blsi_): Add splitter for false
dependency.
(*bmi_blsi__ccno): Add splitter for false dependency.
(*bmi_blsi__falsedep): New pattern.
(*bmi_blsmsk_): Add splitter for false dependency.
(*bmi_blsmsk__falsedep): New pattern.
(*bmi_blsr_): Add splitter for false dependency.
(*bmi_blsr__cmp): Add splitter for false dependency
(*bmi_blsr__cmp_falsedep): New pattern.
* config/i386/x86-tune.def (X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT): 
New tune.
(X86_TUNE_AVOID_FALSE_DEP_FOR_BLS): New tune.

gcc/testsuite/ChangeLog:

* gcc.target/i386/blsi.c: New test.
* gcc.target/i386/blsmsk.c: New test.
* gcc.target/i386/blsr.c: New test.

Diff:
---
 gcc/config/i386/i386.h |   4 +
 gcc/config/i386/i386.md| 168 +++--
 gcc/config/i386/x86-tune.def   |  10 ++
 gcc/testsuite/gcc.target/i386/blsi.c   |  26 +
 gcc/testsuite/gcc.target/i386/blsmsk.c |   9 ++
 gcc/testsuite/gcc.target/i386/blsr.c   |  26 +
 6 files changed, 233 insertions(+), 10 deletions(-)

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 2696bfb3a81e..ce29c272bc0b 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -461,6 +461,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 ix86_tune_features[X86_TUNE_ADJUST_UNROLL]
 #define TARGET_AVOID_FALSE_DEP_FOR_BMI \
ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_BMI]
+#define TARGET_AVOID_FALSE_DEP_FOR_TZCNT \
+   ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT]
+#define TARGET_AVOID_FALSE_DEP_FOR_BLS \
+   ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_BLS]
 #define TARGET_ONE_IF_CONV_INSN \
ix86_tune_features[X86_TUNE_ONE_IF_CONV_INSN]
 #define TARGET_AVOID_MFENCE ix86_t

[gcc r15-7811] Make ix86_macro_fusion_pair_p and ix86_fuse_mov_alu_p match current CPUs

2025-03-04 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:c84be624e079cd748df93a3dc0b5168865fefee9

commit r15-7811-gc84be624e079cd748df93a3dc0b5168865fefee9
Author: Jan Hubicka 
Date:   Mon Mar 3 19:12:20 2025 +0100

Make ix86_macro_fusion_pair_p and ix86_fuse_mov_alu_p match current CPUs

The current implementation of fussion predicates misses some common
fussion cases on zen and more recent cores.  I added knobs for
individual conditionals we test.

 1) I split checks for fusing ALU with conditional operands when the ALU
 has memory operand.  This seems to be supported by zen3+ and by
 tigerlake and coperlake (according to Agner Fog's manual)

 2) znver4 and 5 supports fussion of ALU and conditional even if ALU has
memory and immediate operands.
This seems to be relatively important enabling 25% more fusions on
gcc bootstrap.

 3) no CPU supports fusing when ALU contains IP relative memory
references.  I added separate knob so we do not forger about this if
this gets supoorted later.

The patch does not solve the limitation of sched that fuse pairs must be
adjacent on imput and the first operation must be signle-set.  Fixing
single-set is easy (I have separate patch for this), for non-adjacent
pairs we need bigger surgery.

To verify what CPU really does I made simpe test script.

jh@ryzen3:~> cat fuse-test.c
int b;
const int z = 0;
const int o = 1;
int
main()
{
int a = 10;
int b;
int z = 0;
int o = 1;
asm volatile ("\n"
".L1234:\n"
"nop\n"
"subl   %3, %0\n"

"movl %0, %1\n"
"cmpl %2, %1\n"
"movl %0, %1\n"
"test %1, %1\n"

"nop\n"
"jne.L1234":"=a"(a),
"=m"(b)
"=r"(b)
:
"m"(z),
"m"(o),
"i"(0),
"i"(1),
"0"(a)
);
}
jh@ryzen3:~> cat fuse-test.sh
EVENT=ex_ret_fused_instr
dotest()
{
gcc -O2  fuse-test.c $* -o fuse-cmp-imm-mem-nofuse
perf stat -e $EVENT ./fuse-cmp-imm-mem-nofuse  2>&1 | grep $EVENT
gcc -O2 fuse-test.c -DFUSE $* -o fuse-cmp-imm-mem-fuse
perf stat  -e $EVENT ./fuse-cmp-imm-mem-fuse 2>&1 | grep $EVENT
}

echo ALU with immediate
dotest
echo ALU with memory
dotest -D MEM
echo ALU with IP relative memory
dotest -D MEM -D IPRELATIVE
echo CMP with immediate
dotest -D CMP
echo CMP with memory
dotest -D CMP -D MEM
echo CMP with memory and immediate
dotest -D CMP -D MEMIMM
echo CMP with IP relative memory
dotest -D CMP -D MEM -D IPRELATIVE
echo TEST
dotest -D TEST

On zen5 I get:
ALU with immediate
20,345  ex_ret_fused_instr:u
 1,000,020,278  ex_ret_fused_instr:u
ALU with memory
20,367  ex_ret_fused_instr:u
 1,000,020,290  ex_ret_fused_instr:u
ALU with IP relative memory
20,395  ex_ret_fused_instr:u
20,403  ex_ret_fused_instr:u
CMP with immediate
20,369  ex_ret_fused_instr:u
 1,000,020,301  ex_ret_fused_instr:u
CMP with memory
20,314  ex_ret_fused_instr:u
 1,000,020,341  ex_ret_fused_instr:u
CMP with memory and immediate
20,372  ex_ret_fused_instr:u
 1,000,020,266  ex_ret_fused_instr:u
CMP with IP relative memory
20,382  ex_ret_fused_instr:u
20,369  ex_ret_fused_instr:u
TEST
20,346  ex_ret_fused_instr:u
 1,000,020,301  ex_ret_fused_instr:u

IP relative memory seems to not be documented.

On zen3/4 I get:

ALU with immediate
20,263  ex_ret_fused_instr:u
 1,000,020,051  ex_ret_fused_instr:u
ALU with memory
20,255  ex_ret_fused_instr:u
 1,000,020,056  ex_ret_fused_instr:u
ALU with IP relative memory
20,253  ex_ret_fused_instr:u
20,266  ex_ret_fused_instr:u
CMP with immediate
20,264  ex_ret_fused_instr:u
 1,000,020,052  ex_ret_fused_instr:u
CMP with memory
20,253  ex_ret_fused_instr:u
 1,000,019,794  ex_ret_fused_instr:u
CMP with memory and immediate
20,260  ex_ret_fused_instr:u
20,264  ex_ret_fused_instr:u
CMP with IP relative memory
20,258  ex_ret_fused_instr:u
20,256  ex_ret_fused_instr:u
TEST
20,261  ex_ret_fused_instr:u
 1,000,020,048  ex_ret_fused_instr:u

zen1 and 2 gets:

ALU with immediate
2

[gcc r15-8041] Fix speculation_useful_p

2025-03-13 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:57dbbdd8e34b80926e06b352b6c442c555b303ed

commit r15-8041-g57dbbdd8e34b80926e06b352b6c442c555b303ed
Author: Jan Hubicka 
Date:   Thu Mar 13 20:11:02 2025 +0100

Fix speculation_useful_p

This patch fixes issue with speculation and x264.  With profile feedback
we first introduce speculative calls to mc_chroma which is called 
indirectly.
Then we propagate constants acorss these calls (which is useful transform) 
but
then speculation_useful_p decides that these speculations are not useful and
we end up calling unspecialized version.

This patch updates speculation_useful_p to consider edges redirected earlier
to clones as useful, since we can expect that ipa-cp knows what it is doing
(originally it only looked for inlined calls).  I also noticed that we want
to keep edges even if they are not hot.

Finally I noticed a typo in computing target in code which intends to keep
devirtualized calls to functions where we propagated pureness/constness. 
Newly
we also track ipa-modref summaries as they also may be useful.

gcc/ChangeLog:

PR ipa/119147
* ipa-inline.cc: Include ipa-modref-tree.h and
ipa-modref.h.
(speculation_useful_p): If target is a clone, speculation is usef;
fix mixup of caller and callee; speculate also calls not considered
hot; consider modref summary also possibly useful for optimization.
* ipa-profile.cc (ipa_profile): Keep non-hot speculations.

Diff:
---
 gcc/ipa-inline.cc  | 33 -
 gcc/ipa-profile.cc |  7 ---
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/gcc/ipa-inline.cc b/gcc/ipa-inline.cc
index 163129540ac5..9e6e85d714be 100644
--- a/gcc/ipa-inline.cc
+++ b/gcc/ipa-inline.cc
@@ -121,6 +121,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "attribs.h"
 #include "asan.h"
 #include "ipa-strub.h"
+#include "ipa-modref-tree.h"
+#include "ipa-modref.h"
 
 /* Inliner uses greedy algorithm to inline calls in a priority order.
Badness is used as the key in a Fibonacci heap which roughly corresponds
@@ -1941,23 +1943,30 @@ heap_edge_removal_hook (struct cgraph_edge *e, void 
*data)
 bool
 speculation_useful_p (struct cgraph_edge *e, bool anticipate_inlining)
 {
-  /* If we have already decided to inline the edge, it seems useful.  */
-  if (!e->inline_failed)
+  /* If we have already decided to inline the edge, it seems useful.
+ Also if ipa-cp or other pass worked hard enough to produce a clone,
+ we already decided this is a good idea.  */
+  if (!e->inline_failed
+  || e->callee->clone_of)
 return true;
 
   enum availability avail;
   struct cgraph_node *target = e->callee->ultimate_alias_target (&avail,
-e->caller);
+e->callee);
 
   gcc_assert (e->speculative && !e->indirect_unknown_callee);
 
-  if (!e->maybe_hot_p ())
+  /* Even if calll statement is not hot, we can still have useful speculation
+ in cases where a lot of time is spent is callee.
+ Do not check maybe_hot_p.  */
+  if (!e->count.nonzero_p ())
 return false;
 
   /* See if IP optimizations found something potentially useful about the
- function.  For now we look only for CONST/PURE flags.  Almost everything
- else we propagate is useless.  */
-  if (avail >= AVAIL_AVAILABLE)
+ function.  Do this only if the call seems hot since this is about
+ optimizing the code surrounding call site rahter than improving
+ callee.  */
+  if (avail >= AVAIL_AVAILABLE && e->maybe_hot_p ())
 {
   int ecf_flags = flags_from_decl_or_type (target->decl);
   if (ecf_flags & ECF_CONST)
@@ -1972,12 +1981,18 @@ speculation_useful_p (struct cgraph_edge *e, bool 
anticipate_inlining)
->ecf_flags & ECF_PURE))
return true;
 }
+  else if (get_modref_function_summary (target))
+   return true;
 }
   /* If we did not managed to inline the function nor redirect
  to an ipa-cp clone (that are seen by having local flag set),
  it is probably pointless to inline it unless hardware is missing
- indirect call predictor.  */
-  if (!anticipate_inlining && !target->local)
+ indirect call predictor.
+
+ At this point we know we will not dispatch into faster version of
+ callee, so if call itself is not hot, we definitely can give up
+ speculating.  */
+  if (!anticipate_inlining && (!target->local || !e->maybe_hot_p ()))
 return false;
   /* For overwritable targets there is not much to do.  */
   if (!can_inline_edge_p (e, false)
diff --git a/gcc/ipa-profile.cc b/gcc/ipa-profile.cc
index 51616def254f..08638667f65c 100644
--- a/gcc/ipa-profile.cc
+++ b/gcc/ipa-profile.cc
@@ -882,13 +882,6 @@ ipa_profile (void)
 "Not 

[gcc r15-9047] Optimize string constructor

2025-03-30 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:9c5505a35d9d71705464f9254f55407192d31ec3

commit r15-9047-g9c5505a35d9d71705464f9254f55407192d31ec3
Author: Jan Hubicka 
Date:   Sun Mar 30 23:49:49 2025 +0200

Optimize string constructor

this patch improves code generation on string constructors.  We currently 
have
_M_construct which takes as a parameter two iterators (begin/end pointers to
other string) and produces new string.  This patch adds special case of
constructor where instead of begining/end pointers we readily know the 
string
size and also special case when we know that source is 0 terminated.  This
happens commonly when producing stirng copies. Moreover currently ipa-prop 
is
not able to propagate information that beg-end is known constant (copied 
string
size) which makes it impossible for inliner to spot the common case where
string size is known to be shorter than 15 bytes and fits in local buffer.

Finally I made new constructor inline. Because it is explicitely 
instantiated
without C++20 constexpr we do not produce implicit instantiation (as 
required
by standard) which prevents inlining, ipa-modref and any other IPA analysis 
to
happen.  I think we need to make many of the other functions inline, since
optimization accross string manipulation is quite important. There is 
PR94960
to track this issue.

Bootstrapped/regtested x86_64-linux, OK?

libstdc++-v3/ChangeLog:

PR tree-optimization/103827
PR tree-optimization/80331
PR tree-optimization/87502

* config/abi/pre/gnu.ver: Add version for _M_construct
* include/bits/basic_string.h: (basic_string::_M_construct): 
Declare.
(basic_string constructors): Use it.
* include/bits/basic_string.tcc: 
(basic_string::_M_construct): New template.
* src/c++11/string-inst.cc: Instantated S::_M_construct.

gcc/testsuite/ChangeLog:

* g++.dg/tree-ssa/pr80331.C: New test.
* g++.dg/tree-ssa/pr87502.C: New test.

Diff:
---
 gcc/testsuite/g++.dg/tree-ssa/pr80331.C|  8 
 gcc/testsuite/g++.dg/tree-ssa/pr87502.C| 15 +++
 libstdc++-v3/config/abi/pre/gnu.ver|  3 +++
 libstdc++-v3/include/bits/basic_string.h   | 10 --
 libstdc++-v3/include/bits/basic_string.tcc | 25 +
 libstdc++-v3/src/c++11/string-inst.cc  |  8 
 6 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr80331.C 
b/gcc/testsuite/g++.dg/tree-ssa/pr80331.C
new file mode 100644
index ..85034504f2f8
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr80331.C
@@ -0,0 +1,8 @@
+// { dg-do compile }
+// { dg-additional-options "-O2 -fdump-tree-optimized" }
+#include
+int sain() {
+  const std::string remove_me("remove_me");
+  return 0;
+}
+// { dg-final { scan-tree-dump-not "remove_me" "optimized" } }
diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr87502.C 
b/gcc/testsuite/g++.dg/tree-ssa/pr87502.C
new file mode 100644
index ..ad3e9d254044
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr87502.C
@@ -0,0 +1,15 @@
+// { dg-do compile }
+// { dg-additional-options "-O2 -fdump-tree-optimized" }
+#include 
+
+
+__attribute__ ((pure))
+extern int foo (const std::string &);
+
+int
+bar ()
+{
+  return foo ("abc") + foo (std::string("abc"));
+}
+// We used to add terminating zero explicitely instead of using fact
+// that memcpy source is already 0 terminated.
diff --git a/libstdc++-v3/config/abi/pre/gnu.ver 
b/libstdc++-v3/config/abi/pre/gnu.ver
index adadc62e3533..eb230290313c 100644
--- a/libstdc++-v3/config/abi/pre/gnu.ver
+++ b/libstdc++-v3/config/abi/pre/gnu.ver
@@ -2540,6 +2540,9 @@ GLIBCXX_3.4.34 {
 
_ZNSt8__format25__locale_encoding_to_utf8ERKSt6localeSt17basic_string_viewIcSt11char_traitsIcEEPv;
 # __sso_string constructor and destructor
 _ZNSt12__sso_string[CD][12]Ev;
+# void std::__cxx11::basic_string, 
std::allocator >::_M_construct(char const*, unsigned long)
+# and wide char version
+
_ZNSt7__cxx1112basic_stringI[cw]St11char_traitsI[cw]ESaI[cw]EE12_M_constructILb[01]EEEvPK[cw]m;
 } GLIBCXX_3.4.33;
 
 # Symbols in the support library (libsupc++) have their own tag.
diff --git a/libstdc++-v3/include/bits/basic_string.h 
b/libstdc++-v3/include/bits/basic_string.h
index e3b484d7a53f..86841cb2c5ed 100644
--- a/libstdc++-v3/include/bits/basic_string.h
+++ b/libstdc++-v3/include/bits/basic_string.h
@@ -341,6 +341,13 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11
   void
   _M_construct(size_type __req, _CharT __c);
 
+  // Construct using block of memory of known size.
+  // If _Terminated is true assume that source is already 0 terminated.
+  template
+   _GLIBCXX20_CONSTEXPR
+   void
+   _M_construct(const _CharT *__c, size_type __n);
+
   _GLIBCXX20_CONSTEXPR
   allocator_type&
   _M_get_allocator()
@@ -56

[gcc r15-9176] Fix costs of x86 move instructions at -Os

2025-04-03 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:564e4e0819022925dd160e455ee44baf0fda5805

commit r15-9176-g564e4e0819022925dd160e455ee44baf0fda5805
Author: Jan Hubicka 
Date:   Thu Apr 3 13:06:07 2025 +0200

Fix costs of x86 move instructions at -Os

This patch fixes problem with size costs declaring all moves to have equal 
size
(which was caught by the sanity check I tried in prologue move cost hook).
Costs are relative to reg-reg move which is two. Coincidentally it is also 
size
of the encoding, so the costs should represent typical size of move
instruction.

The patch reduces cc1plus text size 26391115->26205707 (0.7%) and similar 
changes
also happens to other binaries build during bootstrap.

Bootsrapped/regtested x86_64-linux, plan to commit it tomorrow if there
are no complains

There are other targets that define some load/store costs to be 2 that 
probably
should be fixed too, but they are mostly very old ones and I don't have way 
of
benchmarking them.

* config/i386/x86-tune-costs.h (ix86_size_cost): Fix sizes of move
instructions

Diff:
---
 gcc/config/i386/x86-tune-costs.h | 57 ++--
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index a4a128cd5dde..7c8cb738d7cd 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -37,34 +37,37 @@ static stringop_algs ix86_size_memset[2] = {
 const
 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
   {
-  /* Start of register allocator costs.  integer->integer move cost is 2. */
-  2,/* cost for loading QImode using movzbl */
-  {2, 2, 2},   /* cost of loading integer registers
+  /* Start of register allocator costs.  integer->integer move cost is 2
+ and coststs are relative to it.  movl %eax, %ebx is 2 bytes, so the
+ sizes coincides with average size of instruction encoding.  */
+  3,/* cost for loading QImode using movzbl */
+  /* Typical load/save from stack frame is 4 bytes with ebp and 5 with esp.  */
+  {5, 6, 5},   /* cost of loading integer registers
   in QImode, HImode and SImode.
   Relative to reg-reg move (2).  */
-  {2, 2, 2},   /* cost of storing integer registers */
+  {5, 6, 5},   /* cost of storing integer registers */
   2,   /* cost of reg,reg fld/fst */
-  {2, 2, 2},   /* cost of loading fp registers
+  {5, 6, 5},   /* cost of loading fp registers
   in SFmode, DFmode and XFmode */
-  {2, 2, 2},   /* cost of storing fp registers
+  {5, 6, 5},   /* cost of storing fp registers
   in SFmode, DFmode and XFmode */
   3,   /* cost of moving MMX register */
-  {3, 3},  /* cost of loading MMX registers
+  {6, 6},  /* cost of loading MMX registers
   in SImode and DImode */
-  {3, 3},  /* cost of storing MMX registers
+  {6, 6},  /* cost of storing MMX registers
   in SImode and DImode */
-  3, 3, 3, /* cost of moving XMM,YMM,ZMM register 
*/
-  {3, 3, 3, 3, 3}, /* cost of loading SSE registers
+  4, 4, 6, /* cost of moving XMM,YMM,ZMM register 
*/
+  {6, 6, 6, 6, 11},/* cost of loading SSE registers
   in 32,64,128,256 and 512-bit */
-  {3, 3, 3, 3, 3}, /* cost of storing SSE registers
+  {6, 6, 6, 6, 11},/* cost of storing SSE registers
   in 32,64,128,256 and 512-bit */
-  3, 3,/* SSE->integer and integer->SSE moves 
*/
-  3, 3,/* mask->integer and integer->mask 
moves */
-  {2, 2, 2},   /* cost of loading mask register
+  4, 4,/* SSE->integer and integer->SSE moves 
*/
+  4, 4,/* mask->integer and integer->mask 
moves */
+  {7, 7, 7},   /* cost of loading mask register
   in QImode, HImode, SImode.  */
-  {2, 2, 2},   /* cost if storing mask register
+  {7, 7, 7},   /* cost if storing mask register
   in QImode, HImo

[gcc r16-447] i386: implement costs for float<->int conversions in ix86_vector_costs::add_stmt_cost

2025-05-07 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:2c8d632d9ed4e3aeee2156ba17fe631ecbc90dbf

commit r16-447-g2c8d632d9ed4e3aeee2156ba17fe631ecbc90dbf
Author: Jan Hubicka 
Date:   Wed May 7 15:33:44 2025 +0200

i386: implement costs for float<->int conversions in 
ix86_vector_costs::add_stmt_cost

This patch adds pattern matching for float<->int conversions both as normal
statements and promote_demote.  While updating promote_demote I noticed that
in cleanups I turned "stmt_cost =" into "int stmt_cost = " which turned
the existing FP costing to NOOP. I also added comment on how demotes are 
done
when turning i.e. 32bit into 8bit value (which is the case of pr19919.c).

The patch disables vectorization in pr119919.c on generic tuning, but keeps
it at both zen and skylake+. The underlying problem is bad cost of 
open-coded
scatter which is tracked by 119902 so I simply added -mtune=znver1 so the 
testcase
keeps testing vectorization.

gcc/ChangeLog:

* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Add 
FLOAT_EXPR;
FIX_TRUNC_EXPR and vec_promote_demote costs.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr119919.c: Add -mtune=znver1

Diff:
---
 gcc/config/i386/i386.cc  | 50 +---
 gcc/testsuite/gcc.target/i386/pr119919.c |  2 +-
 2 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index bef95ea18c87..fd36ea802c00 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25767,6 +25767,26 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
  (ix86_tune_cost, GET_MODE_BITSIZE (mode));
  break;
 
+   case FLOAT_EXPR:
+   if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+ stmt_cost = ix86_cost->cvtsi2ss;
+   else if (X87_FLOAT_MODE_P (mode))
+ /* TODO: We do not have cost tables for x87.  */
+ stmt_cost = ix86_cost->fadd;
+   else
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtpi2ps);
+   break;
+
+   case FIX_TRUNC_EXPR:
+   if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+ stmt_cost = ix86_cost->cvtss2si;
+   else if (X87_FLOAT_MODE_P (mode))
+ /* TODO: We do not have cost tables for x87.  */
+ stmt_cost = ix86_cost->fadd;
+   else
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtps2pi);
+   break;
+
case COND_EXPR:
  {
/* SSE2 conditinal move sequence is:
@@ -25930,8 +25950,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
break;
   }
 
-  if (kind == vec_promote_demote
-  && fp && FLOAT_TYPE_P (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt
+  if (kind == vec_promote_demote)
 {
   int outer_size
= tree_to_uhwi
@@ -25941,16 +25960,25 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
= tree_to_uhwi
(TYPE_SIZE
(TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt;
-  int stmt_cost = vec_fp_conversion_cost
-   (ix86_tune_cost, GET_MODE_BITSIZE (mode));
-  /* VEC_PACK_TRUNC_EXPR: If inner size is greater than outer size we will 
end
-up doing two conversions and packing them.  */
+  bool inner_fp = FLOAT_TYPE_P
+   (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt)));
+
+  if (fp && inner_fp)
+   stmt_cost = vec_fp_conversion_cost
+ (ix86_tune_cost, GET_MODE_BITSIZE (mode));
+  else if (fp && !inner_fp)
+   stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtpi2ps);
+  else if (!fp && inner_fp)
+   stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtps2pi);
+  else
+   stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+  /* VEC_PACK_TRUNC_EXPR and similar demote operations: If outer size is
+greater than inner size we will end up doing two conversions and
+packing them.  We always pack pairs; if the size difference is greater
+it is split into multiple demote operations.  */
   if (inner_size > outer_size)
-   {
- int n = inner_size / outer_size;
- stmt_cost = stmt_cost * n
- + (n - 1) * ix86_vec_cost (mode, ix86_cost->sse_op);
-   }
+   stmt_cost = stmt_cost * 2
+   + ix86_vec_cost (mode, ix86_cost->sse_op);
 }
 
   /* If we do elementwise loads into a vector then we are bound by
diff --git a/gcc/testsuite/gcc.target/i386/pr119919.c 
b/gcc/testsuite/gcc.target/i386/pr119919.c
index ed646561bd1f..e39819f682db 100644
--- a/gcc/testsuite/gcc.target/i386/pr119919.c
+++ b/gcc/testsuite/gcc.target/i386/pr119919.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -msse2 -fdump-tree-vect-details" } */
+/* { dg-options "-O2 -msse2 -fdump-tree-ve

[gcc r16-531] i386: Fix move costs in vectorizer cost model.

2025-05-11 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:37e61c793c1b22bdcfbf142cd6086da2745be596

commit r16-531-g37e61c793c1b22bdcfbf142cd6086da2745be596
Author: Jan Hubicka 
Date:   Sun May 11 23:49:11 2025 +0200

i386: Fix move costs in vectorizer cost model.

This patch complements the change to stv and uses COSTS_N_INSNS (...)/2
to convert move costs to COSTS_N_INSNS based costs used by vectorizer.
The patch makes pr9981 to XPASS so I removed xfail but it also makes
pr91446 fail.  This is about SLP

/* { dg-options "-O2 -march=icelake-server -ftree-slp-vectorize 
-mtune-ctrl=^sse_typeless_stores" } */

typedef struct
{
  unsigned long long width, height;
  long long x, y;
} info;

extern void bar (info *);

void
foo (unsigned long long width, unsigned long long height,
 long long x, long long y)
{
  info t;
  t.width = width;
  t.height = height;
  t.x = x;
  t.y = y;
  bar (&t);
}

/* { dg-final { scan-assembler-times "vmovdqa\[^\n\r\]*xmm\[0-9\]" 2 } } */

With fixed cost the construction cost is now too large so vectorization does
not happen.  This is the hack increasing cost to account integer->sse move 
which
I think we can handle incrementally.

gcc/ChangeLog:

* config/i386/i386.cc (ix86_widen_mult_cost): Use sse_op to cost
SSE integer addition.
(ix86_multiplication_cost): Use COSTS_N_INSNS (...)/2 to cost sse
loads.
(ix86_shift_rotate_cost): Likewise.
(ix86_vector_costs::add_stmt_cost): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr91446.c: xfail.
* gcc.target/i386/pr99881.c: remove xfail.

Diff:
---
 gcc/config/i386/i386.cc | 26 +++---
 gcc/testsuite/gcc.target/i386/pr91446.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr99881.c |  2 +-
 3 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 9c24a926a890..3d629b06094a 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21753,7 +21753,7 @@ ix86_widen_mult_cost (const struct processor_costs 
*cost,
   /* pmuludq under sse2, pmuldq under sse4.1, for sign_extend,
 require extra 4 mul, 4 add, 4 cmp and 2 shift.  */
   if (!TARGET_SSE4_1 && !uns_p)
-   extra_cost = (cost->mulss + cost->addss + cost->sse_op) * 4
+   extra_cost = (cost->mulss + cost->sse_op + cost->sse_op) * 4
  + cost->sse_op * 2;
   /* Fallthru.  */
 case V4DImode:
@@ -21803,11 +21803,11 @@ ix86_multiplication_cost (const struct 
processor_costs *cost,
  else if (TARGET_AVX2)
nops += 2;
  else if (TARGET_XOP)
-   extra += cost->sse_load[2];
+   extra += COSTS_N_INSNS (cost->sse_load[2]) / 2;
  else
{
  nops += 1;
- extra += cost->sse_load[2];
+ extra += COSTS_N_INSNS (cost->sse_load[2]) / 2;
}
  goto do_qimode;
 
@@ -21826,13 +21826,13 @@ ix86_multiplication_cost (const struct 
processor_costs *cost,
{
  nmults += 1;
  nops += 2;
- extra += cost->sse_load[2];
+ extra += COSTS_N_INSNS (cost->sse_load[2]) / 2;
}
  else
{
  nmults += 1;
  nops += 4;
- extra += cost->sse_load[2];
+ extra += COSTS_N_INSNS (cost->sse_load[2]) / 2;
}
  goto do_qimode;
 
@@ -21845,14 +21845,16 @@ ix86_multiplication_cost (const struct 
processor_costs *cost,
{
  nmults += 1;
  nops += 4;
- extra += cost->sse_load[3] * 2;
+ /* 2 loads, so no division by 2.  */
+ extra += COSTS_N_INSNS (cost->sse_load[3]);
}
  goto do_qimode;
 
case V64QImode:
  nmults = 2;
  nops = 9;
- extra = cost->sse_load[3] * 2 + cost->sse_load[4] * 2;
+ /* 2 loads of each size, so no division by 2.  */
+ extra = COSTS_N_INSNS (cost->sse_load[3] + cost->sse_load[4]);
 
do_qimode:
  return ix86_vec_cost (mode, cost->mulss * nmults
@@ -21945,7 +21947,7 @@ ix86_shift_rotate_cost (const struct processor_costs 
*cost,
/* Use vpbroadcast.  */
extra = cost->sse_op;
  else
-   extra = cost->sse_load[2];
+   extra = COSTS_N_INSNS (cost->sse_load[2]) / 2;
 
  if (constant_op1)
{
@@ -21976,7 +21978,7 @@ ix86_shift_rotate_cost (const struct processor_costs 
*cost,
 shift with one insn set the cost to prefer paddb.  */
  if (constant_op1)
{
- extra = cost->sse_load[2];
+ extra = COSTS_N_INSNS (cost->sse_load[2]) / 2;
  return ix86_vec_cost (mode, cost->sse_op) + extr

[gcc r16-517] i386: Fix some problems in stv cost model

2025-05-10 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:993aa0bd28722c7f01fb8310f1c79814aef217ed

commit r16-517-g993aa0bd28722c7f01fb8310f1c79814aef217ed
Author: Jan Hubicka 
Date:   Sat May 10 22:23:48 2025 +0200

i386: Fix some problems in stv cost model

this patch fixes some of problems with cosint in scalar to vector pass.
In particular
 1) the pass uses optimize_insn_for_size which is intended to be used by
expanders and splitters and requires the optimization pass to use
set_rtl_profile (bb) for currently processed bb.
This is not done, so we get random stale info about hotness of insn.
 2) register allocator move costs are all realtive to integer reg-reg move
which has cost of 2, so it is (except for size tables and i386)
a latency of instruction multiplied by 2.
These costs has been duplicated and are now used in combination with
rtx costs which are all based to COSTS_N_INSNS that multiplies latency
by 4.
Some of vectorizer costing contains COSTS_N_INSNS (move_cost) / 2
to compensate, but some new code does not.  This patch adds 
compensatoin.

Perhaps we should update the cost tables to use COSTS_N_INSNS everywher
but I think we want to first fix inconsistencies.  Also the tables will
get optically much longer, since we have many move costs and 
COSTS_N_INSNS
is a lot of characters.
 3) variable m which decides how much to multiply integer variant (to 
account
that with -m32 all 64bit computations needs 2 instructions) is declared
unsigned which makes the signed computation of instruction gain to be
done in unsigned type and breaks i.e. for division.
 4) I added integer_to_sse costs which are currently all duplicationof
sse_to_integer. AMD chips are asymetric and moving one direction is 
faster
than another.  I will chance costs incremnetally once vectorizer part
is fixed up, too.

There are two failures gcc.target/i386/minmax-6.c and 
gcc.target/i386/minmax-7.c.
Both test stv on hasswell which no longer happens since SSE->INT and 
INT->SSE moves
are now more expensive.

There is only one instruction to convert:

Computing gain for chain #1...
  Instruction gain 8 for11: {r110:SI=smax(r116:SI,0);clobber flags:CC;}
  Instruction conversion gain: 8
  Registers conversion cost: 8<- this is integer_to_sse and 
sse_to_integer
  Total gain: 0

total gain used to be 4 since the patch doubles the conversion costs.
According to agner fog's tables the costs should be 1 cycle which is correct
here.

Final code gnerated is:

vmovd   %esi, %xmm0 * latency 1
cmpl%edx, %esi
je  .L2
vpxor   %xmm1, %xmm1, %xmm1 * latency 1
vpmaxsd %xmm1, %xmm0, %xmm0 * latency 1
vmovd   %xmm0, %eax * latency 1
imull   %edx, %eax
cltq
movzwl  (%rdi,%rax,2), %eax
ret

cmpl%edx, %esi
je  .L2
xorl%eax, %eax  * latency 1
testl   %esi, %esi  * latency 1
cmovs   %eax, %esi  * latency 2
imull   %edx, %esi
movslq  %esi, %rsi
movzwl  (%rdi,%rsi,2), %eax
ret

Instructions with latency info are those really different.
So the uncoverted code has sum of latencies 4 and real latency 3.
Converted code has sum of latencies 4 and real latency 3 
(vmod+vpmaxsd+vmov).
So I do not quite see it should be a win.

There is also a bug in costing MIN/MAX

case ABS:
case SMAX:
case SMIN:
case UMAX:
case UMIN:
  /* We do not have any conditional move cost, estimate it as a
 reg-reg move.  Comparisons are costed as adds.  */
  igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
  /* Integer SSE ops are all costed the same.  */
  igain -= ix86_cost->sse_op;
  break;

Now COSTS_N_INSNS (2) is not quite right since reg-reg move should be 1 or 
perhaps 0.
For Haswell cmov really is 2 cycles, but I guess we want to have that in 
cost vectors
like all other instructions.

I am not sure if this is really a win in this case (other minmax testcases 
seems to make
sense).  I have xfailed it for now and will check if that affects specs on 
LNT testers.

I will proceed with similar fixes on vectorizer cost side. Sadly those 
introduces
quite some differences in the testuiste (partly triggered by other costing 
problems,
such as one of scatter/gather)

gcc/ChangeLog:

* config/i386/i386-features.cc
(general_scalar_chain::vector_const_cost): Add BB parameter

[gcc r16-1756] Fix handling of dwarf name and duplicated names

2025-06-27 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:61f07bcb509cca051e39b698a1bd9a5907f26cf4

commit r16-1756-g61f07bcb509cca051e39b698a1bd9a5907f26cf4
Author: Jan Hubicka 
Date:   Sat Jun 28 05:02:27 2025 +0200

Fix handling of dwarf name and duplicated names

I have tested Kugan's patch on exchange2 and noticed multiple problems:
  1) with LTO the translation from dwarf names to symbol names is disabled
 since we free lang data sooner.  I moved the offline pass upstream 
which
 however also may make us miss clones intorduced betwen free lang data
 and annotation.  This is not very important right now and may be 
furhter
 fixed by splitting off auto-profile-read and offline passes.
  2) I noticed that we miss a lot of AFDO inlines because some code compares
 name indexes for equality in belief that it compares symbol names.  
This
 is not ture if we drop prefixes.  For this reason I integrated 
get_original_name
 into the renaming machinery which actually updates indexes so string 
table
 conitnues to work as symbol table.
 This lets me to drop
afdo_string_table->get_index (afdo_string_table->get_name 
(other->name ()))
 hops that were introduced at some places

 Now after renaming all afdo instances should go by DECL_ASSEMBLER_NAME
 names.
  3) Detection of realized offline instances had an ordering issue where we
 omitted marking of those that were offlined later.  Since we can now
 lookup assembler names, I simplified the logic into single-pass.

autoprofiledbootstrapped/regteted x86_64-linux, comitted.

gcc/ChangeLog:

* auto-profile.cc (get_original_name): Only strip suffixes 
introduced
after auto-fdo annotation.
(string_table::get_index_by_decl):  Simplify.
(string_table::add_name): New member function.
(string_table::read): Micro-optimize allocation.
(function_instance::get_function_instance_by_decl): Dump reasons
for failure; try to compensate lost discriminators.
(function_instance::merge): Simplify sanity check; do not check
for realized flag; fix merging of targets.
(function_instance::offline_if_in_set): Simplify.
(function_instance::dump): Sanity check that names are consistent.
(autofdo_source_profile::offline_external_functions): Also handle
stripping suffixes.
(walk_block): Move up in source.
(autofdo_source_profile::offline_unrealized_inlines): Also compute
realized functions.
(autofdo_source_profile::get_function_instance_by_name_index): 
Simplify.
(autofdo_source_profile::add_function_instance): Simplify.
(autofdo_source_profile::read): Do not strip suffxies; error on 
duplicates.
(mark_realized_functions): Remove.
(auto_profile): Do not call mark_realized_functions.
* passes.def: Move auto_profile_offline before free_lang_data.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-prof/clone-test.c: New test.
* gcc.dg/tree-prof/clone-merge-1.c: Updae template.

Co-authored-by: Kugan Vivekanandarajah  

Diff:
---
 gcc/auto-profile.cc| 443 -
 gcc/passes.def |   4 +-
 gcc/testsuite/gcc.dg/tree-prof/clone-merge-1.c |   4 +-
 gcc/testsuite/gcc.dg/tree-prof/clone-test.c|  63 
 4 files changed, 360 insertions(+), 154 deletions(-)

diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index 12d96bd5c195..7cf1e8f1b815 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -211,6 +211,15 @@ public:
   /* Read profile, return TRUE on success.  */
   bool read ();
 
+  /* Return number of entries.  */
+  size_t num_entries ()
+  {
+return vector_.length ();
+  }
+
+  /* Add new name and return its index.  */
+  int add_name (char *);
+
 private:
   typedef std::map string_index_map;
   string_vector vector_;
@@ -518,16 +527,50 @@ static gcov_type afdo_count_scale = 1;
 
 /* Helper functions.  */
 
+
 /* Return the original name of NAME: strip the suffix that starts
-   with '.' Caller is responsible for freeing RET.  */
+   with '.' for names that are generetad after auto-profile pass.
+   This is to match profiled names with the names in the IR at this stage.
+   Note that we only have to strip sufix and not in the middle.
+   Caller is responsible for freeing RET.  */
 
 static char *
-get_original_name (const char *name)
+get_original_name (const char *name, bool alloc = true)
 {
-  char *ret = xstrdup (name);
-  char *find = strchr (ret, '.');
-  if (find != NULL)
-*find = 0;
+  char *ret = alloc ? xstrdup (name) : const_cast (name);
+  char *last_dot = strrchr (ret, '.');
+  if (last_dot == NULL)
+return ret;
+  bool only_digits = true;
+  char *ptr = l

[gcc r16-1734] Fix afdo profiles for functions that was not early-inlined

2025-06-27 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:ad18f7f88aee15b3f15aa74483ca2ebdc89e18cb

commit r16-1734-gad18f7f88aee15b3f15aa74483ca2ebdc89e18cb
Author: Jan Hubicka 
Date:   Fri Jun 27 16:10:31 2025 +0200

Fix afdo profiles for functions that was not early-inlined

This patch should finish the oflining infrastructure by offlining
(prior AFDO annotation) all inline function instances that was not
early inlined.  This is mostly the case of recursive inlining or when
-fno-auto-profile-inlining is used which sould now produce comparable
code.

I also cleaned up offlining of self-recursive functions which now
happens through the worklist and reduces problem with recursive ivocation
of the funciton merging modifying datastructures at unexpected places.

gcc/ChangeLog:

* auto-profile.cc (function_instance::set_name,
function_instance::set_realized, function_instnace::realized_p,
function_instance::set_in_worklist,
function_instance::clear_in_worklist,
function_instance::in_worklist_p): New member functions.
(function_instance::in_worklist, function_instance::realized_):
new.
(get_relative_location_for_locus): Break out from 
(get_relative_location_for_stmt): ... here.
(function_instance::~function_instance): Sanity check that
removed function is not in worklist.
(function_instance::merge): Do not offline realized instances.
(function_instance::offline): Make private; add duplicate functions
to worklist rather then merging immediately.
(function_instance::offline_if_in_set):  Cleanup.
(function_instance::remove_external_functions): Likewise.
(function_instance::offline_if_not_realized): New member function.
(autofdo_source_profile::offline_external_functions): Handle delayed
functions.
(autofdo_source_profile::offline_unrealized_inlines): New member 
function.
(walk_block): New function.
(mark_realized_functions): New function.
(afdo_annotate_cfg): Fix dump.
(auto_profile): Mark realized functions and offline rest; do not 
compute
fn summary.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-prof/afdo-crossmodule-1.c: Update template.

Diff:
---
 gcc/auto-profile.cc| 395 ++---
 .../gcc.dg/tree-prof/afdo-crossmodule-1.c  |   2 +-
 2 files changed, 339 insertions(+), 58 deletions(-)

diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index d19afd73fae7..12d96bd5c195 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -61,7 +61,8 @@ along with GCC; see the file COPYING3.  If not see
 
There are three phases in AutoFDO:
 
-   Phase 1: Read profile from the profile data file.
+   Phase 1: At startup.
+ Read profile from the profile data file.
  The following info is read from the profile datafile:
 * string_table: a map between function name and its index.
 * autofdo_source_profile: a map from function_instance name to
@@ -76,7 +77,14 @@ along with GCC; see the file COPYING3.  If not see
  standalone symbol, or a clone of a function that is inlined into another
  function.
 
-   Phase 2: AFDO inline + value profile transformation.
+   Phase 2: In afdo_offline pass.
+ Remove function instances from other translation units
+ and offline all cross-translation unit inlining done during train
+ run compilation.  This is necessary to not lose profiles with
+ LTO train run.
+
+   Phase 3: During early optimization.
+ AFDO inline + value profile transformation.
  This happens during early optimization.
  During early inlning AFDO inliner is executed which
  uses autofdo_source_profile to find if a callsite is:
@@ -94,14 +102,19 @@ along with GCC; see the file COPYING3.  If not see
  This is controlled by -fauto-profile-inlinig and is independent
  of -fearly-inlining.
 
-   Phase 3: Annotate control flow graph.
- AutoFDO uses a separate pass to:
+   Phase 4: In AFDO pass.
+ Offline all functions that has been inlined in the
+ train run but were not inlined in early inlining nor AFDO
+ inline.
+
+   Phase 5: In AFDO pass.
+ Annotate control flow graph.
 * Annotate basic block count
 * Estimate branch probability
* Use earlier static profile to fill in the gaps
  if AFDO profile is ambigous
 
-   After the above 3 phases, all profile is readily annotated on the GCC IR.
+   After the above 5 phases, all profile is readily annotated on the GCC IR.
AutoFDO tries to reuse all FDO infrastructure as much as possible to make
use of the profile. E.g. it uses existing mechanism to calculate the basic
block/edge frequency, as well as the cgraph node/edge count.
@@ -232,6 +245,11 @@ publ

[gcc r16-2048] Print discriminators in dump_scope_block

2025-07-07 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:7891c0b450d5ba662fa1817af667b2ba35dee661

commit r16-2048-g7891c0b450d5ba662fa1817af667b2ba35dee661
Author: Jan Hubicka 
Date:   Mon Jul 7 10:07:53 2025 +0200

Print discriminators in dump_scope_block

gcc/ChangeLog:

* tree-ssa-live.cc (dump_scope_block): Print discriminators
of inlined functions.

Diff:
---
 gcc/tree-ssa-live.cc | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/gcc/tree-ssa-live.cc b/gcc/tree-ssa-live.cc
index 5b8bfd06bec1..5e0891361dc7 100644
--- a/gcc/tree-ssa-live.cc
+++ b/gcc/tree-ssa-live.cc
@@ -702,7 +702,10 @@ dump_scope_block (FILE *file, int indent, tree scope, 
dump_flags_t flags)
   if (LOCATION_LOCUS (BLOCK_SOURCE_LOCATION (scope)) != UNKNOWN_LOCATION)
 {
   expanded_location s = expand_location (BLOCK_SOURCE_LOCATION (scope));
-  fprintf (file, " %s:%i", s.file, s.line);
+  fprintf (file, " %s:%i:%i", s.file, s.line, s.column);
+  if (has_discriminator (BLOCK_SOURCE_LOCATION (scope)))
+   fprintf (file, " discrim %i",
+get_discriminator_from_loc (BLOCK_SOURCE_LOCATION (scope)));
 }
   if (BLOCK_ABSTRACT_ORIGIN (scope))
 {


[gcc r16-2124] Fix auto-profile.cc:get_original_name

2025-07-09 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:4de3524f9e88b7b22bdb481163b05a624f090cf9

commit r16-2124-g4de3524f9e88b7b22bdb481163b05a624f090cf9
Author: Jan Hubicka 
Date:   Mon Jul 7 17:18:23 2025 +0200

Fix auto-profile.cc:get_original_name

There are two bugs in get_original_name.  FIrst the for loop walking list 
of known
suffixes uses sizeos (suffixes).  It evnetually walks to an empty suffix.
Second problem is that strcmp may accept suffixes that are longer.  I.e.
mix up .isra with .israabc.  This is probably not a big deal but the first
bug makes get_original_name to effectively strip all suffixes, even 
important
one on my setup.

gcc/ChangeLog:

* auto-profile.cc (get_original_name): Fix loop walking the
suffixes.

Diff:
---
 gcc/auto-profile.cc | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index a970eb8972fa..1700bf8f2cd4 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -622,9 +622,11 @@ get_original_name (const char *name, bool alloc = true)
 }
   /* Suffixes of clones that compiler generates after auto-profile.  */
   const char *suffixes[] = {"isra", "constprop", "lto_priv", "part", "cold"};
-  for (unsigned i = 0; i < sizeof (suffixes); ++i)
+  for (unsigned i = 0; i < sizeof (suffixes) / sizeof (const char *); ++i)
 {
-  if (strncmp (next_dot + 1, suffixes[i], strlen (suffixes[i])) == 0)
+  int len = strlen (suffixes[i]);
+  if (len == last_dot - next_dot - 1
+ && strncmp (next_dot + 1, suffixes[i], strlen (suffixes[i])) == 0)
{
  *next_dot = 0;
  return get_original_name (ret, false);


[gcc r16-2125] Fix profile scaling in tree-inline.cc:initialize_cfun

2025-07-09 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:8bd7504cab6fc3289700c1cdb7d03b5e6e9c6c54

commit r16-2125-g8bd7504cab6fc3289700c1cdb7d03b5e6e9c6c54
Author: Jan Hubicka 
Date:   Mon Jul 7 19:20:25 2025 +0200

Fix profile scaling in tree-inline.cc:initialize_cfun

initialize_cfun calls
 profile_count::adjust_for_ipa_scaling (&num, &den);
but then the result is never used.  This patch fixes it.  Overall scalling
of entry/exit block is bit sloppy in tree-inline.  I see if I can clean it 
up.

* tree-inline.cc (initialize_cfun): Use num and den for scaling.

Diff:
---
 gcc/tree-inline.cc | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/gcc/tree-inline.cc b/gcc/tree-inline.cc
index 7e0ac698e5e0..e8fe035b180e 100644
--- a/gcc/tree-inline.cc
+++ b/gcc/tree-inline.cc
@@ -2888,11 +2888,9 @@ initialize_cfun (tree new_fndecl, tree callee_fndecl, 
profile_count count)
   profile_count::adjust_for_ipa_scaling (&num, &den);
 
   ENTRY_BLOCK_PTR_FOR_FN (cfun)->count =
-ENTRY_BLOCK_PTR_FOR_FN (src_cfun)->count.apply_scale (count,
-   ENTRY_BLOCK_PTR_FOR_FN (src_cfun)->count);
+ENTRY_BLOCK_PTR_FOR_FN (src_cfun)->count.apply_scale (num, den);
   EXIT_BLOCK_PTR_FOR_FN (cfun)->count =
-EXIT_BLOCK_PTR_FOR_FN (src_cfun)->count.apply_scale (count,
-   ENTRY_BLOCK_PTR_FOR_FN (src_cfun)->count);
+EXIT_BLOCK_PTR_FOR_FN (src_cfun)->count.apply_scale (num, den);
   if (src_cfun->eh)
 init_eh_for_function ();


[gcc r16-2126] Improve afdo_adjust_guessed_profile

2025-07-09 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:3c0db87b13ed034196d8b77f1acdf40a538d585f

commit r16-2126-g3c0db87b13ed034196d8b77f1acdf40a538d585f
Author: Jan Hubicka 
Date:   Wed Jul 9 11:51:03 2025 +0200

Improve afdo_adjust_guessed_profile

This patch makes afdo_adjust_guessed_profile more robust.  Instead of using
median of scales we compute robust average wehre weights is taken from 
execution
count of edge it originates from and also I added a cap since in some cases
scaling factor may end up being very large introducing artificial hotest 
regions
of the program confusing ipa-profile's histogram based cutoff.
This was the problem of roms.

Bootstrapped/regtested x86_64-linux, comitted.

gcc/ChangeLog:

* auto-profile.cc (struct scale): New structure.
(add_scale): Also record weights.
(afdo_adjust_guessed_profile): Compute robust average
of scales and cap by max count in function.

Diff:
---
 gcc/auto-profile.cc | 81 +
 1 file changed, 69 insertions(+), 12 deletions(-)

diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index 1700bf8f2cd4..e27bcc7b58db 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -3306,10 +3306,22 @@ cmp (const void *a, const void *b)
   return 0;
 }
 
+/* To scalle a connected component of graph we collect desired scales of
+   basic blocks on the boundary and then compute a robust average.  */
+
+struct scale
+{
+  /* Scale descired.  */
+  sreal scale;
+  /* Weight for averaging computed from execution count of the edge
+ scale originates from.  */
+  uint64_t weight;
+};
+
 /* Add scale ORIG/ANNOTATED to SCALES.  */
 
 static void
-add_scale (vec  *scales, profile_count annotated, profile_count orig)
+add_scale (vec  *scales, profile_count annotated, profile_count orig)
 {
   if (dump_file)
 {
@@ -3318,15 +3330,15 @@ add_scale (vec  *scales, profile_count 
annotated, profile_count orig)
   annotated.dump (dump_file);
   fprintf (dump_file, "\n");
 }
-  if (orig.nonzero_p ())
+  if (orig.force_nonzero () == orig)
 {
   sreal scale
= annotated.guessed_local ()
.to_sreal_scale (orig);
   if (dump_file)
-   fprintf (dump_file, "adding scale %.16f\n",
-scale.to_double ());
-  scales->safe_push (scale);
+   fprintf (dump_file, "adding scale %.16f, weight %" PRId64 "\n",
+scale.to_double (), annotated.value () + 1);
+  scales->safe_push ({scale, annotated.value () + 1});
 }
 }
 
@@ -3372,7 +3384,7 @@ afdo_adjust_guessed_profile (bb_set *annotated_bb)
   /* Basic blocks of connected component currently processed.  */
   auto_vec  bbs (n_basic_blocks_for_fn (cfun));
   /* Scale factors found.  */
-  auto_vec  scales;
+  auto_vec  scales;
   auto_vec  stack (n_basic_blocks_for_fn (cfun));
 
   basic_block seed_bb;
@@ -3384,9 +3396,15 @@ afdo_adjust_guessed_profile (bb_set *annotated_bb)
  >=2 is an id of the component BB belongs to.  */
   auto_vec  component;
   component.safe_grow (last_basic_block_for_fn (cfun));
+  profile_count max_count_in_fn = profile_count::zero ();
   FOR_ALL_BB_FN (seed_bb, cfun)
-component[seed_bb->index]
-   = is_bb_annotated (seed_bb, *annotated_bb) ? 1 : 0;
+if (is_bb_annotated (seed_bb, *annotated_bb))
+  {
+   component[seed_bb->index] = 1;
+   max_count_in_fn = max_count_in_fn.max (seed_bb->count);
+  }
+else
+  component[seed_bb->index] = 0;
   FOR_ALL_BB_FN (seed_bb, cfun)
if (!component[seed_bb->index])
  {
@@ -3509,12 +3527,15 @@ afdo_adjust_guessed_profile (bb_set *annotated_bb)
 profile_count annotated_count = e->dest->count;
 profile_count out_count = profile_count::zero ();
 bool ok = true;
+
 for (edge e2: e->dest->preds)
   if (AFDO_EINFO (e2)->is_annotated ())
 annotated_count -= AFDO_EINFO (e2)->get_count ();
-  else if (component[e->src->index] == component_id)
-out_count += e->count ();
-  else if (e->probability.nonzero_p ())
+  else if (component[e2->src->index] == component_id)
+out_count += e2->count ();
+  else if (is_bb_annotated (e2->src, *annotated_bb))
+annotated_count -= e2->count ();
+  else if (e2->probability.nonzero_p ())
 {
   ok = false;
   break;
@@ -3561,7 +3582,43 @@ afdo_adjust_guessed_profile (bb_set *annotated_bb)
 }
gcc_checking_assert (scales.length ());
scales.qsort (cmp);
-   scale_bbs (bbs, scales[scales.length () / 2]);
+
+   uint64_t overall_weight = 0;
+   for (scale &e : scales)
+overall_weight += e.weight;
+
+   uint64_t cummulated = 0, weight_sum = 0;
+   srea

[gcc r16-2150] Fix ICE in afdo_adjust_guessed_profile

2025-07-09 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:18324422fdd790b0c11ba300a706a86df1023b74

commit r16-2150-g18324422fdd790b0c11ba300a706a86df1023b74
Author: Jan Hubicka 
Date:   Wed Jul 9 18:30:09 2025 +0200

Fix ICE in afdo_adjust_guessed_profile

gcc/ChangeLog:

* auto-profile.cc (afdo_adjust_guessed_profile): Add forgotten
if (dump_file) guard.

Diff:
---
 gcc/auto-profile.cc | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index e27bcc7b58db..219676012e76 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -3613,9 +3613,13 @@ afdo_adjust_guessed_profile (bb_set *annotated_bb)
  of hot basic blocks.  */
if (max_count * scale > max_count_in_fn.guessed_local ())
 {
-  fprintf (dump_file, "Scaling by %.16f produces max count ", 
scale.to_double ());
-  (max_count * scale).dump (dump_file);
-  fprintf (dump_file, " that exceeds max count in fn; capping\n");
+  if (dump_file)
+{
+  fprintf (dump_file, "Scaling by %.16f produces max count ",
+   scale.to_double ());
+  (max_count * scale).dump (dump_file);
+  fprintf (dump_file, " that exceeds max count in fn; capping\n");
+}
   scale = max_count_in_fn.guessed_local ().to_sreal_scale (max_count);
 }
scale_bbs (bbs, scale);


[gcc r16-2217] Fix some auto-profile issues

2025-07-12 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:f3186568d09c02a6d8915e43c0f5d7df704dfa0d

commit r16-2217-gf3186568d09c02a6d8915e43c0f5d7df704dfa0d
Author: Jan Hubicka 
Date:   Sat Jul 12 17:57:25 2025 +0200

Fix some auto-profile issues

This patch fixes minor things that has cumulated in my tree.  Except for
formating fixes an important change is that seen set is now kept up to date.
Oriignal code first populated it for all string in the string table but now
gimple matching may introduce new ones that needs to be checked for match 
with
symbol table as well.

This makes imagemagic of spec2017 to be faster with auto-fdo then without at
least when trained with ref run.  Train run has problem since it does not 
train
the innermost loop at all, so even with normal PGO it is slower then 
without.

autorpfoiledbootstrapped/regtested x86_64-linux, comitted.

gcc/ChangeLog:

* auto-profile.cc (function_instance::~function_instance):
Move down in source.
(string_table::get_cgraph_node): New member function with
logic broken out from ...
(function_instance::get_cgraph_node): ... here.
(match_with_target): Fix formating.
(function_instance::match): Fix formating; do not use iterators
after modifying map; remove incorrect set of warned flag.
(autofdo_source_profile::offline_external_functions): Keep
seen set up to date.
(function_instance::read_function_instance): Fix formating.

Diff:
---
 gcc/auto-profile.cc | 228 ++--
 1 file changed, 130 insertions(+), 98 deletions(-)

diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index 5226e4550257..d1954b4fad69 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -240,6 +240,8 @@ public:
   /* Add new name and return its index.  */
   int add_name (char *);
 
+  /* Return cgraph node corresponding to given name index.  */
+  cgraph_node *get_cgraph_node (int);
 private:
   typedef std::map string_index_map;
   string_vector vector_;
@@ -445,7 +447,6 @@ public:
   /* Lookup count and warn about duplicates.  */
   count_info *lookup_count (location_t loc, inline_stack &stack,
cgraph_node *node);
-
 private:
   /* Callsite, represented as (decl_lineno, callee_function_name_index).  */
   typedef std::pair callsite;
@@ -888,21 +889,13 @@ string_table::read ()
   return true;
 }
 
-/* Member functions for function_instance.  */
-
-function_instance::~function_instance ()
-{
-  gcc_assert (!in_worklist_p ());
-  for (callsite_map::iterator iter = callsites.begin ();
-   iter != callsites.end (); ++iter)
-delete iter->second;
-}
-
-/* Return corresponding cgraph node, NULL if unavailable.  */
+/* Return cgraph node corresponding to given NAME_INDEX,
+   NULL if unavailable.  */
 cgraph_node *
-function_instance::get_cgraph_node ()
+string_table::get_cgraph_node (int name_index)
 {
-  const char *sname = afdo_string_table->get_name (name ());
+  const char *sname = get_name (name_index);
+
   symtab_node *n = cgraph_node::get_for_asmname (get_identifier (sname));
   for (;n; n = n->next_sharing_asm_name)
 if (cgraph_node *cn = dyn_cast  (n))
@@ -911,6 +904,24 @@ function_instance::get_cgraph_node ()
   return NULL;
 }
 
+/* Return corresponding cgraph node.  */
+
+cgraph_node *
+function_instance::get_cgraph_node ()
+{
+  return afdo_string_table->get_cgraph_node (name ());
+}
+
+/* Member functions for function_instance.  */
+
+function_instance::~function_instance ()
+{
+  gcc_assert (!in_worklist_p ());
+  for (callsite_map::iterator iter = callsites.begin ();
+   iter != callsites.end (); ++iter)
+delete iter->second;
+}
+
 /* Traverse callsites of the current function_instance to find one at the
location of LINENO and callee name represented in DECL.  */
 
@@ -1169,7 +1180,7 @@ match_with_target (cgraph_node *n,
}
   /* Accept dwarf names and stripped suffixes.  */
   if (!strcmp (lang_hooks.dwarf_name (callee->decl, 0),
-  afdo_string_table->get_name (inlined_fn->name ()))
+  afdo_string_table->get_name (inlined_fn->name ()))
  || (!name[i] && symbol_name[i] == '.')
  || in_suffix)
{
@@ -1183,8 +1194,8 @@ match_with_target (cgraph_node *n,
  inlined_fn->set_name (index);
  return 2;
}
-  /* Only warn about declarations.  It is possible that the function is
-declared as alias in other module and we inlined cross-module.  */
+  /* Only warn about declarations.  It is possible that the function
+is declared as alias in other module and we inlined cross-module.  */
   if (callee->definition
  && warning (OPT_Wauto_profile,
  "auto-profile of %q+F contains inlined "
@@ -1491,8 +1502,8 @@ function_instance::match (cgraph_node *node,

[gcc r16-2196] Fix ICE in speculative devirtualization

2025-07-11 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:52d9c2272f6366cd5b30e9540ce8ef16b482cee5

commit r16-2196-g52d9c2272f6366cd5b30e9540ce8ef16b482cee5
Author: Jan Hubicka 
Date:   Fri Jul 11 12:37:24 2025 +0200

Fix ICE in speculative devirtualization

This patch fixes ICE bilding lto1 with autoprofiledbootstrap and in 
pr114790.
What happens is that auto-fdo speculatively devirtualizes to a wrong target.
This is due to a bug where it mixes up dwarf names and linkage names of 
inline
functions I need to fix as well.

Later we clone at WPA time. At ltrans time clone is materialized and call is
turned into a direct call (this optimization is missed by ipa-cp 
propagation).
At this time we should resolve speculation but we don't.  As a result we get
error from verifier after inlining complaining that there is speculative 
call
with corresponding direct call lacking speculative flag.

This seems long-lasting problem in cgraph_update_edges_for_call_stmt_node 
but
I suppose it does not trigger since we usually speculate correctly or notice
the direct call at WPA time already.

Bootstrapped/regtested x86_64-linux.

gcc/ChangeLog:

PR ipa/114790
* cgraph.cc (cgraph_update_edges_for_call_stmt_node): Resolve 
devirtualization
if call statement was optimized out or turned to direct call.

gcc/testsuite/ChangeLog:

* g++.dg/lto/pr114790_0.C: New test.
* g++.dg/lto/pr114790_1.C: New test.

Diff:
---
 gcc/cgraph.cc | 13 +
 gcc/testsuite/g++.dg/lto/pr114790_0.C | 16 
 gcc/testsuite/g++.dg/lto/pr114790_1.C | 15 +++
 3 files changed, 44 insertions(+)

diff --git a/gcc/cgraph.cc b/gcc/cgraph.cc
index 94a2e6e61058..32071a84bacc 100644
--- a/gcc/cgraph.cc
+++ b/gcc/cgraph.cc
@@ -1790,6 +1790,19 @@ cgraph_update_edges_for_call_stmt_node (cgraph_node 
*node,
 
   if (e)
{
+ /* If call was devirtualized during cloning, mark edge
+as resolved.  */
+ if (e->speculative)
+   {
+ if (new_stmt && is_gimple_call (new_stmt))
+   {
+ tree decl = gimple_call_fndecl (new_stmt);
+ if (decl)
+   e = cgraph_edge::resolve_speculation (e, decl);
+   }
+ else
+   e = cgraph_edge::resolve_speculation (e, NULL);
+   }
  /* Keep calls marked as dead dead.  */
  if (new_stmt && is_gimple_call (new_stmt) && e->callee
  && fndecl_built_in_p (e->callee->decl, BUILT_IN_UNREACHABLE,
diff --git a/gcc/testsuite/g++.dg/lto/pr114790_0.C 
b/gcc/testsuite/g++.dg/lto/pr114790_0.C
new file mode 100644
index ..eed112df3897
--- /dev/null
+++ b/gcc/testsuite/g++.dg/lto/pr114790_0.C
@@ -0,0 +1,16 @@
+// { dg-lto-do link }
+// { dg-lto-options { { -w -flto -g -flto-partition=1to1 -O2 -shared -fPIC 
-fvisibility=hidden} } }
+// { dg-require-effective-target fpic }
+// { dg-require-effective-target shared }
+struct APITracerContext {
+  virtual ~APITracerContext() = default;
+  virtual void releaseActivetracersList() = 0;
+};
+struct APITracerContextImp : APITracerContext {
+  ~APITracerContextImp() override;
+  void releaseActivetracersList() override;
+};
+struct APITracerContextImp globalAPITracerContextImp;
+struct APITracerContextImp *pGlobalAPITracerContextImp = 
&globalAPITracerContextImp;
+APITracerContextImp::~APITracerContextImp() {}
+
diff --git a/gcc/testsuite/g++.dg/lto/pr114790_1.C 
b/gcc/testsuite/g++.dg/lto/pr114790_1.C
new file mode 100644
index ..511fae45be8b
--- /dev/null
+++ b/gcc/testsuite/g++.dg/lto/pr114790_1.C
@@ -0,0 +1,15 @@
+struct APITracerContext {
+  virtual void releaseActivetracersList() = 0;
+};
+extern struct APITracerContextImp *pGlobalAPITracerContextImp;
+struct APITracerContextImp : APITracerContext { void 
releaseActivetracersList();};
+int g();
+inline int
+apiTracerWrapperImp(  ) {
+  for (int i = 0; i < g(); i++) 
+  pGlobalAPITracerContextImp->releaseActivetracersList();
+}
+__attribute__((visibility("default"))) int
+zeCommandListAppendMemoryCopyTracing() {
+  return apiTracerWrapperImp(  );
+}


[gcc r16-2197] Rewrite assign_discriminators

2025-07-11 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:385d9937f0e23cbf9c62f0b2553a33ff70e56ecf

commit r16-2197-g385d9937f0e23cbf9c62f0b2553a33ff70e56ecf
Author: Jan Hubicka 
Date:   Fri Jul 11 13:01:13 2025 +0200

Rewrite assign_discriminators

To assign debug locations to corresponding statements auto-fdo uses
discriminators.  Documentation says that if given statement belongs to 
multiple
basic blocks, the discrminator distinguishes them.

Current implementation however only work fork statements that expands into a
squence of gimple statements which forms a linear sequence, sicne it
essentially tracks a current location and renews it each time new BB is 
found.
This is commonly not true for C++ code as in:

   :
  [simulator/csimplemodule.cc:379:85] _40 = 
std::__cxx11::basic_string::c_str ([simulator/csimplemodule.cc:379:85] 
&D.80680);
  [simulator/csimplemodule.cc:379:85 discrim 13] _41 = 
[simulator/csimplemodule.cc:379:85] 
&this->D.78503.D.78106.D.72008.D.68585.D.67935.D.67879.D.67782;
  [simulator/csimplemodule.cc:379:85 discrim 13] _42 = 
&this->D.78503.D.78106.D.72008.D.68585.D.67935.D.67879.D.67782;
  [simulator/csimplemodule.cc:377:45] _43 = 
this->D.78503.D.78106.D.72008.D.68585.D.67935.D.67879.D.67782._vptr.cObject;
  [simulator/csimplemodule.cc:377:45] _44 = _43 + 40;
  [simulator/csimplemodule.cc:377:45] _45 = 
[simulator/csimplemodule.cc:377:45] *_44;
  [simulator/csimplemodule.cc:379:85] D.89001 = OBJ_TYPE_REF(_45;(const 
struct cObject)_42->5B) (_41);

This is a fragment of code that is expanded from:

371 if (this!=simulation.getContextModule())
372 throw cRuntimeError("send()/sendDelayed() of module (%s)%s 
called in the context of "
373 "module (%s)%s: method called from the 
latter module "
374 "lacks Enter_Method() or 
Enter_Method_Silent()? "
375 "Also, if message to be sent is passed 
from that module, "
376 "you'll need to call take(msg) after 
Enter_Method() as well",
377 getClassName(), getFullPath().c_str(),
378 
simulation.getContextModule()->getClassName(),
379 
simulation.getContextModule()->getFullPath().c_str());

Notice that 379:85 is interleaved by 377:45 and the pass does not assign 
new discriminator.
With patch we get:

   :
  [simulator/csimplemodule.cc:379:85 discrim 7] _40 = 
std::__cxx11::basic_string::c_str ([simulator/csimplemodule.cc:379:85] 
&D.80680);
  [simulator/csimplemodule.cc:379:85 discrim 8] _41 = 
[simulator/csimplemodule.cc:379:85] 
&this->D.78503.D.78106.D.72008.D.68585.D.67935.D.67879.D.67782;
  [simulator/csimplemodule.cc:379:85 discrim 8] _42 = 
&this->D.78503.D.78106.D.72008.D.68585.D.67935.D.67879.D.67782;
  [simulator/csimplemodule.cc:377:45 discrim 1] _43 = 
this->D.78503.D.78106.D.72008.D.68585.D.67935.D.67879.D.67782._vptr.cObject;
  [simulator/csimplemodule.cc:377:45 discrim 1] _44 = _43 + 40;
  [simulator/csimplemodule.cc:377:45 discrim 1] _45 = 
[simulator/csimplemodule.cc:377:45] *_44;
  [simulator/csimplemodule.cc:379:85 discrim 8] D.89001 = 
OBJ_TYPE_REF(_45;(const struct cObject)_42->5B) (_41);

There are earlier statements with line number 379, so that is why there is 
discriminator 7 for the call.
After that discriminator is increased.  There are two reasons for it
 1) AFDO requires every callsite to have unique lineno:discriminator pair
 2) call may not terminate and htus the profile of first statement
may be higher than the rest.

Old pass also contained logic to skip debug statements.  This is not a good
idea since we output them to the debug output and if AFDO tool picks these
locations up they will be misplaced in basic blocks.

Debug statements are naturally quite useful to track back the AFDO profiles
and in meantime LLVM folks implemented something similar called pseudoprobe.
I think it makes sense toenable debug statements with -fauto-profile even if
debug info is off and make use of them as done in this patch.

Sadly AFDO tool is quite broken and bulid around assumption that every 
address
has at most one debug location assigned to it (i.e. debug info before debug
statements were introduced). I have WIP patch fixing this.

Note that LLVM also has -fdebug-info-for-auto-profile (on by defualt it 
seems)
that controls discriminator production and some other little bits.  I 
wonder if
we want to have something similar.  Should it be 
-gdebug-info-for-auto-profile
instead?

gcc/ChangeLog:

* opts.cc (finish_options): Enable debug_nonbind_markers_p for
auto-profile.
* tree-cfg.cc (struct locus_discrim_m

[gcc r16-2176] Fixes to auto-profile and Gimple matching.

2025-07-10 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:50f3a6a437ad4f2438191b6d9aa9aed8575b9372

commit r16-2176-g50f3a6a437ad4f2438191b6d9aa9aed8575b9372
Author: Jan Hubicka 
Date:   Thu Jul 10 16:56:21 2025 +0200

Fixes to auto-profile and Gimple matching.

This patch fixes several issues I noticed in gimple matching and 
-Wauto-profile
warning.  One problem is that we mismatched symbols with user names, such as
"*strlen" instead of "strlen". I added raw_symbol_name to strip extra '*' 
which
is ok on ELF targets which are only targets we support with auto-profile, 
but
eventually we will want to add the user prefix.  There is sorry about this.
Also I think dwarf2out is wrong:

static void
add_linkage_attr (dw_die_ref die, tree decl)
{
  const char *name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME 
(decl));

  /* Mimic what assemble_name_raw does with a leading '*'.  */
  if (name[0] == '*')
name = &name[1];

The patch also fixes locations of warning.  I used location of problematic
statement as warning_at parmaeter but also included info about the 
containing
funtction.  This makes warning_at to ignore the fist location that is fixed 
now.

I also fixed the ICE with -Wno-auto-profile disussed earlier.

Bootstrapped/regtested x86_64-linux.  Autoprofiled bootstrap now fails for
weird reasons for me (it does not bild the training stage), so I will try to
debug this before comitting.

gcc/ChangeLog:

* auto-profile.cc: Include output.h.
(function_instance::set_call_location): Also sanity check
that location is known.
(raw_symbol_name): Two new static functions.
(dump_inline_stack): Use it.
(string_table::get_index_by_decl): Likewise.
(function_instance::get_cgraph_node): Likewise.
(function_instance::get_function_instance_by_decl): Fix typo
in warning; use raw names; fix lineno decoding.
(match_with_target): Add containing funciton parameter;
correctly output function and call location in warning.
(function_instance::lookup_count): Fix warning locations.
(function_instance::match): Fix warning locations; avoid
crash with mismatched callee; do not warn about broken callsites
twice.
(autofdo_source_profile::offline_external_functions): Use
raw_assembler_name.
(walk_block): Use raw_assembler_name.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-prof/afdo-inline.c: Add user symbol names.

Diff:
---
 gcc/auto-profile.cc  | 231 +--
 gcc/testsuite/gcc.dg/tree-prof/afdo-inline.c |   9 ++
 2 files changed, 156 insertions(+), 84 deletions(-)

diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index 219676012e76..5226e4550257 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -53,6 +53,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "auto-profile.h"
 #include "tree-pretty-print.h"
 #include "gimple-pretty-print.h"
+#include "output.h"
 
 /* The following routines implements AutoFDO optimization.
 
@@ -430,7 +431,8 @@ public:
   void
   set_call_location (location_t l)
   {
-gcc_checking_assert (call_location_ == UNKNOWN_LOCATION);
+gcc_checking_assert (call_location_ == UNKNOWN_LOCATION
+&& l != UNKNOWN_LOCATION);
 call_location_= l;
   }
 
@@ -685,6 +687,26 @@ dump_afdo_loc (FILE *f, unsigned loc)
 fprintf (f, "%i", loc >> 16);
 }
 
+/* Return assembler name as in symbol table and DW_AT_linkage_name.  */
+
+static const char *
+raw_symbol_name (const char *asmname)
+{
+  /* If we start supporting user_label_prefixes, add_linkage_attr will also
+ need to be fixed.  */
+  if (strlen (user_label_prefix))
+sorry ("auto-profile is not supported for targets with user label prefix");
+  return asmname + (asmname[0] == '*');
+}
+
+/* Convenience wrapper that looks up assembler name.  */
+
+static const char *
+raw_symbol_name (tree decl)
+{
+  return raw_symbol_name (IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
+}
+
 /* Dump STACK to F.  */
 
 static void
@@ -695,7 +717,7 @@ dump_inline_stack (FILE *f, inline_stack *stack)
 {
   fprintf (f, "%s%s:",
   first ? "" : "; ",
-  IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (p.decl)));
+  raw_symbol_name (p.decl));
   dump_afdo_loc (f, p.afdo_loc);
   first = false;
 }
@@ -817,7 +839,7 @@ string_table::get_index (const char *name) const
 int
 string_table::get_index_by_decl (tree decl) const
 {
-  const char *name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
+  const char *name = raw_symbol_name (decl);
   int ret = get_index (name);
   if (ret != -1)
 return ret;
@@ -880,10 +902,9 @@ function_instance::~function_instan

[gcc r16-1968] Fix division by zero in ipa-cp.cc:update_profiling_info

2025-07-03 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:a52484f1ac34dbb604dc862407d9abb32df444dd

commit r16-1968-ga52484f1ac34dbb604dc862407d9abb32df444dd
Author: Jan Hubicka 
Date:   Thu Jul 3 11:56:28 2025 +0200

Fix division by zero in ipa-cp.cc:update_profiling_info

This ICE has triggered for me during autoprofiledbootstrap.  The
code already takes into care possible range, so I think in this case
we can just push to one side of it.

Bootstrapped/regtesed x86_64-linux, OK?

gcc/ChangeLog:

* ipa-cp.cc (update_profiling_info): Watch for division by zero.

Diff:
---
 gcc/ipa-cp.cc | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/gcc/ipa-cp.cc b/gcc/ipa-cp.cc
index 901d4a5616e9..480cf48786c7 100644
--- a/gcc/ipa-cp.cc
+++ b/gcc/ipa-cp.cc
@@ -4838,11 +4838,12 @@ update_profiling_info (struct cgraph_node *orig_node,
   profile_count unexp = orig_node_count - new_sum - orig_nonrec_call_count;
 
   int limit_den = 2 * (orig_nonrec_calls + new_nonrec_calls);
-  profile_count new_part
-   = MAX(MIN (unexp.apply_scale (new_sum,
- new_sum + orig_nonrec_call_count),
-  unexp.apply_scale (limit_den - 1, limit_den)),
- unexp.apply_scale (new_nonrec_calls, limit_den));
+  profile_count new_part = unexp.apply_scale (limit_den - 1, limit_den);
+  profile_count den = new_sum + orig_nonrec_call_count;
+  if (den.nonzero_p ())
+   new_part = MIN (unexp.apply_scale (new_sum, den), new_part);
+  new_part = MAX (new_part,
+ unexp.apply_scale (new_nonrec_calls, limit_den));
   if (dump_file)
{
  fprintf (dump_file, "   Claiming ");


[gcc r16-1663] Remove early inlining from afdo pass

2025-06-25 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:3fde750a29820a1ccd6dd399bdfa0cf3d97a2c30

commit r16-1663-g3fde750a29820a1ccd6dd399bdfa0cf3d97a2c30
Author: Jan Hubicka 
Date:   Wed Jun 25 02:59:54 2025 +0200

Remove early inlining from afdo pass

This pass removes early-inlining from afdo pass since all inlining should 
now
happen from early inliner.  I tedted this on spec and there are 3 inlines
happening here which are blocked at early-inline time by hitting large 
function
growth limit.  We probably want to bypass that limit, I will look into that
incrementaly.

This should make the non-inlined function profile merging hopefully easier.

It may still make sense to separate afdo inliner from early inliner to solve
the non-transitivity issues which is not that hard to do with current code
orgnaization. However this should be separate IPA pass rather then another
part of afdo pass, since it can be coneptually separate.

gcc/ChangeLog:

* auto-profile.cc: Update toplevel comment.
(early_inline): Remove.
(auto_profile): Don't do early inlining.

Diff:
---
 gcc/auto-profile.cc | 39 +++
 1 file changed, 15 insertions(+), 24 deletions(-)

diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index 8a1d9f878c65..3f8310e6324b 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -76,21 +76,30 @@ along with GCC; see the file COPYING3.  If not see
  standalone symbol, or a clone of a function that is inlined into another
  function.
 
-   Phase 2: Early inline + value profile transformation.
- Early inline uses autofdo_source_profile to find if a callsite is:
+   Phase 2: AFDO inline + value profile transformation.
+ This happens during early optimization.
+ During early inlning AFDO inliner is executed which
+ uses autofdo_source_profile to find if a callsite is:
 * inlined in the profiled binary.
 * callee body is hot in the profiling run.
  If both condition satisfies, early inline will inline the callsite
  regardless of the code growth.
- Phase 2 is an iterative process. During each iteration, we also check
- if an indirect callsite is promoted and inlined in the profiling run.
- If yes, vpt will happen to force promote it and in the next iteration,
- einline will inline the promoted callsite in the next iteration.
+
+ Performing this early has benefit of doing early optimizations
+ before read IPA passe and getting more "context sensitivity" of
+ the profile read.  Profile of inlined functions may differ
+ significantly form one inline instance to another and from the
+ offline version.
+
+ This is controlled by -fauto-profile-inlinig and is independent
+ of -fearly-inlining.
 
Phase 3: Annotate control flow graph.
  AutoFDO uses a separate pass to:
 * Annotate basic block count
 * Estimate branch probability
+   * Use earlier static profile to fill in the gaps
+ if AFDO profile is ambigous
 
After the above 3 phases, all profile is readily annotated on the GCC IR.
AutoFDO tries to reuse all FDO infrastructure as much as possible to make
@@ -2217,18 +2226,6 @@ afdo_annotate_cfg (void)
   free_dominance_info (CDI_POST_DOMINATORS);
 }
 
-/* Wrapper function to invoke early inliner.  */
-
-static unsigned int
-early_inline ()
-{
-  compute_fn_summary (cgraph_node::get (current_function_decl), true);
-  unsigned int todo = early_inliner (cfun);
-  if (todo & TODO_update_ssa_any)
-update_ssa (TODO_update_ssa);
-  return todo;
-}
-
 /* Use AutoFDO profile to annoate the control flow graph.
Return the todo flag.  */
 
@@ -2254,15 +2251,9 @@ auto_profile (void)
 
 push_cfun (DECL_STRUCT_FUNCTION (node->decl));
 
-unsigned int todo = early_inline ();
 autofdo::afdo_annotate_cfg ();
 compute_function_frequency ();
 
-/* Local pure-const may imply need to fixup the cfg.  */
-todo |= execute_fixup_cfg ();
-if (todo & TODO_cleanup_cfg)
-  cleanup_tree_cfg ();
-
 free_dominance_info (CDI_DOMINATORS);
 free_dominance_info (CDI_POST_DOMINATORS);
 cgraph_edge::rebuild_edges ();


[gcc r16-1961] Auto-FDO/FDO profile comparator

2025-07-03 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:09db37f7cea79f1cfcede455763e5e2da28ae2d5

commit r16-1961-g09db37f7cea79f1cfcede455763e5e2da28ae2d5
Author: Jan Hubicka 
Date:   Tue Jul 1 08:32:56 2025 +0200

Auto-FDO/FDO profile comparator

the patch I sent from airport only worked if you produced the gcda files 
with
unpatched compiler.  For some reason auto-profile reading is interwinded 
into
gcov reading which is not necessary.  Here is cleaner version which also
makes the format bit more convenient.  One can now grep as:

grep "bb.*fdo.*very hot.*cold" *.profile | sort -n -k 5 -r | less

digits_2/30 bb 307 fdo 10273284651 (very hot) afdo 0 (auto FDO) (cold)  
scaled 0 diff -10273284651, -100.00%
digits_2/30 bb 201 fdo 2295561442 (very hot) afdo 19074 (auto FDO) (cold)  
scaled 1341585 diff -2294219857, -99.94%
digits_2/30 bb 203 fdo 1236123372 (very hot) afdo 9537 (auto FDO) (cold)  
scaled 670792 diff -1235452580, -99.95%
digits_2/30 bb 200 fdo 1236123372 (very hot) afdo 9537 (auto FDO) (cold)  
scaled 670792 diff -1235452580, -99.95%
digits_2/30 bb 202 fdo 1059438070 (very hot) afdo 9537 (auto FDO) (cold)  
scaled 670792 diff -1058767278, -99.94%
new_solver/9 bb 246 fdo 413879041 (very hot) afdo 76594 (guessed) (cold)  
scaled 5387299 diff -408491742, -98.70%
new_solver/9 bb 167 fdo 413792205 (very hot) afdo 76594 (guessed) (cold)  
scaled 5387299 diff -408404906, -98.70%
new_solver/9 bb 159 fdo 387809230 (very hot) afdo 57182 (guessed) (cold)  
scaled 4021940 diff -383787290, -98.96%
new_solver/9 bb 158 fdo 387809230 (very hot) afdo 60510 (guessed) (cold)  
scaled 4256018 diff -383553212, -98.90%
new_solver/9 bb 138 fdo 387809230 (very hot) afdo 40917 (guessed) (cold)  
scaled 2877929 diff -384931301, -99.26%
new_solver/9 bb 137 fdo 387809230 (very hot) afdo 43298 (guessed) (cold)  
scaled 3045398 diff -384763832, -99.21%

This dumps basic blocks that do have large counts by normal profile feedback
but autofdo gives them small count (so they get cold).  These seems to be
indeed mostly basic blocks controlling loops.

gcc/ChangeLog:

* auto-profile.cc (afdo_hot_bb_threshod): New global
variable.
(maybe_hot_afdo_count_p): New function.
(autofdo_source_profile::read): Do not set up dump file;
set afdo_hot_bb_threshod.
(afdo_annotate_cfg): Handle partial training.
(afdo_callsite_hot_enough_for_early_inline):
Use maybe_hot_afdo_count_p.
(auto_profile_offline::execute): Read autofdo file.
* auto-profile.h (maybe_hot_afdo_count_p): Declare.
(afdo_hot_bb_threshold): Declare.
* coverage.cc (read_counts_file): Also set gcov_profile_info.
(coverage_init): Do not read autofdo file.
* opts.cc (enable_fdo_optimizations): Add autofdo parameter;
do not set flag_branch_probabilities and flag_profile_values
with it.
(common_handle_option): Update.
* passes.cc (finish_optimization_passes): Do not end branch
prob here.
(pass_manager::dump_profile_report): Also mark change after
autofdo pass.
* profile.cc: Include auto-profile.h
(gcov_profile_info): New global variable.
(struct afdo_fdo_record): New struture.
(compute_branch_probabilities): Record afdo profile.
(end_branch_prob): Dump afdo/fdo profile comparsion.
* profile.h (gcov_profile_info): Declarre.
* tree-profile.cc (tree_profiling): Call end_branch_prob
(pass_ipa_tree_profile::gate): Also enable with autoFDO

Diff:
---
 gcc/auto-profile.cc |  53 ++---
 gcc/auto-profile.h  |   7 
 gcc/coverage.cc |   6 +--
 gcc/opts.cc |  13 +++---
 gcc/passes.cc   |   8 +---
 gcc/profile.cc  | 111 +++-
 gcc/profile.h   |   2 +-
 gcc/tree-profile.cc |   7 ++--
 8 files changed, 170 insertions(+), 37 deletions(-)

diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index d78f2cb42b5c..743b005938c1 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -123,6 +123,18 @@ along with GCC; see the file COPYING3.  If not see
 #define DEFAULT_AUTO_PROFILE_FILE "fbdata.afdo"
 #define AUTO_PROFILE_VERSION 2
 
+/* profile counts determined by AFDO smaller than afdo_hot_bb_threshold are
+   considered cols.  */
+gcov_type afdo_hot_bb_threshod = -1;
+
+/* Return ture if COUNT is possiby hot.  */
+bool
+maybe_hot_afdo_count_p (profile_count count)
+{
+  gcc_checking_assert (count.ipa ().initialized_p ());
+  return count.ipa ().to_gcov_type () >= afdo_hot_bb_threshod;
+}
+
 namespace autofdo
 {
 
@@ -1908,9 +1920,6 @@ autofdo_source_profile::read ()
   /* Read in the function/callsite profile, and store it in local
  data structure.  */
   unsigned function_num =

[gcc r16-1962] Fix overlfow in ipa-cp heuristics

2025-07-03 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:bba817adbfde5c44fb77cc284c1917d33407ec2e

commit r16-1962-gbba817adbfde5c44fb77cc284c1917d33407ec2e
Author: Jan Hubicka 
Date:   Thu Jul 3 10:19:31 2025 +0200

Fix overlfow in ipa-cp heuristics

ipa-cp converts sreal times to int, while point of sreal is to accomodate 
very
large values that can happen for loops with large number of iteraitons and 
also
when profile is inconsistent.  This happens with afdo in testsuite where 
loop
preheader is estimated to have 0 excutions while loop body has large number 
of
executions.

Bootstrapped/regtesed x86_64-linux, comitted.

gcc/ChangeLog:

* ipa-cp.cc (hint_time_bonus): Return sreal and avoid
conversions to integer.
(good_cloning_opportunity_p): Avoid sreal to integer
conversions
(perform_estimation_of_a_value): Update.

Diff:
---
 gcc/ipa-cp.cc | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/gcc/ipa-cp.cc b/gcc/ipa-cp.cc
index 3e073af662a6..75ea94f2ad85 100644
--- a/gcc/ipa-cp.cc
+++ b/gcc/ipa-cp.cc
@@ -3341,10 +3341,10 @@ devirtualization_time_bonus (struct cgraph_node *node,
 
 /* Return time bonus incurred because of hints stored in ESTIMATES.  */
 
-static int
+static sreal
 hint_time_bonus (cgraph_node *node, const ipa_call_estimates &estimates)
 {
-  int result = 0;
+  sreal result = 0;
   ipa_hints hints = estimates.hints;
   if (hints & (INLINE_HINT_loop_iterations | INLINE_HINT_loop_stride))
 result += opt_for_fn (node->decl, param_ipa_cp_loop_hint_bonus);
@@ -3352,10 +3352,10 @@ hint_time_bonus (cgraph_node *node, const 
ipa_call_estimates &estimates)
   sreal bonus_for_one = opt_for_fn (node->decl, param_ipa_cp_loop_hint_bonus);
 
   if (hints & INLINE_HINT_loop_iterations)
-result += (estimates.loops_with_known_iterations * bonus_for_one).to_int 
();
+result += estimates.loops_with_known_iterations * bonus_for_one;
 
   if (hints & INLINE_HINT_loop_stride)
-result += (estimates.loops_with_known_strides * bonus_for_one).to_int ();
+result += estimates.loops_with_known_strides * bonus_for_one;
 
   return result;
 }
@@ -3436,7 +3436,7 @@ good_cloning_opportunity_p (struct cgraph_node *node, 
sreal time_benefit,
 introduced.  This is likely almost always going to be true, since we
 already checked that time saved is large enough to be considered
 hot.  */
-  else if (evaluation.to_int () >= eval_threshold)
+  else if (evaluation >= (sreal)eval_threshold)
return true;
   /* If all call sites have profile known; we know we do not want t clone.
 If there are calls with unknown profile; try local heuristics.  */
@@ -3457,7 +3457,7 @@ good_cloning_opportunity_p (struct cgraph_node *node, 
sreal time_benefit,
 info->node_calling_single_call ? ", single_call" : "",
 evaluation.to_double (), eval_threshold);
 
-  return evaluation.to_int () >= eval_threshold;
+  return evaluation >= eval_threshold;
 }
 
 /* Grow vectors in AVALS and fill them with information about values of
@@ -3543,8 +3543,8 @@ perform_estimation_of_a_value (cgraph_node *node,
 time_benefit = 0;
   else
 time_benefit = (estimates.nonspecialized_time - estimates.time)
+  + hint_time_bonus (node, estimates)
   + (devirtualization_time_bonus (node, avals)
-+ hint_time_bonus (node, estimates)
 + removable_params_cost + est_move_cost);
 
   int size = estimates.size;


[gcc r16-1963] Enable ipa-cp cloning for cold wrappers of hot functions

2025-07-03 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:328ef9aaede3c59224e52a1337416e5489e7c6c8

commit r16-1963-g328ef9aaede3c59224e52a1337416e5489e7c6c8
Author: Jan Hubicka 
Date:   Thu Jul 3 10:25:39 2025 +0200

Enable ipa-cp cloning for cold wrappers of hot functions

ipa-cp cloning disables itself for all functions not passing opt_for_fn
(node->decl, optimize_size) which disables it for cold wrappers of hot
functions where we want to propagate.  Since we later want to time saved
to be considered hot, we do not need to make this early test.

The patch also fixes few other places where AFDO 0 disables ipa-cp.

gcc/ChangeLog:

* ipa-cp.cc (cs_interesting_for_ipcp_p): Handle
correctly GLOBAL0 afdo counts.
(ipcp_cloning_candidate_p): Do not rule out nodes
!node->optimize_for_size_p ().
(good_cloning_opportunity_p): Handle afdo counts
as non-zero.

Diff:
---
 gcc/ipa-cp.cc | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/gcc/ipa-cp.cc b/gcc/ipa-cp.cc
index 75ea94f2ad85..901d4a5616e9 100644
--- a/gcc/ipa-cp.cc
+++ b/gcc/ipa-cp.cc
@@ -554,6 +554,7 @@ cs_interesting_for_ipcp_p (cgraph_edge *e)
   /* If we have zero IPA profile, still consider edge for cloning
  in case we do partial training.  */
   if (e->count.ipa ().initialized_p ()
+  && e->count.ipa ().quality () != AFDO
   && !opt_for_fn (e->callee->decl,flag_profile_partial_training))
 return false;
   return true;
@@ -617,7 +618,9 @@ ipcp_cloning_candidate_p (struct cgraph_node *node)
   return false;
 }
 
-  if (node->optimize_for_size_p ())
+  /* Do not use profile here since cold wrapper wrap
+ hot function.  */
+  if (opt_for_fn (node->decl, optimize_size))
 {
   if (dump_file)
fprintf (dump_file, "Not considering %s for cloning; "
@@ -3391,9 +3394,10 @@ good_cloning_opportunity_p (struct cgraph_node *node, 
sreal time_benefit,
int size_cost, bool called_without_ipa_profile)
 {
   gcc_assert (count_sum.ipa () == count_sum);
+  if (count_sum.quality () == AFDO)
+count_sum = count_sum.force_nonzero ();
   if (time_benefit == 0
   || !opt_for_fn (node->decl, flag_ipa_cp_clone)
-  || node->optimize_for_size_p ()
   /* If there is no call which was executed in profiling or where
 profile is missing, we do not want to clone.  */
   || (!called_without_ipa_profile && !count_sum.nonzero_p ()))


[gcc r16-1970] Add -Wauto-profile warning

2025-07-03 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:99f9e90160cb83b09ec7421e9b53e4fffe3ee5ec

commit r16-1970-g99f9e90160cb83b09ec7421e9b53e4fffe3ee5ec
Author: Jan Hubicka 
Date:   Thu Jul 3 12:05:45 2025 +0200

Add -Wauto-profile warning

this patch adds new warning -Wauto-profile which warns about mismatches 
between
profile data and function bodies.  This is implemented during the offline 
pass
where every function instance is compared with actual gimple body (if
available) and we verify that the statement locations in the profile data 
can
be matched with statements in the function.

Currently it is mostly useful to find bugs, but eventually I hope it will be
useful for users to verify that auto-profile works as expected or to 
evaulate
how much of an old auto-profile data can still be applied to current 
sources.
There will probably be always some side cases we can not handle with
auto-profile format (such as function with bodies in mutlple files) that 
can be
patched in compiled program.

I also added logic to fix up missing discriminators in the function 
callsites.
I am not sure how those happens (but seem to go away with -fno-crossjumping)
and will dig into it.

Ohter problem is that without -flto at the train run inlined functions have
dwarf names rather than symbol names. LLVM solves this by
-gdebug-for-autoprofile flag that we could also have.  With this flag we 
could
output assembler names as well as multiplicities of statemnets.

Building SPECint there are approx 7k profile mismatches.

Bootstrapped/regtested x86_64-linux. Plan to commit it after some extra 
testing.

gcc/ChangeLog:

* auto-profile.cc (get_combined_location): Handle negative
offsets; output better diagnostics.
(get_relative_location_for_locus): Reutrn -1 for unknown location.
(function_instance::get_cgraph_node): New member function.
(match_with_target): New function.
(dump_stmt): New function.
(function_instance::lookup_count): New function.
(mark_expr_locations): New function.
(function_instance::match): New function.
(autofdo_source_profile::offline_external_functions): Do
not repeat renaming; manage two worklists and do matching.
(autofdo_source_profile::offline_unrealized_inlines): Simplify.
(afdo_set_bb_count): do not look for lost discriminators.
(auto_profile): Do not ICE when profile reading failed.
* common.opt (Wauto-profile): New warning flag
* doc/invoke.texi (-Wauto-profile): Document.

Diff:
---
 gcc/auto-profile.cc | 788 
 gcc/common.opt  |   4 +
 gcc/doc/invoke.texi |   7 +-
 3 files changed, 684 insertions(+), 115 deletions(-)

diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index 743b005938c1..64f4cda1b52d 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -307,6 +307,10 @@ public:
  remove them while possibly merging them to offline variants.  */
   void offline_if_not_realized (vec  &new_functions);
 
+  /* Match function instance with gimple body.  */
+  bool match (cgraph_node *node, vec  &new_functions,
+ name_index_map &to_symbol_name);
+
   /* Offline all inlined functions with name in SEEN.
  If new toplevel functions are created, add them to NEW_FUNCTIONS.  */
   void offline_if_in_set (name_index_set &seen,
@@ -407,6 +411,39 @@ public:
 return in_worklist_;
   }
 
+  /* Return corresponding cgraph node.  */
+  cgraph_node *get_cgraph_node ();
+
+  void
+  set_location (location_t l)
+  {
+gcc_checking_assert (location_ == UNKNOWN_LOCATION);
+location_= l;
+  }
+
+  location_t
+  get_location ()
+  {
+return location_;
+  }
+
+  void
+  set_call_location (location_t l)
+  {
+gcc_checking_assert (call_location_ == UNKNOWN_LOCATION);
+call_location_= l;
+  }
+
+  location_t
+  get_call_location ()
+  {
+return call_location_;
+  }
+
+  /* Lookup count and warn about duplicates.  */
+  count_info *lookup_count (location_t loc, inline_stack &stack,
+   cgraph_node *node);
+
 private:
   /* Callsite, represented as (decl_lineno, callee_function_name_index).  */
   typedef std::pair callsite;
@@ -415,9 +452,10 @@ private:
   typedef std::map callsite_map;
 
   function_instance (unsigned name, gcov_type head_count)
-  : name_ (name), total_count_ (0), head_count_ (head_count),
+ : name_ (name), total_count_ (0), head_count_ (head_count),
   removed_icall_target_ (false), realized_ (false),
-  in_worklist_ (false), inlined_to_ (NULL)
+  in_worklist_ (false), inlined_to_ (NULL),
+  location_ (UNKNOWN_LOCATION), call_location_ (UNKNOWN_LOCATION)
   {
   }
 
@@ -454,6 +492,9 @@ private:
  is a toplevel one.  */
   function_instance *inlined_to_;
 
+  /* Lo

  1   2   >