Re: More znver4 x86-tune flags

2023-01-09 Thread Hongtao Liu via Gcc-patches
On Tue, Jan 10, 2023 at 12:32 PM Jan Hubicka via Gcc-patches
 wrote:
>
>
> Hi,
> this patch adds more tunes for zen4:
>  - new tunes for avx512 scater instructions.
>In micro benchmarks these seems consistent loss compared to open-coded coe
>  - disable use of gather for zen4
>While these are win for a micro benchmarks (based on TSVC), enabling gather
>is a loss for parest. So for now it seems safe to keep it off.
>  - disable pass to avoid FMA chains for znver4 since fmadd was optimized and 
> does not seem
>to cause regressions.
>
> Bootstrapped/regtested x86_64.
> Honza
>
> * i386.cc (ix86_vectorize_builtin_scatter): Guard scatter by 
> TARGET_USE_SCATTER.
> * i386.h (TARGET_USE_SCATTER_2PARTS, TARGET_USE_SCATTER_4PARTS, 
> TARGET_USE_SCATTER): New macros.
> * x86-tune.def (TARGET_USE_SCATTER_2PARTS, TARGET_USE_SCATTER_4PARTS, 
> TARGET_USE_SCATTER): New tunes.
> (X86_TUNE_AVOID_256FMA_CHAINS, X86_TUNE_AVOID_512FMA_CHAINS): Disable 
> for znver4.
> (X86_TUNE_USE_GATHER): Disable for zen4.
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index de978d19063..9fb69f6c174 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -19051,6 +19051,13 @@ ix86_vectorize_builtin_scatter (const_tree vectype,
>if (!TARGET_AVX512F)
>  return NULL_TREE;
>
> +  if (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 2u)
> +  ? !TARGET_USE_SCATTER_2PARTS
> +  : (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
> +? !TARGET_USE_SCATTER_4PARTS
> +: !TARGET_USE_SCATTER))
> +return NULL_TREE;
> +
>if ((TREE_CODE (index_type) != INTEGER_TYPE
> && !POINTER_TYPE_P (index_type))
>|| (TYPE_MODE (index_type) != SImode
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index e6a603ed31a..cd7ed19e29c 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -397,10 +397,16 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
> ix86_tune_features[X86_TUNE_AVOID_4BYTE_PREFIXES]
>  #define TARGET_USE_GATHER_2PARTS \
> ix86_tune_features[X86_TUNE_USE_GATHER_2PARTS]
> +#define TARGET_USE_SCATTER_2PARTS \
> +   ix86_tune_features[X86_TUNE_USE_SCATTER_2PARTS]
>  #define TARGET_USE_GATHER_4PARTS \
> ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS]
> +#define TARGET_USE_SCATTER_4PARTS \
> +   ix86_tune_features[X86_TUNE_USE_SCATTER_4PARTS]
>  #define TARGET_USE_GATHER \
> ix86_tune_features[X86_TUNE_USE_GATHER]
> +#define TARGET_USE_SCATTER \
> +   ix86_tune_features[X86_TUNE_USE_SCATTER]
>  #define TARGET_FUSE_CMP_AND_BRANCH_32 \
> ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32]
>  #define TARGET_FUSE_CMP_AND_BRANCH_64 \
> diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> index fae3b650434..7e9c7244fc0 100644
> --- a/gcc/config/i386/x86-tune.def
> +++ b/gcc/config/i386/x86-tune.def
> @@ -483,28 +483,43 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, 
> "avoid_4byte_prefixes",
>  DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
>   ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE | 
> m_CORE_ATOM | m_GENERIC))
>
> +/* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
> +   elements.  */
> +DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
> + ~(m_ZNVER4 | m_GENERIC))
> +
>  /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
> elements.  */
>  DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
>   ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE | 
> m_CORE_ATOM | m_GENERIC))
>
> +/* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
> +   elements.  */
> +DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
> + ~(m_ZNVER4 | m_GENERIC))
> +
>  /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
> elements.  */
>  DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
> - ~(m_ZNVER1 | m_ZNVER2 | m_ALDERLAKE | m_CORE_ATOM | m_GENERIC))
> + ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE | m_CORE_ATOM | 
> m_GENERIC))
> +
> +/* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
> +   elements.  */
> +DEF_TUNE (X86_TUNE_USE_SCATTER, "use_scatter",
> + ~(m_ZNVER4 | m_GENERIC))
>
>  /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
> smaller FMA chain.  */
> -DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER)
> +DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | 
> m_ZNVER2 | m_ZNVER3)
According to comments, it's *256bit or smaller*, so shouldn't
avoid_fma_chains be implied by avoid_fma256_chains.
>
>  /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
> smaller FMA chain.  */
> -DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
> m_ZNVER3 | m_ZNVER4
> +DEF_TUNE 

More znver4 x86-tune flags

2023-01-09 Thread Jan Hubicka via Gcc-patches


Hi,
this patch adds more tunes for zen4:
 - new tunes for avx512 scater instructions.
   In micro benchmarks these seems consistent loss compared to open-coded coe
 - disable use of gather for zen4
   While these are win for a micro benchmarks (based on TSVC), enabling gather
   is a loss for parest. So for now it seems safe to keep it off.
 - disable pass to avoid FMA chains for znver4 since fmadd was optimized and 
does not seem
   to cause regressions.

Bootstrapped/regtested x86_64.
Honza

* i386.cc (ix86_vectorize_builtin_scatter): Guard scatter by 
TARGET_USE_SCATTER.
* i386.h (TARGET_USE_SCATTER_2PARTS, TARGET_USE_SCATTER_4PARTS, 
TARGET_USE_SCATTER): New macros.
* x86-tune.def (TARGET_USE_SCATTER_2PARTS, TARGET_USE_SCATTER_4PARTS, 
TARGET_USE_SCATTER): New tunes.
(X86_TUNE_AVOID_256FMA_CHAINS, X86_TUNE_AVOID_512FMA_CHAINS): Disable 
for znver4.
(X86_TUNE_USE_GATHER): Disable for zen4.
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index de978d19063..9fb69f6c174 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -19051,6 +19051,13 @@ ix86_vectorize_builtin_scatter (const_tree vectype,
   if (!TARGET_AVX512F)
 return NULL_TREE;
 
+  if (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 2u)
+  ? !TARGET_USE_SCATTER_2PARTS
+  : (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
+? !TARGET_USE_SCATTER_4PARTS
+: !TARGET_USE_SCATTER))
+return NULL_TREE;
+
   if ((TREE_CODE (index_type) != INTEGER_TYPE
&& !POINTER_TYPE_P (index_type))
   || (TYPE_MODE (index_type) != SImode
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index e6a603ed31a..cd7ed19e29c 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -397,10 +397,16 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_AVOID_4BYTE_PREFIXES]
 #define TARGET_USE_GATHER_2PARTS \
ix86_tune_features[X86_TUNE_USE_GATHER_2PARTS]
+#define TARGET_USE_SCATTER_2PARTS \
+   ix86_tune_features[X86_TUNE_USE_SCATTER_2PARTS]
 #define TARGET_USE_GATHER_4PARTS \
ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS]
+#define TARGET_USE_SCATTER_4PARTS \
+   ix86_tune_features[X86_TUNE_USE_SCATTER_4PARTS]
 #define TARGET_USE_GATHER \
ix86_tune_features[X86_TUNE_USE_GATHER]
+#define TARGET_USE_SCATTER \
+   ix86_tune_features[X86_TUNE_USE_SCATTER]
 #define TARGET_FUSE_CMP_AND_BRANCH_32 \
ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32]
 #define TARGET_FUSE_CMP_AND_BRANCH_64 \
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index fae3b650434..7e9c7244fc0 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -483,28 +483,43 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, 
"avoid_4byte_prefixes",
 DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE | 
m_CORE_ATOM | m_GENERIC))
 
+/* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
+   elements.  */
+DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
+ ~(m_ZNVER4 | m_GENERIC))
+
 /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE | 
m_CORE_ATOM | m_GENERIC))
 
+/* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
+   elements.  */
+DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
+ ~(m_ZNVER4 | m_GENERIC))
+
 /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
- ~(m_ZNVER1 | m_ZNVER2 | m_ALDERLAKE | m_CORE_ATOM | m_GENERIC))
+ ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE | m_CORE_ATOM | 
m_GENERIC))
+
+/* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
+   elements.  */
+DEF_TUNE (X86_TUNE_USE_SCATTER, "use_scatter",
+ ~(m_ZNVER4 | m_GENERIC))
 
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER)
+DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | 
m_ZNVER2 | m_ZNVER3)
 
 /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
m_ZNVER3 | m_ZNVER4
+DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
m_ZNVER3
  | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_CORE_ATOM)
 
 /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER4)
+DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_NONE)
 
 /*