https://gcc.gnu.org/g:753e5c8a3b04320ae183a7546fb8b926a4678bdb
commit r16-4031-g753e5c8a3b04320ae183a7546fb8b926a4678bdb Author: liuhongt <[email protected]> Date: Thu Sep 18 19:13:22 2025 -0700 Disable vect unroll for znver2/Znver1. Since it regressed SPEC performance(Refer to PR121994), I guess it's related to register pressure and can be tuned by adjusting reduc_lat_mult_thr. I don't have Zen2 machine, so for simplity, I'll just disable unroll in vectorizer for Zen2. Also adjust count number for {AVX256,AVX512}_SPLIT_REGS. gcc/ChangeLog: PR target/121994 * config/i386/x86-tune-costs.h (znver2_cost): Set vect_unroll_limit to 1. (znver1_cost): Ditto. * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Adjust count number for {AVX256,AVX512}_SPLIT_REGS. Diff: --- gcc/config/i386/i386.cc | 18 +++++++++++++----- gcc/config/i386/x86-tune-costs.h | 4 ++-- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 5ef7c315091d..6eb26cd7b824 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -26144,6 +26144,14 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, /* Record number of load/store/gather/scatter in vectorized body. */ if (where == vect_body && !m_costing_for_scalar) { + int scale = 1; + if (vectype + && ((GET_MODE_SIZE (TYPE_MODE (vectype)) == 64 + && TARGET_AVX512_SPLIT_REGS) + || (GET_MODE_SIZE (TYPE_MODE (vectype)) == 32 + && TARGET_AVX256_SPLIT_REGS))) + scale = 2; + switch (kind) { /* Emulated gather/scatter or any scalarization. */ @@ -26166,7 +26174,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, /* Handle __builtin_fma. */ if (gimple_call_combined_fn (stmt_info->stmt) == CFN_FMA) { - m_num_reduc[X86_REDUC_FMA] += count; + m_num_reduc[X86_REDUC_FMA] += count * scale; break; } @@ -26203,12 +26211,12 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, && (def = SSA_NAME_DEF_STMT (rhs1), true) && is_gimple_assign (def) && gimple_assign_rhs_code (def) == MULT_EXPR) - m_num_reduc[X86_REDUC_FMA] += count; + m_num_reduc[X86_REDUC_FMA] += count * scale; else if (TREE_CODE (rhs2) == SSA_NAME && (def = SSA_NAME_DEF_STMT (rhs2), true) && is_gimple_assign (def) && gimple_assign_rhs_code (def) == MULT_EXPR) - m_num_reduc[X86_REDUC_FMA] += count; + m_num_reduc[X86_REDUC_FMA] += count * scale; break; /* Vectorizer lane_reducing_op_p supports DOT_PROX_EXPR, @@ -26237,7 +26245,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, ? TARGET_AVX10_2 : (TARGET_AVXVNNIINT8 || TARGET_AVX10_2)); } - m_num_reduc[X86_REDUC_DOT_PROD] += count; + m_num_reduc[X86_REDUC_DOT_PROD] += count * scale; /* Dislike to do unroll and partial sum for emulated DOT_PROD_EXPR. */ @@ -26246,7 +26254,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, break; case SAD_EXPR: - m_num_reduc[X86_REDUC_SAD] += count; + m_num_reduc[X86_REDUC_SAD] += count * scale; break; default: diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index 1649ea2fe3e5..c7a0f6805ca1 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -1744,7 +1744,7 @@ struct processor_costs znver1_cost = { FMA/DOT_PROD_EXPR/SAD_EXPR, it's used to determine unroll factor in the vectorizer. */ - 4, /* Limit how much the autovectorizer + 1, /* Limit how much the autovectorizer may unroll a loop. */ znver1_memcpy, znver1_memset, @@ -1918,7 +1918,7 @@ struct processor_costs znver2_cost = { FMA/DOT_PROD_EXPR/SAD_EXPR, it's used to determine unroll factor in the vectorizer. */ - 4, /* Limit how much the autovectorizer + 1, /* Limit how much the autovectorizer may unroll a loop. */ znver2_memcpy, znver2_memset,
