Re: [PATCH] aarch64: Add the cost model for Neoverse N1

Evandro Menezes via Gcc-patches Mon, 24 Apr 2023 15:49:30 -0700

Hi, Tamara.

Does this work?


Thank you,

-- 
Evandro Menezes ◊ evan...@yahoo.com ◊ Austin, TX
Άγιος ο Θεός ⁂ ܩܕܝܫܐ ܐܢ̱ܬ ܠܐ ܡܝܘܬܐ ⁂ Sanctus Deus

> Em 24 de abr. de 2023, à(s) 12:37, Tamar Christina <tamar.christ...@arm.com> 
> escreveu:
> 
> Hi Evandro,
> 
> I wanted to give this patch a try, but the diff seems corrupt, the 
> whitespaces at the start of the context lines seem to have gone missing.
> 
> Could you try resending it?
> 
> Thanks,
> Tamar

gcc/ChangeLog:

        * config/aarch64/aarch64-cores.def: Use the Neoverse N1 cost model.
        * config/aarch64/aarch64.cc
        (cortexa76_tunings): Rename variable.
        (neoversen1_addrcost_table): New variable.
        (neoversen1_vector_cost): Likewise.
        (neoversen1_regmove_cost): Likewise.
        (neoversen1_advsimd_vector_cost): Likewise.
        (neoversen1_scalar_issue_info): Likewise.
        (neoversen1_advsimd_issue_info): Likewise.
        (neoversen1_vec_issue_info): Likewise.
        (neoversen1_vector_cost): Likewise.
        (neoversen1_tunings): Likewise.
        * config/arm/aarch-cost-tables.h
        (neoversen1_extra_costs): New variable.

Signed-off-by: Evandro Menezes <evan...@gcc.gnu.org>
---
 gcc/config/aarch64/aarch64-cores.def |  20 ++--
 gcc/config/aarch64/aarch64.cc        | 155 ++++++++++++++++++++++++---
 gcc/config/arm/aarch-cost-tables.h   | 107 ++++++++++++++++++
 3 files changed, 259 insertions(+), 23 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-cores.def 
b/gcc/config/aarch64/aarch64-cores.def
index 2ec88c98400..e352e4077b1 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -105,17 +105,17 @@ AARCH64_CORE("thunderx2t99",  thunderx2t99,  
thunderx2t99, V8_1A,  (CRYPTO), thu
 /* ARM ('A') cores. */
 AARCH64_CORE("cortex-a55",  cortexa55, cortexa53, V8_2A,  (F16, RCPC, 
DOTPROD), cortexa53, 0x41, 0xd05, -1)
 AARCH64_CORE("cortex-a75",  cortexa75, cortexa57, V8_2A,  (F16, RCPC, 
DOTPROD), cortexa73, 0x41, 0xd0a, -1)
-AARCH64_CORE("cortex-a76",  cortexa76, cortexa57, V8_2A,  (F16, RCPC, 
DOTPROD), neoversen1, 0x41, 0xd0b, -1)
-AARCH64_CORE("cortex-a76ae",  cortexa76ae, cortexa57, V8_2A,  (F16, RCPC, 
DOTPROD, SSBS), neoversen1, 0x41, 0xd0e, -1)
-AARCH64_CORE("cortex-a77",  cortexa77, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, 
SSBS), neoversen1, 0x41, 0xd0d, -1)
-AARCH64_CORE("cortex-a78",  cortexa78, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, 
SSBS, PROFILE), neoversen1, 0x41, 0xd41, -1)
-AARCH64_CORE("cortex-a78ae",  cortexa78ae, cortexa57, V8_2A,  (F16, RCPC, 
DOTPROD, SSBS, PROFILE), neoversen1, 0x41, 0xd42, -1)
-AARCH64_CORE("cortex-a78c",  cortexa78c, cortexa57, V8_2A,  (F16, RCPC, 
DOTPROD, SSBS, PROFILE, FLAGM, PAUTH), neoversen1, 0x41, 0xd4b, -1)
+AARCH64_CORE("cortex-a76",  cortexa76, cortexa57, V8_2A,  (F16, RCPC, 
DOTPROD), cortexa76, 0x41, 0xd0b, -1)
+AARCH64_CORE("cortex-a76ae",  cortexa76ae, cortexa57, V8_2A,  (F16, RCPC, 
DOTPROD, SSBS), cortexa76, 0x41, 0xd0e, -1)
+AARCH64_CORE("cortex-a77",  cortexa77, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, 
SSBS), cortexa76, 0x41, 0xd0d, -1)
+AARCH64_CORE("cortex-a78",  cortexa78, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, 
SSBS, PROFILE), cortexa76, 0x41, 0xd41, -1)
+AARCH64_CORE("cortex-a78ae",  cortexa78ae, cortexa57, V8_2A,  (F16, RCPC, 
DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd42, -1)
+AARCH64_CORE("cortex-a78c",  cortexa78c, cortexa57, V8_2A,  (F16, RCPC, 
DOTPROD, SSBS, PROFILE, FLAGM, PAUTH), cortexa76, 0x41, 0xd4b, -1)
 AARCH64_CORE("cortex-a65",  cortexa65, cortexa53, V8_2A,  (F16, RCPC, DOTPROD, 
SSBS), cortexa73, 0x41, 0xd06, -1)
 AARCH64_CORE("cortex-a65ae",  cortexa65ae, cortexa53, V8_2A,  (F16, RCPC, 
DOTPROD, SSBS), cortexa73, 0x41, 0xd43, -1)
-AARCH64_CORE("cortex-x1",  cortexx1, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, 
SSBS, PROFILE), neoversen1, 0x41, 0xd44, -1)
-AARCH64_CORE("cortex-x1c",  cortexx1c, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, 
SSBS, PROFILE, PAUTH), neoversen1, 0x41, 0xd4c, -1)
-AARCH64_CORE("ares",  ares, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, PROFILE), 
neoversen1, 0x41, 0xd0c, -1)
+AARCH64_CORE("cortex-x1",  cortexx1, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, 
SSBS, PROFILE), cortexa76, 0x41, 0xd44, -1)
+AARCH64_CORE("cortex-x1c",  cortexx1c, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, 
SSBS, PROFILE, PAUTH), cortexa76, 0x41, 0xd4c, -1)
+AARCH64_CORE("ares",  ares, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, PROFILE), 
cortexa76, 0x41, 0xd0c, -1)
 AARCH64_CORE("neoverse-n1",  neoversen1, cortexa57, V8_2A,  (F16, RCPC, 
DOTPROD, PROFILE), neoversen1, 0x41, 0xd0c, -1)
 AARCH64_CORE("neoverse-e1",  neoversee1, cortexa53, V8_2A,  (F16, RCPC, 
DOTPROD, SSBS), cortexa73, 0x41, 0xd4a, -1)
 
@@ -160,7 +160,7 @@ AARCH64_CORE("cortex-a73.cortex-a53",  cortexa73cortexa53, 
cortexa53, V8A,  (CRC
 /* ARM DynamIQ big.LITTLE configurations.  */
 
 AARCH64_CORE("cortex-a75.cortex-a55",  cortexa75cortexa55, cortexa53, V8_2A,  
(F16, RCPC, DOTPROD), cortexa73, 0x41, AARCH64_BIG_LITTLE (0xd0a, 0xd05), -1)
-AARCH64_CORE("cortex-a76.cortex-a55",  cortexa76cortexa55, cortexa53, V8_2A,  
(F16, RCPC, DOTPROD), neoversen1, 0x41, AARCH64_BIG_LITTLE (0xd0b, 0xd05), -1)
+AARCH64_CORE("cortex-a76.cortex-a55",  cortexa76cortexa55, cortexa53, V8_2A,  
(F16, RCPC, DOTPROD), cortexa76, 0x41, AARCH64_BIG_LITTLE (0xd0b, 0xd05), -1)
 
 /* Armv8-R Architecture Processors.  */
 AARCH64_CORE("cortex-r82", cortexr82, cortexa53, V8R, (), cortexa53, 0x41, 
0xd15, -1)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 2b0de7ca038..b1adce34711 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -1869,7 +1869,7 @@ static const struct tune_params thunderx3t110_tunings =
   &thunderx3t110_prefetch_tune
 };
 
-static const struct tune_params neoversen1_tunings =
+static const struct tune_params cortexa76_tunings =
 {
   &cortexa76_extra_costs,
   &generic_addrcost_table,
@@ -1887,18 +1887,18 @@ static const struct tune_params neoversen1_tunings =
   }, /* memmov_cost.  */
   3, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "32:16",     /* function_align.  */
-  "4",         /* jump_align.  */
-  "32:16",     /* loop_align.  */
-  2,   /* int_reassoc_width.  */
-  4,   /* fp_reassoc_width.  */
-  1,   /* fma_reassoc_width.  */
-  2,   /* vec_reassoc_width.  */
-  2,   /* min_div_recip_mul_sf.  */
-  2,   /* min_div_recip_mul_df.  */
-  0,   /* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),     /* tune_flags.  */
+  "32:16",     /* function_align.  */
+  "4",         /* jump_align.  */
+  "32:16",     /* loop_align.  */
+  2,   /* int_reassoc_width.  */
+  4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
+  2,   /* vec_reassoc_width.  */
+  2,   /* min_div_recip_mul_sf.  */
+  2,   /* min_div_recip_mul_df.  */
+  0,   /* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),     /* tune_flags.  */
   &generic_prefetch_tune
 };
 
@@ -2295,6 +2295,135 @@ static const struct tune_params neoverse512tvb_tunings =
   &generic_prefetch_tune
 };
 
+static const struct cpu_addrcost_table neoversen1_addrcost_table =
+{
+    {
+      0, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  1, /* post_modify_ld3_st3  */
+  1, /* post_modify_ld4_st4  */
+  0, /* register_offset  */
+  0, /* register_sextend  */
+  0, /* register_zextend  */
+  0 /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost neoversen1_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost.  */
+  3, /* GP2FP  */
+  2, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+static const advsimd_vec_cost neoversen1_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  2, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  3, /* permute_cost  */
+  6, /* reduc_i8_cost  */
+  5, /* reduc_i16_cost  */
+  3, /* reduc_i32_cost  */
+  3, /* reduc_i64_cost  */
+  8, /* reduc_f16_cost  */
+  5, /* reduc_f32_cost  */
+  5, /* reduc_f64_cost  */
+  0, /* store_elt_extra_cost  */
+  2, /* vec_to_scalar_cost  */
+  2, /* scalar_to_vec_cost  */
+  4, /* align_load_cost  */
+  4, /* unalign_load_cost  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+static const aarch64_scalar_vec_issue_info neoversen1_scalar_issue_info =
+{
+  2, /* loads_stores_per_cycle  */
+  2, /* stores_per_cycle  */
+  2, /* general_ops_per_cycle  */
+  0, /* fp_simd_load_general_ops  */
+  1 /* fp_simd_store_general_ops  */
+};
+
+static const aarch64_advsimd_vec_issue_info neoversen1_advsimd_issue_info =
+{
+  {
+    2, /* loads_stores_per_cycle  */
+    2, /* stores_per_cycle  */
+    2, /* general_ops_per_cycle  */
+    0, /* fp_simd_load_general_ops  */
+    1 /* fp_simd_store_general_ops  */
+  },
+  3, /* ld2_st2_general_ops  */
+  5, /* ld3_st3_general_ops  */
+  11 /* ld4_st4_general_ops  */
+};
+
+static const aarch64_vec_issue_info neoversen1_vec_issue_info =
+{
+  &neoversen1_scalar_issue_info, /* scalar  */
+  &neoversen1_advsimd_issue_info, /* advsimd  */
+  nullptr /* sve  */
+};
+
+
+static const struct cpu_vector_cost neoversen1_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &neoversen1_advsimd_vector_cost, /* advsimd  */
+  nullptr, /* sve  */
+  &neoversen1_vec_issue_info /* issue_info  */
+};
+
+static const struct tune_params neoversen1_tunings =
+{
+  &neoversen1_extra_costs,
+  &neoversen1_addrcost_table,
+  &neoversen1_regmove_cost,
+  &neoversen1_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    2, /* store_int.  */
+    5, /* load_fp.  */
+    2, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  4, /* issue_rate  */
+  AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
+  "32:16", /* function_align.  */
+  "4", /* jump_align.  */
+  "32:16", /* loop_align.  */
+  2, /* int_reassoc_width.  */
+  4, /* fp_reassoc_width.  */
+  1, /* fma_reassoc_width.  */
+  2, /* vec_reassoc_width.  */
+  2, /* min_div_recip_mul_sf.  */
+  2, /* min_div_recip_mul_df.  */
+  0, /* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
+  AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND, /* tune_flags.  */
+  &generic_prefetch_tune
+};
+
 static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
 {
   2, /* int_stmt_cost  */
diff --git a/gcc/config/arm/aarch-cost-tables.h 
b/gcc/config/arm/aarch-cost-tables.h
index e3848214728..fce6da6bbcc 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -450,6 +450,113 @@ const struct cpu_cost_table cortexa76_extra_costs =
   }
 };
 
+const struct cpu_cost_table neoversen1_extra_costs =
+{
+  /* ALU */
+  {
+    0,                 /* arith.  */
+    0,                 /* logical.  */
+    0,                 /* shift.  */
+    0,                 /* shift_reg.  */
+    COSTS_N_INSNS (1), /* arith_shift.  */
+    COSTS_N_INSNS (1), /* arith_shift_reg.  */
+    0,                /* log_shift.  */
+    COSTS_N_INSNS (1), /* log_shift_reg.  */
+    0,                 /* extend.  */
+    COSTS_N_INSNS (1), /* extend_arith.  */
+    COSTS_N_INSNS (1), /* bfi.  */
+    0,                 /* bfx.  */
+    0,                 /* clz.  */
+    0,                 /* rev.  */
+    0,                 /* non_exec.  */
+    true               /* non_exec_costs_exec.  */
+  },
+  {
+    /* MULT SImode */
+    {
+      COSTS_N_INSNS (1),       /* simple.  */
+      COSTS_N_INSNS (2),       /* flag_setting.  */
+      COSTS_N_INSNS (1),       /* extend.  */
+      COSTS_N_INSNS (1),       /* add.  */
+      COSTS_N_INSNS (1),       /* extend_add.  */
+      COSTS_N_INSNS (11)              /* idiv.  */
+    },
+    /* MULT DImode */
+    {
+      COSTS_N_INSNS (3),       /* simple.  */
+      0,                       /* flag_setting (N/A).  */
+      COSTS_N_INSNS (1),       /* extend.  */
+      COSTS_N_INSNS (3),       /* add.  */
+      COSTS_N_INSNS (1),       /* extend_add.  */
+      COSTS_N_INSNS (19)       /* idiv.  */
+    }
+  },
+  /* LD/ST */
+  {
+    COSTS_N_INSNS (3),         /* load.  */
+    COSTS_N_INSNS (3),         /* load_sign_extend.  */
+    COSTS_N_INSNS (3),         /* ldrd.  */
+    COSTS_N_INSNS (2),         /* ldm_1st.  */
+    1,                         /* ldm_regs_per_insn_1st.  */
+    2,                         /* ldm_regs_per_insn_subsequent.  */
+    COSTS_N_INSNS (4),         /* loadf.  */
+    COSTS_N_INSNS (4),         /* loadd.  */
+    COSTS_N_INSNS (3),         /* load_unaligned.  */
+    0,                         /* store.  */
+    0,                         /* strd.  */
+    0,                         /* stm_1st.  */
+    1,                         /* stm_regs_per_insn_1st.  */
+    2,                         /* stm_regs_per_insn_subsequent.  */
+    0,                         /* storef.  */
+    0,                         /* stored.  */
+    COSTS_N_INSNS (1),         /* store_unaligned.  */
+    COSTS_N_INSNS (1),         /* loadv.  */
+    COSTS_N_INSNS (1)          /* storev.  */
+  },
+  {
+    /* FP SFmode */
+    {
+      COSTS_N_INSNS (9),       /* div.  */
+      COSTS_N_INSNS (2),       /* mult.  */
+      COSTS_N_INSNS (3),       /* mult_addsub.  */
+      COSTS_N_INSNS (3),       /* fma.  */
+      COSTS_N_INSNS (1),       /* addsub.  */
+      COSTS_N_INSNS (1),       /* fpconst.  */
+      0,                       /* neg.  */
+      0,                       /* compare.  */
+      COSTS_N_INSNS (1),       /* widen.  */
+      COSTS_N_INSNS (1),       /* narrow.  */
+      COSTS_N_INSNS (1),       /* toint.  */
+      COSTS_N_INSNS (1),       /* fromint.  */
+      COSTS_N_INSNS (1)        /* roundint.  */
+    },
+    /* FP DFmode */
+    {
+      COSTS_N_INSNS (14),      /* div.  */
+      COSTS_N_INSNS (2),       /* mult.  */
+      COSTS_N_INSNS (3),       /* mult_addsub.  */
+      COSTS_N_INSNS (3),       /* fma.  */
+      COSTS_N_INSNS (1),       /* addsub.  */
+      COSTS_N_INSNS (1),       /* fpconst.  */
+      0,                       /* neg.  */
+      0,                       /* compare.  */
+      COSTS_N_INSNS (1),       /* widen.  */
+      COSTS_N_INSNS (1),       /* narrow.  */
+      COSTS_N_INSNS (1),       /* toint.  */
+      COSTS_N_INSNS (1),       /* fromint.  */
+      COSTS_N_INSNS (1)        /* roundint.  */
+    }
+  },
+  /* Vector */
+  {
+    COSTS_N_INSNS (1),  /* alu.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (1),  /* dup.  */
+    COSTS_N_INSNS (1)   /* extract.  */
+  }
+};
+
 const struct cpu_cost_table exynosm1_extra_costs =
 {
   /* ALU */
-- 
2.39.2 (Apple Git-143)

Re: [PATCH] aarch64: Add the cost model for Neoverse N1

Reply via email to