[PATCH/AARCH64] Update ThunderX schedule model

Andrew Pinski Tue, 23 Jun 2015 14:00:47 -0700

Hi,
  This patch updates the schedule model to be more accurate and model
SIMD and fp instructions that I had missed out when I had the last
patch.


OK?  Bootstrapped and tested on aarch64-linux-gnu with no regeessions.

Thanks,
Andrew Pinski

ChangeLog:

 * config/aarch64/thunderx.md (thunderx_shift): Add rbit and rev.
(thunderx_crc32): New reservation.
(thunderx_fmov): Add fcsel, ffarithd and ffariths.
(thunderx_fabs): New reservation.
(thunderx_fcsel): New reservation.
(thunderx_fcmp): New reservation.
(thunderx_fsqrtd): Correct latency.
(thunderx_frint): Add f_cvt.
(thunderx_f_cvt): Remove f_cvt.
(thunderx_simd_fp_store): Add neon_store1_one_lane
and neon_store1_one_lane_q.
(thunderx_neon_ld1): New reservation.
(thunderx_neon_move): Add neon_dup.
neon_ins, neon_from_gp, neon_to_gp,
neon_abs, neon_neg,
neon_fp_neg_s, and neon_fp_abs_s.
(thunderx_neon_move_q): Add neon_dup_q,
neon_ins_q, neon_from_gp_q, neon_to_gp_q,
neon_abs_q, neon_neg_q,
neon_fp_neg_s_q, neon_fp_neg_d_q,
neon_fp_abs_s_q, and neon_fp_abs_d_q.
(thunderx_neon_add): Add neon_arith_acc, neon_rev, neon_fp_abd_s,
neon_fp_abd_d, and neon_fp_reduc_minmax_s.
(thunderx_neon_add_q): Add neon_fp_abd_s_q, neon_fp_abd_d_q,
neon_arith_acc_q, neon_rev_q,
neon_fp_reduc_minmax_s_q, and neon_fp_reduc_minmax_d_q.
(thunderx_neon_mult): New reservation.
(thunderx_neon_mult_q): New reservation.
(thunderx_crypto_aese): New reservation.
(thunderx_crypto_aesmc): New reservation.
(bypasses): Add bypass to thunderx_neon_mult_q.
(thunderx_tbl): New reservation.
(thunderx_tblq): New reservation.

Index: config/aarch64/thunderx.md
===================================================================
--- config/aarch64/thunderx.md  (revision 224856)
+++ config/aarch64/thunderx.md  (working copy)
@@ -39,7 +39,7 @@ (define_insn_reservation "thunderx_add"
 
 (define_insn_reservation "thunderx_shift" 1
   (and (eq_attr "tune" "thunderx")
-       (eq_attr "type" "bfm,extend,shift_imm,shift_reg"))
+       (eq_attr "type" "bfm,extend,shift_imm,shift_reg,rbit,rev"))
   "thunderx_pipe0 | thunderx_pipe1")
 
 
@@ -66,12 +66,18 @@ (define_insn_reservation "thunderx_mul"
        (eq_attr "type" "mul,muls,mla,mlas,clz,smull,umull,smlal,umlal"))
   "thunderx_pipe1 + thunderx_mult")
 
-;; Multiply high instructions take an extra cycle and cause the muliply unit to
-;; be busy for an extra cycle.
+;; crcb,crch,crcw is 4 cycles and can only happen on pipe 1
 
-;(define_insn_reservation "thunderx_mul_high" 5
+(define_insn_reservation "thunderx_crc32" 4
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "crc"))
+  "thunderx_pipe1 + thunderx_mult")
+
+;; crcx is 5 cycles and only happen on pipe 1
+;(define_insn_reservation "thunderx_crc64" 5
 ;  (and (eq_attr "tune" "thunderx")
-;       (eq_attr "type" "smull,umull"))
+;       (eq_attr "type" "crc")
+;       (eq_attr "mode" "DI"))
 ;  "thunderx_pipe1 + thunderx_mult")
 
 (define_insn_reservation "thunderx_div32" 22
@@ -97,6 +103,11 @@ (define_insn_reservation "thunderx_store
        (eq_attr "type" "store2"))
   "thunderx_pipe0 + thunderx_pipe1")
 
+;; Prefetch are single issued
+;(define_insn_reservation "thunderx_prefetch" 1
+;  (and (eq_attr "tune" "thunderx")
+;       (eq_attr "type" "prefetch"))
+;  "thunderx_pipe0 + thunderx_pipe1")
 
 ;; loads (and load pairs) from L1 take 3 cycles in pipe 0
 (define_insn_reservation "thunderx_load" 3
@@ -121,10 +132,21 @@ (define_insn_reservation "thunderx_fcons
        (eq_attr "type" "fconsts,fconstd"))
   "thunderx_pipe1")
 
-;; Moves between fp are 2 cycles including min/max/select/abs/neg
+;; Moves between fp are 2 cycles including min/max
 (define_insn_reservation "thunderx_fmov" 2
   (and (eq_attr "tune" "thunderx")
-       (eq_attr "type" "fmov,f_minmaxs,f_minmaxd,fcsel,ffarithd,ffariths"))
+       (eq_attr "type" "fmov,f_minmaxs,f_minmaxd"))
+  "thunderx_pipe1")
+
+;; ABS, and NEG are 1 cycle
+(define_insn_reservation "thunderx_fabs" 1
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "ffariths,ffarithd"))
+  "thunderx_pipe1")
+
+(define_insn_reservation "thunderx_fcsel" 3
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "fcsel"))
   "thunderx_pipe1")
 
 (define_insn_reservation "thunderx_fmovgpr" 2
@@ -132,6 +154,11 @@ (define_insn_reservation "thunderx_fmovg
        (eq_attr "type" "f_mrc, f_mcr"))
   "thunderx_pipe1")
 
+(define_insn_reservation "thunderx_fcmp" 3
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "fcmps,fcmpd"))
+  "thunderx_pipe1")
+
 (define_insn_reservation "thunderx_fmul" 6
   (and (eq_attr "tune" "thunderx")
        (eq_attr "type" "fmacs,fmacd,fmuls,fmuld"))
@@ -152,21 +179,21 @@ (define_insn_reservation "thunderx_fsqrt
        (eq_attr "type" "fsqrts"))
   "thunderx_pipe1 + thunderx_divide, thunderx_divide*13")
 
-(define_insn_reservation "thunderx_fsqrtd" 28
+(define_insn_reservation "thunderx_fsqrtd" 31
   (and (eq_attr "tune" "thunderx")
        (eq_attr "type" "fsqrtd"))
-  "thunderx_pipe1 + thunderx_divide, thunderx_divide*31")
+  "thunderx_pipe1 + thunderx_divide, thunderx_divide*27")
 
 ;; The rounding conversion inside fp is 4 cycles
 (define_insn_reservation "thunderx_frint" 4
   (and (eq_attr "tune" "thunderx")
-       (eq_attr "type" "f_rints,f_rintd"))
+       (eq_attr "type" "f_cvt,f_rints,f_rintd"))
   "thunderx_pipe1")
 
 ;; Float to integer with a move from int to/from float is 6 cycles
 (define_insn_reservation "thunderx_f_cvt" 6
   (and (eq_attr "tune" "thunderx")
-       (eq_attr "type" "f_cvt,f_cvtf2i,f_cvti2f"))
+       (eq_attr "type" "f_cvtf2i,f_cvti2f"))
   "thunderx_pipe1")
 
 ;; FP/SIMD load/stores happen in pipe 0
@@ -184,9 +211,12 @@ (define_insn_reservation "thunderx_128si
   "thunderx_pipe0+thunderx_pipe1")
 
 ;; FP/SIMD Stores takes one cycle in pipe 0
+;; ST1 with one registers either multiple structures or single structure is
+;;    also one cycle.
 (define_insn_reservation "thunderx_simd_fp_store" 1
   (and (eq_attr "tune" "thunderx")
-       (eq_attr "type" 
"f_stored,f_stores,neon_store1_1reg,neon_store1_1reg_q"))
+       (eq_attr "type" "f_stored,f_stores,neon_store1_1reg,neon_store1_1reg_q, 
\
+                       neon_store1_one_lane, neon_store1_one_lane_q"))
   "thunderx_pipe0")
 
 ;; 64bit neon store pairs are single issue for one cycle
@@ -201,24 +231,38 @@ (define_insn_reservation "thunderx_128ne
        (eq_attr "type" "neon_store1_2reg_q"))
   "(thunderx_pipe0 + thunderx_pipe1)*2")
 
+;; LD1R/LD1 (with a single struct) takes 6 cycles and issued in pipe0
+(define_insn_reservation "thunderx_neon_ld1" 6
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "neon_load1_all_lanes"))
+  "thunderx_pipe0")
 
 ;; SIMD/NEON (q forms take an extra cycle)
+;; SIMD For ThunderX is 64bit wide,
 
-;; Thunder simd move instruction types - 2/3 cycles
+;; ThunderX simd move instruction types - 2/3 cycles
+;; ThunderX dup, ins is the same
+;; ThunderX SIMD fabs/fneg instruction types
 (define_insn_reservation "thunderx_neon_move" 2
   (and (eq_attr "tune" "thunderx")
        (eq_attr "type" "neon_logic, neon_bsl, neon_fp_compare_s, \
-                       neon_fp_compare_d, neon_move"))
+                       neon_fp_compare_d, neon_move, neon_dup, \
+                       neon_ins, neon_from_gp, neon_to_gp, \
+                       neon_abs, neon_neg, \
+                       neon_fp_neg_s, neon_fp_abs_s"))
   "thunderx_pipe1 + thunderx_simd")
 
 (define_insn_reservation "thunderx_neon_move_q" 3
   (and (eq_attr "tune" "thunderx")
        (eq_attr "type" "neon_logic_q, neon_bsl_q, neon_fp_compare_s_q, \
-                       neon_fp_compare_d_q, neon_move_q"))
+                       neon_fp_compare_d_q, neon_move_q, neon_dup_q, \
+                       neon_ins_q, neon_from_gp_q, neon_to_gp_q, \
+                       neon_abs_q, neon_neg_q, \
+                       neon_fp_neg_s_q, neon_fp_neg_d_q, \
+                       neon_fp_abs_s_q, neon_fp_abs_d_q"))
   "thunderx_pipe1 + thunderx_simd, thunderx_simd")
 
-
-;; Thunder simd simple/add instruction types - 4/5 cycles
+;; ThunderX simd simple/add instruction types - 4/5 cycles
 
 (define_insn_reservation "thunderx_neon_add" 4
   (and (eq_attr "tune" "thunderx")
@@ -227,7 +271,9 @@ (define_insn_reservation "thunderx_neon_
                        neon_add_halve, neon_sub_halve, neon_qadd, 
neon_compare, \
                        neon_compare_zero, neon_minmax, neon_abd, neon_add, 
neon_sub, \
                        neon_fp_minmax_s, neon_fp_minmax_d, neon_reduc_add, 
neon_cls, \
-                       neon_qabs, neon_qneg, neon_fp_addsub_s, 
neon_fp_addsub_d"))
+                       neon_qabs, neon_qneg, neon_fp_addsub_s, 
neon_fp_addsub_d, \
+                       neon_arith_acc, neon_rev, neon_fp_abd_s, neon_fp_abd_d, 
\
+                       neon_fp_reduc_minmax_s"))
   "thunderx_pipe1 + thunderx_simd")
 
 ;; BIG NOTE: neon_add_long/neon_sub_long don't have a q form which is incorrect
@@ -240,13 +286,74 @@ (define_insn_reservation "thunderx_neon_
                        neon_compare_zero_q, neon_minmax_q, neon_abd_q, 
neon_add_q, neon_sub_q, \
                        neon_fp_minmax_s_q, neon_fp_minmax_d_q, 
neon_reduc_add_q, neon_cls_q, \
                        neon_qabs_q, neon_qneg_q, neon_fp_addsub_s_q, 
neon_fp_addsub_d_q, \
-                       neon_add_long, neon_sub_long"))
+                       neon_add_long, neon_sub_long, neon_fp_abd_s_q, 
neon_fp_abd_d_q, \
+                       neon_arith_acc_q, neon_rev_q, \
+                       neon_fp_reduc_minmax_s_q, neon_fp_reduc_minmax_d_q"))
+  "thunderx_pipe1 + thunderx_simd, thunderx_simd")
+
+;; Multiplies (float and integer) and shifts and permutes (except for TBL) and 
float conversions
+;; are 6/7 cycles
+(define_insn_reservation "thunderx_neon_mult" 6
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "neon_fp_mul_s, neon_fp_mul_d, neon_fp_mla_s, 
neon_fp_mla_d, \
+                       neon_mla_b, neon_mla_h, neon_mla_s, \
+                       neon_mla_h_scalar, neon_mla_s_scalar, \
+                       neon_ext, neon_shift_imm, neon_permute, \
+                       neon_int_to_fp_s, neon_int_to_fp_d, neon_shift_reg, \
+                       neon_sat_shift_reg, neon_shift_acc, \
+                       neon_mul_b, neon_mul_h, neon_mul_s, \
+                       neon_mul_h_scalar, neon_mul_s_scalar, \
+                       neon_fp_mul_s_scalar, \
+                       neon_fp_mla_s_scalar"))
+  "thunderx_pipe1 + thunderx_simd")
+
+(define_insn_reservation "thunderx_neon_mult_q" 7
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "neon_fp_mul_s_q, neon_fp_mul_d_q, neon_fp_mla_s_q, 
neon_fp_mla_d_q, \
+                       neon_mla_b_q, neon_mla_h_q, neon_mla_s_q, \
+                       neon_mla_h_scalar_q, neon_mla_s_scalar_q, \
+                       neon_ext_q, neon_shift_imm_q, neon_permute_q, \
+                       neon_int_to_fp_s_q, neon_int_to_fp_d_q, 
neon_shift_reg_q, \
+                       neon_sat_shift_reg_q, neon_shift_acc_q, \
+                       neon_shift_imm_long, \
+                       neon_mul_b_q, neon_mul_h_q, neon_mul_s_q, \
+                       neon_mul_h_scalar_q, neon_mul_s_scalar_q, \
+                       neon_fp_mul_s_scalar_q, neon_fp_mul_d_scalar_q, \
+                       neon_mul_b_long, neon_mul_h_long, neon_mul_s_long, \
+                       neon_shift_imm_narrow_q, neon_fp_cvt_widen_s, 
neon_fp_cvt_narrow_d_q, \
+                       neon_fp_mla_s_scalar_q, neon_fp_mla_d_scalar_q"))
+  "thunderx_pipe1 + thunderx_simd, thunderx_simd")
+
+
+;; AES[ED] is 5 cycles
+(define_insn_reservation "thunderx_crypto_aese" 5
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "crypto_aese"))
+  "thunderx_pipe1 + thunderx_simd, thunderx_simd")
+
+;; AES{,I}MC is 3 cycles
+(define_insn_reservation "thunderx_crypto_aesmc" 3
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "crypto_aesmc"))
   "thunderx_pipe1 + thunderx_simd, thunderx_simd")
 
 
-;; Thunder 128bit SIMD reads the upper halve in cycle 2 and writes in the last 
cycle
-(define_bypass 2 "thunderx_neon_move_q" "thunderx_neon_move_q, 
thunderx_neon_add_q")
-(define_bypass 4 "thunderx_neon_add_q" "thunderx_neon_move_q, 
thunderx_neon_add_q")
+;; Thunder 128bit SIMD reads the upper halve in cycle 2 and writes upper halve 
in the last cycle
+(define_bypass 2 "thunderx_neon_move_q" "thunderx_neon_move_q, 
thunderx_neon_add_q, thunderx_neon_mult_q")
+(define_bypass 4 "thunderx_neon_add_q" "thunderx_neon_move_q, 
thunderx_neon_add_q, thunderx_neon_mult_q")
+(define_bypass 6 "thunderx_neon_mult_q" "thunderx_neon_move_q, 
thunderx_neon_add_q, thunderx_neon_mult_q")
+
+;; 64bit TBL is emulated and takes 160 cycles
+(define_insn_reservation "thunderx_tbl" 160
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "neon_tbl1"))
+  "(thunderx_pipe1+thunderx_pipe0)*160")
+
+;; 128bit TBL is emulated and takes 320 cycles
+(define_insn_reservation "thunderx_tblq" 320
+  (and (eq_attr "tune" "thunderx")
+       (eq_attr "type" "neon_tbl1_q"))
+  "(thunderx_pipe1+thunderx_pipe0)*320")
 
 ;; Assume both pipes are needed for unknown and multiple-instruction
 ;; patterns.

[PATCH/AARCH64] Update ThunderX schedule model

Reply via email to