Update in V3
> > > + cost_sse_integer = 0;
> > > + weighted_cost_sse_integer = 0 ;
> Extra space here.
Changed.
> > > + : ix86_size_cost.sse_to_integer;
>
> Please be sure to not revert the changes from my patch adding
> COSTS_N_INSNS (...) / 2
> here and some other places.
Yes, keep the original cost, and weights it with bb_freq.
>
> I think with your change the size metrics will get less accurate. Old
> code had hard-wired sizes which depends on ISA we use:
>
> else if (TARGET_64BIT || smode == SImode)
> {
> cost += n_sse_to_integer * COSTS_N_BYTES (4);
> cost += n_integer_to_sse * COSTS_N_BYTES (4);
> }
> else if (TARGET_SSE4_1)
> {
> /* vmovd (4 bytes) + vpextrd (6 bytes). */
> cost += n_sse_to_integer * COSTS_N_BYTES (10);
> /* vmovd (4 bytes) + vpinsrd (6 bytes). */
> cost += n_integer_to_sse * COSTS_N_BYTES (10);
> }
> else
> {
> /* movd (4 bytes) + psrlq (5 bytes) + movd (4 bytes). */
> cost += n_sse_to_integer * COSTS_N_BYTES (13);
> /* movd (4 bytes) + movd (4 bytes) + unpckldq (4 bytes). */
> cost += n_integer_to_sse * COSTS_N_BYTES (12);
> }
>
> While you now use common size_costs->sse_to_integer which will not be
> able to distinguish these.
>
> Perhaps we want to have a function returning correct sizes depending on
> ISA and use it here? (In future it may be useful to get correct size
> costs for vectorizer too.)
Also keep the hard-wired sizes.
> > > }
> > >
> > > + if (speed_p)
> > > + weighted_cost_sse_integer += bb_freq * cost;
>
> You use bb_freq just once, so it may be easier to just call the
> function.
Changed
> > > - if (optimize_insn_for_size_p ())
> > > + if (!speed_p)
>
> If the size tables are right, we should be able to eliminate some of
> these tests since same computation as for speed_p should work for size.
> However sizes are somewhat approximate, so this needs to be done
> carefully, so it is probably better done incremeentally.
Yes, still use ix86_cost, and just refactor optimize_insn_for_size_p () with
!speed_p.
> Patch is OK with these changes.
Thansk for review.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?
n some benchmark, I notice stv failed due to cost unprofitable, but the igain
is inside the loop, but sse<->integer conversion is outside the loop, current
cost
model doesn't consider the frequency of those gain/cost.
The patch weights those cost with frequency.
gcc/ChangeLog:
* config/i386/i386-features.cc
(scalar_chain::mark_dual_mode_def): Weight
cost of integer<->sse move with bb frequency when it's
optimized_for_speed_p.
(general_scalar_chain::compute_convert_gain): Ditto, and
adjust function prototype to return true/false when cost model
is profitable or not.
(timode_scalar_chain::compute_convert_gain): Ditto.
(convert_scalars_to_vector): Adjust after the upper two
function prototype are changed.
* config/i386/i386-features.h (class scalar_chain): Change
n_integer_to_sse/n_sse_to_integer to cost_sse_integer, and add
weighted_cost_sse_integer.
(class general_scalar_chain): Adjust prototype to return bool
intead of int.
(class timode_scalar_chain): Ditto.
---
gcc/config/i386/i386-features.cc | 177 +++++++++++++++++--------------
gcc/config/i386/i386-features.h | 11 +-
2 files changed, 105 insertions(+), 83 deletions(-)
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index cc8313bd292..6491c6b063f 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -296,9 +296,8 @@ scalar_chain::scalar_chain (enum machine_mode smode_, enum
machine_mode vmode_)
insns_conv = BITMAP_ALLOC (NULL);
queue = NULL;
- n_sse_to_integer = 0;
- n_integer_to_sse = 0;
-
+ cost_sse_integer = 0;
+ weighted_cost_sse_integer = 0 ;
max_visits = x86_stv_max_visits;
}
@@ -337,20 +336,52 @@ scalar_chain::mark_dual_mode_def (df_ref def)
/* Record the def/insn pair so we can later efficiently iterate over
the defs to convert on insns not in the chain. */
bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
+ basic_block bb = BLOCK_FOR_INSN (DF_REF_INSN (def));
+ profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
+ bool speed_p = optimize_bb_for_speed_p (bb);
+ int cost = 0;
+
if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
{
if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def))
&& !reg_new)
return;
- n_integer_to_sse++;
+
+ /* Cost integer to sse moves. */
+ if (speed_p)
+ cost = COSTS_N_INSNS (ix86_cost->integer_to_sse) / 2;
+ else if (TARGET_64BIT || smode == SImode)
+ cost = COSTS_N_BYTES (4);
+ /* vmovd (4 bytes) + vpinsrd (6 bytes). */
+ else if (TARGET_SSE4_1)
+ cost = COSTS_N_BYTES (10);
+ /* movd (4 bytes) + movd (4 bytes) + unpckldq (4 bytes). */
+ else
+ cost = COSTS_N_BYTES (12);
}
else
{
if (!reg_new)
return;
- n_sse_to_integer++;
+
+ /* Cost sse to integer moves. */
+ if (speed_p)
+ cost = COSTS_N_INSNS (ix86_cost->sse_to_integer) / 2;
+ else if (TARGET_64BIT || smode == SImode)
+ cost = COSTS_N_BYTES (4);
+ /* vmovd (4 bytes) + vpextrd (6 bytes). */
+ else if (TARGET_SSE4_1)
+ cost = COSTS_N_BYTES (10);
+ /* movd (4 bytes) + psrlq (5 bytes) + movd (4 bytes). */
+ else
+ cost = COSTS_N_BYTES (13);
}
+ if (speed_p)
+ weighted_cost_sse_integer += bb->count.to_sreal_scale (entry_count) * cost;
+
+ cost_sse_integer += cost;
+
if (dump_file)
fprintf (dump_file,
" Mark r%d def in insn %d as requiring both modes in chain #%d\n",
@@ -531,15 +562,15 @@ general_scalar_chain::vector_const_cost (rtx exp,
basic_block bb)
return COSTS_N_INSNS (ix86_cost->sse_load[smode == DImode ? 1 : 0]) / 2;
}
-/* Compute a gain for chain conversion. */
+/* Return true if it's cost profitable for chain conversion. */
-int
+bool
general_scalar_chain::compute_convert_gain ()
{
bitmap_iterator bi;
unsigned insn_uid;
int gain = 0;
- int cost = 0;
+ sreal weighted_gain = 0;
if (dump_file)
fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
@@ -559,10 +590,13 @@ general_scalar_chain::compute_convert_gain ()
rtx dst = SET_DEST (def_set);
basic_block bb = BLOCK_FOR_INSN (insn);
int igain = 0;
+ profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
+ bool speed_p = optimize_bb_for_speed_p (bb);
+ sreal bb_freq = bb->count.to_sreal_scale (entry_count);
if (REG_P (src) && REG_P (dst))
{
- if (optimize_bb_for_size_p (bb))
+ if (!speed_p)
/* reg-reg move is 2 bytes, while SSE 3. */
igain += COSTS_N_BYTES (2 * m - 3);
else
@@ -571,7 +605,7 @@ general_scalar_chain::compute_convert_gain ()
}
else if (REG_P (src) && MEM_P (dst))
{
- if (optimize_bb_for_size_p (bb))
+ if (!speed_p)
/* Integer load/store is 3+ bytes and SSE 4+. */
igain += COSTS_N_BYTES (3 * m - 4);
else
@@ -581,7 +615,7 @@ general_scalar_chain::compute_convert_gain ()
}
else if (MEM_P (src) && REG_P (dst))
{
- if (optimize_bb_for_size_p (bb))
+ if (!speed_p)
igain += COSTS_N_BYTES (3 * m - 4);
else
igain += COSTS_N_INSNS (m * ix86_cost->int_load[2]
@@ -593,7 +627,7 @@ general_scalar_chain::compute_convert_gain ()
of explicit load and store instructions. */
if (MEM_P (dst))
{
- if (optimize_bb_for_size_p (bb))
+ if (!speed_p)
/* ??? This probably should account size difference
of SSE and integer load rather than full SSE load. */
igain -= COSTS_N_BYTES (8);
@@ -667,7 +701,7 @@ general_scalar_chain::compute_convert_gain ()
igain -= vector_const_cost (XEXP (src, 1), bb);
if (MEM_P (XEXP (src, 1)))
{
- if (optimize_bb_for_size_p (bb))
+ if (!speed_p)
igain -= COSTS_N_BYTES (m == 2 ? 3 : 5);
else
igain += COSTS_N_INSNS
@@ -730,7 +764,7 @@ general_scalar_chain::compute_convert_gain ()
case CONST_INT:
if (REG_P (dst))
{
- if (optimize_bb_for_size_p (bb))
+ if (!speed_p)
{
/* xor (2 bytes) vs. xorps (3 bytes). */
if (src == const0_rtx)
@@ -769,14 +803,14 @@ general_scalar_chain::compute_convert_gain ()
if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
{
// movd (4 bytes) replaced with movdqa (4 bytes).
- if (!optimize_bb_for_size_p (bb))
+ if (!!speed_p)
igain += COSTS_N_INSNS (ix86_cost->sse_to_integer
- ix86_cost->xmm_move) / 2;
}
else
{
// pshufd; movd replaced with pshufd.
- if (optimize_bb_for_size_p (bb))
+ if (!speed_p)
igain += COSTS_N_BYTES (4);
else
igain += ix86_cost->sse_to_integer;
@@ -788,55 +822,34 @@ general_scalar_chain::compute_convert_gain ()
}
}
+ if (speed_p)
+ weighted_gain += bb_freq * igain;
+ gain += igain;
+
if (igain != 0 && dump_file)
{
- fprintf (dump_file, " Instruction gain %d for ", igain);
+ fprintf (dump_file, " Instruction gain %d with bb_freq %.2f for",
+ igain, bb_freq.to_double ());
dump_insn_slim (dump_file, insn);
}
- gain += igain;
}
if (dump_file)
- fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
-
- /* Cost the integer to sse and sse to integer moves. */
- if (!optimize_function_for_size_p (cfun))
{
- cost += n_sse_to_integer * COSTS_N_INSNS (ix86_cost->sse_to_integer) / 2;
- /* ??? integer_to_sse but we only have that in the RA cost table.
- Assume sse_to_integer/integer_to_sse are the same which they
- are at the moment. */
- cost += n_integer_to_sse * COSTS_N_INSNS (ix86_cost->integer_to_sse) / 2;
+ fprintf (dump_file, " Instruction conversion gain: %d, \n",
+ gain);
+ fprintf (dump_file, " Registers conversion cost: %d\n",
+ cost_sse_integer);
+ fprintf (dump_file, " Weighted instruction conversion gain: %.2f, \n",
+ weighted_gain.to_double ());
+ fprintf (dump_file, " Weighted registers conversion cost: %.2f\n",
+ weighted_cost_sse_integer.to_double ());
}
- else if (TARGET_64BIT || smode == SImode)
- {
- cost += n_sse_to_integer * COSTS_N_BYTES (4);
- cost += n_integer_to_sse * COSTS_N_BYTES (4);
- }
- else if (TARGET_SSE4_1)
- {
- /* vmovd (4 bytes) + vpextrd (6 bytes). */
- cost += n_sse_to_integer * COSTS_N_BYTES (10);
- /* vmovd (4 bytes) + vpinsrd (6 bytes). */
- cost += n_integer_to_sse * COSTS_N_BYTES (10);
- }
- else
- {
- /* movd (4 bytes) + psrlq (5 bytes) + movd (4 bytes). */
- cost += n_sse_to_integer * COSTS_N_BYTES (13);
- /* movd (4 bytes) + movd (4 bytes) + unpckldq (4 bytes). */
- cost += n_integer_to_sse * COSTS_N_BYTES (12);
- }
-
- if (dump_file)
- fprintf (dump_file, " Registers conversion cost: %d\n", cost);
-
- gain -= cost;
- if (dump_file)
- fprintf (dump_file, " Total gain: %d\n", gain);
-
- return gain;
+ if (weighted_gain != weighted_cost_sse_integer)
+ return weighted_gain > weighted_cost_sse_integer;
+ else
+ return gain > cost_sse_integer;;
}
/* Insert generated conversion instruction sequence INSNS
@@ -1553,21 +1566,22 @@ timode_immed_const_gain (rtx cst, basic_block bb)
return 0;
}
-/* Compute a gain for chain conversion. */
+/* Return true it's cost profitable for for chain conversion. */
-int
+bool
timode_scalar_chain::compute_convert_gain ()
{
/* Assume that if we have to move TImode values between units,
then transforming this chain isn't worth it. */
- if (n_sse_to_integer || n_integer_to_sse)
- return -1;
+ if (cost_sse_integer)
+ return false;
bitmap_iterator bi;
unsigned insn_uid;
/* Split ties to prefer V1TImode when not optimizing for size. */
int gain = optimize_size ? 0 : 1;
+ sreal weighted_gain = 0;
if (dump_file)
fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
@@ -1582,32 +1596,33 @@ timode_scalar_chain::compute_convert_gain ()
basic_block bb = BLOCK_FOR_INSN (insn);
int scost, vcost;
int igain = 0;
+ profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
+ bool speed_p = optimize_bb_for_speed_p (bb);
+ sreal bb_freq = bb->count.to_sreal_scale (entry_count);
switch (GET_CODE (src))
{
case REG:
- if (optimize_bb_for_size_p (bb))
+ if (!speed_p)
igain = MEM_P (dst) ? COSTS_N_BYTES (6) : COSTS_N_BYTES (3);
else
igain = COSTS_N_INSNS (1);
break;
case MEM:
- igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (7)
- : COSTS_N_INSNS (1);
+ igain = !speed_p ? COSTS_N_BYTES (7) : COSTS_N_INSNS (1);
break;
case CONST_INT:
if (MEM_P (dst)
&& standard_sse_constant_p (src, V1TImode))
- igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (11) : 1;
+ igain = !speed_p ? COSTS_N_BYTES (11) : 1;
break;
case CONST_WIDE_INT:
/* 2 x mov vs. vmovdqa. */
if (MEM_P (dst))
- igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (3)
- : COSTS_N_INSNS (1);
+ igain = !speed_p ? COSTS_N_BYTES (3) : COSTS_N_INSNS (1);
break;
case NOT:
@@ -1628,7 +1643,7 @@ timode_scalar_chain::compute_convert_gain ()
case LSHIFTRT:
/* See ix86_expand_v1ti_shift. */
op1val = INTVAL (XEXP (src, 1));
- if (optimize_bb_for_size_p (bb))
+ if (!speed_p)
{
if (op1val == 64 || op1val == 65)
scost = COSTS_N_BYTES (5);
@@ -1662,7 +1677,7 @@ timode_scalar_chain::compute_convert_gain ()
case ASHIFTRT:
/* See ix86_expand_v1ti_ashiftrt. */
op1val = INTVAL (XEXP (src, 1));
- if (optimize_bb_for_size_p (bb))
+ if (!speed_p)
{
if (op1val == 64 || op1val == 127)
scost = COSTS_N_BYTES (7);
@@ -1740,7 +1755,7 @@ timode_scalar_chain::compute_convert_gain ()
case ROTATERT:
/* See ix86_expand_v1ti_rotate. */
op1val = INTVAL (XEXP (src, 1));
- if (optimize_bb_for_size_p (bb))
+ if (!speed_p)
{
scost = COSTS_N_BYTES (13);
if ((op1val & 31) == 0)
@@ -1772,34 +1787,40 @@ timode_scalar_chain::compute_convert_gain ()
{
if (GET_CODE (XEXP (src, 0)) == AND)
/* and;and;or (9 bytes) vs. ptest (5 bytes). */
- igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (4)
- : COSTS_N_INSNS (2);
+ igain = !speed_p ? COSTS_N_BYTES (4) : COSTS_N_INSNS (2);
/* or (3 bytes) vs. ptest (5 bytes). */
- else if (optimize_bb_for_size_p (bb))
+ else if (!speed_p)
igain = -COSTS_N_BYTES (2);
}
else if (XEXP (src, 1) == const1_rtx)
/* and;cmp -1 (7 bytes) vs. pcmpeqd;pxor;ptest (13 bytes). */
- igain = optimize_bb_for_size_p (bb) ? -COSTS_N_BYTES (6)
- : -COSTS_N_INSNS (1);
+ igain = !speed_p ? -COSTS_N_BYTES (6) : -COSTS_N_INSNS (1);
break;
default:
break;
}
+ gain += igain;
+ if (speed_p)
+ weighted_gain += bb_freq * igain;
+
if (igain != 0 && dump_file)
{
- fprintf (dump_file, " Instruction gain %d for ", igain);
+ fprintf (dump_file, " Instruction gain %d with bb_freq %.2f for ",
+ igain, bb_freq.to_double ());
dump_insn_slim (dump_file, insn);
}
- gain += igain;
}
if (dump_file)
- fprintf (dump_file, " Total gain: %d\n", gain);
+ fprintf (dump_file, " Total gain: %d, weighted gain %.2f\n",
+ gain, weighted_gain.to_double ());
- return gain;
+ if (weighted_gain > (sreal) 0)
+ return true;
+ else
+ return gain > 0;
}
/* Fix uses of converted REG in debug insns. */
@@ -2595,7 +2616,7 @@ convert_scalars_to_vector (bool timode_p)
conversions. */
if (chain->build (&candidates[i], uid, disallowed))
{
- if (chain->compute_convert_gain () > 0)
+ if (chain->compute_convert_gain ())
converted_insns += chain->convert ();
else if (dump_file)
fprintf (dump_file, "Chain #%d conversion is not profitable\n",
diff --git a/gcc/config/i386/i386-features.h b/gcc/config/i386/i386-features.h
index 7f7c0f78c96..e3719b31529 100644
--- a/gcc/config/i386/i386-features.h
+++ b/gcc/config/i386/i386-features.h
@@ -153,12 +153,13 @@ class scalar_chain
bitmap insns_conv;
hash_map<rtx, rtx> defs_map;
- unsigned n_sse_to_integer;
- unsigned n_integer_to_sse;
+ /* Cost of inserted conversion between ineteger and sse. */
+ int cost_sse_integer;
+ sreal weighted_cost_sse_integer;
auto_vec<rtx_insn *> control_flow_insns;
bool build (bitmap candidates, unsigned insn_uid, bitmap disallowed);
- virtual int compute_convert_gain () = 0;
+ virtual bool compute_convert_gain () = 0;
int convert ();
protected:
@@ -184,7 +185,7 @@ class general_scalar_chain : public scalar_chain
public:
general_scalar_chain (enum machine_mode smode_, enum machine_mode vmode_)
: scalar_chain (smode_, vmode_) {}
- int compute_convert_gain () final override;
+ bool compute_convert_gain () final override;
private:
void convert_insn (rtx_insn *insn) final override;
@@ -196,7 +197,7 @@ class timode_scalar_chain : public scalar_chain
{
public:
timode_scalar_chain () : scalar_chain (TImode, V1TImode) {}
- int compute_convert_gain () final override;
+ bool compute_convert_gain () final override;
private:
void fix_debug_reg_uses (rtx reg);
--
2.34.1