Re: [PATCH 2/3] Add profiling support for IVOPTS

Martin Liška Thu, 19 May 2016 03:29:05 -0700

On 05/17/2016 12:27 AM, Bin.Cheng wrote:
>> As profile-guided optimization can provide very useful information
>> about basic block frequencies within a loop, following patch set leverages
>> that information. It speeds up a single benchmark from upcoming SPECv6
>> suite by 20% (-O2 -profile-generate/-fprofile use) and I think it can
>> also improve others (currently measuring numbers for PGO).
> Hi,
> Is this 20% improvement from this patch, or does it include the
> existing PGO's improvement?


Hello.

It shows that current trunk (compared to GCC 6 branch)
has significantly improved the benchmark with PGO.
Currently, my patch improves PGO by ~5% w/ -O2, but our plan is to
improve static profile that would utilize the patch.

> 
> For the patch:
>> +
>> +  /* Return true if the frequency has a valid value.  */
>> +  bool has_frequency ();
>> +
>>    /* Return infinite comp_cost.  */
>>    static comp_cost get_infinite ();
>>
>> @@ -249,6 +272,9 @@ private:
>>       complexity field should be larger for more
>>       complex expressions and addressing modes).  */
>>    int m_scratch;  /* Scratch used during cost computation.  */
>> +  sreal m_frequency;  /* Frequency of the basic block this comp_cost
>> +     belongs to.  */
>> +  sreal m_cost_scaled;  /* Scalled runtime cost.  */
> IMHO we shouldn't embed frequency in comp_cost, neither record scaled
> cost in it.  I would suggest we compute cost and amortize the cost
> over frequency in get_computation_cost_at before storing it into
> comp_cost.  That is, once cost is computed/stored in comp_cost, it is
> already scaled with frequency.  One argument is frequency info is only
> valid for use's statement/basic_block, it really doesn't have clear
> meaning in comp_cost structure.  Outside of function
> get_computation_cost_at, I found it's hard to understand/remember
> what's the meaning of comp_cost.m_frequency and where it came from.
> There are other reasons embedded in below comments.
>>
>>
>>  comp_cost&
>> @@ -257,6 +283,8 @@ comp_cost::operator= (const comp_cost& other)
>>    m_cost = other.m_cost;
>>    m_complexity = other.m_complexity;
>>    m_scratch = other.m_scratch;
>> +  m_frequency = other.m_frequency;
>> +  m_cost_scaled = other.m_cost_scaled;
>>
>>    return *this;
>>  }
>> @@ -275,6 +303,7 @@ operator+ (comp_cost cost1, comp_cost cost2)
>>
>>    cost1.m_cost += cost2.m_cost;
>>    cost1.m_complexity += cost2.m_complexity;
>> +  cost1.m_cost_scaled += cost2.m_cost_scaled;
>>
>>    return cost1;
>>  }
>> @@ -290,6 +319,8 @@ comp_cost
>>  comp_cost::operator+= (HOST_WIDE_INT c)
> This and below operators need check for infinite cost first and return
> immediately.
>>  {
>>    this->m_cost += c;
>> +  if (has_frequency ())
>> +    this->m_cost_scaled += scale_cost (c);
>>
>>    return *this;
>>  }
>> @@ -5047,18 +5128,21 @@ get_computation_cost_at (struct ivopts_data *data,
>>       (symbol/var1/const parts may be omitted).  If we are looking for an
>>       address, find the cost of addressing this.  */
>>    if (address_p)
>> -    return cost + get_address_cost (symbol_present, var_present,
>> -    offset, ratio, cstepi,
>> -    mem_mode,
>> -    TYPE_ADDR_SPACE (TREE_TYPE (utype)),
>> -    speed, stmt_is_after_inc, can_autoinc);
>> +    {
>> +      cost += get_address_cost (symbol_present, var_present,
>> + offset, ratio, cstepi,
>> + mem_mode,
>> + TYPE_ADDR_SPACE (TREE_TYPE (utype)),
>> + speed, stmt_is_after_inc, can_autoinc);
>> +      goto ret;
>> +    }
>>
>>    /* Otherwise estimate the costs for computing the expression.  */
>>    if (!symbol_present && !var_present && !offset)
>>      {
>>        if (ratio != 1)
>>   cost += mult_by_coeff_cost (ratio, TYPE_MODE (ctype), speed);
>> -      return cost;
>> +      goto ret;
>>      }
>>
>>    /* Symbol + offset should be compile-time computable so consider that they
>> @@ -5077,7 +5161,8 @@ get_computation_cost_at (struct ivopts_data *data,
>>    aratio = ratio > 0 ? ratio : -ratio;
>>    if (aratio != 1)
>>      cost += mult_by_coeff_cost (aratio, TYPE_MODE (ctype), speed);
>> -  return cost;
>> +
>> +  goto ret;
>>
>>  fallback:
>>    if (can_autoinc)
>> @@ -5093,8 +5178,13 @@ fallback:
>>      if (address_p)
>>        comp = build_simple_mem_ref (comp);
>>
>> -    return comp_cost (computation_cost (comp, speed), 0);
>> +    cost = comp_cost (computation_cost (comp, speed), 0);
>>    }
>> +
>> +ret:
>> +  cost.calculate_scaled_cost (at->bb->frequency,
>> +      data->current_loop->header->frequency);
> Here cost consists of two parts.  One is for loop invariant
> computation, we amortize is against avg_loop_niter and record register
> pressure (either via invriant variables or invariant expressions) for
> it;  the other is loop variant part.  For the first part, we should
> not scaled it using frequency, since we have already assumed it would
> be hoisted out of loop.  No matter where the use is, hoisted loop
> invariant has the same frequency as loop header.  This is the second
> reason I want to factor frequency out of comp_cost.  It's easier to
> scale with frequency only it's necessary.
> 
>> +  return cost;
>>  }
>>
>>  /* Determines the cost of the computation by that USE is expressed
>> @@ -5922,16 +6012,19 @@ determine_group_iv_costs (struct ivopts_data *data)
>>    group = data->vgroups[i];
>>
>>    fprintf (dump_file, "Group %d:\n", i);
>> -  fprintf (dump_file, "  cand\tcost\tcompl.\tinv.ex.\tdepends on\n");
>> +  fprintf (dump_file, "  cand\tcost\tscaled\tfreq.\tcompl.\tinv.ex."
>> +   "\tdepends on\n");
>>    for (j = 0; j < group->n_map_members; j++)
>>      {
>>        if (!group->cost_map[j].cand
>>    || group->cost_map[j].cost.infinite_cost_p ())
>>   continue;
>>
>> -      fprintf (dump_file, "  %d\t%d\t%d\t",
>> +      fprintf (dump_file, "  %d\t%d\t%2.2f\t%2.2f\t%d\t",
>>         group->cost_map[j].cand->id,
>>         group->cost_map[j].cost.get_cost (),
>> +       group->cost_map[j].cost.get_cost_scaled (),
>> +       group->cost_map[j].cost.get_frequency (),
>>         group->cost_map[j].cost.get_complexity ());
>>        if (group->cost_map[j].inv_expr != NULL)
>>   fprintf (dump_file, "%d\t",
>> @@ -5948,6 +6041,19 @@ determine_group_iv_costs (struct ivopts_data *data)
>>   }
>>        fprintf (dump_file, "\n");
>>      }
>> +
>> +  for (i = 0; i < data->vgroups.length (); i++)
>> +    {
>> +      group = data->vgroups[i];
>> +      for (j = 0; j < group->n_map_members; j++)
>> + {
>> +  if (!group->cost_map[j].cand
>> +      || group->cost_map[j].cost.infinite_cost_p ())
>> +    continue;
>> +
>> +  group->cost_map[j].cost.propagate_scaled_cost ();
>> + }
>> +    }
> This is wrong.  m_frequency and m_cost_scaled are initialized to
> sreal(0) by default, and are never changed later for conditional
> iv_use.  As a matter of factor, costs computed for all conditional
> iv_uses are wrong (value is 0).  This makes the observed improvement
> not that promising.  Considering code generation is very sensitive to
> cost computation, it maybe just hit some special cases.  Eitherway we
> need more work/investigation on the impact of this patch.
> 
> Again, I would suggest we factor out frequency out of comp_cost and
> only scale the cost in place when we compute cost for each use.  Then
> we can measure what's the impact on code generation.
> 
> Thanks,
> bin
> 

All remarks were applied in third version of the patch. Together with the 
previous
patch, it survives bootstrap and regression tests on x86_64-linux-gnu.
I'm going to re-test the patch on SPEC benchmarks.

Martin

>From 24e5d3f6747c77d1437feab11ff1e3888779b4d4 Mon Sep 17 00:00:00 2001
From: marxin <mli...@suse.cz>
Date: Tue, 17 May 2016 15:22:43 +0200
Subject: [PATCH 2/4] Add profiling support for IVOPTS

gcc/ChangeLog:

2016-05-17  Martin Liska  <mli...@suse.cz>

	* tree-ssa-loop-ivopts.c (get_computation_cost_at): Scale
	computed costs by frequency of BB they belong to.
---
 gcc/tree-ssa-loop-ivopts.c | 42 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index f48b2f6..8a82831 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -4995,18 +4995,21 @@ get_computation_cost_at (struct ivopts_data *data,
      (symbol/var1/const parts may be omitted).  If we are looking for an
      address, find the cost of addressing this.  */
   if (address_p)
-    return cost + get_address_cost (symbol_present, var_present,
-				    offset, ratio, cstepi,
-				    mem_mode,
-				    TYPE_ADDR_SPACE (TREE_TYPE (utype)),
-				    speed, stmt_is_after_inc, can_autoinc);
+    {
+      cost += get_address_cost (symbol_present, var_present,
+				offset, ratio, cstepi,
+				mem_mode,
+				TYPE_ADDR_SPACE (TREE_TYPE (utype)),
+				speed, stmt_is_after_inc, can_autoinc);
+      goto ret;
+    }
 
   /* Otherwise estimate the costs for computing the expression.  */
   if (!symbol_present && !var_present && !offset)
     {
       if (ratio != 1)
 	cost += mult_by_coeff_cost (ratio, TYPE_MODE (ctype), speed);
-      return cost;
+      goto ret;
     }
 
   /* Symbol + offset should be compile-time computable so consider that they
@@ -5025,7 +5028,7 @@ get_computation_cost_at (struct ivopts_data *data,
   aratio = ratio > 0 ? ratio : -ratio;
   if (aratio != 1)
     cost += mult_by_coeff_cost (aratio, TYPE_MODE (ctype), speed);
-  return cost;
+  goto ret;
 
 fallback:
   if (can_autoinc)
@@ -5041,8 +5044,31 @@ fallback:
     if (address_p)
       comp = build_simple_mem_ref (comp);
 
-    return comp_cost (computation_cost (comp, speed), 0);
+    cost = comp_cost (computation_cost (comp, speed), 0);
   }
+
+ret:
+  /* Scale (multiply) the computed cost (except scratch part that should be
+     hoisted out a loop) by header->frequency / at->frequency,
+     which makes expected cost more accurate.  */
+   int loop_freq = data->current_loop->header->frequency;
+   int bb_freq = at->bb->frequency;
+   if (loop_freq != 0)
+     {
+       gcc_assert (cost.scratch <= cost.cost);
+       int scaled_cost
+	 = cost.scratch + (cost.cost - cost.scratch) * bb_freq / loop_freq;
+
+       if (dump_file && (dump_flags & TDF_DETAILS))
+	 fprintf (dump_file, "Scaling iv_use based on cand %d "
+		  "by %2.2f: %d (scratch: %d) -> %d (%d/%d)\n",
+		  cand->id, 1.0f * bb_freq / loop_freq, cost.cost,
+		  cost.scratch, scaled_cost, bb_freq, loop_freq);
+
+       cost.cost = scaled_cost;
+     }
+
+   return cost;
 }
 
 /* Determines the cost of the computation by that USE is expressed
-- 
2.8.2

Re: [PATCH 2/3] Add profiling support for IVOPTS

Reply via email to