> 
> /* If checking is enabled, verify that no lattice is in the TOP state, i.e. 
> not
>    bottom, not containing a variable component and without any known value at
>    the same time.  */
> 
> static void
> verify_propagated_values (void)
> {
> #ifdef ENABLE_CHECKING

Hmm, would not be better to keep this function around to be called from 
debugger, like
other verifiers do?
>   struct cgraph_node *node;
> 
>   FOR_EACH_DEFINED_FUNCTION (node)
>     {
>       struct ipa_node_params *info = IPA_NODE_REF (node);
>       int i, count = ipa_get_param_count (info);
> 
>       for (i = 0; i < count; i++)
>       {
>         struct ipcp_lattice *lat = ipa_get_lattice (info, i);
> 
>         if (!lat->bottom
>             && !lat->contains_variable
>             && lat->values_count == 0)
>           {
>             if (dump_file)
>               {
>                 fprintf (dump_file, "\nIPA lattices after constant "
>                          "propagation:\n");
>                 print_all_lattices (dump_file, true, false);
>               }
> 
>             gcc_unreachable ();
>           }
>       }
>     }
> #endif
> }
> 
> /* Propagate values through a pass-through jump function JFUNC associated with
>    edge CS, taking values from SRC_LAT and putting them into DEST_LAT.  
> SRC_IDX
>    is the index of the source parameter.  */
> 
> static bool
> propagate_vals_accross_pass_through (struct cgraph_edge *cs,
>                                    struct ipa_jump_func *jfunc,
>                                    struct ipcp_lattice *src_lat,
>                                    struct ipcp_lattice *dest_lat,
>                                    int src_idx)
> {
>   struct ipcp_value *src_val;
>   bool ret = false;
> 
>   if (jfunc->value.pass_through.operation == NOP_EXPR)
>     for (src_val = src_lat->values; src_val; src_val = src_val->next)
>       ret |= add_value_to_lattice (dest_lat, src_val->value, cs,
>                                  src_val, src_idx);
>   else if (edge_within_scc (cs))
>     ret = set_lattice_contains_variable (dest_lat);

Hmm, is the reason for not using artimetic within SCC documented somewhere?
It seems like code someone will eventually run into and remove without much of 
consideration.
> 
> /* Calculate devirtualization time bonus for NODE, assuming we know KNOWN_CSTS
>    and KNOWN_BINFOS.  */
> 
> static int
> devirtualization_time_bonus (struct cgraph_node *node,
>                            VEC (tree, heap) *known_csts,
>                            VEC (tree, heap) *known_binfos)

Eventually it would be nice to make this common with inliner analysis.  We want 
to 
increase inlining limits here too....
>       /* Only bare minimum benefit for clearly un-inlineable targets.  */
>       res += 1;
>       callee = cgraph_get_node (target);
>       if (!callee)
>       continue;

Hmm, when you test it here, you might probably ask if callee is analyzed and 
inlinable then ;)
> 
> /* Return true if cloning NODE is a good idea, given the estimated 
> TIME_BENEFIT
>    and SIZE_COST and with the sum of frequencies of incoming edges to the
>    potential new clone in FREQUENCIES.  */
> 
> static bool
> good_cloning_opportunity_p (struct cgraph_node *node, int time_benefit,
>                           int freq_sum, gcov_type count_sum, int size_cost)
> {
>   if (time_benefit == 0
>       || !flag_ipa_cp_clone
>       || !optimize_function_for_speed_p (DECL_STRUCT_FUNCTION (node->decl)))
>     return false;

Still I think cloning oppurtunity is good if the saving on call size reduce 
enough
to pay back for function body duplication.
It would probably make sense then to create simple wrapper functions instead of
duplicating the function body.

> 
>   gcc_checking_assert (size_cost >= 0);
> 
>   /* FIXME:  These decisions need tuning.  */
>   if (max_count)
>     {
>       int evaluation, factor = (count_sum * 1000) / max_count;
> 
>       evaluation = (time_benefit * factor) / size_cost;
> 
>       if (dump_file && (dump_flags & TDF_DETAILS))
>       fprintf (dump_file, "     good_cloning_opportunity_p (time: %i, "
>                "size: %i, count_sum: " HOST_WIDE_INT_PRINT_DEC
>                ") -> evaluation: %i, threshold: %i\n",
>                time_benefit, size_cost, (HOST_WIDE_INT) count_sum,
>                evaluation, 500);
> 
>       return evaluation > 500;

Hmm, the magic numbers looks a bit scary... putting size and time into fraction
causes problem that the units are not really related.

But we will see.  I guess we will need the 500 as --param value however.

We probably want ipa-profile to collect expected running time of the unit
and then we can do something like computing relative speedup of unit versus
relative growth of it that is sort of better defined  and closer to
temperature metrics. (but of course off reality)
>     }
>   else
>     {
>       int evaluation = (time_benefit * freq_sum) / size_cost;
> 
>       if (dump_file && (dump_flags & TDF_DETAILS))
>       fprintf (dump_file, "     good_cloning_opportunity_p (time: %i, "
>                "size: %i, freq_sum: %i) -> evaluation: %i, threshold: %i\n",
>                time_benefit, size_cost, freq_sum, evaluation,
>                CGRAPH_FREQ_BASE /2);
> 
>       return evaluation > (CGRAPH_FREQ_BASE /2);

Similarly here, the fraction is quite crazy, but don't have much ideas how to
handle it better.

>       init_caller_stats (&stats);
>       cgraph_for_node_and_aliases (node, gather_caller_stats, &stats, false);
>       estimate_ipcp_clone_size_and_time (node, known_csts, &size, &time);
>       time -= devirtualization_time_bonus (node, known_csts, known_binfos);
>       size -= stats.n_calls * removable_params;

You may probably use stame logic as when we estimate call size.
time also benefits from caller getting smaller... so I think you should subtract
sum of move costs of the removable params.
> 
> /* Update the respective profile of specialized NEW_NODE and the original
>    ORIG_NODE after additional edges with cumulative count sum REDIRECTED_SUM
>    have been redirected to the specialized version.  */
> 
> static void
> update_specialized_profile (struct cgraph_node *new_node,
>                           struct cgraph_node *orig_node,
>                           gcov_type redirected_sum)
> {
>   struct cgraph_edge *cs;
>   gcov_type new_node_count, orig_node_count = orig_node->count;
> 
>   if (dump_file)
>     fprintf (dump_file, "    the sum of counts of redirected  edges is "
>            HOST_WIDE_INT_PRINT_DEC "\n", (HOST_WIDE_INT) redirected_sum);
>   if (orig_node_count == 0)
>     return;
> 
>   gcc_assert (orig_node_count >= redirected_sum);
> 
>   new_node_count = new_node->count;
>   new_node->count += redirected_sum;
>   orig_node->count -= redirected_sum;
> 
>   for (cs = new_node->callees; cs ; cs = cs->next_callee)
>     if (cs->frequency)
>       cs->count += cs->count * redirected_sum / new_node_count;
>     else
>       cs->count = 0;
> 
>   for (cs = orig_node->callees; cs ; cs = cs->next_callee)
>     {
>       gcov_type dec = cs->count * redirected_sum / orig_node_count;
Are you sure we can't divide by 0 here or get to negative numbers for insane 
profiles?
>       if (dec < cs->count)
>       cs->count -= dec;
>       else
>       cs->count = 0;
>     }
> 
>   if (dump_file)
>     dump_profile_updates (orig_node, new_node);
> }
> 
>   if (node->local.can_change_signature)
>     {
>       args_to_skip = BITMAP_GGC_ALLOC ();
>       for (i = 0; i < count; i++)
>       {
>         tree t = VEC_index (tree, known_vals, i);
> 
>         if ((t && TREE_CODE (t) != TREE_BINFO)
>             || !ipa_is_param_used (info, i))
>           bitmap_set_bit (args_to_skip, i);
>       }
>     }
>   else
>     args_to_skip = NULL;
When we can't change signature, still we can set is_parm_unused flag for the 
callee
to aid later optimizers.

Rest of patch looks OK. It definitely reads better than previous ipa-cp.c ;)
I suppose we will need to get some experience with the logic deciding whether 
to clone..
Honza

Reply via email to