On x86_64, when the expected size of memcpy/memset is known (e.g, with FDO), libcall strategy is used with the size is > 8192. This value is hard coded, which makes it hard to do performance tuning. This patch adds two new parameters to do that. Potential usage includes per-application libcall strategy min-size tuning based on summary data with FDO (e.g, instruction workset size).
Bootstrap and tested on x86_64/linux. Ok for trunk? thanks, David 2013-08-02 Xinliang David Li <davi...@google.com> * params.def: New parameters. * config/i386/i386.c (ix86_option_override_internal): Override default libcall size limit with parameters.
Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 201458) +++ config/i386/i386.c (working copy) @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost = }; /* Processor costs (relative to an add) */ -static const +static struct processor_costs i386_cost = { /* 386 specific costs */ COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -226,7 +226,7 @@ struct processor_costs i386_cost = { /* 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs i486_cost = { /* 486 specific costs */ COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -298,7 +298,7 @@ struct processor_costs i486_cost = { /* 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentium_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentiumpro_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost = 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs geode_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -518,7 +518,7 @@ struct processor_costs geode_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs k6_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -591,7 +591,7 @@ struct processor_costs k6_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs athlon_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs k8_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentium4_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (3), /* cost of a lea instruction */ @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs nocona_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs atom_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = { }; /* Generic64 should produce code tuned for Nocona and K8. */ -static const +static struct processor_costs generic64_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ /* On all chips taken into consideration lea is 2 cycles and more. With @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost = }; /* core_cost should produce code tuned for Core familly of CPUs. */ -static const +static struct processor_costs core_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ /* On all chips taken into consideration lea is 2 cycles and more. With @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = { /* Generic32 should produce code tuned for PPro, Pentium4, Nocona, Athlon and K8. */ -static const +static struct processor_costs generic32_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ @@ -4021,6 +4021,34 @@ ix86_option_override_internal (bool main /* Handle stack protector */ if (!global_options_set.x_ix86_stack_protector_guard) ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS; + + /* Now override the memcpy/memset inline strategy parameters */ + if (PARAM_VALUE (PARAM_MEMCPY_LIBCALL_MIN_SIZE) != -1 + || PARAM_VALUE (PARAM_MEMSET_LIBCALL_MIN_SIZE) != -1) + { + const struct stringop_algs *algs[2]; + int k; + int min_sizes[2]; + + algs[0] = &ix86_cost->memset[TARGET_64BIT != 0]; + algs[1] = &ix86_cost->memcpy[TARGET_64BIT != 0]; + + min_sizes[0] = PARAM_VALUE (PARAM_MEMSET_LIBCALL_MIN_SIZE); + min_sizes[1] = PARAM_VALUE (PARAM_MEMCPY_LIBCALL_MIN_SIZE); + + for (k = 0; k < 2; k++) + { + if (min_sizes[k] == -1) + continue; + + for (i = 0; i < MAX_STRINGOP_ALGS - 1; i++) + { + if (algs[k]->size[i].max >= min_sizes[k] + || algs[k]->size[i + 1].alg == libcall) + *const_cast<int *>(&algs[k]->size[i].max) = min_sizes[k] - 1; + } + } + } } /* Implement the TARGET_OPTION_OVERRIDE hook. */ Index: params.def =================================================================== --- params.def (revision 201458) +++ params.def (working copy) @@ -117,6 +117,18 @@ DEFPARAM (PARAM_COMDAT_SHARING_PROBABILI "Probability that COMDAT function will be shared with different compilation unit", 20, 0, 0) +/* Use libcall strategy when the expected size is no less than this parameter for memcpy. */ +DEFPARAM (PARAM_MEMCPY_LIBCALL_MIN_SIZE, + "memcpy-libcall-min-size", + "The minimal expected size to force libcall expansion strategy for memcpy", + -1, 1, 0) + +/* Use libcall strategy when the expected size is no less than this parameter for memset. */ +DEFPARAM (PARAM_MEMSET_LIBCALL_MIN_SIZE, + "memset-libcall-min-size", + "The minimal expected size to force libcall expansion strategy for memset", + -1, 1, 0) + /* Limit on probability of entry BB. */ DEFPARAM (PARAM_PARTIAL_INLINING_ENTRY_PROBABILITY, "partial-inlining-entry-probability",