;
>
> +/* Determine whether LOOP contains floating-point computation. */
> +bool
> +loop_has_FP_comp(struct loop *loop)
> +{
> +  rtx set, dest;

This probably should be extended to detect other long latency
operations in the future.


> +
> +  if (ix86_tune != PROCESSOR_COREI7_64 &&
> +      ix86_tune != PROCESSOR_COREI7_32)
> +    return nunroll;

Is it better to generalize it and model the LSD and LSD size in the
target model description? -- probably a different patch for that.


> +
> +  /* Look for instructions that store a constant into HImode (16-bit)
> +     memory. These require a length-changing prefix and on corei7 are
> +     prone to LCP stalls. These stalls can be avoided if the loop
> +     is streamed from the loop stream detector. */
> +  body = get_loop_body (loop);
> +  for (i = 0; i < loop->num_nodes && !found; i++)
> +    {
> +      bb = body[i];
> +
> +      FOR_BB_INSNS (bb, insn)
> +        {
> +          rtx set_expr;
> +          set_expr = single_set (insn);
> +          if (set_expr != NULL_RTX
> +              && GET_MODE (SET_DEST (set_expr)) == HImode
> +              && CONST_INT_P (SET_SRC (set_expr))
> +              && MEM_P (SET_DEST (set_expr)))
> +            {
> +              found = true;
> +              break;
> +            }
> +        }
> +    }
> +  free (body);


Probably generalize this to handle other long latency FE stalls -- for
now it only handles LCP stalls.

> +
> +  if (!found)
> +    return nunroll;
> +
> +  /* Don't reduce unroll factor in loops with floating point
> +     computation, which tend to benefit more heavily from
> +     larger unroll factors and are less likely to bottleneck
> +     at the decoder. */
> +  has_FP = loop_has_FP_comp(loop);
> +  if (has_FP)
> +    return nunroll;
> +
> +  if (dump_file)
> +    {
> +      fprintf (dump_file,
> +               ";; Loop contains HImode store of const (possible LCP
> stalls),\n");
> +      fprintf (dump_file,
> +               "   reduce unroll factor to fit into Loop Stream Detector\n");
> +    }
> +
> +  /* On corei7 the loop stream detector can hold about 28 instructions, so
> +     don't allow unrolling to exceed that. */
> +  newunroll = 28 / loop->av_ninsns;

Is 28 number of instructions or number of uOps?

thanks,

David

> +  if (newunroll < nunroll)
> +    return newunroll;
> +
> +  return nunroll;
> +}
> +
>  /* Initialize the GCC target structure.  */
>  #undef TARGET_RETURN_IN_MEMORY
>  #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
> @@ -38685,6 +38755,9 @@ ix86_autovectorize_vector_sizes (void)
>  #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
>  #endif
>
> +#undef TARGET_LOOP_UNROLL_ADJUST
> +#define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
> +
>  struct gcc_target targetm = TARGET_INITIALIZER;
>  ^L
>  #include "gt-i386.h"
>
> --
> Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413

Reply via email to