On Fri, Dec 2, 2011 at 11:59 AM, Xinliang David Li <davi...@google.com> wrote:
> ;
>>
>> +/* Determine whether LOOP contains floating-point computation. */
>> +bool
>> +loop_has_FP_comp(struct loop *loop)
>> +{
>> +  rtx set, dest;
>
> This probably should be extended to detect other long latency
> operations in the future.
>
>
>> +
>> +  if (ix86_tune != PROCESSOR_COREI7_64 &&
>> +      ix86_tune != PROCESSOR_COREI7_32)
>> +    return nunroll;
>
> Is it better to generalize it and model the LSD and LSD size in the
> target model description? -- probably a different patch for that.

Yes, I thought it made sense to keep the check here for now, but it
could be generalized that way to handle the limits in different
implementations.

>
>
>> +
>> +  /* Look for instructions that store a constant into HImode (16-bit)
>> +     memory. These require a length-changing prefix and on corei7 are
>> +     prone to LCP stalls. These stalls can be avoided if the loop
>> +     is streamed from the loop stream detector. */
>> +  body = get_loop_body (loop);
>> +  for (i = 0; i < loop->num_nodes && !found; i++)
>> +    {
>> +      bb = body[i];
>> +
>> +      FOR_BB_INSNS (bb, insn)
>> +        {
>> +          rtx set_expr;
>> +          set_expr = single_set (insn);
>> +          if (set_expr != NULL_RTX
>> +              && GET_MODE (SET_DEST (set_expr)) == HImode
>> +              && CONST_INT_P (SET_SRC (set_expr))
>> +              && MEM_P (SET_DEST (set_expr)))
>> +            {
>> +              found = true;
>> +              break;
>> +            }
>> +        }
>> +    }
>> +  free (body);
>
>
> Probably generalize this to handle other long latency FE stalls -- for
> now it only handles LCP stalls.
>
>> +
>> +  if (!found)
>> +    return nunroll;
>> +
>> +  /* Don't reduce unroll factor in loops with floating point
>> +     computation, which tend to benefit more heavily from
>> +     larger unroll factors and are less likely to bottleneck
>> +     at the decoder. */
>> +  has_FP = loop_has_FP_comp(loop);
>> +  if (has_FP)
>> +    return nunroll;
>> +
>> +  if (dump_file)
>> +    {
>> +      fprintf (dump_file,
>> +               ";; Loop contains HImode store of const (possible LCP
>> stalls),\n");
>> +      fprintf (dump_file,
>> +               "   reduce unroll factor to fit into Loop Stream 
>> Detector\n");
>> +    }
>> +
>> +  /* On corei7 the loop stream detector can hold about 28 instructions, so
>> +     don't allow unrolling to exceed that. */
>> +  newunroll = 28 / loop->av_ninsns;
>
> Is 28 number of instructions or number of uOps?

It is actually 28 uops, I have updated the comments in the latest
patch, which I am sending out in a follow-on email.

Thanks,
Teresa

>
> thanks,
>
> David
>
>> +  if (newunroll < nunroll)
>> +    return newunroll;
>> +
>> +  return nunroll;
>> +}
>> +
>>  /* Initialize the GCC target structure.  */
>>  #undef TARGET_RETURN_IN_MEMORY
>>  #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
>> @@ -38685,6 +38755,9 @@ ix86_autovectorize_vector_sizes (void)
>>  #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
>>  #endif
>>
>> +#undef TARGET_LOOP_UNROLL_ADJUST
>> +#define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
>> +
>>  struct gcc_target targetm = TARGET_INITIALIZER;
>>  ^L
>>  #include "gt-i386.h"
>>
>> --
>> Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413



-- 
Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413

Reply via email to