On Thu, Apr 21, 2011 at 11:50 AM, Richard Sandiford
<richard.sandif...@linaro.org> wrote:
> To get back to this...
>
> Richard Sandiford <richard.sandif...@linaro.org> writes:
>> Richard Guenther <richard.guent...@gmail.com> writes:
>>> On Thu, Mar 31, 2011 at 3:32 PM, Richard Sandiford
>>> <richard.sandif...@linaro.org> wrote:
>>>> This patch adds an array_mode_supported_p hook, which says whether
>>>> MAX_FIXED_MODE_SIZE should be ignored for a given type of array.
>>>> It follows on from the discussion here:
>>>>
>>>>    http://gcc.gnu.org/ml/gcc/2011-03/msg00342.html
>>>>
>>>> The intended use of the hook is to allow small arrays of vectors
>>>> to have a non-BLK mode, and hence to be stored in rtl registers.
>>>> These arrays are used both in the ARM arm_neon.h API and in the
>>>> optabs proposed in:
>>>>
>>>>    http://gcc.gnu.org/ml/gcc/2011-03/msg00322.html
>>>>
>>>> The tail end of the thread was about the definition of TYPE_MODE:
>>>>
>>>> #define TYPE_MODE(NODE) \
>>>>  (TREE_CODE (TYPE_CHECK (NODE)) == VECTOR_TYPE \
>>>>   ? vector_type_mode (NODE) : (NODE)->type.mode)
>>>>
>>>> with this outcome:
>>>>
>>>>    http://gcc.gnu.org/ml/gcc/2011-03/msg00470.html
>>>>
>>>> To summarise my take on it:
>>>>
>>>> - The current definition of TYPE_MODE isn't sufficient even for vector
>>>>  modes and vector_mode_supported_p, because non-vector types can have
>>>>  vector modes.
>>>>
>>>> - We should no longer treat types as having one mode everywhere.
>>>>  We should instead replace TYPE_MODE with a function that takes
>>>>  a context.  Tests of things like vector_mode_supported_p would
>>>>  move from layout_type to this new function.
>>>>
>>>> I think this patch fits within that scheme.  array_mode_supported_p
>>>> would be treated in the same way as vector_mode_supported_p.
>>>>
>>>> I realise the ideal would be to get rid of TYPE_MODE first.
>>>> But that's going to be a longer-term thing.  Now that there's
>>>> at least a plan, I'd like to press ahead with the array stuff
>>>> on the basis that
>>>>
>>>> (a) although the new hook won't work with the "target" attribute,
>>>>    our current mode handling doesn't work in just the same way.
>>>>
>>>> (b) the new hook doesn't interfere with the plan.
>>>>
>>>> (c) getting good code from the intrinsics (and support for these
>>>>    instructions in the vectoriser) is going to be much more important
>>>>    to most ARM users than the ability to turn Neon on and off for
>>>>    individual functions in a TU.
>>>>
>>>> To give an example of the difference, the Neon code posted here:
>>>>
>>>>    http://hilbert-space.de/?p=22
>>>>
>>>> produces this inner loop before the patch (but with
>>>> http://gcc.gnu.org/ml/gcc-patches/2011-03/msg01996.html applied):
>>>>
>>>> .L3:
>>>>        vld3.8  {d16-d18}, [r1]!
>>>>        vstmia  ip, {d16-d18}
>>>>        fldd    d19, [sp, #24]
>>>>        adr     r5, .L6
>>>>        ldmia   r5, {r4-r5}
>>>>        fldd    d16, [sp, #32]
>>>>        vmov    d18, r4, r5  @ v8qi
>>>>        vmull.u8        q9, d19, d18
>>>>        adr     r5, .L6+8
>>>>        ldmia   r5, {r4-r5}
>>>>        vmov    d17, r4, r5  @ v8qi
>>>>        vstmia  sp, {d18-d19}
>>>>        vmlal.u8        q9, d16, d17
>>>>        fldd    d16, [sp, #40]
>>>>        adr     r5, .L6+16
>>>>        ldmia   r5, {r4-r5}
>>>>        vmov    d17, r4, r5  @ v8qi
>>>>        vmlal.u8        q9, d16, d17
>>>>        add     r3, r3, #1
>>>>        vshrn.i16       d16, q9, #8
>>>>        cmp     r3, r2
>>>>        vst1.8  {d16}, [r0]!
>>>>        bne     .L3
>>>>
>>>> With both patches applied, the inner loop is:
>>>>
>>>> .L3:
>>>>        vld3.8  {d18-d20}, [r1]!
>>>>        vmull.u8        q8, d18, d21
>>>>        vmlal.u8        q8, d19, d22
>>>>        vmlal.u8        q8, d20, d23
>>>>        add     r3, r3, #1
>>>>        vshrn.i16       d16, q8, #8
>>>>        cmp     r3, r2
>>>>        vst1.8  {d16}, [r0]!
>>>>        bne     .L3
>>>>
>>>> Tested on arm-linux-gnueabi.  OK to install?
>>>
>>> It looks reasonable given the past discussion, but - can you move forward
>>> with the Neon stuff a bit to see if it really fits?  Or is this all
>>> that is needed
>>> for the load/store lane support as well (apart from vectorizer changes of
>>> course).
>>
>> Yeah, I have a prototype that hacks up some C support for generating the
>> (otherwise internal-only) load/store built-in functions that the vectoriser
>> is suppsoed to generate.  This patch is all that seems to be needed for the
>> types and optabs generation to work in the natural way.
>>
>> I'm happy to leave it until the vectoriser stuff is in a more
>> submittable state though.
>
> The vectorisation stuff has now been approved and uses this hook to
> detect whether interleaved loads & stores are supported.  Also...
>
>> Especially given:
>>
>>> Can you check the code generated by for example
>>>
>>> float foo(char *p)
>>> {
>>>   float a[2];
>>>   int i;
>>>   ((char *)a)[0] = p[0];
>>>   ((char *)a)[1] = p[1];
>>>   ((char *)a)[2] = p[2];
>>>   ((char *)a)[3] = p[3];
>>>   ((char *)a)[4] = p[4];
>>>   ((char *)a)[5] = p[5];
>>>   ((char *)a)[6] = p[6];
>>>   ((char *)a)[7] = p[7];
>>>   return a[0] + a[1];
>>> }
>>>
>>> for an array a that would get such a larger mode?  Thus, check what
>>> happens with partial defs of different types (just to avoid ICEs like the
>>> ones Jakub was fixing yesterday).
>>
>> OK, I tried:
>>
>> #include "arm_neon.h"
>>
>> uint32x2_t foo(char *p)
>> {
>>   uint32x2_t a[2];
>>   int i;
>>   ((char *)a)[0] = p[0];
>>   ((char *)a)[1] = p[1];
>>   ((char *)a)[2] = p[2];
>>   ((char *)a)[3] = p[3];
>>   ((char *)a)[4] = p[4];
>>   ((char *)a)[5] = p[5];
>>   ((char *)a)[6] = p[6];
>>   ((char *)a)[7] = p[7];
>>   ((char *)a)[8] = p[8];
>>   ((char *)a)[9] = p[9];
>>   ((char *)a)[10] = p[10];
>>   ((char *)a)[11] = p[11];
>>   ((char *)a)[12] = p[12];
>>   ((char *)a)[13] = p[13];
>>   ((char *)a)[14] = p[14];
>>   ((char *)a)[15] = p[15];
>>   return vadd_u32 (a[0], a[1]);
>> }
>>
>> uint32x4_t bar(char *p, uint32x4_t *b)
>> {
>>   uint32x4_t a[2];
>>   int i;
>>   ((char *)a)[0] = p[0];
>>   ((char *)a)[1] = p[1];
>>   ((char *)a)[2] = p[2];
>>   ((char *)a)[3] = p[3];
>>   ((char *)a)[4] = p[4];
>>   ((char *)a)[5] = p[5];
>>   ((char *)a)[6] = p[6];
>>   ((char *)a)[7] = p[7];
>>   ((char *)a)[8] = p[8];
>>   ((char *)a)[9] = p[9];
>>   ((char *)a)[10] = p[10];
>>   ((char *)a)[11] = p[11];
>>   ((char *)a)[12] = p[12];
>>   ((char *)a)[13] = p[13];
>>   ((char *)a)[14] = p[14];
>>   ((char *)a)[15] = p[15];
>>   ((char *)a)[16 + 0] = p[16 + 0];
>>   ((char *)a)[16 + 1] = p[16 + 1];
>>   ((char *)a)[16 + 2] = p[16 + 2];
>>   ((char *)a)[16 + 3] = p[16 + 3];
>>   ((char *)a)[16 + 4] = p[16 + 4];
>>   ((char *)a)[16 + 5] = p[16 + 5];
>>   ((char *)a)[16 + 6] = p[16 + 6];
>>   ((char *)a)[16 + 7] = p[16 + 7];
>>   ((char *)a)[16 + 8] = p[16 + 8];
>>   ((char *)a)[16 + 9] = p[16 + 9];
>>   ((char *)a)[16 + 10] = p[16 + 10];
>>   ((char *)a)[16 + 11] = p[16 + 11];
>>   ((char *)a)[16 + 12] = p[16 + 12];
>>   ((char *)a)[16 + 13] = p[16 + 13];
>>   ((char *)a)[16 + 14] = p[16 + 14];
>>   ((char *)a)[16 + 15] = p[16 + 15];
>>   return vaddq_u32 (a[0], a[1]);
>> }
>>
>> It seemed to avoid the problem Jakub was seeing, but the second function
>> hit the known const_int reload failure for these modes:
>>
>>     http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46329
>
> ...I've just committed the fix for this PR.  Thanks to everyone for
> all the reviews.
>
> Tested on x86_64-linux-gnu and arm-linux-gnueabi.  Do the
> target-independent bits look OK?  How about the ARM bits?

The middle-end pieces look OK.

Thanks,
Richard.

> Thanks,
> Richard
>
>
> gcc/
>        * hooks.h (hook_bool_mode_uhwi_false): Declare.
>        * hooks.c (hook_bool_mode_uhwi_false): New function.
>        * target.def (array_mode_supported_p): New hook.
>        * doc/tm.texi.in (TARGET_ARRAY_MODE_SUPPORTED_P): Add @hook.
>        * doc/tm.texi: Regenerate.
>        * stor-layout.c (mode_for_array): New function.
>        (layout_type): Use it.
>        * config/arm/arm.c (arm_array_mode_supported_p): New function.
>        (TARGET_ARRAY_MODE_SUPPORTED_P): Define.
>
> Index: gcc/hooks.h
> ===================================================================
> --- gcc/hooks.h 2011-04-21 10:47:30.000000000 +0100
> +++ gcc/hooks.h 2011-04-21 10:47:48.000000000 +0100
> @@ -36,6 +36,8 @@ extern bool hook_bool_mode_const_rtx_fal
>  extern bool hook_bool_mode_const_rtx_true (enum machine_mode, const_rtx);
>  extern bool hook_bool_mode_rtx_false (enum machine_mode, rtx);
>  extern bool hook_bool_mode_rtx_true (enum machine_mode, rtx);
> +extern bool hook_bool_mode_uhwi_false (enum machine_mode,
> +                                      unsigned HOST_WIDE_INT);
>  extern bool hook_bool_tree_false (tree);
>  extern bool hook_bool_const_tree_false (const_tree);
>  extern bool hook_bool_tree_true (tree);
> Index: gcc/hooks.c
> ===================================================================
> --- gcc/hooks.c 2011-04-21 10:47:30.000000000 +0100
> +++ gcc/hooks.c 2011-04-21 10:47:48.000000000 +0100
> @@ -117,6 +117,15 @@ hook_bool_mode_rtx_true (enum machine_mo
>   return true;
>  }
>
> +/* Generic hook that takes (enum machine_mode, unsigned HOST_WIDE_INT)
> +   and returns false.  */
> +bool
> +hook_bool_mode_uhwi_false (enum machine_mode mode ATTRIBUTE_UNUSED,
> +                          unsigned HOST_WIDE_INT value ATTRIBUTE_UNUSED)
> +{
> +  return false;
> +}
> +
>  /* Generic hook that takes (FILE *, const char *) and does nothing.  */
>  void
>  hook_void_FILEptr_constcharptr (FILE *a ATTRIBUTE_UNUSED, const char *b 
> ATTRIBUTE_UNUSED)
> Index: gcc/target.def
> ===================================================================
> --- gcc/target.def      2011-04-21 10:47:30.000000000 +0100
> +++ gcc/target.def      2011-04-21 10:47:48.000000000 +0100
> @@ -1565,6 +1565,38 @@ DEFHOOK
>  bool, (enum machine_mode mode),
>  hook_bool_mode_false)
>
> +/* True if we should try to use a scalar mode to represent an array,
> +   overriding the usual MAX_FIXED_MODE limit.  */
> +DEFHOOK
> +(array_mode_supported_p,
> + "Return true if GCC should try to use a scalar mode to store an array\n\
> +of @var{nelems} elements, given that each element has mode @var{mode}.\n\
> +Returning true here overrides the usual @code{MAX_FIXED_MODE} limit\n\
> +and allows GCC to use any defined integer mode.\n\
> +\n\
> +One use of this hook is to support vector load and store operations\n\
> +that operate on several homogeneous vectors.  For example, ARM NEON\n\
> +has operations like:\n\
> +\n\
> +@smallexample\n\
> +int8x8x3_t vld3_s8 (const int8_t *)\n\
> +@end smallexample\n\
> +\n\
> +where the return type is defined as:\n\
> +\n\
> +@smallexample\n\
> +typedef struct int8x8x3_t\n\
> +@{\n\
> +  int8x8_t val[3];\n\
> +@} int8x8x3_t;\n\
> +@end smallexample\n\
> +\n\
> +If this hook allows @code{val} to have a scalar mode, then\n\
> +@code{int8x8x3_t} can have the same mode.  GCC can then store\n\
> +@code{int8x8x3_t}s in registers rather than forcing them onto the stack.",
> + bool, (enum machine_mode mode, unsigned HOST_WIDE_INT nelems),
> + hook_bool_mode_uhwi_false)
> +
>  /* Compute cost of moving data from a register of class FROM to one of
>    TO, using MODE.  */
>  DEFHOOK
> Index: gcc/doc/tm.texi.in
> ===================================================================
> --- gcc/doc/tm.texi.in  2011-04-21 10:47:30.000000000 +0100
> +++ gcc/doc/tm.texi.in  2011-04-21 10:47:48.000000000 +0100
> @@ -4263,6 +4263,8 @@ insns involving vector mode @var{mode}.
>  must have move patterns for this mode.
>  @end deftypefn
>
> +@hook TARGET_ARRAY_MODE_SUPPORTED_P
> +
>  @hook TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P
>  Define this to return nonzero for machine modes for which the port has
>  small register classes.  If this target hook returns nonzero for a given
> Index: gcc/doc/tm.texi
> ===================================================================
> --- gcc/doc/tm.texi     2011-04-21 10:47:30.000000000 +0100
> +++ gcc/doc/tm.texi     2011-04-21 10:47:48.000000000 +0100
> @@ -4277,6 +4277,34 @@ insns involving vector mode @var{mode}.
>  must have move patterns for this mode.
>  @end deftypefn
>
> +@deftypefn {Target Hook} bool TARGET_ARRAY_MODE_SUPPORTED_P (enum 
> machine_mode @var{mode}, unsigned HOST_WIDE_INT @var{nelems})
> +Return true if GCC should try to use a scalar mode to store an array
> +of @var{nelems} elements, given that each element has mode @var{mode}.
> +Returning true here overrides the usual @code{MAX_FIXED_MODE} limit
> +and allows GCC to use any defined integer mode.
> +
> +One use of this hook is to support vector load and store operations
> +that operate on several homogeneous vectors.  For example, ARM NEON
> +has operations like:
> +
> +@smallexample
> +int8x8x3_t vld3_s8 (const int8_t *)
> +@end smallexample
> +
> +where the return type is defined as:
> +
> +@smallexample
> +typedef struct int8x8x3_t
> +@{
> +  int8x8_t val[3];
> +@} int8x8x3_t;
> +@end smallexample
> +
> +If this hook allows @code{val} to have a scalar mode, then
> +@code{int8x8x3_t} can have the same mode.  GCC can then store
> +@code{int8x8x3_t}s in registers rather than forcing them onto the stack.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} bool TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P (enum 
> machine_mode @var{mode})
>  Define this to return nonzero for machine modes for which the port has
>  small register classes.  If this target hook returns nonzero for a given
> Index: gcc/stor-layout.c
> ===================================================================
> --- gcc/stor-layout.c   2011-04-21 10:47:30.000000000 +0100
> +++ gcc/stor-layout.c   2011-04-21 10:47:48.000000000 +0100
> @@ -546,6 +546,34 @@ get_mode_alignment (enum machine_mode mo
>   return MIN (BIGGEST_ALIGNMENT, MAX (1, 
> mode_base_align[mode]*BITS_PER_UNIT));
>  }
>
> +/* Return the natural mode of an array, given that it is SIZE bytes in
> +   total and has elements of type ELEM_TYPE.  */
> +
> +static enum machine_mode
> +mode_for_array (tree elem_type, tree size)
> +{
> +  tree elem_size;
> +  unsigned HOST_WIDE_INT int_size, int_elem_size;
> +  bool limit_p;
> +
> +  /* One-element arrays get the component type's mode.  */
> +  elem_size = TYPE_SIZE (elem_type);
> +  if (simple_cst_equal (size, elem_size))
> +    return TYPE_MODE (elem_type);
> +
> +  limit_p = true;
> +  if (host_integerp (size, 1) && host_integerp (elem_size, 1))
> +    {
> +      int_size = tree_low_cst (size, 1);
> +      int_elem_size = tree_low_cst (elem_size, 1);
> +      if (int_elem_size > 0
> +         && int_size % int_elem_size == 0
> +         && targetm.array_mode_supported_p (TYPE_MODE (elem_type),
> +                                            int_size / int_elem_size))
> +       limit_p = false;
> +    }
> +  return mode_for_size_tree (size, MODE_INT, limit_p);
> +}
>
>  /* Subroutine of layout_decl: Force alignment required for the data type.
>    But if the decl itself wants greater alignment, don't override that.  */
> @@ -2040,14 +2068,8 @@ layout_type (tree type)
>            && (TYPE_MODE (TREE_TYPE (type)) != BLKmode
>                || TYPE_NO_FORCE_BLK (TREE_TYPE (type))))
>          {
> -           /* One-element arrays get the component type's mode.  */
> -           if (simple_cst_equal (TYPE_SIZE (type),
> -                                 TYPE_SIZE (TREE_TYPE (type))))
> -             SET_TYPE_MODE (type, TYPE_MODE (TREE_TYPE (type)));
> -           else
> -             SET_TYPE_MODE (type, mode_for_size_tree (TYPE_SIZE (type),
> -                                                      MODE_INT, 1));
> -
> +           SET_TYPE_MODE (type, mode_for_array (TREE_TYPE (type),
> +                                                TYPE_SIZE (type)));
>            if (TYPE_MODE (type) != BLKmode
>                && STRICT_ALIGNMENT && TYPE_ALIGN (type) < BIGGEST_ALIGNMENT
>                && TYPE_ALIGN (type) < GET_MODE_ALIGNMENT (TYPE_MODE (type)))
> Index: gcc/config/arm/arm.c
> ===================================================================
> --- gcc/config/arm/arm.c        2011-04-21 10:47:30.000000000 +0100
> +++ gcc/config/arm/arm.c        2011-04-21 10:47:48.000000000 +0100
> @@ -243,6 +243,8 @@ static rtx arm_pic_static_addr (rtx orig
>  static bool cortex_a9_sched_adjust_cost (rtx, rtx, rtx, int *);
>  static bool xscale_sched_adjust_cost (rtx, rtx, rtx, int *);
>  static bool fa726te_sched_adjust_cost (rtx, rtx, rtx, int *);
> +static bool arm_array_mode_supported_p (enum machine_mode,
> +                                       unsigned HOST_WIDE_INT);
>  static enum machine_mode arm_preferred_simd_mode (enum machine_mode);
>  static bool arm_class_likely_spilled_p (reg_class_t);
>  static bool arm_vector_alignment_reachable (const_tree type, bool is_packed);
> @@ -399,6 +401,8 @@ #define TARGET_ADDRESS_COST arm_address_
>  #define TARGET_SHIFT_TRUNCATION_MASK arm_shift_truncation_mask
>  #undef TARGET_VECTOR_MODE_SUPPORTED_P
>  #define TARGET_VECTOR_MODE_SUPPORTED_P arm_vector_mode_supported_p
> +#undef TARGET_ARRAY_MODE_SUPPORTED_P
> +#define TARGET_ARRAY_MODE_SUPPORTED_P arm_array_mode_supported_p
>  #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
>  #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE arm_preferred_simd_mode
>  #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
> @@ -22514,6 +22518,20 @@ arm_vector_mode_supported_p (enum machin
>   return false;
>  }
>
> +/* Implements target hook array_mode_supported_p.  */
> +
> +static bool
> +arm_array_mode_supported_p (enum machine_mode mode,
> +                           unsigned HOST_WIDE_INT nelems)
> +{
> +  if (TARGET_NEON
> +      && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))
> +      && (nelems >= 2 && nelems <= 4))
> +    return true;
> +
> +  return false;
> +}
> +
>  /* Use the option -mvectorize-with-neon-quad to override the use of 
> doubleword
>    registers when autovectorizing for Neon, at least until multiple vector
>    widths are supported properly by the middle-end.  */
>

Reply via email to