To get back to this... Richard Sandiford <richard.sandif...@linaro.org> writes: > Richard Guenther <richard.guent...@gmail.com> writes: >> On Thu, Mar 31, 2011 at 3:32 PM, Richard Sandiford >> <richard.sandif...@linaro.org> wrote: >>> This patch adds an array_mode_supported_p hook, which says whether >>> MAX_FIXED_MODE_SIZE should be ignored for a given type of array. >>> It follows on from the discussion here: >>> >>> http://gcc.gnu.org/ml/gcc/2011-03/msg00342.html >>> >>> The intended use of the hook is to allow small arrays of vectors >>> to have a non-BLK mode, and hence to be stored in rtl registers. >>> These arrays are used both in the ARM arm_neon.h API and in the >>> optabs proposed in: >>> >>> http://gcc.gnu.org/ml/gcc/2011-03/msg00322.html >>> >>> The tail end of the thread was about the definition of TYPE_MODE: >>> >>> #define TYPE_MODE(NODE) \ >>> (TREE_CODE (TYPE_CHECK (NODE)) == VECTOR_TYPE \ >>> ? vector_type_mode (NODE) : (NODE)->type.mode) >>> >>> with this outcome: >>> >>> http://gcc.gnu.org/ml/gcc/2011-03/msg00470.html >>> >>> To summarise my take on it: >>> >>> - The current definition of TYPE_MODE isn't sufficient even for vector >>> modes and vector_mode_supported_p, because non-vector types can have >>> vector modes. >>> >>> - We should no longer treat types as having one mode everywhere. >>> We should instead replace TYPE_MODE with a function that takes >>> a context. Tests of things like vector_mode_supported_p would >>> move from layout_type to this new function. >>> >>> I think this patch fits within that scheme. array_mode_supported_p >>> would be treated in the same way as vector_mode_supported_p. >>> >>> I realise the ideal would be to get rid of TYPE_MODE first. >>> But that's going to be a longer-term thing. Now that there's >>> at least a plan, I'd like to press ahead with the array stuff >>> on the basis that >>> >>> (a) although the new hook won't work with the "target" attribute, >>> our current mode handling doesn't work in just the same way. >>> >>> (b) the new hook doesn't interfere with the plan. >>> >>> (c) getting good code from the intrinsics (and support for these >>> instructions in the vectoriser) is going to be much more important >>> to most ARM users than the ability to turn Neon on and off for >>> individual functions in a TU. >>> >>> To give an example of the difference, the Neon code posted here: >>> >>> http://hilbert-space.de/?p=22 >>> >>> produces this inner loop before the patch (but with >>> http://gcc.gnu.org/ml/gcc-patches/2011-03/msg01996.html applied): >>> >>> .L3: >>> vld3.8 {d16-d18}, [r1]! >>> vstmia ip, {d16-d18} >>> fldd d19, [sp, #24] >>> adr r5, .L6 >>> ldmia r5, {r4-r5} >>> fldd d16, [sp, #32] >>> vmov d18, r4, r5 @ v8qi >>> vmull.u8 q9, d19, d18 >>> adr r5, .L6+8 >>> ldmia r5, {r4-r5} >>> vmov d17, r4, r5 @ v8qi >>> vstmia sp, {d18-d19} >>> vmlal.u8 q9, d16, d17 >>> fldd d16, [sp, #40] >>> adr r5, .L6+16 >>> ldmia r5, {r4-r5} >>> vmov d17, r4, r5 @ v8qi >>> vmlal.u8 q9, d16, d17 >>> add r3, r3, #1 >>> vshrn.i16 d16, q9, #8 >>> cmp r3, r2 >>> vst1.8 {d16}, [r0]! >>> bne .L3 >>> >>> With both patches applied, the inner loop is: >>> >>> .L3: >>> vld3.8 {d18-d20}, [r1]! >>> vmull.u8 q8, d18, d21 >>> vmlal.u8 q8, d19, d22 >>> vmlal.u8 q8, d20, d23 >>> add r3, r3, #1 >>> vshrn.i16 d16, q8, #8 >>> cmp r3, r2 >>> vst1.8 {d16}, [r0]! >>> bne .L3 >>> >>> Tested on arm-linux-gnueabi. OK to install? >> >> It looks reasonable given the past discussion, but - can you move forward >> with the Neon stuff a bit to see if it really fits? Or is this all >> that is needed >> for the load/store lane support as well (apart from vectorizer changes of >> course). > > Yeah, I have a prototype that hacks up some C support for generating the > (otherwise internal-only) load/store built-in functions that the vectoriser > is suppsoed to generate. This patch is all that seems to be needed for the > types and optabs generation to work in the natural way. > > I'm happy to leave it until the vectoriser stuff is in a more > submittable state though.
The vectorisation stuff has now been approved and uses this hook to detect whether interleaved loads & stores are supported. Also... > Especially given: > >> Can you check the code generated by for example >> >> float foo(char *p) >> { >> float a[2]; >> int i; >> ((char *)a)[0] = p[0]; >> ((char *)a)[1] = p[1]; >> ((char *)a)[2] = p[2]; >> ((char *)a)[3] = p[3]; >> ((char *)a)[4] = p[4]; >> ((char *)a)[5] = p[5]; >> ((char *)a)[6] = p[6]; >> ((char *)a)[7] = p[7]; >> return a[0] + a[1]; >> } >> >> for an array a that would get such a larger mode? Thus, check what >> happens with partial defs of different types (just to avoid ICEs like the >> ones Jakub was fixing yesterday). > > OK, I tried: > > #include "arm_neon.h" > > uint32x2_t foo(char *p) > { > uint32x2_t a[2]; > int i; > ((char *)a)[0] = p[0]; > ((char *)a)[1] = p[1]; > ((char *)a)[2] = p[2]; > ((char *)a)[3] = p[3]; > ((char *)a)[4] = p[4]; > ((char *)a)[5] = p[5]; > ((char *)a)[6] = p[6]; > ((char *)a)[7] = p[7]; > ((char *)a)[8] = p[8]; > ((char *)a)[9] = p[9]; > ((char *)a)[10] = p[10]; > ((char *)a)[11] = p[11]; > ((char *)a)[12] = p[12]; > ((char *)a)[13] = p[13]; > ((char *)a)[14] = p[14]; > ((char *)a)[15] = p[15]; > return vadd_u32 (a[0], a[1]); > } > > uint32x4_t bar(char *p, uint32x4_t *b) > { > uint32x4_t a[2]; > int i; > ((char *)a)[0] = p[0]; > ((char *)a)[1] = p[1]; > ((char *)a)[2] = p[2]; > ((char *)a)[3] = p[3]; > ((char *)a)[4] = p[4]; > ((char *)a)[5] = p[5]; > ((char *)a)[6] = p[6]; > ((char *)a)[7] = p[7]; > ((char *)a)[8] = p[8]; > ((char *)a)[9] = p[9]; > ((char *)a)[10] = p[10]; > ((char *)a)[11] = p[11]; > ((char *)a)[12] = p[12]; > ((char *)a)[13] = p[13]; > ((char *)a)[14] = p[14]; > ((char *)a)[15] = p[15]; > ((char *)a)[16 + 0] = p[16 + 0]; > ((char *)a)[16 + 1] = p[16 + 1]; > ((char *)a)[16 + 2] = p[16 + 2]; > ((char *)a)[16 + 3] = p[16 + 3]; > ((char *)a)[16 + 4] = p[16 + 4]; > ((char *)a)[16 + 5] = p[16 + 5]; > ((char *)a)[16 + 6] = p[16 + 6]; > ((char *)a)[16 + 7] = p[16 + 7]; > ((char *)a)[16 + 8] = p[16 + 8]; > ((char *)a)[16 + 9] = p[16 + 9]; > ((char *)a)[16 + 10] = p[16 + 10]; > ((char *)a)[16 + 11] = p[16 + 11]; > ((char *)a)[16 + 12] = p[16 + 12]; > ((char *)a)[16 + 13] = p[16 + 13]; > ((char *)a)[16 + 14] = p[16 + 14]; > ((char *)a)[16 + 15] = p[16 + 15]; > return vaddq_u32 (a[0], a[1]); > } > > It seemed to avoid the problem Jakub was seeing, but the second function > hit the known const_int reload failure for these modes: > > http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46329 ...I've just committed the fix for this PR. Thanks to everyone for all the reviews. Tested on x86_64-linux-gnu and arm-linux-gnueabi. Do the target-independent bits look OK? How about the ARM bits? Thanks, Richard gcc/ * hooks.h (hook_bool_mode_uhwi_false): Declare. * hooks.c (hook_bool_mode_uhwi_false): New function. * target.def (array_mode_supported_p): New hook. * doc/tm.texi.in (TARGET_ARRAY_MODE_SUPPORTED_P): Add @hook. * doc/tm.texi: Regenerate. * stor-layout.c (mode_for_array): New function. (layout_type): Use it. * config/arm/arm.c (arm_array_mode_supported_p): New function. (TARGET_ARRAY_MODE_SUPPORTED_P): Define. Index: gcc/hooks.h =================================================================== --- gcc/hooks.h 2011-04-21 10:47:30.000000000 +0100 +++ gcc/hooks.h 2011-04-21 10:47:48.000000000 +0100 @@ -36,6 +36,8 @@ extern bool hook_bool_mode_const_rtx_fal extern bool hook_bool_mode_const_rtx_true (enum machine_mode, const_rtx); extern bool hook_bool_mode_rtx_false (enum machine_mode, rtx); extern bool hook_bool_mode_rtx_true (enum machine_mode, rtx); +extern bool hook_bool_mode_uhwi_false (enum machine_mode, + unsigned HOST_WIDE_INT); extern bool hook_bool_tree_false (tree); extern bool hook_bool_const_tree_false (const_tree); extern bool hook_bool_tree_true (tree); Index: gcc/hooks.c =================================================================== --- gcc/hooks.c 2011-04-21 10:47:30.000000000 +0100 +++ gcc/hooks.c 2011-04-21 10:47:48.000000000 +0100 @@ -117,6 +117,15 @@ hook_bool_mode_rtx_true (enum machine_mo return true; } +/* Generic hook that takes (enum machine_mode, unsigned HOST_WIDE_INT) + and returns false. */ +bool +hook_bool_mode_uhwi_false (enum machine_mode mode ATTRIBUTE_UNUSED, + unsigned HOST_WIDE_INT value ATTRIBUTE_UNUSED) +{ + return false; +} + /* Generic hook that takes (FILE *, const char *) and does nothing. */ void hook_void_FILEptr_constcharptr (FILE *a ATTRIBUTE_UNUSED, const char *b ATTRIBUTE_UNUSED) Index: gcc/target.def =================================================================== --- gcc/target.def 2011-04-21 10:47:30.000000000 +0100 +++ gcc/target.def 2011-04-21 10:47:48.000000000 +0100 @@ -1565,6 +1565,38 @@ DEFHOOK bool, (enum machine_mode mode), hook_bool_mode_false) +/* True if we should try to use a scalar mode to represent an array, + overriding the usual MAX_FIXED_MODE limit. */ +DEFHOOK +(array_mode_supported_p, + "Return true if GCC should try to use a scalar mode to store an array\n\ +of @var{nelems} elements, given that each element has mode @var{mode}.\n\ +Returning true here overrides the usual @code{MAX_FIXED_MODE} limit\n\ +and allows GCC to use any defined integer mode.\n\ +\n\ +One use of this hook is to support vector load and store operations\n\ +that operate on several homogeneous vectors. For example, ARM NEON\n\ +has operations like:\n\ +\n\ +@smallexample\n\ +int8x8x3_t vld3_s8 (const int8_t *)\n\ +@end smallexample\n\ +\n\ +where the return type is defined as:\n\ +\n\ +@smallexample\n\ +typedef struct int8x8x3_t\n\ +@{\n\ + int8x8_t val[3];\n\ +@} int8x8x3_t;\n\ +@end smallexample\n\ +\n\ +If this hook allows @code{val} to have a scalar mode, then\n\ +@code{int8x8x3_t} can have the same mode. GCC can then store\n\ +@code{int8x8x3_t}s in registers rather than forcing them onto the stack.", + bool, (enum machine_mode mode, unsigned HOST_WIDE_INT nelems), + hook_bool_mode_uhwi_false) + /* Compute cost of moving data from a register of class FROM to one of TO, using MODE. */ DEFHOOK Index: gcc/doc/tm.texi.in =================================================================== --- gcc/doc/tm.texi.in 2011-04-21 10:47:30.000000000 +0100 +++ gcc/doc/tm.texi.in 2011-04-21 10:47:48.000000000 +0100 @@ -4263,6 +4263,8 @@ insns involving vector mode @var{mode}. must have move patterns for this mode. @end deftypefn +@hook TARGET_ARRAY_MODE_SUPPORTED_P + @hook TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P Define this to return nonzero for machine modes for which the port has small register classes. If this target hook returns nonzero for a given Index: gcc/doc/tm.texi =================================================================== --- gcc/doc/tm.texi 2011-04-21 10:47:30.000000000 +0100 +++ gcc/doc/tm.texi 2011-04-21 10:47:48.000000000 +0100 @@ -4277,6 +4277,34 @@ insns involving vector mode @var{mode}. must have move patterns for this mode. @end deftypefn +@deftypefn {Target Hook} bool TARGET_ARRAY_MODE_SUPPORTED_P (enum machine_mode @var{mode}, unsigned HOST_WIDE_INT @var{nelems}) +Return true if GCC should try to use a scalar mode to store an array +of @var{nelems} elements, given that each element has mode @var{mode}. +Returning true here overrides the usual @code{MAX_FIXED_MODE} limit +and allows GCC to use any defined integer mode. + +One use of this hook is to support vector load and store operations +that operate on several homogeneous vectors. For example, ARM NEON +has operations like: + +@smallexample +int8x8x3_t vld3_s8 (const int8_t *) +@end smallexample + +where the return type is defined as: + +@smallexample +typedef struct int8x8x3_t +@{ + int8x8_t val[3]; +@} int8x8x3_t; +@end smallexample + +If this hook allows @code{val} to have a scalar mode, then +@code{int8x8x3_t} can have the same mode. GCC can then store +@code{int8x8x3_t}s in registers rather than forcing them onto the stack. +@end deftypefn + @deftypefn {Target Hook} bool TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P (enum machine_mode @var{mode}) Define this to return nonzero for machine modes for which the port has small register classes. If this target hook returns nonzero for a given Index: gcc/stor-layout.c =================================================================== --- gcc/stor-layout.c 2011-04-21 10:47:30.000000000 +0100 +++ gcc/stor-layout.c 2011-04-21 10:47:48.000000000 +0100 @@ -546,6 +546,34 @@ get_mode_alignment (enum machine_mode mo return MIN (BIGGEST_ALIGNMENT, MAX (1, mode_base_align[mode]*BITS_PER_UNIT)); } +/* Return the natural mode of an array, given that it is SIZE bytes in + total and has elements of type ELEM_TYPE. */ + +static enum machine_mode +mode_for_array (tree elem_type, tree size) +{ + tree elem_size; + unsigned HOST_WIDE_INT int_size, int_elem_size; + bool limit_p; + + /* One-element arrays get the component type's mode. */ + elem_size = TYPE_SIZE (elem_type); + if (simple_cst_equal (size, elem_size)) + return TYPE_MODE (elem_type); + + limit_p = true; + if (host_integerp (size, 1) && host_integerp (elem_size, 1)) + { + int_size = tree_low_cst (size, 1); + int_elem_size = tree_low_cst (elem_size, 1); + if (int_elem_size > 0 + && int_size % int_elem_size == 0 + && targetm.array_mode_supported_p (TYPE_MODE (elem_type), + int_size / int_elem_size)) + limit_p = false; + } + return mode_for_size_tree (size, MODE_INT, limit_p); +} /* Subroutine of layout_decl: Force alignment required for the data type. But if the decl itself wants greater alignment, don't override that. */ @@ -2040,14 +2068,8 @@ layout_type (tree type) && (TYPE_MODE (TREE_TYPE (type)) != BLKmode || TYPE_NO_FORCE_BLK (TREE_TYPE (type)))) { - /* One-element arrays get the component type's mode. */ - if (simple_cst_equal (TYPE_SIZE (type), - TYPE_SIZE (TREE_TYPE (type)))) - SET_TYPE_MODE (type, TYPE_MODE (TREE_TYPE (type))); - else - SET_TYPE_MODE (type, mode_for_size_tree (TYPE_SIZE (type), - MODE_INT, 1)); - + SET_TYPE_MODE (type, mode_for_array (TREE_TYPE (type), + TYPE_SIZE (type))); if (TYPE_MODE (type) != BLKmode && STRICT_ALIGNMENT && TYPE_ALIGN (type) < BIGGEST_ALIGNMENT && TYPE_ALIGN (type) < GET_MODE_ALIGNMENT (TYPE_MODE (type))) Index: gcc/config/arm/arm.c =================================================================== --- gcc/config/arm/arm.c 2011-04-21 10:47:30.000000000 +0100 +++ gcc/config/arm/arm.c 2011-04-21 10:47:48.000000000 +0100 @@ -243,6 +243,8 @@ static rtx arm_pic_static_addr (rtx orig static bool cortex_a9_sched_adjust_cost (rtx, rtx, rtx, int *); static bool xscale_sched_adjust_cost (rtx, rtx, rtx, int *); static bool fa726te_sched_adjust_cost (rtx, rtx, rtx, int *); +static bool arm_array_mode_supported_p (enum machine_mode, + unsigned HOST_WIDE_INT); static enum machine_mode arm_preferred_simd_mode (enum machine_mode); static bool arm_class_likely_spilled_p (reg_class_t); static bool arm_vector_alignment_reachable (const_tree type, bool is_packed); @@ -399,6 +401,8 @@ #define TARGET_ADDRESS_COST arm_address_ #define TARGET_SHIFT_TRUNCATION_MASK arm_shift_truncation_mask #undef TARGET_VECTOR_MODE_SUPPORTED_P #define TARGET_VECTOR_MODE_SUPPORTED_P arm_vector_mode_supported_p +#undef TARGET_ARRAY_MODE_SUPPORTED_P +#define TARGET_ARRAY_MODE_SUPPORTED_P arm_array_mode_supported_p #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE arm_preferred_simd_mode #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES @@ -22514,6 +22518,20 @@ arm_vector_mode_supported_p (enum machin return false; } +/* Implements target hook array_mode_supported_p. */ + +static bool +arm_array_mode_supported_p (enum machine_mode mode, + unsigned HOST_WIDE_INT nelems) +{ + if (TARGET_NEON + && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode)) + && (nelems >= 2 && nelems <= 4)) + return true; + + return false; +} + /* Use the option -mvectorize-with-neon-quad to override the use of doubleword registers when autovectorizing for Neon, at least until multiple vector widths are supported properly by the middle-end. */