https://gcc.gnu.org/g:77b2eaf09c77bf2dddcb8c35ee8bc9cc99e2f93c
commit r17-839-g77b2eaf09c77bf2dddcb8c35ee8bc9cc99e2f93c Author: oltolm <[email protected]> Date: Sun May 24 12:57:49 2026 +0200 x86: fix under-aligned indirect AVX argument/return stack slots on win64 [PR54412] Fix x86 caller/callee handling for over-aligned indirect arguments/returns On x86_64-w64-mingw32, TARGET_SEH limits MAX_SUPPORTED_STACK_ALIGNMENT to 128 bits, but 256-bit AVX values are still passed and returned indirectly. Some caller/callee stack-slot paths still used generic allocators that cap requested alignment to MAX_SUPPORTED_STACK_ALIGNMENT, producing slots that are under-aligned for later vmovapd/vmovaps accesses. Fix caller-side paths by using dynamically allocated stack space for: * over-aligned by-reference argument copies * over-aligned hidden return slots Fix callee-side paths by overallocating the local stack slot, then aligning the effective address within that slot when required alignment exceeds MAX_SUPPORTED_STACK_ALIGNMENT. This preserves ABI behavior while ensuring alignment-sensitive AVX accesses are correctly aligned in both caller and callee paths. Use a target hook to control when this over-aligned stack-slot handling is required, instead of hardcoding target conditionals in generic code. gcc/ChangeLog: PR target/54412 * target.def (overaligned_stack_slot_required): New calls hook. * calls.cc (allocate_call_dynamic_stack_space): New helper. (initialize_argument_information): Use targetm.calls.overaligned_stack_slot_required for over-aligned by-reference argument copies. (expand_call): Use targetm.calls.overaligned_stack_slot_required for over-aligned hidden return slots. * function.cc (assign_stack_local_aligned): New helper. (assign_parm_setup_block): Use targetm.calls.overaligned_stack_slot_required for over-aligned stack parm slots. (assign_parm_setup_reg): Likewise. * config/i386/i386.cc (ix86_overaligned_stack_slot_required): New. (TARGET_OVERALIGNED_STACK_SLOT_REQUIRED): Define for i386. * doc/tm.texi.in: Add hook placement. * doc/tm.texi: Regenerate. Signed-off-by: oltolm <[email protected]> Signed-off-by: Jonathan Yong <[email protected]> Diff: --- gcc/calls.cc | 72 +++++++++++++++++++++++++++++++++++-------------- gcc/config/i386/i386.cc | 11 ++++++++ gcc/doc/tm.texi | 10 +++++++ gcc/doc/tm.texi.in | 2 ++ gcc/function.cc | 57 ++++++++++++++++++++++++++------------- gcc/target.def | 12 +++++++++ 6 files changed, 125 insertions(+), 39 deletions(-) diff --git a/gcc/calls.cc b/gcc/calls.cc index d491a4146115..ee885a60b582 100644 --- a/gcc/calls.cc +++ b/gcc/calls.cc @@ -159,6 +159,8 @@ static void compute_argument_addresses (struct arg_data *, rtx, int); static rtx rtx_for_function_call (tree, tree); static void load_register_parameters (struct arg_data *, int, rtx *, int, int, bool *); +static rtx allocate_call_dynamic_stack_space (rtx, unsigned int, HOST_WIDE_INT, + rtx *, poly_int64 *); static int special_function_p (const_tree, int); static bool check_sibcall_argument_overlap_1 (rtx); static bool check_sibcall_argument_overlap (rtx_insn *, struct arg_data *, @@ -211,6 +213,28 @@ mark_stack_region_used (poly_uint64 lower_bound, poly_uint64 upper_bound) stack_usage_watermark = MIN (stack_usage_watermark, const_lower); } +/* Allocate temporary call-related stack space with ALIGN alignment. + Save the stack pointer on first use so the caller can restore it after + the call sequence completes. */ + +static rtx +allocate_call_dynamic_stack_space (rtx size, unsigned int align, + HOST_WIDE_INT known_size, + rtx *old_stack_level, + poly_int64 *old_pending_adj) +{ + if (*old_stack_level == 0) + { + emit_stack_save (SAVE_BLOCK, old_stack_level); + *old_pending_adj = pending_stack_adjust; + pending_stack_adjust = 0; + } + + /* We can pass TRUE as the 5th argument because we just saved the stack + pointer and will restore it right after the call. */ + return allocate_dynamic_stack_space (size, align, align, known_size, true); +} + /* Force FUNEXP into a form suitable for the address of a CALL, and return that as an rtx. Also load the static chain register if FNDECL is a nested function. @@ -1481,30 +1505,23 @@ initialize_argument_information (int num_actuals ATTRIBUTE_UNUSED, if (!COMPLETE_TYPE_P (type) || TREE_CODE (TYPE_SIZE_UNIT (type)) != INTEGER_CST + || (targetm.calls.overaligned_stack_slot_required () + && TYPE_ALIGN (type) > MAX_SUPPORTED_STACK_ALIGNMENT) || (flag_stack_check == GENERIC_STACK_CHECK && compare_tree_int (TYPE_SIZE_UNIT (type), STACK_CHECK_MAX_VAR_SIZE) > 0)) { - /* This is a variable-sized object. Make space on the stack - for it. */ + /* Variable-sized or over-aligned by-reference arguments cannot + use a regular stack temp when the target can't guarantee the + requested alignment for stack slots. Allocate temporary + space dynamically and restore the stack right after the call. */ rtx size_rtx = expr_size (args[i].tree_value); - if (*old_stack_level == 0) - { - emit_stack_save (SAVE_BLOCK, old_stack_level); - *old_pending_adj = pending_stack_adjust; - pending_stack_adjust = 0; - } - - /* We can pass TRUE as the 4th argument because we just - saved the stack pointer and will restore it right after - the call. */ - copy = allocate_dynamic_stack_space (size_rtx, - TYPE_ALIGN (type), - TYPE_ALIGN (type), - max_int_size_in_bytes - (type), - true); + copy = allocate_call_dynamic_stack_space (size_rtx, + TYPE_ALIGN (type), + max_int_size_in_bytes (type), + old_stack_level, + old_pending_adj); copy = gen_rtx_MEM (BLKmode, copy); set_mem_attributes (copy, type, 1); } @@ -2911,8 +2928,23 @@ expand_call (tree exp, rtx target, int ignore) /* For variable-sized objects, we must be called with a target specified. If we were to allocate space on the stack here, we would have no way of knowing when to free it. */ - rtx d = assign_temp (rettype, 1, 1); - structure_value_addr = XEXP (d, 0); + if (targetm.calls.overaligned_stack_slot_required () + && TYPE_ALIGN (rettype) > MAX_SUPPORTED_STACK_ALIGNMENT) + { + unsigned HOST_WIDE_INT size; + + gcc_checking_assert (TREE_CODE (TYPE_SIZE_UNIT (rettype)) + == INTEGER_CST); + size = tree_to_uhwi (TYPE_SIZE_UNIT (rettype)); + structure_value_addr = allocate_call_dynamic_stack_space ( + gen_int_mode (size, Pmode), TYPE_ALIGN (rettype), size, + &old_stack_level, &old_pending_adj); + } + else + { + rtx d = assign_temp (rettype, 1, 1); + structure_value_addr = XEXP (d, 0); + } target = 0; } } diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index a5559fe8a330..dd1f715b460b 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -430,6 +430,7 @@ static rtx ix86_function_value (const_tree, const_tree, bool); static bool ix86_function_value_regno_p (const unsigned int); static unsigned int ix86_function_arg_boundary (machine_mode, const_tree); +static bool ix86_overaligned_stack_slot_required (void); static rtx ix86_static_chain (const_tree, bool); static int ix86_function_regparm (const_tree, const_tree); static void ix86_compute_frame_layout (void); @@ -1631,6 +1632,14 @@ ix86_must_pass_in_stack (const function_arg_info &arg) && arg.type && TREE_CODE (arg.type) != VECTOR_TYPE); } +/* Implement TARGET_OVERALIGNED_STACK_SLOT_REQUIRED. */ + +static bool +ix86_overaligned_stack_slot_required (void) +{ + return TARGET_SEH; +} + /* It returns the size, in bytes, of the area reserved for arguments passed in registers for the function represented by fndecl dependent to the used abi format. */ @@ -28505,6 +28514,8 @@ static const scoped_attribute_specs *const ix86_attribute_table[] = #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs #undef TARGET_MUST_PASS_IN_STACK #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack +#undef TARGET_OVERALIGNED_STACK_SLOT_REQUIRED +#define TARGET_OVERALIGNED_STACK_SLOT_REQUIRED ix86_overaligned_stack_slot_required #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS ix86_allocate_stack_slots_for_args #undef TARGET_FUNCTION_ARG_ADVANCE diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index a4ae17decb07..04537aa32006 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -4300,6 +4300,16 @@ definition that is usually appropriate, refer to @file{expr.h} for additional documentation. @end deftypefn +@deftypefn {Target Hook} bool TARGET_OVERALIGNED_STACK_SLOT_REQUIRED (void) +This hook should return @code{true} when call-related stack slots whose +requested alignment exceeds @code{MAX_SUPPORTED_STACK_ALIGNMENT} need special +handling on the target. This covers stack slots created by call expansion +(such as by-reference argument copies and hidden structure return storage) and +incoming argument setup. When @code{true}, GCC may avoid normal fixed stack +slots for such cases and use over-allocation plus dynamic address alignment +instead. +@end deftypefn + @deftypefn {Target Hook} rtx TARGET_FUNCTION_INCOMING_ARG (cumulative_args_t @var{ca}, const function_arg_info @var{&arg}) Define this hook if the caller and callee on the target have different views of where arguments are passed. Also define this hook if there are diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index 1acda0c264cc..e31d7440b75a 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -3358,6 +3358,8 @@ the stack. @hook TARGET_MUST_PASS_IN_STACK +@hook TARGET_OVERALIGNED_STACK_SLOT_REQUIRED + @hook TARGET_FUNCTION_INCOMING_ARG @hook TARGET_USE_PSEUDO_PIC_REG diff --git a/gcc/function.cc b/gcc/function.cc index 109821c16fc9..84023070ad8b 100644 --- a/gcc/function.cc +++ b/gcc/function.cc @@ -155,6 +155,8 @@ static bool contains (const rtx_insn *, hash_table<insn_cache_hasher> *); static void prepare_function_start (void); static void do_clobber_return_reg (rtx, void *); static void do_use_return_reg (rtx, void *); +static rtx assign_stack_local_aligned (machine_mode, poly_int64, + unsigned int); /* Stack of nested functions. */ @@ -551,6 +553,36 @@ assign_stack_local (machine_mode mode, poly_int64 size, int align) return assign_stack_local_1 (mode, size, align, ASLK_RECORD_PAD); } +/* Like assign_stack_local, but preserve requested over-alignment by + overallocating a BLKmode slot and aligning an address within it. */ + +static rtx +assign_stack_local_aligned (machine_mode mode, poly_int64 size, + unsigned int align) +{ + if (targetm.calls.overaligned_stack_slot_required () + && align > MAX_SUPPORTED_STACK_ALIGNMENT) + { + if (!size.is_constant ()) + return assign_stack_local (mode, size, MAX_SUPPORTED_STACK_ALIGNMENT); + + rtx allocsize = gen_int_mode (size, Pmode); + get_dynamic_stack_size (&allocsize, 0, align, NULL); + + if (!CONST_INT_P (allocsize)) + return assign_stack_local (mode, size, MAX_SUPPORTED_STACK_ALIGNMENT); + + rtx slot = assign_stack_local (BLKmode, UINTVAL (allocsize), + MAX_SUPPORTED_STACK_ALIGNMENT); + rtx addr = align_dynamic_address (XEXP (slot, 0), align); + mark_reg_pointer (addr, align); + slot = gen_rtx_MEM (mode, addr); + MEM_NOTRAP_P (slot) = 1; + return slot; + } + return assign_stack_local (mode, size, align); +} + /* In order to evaluate some expressions, such as function calls returning structures in memory, we need to temporarily allocate stack locations. We record each allocated temporary in the following structure. @@ -2948,21 +2980,8 @@ assign_parm_setup_block (struct assign_parm_data_all *all, ? MAX (DECL_ALIGN (parm), BITS_PER_WORD) : DECL_ALIGN (parm)); SET_DECL_ALIGN (parm, parm_align); - if (DECL_ALIGN (parm) > MAX_SUPPORTED_STACK_ALIGNMENT) - { - rtx allocsize = gen_int_mode (size_stored, Pmode); - get_dynamic_stack_size (&allocsize, 0, DECL_ALIGN (parm), NULL); - stack_parm = assign_stack_local (BLKmode, UINTVAL (allocsize), - MAX_SUPPORTED_STACK_ALIGNMENT); - rtx addr = align_dynamic_address (XEXP (stack_parm, 0), - DECL_ALIGN (parm)); - mark_reg_pointer (addr, DECL_ALIGN (parm)); - stack_parm = gen_rtx_MEM (GET_MODE (stack_parm), addr); - MEM_NOTRAP_P (stack_parm) = 1; - } - else - stack_parm = assign_stack_local (BLKmode, size_stored, - DECL_ALIGN (parm)); + stack_parm + = assign_stack_local_aligned (BLKmode, size_stored, DECL_ALIGN (parm)); if (known_eq (GET_MODE_SIZE (GET_MODE (entry_parm)), size)) PUT_MODE (stack_parm, GET_MODE (entry_parm)); set_mem_attributes (stack_parm, parm, 1); @@ -3366,10 +3385,10 @@ assign_parm_setup_reg (struct assign_parm_data_all *all, tree parm, int align = STACK_SLOT_ALIGNMENT (TREE_TYPE (parm), TYPE_MODE (TREE_TYPE (parm)), TYPE_ALIGN (TREE_TYPE (parm))); - parmreg - = assign_stack_local (TYPE_MODE (TREE_TYPE (parm)), - GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (parm))), - align); + parmreg = assign_stack_local_aligned (TYPE_MODE (TREE_TYPE (parm)), + GET_MODE_SIZE ( + TYPE_MODE (TREE_TYPE (parm))), + align); set_mem_attributes (parmreg, parm, 1); } diff --git a/gcc/target.def b/gcc/target.def index cdb3a6a6c840..0b7436b67a4b 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -5211,6 +5211,18 @@ documentation.", bool, (const function_arg_info &arg), must_pass_in_stack_var_size_or_pad) +DEFHOOK +(overaligned_stack_slot_required, + "This hook should return @code{true} when call-related stack slots whose\n\ +requested alignment exceeds @code{MAX_SUPPORTED_STACK_ALIGNMENT} need special\n\ +handling on the target. This covers stack slots created by call expansion\n\ +(such as by-reference argument copies and hidden structure return storage) and\n\ +incoming argument setup. When @code{true}, GCC may avoid normal fixed stack\n\ +slots for such cases and use over-allocation plus dynamic address alignment\n\ +instead.", + bool, (void), + hook_bool_void_false) + /* Return true if type TYPE, mode MODE, which is passed by reference, should have the object copy generated by the callee rather than the caller. It is never called for TYPE requiring constructors. */
